From 3d1e12363d0d564dd570a8b654ddd698d42e0074 Mon Sep 17 00:00:00 2001
From: cinap_lenrek <cinap_lenrek@felloff.net>
Date: Thu, 28 May 2015 16:37:55 +0200
Subject: [PATCH] uhtml: check if document is valid utf8 even with charset
 specified

often, documents specify charsets but are really utf-8 encoded.
we now try to decode as utf-8 and only if that fails assume
the charset specified in the document.
---
 sys/src/cmd/uhtml.c | 118 ++++++++++++++++++++++----------------------
 1 file changed, 59 insertions(+), 59 deletions(-)

diff --git a/sys/src/cmd/uhtml.c b/sys/src/cmd/uhtml.c
index ca8c154ee..21db3d5b8 100644
--- a/sys/src/cmd/uhtml.c
+++ b/sys/src/cmd/uhtml.c
@@ -49,7 +49,7 @@ void
 main(int argc, char *argv[])
 {
 	int n, q, pfd[2], pflag = 0;
-	char *arg[4], *s, *e, *p, *g, *a, t;
+	char *arg[4], *s, *g, *e, *p, *a, t;
 	Rune r;
 
 	ARGBEGIN {
@@ -69,79 +69,79 @@ main(int argc, char *argv[])
 			sysfatal("open: %r");
 	}
 	nbuf = 0;
-	p = buf;
-	g = buf;
 	while(nbuf < sizeof(buf)-1){
 		if((n = read(0, buf + nbuf, sizeof(buf)-1-nbuf)) <= 0)
 			break;
 		nbuf += n;
 		buf[nbuf] = 0;
-		if(nbuf == n){
-			if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
-				p += 3;
-				nbuf -= 3;
-				cset = "utf";
-				goto Found;
-			}
-			if(memcmp(p, "\xFE\xFF", 2) == 0){
-				p += 2;
-				nbuf -= 2;
-				cset = "unicode-be";
-				goto Found;
-			}
-			if(memcmp(p, "\xFF\xFE", 2) == 0){
-				p += 2;
-				nbuf -= 2;
-				cset = "unicode-le";
-				goto Found;
-			}
-		}
-		s = g;
-		do {
-			if((s = strchr(s, '<')) == nil)
-				break;
-			q = 0;
-			g = ++s;
-			e = buf+nbuf;
-			while(s < e){
-				if(*s == '=' && q == 0)
-					q = '=';
-				else if(*s == '\'' || *s == '"'){
-					if(q == '=')
-						q = *s;
-					else if(q == *s)
-						q = 0;
-				}
-				else if(*s == '>' && q != '\'' && q != '"'){
-					e = s;
-					break;
-				}
-				else if(q == '=' && strchr(whitespace, *s) == nil)
-					q = 0;
-				s++;
-			}
-			t = *e;
-			*e = 0;
-			if((a = attr(g, "encoding")) || (a = attr(g, "charset"))){
-				*e = t;
-				cset = a;
-				goto Found;
-			}
-			*e = t;
-			s = ++e;
-		} while(t);
 	}
-	if(cset)
+
+	p = buf;
+	if(nbuf >= 3 && memcmp(p, "\xEF\xBB\xBF", 3)==0){
+		p += 3;
+		nbuf -= 3;
+		cset = "utf";
 		goto Found;
+	}
+	if(nbuf >= 2 && memcmp(p, "\xFE\xFF", 2) == 0){
+		p += 2;
+		nbuf -= 2;
+		cset = "unicode-be";
+		goto Found;
+	}
+	if(nbuf >= 2 && memcmp(p, "\xFF\xFE", 2) == 0){
+		p += 2;
+		nbuf -= 2;
+		cset = "unicode-le";
+		goto Found;
+	}
+
+	s = p;
+	do {
+		if((s = strchr(s, '<')) == nil)
+			break;
+		q = 0;
+		g = ++s;
+		e = buf+nbuf;
+		while(s < e){
+			if(*s == '=' && q == 0)
+				q = '=';
+			else if(*s == '\'' || *s == '"'){
+				if(q == '=')
+					q = *s;
+				else if(q == *s)
+					q = 0;
+			}
+			else if(*s == '>' && q != '\'' && q != '"'){
+				e = s;
+				break;
+			}
+			else if(q == '=' && strchr(whitespace, *s) == nil)
+				q = 0;
+			s++;
+		}
+		t = *e;
+		*e = 0;
+		if((a = attr(g, "encoding")) != nil || (a = attr(g, "charset")) != nil){
+			cset = a;
+			*e = t;
+			break;
+		}
+		*e = t;
+		s = ++e;
+	} while(t);
+
 	s = p;
 	while(s+UTFmax < p+nbuf){
 		s += chartorune(&r, s);
 		if(r == Runeerror){
-			cset = "latin1";
+			if(cset == nil)
+				cset = "latin1";
 			goto Found;
 		}
 	}
 	cset = "utf";
+
 Found:
 	if(pflag){
 		print("%s\n", cset);