uhtml: check if document is valid utf8 even with charset specified
often, documents specify charsets but are really utf-8 encoded. we now try to decode as utf-8 and only if that fails assume the charset specified in the document.
This commit is contained in:
parent
e601e1605b
commit
3d1e12363d
|
@ -49,7 +49,7 @@ void
|
||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
int n, q, pfd[2], pflag = 0;
|
int n, q, pfd[2], pflag = 0;
|
||||||
char *arg[4], *s, *e, *p, *g, *a, t;
|
char *arg[4], *s, *g, *e, *p, *a, t;
|
||||||
Rune r;
|
Rune r;
|
||||||
|
|
||||||
ARGBEGIN {
|
ARGBEGIN {
|
||||||
|
@ -69,34 +69,34 @@ main(int argc, char *argv[])
|
||||||
sysfatal("open: %r");
|
sysfatal("open: %r");
|
||||||
}
|
}
|
||||||
nbuf = 0;
|
nbuf = 0;
|
||||||
p = buf;
|
|
||||||
g = buf;
|
|
||||||
while(nbuf < sizeof(buf)-1){
|
while(nbuf < sizeof(buf)-1){
|
||||||
if((n = read(0, buf + nbuf, sizeof(buf)-1-nbuf)) <= 0)
|
if((n = read(0, buf + nbuf, sizeof(buf)-1-nbuf)) <= 0)
|
||||||
break;
|
break;
|
||||||
nbuf += n;
|
nbuf += n;
|
||||||
buf[nbuf] = 0;
|
buf[nbuf] = 0;
|
||||||
if(nbuf == n){
|
}
|
||||||
if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
|
|
||||||
|
p = buf;
|
||||||
|
if(nbuf >= 3 && memcmp(p, "\xEF\xBB\xBF", 3)==0){
|
||||||
p += 3;
|
p += 3;
|
||||||
nbuf -= 3;
|
nbuf -= 3;
|
||||||
cset = "utf";
|
cset = "utf";
|
||||||
goto Found;
|
goto Found;
|
||||||
}
|
}
|
||||||
if(memcmp(p, "\xFE\xFF", 2) == 0){
|
if(nbuf >= 2 && memcmp(p, "\xFE\xFF", 2) == 0){
|
||||||
p += 2;
|
p += 2;
|
||||||
nbuf -= 2;
|
nbuf -= 2;
|
||||||
cset = "unicode-be";
|
cset = "unicode-be";
|
||||||
goto Found;
|
goto Found;
|
||||||
}
|
}
|
||||||
if(memcmp(p, "\xFF\xFE", 2) == 0){
|
if(nbuf >= 2 && memcmp(p, "\xFF\xFE", 2) == 0){
|
||||||
p += 2;
|
p += 2;
|
||||||
nbuf -= 2;
|
nbuf -= 2;
|
||||||
cset = "unicode-le";
|
cset = "unicode-le";
|
||||||
goto Found;
|
goto Found;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
s = g;
|
s = p;
|
||||||
do {
|
do {
|
||||||
if((s = strchr(s, '<')) == nil)
|
if((s = strchr(s, '<')) == nil)
|
||||||
break;
|
break;
|
||||||
|
@ -122,26 +122,26 @@ main(int argc, char *argv[])
|
||||||
}
|
}
|
||||||
t = *e;
|
t = *e;
|
||||||
*e = 0;
|
*e = 0;
|
||||||
if((a = attr(g, "encoding")) || (a = attr(g, "charset"))){
|
if((a = attr(g, "encoding")) != nil || (a = attr(g, "charset")) != nil){
|
||||||
*e = t;
|
|
||||||
cset = a;
|
cset = a;
|
||||||
goto Found;
|
*e = t;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
*e = t;
|
*e = t;
|
||||||
s = ++e;
|
s = ++e;
|
||||||
} while(t);
|
} while(t);
|
||||||
}
|
|
||||||
if(cset)
|
|
||||||
goto Found;
|
|
||||||
s = p;
|
s = p;
|
||||||
while(s+UTFmax < p+nbuf){
|
while(s+UTFmax < p+nbuf){
|
||||||
s += chartorune(&r, s);
|
s += chartorune(&r, s);
|
||||||
if(r == Runeerror){
|
if(r == Runeerror){
|
||||||
|
if(cset == nil)
|
||||||
cset = "latin1";
|
cset = "latin1";
|
||||||
goto Found;
|
goto Found;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
cset = "utf";
|
cset = "utf";
|
||||||
|
|
||||||
Found:
|
Found:
|
||||||
if(pflag){
|
if(pflag){
|
||||||
print("%s\n", cset);
|
print("%s\n", cset);
|
||||||
|
|
Loading…
Reference in a new issue