html2ms, tcs, mothra, uhtml: threat ' as special entity, add uhtml(1)

This commit is contained in:
cinap_lenrek 2011-09-24 17:06:45 +02:00
parent 6d6880cec9
commit 13304b7b96
5 changed files with 95 additions and 29 deletions

46
sys/man/1/uhtml Normal file
View file

@ -0,0 +1,46 @@
.TH UHTML 1
.SH NAME
uhtml \- convert foreign character set HTML file to unicode
.SH SYNOPSIS
.B uhtml
[
.B -p
] [
.B -c
.I charset
] [
.I file
]
.SH DESCRIPTION
HTML comes in various character set encodings
and has special forms to encode characters. To
make it easier to process html, uthml is used
to normalize it to a unicode only form.
.LP
Uhtml detects the character set of the html input
.I file
and calls
.IR tcs (1)
to convert it to utf replacing html-entity forms
by ther unicode character representations except for
.B lt
.B gt
.B amp
.B quot
and
.B apos .
The converted html is written to
standard output. If no
.I file
was given, it is read from standard input. If the
.B -p
option is given, the detected character set is printed and
the program exits without conversion.
In case character set detection fails, the default (utf)
is assumed. This default can be changed with the
.B -c
option.
.SH SOURCE
.B /sys/src/cmd/uhtml.c
.SH SEE ALSO
.IR tcs (1)

View file

@ -680,6 +680,8 @@ parserune(int c)
return '>';
if(strcmp(buf, "quot") == 0)
return '"';
if(strcmp(buf, "apos") == 0)
return '\'';
if(strcmp(buf, "amp") == 0)
return '&';
/* use tcs -f html to handle the rest. */

View file

@ -272,6 +272,8 @@ void pl_rmentities(Hglob *g, char *s){
*t++='>';
else if(strcmp(u, "quot") == 0)
*t++='"';
else if(strcmp(u, "apos") == 0)
*t++='\'';
else if(strcmp(u, "amp") == 0)
*t++='&';
else {

View file

@ -11,8 +11,6 @@ struct Hchar
Rune r;
};
/* <, >, ", & intentionally omitted */
/*
* Names beginning with _ are names we recognize
* (without the underscore) but will not generate,
@ -86,7 +84,7 @@ static Hchar byname[] =
{"agrave", 224},
{"alefsym", 8501},
{"alpha", 945},
/* {"amp", 38}, */
{"amp", 38},
{"and", 8743},
{"ang", 8736},
{"aring", 229},
@ -141,7 +139,7 @@ static Hchar byname[] =
{"frasl", 8260},
{"gamma", 947},
{"ge", 8805},
/* {"gt", 62}, */
{"gt", 62},
{"hArr", 8660},
{"harr", 8596},
{"hearts", 9829},
@ -173,7 +171,7 @@ static Hchar byname[] =
{"lrm", 8206},
{"lsaquo", 8249},
{"lsquo", 8216},
/* {"lt", 60}, */
{"lt", 60},
{"macr", 175},
{"mdash", 8212},
{"micro", 181},
@ -219,7 +217,7 @@ static Hchar byname[] =
{"prop", 8733},
{"psi", 968},
{"quad", 8193},
/* {"quot", 34}, */
{"quot", 34},
{"rArr", 8658},
{"radic", 8730},
{"rang", 9002},
@ -416,10 +414,8 @@ html_in(int fd, long *x, struct convert *out)
}
buf[i] = 0;
if(i > 1){
if((c = findbyname(buf+1)) != Runeerror){
*r++ = c;
continue;
}
if((c = findbyname(buf+1)) != Runeerror)
goto out;
if(i > 2 && buf[1] == '#'){
if(i > 3 && strchr("xX", buf[2]))
c = strtol(buf+3, &p, 16);
@ -427,8 +423,7 @@ html_in(int fd, long *x, struct convert *out)
c = strtol(buf+2, &p, 10);
if(*p || c >= NRUNE || c < 0)
goto bad;
*r++ = c;
continue;
goto out;
}
}
bad:
@ -442,6 +437,12 @@ html_in(int fd, long *x, struct convert *out)
}
}
continue;
out:
if(strchr("<>&\"'", c)){
s = ';';
i = sprint(buf, "&%s", findbyrune(c));
goto bad;
}
}
*r++ = c;
}

View file

@ -41,7 +41,7 @@ void
main(int argc, char *argv[])
{
int pfd[2], pflag = 0;
char *arg[4], *s;
char *arg[4], *s, *p;
ARGBEGIN {
case 'h':
@ -59,42 +59,54 @@ main(int argc, char *argv[])
if(open(*argv, OREAD) != 1)
sysfatal("open: %r");
}
if((nbuf = read(0, buf, sizeof(buf)-1)) < 0)
if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0)
sysfatal("read: %r");
buf[nbuf] = 0;
/* useless BOM marker */
if(memcmp(buf, "\xEF\xBB\xBF", 3)==0)
memmove(buf, buf+3, nbuf-3);
for(;;){
if(s = cistrstr(buf, "encoding="))
p = buf;
while(nbuf > 0){
if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
p += 3;
cset = "utf";
break;
}
if(memcmp(p, "\xFE\xFF", 2) == 0){
p += 2;
cset = "unicode-be";
break;
}
if(memcmp(p, "\xFF\xFE", 2) == 0){
p += 2;
cset = "unicode-le";
break;
}
if(s = cistrstr(p, "encoding="))
if(s = strval(s+9)){
cset = s;
break;
}
if(s = cistrstr(buf, "charset="))
if(s = cistrstr(p, "charset="))
if(s = strval(s+8)){
cset = s;
break;
}
break;
}
nbuf -= p - buf;
if(pflag){
print("%s\n", cset);
exits(0);
}
if(pipe(pfd) < 0)
sysfatal("pipe: %r");
if(nbuf == 0){
write(1, buf, 0);
write(1, p, 0);
exits(0);
}
switch(rfork(RFFDG|RFREND|RFPROC|RFNOWAIT)){
if(pipe(pfd) < 0)
sysfatal("pipe: %r");
switch(rfork(RFFDG|RFREND|RFPROC)){
case -1:
sysfatal("fork: %r");
case 0:
@ -114,10 +126,13 @@ main(int argc, char *argv[])
close(pfd[1]);
while(nbuf > 0){
if(write(1, buf, nbuf) != nbuf)
if(write(1, p, nbuf) != nbuf)
sysfatal("write: %r");
if((nbuf = read(0, buf, sizeof(buf))) < 0)
p = buf;
if((nbuf = read(0, p, sizeof(buf))) < 0)
sysfatal("read: %r");
}
close(1);
waitpid();
exits(0);
}