html2ms, tcs, mothra, uhtml: threat ' as special entity, add uhtml(1)
This commit is contained in:
parent
6d6880cec9
commit
13304b7b96
5 changed files with 95 additions and 29 deletions
46
sys/man/1/uhtml
Normal file
46
sys/man/1/uhtml
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
.TH UHTML 1
|
||||||
|
.SH NAME
|
||||||
|
uhtml \- convert foreign character set HTML file to unicode
|
||||||
|
.SH SYNOPSIS
|
||||||
|
.B uhtml
|
||||||
|
[
|
||||||
|
.B -p
|
||||||
|
] [
|
||||||
|
.B -c
|
||||||
|
.I charset
|
||||||
|
] [
|
||||||
|
.I file
|
||||||
|
]
|
||||||
|
.SH DESCRIPTION
|
||||||
|
HTML comes in various character set encodings
|
||||||
|
and has special forms to encode characters. To
|
||||||
|
make it easier to process html, uthml is used
|
||||||
|
to normalize it to a unicode only form.
|
||||||
|
.LP
|
||||||
|
Uhtml detects the character set of the html input
|
||||||
|
.I file
|
||||||
|
and calls
|
||||||
|
.IR tcs (1)
|
||||||
|
to convert it to utf replacing html-entity forms
|
||||||
|
by ther unicode character representations except for
|
||||||
|
.B lt
|
||||||
|
.B gt
|
||||||
|
.B amp
|
||||||
|
.B quot
|
||||||
|
and
|
||||||
|
.B apos .
|
||||||
|
The converted html is written to
|
||||||
|
standard output. If no
|
||||||
|
.I file
|
||||||
|
was given, it is read from standard input. If the
|
||||||
|
.B -p
|
||||||
|
option is given, the detected character set is printed and
|
||||||
|
the program exits without conversion.
|
||||||
|
In case character set detection fails, the default (utf)
|
||||||
|
is assumed. This default can be changed with the
|
||||||
|
.B -c
|
||||||
|
option.
|
||||||
|
.SH SOURCE
|
||||||
|
.B /sys/src/cmd/uhtml.c
|
||||||
|
.SH SEE ALSO
|
||||||
|
.IR tcs (1)
|
|
@ -680,6 +680,8 @@ parserune(int c)
|
||||||
return '>';
|
return '>';
|
||||||
if(strcmp(buf, "quot") == 0)
|
if(strcmp(buf, "quot") == 0)
|
||||||
return '"';
|
return '"';
|
||||||
|
if(strcmp(buf, "apos") == 0)
|
||||||
|
return '\'';
|
||||||
if(strcmp(buf, "amp") == 0)
|
if(strcmp(buf, "amp") == 0)
|
||||||
return '&';
|
return '&';
|
||||||
/* use tcs -f html to handle the rest. */
|
/* use tcs -f html to handle the rest. */
|
||||||
|
|
|
@ -272,6 +272,8 @@ void pl_rmentities(Hglob *g, char *s){
|
||||||
*t++='>';
|
*t++='>';
|
||||||
else if(strcmp(u, "quot") == 0)
|
else if(strcmp(u, "quot") == 0)
|
||||||
*t++='"';
|
*t++='"';
|
||||||
|
else if(strcmp(u, "apos") == 0)
|
||||||
|
*t++='\'';
|
||||||
else if(strcmp(u, "amp") == 0)
|
else if(strcmp(u, "amp") == 0)
|
||||||
*t++='&';
|
*t++='&';
|
||||||
else {
|
else {
|
||||||
|
|
|
@ -11,8 +11,6 @@ struct Hchar
|
||||||
Rune r;
|
Rune r;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* <, >, ", & intentionally omitted */
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Names beginning with _ are names we recognize
|
* Names beginning with _ are names we recognize
|
||||||
* (without the underscore) but will not generate,
|
* (without the underscore) but will not generate,
|
||||||
|
@ -86,7 +84,7 @@ static Hchar byname[] =
|
||||||
{"agrave", 224},
|
{"agrave", 224},
|
||||||
{"alefsym", 8501},
|
{"alefsym", 8501},
|
||||||
{"alpha", 945},
|
{"alpha", 945},
|
||||||
/* {"amp", 38}, */
|
{"amp", 38},
|
||||||
{"and", 8743},
|
{"and", 8743},
|
||||||
{"ang", 8736},
|
{"ang", 8736},
|
||||||
{"aring", 229},
|
{"aring", 229},
|
||||||
|
@ -141,7 +139,7 @@ static Hchar byname[] =
|
||||||
{"frasl", 8260},
|
{"frasl", 8260},
|
||||||
{"gamma", 947},
|
{"gamma", 947},
|
||||||
{"ge", 8805},
|
{"ge", 8805},
|
||||||
/* {"gt", 62}, */
|
{"gt", 62},
|
||||||
{"hArr", 8660},
|
{"hArr", 8660},
|
||||||
{"harr", 8596},
|
{"harr", 8596},
|
||||||
{"hearts", 9829},
|
{"hearts", 9829},
|
||||||
|
@ -173,7 +171,7 @@ static Hchar byname[] =
|
||||||
{"lrm", 8206},
|
{"lrm", 8206},
|
||||||
{"lsaquo", 8249},
|
{"lsaquo", 8249},
|
||||||
{"lsquo", 8216},
|
{"lsquo", 8216},
|
||||||
/* {"lt", 60}, */
|
{"lt", 60},
|
||||||
{"macr", 175},
|
{"macr", 175},
|
||||||
{"mdash", 8212},
|
{"mdash", 8212},
|
||||||
{"micro", 181},
|
{"micro", 181},
|
||||||
|
@ -219,7 +217,7 @@ static Hchar byname[] =
|
||||||
{"prop", 8733},
|
{"prop", 8733},
|
||||||
{"psi", 968},
|
{"psi", 968},
|
||||||
{"quad", 8193},
|
{"quad", 8193},
|
||||||
/* {"quot", 34}, */
|
{"quot", 34},
|
||||||
{"rArr", 8658},
|
{"rArr", 8658},
|
||||||
{"radic", 8730},
|
{"radic", 8730},
|
||||||
{"rang", 9002},
|
{"rang", 9002},
|
||||||
|
@ -416,10 +414,8 @@ html_in(int fd, long *x, struct convert *out)
|
||||||
}
|
}
|
||||||
buf[i] = 0;
|
buf[i] = 0;
|
||||||
if(i > 1){
|
if(i > 1){
|
||||||
if((c = findbyname(buf+1)) != Runeerror){
|
if((c = findbyname(buf+1)) != Runeerror)
|
||||||
*r++ = c;
|
goto out;
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if(i > 2 && buf[1] == '#'){
|
if(i > 2 && buf[1] == '#'){
|
||||||
if(i > 3 && strchr("xX", buf[2]))
|
if(i > 3 && strchr("xX", buf[2]))
|
||||||
c = strtol(buf+3, &p, 16);
|
c = strtol(buf+3, &p, 16);
|
||||||
|
@ -427,8 +423,7 @@ html_in(int fd, long *x, struct convert *out)
|
||||||
c = strtol(buf+2, &p, 10);
|
c = strtol(buf+2, &p, 10);
|
||||||
if(*p || c >= NRUNE || c < 0)
|
if(*p || c >= NRUNE || c < 0)
|
||||||
goto bad;
|
goto bad;
|
||||||
*r++ = c;
|
goto out;
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
bad:
|
bad:
|
||||||
|
@ -442,6 +437,12 @@ html_in(int fd, long *x, struct convert *out)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
out:
|
||||||
|
if(strchr("<>&\"'", c)){
|
||||||
|
s = ';';
|
||||||
|
i = sprint(buf, "&%s", findbyrune(c));
|
||||||
|
goto bad;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
*r++ = c;
|
*r++ = c;
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,7 +41,7 @@ void
|
||||||
main(int argc, char *argv[])
|
main(int argc, char *argv[])
|
||||||
{
|
{
|
||||||
int pfd[2], pflag = 0;
|
int pfd[2], pflag = 0;
|
||||||
char *arg[4], *s;
|
char *arg[4], *s, *p;
|
||||||
|
|
||||||
ARGBEGIN {
|
ARGBEGIN {
|
||||||
case 'h':
|
case 'h':
|
||||||
|
@ -59,42 +59,54 @@ main(int argc, char *argv[])
|
||||||
if(open(*argv, OREAD) != 1)
|
if(open(*argv, OREAD) != 1)
|
||||||
sysfatal("open: %r");
|
sysfatal("open: %r");
|
||||||
}
|
}
|
||||||
if((nbuf = read(0, buf, sizeof(buf)-1)) < 0)
|
if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0)
|
||||||
sysfatal("read: %r");
|
sysfatal("read: %r");
|
||||||
buf[nbuf] = 0;
|
buf[nbuf] = 0;
|
||||||
|
p = buf;
|
||||||
/* useless BOM marker */
|
while(nbuf > 0){
|
||||||
if(memcmp(buf, "\xEF\xBB\xBF", 3)==0)
|
if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
|
||||||
memmove(buf, buf+3, nbuf-3);
|
p += 3;
|
||||||
|
cset = "utf";
|
||||||
for(;;){
|
break;
|
||||||
if(s = cistrstr(buf, "encoding="))
|
}
|
||||||
|
if(memcmp(p, "\xFE\xFF", 2) == 0){
|
||||||
|
p += 2;
|
||||||
|
cset = "unicode-be";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(memcmp(p, "\xFF\xFE", 2) == 0){
|
||||||
|
p += 2;
|
||||||
|
cset = "unicode-le";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(s = cistrstr(p, "encoding="))
|
||||||
if(s = strval(s+9)){
|
if(s = strval(s+9)){
|
||||||
cset = s;
|
cset = s;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if(s = cistrstr(buf, "charset="))
|
if(s = cistrstr(p, "charset="))
|
||||||
if(s = strval(s+8)){
|
if(s = strval(s+8)){
|
||||||
cset = s;
|
cset = s;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
nbuf -= p - buf;
|
||||||
|
|
||||||
if(pflag){
|
if(pflag){
|
||||||
print("%s\n", cset);
|
print("%s\n", cset);
|
||||||
exits(0);
|
exits(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if(pipe(pfd) < 0)
|
|
||||||
sysfatal("pipe: %r");
|
|
||||||
|
|
||||||
if(nbuf == 0){
|
if(nbuf == 0){
|
||||||
write(1, buf, 0);
|
write(1, p, 0);
|
||||||
exits(0);
|
exits(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
switch(rfork(RFFDG|RFREND|RFPROC|RFNOWAIT)){
|
if(pipe(pfd) < 0)
|
||||||
|
sysfatal("pipe: %r");
|
||||||
|
|
||||||
|
switch(rfork(RFFDG|RFREND|RFPROC)){
|
||||||
case -1:
|
case -1:
|
||||||
sysfatal("fork: %r");
|
sysfatal("fork: %r");
|
||||||
case 0:
|
case 0:
|
||||||
|
@ -114,10 +126,13 @@ main(int argc, char *argv[])
|
||||||
close(pfd[1]);
|
close(pfd[1]);
|
||||||
|
|
||||||
while(nbuf > 0){
|
while(nbuf > 0){
|
||||||
if(write(1, buf, nbuf) != nbuf)
|
if(write(1, p, nbuf) != nbuf)
|
||||||
sysfatal("write: %r");
|
sysfatal("write: %r");
|
||||||
if((nbuf = read(0, buf, sizeof(buf))) < 0)
|
p = buf;
|
||||||
|
if((nbuf = read(0, p, sizeof(buf))) < 0)
|
||||||
sysfatal("read: %r");
|
sysfatal("read: %r");
|
||||||
}
|
}
|
||||||
|
close(1);
|
||||||
|
waitpid();
|
||||||
exits(0);
|
exits(0);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue