html2ms, tcs, mothra, uhtml: threat ' as special entity, add uhtml(1)
This commit is contained in:
parent
6d6880cec9
commit
13304b7b96
5 changed files with 95 additions and 29 deletions
46
sys/man/1/uhtml
Normal file
46
sys/man/1/uhtml
Normal file
|
@ -0,0 +1,46 @@
|
|||
.TH UHTML 1
|
||||
.SH NAME
|
||||
uhtml \- convert foreign character set HTML file to unicode
|
||||
.SH SYNOPSIS
|
||||
.B uhtml
|
||||
[
|
||||
.B -p
|
||||
] [
|
||||
.B -c
|
||||
.I charset
|
||||
] [
|
||||
.I file
|
||||
]
|
||||
.SH DESCRIPTION
|
||||
HTML comes in various character set encodings
|
||||
and has special forms to encode characters. To
|
||||
make it easier to process html, uthml is used
|
||||
to normalize it to a unicode only form.
|
||||
.LP
|
||||
Uhtml detects the character set of the html input
|
||||
.I file
|
||||
and calls
|
||||
.IR tcs (1)
|
||||
to convert it to utf replacing html-entity forms
|
||||
by ther unicode character representations except for
|
||||
.B lt
|
||||
.B gt
|
||||
.B amp
|
||||
.B quot
|
||||
and
|
||||
.B apos .
|
||||
The converted html is written to
|
||||
standard output. If no
|
||||
.I file
|
||||
was given, it is read from standard input. If the
|
||||
.B -p
|
||||
option is given, the detected character set is printed and
|
||||
the program exits without conversion.
|
||||
In case character set detection fails, the default (utf)
|
||||
is assumed. This default can be changed with the
|
||||
.B -c
|
||||
option.
|
||||
.SH SOURCE
|
||||
.B /sys/src/cmd/uhtml.c
|
||||
.SH SEE ALSO
|
||||
.IR tcs (1)
|
|
@ -680,6 +680,8 @@ parserune(int c)
|
|||
return '>';
|
||||
if(strcmp(buf, "quot") == 0)
|
||||
return '"';
|
||||
if(strcmp(buf, "apos") == 0)
|
||||
return '\'';
|
||||
if(strcmp(buf, "amp") == 0)
|
||||
return '&';
|
||||
/* use tcs -f html to handle the rest. */
|
||||
|
|
|
@ -272,6 +272,8 @@ void pl_rmentities(Hglob *g, char *s){
|
|||
*t++='>';
|
||||
else if(strcmp(u, "quot") == 0)
|
||||
*t++='"';
|
||||
else if(strcmp(u, "apos") == 0)
|
||||
*t++='\'';
|
||||
else if(strcmp(u, "amp") == 0)
|
||||
*t++='&';
|
||||
else {
|
||||
|
|
|
@ -11,8 +11,6 @@ struct Hchar
|
|||
Rune r;
|
||||
};
|
||||
|
||||
/* <, >, ", & intentionally omitted */
|
||||
|
||||
/*
|
||||
* Names beginning with _ are names we recognize
|
||||
* (without the underscore) but will not generate,
|
||||
|
@ -86,7 +84,7 @@ static Hchar byname[] =
|
|||
{"agrave", 224},
|
||||
{"alefsym", 8501},
|
||||
{"alpha", 945},
|
||||
/* {"amp", 38}, */
|
||||
{"amp", 38},
|
||||
{"and", 8743},
|
||||
{"ang", 8736},
|
||||
{"aring", 229},
|
||||
|
@ -141,7 +139,7 @@ static Hchar byname[] =
|
|||
{"frasl", 8260},
|
||||
{"gamma", 947},
|
||||
{"ge", 8805},
|
||||
/* {"gt", 62}, */
|
||||
{"gt", 62},
|
||||
{"hArr", 8660},
|
||||
{"harr", 8596},
|
||||
{"hearts", 9829},
|
||||
|
@ -173,7 +171,7 @@ static Hchar byname[] =
|
|||
{"lrm", 8206},
|
||||
{"lsaquo", 8249},
|
||||
{"lsquo", 8216},
|
||||
/* {"lt", 60}, */
|
||||
{"lt", 60},
|
||||
{"macr", 175},
|
||||
{"mdash", 8212},
|
||||
{"micro", 181},
|
||||
|
@ -219,7 +217,7 @@ static Hchar byname[] =
|
|||
{"prop", 8733},
|
||||
{"psi", 968},
|
||||
{"quad", 8193},
|
||||
/* {"quot", 34}, */
|
||||
{"quot", 34},
|
||||
{"rArr", 8658},
|
||||
{"radic", 8730},
|
||||
{"rang", 9002},
|
||||
|
@ -416,10 +414,8 @@ html_in(int fd, long *x, struct convert *out)
|
|||
}
|
||||
buf[i] = 0;
|
||||
if(i > 1){
|
||||
if((c = findbyname(buf+1)) != Runeerror){
|
||||
*r++ = c;
|
||||
continue;
|
||||
}
|
||||
if((c = findbyname(buf+1)) != Runeerror)
|
||||
goto out;
|
||||
if(i > 2 && buf[1] == '#'){
|
||||
if(i > 3 && strchr("xX", buf[2]))
|
||||
c = strtol(buf+3, &p, 16);
|
||||
|
@ -427,8 +423,7 @@ html_in(int fd, long *x, struct convert *out)
|
|||
c = strtol(buf+2, &p, 10);
|
||||
if(*p || c >= NRUNE || c < 0)
|
||||
goto bad;
|
||||
*r++ = c;
|
||||
continue;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
bad:
|
||||
|
@ -442,6 +437,12 @@ html_in(int fd, long *x, struct convert *out)
|
|||
}
|
||||
}
|
||||
continue;
|
||||
out:
|
||||
if(strchr("<>&\"'", c)){
|
||||
s = ';';
|
||||
i = sprint(buf, "&%s", findbyrune(c));
|
||||
goto bad;
|
||||
}
|
||||
}
|
||||
*r++ = c;
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ void
|
|||
main(int argc, char *argv[])
|
||||
{
|
||||
int pfd[2], pflag = 0;
|
||||
char *arg[4], *s;
|
||||
char *arg[4], *s, *p;
|
||||
|
||||
ARGBEGIN {
|
||||
case 'h':
|
||||
|
@ -59,42 +59,54 @@ main(int argc, char *argv[])
|
|||
if(open(*argv, OREAD) != 1)
|
||||
sysfatal("open: %r");
|
||||
}
|
||||
if((nbuf = read(0, buf, sizeof(buf)-1)) < 0)
|
||||
if((nbuf = readn(0, buf, sizeof(buf)-1)) < 0)
|
||||
sysfatal("read: %r");
|
||||
buf[nbuf] = 0;
|
||||
|
||||
/* useless BOM marker */
|
||||
if(memcmp(buf, "\xEF\xBB\xBF", 3)==0)
|
||||
memmove(buf, buf+3, nbuf-3);
|
||||
|
||||
for(;;){
|
||||
if(s = cistrstr(buf, "encoding="))
|
||||
p = buf;
|
||||
while(nbuf > 0){
|
||||
if(memcmp(p, "\xEF\xBB\xBF", 3)==0){
|
||||
p += 3;
|
||||
cset = "utf";
|
||||
break;
|
||||
}
|
||||
if(memcmp(p, "\xFE\xFF", 2) == 0){
|
||||
p += 2;
|
||||
cset = "unicode-be";
|
||||
break;
|
||||
}
|
||||
if(memcmp(p, "\xFF\xFE", 2) == 0){
|
||||
p += 2;
|
||||
cset = "unicode-le";
|
||||
break;
|
||||
}
|
||||
if(s = cistrstr(p, "encoding="))
|
||||
if(s = strval(s+9)){
|
||||
cset = s;
|
||||
break;
|
||||
}
|
||||
if(s = cistrstr(buf, "charset="))
|
||||
if(s = cistrstr(p, "charset="))
|
||||
if(s = strval(s+8)){
|
||||
cset = s;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
nbuf -= p - buf;
|
||||
|
||||
if(pflag){
|
||||
print("%s\n", cset);
|
||||
exits(0);
|
||||
}
|
||||
|
||||
if(pipe(pfd) < 0)
|
||||
sysfatal("pipe: %r");
|
||||
|
||||
if(nbuf == 0){
|
||||
write(1, buf, 0);
|
||||
write(1, p, 0);
|
||||
exits(0);
|
||||
}
|
||||
|
||||
switch(rfork(RFFDG|RFREND|RFPROC|RFNOWAIT)){
|
||||
if(pipe(pfd) < 0)
|
||||
sysfatal("pipe: %r");
|
||||
|
||||
switch(rfork(RFFDG|RFREND|RFPROC)){
|
||||
case -1:
|
||||
sysfatal("fork: %r");
|
||||
case 0:
|
||||
|
@ -114,10 +126,13 @@ main(int argc, char *argv[])
|
|||
close(pfd[1]);
|
||||
|
||||
while(nbuf > 0){
|
||||
if(write(1, buf, nbuf) != nbuf)
|
||||
if(write(1, p, nbuf) != nbuf)
|
||||
sysfatal("write: %r");
|
||||
if((nbuf = read(0, buf, sizeof(buf))) < 0)
|
||||
p = buf;
|
||||
if((nbuf = read(0, p, sizeof(buf))) < 0)
|
||||
sysfatal("read: %r");
|
||||
}
|
||||
close(1);
|
||||
waitpid();
|
||||
exits(0);
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue