tcs: use character set alias names from abaco, more tolerant html entity support

This commit is contained in:
cinap_lenrek 2011-09-20 00:37:06 +02:00
parent 19070c5ce5
commit e7df0daa66
6 changed files with 2000 additions and 10 deletions

58
sys/src/cmd/tcs/alias.txt Normal file
View file

@ -0,0 +1,58 @@
iso_8859-1 8859-1
iso_8859-2 8859-2
iso_8859-3 8859-3
iso_8859-4 8859-4
iso_8859-5 8859-5
iso_8859-6 8859-6
iso_8859-7 8859-7
iso_8859-8 8859-8
iso_8859-9 8859-9
iso_8859-10 8859-10
iso_8859-15 8859-15
ascii ascii
atari
av
big5 big5
ibm437 ibm437
ibm720
ibm737
ibm735 ibm775
ibm850 ibm850
ibm852 ibm852
ibm855 ibm855
ibm857 ibm857
ibm858
ibm862 ibm862
ibm866 ibm866
ibm874
windows-1250 windows-1250
windows-1251 windows-1251
windows-1252 windows-1252
windows-1253 windows-1253
windows-1254 windows-1254
windows-1255 windows-1255
windows-1256 windows-1256
windows-1257 windows-1257
windows-1258 windows-1258
ebcdic
korean euc-k
euc-kr euc-k
gb2312 gb
gb_2312-80 gb
iso-2022-jp jis-kanji
koi8-r koi8
macintosh macrom
ibm865 msdos2
shift_jis ms-kanji
next
ov
se sf1
se2 sf2
tis-620 tis
ucode
euc-jp ujis
utf16 unicode
iso-10646-utf-1 utf1
viscii-1 viet1
viscii-2 viet2
viscii viscii

View file

@ -0,0 +1,36 @@
#!/bin/awk -f
# makes a table of character sets from http://www.iana.org/assignments/character-sets
# and tcs.txt
BEGIN{
if(ARGC != 3){
print "Usage: " ARGV[0] " charsets.txt tcs.txt"
exit 1
}
while(getline<ARGV[1]){
if(/^Name:/){
i = 0
name=tolower($2)
names[name] = name
alias[name i] = name
nalias[name] = ++i
}
if(/^Alias:/){
a = tolower($2)
if(a != "none"){
names[a] = name
alias[name i ] = a
nalias[name] = ++i
}
}
}
}
{
tcs = $1
if(tcs in names){
name = names[tcs]
for(i=0; i<nalias[name]; i++)
print "\"" alias[name i] "\", \"" $2 "\","
}
}

1868
sys/src/cmd/tcs/charsets.txt Normal file

File diff suppressed because it is too large Load diff

View file

@ -404,24 +404,25 @@ html_in(int fd, long *x, struct convert *out)
c = Bgetc(&b);
if(c == Beof)
break;
buf[i++] = c;
if(strchr("; \t\r\n", c))
if(strchr(";&</> \t\r\n", c)){
if(c != ';')
Bungetc(&b);
break;
}
buf[i++] = c;
}
buf[i] = 0;
if(buf[i-1] == ';'){
buf[i-1] = 0;
if(i > 1){
if((c = findbyname(buf+1)) != Runeerror){
*r++ = c;
continue;
}
buf[i-1] = ';';
if(buf[1] == '#'){
if(buf[2] == 'x')
if(i > 2 && buf[1] == '#'){
if(i > 3 && strchr("xX", buf[2]))
c = strtol(buf+3, &p, 16);
else
c = strtol(buf+2, &p, 10);
if(*p != ';' || c >= NRUNE || c < 0)
if(*p || c >= NRUNE || c < 0)
goto bad;
*r++ = c;
continue;

View file

@ -30,11 +30,18 @@ tcs.$O: conv.h
tcs.$O: 8859.h
tcs.$O: ms.h
tcs.$O: misc.h
tcs.$O: alias.h
conv%.$O: conv.h
conv_ksc.$O: ksc.h
charsets.txt:
hget http://www.iana.org/assignments/character-sets | sed 's/ //' >$target
alias.h: charsets.awk charsets.txt alias.txt
charsets.awk charsets.txt alias.txt >$target
clean:V:
rm -f *.[$OS] [$OS].out y.tab.? y.debug y.output $TARG
rm -f *.[$OS] [$OS].out y.tab.? y.debug y.output alias.h $TARG
nuke:V:
rm -f *.[$OS] [$OS].out y.tab.? y.debug y.output $TARG
rm -f *.[$OS] [$OS].out y.tab.? y.debug y.output alias.h $TARG

View file

@ -154,12 +154,32 @@ list(void)
EPR "\n");
}
char*
aliasname(char *name)
{
static struct {
char *alias;
char *name;
} tab[] = {
#include "alias.h"
/* not generated by the script */
"euc_jp", "jis",
"euc_kr", "euc-k",
"windows-874", "tis",
};
int i;
for(i=0; i<nelem(tab); i++)
if(cistrcmp(tab[i].alias, name) == 0)
return tab[i].name;
return name;
}
struct convert *
conv(char *name, int from)
{
struct convert *c;
name = aliasname(name);
for(c = convert; c->name; c++){
if(cistrcmp(c->name, name) != 0)
continue;