tcs: use character set alias names from abaco, more tolerant html entity support
This commit is contained in:
parent
19070c5ce5
commit
e7df0daa66
6 changed files with 2000 additions and 10 deletions
58
sys/src/cmd/tcs/alias.txt
Normal file
58
sys/src/cmd/tcs/alias.txt
Normal file
|
@ -0,0 +1,58 @@
|
|||
iso_8859-1 8859-1
|
||||
iso_8859-2 8859-2
|
||||
iso_8859-3 8859-3
|
||||
iso_8859-4 8859-4
|
||||
iso_8859-5 8859-5
|
||||
iso_8859-6 8859-6
|
||||
iso_8859-7 8859-7
|
||||
iso_8859-8 8859-8
|
||||
iso_8859-9 8859-9
|
||||
iso_8859-10 8859-10
|
||||
iso_8859-15 8859-15
|
||||
ascii ascii
|
||||
atari
|
||||
av
|
||||
big5 big5
|
||||
ibm437 ibm437
|
||||
ibm720
|
||||
ibm737
|
||||
ibm735 ibm775
|
||||
ibm850 ibm850
|
||||
ibm852 ibm852
|
||||
ibm855 ibm855
|
||||
ibm857 ibm857
|
||||
ibm858
|
||||
ibm862 ibm862
|
||||
ibm866 ibm866
|
||||
ibm874
|
||||
windows-1250 windows-1250
|
||||
windows-1251 windows-1251
|
||||
windows-1252 windows-1252
|
||||
windows-1253 windows-1253
|
||||
windows-1254 windows-1254
|
||||
windows-1255 windows-1255
|
||||
windows-1256 windows-1256
|
||||
windows-1257 windows-1257
|
||||
windows-1258 windows-1258
|
||||
ebcdic
|
||||
korean euc-k
|
||||
euc-kr euc-k
|
||||
gb2312 gb
|
||||
gb_2312-80 gb
|
||||
iso-2022-jp jis-kanji
|
||||
koi8-r koi8
|
||||
macintosh macrom
|
||||
ibm865 msdos2
|
||||
shift_jis ms-kanji
|
||||
next
|
||||
ov
|
||||
se sf1
|
||||
se2 sf2
|
||||
tis-620 tis
|
||||
ucode
|
||||
euc-jp ujis
|
||||
utf16 unicode
|
||||
iso-10646-utf-1 utf1
|
||||
viscii-1 viet1
|
||||
viscii-2 viet2
|
||||
viscii viscii
|
36
sys/src/cmd/tcs/charsets.awk
Normal file
36
sys/src/cmd/tcs/charsets.awk
Normal file
|
@ -0,0 +1,36 @@
|
|||
#!/bin/awk -f
|
||||
# makes a table of character sets from http://www.iana.org/assignments/character-sets
|
||||
# and tcs.txt
|
||||
|
||||
BEGIN{
|
||||
if(ARGC != 3){
|
||||
print "Usage: " ARGV[0] " charsets.txt tcs.txt"
|
||||
exit 1
|
||||
}
|
||||
while(getline<ARGV[1]){
|
||||
if(/^Name:/){
|
||||
i = 0
|
||||
name=tolower($2)
|
||||
names[name] = name
|
||||
alias[name i] = name
|
||||
nalias[name] = ++i
|
||||
|
||||
}
|
||||
if(/^Alias:/){
|
||||
a = tolower($2)
|
||||
if(a != "none"){
|
||||
names[a] = name
|
||||
alias[name i ] = a
|
||||
nalias[name] = ++i
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
tcs = $1
|
||||
if(tcs in names){
|
||||
name = names[tcs]
|
||||
for(i=0; i<nalias[name]; i++)
|
||||
print "\"" alias[name i] "\", \"" $2 "\","
|
||||
}
|
||||
}
|
1868
sys/src/cmd/tcs/charsets.txt
Normal file
1868
sys/src/cmd/tcs/charsets.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -404,24 +404,25 @@ html_in(int fd, long *x, struct convert *out)
|
|||
c = Bgetc(&b);
|
||||
if(c == Beof)
|
||||
break;
|
||||
buf[i++] = c;
|
||||
if(strchr("; \t\r\n", c))
|
||||
if(strchr(";&</> \t\r\n", c)){
|
||||
if(c != ';')
|
||||
Bungetc(&b);
|
||||
break;
|
||||
}
|
||||
buf[i++] = c;
|
||||
}
|
||||
buf[i] = 0;
|
||||
if(buf[i-1] == ';'){
|
||||
buf[i-1] = 0;
|
||||
if(i > 1){
|
||||
if((c = findbyname(buf+1)) != Runeerror){
|
||||
*r++ = c;
|
||||
continue;
|
||||
}
|
||||
buf[i-1] = ';';
|
||||
if(buf[1] == '#'){
|
||||
if(buf[2] == 'x')
|
||||
if(i > 2 && buf[1] == '#'){
|
||||
if(i > 3 && strchr("xX", buf[2]))
|
||||
c = strtol(buf+3, &p, 16);
|
||||
else
|
||||
c = strtol(buf+2, &p, 10);
|
||||
if(*p != ';' || c >= NRUNE || c < 0)
|
||||
if(*p || c >= NRUNE || c < 0)
|
||||
goto bad;
|
||||
*r++ = c;
|
||||
continue;
|
||||
|
|
|
@ -30,11 +30,18 @@ tcs.$O: conv.h
|
|||
tcs.$O: 8859.h
|
||||
tcs.$O: ms.h
|
||||
tcs.$O: misc.h
|
||||
tcs.$O: alias.h
|
||||
conv%.$O: conv.h
|
||||
conv_ksc.$O: ksc.h
|
||||
|
||||
charsets.txt:
|
||||
hget http://www.iana.org/assignments/character-sets | sed 's/
//' >$target
|
||||
|
||||
alias.h: charsets.awk charsets.txt alias.txt
|
||||
charsets.awk charsets.txt alias.txt >$target
|
||||
|
||||
clean:V:
|
||||
rm -f *.[$OS] [$OS].out y.tab.? y.debug y.output $TARG
|
||||
rm -f *.[$OS] [$OS].out y.tab.? y.debug y.output alias.h $TARG
|
||||
|
||||
nuke:V:
|
||||
rm -f *.[$OS] [$OS].out y.tab.? y.debug y.output $TARG
|
||||
rm -f *.[$OS] [$OS].out y.tab.? y.debug y.output alias.h $TARG
|
||||
|
|
|
@ -154,12 +154,32 @@ list(void)
|
|||
EPR "\n");
|
||||
}
|
||||
|
||||
char*
|
||||
aliasname(char *name)
|
||||
{
|
||||
static struct {
|
||||
char *alias;
|
||||
char *name;
|
||||
} tab[] = {
|
||||
#include "alias.h"
|
||||
/* not generated by the script */
|
||||
"euc_jp", "jis",
|
||||
"euc_kr", "euc-k",
|
||||
"windows-874", "tis",
|
||||
};
|
||||
int i;
|
||||
for(i=0; i<nelem(tab); i++)
|
||||
if(cistrcmp(tab[i].alias, name) == 0)
|
||||
return tab[i].name;
|
||||
return name;
|
||||
}
|
||||
|
||||
struct convert *
|
||||
conv(char *name, int from)
|
||||
{
|
||||
struct convert *c;
|
||||
|
||||
name = aliasname(name);
|
||||
for(c = convert; c->name; c++){
|
||||
if(cistrcmp(c->name, name) != 0)
|
||||
continue;
|
||||
|
|
Loading…
Reference in a new issue