abaco: use uhtml to handle charset conversions
This commit is contained in:
parent
0ca4c2ea45
commit
1af7323238
4 changed files with 9 additions and 2032 deletions
|
@ -1,36 +0,0 @@
|
|||
#!/bin/awk -f
|
||||
# makes a table of character sets from http://www.iana.org/assignments/character-sets
|
||||
# and tcs.txt
|
||||
|
||||
BEGIN{
|
||||
if(ARGC != 3){
|
||||
print "Usage: " ARGV[0] " charsets.txt tcs.txt"
|
||||
exit 1
|
||||
}
|
||||
while(getline<ARGV[1]){
|
||||
if(/^Name:/){
|
||||
i = 0
|
||||
name=tolower($2)
|
||||
names[name] = name
|
||||
alias[name i] = name
|
||||
nalias[name] = ++i
|
||||
|
||||
}
|
||||
if(/^Alias:/){
|
||||
a = tolower($2)
|
||||
if(a != "none"){
|
||||
names[a] = name
|
||||
alias[name i ] = a
|
||||
nalias[name] = ++i
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
tcs = $1
|
||||
if(tcs in names){
|
||||
name = names[tcs]
|
||||
for(i=0; i<nalias[name]; i++)
|
||||
print "\"" alias[name i] "\", \"" $2 "\","
|
||||
}
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -22,7 +22,6 @@ OFILES=\
|
|||
HFILES=\
|
||||
dat.h\
|
||||
fns.h\
|
||||
tcs.h\
|
||||
|
||||
UPDATE=\
|
||||
mkfile\
|
||||
|
@ -31,12 +30,6 @@ UPDATE=\
|
|||
|
||||
</sys/src/cmd/mkone
|
||||
|
||||
charsets.txt:
|
||||
hget http://www.iana.org/assignments/character-sets | sed 's/
//' > charsets.txt
|
||||
|
||||
tcs.h: charsets.awk charsets.txt tcs.txt
|
||||
charsets.awk charsets.txt tcs.txt > tcs.h
|
||||
|
||||
syms:V:
|
||||
8c -a $CFLAGS main.c > syms
|
||||
8c -aa ????.c >> syms
|
||||
|
|
|
@ -715,88 +715,22 @@ writeproc(void *v)
|
|||
sendul(sync, i);
|
||||
}
|
||||
|
||||
struct {
|
||||
char *mime;
|
||||
char *tcs;
|
||||
}tcstab[] = {
|
||||
|
||||
#include "tcs.h"
|
||||
|
||||
/* not generated by the script */
|
||||
"euc_jp", "jis",
|
||||
"euc_kr", "euc-k",
|
||||
"windows-874", "tis",
|
||||
nil, nil,
|
||||
};
|
||||
|
||||
enum {
|
||||
Winstart = 127,
|
||||
Winend = 159
|
||||
};
|
||||
|
||||
static int winchars[] = {
|
||||
8226, /* 8226 is a bullet */
|
||||
8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
|
||||
710, 8240, 352, 8249, 338, 8226, 8226, 8226,
|
||||
8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
|
||||
732, 8482, 353, 8250, 339, 8226, 8226, 376
|
||||
};
|
||||
|
||||
char *
|
||||
tcs(char *cs, char *s, long *np)
|
||||
uhtml(char *cs, char *s, long *np)
|
||||
{
|
||||
Channel *sync;
|
||||
Exec *e;
|
||||
Rune r;
|
||||
long i, n;
|
||||
void **a;
|
||||
uchar *us;
|
||||
char buf[BUFSIZE], cmd[50];
|
||||
char *t, *u;
|
||||
char *t;
|
||||
int p[2], q[2];
|
||||
|
||||
|
||||
if(s==nil || *s=='\0' || *np==0){
|
||||
werrstr("tcs failed: no data");
|
||||
werrstr("uhtml failed: no data");
|
||||
return s;
|
||||
}
|
||||
|
||||
if(cs == nil){
|
||||
werrstr("tcs failed: no charset");
|
||||
return s;
|
||||
}
|
||||
|
||||
if(cistrncmp(cs, "utf-8", 5)==0 || cistrncmp(cs, "utf8", 4)==0)
|
||||
return s;
|
||||
|
||||
for(i=0; tcstab[i].mime!=nil; i++)
|
||||
if(cistrncmp(cs, tcstab[i].mime, strlen(tcstab[i].mime)) == 0)
|
||||
break;
|
||||
|
||||
if(tcstab[i].mime == nil){
|
||||
fprint(2, "abaco: charset: %s not supported\n", cs);
|
||||
goto latin1;
|
||||
}
|
||||
if(cistrcmp(tcstab[i].tcs, "8859-1")==0 || cistrcmp(tcstab[i].tcs, "ascii")==0){
|
||||
latin1:
|
||||
n = 0;
|
||||
for(us=(uchar*)s; *us; us++)
|
||||
n += runelen(*us);
|
||||
n++;
|
||||
t = emalloc(n);
|
||||
for(us=(uchar*)s, u=t; *us; us++){
|
||||
if(*us>=Winstart && *us<=Winend)
|
||||
*u++ = winchars[*us-Winstart];
|
||||
else{
|
||||
r = *us;
|
||||
u += runetochar(u, &r);
|
||||
}
|
||||
}
|
||||
*u = 0;
|
||||
free(s);
|
||||
return t;
|
||||
}
|
||||
|
||||
if(pipe(p)<0 || pipe(q)<0)
|
||||
error("can't create pipe");
|
||||
|
||||
|
@ -804,7 +738,7 @@ latin1:
|
|||
if(sync == nil)
|
||||
error("can't create channel");
|
||||
|
||||
snprint(cmd, sizeof cmd, "tcs -f %s", tcstab[i].tcs);
|
||||
snprint(cmd, sizeof cmd, (cs != nil && *cs != '\0') ? "uhtml -c %s" : "uthml", cs);
|
||||
e = emalloc(sizeof(Exec));
|
||||
e->p[0] = p[0];
|
||||
e->p[1] = p[1];
|
||||
|
@ -818,7 +752,7 @@ latin1:
|
|||
close(p[0]);
|
||||
close(q[1]);
|
||||
|
||||
/* in case tcs fails */
|
||||
/* in case uhtml fails */
|
||||
t = s;
|
||||
sync = chancreate(sizeof(ulong), 0);
|
||||
if(sync == nil)
|
||||
|
@ -831,6 +765,7 @@ latin1:
|
|||
a[3] = (void *)*np;
|
||||
proccreate(writeproc, a, STACK);
|
||||
|
||||
i = 0;
|
||||
s = nil;
|
||||
while((n = read(q[0], buf, sizeof(buf))) > 0){
|
||||
s = erealloc(s, i+n+1);
|
||||
|
@ -840,14 +775,14 @@ latin1:
|
|||
}
|
||||
n = recvul(sync);
|
||||
if(n != *np)
|
||||
fprint(2, "tcs: did not write %ld; wrote %uld\n", *np, n);
|
||||
fprint(2, "uhtml failed: did not write %ld; wrote %uld\n", *np, n);
|
||||
|
||||
*np = i;
|
||||
chanfree(sync);
|
||||
close(q[0]);
|
||||
|
||||
if(s == nil){
|
||||
fprint(2, "tcs failed: can't convert charset=%s to %s\n", cs, tcstab[i].tcs);
|
||||
fprint(2, "uhtml failed: can't convert charset=%s\n", cs);
|
||||
return t;
|
||||
}
|
||||
free(t);
|
||||
|
@ -901,46 +836,6 @@ findctype(char *b, int l, char *keyword, char *s)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
int
|
||||
finddocctype(char *b, int l, char *s)
|
||||
{
|
||||
char *p, *e;
|
||||
|
||||
p = cistrstr(s, "<meta");
|
||||
if(!p)
|
||||
return -1;
|
||||
p += 5;
|
||||
e = strchr(s, '>');
|
||||
if(!e)
|
||||
return -1;
|
||||
snprint(b, l, "%.*s", (int)(e-p), p);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
int
|
||||
findxmltype(char *b, int l, char *s)
|
||||
{
|
||||
char *p, *e;
|
||||
|
||||
p = cistrstr(s, "<?xml ");
|
||||
if(!p)
|
||||
return -1;
|
||||
|
||||
p += 6;
|
||||
e = strstr(p, "?>");
|
||||
if(!e)
|
||||
return -1;
|
||||
snprint(b, l, "%.*s", (int)(e-p), p);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* servers can lie about lie about the charset,
|
||||
* so we use the charset based on the priority.
|
||||
*/
|
||||
char *
|
||||
convert(Runestr ctype, char *s, long *np)
|
||||
{
|
||||
|
@ -951,14 +846,7 @@ convert(Runestr ctype, char *s, long *np)
|
|||
snprint(buf, sizeof(buf), "%.*S", ctype.nr, ctype.r);
|
||||
findctype(t, sizeof(t), "charset", buf);
|
||||
}
|
||||
if(findxmltype(buf, sizeof(buf), s)==0)
|
||||
findctype(t, sizeof(t), "encoding", buf);
|
||||
if(finddocctype(buf, sizeof(buf), s) == 0)
|
||||
findctype(t, sizeof(t), "charset", buf);
|
||||
|
||||
if(*t == '\0')
|
||||
strcpy(t, charset);
|
||||
return tcs(t, s, np);
|
||||
return uhtml(t, s, np);
|
||||
}
|
||||
|
||||
int
|
||||
|
|
Loading…
Reference in a new issue