abaco: use uhtml to handle charset conversions

This commit is contained in:
cinap_lenrek 2013-07-14 16:31:59 +02:00
parent 0ca4c2ea45
commit 1af7323238
4 changed files with 9 additions and 2032 deletions

View file

@ -1,36 +0,0 @@
#!/bin/awk -f
# makes a table of character sets from http://www.iana.org/assignments/character-sets
# and tcs.txt
BEGIN{
if(ARGC != 3){
print "Usage: " ARGV[0] " charsets.txt tcs.txt"
exit 1
}
while(getline<ARGV[1]){
if(/^Name:/){
i = 0
name=tolower($2)
names[name] = name
alias[name i] = name
nalias[name] = ++i
}
if(/^Alias:/){
a = tolower($2)
if(a != "none"){
names[a] = name
alias[name i ] = a
nalias[name] = ++i
}
}
}
}
{
tcs = $1
if(tcs in names){
name = names[tcs]
for(i=0; i<nalias[name]; i++)
print "\"" alias[name i] "\", \"" $2 "\","
}
}

File diff suppressed because it is too large Load diff

View file

@ -22,7 +22,6 @@ OFILES=\
HFILES=\
dat.h\
fns.h\
tcs.h\
UPDATE=\
mkfile\
@ -31,12 +30,6 @@ UPDATE=\
</sys/src/cmd/mkone
charsets.txt:
hget http://www.iana.org/assignments/character-sets | sed 's/ //' > charsets.txt
tcs.h: charsets.awk charsets.txt tcs.txt
charsets.awk charsets.txt tcs.txt > tcs.h
syms:V:
8c -a $CFLAGS main.c > syms
8c -aa ????.c >> syms

View file

@ -715,88 +715,22 @@ writeproc(void *v)
sendul(sync, i);
}
struct {
char *mime;
char *tcs;
}tcstab[] = {
#include "tcs.h"
/* not generated by the script */
"euc_jp", "jis",
"euc_kr", "euc-k",
"windows-874", "tis",
nil, nil,
};
enum {
Winstart = 127,
Winend = 159
};
static int winchars[] = {
8226, /* 8226 is a bullet */
8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
710, 8240, 352, 8249, 338, 8226, 8226, 8226,
8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
732, 8482, 353, 8250, 339, 8226, 8226, 376
};
char *
tcs(char *cs, char *s, long *np)
uhtml(char *cs, char *s, long *np)
{
Channel *sync;
Exec *e;
Rune r;
long i, n;
void **a;
uchar *us;
char buf[BUFSIZE], cmd[50];
char *t, *u;
char *t;
int p[2], q[2];
if(s==nil || *s=='\0' || *np==0){
werrstr("tcs failed: no data");
werrstr("uhtml failed: no data");
return s;
}
if(cs == nil){
werrstr("tcs failed: no charset");
return s;
}
if(cistrncmp(cs, "utf-8", 5)==0 || cistrncmp(cs, "utf8", 4)==0)
return s;
for(i=0; tcstab[i].mime!=nil; i++)
if(cistrncmp(cs, tcstab[i].mime, strlen(tcstab[i].mime)) == 0)
break;
if(tcstab[i].mime == nil){
fprint(2, "abaco: charset: %s not supported\n", cs);
goto latin1;
}
if(cistrcmp(tcstab[i].tcs, "8859-1")==0 || cistrcmp(tcstab[i].tcs, "ascii")==0){
latin1:
n = 0;
for(us=(uchar*)s; *us; us++)
n += runelen(*us);
n++;
t = emalloc(n);
for(us=(uchar*)s, u=t; *us; us++){
if(*us>=Winstart && *us<=Winend)
*u++ = winchars[*us-Winstart];
else{
r = *us;
u += runetochar(u, &r);
}
}
*u = 0;
free(s);
return t;
}
if(pipe(p)<0 || pipe(q)<0)
error("can't create pipe");
@ -804,7 +738,7 @@ latin1:
if(sync == nil)
error("can't create channel");
snprint(cmd, sizeof cmd, "tcs -f %s", tcstab[i].tcs);
snprint(cmd, sizeof cmd, (cs != nil && *cs != '\0') ? "uhtml -c %s" : "uthml", cs);
e = emalloc(sizeof(Exec));
e->p[0] = p[0];
e->p[1] = p[1];
@ -818,7 +752,7 @@ latin1:
close(p[0]);
close(q[1]);
/* in case tcs fails */
/* in case uhtml fails */
t = s;
sync = chancreate(sizeof(ulong), 0);
if(sync == nil)
@ -831,6 +765,7 @@ latin1:
a[3] = (void *)*np;
proccreate(writeproc, a, STACK);
i = 0;
s = nil;
while((n = read(q[0], buf, sizeof(buf))) > 0){
s = erealloc(s, i+n+1);
@ -840,14 +775,14 @@ latin1:
}
n = recvul(sync);
if(n != *np)
fprint(2, "tcs: did not write %ld; wrote %uld\n", *np, n);
fprint(2, "uhtml failed: did not write %ld; wrote %uld\n", *np, n);
*np = i;
chanfree(sync);
close(q[0]);
if(s == nil){
fprint(2, "tcs failed: can't convert charset=%s to %s\n", cs, tcstab[i].tcs);
fprint(2, "uhtml failed: can't convert charset=%s\n", cs);
return t;
}
free(t);
@ -901,46 +836,6 @@ findctype(char *b, int l, char *keyword, char *s)
return 0;
}
static
int
finddocctype(char *b, int l, char *s)
{
char *p, *e;
p = cistrstr(s, "<meta");
if(!p)
return -1;
p += 5;
e = strchr(s, '>');
if(!e)
return -1;
snprint(b, l, "%.*s", (int)(e-p), p);
return 0;
}
static
int
findxmltype(char *b, int l, char *s)
{
char *p, *e;
p = cistrstr(s, "<?xml ");
if(!p)
return -1;
p += 6;
e = strstr(p, "?>");
if(!e)
return -1;
snprint(b, l, "%.*s", (int)(e-p), p);
return 0;
}
/*
* servers can lie about lie about the charset,
* so we use the charset based on the priority.
*/
char *
convert(Runestr ctype, char *s, long *np)
{
@ -951,14 +846,7 @@ convert(Runestr ctype, char *s, long *np)
snprint(buf, sizeof(buf), "%.*S", ctype.nr, ctype.r);
findctype(t, sizeof(t), "charset", buf);
}
if(findxmltype(buf, sizeof(buf), s)==0)
findctype(t, sizeof(t), "encoding", buf);
if(finddocctype(buf, sizeof(buf), s) == 0)
findctype(t, sizeof(t), "charset", buf);
if(*t == '\0')
strcpy(t, charset);
return tcs(t, s, np);
return uhtml(t, s, np);
}
int