htmlfmt: use uhtml for character set conversion

This commit is contained in:
cinap_lenrek 2014-05-12 02:38:53 +02:00
parent 66f76c2821
commit 679b092ee0
3 changed files with 34 additions and 38 deletions

View file

@ -28,12 +28,10 @@ struct URLwin
extern char* url; extern char* url;
extern int aflag; extern int aflag;
extern int width; extern int width;
extern int defcharset;
extern char* loadhtml(int); extern char* loadhtml(int);
extern char* readfile(char*, char*, int*); extern char* readfile(char*, char*, int*);
extern int charset(char*);
extern void* emalloc(ulong); extern void* emalloc(ulong);
extern char* estrdup(char*); extern char* estrdup(char*);
extern char* estrstrdup(char*, char*); extern char* estrstrdup(char*, char*);

View file

@ -285,40 +285,13 @@ rerender(URLwin *u)
free(t); free(t);
} }
/*
* Somewhat of a hack. Not a full parse, just looks for strings in the beginning
* of the document (cistrstr only looks at first somewhat bytes).
*/
int
charset(char *s)
{
char *meta, *emeta, *charset;
if(defcharset == 0)
defcharset = ISO_8859_1;
meta = cistrstr(s, "<meta");
if(meta == nil)
return defcharset;
for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
;
charset = cistrstr(s, "charset=");
if(charset == nil)
return defcharset;
charset += 8;
if(*charset == '"')
charset++;
if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
return UTF_8;
return defcharset;
}
void void
rendertext(URLwin *u, Bytes *b) rendertext(URLwin *u, Bytes *b)
{ {
Rune *rurl; Rune *rurl;
rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1); rurl = toStr((uchar*)u->url, strlen(u->url), UTF_8);
u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo); u->items = parsehtml(b->b, b->n, rurl, u->type, UTF_8, &u->docinfo);
// free(rurl); // free(rurl);
rerender(u); rerender(u);

View file

@ -8,7 +8,34 @@
char *url = ""; char *url = "";
int aflag; int aflag;
int width = 70; int width = 70;
int defcharset; char *defcharset = "latin1";
int
uhtml(int fd)
{
int p[2];
if(pipe(p) < 0)
return fd;
switch(fork()){
case -1:
break;
case 0:
dup(fd, 0);
dup(p[1], 1);
close(p[1]);
close(p[0]);
execl("/bin/uhtml", "uhtml", "-c", defcharset, nil);
execl("/bin/cat", "cat", nil);
exits("exec");
default:
dup(p[0], fd);
break;
}
close(p[0]);
close(p[1]);
return fd;
}
void void
usage(void) usage(void)
@ -21,7 +48,7 @@ void
main(int argc, char *argv[]) main(int argc, char *argv[])
{ {
int i, fd; int i, fd;
char *p, *err, *file; char *err, *file;
char errbuf[ERRMAX]; char errbuf[ERRMAX];
ARGBEGIN{ ARGBEGIN{
@ -29,9 +56,7 @@ main(int argc, char *argv[])
aflag++; aflag++;
break; break;
case 'c': case 'c':
p = smprint("<meta charset=\"%s\">", EARGF(usage())); defcharset = EARGF(usage());
defcharset = charset(p);
free(p);
break; break;
case 'l': case 'w': case 'l': case 'w':
err = EARGF(usage()); err = EARGF(usage());
@ -50,7 +75,7 @@ main(int argc, char *argv[])
err = nil; err = nil;
file = "<stdin>"; file = "<stdin>";
if(argc == 0) if(argc == 0)
err = loadhtml(0); err = loadhtml(uhtml(0));
else else
for(i=0; err==nil && i<argc; i++){ for(i=0; err==nil && i<argc; i++){
file = argv[i]; file = argv[i];
@ -60,7 +85,7 @@ main(int argc, char *argv[])
err = errbuf; err = errbuf;
break; break;
} }
err = loadhtml(fd); err = loadhtml(uhtml(fd));
close(fd); close(fd);
if(err) if(err)
break; break;