1503 lines
28 KiB
C
1503 lines
28 KiB
C
#include <u.h>
|
|
#include <libc.h>
|
|
#include <draw.h>
|
|
#include <ctype.h>
|
|
#include <html.h>
|
|
#include "impl.h"
|
|
|
|
typedef struct TokenSource TokenSource;
|
|
struct TokenSource
|
|
{
|
|
int i; // index of next byte to use
|
|
uchar* data; // all the data
|
|
int edata; // data[0:edata] is valid
|
|
int chset; // one of US_Ascii, etc.
|
|
int mtype; // TextHtml or TextPlain
|
|
};
|
|
|
|
enum {
|
|
EOF = -2,
|
|
EOB = -1
|
|
};
|
|
|
|
#define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
|
|
|
|
#define SMALLBUFSIZE 240
|
|
#define BIGBUFSIZE 2000
|
|
|
|
// HTML 4.0 tag names.
|
|
// Keep sorted, and in correspondence with enum in iparse.h.
|
|
Rune* tagnames[] = {
|
|
L" ",
|
|
L"!",
|
|
L"a",
|
|
L"abbr",
|
|
L"acronym",
|
|
L"address",
|
|
L"applet",
|
|
L"area",
|
|
L"b",
|
|
L"base",
|
|
L"basefont",
|
|
L"bdo",
|
|
L"big",
|
|
L"blink",
|
|
L"blockquote",
|
|
L"body",
|
|
L"bq",
|
|
L"br",
|
|
L"button",
|
|
L"caption",
|
|
L"center",
|
|
L"cite",
|
|
L"code",
|
|
L"col",
|
|
L"colgroup",
|
|
L"dd",
|
|
L"del",
|
|
L"dfn",
|
|
L"dir",
|
|
L"div",
|
|
L"dl",
|
|
L"dt",
|
|
L"em",
|
|
L"fieldset",
|
|
L"font",
|
|
L"form",
|
|
L"frame",
|
|
L"frameset",
|
|
L"h1",
|
|
L"h2",
|
|
L"h3",
|
|
L"h4",
|
|
L"h5",
|
|
L"h6",
|
|
L"head",
|
|
L"hr",
|
|
L"html",
|
|
L"i",
|
|
L"iframe",
|
|
L"img",
|
|
L"input",
|
|
L"ins",
|
|
L"isindex",
|
|
L"kbd",
|
|
L"label",
|
|
L"legend",
|
|
L"li",
|
|
L"link",
|
|
L"map",
|
|
L"menu",
|
|
L"meta",
|
|
L"nobr",
|
|
L"noframes",
|
|
L"noscript",
|
|
L"object",
|
|
L"ol",
|
|
L"optgroup",
|
|
L"option",
|
|
L"p",
|
|
L"param",
|
|
L"pre",
|
|
L"q",
|
|
L"s",
|
|
L"samp",
|
|
L"script",
|
|
L"select",
|
|
L"small",
|
|
L"span",
|
|
L"strike",
|
|
L"strong",
|
|
L"style",
|
|
L"sub",
|
|
L"sup",
|
|
L"table",
|
|
L"tbody",
|
|
L"td",
|
|
L"textarea",
|
|
L"tfoot",
|
|
L"th",
|
|
L"thead",
|
|
L"title",
|
|
L"tr",
|
|
L"tt",
|
|
L"u",
|
|
L"ul",
|
|
L"var"
|
|
};
|
|
|
|
// HTML 4.0 attribute names.
|
|
// Keep sorted, and in correspondence with enum in impl.h.
|
|
Rune* attrnames[] = {
|
|
L"abbr",
|
|
L"accept-charset",
|
|
L"access-key",
|
|
L"action",
|
|
L"align",
|
|
L"alink",
|
|
L"alt",
|
|
L"archive",
|
|
L"axis",
|
|
L"background",
|
|
L"bgcolor",
|
|
L"border",
|
|
L"cellpadding",
|
|
L"cellspacing",
|
|
L"char",
|
|
L"charoff",
|
|
L"charset",
|
|
L"checked",
|
|
L"cite",
|
|
L"class",
|
|
L"classid",
|
|
L"clear",
|
|
L"code",
|
|
L"codebase",
|
|
L"codetype",
|
|
L"color",
|
|
L"cols",
|
|
L"colspan",
|
|
L"compact",
|
|
L"content",
|
|
L"coords",
|
|
L"data",
|
|
L"datetime",
|
|
L"declare",
|
|
L"defer",
|
|
L"dir",
|
|
L"disabled",
|
|
L"enctype",
|
|
L"face",
|
|
L"for",
|
|
L"frame",
|
|
L"frameborder",
|
|
L"headers",
|
|
L"height",
|
|
L"href",
|
|
L"hreflang",
|
|
L"hspace",
|
|
L"http-equiv",
|
|
L"id",
|
|
L"ismap",
|
|
L"label",
|
|
L"lang",
|
|
L"link",
|
|
L"longdesc",
|
|
L"marginheight",
|
|
L"marginwidth",
|
|
L"maxlength",
|
|
L"media",
|
|
L"method",
|
|
L"multiple",
|
|
L"name",
|
|
L"nohref",
|
|
L"noresize",
|
|
L"noshade",
|
|
L"nowrap",
|
|
L"object",
|
|
L"onblur",
|
|
L"onchange",
|
|
L"onclick",
|
|
L"ondblclick",
|
|
L"onfocus",
|
|
L"onkeypress",
|
|
L"onkeyup",
|
|
L"onload",
|
|
L"onmousedown",
|
|
L"onmousemove",
|
|
L"onmouseout",
|
|
L"onmouseover",
|
|
L"onmouseup",
|
|
L"onreset",
|
|
L"onselect",
|
|
L"onsubmit",
|
|
L"onunload",
|
|
L"profile",
|
|
L"prompt",
|
|
L"readonly",
|
|
L"rel",
|
|
L"rev",
|
|
L"rows",
|
|
L"rowspan",
|
|
L"rules",
|
|
L"scheme",
|
|
L"scope",
|
|
L"scrolling",
|
|
L"selected",
|
|
L"shape",
|
|
L"size",
|
|
L"span",
|
|
L"src",
|
|
L"standby",
|
|
L"start",
|
|
L"style",
|
|
L"summary",
|
|
L"tabindex",
|
|
L"target",
|
|
L"text",
|
|
L"title",
|
|
L"type",
|
|
L"usemap",
|
|
L"valign",
|
|
L"value",
|
|
L"valuetype",
|
|
L"version",
|
|
L"vlink",
|
|
L"vspace",
|
|
L"width"
|
|
};
|
|
|
|
|
|
// Character entity to unicode character number map.
|
|
// Keep sorted by name.
|
|
StringInt chartab[]= {
|
|
{L"AElig", 198},
|
|
{L"Aacute", 193},
|
|
{L"Acirc", 194},
|
|
{L"Agrave", 192},
|
|
{L"Alpha", 913},
|
|
{L"Aring", 197},
|
|
{L"Atilde", 195},
|
|
{L"Auml", 196},
|
|
{L"Beta", 914},
|
|
{L"Ccedil", 199},
|
|
{L"Chi", 935},
|
|
{L"Dagger", 8225},
|
|
{L"Delta", 916},
|
|
{L"ETH", 208},
|
|
{L"Eacute", 201},
|
|
{L"Ecirc", 202},
|
|
{L"Egrave", 200},
|
|
{L"Epsilon", 917},
|
|
{L"Eta", 919},
|
|
{L"Euml", 203},
|
|
{L"Gamma", 915},
|
|
{L"Iacute", 205},
|
|
{L"Icirc", 206},
|
|
{L"Igrave", 204},
|
|
{L"Iota", 921},
|
|
{L"Iuml", 207},
|
|
{L"Kappa", 922},
|
|
{L"Lambda", 923},
|
|
{L"Mu", 924},
|
|
{L"Ntilde", 209},
|
|
{L"Nu", 925},
|
|
{L"OElig", 338},
|
|
{L"Oacute", 211},
|
|
{L"Ocirc", 212},
|
|
{L"Ograve", 210},
|
|
{L"Omega", 937},
|
|
{L"Omicron", 927},
|
|
{L"Oslash", 216},
|
|
{L"Otilde", 213},
|
|
{L"Ouml", 214},
|
|
{L"Phi", 934},
|
|
{L"Pi", 928},
|
|
{L"Prime", 8243},
|
|
{L"Psi", 936},
|
|
{L"Rho", 929},
|
|
{L"Scaron", 352},
|
|
{L"Sigma", 931},
|
|
{L"THORN", 222},
|
|
{L"Tau", 932},
|
|
{L"Theta", 920},
|
|
{L"Uacute", 218},
|
|
{L"Ucirc", 219},
|
|
{L"Ugrave", 217},
|
|
{L"Upsilon", 933},
|
|
{L"Uuml", 220},
|
|
{L"Xi", 926},
|
|
{L"Yacute", 221},
|
|
{L"Yuml", 376},
|
|
{L"Zeta", 918},
|
|
{L"aacute", 225},
|
|
{L"acirc", 226},
|
|
{L"acute", 180},
|
|
{L"aelig", 230},
|
|
{L"agrave", 224},
|
|
{L"alefsym", 8501},
|
|
{L"alpha", 945},
|
|
{L"amp", 38},
|
|
{L"and", 8743},
|
|
{L"ang", 8736},
|
|
{L"aring", 229},
|
|
{L"asymp", 8776},
|
|
{L"atilde", 227},
|
|
{L"auml", 228},
|
|
{L"bdquo", 8222},
|
|
{L"beta", 946},
|
|
{L"brvbar", 166},
|
|
{L"bull", 8226},
|
|
{L"cap", 8745},
|
|
{L"ccedil", 231},
|
|
{L"cdots", 8943},
|
|
{L"cedil", 184},
|
|
{L"cent", 162},
|
|
{L"chi", 967},
|
|
{L"circ", 710},
|
|
{L"clubs", 9827},
|
|
{L"cong", 8773},
|
|
{L"copy", 169},
|
|
{L"crarr", 8629},
|
|
{L"cup", 8746},
|
|
{L"curren", 164},
|
|
{L"dArr", 8659},
|
|
{L"dagger", 8224},
|
|
{L"darr", 8595},
|
|
{L"ddots", 8945},
|
|
{L"deg", 176},
|
|
{L"delta", 948},
|
|
{L"diams", 9830},
|
|
{L"divide", 247},
|
|
{L"eacute", 233},
|
|
{L"ecirc", 234},
|
|
{L"egrave", 232},
|
|
{L"emdash", 8212}, /* non-standard but commonly used */
|
|
{L"empty", 8709},
|
|
{L"emsp", 8195},
|
|
{L"endash", 8211}, /* non-standard but commonly used */
|
|
{L"ensp", 8194},
|
|
{L"epsilon", 949},
|
|
{L"equiv", 8801},
|
|
{L"eta", 951},
|
|
{L"eth", 240},
|
|
{L"euml", 235},
|
|
{L"euro", 8364},
|
|
{L"exist", 8707},
|
|
{L"fnof", 402},
|
|
{L"forall", 8704},
|
|
{L"frac12", 189},
|
|
{L"frac14", 188},
|
|
{L"frac34", 190},
|
|
{L"frasl", 8260},
|
|
{L"gamma", 947},
|
|
{L"ge", 8805},
|
|
{L"gt", 62},
|
|
{L"hArr", 8660},
|
|
{L"harr", 8596},
|
|
{L"hearts", 9829},
|
|
{L"hellip", 8230},
|
|
{L"iacute", 237},
|
|
{L"icirc", 238},
|
|
{L"iexcl", 161},
|
|
{L"igrave", 236},
|
|
{L"image", 8465},
|
|
{L"infin", 8734},
|
|
{L"int", 8747},
|
|
{L"iota", 953},
|
|
{L"iquest", 191},
|
|
{L"isin", 8712},
|
|
{L"iuml", 239},
|
|
{L"kappa", 954},
|
|
{L"lArr", 8656},
|
|
{L"lambda", 955},
|
|
{L"lang", 9001},
|
|
{L"laquo", 171},
|
|
{L"larr", 8592},
|
|
{L"lceil", 8968},
|
|
{L"ldots", 8230},
|
|
{L"ldquo", 8220},
|
|
{L"le", 8804},
|
|
{L"lfloor", 8970},
|
|
{L"lowast", 8727},
|
|
{L"loz", 9674},
|
|
{L"lrm", 8206},
|
|
{L"lsaquo", 8249},
|
|
{L"lsquo", 8216},
|
|
{L"lt", 60},
|
|
{L"macr", 175},
|
|
{L"mdash", 8212},
|
|
{L"micro", 181},
|
|
{L"middot", 183},
|
|
{L"minus", 8722},
|
|
{L"mu", 956},
|
|
{L"nabla", 8711},
|
|
{L"nbsp", 160},
|
|
{L"ndash", 8211},
|
|
{L"ne", 8800},
|
|
{L"ni", 8715},
|
|
{L"not", 172},
|
|
{L"notin", 8713},
|
|
{L"nsub", 8836},
|
|
{L"ntilde", 241},
|
|
{L"nu", 957},
|
|
{L"oacute", 243},
|
|
{L"ocirc", 244},
|
|
{L"oelig", 339},
|
|
{L"ograve", 242},
|
|
{L"oline", 8254},
|
|
{L"omega", 969},
|
|
{L"omicron", 959},
|
|
{L"oplus", 8853},
|
|
{L"or", 8744},
|
|
{L"ordf", 170},
|
|
{L"ordm", 186},
|
|
{L"oslash", 248},
|
|
{L"otilde", 245},
|
|
{L"otimes", 8855},
|
|
{L"ouml", 246},
|
|
{L"para", 182},
|
|
{L"part", 8706},
|
|
{L"permil", 8240},
|
|
{L"perp", 8869},
|
|
{L"phi", 966},
|
|
{L"pi", 960},
|
|
{L"piv", 982},
|
|
{L"plusmn", 177},
|
|
{L"pound", 163},
|
|
{L"prime", 8242},
|
|
{L"prod", 8719},
|
|
{L"prop", 8733},
|
|
{L"psi", 968},
|
|
{L"quad", 8193},
|
|
{L"quot", 34},
|
|
{L"rArr", 8658},
|
|
{L"radic", 8730},
|
|
{L"rang", 9002},
|
|
{L"raquo", 187},
|
|
{L"rarr", 8594},
|
|
{L"rceil", 8969},
|
|
{L"rdquo", 8221},
|
|
{L"real", 8476},
|
|
{L"reg", 174},
|
|
{L"rfloor", 8971},
|
|
{L"rho", 961},
|
|
{L"rlm", 8207},
|
|
{L"rsaquo", 8250},
|
|
{L"rsquo", 8217},
|
|
{L"sbquo", 8218},
|
|
{L"scaron", 353},
|
|
{L"sdot", 8901},
|
|
{L"sect", 167},
|
|
{L"shy", 173},
|
|
{L"sigma", 963},
|
|
{L"sigmaf", 962},
|
|
{L"sim", 8764},
|
|
{L"sp", 8194},
|
|
{L"spades", 9824},
|
|
{L"sub", 8834},
|
|
{L"sube", 8838},
|
|
{L"sum", 8721},
|
|
{L"sup", 8835},
|
|
{L"sup1", 185},
|
|
{L"sup2", 178},
|
|
{L"sup3", 179},
|
|
{L"supe", 8839},
|
|
{L"szlig", 223},
|
|
{L"tau", 964},
|
|
{L"there4", 8756},
|
|
{L"theta", 952},
|
|
{L"thetasym", 977},
|
|
{L"thinsp", 8201},
|
|
{L"thorn", 254},
|
|
{L"tilde", 732},
|
|
{L"times", 215},
|
|
{L"trade", 8482},
|
|
{L"uArr", 8657},
|
|
{L"uacute", 250},
|
|
{L"uarr", 8593},
|
|
{L"ucirc", 251},
|
|
{L"ugrave", 249},
|
|
{L"uml", 168},
|
|
{L"upsih", 978},
|
|
{L"upsilon", 965},
|
|
{L"uuml", 252},
|
|
{L"varepsilon", 8712},
|
|
{L"varphi", 981},
|
|
{L"varpi", 982},
|
|
{L"varrho", 1009},
|
|
{L"vdots", 8942},
|
|
{L"vsigma", 962},
|
|
{L"vtheta", 977},
|
|
{L"weierp", 8472},
|
|
{L"xi", 958},
|
|
{L"yacute", 253},
|
|
{L"yen", 165},
|
|
{L"yuml", 255},
|
|
{L"zeta", 950},
|
|
{L"zwj", 8205},
|
|
{L"zwnj", 8204}
|
|
};
|
|
#define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
|
|
|
|
// Characters Winstart..Winend are those that Windows
|
|
// uses interpolated into the Latin1 set.
|
|
// They aren't supposed to appear in HTML, but they do....
|
|
enum {
|
|
Winstart = 127,
|
|
Winend = 159
|
|
};
|
|
|
|
static int winchars[]= { 8226, // 8226 is a bullet
|
|
8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
|
|
710, 8240, 352, 8249, 338, 8226, 8226, 8226,
|
|
8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
|
|
732, 8482, 353, 8250, 339, 8226, 8226, 376};
|
|
|
|
static StringInt* tagtable; // initialized from tagnames
|
|
static StringInt* attrtable; // initialized from attrnames
|
|
|
|
static void lexinit(void);
|
|
static int getplaindata(TokenSource* ts, Token* a, int* pai);
|
|
static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
|
|
static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag);
|
|
static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
|
|
static Rune* buftostr(Rune* s, Rune* buf, int j);
|
|
static int comment(TokenSource* ts);
|
|
static int findstr(TokenSource* ts, Rune* s);
|
|
static int ampersand(TokenSource* ts);
|
|
static int lowerc(int c);
|
|
static int getchar(TokenSource* ts);
|
|
static void ungetchar(TokenSource* ts, int c);
|
|
static void backup(TokenSource* ts, int savei);
|
|
static void freeinsidetoken(Token* t);
|
|
static void freeattrs(Attr* ahead);
|
|
static Attr* newattr(int attid, Rune* value, Attr* link);
|
|
static int Tconv(Fmt* f);
|
|
|
|
int dbglex = 0;
|
|
static int lexinited = 0;
|
|
|
|
static void
|
|
lexinit(void)
|
|
{
|
|
tagtable = _makestrinttab(tagnames, Numtags);
|
|
attrtable = _makestrinttab(attrnames, Numattrs);
|
|
fmtinstall('T', Tconv);
|
|
lexinited = 1;
|
|
}
|
|
|
|
static TokenSource*
|
|
newtokensource(uchar* data, int edata, int chset, int mtype)
|
|
{
|
|
TokenSource* ans;
|
|
|
|
assert(chset == US_Ascii || chset == ISO_8859_1 ||
|
|
chset == UTF_8 || chset == Unicode);
|
|
ans = (TokenSource*)emalloc(sizeof(TokenSource));
|
|
ans->i = 0;
|
|
ans->data = data;
|
|
ans->edata = edata;
|
|
ans->chset = chset;
|
|
ans->mtype = mtype;
|
|
return ans;
|
|
}
|
|
|
|
enum {
|
|
ToksChunk = 500,
|
|
};
|
|
|
|
// Call this to get the tokens.
|
|
// The number of returned tokens is returned in *plen.
|
|
Token*
|
|
_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
|
|
{
|
|
TokenSource* ts;
|
|
Token* a;
|
|
int alen;
|
|
int ai;
|
|
int starti;
|
|
int c;
|
|
int tag;
|
|
|
|
if(!lexinited)
|
|
lexinit();
|
|
ts = newtokensource(data, datalen, chset, mtype);
|
|
if(dbglex)
|
|
fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
|
|
alen = 0;
|
|
ai = 0;
|
|
a = 0;
|
|
if(ts->mtype == TextHtml) {
|
|
for(;;) {
|
|
if(alen - ai < ToksChunk/32) {
|
|
alen += ToksChunk;
|
|
a = erealloc(a, alen*sizeof *a);
|
|
}
|
|
starti = ts->i;
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
break;
|
|
if(c == '<') {
|
|
tag = gettag(ts, starti, a, &ai);
|
|
if(tag == Tscript || tag == Tstyle) {
|
|
// special rules for getting Data after....
|
|
starti = ts->i;
|
|
c = getchar(ts);
|
|
tag = getscriptdata(ts, c, starti, a, &ai, tag);
|
|
}
|
|
}
|
|
else
|
|
tag = getdata(ts, c, starti, a, &ai);
|
|
if(tag == -1)
|
|
break;
|
|
else if(dbglex > 1 && tag != Comment)
|
|
fprint(2, "lex: got token %T\n", &a[ai-1]);
|
|
}
|
|
}
|
|
else {
|
|
// plain text (non-html) tokens
|
|
for(;;) {
|
|
if(alen - ai < ToksChunk/32) {
|
|
alen += ToksChunk;
|
|
a = erealloc(a, alen*sizeof *a);
|
|
}
|
|
tag = getplaindata(ts, a, &ai);
|
|
if(tag == -1)
|
|
break;
|
|
if(dbglex > 1)
|
|
fprint(2, "lex: got token %T\n", &a[ai]);
|
|
}
|
|
}
|
|
free(ts);
|
|
if(dbglex)
|
|
fprint(2, "lex: returning %d tokens\n", ai);
|
|
*plen = ai;
|
|
if(ai == 0){
|
|
free(a);
|
|
a = 0;
|
|
}
|
|
return a;
|
|
}
|
|
|
|
// For case where source isn't HTML.
|
|
// Just make data tokens, one per line (or partial line,
|
|
// at end of buffer), ignoring non-whitespace control
|
|
// characters and dumping \r's.
|
|
// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
|
|
// Otherwise return -1;
|
|
static int
|
|
getplaindata(TokenSource* ts, Token* a, int* pai)
|
|
{
|
|
Rune* s;
|
|
int j;
|
|
int starti;
|
|
int c;
|
|
Token* tok;
|
|
Rune buf[BIGBUFSIZE];
|
|
|
|
s = nil;
|
|
j = 0;
|
|
starti = ts->i;
|
|
for(c = getchar(ts); c >= 0; c = getchar(ts)) {
|
|
if(c < ' ') {
|
|
if(isspace(c)) {
|
|
if(c == '\r') {
|
|
// ignore it unless no following '\n',
|
|
// in which case treat it like '\n'
|
|
c = getchar(ts);
|
|
if(c != '\n') {
|
|
if(c >= 0)
|
|
ungetchar(ts, c);
|
|
c = '\n';
|
|
}
|
|
}
|
|
}
|
|
else
|
|
c = 0;
|
|
}
|
|
if(c != 0) {
|
|
buf[j++] = c;
|
|
if(j == nelem(buf)-1) {
|
|
s = buftostr(s, buf, j);
|
|
j = 0;
|
|
}
|
|
}
|
|
if(c == '\n')
|
|
break;
|
|
}
|
|
s = buftostr(s, buf, j);
|
|
if(s == nil)
|
|
return -1;
|
|
tok = &a[(*pai)++];
|
|
tok->tag = Data;
|
|
tok->text = s;
|
|
tok->attr = nil;
|
|
tok->starti = starti;
|
|
return Data;
|
|
}
|
|
|
|
// Return concatenation of s and buf[0:j]
|
|
static Rune*
|
|
buftostr(Rune* s, Rune* buf, int j)
|
|
{
|
|
int i;
|
|
|
|
if(s == nil)
|
|
s = _Strndup(buf, j);
|
|
else {
|
|
i = _Strlen(s);
|
|
s = realloc(s, ( i+j+1)*sizeof *s);
|
|
memcpy(&s[i], buf, j*sizeof *s);
|
|
s[i+j] = 0;
|
|
}
|
|
return s;
|
|
}
|
|
|
|
// Gather data up to next start-of-tag or end-of-buffer.
|
|
// Translate entity references (&).
|
|
// Ignore non-whitespace control characters and get rid of \r's.
|
|
// If find non-empty token, fill in a[*pai], bump *pai, and return Data.
|
|
// Otherwise return -1;
|
|
static int
|
|
getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
|
|
{
|
|
Rune* s;
|
|
int j;
|
|
int c;
|
|
Token* tok;
|
|
Rune buf[SMALLBUFSIZE];
|
|
|
|
s = nil;
|
|
j = 0;
|
|
for(c = firstc; c >= 0; c = getchar(ts)){
|
|
if(c == '&') {
|
|
c = ampersand(ts);
|
|
if(c < 0)
|
|
break;
|
|
}
|
|
else if(c < ' ') {
|
|
if(isspace(c)) {
|
|
if(c == '\r') {
|
|
// ignore it unless no following '\n',
|
|
// in which case treat it like '\n'
|
|
c = getchar(ts);
|
|
if(c != '\n') {
|
|
if(c >= 0)
|
|
ungetchar(ts, c);
|
|
c = '\n';
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
if(warn)
|
|
fprint(2, "warning: non-whitespace control character %d ignored\n", c);
|
|
c = 0;
|
|
}
|
|
}
|
|
else if(c == '<') {
|
|
ungetchar(ts, c);
|
|
break;
|
|
}
|
|
if(c != 0) {
|
|
buf[j++] = c;
|
|
if(j == nelem(buf)-1) {
|
|
s = buftostr(s, buf, j);
|
|
j = 0;
|
|
}
|
|
}
|
|
}
|
|
s = buftostr(s, buf, j);
|
|
if(s == nil)
|
|
return -1;
|
|
tok = &a[(*pai)++];
|
|
tok->tag = Data;
|
|
tok->text = s;
|
|
tok->attr = nil;
|
|
tok->starti = starti;
|
|
return Data;
|
|
}
|
|
|
|
// The rules for lexing scripts are different (ugh).
|
|
// Gather up everything until see an "</" tagnames[tok] ">"
|
|
static int
|
|
getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai, int findtag)
|
|
{
|
|
Rune* s;
|
|
int j;
|
|
int tstarti;
|
|
int savei;
|
|
int c;
|
|
int tag;
|
|
int done;
|
|
Token* tok;
|
|
Rune buf[BIGBUFSIZE];
|
|
|
|
s = nil;
|
|
j = 0;
|
|
tstarti = starti;
|
|
c = firstc;
|
|
done = 0;
|
|
while(c >= 0) {
|
|
if(c == '<') {
|
|
// other browsers ignore stuff to end of line after <!
|
|
savei = ts->i;
|
|
c = getchar(ts);
|
|
if(c == '!') {
|
|
if(comment(ts) == -1)
|
|
break;
|
|
if(c == '\r')
|
|
c = getchar(ts);
|
|
if(c == '\n')
|
|
c = getchar(ts);
|
|
}
|
|
else if(c >= 0) {
|
|
backup(ts, savei);
|
|
tag = gettag(ts, tstarti, a, pai);
|
|
if(tag == -1)
|
|
break;
|
|
if(tag != Comment)
|
|
(*pai)--;
|
|
backup(ts, tstarti);
|
|
if(tag == findtag + RBRA) {
|
|
done = 1;
|
|
break;
|
|
}
|
|
// here tag was not the one we were looking for, so take as regular data
|
|
c = getchar(ts);
|
|
}
|
|
}
|
|
if(c < 0)
|
|
break;
|
|
if(c != 0) {
|
|
buf[j++] = c;
|
|
if(j == nelem(buf)-1) {
|
|
s = buftostr(s, buf, j);
|
|
j = 0;
|
|
}
|
|
}
|
|
tstarti = ts->i;
|
|
c = getchar(ts);
|
|
}
|
|
if(done || ts->i == ts->edata) {
|
|
s = buftostr(s, buf, j);
|
|
tok = &a[(*pai)++];
|
|
tok->tag = Data;
|
|
tok->text = s;
|
|
tok->attr = nil;
|
|
tok->starti = starti;
|
|
return Data;
|
|
}
|
|
free(s);
|
|
backup(ts, starti);
|
|
return -1;
|
|
}
|
|
|
|
// We've just seen a '<'. Gather up stuff to closing '>' (if buffer
|
|
// ends before then, return -1).
|
|
// If it's a tag, look up the name, gather the attributes, and return
|
|
// the appropriate token.
|
|
// Else it's either just plain data or some kind of ignorable stuff:
|
|
// return Data or Comment as appropriate.
|
|
// If it's not a Comment, put it in a[*pai] and bump *pai.
|
|
static int
|
|
gettag(TokenSource* ts, int starti, Token* a, int* pai)
|
|
{
|
|
int rbra;
|
|
int ans;
|
|
Attr* al;
|
|
int nexti;
|
|
int c;
|
|
int ti;
|
|
int afnd;
|
|
int attid;
|
|
int quote;
|
|
Rune* val;
|
|
int nv;
|
|
int i;
|
|
int tag;
|
|
Token* tok;
|
|
Rune buf[BIGBUFSIZE];
|
|
|
|
rbra = 0;
|
|
nexti = ts->i;
|
|
tok = &a[*pai];
|
|
tok->tag = Notfound;
|
|
tok->text = nil;
|
|
tok->attr = nil;
|
|
tok->starti = starti;
|
|
c = getchar(ts);
|
|
if(c == '/') {
|
|
rbra = RBRA;
|
|
c = getchar(ts);
|
|
}
|
|
if(c < 0)
|
|
goto eob_done;
|
|
if(c >= 256 || !isalpha(c)) {
|
|
// not a tag
|
|
if(c == '!') {
|
|
ans = comment(ts);
|
|
if(ans != -1)
|
|
return ans;
|
|
goto eob_done;
|
|
}
|
|
else {
|
|
backup(ts, nexti);
|
|
tok->tag = Data;
|
|
tok->text = _Strdup(L"<");
|
|
(*pai)++;
|
|
return Data;
|
|
}
|
|
}
|
|
// c starts a tagname
|
|
buf[0] = c;
|
|
i = 1;
|
|
while(1) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto eob_done;
|
|
if(!ISNAMCHAR(c))
|
|
break;
|
|
// if name is bigger than buf it won't be found anyway...
|
|
if(i < BIGBUFSIZE)
|
|
buf[i++] = c;
|
|
}
|
|
if(_lookup(tagtable, Numtags, buf, i, &tag))
|
|
tok->tag = tag + rbra;
|
|
else
|
|
tok->text = _Strndup(buf, i); // for warning print, in build
|
|
// attribute gathering loop
|
|
al = nil;
|
|
while(1) {
|
|
// look for "ws name" or "ws name ws = ws val" (ws=whitespace)
|
|
// skip whitespace
|
|
attrloop_continue:
|
|
while(c < 256 && isspace(c)) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto eob_done;
|
|
}
|
|
if(c == '>')
|
|
goto attrloop_done;
|
|
if(c == '<') {
|
|
if(warn)
|
|
fprint(2, "warning: unclosed tag\n");
|
|
ungetchar(ts, c);
|
|
goto attrloop_done;
|
|
}
|
|
if(c >= 256 || !isalpha(c)) {
|
|
if(warn)
|
|
fprint(2, "warning: expected attribute name\n");
|
|
// skipt to next attribute name
|
|
while(1) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto eob_done;
|
|
if(c < 256 && isalpha(c))
|
|
goto attrloop_continue;
|
|
if(c == '<') {
|
|
if(warn)
|
|
fprint(2, "warning: unclosed tag\n");
|
|
ungetchar(ts, 60);
|
|
goto attrloop_done;
|
|
}
|
|
if(c == '>')
|
|
goto attrloop_done;
|
|
}
|
|
}
|
|
// gather attribute name
|
|
buf[0] = c;
|
|
i = 1;
|
|
while(1) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto eob_done;
|
|
if(!ISNAMCHAR(c))
|
|
break;
|
|
if(i < BIGBUFSIZE-1)
|
|
buf[i++] = c;
|
|
}
|
|
afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
|
|
if(warn && !afnd) {
|
|
buf[i] = 0;
|
|
fprint(2, "warning: unknown attribute name %S\n", buf);
|
|
}
|
|
// skip whitespace
|
|
while(c < 256 && isspace(c)) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto eob_done;
|
|
}
|
|
if(c != '=') {
|
|
if(afnd)
|
|
al = newattr(attid, nil, al);
|
|
goto attrloop_continue;
|
|
}
|
|
//# c is '=' here; skip whitespace
|
|
while(1) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto eob_done;
|
|
if(c >= 256 || !isspace(c))
|
|
break;
|
|
}
|
|
quote = 0;
|
|
if(c == '\'' || c == '"') {
|
|
quote = c;
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto eob_done;
|
|
}
|
|
val = nil;
|
|
nv = 0;
|
|
while(1) {
|
|
valloop_continue:
|
|
if(c < 0)
|
|
goto eob_done;
|
|
if(c == '>') {
|
|
if(quote) {
|
|
// c might be part of string (though not good style)
|
|
// but if line ends before close quote, assume
|
|
// there was an unmatched quote
|
|
ti = ts->i;
|
|
while(1) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto eob_done;
|
|
if(c == quote) {
|
|
backup(ts, ti);
|
|
buf[nv++] = '>';
|
|
if(nv == BIGBUFSIZE-1) {
|
|
val = buftostr(val, buf, nv);
|
|
nv = 0;
|
|
}
|
|
c = getchar(ts);
|
|
goto valloop_continue;
|
|
}
|
|
if(c == '\n') {
|
|
if(warn)
|
|
fprint(2, "warning: apparent unmatched quote\n");
|
|
backup(ts, ti);
|
|
c = '>';
|
|
goto valloop_done;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
goto valloop_done;
|
|
}
|
|
if(quote) {
|
|
if(c == quote) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto eob_done;
|
|
goto valloop_done;
|
|
}
|
|
if(c == '\r') {
|
|
c = getchar(ts);
|
|
goto valloop_continue;
|
|
}
|
|
if(c == '\t' || c == '\n')
|
|
c = ' ';
|
|
}
|
|
else {
|
|
if(c < 256 && isspace(c))
|
|
goto valloop_done;
|
|
}
|
|
if(c == '&') {
|
|
c = ampersand(ts);
|
|
if(c == -1)
|
|
goto eob_done;
|
|
}
|
|
buf[nv++] = c;
|
|
if(nv == BIGBUFSIZE-1) {
|
|
val = buftostr(val, buf, nv);
|
|
nv = 0;
|
|
}
|
|
c = getchar(ts);
|
|
}
|
|
valloop_done:
|
|
if(afnd) {
|
|
val = buftostr(val, buf, nv);
|
|
al = newattr(attid, val, al);
|
|
}
|
|
}
|
|
|
|
attrloop_done:
|
|
tok->attr = al;
|
|
(*pai)++;
|
|
return tok->tag;
|
|
|
|
eob_done:
|
|
if(warn)
|
|
fprint(2, "warning: incomplete tag at end of page\n");
|
|
backup(ts, nexti);
|
|
tok->tag = Data;
|
|
tok->text = _Strdup(L"<");
|
|
return Data;
|
|
}
|
|
|
|
// We've just read a '<!' at position starti,
|
|
// so this may be a comment or other ignored section, or it may
|
|
// be just a literal string if there is no close before end of file
|
|
// (other browsers do that).
|
|
// The accepted practice seems to be (note: contrary to SGML spec!):
|
|
// If see <!--, look for --> to close, or if none, > to close.
|
|
// If see <!(not --), look for > to close.
|
|
// If no close before end of file, leave original characters in as literal data.
|
|
//
|
|
// If we see ignorable stuff, return Comment.
|
|
// Else return nil (caller should back up and try again when more data arrives,
|
|
// unless at end of file, in which case caller should just make '<' a data token).
|
|
static int
|
|
comment(TokenSource* ts)
|
|
{
|
|
int nexti;
|
|
int havecomment;
|
|
int c;
|
|
|
|
nexti = ts->i;
|
|
havecomment = 0;
|
|
c = getchar(ts);
|
|
if(c == '-') {
|
|
c = getchar(ts);
|
|
if(c == '-') {
|
|
if(findstr(ts, L"-->"))
|
|
havecomment = 1;
|
|
else
|
|
backup(ts, nexti);
|
|
}
|
|
}
|
|
if(!havecomment) {
|
|
if(c == '>')
|
|
havecomment = 1;
|
|
else if(c >= 0) {
|
|
if(findstr(ts, L">"))
|
|
havecomment = 1;
|
|
}
|
|
}
|
|
if(havecomment)
|
|
return Comment;
|
|
return -1;
|
|
}
|
|
|
|
// Look for string s in token source.
|
|
// If found, return 1, with buffer at next char after s,
|
|
// else return 0 (caller should back up).
|
|
static int
|
|
findstr(TokenSource* ts, Rune* s)
|
|
{
|
|
int c0;
|
|
int n;
|
|
int nexti;
|
|
int i;
|
|
int c;
|
|
|
|
c0 = s[0];
|
|
n = runestrlen(s);
|
|
while(1) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
break;
|
|
if(c == c0) {
|
|
if(n == 1)
|
|
return 1;
|
|
nexti = ts->i;
|
|
for(i = 1; i < n; i++) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
goto mainloop_done;
|
|
if(c != s[i])
|
|
break;
|
|
}
|
|
if(i == n)
|
|
return 1;
|
|
backup(ts, nexti);
|
|
}
|
|
}
|
|
mainloop_done:
|
|
return 0;
|
|
}
|
|
|
|
// We've just read an '&'; look for an entity reference
|
|
// name, and if found, return translated char.
|
|
// if there is a complete entity name but it isn't known,
|
|
// back up to just past the '&' and return '&'.
|
|
// If the entity can't be completed in the current buffer, back up
|
|
// to the '&' and return -1.
|
|
static int
|
|
ampersand(TokenSource* ts)
|
|
{
|
|
int savei;
|
|
int c;
|
|
int fnd;
|
|
int ans;
|
|
int v;
|
|
int k;
|
|
Rune buf[25];
|
|
|
|
savei = ts->i;
|
|
c = getchar(ts);
|
|
fnd = 0;
|
|
ans = -1;
|
|
if(c == '#') {
|
|
c = getchar(ts);
|
|
v = 0;
|
|
if(c == 'X' || c == 'x')
|
|
for(c = getchar(ts); c < 256; c = getchar(ts))
|
|
if(c >= '0' && c <= '9')
|
|
v = v*16+c-'0';
|
|
else if(c >= 'A' && c<= 'F')
|
|
v = v*16+c-'A'+10;
|
|
else if(c >= 'a' && c <= 'f')
|
|
v = v*16+c-'a'+10;
|
|
else
|
|
break;
|
|
else
|
|
while(c >= 0) {
|
|
if(!(c < 256 && isdigit(c)))
|
|
break;
|
|
v = v*10 + c - 48;
|
|
c = getchar(ts);
|
|
}
|
|
if(c >= 0) {
|
|
if(!(c == ';' || c == '\n' || c == '\r'))
|
|
ungetchar(ts, c);
|
|
c = v;
|
|
if(c == 160)
|
|
c = 160;
|
|
if(c >= Winstart && c <= Winend) {
|
|
c = winchars[c - Winstart];
|
|
}
|
|
ans = c;
|
|
fnd = 1;
|
|
}
|
|
}
|
|
else if(c < 256 && isalpha(c)) {
|
|
buf[0] = c;
|
|
k = 1;
|
|
while(1) {
|
|
c = getchar(ts);
|
|
if(c < 0)
|
|
break;
|
|
if(c < 256 && (isalpha(c) || isdigit(c))) {
|
|
if(k < nelem(buf)-1)
|
|
buf[k++] = c;
|
|
}
|
|
else {
|
|
if(!(c == ';' || c == '\n' || c == '\r'))
|
|
ungetchar(ts, c);
|
|
break;
|
|
}
|
|
}
|
|
if(c >= 256 || c != '=' && !(isalpha(c) || isdigit(c)))
|
|
fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
|
|
}
|
|
if(!fnd) {
|
|
backup(ts, savei);
|
|
ans = '&';
|
|
}
|
|
return ans;
|
|
}
|
|
|
|
// Get next char, obeying ts.chset.
|
|
// Returns -1 if no complete character left before current end of data.
|
|
static int
|
|
getchar(TokenSource* ts)
|
|
{
|
|
uchar* buf;
|
|
int c;
|
|
int n;
|
|
int ok;
|
|
Rune r;
|
|
|
|
if(ts->i >= ts->edata)
|
|
return -1;
|
|
buf = ts->data;
|
|
c = buf[ts->i];
|
|
switch(ts->chset) {
|
|
case ISO_8859_1:
|
|
if(c >= Winstart && c <= Winend)
|
|
c = winchars[c - Winstart];
|
|
ts->i++;
|
|
break;
|
|
case US_Ascii:
|
|
if(c > 127) {
|
|
if(warn)
|
|
fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
|
|
}
|
|
ts->i++;
|
|
break;
|
|
case UTF_8:
|
|
ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
|
|
n = chartorune(&r, (char*)(buf+ts->i));
|
|
if(ok) {
|
|
if(warn && c == 0x80)
|
|
fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
|
|
ts->i += n;
|
|
c = r;
|
|
}
|
|
else {
|
|
// not enough bytes in buf to complete utf-8 char
|
|
ts->i = ts->edata; // mark "all used"
|
|
c = -1;
|
|
}
|
|
break;
|
|
case Unicode:
|
|
if(ts->i < ts->edata - 1) {
|
|
//standards say most-significant byte first
|
|
c = (c << 8)|(buf[ts->i + 1]);
|
|
ts->i += 2;
|
|
}
|
|
else {
|
|
ts->i = ts->edata; // mark "all used"
|
|
c = -1;
|
|
}
|
|
break;
|
|
default:
|
|
return -1;
|
|
}
|
|
return c;
|
|
}
|
|
|
|
// Assuming c was the last character returned by getchar, set
|
|
// things up so that next getchar will get that same character
|
|
// followed by the current 'next character', etc.
|
|
static void
|
|
ungetchar(TokenSource* ts, int c)
|
|
{
|
|
int n;
|
|
Rune r;
|
|
char a[UTFmax];
|
|
|
|
n = 1;
|
|
switch(ts->chset) {
|
|
case UTF_8:
|
|
if(c >= 128) {
|
|
r = c;
|
|
n = runetochar(a, &r);
|
|
}
|
|
break;
|
|
case Unicode:
|
|
n = 2;
|
|
break;
|
|
}
|
|
ts->i -= n;
|
|
}
|
|
|
|
// Restore ts so that it is at the state where the index was savei.
|
|
static void
|
|
backup(TokenSource* ts, int savei)
|
|
{
|
|
if(dbglex)
|
|
fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
|
|
ts->i = savei;
|
|
}
|
|
|
|
|
|
// Look for value associated with attribute attid in token t.
|
|
// If there is one, return 1 and put the value in *pans,
|
|
// else return 0.
|
|
// If xfer is true, transfer ownership of the string to the caller
|
|
// (nil it out here); otherwise, caller must duplicate the answer
|
|
// if it needs to save it.
|
|
// OK to have pans==0, in which case this is just looking
|
|
// to see if token is present.
|
|
int
|
|
_tokaval(Token* t, int attid, Rune** pans, int xfer)
|
|
{
|
|
Attr* attr;
|
|
|
|
attr = t->attr;
|
|
while(attr != nil) {
|
|
if(attr->attid == attid) {
|
|
if(pans != nil)
|
|
*pans = attr->value;
|
|
if(xfer)
|
|
attr->value = nil;
|
|
return 1;
|
|
}
|
|
attr = attr->next;
|
|
}
|
|
if(pans != nil)
|
|
*pans = nil;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
Tconv(Fmt *f)
|
|
{
|
|
Token* t;
|
|
int i;
|
|
int tag;
|
|
char* srbra;
|
|
Rune* aname;
|
|
Rune* tname;
|
|
Attr* a;
|
|
char buf[BIGBUFSIZE];
|
|
|
|
t = va_arg(f->args, Token*);
|
|
if(t == nil)
|
|
sprint(buf, "<null>");
|
|
else {
|
|
i = 0;
|
|
if(dbglex > 1)
|
|
i = snprint(buf, sizeof(buf), "[%d]", t->starti);
|
|
tag = t->tag;
|
|
if(tag == Data) {
|
|
i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
|
|
}
|
|
else {
|
|
srbra = "";
|
|
if(tag >= RBRA) {
|
|
tag -= RBRA;
|
|
srbra = "/";
|
|
}
|
|
tname = tagnames[tag];
|
|
if(tag == Notfound)
|
|
tname = L"?";
|
|
i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
|
|
for(a = t->attr; a != nil; a = a->next) {
|
|
aname = attrnames[a->attid];
|
|
i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
|
|
if(a->value != nil)
|
|
i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
|
|
}
|
|
i += snprint(buf+i, sizeof(buf)-i-1, ">");
|
|
}
|
|
buf[i] = 0;
|
|
}
|
|
return fmtstrcpy(f, buf);
|
|
}
|
|
|
|
// Attrs own their constituent strings, but build may eventually
|
|
// transfer some values to its items and nil them out in the Attr.
|
|
static Attr*
|
|
newattr(int attid, Rune* value, Attr* link)
|
|
{
|
|
Attr* ans;
|
|
|
|
ans = (Attr*)emalloc(sizeof(Attr));
|
|
ans->attid = attid;
|
|
ans->value = value;
|
|
ans->next = link;
|
|
return ans;
|
|
}
|
|
|
|
// Free list of Attrs linked through next field
|
|
static void
|
|
freeattrs(Attr* ahead)
|
|
{
|
|
Attr* a;
|
|
Attr* nexta;
|
|
|
|
a = ahead;
|
|
while(a != nil) {
|
|
nexta = a->next;
|
|
free(a->value);
|
|
free(a);
|
|
a = nexta;
|
|
}
|
|
}
|
|
|
|
// Free array of Tokens.
|
|
// Allocated space might have room for more than n tokens,
|
|
// but only n of them are initialized.
|
|
// If caller has transferred ownership of constitutent strings
|
|
// or attributes, it must have nil'd out the pointers in the Tokens.
|
|
void
|
|
_freetokens(Token* tarray, int n)
|
|
{
|
|
int i;
|
|
Token* t;
|
|
|
|
if(tarray == nil)
|
|
return;
|
|
for(i = 0; i < n; i++) {
|
|
t = &tarray[i];
|
|
free(t->text);
|
|
freeattrs(t->attr);
|
|
}
|
|
free(tarray);
|
|
}
|