plan9port/src/libhtml/lex.c

#include <u.h>
#include <libc.h>
#include <draw.h>
#include <ctype.h>
#include <html.h>
#include "impl.h"

typedef struct TokenSource TokenSource;
struct TokenSource
{
	int			i;		/* index of next byte to use */
	uchar*		data;		/* all the data */
	int			edata;	/* data[0:edata] is valid */
	int			chset;	/* one of US_Ascii, etc. */
	int			mtype;	/* TextHtml or TextPlain */
};

enum {
	EOF = -2,
	EOB = -1
};

#define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))

#define SMALLBUFSIZE 240
#define BIGBUFSIZE 2000

/* HTML 4.0 tag names. */
/* Keep sorted, and in correspondence with enum in iparse.h. */
Rune **tagnames;
char *_tagnames[] = {
	" ",
	"!",
	"a",
	"abbr",
	"acronym",
	"address",
	"applet",
	"area",
	"b",
	"base",
	"basefont",
	"bdo",
	"big",
	"blink",
	"blockquote",
	"body",
	"bq",
	"br",
	"button",
	"caption",
	"center",
	"cite",
	"code",
	"col",
	"colgroup",
	"dd",
	"del",
	"dfn",
	"dir",
	"div",
	"dl",
	"dt",
	"em",
	"fieldset",
	"font",
	"form",
	"frame",
	"frameset",
	"h1",
	"h2",
	"h3",
	"h4",
	"h5",
	"h6",
	"head",
	"hr",
	"html",
	"i",
	"iframe",
	"img",
	"input",
	"ins",
	"isindex",
	"kbd",
	"label",
	"legend",
	"li",
	"link",
	"map",
	"menu",
	"meta",
	"nobr",
	"noframes",
	"noscript",
	"object",
	"ol",
	"optgroup",
	"option",
	"p",
	"param",
	"pre",
	"q",
	"s",
	"samp",
	"script",
	"select",
	"small",
	"span",
	"strike",
	"strong",
	"style",
	"sub",
	"sup",
	"table",
	"tbody",
	"td",
	"textarea",
	"tfoot",
	"th",
	"thead",
	"title",
	"tr",
	"tt",
	"u",
	"ul",
	"var"
};

/* HTML 4.0 attribute names. */
/* Keep sorted, and in correspondence with enum in i.h. */
Rune **attrnames;
char* _attrnames[] = {
	"abbr",
	"accept-charset",
	"access-key",
	"action",
	"align",
	"alink",
	"alt",
	"archive",
	"axis",
	"background",
	"bgcolor",
	"border",
	"cellpadding",
	"cellspacing",
	"char",
	"charoff",
	"charset",
	"checked",
	"cite",
	"class",
	"classid",
	"clear",
	"code",
	"codebase",
	"codetype",
	"color",
	"cols",
	"colspan",
	"compact",
	"content",
	"coords",
	"data",
	"datetime",
	"declare",
	"defer",
	"dir",
	"disabled",
	"enctype",
	"face",
	"for",
	"frame",
	"frameborder",
	"headers",
	"height",
	"href",
	"hreflang",
	"hspace",
	"http-equiv",
	"id",
	"ismap",
	"label",
	"lang",
	"link",
	"longdesc",
	"marginheight",
	"marginwidth",
	"maxlength",
	"media",
	"method",
	"multiple",
	"name",
	"nohref",
	"noresize",
	"noshade",
	"nowrap",
	"object",
	"onblur",
	"onchange",
	"onclick",
	"ondblclick",
	"onfocus",
	"onkeypress",
	"onkeyup",
	"onload",
	"onmousedown",
	"onmousemove",
	"onmouseout",
	"onmouseover",
	"onmouseup",
	"onreset",
	"onselect",
	"onsubmit",
	"onunload",
	"profile",
	"prompt",
	"readonly",
	"rel",
	"rev",
	"rows",
	"rowspan",
	"rules",
	"scheme",
	"scope",
	"scrolling",
	"selected",
	"shape",
	"size",
	"span",
	"src",
	"standby",
	"start",
	"style",
	"summary",
	"tabindex",
	"target",
	"text",
	"title",
	"type",
	"usemap",
	"valign",
	"value",
	"valuetype",
	"version",
	"vlink",
	"vspace",
	"width"
};


/* Character entity to unicode character number map. */
/* Keep sorted by name. */
StringInt *chartab;
AsciiInt _chartab[] = {
	{"AElig", 198},
	{"Aacute", 193},
	{"Acirc", 194},
	{"Agrave", 192},
	{"Aring", 197},
	{"Atilde", 195},
	{"Auml", 196},
	{"Ccedil", 199},
	{"ETH", 208},
	{"Eacute", 201},
	{"Ecirc", 202},
	{"Egrave", 200},
	{"Euml", 203},
	{"Iacute", 205},
	{"Icirc", 206},
	{"Igrave", 204},
	{"Iuml", 207},
	{"Ntilde", 209},
	{"Oacute", 211},
	{"Ocirc", 212},
	{"Ograve", 210},
	{"Oslash", 216},
	{"Otilde", 213},
	{"Ouml", 214},
	{"THORN", 222},
	{"Uacute", 218},
	{"Ucirc", 219},
	{"Ugrave", 217},
	{"Uuml", 220},
	{"Yacute", 221},
	{"aacute", 225},
	{"acirc", 226},
	{"acute", 180},
	{"aelig", 230},
	{"agrave", 224},
	{"alpha", 945},
	{"amp", 38},
	{"aring", 229},
	{"atilde", 227},
	{"auml", 228},
	{"beta", 946},
	{"brvbar", 166},
	{"ccedil", 231},
	{"cdots", 8943},
	{"cedil", 184},
	{"cent", 162},
	{"chi", 967},
	{"copy", 169},
	{"curren", 164},
	{"ddots", 8945},
	{"deg", 176},
	{"delta", 948},
	{"divide", 247},
	{"eacute", 233},
	{"ecirc", 234},
	{"egrave", 232},
	{"emdash", 8212},	/* non-standard but commonly used */
	{"emsp", 8195},
	{"endash", 8211},	/* non-standard but commonly used */
	{"ensp", 8194},
	{"epsilon", 949},
	{"eta", 951},
	{"eth", 240},
	{"euml", 235},
	{"frac12", 189},
	{"frac14", 188},
	{"frac34", 190},
	{"gamma", 947},
	{"gt", 62},
	{"iacute", 237},
	{"icirc", 238},
	{"iexcl", 161},
	{"igrave", 236},
	{"iota", 953},
	{"iquest", 191},
	{"iuml", 239},
	{"kappa", 954},
	{"lambda", 955},
	{"laquo", 171},
	{"ldquo", 8220},
	{"ldots", 8230},
	{"lsquo", 8216},
	{"lt", 60},
	{"macr", 175},
	{"mdash", 8212},
	{"micro", 181},
	{"middot", 183},
	{"mu", 956},
	{"nbsp", 160},
	{"ndash", 8211},
	{"not", 172},
	{"ntilde", 241},
	{"nu", 957},
	{"oacute", 243},
	{"ocirc", 244},
	{"ograve", 242},
	{"omega", 969},
	{"omicron", 959},
	{"ordf", 170},
	{"ordm", 186},
	{"oslash", 248},
	{"otilde", 245},
	{"ouml", 246},
	{"para", 182},
	{"phi", 966},
	{"pi", 960},
	{"plusmn", 177},
	{"pound", 163},
	{"psi", 968},
	{"quad", 8193},
	{"quot", 34},
	{"raquo", 187},
	{"rdquo", 8221},
	{"reg", 174},
	{"rho", 961},
	{"rsquo", 8217},
	{"sect", 167},
	{"shy", 173},
	{"sigma", 963},
	{"sp", 8194},
	{"sup1", 185},
	{"sup2", 178},
	{"sup3", 179},
	{"szlig", 223},
	{"tau", 964},
	{"theta", 952},
	{"thinsp", 8201},
	{"thorn", 254},
	{"times", 215},
	{"trade", 8482},
	{"uacute", 250},
	{"ucirc", 251},
	{"ugrave", 249},
	{"uml", 168},
	{"upsilon", 965},
	{"uuml", 252},
	{"varepsilon", 8712},
	{"varphi", 981},
	{"varpi", 982},
	{"varrho", 1009},
	{"vdots", 8942},
	{"vsigma", 962},
	{"vtheta", 977},
	{"xi", 958},
	{"yacute", 253},
	{"yen", 165},
	{"yuml", 255},
	{"zeta", 950}
};
#define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))

/* Characters Winstart..Winend are those that Windows */
/* uses interpolated into the Latin1 set. */
/* They aren't supposed to appear in HTML, but they do.... */
enum {
	Winstart = 127,
	Winend = 159
};

static int	winchars[]= { 8226,	/* 8226 is a bullet */
	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
	732, 8482, 353, 8250, 339, 8226, 8226, 376};

static StringInt*	tagtable;		/* initialized from tagnames */
static StringInt*	attrtable;		/* initialized from attrnames */

static void		lexinit(void);
static int		getplaindata(TokenSource* ts, Token* a, int* pai);
static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
static Rune*		buftostr(Rune* s, Rune* buf, int j);
static int		comment(TokenSource* ts);
static int		findstr(TokenSource* ts, Rune* s);
static int		ampersand(TokenSource* ts);
/*static int		lowerc(int c); */
static int		getchar(TokenSource* ts);
static void		ungetchar(TokenSource* ts, int c);
static void		backup(TokenSource* ts, int savei);
/*static void		freeinsidetoken(Token* t); */
static void		freeattrs(Attr* ahead);
static Attr*		newattr(int attid, Rune* value, Attr* link);
static int		Tconv(Fmt* f);

int	dbglex = 0;
static int lexinited = 0;

static void
lexinit(void)
{
	chartab = _cvtstringinttab(_chartab, nelem(_chartab));
	tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
	tagtable = _makestrinttab(tagnames, Numtags);
	attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
	attrtable = _makestrinttab(attrnames, Numattrs);
	fmtinstall('T', Tconv);
	lexinited = 1;
}

static TokenSource*
newtokensource(uchar* data, int edata, int chset, int mtype)
{
	TokenSource*	ans;

	assert(chset == US_Ascii || chset == ISO_8859_1 ||
			chset == UTF_8 || chset == Unicode);
	ans = (TokenSource*)emalloc(sizeof(TokenSource));
	ans->i = 0;
	ans->data = data;
	ans->edata = edata;
	ans->chset = chset;
	ans->mtype = mtype;
	return ans;
}

enum {
	ToksChunk = 500
};

/* Call this to get the tokens. */
/*  The number of returned tokens is returned in *plen. */
Token*
_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
{
	TokenSource*	ts;
	Token*		a;
	int	alen;
	int	ai;
	int	starti;
	int	c;
	int	tag;

	if(!lexinited)
		lexinit();
	ts = newtokensource(data, datalen, chset, mtype);
	alen = ToksChunk;
	a = (Token*)emalloc(alen * sizeof(Token));
	ai = 0;
	if(dbglex)
		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
	if(ts->mtype == TextHtml){
		for(;;){
			if(ai == alen){
				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
				alen += ToksChunk;
			}
			starti = ts->i;
			c = getchar(ts);
			if(c < 0)
				break;
			if(c == '<'){
				tag = gettag(ts, starti, a, &ai);
				if(tag == Tscript){
					/* special rules for getting Data after.... */
					starti = ts->i;
					c = getchar(ts);
					tag = getscriptdata(ts, c, starti, a, &ai);
				}
			}
			else
				tag = getdata(ts, c, starti, a, &ai);
			if(tag == -1)
				break;
			else if(dbglex > 1 && tag != Comment)
				fprint(2, "lex: got token %T\n", &a[ai-1]);
		}
	}
	else {
		/* plain text (non-html) tokens */
		for(;;){
			if(ai == alen){
				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
				alen += ToksChunk;
			}
			tag = getplaindata(ts, a, &ai);
			if(tag == -1)
				break;
			if(dbglex > 1)
				fprint(2, "lex: got token %T\n", &a[ai]);
		}
	}
	if(dbglex)
		fprint(2, "lex: returning %d tokens\n", ai);
	*plen = ai;
	free(ts);
	if(ai == 0) {
		free(a);
		return nil;
	}
	return a;
}

/* For case where source isn't HTML. */
/* Just make data tokens, one per line (or partial line, */
/* at end of buffer), ignoring non-whitespace control */
/* characters and dumping \r's. */
/* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
/* Otherwise return -1; */
static int
getplaindata(TokenSource* ts, Token* a, int* pai)
{
	Rune*	s;
	int	j;
	int	starti;
	int	c;
	Token*	tok;
	Rune	buf[BIGBUFSIZE];

	s = nil;
	j = 0;
	starti = ts->i;
	for(c = getchar(ts); c >= 0; c = getchar(ts)){
		if(c < ' '){
			if(isspace(c)){
				if(c == '\r'){
					/* ignore it unless no following '\n', */
					/* in which case treat it like '\n' */
					c = getchar(ts);
					if(c != '\n'){
						if(c >= 0)
							ungetchar(ts, c);
						c = '\n';
					}
				}
			}
			else
				c = 0;
		}
		if(c != 0){
			buf[j++] = c;
			if(j == BIGBUFSIZE-1){
				s = buftostr(s, buf, j);
				j = 0;
			}
		}
		if(c == '\n')
			break;
	}
	s = buftostr(s, buf, j);
	if(s == nil)
		return -1;
	tok = &a[(*pai)++];
	tok->tag = Data;
	tok->text = s;
	tok->attr = nil;
	tok->starti = starti;
	return Data;
}

/* Return concatenation of s and buf[0:j] */
/* Frees s. */
static Rune*
buftostr(Rune* s, Rune* buf, int j)
{
	Rune *tmp;
	buf[j] = 0;
	if(s == nil)
		tmp = _Strndup(buf, j);
	else
		tmp = _Strdup2(s, buf);
	free(s);
	return tmp;
}

/* Gather data up to next start-of-tag or end-of-buffer. */
/* Translate entity references (&amp;). */
/* Ignore non-whitespace control characters and get rid of \r's. */
/* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
/* Otherwise return -1; */
static int
getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
{
	Rune*	s;
	int	j;
	int	c;
	Token*	tok;
	Rune	buf[BIGBUFSIZE];

	s = nil;
	j = 0;
	c = firstc;
	while(c >= 0){
		if(c == '&'){
			c = ampersand(ts);
			if(c < 0)
				break;
		}
		else if(c < ' '){
			if(isspace(c)){
				if(c == '\r'){
					/* ignore it unless no following '\n', */
					/* in which case treat it like '\n' */
					c = getchar(ts);
					if(c != '\n'){
						if(c >= 0)
							ungetchar(ts, c);
						c = '\n';
					}
				}
			}
			else {
				if(warn)
					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
				c = 0;
			}
		}
		else if(c == '<'){
			ungetchar(ts, c);
			break;
		}
		if(c != 0){
			buf[j++] = c;
			if(j == BIGBUFSIZE-1){
				s = buftostr(s, buf, j);
				j = 0;
			}
		}
		c = getchar(ts);
	}
	s = buftostr(s, buf, j);
	if(s == nil)
		return -1;
	tok = &a[(*pai)++];
	tok->tag = Data;
	tok->text = s;
	tok->attr = nil;
	tok->starti = starti;
	return Data;
}

/* The rules for lexing scripts are different (ugh). */
/* Gather up everything until see a </SCRIPT>. */
static int
getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
{
	Rune*	s;
	int	j;
	int	tstarti;
	int	savei;
	int	c;
	int	tag;
	int	done;
	Token*	tok;
	Rune	buf[BIGBUFSIZE];

	s = nil;
	j = 0;
	tstarti = starti;
	c = firstc;
	done = 0;
	while(c >= 0){
		if(c == '<'){
			/* other browsers ignore stuff to end of line after <! */
			savei = ts->i;
			c = getchar(ts);
			if(c == '!'){
				while(c >= 0 && c != '\n' && c != '\r')
					c = getchar(ts);
				if(c == '\r')
					c = getchar(ts);
				if(c == '\n')
					c = getchar(ts);
			}
			else if(c >= 0){
				backup(ts, savei);
				tag = gettag(ts, tstarti, a, pai);
				if(tag == -1)
					break;
				if(tag != Comment)
					(*pai)--;
				backup(ts, tstarti);
				if(tag == Tscript + RBRA){
					done = 1;
					break;
				}
				/* here tag was not </SCRIPT>, so take as regular data */
				c = getchar(ts);
			}
		}
		if(c < 0)
			break;
		if(c != 0){
			buf[j++] = c;
			if(j == BIGBUFSIZE-1){
				s = buftostr(s, buf, j);
				j = 0;
			}
		}
		tstarti = ts->i;
		c = getchar(ts);
	}
	if(done || ts->i == ts->edata){
		s = buftostr(s, buf, j);
		tok = &a[(*pai)++];
		tok->tag = Data;
		tok->text = s;
		tok->attr = nil;
		tok->starti = starti;
		return Data;
	}
	backup(ts, starti);
	return -1;
}

/* We've just seen a '<'.  Gather up stuff to closing '>' (if buffer */
/* ends before then, return -1). */
/* If it's a tag, look up the name, gather the attributes, and return */
/* the appropriate token. */
/* Else it's either just plain data or some kind of ignorable stuff: */
/* return Data or Comment as appropriate. */
/* If it's not a Comment, put it in a[*pai] and bump *pai. */
static int
gettag(TokenSource* ts, int starti, Token* a, int* pai)
{
	int	rbra;
	int	ans;
	Attr*	al;
	int	nexti;
	int	c;
	int	ti;
	int	afnd;
	int	attid;
	int	quote;
	Rune*	val;
	int	nv;
	int	i;
	int	tag;
	Token*	tok;
	Rune	buf[BIGBUFSIZE];

	rbra = 0;
	nexti = ts->i;
	tok = &a[*pai];
	tok->tag = Notfound;
	tok->text = nil;
	tok->attr = nil;
	tok->starti = starti;
	c = getchar(ts);
	if(c == '/'){
		rbra = RBRA;
		c = getchar(ts);
	}
	if(c < 0)
		goto eob_done;
	if(c >= 256 || !isalpha(c)){
		/* not a tag */
		if(c == '!'){
			ans = comment(ts);
			if(ans != -1)
				return ans;
			goto eob_done;
		}
		else {
			backup(ts, nexti);
			tok->tag = Data;
			tok->text = _Strdup(L(Llt));
			(*pai)++;
			return Data;
		}
	}
	/* c starts a tagname */
	buf[0] = c;
	i = 1;
	for(;;){
		c = getchar(ts);
		if(c < 0)
			goto eob_done;
		if(!ISNAMCHAR(c))
			break;
		/* if name is bigger than buf it won't be found anyway... */
		if(i < BIGBUFSIZE)
			buf[i++] = c;
	}
	if(_lookup(tagtable, Numtags, buf, i, &tag))
		tok->tag = tag + rbra;
	else
		tok->text = _Strndup(buf, i);	/* for warning print, in build */

	/* attribute gathering loop */
	al = nil;
	for(;;){
		/* look for "ws name" or "ws name ws = ws val"  (ws=whitespace) */
		/* skip whitespace */
attrloop_continue:
		while(c < 256 && isspace(c)){
			c = getchar(ts);
			if(c < 0)
				goto eob_done;
		}
		if(c == '>')
			goto attrloop_done;
		if(c == '<'){
			if(warn)
				fprint(2, "warning: unclosed tag\n");
			ungetchar(ts, c);
			goto attrloop_done;
		}
		if(c >= 256 || !isalpha(c)){
			if(warn)
				fprint(2, "warning: expected attribute name\n");
			/* skipt to next attribute name */
			for(;;){
				c = getchar(ts);
				if(c < 0)
					goto eob_done;
				if(c < 256 && isalpha(c))
					goto attrloop_continue;
				if(c == '<'){
					if(warn)
						fprint(2, "warning: unclosed tag\n");
					ungetchar(ts, 60);
					goto attrloop_done;
				}
				if(c == '>')
					goto attrloop_done;
			}
		}
		/* gather attribute name */
		buf[0] = c;
		i = 1;
		for(;;){
			c = getchar(ts);
			if(c < 0)
				goto eob_done;
			if(!ISNAMCHAR(c))
				break;
			if(i < BIGBUFSIZE-1)
				buf[i++] = c;
		}
		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
		if(warn && !afnd){
			buf[i] = 0;
			fprint(2, "warning: unknown attribute name %S\n", buf);
		}
		/* skip whitespace */
		while(c < 256 && isspace(c)){
			c = getchar(ts);
			if(c < 0)
				goto eob_done;
		}
		if(c != '='){
			if(afnd)
				al = newattr(attid, nil, al);
			goto attrloop_continue;
		}
		/*# c is '=' here;  skip whitespace */
		for(;;){
			c = getchar(ts);
			if(c < 0)
				goto eob_done;
			if(c >= 256 || !isspace(c))
				break;
		}
		quote = 0;
		if(c == '\'' || c == '"'){
			quote = c;
			c = getchar(ts);
			if(c < 0)
				goto eob_done;
		}
		val = nil;
		nv = 0;
		for(;;){
valloop_continue:
			if(c < 0)
				goto eob_done;
			if(c == '>'){
				if(quote){
					/* c might be part of string (though not good style) */
					/* but if line ends before close quote, assume */
					/* there was an unmatched quote */
					ti = ts->i;
					for(;;){
						c = getchar(ts);
						if(c < 0)
							goto eob_done;
						if(c == quote){
							backup(ts, ti);
							buf[nv++] = '>';
							if(nv == BIGBUFSIZE-1){
								val = buftostr(val, buf, nv);
								nv = 0;
							}
							c = getchar(ts);
							goto valloop_continue;
						}
						if(c == '\n'){
							if(warn)
								fprint(2, "warning: apparent unmatched quote\n");
							backup(ts, ti);
							c = '>';
							goto valloop_done;
						}
					}
				}
				else
					goto valloop_done;
			}
			if(quote){
				if(c == quote){
					c = getchar(ts);
					if(c < 0)
						goto eob_done;
					goto valloop_done;
				}
				if(c == '\r'){
					c = getchar(ts);
					goto valloop_continue;
				}
				if(c == '\t' || c == '\n')
					c = ' ';
			}
			else {
				if(c < 256 && isspace(c))
					goto valloop_done;
			}
			if(c == '&'){
				c = ampersand(ts);
				if(c == -1)
					goto eob_done;
			}
			buf[nv++] = c;
			if(nv == BIGBUFSIZE-1){
				val = buftostr(val, buf, nv);
				nv = 0;
			}
			c = getchar(ts);
		}
valloop_done:
		if(afnd){
			val = buftostr(val, buf, nv);
			al = newattr(attid, val, al);
		}
	}

attrloop_done:
	tok->attr = al;
	(*pai)++;
	return tok->tag;

eob_done:
	if(warn)
		fprint(2, "warning: incomplete tag at end of page\n");
	backup(ts, nexti);
	tok->tag = Data;
	tok->text = _Strdup(L(Llt));
	return Data;
}

/* We've just read a '<!' at position starti, */
/* so this may be a comment or other ignored section, or it may */
/* be just a literal string if there is no close before end of file */
/* (other browsers do that). */
/* The accepted practice seems to be (note: contrary to SGML spec!): */
/* If see <!--, look for --> to close, or if none, > to close. */
/* If see <!(not --), look for > to close. */
/* If no close before end of file, leave original characters in as literal data. */
/* */
/* If we see ignorable stuff, return Comment. */
/* Else return nil (caller should back up and try again when more data arrives, */
/* unless at end of file, in which case caller should just make '<' a data token). */
static int
comment(TokenSource* ts)
{
	int	nexti;
	int	havecomment;
	int	c;

	nexti = ts->i;
	havecomment = 0;
	c = getchar(ts);
	if(c == '-'){
		c = getchar(ts);
		if(c == '-'){
			if(findstr(ts, L(Larrow)))
				havecomment = 1;
			else
				backup(ts, nexti);
		}
	}
	if(!havecomment){
		if(c == '>')
			havecomment = 1;
		else if(c >= 0){
			if(findstr(ts, L(Lgt)))
				havecomment = 1;
		}
	}
	if(havecomment)
		return Comment;
	return -1;
}

/* Look for string s in token source. */
/* If found, return 1, with buffer at next char after s, */
/* else return 0 (caller should back up). */
static int
findstr(TokenSource* ts, Rune* s)
{
	int	c0;
	int	n;
	int	nexti;
	int	i;
	int	c;

	c0 = s[0];
	n = runestrlen(s);
	for(;;){
		c = getchar(ts);
		if(c < 0)
			break;
		if(c == c0){
			if(n == 1)
				return 1;
			nexti = ts->i;
			for(i = 1; i < n; i++){
				c = getchar(ts);
				if(c < 0)
					goto mainloop_done;
				if(c != s[i])
					break;
			}
			if(i == n)
				return 1;
			backup(ts, nexti);
		}
	}
mainloop_done:
	return 0;
}

static int
xdigit(int c)
{
	if('0' <= c && c <= '9')
		return c-'0';
	if('a' <= c && c <= 'f')
		return c-'a'+10;
	if('A' <= c && c <= 'F')
		return c-'A'+10;
	return -1;
}

/* We've just read an '&'; look for an entity reference */
/* name, and if found, return translated char. */
/* if there is a complete entity name but it isn't known, */
/* try prefixes (gets around some buggy HTML out there), */
/* and if that fails, back up to just past the '&' and return '&'. */
/* If the entity can't be completed in the current buffer, back up */
/* to the '&' and return -1. */
static int
ampersand(TokenSource* ts)
{
	int	savei;
	int	c;
	int	fnd;
	int	ans;
	int	v;
	int	i;
	int	k;
	Rune	buf[SMALLBUFSIZE];

	savei = ts->i;
	c = getchar(ts);
	fnd = 0;
	ans = -1;
	if(c == '#'){
		c = getchar(ts);
		v = 0;
		if(c == 'x'){
			c = getchar(ts);
			while((i=xdigit(c)) != -1){
				v = v*16 + i;
				c = getchar(ts);
			}
		}else{
			while('0' <= c && c <= '9'){
				v = v*10 + c - '0';
				c = getchar(ts);
			}
		}
		if(c >= 0){
			if(!(c == ';' || c == '\n' || c == '\r'))
				ungetchar(ts, c);
			c = v;
			if(c == 160)
				c = 160;
			if(c >= Winstart && c <= Winend){
				c = winchars[c - Winstart];
			}
			ans = c;
			fnd = 1;
		}
	}
	else if(c < 256 && isalpha(c)){
		buf[0] = c;
		k = 1;
		for(;;){
			c = getchar(ts);
			if(c < 0)
				break;
			if(ISNAMCHAR(c)){
				if(k < SMALLBUFSIZE-1)
					buf[k++] = c;
			}
			else {
				if(!(c == ';' || c == '\n' || c == '\r'))
					ungetchar(ts, c);
				break;
			}
		}
		if(c >= 0){
			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
			if(!fnd){
				/* Try prefixes of s */
				if(c == ';' || c == '\n' || c == '\r')
					ungetchar(ts, c);
				i = k;
				while(--k > 0){
					fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
					if(fnd){
						while(i > k){
							i--;
							ungetchar(ts, buf[i]);
						}
						break;
					}
				}
			}
		}
	}
	if(!fnd){
		backup(ts, savei);
		ans = '&';
	}
	return ans;
}

/* Get next char, obeying ts.chset. */
/* Returns -1 if no complete character left before current end of data. */
static int
getchar(TokenSource* ts)
{
	uchar*	buf;
	int	c;
	int	n;
	int	ok;
	Rune	r;

	if(ts->i >= ts->edata)
		return -1;
	buf = ts->data;
	c = buf[ts->i];
	switch(ts->chset){
	case ISO_8859_1:
		if(c >= Winstart && c <= Winend)
			c = winchars[c - Winstart];
		ts->i++;
		break;
	case US_Ascii:
		if(c > 127){
			if(warn)
				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
		}
		ts->i++;
		break;
	case UTF_8:
		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
		n = chartorune(&r, (char*)(buf+ts->i));
		if(ok){
			if(warn && c == 0x80)
				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
			ts->i += n;
			c = r;
		}
		else {
			/* not enough bytes in buf to complete utf-8 char */
			ts->i = ts->edata;	/* mark "all used" */
			c = -1;
		}
		break;
	case Unicode:
		if(ts->i < ts->edata - 1){
			/*standards say most-significant byte first */
			c = (c << 8)|(buf[ts->i + 1]);
			ts->i += 2;
		}
		else {
			ts->i = ts->edata;	/* mark "all used" */
			c = -1;
		}
		break;
	}
	return c;
}

/* Assuming c was the last character returned by getchar, set */
/* things up so that next getchar will get that same character */
/* followed by the current 'next character', etc. */
static void
ungetchar(TokenSource* ts, int c)
{
	int	n;
	Rune	r;
	char	a[UTFmax];

	n = 1;
	switch(ts->chset){
	case UTF_8:
		if(c >= 128){
			r = c;
			n = runetochar(a, &r);
		}
		break;
	case Unicode:
		n = 2;
		break;
	}
	ts->i -= n;
}

/* Restore ts so that it is at the state where the index was savei. */
static void
backup(TokenSource* ts, int savei)
{
	if(dbglex)
		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
	ts->i = savei;
}


/* Look for value associated with attribute attid in token t. */
/* If there is one, return 1 and put the value in *pans, */
/* else return 0. */
/* If xfer is true, transfer ownership of the string to the caller */
/* (nil it out here); otherwise, caller must duplicate the answer */
/* if it needs to save it. */
/* OK to have pans==0, in which case this is just looking */
/* to see if token is present. */
int
_tokaval(Token* t, int attid, Rune** pans, int xfer)
{
	Attr*	attr;

	attr = t->attr;
	while(attr != nil){
		if(attr->attid == attid){
			if(pans != nil)
				*pans = attr->value;
			if(xfer)
				attr->value = nil;
			return 1;
		}
		attr = attr->next;
	}
	if(pans != nil)
		*pans = nil;
	return 0;
}

static int
Tconv(Fmt *f)
{
	Token*	t;
	int	i;
	int	tag;
	char*	srbra;
	Rune*	aname;
	Rune*	tname;
	Attr*	a;
	char	buf[BIGBUFSIZE];

	t = va_arg(f->args, Token*);
	if(t == nil)
		sprint(buf, "<null>");
	else {
		i = 0;
		if(dbglex > 1)
			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
		tag = t->tag;
		if(tag == Data){
			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
		}
		else {
			srbra = "";
			if(tag >= RBRA){
				tag -= RBRA;
				srbra = "/";
			}
			tname = tagnames[tag];
			if(tag == Notfound)
				tname = L(Lquestion);
			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
			for(a = t->attr; a != nil; a = a->next){
				aname = attrnames[a->attid];
				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
				if(a->value != nil)
					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
			}
			i += snprint(buf+i, sizeof(buf)-i-1, ">");
		}
		buf[i] = 0;
	}
	return fmtstrcpy(f, buf);
}

/* Attrs own their constituent strings, but build may eventually */
/* transfer some values to its items and nil them out in the Attr. */
static Attr*
newattr(int attid, Rune* value, Attr* link)
{
	Attr* ans;

	ans = (Attr*)emalloc(sizeof(Attr));
	ans->attid = attid;
	ans->value = value;
	ans->next = link;
	return ans;
}

/* Free list of Attrs linked through next field */
static void
freeattrs(Attr* ahead)
{
	Attr* a;
	Attr* nexta;

	a = ahead;
	while(a != nil){
		nexta = a->next;
		free(a->value);
		free(a);
		a = nexta;
	}
}

/* Free array of Tokens. */
/* Allocated space might have room for more than n tokens, */
/* but only n of them are initialized. */
/* If caller has transferred ownership of constitutent strings */
/* or attributes, it must have nil'd out the pointers in the Tokens. */
void
_freetokens(Token* tarray, int n)
{
	int i;
	Token* t;

	if(tarray == nil)
		return;
	for(i = 0; i < n; i++){
		t = &tarray[i];
		free(t->text);
		freeattrs(t->attr);
	}
	free(tarray);
}