Import version of libhtml that might actually work with ANSI C.

2004-04-06 19:06:52 +00:00 · 2004-04-06 19:06:52 +00:00 · 7cf289ca89
commit 7cf289ca89
parent 3e3817f7c8
13 changed files with 7206 additions and 0 deletions
--- a/src/cmd/htmlfmt/dat.h
+++ b/src/cmd/htmlfmt/dat.h
@ -0,0 +1,50 @@
 typedef struct Bytes Bytes;
 typedef struct URLwin URLwin;
 enum
 {
 	STACK		= 8192,
 	EVENTSIZE	= 256,
 };
 struct Bytes
 {
 	uchar	*b;
 	long		n;
 	long		nalloc;
 };
 struct URLwin
 {
 	int		infd;
 	int		outfd;
 	int		type;
 	char		*url;
 	Item		*items;
 	Docinfo	*docinfo;
 };
 extern	char*	url;
 extern	int		aflag;
 extern	int		width;
 extern	int		defcharset;
 extern	char*	loadhtml(int);
 extern	char*	readfile(char*, char*, int*);
 extern	int	charset(char*);
 extern	void*	emalloc(ulong);
 extern	char*	estrdup(char*);
 extern	char*	estrstrdup(char*, char*);
 extern	char*	egrow(char*, char*, char*);
 extern	char*	eappend(char*, char*, char*);
 extern	void		error(char*, ...);
 extern	void		growbytes(Bytes*, char*, long);
 extern	void		rendertext(URLwin*, Bytes*);
 extern	void		rerender(URLwin*);
 extern	void		freeurlwin(URLwin*);
 #pragma	varargck	argpos	error	1
--- a/src/cmd/htmlfmt/html.c
+++ b/src/cmd/htmlfmt/html.c
@ -0,0 +1,331 @@
 #include <u.h>
 #include <libc.h>
 #include <bio.h>
 #include <draw.h>
 #include <regexp.h>
 #include <html.h>
 #include <ctype.h>
 #include "dat.h"
 char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
 Reprog	*urlprog;
 int inword = 0;
 int col = 0;
 int wordi = 0;
 char*
 loadhtml(int fd)
 {
 	URLwin *u;
 	Bytes *b;
 	int n;
 	char buf[4096];
 	u = emalloc(sizeof(URLwin));
 	u->infd = fd;
 	u->outfd = 1;
 	u->url = estrdup(url);
 	u->type = TextHtml;
 	b = emalloc(sizeof(Bytes));
 	while((n = read(fd, buf, sizeof buf)) > 0)
 		growbytes(b, buf, n);
 	if(b->b == nil)
 		return nil;	/* empty file */
 	rendertext(u, b);
 	freeurlwin(u);
 	return nil;
 }
 char*
 runetobyte(Rune *r, int n)
 {
 	char *s;
 	if(n == 0)
 		return emalloc(1);
 	s = smprint("%.*S", n, r);
 	if(s == nil)
 		error("malloc failed");
 	return s;
 }
 int
 closingpunct(int c)
 {
 	return strchr(".,:;'\")]}>!?", c) != nil;
 }
 void
 emitword(Bytes *b, Rune *r, int nr)
 {
 	char *s;
 	int space;
 	if(nr == 0)
 		return;
 	s = smprint("%.*S", nr, r);
 	space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
 	if(col>0 && col+space+nr > width){
 		growbytes(b, "\n", 1);
 		space = 0;
 		col = 0;
 	}
 	if(space && col>0){
 		growbytes(b, " ", 1);
 		col++;
 	}
 	growbytes(b, s, strlen(s));
 	col += nr;
 	free(s);
 	inword = 0;
 }
 void
 renderrunes(Bytes *b, Rune *r)
 {
 	int i, n;
 	n = runestrlen(r);
 	for(i=0; i<n; i++){
 		switch(r[i]){
 		case '\n':
 			if(inword)
 				emitword(b, r+wordi, i-wordi);
 			col = 0;
 			if(b->n == 0)
 				break;	/* don't start with blank lines */
 			if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
 				growbytes(b, "\n", 1);
 			break;
 		case ' ':
 			if(inword)
 				emitword(b, r+wordi, i-wordi);
 			break;
 		default:
 			if(!inword)
 				wordi = i;
 			inword = 1;
 			break;
 		}
 	}
 	if(inword)
 		emitword(b, r+wordi, i-wordi);
 }
 void
 renderbytes(Bytes *b, char *fmt, ...)
 {
 	Rune *r;
 	va_list arg;
 	va_start(arg, fmt);
 	r = runevsmprint(fmt, arg);
 	va_end(arg);
 	renderrunes(b, r);
 	free(r);
 }
 char*
 baseurl(char *url)
 {
 	char *base, *slash;
 	Resub rs[10];
 	if(url == nil)
 		return nil;
 	if(urlprog == nil){
 		urlprog = regcomp(urlexpr);
 		if(urlprog == nil)
 			error("can't compile URL regexp");
 	}
 	memset(rs, 0, sizeof rs);
 	if(regexec(urlprog, url, rs, nelem(rs)) == 0)
 		return nil;
 	base = estrdup(url);
 	slash = strrchr(base, '/');
 	if(slash!=nil && slash>=&base[rs[0].e.p-rs[0].s.p])
 		*slash = '\0';
 	else
 		base[rs[0].e.p-rs[0].s.p] = '\0';
 	return base;
 }
 char*
 fullurl(URLwin *u, Rune *rhref)
 {
 	char *base, *href, *hrefbase;
 	char *result;
 	if(rhref == nil)
 		return estrdup("NULL URL");
 	href = runetobyte(rhref, runestrlen(rhref));
 	hrefbase = baseurl(href);
 	result = nil;
 	if(hrefbase==nil && (base = baseurl(u->url))!=nil){
 		result = estrdup(base);
 		if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
 			result = eappend(result, "/", "");
 		free(base);
 	}
 	if(href){
 		if(result)
 			result = eappend(result, "", href);
 		else
 			result = estrdup(href);
 	}
 	free(hrefbase);
 	if(result == nil)
 		return estrdup("***unknown***");
 	return result;
 }
 void
 render(URLwin *u, Bytes *t, Item *items, int curanchor)
 {
 	Item *il;
 	Itext *it;
 	Ifloat *ifl;
 	Ispacer *is;
 	Itable *ita;
 	Iimage *im;
 	Anchor *a;
 	Table *tab;
 	Tablecell *cell;
 	char *href;
 	inword = 0;
 	col = 0;
 	wordi = 0;
 	for(il=items; il!=nil; il=il->next){
 		if(il->state & IFbrk)
 			renderbytes(t, "\n");
 		if(il->state & IFbrksp)
 			renderbytes(t, "\n");
 		switch(il->tag){
 		case Itexttag:
 			it = (Itext*)il;
 			renderrunes(t, it->s);
 			break;
 		case Iruletag:
 			if(t->n>0 && t->b[t->n-1]!='\n')
 				renderbytes(t, "\n");
 			renderbytes(t, "=======\n");
 			break;
 		case Iimagetag:
 			if(!aflag)
 				break;
 			im = (Iimage*)il;
 			if(im->imsrc){
 				href = fullurl(u, im->imsrc);
 				renderbytes(t, "[image %s]", href);
 				free(href);
 			}
 			break;
 		case Iformfieldtag:
 			if(aflag)
 				renderbytes(t, "[formfield]");
 			break;
 		case Itabletag:
 			ita = (Itable*)il;
 			tab = ita->table;
 			for(cell=tab->cells; cell!=nil; cell=cell->next){
 				render(u, t, cell->content, curanchor);
 			}
 			if(t->n>0 && t->b[t->n-1]!='\n')
 				renderbytes(t, "\n");
 			break;
 		case Ifloattag:
 			ifl = (Ifloat*)il;
 			render(u, t, ifl->item, curanchor);
 			break;
 		case Ispacertag:
 			is = (Ispacer*)il;
 			if(is->spkind != ISPnull)
 				renderbytes(t, " ");
 			break;
 		default:
 			error("unknown item tag %d\n", il->tag);
 		}
 		if(il->anchorid != 0 && il->anchorid!=curanchor){
 			for(a=u->docinfo->anchors; a!=nil; a=a->next)
 				if(aflag && a->index == il->anchorid){
 					href = fullurl(u, a->href);
 					renderbytes(t, "[%s]", href);
 					free(href);
 					break;
 				}
 			curanchor = il->anchorid;
 		}
 	}
 	if(t->n>0 && t->b[t->n-1]!='\n')
 		renderbytes(t, "\n");
 }
 void
 rerender(URLwin *u)
 {
 	Bytes *t;
 	t = emalloc(sizeof(Bytes));
 	render(u, t, u->items, 0);
 	if(t->n)
 		write(u->outfd, (char*)t->b, t->n);
 	free(t->b);
 	free(t);
 }
 /*
 * Somewhat of a hack.  Not a full parse, just looks for strings in the beginning
 * of the document (cistrstr only looks at first somewhat bytes).
 */
 int
 charset(char *s)
 {
 	char *meta, *emeta, *charset;
 	if(defcharset == 0)
 		defcharset = ISO_8859_1;
 	meta = cistrstr(s, "<meta");
 	if(meta == nil)
 		return defcharset;
 	for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
 		;
 	charset = cistrstr(s, "charset=");
 	if(charset == nil)
 		return defcharset;
 	charset += 8;
 	if(*charset == '"')
 		charset++;
 	if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
 		return UTF_8;
 	return defcharset;
 }
 void
 rendertext(URLwin *u, Bytes *b)
 {
 	Rune *rurl;
 	rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
 	u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
 //	free(rurl);
 	rerender(u);
 }
 void
 freeurlwin(URLwin *u)
 {
 	freeitems(u->items);
 	u->items = nil;
 	freedocinfo(u->docinfo);
 	u->docinfo = nil;
 	free(u);
 }
--- a/src/cmd/htmlfmt/main.c
+++ b/src/cmd/htmlfmt/main.c
@ -0,0 +1,71 @@
 #include <u.h>
 #include <libc.h>
 #include <bio.h>
 #include <draw.h>
 #include <html.h>
 #include "dat.h"
 char *url = "";
 int aflag;
 int width = 70;
 int defcharset;
 void
 usage(void)
 {
 	fprint(2, "usage: htmlfmt [-c charset] [-u URL] [-a] [-l length] [file ...]\n");
 	exits("usage");
 }
 void
 main(int argc, char *argv[])
 {
 	int i, fd;
 	char *p, *err, *file;
 	char errbuf[ERRMAX];
 	ARGBEGIN{
 	case 'a':
 		aflag++;
 		break;
 	case 'c':
 		p = smprint("<meta charset=\"%s\">", EARGF(usage()));
 		defcharset = charset(p);
 		free(p);
 		break;
 	case 'l': case 'w':
 		err = EARGF(usage());
 		width = atoi(err);
 		if(width <= 0)
 			usage();
 		break;
 	case 'u':
 		url = EARGF(usage());
 		aflag++;
 		break;
 	default:
 		usage();
 	}ARGEND
 	err = nil;
 	file = "<stdin>";
 	if(argc == 0)
 		err = loadhtml(0);
 	else
 		for(i=0; err==nil && i<argc; i++){
 			file = argv[i];
 			fd = open(file, OREAD);
 			if(fd < 0){
 				errstr(errbuf, sizeof errbuf);
 				err = errbuf;
 				break;
 			}
 			err = loadhtml(fd);
 			close(fd);
 			if(err)
 				break;
 		}
 	if(err)
 		fprint(2, "htmlfmt: processing %s: %s\n", file, err);
 	exits(err);
 }
--- a/src/cmd/htmlfmt/mkfile
+++ b/src/cmd/htmlfmt/mkfile
@ -0,0 +1,30 @@
 <$SYS9/$systype/$objtype/mkfile
 TARG=htmlfmt
 OFILES=\
 	main.$O\
 	html.$O\
 	util.$O\
 HFILES=\
 	dat.h\
 	$SYS9/sys/include/html.h\
 LIB=$SYS9/$systype/$objtype/lib/libbio.a\
 	$SYS9/$systype/$objtype/lib/libregexp.a\
 	$SYS9/$systype/$objtype/lib/libhtml.a\
 	$SYS9/$systype/$objtype/lib/lib9c.a
 BIN=$SYS9/$systype/$objtype/bin
 UPDATE=\
 	mkfile\
 	$HFILES\
 	${OFILES:%.$O=%.c}
 <$SYS9/sys/src/cmd/mkone
 CFLAGS=$CFLAGS
 #$O.out: $OFILES
 #	$LD -o $target  $LDFLAGS $OFILES
--- a/src/cmd/htmlfmt/util.c
+++ b/src/cmd/htmlfmt/util.c
@ -0,0 +1,120 @@
 #include <u.h>
 #include <libc.h>
 #include <bio.h>
 #include <draw.h>
 #include <html.h>
 #include "dat.h"
 void*
 emalloc(ulong n)
 {
 	void *p;
 	p = malloc(n);
 	if(p == nil)
 		error("can't malloc: %r");
 	memset(p, 0, n);
 	return p;
 }
 void*
 erealloc(void *p, ulong n)
 {
 	p = realloc(p, n);
 	if(p == nil)
 		error("can't malloc: %r");
 	return p;
 }
 char*
 estrdup(char *s)
 {
 	char *t;
 	t = emalloc(strlen(s)+1);
 	strcpy(t, s);
 	return t;
 }
 char*
 estrstrdup(char *s, char *t)
 {
 	long ns, nt;
 	char *u;
 	ns = strlen(s);
 	nt = strlen(t);
 	/* use malloc to avoid memset */
 	u = malloc(ns+nt+1);
 	if(u == nil)
 		error("can't malloc: %r");
 	memmove(u, s, ns);
 	memmove(u+ns, t, nt);
 	u[ns+nt] = '\0';
 	return u;
 }
 char*
 eappend(char *s, char *sep, char *t)
 {
 	long ns, nsep, nt;
 	char *u;
 	if(t == nil)
 		u = estrstrdup(s, sep);
 	else{
 		ns = strlen(s);
 		nsep = strlen(sep);
 		nt = strlen(t);
 		/* use malloc to avoid memset */
 		u = malloc(ns+nsep+nt+1);
 		if(u == nil)
 			error("can't malloc: %r");
 		memmove(u, s, ns);
 		memmove(u+ns, sep, nsep);
 		memmove(u+ns+nsep, t, nt);
 		u[ns+nsep+nt] = '\0';
 	}
 	free(s);
 	return u;
 }
 char*
 egrow(char *s, char *sep, char *t)
 {
 	s = eappend(s, sep, t);
 	free(t);
 	return s;
 }
 void
 error(char *fmt, ...)
 {
 	va_list arg;
 	char buf[256];
 	Fmt f;
 	fmtfdinit(&f, 2, buf, sizeof buf);
 	fmtprint(&f, "Mail: ");
 	va_start(arg, fmt);
 	fmtvprint(&f, fmt, arg);
 	va_end(arg);
 	fmtprint(&f, "\n");
 	fmtfdflush(&f);
 	exits(fmt);
 }
 void
 growbytes(Bytes *b, char *s, long ns)
 {
 	if(b->nalloc < b->n + ns + 1){
 		b->nalloc = b->n + ns + 8000;
 		/* use realloc to avoid memset */
 		b->b = realloc(b->b, b->nalloc);
 		if(b->b == nil)
 			error("growbytes: can't realloc: %r");
 	}
 	memmove(b->b+b->n, s, ns);
 	b->n += ns;
 	b->b[b->n] = '\0';
 }
--- a/src/libhtml/build.c
+++ b/src/libhtml/build.c
--- a/src/libhtml/impl.h
+++ b/src/libhtml/impl.h
@ -0,0 +1,163 @@
 // UTILS
 typedef struct List List;
 typedef struct Strlist Strlist;
 // List of integers (and also generic list with next pointer at beginning)
 struct List
 {
 	List*	next;
 	int	val;
 };
 struct Strlist
 {
 	Strlist*	next;
 	Rune*	val;
 };
 extern int		_inclass(Rune c, Rune* cl);
 extern int		_listlen(List* l);
 extern Rune*	_ltoStr(int n);
 extern List*	_newlist(int val, List* rest);
 extern Rune*	_newstr(int n);
 extern int		_prefix(Rune* pre, Rune* s);
 extern List*	_revlist(List* l);
 extern void	_splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2);
 extern void	_splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2);
 extern int		_splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen);
 extern Rune*	_Stradd(Rune*s1, Rune* s2, int n);
 extern Rune*	_Strclass(Rune* s, Rune* cl);
 extern int		_Strcmp(Rune* s1, Rune* s2);
 extern Rune*	_Strdup(Rune* s);
 extern Rune*	_Strdup2(Rune* s, Rune* t);
 extern int		_Streqn(Rune* s1, int n1, Rune* s2);
 extern int		_Strlen(Rune* s);
 extern Rune*	_Strnclass(Rune* s, Rune* cl, int n);
 extern int		_Strncmpci(Rune* s1, int n1, Rune* s2);
 extern Rune*	_Strndup(Rune* s, int n);
 extern Rune*	_Strnrclass(Rune* s, Rune* cl, int n);
 extern Rune*	_Strrclass(Rune* s, Rune* cl);
 extern Rune*	_Strsubstr(Rune* s, int start, int stop);
 extern long	_Strtol(Rune* s, Rune** eptr, int base);
 extern void	_trimwhite(Rune* s, int n, Rune** pans, int* panslen);
 extern Rune	notwhitespace[];
 extern Rune	whitespace[];
 // STRINTTAB
 typedef struct StringInt StringInt;
 // Element of String-Int table (used for keyword lookup)
 struct StringInt
 {
 	Rune*	key;
 	int	val;
 };
 extern int			_lookup(StringInt* t, int n, Rune* key, int keylen, int* pans);
 extern StringInt*	_makestrinttab(Rune** a, int n);
 extern Rune*		_revlookup(StringInt* t, int n, int val);
 // Colors, in html format, not Plan 9 format.  (RGB values in bottom 3 bytes)
 enum {
 	White = 0xFFFFFF,
 	Black = 0x000000,
 	Blue = 0x0000CC,
 };
 // LEX
 // HTML 4.0 tags (plus blink, nobr)
 // sorted in lexical order; used as array indices
 enum {
 	Notfound,
 	Comment,
 	Ta, Tabbr, Tacronym, Taddress, Tapplet, Tarea,
 	Tb, Tbase, Tbasefont, Tbdo, Tbig, Tblink,
 	Tblockquote, Tbody, Tbq, Tbr, Tbutton,
 	Tcaption, Tcenter, Tcite, Tcode, Tcol, Tcolgroup,
 	Tdd, Tdel, Tdfn, Tdir, Tdiv, Tdl, Tdt,
 	Tem,
 	Tfieldset, Tfont, Tform, Tframe, Tframeset,
 	Th1, Th2, Th3, Th4, Th5, Th6,
 	Thead, Thr, Thtml,
 	Ti, Tiframe, Timg, Tinput, Tins, Tisindex,
 	Tkbd,
 	Tlabel, Tlegend, Tli, Tlink,
 	Tmap, Tmenu, Tmeta,
 	Tnobr, Tnoframes, Tnoscript,
 	Tobject, Tol, Toptgroup, Toption,
 	Tp, Tparam, Tpre,
 	Tq,
 	Ts, Tsamp, Tscript, Tselect, Tsmall,
 	Tspan, Tstrike, Tstrong, Tstyle, Tsub, Tsup,
 	Ttable, Ttbody, Ttd, Ttextarea, Ttfoot,
 	Tth, Tthead, Ttitle, Ttr, Ttt,
 	Tu, Tul,
 	Tvar,
 	Numtags,
 	RBRA = Numtags,
 	Data = Numtags+RBRA
 };
 // HTML 4.0 tag attributes
 // Keep sorted in lexical order
 enum {
 	Aabbr, Aaccept_charset, Aaccess_key, Aaction,
 	Aalign, Aalink, Aalt, Aarchive, Aaxis,
 	Abackground, Abgcolor, Aborder,
 	Acellpadding, Acellspacing, Achar, Acharoff,
 	Acharset, Achecked, Acite, Aclass, Aclassid,
 	Aclear, Acode, Acodebase, Acodetype, Acolor,
 	Acols, Acolspan, Acompact, Acontent, Acoords,
 	Adata, Adatetime, Adeclare, Adefer, Adir, Adisabled,
 	Aenctype,
 	Aface, Afor, Aframe, Aframeborder,
 	Aheaders, Aheight, Ahref, Ahreflang, Ahspace, Ahttp_equiv,
 	Aid, Aismap,
 	Alabel, Alang, Alink, Alongdesc,
 	Amarginheight, Amarginwidth, Amaxlength,
 	Amedia, Amethod, Amultiple,
 	Aname, Anohref, Anoresize, Anoshade, Anowrap,
 	Aobject, Aonblur, Aonchange, Aonclick, Aondblclick,
 	Aonfocus, Aonkeypress, Aonkeyup, Aonload,
 	Aonmousedown, Aonmousemove, Aonmouseout,
 	Aonmouseover, Aonmouseup, Aonreset, Aonselect,
 	Aonsubmit, Aonunload,
 	Aprofile, Aprompt,
 	Areadonly, Arel, Arev, Arows, Arowspan, Arules,
 	Ascheme, Ascope, Ascrolling, Aselected, Ashape,
 	Asize, Aspan, Asrc, Astandby, Astart, Astyle, Asummary,
 	Atabindex, Atarget, Atext, Atitle, Atype,
 	Ausemap,
 	Avalign, Avalue, Avaluetype, Aversion, Avlink, Avspace,
 	Awidth,
 	Numattrs
 };
 struct Attr
 {
 	Attr*		next;		// in list of attrs for a token
 	int		attid;		// Aabbr, etc.
 	Rune*	value;
 };
 struct Token
 {
 	int		tag;		// Ta, etc
 	Rune*	text;		// text in Data, attribute text in tag
 	Attr*		attr;		// list of Attrs
 	int		starti;	// index into source buffer of token start
 };
 extern Rune**	tagnames;
 extern Rune**	attrnames;
 extern void	_freetokens(Token* tarray, int n);
 extern Token*	_gettoks(uchar* data, int datalen, int chset, int mtype, int* plen);
 extern int		_tokaval(Token* t, int attid, Rune** pans, int xfer);
 #pragma varargck	type "T"	Token*
 #include "runetab.h"
--- a/src/libhtml/lex.c
+++ b/src/libhtml/lex.c
--- a/src/libhtml/mkfile
+++ b/src/libhtml/mkfile
@ -0,0 +1,22 @@
 <$SYS9/$systype/$objtype/mkfile
 LIB=$LIB9/libhtml.a
 OFILES=\
 	build.$O\
 	lex.$O\
 	strinttab.$O\
 	utils.$O\
 	runetab.$O\
 HFILES=\
 	$SYS9/sys/include/html.h\
 	impl.h\
 UPDATE=\
 	mkfile\
 	$HFILES\
 	${OFILES:%.$O=%.c}\
 	${LIB:$SYS9/$systype/$objtype/%=$SYS9/$systype/386/%}\
 <$SYS9/sys/src/cmd/mksyslib
--- a/src/libhtml/runetab.c
+++ b/src/libhtml/runetab.c
@ -0,0 +1,83 @@
 #include <u.h>
 #include <libc.h>
 #include <draw.h>
 #include <html.h>
 #include "impl.h"
 Rune **runeconsttab;
 char *_runeconsttab[] = {
 	"        ",
 	" ",
 	"",
 	"#",
 	"+",
 	", ",
 	"-",
 	"-->",
 	"1",
 	"<",
 	">",
 	"?",
 	"Index search terms:",
 	"Reset",
 	"Submit",
 	"^0-9",
 	"_ISINDEX_",
 	"_blank",
 	"_fr",
 	"_no_name_submit_",
 	"_parent",
 	"_self",
 	"_top",
 	"application/x-www-form-urlencoded",
 	"circle",
 	"cm",
 	"content-script-type",
 	"disc",
 	"em",
 	"in",
 	"javascript",
 	"jscript",
 	"jscript1.1",
 	"mm",
 	"none",
 	"pi",
 	"pt",
 	"refresh",
 	"select",
 	"square",
 	"textarea",
 };
 Rune**
 cvtstringtab(char **tab, int n)
 {
 	int i;
 	Rune **rtab;
 	rtab = emalloc(n*sizeof(rtab[0]));
 	for(i=0; i<n; i++)
 		rtab[i] = toStr(tab[i], strlen(tab[i]), US_Ascii);
 	return rtab;
 }
 StringInt*
 cvtstringinttab(AsciiInt *tab, int n)
 {
 	int i;
 	StringInt *stab;
 	stab = emalloc(n*sizeof(stab[0]));
 	for(i=0; i<n; i++){
 		stab[i].key = toStr(tab[i].key, strlen(tab[i].key), US_Ascii);
 		stab[i].val = tab[i].val;
 	}
 	return stab;
 }
 void
 runetabinit(void)
 {
 	runeconsttab = cvtstringtab(_runeconsttab, nelem(_runeconsttab));
 	return;
 }
--- a/src/libhtml/runetab.h
+++ b/src/libhtml/runetab.h
@ -0,0 +1,59 @@
 typedef struct AsciiInt AsciiInt;
 struct AsciiInt {
 	char*	key;
 	int	val;
 };
 enum {
 	Ltab2space,
 	Lspace,
 	Lempty,
 	Lhash,
 	Lplus,
 	Lcommaspace,
 	Lminus,
 	Larrow,
 	Lone,
 	Llt,
 	Lgt,
 	Lquestion,
 	Lindex,
 	Lreset,
 	Lsubmit,
 	Lnot0to9,
 	Lisindex,
 	L_blank,
 	Lfr,
 	Lnoname,
 	L_parent,
 	L_self,
 	L_top,
 	Lappl_form,
 	Lcircle,
 	Lcm,
 	Lcontent,
 	Ldisc,
 	Lem,
 	Lin,
 	Ljavascript,
 	Ljscript,
 	Ljscript1,
 	Lmm,
 	Lnone,
 	Lpi,
 	Lpt,
 	Lrefresh,
 	Lselect,
 	Lsquare,
 	Ltextarea,
 };
 #define L(x)	runeconsttab[(x)]
 extern	Rune	**runeconsttab;
 /* XXX: for unix port only */
 Rune		**cvtstringtab(char**, int);
 StringInt	*cvtstringinttab(AsciiInt*, int);
 void		runetabinit(void);
--- a/src/libhtml/strinttab.c
+++ b/src/libhtml/strinttab.c
@ -0,0 +1,64 @@
 #include <u.h>
 #include <libc.h>
 #include <draw.h>
 #include <html.h>
 #include "impl.h"
 // Do case-insensitive lookup of key[0:keylen] in t[0:n] (key part),
 // returning 1 if found, 0 if not.
 // Array t must be sorted in increasing lexicographic order of key.
 // If found, return corresponding val in *pans.
 int
 _lookup(StringInt* t, int n, Rune* key, int keylen, int* pans)
 {
 	int	min;
 	int	max;
 	int	try;
 	int	cmpresult;
 	min = 0;
 	max = n - 1;
 	while(min <= max) {
 		try = (min + max)/2;
 		cmpresult = _Strncmpci(key, keylen, t[try].key);
 		if(cmpresult > 0)
 			min = try + 1;
 		else if(cmpresult < 0)
 			max = try - 1;
 		else {
 			*pans = t[try].val;
 			return 1;
 		}
 	}
 	return 0;
 }
 // Return first key in t[0:n] that corresponds to val,
 // nil if none.
 Rune*
 _revlookup(StringInt* t, int n, int val)
 {
 	int	i;
 	for(i = 0; i < n; i++)
 		if(t[i].val == val)
 			return t[i].key;
 	return nil;
 }
 // Make a StringInt table out of a[0:n], mapping each string
 // to its index.  Check that entries are in alphabetical order.
 StringInt*
 _makestrinttab(Rune** a, int n)
 {
 	StringInt*	ans;
 	int	i;
 	ans = (StringInt*)emalloc(n * sizeof(StringInt));
 	for(i = 0; i < n; i++) {
 		ans[i].key = a[i];
 		ans[i].val = i;
 		assert(i == 0 || runestrcmp(a[i], a[i - 1]) >= 0);
 	}
 	return ans;
 }
--- a/src/libhtml/utils.c
+++ b/src/libhtml/utils.c
@ -0,0 +1,591 @@
 #include <u.h>
 #include <libc.h>
 #include <draw.h>
 #include <html.h>
 #include "impl.h"
 Rune whitespace[] = { ' ', '\t', '\n', '\r', '\0' };
 Rune notwhitespace[] = { '^', ' ', '\t', '\n', '\r' , '\0'};
 // All lists start out like List structure.
 // List itself can be used as list of int.
 int
 _listlen(List* l)
 {
 	int n = 0;
 	while(l != nil) {
 		l = l->next;
 		n++;
 	}
 	return n;
 }
 // Cons
 List*
 _newlist(int val, List* rest)
 {
 	List* ans;
 	ans = (List*)emalloc(sizeof(List));
 	ans->val = val;
 	ans->next = rest;
 	return ans;
 }
 // Reverse a list in place
 List*
 _revlist(List* l)
 {
 	List* newl;
 	List* nextl;
 	newl = nil;
 	while(l != nil) {
 		nextl = l->next;
 		l->next = newl;
 		newl = l;
 		l = nextl;
 	}
 	return newl;
 }
 // The next few routines take a "character class" as argument.
 //    e.g., "a-zA-Z", or "^ \t\n"
 // (ranges indicated by - except in first position;
 //  ^ is first position means "not in" the following class)
 // Splitl splits s[0:n] just before first character of class cl.
 // Answers go in (p1, n1) and (p2, n2).
 // If no split, the whole thing goes in the first component.
 // Note: answers contain pointers into original string.
 void
 _splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
 {
 	Rune* p;
 	p = _Strnclass(s, cl, n);
 	*p1 = s;
 	if(p == nil) {
 		*n1 = n;
 		*p2 = nil;
 		*n2 = 0;
 	}
 	else {
 		*p2 = p;
 		*n1 = p-s;
 		*n2 = n-*n1;
 	}
 }
 // Splitr splits s[0:n] just after last character of class cl.
 // Answers go in (p1, n1) and (p2, n2).
 // If no split, the whole thing goes in the last component.
 // Note: answers contain pointers into original string.
 void
 _splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
 {
 	Rune* p;
 	p = _Strnrclass(s, cl, n);
 	if(p == nil) {
 		*p1 = nil;
 		*n1 = 0;
 		*p2 = s;
 		*n2 = n;
 	}
 	else {
 		*p1 = s;
 		*p2 = p+1;
 		*n1 = *p2-s;
 		*n2 = n-*n1;
 	}
 }
 // Splitall splits s[0:n] into parts that are separated by characters from class cl.
 // Each part will have nonzero length.
 // At most alen parts are found, and pointers to their starts go into
 // the strarr array, while their lengths go into the lenarr array.
 // The return value is the number of parts found.
 int
 _splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen)
 {
 	int i;
 	Rune* p;
 	Rune* q;
 	Rune* slast;
 	if(s == nil || n == 0)
 		return 0;
 	i = 0;
 	p = s;
 	slast = s+n;
 	while(p < slast && i < alen) {
 		while(p < slast && _inclass(*p, cl))
 			p++;
 		if(p == slast)
 			break;
 		q = _Strnclass(p, cl, slast-p);
 		if(q == nil)
 			q = slast;
 		assert(q > p && q <= slast);
 		strarr[i] = p;
 		lenarr[i] = q-p;
 		i++;
 		p = q;
 	}
 	return i;
 }
 // Find part of s that excludes leading and trailing whitespace,
 // and return that part in *pans (and its length in *panslen).
 void
 _trimwhite(Rune* s, int n, Rune** pans, int* panslen)
 {
 	Rune* p;
 	Rune* q;
 	p = nil;
 	if(n > 0) {
 		p = _Strnclass(s, notwhitespace, n);
 		if(p != nil) {
 			q = _Strnrclass(s, notwhitespace, n);
 			assert(q != nil);
 			n = q+1-p;
 		}
 	}
 	*pans = p;
 	*panslen = n;
 }
 // _Strclass returns a pointer to the first element of s that is
 // a member of class cl, nil if none.
 Rune*
 _Strclass(Rune* s, Rune* cl)
 {
 	Rune* p;
 	for(p = s; *p != 0; p++)
 		if(_inclass(*p, cl))
 			return p;
 	return nil;
 }
 // _Strnclass returns a pointer to the first element of s[0:n] that is
 // a member of class cl, nil if none.
 Rune*
 _Strnclass(Rune* s, Rune* cl, int n)
 {
 	Rune* p;
 	for(p = s; n-- && *p != 0; p++)
 		if(_inclass(*p, cl))
 			return p;
 	return nil;
 }
 // _Strrclass returns a pointer to the last element of s that is
 // a member of class cl, nil if none
 Rune*
 _Strrclass(Rune* s, Rune* cl)
 {
 	Rune* p;
 	if(s == nil || *s == 0)
 		return nil;
 	p = s + runestrlen(s) - 1;
 	while(p >= s) {
 		if(_inclass(*p, cl))
 			return p;
 		p--;
 	};
 	return nil;
 }
 // _Strnrclass returns a pointer to the last element of s[0:n] that is
 // a member of class cl, nil if none
 Rune*
 _Strnrclass(Rune* s, Rune* cl, int n)
 {
 	Rune* p;
 	if(s == nil || *s == 0 || n == 0)
 		return nil;
 	p = s + n - 1;
 	while(p >= s) {
 		if(_inclass(*p, cl))
 			return p;
 		p--;
 	};
 	return nil;
 }
 // Is c in the class cl?
 int
 _inclass(Rune c, Rune* cl)
 {
 	int	n;
 	int	ans;
 	int	negate;
 	int	i;
 	n = _Strlen(cl);
 	if(n == 0)
 		return 0;
 	ans = 0;
 	negate = 0;
 	if(cl[0] == '^') {
 		negate = 1;
 		cl++;
 		n--;
 	}
 	for(i = 0; i < n; i++) {
 		if(cl[i] == '-' && i > 0 && i < n - 1) {
 			if(c >= cl[i - 1] && c <= cl[i + 1]) {
 				ans = 1;
 				break;
 			}
 			i++;
 		}
 		else if(c == cl[i]) {
 			ans = 1;
 			break;
 		}
 	}
 	if(negate)
 		ans = !ans;
 	return ans;
 }
 // Is pre a prefix of s?
 int
 _prefix(Rune* pre, Rune* s)
 {
 	int	ns;
 	int	n;
 	int	k;
 	ns = _Strlen(s);
 	n = _Strlen(pre);
 	if(ns < n)
 		return 0;
 	for(k = 0; k < n; k++) {
 		if(pre[k] != s[k])
 			return 0;
 	}
 	return 1;
 }
 // Number of runes in (null-terminated) s
 int
 _Strlen(Rune* s)
 {
 	if(s == nil)
 		return 0;
 	return runestrlen(s);
 }
 // -1, 0, 1 as s1 is lexicographically less, equal greater than s2
 int
 _Strcmp(Rune *s1, Rune *s2)
 {
 	if(s1 == nil)
 		return (s2 == nil || *s2 == 0) ? 0 : -1;
 	if(s2 == nil)
 		return (*s1 == 0) ? 0 : 1;
 	return runestrcmp(s1, s2);
 }
 // Like Strcmp, but use exactly n chars of s1 (assume s1 has at least n chars).
 // Also, do a case-insensitive match, assuming s2
 // has no chars in [A-Z], only their lowercase versions.
 // (This routine is used for in-place keyword lookup, where s2 is in a keyword
 // list and s1 is some substring, possibly mixed-case, in a buffer.)
 int
 _Strncmpci(Rune *s1, int n1, Rune *s2)
 {
 	Rune c1, c2;
 	for(;;) {
 		if(n1-- == 0) {
 			if(*s2 == 0)
 				return 0;
 			return -1;
 		}
 		c1 = *s1++;
 		c2 = *s2++;
 		if(c1 >= 'A' && c1 <= 'Z')
 			c1 = c1 - 'A' + 'a';
 		if(c1 != c2) {
 			if(c1 > c2)
 				return 1;
 			return -1;
 		}
 	}
 }
 // emalloc and copy
 Rune*
 _Strdup(Rune* s)
 {
 	if(s == nil)
 		return nil;
 	return _Strndup(s, runestrlen(s));
 }
 // emalloc and copy n chars of s (assume s is at least that long),
 // and add 0 terminator.
 // Return nil if n==0.
 Rune*
 _Strndup(Rune* s, int n)
 {
 	Rune* ans;
 	if(n <= 0)
 		return nil;
 	ans = _newstr(n);
 	memmove(ans, s, n*sizeof(Rune));
 	ans[n] = 0;
 	return ans;
 }
 // emalloc enough room for n Runes, plus 1 null terminator.
 // (Not initialized to anything.)
 Rune*
 _newstr(int n)
 {
 	return (Rune*)emalloc((n+1)*sizeof(Rune));
 }
 // emalloc and copy s+t
 Rune*
 _Strdup2(Rune* s, Rune* t)
 {
 	int ns, nt;
 	Rune* ans;
 	Rune* p;
 	ns = _Strlen(s);
 	nt = _Strlen(t);
 	if(ns+nt == 0)
 		return nil;
 	ans = _newstr(ns+nt);
 	p = _Stradd(ans, s, ns);
 	p = _Stradd(p, t, nt);
 	*p = 0;
 	return ans;
 }
 // Return emalloc'd substring s[start:stop],
 Rune*
 _Strsubstr(Rune* s, int start, int stop)
 {
 	Rune* t;
 	if(start == stop)
 		return nil;
 	t = _Strndup(s+start, stop-start);
 	return t;
 }
 // Copy n chars to s1 from s2, and return s1+n
 Rune*
 _Stradd(Rune* s1, Rune* s2, int n)
 {
 	if(n == 0)
 		return s1;
 	memmove(s1, s2, n*sizeof(Rune));
 	return s1+n;
 }
 // Like strtol, but converting from Rune* string
 //#define LONG_MAX	2147483647L
 //#define LONG_MIN	-2147483648L
 long
 _Strtol(Rune* nptr, Rune** endptr, int base)
 {
 	Rune* p;
 	long n, nn;
 	int c, ovfl, v, neg, ndig;
 	p = nptr;
 	neg = 0;
 	n = 0;
 	ndig = 0;
 	ovfl = 0;
 	/*
 	 * White space
 	 */
 	for(;;p++){
 		switch(*p){
 		case ' ':
 		case '\t':
 		case '\n':
 		case '\f':
 		case '\r':
 		case '\v':
 			continue;
 		}
 		break;
 	}
 	/*
 	 * Sign
 	 */
 	if(*p=='-' || *p=='+')
 		if(*p++ == '-')
 			neg = 1;
 	/*
 	 * Base
 	 */
 	if(base==0){
 		if(*p != '0')
 			base = 10;
 		else{
 			base = 8;
 			if(p[1]=='x' || p[1]=='X'){
 				p += 2;
 				base = 16;
 			}
 		}
 	}else if(base==16 && *p=='0'){
 		if(p[1]=='x' || p[1]=='X')
 			p += 2;
 	}else if(base<0 || 36<base)
 		goto Return;
 	/*
 	 * Non-empty sequence of digits
 	 */
 	for(;; p++,ndig++){
 		c = *p;
 		v = base;
 		if('0'<=c && c<='9')
 			v = c - '0';
 		else if('a'<=c && c<='z')
 			v = c - 'a' + 10;
 		else if('A'<=c && c<='Z')
 			v = c - 'A' + 10;
 		if(v >= base)
 			break;
 		nn = n*base + v;
 		if(nn < n)
 			ovfl = 1;
 		n = nn;
 	}
    Return:
 	if(ndig == 0)
 		p = nptr;
 	if(endptr)
 		*endptr = p;
 	if(ovfl){
 		if(neg)
 			return LONG_MIN;
 		return LONG_MAX;
 	}
 	if(neg)
 		return -n;
 	return n;
 }
 // Convert buf[0:n], bytes whose character set is chset,
 // into a emalloc'd null-terminated Unicode string.
 Rune*
 toStr(uchar* buf, int n, int chset)
 {
 	int i;
 	int m;
 	Rune ch;
 	Rune* ans;
 	switch(chset) {
 	case US_Ascii:
 	case ISO_8859_1:
 		ans = (Rune*)emalloc((n+1)*sizeof(Rune));
 		for(i = 0; i < n; i++)
 			ans[i] = buf[i];
 		ans[n] = 0;
 		break;
 	case UTF_8:
 		m = 0;
 		for(i = 0; i < n; ) {
 			i += chartorune(&ch, (char*)(buf+i));
 			m++;
 		}
 		ans = (Rune*)emalloc((m+1)*sizeof(Rune));
 		m = 0;
 		for(i = 0; i < n; ) {
 			i += chartorune(&ch, (char*)(buf+i));
 			ans[m++] = ch;
 		}
 		ans[m] = 0;
 		break;
 	default:
 		ans = nil;
 		assert(0);
 	}
 	return ans;
 }
 // Convert buf[0:n], Unicode characters,
 // into an emalloc'd null-terminated string in character set chset.
 // Use 0x80 for unconvertable characters.
 uchar*
 fromStr(Rune* buf, int n, int chset)
 {
 	uchar* ans;
 	int i, lim, m;
 	Rune ch;
 	uchar* p;
 	uchar s[UTFmax];
 	ans = nil;
 	switch(chset) {
 	case US_Ascii:
 	case ISO_8859_1:
 		ans = (uchar*)emalloc(n+1);
 		lim = (chset==US_Ascii)? 127 : 255;
 		for(i = 0; i < n; i++) {
 			ch = buf[i];
 			if(ch > lim)
 				ch = 0x80;
 			ans[i] = ch;
 		}
 		ans[n] = 0;
 		break;
 	case UTF_8:
 		m = 0;
 		for(i = 0; i < n; i++) {
 			m += runetochar((char*)s, &buf[i]);
 		}
 		ans = (uchar*)emalloc(m+1);
 		p = ans;
 		for(i = 0; i < n; i++)
 			p += runetochar((char*)p, &buf[i]);
 		*p = 0;
 		break;
 	default:
 		assert(0);
 	}
 	return ans;
 }
 // Convert n to emalloc'd String.
 Rune*
 _ltoStr(int n)
 {
 	int m;
 	uchar buf[20];
 	m = snprint((char*)buf, sizeof(buf), "%d", n);
 	return toStr(buf, m, US_Ascii);
 }