+#include "impl.h"
+
+// A stack for holding integer values
+enum {
+ Nestmax = 40 // max nesting level of lists, font styles, etc.
+};
+
+struct Stack {
+ int n; // next available slot (top of stack is stack[n-1])
+ int slots[Nestmax]; // stack entries
+};
+
+// Parsing state
+struct Pstate
+{
+ Pstate* next; // in stack of Pstates
+ int skipping; // true when we shouldn't add items
+ int skipwhite; // true when we should strip leading space
+ int curfont; // font index for current font
+ int curfg; // current foreground color
+ Background curbg; // current background
+ int curvoff; // current baseline offset
+ uchar curul; // current underline/strike state
+ uchar curjust; // current justify state
+ int curanchor; // current (href) anchor id (if in one), or 0
+ int curstate; // current value of item state
+ int literal; // current literal state
+ int inpar; // true when in a paragraph-like construct
+ int adjsize; // current font size adjustment
+ Item* items; // dummy head of item list we're building
+ Item* lastit; // tail of item list we're building
+ Item* prelastit; // item before lastit
+ Stack fntstylestk; // style stack
+ Stack fntsizestk; // size stack
+ Stack fgstk; // text color stack
+ Stack ulstk; // underline stack
+ Stack voffstk; // vertical offset stack
+ Stack listtypestk; // list type stack
+ Stack listcntstk; // list counter stack
+ Stack juststk; // justification stack
+ Stack hangstk; // hanging stack
+};
+
+struct ItemSource
+{
+ Docinfo* doc;
+ Pstate* psstk;
+ int nforms;
+ int ntables;
+ int nanchors;
+ int nframes;
+ Form* curform;
+ Map* curmap;
+ Table* tabstk;
+ Kidinfo* kidstk;
+};
+
+// Some layout parameters
+enum {
+ FRKIDMARGIN = 6, // default margin around kid frames
+ IMGHSPACE = 0, // default hspace for images (0 matches IE, Netscape)
+ IMGVSPACE = 0, // default vspace for images
+ FLTIMGHSPACE = 2, // default hspace for float images
+ TABSP = 5, // default cellspacing for tables
+ TABPAD = 1, // default cell padding for tables
+ LISTTAB = 1, // number of tabs to indent lists
+ BQTAB = 1, // number of tabs to indent blockquotes
+ HRSZ = 2, // thickness of horizontal rules
+ SUBOFF = 4, // vertical offset for subscripts
+ SUPOFF = 6, // vertical offset for superscripts
+ NBSP = 160 // non-breaking space character
+};
+
+// These tables must be sorted
+static StringInt *align_tab;
+static AsciiInt _align_tab[] = {
+ {"baseline", ALbaseline},
+ {"bottom", ALbottom},
+ {"center", ALcenter},
+ {"char", ALchar},
+ {"justify", ALjustify},
+ {"left", ALleft},
+ {"middle", ALmiddle},
+ {"right", ALright},
+ {"top", ALtop}
+};
+#define NALIGNTAB (sizeof(align_tab)/sizeof(StringInt))
+
+static StringInt *input_tab;
+static AsciiInt _input_tab[] = {
+ {"button", Fbutton},
+ {"checkbox", Fcheckbox},
+ {"file", Ffile},
+ {"hidden", Fhidden},
+ {"image", Fimage},
+ {"password", Fpassword},
+ {"radio", Fradio},
+ {"reset", Freset},
+ {"submit", Fsubmit},
+ {"text", Ftext}
+};
+#define NINPUTTAB (sizeof(input_tab)/sizeof(StringInt))
+
+static StringInt *clear_tab;
+static AsciiInt _clear_tab[] = {
+ {"all", IFcleft|IFcright},
+ {"left", IFcleft},
+ {"right", IFcright}
+};
+#define NCLEARTAB (sizeof(clear_tab)/sizeof(StringInt))
+
+static StringInt *fscroll_tab;
+static AsciiInt _fscroll_tab[] = {
+ {"auto", FRhscrollauto|FRvscrollauto},
+ {"no", FRnoscroll},
+ {"yes", FRhscroll|FRvscroll},
+};
+#define NFSCROLLTAB (sizeof(fscroll_tab)/sizeof(StringInt))
+
+static StringInt *shape_tab;
+static AsciiInt _shape_tab[] = {
+ {"circ", SHcircle},
+ {"circle", SHcircle},
+ {"poly", SHpoly},
+ {"polygon", SHpoly},
+ {"rect", SHrect},
+ {"rectangle", SHrect}
+};
+#define NSHAPETAB (sizeof(shape_tab)/sizeof(StringInt))
+
+static StringInt *method_tab;
+static AsciiInt _method_tab[] = {
+ {"get", HGet},
+ {"post", HPost}
+};
+#define NMETHODTAB (sizeof(method_tab)/sizeof(StringInt))
+
+static Rune** roman;
+static char* _roman[15]= {
+ "I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X",
+ "XI", "XII", "XIII", "XIV", "XV"
+};
+#define NROMAN 15
+
+// List number types
+enum {
+ LTdisc, LTsquare, LTcircle, LT1, LTa, LTA, LTi, LTI
+};
+
+enum {
+ SPBefore = 2,
+ SPAfter = 4,
+ BL = 1,
+ BLBA = (BL|SPBefore|SPAfter)
+};
+
+// blockbrk[tag] is break info for a block level element, or one
+// of a few others that get the same treatment re ending open paragraphs
+// and requiring a line break / vertical space before them.
+// If we want a line of space before the given element, SPBefore is OR'd in.
+// If we want a line of space after the given element, SPAfter is OR'd in.
+
+static uchar blockbrk[Numtags]= {
+ [Taddress] BLBA, [Tblockquote] BLBA, [Tcenter] BL,
+ [Tdir] BLBA, [Tdiv] BL, [Tdd] BL, [Tdl] BLBA,
+ [Tdt] BL, [Tform] BLBA,
+ // headings and tables get breaks added manually
+ [Th1] BL, [Th2] BL, [Th3] BL,
+ [Th4] BL, [Th5] BL, [Th6] BL,
+ [Thr] BL, [Tisindex] BLBA, [Tli] BL, [Tmenu] BLBA,
+ [Tol] BLBA, [Tp] BLBA, [Tpre] BLBA,
+ [Tul] BLBA
+};
+
+enum {
+ AGEN = 1
+};
+
+// attrinfo is information about attributes.
+// The AGEN value means that the attribute is generic (applies to almost all elements)
+static uchar attrinfo[Numattrs]= {
+ [Aid] AGEN, [Aclass] AGEN, [Astyle] AGEN, [Atitle] AGEN,
+ [Aonblur] AGEN, [Aonchange] AGEN, [Aonclick] AGEN,
+ [Aondblclick] AGEN, [Aonfocus] AGEN, [Aonkeypress] AGEN,
+ [Aonkeyup] AGEN, [Aonload] AGEN, [Aonmousedown] AGEN,
+ [Aonmousemove] AGEN, [Aonmouseout] AGEN, [Aonmouseover] AGEN,
+ [Aonmouseup] AGEN, [Aonreset] AGEN, [Aonselect] AGEN,
+ [Aonsubmit] AGEN, [Aonunload] AGEN
+};
+
+static uchar scriptev[Numattrs]= {
+ [Aonblur] SEonblur, [Aonchange] SEonchange, [Aonclick] SEonclick,
+ [Aondblclick] SEondblclick, [Aonfocus] SEonfocus, [Aonkeypress] SEonkeypress,
+ [Aonkeyup] SEonkeyup, [Aonload] SEonload, [Aonmousedown] SEonmousedown,
+ [Aonmousemove] SEonmousemove, [Aonmouseout] SEonmouseout, [Aonmouseover] SEonmouseover,
+ [Aonmouseup] SEonmouseup, [Aonreset] SEonreset, [Aonselect] SEonselect,
+ [Aonsubmit] SEonsubmit, [Aonunload] SEonunload
+};
+
+// Color lookup table
+static StringInt *color_tab;
+static AsciiInt _color_tab[] = {
+ {"aqua", 0x00FFFF},
+ {"black", 0x000000},
+ {"blue", 0x0000CC},
+ {"fuchsia", 0xFF00FF},
+ {"gray", 0x808080},
+ {"green", 0x008000},
+ {"lime", 0x00FF00},
+ {"maroon", 0x800000},
+ {"navy", 0x000080,},
+ {"olive", 0x808000},
+ {"purple", 0x800080},
+ {"red", 0xFF0000},
+ {"silver", 0xC0C0C0},
+ {"teal", 0x008080},
+ {"white", 0xFFFFFF},
+ {"yellow", 0xFFFF00}
+};
+#define NCOLORS (sizeof(color_tab)/sizeof(StringInt))
+
+static StringInt *targetmap;
+static int targetmapsize;
+static int ntargets;
+
+static int buildinited = 0;
+
+#define SMALLBUFSIZE 240
+#define BIGBUFSIZE 2000
+
+int dbgbuild = 0;
+int warn = 0;
+
+static Align aalign(Token* tok);
+static int acolorval(Token* tok, int attid, int dflt);
+static void addbrk(Pstate* ps, int sp, int clr);
+static void additem(Pstate* ps, Item* it, Token* tok);
+static void addlinebrk(Pstate* ps, int clr);
+static void addnbsp(Pstate* ps);
+static void addtext(Pstate* ps, Rune* s);
+static Dimen adimen(Token* tok, int attid);
+static int aflagval(Token* tok, int attid);
+static int aintval(Token* tok, int attid, int dflt);
+static Rune* astrval(Token* tok, int attid, Rune* dflt);
+static int atabval(Token* tok, int attid, StringInt* tab, int ntab, int dflt);
+static int atargval(Token* tok, int dflt);
+static int auintval(Token* tok, int attid, int dflt);
+static Rune* aurlval(Token* tok, int attid, Rune* dflt, Rune* base);
+static Rune* aval(Token* tok, int attid);
+static void buildinit(void);
+static Pstate* cell_pstate(Pstate* oldps, int ishead);
+static void changehang(Pstate* ps, int delta);
+static void changeindent(Pstate* ps, int delta);
+static int color(Rune* s, int dflt);
+static void copystack(Stack* tostk, Stack* fromstk);
+static int dimprint(char* buf, int nbuf, Dimen d);
+static Pstate* finishcell(Table* curtab, Pstate* psstk);
+static void finish_table(Table* t);
+static void freeanchor(Anchor* a);
+static void freedestanchor(DestAnchor* da);
+static void freeform(Form* f);
+static void freeformfield(Formfield* ff);
+static void freeitem(Item* it);
+static void freepstate(Pstate* p);
+static void freepstatestack(Pstate* pshead);
+static void freescriptevents(SEvent* ehead);
+static void freetable(Table* t);
+static Map* getmap(Docinfo* di, Rune* name);
+static Rune* getpcdata(Token* toks, int tokslen, int* ptoki);
+static Pstate* lastps(Pstate* psl);
+static Rune* listmark(uchar ty, int n);
+static int listtyval(Token* tok, int dflt);
+static Align makealign(int halign, int valign);
+static Background makebackground(Rune* imgurl, int color);
+static Dimen makedimen(int kind, int spec);
+static Anchor* newanchor(int index, Rune* name, Rune* href, int target, Anchor* link);
+static Area* newarea(int shape, Rune* href, int target, Area* link);
+static DestAnchor* newdestanchor(int index, Rune* name, Item* item, DestAnchor* link);
+static Docinfo* newdocinfo(void);
+static Genattr* newgenattr(Rune* id, Rune* class, Rune* style, Rune* title, SEvent* events);
+static Form* newform(int formid, Rune* name, Rune* action,
+ int target, int method, Form* link);
+static Formfield* newformfield(int ftype, int fieldid, Form* form, Rune* name,
+ Rune* value, int size, int maxlength, Formfield* link);
+static Item* newifloat(Item* it, int side);
+static Item* newiformfield(Formfield* ff);
+static Item* newiimage(Rune* src, Rune* altrep, int align, int width, int height,
+ int hspace, int vspace, int border, int ismap, Map* map);
+static Item* newirule(int align, int size, int noshade, Dimen wspec);
+static Item* newispacer(int spkind);
+static Item* newitable(Table* t);
+static ItemSource* newitemsource(Docinfo* di);
+static Item* newitext(Rune* s, int fnt, int fg, int voff, int ul);
+static Kidinfo* newkidinfo(int isframeset, Kidinfo* link);
+static Option* newoption(int selected, Rune* value, Rune* display, Option* link);
+static Pstate* newpstate(Pstate* link);
+static SEvent* newscriptevent(int type, Rune* script, SEvent* link);
+static Table* newtable(int tableid, Align align, Dimen width, int border,
+ int cellspacing, int cellpadding, Background bg, Token* tok, Table* link);
+static Tablecell* newtablecell(int cellid, int rowspan, int colspan, Align align, Dimen wspec,
+ int hspec, Background bg, int flags, Tablecell* link);
+static Tablerow* newtablerow(Align align, Background bg, int flags, Tablerow* link);
+static Dimen parsedim(Rune* s, int ns);
+static void pop(Stack* stk);
+static void popfontsize(Pstate* ps);
+static void popfontstyle(Pstate* ps);
+static void popjust(Pstate* ps);
+static int popretnewtop(Stack* stk, int dflt);
+static int push(Stack* stk, int val);
+static void pushfontsize(Pstate* ps, int sz);
+static void pushfontstyle(Pstate* ps, int sty);
+static void pushjust(Pstate* ps, int j);
+static Item* textit(Pstate* ps, Rune* s);
+static Rune* removeallwhite(Rune* s);
+static void resetdocinfo(Docinfo* d);
+static void setcurfont(Pstate* ps);
+static void setcurjust(Pstate* ps);
+static void setdimarray(Token* tok, int attid, Dimen** pans, int* panslen);
+static Rune* stringalign(int a);
+static void targetmapinit(void);
+static int toint(Rune* s);
+static int top(Stack* stk, int dflt);
+static void trim_cell(Tablecell* c);
+static int validalign(Align a);
+static int validdimen(Dimen d);
+static int validformfield(Formfield* f);
+static int validhalign(int a);
+static int validptr(void* p);
+static int validStr(Rune* s);
+static int validtable(Table* t);
+static int validtablerow(Tablerow* r);
+static int validtablecol(Tablecol* c);
+static int validtablecell(Tablecell* c);
+static int validvalign(int a);
+static int Iconv(Fmt *f);
+
+static void
+buildinit(void)
+{
+ runetabinit();
+ roman = cvtstringtab(_roman, nelem(_roman));
+ color_tab = cvtstringinttab(_color_tab, nelem(_color_tab));
+ method_tab = cvtstringinttab(_method_tab, nelem(_method_tab));
+ shape_tab = cvtstringinttab(_shape_tab, nelem(_shape_tab));
+ fscroll_tab = cvtstringinttab(_fscroll_tab, nelem(_fscroll_tab));
+ clear_tab = cvtstringinttab(_clear_tab, nelem(_clear_tab));
+ input_tab = cvtstringinttab(_input_tab, nelem(_input_tab));
+ align_tab = cvtstringinttab(_align_tab, nelem(_align_tab));
+
+ fmtinstall('I', Iconv);
+ targetmapinit();
+ buildinited = 1;
+}
+
+static ItemSource*
+newitemsource(Docinfo* di)
+{
+ ItemSource* is;
+ Pstate* ps;
+
+ ps = newpstate(nil);
+ if(di->mediatype != TextHtml) {
+ ps->curstate &= ~IFwrap;
+ ps->literal = 1;
+ pushfontstyle(ps, FntT);
+ }
+ is = (ItemSource*)emalloc(sizeof(ItemSource));
+ is->doc = di;
+ is->psstk = ps;
+ is->nforms = 0;
+ is->ntables = 0;
+ is->nanchors = 0;
+ is->nframes = 0;
+ is->curform = nil;
+ is->curmap = nil;
+ is->tabstk = nil;
+ is->kidstk = nil;
+ return is;
+}
+
+static Item *getitems(ItemSource* is, uchar* data, int datalen);
+
+// Parse an html document and create a list of layout items.
+// Allocate and return document info in *pdi.
+// When caller is done with the items, it should call
+// freeitems on the returned result, and then
+// freedocinfo(*pdi).
+Item*
+parsehtml(uchar* data, int datalen, Rune* pagesrc, int mtype, int chset, Docinfo** pdi)
+{
+ Item *it;
+ Docinfo* di;
+ ItemSource* is;
+
+ di = newdocinfo();
+ di->src = _Strdup(pagesrc);
+ di->base = _Strdup(pagesrc);
+ di->mediatype = mtype;
+ di->chset = chset;
+ *pdi = di;
+ is = newitemsource(di);
+ it = getitems(is, data, datalen);
+ freepstatestack(is->psstk);
+ free(is);
+ return it;
+}
+
+// Get a group of tokens for lexer, parse them, and create
+// a list of layout items.
+// When caller is done with the items, it should call
+// freeitems on the returned result.
+static Item*
+getitems(ItemSource* is, uchar* data, int datalen)
+{
+ int i;
+ int j;
+ int nt;
+ int pt;
+ int doscripts;
+ int tokslen;
+ int toki;
+ int h;
+ int sz;
+ int method;
+ int n;
+ int nblank;
+ int norsz;
+ int bramt;
+ int sty;
+ int nosh;
+ int oldcuranchor;
+ int dfltbd;
+ int v;
+ int hang;
+ int isempty;
+ int tag;
+ int brksp;
+ int target;
+ uchar brk;
+ uchar flags;
+ uchar align;
+ uchar al;
+ uchar ty;
+ uchar ty2;
+ Pstate* ps;
+ Pstate* nextps;
+ Pstate* outerps;
+ Table* curtab;
+ Token* tok;
+ Token* toks;
+ Docinfo* di;
+ Item* ans;
+ Item* img;
+ Item* ffit;
+ Item* tabitem;
+ Rune* s;
+ Rune* t;
+ Rune* name;
+ Rune* enctype;
+ Rune* usemap;
+ Rune* prompt;
+ Rune* equiv;
+ Rune* val;
+ Rune* nsz;
+ Rune* script;
+ Map* map;
+ Form* frm;
+ Iimage* ii;
+ Kidinfo* kd;
+ Kidinfo* ks;
+ Kidinfo* pks;
+ Dimen wd;
+ Option* option;
+ Table* tab;
+ Tablecell* c;
+ Tablerow* tr;
+ Formfield* field;
+ Formfield* ff;
+ Rune* href;
+ Rune* src;
+ Rune* scriptsrc;
+ Rune* bgurl;
+ Rune* action;
+ Background bg;
+
+ if(!buildinited)
+ buildinit();
+ doscripts = 0; // for now
+ ps = is->psstk;
+ curtab = is->tabstk;
+ di = is->doc;
+ toks = _gettoks(data, datalen, di->chset, di->mediatype, &tokslen);
+ toki = 0;
+ for(; toki < tokslen; toki++) {
+ tok = &toks[toki];
+ if(dbgbuild > 1)
+ fprint(2, "build: curstate %ux, token %T\n", ps->curstate, tok);
+ tag = tok->tag;
+ brk = 0;
+ brksp = 0;
+ if(tag < Numtags) {
+ brk = blockbrk[tag];
+ if(brk&SPBefore)
+ brksp = 1;
+ }
+ else if(tag < Numtags + RBRA) {
+ brk = blockbrk[tag - RBRA];
+ if(brk&SPAfter)
+ brksp = 1;
+ }
+ if(brk) {
+ addbrk(ps, brksp, 0);
+ if(ps->inpar) {
+ popjust(ps);
+ ps->inpar = 0;
+ }
+ }
+ // check common case first (Data), then switch statement on tag
+ if(tag == Data) {
+ // Lexing didn't pay attention to SGML record boundary rules:
+ // \n after start tag or before end tag to be discarded.
+ // (Lex has already discarded all \r's).
+ // Some pages assume this doesn't happen in text,
+ // so we won't do it if literal is true.
+ // BUG: won't discard \n before a start tag that begins
+ // the next bufferful of tokens.
+ s = tok->text;
+ n = _Strlen(s);
+ if(!ps->literal) {
+ i = 0;
+ j = n;
+ if(toki > 0) {
+ pt = toks[toki - 1].tag;
+ // IE and Netscape both ignore this rule (contrary to spec)
+ // if previous tag was img
+ if(pt < Numtags && pt != Timg && j > 0 && s[0] == '\n')
+ i++;
+ }
+ if(toki < tokslen - 1) {
+ nt = toks[toki + 1].tag;
+ if(nt >= RBRA && nt < Numtags + RBRA && j > i && s[j - 1] == '\n')
+ j--;
+ }
+ if(i > 0 || j < n) {
+ t = s;
+ s = _Strsubstr(s, i, j);
+ free(t);
+ n = j-i;
+ }
+ }
+ if(ps->skipwhite) {
+ _trimwhite(s, n, &t, &nt);
+ if(t == nil) {
+ free(s);
+ s = nil;
+ }
+ else if(t != s) {
+ t = _Strndup(t, nt);
+ free(s);
+ s = t;
+ }
+ if(s != nil)
+ ps->skipwhite = 0;
+ }
+ tok->text = nil; // token doesn't own string anymore
+ if(s != nil)
+ addtext(ps, s);
+ }
+ else
+ switch(tag) {
+ // Some abbrevs used in following DTD comments
+ // %text = #PCDATA
+ // | TT | I | B | U | STRIKE | BIG | SMALL | SUB | SUP
+ // | EM | STRONG | DFN | CODE | SAMP | KBD | VAR | CITE
+ // | A | IMG | APPLET | FONT | BASEFONT | BR | SCRIPT | MAP
+ // | INPUT | SELECT | TEXTAREA
+ // %block = P | UL | OL | DIR | MENU | DL | PRE | DL | DIV | CENTER
+ // | BLOCKQUOTE | FORM | ISINDEX | HR | TABLE
+ // %flow = (%text | %block)*
+ // %body.content = (%heading | %text | %block | ADDRESS)*
+
+ //
+ // Anchors are not supposed to be nested, but you sometimes see
+ // href anchors inside destination anchors.
+ case Ta:
+ if(ps->curanchor != 0) {
+ if(warn)
+ fprint(2, "warning: nested or missing \n");
+ ps->curanchor = 0;
+ }
+ name = aval(tok, Aname);
+ href = aurlval(tok, Ahref, nil, di->base);
+ // ignore rel, rev, and title attrs
+ if(href != nil) {
+ target = atargval(tok, di->target);
+ di->anchors = newanchor(++is->nanchors, name, href, target, di->anchors);
+ if(name != nil)
+ name = _Strdup(name); // for DestAnchor construction, below
+ ps->curanchor = is->nanchors;
+ ps->curfg = push(&ps->fgstk, di->link);
+ ps->curul = push(&ps->ulstk, ULunder);
+ }
+ if(name != nil) {
+ // add a null item to be destination
+ additem(ps, newispacer(ISPnull), tok);
+ di->dests = newdestanchor(++is->nanchors, name, ps->lastit, di->dests);
+ }
+ break;
+
+ case Ta+RBRA :
+ if(ps->curanchor != 0) {
+ ps->curfg = popretnewtop(&ps->fgstk, di->text);
+ ps->curul = popretnewtop(&ps->ulstk, ULnone);
+ ps->curanchor = 0;
+ }
+ break;
+
+ //
+ // We can't do applets, so ignore PARAMS, and let
+ // the %text contents appear for the alternative rep
+ case Tapplet:
+ case Tapplet+RBRA:
+ if(warn && tag == Tapplet)
+ fprint(2, "warning: