Import version of libhtml that might actually work with ANSI C.

This commit is contained in:
wkj 2004-04-06 19:06:52 +00:00
parent 3e3817f7c8
commit 7cf289ca89
13 changed files with 7206 additions and 0 deletions

50
src/cmd/htmlfmt/dat.h Normal file
View file

@ -0,0 +1,50 @@
typedef struct Bytes Bytes;
typedef struct URLwin URLwin;
enum
{
STACK = 8192,
EVENTSIZE = 256,
};
struct Bytes
{
uchar *b;
long n;
long nalloc;
};
struct URLwin
{
int infd;
int outfd;
int type;
char *url;
Item *items;
Docinfo *docinfo;
};
extern char* url;
extern int aflag;
extern int width;
extern int defcharset;
extern char* loadhtml(int);
extern char* readfile(char*, char*, int*);
extern int charset(char*);
extern void* emalloc(ulong);
extern char* estrdup(char*);
extern char* estrstrdup(char*, char*);
extern char* egrow(char*, char*, char*);
extern char* eappend(char*, char*, char*);
extern void error(char*, ...);
extern void growbytes(Bytes*, char*, long);
extern void rendertext(URLwin*, Bytes*);
extern void rerender(URLwin*);
extern void freeurlwin(URLwin*);
#pragma varargck argpos error 1

331
src/cmd/htmlfmt/html.c Normal file
View file

@ -0,0 +1,331 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <draw.h>
#include <regexp.h>
#include <html.h>
#include <ctype.h>
#include "dat.h"
char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
Reprog *urlprog;
int inword = 0;
int col = 0;
int wordi = 0;
char*
loadhtml(int fd)
{
URLwin *u;
Bytes *b;
int n;
char buf[4096];
u = emalloc(sizeof(URLwin));
u->infd = fd;
u->outfd = 1;
u->url = estrdup(url);
u->type = TextHtml;
b = emalloc(sizeof(Bytes));
while((n = read(fd, buf, sizeof buf)) > 0)
growbytes(b, buf, n);
if(b->b == nil)
return nil; /* empty file */
rendertext(u, b);
freeurlwin(u);
return nil;
}
char*
runetobyte(Rune *r, int n)
{
char *s;
if(n == 0)
return emalloc(1);
s = smprint("%.*S", n, r);
if(s == nil)
error("malloc failed");
return s;
}
int
closingpunct(int c)
{
return strchr(".,:;'\")]}>!?", c) != nil;
}
void
emitword(Bytes *b, Rune *r, int nr)
{
char *s;
int space;
if(nr == 0)
return;
s = smprint("%.*S", nr, r);
space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
if(col>0 && col+space+nr > width){
growbytes(b, "\n", 1);
space = 0;
col = 0;
}
if(space && col>0){
growbytes(b, " ", 1);
col++;
}
growbytes(b, s, strlen(s));
col += nr;
free(s);
inword = 0;
}
void
renderrunes(Bytes *b, Rune *r)
{
int i, n;
n = runestrlen(r);
for(i=0; i<n; i++){
switch(r[i]){
case '\n':
if(inword)
emitword(b, r+wordi, i-wordi);
col = 0;
if(b->n == 0)
break; /* don't start with blank lines */
if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
growbytes(b, "\n", 1);
break;
case ' ':
if(inword)
emitword(b, r+wordi, i-wordi);
break;
default:
if(!inword)
wordi = i;
inword = 1;
break;
}
}
if(inword)
emitword(b, r+wordi, i-wordi);
}
void
renderbytes(Bytes *b, char *fmt, ...)
{
Rune *r;
va_list arg;
va_start(arg, fmt);
r = runevsmprint(fmt, arg);
va_end(arg);
renderrunes(b, r);
free(r);
}
char*
baseurl(char *url)
{
char *base, *slash;
Resub rs[10];
if(url == nil)
return nil;
if(urlprog == nil){
urlprog = regcomp(urlexpr);
if(urlprog == nil)
error("can't compile URL regexp");
}
memset(rs, 0, sizeof rs);
if(regexec(urlprog, url, rs, nelem(rs)) == 0)
return nil;
base = estrdup(url);
slash = strrchr(base, '/');
if(slash!=nil && slash>=&base[rs[0].e.p-rs[0].s.p])
*slash = '\0';
else
base[rs[0].e.p-rs[0].s.p] = '\0';
return base;
}
char*
fullurl(URLwin *u, Rune *rhref)
{
char *base, *href, *hrefbase;
char *result;
if(rhref == nil)
return estrdup("NULL URL");
href = runetobyte(rhref, runestrlen(rhref));
hrefbase = baseurl(href);
result = nil;
if(hrefbase==nil && (base = baseurl(u->url))!=nil){
result = estrdup(base);
if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
result = eappend(result, "/", "");
free(base);
}
if(href){
if(result)
result = eappend(result, "", href);
else
result = estrdup(href);
}
free(hrefbase);
if(result == nil)
return estrdup("***unknown***");
return result;
}
void
render(URLwin *u, Bytes *t, Item *items, int curanchor)
{
Item *il;
Itext *it;
Ifloat *ifl;
Ispacer *is;
Itable *ita;
Iimage *im;
Anchor *a;
Table *tab;
Tablecell *cell;
char *href;
inword = 0;
col = 0;
wordi = 0;
for(il=items; il!=nil; il=il->next){
if(il->state & IFbrk)
renderbytes(t, "\n");
if(il->state & IFbrksp)
renderbytes(t, "\n");
switch(il->tag){
case Itexttag:
it = (Itext*)il;
renderrunes(t, it->s);
break;
case Iruletag:
if(t->n>0 && t->b[t->n-1]!='\n')
renderbytes(t, "\n");
renderbytes(t, "=======\n");
break;
case Iimagetag:
if(!aflag)
break;
im = (Iimage*)il;
if(im->imsrc){
href = fullurl(u, im->imsrc);
renderbytes(t, "[image %s]", href);
free(href);
}
break;
case Iformfieldtag:
if(aflag)
renderbytes(t, "[formfield]");
break;
case Itabletag:
ita = (Itable*)il;
tab = ita->table;
for(cell=tab->cells; cell!=nil; cell=cell->next){
render(u, t, cell->content, curanchor);
}
if(t->n>0 && t->b[t->n-1]!='\n')
renderbytes(t, "\n");
break;
case Ifloattag:
ifl = (Ifloat*)il;
render(u, t, ifl->item, curanchor);
break;
case Ispacertag:
is = (Ispacer*)il;
if(is->spkind != ISPnull)
renderbytes(t, " ");
break;
default:
error("unknown item tag %d\n", il->tag);
}
if(il->anchorid != 0 && il->anchorid!=curanchor){
for(a=u->docinfo->anchors; a!=nil; a=a->next)
if(aflag && a->index == il->anchorid){
href = fullurl(u, a->href);
renderbytes(t, "[%s]", href);
free(href);
break;
}
curanchor = il->anchorid;
}
}
if(t->n>0 && t->b[t->n-1]!='\n')
renderbytes(t, "\n");
}
void
rerender(URLwin *u)
{
Bytes *t;
t = emalloc(sizeof(Bytes));
render(u, t, u->items, 0);
if(t->n)
write(u->outfd, (char*)t->b, t->n);
free(t->b);
free(t);
}
/*
* Somewhat of a hack. Not a full parse, just looks for strings in the beginning
* of the document (cistrstr only looks at first somewhat bytes).
*/
int
charset(char *s)
{
char *meta, *emeta, *charset;
if(defcharset == 0)
defcharset = ISO_8859_1;
meta = cistrstr(s, "<meta");
if(meta == nil)
return defcharset;
for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
;
charset = cistrstr(s, "charset=");
if(charset == nil)
return defcharset;
charset += 8;
if(*charset == '"')
charset++;
if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
return UTF_8;
return defcharset;
}
void
rendertext(URLwin *u, Bytes *b)
{
Rune *rurl;
rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
// free(rurl);
rerender(u);
}
void
freeurlwin(URLwin *u)
{
freeitems(u->items);
u->items = nil;
freedocinfo(u->docinfo);
u->docinfo = nil;
free(u);
}

71
src/cmd/htmlfmt/main.c Normal file
View file

@ -0,0 +1,71 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <draw.h>
#include <html.h>
#include "dat.h"
char *url = "";
int aflag;
int width = 70;
int defcharset;
void
usage(void)
{
fprint(2, "usage: htmlfmt [-c charset] [-u URL] [-a] [-l length] [file ...]\n");
exits("usage");
}
void
main(int argc, char *argv[])
{
int i, fd;
char *p, *err, *file;
char errbuf[ERRMAX];
ARGBEGIN{
case 'a':
aflag++;
break;
case 'c':
p = smprint("<meta charset=\"%s\">", EARGF(usage()));
defcharset = charset(p);
free(p);
break;
case 'l': case 'w':
err = EARGF(usage());
width = atoi(err);
if(width <= 0)
usage();
break;
case 'u':
url = EARGF(usage());
aflag++;
break;
default:
usage();
}ARGEND
err = nil;
file = "<stdin>";
if(argc == 0)
err = loadhtml(0);
else
for(i=0; err==nil && i<argc; i++){
file = argv[i];
fd = open(file, OREAD);
if(fd < 0){
errstr(errbuf, sizeof errbuf);
err = errbuf;
break;
}
err = loadhtml(fd);
close(fd);
if(err)
break;
}
if(err)
fprint(2, "htmlfmt: processing %s: %s\n", file, err);
exits(err);
}

30
src/cmd/htmlfmt/mkfile Normal file
View file

@ -0,0 +1,30 @@
<$SYS9/$systype/$objtype/mkfile
TARG=htmlfmt
OFILES=\
main.$O\
html.$O\
util.$O\
HFILES=\
dat.h\
$SYS9/sys/include/html.h\
LIB=$SYS9/$systype/$objtype/lib/libbio.a\
$SYS9/$systype/$objtype/lib/libregexp.a\
$SYS9/$systype/$objtype/lib/libhtml.a\
$SYS9/$systype/$objtype/lib/lib9c.a
BIN=$SYS9/$systype/$objtype/bin
UPDATE=\
mkfile\
$HFILES\
${OFILES:%.$O=%.c}
<$SYS9/sys/src/cmd/mkone
CFLAGS=$CFLAGS
#$O.out: $OFILES
# $LD -o $target $LDFLAGS $OFILES

120
src/cmd/htmlfmt/util.c Normal file
View file

@ -0,0 +1,120 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <draw.h>
#include <html.h>
#include "dat.h"
void*
emalloc(ulong n)
{
void *p;
p = malloc(n);
if(p == nil)
error("can't malloc: %r");
memset(p, 0, n);
return p;
}
void*
erealloc(void *p, ulong n)
{
p = realloc(p, n);
if(p == nil)
error("can't malloc: %r");
return p;
}
char*
estrdup(char *s)
{
char *t;
t = emalloc(strlen(s)+1);
strcpy(t, s);
return t;
}
char*
estrstrdup(char *s, char *t)
{
long ns, nt;
char *u;
ns = strlen(s);
nt = strlen(t);
/* use malloc to avoid memset */
u = malloc(ns+nt+1);
if(u == nil)
error("can't malloc: %r");
memmove(u, s, ns);
memmove(u+ns, t, nt);
u[ns+nt] = '\0';
return u;
}
char*
eappend(char *s, char *sep, char *t)
{
long ns, nsep, nt;
char *u;
if(t == nil)
u = estrstrdup(s, sep);
else{
ns = strlen(s);
nsep = strlen(sep);
nt = strlen(t);
/* use malloc to avoid memset */
u = malloc(ns+nsep+nt+1);
if(u == nil)
error("can't malloc: %r");
memmove(u, s, ns);
memmove(u+ns, sep, nsep);
memmove(u+ns+nsep, t, nt);
u[ns+nsep+nt] = '\0';
}
free(s);
return u;
}
char*
egrow(char *s, char *sep, char *t)
{
s = eappend(s, sep, t);
free(t);
return s;
}
void
error(char *fmt, ...)
{
va_list arg;
char buf[256];
Fmt f;
fmtfdinit(&f, 2, buf, sizeof buf);
fmtprint(&f, "Mail: ");
va_start(arg, fmt);
fmtvprint(&f, fmt, arg);
va_end(arg);
fmtprint(&f, "\n");
fmtfdflush(&f);
exits(fmt);
}
void
growbytes(Bytes *b, char *s, long ns)
{
if(b->nalloc < b->n + ns + 1){
b->nalloc = b->n + ns + 8000;
/* use realloc to avoid memset */
b->b = realloc(b->b, b->nalloc);
if(b->b == nil)
error("growbytes: can't realloc: %r");
}
memmove(b->b+b->n, s, ns);
b->n += ns;
b->b[b->n] = '\0';
}

4238
src/libhtml/build.c Normal file

File diff suppressed because it is too large Load diff

163
src/libhtml/impl.h Normal file
View file

@ -0,0 +1,163 @@
// UTILS
typedef struct List List;
typedef struct Strlist Strlist;
// List of integers (and also generic list with next pointer at beginning)
struct List
{
List* next;
int val;
};
struct Strlist
{
Strlist* next;
Rune* val;
};
extern int _inclass(Rune c, Rune* cl);
extern int _listlen(List* l);
extern Rune* _ltoStr(int n);
extern List* _newlist(int val, List* rest);
extern Rune* _newstr(int n);
extern int _prefix(Rune* pre, Rune* s);
extern List* _revlist(List* l);
extern void _splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2);
extern void _splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2);
extern int _splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen);
extern Rune* _Stradd(Rune*s1, Rune* s2, int n);
extern Rune* _Strclass(Rune* s, Rune* cl);
extern int _Strcmp(Rune* s1, Rune* s2);
extern Rune* _Strdup(Rune* s);
extern Rune* _Strdup2(Rune* s, Rune* t);
extern int _Streqn(Rune* s1, int n1, Rune* s2);
extern int _Strlen(Rune* s);
extern Rune* _Strnclass(Rune* s, Rune* cl, int n);
extern int _Strncmpci(Rune* s1, int n1, Rune* s2);
extern Rune* _Strndup(Rune* s, int n);
extern Rune* _Strnrclass(Rune* s, Rune* cl, int n);
extern Rune* _Strrclass(Rune* s, Rune* cl);
extern Rune* _Strsubstr(Rune* s, int start, int stop);
extern long _Strtol(Rune* s, Rune** eptr, int base);
extern void _trimwhite(Rune* s, int n, Rune** pans, int* panslen);
extern Rune notwhitespace[];
extern Rune whitespace[];
// STRINTTAB
typedef struct StringInt StringInt;
// Element of String-Int table (used for keyword lookup)
struct StringInt
{
Rune* key;
int val;
};
extern int _lookup(StringInt* t, int n, Rune* key, int keylen, int* pans);
extern StringInt* _makestrinttab(Rune** a, int n);
extern Rune* _revlookup(StringInt* t, int n, int val);
// Colors, in html format, not Plan 9 format. (RGB values in bottom 3 bytes)
enum {
White = 0xFFFFFF,
Black = 0x000000,
Blue = 0x0000CC,
};
// LEX
// HTML 4.0 tags (plus blink, nobr)
// sorted in lexical order; used as array indices
enum {
Notfound,
Comment,
Ta, Tabbr, Tacronym, Taddress, Tapplet, Tarea,
Tb, Tbase, Tbasefont, Tbdo, Tbig, Tblink,
Tblockquote, Tbody, Tbq, Tbr, Tbutton,
Tcaption, Tcenter, Tcite, Tcode, Tcol, Tcolgroup,
Tdd, Tdel, Tdfn, Tdir, Tdiv, Tdl, Tdt,
Tem,
Tfieldset, Tfont, Tform, Tframe, Tframeset,
Th1, Th2, Th3, Th4, Th5, Th6,
Thead, Thr, Thtml,
Ti, Tiframe, Timg, Tinput, Tins, Tisindex,
Tkbd,
Tlabel, Tlegend, Tli, Tlink,
Tmap, Tmenu, Tmeta,
Tnobr, Tnoframes, Tnoscript,
Tobject, Tol, Toptgroup, Toption,
Tp, Tparam, Tpre,
Tq,
Ts, Tsamp, Tscript, Tselect, Tsmall,
Tspan, Tstrike, Tstrong, Tstyle, Tsub, Tsup,
Ttable, Ttbody, Ttd, Ttextarea, Ttfoot,
Tth, Tthead, Ttitle, Ttr, Ttt,
Tu, Tul,
Tvar,
Numtags,
RBRA = Numtags,
Data = Numtags+RBRA
};
// HTML 4.0 tag attributes
// Keep sorted in lexical order
enum {
Aabbr, Aaccept_charset, Aaccess_key, Aaction,
Aalign, Aalink, Aalt, Aarchive, Aaxis,
Abackground, Abgcolor, Aborder,
Acellpadding, Acellspacing, Achar, Acharoff,
Acharset, Achecked, Acite, Aclass, Aclassid,
Aclear, Acode, Acodebase, Acodetype, Acolor,
Acols, Acolspan, Acompact, Acontent, Acoords,
Adata, Adatetime, Adeclare, Adefer, Adir, Adisabled,
Aenctype,
Aface, Afor, Aframe, Aframeborder,
Aheaders, Aheight, Ahref, Ahreflang, Ahspace, Ahttp_equiv,
Aid, Aismap,
Alabel, Alang, Alink, Alongdesc,
Amarginheight, Amarginwidth, Amaxlength,
Amedia, Amethod, Amultiple,
Aname, Anohref, Anoresize, Anoshade, Anowrap,
Aobject, Aonblur, Aonchange, Aonclick, Aondblclick,
Aonfocus, Aonkeypress, Aonkeyup, Aonload,
Aonmousedown, Aonmousemove, Aonmouseout,
Aonmouseover, Aonmouseup, Aonreset, Aonselect,
Aonsubmit, Aonunload,
Aprofile, Aprompt,
Areadonly, Arel, Arev, Arows, Arowspan, Arules,
Ascheme, Ascope, Ascrolling, Aselected, Ashape,
Asize, Aspan, Asrc, Astandby, Astart, Astyle, Asummary,
Atabindex, Atarget, Atext, Atitle, Atype,
Ausemap,
Avalign, Avalue, Avaluetype, Aversion, Avlink, Avspace,
Awidth,
Numattrs
};
struct Attr
{
Attr* next; // in list of attrs for a token
int attid; // Aabbr, etc.
Rune* value;
};
struct Token
{
int tag; // Ta, etc
Rune* text; // text in Data, attribute text in tag
Attr* attr; // list of Attrs
int starti; // index into source buffer of token start
};
extern Rune** tagnames;
extern Rune** attrnames;
extern void _freetokens(Token* tarray, int n);
extern Token* _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen);
extern int _tokaval(Token* t, int attid, Rune** pans, int xfer);
#pragma varargck type "T" Token*
#include "runetab.h"

1384
src/libhtml/lex.c Normal file

File diff suppressed because it is too large Load diff

22
src/libhtml/mkfile Normal file
View file

@ -0,0 +1,22 @@
<$SYS9/$systype/$objtype/mkfile
LIB=$LIB9/libhtml.a
OFILES=\
build.$O\
lex.$O\
strinttab.$O\
utils.$O\
runetab.$O\
HFILES=\
$SYS9/sys/include/html.h\
impl.h\
UPDATE=\
mkfile\
$HFILES\
${OFILES:%.$O=%.c}\
${LIB:$SYS9/$systype/$objtype/%=$SYS9/$systype/386/%}\
<$SYS9/sys/src/cmd/mksyslib

83
src/libhtml/runetab.c Normal file
View file

@ -0,0 +1,83 @@
#include <u.h>
#include <libc.h>
#include <draw.h>
#include <html.h>
#include "impl.h"
Rune **runeconsttab;
char *_runeconsttab[] = {
" ",
" ",
"",
"#",
"+",
", ",
"-",
"-->",
"1",
"<",
">",
"?",
"Index search terms:",
"Reset",
"Submit",
"^0-9",
"_ISINDEX_",
"_blank",
"_fr",
"_no_name_submit_",
"_parent",
"_self",
"_top",
"application/x-www-form-urlencoded",
"circle",
"cm",
"content-script-type",
"disc",
"em",
"in",
"javascript",
"jscript",
"jscript1.1",
"mm",
"none",
"pi",
"pt",
"refresh",
"select",
"square",
"textarea",
};
Rune**
cvtstringtab(char **tab, int n)
{
int i;
Rune **rtab;
rtab = emalloc(n*sizeof(rtab[0]));
for(i=0; i<n; i++)
rtab[i] = toStr(tab[i], strlen(tab[i]), US_Ascii);
return rtab;
}
StringInt*
cvtstringinttab(AsciiInt *tab, int n)
{
int i;
StringInt *stab;
stab = emalloc(n*sizeof(stab[0]));
for(i=0; i<n; i++){
stab[i].key = toStr(tab[i].key, strlen(tab[i].key), US_Ascii);
stab[i].val = tab[i].val;
}
return stab;
}
void
runetabinit(void)
{
runeconsttab = cvtstringtab(_runeconsttab, nelem(_runeconsttab));
return;
}

59
src/libhtml/runetab.h Normal file
View file

@ -0,0 +1,59 @@
typedef struct AsciiInt AsciiInt;
struct AsciiInt {
char* key;
int val;
};
enum {
Ltab2space,
Lspace,
Lempty,
Lhash,
Lplus,
Lcommaspace,
Lminus,
Larrow,
Lone,
Llt,
Lgt,
Lquestion,
Lindex,
Lreset,
Lsubmit,
Lnot0to9,
Lisindex,
L_blank,
Lfr,
Lnoname,
L_parent,
L_self,
L_top,
Lappl_form,
Lcircle,
Lcm,
Lcontent,
Ldisc,
Lem,
Lin,
Ljavascript,
Ljscript,
Ljscript1,
Lmm,
Lnone,
Lpi,
Lpt,
Lrefresh,
Lselect,
Lsquare,
Ltextarea,
};
#define L(x) runeconsttab[(x)]
extern Rune **runeconsttab;
/* XXX: for unix port only */
Rune **cvtstringtab(char**, int);
StringInt *cvtstringinttab(AsciiInt*, int);
void runetabinit(void);

64
src/libhtml/strinttab.c Normal file
View file

@ -0,0 +1,64 @@
#include <u.h>
#include <libc.h>
#include <draw.h>
#include <html.h>
#include "impl.h"
// Do case-insensitive lookup of key[0:keylen] in t[0:n] (key part),
// returning 1 if found, 0 if not.
// Array t must be sorted in increasing lexicographic order of key.
// If found, return corresponding val in *pans.
int
_lookup(StringInt* t, int n, Rune* key, int keylen, int* pans)
{
int min;
int max;
int try;
int cmpresult;
min = 0;
max = n - 1;
while(min <= max) {
try = (min + max)/2;
cmpresult = _Strncmpci(key, keylen, t[try].key);
if(cmpresult > 0)
min = try + 1;
else if(cmpresult < 0)
max = try - 1;
else {
*pans = t[try].val;
return 1;
}
}
return 0;
}
// Return first key in t[0:n] that corresponds to val,
// nil if none.
Rune*
_revlookup(StringInt* t, int n, int val)
{
int i;
for(i = 0; i < n; i++)
if(t[i].val == val)
return t[i].key;
return nil;
}
// Make a StringInt table out of a[0:n], mapping each string
// to its index. Check that entries are in alphabetical order.
StringInt*
_makestrinttab(Rune** a, int n)
{
StringInt* ans;
int i;
ans = (StringInt*)emalloc(n * sizeof(StringInt));
for(i = 0; i < n; i++) {
ans[i].key = a[i];
ans[i].val = i;
assert(i == 0 || runestrcmp(a[i], a[i - 1]) >= 0);
}
return ans;
}

591
src/libhtml/utils.c Normal file
View file

@ -0,0 +1,591 @@
#include <u.h>
#include <libc.h>
#include <draw.h>
#include <html.h>
#include "impl.h"
Rune whitespace[] = { ' ', '\t', '\n', '\r', '\0' };
Rune notwhitespace[] = { '^', ' ', '\t', '\n', '\r' , '\0'};
// All lists start out like List structure.
// List itself can be used as list of int.
int
_listlen(List* l)
{
int n = 0;
while(l != nil) {
l = l->next;
n++;
}
return n;
}
// Cons
List*
_newlist(int val, List* rest)
{
List* ans;
ans = (List*)emalloc(sizeof(List));
ans->val = val;
ans->next = rest;
return ans;
}
// Reverse a list in place
List*
_revlist(List* l)
{
List* newl;
List* nextl;
newl = nil;
while(l != nil) {
nextl = l->next;
l->next = newl;
newl = l;
l = nextl;
}
return newl;
}
// The next few routines take a "character class" as argument.
// e.g., "a-zA-Z", or "^ \t\n"
// (ranges indicated by - except in first position;
// ^ is first position means "not in" the following class)
// Splitl splits s[0:n] just before first character of class cl.
// Answers go in (p1, n1) and (p2, n2).
// If no split, the whole thing goes in the first component.
// Note: answers contain pointers into original string.
void
_splitl(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
{
Rune* p;
p = _Strnclass(s, cl, n);
*p1 = s;
if(p == nil) {
*n1 = n;
*p2 = nil;
*n2 = 0;
}
else {
*p2 = p;
*n1 = p-s;
*n2 = n-*n1;
}
}
// Splitr splits s[0:n] just after last character of class cl.
// Answers go in (p1, n1) and (p2, n2).
// If no split, the whole thing goes in the last component.
// Note: answers contain pointers into original string.
void
_splitr(Rune* s, int n, Rune* cl, Rune** p1, int* n1, Rune** p2, int* n2)
{
Rune* p;
p = _Strnrclass(s, cl, n);
if(p == nil) {
*p1 = nil;
*n1 = 0;
*p2 = s;
*n2 = n;
}
else {
*p1 = s;
*p2 = p+1;
*n1 = *p2-s;
*n2 = n-*n1;
}
}
// Splitall splits s[0:n] into parts that are separated by characters from class cl.
// Each part will have nonzero length.
// At most alen parts are found, and pointers to their starts go into
// the strarr array, while their lengths go into the lenarr array.
// The return value is the number of parts found.
int
_splitall(Rune* s, int n, Rune* cl, Rune** strarr, int* lenarr, int alen)
{
int i;
Rune* p;
Rune* q;
Rune* slast;
if(s == nil || n == 0)
return 0;
i = 0;
p = s;
slast = s+n;
while(p < slast && i < alen) {
while(p < slast && _inclass(*p, cl))
p++;
if(p == slast)
break;
q = _Strnclass(p, cl, slast-p);
if(q == nil)
q = slast;
assert(q > p && q <= slast);
strarr[i] = p;
lenarr[i] = q-p;
i++;
p = q;
}
return i;
}
// Find part of s that excludes leading and trailing whitespace,
// and return that part in *pans (and its length in *panslen).
void
_trimwhite(Rune* s, int n, Rune** pans, int* panslen)
{
Rune* p;
Rune* q;
p = nil;
if(n > 0) {
p = _Strnclass(s, notwhitespace, n);
if(p != nil) {
q = _Strnrclass(s, notwhitespace, n);
assert(q != nil);
n = q+1-p;
}
}
*pans = p;
*panslen = n;
}
// _Strclass returns a pointer to the first element of s that is
// a member of class cl, nil if none.
Rune*
_Strclass(Rune* s, Rune* cl)
{
Rune* p;
for(p = s; *p != 0; p++)
if(_inclass(*p, cl))
return p;
return nil;
}
// _Strnclass returns a pointer to the first element of s[0:n] that is
// a member of class cl, nil if none.
Rune*
_Strnclass(Rune* s, Rune* cl, int n)
{
Rune* p;
for(p = s; n-- && *p != 0; p++)
if(_inclass(*p, cl))
return p;
return nil;
}
// _Strrclass returns a pointer to the last element of s that is
// a member of class cl, nil if none
Rune*
_Strrclass(Rune* s, Rune* cl)
{
Rune* p;
if(s == nil || *s == 0)
return nil;
p = s + runestrlen(s) - 1;
while(p >= s) {
if(_inclass(*p, cl))
return p;
p--;
};
return nil;
}
// _Strnrclass returns a pointer to the last element of s[0:n] that is
// a member of class cl, nil if none
Rune*
_Strnrclass(Rune* s, Rune* cl, int n)
{
Rune* p;
if(s == nil || *s == 0 || n == 0)
return nil;
p = s + n - 1;
while(p >= s) {
if(_inclass(*p, cl))
return p;
p--;
};
return nil;
}
// Is c in the class cl?
int
_inclass(Rune c, Rune* cl)
{
int n;
int ans;
int negate;
int i;
n = _Strlen(cl);
if(n == 0)
return 0;
ans = 0;
negate = 0;
if(cl[0] == '^') {
negate = 1;
cl++;
n--;
}
for(i = 0; i < n; i++) {
if(cl[i] == '-' && i > 0 && i < n - 1) {
if(c >= cl[i - 1] && c <= cl[i + 1]) {
ans = 1;
break;
}
i++;
}
else if(c == cl[i]) {
ans = 1;
break;
}
}
if(negate)
ans = !ans;
return ans;
}
// Is pre a prefix of s?
int
_prefix(Rune* pre, Rune* s)
{
int ns;
int n;
int k;
ns = _Strlen(s);
n = _Strlen(pre);
if(ns < n)
return 0;
for(k = 0; k < n; k++) {
if(pre[k] != s[k])
return 0;
}
return 1;
}
// Number of runes in (null-terminated) s
int
_Strlen(Rune* s)
{
if(s == nil)
return 0;
return runestrlen(s);
}
// -1, 0, 1 as s1 is lexicographically less, equal greater than s2
int
_Strcmp(Rune *s1, Rune *s2)
{
if(s1 == nil)
return (s2 == nil || *s2 == 0) ? 0 : -1;
if(s2 == nil)
return (*s1 == 0) ? 0 : 1;
return runestrcmp(s1, s2);
}
// Like Strcmp, but use exactly n chars of s1 (assume s1 has at least n chars).
// Also, do a case-insensitive match, assuming s2
// has no chars in [A-Z], only their lowercase versions.
// (This routine is used for in-place keyword lookup, where s2 is in a keyword
// list and s1 is some substring, possibly mixed-case, in a buffer.)
int
_Strncmpci(Rune *s1, int n1, Rune *s2)
{
Rune c1, c2;
for(;;) {
if(n1-- == 0) {
if(*s2 == 0)
return 0;
return -1;
}
c1 = *s1++;
c2 = *s2++;
if(c1 >= 'A' && c1 <= 'Z')
c1 = c1 - 'A' + 'a';
if(c1 != c2) {
if(c1 > c2)
return 1;
return -1;
}
}
}
// emalloc and copy
Rune*
_Strdup(Rune* s)
{
if(s == nil)
return nil;
return _Strndup(s, runestrlen(s));
}
// emalloc and copy n chars of s (assume s is at least that long),
// and add 0 terminator.
// Return nil if n==0.
Rune*
_Strndup(Rune* s, int n)
{
Rune* ans;
if(n <= 0)
return nil;
ans = _newstr(n);
memmove(ans, s, n*sizeof(Rune));
ans[n] = 0;
return ans;
}
// emalloc enough room for n Runes, plus 1 null terminator.
// (Not initialized to anything.)
Rune*
_newstr(int n)
{
return (Rune*)emalloc((n+1)*sizeof(Rune));
}
// emalloc and copy s+t
Rune*
_Strdup2(Rune* s, Rune* t)
{
int ns, nt;
Rune* ans;
Rune* p;
ns = _Strlen(s);
nt = _Strlen(t);
if(ns+nt == 0)
return nil;
ans = _newstr(ns+nt);
p = _Stradd(ans, s, ns);
p = _Stradd(p, t, nt);
*p = 0;
return ans;
}
// Return emalloc'd substring s[start:stop],
Rune*
_Strsubstr(Rune* s, int start, int stop)
{
Rune* t;
if(start == stop)
return nil;
t = _Strndup(s+start, stop-start);
return t;
}
// Copy n chars to s1 from s2, and return s1+n
Rune*
_Stradd(Rune* s1, Rune* s2, int n)
{
if(n == 0)
return s1;
memmove(s1, s2, n*sizeof(Rune));
return s1+n;
}
// Like strtol, but converting from Rune* string
//#define LONG_MAX 2147483647L
//#define LONG_MIN -2147483648L
long
_Strtol(Rune* nptr, Rune** endptr, int base)
{
Rune* p;
long n, nn;
int c, ovfl, v, neg, ndig;
p = nptr;
neg = 0;
n = 0;
ndig = 0;
ovfl = 0;
/*
* White space
*/
for(;;p++){
switch(*p){
case ' ':
case '\t':
case '\n':
case '\f':
case '\r':
case '\v':
continue;
}
break;
}
/*
* Sign
*/
if(*p=='-' || *p=='+')
if(*p++ == '-')
neg = 1;
/*
* Base
*/
if(base==0){
if(*p != '0')
base = 10;
else{
base = 8;
if(p[1]=='x' || p[1]=='X'){
p += 2;
base = 16;
}
}
}else if(base==16 && *p=='0'){
if(p[1]=='x' || p[1]=='X')
p += 2;
}else if(base<0 || 36<base)
goto Return;
/*
* Non-empty sequence of digits
*/
for(;; p++,ndig++){
c = *p;
v = base;
if('0'<=c && c<='9')
v = c - '0';
else if('a'<=c && c<='z')
v = c - 'a' + 10;
else if('A'<=c && c<='Z')
v = c - 'A' + 10;
if(v >= base)
break;
nn = n*base + v;
if(nn < n)
ovfl = 1;
n = nn;
}
Return:
if(ndig == 0)
p = nptr;
if(endptr)
*endptr = p;
if(ovfl){
if(neg)
return LONG_MIN;
return LONG_MAX;
}
if(neg)
return -n;
return n;
}
// Convert buf[0:n], bytes whose character set is chset,
// into a emalloc'd null-terminated Unicode string.
Rune*
toStr(uchar* buf, int n, int chset)
{
int i;
int m;
Rune ch;
Rune* ans;
switch(chset) {
case US_Ascii:
case ISO_8859_1:
ans = (Rune*)emalloc((n+1)*sizeof(Rune));
for(i = 0; i < n; i++)
ans[i] = buf[i];
ans[n] = 0;
break;
case UTF_8:
m = 0;
for(i = 0; i < n; ) {
i += chartorune(&ch, (char*)(buf+i));
m++;
}
ans = (Rune*)emalloc((m+1)*sizeof(Rune));
m = 0;
for(i = 0; i < n; ) {
i += chartorune(&ch, (char*)(buf+i));
ans[m++] = ch;
}
ans[m] = 0;
break;
default:
ans = nil;
assert(0);
}
return ans;
}
// Convert buf[0:n], Unicode characters,
// into an emalloc'd null-terminated string in character set chset.
// Use 0x80 for unconvertable characters.
uchar*
fromStr(Rune* buf, int n, int chset)
{
uchar* ans;
int i, lim, m;
Rune ch;
uchar* p;
uchar s[UTFmax];
ans = nil;
switch(chset) {
case US_Ascii:
case ISO_8859_1:
ans = (uchar*)emalloc(n+1);
lim = (chset==US_Ascii)? 127 : 255;
for(i = 0; i < n; i++) {
ch = buf[i];
if(ch > lim)
ch = 0x80;
ans[i] = ch;
}
ans[n] = 0;
break;
case UTF_8:
m = 0;
for(i = 0; i < n; i++) {
m += runetochar((char*)s, &buf[i]);
}
ans = (uchar*)emalloc(m+1);
p = ans;
for(i = 0; i < n; i++)
p += runetochar((char*)p, &buf[i]);
*p = 0;
break;
default:
assert(0);
}
return ans;
}
// Convert n to emalloc'd String.
Rune*
_ltoStr(int n)
{
int m;
uchar buf[20];
m = snprint((char*)buf, sizeof(buf), "%d", n);
return toStr(buf, m, US_Ascii);
}