Import version of libhtml that might actually work with ANSI C.

This commit is contained in:
wkj 2004-04-06 19:06:52 +00:00
parent 3e3817f7c8
commit 7cf289ca89
13 changed files with 7206 additions and 0 deletions

50
src/cmd/htmlfmt/dat.h Normal file
View file

@ -0,0 +1,50 @@
typedef struct Bytes Bytes;
typedef struct URLwin URLwin;
enum
{
STACK = 8192,
EVENTSIZE = 256,
};
struct Bytes
{
uchar *b;
long n;
long nalloc;
};
struct URLwin
{
int infd;
int outfd;
int type;
char *url;
Item *items;
Docinfo *docinfo;
};
extern char* url;
extern int aflag;
extern int width;
extern int defcharset;
extern char* loadhtml(int);
extern char* readfile(char*, char*, int*);
extern int charset(char*);
extern void* emalloc(ulong);
extern char* estrdup(char*);
extern char* estrstrdup(char*, char*);
extern char* egrow(char*, char*, char*);
extern char* eappend(char*, char*, char*);
extern void error(char*, ...);
extern void growbytes(Bytes*, char*, long);
extern void rendertext(URLwin*, Bytes*);
extern void rerender(URLwin*);
extern void freeurlwin(URLwin*);
#pragma varargck argpos error 1

331
src/cmd/htmlfmt/html.c Normal file
View file

@ -0,0 +1,331 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <draw.h>
#include <regexp.h>
#include <html.h>
#include <ctype.h>
#include "dat.h"
char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
Reprog *urlprog;
int inword = 0;
int col = 0;
int wordi = 0;
char*
loadhtml(int fd)
{
URLwin *u;
Bytes *b;
int n;
char buf[4096];
u = emalloc(sizeof(URLwin));
u->infd = fd;
u->outfd = 1;
u->url = estrdup(url);
u->type = TextHtml;
b = emalloc(sizeof(Bytes));
while((n = read(fd, buf, sizeof buf)) > 0)
growbytes(b, buf, n);
if(b->b == nil)
return nil; /* empty file */
rendertext(u, b);
freeurlwin(u);
return nil;
}
char*
runetobyte(Rune *r, int n)
{
char *s;
if(n == 0)
return emalloc(1);
s = smprint("%.*S", n, r);
if(s == nil)
error("malloc failed");
return s;
}
int
closingpunct(int c)
{
return strchr(".,:;'\")]}>!?", c) != nil;
}
void
emitword(Bytes *b, Rune *r, int nr)
{
char *s;
int space;
if(nr == 0)
return;
s = smprint("%.*S", nr, r);
space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
if(col>0 && col+space+nr > width){
growbytes(b, "\n", 1);
space = 0;
col = 0;
}
if(space && col>0){
growbytes(b, " ", 1);
col++;
}
growbytes(b, s, strlen(s));
col += nr;
free(s);
inword = 0;
}
void
renderrunes(Bytes *b, Rune *r)
{
int i, n;
n = runestrlen(r);
for(i=0; i<n; i++){
switch(r[i]){
case '\n':
if(inword)
emitword(b, r+wordi, i-wordi);
col = 0;
if(b->n == 0)
break; /* don't start with blank lines */
if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
growbytes(b, "\n", 1);
break;
case ' ':
if(inword)
emitword(b, r+wordi, i-wordi);
break;
default:
if(!inword)
wordi = i;
inword = 1;
break;
}
}
if(inword)
emitword(b, r+wordi, i-wordi);
}
void
renderbytes(Bytes *b, char *fmt, ...)
{
Rune *r;
va_list arg;
va_start(arg, fmt);
r = runevsmprint(fmt, arg);
va_end(arg);
renderrunes(b, r);
free(r);
}
char*
baseurl(char *url)
{
char *base, *slash;
Resub rs[10];
if(url == nil)
return nil;
if(urlprog == nil){
urlprog = regcomp(urlexpr);
if(urlprog == nil)
error("can't compile URL regexp");
}
memset(rs, 0, sizeof rs);
if(regexec(urlprog, url, rs, nelem(rs)) == 0)
return nil;
base = estrdup(url);
slash = strrchr(base, '/');
if(slash!=nil && slash>=&base[rs[0].e.p-rs[0].s.p])
*slash = '\0';
else
base[rs[0].e.p-rs[0].s.p] = '\0';
return base;
}
char*
fullurl(URLwin *u, Rune *rhref)
{
char *base, *href, *hrefbase;
char *result;
if(rhref == nil)
return estrdup("NULL URL");
href = runetobyte(rhref, runestrlen(rhref));
hrefbase = baseurl(href);
result = nil;
if(hrefbase==nil && (base = baseurl(u->url))!=nil){
result = estrdup(base);
if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
result = eappend(result, "/", "");
free(base);
}
if(href){
if(result)
result = eappend(result, "", href);
else
result = estrdup(href);
}
free(hrefbase);
if(result == nil)
return estrdup("***unknown***");
return result;
}
void
render(URLwin *u, Bytes *t, Item *items, int curanchor)
{
Item *il;
Itext *it;
Ifloat *ifl;
Ispacer *is;
Itable *ita;
Iimage *im;
Anchor *a;
Table *tab;
Tablecell *cell;
char *href;
inword = 0;
col = 0;
wordi = 0;
for(il=items; il!=nil; il=il->next){
if(il->state & IFbrk)
renderbytes(t, "\n");
if(il->state & IFbrksp)
renderbytes(t, "\n");
switch(il->tag){
case Itexttag:
it = (Itext*)il;
renderrunes(t, it->s);
break;
case Iruletag:
if(t->n>0 && t->b[t->n-1]!='\n')
renderbytes(t, "\n");
renderbytes(t, "=======\n");
break;
case Iimagetag:
if(!aflag)
break;
im = (Iimage*)il;
if(im->imsrc){
href = fullurl(u, im->imsrc);
renderbytes(t, "[image %s]", href);
free(href);
}
break;
case Iformfieldtag:
if(aflag)
renderbytes(t, "[formfield]");
break;
case Itabletag:
ita = (Itable*)il;
tab = ita->table;
for(cell=tab->cells; cell!=nil; cell=cell->next){
render(u, t, cell->content, curanchor);
}
if(t->n>0 && t->b[t->n-1]!='\n')
renderbytes(t, "\n");
break;
case Ifloattag:
ifl = (Ifloat*)il;
render(u, t, ifl->item, curanchor);
break;
case Ispacertag:
is = (Ispacer*)il;
if(is->spkind != ISPnull)
renderbytes(t, " ");
break;
default:
error("unknown item tag %d\n", il->tag);
}
if(il->anchorid != 0 && il->anchorid!=curanchor){
for(a=u->docinfo->anchors; a!=nil; a=a->next)
if(aflag && a->index == il->anchorid){
href = fullurl(u, a->href);
renderbytes(t, "[%s]", href);
free(href);
break;
}
curanchor = il->anchorid;
}
}
if(t->n>0 && t->b[t->n-1]!='\n')
renderbytes(t, "\n");
}
void
rerender(URLwin *u)
{
Bytes *t;
t = emalloc(sizeof(Bytes));
render(u, t, u->items, 0);
if(t->n)
write(u->outfd, (char*)t->b, t->n);
free(t->b);
free(t);
}
/*
* Somewhat of a hack. Not a full parse, just looks for strings in the beginning
* of the document (cistrstr only looks at first somewhat bytes).
*/
int
charset(char *s)
{
char *meta, *emeta, *charset;
if(defcharset == 0)
defcharset = ISO_8859_1;
meta = cistrstr(s, "<meta");
if(meta == nil)
return defcharset;
for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
;
charset = cistrstr(s, "charset=");
if(charset == nil)
return defcharset;
charset += 8;
if(*charset == '"')
charset++;
if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
return UTF_8;
return defcharset;
}
void
rendertext(URLwin *u, Bytes *b)
{
Rune *rurl;
rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
// free(rurl);
rerender(u);
}
void
freeurlwin(URLwin *u)
{
freeitems(u->items);
u->items = nil;
freedocinfo(u->docinfo);
u->docinfo = nil;
free(u);
}

71
src/cmd/htmlfmt/main.c Normal file
View file

@ -0,0 +1,71 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <draw.h>
#include <html.h>
#include "dat.h"
char *url = "";
int aflag;
int width = 70;
int defcharset;
void
usage(void)
{
fprint(2, "usage: htmlfmt [-c charset] [-u URL] [-a] [-l length] [file ...]\n");
exits("usage");
}
void
main(int argc, char *argv[])
{
int i, fd;
char *p, *err, *file;
char errbuf[ERRMAX];
ARGBEGIN{
case 'a':
aflag++;
break;
case 'c':
p = smprint("<meta charset=\"%s\">", EARGF(usage()));
defcharset = charset(p);
free(p);
break;
case 'l': case 'w':
err = EARGF(usage());
width = atoi(err);
if(width <= 0)
usage();
break;
case 'u':
url = EARGF(usage());
aflag++;
break;
default:
usage();
}ARGEND
err = nil;
file = "<stdin>";
if(argc == 0)
err = loadhtml(0);
else
for(i=0; err==nil && i<argc; i++){
file = argv[i];
fd = open(file, OREAD);
if(fd < 0){
errstr(errbuf, sizeof errbuf);
err = errbuf;
break;
}
err = loadhtml(fd);
close(fd);
if(err)
break;
}
if(err)
fprint(2, "htmlfmt: processing %s: %s\n", file, err);
exits(err);
}

30
src/cmd/htmlfmt/mkfile Normal file
View file

@ -0,0 +1,30 @@
<$SYS9/$systype/$objtype/mkfile
TARG=htmlfmt
OFILES=\
main.$O\
html.$O\
util.$O\
HFILES=\
dat.h\
$SYS9/sys/include/html.h\
LIB=$SYS9/$systype/$objtype/lib/libbio.a\
$SYS9/$systype/$objtype/lib/libregexp.a\
$SYS9/$systype/$objtype/lib/libhtml.a\
$SYS9/$systype/$objtype/lib/lib9c.a
BIN=$SYS9/$systype/$objtype/bin
UPDATE=\
mkfile\
$HFILES\
${OFILES:%.$O=%.c}
<$SYS9/sys/src/cmd/mkone
CFLAGS=$CFLAGS
#$O.out: $OFILES
# $LD -o $target $LDFLAGS $OFILES

120
src/cmd/htmlfmt/util.c Normal file
View file

@ -0,0 +1,120 @@
#include <u.h>
#include <libc.h>
#include <bio.h>
#include <draw.h>
#include <html.h>
#include "dat.h"
void*
emalloc(ulong n)
{
void *p;
p = malloc(n);
if(p == nil)
error("can't malloc: %r");
memset(p, 0, n);
return p;
}
void*
erealloc(void *p, ulong n)
{
p = realloc(p, n);
if(p == nil)
error("can't malloc: %r");
return p;
}
char*
estrdup(char *s)
{
char *t;
t = emalloc(strlen(s)+1);
strcpy(t, s);
return t;
}
char*
estrstrdup(char *s, char *t)
{
long ns, nt;
char *u;
ns = strlen(s);
nt = strlen(t);
/* use malloc to avoid memset */
u = malloc(ns+nt+1);
if(u == nil)
error("can't malloc: %r");
memmove(u, s, ns);
memmove(u+ns, t, nt);
u[ns+nt] = '\0';
return u;
}
char*
eappend(char *s, char *sep, char *t)
{
long ns, nsep, nt;
char *u;
if(t == nil)
u = estrstrdup(s, sep);
else{
ns = strlen(s);
nsep = strlen(sep);
nt = strlen(t);
/* use malloc to avoid memset */
u = malloc(ns+nsep+nt+1);
if(u == nil)
error("can't malloc: %r");
memmove(u, s, ns);
memmove(u+ns, sep, nsep);
memmove(u+ns+nsep, t, nt);
u[ns+nsep+nt] = '\0';
}
free(s);
return u;
}
char*
egrow(char *s, char *sep, char *t)
{
s = eappend(s, sep, t);
free(t);
return s;
}
void
error(char *fmt, ...)
{
va_list arg;
char buf[256];
Fmt f;
fmtfdinit(&f, 2, buf, sizeof buf);
fmtprint(&f, "Mail: ");
va_start(arg, fmt);
fmtvprint(&f, fmt, arg);
va_end(arg);
fmtprint(&f, "\n");
fmtfdflush(&f);
exits(fmt);
}
void
growbytes(Bytes *b, char *s, long ns)
{
if(b->nalloc < b->n + ns + 1){
b->nalloc = b->n + ns + 8000;
/* use realloc to avoid memset */
b->b = realloc(b->b, b->nalloc);
if(b->b == nil)
error("growbytes: can't realloc: %r");
}
memmove(b->b+b->n, s, ns);
b->n += ns;
b->b[b->n] = '\0';
}