176 lines
No EOL
2.4 KiB
C
176 lines
No EOL
2.4 KiB
C
#include <u.h>
|
|
#include <libc.h>
|
|
#include <bio.h>
|
|
#include <regexp.h>
|
|
#include "dfa.h"
|
|
|
|
/***
|
|
* Regular expression for matching.
|
|
*/
|
|
|
|
char *ignore[] =
|
|
{
|
|
/* HTML that isn't A, IMG, or FONT */
|
|
/* Must have a space somewhere to avoid catching <email@address> */
|
|
"<[ \n\r]*("
|
|
"[^aif]|"
|
|
"a[^> \t\r\n]|"
|
|
"i[^mM \t\r\n]|"
|
|
"im[^gG \t\r\n]|"
|
|
"img[^> \t\r\n]|"
|
|
"f[^oO \t\r\n]|"
|
|
"fo[^Nn \t\r\n]|"
|
|
"fon[^tT \t\r\n]|"
|
|
"font[^> \r\t\n]"
|
|
")[^>]*[ \t\n\r][^>]*>",
|
|
"<[ \n\r]*("
|
|
"i|im|f|fo|fon"
|
|
")[ \t\r\n][^>]*>",
|
|
|
|
/* ignore html comments */
|
|
"<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->",
|
|
|
|
/* random mail strings */
|
|
"^message-id:.*\n([ ].*\n)*",
|
|
"^in-reply-to:.*\n([ ].*\n)*",
|
|
"^references:.*\n([ ].*\n)*",
|
|
"^date:.*\n([ ].*\n)*",
|
|
"^delivery-date:.*\n([ ].*\n)*",
|
|
"e?smtp id .*",
|
|
"^ id.*",
|
|
"boundary=.*",
|
|
"name=\"",
|
|
"filename=\"",
|
|
"news:<[^>]+>",
|
|
"^--[^ ]*$",
|
|
|
|
/* base64 encoding */
|
|
"^[0-9a-zA-Z+\\-=/]+$",
|
|
|
|
/* uu encoding */
|
|
"^[!-Z]+$",
|
|
|
|
/* little things */
|
|
".",
|
|
"\n"
|
|
};
|
|
|
|
char *keywords[] =
|
|
{
|
|
"([a-zA-Z'`$!¡-]|[0-9]([.,][0-9])*)+"
|
|
};
|
|
|
|
int debug;
|
|
|
|
Dreprog*
|
|
dregcomp(char *buf)
|
|
{
|
|
Reprog *r;
|
|
Dreprog *d;
|
|
|
|
if(debug)
|
|
print(">>> '%s'\n", buf);
|
|
|
|
r = regcomp(buf);
|
|
if(r == nil)
|
|
sysfatal("regcomp");
|
|
d = dregcvt(r);
|
|
if(d == nil)
|
|
sysfatal("dregcomp");
|
|
free(r);
|
|
return d;
|
|
}
|
|
|
|
char*
|
|
strcpycase(char *d, char *s)
|
|
{
|
|
int cc, esc;
|
|
|
|
cc = 0;
|
|
esc = 0;
|
|
while(*s){
|
|
if(*s == '[')
|
|
cc++;
|
|
if(*s == ']')
|
|
cc--;
|
|
if(!cc && 'a' <= *s && *s <= 'z'){
|
|
*d++ = '[';
|
|
*d++ = *s;
|
|
*d++ = *s+'A'-'a';
|
|
*d++ = ']';
|
|
}else
|
|
*d++ = *s;
|
|
if(*s == '\\')
|
|
esc++;
|
|
else if(esc)
|
|
esc--;
|
|
s++;
|
|
}
|
|
return d;
|
|
}
|
|
|
|
void
|
|
regerror(char *msg)
|
|
{
|
|
sysfatal("regerror: %s", msg);
|
|
}
|
|
|
|
void
|
|
buildre(Dreprog *re[3])
|
|
{
|
|
int i;
|
|
static char buf[16384], *s;
|
|
|
|
re[0] = dregcomp("^From ");
|
|
|
|
s = buf;
|
|
for(i=0; i<nelem(keywords); i++){
|
|
if(i != 0)
|
|
*s++ = '|';
|
|
s = strcpycase(s, keywords[i]);
|
|
}
|
|
*s = 0;
|
|
re[1] = dregcomp(buf);
|
|
|
|
s = buf;
|
|
for(i=0; i<nelem(ignore); i++){
|
|
if(i != 0)
|
|
*s++ = '|';
|
|
s = strcpycase(s, ignore[i]);
|
|
}
|
|
*s = 0;
|
|
re[2] = dregcomp(buf);
|
|
}
|
|
|
|
void
|
|
usage(void)
|
|
{
|
|
fprint(2, "usage: regen [-d]\n");
|
|
exits("usage");
|
|
}
|
|
|
|
void
|
|
main(int argc, char **argv)
|
|
{
|
|
Dreprog *re[3];
|
|
Biobuf b;
|
|
|
|
ARGBEGIN{
|
|
default:
|
|
usage();
|
|
case 'd':
|
|
debug = 1;
|
|
}ARGEND
|
|
|
|
if(argc != 0)
|
|
usage();
|
|
|
|
buildre(re);
|
|
Binit(&b, 1, OWRITE);
|
|
Bprintdfa(&b, re[0]);
|
|
Bprintdfa(&b, re[1]);
|
|
Bprintdfa(&b, re[2]);
|
|
exits(0);
|
|
}
|
|
|
|
|