plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

regen.c (2446B)


      1 #include <u.h>
      2 #include <libc.h>
      3 #include <bio.h>
      4 #include <regexp.h>
      5 #include "dfa.h"
      6 
      7 /***
      8  * Regular expression for matching.
      9  */
     10 
     11 char *ignore[] =
     12 {
     13 	/* HTML that isn't A, IMG, or FONT */
     14 	/* Must have a space somewhere to avoid catching <email@address> */
     15 	"<[ 	\n\r]*("
     16 		"[^aif]|"
     17 		"a[^> \t\r\n]|"
     18 		"i[^mM \t\r\n]|"
     19 		"im[^gG \t\r\n]|"
     20 		"img[^> \t\r\n]|"
     21 		"f[^oO \t\r\n]|"
     22 		"fo[^Nn \t\r\n]|"
     23 		"fon[^tT \t\r\n]|"
     24 		"font[^> \r\t\n]"
     25 	")[^>]*[ \t\n\r][^>]*>",
     26 	"<[ 	\n\r]*("
     27 		"i|im|f|fo|fon"
     28 	")[ \t\r\n][^>]*>",
     29 
     30 	/* ignore html comments */
     31 	"<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->",
     32 
     33 	/* random mail strings */
     34 	"^message-id:.*\n([ 	].*\n)*",
     35 	"^in-reply-to:.*\n([ 	].*\n)*",
     36 	"^references:.*\n([ 	].*\n)*",
     37 	"^date:.*\n([ 	].*\n)*",
     38 	"^delivery-date:.*\n([ 	].*\n)*",
     39 	"e?smtp id .*",
     40 	"^	id.*",
     41 	"boundary=.*",
     42 	"name=\"",
     43 	"filename=\"",
     44 	"news:<[^>]+>",
     45 	"^--[^ 	]*$",
     46 
     47 	/* base64 encoding */
     48 	"^[0-9a-zA-Z+\\-=/]+$",
     49 
     50 	/* uu encoding */
     51 	"^[!-Z]+$",
     52 
     53 	/* little things */
     54 	".",
     55 	"\n"
     56 };
     57 
     58 char *keywords[] =
     59 {
     60 	"([a-zA-Z'`$!¡-￿]|[0-9]([.,][0-9])*)+"
     61 };
     62 
     63 int debug;
     64 
     65 Dreprog*
     66 dregcomp(char *buf)
     67 {
     68 	Reprog *r;
     69 	Dreprog *d;
     70 
     71 	if(debug)
     72 		print(">>> '%s'\n", buf);
     73 
     74 	r = regcomp(buf);
     75 	if(r == nil)
     76 		sysfatal("regcomp");
     77 	d = dregcvt(r);
     78 	if(d == nil)
     79 		sysfatal("dregcomp");
     80 	free(r);
     81 	return d;
     82 }
     83 
     84 char*
     85 strcpycase(char *d, char *s)
     86 {
     87 	int cc, esc;
     88 
     89 	cc = 0;
     90 	esc = 0;
     91 	while(*s){
     92 		if(*s == '[')
     93 			cc++;
     94 		if(*s == ']')
     95 			cc--;
     96 		if(!cc && 'a' <= *s && *s <= 'z'){
     97 			*d++ = '[';
     98 			*d++ = *s;
     99 			*d++ = *s+'A'-'a';
    100 			*d++ = ']';
    101 		}else
    102 			*d++ = *s;
    103 		if(*s == '\\')
    104 			esc++;
    105 		else if(esc)
    106 			esc--;
    107 		s++;
    108 	}
    109 	return d;
    110 }
    111 
    112 void
    113 regerror(char *msg)
    114 {
    115 	sysfatal("regerror: %s", msg);
    116 }
    117 
    118 void
    119 buildre(Dreprog *re[3])
    120 {
    121 	int i;
    122 	static char buf[16384], *s;
    123 
    124 	re[0] = dregcomp("^From ");
    125 
    126 	s = buf;
    127 	for(i=0; i<nelem(keywords); i++){
    128 		if(i != 0)
    129 			*s++ = '|';
    130 		s = strcpycase(s, keywords[i]);
    131 	}
    132 	*s = 0;
    133 	re[1] = dregcomp(buf);
    134 
    135 	s = buf;
    136 	for(i=0; i<nelem(ignore); i++){
    137 		if(i != 0)
    138 			*s++ = '|';
    139 		s = strcpycase(s, ignore[i]);
    140 	}
    141 	*s = 0;
    142 	re[2] = dregcomp(buf);
    143 }
    144 
    145 void
    146 usage(void)
    147 {
    148 	fprint(2, "usage: regen [-d]\n");
    149 	exits("usage");
    150 }
    151 
    152 void
    153 main(int argc, char **argv)
    154 {
    155 	Dreprog *re[3];
    156 	Biobuf b;
    157 
    158 	ARGBEGIN{
    159 	default:
    160 		usage();
    161 	case 'd':
    162 		debug = 1;
    163 	}ARGEND
    164 
    165 	if(argc != 0)
    166 		usage();
    167 
    168 	buildre(re);
    169 	Binit(&b, 1, OWRITE);
    170 	Bprintdfa(&b, re[0]);
    171 	Bprintdfa(&b, re[1]);
    172 	Bprintdfa(&b, re[2]);
    173 	exits(0);
    174 }