regen.c (2446B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <regexp.h> 5 #include "dfa.h" 6 7 /*** 8 * Regular expression for matching. 9 */ 10 11 char *ignore[] = 12 { 13 /* HTML that isn't A, IMG, or FONT */ 14 /* Must have a space somewhere to avoid catching <email@address> */ 15 "<[ \n\r]*(" 16 "[^aif]|" 17 "a[^> \t\r\n]|" 18 "i[^mM \t\r\n]|" 19 "im[^gG \t\r\n]|" 20 "img[^> \t\r\n]|" 21 "f[^oO \t\r\n]|" 22 "fo[^Nn \t\r\n]|" 23 "fon[^tT \t\r\n]|" 24 "font[^> \r\t\n]" 25 ")[^>]*[ \t\n\r][^>]*>", 26 "<[ \n\r]*(" 27 "i|im|f|fo|fon" 28 ")[ \t\r\n][^>]*>", 29 30 /* ignore html comments */ 31 "<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->", 32 33 /* random mail strings */ 34 "^message-id:.*\n([ ].*\n)*", 35 "^in-reply-to:.*\n([ ].*\n)*", 36 "^references:.*\n([ ].*\n)*", 37 "^date:.*\n([ ].*\n)*", 38 "^delivery-date:.*\n([ ].*\n)*", 39 "e?smtp id .*", 40 "^ id.*", 41 "boundary=.*", 42 "name=\"", 43 "filename=\"", 44 "news:<[^>]+>", 45 "^--[^ ]*$", 46 47 /* base64 encoding */ 48 "^[0-9a-zA-Z+\\-=/]+$", 49 50 /* uu encoding */ 51 "^[!-Z]+$", 52 53 /* little things */ 54 ".", 55 "\n" 56 }; 57 58 char *keywords[] = 59 { 60 "([a-zA-Z'`$!¡-]|[0-9]([.,][0-9])*)+" 61 }; 62 63 int debug; 64 65 Dreprog* 66 dregcomp(char *buf) 67 { 68 Reprog *r; 69 Dreprog *d; 70 71 if(debug) 72 print(">>> '%s'\n", buf); 73 74 r = regcomp(buf); 75 if(r == nil) 76 sysfatal("regcomp"); 77 d = dregcvt(r); 78 if(d == nil) 79 sysfatal("dregcomp"); 80 free(r); 81 return d; 82 } 83 84 char* 85 strcpycase(char *d, char *s) 86 { 87 int cc, esc; 88 89 cc = 0; 90 esc = 0; 91 while(*s){ 92 if(*s == '[') 93 cc++; 94 if(*s == ']') 95 cc--; 96 if(!cc && 'a' <= *s && *s <= 'z'){ 97 *d++ = '['; 98 *d++ = *s; 99 *d++ = *s+'A'-'a'; 100 *d++ = ']'; 101 }else 102 *d++ = *s; 103 if(*s == '\\') 104 esc++; 105 else if(esc) 106 esc--; 107 s++; 108 } 109 return d; 110 } 111 112 void 113 regerror(char *msg) 114 { 115 sysfatal("regerror: %s", msg); 116 } 117 118 void 119 buildre(Dreprog *re[3]) 120 { 121 int i; 122 static char buf[16384], *s; 123 124 re[0] = dregcomp("^From "); 125 126 s = buf; 127 for(i=0; i<nelem(keywords); i++){ 128 if(i != 0) 129 *s++ = '|'; 130 s = strcpycase(s, keywords[i]); 131 } 132 *s = 0; 133 re[1] = dregcomp(buf); 134 135 s = buf; 136 for(i=0; i<nelem(ignore); i++){ 137 if(i != 0) 138 *s++ = '|'; 139 s = strcpycase(s, ignore[i]); 140 } 141 *s = 0; 142 re[2] = dregcomp(buf); 143 } 144 145 void 146 usage(void) 147 { 148 fprint(2, "usage: regen [-d]\n"); 149 exits("usage"); 150 } 151 152 void 153 main(int argc, char **argv) 154 { 155 Dreprog *re[3]; 156 Biobuf b; 157 158 ARGBEGIN{ 159 default: 160 usage(); 161 case 'd': 162 debug = 1; 163 }ARGEND 164 165 if(argc != 0) 166 usage(); 167 168 buildre(re); 169 Binit(&b, 1, OWRITE); 170 Bprintdfa(&b, re[0]); 171 Bprintdfa(&b, re[1]); 172 Bprintdfa(&b, re[2]); 173 exits(0); 174 }