msgtok.c (4209B)
1 /* 2 * RFC822 message tokenizer (really feature generator) for spam filter. 3 * 4 * See Paul Graham's musings on spam filtering for theory. 5 */ 6 7 #include <u.h> 8 #include <libc.h> 9 #include <bio.h> 10 #include <regexp.h> 11 #include <ctype.h> 12 #include "dfa.h" 13 14 void buildre(Dreprog*[3]); 15 int debug; 16 char *refile = "#9/mail/lib/classify.re"; 17 int maxtoklen = 20; 18 int trim(char*); 19 20 void 21 usage(void) 22 { 23 fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n"); 24 exits("usage"); 25 } 26 27 void 28 main(int argc, char **argv) 29 { 30 int i, hdr, n, eof, off; 31 Dreprog *re[3]; 32 int m[3]; 33 char *p, *ep, *tag; 34 Biobuf bout, bin; 35 char msg[1024+1]; 36 char buf[1024]; 37 38 refile = unsharp(refile); 39 buildre(re); 40 ARGBEGIN{ 41 case 'D': 42 debug = 1; 43 break; 44 case 'n': 45 maxtoklen = atoi(EARGF(usage())); 46 break; 47 case 'r': 48 refile = EARGF(usage()); 49 break; 50 default: 51 usage(); 52 }ARGEND; 53 54 if(argc > 1) 55 usage(); 56 if(argc == 1){ 57 close(0); 58 if(open(argv[0], OREAD) < 0) 59 sysfatal("open %s: %r", argv[0]); 60 } 61 62 tag = nil; 63 Binit(&bin, 0, OREAD); 64 Binit(&bout, 1, OWRITE); 65 ep = msg; 66 p = msg; 67 eof = 0; 68 off = 0; 69 hdr = 1; 70 for(;;){ 71 /* replenish buffer */ 72 if(ep - p < 512 && !eof){ 73 if(p > msg + 1){ 74 n = ep - p; 75 memmove(msg, p-1, ep-(p-1)); 76 off += (p-1) - msg; 77 p = msg+1; 78 ep = p + n; 79 } 80 n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep); 81 if(n < 0) 82 sysfatal("read error: %r"); 83 if(n == 0) 84 eof = 1; 85 ep += n; 86 *ep = 0; 87 } 88 if(p >= ep) 89 break; 90 91 if(*p == 0){ 92 p++; 93 continue; 94 } 95 96 if(hdr && p[-1]=='\n'){ 97 if(p[0]=='\n') 98 hdr = 0; 99 else if(cistrncmp(p-1, "\nfrom:", 6) == 0) 100 tag = "From*"; 101 else if(cistrncmp(p-1, "\nto:", 4) == 0) 102 tag = "To*"; 103 else if(cistrncmp(p-1, "\nsubject:", 9) == 0) 104 tag = "Subject*"; 105 else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0) 106 tag = "Return-Path*"; 107 else 108 tag = nil; 109 } 110 m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n'); 111 m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n'); 112 m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n'); 113 114 n = m[0]; 115 if(n < m[1]) 116 n = m[1]; 117 if(n < m[2]) 118 n = m[2]; 119 if(n <= 0){ 120 fprint(2, "«%s» %.2ux", p, p[0]); 121 sysfatal("no regexps matched at %ld", off + (p-msg)); 122 } 123 124 if(m[0] >= m[1] && m[0] >= m[2]){ 125 /* "From " marks start of new message */ 126 Bprint(&bout, "*From*\n"); 127 n = m[0]; 128 hdr = 1; 129 }else if(m[2] > 1){ 130 /* ignore */ 131 n = m[2]; 132 }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){ 133 /* keyword */ 134 /* should do UTF-aware lowercasing, too much bother */ 135 /* 136 for(i=0; i<n; i++) 137 if('A' <= p[i] && p[i] <= 'Z') 138 p[i] += 'a' - 'A'; 139 */ 140 if(tag){ 141 i = strlen(tag); 142 memmove(buf, tag, i); 143 memmove(buf+i, p, m[1]); 144 buf[i+m[1]] = 0; 145 }else{ 146 memmove(buf, p, m[1]); 147 buf[m[1]] = 0; 148 } 149 Bprint(&bout, "%s\n", buf); 150 while(trim(buf) >= 0) 151 Bprint(&bout, "stem*%s\n", buf); 152 n = m[1]; 153 }else 154 n = m[2]; 155 if(debug) 156 fprint(2, "%.*s¦", utfnlen(p, n), p); 157 p += n; 158 } 159 Bterm(&bout); 160 exits(0); 161 } 162 163 void 164 buildre(Dreprog *re[3]) 165 { 166 Biobuf *b; 167 168 if((b = Bopen(refile, OREAD)) == nil) 169 sysfatal("open %s: %r", refile); 170 171 re[0] = Breaddfa(b); 172 re[1] = Breaddfa(b); 173 re[2] = Breaddfa(b); 174 175 if(re[0]==nil || re[1]==nil || re[2]==nil) 176 sysfatal("Breaddfa: %r"); 177 Bterm(b); 178 } 179 180 /* perhaps this belongs in the tokenizer */ 181 int 182 trim(char *s) 183 { 184 char *p, *op; 185 int mix, mix1; 186 187 if(*s == '*') 188 return -1; 189 190 /* strip leading punctuation */ 191 p = strchr(s, '*'); 192 if(p == nil) 193 p = s; 194 while(*p && !isalpha(*p)) 195 p++; 196 if(strlen(p) < 2) 197 { 198 return -1; 199 } 200 memmove(s, p, strlen(p)+1); 201 202 /* strip suffix of punctuation */ 203 p = s+strlen(s); 204 op = p; 205 while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1])) 206 p--; 207 208 /* chop punctuation */ 209 if(p > s){ 210 /* free!!! -> free! */ 211 if(p+1 < op){ 212 p[1] = 0; 213 return 0; 214 } 215 /* free! -> free */ 216 if(p < op){ 217 p[0] = 0; 218 return 0; 219 } 220 } 221 222 mix = mix1 = 0; 223 if(isupper(s[0])) 224 mix = 1; 225 for(p=s+1; *p; p++) 226 if(isupper(*p)){ 227 mix1 = 1; 228 break; 229 } 230 231 /* turn FREE into Free */ 232 if(mix1){ 233 for(p=s+1; *p; p++) 234 if(isupper(*p)) 235 *p += 'a'-'A'; 236 return 0; 237 } 238 239 /* turn Free into free */ 240 if(mix){ 241 *s += 'a'-'A'; 242 return 0; 243 } 244 return -1; 245 }