plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

msgtok.c (4209B)


      1 /*
      2  * RFC822 message tokenizer (really feature generator) for spam filter.
      3  *
      4  * See Paul Graham's musings on spam filtering for theory.
      5  */
      6 
      7 #include <u.h>
      8 #include <libc.h>
      9 #include <bio.h>
     10 #include <regexp.h>
     11 #include <ctype.h>
     12 #include "dfa.h"
     13 
     14 void buildre(Dreprog*[3]);
     15 int debug;
     16 char *refile = "#9/mail/lib/classify.re";
     17 int maxtoklen = 20;
     18 int trim(char*);
     19 
     20 void
     21 usage(void)
     22 {
     23 	fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
     24 	exits("usage");
     25 }
     26 
     27 void
     28 main(int argc, char **argv)
     29 {
     30 	int i, hdr, n, eof, off;
     31 	Dreprog *re[3];
     32 	int m[3];
     33 	char *p, *ep, *tag;
     34 	Biobuf bout, bin;
     35 	char msg[1024+1];
     36 	char buf[1024];
     37 
     38 	refile = unsharp(refile);
     39 	buildre(re);
     40 	ARGBEGIN{
     41 	case 'D':
     42 		debug = 1;
     43 		break;
     44 	case 'n':
     45 		maxtoklen = atoi(EARGF(usage()));
     46 		break;
     47 	case 'r':
     48 		refile = EARGF(usage());
     49 		break;
     50 	default:
     51 		usage();
     52 	}ARGEND;
     53 
     54 	if(argc > 1)
     55 		usage();
     56 	if(argc == 1){
     57 		close(0);
     58 		if(open(argv[0], OREAD) < 0)
     59 			sysfatal("open %s: %r", argv[0]);
     60 	}
     61 
     62 	tag = nil;
     63 	Binit(&bin, 0, OREAD);
     64 	Binit(&bout, 1, OWRITE);
     65 	ep = msg;
     66 	p = msg;
     67 	eof = 0;
     68 	off = 0;
     69 	hdr = 1;
     70 	for(;;){
     71 		/* replenish buffer */
     72 		if(ep - p < 512 && !eof){
     73 			if(p > msg + 1){
     74 				n = ep - p;
     75 				memmove(msg, p-1, ep-(p-1));
     76 				off += (p-1) - msg;
     77 				p = msg+1;
     78 				ep = p + n;
     79 			}
     80 			n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
     81 			if(n < 0)
     82 				sysfatal("read error: %r");
     83 			if(n == 0)
     84 				eof = 1;
     85 			ep += n;
     86 			*ep = 0;
     87 		}
     88 		if(p >= ep)
     89 			break;
     90 
     91 		if(*p == 0){
     92 			p++;
     93 			continue;
     94 		}
     95 
     96 		if(hdr && p[-1]=='\n'){
     97 			if(p[0]=='\n')
     98 				hdr = 0;
     99 			else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
    100 				tag = "From*";
    101 			else if(cistrncmp(p-1, "\nto:", 4) == 0)
    102 				tag = "To*";
    103 			else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
    104 				tag = "Subject*";
    105 			else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
    106 				tag = "Return-Path*";
    107 			else
    108 				tag = nil;
    109 		}
    110 		m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
    111 		m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
    112 		m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
    113 
    114 		n = m[0];
    115 		if(n < m[1])
    116 			n = m[1];
    117 		if(n < m[2])
    118 			n = m[2];
    119 		if(n <= 0){
    120 fprint(2, "«%s» %.2ux", p, p[0]);
    121 			sysfatal("no regexps matched at %ld", off + (p-msg));
    122 		}
    123 
    124 		if(m[0] >= m[1] && m[0] >= m[2]){
    125 			/* "From " marks start of new message */
    126 			Bprint(&bout, "*From*\n");
    127 			n = m[0];
    128 			hdr = 1;
    129 		}else if(m[2] > 1){
    130 			/* ignore */
    131 			n = m[2];
    132 		}else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
    133 			/* keyword */
    134 			/* should do UTF-aware lowercasing, too much bother */
    135 /*
    136 			for(i=0; i<n; i++)
    137 				if('A' <= p[i] && p[i] <= 'Z')
    138 					p[i] += 'a' - 'A';
    139 */
    140 			if(tag){
    141 				i = strlen(tag);
    142 				memmove(buf, tag, i);
    143 				memmove(buf+i, p, m[1]);
    144 				buf[i+m[1]] = 0;
    145 			}else{
    146 				memmove(buf, p, m[1]);
    147 				buf[m[1]] = 0;
    148 			}
    149 			Bprint(&bout, "%s\n", buf);
    150 			while(trim(buf) >= 0)
    151 				Bprint(&bout, "stem*%s\n", buf);
    152 			n = m[1];
    153 		}else
    154 			n = m[2];
    155 		if(debug)
    156 			fprint(2, "%.*s¦", utfnlen(p, n), p);
    157 		p += n;
    158 	}
    159 	Bterm(&bout);
    160 	exits(0);
    161 }
    162 
    163 void
    164 buildre(Dreprog *re[3])
    165 {
    166 	Biobuf *b;
    167 
    168 	if((b = Bopen(refile, OREAD)) == nil)
    169 		sysfatal("open %s: %r", refile);
    170 
    171 	re[0] = Breaddfa(b);
    172 	re[1] = Breaddfa(b);
    173 	re[2] = Breaddfa(b);
    174 
    175 	if(re[0]==nil || re[1]==nil || re[2]==nil)
    176 		sysfatal("Breaddfa: %r");
    177 	Bterm(b);
    178 }
    179 
    180 /* perhaps this belongs in the tokenizer */
    181 int
    182 trim(char *s)
    183 {
    184 	char *p, *op;
    185 	int mix, mix1;
    186 
    187 	if(*s == '*')
    188 		return -1;
    189 
    190 	/* strip leading punctuation */
    191 	p = strchr(s, '*');
    192 	if(p == nil)
    193 		p = s;
    194 	while(*p && !isalpha(*p))
    195 		p++;
    196 	if(strlen(p) < 2)
    197 {
    198 		return -1;
    199 }
    200 	memmove(s, p, strlen(p)+1);
    201 
    202 	/* strip suffix of punctuation */
    203 	p = s+strlen(s);
    204 	op = p;
    205 	while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
    206 		p--;
    207 
    208 	/* chop punctuation */
    209 	if(p > s){
    210 		/* free!!! -> free! */
    211 		if(p+1 < op){
    212 			p[1] = 0;
    213 			return 0;
    214 		}
    215 		/* free! -> free */
    216 		if(p < op){
    217 			p[0] = 0;
    218 			return 0;
    219 		}
    220 	}
    221 
    222 	mix = mix1 = 0;
    223 	if(isupper(s[0]))
    224 		mix = 1;
    225 	for(p=s+1; *p; p++)
    226 		if(isupper(*p)){
    227 			mix1 = 1;
    228 			break;
    229 		}
    230 
    231 	/* turn FREE into Free */
    232 	if(mix1){
    233 		for(p=s+1; *p; p++)
    234 			if(isupper(*p))
    235 				*p += 'a'-'A';
    236 		return 0;
    237 	}
    238 
    239 	/* turn Free into free */
    240 	if(mix){
    241 		*s += 'a'-'A';
    242 		return 0;
    243 	}
    244 	return -1;
    245 }