msgclass.c (4777B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include "msgdb.h" 6 7 void 8 usage(void) 9 { 10 fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n"); 11 exits("usage"); 12 } 13 14 enum 15 { 16 MAXBEST = 32, 17 MAXLEN = 64, 18 MAXTAB = 256 19 }; 20 21 typedef struct Ndb Ndb; 22 struct Ndb 23 { 24 char *name; 25 char *file; 26 Msgdb *db; 27 double p; 28 long nmsg; 29 }; 30 31 typedef struct Word Word; 32 struct Word 33 { 34 char s[MAXLEN]; 35 int count[MAXTAB]; 36 double p[MAXTAB]; 37 double mp; 38 int mi; /* w.p[w.mi] = w.mp */ 39 int nmsg; 40 }; 41 42 Ndb db[MAXTAB]; 43 int ndb; 44 45 int add; 46 int mul; 47 Msgdb *indb; 48 49 Word best[MAXBEST]; 50 int mbest = 15; 51 int nbest; 52 53 void process(Biobuf*, char*); 54 void lockfile(char*); 55 56 void 57 noteword(Word *w, char *s) 58 { 59 int i; 60 61 for(i=nbest-1; i>=0; i--) 62 if(w->mp < best[i].mp) 63 break; 64 i++; 65 66 if(i >= mbest) 67 return; 68 if(nbest == mbest) 69 nbest--; 70 if(i < nbest) 71 memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0])); 72 best[i] = *w; 73 strecpy(best[i].s, best[i].s+MAXLEN, s); 74 nbest++; 75 } 76 77 void 78 main(int argc, char **argv) 79 { 80 int i, bad, m, tot, nn, j; 81 Biobuf bin, *b, bout; 82 char *s, *lf; 83 double totp, p, thresh; 84 long n; 85 Word w; 86 87 lf = nil; 88 thresh = 0; 89 ARGBEGIN{ 90 case 'a': 91 add = 1; 92 break; 93 case 'd': 94 if(ndb >= MAXTAB) 95 sysfatal("too many db classes"); 96 db[ndb].name = EARGF(usage()); 97 db[ndb].file = EARGF(usage()); 98 ndb++; 99 break; 100 case 'l': 101 lf = EARGF(usage()); 102 break; 103 case 'm': 104 mul = atoi(EARGF(usage())); 105 break; 106 case 't': 107 thresh = atof(EARGF(usage())); 108 break; 109 default: 110 usage(); 111 }ARGEND 112 113 if(ndb == 0){ 114 fprint(2, "must have at least one -d option\n"); 115 usage(); 116 } 117 118 indb = mdopen(nil, 1); 119 if(argc == 0){ 120 Binit(&bin, 0, OREAD); 121 process(&bin, "<stdin>"); 122 Bterm(&bin); 123 }else{ 124 bad = 0; 125 for(i=0; i<argc; i++){ 126 if((b = Bopen(argv[i], OREAD)) == nil){ 127 fprint(2, "opening %s: %r\n", argv[i]); 128 bad = 1; 129 continue; 130 } 131 process(b, argv[i]); 132 Bterm(b); 133 } 134 if(bad) 135 exits("open inputs"); 136 } 137 138 lockfile(lf); 139 bad = 0; 140 for(i=0; i<ndb; i++){ 141 if((db[i].db = mdopen(db[i].file, 0)) == nil){ 142 fprint(2, "opendb %s: %r\n", db[i].file); 143 bad = 1; 144 } 145 db[i].nmsg = mdget(db[i].db, "*From*"); 146 } 147 if(bad) 148 exits("open databases"); 149 150 /* run conditional probabilities of input words, getting 15 most specific */ 151 mdenum(indb); 152 nbest = 0; 153 while(mdnext(indb, &s, &n) >= 0){ 154 tot = 0; 155 totp = 0.0; 156 for(i=0; i<ndb; i++){ 157 nn = mdget(db[i].db, s)*(i==0 ? 3 : 1); 158 tot += nn; 159 w.count[i] = nn; 160 p = w.count[i]/(double)db[i].nmsg; 161 if(p >= 1.0) 162 p = 1.0; 163 w.p[i] = p; 164 totp += p; 165 } 166 /*fprint(2, "%s tot %d totp %g\n", s, tot, totp); */ 167 if(tot < 2) 168 continue; 169 w.mp = 0.0; 170 for(i=0; i<ndb; i++){ 171 p = w.p[i]; 172 p /= totp; 173 if(p < 0.001) 174 p = 0.001; 175 else if(p > 0.999) 176 p = 0.999; 177 if(p > w.mp){ 178 w.mp = p; 179 w.mi = i; 180 } 181 w.p[i] = p; 182 } 183 noteword(&w, s); 184 } 185 186 /* compute conditional probabilities of message classes using 15 most specific */ 187 totp = 0.0; 188 for(i=0; i<ndb; i++){ 189 p = 1.0; 190 for(j=0; j<nbest; j++) 191 p *= best[j].p[i]; 192 db[i].p = p; 193 totp += p; 194 } 195 for(i=0; i<ndb; i++) 196 db[i].p /= totp; 197 m = 0; 198 for(i=1; i<ndb; i++) 199 if(db[i].p > db[m].p) 200 m = i; 201 202 Binit(&bout, 1, OWRITE); 203 if(db[m].p < thresh) 204 m = -1; 205 if(m >= 0) 206 Bprint(&bout, "%s", db[m].name); 207 else 208 Bprint(&bout, "inconclusive"); 209 for(j=0; j<ndb; j++) 210 Bprint(&bout, " %s=%g", db[j].name, db[j].p); 211 Bprint(&bout, "\n"); 212 for(i=0; i<nbest; i++){ 213 Bprint(&bout, "%s", best[i].s); 214 for(j=0; j<ndb; j++) 215 Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]); 216 Bprint(&bout, "\n"); 217 } 218 Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]); 219 Bterm(&bout); 220 221 if(m >= 0 && add){ 222 mdenum(indb); 223 while(mdnext(indb, &s, &n) >= 0) 224 mdput(db[m].db, s, mdget(db[m].db, s)+n*mul); 225 mdclose(db[m].db); 226 } 227 exits(nil); 228 } 229 230 void 231 process(Biobuf *b, char*) 232 { 233 char *s; 234 char *p; 235 long n; 236 237 while((s = Brdline(b, '\n')) != nil){ 238 s[Blinelen(b)-1] = 0; 239 if((p = strrchr(s, ' ')) != nil){ 240 *p++ = 0; 241 n = atoi(p); 242 }else 243 n = 1; 244 mdput(indb, s, mdget(indb, s)+n); 245 } 246 } 247 248 int tpid; 249 void 250 killtickle(void) 251 { 252 postnote(PNPROC, tpid, "die"); 253 } 254 255 void 256 lockfile(char *s) 257 { 258 int fd, t, w; 259 char err[ERRMAX]; 260 261 if(s == nil) 262 return; 263 w = 50; 264 t = 0; 265 for(;;){ 266 fd = open(s, OREAD); 267 if(fd >= 0) 268 break; 269 rerrstr(err, sizeof err); 270 if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil)) 271 break; 272 sleep(w); 273 t += w; 274 if(w < 1000) 275 w = (w*3)/2; 276 if(t > 120*1000) 277 break; 278 } 279 if(fd < 0) 280 sysfatal("could not lock %s", s); 281 switch(tpid = fork()){ 282 case -1: 283 sysfatal("fork: %r"); 284 case 0: 285 for(;;){ 286 sleep(30*1000); 287 free(dirfstat(fd)); 288 } 289 _exits(nil); 290 default: 291 break; 292 } 293 close(fd); 294 atexit(killtickle); 295 }