plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

msgclass.c (4777B)


      1 #include <u.h>
      2 #include <libc.h>
      3 #include <bio.h>
      4 #include <ctype.h>
      5 #include "msgdb.h"
      6 
      7 void
      8 usage(void)
      9 {
     10 	fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n");
     11 	exits("usage");
     12 }
     13 
     14 enum
     15 {
     16 	MAXBEST = 32,
     17 	MAXLEN = 64,
     18 	MAXTAB = 256
     19 };
     20 
     21 typedef struct Ndb Ndb;
     22 struct Ndb
     23 {
     24 	char *name;
     25 	char *file;
     26 	Msgdb *db;
     27 	double p;
     28 	long nmsg;
     29 };
     30 
     31 typedef struct Word Word;
     32 struct Word
     33 {
     34 	char s[MAXLEN];
     35 	int count[MAXTAB];
     36 	double p[MAXTAB];
     37 	double mp;
     38 	int mi; /* w.p[w.mi] = w.mp */
     39 	int nmsg;
     40 };
     41 
     42 Ndb db[MAXTAB];
     43 int ndb;
     44 
     45 int add;
     46 int mul;
     47 Msgdb *indb;
     48 
     49 Word best[MAXBEST];
     50 int mbest = 15;
     51 int nbest;
     52 
     53 void process(Biobuf*, char*);
     54 void lockfile(char*);
     55 
     56 void
     57 noteword(Word *w, char *s)
     58 {
     59 	int i;
     60 
     61 	for(i=nbest-1; i>=0; i--)
     62 		if(w->mp < best[i].mp)
     63 			break;
     64 	i++;
     65 
     66 	if(i >= mbest)
     67 		return;
     68 	if(nbest == mbest)
     69 		nbest--;
     70 	if(i < nbest)
     71 		memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
     72 	best[i] = *w;
     73 	strecpy(best[i].s, best[i].s+MAXLEN, s);
     74 	nbest++;
     75 }
     76 
     77 void
     78 main(int argc, char **argv)
     79 {
     80 	int i, bad, m, tot, nn, j;
     81 	Biobuf bin, *b, bout;
     82 	char *s, *lf;
     83 	double totp, p, thresh;
     84 	long n;
     85 	Word w;
     86 
     87 	lf = nil;
     88 	thresh = 0;
     89 	ARGBEGIN{
     90 	case 'a':
     91 		add = 1;
     92 		break;
     93 	case 'd':
     94 		if(ndb >= MAXTAB)
     95 			sysfatal("too many db classes");
     96 		db[ndb].name = EARGF(usage());
     97 		db[ndb].file = EARGF(usage());
     98 		ndb++;
     99 		break;
    100 	case 'l':
    101 		lf = EARGF(usage());
    102 		break;
    103 	case 'm':
    104 		mul = atoi(EARGF(usage()));
    105 		break;
    106 	case 't':
    107 		thresh = atof(EARGF(usage()));
    108 		break;
    109 	default:
    110 		usage();
    111 	}ARGEND
    112 
    113 	if(ndb == 0){
    114 		fprint(2, "must have at least one -d option\n");
    115 		usage();
    116 	}
    117 
    118 	indb = mdopen(nil, 1);
    119 	if(argc == 0){
    120 		Binit(&bin, 0, OREAD);
    121 		process(&bin, "<stdin>");
    122 		Bterm(&bin);
    123 	}else{
    124 		bad = 0;
    125 		for(i=0; i<argc; i++){
    126 			if((b = Bopen(argv[i], OREAD)) == nil){
    127 				fprint(2, "opening %s: %r\n", argv[i]);
    128 				bad = 1;
    129 				continue;
    130 			}
    131 			process(b, argv[i]);
    132 			Bterm(b);
    133 		}
    134 		if(bad)
    135 			exits("open inputs");
    136 	}
    137 
    138 	lockfile(lf);
    139 	bad = 0;
    140 	for(i=0; i<ndb; i++){
    141 		if((db[i].db = mdopen(db[i].file, 0)) == nil){
    142 			fprint(2, "opendb %s: %r\n", db[i].file);
    143 			bad = 1;
    144 		}
    145 		db[i].nmsg = mdget(db[i].db, "*From*");
    146 	}
    147 	if(bad)
    148 		exits("open databases");
    149 
    150 	/* run conditional probabilities of input words, getting 15 most specific */
    151 	mdenum(indb);
    152 	nbest = 0;
    153 	while(mdnext(indb, &s, &n) >= 0){
    154 		tot = 0;
    155 		totp = 0.0;
    156 		for(i=0; i<ndb; i++){
    157 			nn = mdget(db[i].db, s)*(i==0 ? 3 : 1);
    158 			tot += nn;
    159 			w.count[i] = nn;
    160 			p = w.count[i]/(double)db[i].nmsg;
    161 			if(p >= 1.0)
    162 				p = 1.0;
    163 			w.p[i] = p;
    164 			totp += p;
    165 		}
    166 /*fprint(2, "%s tot %d totp %g\n", s, tot, totp); */
    167 		if(tot < 2)
    168 			continue;
    169 		w.mp = 0.0;
    170 		for(i=0; i<ndb; i++){
    171 			p = w.p[i];
    172 			p /= totp;
    173 			if(p < 0.001)
    174 				p = 0.001;
    175 			else if(p > 0.999)
    176 				p = 0.999;
    177 			if(p > w.mp){
    178 				w.mp = p;
    179 				w.mi = i;
    180 			}
    181 			w.p[i] = p;
    182 		}
    183 		noteword(&w, s);
    184 	}
    185 
    186 	/* compute conditional probabilities of message classes using 15 most specific */
    187 	totp = 0.0;
    188 	for(i=0; i<ndb; i++){
    189 		p = 1.0;
    190 		for(j=0; j<nbest; j++)
    191 			p *= best[j].p[i];
    192 		db[i].p = p;
    193 		totp += p;
    194 	}
    195 	for(i=0; i<ndb; i++)
    196 		db[i].p /= totp;
    197 	m = 0;
    198 	for(i=1; i<ndb; i++)
    199 		if(db[i].p > db[m].p)
    200 			m = i;
    201 
    202 	Binit(&bout, 1, OWRITE);
    203 	if(db[m].p < thresh)
    204 		m = -1;
    205 	if(m >= 0)
    206 		Bprint(&bout, "%s", db[m].name);
    207 	else
    208 		Bprint(&bout, "inconclusive");
    209 	for(j=0; j<ndb; j++)
    210 		Bprint(&bout, " %s=%g", db[j].name, db[j].p);
    211 	Bprint(&bout, "\n");
    212 	for(i=0; i<nbest; i++){
    213 		Bprint(&bout, "%s", best[i].s);
    214 		for(j=0; j<ndb; j++)
    215 			Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]);
    216 		Bprint(&bout, "\n");
    217 	}
    218 		Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]);
    219 	Bterm(&bout);
    220 
    221 	if(m >= 0 && add){
    222 		mdenum(indb);
    223 		while(mdnext(indb, &s, &n) >= 0)
    224 			mdput(db[m].db, s, mdget(db[m].db, s)+n*mul);
    225 		mdclose(db[m].db);
    226 	}
    227 	exits(nil);
    228 }
    229 
    230 void
    231 process(Biobuf *b, char*)
    232 {
    233 	char *s;
    234 	char *p;
    235 	long n;
    236 
    237 	while((s = Brdline(b, '\n')) != nil){
    238 		s[Blinelen(b)-1] = 0;
    239 		if((p = strrchr(s, ' ')) != nil){
    240 			*p++ = 0;
    241 			n = atoi(p);
    242 		}else
    243 			n = 1;
    244 		mdput(indb, s, mdget(indb, s)+n);
    245 	}
    246 }
    247 
    248 int tpid;
    249 void
    250 killtickle(void)
    251 {
    252 	postnote(PNPROC, tpid, "die");
    253 }
    254 
    255 void
    256 lockfile(char *s)
    257 {
    258 	int fd, t, w;
    259 	char err[ERRMAX];
    260 
    261 	if(s == nil)
    262 		return;
    263 	w = 50;
    264 	t = 0;
    265 	for(;;){
    266 		fd = open(s, OREAD);
    267 		if(fd >= 0)
    268 			break;
    269 		rerrstr(err, sizeof err);
    270 		if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil))
    271 			break;
    272 		sleep(w);
    273 		t += w;
    274 		if(w < 1000)
    275 			w = (w*3)/2;
    276 		if(t > 120*1000)
    277 			break;
    278 	}
    279 	if(fd < 0)
    280 		sysfatal("could not lock %s", s);
    281 	switch(tpid = fork()){
    282 	case -1:
    283 		sysfatal("fork: %r");
    284 	case 0:
    285 		for(;;){
    286 			sleep(30*1000);
    287 			free(dirfstat(fd));
    288 		}
    289 		_exits(nil);
    290 	default:
    291 		break;
    292 	}
    293 	close(fd);
    294 	atexit(killtickle);
    295 }