dict.c - plan9port - fork of plan9port with libvec, libstr and libsdb

dict.c (12530B)
      1 #include <u.h>
      2 #include <libc.h>
      3 #include <bio.h>
      4 #include <regexp.h>
      5 #include <ctype.h>
      6 #include "dict.h"
      7 
      8 /*
      9  * Assumed index file structure: lines of form
     10  * 	[^\t]+\t[0-9]+
     11  * First field is key, second is byte offset into dictionary.
     12  * Should be sorted with args -u -t'	' +0f -1 +0 -1 +1n -2
     13  */
     14 typedef struct Addr Addr;
     15 
     16 struct Addr {
     17 	int	n;		/* number of offsets */
     18 	int	cur;		/* current position within doff array */
     19 	int	maxn;		/* actual current size of doff array */
     20 	ulong	doff[1];	/* doff[maxn], with 0..n-1 significant */
     21 };
     22 
     23 Biobuf	binbuf;
     24 Biobuf	boutbuf;
     25 Biobuf	*bin = &binbuf;		/* user cmd input */
     26 Biobuf	*bout = &boutbuf;	/* output */
     27 Biobuf	*bdict;			/* dictionary */
     28 Biobuf	*bindex;		/* index file */
     29 long	indextop;		/* index offset at end of file */
     30 int	lastcmd;		/* last executed command */
     31 Addr	*dot;			/* "current" address */
     32 Dict	*dict;			/* current dictionary */
     33 int	linelen;
     34 int	breaklen = 60;
     35 int	outinhibit;
     36 int	debug;
     37 
     38 void	execcmd(int);
     39 int	getpref(char*, Rune*);
     40 Entry	getentry(int);
     41 int	getfield(Rune*);
     42 long	locate(Rune*);
     43 int	parseaddr(char*, char**);
     44 int	parsecmd(char*);
     45 int	search(char*, int);
     46 long	seeknextline(Biobuf*, long);
     47 void	setdotnext(void);
     48 void	setdotprev(void);
     49 void	sortaddr(Addr*);
     50 void	usage(void);
     51 
     52 enum {
     53 	Plen=300,	/* max length of a search pattern */
     54 	Fieldlen=200,	/* max length of an index field */
     55 	Aslots=10	/* initial number of slots in an address */
     56 };
     57 
     58 void
     59 main(int argc, char **argv)
     60 {
     61 	int i, cmd, kflag;
     62 	char *line, *p;
     63 
     64 	Binit(&binbuf, 0, OREAD);
     65 	Binit(&boutbuf, 1, OWRITE);
     66 	kflag = 0;
     67 	line = 0;
     68 	dict = 0;
     69 
     70 	for(i=0; dicts[i].name; i++){
     71 		if(access(dictfile(dicts[i].path), 0)>=0 && access(dictfile(dicts[i].indexpath), 0)>=0){
     72 			dict = &dicts[i];
     73 			break;
     74 		}
     75 	}
     76 	ARGBEGIN {
     77 		case 'd':
     78 			p = ARGF();
     79 			dict = 0;
     80 			if(p) {
     81 				for(i=0; dicts[i].name; i++)
     82 					if(strcmp(p, dicts[i].name)==0) {
     83 						dict = &dicts[i];
     84 						break;
     85 					}
     86 			}
     87 			if(!dict)
     88 				usage();
     89 			break;
     90 		case 'c':
     91 			line = ARGF();
     92 			if(!line)
     93 				usage();
     94 			break;
     95 		case 'k':
     96 			kflag++;
     97 			break;
     98 		case 'D':
     99 			debug++;
    100 			break;
    101 		default:
    102 			usage();
    103 	ARGEND }
    104 	if(dict == 0){
    105 		err("no dictionaries present on this system");
    106 		exits("nodict");
    107 	}
    108 
    109 	if(kflag) {
    110 		(*dict->printkey)();
    111 		exits(0);
    112 	}
    113 	if(argc > 1)
    114 		usage();
    115 	else if(argc == 1) {
    116 		if(line)
    117 			usage();
    118 		p = argv[0];
    119 		line = malloc(strlen(p)+5);
    120 		sprint(line, "/%s/P\n", p);
    121 	}
    122 	dict->path = dictfile(dict->path);
    123 	dict->indexpath = dictfile(dict->indexpath);
    124 	bdict = Bopen(dict->path, OREAD);
    125 	if(!bdict) {
    126 		err("can't open dictionary %s", dict->path);
    127 		exits("nodict");
    128 	}
    129 	bindex = Bopen(dict->indexpath, OREAD);
    130 	if(!bindex) {
    131 		err("can't open index %s", dict->indexpath);
    132 		exits("noindex");
    133 	}
    134 	indextop = Bseek(bindex, 0L, 2);
    135 
    136 	dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
    137 	dot->n = 0;
    138 	dot->cur = 0;
    139 	dot->maxn = Aslots;
    140 	lastcmd = 0;
    141 
    142 	if(line) {
    143 		cmd = parsecmd(line);
    144 		if(cmd)
    145 			execcmd(cmd);
    146 	} else {
    147 		for(;;) {
    148 			Bprint(bout, "*");
    149 			Bflush(bout);
    150 			line = Brdline(bin, '\n');
    151 			linelen = 0;
    152 			if(!line)
    153 				break;
    154 			cmd = parsecmd(line);
    155 			if(cmd) {
    156 				execcmd(cmd);
    157 				lastcmd = cmd;
    158 			}
    159 		}
    160 	}
    161 	exits(0);
    162 }
    163 
    164 void
    165 usage(void)
    166 {
    167 	int i;
    168 	char *a, *b;
    169 
    170 	Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
    171 	Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
    172 	for(i = 0; dicts[i].name; i++){
    173 		a = b = "";
    174 		if(access(dictfile(dicts[i].path), 0)<0 || access(dictfile(dicts[i].indexpath), 0)<0){
    175 			a = "[";
    176 			b = "]";
    177 		}
    178 		Bprint(bout, "   %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
    179 	}
    180 	exits("usage");
    181 }
    182 
    183 int
    184 parsecmd(char *line)
    185 {
    186 	char *e;
    187 	int cmd, ans;
    188 
    189 	if(parseaddr(line, &e) >= 0)
    190 		line = e;
    191 	else
    192 		return 0;
    193 	cmd = *line;
    194 	ans = cmd;
    195 	if(isupper(cmd))
    196 		cmd = tolower(cmd);
    197 	if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
    198 	     cmd == '\n')) {
    199 		err("unknown command %c", cmd);
    200 		return 0;
    201 	}
    202 	if(cmd == '\n')
    203 		switch(lastcmd) {
    204 			case 0:	ans = 'H'; break;
    205 			case 'H':	ans = 'p'; break;
    206 			default :	ans = lastcmd; break;
    207 		}
    208 	else if(line[1] != '\n' && line[1] != 0)
    209 		err("extra stuff after command %c ignored", cmd);
    210 	return ans;
    211 }
    212 
    213 void
    214 execcmd(int cmd)
    215 {
    216 	Entry e;
    217 	int cur, doall;
    218 
    219 	if(isupper(cmd)) {
    220 		doall = 1;
    221 		cmd = tolower(cmd);
    222 		cur = 0;
    223 	} else {
    224 		doall = 0;
    225 		cur = dot->cur;
    226 	}
    227 	if(debug && doall && cmd == 'a')
    228 		Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
    229 	for(;;){
    230 		if(cur >= dot->n)
    231 			break;
    232 		if(doall) {
    233 			Bprint(bout, "%d\t", cur+1);
    234 			linelen += 4 + (cur >= 10);
    235 		}
    236 		switch(cmd) {
    237 		case 'a':
    238 			Bprint(bout, "#%lud\n", dot->doff[cur]);
    239 			break;
    240 		case 'h':
    241 		case 'p':
    242 		case 'r':
    243 			e = getentry(cur);
    244 			(*dict->printentry)(e, cmd);
    245 			break;
    246 		}
    247 		cur++;
    248 		if(doall) {
    249 			if(cmd == 'p' || cmd == 'r') {
    250 				Bputc(bout, '\n');
    251 				linelen = 0;
    252 			}
    253 		} else
    254 			break;
    255 	}
    256 	if(cur >= dot->n)
    257 		cur = 0;
    258 	dot->cur = cur;
    259 }
    260 
    261 /*
    262  * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
    263  * Answer goes in dot.
    264  * Return -1 if address starts, but get error.
    265  * Return 0 if no address.
    266  */
    267 int
    268 parseaddr(char *line, char **eptr)
    269 {
    270 	int delim, plen;
    271 	ulong v;
    272 	char *e;
    273 	char pat[Plen];
    274 
    275 	if(*line == '/' || *line == '!') {
    276 		/* anchored regular expression match; '!' means no folding */
    277 		if(*line == '/') {
    278 			delim = '/';
    279 			e = strpbrk(line+1, "/\n");
    280 		} else {
    281 			delim = '!';
    282 			e = strpbrk(line+1, "!\n");
    283 		}
    284 		plen = e-line-1;
    285 		if(plen >= Plen-3) {
    286 			err("pattern too big");
    287 			return -1;
    288 		}
    289 		pat[0] = '^';
    290 		memcpy(pat+1, line+1, plen);
    291 		pat[plen+1] = '$';
    292 		pat[plen+2] = 0;
    293 		if(*e == '\n')
    294 			line = e;
    295 		else
    296 			line = e+1;
    297 		if(!search(pat, delim == '/')) {
    298 			err("pattern not found");
    299 			return -1;
    300 		}
    301 	} else if(*line == '#') {
    302 		/* absolute byte offset into dictionary */
    303 		line++;
    304 		if(!isdigit((uchar)*line))
    305 			return -1;
    306 		v = strtoul(line, &e, 10);
    307 		line = e;
    308 		dot->doff[0] = v;
    309 		dot->n = 1;
    310 		dot->cur = 0;
    311 	} else if(isdigit((uchar)*line)) {
    312 		v = strtoul(line, &e, 10);
    313 		line = e;
    314 		if(v < 1 || v > dot->n)
    315 			err(".%d not in range [1,%d], ignored",
    316 				v, dot->n);
    317 		else
    318 			dot->cur = v-1;
    319 	} else if(*line == '.') {
    320 		line++;
    321 	} else {
    322 		*eptr = line;
    323 		return 0;
    324 	}
    325 	while(*line == '+' || *line == '-') {
    326 		if(*line == '+')
    327 			setdotnext();
    328 		else
    329 			setdotprev();
    330 		line++;
    331 	}
    332 	*eptr = line;
    333 	return 1;
    334 }
    335 
    336 /*
    337  * Index file is sorted by folded field1.
    338  * Method: find pre, a folded prefix of r.e. pat,
    339  * and then low = offset to beginning of
    340  * line in index file where first match of prefix occurs.
    341  * Then go through index until prefix no longer matches,
    342  * adding each line that matches real pattern to dot.
    343  * Finally, sort dot offsets (uniquing).
    344  * We know pat len < Plen, and that it is surrounded by ^..$
    345  */
    346 int
    347 search(char *pat, int dofold)
    348 {
    349 	int needre, prelen, match, n;
    350 	Reprog *re;
    351 	long ioff, v;
    352 	Rune pre[Plen];
    353 	Rune lit[Plen];
    354 	Rune entry[Fieldlen];
    355 	char fpat[Plen];
    356 
    357 	prelen = getpref(pat+1, pre);
    358 	if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
    359 		runescpy(lit, pre);
    360 		if(dofold)
    361 			fold(lit);
    362 		needre = 0;
    363 		SET(re);
    364 	} else {
    365 		needre = 1;
    366 		if(dofold) {
    367 			foldre(fpat, pat);
    368 			re = regcomp(fpat);
    369 		} else
    370 			re = regcomp(pat);
    371 	}
    372 	fold(pre);
    373 	ioff = locate(pre);
    374 	if(ioff < 0)
    375 		return 0;
    376 	dot->n = 0;
    377 	Bseek(bindex, ioff, 0);
    378 	for(;;) {
    379 		if(!getfield(entry))
    380 			break;
    381 		if(dofold)
    382 			fold(entry);
    383 		if(needre)
    384 			match = rregexec(re, entry, 0, 0);
    385 		else
    386 			match = (acomp(lit, entry) == 0);
    387 		if(match) {
    388 			if(!getfield(entry))
    389 				break;
    390 			v = runetol(entry);
    391 			if(dot->n >= dot->maxn) {
    392 				n = 2*dot->maxn;
    393 				dot = realloc(dot,
    394 					sizeof(Addr)+(n-1)*sizeof(long));
    395 				if(!dot) {
    396 					err("out of memory");
    397 					exits("nomem");
    398 				}
    399 				dot->maxn = n;
    400 			}
    401 			dot->doff[dot->n++] = v;
    402 		} else {
    403 			if(!dofold)
    404 				fold(entry);
    405 			if(*pre) {
    406 				n = acomp(pre, entry);
    407 				if(n < -1 || (!needre && n < 0))
    408 					break;
    409 			}
    410 			/* get to next index entry */
    411 			if(!getfield(entry))
    412 				break;
    413 		}
    414 	}
    415 	sortaddr(dot);
    416 	dot->cur = 0;
    417 	return dot->n;
    418 }
    419 
    420 /*
    421  * Return offset in index file of first line whose folded
    422  * first field has pre as a prefix.  -1 if none found.
    423  */
    424 long
    425 locate(Rune *pre)
    426 {
    427 	long top, bot, mid;
    428 	Rune entry[Fieldlen];
    429 
    430 	if(*pre == 0)
    431 		return 0;
    432 	bot = 0;
    433 	top = indextop;
    434 	if(debug>1)
    435 		fprint(2, "locate looking for prefix %S\n", pre);
    436 	for(;;) {
    437 		/*
    438 		 * Loop invariant: foldkey(bot) < pre <= foldkey(top)
    439 		 * and bot < top, and bot,top point at beginning of lines
    440 		 */
    441 		mid = (top+bot) / 2;
    442 		mid = seeknextline(bindex, mid);
    443 		if(debug > 1)
    444 			fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
    445 				bot, (top+bot) / 2, mid, top);
    446 		if(mid == top || !getfield(entry))
    447 			break;
    448 		if(debug > 1)
    449 			fprint(2, "key=%S\n", entry);
    450 		/*
    451 		 * here mid is strictly between bot and top
    452 		 */
    453 		fold(entry);
    454 		if(acomp(pre, entry) <= 0)
    455 			top = mid;
    456 		else
    457 			bot = mid;
    458 	}
    459 	/*
    460 	 * bot < top, but they don't necessarily point at successive lines
    461 	 * Use linear search from bot to find first line that pre is a
    462 	 * prefix of
    463 	 */
    464 	while((bot = seeknextline(bindex, bot)) <= top) {
    465 		if(!getfield(entry))
    466 			return -1;
    467 		if(debug > 1)
    468 			fprint(2, "key=%S\n", entry);
    469 		fold(entry);
    470 		switch(acomp(pre, entry)) {
    471 		case -2:
    472 			return -1;
    473 		case -1:
    474 		case 0:
    475 			return bot;
    476 		case 1:
    477 		case 2:
    478 			continue;
    479 		}
    480 	}
    481 	return -1;
    482 
    483 }
    484 
    485 /*
    486  * Get prefix of non re-metacharacters, runified, into pre,
    487  * and return length
    488  */
    489 int
    490 getpref(char *pat, Rune *pre)
    491 {
    492 	int n, r;
    493 	char *p;
    494 
    495 	p = pat;
    496 	while(*p) {
    497 		n = chartorune(pre, p);
    498 		r = *pre;
    499 		switch(r) {
    500 		case 0x2e: case 0x2a: case 0x2b: case 0x3f:
    501 		case 0x5b: case 0x5d: case 0x28: case ')':
    502 		case 0x7c: case 0x5e: case 0x24:
    503 			*pre = 0;
    504 			return p-pat;
    505 		case '\\':
    506 			p += n;
    507 			p += chartorune(++pre, p);
    508 			pre++;
    509 			break;
    510 		default:
    511 			p += n;
    512 			pre++;
    513 		}
    514 	}
    515 	return p-pat;
    516 }
    517 
    518 long
    519 seeknextline(Biobuf *b, long off)
    520 {
    521 	long c;
    522 
    523 	Bseek(b, off, 0);
    524 	do {
    525 		c = Bgetrune(b);
    526 	} while(c>=0 && c!='\n');
    527 	return Boffset(b);
    528 }
    529 
    530 /*
    531  * Get next field out of index file (either tab- or nl- terminated)
    532  * Answer in *rp, assumed to be Fieldlen long.
    533  * Return 0 if read error first.
    534  */
    535 int
    536 getfield(Rune *rp)
    537 {
    538 	long c;
    539 	int n;
    540 
    541 	for(n=Fieldlen; n-- > 0; ) {
    542 		if ((c = Bgetrune(bindex)) < 0)
    543 			return 0;
    544 		if(c == '\t' || c == '\n') {
    545 			*rp = '\0';
    546 			return 1;
    547 		}
    548 		*rp++ = c;
    549 	}
    550 	err("word too long");
    551 	return 0;
    552 }
    553 
    554 /*
    555  * A compare longs function suitable for qsort
    556  */
    557 static int
    558 longcmp(const void *av, const void *bv)
    559 {
    560 	long v;
    561 	long *a, *b;
    562 
    563 	a = (long*)av;
    564 	b = (long*)bv;
    565 
    566 	v = *a - *b;
    567 	if(v < 0)
    568 		return -1;
    569 	else if(v == 0)
    570 		return 0;
    571 	else
    572 		return 1;
    573 }
    574 
    575 void
    576 sortaddr(Addr *a)
    577 {
    578 	int i, j;
    579 	long v;
    580 
    581 	if(a->n <= 1)
    582 		return;
    583 
    584 	qsort(a->doff, a->n, sizeof(long), longcmp);
    585 
    586 	/* remove duplicates */
    587 	for(i=0, j=0; j < a->n; j++) {
    588 		v = a->doff[j];
    589 		if(i > 0 && v == a->doff[i-1])
    590 			continue;
    591 		a->doff[i++] = v;
    592 	}
    593 	a->n = i;
    594 }
    595 
    596 Entry
    597 getentry(int i)
    598 {
    599 	long b, e, n;
    600 	static Entry ans;
    601 	static int anslen = 0;
    602 
    603 	b = dot->doff[i];
    604 	e = (*dict->nextoff)(b+1);
    605 	ans.doff = b;
    606 	if(e < 0) {
    607 		err("couldn't seek to entry");
    608 		ans.start = 0;
    609 		ans.end = 0;
    610 	} else {
    611 		n = e-b;
    612 		if(n+1 > anslen) {
    613 			ans.start = realloc(ans.start, n+1);
    614 			if(!ans.start) {
    615 				err("out of memory");
    616 				exits("nomem");
    617 			}
    618 			anslen = n+1;
    619 		}
    620 		Bseek(bdict, b, 0);
    621 		n = Bread(bdict, ans.start, n);
    622 		ans.end = ans.start + n;
    623 		*ans.end = 0;
    624 	}
    625 	return ans;
    626 }
    627 
    628 void
    629 setdotnext(void)
    630 {
    631 	long b;
    632 
    633 	b = (*dict->nextoff)(dot->doff[dot->cur]+1);
    634 	if(b < 0) {
    635 		err("couldn't find a next entry");
    636 		return;
    637 	}
    638 	dot->doff[0] = b;
    639 	dot->n = 1;
    640 	dot->cur = 0;
    641 }
    642 
    643 void
    644 setdotprev(void)
    645 {
    646 	int tryback;
    647 	long here, last, p;
    648 
    649 	if(dot->cur < 0 || dot->cur >= dot->n)
    650 		return;
    651 	tryback = 2000;
    652 	here = dot->doff[dot->cur];
    653 	last = 0;
    654 	while(last == 0) {
    655 		p = here - tryback;
    656 		if(p < 0)
    657 			p = 0;
    658 		for(;;) {
    659 			p = (*dict->nextoff)(p+1);
    660 			if(p < 0)
    661 				return; /* shouldn't happen */
    662 			if(p >= here)
    663 				break;
    664 			last = p;
    665 		}
    666 		if(!last) {
    667 			if(here - tryback < 0) {
    668 				err("can't find a previous entry");
    669 				return;
    670 			}
    671 			tryback = 2*tryback;
    672 		}
    673 	}
    674 	dot->doff[0] = last;
    675 	dot->n = 1;
    676 	dot->cur = 0;
    677 }
    678 
    679 /*
    680  * find the specified file and return a path.
    681  * default location is #9/dict, but can be
    682  * in $dictdir instead.
    683  */
    684 char*
    685 dictfile(char *f)
    686 {
    687 	static char *dict;
    688 	static int did;
    689 
    690 	if(!did){
    691 		dict = getenv("dictpath");
    692 		did = 1;
    693 	}
    694 
    695 	if(dict)
    696 		return smprint("%s/%s", dict, f);
    697 	return unsharp(smprint("#9/dict/%s", f));
    698 }
	plan9port fork of plan9port with libvec, libstr and libsdb
	Log \| Files \| Refs \| README \| LICENSE