dict.c (12530B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <regexp.h> 5 #include <ctype.h> 6 #include "dict.h" 7 8 /* 9 * Assumed index file structure: lines of form 10 * [^\t]+\t[0-9]+ 11 * First field is key, second is byte offset into dictionary. 12 * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2 13 */ 14 typedef struct Addr Addr; 15 16 struct Addr { 17 int n; /* number of offsets */ 18 int cur; /* current position within doff array */ 19 int maxn; /* actual current size of doff array */ 20 ulong doff[1]; /* doff[maxn], with 0..n-1 significant */ 21 }; 22 23 Biobuf binbuf; 24 Biobuf boutbuf; 25 Biobuf *bin = &binbuf; /* user cmd input */ 26 Biobuf *bout = &boutbuf; /* output */ 27 Biobuf *bdict; /* dictionary */ 28 Biobuf *bindex; /* index file */ 29 long indextop; /* index offset at end of file */ 30 int lastcmd; /* last executed command */ 31 Addr *dot; /* "current" address */ 32 Dict *dict; /* current dictionary */ 33 int linelen; 34 int breaklen = 60; 35 int outinhibit; 36 int debug; 37 38 void execcmd(int); 39 int getpref(char*, Rune*); 40 Entry getentry(int); 41 int getfield(Rune*); 42 long locate(Rune*); 43 int parseaddr(char*, char**); 44 int parsecmd(char*); 45 int search(char*, int); 46 long seeknextline(Biobuf*, long); 47 void setdotnext(void); 48 void setdotprev(void); 49 void sortaddr(Addr*); 50 void usage(void); 51 52 enum { 53 Plen=300, /* max length of a search pattern */ 54 Fieldlen=200, /* max length of an index field */ 55 Aslots=10 /* initial number of slots in an address */ 56 }; 57 58 void 59 main(int argc, char **argv) 60 { 61 int i, cmd, kflag; 62 char *line, *p; 63 64 Binit(&binbuf, 0, OREAD); 65 Binit(&boutbuf, 1, OWRITE); 66 kflag = 0; 67 line = 0; 68 dict = 0; 69 70 for(i=0; dicts[i].name; i++){ 71 if(access(dictfile(dicts[i].path), 0)>=0 && access(dictfile(dicts[i].indexpath), 0)>=0){ 72 dict = &dicts[i]; 73 break; 74 } 75 } 76 ARGBEGIN { 77 case 'd': 78 p = ARGF(); 79 dict = 0; 80 if(p) { 81 for(i=0; dicts[i].name; i++) 82 if(strcmp(p, dicts[i].name)==0) { 83 dict = &dicts[i]; 84 break; 85 } 86 } 87 if(!dict) 88 usage(); 89 break; 90 case 'c': 91 line = ARGF(); 92 if(!line) 93 usage(); 94 break; 95 case 'k': 96 kflag++; 97 break; 98 case 'D': 99 debug++; 100 break; 101 default: 102 usage(); 103 ARGEND } 104 if(dict == 0){ 105 err("no dictionaries present on this system"); 106 exits("nodict"); 107 } 108 109 if(kflag) { 110 (*dict->printkey)(); 111 exits(0); 112 } 113 if(argc > 1) 114 usage(); 115 else if(argc == 1) { 116 if(line) 117 usage(); 118 p = argv[0]; 119 line = malloc(strlen(p)+5); 120 sprint(line, "/%s/P\n", p); 121 } 122 dict->path = dictfile(dict->path); 123 dict->indexpath = dictfile(dict->indexpath); 124 bdict = Bopen(dict->path, OREAD); 125 if(!bdict) { 126 err("can't open dictionary %s", dict->path); 127 exits("nodict"); 128 } 129 bindex = Bopen(dict->indexpath, OREAD); 130 if(!bindex) { 131 err("can't open index %s", dict->indexpath); 132 exits("noindex"); 133 } 134 indextop = Bseek(bindex, 0L, 2); 135 136 dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong)); 137 dot->n = 0; 138 dot->cur = 0; 139 dot->maxn = Aslots; 140 lastcmd = 0; 141 142 if(line) { 143 cmd = parsecmd(line); 144 if(cmd) 145 execcmd(cmd); 146 } else { 147 for(;;) { 148 Bprint(bout, "*"); 149 Bflush(bout); 150 line = Brdline(bin, '\n'); 151 linelen = 0; 152 if(!line) 153 break; 154 cmd = parsecmd(line); 155 if(cmd) { 156 execcmd(cmd); 157 lastcmd = cmd; 158 } 159 } 160 } 161 exits(0); 162 } 163 164 void 165 usage(void) 166 { 167 int i; 168 char *a, *b; 169 170 Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0); 171 Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n"); 172 for(i = 0; dicts[i].name; i++){ 173 a = b = ""; 174 if(access(dictfile(dicts[i].path), 0)<0 || access(dictfile(dicts[i].indexpath), 0)<0){ 175 a = "["; 176 b = "]"; 177 } 178 Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b); 179 } 180 exits("usage"); 181 } 182 183 int 184 parsecmd(char *line) 185 { 186 char *e; 187 int cmd, ans; 188 189 if(parseaddr(line, &e) >= 0) 190 line = e; 191 else 192 return 0; 193 cmd = *line; 194 ans = cmd; 195 if(isupper(cmd)) 196 cmd = tolower(cmd); 197 if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' || 198 cmd == '\n')) { 199 err("unknown command %c", cmd); 200 return 0; 201 } 202 if(cmd == '\n') 203 switch(lastcmd) { 204 case 0: ans = 'H'; break; 205 case 'H': ans = 'p'; break; 206 default : ans = lastcmd; break; 207 } 208 else if(line[1] != '\n' && line[1] != 0) 209 err("extra stuff after command %c ignored", cmd); 210 return ans; 211 } 212 213 void 214 execcmd(int cmd) 215 { 216 Entry e; 217 int cur, doall; 218 219 if(isupper(cmd)) { 220 doall = 1; 221 cmd = tolower(cmd); 222 cur = 0; 223 } else { 224 doall = 0; 225 cur = dot->cur; 226 } 227 if(debug && doall && cmd == 'a') 228 Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1); 229 for(;;){ 230 if(cur >= dot->n) 231 break; 232 if(doall) { 233 Bprint(bout, "%d\t", cur+1); 234 linelen += 4 + (cur >= 10); 235 } 236 switch(cmd) { 237 case 'a': 238 Bprint(bout, "#%lud\n", dot->doff[cur]); 239 break; 240 case 'h': 241 case 'p': 242 case 'r': 243 e = getentry(cur); 244 (*dict->printentry)(e, cmd); 245 break; 246 } 247 cur++; 248 if(doall) { 249 if(cmd == 'p' || cmd == 'r') { 250 Bputc(bout, '\n'); 251 linelen = 0; 252 } 253 } else 254 break; 255 } 256 if(cur >= dot->n) 257 cur = 0; 258 dot->cur = cur; 259 } 260 261 /* 262 * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')* 263 * Answer goes in dot. 264 * Return -1 if address starts, but get error. 265 * Return 0 if no address. 266 */ 267 int 268 parseaddr(char *line, char **eptr) 269 { 270 int delim, plen; 271 ulong v; 272 char *e; 273 char pat[Plen]; 274 275 if(*line == '/' || *line == '!') { 276 /* anchored regular expression match; '!' means no folding */ 277 if(*line == '/') { 278 delim = '/'; 279 e = strpbrk(line+1, "/\n"); 280 } else { 281 delim = '!'; 282 e = strpbrk(line+1, "!\n"); 283 } 284 plen = e-line-1; 285 if(plen >= Plen-3) { 286 err("pattern too big"); 287 return -1; 288 } 289 pat[0] = '^'; 290 memcpy(pat+1, line+1, plen); 291 pat[plen+1] = '$'; 292 pat[plen+2] = 0; 293 if(*e == '\n') 294 line = e; 295 else 296 line = e+1; 297 if(!search(pat, delim == '/')) { 298 err("pattern not found"); 299 return -1; 300 } 301 } else if(*line == '#') { 302 /* absolute byte offset into dictionary */ 303 line++; 304 if(!isdigit((uchar)*line)) 305 return -1; 306 v = strtoul(line, &e, 10); 307 line = e; 308 dot->doff[0] = v; 309 dot->n = 1; 310 dot->cur = 0; 311 } else if(isdigit((uchar)*line)) { 312 v = strtoul(line, &e, 10); 313 line = e; 314 if(v < 1 || v > dot->n) 315 err(".%d not in range [1,%d], ignored", 316 v, dot->n); 317 else 318 dot->cur = v-1; 319 } else if(*line == '.') { 320 line++; 321 } else { 322 *eptr = line; 323 return 0; 324 } 325 while(*line == '+' || *line == '-') { 326 if(*line == '+') 327 setdotnext(); 328 else 329 setdotprev(); 330 line++; 331 } 332 *eptr = line; 333 return 1; 334 } 335 336 /* 337 * Index file is sorted by folded field1. 338 * Method: find pre, a folded prefix of r.e. pat, 339 * and then low = offset to beginning of 340 * line in index file where first match of prefix occurs. 341 * Then go through index until prefix no longer matches, 342 * adding each line that matches real pattern to dot. 343 * Finally, sort dot offsets (uniquing). 344 * We know pat len < Plen, and that it is surrounded by ^..$ 345 */ 346 int 347 search(char *pat, int dofold) 348 { 349 int needre, prelen, match, n; 350 Reprog *re; 351 long ioff, v; 352 Rune pre[Plen]; 353 Rune lit[Plen]; 354 Rune entry[Fieldlen]; 355 char fpat[Plen]; 356 357 prelen = getpref(pat+1, pre); 358 if(pat[prelen+1] == 0 || pat[prelen+1] == '$') { 359 runescpy(lit, pre); 360 if(dofold) 361 fold(lit); 362 needre = 0; 363 SET(re); 364 } else { 365 needre = 1; 366 if(dofold) { 367 foldre(fpat, pat); 368 re = regcomp(fpat); 369 } else 370 re = regcomp(pat); 371 } 372 fold(pre); 373 ioff = locate(pre); 374 if(ioff < 0) 375 return 0; 376 dot->n = 0; 377 Bseek(bindex, ioff, 0); 378 for(;;) { 379 if(!getfield(entry)) 380 break; 381 if(dofold) 382 fold(entry); 383 if(needre) 384 match = rregexec(re, entry, 0, 0); 385 else 386 match = (acomp(lit, entry) == 0); 387 if(match) { 388 if(!getfield(entry)) 389 break; 390 v = runetol(entry); 391 if(dot->n >= dot->maxn) { 392 n = 2*dot->maxn; 393 dot = realloc(dot, 394 sizeof(Addr)+(n-1)*sizeof(long)); 395 if(!dot) { 396 err("out of memory"); 397 exits("nomem"); 398 } 399 dot->maxn = n; 400 } 401 dot->doff[dot->n++] = v; 402 } else { 403 if(!dofold) 404 fold(entry); 405 if(*pre) { 406 n = acomp(pre, entry); 407 if(n < -1 || (!needre && n < 0)) 408 break; 409 } 410 /* get to next index entry */ 411 if(!getfield(entry)) 412 break; 413 } 414 } 415 sortaddr(dot); 416 dot->cur = 0; 417 return dot->n; 418 } 419 420 /* 421 * Return offset in index file of first line whose folded 422 * first field has pre as a prefix. -1 if none found. 423 */ 424 long 425 locate(Rune *pre) 426 { 427 long top, bot, mid; 428 Rune entry[Fieldlen]; 429 430 if(*pre == 0) 431 return 0; 432 bot = 0; 433 top = indextop; 434 if(debug>1) 435 fprint(2, "locate looking for prefix %S\n", pre); 436 for(;;) { 437 /* 438 * Loop invariant: foldkey(bot) < pre <= foldkey(top) 439 * and bot < top, and bot,top point at beginning of lines 440 */ 441 mid = (top+bot) / 2; 442 mid = seeknextline(bindex, mid); 443 if(debug > 1) 444 fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n", 445 bot, (top+bot) / 2, mid, top); 446 if(mid == top || !getfield(entry)) 447 break; 448 if(debug > 1) 449 fprint(2, "key=%S\n", entry); 450 /* 451 * here mid is strictly between bot and top 452 */ 453 fold(entry); 454 if(acomp(pre, entry) <= 0) 455 top = mid; 456 else 457 bot = mid; 458 } 459 /* 460 * bot < top, but they don't necessarily point at successive lines 461 * Use linear search from bot to find first line that pre is a 462 * prefix of 463 */ 464 while((bot = seeknextline(bindex, bot)) <= top) { 465 if(!getfield(entry)) 466 return -1; 467 if(debug > 1) 468 fprint(2, "key=%S\n", entry); 469 fold(entry); 470 switch(acomp(pre, entry)) { 471 case -2: 472 return -1; 473 case -1: 474 case 0: 475 return bot; 476 case 1: 477 case 2: 478 continue; 479 } 480 } 481 return -1; 482 483 } 484 485 /* 486 * Get prefix of non re-metacharacters, runified, into pre, 487 * and return length 488 */ 489 int 490 getpref(char *pat, Rune *pre) 491 { 492 int n, r; 493 char *p; 494 495 p = pat; 496 while(*p) { 497 n = chartorune(pre, p); 498 r = *pre; 499 switch(r) { 500 case 0x2e: case 0x2a: case 0x2b: case 0x3f: 501 case 0x5b: case 0x5d: case 0x28: case ')': 502 case 0x7c: case 0x5e: case 0x24: 503 *pre = 0; 504 return p-pat; 505 case '\\': 506 p += n; 507 p += chartorune(++pre, p); 508 pre++; 509 break; 510 default: 511 p += n; 512 pre++; 513 } 514 } 515 return p-pat; 516 } 517 518 long 519 seeknextline(Biobuf *b, long off) 520 { 521 long c; 522 523 Bseek(b, off, 0); 524 do { 525 c = Bgetrune(b); 526 } while(c>=0 && c!='\n'); 527 return Boffset(b); 528 } 529 530 /* 531 * Get next field out of index file (either tab- or nl- terminated) 532 * Answer in *rp, assumed to be Fieldlen long. 533 * Return 0 if read error first. 534 */ 535 int 536 getfield(Rune *rp) 537 { 538 long c; 539 int n; 540 541 for(n=Fieldlen; n-- > 0; ) { 542 if ((c = Bgetrune(bindex)) < 0) 543 return 0; 544 if(c == '\t' || c == '\n') { 545 *rp = '\0'; 546 return 1; 547 } 548 *rp++ = c; 549 } 550 err("word too long"); 551 return 0; 552 } 553 554 /* 555 * A compare longs function suitable for qsort 556 */ 557 static int 558 longcmp(const void *av, const void *bv) 559 { 560 long v; 561 long *a, *b; 562 563 a = (long*)av; 564 b = (long*)bv; 565 566 v = *a - *b; 567 if(v < 0) 568 return -1; 569 else if(v == 0) 570 return 0; 571 else 572 return 1; 573 } 574 575 void 576 sortaddr(Addr *a) 577 { 578 int i, j; 579 long v; 580 581 if(a->n <= 1) 582 return; 583 584 qsort(a->doff, a->n, sizeof(long), longcmp); 585 586 /* remove duplicates */ 587 for(i=0, j=0; j < a->n; j++) { 588 v = a->doff[j]; 589 if(i > 0 && v == a->doff[i-1]) 590 continue; 591 a->doff[i++] = v; 592 } 593 a->n = i; 594 } 595 596 Entry 597 getentry(int i) 598 { 599 long b, e, n; 600 static Entry ans; 601 static int anslen = 0; 602 603 b = dot->doff[i]; 604 e = (*dict->nextoff)(b+1); 605 ans.doff = b; 606 if(e < 0) { 607 err("couldn't seek to entry"); 608 ans.start = 0; 609 ans.end = 0; 610 } else { 611 n = e-b; 612 if(n+1 > anslen) { 613 ans.start = realloc(ans.start, n+1); 614 if(!ans.start) { 615 err("out of memory"); 616 exits("nomem"); 617 } 618 anslen = n+1; 619 } 620 Bseek(bdict, b, 0); 621 n = Bread(bdict, ans.start, n); 622 ans.end = ans.start + n; 623 *ans.end = 0; 624 } 625 return ans; 626 } 627 628 void 629 setdotnext(void) 630 { 631 long b; 632 633 b = (*dict->nextoff)(dot->doff[dot->cur]+1); 634 if(b < 0) { 635 err("couldn't find a next entry"); 636 return; 637 } 638 dot->doff[0] = b; 639 dot->n = 1; 640 dot->cur = 0; 641 } 642 643 void 644 setdotprev(void) 645 { 646 int tryback; 647 long here, last, p; 648 649 if(dot->cur < 0 || dot->cur >= dot->n) 650 return; 651 tryback = 2000; 652 here = dot->doff[dot->cur]; 653 last = 0; 654 while(last == 0) { 655 p = here - tryback; 656 if(p < 0) 657 p = 0; 658 for(;;) { 659 p = (*dict->nextoff)(p+1); 660 if(p < 0) 661 return; /* shouldn't happen */ 662 if(p >= here) 663 break; 664 last = p; 665 } 666 if(!last) { 667 if(here - tryback < 0) { 668 err("can't find a previous entry"); 669 return; 670 } 671 tryback = 2*tryback; 672 } 673 } 674 dot->doff[0] = last; 675 dot->n = 1; 676 dot->cur = 0; 677 } 678 679 /* 680 * find the specified file and return a path. 681 * default location is #9/dict, but can be 682 * in $dictdir instead. 683 */ 684 char* 685 dictfile(char *f) 686 { 687 static char *dict; 688 static int did; 689 690 if(!did){ 691 dict = getenv("dictpath"); 692 did = 1; 693 } 694 695 if(dict) 696 return smprint("%s/%s", dict, f); 697 return unsharp(smprint("#9/dict/%s", f)); 698 }