sprog.c (23115B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <ctype.h> 5 #include "code.h" 6 7 /* fig leaves for possibly signed char quantities */ 8 #define ISUPPER(c) isupper((c)&0xff) 9 #define ISLOWER(c) islower((c)&0xff) 10 #define ISALPHA(c) isalpha((c)&0xff) 11 #define ISDIGIT(c) isdigit((c)&0xff) 12 #define ISVOWEL(c) voweltab[(c)&0xff] 13 #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c)) 14 #define pair(a,b) (((a)<<8) | (b)) 15 #define DLEV 2 16 #define DSIZ 40 17 18 typedef long Bits; 19 #define Set(h, f) ((long)(h) & (f)) 20 21 Bits nop(char*, char*, char*, int, int); 22 Bits strip(char*, char*, char*, int, int); 23 Bits ize(char*, char*, char*, int, int); 24 Bits i_to_y(char*, char*, char*, int, int); 25 Bits ily(char*, char*, char*, int, int); 26 Bits subst(char*, char*, char*, int, int); 27 Bits CCe(char*, char*, char*, int, int); 28 Bits tion(char*, char*, char*, int, int); 29 Bits an(char*, char*, char*, int, int); 30 Bits s(char*, char*, char*, int, int); 31 Bits es(char*, char*, char*, int, int); 32 Bits bility(char*, char*, char*, int, int); 33 Bits y_to_e(char*, char*, char*, int, int); 34 Bits VCe(char*, char*, char*, int, int); 35 36 Bits trypref(char*, char*, int, int); 37 Bits tryword(char*, char*, int, int); 38 Bits trysuff(char*, int, int); 39 Bits dict(char*, char*); 40 void typeprint(Bits); 41 void pcomma(char*); 42 43 void ise(void); 44 int ordinal(void); 45 char* skipv(char*); 46 int inun(char*, Bits); 47 char* ztos(char*); 48 void readdict(char*); 49 50 typedef struct Ptab Ptab; 51 struct Ptab 52 { 53 char* s; 54 int flag; 55 }; 56 57 typedef struct Suftab Suftab; 58 struct Suftab 59 { 60 char *suf; 61 Bits (*p1)(char*, char*, char*, int, int); 62 int n1; 63 char *d1; 64 char *a1; 65 int flag; 66 int affixable; 67 Bits (*p2)(char*, char*, char*, int, int); 68 int n2; 69 char *d2; 70 char *a2; 71 }; 72 73 Suftab staba[] = { 74 {"aibohp",subst,1,"-e+ia","",NOUN, NOUN}, 75 0 76 }; 77 78 Suftab stabc[] = 79 { 80 {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN}, 81 {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN}, 82 {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ }, 83 {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN }, 84 {"cipocs",ize,1,"-e+ic","",NOUN, ADJ }, 85 {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ }, 86 {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ }, 87 {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ }, 88 {"cibohp",subst,1,"-e+ic","",NOUN, ADJ }, 89 0 90 }; 91 Suftab stabd[] = 92 { 93 {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"}, 94 {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN}, 95 0 96 }; 97 Suftab stabe[] = 98 { 99 /* 100 * V_affix for comment ->commence->commentment?? 101 */ 102 {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX}, 103 {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX}, 104 {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ}, 105 {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ}, 106 {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ}, 107 {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP}, 108 {"ekil",strip,4,"","+like",N_AFFIX ,ADJ}, 109 0 110 }; 111 Suftab stabg[] = 112 { 113 {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN}, 114 {"gnikam",strip,6,"","+making",NOUN,NOUN}, 115 {"gnipeek",strip,7,"","+keeping",NOUN,NOUN}, 116 {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN}, 117 0 118 }; 119 Suftab stabl[] = 120 { 121 {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ}, 122 {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX}, 123 {"latnem",strip,2,"","+al",N_AFFIX,ADJ}, 124 {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN}, 125 {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN}, 126 0 127 }; 128 Suftab stabm[] = 129 { 130 /* congregational + ism */ 131 {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN}, 132 {"margo",subst,-1,"-ph+m","",NOUN,NOUN}, 133 0 134 }; 135 Suftab stabn[] = 136 { 137 {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX}, 138 {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX}, 139 {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR}, 140 {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX}, 141 {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX}, 142 {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB}, 143 {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX}, 144 {"nemow",strip,5,"","+women",MAN,PROP_COLLECT}, 145 {"nem",strip,3,"","+man",MAN,PROP_COLLECT}, 146 {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT}, 147 0 148 }; 149 Suftab stabp[] = 150 { 151 {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX}, 152 0 153 }; 154 Suftab stabr[] = 155 { 156 {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"}, 157 {"reyhparg",nop,0,"","",0,NOUN}, 158 {"reyl",nop,0,"","",0,NOUN}, 159 {"rekam",strip,5,"","+maker",NOUN,NOUN}, 160 {"repeek",strip,6,"","+keeper",NOUN,NOUN}, 161 {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"}, 162 {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y}, 163 {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX}, 164 {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX}, 165 0 166 }; 167 Suftab stabs[] = 168 { 169 {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX}, 170 {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ }, 171 {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"}, 172 {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH }, 173 {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH }, 174 0 175 }; 176 Suftab stabt[] = 177 { 178 {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB}, 179 {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" }, 180 {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX}, 181 {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP}, 182 0 183 }; 184 Suftab staby[] = 185 { 186 {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX}, 187 {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX}, 188 {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX}, 189 {"ytisuo",nop,0,"","",NOUN}, 190 {"ytilb",nop,0,"","",0,NOUN}, 191 {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX }, 192 {"ylb",y_to_e,1,"-e+y","",ADJ,ADV}, 193 {"ylc",nop,0,"","",0}, 194 {"ylelb",nop,0,"","",0}, 195 {"ylelp",nop,0,"","",0}, 196 {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP}, 197 {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX}, 198 {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP}, 199 0 200 }; 201 Suftab stabz[] = 202 { 203 0 204 }; 205 Suftab* suftab[] = 206 { 207 staba, 208 stabz, 209 stabc, 210 stabd, 211 stabe, 212 stabz, 213 stabg, 214 stabz, 215 stabz, 216 stabz, 217 stabz, 218 stabl, 219 stabm, 220 stabn, 221 stabz, 222 stabp, 223 stabz, 224 stabr, 225 stabs, 226 stabt, 227 stabz, 228 stabz, 229 stabz, 230 stabz, 231 staby, 232 stabz 233 }; 234 235 Ptab ptaba[] = 236 { 237 "anti", 0, 238 "auto", 0, 239 0 240 }; 241 Ptab ptabb[] = 242 { 243 "bio", 0, 244 0 245 }; 246 Ptab ptabc[] = 247 { 248 "counter", 0, 249 0 250 }; 251 Ptab ptabd[] = 252 { 253 "dis", 0, 254 0 255 }; 256 Ptab ptabe[] = 257 { 258 "electro", 0, 259 0 260 }; 261 Ptab ptabf[] = 262 { 263 "femto", 0, 264 0 265 }; 266 Ptab ptabg[] = 267 { 268 "geo", 0, 269 "giga", 0, 270 0 271 }; 272 Ptab ptabh[] = 273 { 274 "hyper", 0, 275 0 276 }; 277 Ptab ptabi[] = 278 { 279 "immuno", 0, 280 "im", IN, 281 "intra", 0, 282 "inter", 0, 283 "in", IN, 284 "ir", IN, 285 "iso", 0, 286 0 287 }; 288 Ptab ptabj[] = 289 { 290 0 291 }; 292 Ptab ptabk[] = 293 { 294 "kilo", 0, 295 0 296 }; 297 Ptab ptabl[] = 298 { 299 0 300 }; 301 Ptab ptabm[] = 302 { 303 "magneto", 0, 304 "mega", 0, 305 "meta", 0, 306 "micro", 0, 307 "mid", 0, 308 "milli", 0, 309 "mini", 0, 310 "mis", 0, 311 "mono", 0, 312 "multi", 0, 313 0 314 }; 315 Ptab ptabn[] = 316 { 317 "nano", 0, 318 "neuro", 0, 319 "non", 0, 320 0 321 }; 322 Ptab ptabo[] = 323 { 324 "out", 0, 325 "over", 0, 326 0 327 }; 328 Ptab ptabp[] = 329 { 330 "para", 0, 331 "photo", 0, 332 "pico", 0, 333 "poly", 0, 334 "pre", 0, 335 "pseudo", 0, 336 "psycho", 0, 337 0 338 }; 339 Ptab ptabq[] = 340 { 341 "quasi", 0, 342 0 343 }; 344 Ptab ptabr[] = 345 { 346 "radio", 0, 347 "re", 0, 348 0 349 }; 350 Ptab ptabs[] = 351 { 352 "semi", 0, 353 "stereo", 0, 354 "sub", 0, 355 "super", 0, 356 0 357 }; 358 Ptab ptabt[] = 359 { 360 "tele", 0, 361 "tera", 0, 362 "thermo", 0, 363 0 364 }; 365 Ptab ptabu[] = 366 { 367 "ultra", 0, 368 "under", 0, /*must precede un*/ 369 "un", IN, 370 0 371 }; 372 Ptab ptabv[] = 373 { 374 0 375 }; 376 Ptab ptabw[] = 377 { 378 0 379 }; 380 Ptab ptabx[] = 381 { 382 0 383 }; 384 Ptab ptaby[] = 385 { 386 0 387 }; 388 Ptab ptabz[] = 389 { 390 0 391 }; 392 393 Ptab* preftab[] = 394 { 395 ptaba, 396 ptabb, 397 ptabc, 398 ptabd, 399 ptabe, 400 ptabf, 401 ptabg, 402 ptabh, 403 ptabi, 404 ptabj, 405 ptabk, 406 ptabl, 407 ptabm, 408 ptabn, 409 ptabo, 410 ptabp, 411 ptabq, 412 ptabr, 413 ptabs, 414 ptabt, 415 ptabu, 416 ptabv, 417 ptabw, 418 ptabx, 419 ptaby, 420 ptabz 421 }; 422 423 typedef struct { 424 char *mesg; 425 enum { NONE, SUFF, PREF} type; 426 } Deriv; 427 428 int aflag; 429 int cflag; 430 int fflag; 431 int vflag; 432 int xflag; 433 int nflag; 434 char word[500]; 435 char* original; 436 Deriv emptyderiv; 437 Deriv deriv[DSIZ+3]; 438 char affix[DSIZ*10]; /* 10 is longest affix message */ 439 int prefcount; 440 int suffcount; 441 char* acmeid; 442 char space[300000]; /* must be as large as "words"+"space" in pcode run */ 443 Bits encode[2048]; /* must be as long as "codes" in pcode run */ 444 int nencode; 445 char voweltab[256]; 446 char* spacep[128*128+1]; /* pointer to words starting with 'xx' */ 447 Biobuf bin; 448 Biobuf bout; 449 450 char* codefile = "#9/lib/amspell"; 451 char* brfile = "#9/lib/brspell"; 452 char* Usage = "usage"; 453 454 void 455 main(int argc, char *argv[]) 456 { 457 char *ep, *cp; 458 char *dp; 459 int j, i, c; 460 int low; 461 Bits h; 462 463 codefile = unsharp(codefile); 464 brfile = unsharp(brfile); 465 466 Binit(&bin, 0, OREAD); 467 Binit(&bout, 1, OWRITE); 468 for(i=0; c = "aeiouyAEIOUY"[i]; i++) 469 voweltab[c] = 1; 470 while(argc > 1) { 471 if(argv[1][0] != '-') 472 break; 473 for(i=1; c = argv[1][i]; i++) 474 switch(c) { 475 default: 476 fprint(2, "usage: spell [-bcCvx] [-f file]\n"); 477 exits(Usage); 478 479 case 'a': 480 aflag++; 481 continue; 482 483 case 'b': 484 ise(); 485 if(!fflag) 486 codefile = brfile; 487 continue; 488 489 case 'C': /* for "correct" */ 490 vflag++; 491 case 'c': /* for ocr */ 492 cflag++; 493 continue; 494 495 case 'v': 496 vflag++; 497 continue; 498 499 case 'x': 500 xflag++; 501 continue; 502 503 case 'f': 504 if(argc <= 2) { 505 fprint(2, "spell: -f requires another argument\n"); 506 exits(Usage); 507 } 508 argv++; 509 argc--; 510 codefile = argv[1]; 511 fflag++; 512 goto brk; 513 } 514 brk: 515 argv++; 516 argc--; 517 } 518 readdict(codefile); 519 if(argc > 1) { 520 fprint(2, "usage: spell [-bcCvx] [-f file]\n"); 521 exits(Usage); 522 } 523 if(aflag) 524 cflag = vflag = 0; 525 526 for(;;) { 527 affix[0] = 0; 528 original = Brdline(&bin, '\n'); 529 if(original == 0) 530 exits(0); 531 original[Blinelen(&bin)-1] = 0; 532 low = 0; 533 534 if(aflag) { 535 acmeid = original; 536 while(*original != ':') 537 if(*original++ == 0) 538 exits(0); 539 while(*++original != ':') 540 if(*original == 0) 541 exits(0); 542 *original++ = 0; 543 } 544 for(ep=word,dp=original; j = *dp; ep++,dp++) { 545 if(ISLOWER(j)) 546 low++; 547 if(ep >= word+sizeof(word)-1) 548 break; 549 *ep = j; 550 } 551 *ep = 0; 552 553 if(ISDIGIT(word[0]) && ordinal()) 554 continue; 555 556 h = 0; 557 if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))) 558 for(cp=original+1,dp=word+1; dp<ep; dp++,cp++) 559 *dp = Tolower(*cp); 560 if(!h) 561 for(;;) { /* at most twice */ 562 if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)) 563 break; 564 if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH)) 565 break; 566 if(!ISUPPER(word[0])) 567 break; 568 cp = original; 569 dp = word; 570 while(*dp = *cp++) { 571 if(!low) 572 *dp = Tolower(*dp); 573 dp++; 574 } 575 word[0] = Tolower(word[0]); 576 } 577 578 if(cflag) { 579 if(!h || Set(h,STOP)) 580 print("-"); 581 else if(!vflag) 582 print("+"); 583 else 584 print("%c",'0' + (suffcount>0) + 585 (prefcount>4? 8: 2*prefcount)); 586 } else if(!h || Set(h,STOP)) { 587 if(aflag) 588 Bprint(&bout, "%s:%s\n", acmeid, original); 589 else 590 Bprint(&bout, "%s\n", original); 591 } else if(affix[0] != 0 && affix[0] != '.') 592 print("%s\t%s\n", affix, original); 593 } 594 } 595 596 /* strip exactly one suffix and do 597 * indicated routine(s), which may recursively 598 * strip suffixes 599 */ 600 Bits 601 trysuff(char* ep, int lev, int flag) 602 { 603 Suftab *t; 604 char *cp, *sp; 605 Bits h = 0; 606 int initchar = ep[-1]; 607 608 flag &= ~MONO; 609 lev += DLEV; 610 if(lev < DSIZ) { 611 deriv[lev] = emptyderiv; 612 deriv[lev-1] = emptyderiv; 613 } 614 if(!ISLOWER(initchar)) 615 return h; 616 for(t=suftab[initchar-'a']; sp=t->suf; t++) { 617 cp = ep; 618 while(*sp) 619 if(*--cp != *sp++) 620 goto next; 621 for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);) 622 ; 623 if(sp < word) 624 continue; 625 if(!(t->affixable & flag)) 626 return 0; 627 h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP); 628 if(!h && t->p2!=0) { 629 if(lev < DSIZ) { 630 deriv[lev] = emptyderiv; 631 deriv[lev+1] = emptyderiv; 632 } 633 h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP); 634 } 635 break; 636 next:; 637 } 638 return h; 639 } 640 641 Bits 642 nop(char* ep, char* d, char* a, int lev, int flag) 643 { 644 USED(ep); 645 USED(d); 646 USED(a); 647 USED(lev); 648 USED(flag); 649 return 0; 650 } 651 652 Bits 653 cstrip(char* ep, char* d, char* a, int lev, int flag) 654 { 655 int temp = ep[0]; 656 657 if(ISVOWEL(temp) && ISVOWEL(ep[-1])) { 658 switch(pair(ep[-1],ep[0])) { 659 case pair('a', 'a'): 660 case pair('a', 'e'): 661 case pair('a', 'i'): 662 case pair('e', 'a'): 663 case pair('e', 'e'): 664 case pair('e', 'i'): 665 case pair('i', 'i'): 666 case pair('o', 'a'): 667 return 0; 668 } 669 } else 670 if(temp==ep[-1]&&temp==ep[-2]) 671 return 0; 672 return strip(ep,d,a,lev,flag); 673 } 674 675 Bits 676 strip(char* ep, char* d, char* a, int lev, int flag) 677 { 678 Bits h = trypref(ep, a, lev, flag); 679 680 USED(d); 681 if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2])) 682 h = 0; 683 if(h) 684 return h; 685 if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) { 686 h = trypref(ep-1,a,lev,flag|MONO); 687 if(h) 688 return h; 689 } 690 return trysuff(ep,lev,flag); 691 } 692 693 Bits 694 s(char* ep, char* d, char* a, int lev, int flag) 695 { 696 if(lev > DLEV+1) 697 return 0; 698 if(*ep=='s') { 699 switch(ep[-1]) { 700 case 'y': 701 if(ISVOWEL(ep[-2])||ISUPPER(*word)) 702 break; /*says Kennedys*/ 703 case 'x': 704 case 'z': 705 case 's': 706 return 0; 707 case 'h': 708 switch(ep[-2]) { 709 case 'c': 710 case 's': 711 return 0; 712 } 713 } 714 } 715 return strip(ep,d,a,lev,flag); 716 } 717 718 Bits 719 an(char* ep, char* d, char* a, int lev, int flag) 720 { 721 USED(d); 722 if(!ISUPPER(*word)) /*must be proper name*/ 723 return 0; 724 return trypref(ep,a,lev,flag); 725 } 726 727 Bits 728 ize(char* ep, char* d, char* a, int lev, int flag) 729 { 730 int temp = ep[-1]; 731 Bits h; 732 733 USED(a); 734 ep[-1] = 'e'; 735 h = strip(ep,"",d,lev,flag); 736 ep[-1] = temp; 737 return h; 738 } 739 740 Bits 741 y_to_e(char* ep, char* d, char* a, int lev, int flag) 742 { 743 Bits h; 744 int temp; 745 746 USED(a); 747 switch(ep[-1]) { 748 case 'a': 749 case 'e': 750 case 'i': 751 return 0; 752 } 753 temp = *ep; 754 *ep++ = 'e'; 755 h = strip(ep,"",d,lev,flag); 756 ep[-1] = temp; 757 return h; 758 } 759 760 Bits 761 ily(char* ep, char* d, char* a, int lev, int flag) 762 { 763 int temp = ep[0]; 764 char *cp = ep; 765 766 if(temp==ep[-1]&&temp==ep[-2]) /* sillly */ 767 return 0; 768 if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */ 769 while(cp>word) 770 if(ISVOWEL(*--cp)) /* shyness */ 771 return 0; 772 if(ep[-1]=='i') 773 return i_to_y(ep,d,a,lev,flag); 774 return cstrip(ep,d,a,lev,flag); 775 } 776 777 Bits 778 bility(char* ep, char* d, char* a, int lev, int flag) 779 { 780 *ep++ = 'l'; 781 return y_to_e(ep,d,a,lev,flag); 782 } 783 784 Bits 785 i_to_y(char* ep, char* d, char* a, int lev, int flag) 786 { 787 Bits h; 788 int temp; 789 790 if(ISUPPER(*word)) 791 return 0; 792 if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) { 793 ep[-1] = 'y'; 794 a = d; 795 } 796 h = cstrip(ep,"",a,lev,flag); 797 ep[-1] = temp; 798 return h; 799 } 800 801 Bits 802 es(char* ep, char* d, char* a, int lev, int flag) 803 { 804 if(lev>DLEV) 805 return 0; 806 switch(ep[-1]) { 807 default: 808 return 0; 809 case 'i': 810 return i_to_y(ep,d,a,lev,flag); 811 case 'h': 812 switch(ep[-2]) { 813 default: 814 return 0; 815 case 'c': 816 case 's': 817 break; 818 } 819 case 's': 820 case 'z': 821 case 'x': 822 return strip(ep,d,a,lev,flag); 823 } 824 } 825 826 Bits 827 subst(char* ep, char* d, char* a, int lev, int flag) 828 { 829 char *u,*t; 830 Bits h; 831 832 USED(a); 833 if(skipv(skipv(ep-1)) < word) 834 return 0; 835 for(t=d; *t!='+'; t++) 836 continue; 837 for(u=ep; *--t!='-';) 838 *--u = *t; 839 h = strip(ep,"",d,lev,flag); 840 while(*++t != '+') 841 continue; 842 while(*++t) 843 *u++ = *t; 844 return h; 845 } 846 847 Bits 848 tion(char* ep, char* d, char* a, int lev, int flag) 849 { 850 switch(ep[-2]) { 851 default: 852 return trypref(ep,a,lev,flag); 853 case 'a': 854 case 'e': 855 case 'i': 856 case 'o': 857 case 'u': 858 return y_to_e(ep,d,a,lev,flag); 859 } 860 } 861 862 /* 863 * possible consonant-consonant-e ending 864 */ 865 Bits 866 CCe(char* ep, char* d, char* a, int lev, int flag) 867 { 868 Bits h; 869 870 switch(ep[-1]) { 871 case 'l': 872 if(ISVOWEL(ep[-2])) 873 break; 874 switch(ep[-2]) { 875 case 'l': 876 case 'r': 877 case 'w': 878 break; 879 default: 880 return y_to_e(ep,d,a,lev,flag); 881 } 882 break; 883 case 'c': 884 case 'g': 885 if(*ep == 'a') /* prevent -able for -eable */ 886 return 0; 887 case 's': 888 case 'v': 889 case 'z': 890 if(ep[-2]==ep[-1]) 891 break; 892 if(ISVOWEL(ep[-2])) 893 break; 894 case 'u': 895 if(h = y_to_e(ep,d,a,lev,flag)) 896 return h; 897 if(!(ep[-2]=='n' && ep[-1]=='g')) 898 return 0; 899 } 900 return VCe(ep,d,a,lev,flag); 901 } 902 903 /* 904 * possible consonant-vowel-consonant-e ending 905 */ 906 Bits 907 VCe(char* ep, char* d, char* a, int lev, int flag) 908 { 909 int c; 910 Bits h; 911 912 c = ep[-1]; 913 if(c=='e') 914 return 0; 915 if(!ISVOWEL(c) && ISVOWEL(ep[-2])) { 916 c = *ep; 917 *ep++ = 'e'; 918 h = trypref(ep,d,lev,flag); 919 if(!h) 920 h = trysuff(ep,lev,flag); 921 if(h) 922 return h; 923 ep--; 924 *ep = c; 925 } 926 return cstrip(ep,d,a,lev,flag); 927 } 928 929 Ptab* 930 lookuppref(uchar** wp, char* ep) 931 { 932 Ptab *sp; 933 uchar *bp,*cp; 934 unsigned int initchar = Tolower(**wp); 935 936 if(!ISALPHA(initchar)) 937 return 0; 938 for(sp=preftab[initchar-'a'];sp->s;sp++) { 939 bp = *wp; 940 for(cp= (uchar*)sp->s;*cp; ) 941 if(*bp++!=*cp++) 942 goto next; 943 for(cp=bp;cp<(uchar*)ep;cp++) 944 if(ISVOWEL(*cp)) { 945 *wp = bp; 946 return sp; 947 } 948 next:; 949 } 950 return 0; 951 } 952 953 /* while word is not in dictionary try stripping 954 * prefixes. Fail if no more prefixes. 955 */ 956 Bits 957 trypref(char* ep, char* a, int lev, int flag) 958 { 959 Ptab *tp; 960 char *bp, *cp; 961 char *pp; 962 Bits h; 963 char space[20]; 964 965 if(lev<DSIZ) { 966 deriv[lev].mesg = a; 967 deriv[lev].type = *a=='.'? NONE: SUFF; 968 } 969 if(h = tryword(word,ep,lev,flag)) { 970 if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO)) 971 return h; 972 h = 0; 973 } 974 bp = word; 975 pp = space; 976 if(lev<DSIZ) { 977 deriv[lev+1].mesg = pp; 978 deriv[lev+1].type = 0; 979 } 980 while(tp=lookuppref((uchar**)(void*)&bp,ep)) { 981 *pp++ = '+'; 982 cp = tp->s; 983 while(pp<space+sizeof(space) && (*pp = *cp++)) 984 pp++; 985 deriv[lev+1].type += PREF; 986 h = tryword(bp,ep,lev+1,flag); 987 if(Set(h,NOPREF) || 988 ((tp->flag&IN) && inun(bp-2,h)==0)) { 989 h = 0; 990 break; 991 } 992 if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO)) 993 break; 994 h = 0; 995 } 996 if(lev < DSIZ) { 997 deriv[lev+1] = emptyderiv; 998 deriv[lev+2] = emptyderiv; 999 } 1000 return h; 1001 } 1002 1003 Bits 1004 tryword(char* bp, char* ep, int lev, int flag) 1005 { 1006 int j; 1007 Bits h = 0; 1008 char duple[3]; 1009 1010 if(ep-bp <= 1) 1011 return h; 1012 if(flag&MONO) { 1013 if(lev<DSIZ) { 1014 deriv[++lev].mesg = duple; 1015 deriv[lev].type = SUFF; 1016 } 1017 duple[0] = '+'; 1018 duple[1] = *ep; 1019 duple[2] = 0; 1020 } 1021 h = dict(bp, ep); 1022 if(vflag==0 || h==0) 1023 return h; 1024 /* 1025 * when derivations are wanted, collect them 1026 * for printing 1027 */ 1028 j = lev; 1029 prefcount = suffcount = 0; 1030 do { 1031 if(j<DSIZ && deriv[j].type) { 1032 strcat(affix, deriv[j].mesg); 1033 if(deriv[j].type == SUFF) 1034 suffcount++; 1035 else if(deriv[j].type != NONE) 1036 prefcount = deriv[j].type/PREF; 1037 } 1038 } while(--j > 0); 1039 return h; 1040 } 1041 1042 int 1043 inun(char* bp, Bits h) 1044 { 1045 if(*bp == 'u') 1046 return Set(h, IN) == 0; 1047 /* *bp == 'i' */ 1048 if(Set(h, IN) == 0) 1049 return 0; 1050 switch(bp[2]) { 1051 case 'r': 1052 return bp[1] == 'r'; 1053 case 'm': 1054 case 'p': 1055 return bp[1] == 'm'; 1056 } 1057 return bp[1] == 'n'; 1058 } 1059 1060 char* 1061 skipv(char *s) 1062 { 1063 if(s >= word && ISVOWEL(*s)) 1064 s--; 1065 while(s >= word && !ISVOWEL(*s)) 1066 s--; 1067 return s; 1068 } 1069 1070 /* 1071 * crummy way to Britishise 1072 */ 1073 void 1074 ise(void) 1075 { 1076 Suftab *p; 1077 int i; 1078 1079 for(i=0; i<26; i++) 1080 for(p = suftab[i]; p->suf; p++) { 1081 p->suf = ztos(p->suf); 1082 p->d1 = ztos(p->d1); 1083 p->a1 = ztos(p->a1); 1084 } 1085 } 1086 1087 char* 1088 ztos(char *as) 1089 { 1090 char *s, *ds; 1091 1092 for(s=as; *s; s++) 1093 if(*s == 'z') 1094 goto copy; 1095 return as; 1096 1097 copy: 1098 ds = strdup(as); 1099 for(s=ds; *s; s++) 1100 if(*s == 'z') 1101 *s = 's'; 1102 return ds; 1103 } 1104 1105 Bits 1106 dict(char* bp, char* ep) 1107 { 1108 char *cp, *cp1, *w, *wp, *we; 1109 int n, f; 1110 1111 w = bp; 1112 we = ep; 1113 n = ep-bp; 1114 if(n <= 1) 1115 return NOUN; 1116 1117 f = w[0] & 0x7f; 1118 f *= 128; 1119 f += w[1] & 0x7f; 1120 bp = spacep[f]; 1121 ep = spacep[f+1]; 1122 1123 loop: 1124 if(bp >= ep) { 1125 if(xflag) 1126 fprint(2, "=%.*s\n", utfnlen(w, n), w); 1127 return 0; 1128 } 1129 /* 1130 * find the beginning of some word in the middle 1131 */ 1132 cp = bp + (ep-bp)/2; 1133 1134 while(cp > bp && !(*cp & 0x80)) 1135 cp--; 1136 while(cp > bp && (cp[-1] & 0x80)) 1137 cp--; 1138 1139 wp = w + 2; /* skip two letters */ 1140 cp1 = cp + 2; /* skip affix code */ 1141 for(;;) { 1142 if(wp >= we) { 1143 if(*cp1 & 0x80) 1144 goto found; 1145 else 1146 f = 1; 1147 break; 1148 } 1149 if(*cp1 & 0x80) { 1150 f = -1; 1151 break; 1152 } 1153 f = *cp1++ - *wp++; 1154 if(f != 0) 1155 break; 1156 } 1157 1158 if(f < 0) { 1159 while(!(*cp1 & 0x80)) 1160 cp1++; 1161 bp = cp1; 1162 goto loop; 1163 } 1164 ep = cp; 1165 goto loop; 1166 1167 found: 1168 f = ((cp[0] & 0x7) << 8) | 1169 (cp[1] & 0xff); 1170 if(xflag) { 1171 fprint(2, "=%.*s ", utfnlen(w, n), w); 1172 typeprint(encode[f]); 1173 } 1174 return encode[f]; 1175 } 1176 1177 void 1178 typeprint(Bits h) 1179 { 1180 1181 pcomma(""); 1182 if(h & NOUN) 1183 pcomma("n"); 1184 if(h & PROP_COLLECT) 1185 pcomma("pc"); 1186 if(h & VERB) { 1187 if((h & VERB) == VERB) 1188 pcomma("v"); 1189 else 1190 if((h & VERB) == V_IRREG) 1191 pcomma("vi"); 1192 else 1193 if(h & ED) 1194 pcomma("ed"); 1195 } 1196 if(h & ADJ) 1197 pcomma("a"); 1198 if(h & COMP) { 1199 if((h & COMP) == ACTOR) 1200 pcomma("er"); 1201 else 1202 pcomma("comp"); 1203 } 1204 if(h & DONT_TOUCH) 1205 pcomma("d"); 1206 if(h & N_AFFIX) 1207 pcomma("na"); 1208 if(h & ADV) 1209 pcomma("adv"); 1210 if(h & ION) 1211 pcomma("ion"); 1212 if(h & V_AFFIX) 1213 pcomma("va"); 1214 if(h & MAN) 1215 pcomma("man"); 1216 if(h & NOPREF) 1217 pcomma("nopref"); 1218 if(h & MONO) 1219 pcomma("ms"); 1220 if(h & IN) 1221 pcomma("in"); 1222 if(h & _Y) 1223 pcomma("y"); 1224 if(h & STOP) 1225 pcomma("s"); 1226 fprint(2, "\n"); 1227 } 1228 1229 void 1230 pcomma(char *s) 1231 { 1232 static int flag; 1233 1234 if(*s == 0) { 1235 flag = 0; 1236 return; 1237 } 1238 if(!flag) { 1239 fprint(2, "%s", s); 1240 flag = 1; 1241 } else 1242 fprint(2, ",%s", s); 1243 } 1244 1245 /* 1246 * is the word on of the following 1247 * 12th teen 1248 * 21st end in 1 1249 * 23rd end in 3 1250 * 77th default 1251 * called knowing word[0] is a digit 1252 */ 1253 int 1254 ordinal(void) 1255 { 1256 char *cp = word; 1257 static char sp[4]; 1258 1259 while(ISDIGIT(*cp)) 1260 cp++; 1261 strncpy(sp,cp,3); 1262 if(ISUPPER(cp[0]) && ISUPPER(cp[1])) { 1263 sp[0] = Tolower(cp[0]); 1264 sp[1] = Tolower(cp[1]); 1265 } 1266 return 0 == strncmp(sp, 1267 cp[-2]=='1'? "th": /* out of bounds if 1 digit */ 1268 *--cp=='1'? "st": /* harmless */ 1269 *cp=='2'? "nd": 1270 *cp=='3'? "rd": 1271 "th", 3); 1272 } 1273 1274 /* 1275 * read in the dictionary. 1276 * format is 1277 * { 1278 * short nencode; 1279 * long encode[nencode]; 1280 * char space[*]; 1281 * }; 1282 * 1283 * the encodings are a table all different 1284 * affixes. 1285 * the dictionary proper has 2 bytes 1286 * that demark and then the rest of the 1287 * word. the 2 bytes have the following 1288 * 0x80 0x00 flag 1289 * 0x78 0x00 count of prefix bytes 1290 * common with prev word 1291 * 0x07 0xff affix code 1292 * 1293 * all ints are big endians in the file. 1294 */ 1295 void 1296 readdict(char *file) 1297 { 1298 char *s, *is, *lasts, *ls; 1299 int c, i, sp, p; 1300 int f; 1301 long l; 1302 1303 lasts = 0; 1304 f = open(file, 0); 1305 if(f == -1) { 1306 fprint(2, "cannot open %s\n", file); 1307 exits("open"); 1308 } 1309 if(read(f, space, 2) != 2) 1310 goto bad; 1311 nencode = ((space[0]&0xff)<<8) | (space[1]&0xff); 1312 if(read(f, space, 4*nencode) != 4*nencode) 1313 goto bad; 1314 s = space; 1315 for(i=0; i<nencode; i++) { 1316 l = (long)(s[0] & 0xff) << 24; 1317 l |= (s[1] & 0xff) << 16; 1318 l |= (s[2] & 0xff) << 8; 1319 l |= s[3] & 0xff; 1320 encode[i] = (Bits)l; 1321 s += 4; 1322 } 1323 l = read(f, space, sizeof(space)); 1324 if(l == sizeof(space)) 1325 goto noroom; 1326 is = space + (sizeof(space) - l); 1327 memmove(is, space, l); 1328 1329 s = space; 1330 c = *is++ & 0xff; 1331 sp = -1; 1332 i = 0; 1333 1334 loop: 1335 if(s > is) 1336 goto noroom; 1337 if(c < 0) { 1338 close(f); 1339 while(sp < 128*128) 1340 spacep[++sp] = s; 1341 *s = (char)0x80; /* fence */ 1342 return; 1343 } 1344 p = (c>>3) & 0xf; 1345 *s++ = c; 1346 *s++ = *is++ & 0xff; 1347 if(p <= 0) 1348 i = (*is++ & 0xff)*128; 1349 if(p <= 1) { 1350 if(!(*is & 0x80)) 1351 i = i/128*128 + (*is++ & 0xff); 1352 if(i <= sp) { 1353 fprint(2, "the dict isnt sorted or \n"); 1354 fprint(2, "memmove didn't work\n"); 1355 goto bad; 1356 } 1357 while(sp < i) 1358 spacep[++sp] = s-2; 1359 } 1360 ls = lasts; 1361 lasts = s; 1362 for(p-=2; p>0; p--) 1363 *s++ = *ls++; 1364 for(;;) { 1365 if(is >= space+sizeof(space)) { 1366 c = -1; 1367 break; 1368 } 1369 c = *is++ & 0xff; 1370 if(c & 0x80) 1371 break; 1372 *s++ = c; 1373 } 1374 *s = 0; 1375 goto loop; 1376 1377 bad: 1378 fprint(2, "trouble reading %s\n", file); 1379 exits("read"); 1380 noroom: 1381 fprint(2, "not enough space for dictionary\n"); 1382 exits("space"); 1383 }