utils.c (14958B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include "dict.h" 5 6 Dict dicts[] = { 7 {"oed", "Oxford English Dictionary, 2nd Ed.", 8 "oed2", "oed2index", 9 oednextoff, oedprintentry, oedprintkey}, 10 {"ahd", "American Heritage Dictionary, 2nd College Ed.", 11 "ahd/DICT.DB", "ahd/index", 12 ahdnextoff, ahdprintentry, ahdprintkey}, 13 {"pgw", "Project Gutenberg Webster Dictionary", 14 "pgw", "pgwindex", 15 pgwnextoff, pgwprintentry, pgwprintkey}, 16 {"thesaurus", "Collins Thesaurus", 17 "thesaurus", "thesindex", 18 thesnextoff, thesprintentry, thesprintkey}, 19 {"roget", "Project Gutenberg Roget's Thesaurus", 20 "roget", "rogetindex", 21 rogetnextoff, rogetprintentry, rogetprintkey}, 22 23 {"ce", "Gendai Chinese->English", 24 "world/sansdata/sandic24.dat", 25 "world/sansdata/ceindex", 26 worldnextoff, worldprintentry, worldprintkey}, 27 {"ceh", "Gendai Chinese->English (Hanzi index)", 28 "world/sansdata/sandic24.dat", 29 "world/sansdata/cehindex", 30 worldnextoff, worldprintentry, worldprintkey}, 31 {"ec", "Gendai English->Chinese", 32 "world/sansdata/sandic24.dat", 33 "world/sansdata/ecindex", 34 worldnextoff, worldprintentry, worldprintkey}, 35 36 {"dae", "Gyldendal Danish->English", 37 "world/gylddata/sandic30.dat", 38 "world/gylddata/daeindex", 39 worldnextoff, worldprintentry, worldprintkey}, 40 {"eda", "Gyldendal English->Danish", 41 "world/gylddata/sandic29.dat", 42 "world/gylddata/edaindex", 43 worldnextoff, worldprintentry, worldprintkey}, 44 45 {"due", "Wolters-Noordhoff Dutch->English", 46 "world/woltdata/sandic07.dat", 47 "world/woltdata/deindex", 48 worldnextoff, worldprintentry, worldprintkey}, 49 {"edu", "Wolters-Noordhoff English->Dutch", 50 "world/woltdata/sandic06.dat", 51 "world/woltdata/edindex", 52 worldnextoff, worldprintentry, worldprintkey}, 53 54 {"fie", "WSOY Finnish->English", 55 "world/werndata/sandic32.dat", 56 "world/werndata/fieindex", 57 worldnextoff, worldprintentry, worldprintkey}, 58 {"efi", "WSOY English->Finnish", 59 "world/werndata/sandic31.dat", 60 "world/werndata/efiindex", 61 worldnextoff, worldprintentry, worldprintkey}, 62 63 {"fe", "Collins French->English", 64 "fe", "feindex", 65 pcollnextoff, pcollprintentry, pcollprintkey}, 66 {"ef", "Collins English->French", 67 "ef", "efindex", 68 pcollnextoff, pcollprintentry, pcollprintkey}, 69 70 {"ge", "Collins German->English", 71 "ge", "geindex", 72 pcollgnextoff, pcollgprintentry, pcollgprintkey}, 73 {"eg", "Collins English->German", 74 "eg", "egindex", 75 pcollgnextoff, pcollgprintentry, pcollgprintkey}, 76 77 {"ie", "Collins Italian->English", 78 "ie", "ieindex", 79 pcollnextoff, pcollprintentry, pcollprintkey}, 80 {"ei", "Collins English->Italian", 81 "ei", "eiindex", 82 pcollnextoff, pcollprintentry, pcollprintkey}, 83 84 {"je", "Sanshusha Japanese->English", 85 "world/sansdata/sandic18.dat", 86 "world/sansdata/jeindex", 87 worldnextoff, worldprintentry, worldprintkey}, 88 {"jek", "Sanshusha Japanese->English (Kanji index)", 89 "world/sansdata/sandic18.dat", 90 "world/sansdata/jekindex", 91 worldnextoff, worldprintentry, worldprintkey}, 92 {"ej", "Sanshusha English->Japanese", 93 "world/sansdata/sandic18.dat", 94 "world/sansdata/ejindex", 95 worldnextoff, worldprintentry, worldprintkey}, 96 97 {"tjeg", "Sanshusha technical Japanese->English,German", 98 "world/sansdata/sandic16.dat", 99 "world/sansdata/tjegindex", 100 worldnextoff, worldprintentry, worldprintkey}, 101 {"tjegk", "Sanshusha technical Japanese->English,German (Kanji index)", 102 "world/sansdata/sandic16.dat", 103 "world/sansdata/tjegkindex", 104 worldnextoff, worldprintentry, worldprintkey}, 105 {"tegj", "Sanshusha technical English->German,Japanese", 106 "world/sansdata/sandic16.dat", 107 "world/sansdata/tegjindex", 108 worldnextoff, worldprintentry, worldprintkey}, 109 {"tgje", "Sanshusha technical German->Japanese,English", 110 "world/sansdata/sandic16.dat", 111 "world/sansdata/tgjeindex", 112 worldnextoff, worldprintentry, worldprintkey}, 113 114 {"ne", "Kunnskapforlaget Norwegian->English", 115 "world/kunndata/sandic28.dat", 116 "world/kunndata/neindex", 117 worldnextoff, worldprintentry, worldprintkey}, 118 {"en", "Kunnskapforlaget English->Norwegian", 119 "world/kunndata/sandic27.dat", 120 "world/kunndata/enindex", 121 worldnextoff, worldprintentry, worldprintkey}, 122 123 {"re", "Leon Ungier Russian->English", 124 "re", "reindex", 125 simplenextoff, simpleprintentry, simpleprintkey}, 126 {"er", "Leon Ungier English->Russian", 127 "re", "erindex", 128 simplenextoff, simpleprintentry, simpleprintkey}, 129 130 {"se", "Collins Spanish->English", 131 "se", "seindex", 132 pcollnextoff, pcollprintentry, pcollprintkey}, 133 {"es", "Collins English->Spanish", 134 "es", "esindex", 135 pcollnextoff, pcollprintentry, pcollprintkey}, 136 137 {"swe", "Esselte Studium Swedish->English", 138 "world/essedata/sandic34.dat", 139 "world/essedata/sweindex", 140 worldnextoff, worldprintentry, worldprintkey}, 141 {"esw", "Esselte Studium English->Swedish", 142 "world/essedata/sandic33.dat", 143 "world/essedata/eswindex", 144 worldnextoff, worldprintentry, worldprintkey}, 145 146 {"movie", "Movies -- by title", 147 "movie/data", "movtindex", 148 movienextoff, movieprintentry, movieprintkey}, 149 {"moviea", "Movies -- by actor", 150 "movie/data", "movaindex", 151 movienextoff, movieprintentry, movieprintkey}, 152 {"movied", "Movies -- by director", 153 "movie/data", "movdindex", 154 movienextoff, movieprintentry, movieprintkey}, 155 156 {"slang", "English Slang", 157 "slang", "slangindex", 158 slangnextoff, slangprintentry, slangprintkey}, 159 160 {"robert", "Robert Électronique", 161 "robert/_pointers", "robert/_index", 162 robertnextoff, robertindexentry, robertprintkey}, 163 {"robertv", "Robert Électronique - formes des verbes", 164 "robert/flex.rob", "robert/_flexindex", 165 robertnextflex, robertflexentry, robertprintkey}, 166 167 {0, 0, 0, 0, 0} 168 }; 169 170 typedef struct Lig Lig; 171 struct Lig { 172 Rune start; /* accent rune */ 173 Rune pairs[100]; /* <char,accented version> pairs */ 174 }; 175 176 /* keep in sync with dict.h */ 177 static Lig ligtab[Nligs] = { 178 {0xb4, {0x41, 0xc1, 0x61, 0xe1, 0x43, 0x106, 0x63, 0x107, 0x45, 0xc9, 0x65, 0xe9, 0x67, 0x123, 0x49, 0xcd, 0x69, 0xed, 0x131, 0xed, 0x4c, 0x139, 0x6c, 0x13a, 0x4e, 0x143, 0x6e, 0x144, 0x4f, 0xd3, 0x6f, 0xf3, 0x52, 0x154, 0x72, 0x155, 0x53, 0x15a, 0x73, 0x15b, 0x55, 0xda, 0x75, 0xfa, 0x59, 0xdd, 0x79, 0xfd, 0x5a, 0x179, 0x7a, 0x17a, 0}}, 179 {0x2cb, {0x41, 0xc0, 0x61, 0xe0, 0x45, 0xc8, 0x65, 0xe8, 0x49, 0xcc, 0x69, 0xec, 0x131, 0xec, 0x4f, 0xd2, 0x6f, 0xf2, 0x55, 0xd9, 0x75, 0xf9, 0}}, 180 {0xa8, {0x41, 0xc4, 0x61, 0xe4, 0x45, 0xcb, 0x65, 0xeb, 0x49, 0xcf, 0x69, 0xef, 0x4f, 0xd6, 0x6f, 0xf6, 0x55, 0xdc, 0x75, 0xfc, 0x59, 0x178, 0x79, 0xff, 0}}, 181 {0xb8, {0x43, 0xc7, 0x63, 0xe7, 0x47, 0x122, 0x4b, 0x136, 0x6b, 0x137, 0x4c, 0x13b, 0x6c, 0x13c, 0x4e, 0x145, 0x6e, 0x146, 0x52, 0x156, 0x72, 0x157, 0x53, 0x15e, 0x73, 0x15f, 0x54, 0x162, 0x74, 0x163, 0}}, 182 {0x2dc, {0x41, 0xc3, 0x61, 0xe3, 0x49, 0x128, 0x69, 0x129, 0x131, 0x129, 0x4e, 0xd1, 0x6e, 0xf1, 0x4f, 0xd5, 0x6f, 0xf5, 0x55, 0x168, 0x75, 0x169, 0}}, 183 {0x2d8, {0x41, 0x102, 0x61, 0x103, 0x45, 0x114, 0x65, 0x115, 0x47, 0x11e, 0x67, 0x11f, 0x49, 0x12c, 0x69, 0x12d, 0x131, 0x12d, 0x4f, 0x14e, 0x6f, 0x14f, 0x55, 0x16c, 0x75, 0x16d, 0}}, 184 {0x2da, {0x41, 0xc5, 0x61, 0xe5, 0x55, 0x16e, 0x75, 0x16f, 0}}, 185 {0x2d9, {0x43, 0x10a, 0x63, 0x10b, 0x45, 0x116, 0x65, 0x117, 0x47, 0x120, 0x67, 0x121, 0x49, 0x130, 0x4c, 0x13f, 0x6c, 0x140, 0x5a, 0x17b, 0x7a, 0x17c, 0}}, 186 {0x2e, {0}}, 187 {0x2322, {0x41, 0xc2, 0x61, 0xe2, 0x43, 0x108, 0x63, 0x109, 0x45, 0xca, 0x65, 0xea, 0x47, 0x11c, 0x67, 0x11d, 0x48, 0x124, 0x68, 0x125, 0x49, 0xce, 0x69, 0xee, 0x131, 0xee, 0x4a, 0x134, 0x6a, 0x135, 0x4f, 0xd4, 0x6f, 0xf4, 0x53, 0x15c, 0x73, 0x15d, 0x55, 0xdb, 0x75, 0xfb, 0x57, 0x174, 0x77, 0x175, 0x59, 0x176, 0x79, 0x177, 0}}, 188 {0x32f, {0}}, 189 {0x2db, {0x41, 0x104, 0x61, 0x105, 0x45, 0x118, 0x65, 0x119, 0x49, 0x12e, 0x69, 0x12f, 0x131, 0x12f, 0x55, 0x172, 0x75, 0x173, 0}}, 190 {0xaf, {0x41, 0x100, 0x61, 0x101, 0x45, 0x112, 0x65, 0x113, 0x49, 0x12a, 0x69, 0x12b, 0x131, 0x12b, 0x4f, 0x14c, 0x6f, 0x14d, 0x55, 0x16a, 0x75, 0x16b, 0}}, 191 {0x2c7, {0x43, 0x10c, 0x63, 0x10d, 0x44, 0x10e, 0x64, 0x10f, 0x45, 0x11a, 0x65, 0x11b, 0x4c, 0x13d, 0x6c, 0x13e, 0x4e, 0x147, 0x6e, 0x148, 0x52, 0x158, 0x72, 0x159, 0x53, 0x160, 0x73, 0x161, 0x54, 0x164, 0x74, 0x165, 0x5a, 0x17d, 0x7a, 0x17e, 0}}, 192 {0x2bd, {0}}, 193 {0x2bc, {0}}, 194 {0x32e, {0}} 195 }; 196 197 Rune multitab[Nmulti][5] = { 198 {0x2bd, 0x3b1, 0}, 199 {0x2bc, 0x3b1, 0}, 200 {0x61, 0x6e, 0x64, 0}, 201 {0x61, 0x2f, 0x71, 0}, 202 {0x3c, 0x7c, 0}, 203 {0x2e, 0x2e, 0}, 204 {0x2e, 0x2e, 0x2e, 0}, 205 {0x2bd, 0x3b5, 0}, 206 {0x2bc, 0x3b5, 0}, 207 {0x2014, 0x2014, 0}, 208 {0x2bd, 0x3b7, 0}, 209 {0x2bc, 0x3b7, 0}, 210 {0x2bd, 0x3b9, 0}, 211 {0x2bc, 0x3b9, 0}, 212 {0x63, 0x74, 0}, 213 {0x66, 0x66, 0}, 214 {0x66, 0x66, 0x69, 0}, 215 {0x66, 0x66, 0x6c, 0}, 216 {0x66, 0x6c, 0}, 217 {0x66, 0x69, 0}, 218 {0x26b, 0x26b, 0}, 219 {0x73, 0x74, 0}, 220 {0x2bd, 0x3bf, 0}, 221 {0x2bc, 0x3bf, 0}, 222 {0x6f, 0x72, 0}, 223 {0x2bd, 0x3c1, 0}, 224 {0x2bc, 0x3c1, 0}, 225 {0x7e, 0x7e, 0}, 226 {0x2bd, 0x3c5, 0}, 227 {0x2bc, 0x3c5, 0}, 228 {0x2bd, 0x3c9, 0}, 229 {0x2bc, 0x3c9, 0}, 230 {0x6f, 0x65, 0}, 231 {0x20, 0x20, 0} 232 }; 233 234 #define risupper(r) (0x41 <= (r) && (r) <= 0x5a) 235 #define rislatin1(r) (0xC0 <= (r) && (r) <= 0xFF) 236 #define rtolower(r) ((r)-'A'+'a') 237 238 static Rune latin_fold_tab[] = 239 { 240 /* Table to fold latin 1 characters to ASCII equivalents 241 based at Rune value 0xc0 242 243 À Á Â Ã Ä Å Æ Ç 244 È É Ê Ë Ì Í Î Ï 245 Ð Ñ Ò Ó Ô Õ Ö × 246 Ø Ù Ú Û Ü Ý Þ ß 247 à á â ã ä å æ ç 248 è é ê ë ì í î ï 249 ð ñ ò ó ô õ ö ÷ 250 ø ù ú û ü ý þ ÿ 251 */ 252 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 253 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 254 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 , 255 'o', 'u', 'u', 'u', 'u', 'y', 0 , 0 , 256 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'c', 257 'e', 'e', 'e', 'e', 'i', 'i', 'i', 'i', 258 'd', 'n', 'o', 'o', 'o', 'o', 'o', 0 , 259 'o', 'u', 'u', 'u', 'u', 'y', 0 , 'y' 260 }; 261 262 static Rune *ttabstack[20]; 263 static int ntt; 264 265 /* 266 * tab is an array of n Assoc's, sorted by key. 267 * Look for key in tab, and return corresponding val 268 * or -1 if not there 269 */ 270 long 271 lookassoc(Assoc *tab, int n, char *key) 272 { 273 Assoc *q; 274 long i, low, high; 275 int r; 276 277 for(low = -1, high = n; high > low+1; ){ 278 i = (high+low)/2; 279 q = &tab[i]; 280 if((r=strcmp(key, q->key))<0) 281 high = i; 282 else if(r == 0) 283 return q->val; 284 else 285 low=i; 286 } 287 return -1; 288 } 289 290 long 291 looknassoc(Nassoc *tab, int n, long key) 292 { 293 Nassoc *q; 294 long i, low, high; 295 296 for(low = -1, high = n; high > low+1; ){ 297 i = (high+low)/2; 298 q = &tab[i]; 299 if(key < q->key) 300 high = i; 301 else if(key == q->key) 302 return q->val; 303 else 304 low=i; 305 } 306 return -1; 307 } 308 309 void 310 err(char *fmt, ...) 311 { 312 char buf[1000]; 313 va_list v; 314 315 va_start(v, fmt); 316 vsnprint(buf, sizeof(buf), fmt, v); 317 va_end(v); 318 fprint(2, "%s: %s\n", argv0, buf); 319 } 320 321 /* 322 * Write the rune r to bout, keeping track of line length 323 * and breaking the lines (at blanks) when they get too long 324 */ 325 void 326 outrune(long r) 327 { 328 if(outinhibit) 329 return; 330 if(++linelen > breaklen && r == 0x20) { 331 Bputc(bout, '\n'); 332 linelen = 0; 333 } else 334 Bputrune(bout, r); 335 } 336 337 void 338 outrunes(Rune *rp) 339 { 340 Rune r; 341 342 while((r = *rp++) != 0) 343 outrune(r); 344 } 345 346 /* like outrune, but when arg is know to be a char */ 347 void 348 outchar(int c) 349 { 350 if(outinhibit) 351 return; 352 if(++linelen > breaklen && c == ' ') { 353 c ='\n'; 354 linelen = 0; 355 } 356 Bputc(bout, c); 357 } 358 359 void 360 outchars(char *s) 361 { 362 char c; 363 364 while((c = *s++) != 0) 365 outchar(c); 366 } 367 368 void 369 outprint(char *fmt, ...) 370 { 371 char buf[1000]; 372 va_list v; 373 374 va_start(v, fmt); 375 vsnprint(buf, sizeof(buf), fmt, v); 376 va_end(v); 377 outchars(buf); 378 } 379 380 void 381 outpiece(char *b, char *e) 382 { 383 int c, lastc; 384 385 lastc = 0; 386 while(b < e) { 387 c = *b++; 388 if(c == '\n') 389 c = ' '; 390 if(!(c == ' ' && lastc == ' ')) 391 outchar(c); 392 lastc = c; 393 } 394 } 395 396 /* 397 * Go to new line if not already there; indent if ind != 0. 398 * If ind > 1, leave a blank line too. 399 * Slight hack: assume if current line is only one or two 400 * characters long, then they were spaces. 401 */ 402 void 403 outnl(int ind) 404 { 405 if(outinhibit) 406 return; 407 if(ind) { 408 if(ind > 1) { 409 if(linelen > 2) 410 Bputc(bout, '\n'); 411 Bprint(bout, "\n "); 412 } else if(linelen == 0) 413 Bprint(bout, " "); 414 else if(linelen == 1) 415 Bputc(bout, ' '); 416 else if(linelen != 2) 417 Bprint(bout, "\n "); 418 linelen = 2; 419 } else { 420 if(linelen) { 421 Bputc(bout, '\n'); 422 linelen = 0; 423 } 424 } 425 } 426 427 /* 428 * Fold the runes in null-terminated rp. 429 * Use the sort(1) definition of folding (uppercase to lowercase, 430 * latin1-accented characters to corresponding unaccented chars) 431 */ 432 void 433 fold(Rune *rp) 434 { 435 Rune r; 436 437 while((r = *rp) != 0) { 438 if (rislatin1(r) && latin_fold_tab[r-0xc0]) 439 r = latin_fold_tab[r-0xc0]; 440 if(risupper(r)) 441 r = rtolower(r); 442 *rp++ = r; 443 } 444 } 445 446 /* 447 * Like fold, but put folded result into new 448 * (assumed to have enough space). 449 * old is a regular expression, but we know that 450 * metacharacters aren't affected 451 */ 452 void 453 foldre(char *new, char *old) 454 { 455 Rune r; 456 457 while(*old) { 458 old += chartorune(&r, old); 459 if (rislatin1(r) && latin_fold_tab[r-0xc0]) 460 r = latin_fold_tab[r-0xc0]; 461 if(risupper(r)) 462 r = rtolower(r); 463 new += runetochar(new, &r); 464 } 465 *new = 0; 466 } 467 468 /* 469 * acomp(s, t) returns: 470 * -2 if s strictly precedes t 471 * -1 if s is a prefix of t 472 * 0 if s is the same as t 473 * 1 if t is a prefix of s 474 * 2 if t strictly precedes s 475 */ 476 477 int 478 acomp(Rune *s, Rune *t) 479 { 480 int cs, ct; 481 482 for(;;) { 483 cs = *s; 484 ct = *t; 485 if(cs != ct) 486 break; 487 if(cs == 0) 488 return 0; 489 s++; 490 t++; 491 } 492 if(cs == 0) 493 return -1; 494 if(ct == 0) 495 return 1; 496 if(cs < ct) 497 return -2; 498 return 2; 499 } 500 501 /* 502 * Copy null terminated Runes from 'from' to 'to'. 503 */ 504 void 505 runescpy(Rune *to, Rune *from) 506 { 507 while((*to++ = *from++) != 0) 508 continue; 509 } 510 511 /* 512 * Conversion of unsigned number to long, no overflow detection 513 */ 514 long 515 runetol(Rune *r) 516 { 517 int c; 518 long n; 519 520 n = 0; 521 for(;; r++){ 522 c = *r; 523 if(0x30<=c && c<=0x39) 524 c -= '0'; 525 else 526 break; 527 n = n*10 + c; 528 } 529 return n; 530 } 531 532 /* 533 * See if there is a rune corresponding to the accented 534 * version of r with accent acc (acc in [LIGS..LIGE-1]), 535 * and return it if so, else return NONE. 536 */ 537 Rune 538 liglookup(Rune acc, Rune r) 539 { 540 Rune *p; 541 542 if(acc < LIGS || acc >= LIGE) 543 return NONE; 544 for(p = ligtab[acc-LIGS].pairs; *p; p += 2) 545 if(*p == r) 546 return *(p+1); 547 return NONE; 548 } 549 550 /* 551 * Maintain a translation table stack (a translation table 552 * is an array of Runes indexed by bytes or 7-bit bytes). 553 * If starting is true, push the curtab onto the stack 554 * and return newtab; else pop the top of the stack and 555 * return it. 556 * If curtab is 0, initialize the stack and return. 557 */ 558 Rune * 559 changett(Rune *curtab, Rune *newtab, int starting) 560 { 561 if(curtab == 0) { 562 ntt = 0; 563 return 0; 564 } 565 if(starting) { 566 if(ntt >= asize(ttabstack)) { 567 if(debug) 568 err("translation stack overflow"); 569 return curtab; 570 } 571 ttabstack[ntt++] = curtab; 572 return newtab; 573 } else { 574 if(ntt == 0) { 575 if(debug) 576 err("translation stack underflow"); 577 return curtab; 578 } 579 return ttabstack[--ntt]; 580 } 581 }