deroff.c (14583B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 5 /* 6 * Deroff command -- strip troff, eqn, and tbl sequences from 7 * a file. Has three flags argument, -w, to cause output one word per line 8 * rather than in the original format. 9 * -mm (or -ms) causes the corresponding macro's to be interpreted 10 * so that just sentences are output 11 * -ml also gets rid of lists. 12 * -i causes deroff to ignore .so and .nx commands. 13 * Deroff follows .so and .nx commands, removes contents of macro 14 * definitions, equations (both .EQ ... .EN and $...$), 15 * Tbl command sequences, and Troff backslash vconstructions. 16 * 17 * All input is through the C macro; the most recently read character is in c. 18 */ 19 20 /* 21 #define C ((c = Bgetrune(infile)) < 0?\ 22 eof():\ 23 ((c == ldelim) && (filesp == files)?\ 24 skeqn():\ 25 (c == '\n'?\ 26 (linect++,c):\ 27 c))) 28 29 #define C1 ((c = Bgetrune(infile)) == Beof?\ 30 eof():\ 31 (c == '\n'?\ 32 (linect++,c):\ 33 c)) 34 */ 35 36 /* lose those macros! */ 37 #define C fC() 38 #define C1 fC1() 39 40 #define SKIP while(C != '\n') 41 #define SKIP1 while(C1 != '\n') 42 #define SKIP_TO_COM SKIP;\ 43 SKIP;\ 44 pc=c;\ 45 while(C != '.' || pc != '\n' || C > 'Z')\ 46 pc=c 47 48 #define YES 1 49 #define NO 0 50 #define MS 0 51 #define MM 1 52 #define ONE 1 53 #define TWO 2 54 55 #define NOCHAR -2 56 #define EXTENDED -1 /* All runes above 0x7F */ 57 #define SPECIAL 0 58 #define APOS 1 59 #define PUNCT 2 60 #define DIGIT 3 61 #define LETTER 4 62 63 64 int linect = 0; 65 int wordflag= NO; 66 int underscoreflag = NO; 67 int msflag = NO; 68 int iflag = NO; 69 int mac = MM; 70 int disp = 0; 71 int inmacro = NO; 72 int intable = NO; 73 int eqnflag = 0; 74 75 #define MAX_ASCII 0X80 76 77 char chars[MAX_ASCII]; /* SPECIAL, PUNCT, APOS, DIGIT, or LETTER */ 78 79 Rune line[30000]; 80 Rune* lp; 81 82 long c; 83 long pc; 84 int ldelim = NOCHAR; 85 int rdelim = NOCHAR; 86 87 88 char** argv; 89 90 char fname[50]; 91 Biobuf* files[15]; 92 Biobuf**filesp; 93 Biobuf* infile; 94 char* devnull = "/dev/null"; 95 Biobuf *infile; 96 Biobuf bout; 97 98 long skeqn(void); 99 Biobuf* opn(char *p); 100 int eof(void); 101 int charclass(int); 102 void getfname(void); 103 void fatal(char *s, char *p); 104 void usage(void); 105 void work(void); 106 void putmac(Rune *rp, int vconst); 107 void regline(int macline, int vconst); 108 void putwords(void); 109 void comline(void); 110 void macro(void); 111 void eqn(void); 112 void tbl(void); 113 void stbl(void); 114 void sdis(char a1, char a2); 115 void sce(void); 116 void backsl(void); 117 char* copys(char *s); 118 void refer(int c1); 119 void inpic(void); 120 121 int 122 fC(void) 123 { 124 c = Bgetrune(infile); 125 if(c < 0) 126 return eof(); 127 if(c == ldelim && filesp == files) 128 return skeqn(); 129 if(c == '\n') 130 linect++; 131 return c; 132 } 133 134 int 135 fC1(void) 136 { 137 c = Bgetrune(infile); 138 if(c == Beof) 139 return eof(); 140 if(c == '\n') 141 linect++; 142 return c; 143 } 144 145 void 146 main(int argc, char *av[]) 147 { 148 int i; 149 char *f; 150 151 argv = av; 152 Binit(&bout, 1, OWRITE); 153 ARGBEGIN{ 154 case 'w': 155 wordflag = YES; 156 break; 157 case '_': 158 wordflag = YES; 159 underscoreflag = YES; 160 break; 161 case 'm': 162 msflag = YES; 163 if(f = ARGF()) 164 switch(*f) 165 { 166 case 'm': mac = MM; break; 167 case 's': mac = MS; break; 168 case 'l': disp = 1; break; 169 default: usage(); 170 } 171 else 172 usage(); 173 break; 174 case 'i': 175 iflag = YES; 176 break; 177 default: 178 usage(); 179 }ARGEND 180 if(*argv) 181 infile = opn(*argv++); 182 else{ 183 infile = malloc(sizeof(Biobuf)); 184 Binit(infile, 0, OREAD); 185 } 186 files[0] = infile; 187 filesp = &files[0]; 188 189 for(i='a'; i<='z' ; ++i) 190 chars[i] = LETTER; 191 for(i='A'; i<='Z'; ++i) 192 chars[i] = LETTER; 193 for(i='0'; i<='9'; ++i) 194 chars[i] = DIGIT; 195 chars['\''] = APOS; 196 chars['&'] = APOS; 197 chars['\b'] = APOS; 198 chars['.'] = PUNCT; 199 chars[','] = PUNCT; 200 chars[';'] = PUNCT; 201 chars['?'] = PUNCT; 202 chars[':'] = PUNCT; 203 work(); 204 } 205 206 long 207 skeqn(void) 208 { 209 while(C1 != rdelim) 210 if(c == '\\') 211 c = C1; 212 else if(c == '"') 213 while(C1 != '"') 214 if(c == '\\') 215 C1; 216 if (msflag) 217 eqnflag = 1; 218 return(c = ' '); 219 } 220 221 Biobuf* 222 opn(char *p) 223 { 224 Biobuf *fd; 225 226 while ((fd = Bopen(p, OREAD)) == 0) { 227 if(msflag || p == devnull) 228 fatal("Cannot open file %s - quitting\n", p); 229 else { 230 fprint(2, "Deroff: Cannot open file %s - continuing\n", p); 231 p = devnull; 232 } 233 } 234 linect = 0; 235 return(fd); 236 } 237 238 int 239 eof(void) 240 { 241 if(Bfildes(infile) != 0) 242 Bterm(infile); 243 if(filesp > files) 244 infile = *--filesp; 245 else 246 if(*argv) 247 infile = opn(*argv++); 248 else 249 exits(0); 250 return(C); 251 } 252 253 void 254 getfname(void) 255 { 256 char *p; 257 Rune r; 258 Dir *dir; 259 struct chain 260 { 261 struct chain* nextp; 262 char* datap; 263 } *q; 264 265 static struct chain *namechain= 0; 266 267 while(C == ' ') 268 ; 269 for(p = fname; (r=c) != '\n' && r != ' ' && r != '\t' && r != '\\'; C) 270 p += runetochar(p, &r); 271 *p = '\0'; 272 while(c != '\n') 273 C; 274 if(!strcmp(fname, "/sys/lib/tmac/tmac.cs") 275 || !strcmp(fname, "/sys/lib/tmac/tmac.s")) { 276 fname[0] = '\0'; 277 return; 278 } 279 dir = dirstat(fname); 280 if(dir!=nil && ((dir->mode & DMDIR) || dir->type != 'M')) { 281 free(dir); 282 fname[0] = '\0'; 283 return; 284 } 285 free(dir); 286 /* 287 * see if this name has already been used 288 */ 289 290 for(q = namechain; q; q = q->nextp) 291 if( !strcmp(fname, q->datap)) { 292 fname[0] = '\0'; 293 return; 294 } 295 q = (struct chain*)malloc(sizeof(struct chain)); 296 q->nextp = namechain; 297 q->datap = copys(fname); 298 namechain = q; 299 } 300 301 void 302 usage(void) 303 { 304 fprint(2,"usage: deroff [-nw_pi] [-m (m s l)] [file ...] \n"); 305 exits("usage"); 306 } 307 308 void 309 fatal(char *s, char *p) 310 { 311 fprint(2, "deroff: "); 312 fprint(2, s, p); 313 exits(s); 314 } 315 316 void 317 work(void) 318 { 319 320 for(;;) { 321 eqnflag = 0; 322 if(C == '.' || c == '\'') 323 comline(); 324 else 325 regline(NO, TWO); 326 } 327 } 328 329 void 330 regline(int macline, int vconst) 331 { 332 line[0] = c; 333 lp = line; 334 for(;;) { 335 if(c == '\\') { 336 *lp = ' '; 337 backsl(); 338 if(c == '%') /* no blank for hyphenation char */ 339 lp--; 340 } 341 if(c == '\n') 342 break; 343 if(intable && c=='T') { 344 *++lp = C; 345 if(c=='{' || c=='}') { 346 lp[-1] = ' '; 347 *lp = C; 348 } 349 } else { 350 if(msflag == 1 && eqnflag == 1) { 351 eqnflag = 0; 352 *++lp = 'x'; 353 } 354 *++lp = C; 355 } 356 } 357 *lp = '\0'; 358 if(lp != line) { 359 if(wordflag) 360 putwords(); 361 else 362 if(macline) 363 putmac(line,vconst); 364 else 365 Bprint(&bout, "%S\n", line); 366 } 367 } 368 369 void 370 putmac(Rune *rp, int vconst) 371 { 372 Rune *t; 373 int found; 374 Rune last; 375 376 found = 0; 377 last = 0; 378 while(*rp) { 379 while(*rp == ' ' || *rp == '\t') 380 Bputrune(&bout, *rp++); 381 for(t = rp; *t != ' ' && *t != '\t' && *t != '\0'; t++) 382 ; 383 if(*rp == '\"') 384 rp++; 385 if(t > rp+vconst && charclass(*rp) == LETTER 386 && charclass(rp[1]) == LETTER) { 387 while(rp < t) 388 if(*rp == '\"') 389 rp++; 390 else 391 Bputrune(&bout, *rp++); 392 last = t[-1]; 393 found++; 394 } else 395 if(found && charclass(*rp) == PUNCT && rp[1] == '\0') 396 Bputrune(&bout, *rp++); 397 else { 398 last = t[-1]; 399 rp = t; 400 } 401 } 402 Bputc(&bout, '\n'); 403 if(msflag && charclass(last) == PUNCT) 404 Bprint(&bout, " %C\n", last); 405 } 406 407 /* 408 * break into words for -w option 409 */ 410 void 411 putwords(void) 412 { 413 Rune *p, *p1; 414 int i, nlet; 415 416 417 for(p1 = line;;) { 418 /* 419 * skip initial specials ampersands and apostrophes 420 */ 421 while((i = charclass(*p1)) != EXTENDED && i < DIGIT) 422 if(*p1++ == '\0') 423 return; 424 nlet = 0; 425 for(p = p1; (i = charclass(*p)) != SPECIAL || (underscoreflag && *p=='_'); p++) 426 if(i == LETTER || (underscoreflag && *p == '_')) 427 nlet++; 428 /* 429 * MDM definition of word 430 */ 431 if(nlet > 1) { 432 /* 433 * delete trailing ampersands and apostrophes 434 */ 435 while(*--p == '\'' || *p == '&' 436 || charclass(*p) == PUNCT) 437 ; 438 while(p1 <= p) 439 Bputrune(&bout, *p1++); 440 Bputc(&bout, '\n'); 441 } else 442 p1 = p; 443 } 444 } 445 446 void 447 comline(void) 448 { 449 long c1, c2; 450 451 while(C==' ' || c=='\t') 452 ; 453 comx: 454 if((c1=c) == '\n') 455 return; 456 c2 = C; 457 if(c1=='.' && c2!='.') 458 inmacro = NO; 459 if(msflag && c1 == '['){ 460 refer(c2); 461 return; 462 } 463 if(c2 == '\n') 464 return; 465 if(c1 == '\\' && c2 == '\"') 466 SKIP; 467 else 468 if (filesp==files && c1=='E' && c2=='Q') 469 eqn(); 470 else 471 if(filesp==files && c1=='T' && (c2=='S' || c2=='C' || c2=='&')) { 472 if(msflag) 473 stbl(); 474 else 475 tbl(); 476 } 477 else 478 if(c1=='T' && c2=='E') 479 intable = NO; 480 else if (!inmacro && 481 ((c1 == 'd' && c2 == 'e') || 482 (c1 == 'i' && c2 == 'g') || 483 (c1 == 'a' && c2 == 'm'))) 484 macro(); 485 else 486 if(c1=='s' && c2=='o') { 487 if(iflag) 488 SKIP; 489 else { 490 getfname(); 491 if(fname[0]) { 492 if(infile = opn(fname)) 493 *++filesp = infile; 494 else infile = *filesp; 495 } 496 } 497 } 498 else 499 if(c1=='n' && c2=='x') 500 if(iflag) 501 SKIP; 502 else { 503 getfname(); 504 if(fname[0] == '\0') 505 exits(0); 506 if(Bfildes(infile) != 0) 507 Bterm(infile); 508 infile = *filesp = opn(fname); 509 } 510 else 511 if(c1 == 't' && c2 == 'm') 512 SKIP; 513 else 514 if(c1=='h' && c2=='w') 515 SKIP; 516 else 517 if(msflag && c1 == 'T' && c2 == 'L') { 518 SKIP_TO_COM; 519 goto comx; 520 } 521 else 522 if(msflag && c1=='N' && c2 == 'R') 523 SKIP; 524 else 525 if(msflag && c1 == 'A' && (c2 == 'U' || c2 == 'I')){ 526 if(mac==MM)SKIP; 527 else { 528 SKIP_TO_COM; 529 goto comx; 530 } 531 } else 532 if(msflag && c1=='F' && c2=='S') { 533 SKIP_TO_COM; 534 goto comx; 535 } 536 else 537 if(msflag && (c1=='S' || c1=='N') && c2=='H') { 538 SKIP_TO_COM; 539 goto comx; 540 } else 541 if(c1 == 'U' && c2 == 'X') { 542 if(wordflag) 543 Bprint(&bout, "UNIX\n"); 544 else 545 Bprint(&bout, "UNIX "); 546 } else 547 if(msflag && c1=='O' && c2=='K') { 548 SKIP_TO_COM; 549 goto comx; 550 } else 551 if(msflag && c1=='N' && c2=='D') 552 SKIP; 553 else 554 if(msflag && mac==MM && c1=='H' && (c2==' '||c2=='U')) 555 SKIP; 556 else 557 if(msflag && mac==MM && c2=='L') { 558 if(disp || c1=='R') 559 sdis('L', 'E'); 560 else { 561 SKIP; 562 Bprint(&bout, " ."); 563 } 564 } else 565 if(!msflag && c1=='P' && c2=='S') { 566 inpic(); 567 } else 568 if(msflag && (c1=='D' || c1=='N' || c1=='K'|| c1=='P') && c2=='S') { 569 sdis(c1, 'E'); 570 } else 571 if(msflag && (c1 == 'K' && c2 == 'F')) { 572 sdis(c1,'E'); 573 } else 574 if(msflag && c1=='n' && c2=='f') 575 sdis('f','i'); 576 else 577 if(msflag && c1=='c' && c2=='e') 578 sce(); 579 else { 580 if(c1=='.' && c2=='.') { 581 if(msflag) { 582 SKIP; 583 return; 584 } 585 while(C == '.') 586 ; 587 } 588 inmacro++; 589 if(c1 <= 'Z' && msflag) 590 regline(YES,ONE); 591 else { 592 if(wordflag) 593 C; 594 regline(YES,TWO); 595 } 596 inmacro--; 597 } 598 } 599 600 void 601 macro(void) 602 { 603 if(msflag) { 604 do { 605 SKIP1; 606 } while(C1 != '.' || C1 != '.' || C1 == '.'); 607 if(c != '\n') 608 SKIP; 609 return; 610 } 611 SKIP; 612 inmacro = YES; 613 } 614 615 void 616 sdis(char a1, char a2) 617 { 618 int c1, c2; 619 int eqnf; 620 int lct; 621 622 if(a1 == 'P'){ 623 while(C1 == ' ') 624 ; 625 if(c == '<') { 626 SKIP1; 627 return; 628 } 629 } 630 lct = 0; 631 eqnf = 1; 632 if(c != '\n') 633 SKIP1; 634 for(;;) { 635 while(C1 != '.') 636 if(c == '\n') 637 continue; 638 else 639 SKIP1; 640 if((c1=C1) == '\n') 641 continue; 642 if((c2=C1) == '\n') { 643 if(a1 == 'f' && (c1 == 'P' || c1 == 'H')) 644 return; 645 continue; 646 } 647 if(c1==a1 && c2 == a2) { 648 SKIP1; 649 if(lct != 0){ 650 lct--; 651 continue; 652 } 653 if(eqnf) 654 Bprint(&bout, " ."); 655 Bputc(&bout, '\n'); 656 return; 657 } else 658 if(a1 == 'L' && c2 == 'L') { 659 lct++; 660 SKIP1; 661 } else 662 if(a1 == 'D' && c1 == 'E' && c2 == 'Q') { 663 eqn(); 664 eqnf = 0; 665 } else 666 if(a1 == 'f') { 667 if((mac == MS && c2 == 'P') || 668 (mac == MM && c1 == 'H' && c2 == 'U')){ 669 SKIP1; 670 return; 671 } 672 SKIP1; 673 } 674 else 675 SKIP1; 676 } 677 } 678 679 void 680 tbl(void) 681 { 682 while(C != '.') 683 ; 684 SKIP; 685 intable = YES; 686 } 687 688 void 689 stbl(void) 690 { 691 while(C != '.') 692 ; 693 SKIP_TO_COM; 694 if(c != 'T' || C != 'E') { 695 SKIP; 696 pc = c; 697 while(C != '.' || pc != '\n' || C != 'T' || C != 'E') 698 pc = c; 699 } 700 } 701 702 void 703 eqn(void) 704 { 705 long c1, c2; 706 int dflg; 707 char last; 708 709 last = 0; 710 dflg = 1; 711 SKIP; 712 713 for(;;) { 714 if(C1 == '.' || c == '\'') { 715 while(C1==' ' || c=='\t') 716 ; 717 if(c=='E' && C1=='N') { 718 SKIP; 719 if(msflag && dflg) { 720 Bputc(&bout, 'x'); 721 Bputc(&bout, ' '); 722 if(last) { 723 Bputc(&bout, last); 724 Bputc(&bout, '\n'); 725 } 726 } 727 return; 728 } 729 } else 730 if(c == 'd') { 731 if(C1=='e' && C1=='l') 732 if(C1=='i' && C1=='m') { 733 while(C1 == ' ') 734 ; 735 if((c1=c)=='\n' || (c2=C1)=='\n' || 736 (c1=='o' && c2=='f' && C1=='f')) { 737 ldelim = NOCHAR; 738 rdelim = NOCHAR; 739 } else { 740 ldelim = c1; 741 rdelim = c2; 742 } 743 } 744 dflg = 0; 745 } 746 if(c != '\n') 747 while(C1 != '\n') { 748 if(charclass(c) == PUNCT) 749 last = c; 750 else 751 if(c != ' ') 752 last = 0; 753 } 754 } 755 } 756 757 /* 758 * skip over a complete backslash vconstruction 759 */ 760 void 761 backsl(void) 762 { 763 int bdelim; 764 765 sw: 766 switch(C1) 767 { 768 case '"': 769 SKIP1; 770 return; 771 772 case 's': 773 if(C1 == '\\') 774 backsl(); 775 else { 776 while(C1>='0' && c<='9') 777 ; 778 Bungetrune(infile); 779 c = '0'; 780 } 781 lp--; 782 return; 783 784 case 'f': 785 case 'n': 786 case '*': 787 if(C1 != '(') 788 return; 789 790 case '(': 791 if(msflag) { 792 if(C == 'e') { 793 if(C1 == 'm') { 794 *lp = '-'; 795 return; 796 } 797 } else 798 if(c != '\n') 799 C1; 800 return; 801 } 802 if(C1 != '\n') 803 C1; 804 return; 805 806 case '$': 807 C1; /* discard argument number */ 808 return; 809 810 case 'b': 811 case 'x': 812 case 'v': 813 case 'h': 814 case 'w': 815 case 'o': 816 case 'l': 817 case 'L': 818 if((bdelim=C1) == '\n') 819 return; 820 while(C1!='\n' && c!=bdelim) 821 if(c == '\\') 822 backsl(); 823 return; 824 825 case '\\': 826 if(inmacro) 827 goto sw; 828 default: 829 return; 830 } 831 } 832 833 char* 834 copys(char *s) 835 { 836 char *t, *t0; 837 838 if((t0 = t = malloc((strlen(s)+1))) == 0) 839 fatal("Cannot allocate memory", (char*)0); 840 while(*t++ = *s++) 841 ; 842 return(t0); 843 } 844 845 void 846 sce(void) 847 { 848 int n = 1; 849 850 while (C != '\n' && !('0' <= c && c <= '9')) 851 ; 852 if (c != '\n') { 853 for (n = c-'0';'0' <= C && c <= '9';) 854 n = n*10 + c-'0'; 855 } 856 while(n) { 857 if(C == '.') { 858 if(C == 'c') { 859 if(C == 'e') { 860 while(C == ' ') 861 ; 862 if(c == '0') { 863 SKIP; 864 break; 865 } else 866 SKIP; 867 } else 868 SKIP; 869 } else 870 if(c == 'P' || C == 'P') { 871 if(c != '\n') 872 SKIP; 873 break; 874 } else 875 if(c != '\n') 876 SKIP; 877 } else { 878 SKIP; 879 n--; 880 } 881 } 882 } 883 884 void 885 refer(int c1) 886 { 887 int c2; 888 889 if(c1 != '\n') 890 SKIP; 891 c2 = 0; 892 for(;;) { 893 if(C != '.') 894 SKIP; 895 else { 896 if(C != ']') 897 SKIP; 898 else { 899 while(C != '\n') 900 c2 = c; 901 if(charclass(c2) == PUNCT) 902 Bprint(&bout, " %C",c2); 903 return; 904 } 905 } 906 } 907 } 908 909 void 910 inpic(void) 911 { 912 int c1; 913 Rune *p1; 914 915 /* SKIP1;*/ 916 while(C1 != '\n') 917 if(c == '<'){ 918 SKIP1; 919 return; 920 } 921 p1 = line; 922 c = '\n'; 923 for(;;) { 924 c1 = c; 925 if(C1 == '.' && c1 == '\n') { 926 if(C1 != 'P' || C1 != 'E') { 927 if(c != '\n'){ 928 SKIP1; 929 c = '\n'; 930 } 931 continue; 932 } 933 SKIP1; 934 return; 935 } else 936 if(c == '\"') { 937 while(C1 != '\"') { 938 if(c == '\\') { 939 if(C1 == '\"') 940 continue; 941 Bungetrune(infile); 942 backsl(); 943 } else 944 *p1++ = c; 945 } 946 *p1++ = ' '; 947 } else 948 if(c == '\n' && p1 != line) { 949 *p1 = '\0'; 950 if(wordflag) 951 putwords(); 952 else 953 Bprint(&bout, "%S\n\n", line); 954 p1 = line; 955 } 956 } 957 } 958 959 int 960 charclass(int c) 961 { 962 if(c < MAX_ASCII) 963 return chars[c]; 964 switch(c){ 965 case 0x2013: case 0x2014: /* en dash, em dash */ 966 return SPECIAL; 967 } 968 return EXTENDED; 969 }