regexp.c (15575B)
1 #include "sam.h" 2 3 Rangeset sel; 4 String lastregexp; 5 /* 6 * Machine Information 7 */ 8 typedef struct Inst Inst; 9 10 struct Inst 11 { 12 long type; /* < OPERATOR ==> literal, otherwise action */ 13 union { 14 int rsid; 15 int rsubid; 16 int class; 17 struct Inst *rother; 18 struct Inst *rright; 19 } r; 20 union{ 21 struct Inst *lleft; 22 struct Inst *lnext; 23 } l; 24 }; 25 #define sid r.rsid 26 #define subid r.rsubid 27 #define rclass r.class 28 #define other r.rother 29 #define right r.rright 30 #define left l.lleft 31 #define next l.lnext 32 33 #define NPROG 1024 34 Inst program[NPROG]; 35 Inst *progp; 36 Inst *startinst; /* First inst. of program; might not be program[0] */ 37 Inst *bstartinst; /* same for backwards machine */ 38 39 typedef struct Ilist Ilist; 40 struct Ilist 41 { 42 Inst *inst; /* Instruction of the thread */ 43 Rangeset se; 44 Posn startp; /* first char of match */ 45 }; 46 47 #define NLIST 127 48 49 Ilist *tl, *nl; /* This list, next list */ 50 Ilist list[2][NLIST+1]; /* +1 for trailing null */ 51 static Rangeset sempty; 52 53 /* 54 * Actions and Tokens 55 * 56 * 0x10000xx are operators, value == precedence 57 * 0x20000xx are tokens, i.e. operands for operators 58 */ 59 #define OPERATOR 0x1000000 /* Bit set in all operators */ 60 #define START (OPERATOR+0) /* Start, used for marker on stack */ 61 #define RBRA (OPERATOR+1) /* Right bracket, ) */ 62 #define LBRA (OPERATOR+2) /* Left bracket, ( */ 63 #define OR (OPERATOR+3) /* Alternation, | */ 64 #define CAT (OPERATOR+4) /* Concatentation, implicit operator */ 65 #define STAR (OPERATOR+5) /* Closure, * */ 66 #define PLUS (OPERATOR+6) /* a+ == aa* */ 67 #define QUEST (OPERATOR+7) /* a? == a|nothing, i.e. 0 or 1 a's */ 68 #define ANY 0x2000000 /* Any character but newline, . */ 69 #define NOP (ANY+1) /* No operation, internal use only */ 70 #define BOL (ANY+2) /* Beginning of line, ^ */ 71 #define EOL (ANY+3) /* End of line, $ */ 72 #define CCLASS (ANY+4) /* Character class, [] */ 73 #define NCCLASS (ANY+5) /* Negated character class, [^] */ 74 #define END (ANY+0x77) /* Terminate: match found */ 75 76 #define ISATOR OPERATOR 77 #define ISAND ANY 78 79 #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */ 80 81 /* 82 * Parser Information 83 */ 84 typedef struct Node Node; 85 struct Node 86 { 87 Inst *first; 88 Inst *last; 89 }; 90 91 #define NSTACK 20 92 Node andstack[NSTACK]; 93 Node *andp; 94 int atorstack[NSTACK]; 95 int *atorp; 96 int lastwasand; /* Last token was operand */ 97 int cursubid; 98 int subidstack[NSTACK]; 99 int *subidp; 100 int backwards; 101 int nbra; 102 Rune *exprp; /* pointer to next character in source expression */ 103 #define DCLASS 10 /* allocation increment */ 104 int nclass; /* number active */ 105 int Nclass; /* high water mark */ 106 Rune **class; 107 int negateclass; 108 109 int addinst(Ilist *l, Inst *inst, Rangeset *sep); 110 void newmatch(Rangeset*); 111 void bnewmatch(Rangeset*); 112 void pushand(Inst*, Inst*); 113 void pushator(int); 114 Node *popand(int); 115 int popator(void); 116 void startlex(Rune*); 117 int lex(void); 118 void operator(int); 119 void operand(int); 120 void evaluntil(int); 121 void optimize(Inst*); 122 void bldcclass(void); 123 124 void 125 regerror(Err e) 126 { 127 Strzero(&lastregexp); 128 error(e); 129 } 130 131 void 132 regerror_c(Err e, int c) 133 { 134 Strzero(&lastregexp); 135 error_c(e, c); 136 } 137 138 Inst * 139 newinst(int t) 140 { 141 if(progp >= &program[NPROG]) 142 regerror(Etoolong); 143 progp->type = t; 144 progp->left = 0; 145 progp->right = 0; 146 return progp++; 147 } 148 149 Inst * 150 realcompile(Rune *s) 151 { 152 int token; 153 154 startlex(s); 155 atorp = atorstack; 156 andp = andstack; 157 subidp = subidstack; 158 cursubid = 0; 159 lastwasand = FALSE; 160 /* Start with a low priority operator to prime parser */ 161 pushator(START-1); 162 while((token=lex()) != END){ 163 if((token&ISATOR) == OPERATOR) 164 operator(token); 165 else 166 operand(token); 167 } 168 /* Close with a low priority operator */ 169 evaluntil(START); 170 /* Force END */ 171 operand(END); 172 evaluntil(START); 173 if(nbra) 174 regerror(Eleftpar); 175 --andp; /* points to first and only operand */ 176 return andp->first; 177 } 178 179 void 180 compile(String *s) 181 { 182 int i; 183 Inst *oprogp; 184 185 if(Strcmp(s, &lastregexp)==0) 186 return; 187 for(i=0; i<nclass; i++) 188 free(class[i]); 189 nclass = 0; 190 progp = program; 191 backwards = FALSE; 192 startinst = realcompile(s->s); 193 optimize(program); 194 oprogp = progp; 195 backwards = TRUE; 196 bstartinst = realcompile(s->s); 197 optimize(oprogp); 198 Strduplstr(&lastregexp, s); 199 } 200 201 void 202 operand(int t) 203 { 204 Inst *i; 205 if(lastwasand) 206 operator(CAT); /* catenate is implicit */ 207 i = newinst(t); 208 if(t == CCLASS){ 209 if(negateclass) 210 i->type = NCCLASS; /* UGH */ 211 i->rclass = nclass-1; /* UGH */ 212 } 213 pushand(i, i); 214 lastwasand = TRUE; 215 } 216 217 void 218 operator(int t) 219 { 220 if(t==RBRA && --nbra<0) 221 regerror(Erightpar); 222 if(t==LBRA){ 223 /* 224 * if(++cursubid >= NSUBEXP) 225 * regerror(Esubexp); 226 */ 227 cursubid++; /* silently ignored */ 228 nbra++; 229 if(lastwasand) 230 operator(CAT); 231 }else 232 evaluntil(t); 233 if(t!=RBRA) 234 pushator(t); 235 lastwasand = FALSE; 236 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 237 lastwasand = TRUE; /* these look like operands */ 238 } 239 240 void 241 cant(char *s) 242 { 243 char buf[100]; 244 245 sprint(buf, "regexp: can't happen: %s", s); 246 panic(buf); 247 } 248 249 void 250 pushand(Inst *f, Inst *l) 251 { 252 if(andp >= &andstack[NSTACK]) 253 cant("operand stack overflow"); 254 andp->first = f; 255 andp->last = l; 256 andp++; 257 } 258 259 void 260 pushator(int t) 261 { 262 if(atorp >= &atorstack[NSTACK]) 263 cant("operator stack overflow"); 264 *atorp++=t; 265 if(cursubid >= NSUBEXP) 266 *subidp++= -1; 267 else 268 *subidp++=cursubid; 269 } 270 271 Node * 272 popand(int op) 273 { 274 if(andp <= &andstack[0]) 275 if(op) 276 regerror_c(Emissop, op); 277 else 278 regerror(Ebadregexp); 279 return --andp; 280 } 281 282 int 283 popator(void) 284 { 285 if(atorp <= &atorstack[0]) 286 cant("operator stack underflow"); 287 --subidp; 288 return *--atorp; 289 } 290 291 void 292 evaluntil(int pri) 293 { 294 Node *op1, *op2, *t; 295 Inst *inst1, *inst2; 296 297 while(pri==RBRA || atorp[-1]>=pri){ 298 switch(popator()){ 299 case LBRA: 300 op1 = popand('('); 301 inst2 = newinst(RBRA); 302 inst2->subid = *subidp; 303 op1->last->next = inst2; 304 inst1 = newinst(LBRA); 305 inst1->subid = *subidp; 306 inst1->next = op1->first; 307 pushand(inst1, inst2); 308 return; /* must have been RBRA */ 309 default: 310 panic("unknown regexp operator"); 311 break; 312 case OR: 313 op2 = popand('|'); 314 op1 = popand('|'); 315 inst2 = newinst(NOP); 316 op2->last->next = inst2; 317 op1->last->next = inst2; 318 inst1 = newinst(OR); 319 inst1->right = op1->first; 320 inst1->left = op2->first; 321 pushand(inst1, inst2); 322 break; 323 case CAT: 324 op2 = popand(0); 325 op1 = popand(0); 326 if(backwards && op2->first->type!=END) 327 t = op1, op1 = op2, op2 = t; 328 op1->last->next = op2->first; 329 pushand(op1->first, op2->last); 330 break; 331 case STAR: 332 op2 = popand('*'); 333 inst1 = newinst(OR); 334 op2->last->next = inst1; 335 inst1->right = op2->first; 336 pushand(inst1, inst1); 337 break; 338 case PLUS: 339 op2 = popand('+'); 340 inst1 = newinst(OR); 341 op2->last->next = inst1; 342 inst1->right = op2->first; 343 pushand(op2->first, inst1); 344 break; 345 case QUEST: 346 op2 = popand('?'); 347 inst1 = newinst(OR); 348 inst2 = newinst(NOP); 349 inst1->left = inst2; 350 inst1->right = op2->first; 351 op2->last->next = inst2; 352 pushand(inst1, inst2); 353 break; 354 } 355 } 356 } 357 358 359 void 360 optimize(Inst *start) 361 { 362 Inst *inst, *target; 363 364 for(inst=start; inst->type!=END; inst++){ 365 target = inst->next; 366 while(target->type == NOP) 367 target = target->next; 368 inst->next = target; 369 } 370 } 371 372 #ifdef DEBUG 373 void 374 dumpstack(void){ 375 Node *stk; 376 int *ip; 377 378 dprint("operators\n"); 379 for(ip = atorstack; ip<atorp; ip++) 380 dprint("0%o\n", *ip); 381 dprint("operands\n"); 382 for(stk = andstack; stk<andp; stk++) 383 dprint("0%o\t0%o\n", stk->first->type, stk->last->type); 384 } 385 void 386 dump(void){ 387 Inst *l; 388 389 l = program; 390 do{ 391 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type, 392 l->left-program, l->right-program); 393 }while(l++->type); 394 } 395 #endif 396 397 void 398 startlex(Rune *s) 399 { 400 exprp = s; 401 nbra = 0; 402 } 403 404 405 int 406 lex(void){ 407 int c= *exprp++; 408 409 switch(c){ 410 case '\\': 411 if(*exprp) 412 if((c= *exprp++)=='n') 413 c='\n'; 414 break; 415 case 0: 416 c = END; 417 --exprp; /* In case we come here again */ 418 break; 419 case '*': 420 c = STAR; 421 break; 422 case '?': 423 c = QUEST; 424 break; 425 case '+': 426 c = PLUS; 427 break; 428 case '|': 429 c = OR; 430 break; 431 case '.': 432 c = ANY; 433 break; 434 case '(': 435 c = LBRA; 436 break; 437 case ')': 438 c = RBRA; 439 break; 440 case '^': 441 c = BOL; 442 break; 443 case '$': 444 c = EOL; 445 break; 446 case '[': 447 c = CCLASS; 448 bldcclass(); 449 break; 450 } 451 return c; 452 } 453 454 long 455 nextrec(void){ 456 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) 457 regerror(Ebadclass); 458 if(exprp[0] == '\\'){ 459 exprp++; 460 if(*exprp=='n'){ 461 exprp++; 462 return '\n'; 463 } 464 return *exprp++|QUOTED; 465 } 466 return *exprp++; 467 } 468 469 void 470 bldcclass(void) 471 { 472 long c1, c2, n, na; 473 Rune *classp; 474 475 classp = emalloc(DCLASS*RUNESIZE); 476 n = 0; 477 na = DCLASS; 478 /* we have already seen the '[' */ 479 if(*exprp == '^'){ 480 classp[n++] = '\n'; /* don't match newline in negate case */ 481 negateclass = TRUE; 482 exprp++; 483 }else 484 negateclass = FALSE; 485 while((c1 = nextrec()) != ']'){ 486 if(c1 == '-'){ 487 Error: 488 free(classp); 489 regerror(Ebadclass); 490 } 491 if(n+4 >= na){ /* 3 runes plus NUL */ 492 na += DCLASS; 493 classp = erealloc(classp, na*RUNESIZE); 494 } 495 if(*exprp == '-'){ 496 exprp++; /* eat '-' */ 497 if((c2 = nextrec()) == ']') 498 goto Error; 499 classp[n+0] = Runemax; 500 classp[n+1] = c1; 501 classp[n+2] = c2; 502 n += 3; 503 }else 504 classp[n++] = c1 & ~QUOTED; 505 } 506 classp[n] = 0; 507 if(nclass == Nclass){ 508 Nclass += DCLASS; 509 class = erealloc(class, Nclass*sizeof(Rune*)); 510 } 511 class[nclass++] = classp; 512 } 513 514 int 515 classmatch(int classno, int c, int negate) 516 { 517 Rune *p; 518 519 p = class[classno]; 520 while(*p){ 521 if(*p == Runemax){ 522 if(p[1]<=c && c<=p[2]) 523 return !negate; 524 p += 3; 525 }else if(*p++ == c) 526 return !negate; 527 } 528 return negate; 529 } 530 531 /* 532 * Note optimization in addinst: 533 * *l must be pending when addinst called; if *l has been looked 534 * at already, the optimization is a bug. 535 */ 536 int 537 addinst(Ilist *l, Inst *inst, Rangeset *sep) 538 { 539 Ilist *p; 540 541 for(p = l; p->inst; p++){ 542 if(p->inst==inst){ 543 if((sep)->p[0].p1 < p->se.p[0].p1) 544 p->se= *sep; /* this would be bug */ 545 return 0; /* It's already there */ 546 } 547 } 548 p->inst = inst; 549 p->se= *sep; 550 (p+1)->inst = 0; 551 return 1; 552 } 553 554 int 555 execute(File *f, Posn startp, Posn eof) 556 { 557 int flag = 0; 558 Inst *inst; 559 Ilist *tlp; 560 Posn p = startp; 561 int nnl = 0, ntl; 562 int c; 563 int wrapped = 0; 564 int startchar = startinst->type<OPERATOR? startinst->type : 0; 565 566 list[0][0].inst = list[1][0].inst = 0; 567 sel.p[0].p1 = -1; 568 /* Execute machine once for each character */ 569 for(;;p++){ 570 doloop: 571 c = filereadc(f, p); 572 if(p>=eof || c<0){ 573 switch(wrapped++){ 574 case 0: /* let loop run one more click */ 575 case 2: 576 break; 577 case 1: /* expired; wrap to beginning */ 578 if(sel.p[0].p1>=0 || eof!=INFINITY) 579 goto Return; 580 list[0][0].inst = list[1][0].inst = 0; 581 p = 0; 582 goto doloop; 583 default: 584 goto Return; 585 } 586 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0) 587 break; 588 /* fast check for first char */ 589 if(startchar && nnl==0 && c!=startchar) 590 continue; 591 tl = list[flag]; 592 nl = list[flag^=1]; 593 nl->inst = 0; 594 ntl = nnl; 595 nnl = 0; 596 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){ 597 /* Add first instruction to this list */ 598 sempty.p[0].p1 = p; 599 if(addinst(tl, startinst, &sempty)) 600 if(++ntl >= NLIST) 601 Overflow: 602 error(Eoverflow); 603 } 604 /* Execute machine until this list is empty */ 605 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 606 Switchstmt: 607 switch(inst->type){ 608 default: /* regular character */ 609 if(inst->type==c){ 610 Addinst: 611 if(addinst(nl, inst->next, &tlp->se)) 612 if(++nnl >= NLIST) 613 goto Overflow; 614 } 615 break; 616 case LBRA: 617 if(inst->subid>=0) 618 tlp->se.p[inst->subid].p1 = p; 619 inst = inst->next; 620 goto Switchstmt; 621 case RBRA: 622 if(inst->subid>=0) 623 tlp->se.p[inst->subid].p2 = p; 624 inst = inst->next; 625 goto Switchstmt; 626 case ANY: 627 if(c!='\n') 628 goto Addinst; 629 break; 630 case BOL: 631 if(p==0 || filereadc(f, p - 1)=='\n'){ 632 Step: 633 inst = inst->next; 634 goto Switchstmt; 635 } 636 break; 637 case EOL: 638 if(c == '\n') 639 goto Step; 640 break; 641 case CCLASS: 642 if(c>=0 && classmatch(inst->rclass, c, 0)) 643 goto Addinst; 644 break; 645 case NCCLASS: 646 if(c>=0 && classmatch(inst->rclass, c, 1)) 647 goto Addinst; 648 break; 649 case OR: 650 /* evaluate right choice later */ 651 if(addinst(tl, inst->right, &tlp->se)) 652 if(++ntl >= NLIST) 653 goto Overflow; 654 /* efficiency: advance and re-evaluate */ 655 inst = inst->left; 656 goto Switchstmt; 657 case END: /* Match! */ 658 tlp->se.p[0].p2 = p; 659 newmatch(&tlp->se); 660 break; 661 } 662 } 663 } 664 Return: 665 return sel.p[0].p1>=0; 666 } 667 668 void 669 newmatch(Rangeset *sp) 670 { 671 int i; 672 673 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 || 674 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2)) 675 for(i = 0; i<NSUBEXP; i++) 676 sel.p[i] = sp->p[i]; 677 } 678 679 int 680 bexecute(File *f, Posn startp) 681 { 682 int flag = 0; 683 Inst *inst; 684 Ilist *tlp; 685 Posn p = startp; 686 int nnl = 0, ntl; 687 int c; 688 int wrapped = 0; 689 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0; 690 691 list[0][0].inst = list[1][0].inst = 0; 692 sel.p[0].p1= -1; 693 /* Execute machine once for each character, including terminal NUL */ 694 for(;;--p){ 695 doloop: 696 if((c = filereadc(f, p - 1))==-1){ 697 switch(wrapped++){ 698 case 0: /* let loop run one more click */ 699 case 2: 700 break; 701 case 1: /* expired; wrap to end */ 702 if(sel.p[0].p1>=0) 703 goto Return; 704 list[0][0].inst = list[1][0].inst = 0; 705 p = f->b.nc; 706 goto doloop; 707 case 3: 708 default: 709 goto Return; 710 } 711 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0) 712 break; 713 /* fast check for first char */ 714 if(startchar && nnl==0 && c!=startchar) 715 continue; 716 tl = list[flag]; 717 nl = list[flag^=1]; 718 nl->inst = 0; 719 ntl = nnl; 720 nnl = 0; 721 if(sel.p[0].p1<0 && (!wrapped || p>startp)){ 722 /* Add first instruction to this list */ 723 /* the minus is so the optimizations in addinst work */ 724 sempty.p[0].p1 = -p; 725 if(addinst(tl, bstartinst, &sempty)) 726 if(++ntl >= NLIST) 727 Overflow: 728 error(Eoverflow); 729 } 730 /* Execute machine until this list is empty */ 731 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 732 Switchstmt: 733 switch(inst->type){ 734 default: /* regular character */ 735 if(inst->type == c){ 736 Addinst: 737 if(addinst(nl, inst->next, &tlp->se)) 738 if(++nnl >= NLIST) 739 goto Overflow; 740 } 741 break; 742 case LBRA: 743 if(inst->subid>=0) 744 tlp->se.p[inst->subid].p1 = p; 745 inst = inst->next; 746 goto Switchstmt; 747 case RBRA: 748 if(inst->subid >= 0) 749 tlp->se.p[inst->subid].p2 = p; 750 inst = inst->next; 751 goto Switchstmt; 752 case ANY: 753 if(c != '\n') 754 goto Addinst; 755 break; 756 case BOL: 757 if(c=='\n' || p==0){ 758 Step: 759 inst = inst->next; 760 goto Switchstmt; 761 } 762 break; 763 case EOL: 764 if(p==f->b.nc || filereadc(f, p)=='\n') 765 goto Step; 766 break; 767 case CCLASS: 768 if(c>=0 && classmatch(inst->rclass, c, 0)) 769 goto Addinst; 770 break; 771 case NCCLASS: 772 if(c>=0 && classmatch(inst->rclass, c, 1)) 773 goto Addinst; 774 break; 775 case OR: 776 /* evaluate right choice later */ 777 if(addinst(tlp, inst->right, &tlp->se)) 778 if(++ntl >= NLIST) 779 goto Overflow; 780 /* efficiency: advance and re-evaluate */ 781 inst = inst->left; 782 goto Switchstmt; 783 case END: /* Match! */ 784 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */ 785 tlp->se.p[0].p2 = p; 786 bnewmatch(&tlp->se); 787 break; 788 } 789 } 790 } 791 Return: 792 return sel.p[0].p1>=0; 793 } 794 795 void 796 bnewmatch(Rangeset *sp) 797 { 798 int i; 799 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1)) 800 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */ 801 sel.p[i].p1 = sp->p[i].p2; 802 sel.p[i].p2 = sp->p[i].p1; 803 } 804 }