regx.c (16508B)
1 #include <u.h> 2 #include <libc.h> 3 #include <draw.h> 4 #include <thread.h> 5 #include <cursor.h> 6 #include <mouse.h> 7 #include <keyboard.h> 8 #include <frame.h> 9 #include <fcall.h> 10 #include <plumb.h> 11 #include <libsec.h> 12 #include "dat.h" 13 #include "fns.h" 14 15 Rangeset sel; 16 Rune *lastregexp; 17 18 #undef class 19 #define class regxclass /* some systems declare "class" in system headers */ 20 21 /* 22 * Machine Information 23 */ 24 typedef struct Inst Inst; 25 struct Inst 26 { 27 uint type; /* < OPERATOR ==> literal, otherwise action */ 28 union { 29 int sid; 30 int subid; 31 int class; 32 Inst *other; 33 Inst *right; 34 } u; 35 union{ 36 Inst *left; 37 Inst *next; 38 } u1; 39 }; 40 41 #define NPROG 1024 42 Inst program[NPROG]; 43 Inst *progp; 44 Inst *startinst; /* First inst. of program; might not be program[0] */ 45 Inst *bstartinst; /* same for backwards machine */ 46 Channel *rechan; /* chan(Inst*) */ 47 48 typedef struct Ilist Ilist; 49 struct Ilist 50 { 51 Inst *inst; /* Instruction of the thread */ 52 Rangeset se; 53 uint startp; /* first char of match */ 54 }; 55 56 #define NLIST 127 57 58 Ilist *tl, *nl; /* This list, next list */ 59 Ilist list[2][NLIST+1]; /* +1 for trailing null */ 60 static Rangeset sempty; 61 62 /* 63 * Actions and Tokens 64 * 65 * 0x10000xx are operators, value == precedence 66 * 0x20000xx are tokens, i.e. operands for operators 67 */ 68 #define OPERATOR 0x1000000 /* Bit set in all operators */ 69 #define START (OPERATOR+0) /* Start, used for marker on stack */ 70 #define RBRA (OPERATOR+1) /* Right bracket, ) */ 71 #define LBRA (OPERATOR+2) /* Left bracket, ( */ 72 #define OR (OPERATOR+3) /* Alternation, | */ 73 #define CAT (OPERATOR+4) /* Concatentation, implicit operator */ 74 #define STAR (OPERATOR+5) /* Closure, * */ 75 #define PLUS (OPERATOR+6) /* a+ == aa* */ 76 #define QUEST (OPERATOR+7) /* a? == a|nothing, i.e. 0 or 1 a's */ 77 #define ANY 0x2000000 /* Any character but newline, . */ 78 #define NOP (ANY+1) /* No operation, internal use only */ 79 #define BOL (ANY+2) /* Beginning of line, ^ */ 80 #define EOL (ANY+3) /* End of line, $ */ 81 #define CCLASS (ANY+4) /* Character class, [] */ 82 #define NCCLASS (ANY+5) /* Negated character class, [^] */ 83 #define END (ANY+0x77) /* Terminate: match found */ 84 85 #define ISATOR OPERATOR 86 #define ISAND ANY 87 88 #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */ 89 90 /* 91 * Parser Information 92 */ 93 typedef struct Node Node; 94 struct Node 95 { 96 Inst *first; 97 Inst *last; 98 }; 99 100 #define NSTACK 20 101 Node andstack[NSTACK]; 102 Node *andp; 103 int atorstack[NSTACK]; 104 int *atorp; 105 int lastwasand; /* Last token was operand */ 106 int cursubid; 107 int subidstack[NSTACK]; 108 int *subidp; 109 int backwards; 110 int nbra; 111 Rune *exprp; /* pointer to next character in source expression */ 112 #define DCLASS 10 /* allocation increment */ 113 int nclass; /* number active */ 114 int Nclass; /* high water mark */ 115 Rune **class; 116 int negateclass; 117 118 int addinst(Ilist *l, Inst *inst, Rangeset *sep); 119 void newmatch(Rangeset*); 120 void bnewmatch(Rangeset*); 121 void pushand(Inst*, Inst*); 122 void pushator(int); 123 Node *popand(int); 124 int popator(void); 125 void startlex(Rune*); 126 int lex(void); 127 void operator(int); 128 void operand(int); 129 void evaluntil(int); 130 void optimize(Inst*); 131 void bldcclass(void); 132 133 void 134 rxinit(void) 135 { 136 rechan = chancreate(sizeof(Inst*), 0); 137 chansetname(rechan, "rechan"); 138 lastregexp = runemalloc(1); 139 } 140 141 void 142 regerror(char *e) 143 { 144 lastregexp[0] = 0; 145 warning(nil, "regexp: %s\n", e); 146 sendp(rechan, nil); 147 threadexits(nil); 148 } 149 150 Inst * 151 newinst(int t) 152 { 153 if(progp >= &program[NPROG]) 154 regerror("expression too long"); 155 progp->type = t; 156 progp->u1.left = nil; 157 progp->u.right = nil; 158 return progp++; 159 } 160 161 void 162 realcompile(void *arg) 163 { 164 int token; 165 Rune *s; 166 167 threadsetname("regcomp"); 168 s = arg; 169 startlex(s); 170 atorp = atorstack; 171 andp = andstack; 172 subidp = subidstack; 173 cursubid = 0; 174 lastwasand = FALSE; 175 /* Start with a low priority operator to prime parser */ 176 pushator(START-1); 177 while((token=lex()) != END){ 178 if((token&ISATOR) == OPERATOR) 179 operator(token); 180 else 181 operand(token); 182 } 183 /* Close with a low priority operator */ 184 evaluntil(START); 185 /* Force END */ 186 operand(END); 187 evaluntil(START); 188 if(nbra) 189 regerror("unmatched `('"); 190 --andp; /* points to first and only operand */ 191 sendp(rechan, andp->first); 192 threadexits(nil); 193 } 194 195 /* r is null terminated */ 196 int 197 rxcompile(Rune *r) 198 { 199 int i, nr; 200 Inst *oprogp; 201 202 nr = runestrlen(r)+1; 203 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE) 204 return TRUE; 205 lastregexp[0] = 0; 206 for(i=0; i<nclass; i++) 207 free(class[i]); 208 nclass = 0; 209 progp = program; 210 backwards = FALSE; 211 bstartinst = nil; 212 threadcreate(realcompile, r, STACK); 213 startinst = recvp(rechan); 214 if(startinst == nil) 215 return FALSE; 216 optimize(program); 217 oprogp = progp; 218 backwards = TRUE; 219 threadcreate(realcompile, r, STACK); 220 bstartinst = recvp(rechan); 221 if(bstartinst == nil) 222 return FALSE; 223 optimize(oprogp); 224 lastregexp = runerealloc(lastregexp, nr); 225 runemove(lastregexp, r, nr); 226 return TRUE; 227 } 228 229 void 230 operand(int t) 231 { 232 Inst *i; 233 if(lastwasand) 234 operator(CAT); /* catenate is implicit */ 235 i = newinst(t); 236 if(t == CCLASS){ 237 if(negateclass) 238 i->type = NCCLASS; /* UGH */ 239 i->u.class = nclass-1; /* UGH */ 240 } 241 pushand(i, i); 242 lastwasand = TRUE; 243 } 244 245 void 246 operator(int t) 247 { 248 if(t==RBRA && --nbra<0) 249 regerror("unmatched `)'"); 250 if(t==LBRA){ 251 cursubid++; /* silently ignored */ 252 nbra++; 253 if(lastwasand) 254 operator(CAT); 255 }else 256 evaluntil(t); 257 if(t!=RBRA) 258 pushator(t); 259 lastwasand = FALSE; 260 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 261 lastwasand = TRUE; /* these look like operands */ 262 } 263 264 void 265 pushand(Inst *f, Inst *l) 266 { 267 if(andp >= &andstack[NSTACK]) 268 error("operand stack overflow"); 269 andp->first = f; 270 andp->last = l; 271 andp++; 272 } 273 274 void 275 pushator(int t) 276 { 277 if(atorp >= &atorstack[NSTACK]) 278 error("operator stack overflow"); 279 *atorp++=t; 280 if(cursubid >= NRange) 281 *subidp++= -1; 282 else 283 *subidp++=cursubid; 284 } 285 286 Node * 287 popand(int op) 288 { 289 char buf[64]; 290 291 if(andp <= &andstack[0]) 292 if(op){ 293 sprint(buf, "missing operand for %c", op); 294 regerror(buf); 295 }else 296 regerror("malformed regexp"); 297 return --andp; 298 } 299 300 int 301 popator() 302 { 303 if(atorp <= &atorstack[0]) 304 error("operator stack underflow"); 305 --subidp; 306 return *--atorp; 307 } 308 309 void 310 evaluntil(int pri) 311 { 312 Node *op1, *op2, *t; 313 Inst *inst1, *inst2; 314 315 while(pri==RBRA || atorp[-1]>=pri){ 316 switch(popator()){ 317 case LBRA: 318 op1 = popand('('); 319 inst2 = newinst(RBRA); 320 inst2->u.subid = *subidp; 321 op1->last->u1.next = inst2; 322 inst1 = newinst(LBRA); 323 inst1->u.subid = *subidp; 324 inst1->u1.next = op1->first; 325 pushand(inst1, inst2); 326 return; /* must have been RBRA */ 327 default: 328 error("unknown regexp operator"); 329 break; 330 case OR: 331 op2 = popand('|'); 332 op1 = popand('|'); 333 inst2 = newinst(NOP); 334 op2->last->u1.next = inst2; 335 op1->last->u1.next = inst2; 336 inst1 = newinst(OR); 337 inst1->u.right = op1->first; 338 inst1->u1.left = op2->first; 339 pushand(inst1, inst2); 340 break; 341 case CAT: 342 op2 = popand(0); 343 op1 = popand(0); 344 if(backwards && op2->first->type!=END){ 345 t = op1; 346 op1 = op2; 347 op2 = t; 348 } 349 op1->last->u1.next = op2->first; 350 pushand(op1->first, op2->last); 351 break; 352 case STAR: 353 op2 = popand('*'); 354 inst1 = newinst(OR); 355 op2->last->u1.next = inst1; 356 inst1->u.right = op2->first; 357 pushand(inst1, inst1); 358 break; 359 case PLUS: 360 op2 = popand('+'); 361 inst1 = newinst(OR); 362 op2->last->u1.next = inst1; 363 inst1->u.right = op2->first; 364 pushand(op2->first, inst1); 365 break; 366 case QUEST: 367 op2 = popand('?'); 368 inst1 = newinst(OR); 369 inst2 = newinst(NOP); 370 inst1->u1.left = inst2; 371 inst1->u.right = op2->first; 372 op2->last->u1.next = inst2; 373 pushand(inst1, inst2); 374 break; 375 } 376 } 377 } 378 379 380 void 381 optimize(Inst *start) 382 { 383 Inst *inst, *target; 384 385 for(inst=start; inst->type!=END; inst++){ 386 target = inst->u1.next; 387 while(target->type == NOP) 388 target = target->u1.next; 389 inst->u1.next = target; 390 } 391 } 392 393 void 394 startlex(Rune *s) 395 { 396 exprp = s; 397 nbra = 0; 398 } 399 400 401 int 402 lex(void){ 403 int c; 404 405 c = *exprp++; 406 switch(c){ 407 case '\\': 408 if(*exprp) 409 if((c= *exprp++)=='n') 410 c='\n'; 411 break; 412 case 0: 413 c = END; 414 --exprp; /* In case we come here again */ 415 break; 416 case '*': 417 c = STAR; 418 break; 419 case '?': 420 c = QUEST; 421 break; 422 case '+': 423 c = PLUS; 424 break; 425 case '|': 426 c = OR; 427 break; 428 case '.': 429 c = ANY; 430 break; 431 case '(': 432 c = LBRA; 433 break; 434 case ')': 435 c = RBRA; 436 break; 437 case '^': 438 c = BOL; 439 break; 440 case '$': 441 c = EOL; 442 break; 443 case '[': 444 c = CCLASS; 445 bldcclass(); 446 break; 447 } 448 return c; 449 } 450 451 int 452 nextrec(void) 453 { 454 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) 455 regerror("malformed `[]'"); 456 if(exprp[0] == '\\'){ 457 exprp++; 458 if(*exprp=='n'){ 459 exprp++; 460 return '\n'; 461 } 462 return *exprp++|QUOTED; 463 } 464 return *exprp++; 465 } 466 467 void 468 bldcclass(void) 469 { 470 int c1, c2, n, na; 471 Rune *classp; 472 473 classp = runemalloc(DCLASS); 474 n = 0; 475 na = DCLASS; 476 /* we have already seen the '[' */ 477 if(*exprp == '^'){ 478 classp[n++] = '\n'; /* don't match newline in negate case */ 479 negateclass = TRUE; 480 exprp++; 481 }else 482 negateclass = FALSE; 483 while((c1 = nextrec()) != ']'){ 484 if(c1 == '-'){ 485 Error: 486 free(classp); 487 regerror("malformed `[]'"); 488 } 489 if(n+4 >= na){ /* 3 runes plus NUL */ 490 na += DCLASS; 491 classp = runerealloc(classp, na); 492 } 493 if(*exprp == '-'){ 494 exprp++; /* eat '-' */ 495 if((c2 = nextrec()) == ']') 496 goto Error; 497 classp[n+0] = Runemax; 498 classp[n+1] = c1; 499 classp[n+2] = c2; 500 n += 3; 501 }else 502 classp[n++] = c1 & ~QUOTED; 503 } 504 classp[n] = 0; 505 if(nclass == Nclass){ 506 Nclass += DCLASS; 507 class = realloc(class, Nclass*sizeof(Rune*)); 508 } 509 class[nclass++] = classp; 510 } 511 512 int 513 classmatch(int classno, int c, int negate) 514 { 515 Rune *p; 516 517 p = class[classno]; 518 while(*p){ 519 if(*p == Runemax){ 520 if(p[1]<=c && c<=p[2]) 521 return !negate; 522 p += 3; 523 }else if(*p++ == c) 524 return !negate; 525 } 526 return negate; 527 } 528 529 /* 530 * Note optimization in addinst: 531 * *l must be pending when addinst called; if *l has been looked 532 * at already, the optimization is a bug. 533 */ 534 int 535 addinst(Ilist *l, Inst *inst, Rangeset *sep) 536 { 537 Ilist *p; 538 539 for(p = l; p->inst; p++){ 540 if(p->inst==inst){ 541 if((sep)->r[0].q0 < p->se.r[0].q0) 542 p->se= *sep; /* this would be bug */ 543 return 0; /* It's already there */ 544 } 545 } 546 p->inst = inst; 547 p->se= *sep; 548 (p+1)->inst = nil; 549 return 1; 550 } 551 552 int 553 rxnull(void) 554 { 555 return startinst==nil || bstartinst==nil; 556 } 557 558 /* either t!=nil or r!=nil, and we match the string in the appropriate place */ 559 int 560 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp) 561 { 562 int flag; 563 Inst *inst; 564 Ilist *tlp; 565 uint p; 566 int nnl, ntl; 567 int nc, c; 568 int wrapped; 569 int startchar; 570 571 flag = 0; 572 p = startp; 573 startchar = 0; 574 wrapped = 0; 575 nnl = 0; 576 if(startinst->type<OPERATOR) 577 startchar = startinst->type; 578 list[0][0].inst = list[1][0].inst = nil; 579 sel.r[0].q0 = -1; 580 if(t != nil) 581 nc = t->file->b.nc; 582 else 583 nc = runestrlen(r); 584 /* Execute machine once for each character */ 585 for(;;p++){ 586 doloop: 587 if(p>=eof || p>=nc){ 588 switch(wrapped++){ 589 case 0: /* let loop run one more click */ 590 case 2: 591 break; 592 case 1: /* expired; wrap to beginning */ 593 if(sel.r[0].q0>=0 || eof!=Infinity) 594 goto Return; 595 list[0][0].inst = list[1][0].inst = nil; 596 p = 0; 597 goto doloop; 598 default: 599 goto Return; 600 } 601 c = 0; 602 }else{ 603 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0) 604 break; 605 if(t != nil) 606 c = textreadc(t, p); 607 else 608 c = r[p]; 609 } 610 /* fast check for first char */ 611 if(startchar && nnl==0 && c!=startchar) 612 continue; 613 tl = list[flag]; 614 nl = list[flag^=1]; 615 nl->inst = nil; 616 ntl = nnl; 617 nnl = 0; 618 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){ 619 /* Add first instruction to this list */ 620 sempty.r[0].q0 = p; 621 if(addinst(tl, startinst, &sempty)) 622 if(++ntl >= NLIST){ 623 Overflow: 624 warning(nil, "regexp list overflow\n"); 625 sel.r[0].q0 = -1; 626 goto Return; 627 } 628 } 629 /* Execute machine until this list is empty */ 630 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 631 Switchstmt: 632 switch(inst->type){ 633 default: /* regular character */ 634 if(inst->type==c){ 635 Addinst: 636 if(addinst(nl, inst->u1.next, &tlp->se)) 637 if(++nnl >= NLIST) 638 goto Overflow; 639 } 640 break; 641 case LBRA: 642 if(inst->u.subid>=0) 643 tlp->se.r[inst->u.subid].q0 = p; 644 inst = inst->u1.next; 645 goto Switchstmt; 646 case RBRA: 647 if(inst->u.subid>=0) 648 tlp->se.r[inst->u.subid].q1 = p; 649 inst = inst->u1.next; 650 goto Switchstmt; 651 case ANY: 652 if(c!='\n') 653 goto Addinst; 654 break; 655 case BOL: 656 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){ 657 Step: 658 inst = inst->u1.next; 659 goto Switchstmt; 660 } 661 break; 662 case EOL: 663 if(c == '\n') 664 goto Step; 665 break; 666 case CCLASS: 667 if(c>=0 && classmatch(inst->u.class, c, 0)) 668 goto Addinst; 669 break; 670 case NCCLASS: 671 if(c>=0 && classmatch(inst->u.class, c, 1)) 672 goto Addinst; 673 break; 674 case OR: 675 /* evaluate right choice later */ 676 if(addinst(tlp, inst->u.right, &tlp->se)) 677 if(++ntl >= NLIST) 678 goto Overflow; 679 /* efficiency: advance and re-evaluate */ 680 inst = inst->u1.left; 681 goto Switchstmt; 682 case END: /* Match! */ 683 tlp->se.r[0].q1 = p; 684 newmatch(&tlp->se); 685 break; 686 } 687 } 688 } 689 Return: 690 *rp = sel; 691 return sel.r[0].q0 >= 0; 692 } 693 694 void 695 newmatch(Rangeset *sp) 696 { 697 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 || 698 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1)) 699 sel = *sp; 700 } 701 702 int 703 rxbexecute(Text *t, uint startp, Rangeset *rp) 704 { 705 int flag; 706 Inst *inst; 707 Ilist *tlp; 708 int p; 709 int nnl, ntl; 710 int c; 711 int wrapped; 712 int startchar; 713 714 flag = 0; 715 nnl = 0; 716 wrapped = 0; 717 p = startp; 718 startchar = 0; 719 if(bstartinst->type<OPERATOR) 720 startchar = bstartinst->type; 721 list[0][0].inst = list[1][0].inst = nil; 722 sel.r[0].q0= -1; 723 /* Execute machine once for each character, including terminal NUL */ 724 for(;;--p){ 725 doloop: 726 if(p <= 0){ 727 switch(wrapped++){ 728 case 0: /* let loop run one more click */ 729 case 2: 730 break; 731 case 1: /* expired; wrap to end */ 732 if(sel.r[0].q0>=0) 733 goto Return; 734 list[0][0].inst = list[1][0].inst = nil; 735 p = t->file->b.nc; 736 goto doloop; 737 case 3: 738 default: 739 goto Return; 740 } 741 c = 0; 742 }else{ 743 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0) 744 break; 745 c = textreadc(t, p-1); 746 } 747 /* fast check for first char */ 748 if(startchar && nnl==0 && c!=startchar) 749 continue; 750 tl = list[flag]; 751 nl = list[flag^=1]; 752 nl->inst = nil; 753 ntl = nnl; 754 nnl = 0; 755 if(sel.r[0].q0<0 && (!wrapped || p>startp)){ 756 /* Add first instruction to this list */ 757 /* the minus is so the optimizations in addinst work */ 758 sempty.r[0].q0 = -p; 759 if(addinst(tl, bstartinst, &sempty)) 760 if(++ntl >= NLIST){ 761 Overflow: 762 warning(nil, "regexp list overflow\n"); 763 sel.r[0].q0 = -1; 764 goto Return; 765 } 766 } 767 /* Execute machine until this list is empty */ 768 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 769 Switchstmt: 770 switch(inst->type){ 771 default: /* regular character */ 772 if(inst->type == c){ 773 Addinst: 774 if(addinst(nl, inst->u1.next, &tlp->se)) 775 if(++nnl >= NLIST) 776 goto Overflow; 777 } 778 break; 779 case LBRA: 780 if(inst->u.subid>=0) 781 tlp->se.r[inst->u.subid].q0 = p; 782 inst = inst->u1.next; 783 goto Switchstmt; 784 case RBRA: 785 if(inst->u.subid >= 0) 786 tlp->se.r[inst->u.subid].q1 = p; 787 inst = inst->u1.next; 788 goto Switchstmt; 789 case ANY: 790 if(c != '\n') 791 goto Addinst; 792 break; 793 case BOL: 794 if(c=='\n' || p==0){ 795 Step: 796 inst = inst->u1.next; 797 goto Switchstmt; 798 } 799 break; 800 case EOL: 801 if(p<t->file->b.nc && textreadc(t, p)=='\n') 802 goto Step; 803 break; 804 case CCLASS: 805 if(c>0 && classmatch(inst->u.class, c, 0)) 806 goto Addinst; 807 break; 808 case NCCLASS: 809 if(c>0 && classmatch(inst->u.class, c, 1)) 810 goto Addinst; 811 break; 812 case OR: 813 /* evaluate right choice later */ 814 if(addinst(tl, inst->u.right, &tlp->se)) 815 if(++ntl >= NLIST) 816 goto Overflow; 817 /* efficiency: advance and re-evaluate */ 818 inst = inst->u1.left; 819 goto Switchstmt; 820 case END: /* Match! */ 821 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */ 822 tlp->se.r[0].q1 = p; 823 bnewmatch(&tlp->se); 824 break; 825 } 826 } 827 } 828 Return: 829 *rp = sel; 830 return sel.r[0].q0 >= 0; 831 } 832 833 void 834 bnewmatch(Rangeset *sp) 835 { 836 int i; 837 838 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0)) 839 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */ 840 sel.r[i].q0 = sp->r[i].q1; 841 sel.r[i].q1 = sp->r[i].q0; 842 } 843 }