dfa.c (14742B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bin.h> 4 #include <bio.h> 5 #include <regexp.h> 6 #include "../../../libregexp/regcomp.h" 7 #include "dfa.h" 8 9 void rdump(Reprog*); 10 void dump(Dreprog*); 11 12 /* 13 * Standard NFA determinization and DFA minimization. 14 */ 15 typedef struct Deter Deter; 16 typedef struct Reiset Reiset; 17 18 void ddump(Deter*); 19 20 /* state of determinization */ 21 struct Deter 22 { 23 jmp_buf kaboom; /* jmp on error */ 24 25 Bin *bin; /* bin for temporary allocations */ 26 27 Reprog *p; /* program being determinized */ 28 uint ninst; /* number of instructions in program */ 29 30 Reiset *alloc; /* chain of all Reisets */ 31 Reiset **last; 32 33 Reiset **hash; /* hash of all Reisets */ 34 uint nhash; 35 36 Reiset *tmp; /* temporaries for walk */ 37 uchar *bits; 38 39 Rune *c; /* ``interesting'' characters */ 40 uint nc; 41 }; 42 43 /* set of Reinsts: perhaps we should use a bit list instead of the indices? */ 44 struct Reiset 45 { 46 uint *inst; /* indices of instructions in set */ 47 uint ninst; /* size of set */ 48 49 Reiset *next; /* d.alloc chain */ 50 Reiset *hash; /* d.hash chain */ 51 Reiset **delta; /* where to go on each interesting char */ 52 uint id; /* assigned id during minimization */ 53 uint isfinal; /* is an accepting (final) state */ 54 }; 55 56 static Reiset* 57 ralloc(Deter *d, int ninst) 58 { 59 Reiset *t; 60 61 t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0); 62 if(t == nil) 63 longjmp(d->kaboom, 1); 64 t->delta = (Reiset**)&t[1]; 65 t->inst = (uint*)&t->delta[2*d->nc]; 66 return t; 67 } 68 69 /* find the canonical form a given Reiset */ 70 static Reiset* 71 findreiset(Deter *d, Reiset *s) 72 { 73 int i, szinst; 74 uint h; 75 Reiset *t; 76 77 h = 0; 78 for(i=0; i<s->ninst; i++) 79 h = h*1000003 + s->inst[i]; 80 h %= d->nhash; 81 82 szinst = s->ninst*sizeof(s->inst[0]); 83 for(t=d->hash[h]; t; t=t->hash) 84 if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0) 85 return t; 86 87 t = ralloc(d, s->ninst); 88 t->hash = d->hash[h]; 89 d->hash[h] = t; 90 91 *d->last = t; 92 d->last = &t->next; 93 t->next = 0; 94 95 t->ninst = s->ninst; 96 memmove(t->inst, s->inst, szinst); 97 98 /* delta is filled in later */ 99 100 return t; 101 } 102 103 /* convert bits to a real reiset */ 104 static Reiset* 105 bits2reiset(Deter *d, uchar *bits) 106 { 107 int k; 108 Reiset *s; 109 110 s = d->tmp; 111 s->ninst = 0; 112 for(k=0; k<d->ninst; k++) 113 if(bits[k]) 114 s->inst[s->ninst++] = k; 115 return findreiset(d, s); 116 } 117 118 /* add n to state set; if n < k, need to go around again */ 119 static int 120 add(int n, uchar *bits, int k) 121 { 122 if(bits[n]) 123 return 0; 124 bits[n] = 1; 125 return n < k; 126 } 127 128 /* update bits to follow all the empty (non-character-related) transitions possible */ 129 static void 130 followempty(Deter *d, uchar *bits, int bol, int eol) 131 { 132 int again, k; 133 Reinst *i; 134 135 do{ 136 again = 0; 137 for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){ 138 if(!bits[k]) 139 continue; 140 switch(i->type){ 141 case RBRA: 142 case LBRA: 143 again |= add(i->u2.next - d->p->firstinst, bits, k); 144 break; 145 case OR: 146 again |= add(i->u2.left - d->p->firstinst, bits, k); 147 again |= add(i->u1.right - d->p->firstinst, bits, k); 148 break; 149 case BOL: 150 if(bol) 151 again |= add(i->u2.next - d->p->firstinst, bits, k); 152 break; 153 case EOL: 154 if(eol) 155 again |= add(i->u2.next - d->p->firstinst, bits, k); 156 break; 157 } 158 } 159 }while(again); 160 161 /* 162 * Clear bits for useless transitions. We could do this during 163 * the switch above, but then we have no guarantee of termination 164 * if we get a loop in the regexp. 165 */ 166 for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){ 167 if(!bits[k]) 168 continue; 169 switch(i->type){ 170 case RBRA: 171 case LBRA: 172 case OR: 173 case BOL: 174 case EOL: 175 bits[k] = 0; 176 break; 177 } 178 } 179 } 180 181 /* 182 * Where does s go if it sees rune r? 183 * Eol is true if a $ matches the string at the position just after r. 184 */ 185 static Reiset* 186 transition(Deter *d, Reiset *s, Rune r, uint eol) 187 { 188 int k; 189 uchar *bits; 190 Reinst *i, *inst0; 191 Rune *rp, *ep; 192 193 bits = d->bits; 194 memset(bits, 0, d->ninst); 195 196 inst0 = d->p->firstinst; 197 for(k=0; k < s->ninst; k++){ 198 i = inst0 + s->inst[k]; 199 switch(i->type){ 200 default: 201 werrstr("bad reprog: got type %d", i->type); 202 longjmp(d->kaboom, 1); 203 case RBRA: 204 case LBRA: 205 case OR: 206 case BOL: 207 case EOL: 208 werrstr("internal error: got type %d", i->type); 209 longjmp(d->kaboom, 1); 210 211 case RUNE: 212 if(r == i->u1.r) 213 bits[i->u2.next - inst0] = 1; 214 break; 215 case ANY: 216 if(r != L'\n') 217 bits[i->u2.next - inst0] = 1; 218 break; 219 case ANYNL: 220 bits[i->u2.next - inst0] = 1; 221 break; 222 case NCCLASS: 223 if(r == L'\n') 224 break; 225 /* fall through */ 226 case CCLASS: 227 ep = i->u1.cp->end; 228 for(rp = i->u1.cp->spans; rp < ep; rp += 2) 229 if(rp[0] <= r && r <= rp[1]) 230 break; 231 if((rp < ep) ^! (i->type == CCLASS)) 232 bits[i->u2.next - inst0] = 1; 233 break; 234 case END: 235 break; 236 } 237 } 238 239 followempty(d, bits, r=='\n', eol); 240 return bits2reiset(d, bits); 241 } 242 243 static int 244 countinst(Reprog *pp) 245 { 246 int n; 247 Reinst *l; 248 249 n = 0; 250 l = pp->firstinst; 251 while(l++->type) 252 n++; 253 return n; 254 } 255 256 static void 257 set(Deter *d, u32int **tab, Rune r) 258 { 259 u32int *u; 260 261 if((u = tab[r/4096]) == nil){ 262 u = binalloc(&d->bin, 4096/8, 1); 263 if(u == nil) 264 longjmp(d->kaboom, 1); 265 tab[r/4096] = u; 266 } 267 u[(r%4096)/32] |= 1<<(r%32); 268 } 269 270 /* 271 * Compute the list of important characters. 272 * Other characters behave like the ones that surround them. 273 */ 274 static void 275 findchars(Deter *d, Reprog *p) 276 { 277 u32int *tab[65536/4096], *u, x; 278 Reinst *i; 279 Rune *rp, *ep; 280 int k, m, n, a; 281 282 memset(tab, 0, sizeof tab); 283 set(d, tab, 0); 284 set(d, tab, 0xFFFF); 285 for(i=p->firstinst; i->type; i++){ 286 switch(i->type){ 287 case ANY: 288 set(d, tab, L'\n'-1); 289 set(d, tab, L'\n'); 290 set(d, tab, L'\n'+1); 291 break; 292 case RUNE: 293 set(d, tab, i->u1.r-1); 294 set(d, tab, i->u1.r); 295 set(d, tab, i->u1.r+1); 296 break; 297 case NCCLASS: 298 set(d, tab, L'\n'-1); 299 set(d, tab, L'\n'); 300 set(d, tab, L'\n'+1); 301 /* fall through */ 302 case CCLASS: 303 ep = i->u1.cp->end; 304 for(rp = i->u1.cp->spans; rp < ep; rp += 2){ 305 set(d, tab, rp[0]-1); 306 set(d, tab, rp[0]); 307 set(d, tab, rp[1]); 308 set(d, tab, rp[1]+1); 309 } 310 break; 311 } 312 } 313 314 n = 0; 315 for(k=0; k<nelem(tab); k++){ 316 if((u = tab[k]) == nil) 317 continue; 318 for(m=0; m<4096/32; m++){ 319 if((x = u[m]) == 0) 320 continue; 321 for(a=0; a<32; a++) 322 if(x&(1<<a)) 323 n++; 324 } 325 } 326 327 d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0); 328 if(d->c == 0) 329 longjmp(d->kaboom, 1); 330 d->nc = n; 331 332 n = 0; 333 for(k=0; k<nelem(tab); k++){ 334 if((u = tab[k]) == nil) 335 continue; 336 for(m=0; m<4096/32; m++){ 337 if((x = u[m]) == 0) 338 continue; 339 for(a=0; a<32; a++) 340 if(x&(1<<a)) 341 d->c[n++] = k*4096+m*32+a; 342 } 343 } 344 345 d->c[n] = 0; 346 if(n != d->nc) 347 abort(); 348 } 349 350 /* 351 * convert the Deter and Reisets into a Dreprog. 352 * if dp and c are nil, just return the count of Drecases needed. 353 */ 354 static int 355 buildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c) 356 { 357 int i, j, id, n, nn; 358 Dreinst *di; 359 Reiset *s; 360 361 nn = 0; 362 di = 0; 363 for(i=0; i<nid; i++){ 364 s = id2set[i]; 365 if(c){ 366 di = &dp->inst[i]; 367 di->isfinal = s->isfinal; 368 } 369 n = 0; 370 id = -1; 371 for(j=0; j<2*d->nc; j++){ 372 if(s->delta[j]->id != id){ 373 id = s->delta[j]->id; 374 if(c){ 375 c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc]; 376 c[n].next = &dp->inst[id]; 377 } 378 n++; 379 } 380 } 381 if(c){ 382 if(n == 1 && c[0].next == di) 383 di->isloop = 1; 384 di->c = c; 385 di->nc = n; 386 c += n; 387 } 388 nn += n; 389 } 390 return nn; 391 } 392 393 Dreprog* 394 dregcvt(Reprog *p) 395 { 396 uchar *bits; 397 uint again, n, nid, id; 398 Deter d; 399 Reiset **id2set, *s, *t, *start[4]; 400 Dreprog *dp; 401 Drecase *c; 402 403 memset(&d, 0, sizeof d); 404 405 if(setjmp(d.kaboom)){ 406 binfree(&d.bin); 407 return nil; 408 } 409 410 d.p = p; 411 d.ninst = countinst(p); 412 413 d.last = &d.alloc; 414 415 n = d.ninst; 416 /* round up to power of two; this loop is the least of our efficiency problems */ 417 while(n&(n-1)) 418 n++; 419 d.nhash = n; 420 d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1); 421 422 /* get list of important runes */ 423 findchars(&d, p); 424 425 #ifdef DUMP 426 print("relevant chars are: «%S»\n", d.c+1); 427 #endif 428 429 d.bits = bits = binalloc(&d.bin, d.ninst, 0); 430 d.tmp = ralloc(&d, d.ninst); 431 432 /* 433 * Convert to DFA 434 */ 435 436 /* 4 start states, depending on initial bol, eol */ 437 for(n=0; n<4; n++){ 438 memset(bits, 0, d.ninst); 439 bits[p->startinst - p->firstinst] = 1; 440 followempty(&d, bits, n&1, n&2); 441 start[n] = bits2reiset(&d, bits); 442 } 443 444 /* explore the reiset space */ 445 for(s=d.alloc; s; s=s->next) 446 for(n=0; n<2*d.nc; n++) 447 s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc); 448 449 #ifdef DUMP 450 nid = 0; 451 for(s=d.alloc; s; s=s->next) 452 s->id = nid++; 453 ddump(&d); 454 #endif 455 456 /* 457 * Minimize. 458 */ 459 460 /* first class division is final or not */ 461 for(s=d.alloc; s; s=s->next){ 462 s->isfinal = 0; 463 for(n=0; n<s->ninst; n++) 464 if(p->firstinst[s->inst[n]].type == END) 465 s->isfinal = 1; 466 s->id = s->isfinal; 467 } 468 469 /* divide states with different transition tables in id space */ 470 nid = 2; 471 do{ 472 again = 0; 473 for(s=d.alloc; s; s=s->next){ 474 id = -1; 475 for(t=s->next; t; t=t->next){ 476 if(s->id != t->id) 477 continue; 478 for(n=0; n<2*d.nc; n++){ 479 /* until we finish the for(t) loop, s->id and id are same */ 480 if((s->delta[n]->id == t->delta[n]->id) 481 || (s->delta[n]->id == s->id && t->delta[n]->id == id) 482 || (s->delta[n]->id == id && t->delta[n]->id == s->id)) 483 continue; 484 break; 485 } 486 if(n == 2*d.nc) 487 continue; 488 if(id == -1) 489 id = nid++; 490 t->id = id; 491 again = 1; 492 } 493 } 494 }while(again); 495 496 #ifdef DUMP 497 ddump(&d); 498 #endif 499 500 /* build dreprog */ 501 id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1); 502 if(id2set == nil) 503 longjmp(d.kaboom, 1); 504 for(s=d.alloc; s; s=s->next) 505 id2set[s->id] = s; 506 507 n = buildprog(&d, id2set, nid, nil, nil); 508 dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1); 509 if(dp == nil) 510 longjmp(d.kaboom, 1); 511 c = (Drecase*)&dp->inst[nid]; 512 buildprog(&d, id2set, nid, dp, c); 513 514 for(n=0; n<4; n++) 515 dp->start[n] = &dp->inst[start[n]->id]; 516 dp->ninst = nid; 517 518 binfree(&d.bin); 519 return dp; 520 } 521 522 int 523 dregexec(Dreprog *p, char *s, int bol) 524 { 525 Rune r; 526 ulong rr; 527 Dreinst *i; 528 Drecase *c, *ec; 529 int best, n; 530 char *os; 531 532 i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)]; 533 best = -1; 534 os = s; 535 for(; *s; s+=n){ 536 if(i->isfinal) 537 best = s - os; 538 if(i->isloop){ 539 if(i->isfinal) 540 return strlen(os); 541 else 542 return best; 543 } 544 if((*s&0xFF) < Runeself){ 545 r = *s; 546 n = 1; 547 }else 548 n = chartorune(&r, s); 549 c = i->c; 550 ec = c+i->nc; 551 rr = r; 552 if(s[n] == '\n' || s[n] == '\0') 553 rr |= 0x10000; 554 for(; c<ec; c++){ 555 if(c->start > rr){ 556 i = c[-1].next; 557 goto Out; 558 } 559 } 560 i = ec[-1].next; 561 Out:; 562 } 563 if(i->isfinal) 564 best = s - os; 565 return best; 566 } 567 568 569 #ifdef DUMP 570 void 571 ddump(Deter *d) 572 { 573 int i, id; 574 Reiset *s; 575 576 for(s=d->alloc; s; s=s->next){ 577 print("%d ", s->id); 578 id = -1; 579 for(i=0; i<2*d->nc; i++){ 580 if(id != s->delta[i]->id){ 581 if(i==0) 582 print(" ["); 583 else if(i/d->nc) 584 print(" [%C$", d->c[i%d->nc]); 585 else 586 print(" [%C", d->c[i%d->nc]); 587 print(" %d]", s->delta[i]->id); 588 id = s->delta[i]->id; 589 } 590 } 591 print("\n"); 592 } 593 } 594 595 void 596 rdump(Reprog *pp) 597 { 598 Reinst *l; 599 Rune *p; 600 601 l = pp->firstinst; 602 do{ 603 print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type, 604 l->left-pp->firstinst, l->right-pp->firstinst); 605 if(l->type == RUNE) 606 print("\t%C\n", l->r); 607 else if(l->type == CCLASS || l->type == NCCLASS){ 608 print("\t["); 609 if(l->type == NCCLASS) 610 print("^"); 611 for(p = l->cp->spans; p < l->cp->end; p += 2) 612 if(p[0] == p[1]) 613 print("%C", p[0]); 614 else 615 print("%C-%C", p[0], p[1]); 616 print("]\n"); 617 } else 618 print("\n"); 619 }while(l++->type); 620 } 621 622 void 623 dump(Dreprog *pp) 624 { 625 int i, j; 626 Dreinst *l; 627 628 print("start %ld %ld %ld %ld\n", 629 pp->start[0]-pp->inst, 630 pp->start[1]-pp->inst, 631 pp->start[2]-pp->inst, 632 pp->start[3]-pp->inst); 633 634 for(i=0; i<pp->ninst; i++){ 635 l = &pp->inst[i]; 636 print("%d:", i); 637 for(j=0; j<l->nc; j++){ 638 print(" ["); 639 if(j == 0) 640 if(l->c[j].start != 1) 641 abort(); 642 if(j != 0) 643 print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : ""); 644 print("-"); 645 if(j != l->nc-1) 646 print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : ""); 647 print("] %ld", l->c[j].next - pp->inst); 648 } 649 if(l->isfinal) 650 print(" final"); 651 if(l->isloop) 652 print(" loop"); 653 print("\n"); 654 } 655 } 656 657 658 void 659 main(int argc, char **argv) 660 { 661 int i; 662 Reprog *p; 663 Dreprog *dp; 664 665 i = 1; 666 p = regcomp(argv[i]); 667 if(p == 0){ 668 print("=== %s: bad regexp\n", argv[i]); 669 } 670 /* print("=== %s\n", argv[i]); */ 671 /* rdump(p); */ 672 dp = dregcvt(p); 673 print("=== dfa\n"); 674 dump(dp); 675 676 for(i=2; i<argc; i++) 677 print("match %d\n", dregexec(dp, argv[i], 0)); 678 exits(0); 679 } 680 #endif 681 682 void 683 Bprintdfa(Biobuf *b, Dreprog *p) 684 { 685 int i, j, nc; 686 687 Bprint(b, "# dreprog\n"); 688 nc = 0; 689 for(i=0; i<p->ninst; i++) 690 nc += p->inst[i].nc; 691 Bprint(b, "%d %d %ld %ld %ld %ld\n", p->ninst, nc, 692 p->start[0]-p->inst, p->start[1]-p->inst, 693 p->start[2]-p->inst, p->start[3]-p->inst); 694 for(i=0; i<p->ninst; i++){ 695 Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc); 696 for(j=0; j<p->inst[i].nc; j++) 697 Bprint(b, " %d %ld", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst); 698 Bprint(b, "\n"); 699 } 700 } 701 702 static char* 703 egetline(Biobuf *b, int c, jmp_buf jb) 704 { 705 char *p; 706 707 p = Brdline(b, c); 708 if(p == nil) 709 longjmp(jb, 1); 710 p[Blinelen(b)-1] = '\0'; 711 return p; 712 } 713 714 static void 715 egetc(Biobuf *b, int c, jmp_buf jb) 716 { 717 if(Bgetc(b) != c) 718 longjmp(jb, 1); 719 } 720 721 static int 722 egetnum(Biobuf *b, int want, jmp_buf jb) 723 { 724 int c; 725 int n, first; 726 727 n = 0; 728 first = 1; 729 while((c = Bgetc(b)) != Beof){ 730 if(c < '0' || c > '9'){ 731 if(want == 0){ 732 Bungetc(b); 733 c = 0; 734 } 735 if(first || c != want){ 736 werrstr("format error"); 737 longjmp(jb, 1); 738 } 739 return n; 740 } 741 n = n*10 + c - '0'; 742 first = 0; 743 } 744 werrstr("unexpected eof"); 745 longjmp(jb, 1); 746 return -1; 747 } 748 749 Dreprog* 750 Breaddfa(Biobuf *b) 751 { 752 char *s; 753 int ninst, nc; 754 jmp_buf jb; 755 Dreprog *p; 756 Drecase *c; 757 Dreinst *l; 758 int j, k; 759 760 p = nil; 761 if(setjmp(jb)){ 762 free(p); 763 return nil; 764 } 765 766 s = egetline(b, '\n', jb); 767 if(strcmp(s, "# dreprog") != 0){ 768 werrstr("format error"); 769 longjmp(jb, 1); 770 } 771 772 ninst = egetnum(b, ' ', jb); 773 nc = egetnum(b, ' ', jb); 774 775 p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1); 776 if(p == nil) 777 longjmp(jb, 1); 778 c = (Drecase*)&p->inst[ninst]; 779 780 p->start[0] = &p->inst[egetnum(b, ' ', jb)]; 781 p->start[1] = &p->inst[egetnum(b, ' ', jb)]; 782 p->start[2] = &p->inst[egetnum(b, ' ', jb)]; 783 p->start[3] = &p->inst[egetnum(b, '\n', jb)]; 784 785 for(j=0; j<ninst; j++){ 786 l = &p->inst[j]; 787 l->isfinal = egetnum(b, ' ', jb); 788 l->isloop = egetnum(b, ' ', jb); 789 l->nc = egetnum(b, 0, jb); 790 l->c = c; 791 for(k=0; k<l->nc; k++){ 792 egetc(b, ' ', jb); 793 c->start = egetnum(b, ' ', jb); 794 c->next = &p->inst[egetnum(b, 0, jb)]; 795 c++; 796 } 797 egetc(b, '\n', jb); 798 } 799 return p; 800 }