regcomp.c (10053B)
1 #include "lib9.h" 2 #include "regexp9.h" 3 #include "regcomp.h" 4 5 #define TRUE 1 6 #define FALSE 0 7 8 /* 9 * Parser Information 10 */ 11 typedef 12 struct Node 13 { 14 Reinst* first; 15 Reinst* last; 16 }Node; 17 18 /* max rune ranges per character class is nelem(classp->spans)/2 */ 19 #define NCCRUNE nelem(classp->spans) 20 21 #define NSTACK 20 22 static Node andstack[NSTACK]; 23 static Node *andp; 24 static int atorstack[NSTACK]; 25 static int* atorp; 26 static int cursubid; /* id of current subexpression */ 27 static int subidstack[NSTACK]; /* parallel to atorstack */ 28 static int* subidp; 29 static int lastwasand; /* Last token was operand */ 30 static int nbra; 31 static char* exprp; /* pointer to next character in source expression */ 32 static int lexdone; 33 static int nclass; 34 static Reclass*classp; 35 static Reinst* freep; 36 static int errors; 37 static Rune yyrune; /* last lex'd rune */ 38 static Reclass*yyclassp; /* last lex'd class */ 39 40 /* predeclared crap */ 41 static void operator(int); 42 static void pushand(Reinst*, Reinst*); 43 static void pushator(int); 44 static void evaluntil(int); 45 static int bldcclass(void); 46 47 static jmp_buf regkaboom; 48 49 static void 50 rcerror(char *s) 51 { 52 errors++; 53 regerror(s); 54 longjmp(regkaboom, 1); 55 } 56 57 static Reinst* 58 newinst(int t) 59 { 60 freep->type = t; 61 freep->u2.left = 0; 62 freep->u1.right = 0; 63 return freep++; 64 } 65 66 static void 67 operand(int t) 68 { 69 Reinst *i; 70 71 if(lastwasand) 72 operator(CAT); /* catenate is implicit */ 73 i = newinst(t); 74 75 if(t == CCLASS || t == NCCLASS) 76 i->u1.cp = yyclassp; 77 if(t == RUNE) 78 i->u1.r = yyrune; 79 80 pushand(i, i); 81 lastwasand = TRUE; 82 } 83 84 static void 85 operator(int t) 86 { 87 if(t==RBRA && --nbra<0) 88 rcerror("unmatched right paren"); 89 if(t==LBRA){ 90 if(++cursubid >= NSUBEXP) 91 rcerror ("too many subexpressions"); 92 nbra++; 93 if(lastwasand) 94 operator(CAT); 95 } else 96 evaluntil(t); 97 if(t != RBRA) 98 pushator(t); 99 lastwasand = FALSE; 100 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 101 lastwasand = TRUE; /* these look like operands */ 102 } 103 104 static void 105 regerr2(char *s, int c) 106 { 107 char buf[100]; 108 char *cp = buf; 109 while(*s) 110 *cp++ = *s++; 111 *cp++ = c; 112 *cp = '\0'; 113 rcerror(buf); 114 } 115 116 static void 117 cant(char *s) 118 { 119 char buf[100]; 120 strcpy(buf, "can't happen: "); 121 strcat(buf, s); 122 rcerror(buf); 123 } 124 125 static void 126 pushand(Reinst *f, Reinst *l) 127 { 128 if(andp >= &andstack[NSTACK]) 129 cant("operand stack overflow"); 130 andp->first = f; 131 andp->last = l; 132 andp++; 133 } 134 135 static void 136 pushator(int t) 137 { 138 if(atorp >= &atorstack[NSTACK]) 139 cant("operator stack overflow"); 140 *atorp++ = t; 141 *subidp++ = cursubid; 142 } 143 144 static Node* 145 popand(int op) 146 { 147 Reinst *inst; 148 149 if(andp <= &andstack[0]){ 150 regerr2("missing operand for ", op); 151 inst = newinst(NOP); 152 pushand(inst,inst); 153 } 154 return --andp; 155 } 156 157 static int 158 popator(void) 159 { 160 if(atorp <= &atorstack[0]) 161 cant("operator stack underflow"); 162 --subidp; 163 return *--atorp; 164 } 165 166 static void 167 evaluntil(int pri) 168 { 169 Node *op1, *op2; 170 Reinst *inst1, *inst2; 171 172 while(pri==RBRA || atorp[-1]>=pri){ 173 switch(popator()){ 174 default: 175 rcerror("unknown operator in evaluntil"); 176 break; 177 case LBRA: /* must have been RBRA */ 178 op1 = popand('('); 179 inst2 = newinst(RBRA); 180 inst2->u1.subid = *subidp; 181 op1->last->u2.next = inst2; 182 inst1 = newinst(LBRA); 183 inst1->u1.subid = *subidp; 184 inst1->u2.next = op1->first; 185 pushand(inst1, inst2); 186 return; 187 case OR: 188 op2 = popand('|'); 189 op1 = popand('|'); 190 inst2 = newinst(NOP); 191 op2->last->u2.next = inst2; 192 op1->last->u2.next = inst2; 193 inst1 = newinst(OR); 194 inst1->u1.right = op1->first; 195 inst1->u2.left = op2->first; 196 pushand(inst1, inst2); 197 break; 198 case CAT: 199 op2 = popand(0); 200 op1 = popand(0); 201 op1->last->u2.next = op2->first; 202 pushand(op1->first, op2->last); 203 break; 204 case STAR: 205 op2 = popand('*'); 206 inst1 = newinst(OR); 207 op2->last->u2.next = inst1; 208 inst1->u1.right = op2->first; 209 pushand(inst1, inst1); 210 break; 211 case PLUS: 212 op2 = popand('+'); 213 inst1 = newinst(OR); 214 op2->last->u2.next = inst1; 215 inst1->u1.right = op2->first; 216 pushand(op2->first, inst1); 217 break; 218 case QUEST: 219 op2 = popand('?'); 220 inst1 = newinst(OR); 221 inst2 = newinst(NOP); 222 inst1->u2.left = inst2; 223 inst1->u1.right = op2->first; 224 op2->last->u2.next = inst2; 225 pushand(inst1, inst2); 226 break; 227 } 228 } 229 } 230 231 static Reprog* 232 optimize(Reprog *pp) 233 { 234 Reinst *inst, *target; 235 int size; 236 Reprog *npp; 237 Reclass *cl; 238 ptrdiff_t diff; 239 240 /* 241 * get rid of NOOP chains 242 */ 243 for(inst=pp->firstinst; inst->type!=END; inst++){ 244 target = inst->u2.next; 245 while(target->type == NOP) 246 target = target->u2.next; 247 inst->u2.next = target; 248 } 249 250 /* 251 * The original allocation is for an area larger than 252 * necessary. Reallocate to the actual space used 253 * and then relocate the code. 254 */ 255 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); 256 npp = realloc(pp, size); 257 if(npp==0 || npp==pp) 258 return pp; 259 diff = (char *)npp - (char *)pp; 260 freep = (Reinst *)((char *)freep + diff); 261 for(inst=npp->firstinst; inst<freep; inst++){ 262 switch(inst->type){ 263 case OR: 264 case STAR: 265 case PLUS: 266 case QUEST: 267 inst->u1.right = (void*)((char*)inst->u1.right + diff); 268 break; 269 case CCLASS: 270 case NCCLASS: 271 inst->u1.right = (void*)((char*)inst->u1.right + diff); 272 cl = inst->u1.cp; 273 cl->end = (void*)((char*)cl->end + diff); 274 break; 275 } 276 inst->u2.left = (void*)((char*)inst->u2.left + diff); 277 } 278 npp->startinst = (void*)((char*)npp->startinst + diff); 279 return npp; 280 } 281 282 #ifdef DEBUG 283 static void 284 dumpstack(void){ 285 Node *stk; 286 int *ip; 287 288 print("operators\n"); 289 for(ip=atorstack; ip<atorp; ip++) 290 print("0%o\n", *ip); 291 print("operands\n"); 292 for(stk=andstack; stk<andp; stk++) 293 print("0%o\t0%o\n", stk->first->type, stk->last->type); 294 } 295 296 static void 297 dump(Reprog *pp) 298 { 299 Reinst *l; 300 Rune *p; 301 302 l = pp->firstinst; 303 do{ 304 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, 305 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst); 306 if(l->type == RUNE) 307 print("\t%C\n", l->u1.r); 308 else if(l->type == CCLASS || l->type == NCCLASS){ 309 print("\t["); 310 if(l->type == NCCLASS) 311 print("^"); 312 for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2) 313 if(p[0] == p[1]) 314 print("%C", p[0]); 315 else 316 print("%C-%C", p[0], p[1]); 317 print("]\n"); 318 } else 319 print("\n"); 320 }while(l++->type); 321 } 322 #endif 323 324 static Reclass* 325 newclass(void) 326 { 327 if(nclass >= nelem(((Reprog*)0)->class)) 328 rcerror("too many character classes; increase Reprog.class size"); 329 return &(classp[nclass++]); 330 } 331 332 static int 333 nextc(Rune *rp) 334 { 335 if(lexdone){ 336 *rp = 0; 337 return 1; 338 } 339 exprp += chartorune(rp, exprp); 340 if(*rp == '\\'){ 341 exprp += chartorune(rp, exprp); 342 return 1; 343 } 344 if(*rp == 0) 345 lexdone = 1; 346 return 0; 347 } 348 349 static int 350 lex(int literal, int dot_type) 351 { 352 int quoted; 353 354 quoted = nextc(&yyrune); 355 if(literal || quoted){ 356 if(yyrune == 0) 357 return END; 358 return RUNE; 359 } 360 361 switch(yyrune){ 362 case 0: 363 return END; 364 case '*': 365 return STAR; 366 case '?': 367 return QUEST; 368 case '+': 369 return PLUS; 370 case '|': 371 return OR; 372 case '.': 373 return dot_type; 374 case '(': 375 return LBRA; 376 case ')': 377 return RBRA; 378 case '^': 379 return BOL; 380 case '$': 381 return EOL; 382 case '[': 383 return bldcclass(); 384 } 385 return RUNE; 386 } 387 388 static int 389 bldcclass(void) 390 { 391 int type; 392 Rune r[NCCRUNE]; 393 Rune *p, *ep, *np; 394 Rune rune; 395 int quoted; 396 397 /* we have already seen the '[' */ 398 type = CCLASS; 399 yyclassp = newclass(); 400 401 /* look ahead for negation */ 402 /* SPECIAL CASE!!! negated classes don't match \n */ 403 ep = r; 404 quoted = nextc(&rune); 405 if(!quoted && rune == '^'){ 406 type = NCCLASS; 407 quoted = nextc(&rune); 408 *ep++ = '\n'; 409 *ep++ = '\n'; 410 } 411 412 /* parse class into a set of spans */ 413 while(ep < &r[NCCRUNE-1]){ 414 if(rune == 0){ 415 rcerror("malformed '[]'"); 416 return 0; 417 } 418 if(!quoted && rune == ']') 419 break; 420 if(!quoted && rune == '-'){ 421 if(ep == r){ 422 rcerror("malformed '[]'"); 423 return 0; 424 } 425 quoted = nextc(&rune); 426 if((!quoted && rune == ']') || rune == 0){ 427 rcerror("malformed '[]'"); 428 return 0; 429 } 430 *(ep-1) = rune; 431 } else { 432 *ep++ = rune; 433 *ep++ = rune; 434 } 435 quoted = nextc(&rune); 436 } 437 if(ep >= &r[NCCRUNE-1]) { 438 rcerror("char class too large; increase Reclass.spans size"); 439 return 0; 440 } 441 442 /* sort on span start */ 443 for(p = r; p < ep; p += 2){ 444 for(np = p; np < ep; np += 2) 445 if(*np < *p){ 446 rune = np[0]; 447 np[0] = p[0]; 448 p[0] = rune; 449 rune = np[1]; 450 np[1] = p[1]; 451 p[1] = rune; 452 } 453 } 454 455 /* merge spans */ 456 np = yyclassp->spans; 457 p = r; 458 if(r == ep) 459 yyclassp->end = np; 460 else { 461 np[0] = *p++; 462 np[1] = *p++; 463 for(; p < ep; p += 2) 464 /* overlapping or adjacent ranges? */ 465 if(p[0] <= np[1] + 1){ 466 if(p[1] >= np[1]) 467 np[1] = p[1]; /* coalesce */ 468 } else { 469 np += 2; 470 np[0] = p[0]; 471 np[1] = p[1]; 472 } 473 yyclassp->end = np+2; 474 } 475 476 return type; 477 } 478 479 static Reprog* 480 regcomp1(char *s, int literal, int dot_type) 481 { 482 int token; 483 Reprog *volatile pp; 484 485 /* get memory for the program */ 486 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); 487 if(pp == 0){ 488 regerror("out of memory"); 489 return 0; 490 } 491 freep = pp->firstinst; 492 classp = pp->class; 493 errors = 0; 494 495 if(setjmp(regkaboom)) 496 goto out; 497 498 /* go compile the sucker */ 499 lexdone = 0; 500 exprp = s; 501 nclass = 0; 502 nbra = 0; 503 atorp = atorstack; 504 andp = andstack; 505 subidp = subidstack; 506 lastwasand = FALSE; 507 cursubid = 0; 508 509 /* Start with a low priority operator to prime parser */ 510 pushator(START-1); 511 while((token = lex(literal, dot_type)) != END){ 512 if((token&0300) == OPERATOR) 513 operator(token); 514 else 515 operand(token); 516 } 517 518 /* Close with a low priority operator */ 519 evaluntil(START); 520 521 /* Force END */ 522 operand(END); 523 evaluntil(START); 524 #ifdef DEBUG 525 dumpstack(); 526 #endif 527 if(nbra) 528 rcerror("unmatched left paren"); 529 --andp; /* points to first and only operand */ 530 pp->startinst = andp->first; 531 #ifdef DEBUG 532 dump(pp); 533 #endif 534 pp = optimize(pp); 535 #ifdef DEBUG 536 print("start: %d\n", andp->first-pp->firstinst); 537 dump(pp); 538 #endif 539 out: 540 if(errors){ 541 free(pp); 542 pp = 0; 543 } 544 return pp; 545 } 546 547 extern Reprog* 548 regcomp(char *s) 549 { 550 return regcomp1(s, 0, ANY); 551 } 552 553 extern Reprog* 554 regcomplit(char *s) 555 { 556 return regcomp1(s, 1, ANY); 557 } 558 559 extern Reprog* 560 regcompnl(char *s) 561 { 562 return regcomp1(s, 0, ANYNL); 563 }