plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

regcomp.c (10053B)


      1 #include "lib9.h"
      2 #include "regexp9.h"
      3 #include "regcomp.h"
      4 
      5 #define	TRUE	1
      6 #define	FALSE	0
      7 
      8 /*
      9  * Parser Information
     10  */
     11 typedef
     12 struct Node
     13 {
     14 	Reinst*	first;
     15 	Reinst*	last;
     16 }Node;
     17 
     18 /* max rune ranges per character class is nelem(classp->spans)/2 */
     19 #define NCCRUNE	nelem(classp->spans)
     20 
     21 #define	NSTACK	20
     22 static	Node	andstack[NSTACK];
     23 static	Node	*andp;
     24 static	int	atorstack[NSTACK];
     25 static	int*	atorp;
     26 static	int	cursubid;		/* id of current subexpression */
     27 static	int	subidstack[NSTACK];	/* parallel to atorstack */
     28 static	int*	subidp;
     29 static	int	lastwasand;	/* Last token was operand */
     30 static	int	nbra;
     31 static	char*	exprp;		/* pointer to next character in source expression */
     32 static	int	lexdone;
     33 static	int	nclass;
     34 static	Reclass*classp;
     35 static	Reinst*	freep;
     36 static	int	errors;
     37 static	Rune	yyrune;		/* last lex'd rune */
     38 static	Reclass*yyclassp;	/* last lex'd class */
     39 
     40 /* predeclared crap */
     41 static	void	operator(int);
     42 static	void	pushand(Reinst*, Reinst*);
     43 static	void	pushator(int);
     44 static	void	evaluntil(int);
     45 static	int	bldcclass(void);
     46 
     47 static jmp_buf regkaboom;
     48 
     49 static	void
     50 rcerror(char *s)
     51 {
     52 	errors++;
     53 	regerror(s);
     54 	longjmp(regkaboom, 1);
     55 }
     56 
     57 static	Reinst*
     58 newinst(int t)
     59 {
     60 	freep->type = t;
     61 	freep->u2.left = 0;
     62 	freep->u1.right = 0;
     63 	return freep++;
     64 }
     65 
     66 static	void
     67 operand(int t)
     68 {
     69 	Reinst *i;
     70 
     71 	if(lastwasand)
     72 		operator(CAT);	/* catenate is implicit */
     73 	i = newinst(t);
     74 
     75 	if(t == CCLASS || t == NCCLASS)
     76 		i->u1.cp = yyclassp;
     77 	if(t == RUNE)
     78 		i->u1.r = yyrune;
     79 
     80 	pushand(i, i);
     81 	lastwasand = TRUE;
     82 }
     83 
     84 static	void
     85 operator(int t)
     86 {
     87 	if(t==RBRA && --nbra<0)
     88 		rcerror("unmatched right paren");
     89 	if(t==LBRA){
     90 		if(++cursubid >= NSUBEXP)
     91 			rcerror ("too many subexpressions");
     92 		nbra++;
     93 		if(lastwasand)
     94 			operator(CAT);
     95 	} else
     96 		evaluntil(t);
     97 	if(t != RBRA)
     98 		pushator(t);
     99 	lastwasand = FALSE;
    100 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
    101 		lastwasand = TRUE;	/* these look like operands */
    102 }
    103 
    104 static	void
    105 regerr2(char *s, int c)
    106 {
    107 	char buf[100];
    108 	char *cp = buf;
    109 	while(*s)
    110 		*cp++ = *s++;
    111 	*cp++ = c;
    112 	*cp = '\0';
    113 	rcerror(buf);
    114 }
    115 
    116 static	void
    117 cant(char *s)
    118 {
    119 	char buf[100];
    120 	strcpy(buf, "can't happen: ");
    121 	strcat(buf, s);
    122 	rcerror(buf);
    123 }
    124 
    125 static	void
    126 pushand(Reinst *f, Reinst *l)
    127 {
    128 	if(andp >= &andstack[NSTACK])
    129 		cant("operand stack overflow");
    130 	andp->first = f;
    131 	andp->last = l;
    132 	andp++;
    133 }
    134 
    135 static	void
    136 pushator(int t)
    137 {
    138 	if(atorp >= &atorstack[NSTACK])
    139 		cant("operator stack overflow");
    140 	*atorp++ = t;
    141 	*subidp++ = cursubid;
    142 }
    143 
    144 static	Node*
    145 popand(int op)
    146 {
    147 	Reinst *inst;
    148 
    149 	if(andp <= &andstack[0]){
    150 		regerr2("missing operand for ", op);
    151 		inst = newinst(NOP);
    152 		pushand(inst,inst);
    153 	}
    154 	return --andp;
    155 }
    156 
    157 static	int
    158 popator(void)
    159 {
    160 	if(atorp <= &atorstack[0])
    161 		cant("operator stack underflow");
    162 	--subidp;
    163 	return *--atorp;
    164 }
    165 
    166 static	void
    167 evaluntil(int pri)
    168 {
    169 	Node *op1, *op2;
    170 	Reinst *inst1, *inst2;
    171 
    172 	while(pri==RBRA || atorp[-1]>=pri){
    173 		switch(popator()){
    174 		default:
    175 			rcerror("unknown operator in evaluntil");
    176 			break;
    177 		case LBRA:		/* must have been RBRA */
    178 			op1 = popand('(');
    179 			inst2 = newinst(RBRA);
    180 			inst2->u1.subid = *subidp;
    181 			op1->last->u2.next = inst2;
    182 			inst1 = newinst(LBRA);
    183 			inst1->u1.subid = *subidp;
    184 			inst1->u2.next = op1->first;
    185 			pushand(inst1, inst2);
    186 			return;
    187 		case OR:
    188 			op2 = popand('|');
    189 			op1 = popand('|');
    190 			inst2 = newinst(NOP);
    191 			op2->last->u2.next = inst2;
    192 			op1->last->u2.next = inst2;
    193 			inst1 = newinst(OR);
    194 			inst1->u1.right = op1->first;
    195 			inst1->u2.left = op2->first;
    196 			pushand(inst1, inst2);
    197 			break;
    198 		case CAT:
    199 			op2 = popand(0);
    200 			op1 = popand(0);
    201 			op1->last->u2.next = op2->first;
    202 			pushand(op1->first, op2->last);
    203 			break;
    204 		case STAR:
    205 			op2 = popand('*');
    206 			inst1 = newinst(OR);
    207 			op2->last->u2.next = inst1;
    208 			inst1->u1.right = op2->first;
    209 			pushand(inst1, inst1);
    210 			break;
    211 		case PLUS:
    212 			op2 = popand('+');
    213 			inst1 = newinst(OR);
    214 			op2->last->u2.next = inst1;
    215 			inst1->u1.right = op2->first;
    216 			pushand(op2->first, inst1);
    217 			break;
    218 		case QUEST:
    219 			op2 = popand('?');
    220 			inst1 = newinst(OR);
    221 			inst2 = newinst(NOP);
    222 			inst1->u2.left = inst2;
    223 			inst1->u1.right = op2->first;
    224 			op2->last->u2.next = inst2;
    225 			pushand(inst1, inst2);
    226 			break;
    227 		}
    228 	}
    229 }
    230 
    231 static	Reprog*
    232 optimize(Reprog *pp)
    233 {
    234 	Reinst *inst, *target;
    235 	int size;
    236 	Reprog *npp;
    237 	Reclass *cl;
    238 	ptrdiff_t diff;
    239 
    240 	/*
    241 	 *  get rid of NOOP chains
    242 	 */
    243 	for(inst=pp->firstinst; inst->type!=END; inst++){
    244 		target = inst->u2.next;
    245 		while(target->type == NOP)
    246 			target = target->u2.next;
    247 		inst->u2.next = target;
    248 	}
    249 
    250 	/*
    251 	 *  The original allocation is for an area larger than
    252 	 *  necessary.  Reallocate to the actual space used
    253 	 *  and then relocate the code.
    254 	 */
    255 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
    256 	npp = realloc(pp, size);
    257 	if(npp==0 || npp==pp)
    258 		return pp;
    259 	diff = (char *)npp - (char *)pp;
    260 	freep = (Reinst *)((char *)freep + diff);
    261 	for(inst=npp->firstinst; inst<freep; inst++){
    262 		switch(inst->type){
    263 		case OR:
    264 		case STAR:
    265 		case PLUS:
    266 		case QUEST:
    267 			inst->u1.right = (void*)((char*)inst->u1.right + diff);
    268 			break;
    269 		case CCLASS:
    270 		case NCCLASS:
    271 			inst->u1.right = (void*)((char*)inst->u1.right + diff);
    272 			cl = inst->u1.cp;
    273 			cl->end = (void*)((char*)cl->end + diff);
    274 			break;
    275 		}
    276 		inst->u2.left = (void*)((char*)inst->u2.left + diff);
    277 	}
    278 	npp->startinst = (void*)((char*)npp->startinst + diff);
    279 	return npp;
    280 }
    281 
    282 #ifdef	DEBUG
    283 static	void
    284 dumpstack(void){
    285 	Node *stk;
    286 	int *ip;
    287 
    288 	print("operators\n");
    289 	for(ip=atorstack; ip<atorp; ip++)
    290 		print("0%o\n", *ip);
    291 	print("operands\n");
    292 	for(stk=andstack; stk<andp; stk++)
    293 		print("0%o\t0%o\n", stk->first->type, stk->last->type);
    294 }
    295 
    296 static	void
    297 dump(Reprog *pp)
    298 {
    299 	Reinst *l;
    300 	Rune *p;
    301 
    302 	l = pp->firstinst;
    303 	do{
    304 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
    305 			l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
    306 		if(l->type == RUNE)
    307 			print("\t%C\n", l->u1.r);
    308 		else if(l->type == CCLASS || l->type == NCCLASS){
    309 			print("\t[");
    310 			if(l->type == NCCLASS)
    311 				print("^");
    312 			for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
    313 				if(p[0] == p[1])
    314 					print("%C", p[0]);
    315 				else
    316 					print("%C-%C", p[0], p[1]);
    317 			print("]\n");
    318 		} else
    319 			print("\n");
    320 	}while(l++->type);
    321 }
    322 #endif
    323 
    324 static	Reclass*
    325 newclass(void)
    326 {
    327 	if(nclass >= nelem(((Reprog*)0)->class))
    328 		rcerror("too many character classes; increase Reprog.class size");
    329 	return &(classp[nclass++]);
    330 }
    331 
    332 static	int
    333 nextc(Rune *rp)
    334 {
    335 	if(lexdone){
    336 		*rp = 0;
    337 		return 1;
    338 	}
    339 	exprp += chartorune(rp, exprp);
    340 	if(*rp == '\\'){
    341 		exprp += chartorune(rp, exprp);
    342 		return 1;
    343 	}
    344 	if(*rp == 0)
    345 		lexdone = 1;
    346 	return 0;
    347 }
    348 
    349 static	int
    350 lex(int literal, int dot_type)
    351 {
    352 	int quoted;
    353 
    354 	quoted = nextc(&yyrune);
    355 	if(literal || quoted){
    356 		if(yyrune == 0)
    357 			return END;
    358 		return RUNE;
    359 	}
    360 
    361 	switch(yyrune){
    362 	case 0:
    363 		return END;
    364 	case '*':
    365 		return STAR;
    366 	case '?':
    367 		return QUEST;
    368 	case '+':
    369 		return PLUS;
    370 	case '|':
    371 		return OR;
    372 	case '.':
    373 		return dot_type;
    374 	case '(':
    375 		return LBRA;
    376 	case ')':
    377 		return RBRA;
    378 	case '^':
    379 		return BOL;
    380 	case '$':
    381 		return EOL;
    382 	case '[':
    383 		return bldcclass();
    384 	}
    385 	return RUNE;
    386 }
    387 
    388 static int
    389 bldcclass(void)
    390 {
    391 	int type;
    392 	Rune r[NCCRUNE];
    393 	Rune *p, *ep, *np;
    394 	Rune rune;
    395 	int quoted;
    396 
    397 	/* we have already seen the '[' */
    398 	type = CCLASS;
    399 	yyclassp = newclass();
    400 
    401 	/* look ahead for negation */
    402 	/* SPECIAL CASE!!! negated classes don't match \n */
    403 	ep = r;
    404 	quoted = nextc(&rune);
    405 	if(!quoted && rune == '^'){
    406 		type = NCCLASS;
    407 		quoted = nextc(&rune);
    408 		*ep++ = '\n';
    409 		*ep++ = '\n';
    410 	}
    411 
    412 	/* parse class into a set of spans */
    413 	while(ep < &r[NCCRUNE-1]){
    414 		if(rune == 0){
    415 			rcerror("malformed '[]'");
    416 			return 0;
    417 		}
    418 		if(!quoted && rune == ']')
    419 			break;
    420 		if(!quoted && rune == '-'){
    421 			if(ep == r){
    422 				rcerror("malformed '[]'");
    423 				return 0;
    424 			}
    425 			quoted = nextc(&rune);
    426 			if((!quoted && rune == ']') || rune == 0){
    427 				rcerror("malformed '[]'");
    428 				return 0;
    429 			}
    430 			*(ep-1) = rune;
    431 		} else {
    432 			*ep++ = rune;
    433 			*ep++ = rune;
    434 		}
    435 		quoted = nextc(&rune);
    436 	}
    437 	if(ep >= &r[NCCRUNE-1]) {
    438 		rcerror("char class too large; increase Reclass.spans size");
    439 		return 0;
    440 	}
    441 
    442 	/* sort on span start */
    443 	for(p = r; p < ep; p += 2){
    444 		for(np = p; np < ep; np += 2)
    445 			if(*np < *p){
    446 				rune = np[0];
    447 				np[0] = p[0];
    448 				p[0] = rune;
    449 				rune = np[1];
    450 				np[1] = p[1];
    451 				p[1] = rune;
    452 			}
    453 	}
    454 
    455 	/* merge spans */
    456 	np = yyclassp->spans;
    457 	p = r;
    458 	if(r == ep)
    459 		yyclassp->end = np;
    460 	else {
    461 		np[0] = *p++;
    462 		np[1] = *p++;
    463 		for(; p < ep; p += 2)
    464 			/* overlapping or adjacent ranges? */
    465 			if(p[0] <= np[1] + 1){
    466 				if(p[1] >= np[1])
    467 					np[1] = p[1];	/* coalesce */
    468 			} else {
    469 				np += 2;
    470 				np[0] = p[0];
    471 				np[1] = p[1];
    472 			}
    473 		yyclassp->end = np+2;
    474 	}
    475 
    476 	return type;
    477 }
    478 
    479 static	Reprog*
    480 regcomp1(char *s, int literal, int dot_type)
    481 {
    482 	int token;
    483 	Reprog *volatile pp;
    484 
    485 	/* get memory for the program */
    486 	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
    487 	if(pp == 0){
    488 		regerror("out of memory");
    489 		return 0;
    490 	}
    491 	freep = pp->firstinst;
    492 	classp = pp->class;
    493 	errors = 0;
    494 
    495 	if(setjmp(regkaboom))
    496 		goto out;
    497 
    498 	/* go compile the sucker */
    499 	lexdone = 0;
    500 	exprp = s;
    501 	nclass = 0;
    502 	nbra = 0;
    503 	atorp = atorstack;
    504 	andp = andstack;
    505 	subidp = subidstack;
    506 	lastwasand = FALSE;
    507 	cursubid = 0;
    508 
    509 	/* Start with a low priority operator to prime parser */
    510 	pushator(START-1);
    511 	while((token = lex(literal, dot_type)) != END){
    512 		if((token&0300) == OPERATOR)
    513 			operator(token);
    514 		else
    515 			operand(token);
    516 	}
    517 
    518 	/* Close with a low priority operator */
    519 	evaluntil(START);
    520 
    521 	/* Force END */
    522 	operand(END);
    523 	evaluntil(START);
    524 #ifdef DEBUG
    525 	dumpstack();
    526 #endif
    527 	if(nbra)
    528 		rcerror("unmatched left paren");
    529 	--andp;	/* points to first and only operand */
    530 	pp->startinst = andp->first;
    531 #ifdef DEBUG
    532 	dump(pp);
    533 #endif
    534 	pp = optimize(pp);
    535 #ifdef DEBUG
    536 	print("start: %d\n", andp->first-pp->firstinst);
    537 	dump(pp);
    538 #endif
    539 out:
    540 	if(errors){
    541 		free(pp);
    542 		pp = 0;
    543 	}
    544 	return pp;
    545 }
    546 
    547 extern	Reprog*
    548 regcomp(char *s)
    549 {
    550 	return regcomp1(s, 0, ANY);
    551 }
    552 
    553 extern	Reprog*
    554 regcomplit(char *s)
    555 {
    556 	return regcomp1(s, 1, ANY);
    557 }
    558 
    559 extern	Reprog*
    560 regcompnl(char *s)
    561 {
    562 	return regcomp1(s, 0, ANYNL);
    563 }