plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

common.c (12410B)


      1 #include <u.h>
      2 #include <libc.h>
      3 #include <bio.h>
      4 #include <regexp.h>
      5 #include "spam.h"
      6 
      7 enum {
      8 	Quanta	= 8192,
      9 	Minbody = 6000,
     10 	HdrMax	= 15
     11 };
     12 
     13 typedef struct keyword Keyword;
     14 typedef struct word Word;
     15 
     16 struct word{
     17 	char	*string;
     18 	int	n;
     19 };
     20 
     21 struct	keyword{
     22 	char	*string;
     23 	int	value;
     24 };
     25 
     26 Word	htmlcmds[] =
     27 {
     28 	"html",		4,
     29 	"!doctype html", 13,
     30 	0,
     31 
     32 };
     33 
     34 Word	hrefs[] =
     35 {
     36 	"a href=",	7,
     37 	"a title=",	8,
     38 	"a target=",	9,
     39 	"base href=",	10,
     40 	"img src=",	8,
     41 	"img border=",	11,
     42 	"form action=", 12,
     43 	"!--",		3,
     44 	0,
     45 
     46 };
     47 
     48 /*
     49  *	RFC822 header keywords to look for for fractured header.
     50  *	all lengths must be less than HdrMax defined above.
     51  */
     52 Word	hdrwords[] =
     53 {
     54 	"cc:",			3,
     55 	"bcc:", 		4,
     56 	"to:",			3,
     57 	0,			0,
     58 
     59 };
     60 
     61 Keyword	keywords[] =
     62 {
     63 	"header",	HoldHeader,
     64 	"line",		SaveLine,
     65 	"hold",		Hold,
     66 	"dump",		Dump,
     67 	"loff",		Lineoff,
     68 	0,		Nactions
     69 };
     70 
     71 Patterns patterns[] = {
     72 [Dump]		{ "DUMP:", 0, 0 },
     73 [HoldHeader]	{ "HEADER:", 0, 0 },
     74 [Hold]		{ "HOLD:", 0, 0 },
     75 [SaveLine]	{ "LINE:", 0, 0 },
     76 [Lineoff]	{ "LINEOFF:", 0, 0 },
     77 [Nactions]	{ 0, 0, 0 }
     78 };
     79 
     80 static char*	endofhdr(char*, char*);
     81 static	int	escape(char**);
     82 static	int	extract(char*);
     83 static	int	findkey(char*);
     84 static	int	hash(int);
     85 static	int	isword(Word*, char*, int);
     86 static	void	parsealt(Biobuf*, char*, Spat**);
     87 
     88 /*
     89  *	The canonicalizer: convert input to canonical representation
     90  */
     91 char*
     92 readmsg(Biobuf *bp, int *hsize, int *bufsize)
     93 {
     94 	char *p, *buf;
     95 	int n, offset, eoh, bsize, delta;
     96 
     97 	buf = 0;
     98 	offset = 0;
     99 	if(bufsize)
    100 		*bufsize = 0;
    101 	if(hsize)
    102 		*hsize = 0;
    103 	for(;;) {
    104 		buf = Realloc(buf, offset+Quanta+1);
    105 		n = Bread(bp, buf+offset, Quanta);
    106 		if(n < 0){
    107 			free(buf);
    108 			return 0;
    109 		}
    110 		p = buf+offset;			/* start of this chunk */
    111 		offset += n;			/* end of this chunk */
    112 		buf[offset] = 0;
    113 		if(n == 0){
    114 			if(offset == 0)
    115 				return 0;
    116 			break;
    117 		}
    118 
    119 		if(hsize == 0)			/* don't process header */
    120 			break;
    121 		if(p != buf && p[-1] == '\n')	/* check for EOH across buffer split */
    122 			p--;
    123 		p = endofhdr(p, buf+offset);
    124 		if(p)
    125 			break;
    126 		if(offset >= Maxread)		/* gargantuan header - just punt*/
    127 		{
    128 			if(hsize)
    129 				*hsize = offset;
    130 			if(bufsize)
    131 				*bufsize = offset;
    132 			return buf;
    133 		}
    134 	}
    135 	eoh = p-buf;				/* End of header */
    136 	bsize = offset - eoh;			/* amount of body already read */
    137 
    138 		/* Read at least Minbody bytes of the body */
    139 	if (bsize < Minbody){
    140 		delta = Minbody-bsize;
    141 		buf = Realloc(buf, offset+delta+1);
    142 		n = Bread(bp, buf+offset, delta);
    143 		if(n > 0) {
    144 			offset += n;
    145 			buf[offset] = 0;
    146 		}
    147 	}
    148 	if(hsize)
    149 		*hsize = eoh;
    150 	if(bufsize)
    151 		*bufsize = offset;
    152 	return buf;
    153 }
    154 
    155 static	int
    156 isword(Word *wp, char *text, int len)
    157 {
    158 	for(;wp->string; wp++)
    159 		if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
    160 			return 1;
    161 	return 0;
    162 }
    163 
    164 static char*
    165 endofhdr(char *raw, char *end)
    166 {
    167 	int i;
    168 	char *p, *q;
    169 	char buf[HdrMax];
    170 
    171 	/*
    172  	 * can't use strchr to search for newlines because
    173 	 * there may be embedded NULL's.
    174 	 */
    175 	for(p = raw; p < end; p++){
    176 		if(*p != '\n' || p[1] != '\n')
    177 			continue;
    178 		p++;
    179 		for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
    180 			buf[i++] = tolower(*q);
    181 			if(*q == ':' || *q == '\n')
    182 				break;
    183 		}
    184 		if(!isword(hdrwords, buf, i))
    185 			return p+1;
    186 	}
    187 	return 0;
    188 }
    189 
    190 static	int
    191 htmlmatch(Word *wp, char *text, char *end, int *n)
    192 {
    193 	char *cp;
    194 	int i, c, lastc;
    195 	char buf[MaxHtml];
    196 
    197 	/*
    198 	 * extract a string up to '>'
    199 	 */
    200 
    201 	i = lastc = 0;
    202 	cp = text;
    203 	while (cp < end && i < sizeof(buf)-1){
    204 		c = *cp++;
    205 		if(c == '=')
    206 			c = escape(&cp);
    207 		switch(c){
    208 		case 0:
    209 		case '\r':
    210 			continue;
    211 		case '>':
    212 			goto out;
    213 		case '\n':
    214 		case ' ':
    215 		case '\t':
    216 			if(lastc == ' ')
    217 				continue;
    218 			c = ' ';
    219 			break;
    220 		default:
    221 			c = tolower(c);
    222 			break;
    223 		}
    224 		buf[i++] = lastc = c;
    225 	}
    226 out:
    227 	buf[i] = 0;
    228 	if(n)
    229 		*n = cp-text;
    230 	return isword(wp, buf, i);
    231 }
    232 
    233 static int
    234 escape(char **msg)
    235 {
    236 	int c;
    237 	char *p;
    238 
    239 	p = *msg;
    240 	c = *p;
    241 	if(c == '\n'){
    242 		p++;
    243 		c = *p++;
    244 	} else
    245 	if(c == '2'){
    246 		c = tolower(p[1]);
    247 		if(c == 'e'){
    248 			p += 2;
    249 			c = '.';
    250 		}else
    251 		if(c == 'f'){
    252 			p += 2;
    253 			c = '/';
    254 		}else
    255 		if(c == '0'){
    256 			p += 2;
    257 			c = ' ';
    258 		}
    259 		else c = '=';
    260 	} else {
    261 		if(c == '3' && tolower(p[1]) == 'd')
    262 			p += 2;
    263 		c = '=';
    264 	}
    265 	*msg = p;
    266 	return c;
    267 }
    268 
    269 static int
    270 htmlchk(char **msg, char *end)
    271 {
    272 	int n;
    273 	char *p;
    274 
    275 	static int ishtml;
    276 
    277 	p = *msg;
    278 	if(ishtml == 0){
    279 		ishtml = htmlmatch(htmlcmds, p, end, &n);
    280 
    281 		/* If not an HTML keyword, check if it's
    282 		 * an HTML comment (<!comment>).  if so,
    283 		 * skip over it; otherwise copy it in.
    284 		 */
    285 		if(ishtml == 0 && *p != '!')	/* not comment */
    286 			return '<';		/* copy it */
    287 
    288 	} else if(htmlmatch(hrefs, p, end, &n))	/* if special HTML string  */
    289 		return '<';			/* copy it */
    290 
    291 	/*
    292 	 * this is an uninteresting HTML command; skip over it.
    293 	 */
    294 	p += n;
    295 	*msg = p+1;
    296 	return *p;
    297 }
    298 
    299 /*
    300  * decode a base 64 encode body
    301  */
    302 void
    303 conv64(char *msg, char *end, char *buf, int bufsize)
    304 {
    305 	int len, i;
    306 	char *cp;
    307 
    308 	len = end - msg;
    309 	i = (len*3)/4+1;	/* room for max chars + null */
    310 	cp = Malloc(i);
    311 	len = dec64((uchar*)cp, i, msg, len);
    312 	convert(cp, cp+len, buf, bufsize, 1);
    313 	free(cp);
    314 }
    315 
    316 int
    317 convert(char *msg, char *end, char *buf, int bufsize, int isbody)
    318 {
    319 
    320 	char *p;
    321 	int c, lastc, base64;
    322 
    323 	lastc = 0;
    324 	base64 = 0;
    325 	while(msg < end && bufsize > 0){
    326 		c = *msg++;
    327 
    328 		/*
    329 		 * In the body only, try to strip most HTML and
    330 		 * replace certain MIME escape sequences with the character
    331 		 */
    332 		if(isbody) {
    333 			do{
    334 				p = msg;
    335 				if(c == '<')
    336 					c = htmlchk(&msg, end);
    337 				if(c == '=')
    338 					c = escape(&msg);
    339 			} while(p != msg && p < end);
    340 		}
    341 		switch(c){
    342 		case 0:
    343 		case '\r':
    344 			continue;
    345 		case '\t':
    346 		case ' ':
    347 		case '\n':
    348 			if(lastc == ' ')
    349 				continue;
    350 			c = ' ';
    351 			break;
    352 		case 'C':	/* check for MIME base 64 encoding in header */
    353 		case 'c':
    354 			if(isbody == 0)
    355 			if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
    356 			if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
    357 				base64 = 1;
    358 			c = 'c';
    359 			break;
    360 		default:
    361 			c = tolower(c);
    362 			break;
    363 		}
    364 		*buf++ = c;
    365 		lastc = c;
    366 		bufsize--;
    367 	}
    368 	*buf = 0;
    369 	return base64;
    370 }
    371 
    372 /*
    373  *	The pattern parser: build data structures from the pattern file
    374  */
    375 
    376 static int
    377 hash(int c)
    378 {
    379 	return c & 127;
    380 }
    381 
    382 static	int
    383 findkey(char *val)
    384 {
    385 	Keyword *kp;
    386 
    387 	for(kp = keywords; kp->string; kp++)
    388 		if(strcmp(val, kp->string) == 0)
    389 				break;
    390 	return kp->value;
    391 }
    392 
    393 #define	whitespace(c)	((c) == ' ' || (c) == '\t')
    394 
    395 void
    396 parsepats(Biobuf *bp)
    397 {
    398 	Pattern *p, *new;
    399 	char *cp, *qp;
    400 	int type, action, n, h;
    401 	Spat *spat;
    402 
    403 	for(;;){
    404 		cp = Brdline(bp, '\n');
    405 		if(cp == 0)
    406 			break;
    407 		cp[Blinelen(bp)-1] = 0;
    408 		while(*cp == ' ' || *cp == '\t')
    409 			cp++;
    410 		if(*cp == '#' || *cp == 0)
    411 			continue;
    412 		type = regexp;
    413 		if(*cp == '*'){
    414 			type = string;
    415 			cp++;
    416 		}
    417 		qp = strchr(cp, ':');
    418 		if(qp == 0)
    419 			continue;
    420 		*qp = 0;
    421 		if(debug)
    422 			fprint(2, "action = %s\n", cp);
    423 		action = findkey(cp);
    424 		if(action >= Nactions)
    425 			continue;
    426 		cp = qp+1;
    427 		n = extract(cp);
    428 		if(n <= 0 || *cp == 0)
    429 			continue;
    430 
    431 		qp = strstr(cp, "~~");
    432 		if(qp){
    433 			*qp = 0;
    434 			n = strlen(cp);
    435 		}
    436 		if(debug)
    437 			fprint(2, " Pattern: `%s'\n", cp);
    438 
    439 			/* Hook regexps into a chain */
    440 		if(type == regexp) {
    441 			new = Malloc(sizeof(Pattern));
    442 			new->action = action;
    443 			new->pat = regcomp(cp);
    444 			if(new->pat == 0){
    445 				free(new);
    446 				continue;
    447 			}
    448 			new->type = regexp;
    449 			new->alt = 0;
    450 			new->next = 0;
    451 
    452 			if(qp)
    453 				parsealt(bp, qp+2, &new->alt);
    454 
    455 			new->next = patterns[action].regexps;
    456 			patterns[action].regexps = new;
    457 			continue;
    458 
    459 		}
    460 			/* not a Regexp - hook strings into Pattern hash chain */
    461 		spat = Malloc(sizeof(*spat));
    462 		spat->next = 0;
    463 		spat->alt = 0;
    464 		spat->len = n;
    465 		spat->string = Malloc(n+1);
    466 		spat->c1 = cp[1];
    467 		strcpy(spat->string, cp);
    468 
    469 		if(qp)
    470 			parsealt(bp, qp+2, &spat->alt);
    471 
    472 		p = patterns[action].strings;
    473 		if(p == 0) {
    474 			p = Malloc(sizeof(Pattern));
    475 			memset(p, 0, sizeof(*p));
    476 			p->action = action;
    477 			p->type = string;
    478 			patterns[action].strings = p;
    479 		}
    480 		h = hash(*spat->string);
    481 		spat->next = p->spat[h];
    482 		p->spat[h] = spat;
    483 	}
    484 }
    485 
    486 static void
    487 parsealt(Biobuf *bp, char *cp, Spat** head)
    488 {
    489 	char *p;
    490 	Spat *alt;
    491 
    492 	while(cp){
    493 		if(*cp == 0){		/*escaped newline*/
    494 			do{
    495 				cp = Brdline(bp, '\n');
    496 				if(cp == 0)
    497 					return;
    498 				cp[Blinelen(bp)-1] = 0;
    499 			} while(extract(cp) <= 0 || *cp == 0);
    500 		}
    501 
    502 		p = cp;
    503 		cp = strstr(p, "~~");
    504 		if(cp){
    505 			*cp = 0;
    506 			cp += 2;
    507 		}
    508 		if(strlen(p)){
    509 			alt = Malloc(sizeof(*alt));
    510 			alt->string = strdup(p);
    511 			alt->next = *head;
    512 			*head = alt;
    513 		}
    514 	}
    515 }
    516 
    517 static int
    518 extract(char *cp)
    519 {
    520 	int c;
    521 	char *p, *q, *r;
    522 
    523 	p = q = r = cp;
    524 	while(whitespace(*p))
    525 		p++;
    526 	while(c = *p++){
    527 		if (c == '#')
    528 			break;
    529 		if(c == '"'){
    530 			while(*p && *p != '"'){
    531 				if(*p == '\\' && p[1] == '"')
    532 					p++;
    533 				if('A' <= *p && *p <= 'Z')
    534 					*q++ = *p++ + ('a'-'A');
    535 				else
    536 					*q++ = *p++;
    537 			}
    538 			if(*p)
    539 				p++;
    540 			r = q;		/* never back up over a quoted string */
    541 		} else {
    542 			if('A' <= c && c <= 'Z')
    543 				c += ('a'-'A');
    544 			*q++ = c;
    545 		}
    546 	}
    547 	while(q > r && whitespace(q[-1]))
    548 		q--;
    549 	*q = 0;
    550 	return q-cp;
    551 }
    552 
    553 /*
    554  *	The matching engine: compare canonical input to pattern structures
    555  */
    556 
    557 static Spat*
    558 isalt(char *message, Spat *alt)
    559 {
    560 	while(alt) {
    561 		if(*cmd)
    562 		if(message != cmd && strstr(cmd, alt->string))
    563 			break;
    564 		if(message != header+1 && strstr(header+1, alt->string))
    565 			break;
    566 		if(strstr(message, alt->string))
    567 			break;
    568 		alt = alt->next;
    569 	}
    570 	return alt;
    571 }
    572 
    573 int
    574 matchpat(Pattern *p, char *message, Resub *m)
    575 {
    576 	Spat *spat;
    577 	char *s;
    578 	int c, c1;
    579 
    580 	if(p->type == string){
    581 		c1 = *message;
    582 		for(s=message; c=c1; s++){
    583 			c1 = s[1];
    584 			for(spat=p->spat[hash(c)]; spat; spat=spat->next){
    585 				if(c1 == spat->c1)
    586 				if(memcmp(s, spat->string, spat->len) == 0)
    587 				if(!isalt(message, spat->alt)){
    588 					m->s.sp = s;
    589 					m->e.ep = s + spat->len;
    590 					return 1;
    591 				}
    592 			}
    593 		}
    594 		return 0;
    595 	}
    596 	m->s.sp = m->e.ep = 0;
    597 	if(regexec(p->pat, message, m, 1) == 0)
    598 		return 0;
    599 	if(isalt(message, p->alt))
    600 		return 0;
    601 	return 1;
    602 }
    603 
    604 
    605 void
    606 xprint(int fd, char *type, Resub *m)
    607 {
    608 	char *p, *q;
    609 	int i;
    610 
    611 	if(m->s.sp == 0 || m->e.ep == 0)
    612 		return;
    613 
    614 		/* back up approx 30 characters to whitespace */
    615 	for(p = m->s.sp, i = 0; *p && i < 30; i++, p--)
    616 			;
    617 	while(*p && *p != ' ')
    618 		p--;
    619 	p++;
    620 
    621 		/* grab about 30 more chars beyond the end of the match */
    622 	for(q = m->e.ep, i = 0; *q && i < 30; i++, q++)
    623 			;
    624 	while(*q && *q != ' ')
    625 		q++;
    626 
    627 	fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->s.sp-p), p, (int)(m->e.ep-m->s.sp), m->s.sp, (int)(q-m->e.ep), m->e.ep);
    628 }
    629 
    630 enum {
    631 	INVAL=	255
    632 };
    633 
    634 static uchar t64d[256] = {
    635 /*00 */	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    636 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    637 /*10*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    638 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    639 /*20*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    640 	INVAL, INVAL, INVAL,    62, INVAL, INVAL, INVAL,    63,
    641 /*30*/	   52,	  53,	 54,	55,    56,    57,    58,    59,
    642 	   60,	  61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    643 /*40*/	INVAL,    0,      1,     2,     3,     4,     5,     6,
    644 	    7,    8,      9,    10,    11,    12,    13,    14,
    645 /*50*/	   15,   16,     17,    18,    19,    20,    21,    22,
    646 	   23,   24,     25, INVAL, INVAL, INVAL, INVAL, INVAL,
    647 /*60*/	INVAL,   26,     27,    28,    29,    30,    31,    32,
    648 	   33,   34,     35,    36,    37,    38,    39,    40,
    649 /*70*/	   41,   42,     43,    44,    45,    46,    47,    48,
    650 	   49,   50,     51, INVAL, INVAL, INVAL, INVAL, INVAL,
    651 /*80*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    652 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    653 /*90*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    654 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    655 /*A0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    656 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    657 /*B0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    658 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    659 /*C0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    660 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    661 /*D0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    662 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    663 /*E0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    664 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    665 /*F0*/	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
    666 	INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL
    667 };