common.c (12410B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <regexp.h> 5 #include "spam.h" 6 7 enum { 8 Quanta = 8192, 9 Minbody = 6000, 10 HdrMax = 15 11 }; 12 13 typedef struct keyword Keyword; 14 typedef struct word Word; 15 16 struct word{ 17 char *string; 18 int n; 19 }; 20 21 struct keyword{ 22 char *string; 23 int value; 24 }; 25 26 Word htmlcmds[] = 27 { 28 "html", 4, 29 "!doctype html", 13, 30 0, 31 32 }; 33 34 Word hrefs[] = 35 { 36 "a href=", 7, 37 "a title=", 8, 38 "a target=", 9, 39 "base href=", 10, 40 "img src=", 8, 41 "img border=", 11, 42 "form action=", 12, 43 "!--", 3, 44 0, 45 46 }; 47 48 /* 49 * RFC822 header keywords to look for for fractured header. 50 * all lengths must be less than HdrMax defined above. 51 */ 52 Word hdrwords[] = 53 { 54 "cc:", 3, 55 "bcc:", 4, 56 "to:", 3, 57 0, 0, 58 59 }; 60 61 Keyword keywords[] = 62 { 63 "header", HoldHeader, 64 "line", SaveLine, 65 "hold", Hold, 66 "dump", Dump, 67 "loff", Lineoff, 68 0, Nactions 69 }; 70 71 Patterns patterns[] = { 72 [Dump] { "DUMP:", 0, 0 }, 73 [HoldHeader] { "HEADER:", 0, 0 }, 74 [Hold] { "HOLD:", 0, 0 }, 75 [SaveLine] { "LINE:", 0, 0 }, 76 [Lineoff] { "LINEOFF:", 0, 0 }, 77 [Nactions] { 0, 0, 0 } 78 }; 79 80 static char* endofhdr(char*, char*); 81 static int escape(char**); 82 static int extract(char*); 83 static int findkey(char*); 84 static int hash(int); 85 static int isword(Word*, char*, int); 86 static void parsealt(Biobuf*, char*, Spat**); 87 88 /* 89 * The canonicalizer: convert input to canonical representation 90 */ 91 char* 92 readmsg(Biobuf *bp, int *hsize, int *bufsize) 93 { 94 char *p, *buf; 95 int n, offset, eoh, bsize, delta; 96 97 buf = 0; 98 offset = 0; 99 if(bufsize) 100 *bufsize = 0; 101 if(hsize) 102 *hsize = 0; 103 for(;;) { 104 buf = Realloc(buf, offset+Quanta+1); 105 n = Bread(bp, buf+offset, Quanta); 106 if(n < 0){ 107 free(buf); 108 return 0; 109 } 110 p = buf+offset; /* start of this chunk */ 111 offset += n; /* end of this chunk */ 112 buf[offset] = 0; 113 if(n == 0){ 114 if(offset == 0) 115 return 0; 116 break; 117 } 118 119 if(hsize == 0) /* don't process header */ 120 break; 121 if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */ 122 p--; 123 p = endofhdr(p, buf+offset); 124 if(p) 125 break; 126 if(offset >= Maxread) /* gargantuan header - just punt*/ 127 { 128 if(hsize) 129 *hsize = offset; 130 if(bufsize) 131 *bufsize = offset; 132 return buf; 133 } 134 } 135 eoh = p-buf; /* End of header */ 136 bsize = offset - eoh; /* amount of body already read */ 137 138 /* Read at least Minbody bytes of the body */ 139 if (bsize < Minbody){ 140 delta = Minbody-bsize; 141 buf = Realloc(buf, offset+delta+1); 142 n = Bread(bp, buf+offset, delta); 143 if(n > 0) { 144 offset += n; 145 buf[offset] = 0; 146 } 147 } 148 if(hsize) 149 *hsize = eoh; 150 if(bufsize) 151 *bufsize = offset; 152 return buf; 153 } 154 155 static int 156 isword(Word *wp, char *text, int len) 157 { 158 for(;wp->string; wp++) 159 if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0) 160 return 1; 161 return 0; 162 } 163 164 static char* 165 endofhdr(char *raw, char *end) 166 { 167 int i; 168 char *p, *q; 169 char buf[HdrMax]; 170 171 /* 172 * can't use strchr to search for newlines because 173 * there may be embedded NULL's. 174 */ 175 for(p = raw; p < end; p++){ 176 if(*p != '\n' || p[1] != '\n') 177 continue; 178 p++; 179 for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){ 180 buf[i++] = tolower(*q); 181 if(*q == ':' || *q == '\n') 182 break; 183 } 184 if(!isword(hdrwords, buf, i)) 185 return p+1; 186 } 187 return 0; 188 } 189 190 static int 191 htmlmatch(Word *wp, char *text, char *end, int *n) 192 { 193 char *cp; 194 int i, c, lastc; 195 char buf[MaxHtml]; 196 197 /* 198 * extract a string up to '>' 199 */ 200 201 i = lastc = 0; 202 cp = text; 203 while (cp < end && i < sizeof(buf)-1){ 204 c = *cp++; 205 if(c == '=') 206 c = escape(&cp); 207 switch(c){ 208 case 0: 209 case '\r': 210 continue; 211 case '>': 212 goto out; 213 case '\n': 214 case ' ': 215 case '\t': 216 if(lastc == ' ') 217 continue; 218 c = ' '; 219 break; 220 default: 221 c = tolower(c); 222 break; 223 } 224 buf[i++] = lastc = c; 225 } 226 out: 227 buf[i] = 0; 228 if(n) 229 *n = cp-text; 230 return isword(wp, buf, i); 231 } 232 233 static int 234 escape(char **msg) 235 { 236 int c; 237 char *p; 238 239 p = *msg; 240 c = *p; 241 if(c == '\n'){ 242 p++; 243 c = *p++; 244 } else 245 if(c == '2'){ 246 c = tolower(p[1]); 247 if(c == 'e'){ 248 p += 2; 249 c = '.'; 250 }else 251 if(c == 'f'){ 252 p += 2; 253 c = '/'; 254 }else 255 if(c == '0'){ 256 p += 2; 257 c = ' '; 258 } 259 else c = '='; 260 } else { 261 if(c == '3' && tolower(p[1]) == 'd') 262 p += 2; 263 c = '='; 264 } 265 *msg = p; 266 return c; 267 } 268 269 static int 270 htmlchk(char **msg, char *end) 271 { 272 int n; 273 char *p; 274 275 static int ishtml; 276 277 p = *msg; 278 if(ishtml == 0){ 279 ishtml = htmlmatch(htmlcmds, p, end, &n); 280 281 /* If not an HTML keyword, check if it's 282 * an HTML comment (<!comment>). if so, 283 * skip over it; otherwise copy it in. 284 */ 285 if(ishtml == 0 && *p != '!') /* not comment */ 286 return '<'; /* copy it */ 287 288 } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */ 289 return '<'; /* copy it */ 290 291 /* 292 * this is an uninteresting HTML command; skip over it. 293 */ 294 p += n; 295 *msg = p+1; 296 return *p; 297 } 298 299 /* 300 * decode a base 64 encode body 301 */ 302 void 303 conv64(char *msg, char *end, char *buf, int bufsize) 304 { 305 int len, i; 306 char *cp; 307 308 len = end - msg; 309 i = (len*3)/4+1; /* room for max chars + null */ 310 cp = Malloc(i); 311 len = dec64((uchar*)cp, i, msg, len); 312 convert(cp, cp+len, buf, bufsize, 1); 313 free(cp); 314 } 315 316 int 317 convert(char *msg, char *end, char *buf, int bufsize, int isbody) 318 { 319 320 char *p; 321 int c, lastc, base64; 322 323 lastc = 0; 324 base64 = 0; 325 while(msg < end && bufsize > 0){ 326 c = *msg++; 327 328 /* 329 * In the body only, try to strip most HTML and 330 * replace certain MIME escape sequences with the character 331 */ 332 if(isbody) { 333 do{ 334 p = msg; 335 if(c == '<') 336 c = htmlchk(&msg, end); 337 if(c == '=') 338 c = escape(&msg); 339 } while(p != msg && p < end); 340 } 341 switch(c){ 342 case 0: 343 case '\r': 344 continue; 345 case '\t': 346 case ' ': 347 case '\n': 348 if(lastc == ' ') 349 continue; 350 c = ' '; 351 break; 352 case 'C': /* check for MIME base 64 encoding in header */ 353 case 'c': 354 if(isbody == 0) 355 if(msg < end-32 && *msg == 'o' && msg[1] == 'n') 356 if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0) 357 base64 = 1; 358 c = 'c'; 359 break; 360 default: 361 c = tolower(c); 362 break; 363 } 364 *buf++ = c; 365 lastc = c; 366 bufsize--; 367 } 368 *buf = 0; 369 return base64; 370 } 371 372 /* 373 * The pattern parser: build data structures from the pattern file 374 */ 375 376 static int 377 hash(int c) 378 { 379 return c & 127; 380 } 381 382 static int 383 findkey(char *val) 384 { 385 Keyword *kp; 386 387 for(kp = keywords; kp->string; kp++) 388 if(strcmp(val, kp->string) == 0) 389 break; 390 return kp->value; 391 } 392 393 #define whitespace(c) ((c) == ' ' || (c) == '\t') 394 395 void 396 parsepats(Biobuf *bp) 397 { 398 Pattern *p, *new; 399 char *cp, *qp; 400 int type, action, n, h; 401 Spat *spat; 402 403 for(;;){ 404 cp = Brdline(bp, '\n'); 405 if(cp == 0) 406 break; 407 cp[Blinelen(bp)-1] = 0; 408 while(*cp == ' ' || *cp == '\t') 409 cp++; 410 if(*cp == '#' || *cp == 0) 411 continue; 412 type = regexp; 413 if(*cp == '*'){ 414 type = string; 415 cp++; 416 } 417 qp = strchr(cp, ':'); 418 if(qp == 0) 419 continue; 420 *qp = 0; 421 if(debug) 422 fprint(2, "action = %s\n", cp); 423 action = findkey(cp); 424 if(action >= Nactions) 425 continue; 426 cp = qp+1; 427 n = extract(cp); 428 if(n <= 0 || *cp == 0) 429 continue; 430 431 qp = strstr(cp, "~~"); 432 if(qp){ 433 *qp = 0; 434 n = strlen(cp); 435 } 436 if(debug) 437 fprint(2, " Pattern: `%s'\n", cp); 438 439 /* Hook regexps into a chain */ 440 if(type == regexp) { 441 new = Malloc(sizeof(Pattern)); 442 new->action = action; 443 new->pat = regcomp(cp); 444 if(new->pat == 0){ 445 free(new); 446 continue; 447 } 448 new->type = regexp; 449 new->alt = 0; 450 new->next = 0; 451 452 if(qp) 453 parsealt(bp, qp+2, &new->alt); 454 455 new->next = patterns[action].regexps; 456 patterns[action].regexps = new; 457 continue; 458 459 } 460 /* not a Regexp - hook strings into Pattern hash chain */ 461 spat = Malloc(sizeof(*spat)); 462 spat->next = 0; 463 spat->alt = 0; 464 spat->len = n; 465 spat->string = Malloc(n+1); 466 spat->c1 = cp[1]; 467 strcpy(spat->string, cp); 468 469 if(qp) 470 parsealt(bp, qp+2, &spat->alt); 471 472 p = patterns[action].strings; 473 if(p == 0) { 474 p = Malloc(sizeof(Pattern)); 475 memset(p, 0, sizeof(*p)); 476 p->action = action; 477 p->type = string; 478 patterns[action].strings = p; 479 } 480 h = hash(*spat->string); 481 spat->next = p->spat[h]; 482 p->spat[h] = spat; 483 } 484 } 485 486 static void 487 parsealt(Biobuf *bp, char *cp, Spat** head) 488 { 489 char *p; 490 Spat *alt; 491 492 while(cp){ 493 if(*cp == 0){ /*escaped newline*/ 494 do{ 495 cp = Brdline(bp, '\n'); 496 if(cp == 0) 497 return; 498 cp[Blinelen(bp)-1] = 0; 499 } while(extract(cp) <= 0 || *cp == 0); 500 } 501 502 p = cp; 503 cp = strstr(p, "~~"); 504 if(cp){ 505 *cp = 0; 506 cp += 2; 507 } 508 if(strlen(p)){ 509 alt = Malloc(sizeof(*alt)); 510 alt->string = strdup(p); 511 alt->next = *head; 512 *head = alt; 513 } 514 } 515 } 516 517 static int 518 extract(char *cp) 519 { 520 int c; 521 char *p, *q, *r; 522 523 p = q = r = cp; 524 while(whitespace(*p)) 525 p++; 526 while(c = *p++){ 527 if (c == '#') 528 break; 529 if(c == '"'){ 530 while(*p && *p != '"'){ 531 if(*p == '\\' && p[1] == '"') 532 p++; 533 if('A' <= *p && *p <= 'Z') 534 *q++ = *p++ + ('a'-'A'); 535 else 536 *q++ = *p++; 537 } 538 if(*p) 539 p++; 540 r = q; /* never back up over a quoted string */ 541 } else { 542 if('A' <= c && c <= 'Z') 543 c += ('a'-'A'); 544 *q++ = c; 545 } 546 } 547 while(q > r && whitespace(q[-1])) 548 q--; 549 *q = 0; 550 return q-cp; 551 } 552 553 /* 554 * The matching engine: compare canonical input to pattern structures 555 */ 556 557 static Spat* 558 isalt(char *message, Spat *alt) 559 { 560 while(alt) { 561 if(*cmd) 562 if(message != cmd && strstr(cmd, alt->string)) 563 break; 564 if(message != header+1 && strstr(header+1, alt->string)) 565 break; 566 if(strstr(message, alt->string)) 567 break; 568 alt = alt->next; 569 } 570 return alt; 571 } 572 573 int 574 matchpat(Pattern *p, char *message, Resub *m) 575 { 576 Spat *spat; 577 char *s; 578 int c, c1; 579 580 if(p->type == string){ 581 c1 = *message; 582 for(s=message; c=c1; s++){ 583 c1 = s[1]; 584 for(spat=p->spat[hash(c)]; spat; spat=spat->next){ 585 if(c1 == spat->c1) 586 if(memcmp(s, spat->string, spat->len) == 0) 587 if(!isalt(message, spat->alt)){ 588 m->s.sp = s; 589 m->e.ep = s + spat->len; 590 return 1; 591 } 592 } 593 } 594 return 0; 595 } 596 m->s.sp = m->e.ep = 0; 597 if(regexec(p->pat, message, m, 1) == 0) 598 return 0; 599 if(isalt(message, p->alt)) 600 return 0; 601 return 1; 602 } 603 604 605 void 606 xprint(int fd, char *type, Resub *m) 607 { 608 char *p, *q; 609 int i; 610 611 if(m->s.sp == 0 || m->e.ep == 0) 612 return; 613 614 /* back up approx 30 characters to whitespace */ 615 for(p = m->s.sp, i = 0; *p && i < 30; i++, p--) 616 ; 617 while(*p && *p != ' ') 618 p--; 619 p++; 620 621 /* grab about 30 more chars beyond the end of the match */ 622 for(q = m->e.ep, i = 0; *q && i < 30; i++, q++) 623 ; 624 while(*q && *q != ' ') 625 q++; 626 627 fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->s.sp-p), p, (int)(m->e.ep-m->s.sp), m->s.sp, (int)(q-m->e.ep), m->e.ep); 628 } 629 630 enum { 631 INVAL= 255 632 }; 633 634 static uchar t64d[256] = { 635 /*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 636 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 637 /*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 638 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 639 /*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 640 INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63, 641 /*30*/ 52, 53, 54, 55, 56, 57, 58, 59, 642 60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 643 /*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6, 644 7, 8, 9, 10, 11, 12, 13, 14, 645 /*50*/ 15, 16, 17, 18, 19, 20, 21, 22, 646 23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL, 647 /*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32, 648 33, 34, 35, 36, 37, 38, 39, 40, 649 /*70*/ 41, 42, 43, 44, 45, 46, 47, 48, 650 49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL, 651 /*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 652 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 653 /*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 654 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 655 /*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 656 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 657 /*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 658 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 659 /*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 660 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 661 /*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 662 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 663 /*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 664 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 665 /*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, 666 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL 667 };