lex.c (26465B)
1 #include <u.h> 2 #include <libc.h> 3 #include <draw.h> 4 #include <ctype.h> 5 #include <html.h> 6 #include "impl.h" 7 8 typedef struct TokenSource TokenSource; 9 struct TokenSource 10 { 11 int i; /* index of next byte to use */ 12 uchar* data; /* all the data */ 13 int edata; /* data[0:edata] is valid */ 14 int chset; /* one of US_Ascii, etc. */ 15 int mtype; /* TextHtml or TextPlain */ 16 }; 17 18 enum { 19 EOF = -2, 20 EOB = -1 21 }; 22 23 #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.')) 24 25 #define SMALLBUFSIZE 240 26 #define BIGBUFSIZE 2000 27 28 /* HTML 4.0 tag names. */ 29 /* Keep sorted, and in correspondence with enum in iparse.h. */ 30 Rune **tagnames; 31 char *_tagnames[] = { 32 " ", 33 "!", 34 "a", 35 "abbr", 36 "acronym", 37 "address", 38 "applet", 39 "area", 40 "b", 41 "base", 42 "basefont", 43 "bdo", 44 "big", 45 "blink", 46 "blockquote", 47 "body", 48 "bq", 49 "br", 50 "button", 51 "caption", 52 "center", 53 "cite", 54 "code", 55 "col", 56 "colgroup", 57 "dd", 58 "del", 59 "dfn", 60 "dir", 61 "div", 62 "dl", 63 "dt", 64 "em", 65 "fieldset", 66 "font", 67 "form", 68 "frame", 69 "frameset", 70 "h1", 71 "h2", 72 "h3", 73 "h4", 74 "h5", 75 "h6", 76 "head", 77 "hr", 78 "html", 79 "i", 80 "iframe", 81 "img", 82 "input", 83 "ins", 84 "isindex", 85 "kbd", 86 "label", 87 "legend", 88 "li", 89 "link", 90 "map", 91 "menu", 92 "meta", 93 "nobr", 94 "noframes", 95 "noscript", 96 "object", 97 "ol", 98 "optgroup", 99 "option", 100 "p", 101 "param", 102 "pre", 103 "q", 104 "s", 105 "samp", 106 "script", 107 "select", 108 "small", 109 "span", 110 "strike", 111 "strong", 112 "style", 113 "sub", 114 "sup", 115 "table", 116 "tbody", 117 "td", 118 "textarea", 119 "tfoot", 120 "th", 121 "thead", 122 "title", 123 "tr", 124 "tt", 125 "u", 126 "ul", 127 "var" 128 }; 129 130 /* HTML 4.0 attribute names. */ 131 /* Keep sorted, and in correspondence with enum in i.h. */ 132 Rune **attrnames; 133 char* _attrnames[] = { 134 "abbr", 135 "accept-charset", 136 "access-key", 137 "action", 138 "align", 139 "alink", 140 "alt", 141 "archive", 142 "axis", 143 "background", 144 "bgcolor", 145 "border", 146 "cellpadding", 147 "cellspacing", 148 "char", 149 "charoff", 150 "charset", 151 "checked", 152 "cite", 153 "class", 154 "classid", 155 "clear", 156 "code", 157 "codebase", 158 "codetype", 159 "color", 160 "cols", 161 "colspan", 162 "compact", 163 "content", 164 "coords", 165 "data", 166 "datetime", 167 "declare", 168 "defer", 169 "dir", 170 "disabled", 171 "enctype", 172 "face", 173 "for", 174 "frame", 175 "frameborder", 176 "headers", 177 "height", 178 "href", 179 "hreflang", 180 "hspace", 181 "http-equiv", 182 "id", 183 "ismap", 184 "label", 185 "lang", 186 "link", 187 "longdesc", 188 "marginheight", 189 "marginwidth", 190 "maxlength", 191 "media", 192 "method", 193 "multiple", 194 "name", 195 "nohref", 196 "noresize", 197 "noshade", 198 "nowrap", 199 "object", 200 "onblur", 201 "onchange", 202 "onclick", 203 "ondblclick", 204 "onfocus", 205 "onkeypress", 206 "onkeyup", 207 "onload", 208 "onmousedown", 209 "onmousemove", 210 "onmouseout", 211 "onmouseover", 212 "onmouseup", 213 "onreset", 214 "onselect", 215 "onsubmit", 216 "onunload", 217 "profile", 218 "prompt", 219 "readonly", 220 "rel", 221 "rev", 222 "rows", 223 "rowspan", 224 "rules", 225 "scheme", 226 "scope", 227 "scrolling", 228 "selected", 229 "shape", 230 "size", 231 "span", 232 "src", 233 "standby", 234 "start", 235 "style", 236 "summary", 237 "tabindex", 238 "target", 239 "text", 240 "title", 241 "type", 242 "usemap", 243 "valign", 244 "value", 245 "valuetype", 246 "version", 247 "vlink", 248 "vspace", 249 "width" 250 }; 251 252 253 /* Character entity to unicode character number map. */ 254 /* Keep sorted by name. */ 255 StringInt *chartab; 256 AsciiInt _chartab[] = { 257 {"AElig", 198}, 258 {"Aacute", 193}, 259 {"Acirc", 194}, 260 {"Agrave", 192}, 261 {"Aring", 197}, 262 {"Atilde", 195}, 263 {"Auml", 196}, 264 {"Ccedil", 199}, 265 {"ETH", 208}, 266 {"Eacute", 201}, 267 {"Ecirc", 202}, 268 {"Egrave", 200}, 269 {"Euml", 203}, 270 {"Iacute", 205}, 271 {"Icirc", 206}, 272 {"Igrave", 204}, 273 {"Iuml", 207}, 274 {"Ntilde", 209}, 275 {"Oacute", 211}, 276 {"Ocirc", 212}, 277 {"Ograve", 210}, 278 {"Oslash", 216}, 279 {"Otilde", 213}, 280 {"Ouml", 214}, 281 {"THORN", 222}, 282 {"Uacute", 218}, 283 {"Ucirc", 219}, 284 {"Ugrave", 217}, 285 {"Uuml", 220}, 286 {"Yacute", 221}, 287 {"aacute", 225}, 288 {"acirc", 226}, 289 {"acute", 180}, 290 {"aelig", 230}, 291 {"agrave", 224}, 292 {"alpha", 945}, 293 {"amp", 38}, 294 {"aring", 229}, 295 {"atilde", 227}, 296 {"auml", 228}, 297 {"beta", 946}, 298 {"brvbar", 166}, 299 {"ccedil", 231}, 300 {"cdots", 8943}, 301 {"cedil", 184}, 302 {"cent", 162}, 303 {"chi", 967}, 304 {"copy", 169}, 305 {"curren", 164}, 306 {"ddots", 8945}, 307 {"deg", 176}, 308 {"delta", 948}, 309 {"divide", 247}, 310 {"eacute", 233}, 311 {"ecirc", 234}, 312 {"egrave", 232}, 313 {"emdash", 8212}, /* non-standard but commonly used */ 314 {"emsp", 8195}, 315 {"endash", 8211}, /* non-standard but commonly used */ 316 {"ensp", 8194}, 317 {"epsilon", 949}, 318 {"eta", 951}, 319 {"eth", 240}, 320 {"euml", 235}, 321 {"frac12", 189}, 322 {"frac14", 188}, 323 {"frac34", 190}, 324 {"gamma", 947}, 325 {"gt", 62}, 326 {"iacute", 237}, 327 {"icirc", 238}, 328 {"iexcl", 161}, 329 {"igrave", 236}, 330 {"iota", 953}, 331 {"iquest", 191}, 332 {"iuml", 239}, 333 {"kappa", 954}, 334 {"lambda", 955}, 335 {"laquo", 171}, 336 {"ldquo", 8220}, 337 {"ldots", 8230}, 338 {"lsquo", 8216}, 339 {"lt", 60}, 340 {"macr", 175}, 341 {"mdash", 8212}, 342 {"micro", 181}, 343 {"middot", 183}, 344 {"mu", 956}, 345 {"nbsp", 160}, 346 {"ndash", 8211}, 347 {"not", 172}, 348 {"ntilde", 241}, 349 {"nu", 957}, 350 {"oacute", 243}, 351 {"ocirc", 244}, 352 {"ograve", 242}, 353 {"omega", 969}, 354 {"omicron", 959}, 355 {"ordf", 170}, 356 {"ordm", 186}, 357 {"oslash", 248}, 358 {"otilde", 245}, 359 {"ouml", 246}, 360 {"para", 182}, 361 {"phi", 966}, 362 {"pi", 960}, 363 {"plusmn", 177}, 364 {"pound", 163}, 365 {"psi", 968}, 366 {"quad", 8193}, 367 {"quot", 34}, 368 {"raquo", 187}, 369 {"rdquo", 8221}, 370 {"reg", 174}, 371 {"rho", 961}, 372 {"rsquo", 8217}, 373 {"sect", 167}, 374 {"shy", 173}, 375 {"sigma", 963}, 376 {"sp", 8194}, 377 {"sup1", 185}, 378 {"sup2", 178}, 379 {"sup3", 179}, 380 {"szlig", 223}, 381 {"tau", 964}, 382 {"theta", 952}, 383 {"thinsp", 8201}, 384 {"thorn", 254}, 385 {"times", 215}, 386 {"trade", 8482}, 387 {"uacute", 250}, 388 {"ucirc", 251}, 389 {"ugrave", 249}, 390 {"uml", 168}, 391 {"upsilon", 965}, 392 {"uuml", 252}, 393 {"varepsilon", 8712}, 394 {"varphi", 981}, 395 {"varpi", 982}, 396 {"varrho", 1009}, 397 {"vdots", 8942}, 398 {"vsigma", 962}, 399 {"vtheta", 977}, 400 {"xi", 958}, 401 {"yacute", 253}, 402 {"yen", 165}, 403 {"yuml", 255}, 404 {"zeta", 950} 405 }; 406 #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0])) 407 408 /* Characters Winstart..Winend are those that Windows */ 409 /* uses interpolated into the Latin1 set. */ 410 /* They aren't supposed to appear in HTML, but they do.... */ 411 enum { 412 Winstart = 127, 413 Winend = 159 414 }; 415 416 static int winchars[]= { 8226, /* 8226 is a bullet */ 417 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225, 418 710, 8240, 352, 8249, 338, 8226, 8226, 8226, 419 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 420 732, 8482, 353, 8250, 339, 8226, 8226, 376}; 421 422 static StringInt* tagtable; /* initialized from tagnames */ 423 static StringInt* attrtable; /* initialized from attrnames */ 424 425 static void lexinit(void); 426 static int getplaindata(TokenSource* ts, Token* a, int* pai); 427 static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); 428 static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai); 429 static int gettag(TokenSource* ts, int starti, Token* a, int* pai); 430 static Rune* buftostr(Rune* s, Rune* buf, int j); 431 static int comment(TokenSource* ts); 432 static int findstr(TokenSource* ts, Rune* s); 433 static int ampersand(TokenSource* ts); 434 /*static int lowerc(int c); */ 435 static int getchar(TokenSource* ts); 436 static void ungetchar(TokenSource* ts, int c); 437 static void backup(TokenSource* ts, int savei); 438 /*static void freeinsidetoken(Token* t); */ 439 static void freeattrs(Attr* ahead); 440 static Attr* newattr(int attid, Rune* value, Attr* link); 441 static int Tconv(Fmt* f); 442 443 int dbglex = 0; 444 static int lexinited = 0; 445 446 static void 447 lexinit(void) 448 { 449 chartab = _cvtstringinttab(_chartab, nelem(_chartab)); 450 tagnames = _cvtstringtab(_tagnames, nelem(_tagnames)); 451 tagtable = _makestrinttab(tagnames, Numtags); 452 attrnames = _cvtstringtab(_attrnames, nelem(_attrnames)); 453 attrtable = _makestrinttab(attrnames, Numattrs); 454 fmtinstall('T', Tconv); 455 lexinited = 1; 456 } 457 458 static TokenSource* 459 newtokensource(uchar* data, int edata, int chset, int mtype) 460 { 461 TokenSource* ans; 462 463 assert(chset == US_Ascii || chset == ISO_8859_1 || 464 chset == UTF_8 || chset == Unicode); 465 ans = (TokenSource*)emalloc(sizeof(TokenSource)); 466 ans->i = 0; 467 ans->data = data; 468 ans->edata = edata; 469 ans->chset = chset; 470 ans->mtype = mtype; 471 return ans; 472 } 473 474 enum { 475 ToksChunk = 500 476 }; 477 478 /* Call this to get the tokens. */ 479 /* The number of returned tokens is returned in *plen. */ 480 Token* 481 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen) 482 { 483 TokenSource* ts; 484 Token* a; 485 int alen; 486 int ai; 487 int starti; 488 int c; 489 int tag; 490 491 if(!lexinited) 492 lexinit(); 493 ts = newtokensource(data, datalen, chset, mtype); 494 alen = ToksChunk; 495 a = (Token*)emalloc(alen * sizeof(Token)); 496 ai = 0; 497 if(dbglex) 498 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata); 499 if(ts->mtype == TextHtml){ 500 for(;;){ 501 if(ai == alen){ 502 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); 503 alen += ToksChunk; 504 } 505 starti = ts->i; 506 c = getchar(ts); 507 if(c < 0) 508 break; 509 if(c == '<'){ 510 tag = gettag(ts, starti, a, &ai); 511 if(tag == Tscript){ 512 /* special rules for getting Data after.... */ 513 starti = ts->i; 514 c = getchar(ts); 515 tag = getscriptdata(ts, c, starti, a, &ai); 516 } 517 } 518 else 519 tag = getdata(ts, c, starti, a, &ai); 520 if(tag == -1) 521 break; 522 else if(dbglex > 1 && tag != Comment) 523 fprint(2, "lex: got token %T\n", &a[ai-1]); 524 } 525 } 526 else { 527 /* plain text (non-html) tokens */ 528 for(;;){ 529 if(ai == alen){ 530 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token)); 531 alen += ToksChunk; 532 } 533 tag = getplaindata(ts, a, &ai); 534 if(tag == -1) 535 break; 536 if(dbglex > 1) 537 fprint(2, "lex: got token %T\n", &a[ai]); 538 } 539 } 540 if(dbglex) 541 fprint(2, "lex: returning %d tokens\n", ai); 542 *plen = ai; 543 free(ts); 544 if(ai == 0) { 545 free(a); 546 return nil; 547 } 548 return a; 549 } 550 551 /* For case where source isn't HTML. */ 552 /* Just make data tokens, one per line (or partial line, */ 553 /* at end of buffer), ignoring non-whitespace control */ 554 /* characters and dumping \r's. */ 555 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */ 556 /* Otherwise return -1; */ 557 static int 558 getplaindata(TokenSource* ts, Token* a, int* pai) 559 { 560 Rune* s; 561 int j; 562 int starti; 563 int c; 564 Token* tok; 565 Rune buf[BIGBUFSIZE]; 566 567 s = nil; 568 j = 0; 569 starti = ts->i; 570 for(c = getchar(ts); c >= 0; c = getchar(ts)){ 571 if(c < ' '){ 572 if(isspace(c)){ 573 if(c == '\r'){ 574 /* ignore it unless no following '\n', */ 575 /* in which case treat it like '\n' */ 576 c = getchar(ts); 577 if(c != '\n'){ 578 if(c >= 0) 579 ungetchar(ts, c); 580 c = '\n'; 581 } 582 } 583 } 584 else 585 c = 0; 586 } 587 if(c != 0){ 588 buf[j++] = c; 589 if(j == BIGBUFSIZE-1){ 590 s = buftostr(s, buf, j); 591 j = 0; 592 } 593 } 594 if(c == '\n') 595 break; 596 } 597 s = buftostr(s, buf, j); 598 if(s == nil) 599 return -1; 600 tok = &a[(*pai)++]; 601 tok->tag = Data; 602 tok->text = s; 603 tok->attr = nil; 604 tok->starti = starti; 605 return Data; 606 } 607 608 /* Return concatenation of s and buf[0:j] */ 609 /* Frees s. */ 610 static Rune* 611 buftostr(Rune* s, Rune* buf, int j) 612 { 613 Rune *tmp; 614 buf[j] = 0; 615 if(s == nil) 616 tmp = _Strndup(buf, j); 617 else 618 tmp = _Strdup2(s, buf); 619 free(s); 620 return tmp; 621 } 622 623 /* Gather data up to next start-of-tag or end-of-buffer. */ 624 /* Translate entity references (&). */ 625 /* Ignore non-whitespace control characters and get rid of \r's. */ 626 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */ 627 /* Otherwise return -1; */ 628 static int 629 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) 630 { 631 Rune* s; 632 int j; 633 int c; 634 Token* tok; 635 Rune buf[BIGBUFSIZE]; 636 637 s = nil; 638 j = 0; 639 c = firstc; 640 while(c >= 0){ 641 if(c == '&'){ 642 c = ampersand(ts); 643 if(c < 0) 644 break; 645 } 646 else if(c < ' '){ 647 if(isspace(c)){ 648 if(c == '\r'){ 649 /* ignore it unless no following '\n', */ 650 /* in which case treat it like '\n' */ 651 c = getchar(ts); 652 if(c != '\n'){ 653 if(c >= 0) 654 ungetchar(ts, c); 655 c = '\n'; 656 } 657 } 658 } 659 else { 660 if(warn) 661 fprint(2, "warning: non-whitespace control character %d ignored\n", c); 662 c = 0; 663 } 664 } 665 else if(c == '<'){ 666 ungetchar(ts, c); 667 break; 668 } 669 if(c != 0){ 670 buf[j++] = c; 671 if(j == BIGBUFSIZE-1){ 672 s = buftostr(s, buf, j); 673 j = 0; 674 } 675 } 676 c = getchar(ts); 677 } 678 s = buftostr(s, buf, j); 679 if(s == nil) 680 return -1; 681 tok = &a[(*pai)++]; 682 tok->tag = Data; 683 tok->text = s; 684 tok->attr = nil; 685 tok->starti = starti; 686 return Data; 687 } 688 689 /* The rules for lexing scripts are different (ugh). */ 690 /* Gather up everything until see a </SCRIPT>. */ 691 static int 692 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai) 693 { 694 Rune* s; 695 int j; 696 int tstarti; 697 int savei; 698 int c; 699 int tag; 700 int done; 701 Token* tok; 702 Rune buf[BIGBUFSIZE]; 703 704 s = nil; 705 j = 0; 706 tstarti = starti; 707 c = firstc; 708 done = 0; 709 while(c >= 0){ 710 if(c == '<'){ 711 /* other browsers ignore stuff to end of line after <! */ 712 savei = ts->i; 713 c = getchar(ts); 714 if(c == '!'){ 715 while(c >= 0 && c != '\n' && c != '\r') 716 c = getchar(ts); 717 if(c == '\r') 718 c = getchar(ts); 719 if(c == '\n') 720 c = getchar(ts); 721 } 722 else if(c >= 0){ 723 backup(ts, savei); 724 tag = gettag(ts, tstarti, a, pai); 725 if(tag == -1) 726 break; 727 if(tag != Comment) 728 (*pai)--; 729 backup(ts, tstarti); 730 if(tag == Tscript + RBRA){ 731 done = 1; 732 break; 733 } 734 /* here tag was not </SCRIPT>, so take as regular data */ 735 c = getchar(ts); 736 } 737 } 738 if(c < 0) 739 break; 740 if(c != 0){ 741 buf[j++] = c; 742 if(j == BIGBUFSIZE-1){ 743 s = buftostr(s, buf, j); 744 j = 0; 745 } 746 } 747 tstarti = ts->i; 748 c = getchar(ts); 749 } 750 if(done || ts->i == ts->edata){ 751 s = buftostr(s, buf, j); 752 tok = &a[(*pai)++]; 753 tok->tag = Data; 754 tok->text = s; 755 tok->attr = nil; 756 tok->starti = starti; 757 return Data; 758 } 759 backup(ts, starti); 760 return -1; 761 } 762 763 /* We've just seen a '<'. Gather up stuff to closing '>' (if buffer */ 764 /* ends before then, return -1). */ 765 /* If it's a tag, look up the name, gather the attributes, and return */ 766 /* the appropriate token. */ 767 /* Else it's either just plain data or some kind of ignorable stuff: */ 768 /* return Data or Comment as appropriate. */ 769 /* If it's not a Comment, put it in a[*pai] and bump *pai. */ 770 static int 771 gettag(TokenSource* ts, int starti, Token* a, int* pai) 772 { 773 int rbra; 774 int ans; 775 Attr* al; 776 int nexti; 777 int c; 778 int ti; 779 int afnd; 780 int attid; 781 int quote; 782 Rune* val; 783 int nv; 784 int i; 785 int tag; 786 Token* tok; 787 Rune buf[BIGBUFSIZE]; 788 789 rbra = 0; 790 nexti = ts->i; 791 tok = &a[*pai]; 792 tok->tag = Notfound; 793 tok->text = nil; 794 tok->attr = nil; 795 tok->starti = starti; 796 c = getchar(ts); 797 if(c == '/'){ 798 rbra = RBRA; 799 c = getchar(ts); 800 } 801 if(c < 0) 802 goto eob_done; 803 if(c >= 256 || !isalpha(c)){ 804 /* not a tag */ 805 if(c == '!'){ 806 ans = comment(ts); 807 if(ans != -1) 808 return ans; 809 goto eob_done; 810 } 811 else { 812 backup(ts, nexti); 813 tok->tag = Data; 814 tok->text = _Strdup(L(Llt)); 815 (*pai)++; 816 return Data; 817 } 818 } 819 /* c starts a tagname */ 820 buf[0] = c; 821 i = 1; 822 for(;;){ 823 c = getchar(ts); 824 if(c < 0) 825 goto eob_done; 826 if(!ISNAMCHAR(c)) 827 break; 828 /* if name is bigger than buf it won't be found anyway... */ 829 if(i < BIGBUFSIZE) 830 buf[i++] = c; 831 } 832 if(_lookup(tagtable, Numtags, buf, i, &tag)) 833 tok->tag = tag + rbra; 834 else 835 tok->text = _Strndup(buf, i); /* for warning print, in build */ 836 837 /* attribute gathering loop */ 838 al = nil; 839 for(;;){ 840 /* look for "ws name" or "ws name ws = ws val" (ws=whitespace) */ 841 /* skip whitespace */ 842 attrloop_continue: 843 while(c < 256 && isspace(c)){ 844 c = getchar(ts); 845 if(c < 0) 846 goto eob_done; 847 } 848 if(c == '>') 849 goto attrloop_done; 850 if(c == '<'){ 851 if(warn) 852 fprint(2, "warning: unclosed tag\n"); 853 ungetchar(ts, c); 854 goto attrloop_done; 855 } 856 if(c >= 256 || !isalpha(c)){ 857 if(warn) 858 fprint(2, "warning: expected attribute name\n"); 859 /* skipt to next attribute name */ 860 for(;;){ 861 c = getchar(ts); 862 if(c < 0) 863 goto eob_done; 864 if(c < 256 && isalpha(c)) 865 goto attrloop_continue; 866 if(c == '<'){ 867 if(warn) 868 fprint(2, "warning: unclosed tag\n"); 869 ungetchar(ts, 60); 870 goto attrloop_done; 871 } 872 if(c == '>') 873 goto attrloop_done; 874 } 875 } 876 /* gather attribute name */ 877 buf[0] = c; 878 i = 1; 879 for(;;){ 880 c = getchar(ts); 881 if(c < 0) 882 goto eob_done; 883 if(!ISNAMCHAR(c)) 884 break; 885 if(i < BIGBUFSIZE-1) 886 buf[i++] = c; 887 } 888 afnd = _lookup(attrtable, Numattrs, buf, i, &attid); 889 if(warn && !afnd){ 890 buf[i] = 0; 891 fprint(2, "warning: unknown attribute name %S\n", buf); 892 } 893 /* skip whitespace */ 894 while(c < 256 && isspace(c)){ 895 c = getchar(ts); 896 if(c < 0) 897 goto eob_done; 898 } 899 if(c != '='){ 900 if(afnd) 901 al = newattr(attid, nil, al); 902 goto attrloop_continue; 903 } 904 /*# c is '=' here; skip whitespace */ 905 for(;;){ 906 c = getchar(ts); 907 if(c < 0) 908 goto eob_done; 909 if(c >= 256 || !isspace(c)) 910 break; 911 } 912 quote = 0; 913 if(c == '\'' || c == '"'){ 914 quote = c; 915 c = getchar(ts); 916 if(c < 0) 917 goto eob_done; 918 } 919 val = nil; 920 nv = 0; 921 for(;;){ 922 valloop_continue: 923 if(c < 0) 924 goto eob_done; 925 if(c == '>'){ 926 if(quote){ 927 /* c might be part of string (though not good style) */ 928 /* but if line ends before close quote, assume */ 929 /* there was an unmatched quote */ 930 ti = ts->i; 931 for(;;){ 932 c = getchar(ts); 933 if(c < 0) 934 goto eob_done; 935 if(c == quote){ 936 backup(ts, ti); 937 buf[nv++] = '>'; 938 if(nv == BIGBUFSIZE-1){ 939 val = buftostr(val, buf, nv); 940 nv = 0; 941 } 942 c = getchar(ts); 943 goto valloop_continue; 944 } 945 if(c == '\n'){ 946 if(warn) 947 fprint(2, "warning: apparent unmatched quote\n"); 948 backup(ts, ti); 949 c = '>'; 950 goto valloop_done; 951 } 952 } 953 } 954 else 955 goto valloop_done; 956 } 957 if(quote){ 958 if(c == quote){ 959 c = getchar(ts); 960 if(c < 0) 961 goto eob_done; 962 goto valloop_done; 963 } 964 if(c == '\r'){ 965 c = getchar(ts); 966 goto valloop_continue; 967 } 968 if(c == '\t' || c == '\n') 969 c = ' '; 970 } 971 else { 972 if(c < 256 && isspace(c)) 973 goto valloop_done; 974 } 975 if(c == '&'){ 976 c = ampersand(ts); 977 if(c == -1) 978 goto eob_done; 979 } 980 buf[nv++] = c; 981 if(nv == BIGBUFSIZE-1){ 982 val = buftostr(val, buf, nv); 983 nv = 0; 984 } 985 c = getchar(ts); 986 } 987 valloop_done: 988 if(afnd){ 989 val = buftostr(val, buf, nv); 990 al = newattr(attid, val, al); 991 } 992 } 993 994 attrloop_done: 995 tok->attr = al; 996 (*pai)++; 997 return tok->tag; 998 999 eob_done: 1000 if(warn) 1001 fprint(2, "warning: incomplete tag at end of page\n"); 1002 backup(ts, nexti); 1003 tok->tag = Data; 1004 tok->text = _Strdup(L(Llt)); 1005 return Data; 1006 } 1007 1008 /* We've just read a '<!' at position starti, */ 1009 /* so this may be a comment or other ignored section, or it may */ 1010 /* be just a literal string if there is no close before end of file */ 1011 /* (other browsers do that). */ 1012 /* The accepted practice seems to be (note: contrary to SGML spec!): */ 1013 /* If see <!--, look for --> to close, or if none, > to close. */ 1014 /* If see <!(not --), look for > to close. */ 1015 /* If no close before end of file, leave original characters in as literal data. */ 1016 /* */ 1017 /* If we see ignorable stuff, return Comment. */ 1018 /* Else return nil (caller should back up and try again when more data arrives, */ 1019 /* unless at end of file, in which case caller should just make '<' a data token). */ 1020 static int 1021 comment(TokenSource* ts) 1022 { 1023 int nexti; 1024 int havecomment; 1025 int c; 1026 1027 nexti = ts->i; 1028 havecomment = 0; 1029 c = getchar(ts); 1030 if(c == '-'){ 1031 c = getchar(ts); 1032 if(c == '-'){ 1033 if(findstr(ts, L(Larrow))) 1034 havecomment = 1; 1035 else 1036 backup(ts, nexti); 1037 } 1038 } 1039 if(!havecomment){ 1040 if(c == '>') 1041 havecomment = 1; 1042 else if(c >= 0){ 1043 if(findstr(ts, L(Lgt))) 1044 havecomment = 1; 1045 } 1046 } 1047 if(havecomment) 1048 return Comment; 1049 return -1; 1050 } 1051 1052 /* Look for string s in token source. */ 1053 /* If found, return 1, with buffer at next char after s, */ 1054 /* else return 0 (caller should back up). */ 1055 static int 1056 findstr(TokenSource* ts, Rune* s) 1057 { 1058 int c0; 1059 int n; 1060 int nexti; 1061 int i; 1062 int c; 1063 1064 c0 = s[0]; 1065 n = runestrlen(s); 1066 for(;;){ 1067 c = getchar(ts); 1068 if(c < 0) 1069 break; 1070 if(c == c0){ 1071 if(n == 1) 1072 return 1; 1073 nexti = ts->i; 1074 for(i = 1; i < n; i++){ 1075 c = getchar(ts); 1076 if(c < 0) 1077 goto mainloop_done; 1078 if(c != s[i]) 1079 break; 1080 } 1081 if(i == n) 1082 return 1; 1083 backup(ts, nexti); 1084 } 1085 } 1086 mainloop_done: 1087 return 0; 1088 } 1089 1090 static int 1091 xdigit(int c) 1092 { 1093 if('0' <= c && c <= '9') 1094 return c-'0'; 1095 if('a' <= c && c <= 'f') 1096 return c-'a'+10; 1097 if('A' <= c && c <= 'F') 1098 return c-'A'+10; 1099 return -1; 1100 } 1101 1102 /* We've just read an '&'; look for an entity reference */ 1103 /* name, and if found, return translated char. */ 1104 /* if there is a complete entity name but it isn't known, */ 1105 /* try prefixes (gets around some buggy HTML out there), */ 1106 /* and if that fails, back up to just past the '&' and return '&'. */ 1107 /* If the entity can't be completed in the current buffer, back up */ 1108 /* to the '&' and return -1. */ 1109 static int 1110 ampersand(TokenSource* ts) 1111 { 1112 int savei; 1113 int c; 1114 int fnd; 1115 int ans; 1116 int v; 1117 int i; 1118 int k; 1119 Rune buf[SMALLBUFSIZE]; 1120 1121 savei = ts->i; 1122 c = getchar(ts); 1123 fnd = 0; 1124 ans = -1; 1125 if(c == '#'){ 1126 c = getchar(ts); 1127 v = 0; 1128 if(c == 'x'){ 1129 c = getchar(ts); 1130 while((i=xdigit(c)) != -1){ 1131 v = v*16 + i; 1132 c = getchar(ts); 1133 } 1134 }else{ 1135 while('0' <= c && c <= '9'){ 1136 v = v*10 + c - '0'; 1137 c = getchar(ts); 1138 } 1139 } 1140 if(c >= 0){ 1141 if(!(c == ';' || c == '\n' || c == '\r')) 1142 ungetchar(ts, c); 1143 c = v; 1144 if(c == 160) 1145 c = 160; 1146 if(c >= Winstart && c <= Winend){ 1147 c = winchars[c - Winstart]; 1148 } 1149 ans = c; 1150 fnd = 1; 1151 } 1152 } 1153 else if(c < 256 && isalpha(c)){ 1154 buf[0] = c; 1155 k = 1; 1156 for(;;){ 1157 c = getchar(ts); 1158 if(c < 0) 1159 break; 1160 if(ISNAMCHAR(c)){ 1161 if(k < SMALLBUFSIZE-1) 1162 buf[k++] = c; 1163 } 1164 else { 1165 if(!(c == ';' || c == '\n' || c == '\r')) 1166 ungetchar(ts, c); 1167 break; 1168 } 1169 } 1170 if(c >= 0){ 1171 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); 1172 if(!fnd){ 1173 /* Try prefixes of s */ 1174 if(c == ';' || c == '\n' || c == '\r') 1175 ungetchar(ts, c); 1176 i = k; 1177 while(--k > 0){ 1178 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans); 1179 if(fnd){ 1180 while(i > k){ 1181 i--; 1182 ungetchar(ts, buf[i]); 1183 } 1184 break; 1185 } 1186 } 1187 } 1188 } 1189 } 1190 if(!fnd){ 1191 backup(ts, savei); 1192 ans = '&'; 1193 } 1194 return ans; 1195 } 1196 1197 /* Get next char, obeying ts.chset. */ 1198 /* Returns -1 if no complete character left before current end of data. */ 1199 static int 1200 getchar(TokenSource* ts) 1201 { 1202 uchar* buf; 1203 int c; 1204 int n; 1205 int ok; 1206 Rune r; 1207 1208 if(ts->i >= ts->edata) 1209 return -1; 1210 buf = ts->data; 1211 c = buf[ts->i]; 1212 switch(ts->chset){ 1213 case ISO_8859_1: 1214 if(c >= Winstart && c <= Winend) 1215 c = winchars[c - Winstart]; 1216 ts->i++; 1217 break; 1218 case US_Ascii: 1219 if(c > 127){ 1220 if(warn) 1221 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c); 1222 } 1223 ts->i++; 1224 break; 1225 case UTF_8: 1226 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i); 1227 n = chartorune(&r, (char*)(buf+ts->i)); 1228 if(ok){ 1229 if(warn && c == 0x80) 1230 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]); 1231 ts->i += n; 1232 c = r; 1233 } 1234 else { 1235 /* not enough bytes in buf to complete utf-8 char */ 1236 ts->i = ts->edata; /* mark "all used" */ 1237 c = -1; 1238 } 1239 break; 1240 case Unicode: 1241 if(ts->i < ts->edata - 1){ 1242 /*standards say most-significant byte first */ 1243 c = (c << 8)|(buf[ts->i + 1]); 1244 ts->i += 2; 1245 } 1246 else { 1247 ts->i = ts->edata; /* mark "all used" */ 1248 c = -1; 1249 } 1250 break; 1251 } 1252 return c; 1253 } 1254 1255 /* Assuming c was the last character returned by getchar, set */ 1256 /* things up so that next getchar will get that same character */ 1257 /* followed by the current 'next character', etc. */ 1258 static void 1259 ungetchar(TokenSource* ts, int c) 1260 { 1261 int n; 1262 Rune r; 1263 char a[UTFmax]; 1264 1265 n = 1; 1266 switch(ts->chset){ 1267 case UTF_8: 1268 if(c >= 128){ 1269 r = c; 1270 n = runetochar(a, &r); 1271 } 1272 break; 1273 case Unicode: 1274 n = 2; 1275 break; 1276 } 1277 ts->i -= n; 1278 } 1279 1280 /* Restore ts so that it is at the state where the index was savei. */ 1281 static void 1282 backup(TokenSource* ts, int savei) 1283 { 1284 if(dbglex) 1285 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei); 1286 ts->i = savei; 1287 } 1288 1289 1290 /* Look for value associated with attribute attid in token t. */ 1291 /* If there is one, return 1 and put the value in *pans, */ 1292 /* else return 0. */ 1293 /* If xfer is true, transfer ownership of the string to the caller */ 1294 /* (nil it out here); otherwise, caller must duplicate the answer */ 1295 /* if it needs to save it. */ 1296 /* OK to have pans==0, in which case this is just looking */ 1297 /* to see if token is present. */ 1298 int 1299 _tokaval(Token* t, int attid, Rune** pans, int xfer) 1300 { 1301 Attr* attr; 1302 1303 attr = t->attr; 1304 while(attr != nil){ 1305 if(attr->attid == attid){ 1306 if(pans != nil) 1307 *pans = attr->value; 1308 if(xfer) 1309 attr->value = nil; 1310 return 1; 1311 } 1312 attr = attr->next; 1313 } 1314 if(pans != nil) 1315 *pans = nil; 1316 return 0; 1317 } 1318 1319 static int 1320 Tconv(Fmt *f) 1321 { 1322 Token* t; 1323 int i; 1324 int tag; 1325 char* srbra; 1326 Rune* aname; 1327 Rune* tname; 1328 Attr* a; 1329 char buf[BIGBUFSIZE]; 1330 1331 t = va_arg(f->args, Token*); 1332 if(t == nil) 1333 sprint(buf, "<null>"); 1334 else { 1335 i = 0; 1336 if(dbglex > 1) 1337 i = snprint(buf, sizeof(buf), "[%d]", t->starti); 1338 tag = t->tag; 1339 if(tag == Data){ 1340 i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text); 1341 } 1342 else { 1343 srbra = ""; 1344 if(tag >= RBRA){ 1345 tag -= RBRA; 1346 srbra = "/"; 1347 } 1348 tname = tagnames[tag]; 1349 if(tag == Notfound) 1350 tname = L(Lquestion); 1351 i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname); 1352 for(a = t->attr; a != nil; a = a->next){ 1353 aname = attrnames[a->attid]; 1354 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname); 1355 if(a->value != nil) 1356 i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value); 1357 } 1358 i += snprint(buf+i, sizeof(buf)-i-1, ">"); 1359 } 1360 buf[i] = 0; 1361 } 1362 return fmtstrcpy(f, buf); 1363 } 1364 1365 /* Attrs own their constituent strings, but build may eventually */ 1366 /* transfer some values to its items and nil them out in the Attr. */ 1367 static Attr* 1368 newattr(int attid, Rune* value, Attr* link) 1369 { 1370 Attr* ans; 1371 1372 ans = (Attr*)emalloc(sizeof(Attr)); 1373 ans->attid = attid; 1374 ans->value = value; 1375 ans->next = link; 1376 return ans; 1377 } 1378 1379 /* Free list of Attrs linked through next field */ 1380 static void 1381 freeattrs(Attr* ahead) 1382 { 1383 Attr* a; 1384 Attr* nexta; 1385 1386 a = ahead; 1387 while(a != nil){ 1388 nexta = a->next; 1389 free(a->value); 1390 free(a); 1391 a = nexta; 1392 } 1393 } 1394 1395 /* Free array of Tokens. */ 1396 /* Allocated space might have room for more than n tokens, */ 1397 /* but only n of them are initialized. */ 1398 /* If caller has transferred ownership of constitutent strings */ 1399 /* or attributes, it must have nil'd out the pointers in the Tokens. */ 1400 void 1401 _freetokens(Token* tarray, int n) 1402 { 1403 int i; 1404 Token* t; 1405 1406 if(tarray == nil) 1407 return; 1408 for(i = 0; i < n; i++){ 1409 t = &tarray[i]; 1410 free(t->text); 1411 freeattrs(t->attr); 1412 } 1413 free(tarray); 1414 }
