html.c (7691B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include "hdr.h" 5 #include "conv.h" 6 7 typedef struct Hchar Hchar; 8 struct Hchar 9 { 10 char *s; 11 Rune r; 12 }; 13 14 /* <, >, ", & intentionally omitted */ 15 16 /* 17 * Names beginning with _ are names we recognize 18 * (without the underscore) but will not generate, 19 * because they are nonstandard. 20 */ 21 static Hchar byname[] = 22 { 23 {"AElig", 198}, 24 {"Aacute", 193}, 25 {"Acirc", 194}, 26 {"Agrave", 192}, 27 {"Alpha", 913}, 28 {"Aring", 197}, 29 {"Atilde", 195}, 30 {"Auml", 196}, 31 {"Beta", 914}, 32 {"Ccedil", 199}, 33 {"Chi", 935}, 34 {"Dagger", 8225}, 35 {"Delta", 916}, 36 {"ETH", 208}, 37 {"Eacute", 201}, 38 {"Ecirc", 202}, 39 {"Egrave", 200}, 40 {"Epsilon", 917}, 41 {"Eta", 919}, 42 {"Euml", 203}, 43 {"Gamma", 915}, 44 {"Iacute", 205}, 45 {"Icirc", 206}, 46 {"Igrave", 204}, 47 {"Iota", 921}, 48 {"Iuml", 207}, 49 {"Kappa", 922}, 50 {"Lambda", 923}, 51 {"Mu", 924}, 52 {"Ntilde", 209}, 53 {"Nu", 925}, 54 {"OElig", 338}, 55 {"Oacute", 211}, 56 {"Ocirc", 212}, 57 {"Ograve", 210}, 58 {"Omega", 937}, 59 {"Omicron", 927}, 60 {"Oslash", 216}, 61 {"Otilde", 213}, 62 {"Ouml", 214}, 63 {"Phi", 934}, 64 {"Pi", 928}, 65 {"Prime", 8243}, 66 {"Psi", 936}, 67 {"Rho", 929}, 68 {"Scaron", 352}, 69 {"Sigma", 931}, 70 {"THORN", 222}, 71 {"Tau", 932}, 72 {"Theta", 920}, 73 {"Uacute", 218}, 74 {"Ucirc", 219}, 75 {"Ugrave", 217}, 76 {"Upsilon", 933}, 77 {"Uuml", 220}, 78 {"Xi", 926}, 79 {"Yacute", 221}, 80 {"Yuml", 376}, 81 {"Zeta", 918}, 82 {"aacute", 225}, 83 {"acirc", 226}, 84 {"acute", 180}, 85 {"aelig", 230}, 86 {"agrave", 224}, 87 {"alefsym", 8501}, 88 {"alpha", 945}, 89 {"amp", 38}, 90 {"and", 8743}, 91 {"ang", 8736}, 92 {"aring", 229}, 93 {"asymp", 8776}, 94 {"atilde", 227}, 95 {"auml", 228}, 96 {"bdquo", 8222}, 97 {"beta", 946}, 98 {"brvbar", 166}, 99 {"bull", 8226}, 100 {"cap", 8745}, 101 {"ccedil", 231}, 102 {"cdots", 8943}, 103 {"cedil", 184}, 104 {"cent", 162}, 105 {"chi", 967}, 106 {"circ", 710}, 107 {"clubs", 9827}, 108 {"cong", 8773}, 109 {"copy", 169}, 110 {"crarr", 8629}, 111 {"cup", 8746}, 112 {"curren", 164}, 113 {"dArr", 8659}, 114 {"dagger", 8224}, 115 {"darr", 8595}, 116 {"ddots", 8945}, 117 {"deg", 176}, 118 {"delta", 948}, 119 {"diams", 9830}, 120 {"divide", 247}, 121 {"eacute", 233}, 122 {"ecirc", 234}, 123 {"egrave", 232}, 124 {"_emdash", 8212}, /* non-standard but commonly used */ 125 {"empty", 8709}, 126 {"emsp", 8195}, 127 {"_endash", 8211}, /* non-standard but commonly used */ 128 {"ensp", 8194}, 129 {"epsilon", 949}, 130 {"equiv", 8801}, 131 {"eta", 951}, 132 {"eth", 240}, 133 {"euml", 235}, 134 {"euro", 8364}, 135 {"exist", 8707}, 136 {"fnof", 402}, 137 {"forall", 8704}, 138 {"frac12", 189}, 139 {"frac14", 188}, 140 {"frac34", 190}, 141 {"frasl", 8260}, 142 {"gamma", 947}, 143 {"ge", 8805}, 144 {"gt", 62}, 145 {"hArr", 8660}, 146 {"harr", 8596}, 147 {"hearts", 9829}, 148 {"hellip", 8230}, 149 {"iacute", 237}, 150 {"icirc", 238}, 151 {"iexcl", 161}, 152 {"igrave", 236}, 153 {"image", 8465}, 154 {"infin", 8734}, 155 {"int", 8747}, 156 {"iota", 953}, 157 {"iquest", 191}, 158 {"isin", 8712}, 159 {"iuml", 239}, 160 {"kappa", 954}, 161 {"lArr", 8656}, 162 {"lambda", 955}, 163 {"lang", 9001}, 164 {"laquo", 171}, 165 {"larr", 8592}, 166 {"lceil", 8968}, 167 {"_ldots", 8230}, 168 {"ldquo", 8220}, 169 {"le", 8804}, 170 {"lfloor", 8970}, 171 {"lowast", 8727}, 172 {"loz", 9674}, 173 {"lrm", 8206}, 174 {"lsaquo", 8249}, 175 {"lsquo", 8216}, 176 {"lt", 60}, 177 {"macr", 175}, 178 {"mdash", 8212}, 179 {"micro", 181}, 180 {"middot", 183}, 181 {"minus", 8722}, 182 {"mu", 956}, 183 {"nabla", 8711}, 184 {"nbsp", 160}, 185 {"ndash", 8211}, 186 {"ne", 8800}, 187 {"ni", 8715}, 188 {"not", 172}, 189 {"notin", 8713}, 190 {"nsub", 8836}, 191 {"ntilde", 241}, 192 {"nu", 957}, 193 {"oacute", 243}, 194 {"ocirc", 244}, 195 {"oelig", 339}, 196 {"ograve", 242}, 197 {"oline", 8254}, 198 {"omega", 969}, 199 {"omicron", 959}, 200 {"oplus", 8853}, 201 {"or", 8744}, 202 {"ordf", 170}, 203 {"ordm", 186}, 204 {"oslash", 248}, 205 {"otilde", 245}, 206 {"otimes", 8855}, 207 {"ouml", 246}, 208 {"para", 182}, 209 {"part", 8706}, 210 {"permil", 8240}, 211 {"perp", 8869}, 212 {"phi", 966}, 213 {"pi", 960}, 214 {"piv", 982}, 215 {"plusmn", 177}, 216 {"pound", 163}, 217 {"prime", 8242}, 218 {"prod", 8719}, 219 {"prop", 8733}, 220 {"psi", 968}, 221 {"quad", 8193}, 222 {"quot", 34}, 223 {"rArr", 8658}, 224 {"radic", 8730}, 225 {"rang", 9002}, 226 {"raquo", 187}, 227 {"rarr", 8594}, 228 {"rceil", 8969}, 229 {"rdquo", 8221}, 230 {"real", 8476}, 231 {"reg", 174}, 232 {"rfloor", 8971}, 233 {"rho", 961}, 234 {"rlm", 8207}, 235 {"rsaquo", 8250}, 236 {"rsquo", 8217}, 237 {"sbquo", 8218}, 238 {"scaron", 353}, 239 {"sdot", 8901}, 240 {"sect", 167}, 241 {"shy", 173}, 242 {"sigma", 963}, 243 {"sigmaf", 962}, 244 {"sim", 8764}, 245 {"_sp", 8194}, 246 {"spades", 9824}, 247 {"sub", 8834}, 248 {"sube", 8838}, 249 {"sum", 8721}, 250 {"sup", 8835}, 251 {"sup1", 185}, 252 {"sup2", 178}, 253 {"sup3", 179}, 254 {"supe", 8839}, 255 {"szlig", 223}, 256 {"tau", 964}, 257 {"there4", 8756}, 258 {"theta", 952}, 259 {"thetasym", 977}, 260 {"thinsp", 8201}, 261 {"thorn", 254}, 262 {"tilde", 732}, 263 {"times", 215}, 264 {"trade", 8482}, 265 {"uArr", 8657}, 266 {"uacute", 250}, 267 {"uarr", 8593}, 268 {"ucirc", 251}, 269 {"ugrave", 249}, 270 {"uml", 168}, 271 {"upsih", 978}, 272 {"upsilon", 965}, 273 {"uuml", 252}, 274 {"_varepsilon", 8712}, 275 {"varphi", 981}, 276 {"_varpi", 982}, 277 {"varrho", 1009}, 278 {"vdots", 8942}, 279 {"_vsigma", 962}, 280 {"_vtheta", 977}, 281 {"weierp", 8472}, 282 {"xi", 958}, 283 {"yacute", 253}, 284 {"yen", 165}, 285 {"yuml", 255}, 286 {"zeta", 950}, 287 {"zwj", 8205}, 288 {"zwnj", 8204} 289 }; 290 291 static Hchar byrune[nelem(byname)]; 292 293 static int 294 hnamecmp(const void *va, const void *vb) 295 { 296 Hchar *a, *b; 297 298 a = (Hchar*)va; 299 b = (Hchar*)vb; 300 return strcmp(a->s, b->s); 301 } 302 303 static int 304 hrunecmp(const void *va, const void *vb) 305 { 306 Hchar *a, *b; 307 308 a = (Hchar*)va; 309 b = (Hchar*)vb; 310 return a->r - b->r; 311 } 312 313 static void 314 html_init(void) 315 { 316 static int init; 317 int i; 318 319 if(init) 320 return; 321 init = 1; 322 memmove(byrune, byname, sizeof byrune); 323 324 /* Eliminate names we aren't allowed to generate. */ 325 for(i=0; i<nelem(byrune); i++){ 326 if(byrune[i].s[0] == '_'){ 327 byrune[i].r = Runeerror; 328 byname[i].s++; 329 } 330 } 331 332 qsort(byname, nelem(byname), sizeof byname[0], hnamecmp); 333 qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp); 334 } 335 336 static Rune 337 findbyname(char *s) 338 { 339 Hchar *h; 340 int n, m, x; 341 342 h = byname; 343 n = nelem(byname); 344 while(n > 0){ 345 m = n/2; 346 x = strcmp(h[m].s, s); 347 if(x == 0) 348 return h[m].r; 349 if(x < 0){ 350 h += m+1; 351 n -= m+1; 352 }else 353 n = m; 354 } 355 return Runeerror; 356 } 357 358 static char* 359 findbyrune(Rune r) 360 { 361 Hchar *h; 362 int n, m; 363 364 if(r == Runeerror) 365 return nil; 366 h = byrune; 367 n = nelem(byrune); 368 while(n > 0){ 369 m = n/2; 370 if(h[m].r == r) 371 return h[m].s; 372 if(h[m].r < r){ 373 h += m+1; 374 n -= m+1; 375 }else 376 n = m; 377 } 378 return nil; 379 } 380 381 void 382 html_in(int fd, long *x, struct convert *out) 383 { 384 char buf[100], *p; 385 Biobuf b; 386 Rune rbuf[N]; 387 Rune *r, *er; 388 int c, i; 389 390 USED(x); 391 392 html_init(); 393 r = rbuf; 394 er = rbuf+N; 395 Binit(&b, fd, OREAD); 396 while((c = Bgetrune(&b)) != Beof){ 397 if(r >= er){ 398 OUT(out, rbuf, r-rbuf); 399 r = rbuf; 400 } 401 if(c == '&'){ 402 buf[0] = c; 403 for(i=1; i<nelem(buf)-1;){ 404 c = Bgetc(&b); 405 if(c == Beof) 406 break; 407 buf[i++] = c; 408 if(strchr("; \t\r\n", c)) 409 break; 410 } 411 buf[i] = 0; 412 if(buf[i-1] == ';'){ 413 buf[i-1] = 0; 414 if((c = findbyname(buf+1)) != Runeerror){ 415 *r++ = c; 416 continue; 417 } 418 buf[i-1] = ';'; 419 if(buf[1] == '#'){ 420 if(buf[2] == 'x') 421 c = strtol(buf+3, &p, 16); 422 else 423 c = strtol(buf+2, &p, 10); 424 if(*p != ';' || c >= NRUNE || c < 0) 425 goto bad; 426 *r++ = c; 427 continue; 428 } 429 } 430 bad: 431 for(p=buf; p<buf+i; ){ 432 p += chartorune(r++, p); 433 if(r >= er){ 434 OUT(out, rbuf, r-rbuf); 435 r = rbuf; 436 } 437 } 438 continue; 439 } 440 *r++ = c; 441 } 442 if(r > rbuf) 443 OUT(out, rbuf, r-rbuf); 444 OUT(out, rbuf, 0); 445 } 446 447 /* 448 * use biobuf because can use more than UTFmax bytes per rune 449 */ 450 void 451 html_out(Rune *r, int n, long *x) 452 { 453 char *s; 454 Biobuf b; 455 Rune *er; 456 457 USED(x); 458 html_init(); 459 Binit(&b, 1, OWRITE); 460 er = r+n; 461 for(; r<er; r++){ 462 if(*r < Runeself) 463 Bputrune(&b, *r); 464 else if((s = findbyrune(*r)) != nil) 465 Bprint(&b, "&%s;", s); 466 else 467 Bprint(&b, "&#%d;", *r); 468 } 469 Bflush(&b); 470 }