plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

html.c (7691B)


      1 #include <u.h>
      2 #include <libc.h>
      3 #include <bio.h>
      4 #include "hdr.h"
      5 #include "conv.h"
      6 
      7 typedef struct Hchar Hchar;
      8 struct Hchar
      9 {
     10 	char *s;
     11 	Rune r;
     12 };
     13 
     14 /* &lt;, &gt;, &quot;, &amp; intentionally omitted */
     15 
     16 /*
     17  * Names beginning with _ are names we recognize
     18  * (without the underscore) but will not generate,
     19  * because they are nonstandard.
     20  */
     21 static Hchar byname[] =
     22 {
     23 	{"AElig", 198},
     24 	{"Aacute", 193},
     25 	{"Acirc", 194},
     26 	{"Agrave", 192},
     27 	{"Alpha", 913},
     28 	{"Aring", 197},
     29 	{"Atilde", 195},
     30 	{"Auml", 196},
     31 	{"Beta", 914},
     32 	{"Ccedil", 199},
     33 	{"Chi", 935},
     34 	{"Dagger", 8225},
     35 	{"Delta", 916},
     36 	{"ETH", 208},
     37 	{"Eacute", 201},
     38 	{"Ecirc", 202},
     39 	{"Egrave", 200},
     40 	{"Epsilon", 917},
     41 	{"Eta", 919},
     42 	{"Euml", 203},
     43 	{"Gamma", 915},
     44 	{"Iacute", 205},
     45 	{"Icirc", 206},
     46 	{"Igrave", 204},
     47 	{"Iota", 921},
     48 	{"Iuml", 207},
     49 	{"Kappa", 922},
     50 	{"Lambda", 923},
     51 	{"Mu", 924},
     52 	{"Ntilde", 209},
     53 	{"Nu", 925},
     54 	{"OElig", 338},
     55 	{"Oacute", 211},
     56 	{"Ocirc", 212},
     57 	{"Ograve", 210},
     58 	{"Omega", 937},
     59 	{"Omicron", 927},
     60 	{"Oslash", 216},
     61 	{"Otilde", 213},
     62 	{"Ouml", 214},
     63 	{"Phi", 934},
     64 	{"Pi", 928},
     65 	{"Prime", 8243},
     66 	{"Psi", 936},
     67 	{"Rho", 929},
     68 	{"Scaron", 352},
     69 	{"Sigma", 931},
     70 	{"THORN", 222},
     71 	{"Tau", 932},
     72 	{"Theta", 920},
     73 	{"Uacute", 218},
     74 	{"Ucirc", 219},
     75 	{"Ugrave", 217},
     76 	{"Upsilon", 933},
     77 	{"Uuml", 220},
     78 	{"Xi", 926},
     79 	{"Yacute", 221},
     80 	{"Yuml", 376},
     81 	{"Zeta", 918},
     82 	{"aacute", 225},
     83 	{"acirc", 226},
     84 	{"acute", 180},
     85 	{"aelig", 230},
     86 	{"agrave", 224},
     87 	{"alefsym", 8501},
     88 	{"alpha", 945},
     89 	{"amp", 38},
     90 	{"and", 8743},
     91 	{"ang", 8736},
     92 	{"aring", 229},
     93 	{"asymp", 8776},
     94 	{"atilde", 227},
     95 	{"auml", 228},
     96 	{"bdquo", 8222},
     97 	{"beta", 946},
     98 	{"brvbar", 166},
     99 	{"bull", 8226},
    100 	{"cap", 8745},
    101 	{"ccedil", 231},
    102 	{"cdots", 8943},
    103 	{"cedil", 184},
    104 	{"cent", 162},
    105 	{"chi", 967},
    106 	{"circ", 710},
    107 	{"clubs", 9827},
    108 	{"cong", 8773},
    109 	{"copy", 169},
    110 	{"crarr", 8629},
    111 	{"cup", 8746},
    112 	{"curren", 164},
    113 	{"dArr", 8659},
    114 	{"dagger", 8224},
    115 	{"darr", 8595},
    116 	{"ddots", 8945},
    117 	{"deg", 176},
    118 	{"delta", 948},
    119 	{"diams", 9830},
    120 	{"divide", 247},
    121 	{"eacute", 233},
    122 	{"ecirc", 234},
    123 	{"egrave", 232},
    124 	{"_emdash", 8212},	/* non-standard but commonly used */
    125 	{"empty", 8709},
    126 	{"emsp", 8195},
    127 	{"_endash", 8211},	/* non-standard but commonly used */
    128 	{"ensp", 8194},
    129 	{"epsilon", 949},
    130 	{"equiv", 8801},
    131 	{"eta", 951},
    132 	{"eth", 240},
    133 	{"euml", 235},
    134 	{"euro", 8364},
    135 	{"exist", 8707},
    136 	{"fnof", 402},
    137 	{"forall", 8704},
    138 	{"frac12", 189},
    139 	{"frac14", 188},
    140 	{"frac34", 190},
    141 	{"frasl", 8260},
    142 	{"gamma", 947},
    143 	{"ge", 8805},
    144 	{"gt", 62},
    145 	{"hArr", 8660},
    146 	{"harr", 8596},
    147 	{"hearts", 9829},
    148 	{"hellip", 8230},
    149 	{"iacute", 237},
    150 	{"icirc", 238},
    151 	{"iexcl", 161},
    152 	{"igrave", 236},
    153 	{"image", 8465},
    154 	{"infin", 8734},
    155 	{"int", 8747},
    156 	{"iota", 953},
    157 	{"iquest", 191},
    158 	{"isin", 8712},
    159 	{"iuml", 239},
    160 	{"kappa", 954},
    161 	{"lArr", 8656},
    162 	{"lambda", 955},
    163 	{"lang", 9001},
    164 	{"laquo", 171},
    165 	{"larr", 8592},
    166 	{"lceil", 8968},
    167 	{"_ldots", 8230},
    168 	{"ldquo", 8220},
    169 	{"le", 8804},
    170 	{"lfloor", 8970},
    171 	{"lowast", 8727},
    172 	{"loz", 9674},
    173 	{"lrm", 8206},
    174 	{"lsaquo", 8249},
    175 	{"lsquo", 8216},
    176 	{"lt", 60},
    177 	{"macr", 175},
    178 	{"mdash", 8212},
    179 	{"micro", 181},
    180 	{"middot", 183},
    181 	{"minus", 8722},
    182 	{"mu", 956},
    183 	{"nabla", 8711},
    184 	{"nbsp", 160},
    185 	{"ndash", 8211},
    186 	{"ne", 8800},
    187 	{"ni", 8715},
    188 	{"not", 172},
    189 	{"notin", 8713},
    190 	{"nsub", 8836},
    191 	{"ntilde", 241},
    192 	{"nu", 957},
    193 	{"oacute", 243},
    194 	{"ocirc", 244},
    195 	{"oelig", 339},
    196 	{"ograve", 242},
    197 	{"oline", 8254},
    198 	{"omega", 969},
    199 	{"omicron", 959},
    200 	{"oplus", 8853},
    201 	{"or", 8744},
    202 	{"ordf", 170},
    203 	{"ordm", 186},
    204 	{"oslash", 248},
    205 	{"otilde", 245},
    206 	{"otimes", 8855},
    207 	{"ouml", 246},
    208 	{"para", 182},
    209 	{"part", 8706},
    210 	{"permil", 8240},
    211 	{"perp", 8869},
    212 	{"phi", 966},
    213 	{"pi", 960},
    214 	{"piv", 982},
    215 	{"plusmn", 177},
    216 	{"pound", 163},
    217 	{"prime", 8242},
    218 	{"prod", 8719},
    219 	{"prop", 8733},
    220 	{"psi", 968},
    221 	{"quad", 8193},
    222 	{"quot", 34},
    223 	{"rArr", 8658},
    224 	{"radic", 8730},
    225 	{"rang", 9002},
    226 	{"raquo", 187},
    227 	{"rarr", 8594},
    228 	{"rceil", 8969},
    229 	{"rdquo", 8221},
    230 	{"real", 8476},
    231 	{"reg", 174},
    232 	{"rfloor", 8971},
    233 	{"rho", 961},
    234 	{"rlm", 8207},
    235 	{"rsaquo", 8250},
    236 	{"rsquo", 8217},
    237 	{"sbquo", 8218},
    238 	{"scaron", 353},
    239 	{"sdot", 8901},
    240 	{"sect", 167},
    241 	{"shy", 173},
    242 	{"sigma", 963},
    243 	{"sigmaf", 962},
    244 	{"sim", 8764},
    245 	{"_sp", 8194},
    246 	{"spades", 9824},
    247 	{"sub", 8834},
    248 	{"sube", 8838},
    249 	{"sum", 8721},
    250 	{"sup", 8835},
    251 	{"sup1", 185},
    252 	{"sup2", 178},
    253 	{"sup3", 179},
    254 	{"supe", 8839},
    255 	{"szlig", 223},
    256 	{"tau", 964},
    257 	{"there4", 8756},
    258 	{"theta", 952},
    259 	{"thetasym", 977},
    260 	{"thinsp", 8201},
    261 	{"thorn", 254},
    262 	{"tilde", 732},
    263 	{"times", 215},
    264 	{"trade", 8482},
    265 	{"uArr", 8657},
    266 	{"uacute", 250},
    267 	{"uarr", 8593},
    268 	{"ucirc", 251},
    269 	{"ugrave", 249},
    270 	{"uml", 168},
    271 	{"upsih", 978},
    272 	{"upsilon", 965},
    273 	{"uuml", 252},
    274 	{"_varepsilon", 8712},
    275 	{"varphi", 981},
    276 	{"_varpi", 982},
    277 	{"varrho", 1009},
    278 	{"vdots", 8942},
    279 	{"_vsigma", 962},
    280 	{"_vtheta", 977},
    281 	{"weierp", 8472},
    282 	{"xi", 958},
    283 	{"yacute", 253},
    284 	{"yen", 165},
    285 	{"yuml", 255},
    286 	{"zeta", 950},
    287 	{"zwj", 8205},
    288 	{"zwnj", 8204}
    289 };
    290 
    291 static Hchar byrune[nelem(byname)];
    292 
    293 static int
    294 hnamecmp(const void *va, const void *vb)
    295 {
    296 	Hchar *a, *b;
    297 
    298 	a = (Hchar*)va;
    299 	b = (Hchar*)vb;
    300 	return strcmp(a->s, b->s);
    301 }
    302 
    303 static int
    304 hrunecmp(const void *va, const void *vb)
    305 {
    306 	Hchar *a, *b;
    307 
    308 	a = (Hchar*)va;
    309 	b = (Hchar*)vb;
    310 	return a->r - b->r;
    311 }
    312 
    313 static void
    314 html_init(void)
    315 {
    316 	static int init;
    317 	int i;
    318 
    319 	if(init)
    320 		return;
    321 	init = 1;
    322 	memmove(byrune, byname, sizeof byrune);
    323 
    324 	/* Eliminate names we aren't allowed to generate. */
    325 	for(i=0; i<nelem(byrune); i++){
    326 		if(byrune[i].s[0] == '_'){
    327 			byrune[i].r = Runeerror;
    328 			byname[i].s++;
    329 		}
    330 	}
    331 
    332 	qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
    333 	qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
    334 }
    335 
    336 static Rune
    337 findbyname(char *s)
    338 {
    339 	Hchar *h;
    340 	int n, m, x;
    341 
    342 	h = byname;
    343 	n = nelem(byname);
    344 	while(n > 0){
    345 		m = n/2;
    346 		x = strcmp(h[m].s, s);
    347 		if(x == 0)
    348 			return h[m].r;
    349 		if(x < 0){
    350 			h += m+1;
    351 			n -= m+1;
    352 		}else
    353 			n = m;
    354 	}
    355 	return Runeerror;
    356 }
    357 
    358 static char*
    359 findbyrune(Rune r)
    360 {
    361 	Hchar *h;
    362 	int n, m;
    363 
    364 	if(r == Runeerror)
    365 		return nil;
    366 	h = byrune;
    367 	n = nelem(byrune);
    368 	while(n > 0){
    369 		m = n/2;
    370 		if(h[m].r == r)
    371 			return h[m].s;
    372 		if(h[m].r < r){
    373 			h += m+1;
    374 			n -= m+1;
    375 		}else
    376 			n = m;
    377 	}
    378 	return nil;
    379 }
    380 
    381 void
    382 html_in(int fd, long *x, struct convert *out)
    383 {
    384 	char buf[100], *p;
    385 	Biobuf b;
    386 	Rune rbuf[N];
    387 	Rune *r, *er;
    388 	int c, i;
    389 
    390 	USED(x);
    391 
    392 	html_init();
    393 	r = rbuf;
    394 	er = rbuf+N;
    395 	Binit(&b, fd, OREAD);
    396 	while((c = Bgetrune(&b)) != Beof){
    397 		if(r >= er){
    398 			OUT(out, rbuf, r-rbuf);
    399 			r = rbuf;
    400 		}
    401 		if(c == '&'){
    402 			buf[0] = c;
    403 			for(i=1; i<nelem(buf)-1;){
    404 				c = Bgetc(&b);
    405 				if(c == Beof)
    406 					break;
    407 				buf[i++] = c;
    408 				if(strchr("; \t\r\n", c))
    409 					break;
    410 			}
    411 			buf[i] = 0;
    412 			if(buf[i-1] == ';'){
    413 				buf[i-1] = 0;
    414 				if((c = findbyname(buf+1)) != Runeerror){
    415 					*r++ = c;
    416 					continue;
    417 				}
    418 				buf[i-1] = ';';
    419 				if(buf[1] == '#'){
    420 					if(buf[2] == 'x')
    421 						c = strtol(buf+3, &p, 16);
    422 					else
    423 						c = strtol(buf+2, &p, 10);
    424 					if(*p != ';' || c >= NRUNE || c < 0)
    425 						goto bad;
    426 					*r++ = c;
    427 					continue;
    428 				}
    429 			}
    430 		bad:
    431 			for(p=buf; p<buf+i; ){
    432 				p += chartorune(r++, p);
    433 				if(r >= er){
    434 					OUT(out, rbuf, r-rbuf);
    435 					r = rbuf;
    436 				}
    437 			}
    438 			continue;
    439 		}
    440 		*r++ = c;
    441 	}
    442 	if(r > rbuf)
    443 		OUT(out, rbuf, r-rbuf);
    444 	OUT(out, rbuf, 0);
    445 }
    446 
    447 /*
    448  * use biobuf because can use more than UTFmax bytes per rune
    449  */
    450 void
    451 html_out(Rune *r, int n, long *x)
    452 {
    453 	char *s;
    454 	Biobuf b;
    455 	Rune *er;
    456 
    457 	USED(x);
    458 	html_init();
    459 	Binit(&b, 1, OWRITE);
    460 	er = r+n;
    461 	for(; r<er; r++){
    462 		if(*r < Runeself)
    463 			Bputrune(&b, *r);
    464 		else if((s = findbyrune(*r)) != nil)
    465 			Bprint(&b, "&%s;", s);
    466 		else
    467 			Bprint(&b, "&#%d;", *r);
    468 	}
    469 	Bflush(&b);
    470 }