plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

html.c (5965B)


      1 #include <u.h>
      2 #include <libc.h>
      3 #include <bio.h>
      4 #include <draw.h>
      5 #include <regexp.h>
      6 #include <html.h>
      7 #include <ctype.h>
      8 #include "dat.h"
      9 
     10 char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
     11 Reprog	*urlprog;
     12 
     13 int inword = 0;
     14 int col = 0;
     15 int wordi = 0;
     16 
     17 char*
     18 loadhtml(int fd)
     19 {
     20 	URLwin *u;
     21 	Bytes *b;
     22 	int n;
     23 	char buf[4096];
     24 
     25 	u = emalloc(sizeof(URLwin));
     26 	u->infd = fd;
     27 	u->outfd = 1;
     28 	u->url = estrdup(url);
     29 	u->type = TextHtml;
     30 
     31 	b = emalloc(sizeof(Bytes));
     32 	while((n = read(fd, buf, sizeof buf)) > 0)
     33 		growbytes(b, buf, n);
     34 	if(b->b == nil)
     35 		return nil;	/* empty file */
     36 	rendertext(u, b);
     37 	freeurlwin(u);
     38 	return nil;
     39 }
     40 
     41 char*
     42 runetobyte(Rune *r, int n)
     43 {
     44 	char *s;
     45 
     46 	if(n == 0)
     47 		return emalloc(1);
     48 	s = smprint("%.*S", n, r);
     49 	if(s == nil)
     50 		error("malloc failed");
     51 	return s;
     52 }
     53 
     54 int
     55 closingpunct(int c)
     56 {
     57 	return strchr(".,:;'\")]}>!?", c) != nil;
     58 }
     59 
     60 void
     61 emitword(Bytes *b, Rune *r, int nr)
     62 {
     63 	char *s;
     64 	int space;
     65 
     66 	if(nr == 0)
     67 		return;
     68 	s = smprint("%.*S", nr, r);
     69 	space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
     70 	if(col>0 && col+space+nr > width){
     71 		growbytes(b, "\n", 1);
     72 		space = 0;
     73 		col = 0;
     74 	}
     75 	if(space && col>0){
     76 		growbytes(b, " ", 1);
     77 		col++;
     78 	}
     79 	growbytes(b, s, strlen(s));
     80 	col += nr;
     81 	free(s);
     82 	inword = 0;
     83 }
     84 
     85 void
     86 renderrunes(Bytes *b, Rune *r)
     87 {
     88 	int i, n;
     89 
     90 	n = runestrlen(r);
     91 	for(i=0; i<n; i++){
     92 		switch(r[i]){
     93 		case '\n':
     94 			if(inword)
     95 				emitword(b, r+wordi, i-wordi);
     96 			col = 0;
     97 			if(b->n == 0)
     98 				break;	/* don't start with blank lines */
     99 			if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
    100 				growbytes(b, "\n", 1);
    101 			break;
    102 		case ' ':
    103 			if(inword)
    104 				emitword(b, r+wordi, i-wordi);
    105 			break;
    106 		default:
    107 			if(!inword)
    108 				wordi = i;
    109 			inword = 1;
    110 			break;
    111 		}
    112 	}
    113 	if(inword)
    114 		emitword(b, r+wordi, i-wordi);
    115 }
    116 
    117 void
    118 renderbytes(Bytes *b, char *fmt, ...)
    119 {
    120 	Rune *r;
    121 	va_list arg;
    122 
    123 	va_start(arg, fmt);
    124 	r = runevsmprint(fmt, arg);
    125 	va_end(arg);
    126 	renderrunes(b, r);
    127 	free(r);
    128 }
    129 
    130 char*
    131 baseurl(char *url)
    132 {
    133 	char *base, *slash;
    134 	Resub rs[10];
    135 
    136 	if(url == nil)
    137 		return nil;
    138 	if(urlprog == nil){
    139 		urlprog = regcomp(urlexpr);
    140 		if(urlprog == nil)
    141 			error("can't compile URL regexp");
    142 	}
    143 	memset(rs, 0, sizeof rs);
    144 	if(regexec(urlprog, url, rs, nelem(rs)) == 0)
    145 		return nil;
    146 	base = estrdup(url);
    147 	slash = strrchr(base, '/');
    148 	if(slash!=nil && slash>=&base[rs[0].e.ep-rs[0].s.sp])
    149 		*slash = '\0';
    150 	else
    151 		base[rs[0].e.ep-rs[0].s.sp] = '\0';
    152 	return base;
    153 }
    154 
    155 char*
    156 fullurl(URLwin *u, Rune *rhref)
    157 {
    158 	char *base, *href, *hrefbase;
    159 	char *result;
    160 
    161 	if(rhref == nil)
    162 		return estrdup("NULL URL");
    163 	href = runetobyte(rhref, runestrlen(rhref));
    164 	hrefbase = baseurl(href);
    165 	result = nil;
    166 	if(hrefbase==nil && (base = baseurl(u->url))!=nil){
    167 		result = estrdup(base);
    168 		if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
    169 			result = eappend(result, "/", "");
    170 		free(base);
    171 	}
    172 	if(href){
    173 		if(result)
    174 			result = eappend(result, "", href);
    175 		else
    176 			result = estrdup(href);
    177 	}
    178 	free(hrefbase);
    179 	if(result == nil)
    180 		return estrdup("***unknown***");
    181 	return result;
    182 }
    183 
    184 void
    185 render(URLwin *u, Bytes *t, Item *items, int curanchor)
    186 {
    187 	Item *il;
    188 	Itext *it;
    189 	Ifloat *ifl;
    190 	Ispacer *is;
    191 	Itable *ita;
    192 	Iimage *im;
    193 	Anchor *a;
    194 	Table *tab;
    195 	Tablecell *cell;
    196 	char *href;
    197 
    198 	inword = 0;
    199 	col = 0;
    200 	wordi = 0;
    201 
    202 	for(il=items; il!=nil; il=il->next){
    203 		if(il->state & IFbrk)
    204 			renderbytes(t, "\n");
    205 		if(il->state & IFbrksp)
    206 			renderbytes(t, "\n");
    207 
    208 		switch(il->tag){
    209 		case Itexttag:
    210 			it = (Itext*)il;
    211 			renderrunes(t, it->s);
    212 			break;
    213 		case Iruletag:
    214 			if(t->n>0 && t->b[t->n-1]!='\n')
    215 				renderbytes(t, "\n");
    216 			renderbytes(t, "=======\n");
    217 			break;
    218 		case Iimagetag:
    219 			if(!aflag)
    220 				break;
    221 			im = (Iimage*)il;
    222 			if(im->imsrc){
    223 				href = fullurl(u, im->imsrc);
    224 				renderbytes(t, "[image %s]", href);
    225 				free(href);
    226 			}
    227 			break;
    228 		case Iformfieldtag:
    229 			if(aflag)
    230 				renderbytes(t, "[formfield]");
    231 			break;
    232 		case Itabletag:
    233 			ita = (Itable*)il;
    234 			tab = ita->table;
    235 			for(cell=tab->cells; cell!=nil; cell=cell->next){
    236 				render(u, t, cell->content, curanchor);
    237 			}
    238 			if(t->n>0 && t->b[t->n-1]!='\n')
    239 				renderbytes(t, "\n");
    240 			break;
    241 		case Ifloattag:
    242 			ifl = (Ifloat*)il;
    243 			render(u, t, ifl->item, curanchor);
    244 			break;
    245 		case Ispacertag:
    246 			is = (Ispacer*)il;
    247 			if(is->spkind != ISPnull)
    248 				renderbytes(t, " ");
    249 			break;
    250 		default:
    251 			error("unknown item tag %d\n", il->tag);
    252 		}
    253 		if(il->anchorid != 0 && il->anchorid!=curanchor){
    254 			for(a=u->docinfo->anchors; a!=nil; a=a->next)
    255 				if(aflag && a->index == il->anchorid){
    256 					href = fullurl(u, a->href);
    257 					renderbytes(t, "[%s]", href);
    258 					free(href);
    259 					break;
    260 				}
    261 			curanchor = il->anchorid;
    262 		}
    263 	}
    264 	if(t->n>0 && t->b[t->n-1]!='\n')
    265 		renderbytes(t, "\n");
    266 }
    267 
    268 void
    269 rerender(URLwin *u)
    270 {
    271 	Bytes *t;
    272 
    273 	t = emalloc(sizeof(Bytes));
    274 
    275 	render(u, t, u->items, 0);
    276 
    277 	if(t->n)
    278 		write(u->outfd, (char*)t->b, t->n);
    279 	free(t->b);
    280 	free(t);
    281 }
    282 
    283 /*
    284  * Somewhat of a hack.  Not a full parse, just looks for strings in the beginning
    285  * of the document (cistrstr only looks at first somewhat bytes).
    286  */
    287 int
    288 charset(char *s)
    289 {
    290 	char *meta, *emeta, *charset;
    291 
    292 	if(defcharset == 0)
    293 		defcharset = ISO_8859_1;
    294 	meta = cistrstr(s, "<meta");
    295 	if(meta == nil)
    296 		return defcharset;
    297 	for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
    298 		;
    299 	charset = cistrstr(s, "charset=");
    300 	if(charset == nil)
    301 		return defcharset;
    302 	charset += 8;
    303 	if(*charset == '"')
    304 		charset++;
    305 	if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
    306 		return UTF_8;
    307 	return defcharset;
    308 }
    309 
    310 void
    311 rendertext(URLwin *u, Bytes *b)
    312 {
    313 	Rune *rurl;
    314 
    315 	rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
    316 	u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
    317 /*	free(rurl); */
    318 
    319 	rerender(u);
    320 }
    321 
    322 
    323 void
    324 freeurlwin(URLwin *u)
    325 {
    326 	freeitems(u->items);
    327 	u->items = nil;
    328 	freedocinfo(u->docinfo);
    329 	u->docinfo = nil;
    330 	free(u);
    331 }