html.c (5965B)
1 #include <u.h> 2 #include <libc.h> 3 #include <bio.h> 4 #include <draw.h> 5 #include <regexp.h> 6 #include <html.h> 7 #include <ctype.h> 8 #include "dat.h" 9 10 char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)"; 11 Reprog *urlprog; 12 13 int inword = 0; 14 int col = 0; 15 int wordi = 0; 16 17 char* 18 loadhtml(int fd) 19 { 20 URLwin *u; 21 Bytes *b; 22 int n; 23 char buf[4096]; 24 25 u = emalloc(sizeof(URLwin)); 26 u->infd = fd; 27 u->outfd = 1; 28 u->url = estrdup(url); 29 u->type = TextHtml; 30 31 b = emalloc(sizeof(Bytes)); 32 while((n = read(fd, buf, sizeof buf)) > 0) 33 growbytes(b, buf, n); 34 if(b->b == nil) 35 return nil; /* empty file */ 36 rendertext(u, b); 37 freeurlwin(u); 38 return nil; 39 } 40 41 char* 42 runetobyte(Rune *r, int n) 43 { 44 char *s; 45 46 if(n == 0) 47 return emalloc(1); 48 s = smprint("%.*S", n, r); 49 if(s == nil) 50 error("malloc failed"); 51 return s; 52 } 53 54 int 55 closingpunct(int c) 56 { 57 return strchr(".,:;'\")]}>!?", c) != nil; 58 } 59 60 void 61 emitword(Bytes *b, Rune *r, int nr) 62 { 63 char *s; 64 int space; 65 66 if(nr == 0) 67 return; 68 s = smprint("%.*S", nr, r); 69 space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]); 70 if(col>0 && col+space+nr > width){ 71 growbytes(b, "\n", 1); 72 space = 0; 73 col = 0; 74 } 75 if(space && col>0){ 76 growbytes(b, " ", 1); 77 col++; 78 } 79 growbytes(b, s, strlen(s)); 80 col += nr; 81 free(s); 82 inword = 0; 83 } 84 85 void 86 renderrunes(Bytes *b, Rune *r) 87 { 88 int i, n; 89 90 n = runestrlen(r); 91 for(i=0; i<n; i++){ 92 switch(r[i]){ 93 case '\n': 94 if(inword) 95 emitword(b, r+wordi, i-wordi); 96 col = 0; 97 if(b->n == 0) 98 break; /* don't start with blank lines */ 99 if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n') 100 growbytes(b, "\n", 1); 101 break; 102 case ' ': 103 if(inword) 104 emitword(b, r+wordi, i-wordi); 105 break; 106 default: 107 if(!inword) 108 wordi = i; 109 inword = 1; 110 break; 111 } 112 } 113 if(inword) 114 emitword(b, r+wordi, i-wordi); 115 } 116 117 void 118 renderbytes(Bytes *b, char *fmt, ...) 119 { 120 Rune *r; 121 va_list arg; 122 123 va_start(arg, fmt); 124 r = runevsmprint(fmt, arg); 125 va_end(arg); 126 renderrunes(b, r); 127 free(r); 128 } 129 130 char* 131 baseurl(char *url) 132 { 133 char *base, *slash; 134 Resub rs[10]; 135 136 if(url == nil) 137 return nil; 138 if(urlprog == nil){ 139 urlprog = regcomp(urlexpr); 140 if(urlprog == nil) 141 error("can't compile URL regexp"); 142 } 143 memset(rs, 0, sizeof rs); 144 if(regexec(urlprog, url, rs, nelem(rs)) == 0) 145 return nil; 146 base = estrdup(url); 147 slash = strrchr(base, '/'); 148 if(slash!=nil && slash>=&base[rs[0].e.ep-rs[0].s.sp]) 149 *slash = '\0'; 150 else 151 base[rs[0].e.ep-rs[0].s.sp] = '\0'; 152 return base; 153 } 154 155 char* 156 fullurl(URLwin *u, Rune *rhref) 157 { 158 char *base, *href, *hrefbase; 159 char *result; 160 161 if(rhref == nil) 162 return estrdup("NULL URL"); 163 href = runetobyte(rhref, runestrlen(rhref)); 164 hrefbase = baseurl(href); 165 result = nil; 166 if(hrefbase==nil && (base = baseurl(u->url))!=nil){ 167 result = estrdup(base); 168 if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/')) 169 result = eappend(result, "/", ""); 170 free(base); 171 } 172 if(href){ 173 if(result) 174 result = eappend(result, "", href); 175 else 176 result = estrdup(href); 177 } 178 free(hrefbase); 179 if(result == nil) 180 return estrdup("***unknown***"); 181 return result; 182 } 183 184 void 185 render(URLwin *u, Bytes *t, Item *items, int curanchor) 186 { 187 Item *il; 188 Itext *it; 189 Ifloat *ifl; 190 Ispacer *is; 191 Itable *ita; 192 Iimage *im; 193 Anchor *a; 194 Table *tab; 195 Tablecell *cell; 196 char *href; 197 198 inword = 0; 199 col = 0; 200 wordi = 0; 201 202 for(il=items; il!=nil; il=il->next){ 203 if(il->state & IFbrk) 204 renderbytes(t, "\n"); 205 if(il->state & IFbrksp) 206 renderbytes(t, "\n"); 207 208 switch(il->tag){ 209 case Itexttag: 210 it = (Itext*)il; 211 renderrunes(t, it->s); 212 break; 213 case Iruletag: 214 if(t->n>0 && t->b[t->n-1]!='\n') 215 renderbytes(t, "\n"); 216 renderbytes(t, "=======\n"); 217 break; 218 case Iimagetag: 219 if(!aflag) 220 break; 221 im = (Iimage*)il; 222 if(im->imsrc){ 223 href = fullurl(u, im->imsrc); 224 renderbytes(t, "[image %s]", href); 225 free(href); 226 } 227 break; 228 case Iformfieldtag: 229 if(aflag) 230 renderbytes(t, "[formfield]"); 231 break; 232 case Itabletag: 233 ita = (Itable*)il; 234 tab = ita->table; 235 for(cell=tab->cells; cell!=nil; cell=cell->next){ 236 render(u, t, cell->content, curanchor); 237 } 238 if(t->n>0 && t->b[t->n-1]!='\n') 239 renderbytes(t, "\n"); 240 break; 241 case Ifloattag: 242 ifl = (Ifloat*)il; 243 render(u, t, ifl->item, curanchor); 244 break; 245 case Ispacertag: 246 is = (Ispacer*)il; 247 if(is->spkind != ISPnull) 248 renderbytes(t, " "); 249 break; 250 default: 251 error("unknown item tag %d\n", il->tag); 252 } 253 if(il->anchorid != 0 && il->anchorid!=curanchor){ 254 for(a=u->docinfo->anchors; a!=nil; a=a->next) 255 if(aflag && a->index == il->anchorid){ 256 href = fullurl(u, a->href); 257 renderbytes(t, "[%s]", href); 258 free(href); 259 break; 260 } 261 curanchor = il->anchorid; 262 } 263 } 264 if(t->n>0 && t->b[t->n-1]!='\n') 265 renderbytes(t, "\n"); 266 } 267 268 void 269 rerender(URLwin *u) 270 { 271 Bytes *t; 272 273 t = emalloc(sizeof(Bytes)); 274 275 render(u, t, u->items, 0); 276 277 if(t->n) 278 write(u->outfd, (char*)t->b, t->n); 279 free(t->b); 280 free(t); 281 } 282 283 /* 284 * Somewhat of a hack. Not a full parse, just looks for strings in the beginning 285 * of the document (cistrstr only looks at first somewhat bytes). 286 */ 287 int 288 charset(char *s) 289 { 290 char *meta, *emeta, *charset; 291 292 if(defcharset == 0) 293 defcharset = ISO_8859_1; 294 meta = cistrstr(s, "<meta"); 295 if(meta == nil) 296 return defcharset; 297 for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++) 298 ; 299 charset = cistrstr(s, "charset="); 300 if(charset == nil) 301 return defcharset; 302 charset += 8; 303 if(*charset == '"') 304 charset++; 305 if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4)) 306 return UTF_8; 307 return defcharset; 308 } 309 310 void 311 rendertext(URLwin *u, Bytes *b) 312 { 313 Rune *rurl; 314 315 rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1); 316 u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo); 317 /* free(rurl); */ 318 319 rerender(u); 320 } 321 322 323 void 324 freeurlwin(URLwin *u) 325 { 326 freeitems(u->items); 327 u->items = nil; 328 freedocinfo(u->docinfo); 329 u->docinfo = nil; 330 free(u); 331 }