utf.c (5985B)
1 #ifdef PLAN9 2 #include <u.h> 3 #include <libc.h> 4 #include <bio.h> 5 #ifdef PLAN9PORT 6 #include <errno.h> 7 #else 8 extern int errno; 9 #endif 10 #else 11 #include <sys/types.h> 12 #include <stdio.h> 13 #include <stdlib.h> 14 #include <string.h> 15 #include <unistd.h> 16 #include <errno.h> 17 #include "plan9.h" 18 #endif 19 #include "hdr.h" 20 #ifndef EILSEQ 21 #define EILSEQ 9998 22 #endif 23 24 /* 25 the our_* routines are implementations for the corresponding library 26 routines. for a while, i tried to actually name them wctomb etc 27 but stopped that after i found a system which made wchar_t an 28 unsigned char. 29 */ 30 31 int our_wctomb(char *s, unsigned long wc); 32 int our_mbtowc(unsigned long *p, char *s, unsigned n); 33 int runetoisoutf(char *str, Rune *rune); 34 int fullisorune(char *str, int n); 35 int isochartorune(Rune *rune, char *str); 36 37 void 38 utf_in(int fd, long *notused, struct convert *out) 39 { 40 char buf[N]; 41 int i, j, c, n, tot; 42 ulong l; 43 44 USED(notused); 45 tot = 0; 46 while((n = read(fd, buf+tot, N-tot)) >= 0){ 47 tot += n; 48 for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){ 49 c = our_mbtowc(&l, buf+i, tot-i); 50 if(c == -1){ 51 if(squawk) 52 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); 53 if(clean){ 54 i++; 55 continue; 56 } 57 nerrors++; 58 l = Runeerror; 59 c = 1; 60 } 61 runes[j++] = l; 62 i += c; 63 } 64 OUT(out, runes, j); 65 tot -= i; 66 ninput += i; 67 if(tot) 68 memmove(buf, buf+i, tot); 69 if(n == 0) 70 break; 71 } 72 OUT(out, runes, 0); 73 } 74 75 void 76 utf_out(Rune *base, int n, long *notused) 77 { 78 char *p; 79 Rune *r; 80 81 USED(notused); 82 nrunes += n; 83 for(r = base, p = obuf; n-- > 0; r++){ 84 p += our_wctomb(p, *r); 85 } 86 noutput += p-obuf; 87 write(1, obuf, p-obuf); 88 } 89 90 void 91 isoutf_in(int fd, long *notused, struct convert *out) 92 { 93 char buf[N]; 94 int i, j, c, n, tot; 95 96 USED(notused); 97 tot = 0; 98 while((n = read(fd, buf+tot, N-tot)) >= 0){ 99 tot += n; 100 for(i=j=0; i<tot; ){ 101 if(!fullisorune(buf+i, tot-i)) 102 break; 103 c = isochartorune(&runes[j], buf+i); 104 if(runes[j] == Runeerror && c == 1){ 105 if(squawk) 106 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i); 107 if(clean){ 108 i++; 109 continue; 110 } 111 nerrors++; 112 } 113 j++; 114 i += c; 115 } 116 OUT(out, runes, j); 117 tot -= i; 118 ninput += i; 119 if(tot) 120 memmove(buf, buf+i, tot); 121 if(n == 0) 122 break; 123 } 124 OUT(out, runes, 0); 125 } 126 127 void 128 isoutf_out(Rune *base, int n, long *notused) 129 { 130 char *p; 131 Rune *r; 132 133 USED(notused); 134 nrunes += n; 135 for(r = base, p = obuf; n-- > 0; r++) 136 p += runetoisoutf(p, r); 137 noutput += p-obuf; 138 write(1, obuf, p-obuf); 139 } 140 141 142 int 143 isochartorune(Rune *rune, char *str) 144 { 145 return chartorune(rune, str); 146 } 147 148 int 149 runetoisoutf(char *str, Rune *rune) 150 { 151 return runetochar(str, rune); 152 } 153 154 int 155 fullisorune(char *str, int n) 156 { 157 return fullrune(str, n); 158 } 159 160 enum 161 { 162 T1 = 0x00, 163 Tx = 0x80, 164 T2 = 0xC0, 165 T3 = 0xE0, 166 T4 = 0xF0, 167 T5 = 0xF8, 168 T6 = 0xFC, 169 170 Bit1 = 7, 171 Bitx = 6, 172 Bit2 = 5, 173 Bit3 = 4, 174 Bit4 = 3, 175 Bit5 = 2, 176 Bit6 = 2, 177 178 Mask1 = (1<<Bit1)-1, 179 Maskx = (1<<Bitx)-1, 180 Mask2 = (1<<Bit2)-1, 181 Mask3 = (1<<Bit3)-1, 182 Mask4 = (1<<Bit4)-1, 183 Mask5 = (1<<Bit5)-1, 184 Mask6 = (1<<Bit6)-1, 185 186 Wchar1 = (1UL<<Bit1)-1, 187 Wchar2 = (1UL<<(Bit2+Bitx))-1, 188 Wchar3 = (1UL<<(Bit3+2*Bitx))-1, 189 Wchar4 = (1UL<<(Bit4+3*Bitx))-1, 190 Wchar5 = (1UL<<(Bit5+4*Bitx))-1 191 }; 192 193 int 194 our_wctomb(char *s, unsigned long wc) 195 { 196 if(s == 0) 197 return 0; /* no shift states */ 198 if(wc & ~Wchar2) { 199 if(wc & ~Wchar4) { 200 if(wc & ~Wchar5) { 201 /* 6 bytes */ 202 s[0] = T6 | ((wc >> 5*Bitx) & Mask6); 203 s[1] = Tx | ((wc >> 4*Bitx) & Maskx); 204 s[2] = Tx | ((wc >> 3*Bitx) & Maskx); 205 s[3] = Tx | ((wc >> 2*Bitx) & Maskx); 206 s[4] = Tx | ((wc >> 1*Bitx) & Maskx); 207 s[5] = Tx | (wc & Maskx); 208 return 6; 209 } 210 /* 5 bytes */ 211 s[0] = T5 | (wc >> 4*Bitx); 212 s[1] = Tx | ((wc >> 3*Bitx) & Maskx); 213 s[2] = Tx | ((wc >> 2*Bitx) & Maskx); 214 s[3] = Tx | ((wc >> 1*Bitx) & Maskx); 215 s[4] = Tx | (wc & Maskx); 216 return 5; 217 } 218 if(wc & ~Wchar3) { 219 /* 4 bytes */ 220 s[0] = T4 | (wc >> 3*Bitx); 221 s[1] = Tx | ((wc >> 2*Bitx) & Maskx); 222 s[2] = Tx | ((wc >> 1*Bitx) & Maskx); 223 s[3] = Tx | (wc & Maskx); 224 return 4; 225 } 226 /* 3 bytes */ 227 s[0] = T3 | (wc >> 2*Bitx); 228 s[1] = Tx | ((wc >> 1*Bitx) & Maskx); 229 s[2] = Tx | (wc & Maskx); 230 return 3; 231 } 232 if(wc & ~Wchar1) { 233 /* 2 bytes */ 234 s[0] = T2 | (wc >> 1*Bitx); 235 s[1] = Tx | (wc & Maskx); 236 return 2; 237 } 238 /* 1 byte */ 239 s[0] = T1 | wc; 240 return 1; 241 } 242 243 int 244 our_mbtowc(unsigned long *p, char *s, unsigned n) 245 { 246 uchar *us; 247 int c0, c1, c2, c3, c4, c5; 248 unsigned long wc; 249 250 if(s == 0) 251 return 0; /* no shift states */ 252 253 if(n < 1) 254 goto bad; 255 us = (uchar*)s; 256 c0 = us[0]; 257 if(c0 >= T3) { 258 if(n < 3) 259 goto bad; 260 c1 = us[1] ^ Tx; 261 c2 = us[2] ^ Tx; 262 if((c1|c2) & T2) 263 goto bad; 264 if(c0 >= T5) { 265 if(n < 5) 266 goto bad; 267 c3 = us[3] ^ Tx; 268 c4 = us[4] ^ Tx; 269 if((c3|c4) & T2) 270 goto bad; 271 if(c0 >= T6) { 272 /* 6 bytes */ 273 if(n < 6) 274 goto bad; 275 c5 = us[5] ^ Tx; 276 if(c5 & T2) 277 goto bad; 278 wc = ((((((((((c0 & Mask6) << Bitx) | 279 c1) << Bitx) | c2) << Bitx) | 280 c3) << Bitx) | c4) << Bitx) | c5; 281 if(wc <= Wchar5) 282 goto bad; 283 *p = wc; 284 return 6; 285 } 286 /* 5 bytes */ 287 wc = ((((((((c0 & Mask5) << Bitx) | 288 c1) << Bitx) | c2) << Bitx) | 289 c3) << Bitx) | c4; 290 if(wc <= Wchar4) 291 goto bad; 292 *p = wc; 293 return 5; 294 } 295 if(c0 >= T4) { 296 /* 4 bytes */ 297 if(n < 4) 298 goto bad; 299 c3 = us[3] ^ Tx; 300 if(c3 & T2) 301 goto bad; 302 wc = ((((((c0 & Mask4) << Bitx) | 303 c1) << Bitx) | c2) << Bitx) | 304 c3; 305 if(wc <= Wchar3) 306 goto bad; 307 *p = wc; 308 return 4; 309 } 310 /* 3 bytes */ 311 wc = ((((c0 & Mask3) << Bitx) | 312 c1) << Bitx) | c2; 313 if(wc <= Wchar2) 314 goto bad; 315 *p = wc; 316 return 3; 317 } 318 if(c0 >= T2) { 319 /* 2 bytes */ 320 if(n < 2) 321 goto bad; 322 c1 = us[1] ^ Tx; 323 if(c1 & T2) 324 goto bad; 325 wc = ((c0 & Mask2) << Bitx) | 326 c1; 327 if(wc <= Wchar1) 328 goto bad; 329 *p = wc; 330 return 2; 331 } 332 /* 1 byte */ 333 if(c0 >= Tx) 334 goto bad; 335 *p = c0; 336 return 1; 337 338 bad: 339 errno = EILSEQ; 340 return -1; 341 }