plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

utf.c (5985B)


      1 #ifdef PLAN9
      2 #include	<u.h>
      3 #include	<libc.h>
      4 #include	<bio.h>
      5 #ifdef PLAN9PORT
      6 #include	<errno.h>
      7 #else
      8 extern int errno;
      9 #endif
     10 #else
     11 #include	<sys/types.h>
     12 #include	<stdio.h>
     13 #include	<stdlib.h>
     14 #include	<string.h>
     15 #include	<unistd.h>
     16 #include	<errno.h>
     17 #include	"plan9.h"
     18 #endif
     19 #include	"hdr.h"
     20 #ifndef EILSEQ
     21 #define EILSEQ 9998
     22 #endif
     23 
     24 /*
     25 	the our_* routines are implementations for the corresponding library
     26 	routines. for a while, i tried to actually name them wctomb etc
     27 	but stopped that after i found a system which made wchar_t an
     28 	unsigned char.
     29 */
     30 
     31 int our_wctomb(char *s, unsigned long wc);
     32 int our_mbtowc(unsigned long *p, char *s, unsigned n);
     33 int runetoisoutf(char *str, Rune *rune);
     34 int fullisorune(char *str, int n);
     35 int isochartorune(Rune *rune, char *str);
     36 
     37 void
     38 utf_in(int fd, long *notused, struct convert *out)
     39 {
     40 	char buf[N];
     41 	int i, j, c, n, tot;
     42 	ulong l;
     43 
     44 	USED(notused);
     45 	tot = 0;
     46 	while((n = read(fd, buf+tot, N-tot)) >= 0){
     47 		tot += n;
     48 		for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
     49 			c = our_mbtowc(&l, buf+i, tot-i);
     50 			if(c == -1){
     51 				if(squawk)
     52 					EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
     53 				if(clean){
     54 					i++;
     55 					continue;
     56 				}
     57 				nerrors++;
     58 				l = Runeerror;
     59 				c = 1;
     60 			}
     61 			runes[j++] = l;
     62 			i += c;
     63 		}
     64 		OUT(out, runes, j);
     65 		tot -= i;
     66 		ninput += i;
     67 		if(tot)
     68 			memmove(buf, buf+i, tot);
     69 		if(n == 0)
     70 			break;
     71 	}
     72 	OUT(out, runes, 0);
     73 }
     74 
     75 void
     76 utf_out(Rune *base, int n, long *notused)
     77 {
     78 	char *p;
     79 	Rune *r;
     80 
     81 	USED(notused);
     82 	nrunes += n;
     83 	for(r = base, p = obuf; n-- > 0; r++){
     84 		p += our_wctomb(p, *r);
     85 	}
     86 	noutput += p-obuf;
     87 	write(1, obuf, p-obuf);
     88 }
     89 
     90 void
     91 isoutf_in(int fd, long *notused, struct convert *out)
     92 {
     93 	char buf[N];
     94 	int i, j, c, n, tot;
     95 
     96 	USED(notused);
     97 	tot = 0;
     98 	while((n = read(fd, buf+tot, N-tot)) >= 0){
     99 		tot += n;
    100 		for(i=j=0; i<tot; ){
    101 			if(!fullisorune(buf+i, tot-i))
    102 				break;
    103 			c = isochartorune(&runes[j], buf+i);
    104 			if(runes[j] == Runeerror && c == 1){
    105 				if(squawk)
    106 					EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
    107 				if(clean){
    108 					i++;
    109 					continue;
    110 				}
    111 				nerrors++;
    112 			}
    113 			j++;
    114 			i += c;
    115 		}
    116 		OUT(out, runes, j);
    117 		tot -= i;
    118 		ninput += i;
    119 		if(tot)
    120 			memmove(buf, buf+i, tot);
    121 		if(n == 0)
    122 			break;
    123 	}
    124 	OUT(out, runes, 0);
    125 }
    126 
    127 void
    128 isoutf_out(Rune *base, int n, long *notused)
    129 {
    130 	char *p;
    131 	Rune *r;
    132 
    133 	USED(notused);
    134 	nrunes += n;
    135 	for(r = base, p = obuf; n-- > 0; r++)
    136 		p += runetoisoutf(p, r);
    137 	noutput += p-obuf;
    138 	write(1, obuf, p-obuf);
    139 }
    140 
    141 
    142 int
    143 isochartorune(Rune *rune, char *str)
    144 {
    145 	return chartorune(rune, str);
    146 }
    147 
    148 int
    149 runetoisoutf(char *str, Rune *rune)
    150 {
    151 	return runetochar(str, rune);
    152 }
    153 
    154 int
    155 fullisorune(char *str, int n)
    156 {
    157 	return fullrune(str, n);
    158 }
    159 
    160 enum
    161 {
    162 	T1	= 0x00,
    163 	Tx	= 0x80,
    164 	T2	= 0xC0,
    165 	T3	= 0xE0,
    166 	T4	= 0xF0,
    167 	T5	= 0xF8,
    168 	T6	= 0xFC,
    169 
    170 	Bit1	= 7,
    171 	Bitx	= 6,
    172 	Bit2	= 5,
    173 	Bit3	= 4,
    174 	Bit4	= 3,
    175 	Bit5	= 2,
    176 	Bit6	= 2,
    177 
    178 	Mask1	= (1<<Bit1)-1,
    179 	Maskx	= (1<<Bitx)-1,
    180 	Mask2	= (1<<Bit2)-1,
    181 	Mask3	= (1<<Bit3)-1,
    182 	Mask4	= (1<<Bit4)-1,
    183 	Mask5	= (1<<Bit5)-1,
    184 	Mask6	= (1<<Bit6)-1,
    185 
    186 	Wchar1	= (1UL<<Bit1)-1,
    187 	Wchar2	= (1UL<<(Bit2+Bitx))-1,
    188 	Wchar3	= (1UL<<(Bit3+2*Bitx))-1,
    189 	Wchar4	= (1UL<<(Bit4+3*Bitx))-1,
    190 	Wchar5	= (1UL<<(Bit5+4*Bitx))-1
    191 };
    192 
    193 int
    194 our_wctomb(char *s, unsigned long wc)
    195 {
    196 	if(s == 0)
    197 		return 0;		/* no shift states */
    198 	if(wc & ~Wchar2) {
    199 		if(wc & ~Wchar4) {
    200 			if(wc & ~Wchar5) {
    201 				/* 6 bytes */
    202 				s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
    203 				s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
    204 				s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
    205 				s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
    206 				s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
    207 				s[5] = Tx |  (wc & Maskx);
    208 				return 6;
    209 			}
    210 			/* 5 bytes */
    211 			s[0] = T5 |  (wc >> 4*Bitx);
    212 			s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
    213 			s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
    214 			s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
    215 			s[4] = Tx |  (wc & Maskx);
    216 			return 5;
    217 		}
    218 		if(wc & ~Wchar3) {
    219 			/* 4 bytes */
    220 			s[0] = T4 |  (wc >> 3*Bitx);
    221 			s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
    222 			s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
    223 			s[3] = Tx |  (wc & Maskx);
    224 			return 4;
    225 		}
    226 		/* 3 bytes */
    227 		s[0] = T3 |  (wc >> 2*Bitx);
    228 		s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
    229 		s[2] = Tx |  (wc & Maskx);
    230 		return 3;
    231 	}
    232 	if(wc & ~Wchar1) {
    233 		/* 2 bytes */
    234 		s[0] = T2 | (wc >> 1*Bitx);
    235 		s[1] = Tx | (wc & Maskx);
    236 		return 2;
    237 	}
    238 	/* 1 byte */
    239 	s[0] = T1 | wc;
    240 	return 1;
    241 }
    242 
    243 int
    244 our_mbtowc(unsigned long *p, char *s, unsigned n)
    245 {
    246 	uchar *us;
    247 	int c0, c1, c2, c3, c4, c5;
    248 	unsigned long wc;
    249 
    250 	if(s == 0)
    251 		return 0;		/* no shift states */
    252 
    253 	if(n < 1)
    254 		goto bad;
    255 	us = (uchar*)s;
    256 	c0 = us[0];
    257 	if(c0 >= T3) {
    258 		if(n < 3)
    259 			goto bad;
    260 		c1 = us[1] ^ Tx;
    261 		c2 = us[2] ^ Tx;
    262 		if((c1|c2) & T2)
    263 			goto bad;
    264 		if(c0 >= T5) {
    265 			if(n < 5)
    266 				goto bad;
    267 			c3 = us[3] ^ Tx;
    268 			c4 = us[4] ^ Tx;
    269 			if((c3|c4) & T2)
    270 				goto bad;
    271 			if(c0 >= T6) {
    272 				/* 6 bytes */
    273 				if(n < 6)
    274 					goto bad;
    275 				c5 = us[5] ^ Tx;
    276 				if(c5 & T2)
    277 					goto bad;
    278 				wc = ((((((((((c0 & Mask6) << Bitx) |
    279 					c1) << Bitx) | c2) << Bitx) |
    280 					c3) << Bitx) | c4) << Bitx) | c5;
    281 				if(wc <= Wchar5)
    282 					goto bad;
    283 				*p = wc;
    284 				return 6;
    285 			}
    286 			/* 5 bytes */
    287 			wc = ((((((((c0 & Mask5) << Bitx) |
    288 				c1) << Bitx) | c2) << Bitx) |
    289 				c3) << Bitx) | c4;
    290 			if(wc <= Wchar4)
    291 				goto bad;
    292 			*p = wc;
    293 			return 5;
    294 		}
    295 		if(c0 >= T4) {
    296 			/* 4 bytes */
    297 			if(n < 4)
    298 				goto bad;
    299 			c3 = us[3] ^ Tx;
    300 			if(c3 & T2)
    301 				goto bad;
    302 			wc = ((((((c0 & Mask4) << Bitx) |
    303 				c1) << Bitx) | c2) << Bitx) |
    304 				c3;
    305 			if(wc <= Wchar3)
    306 				goto bad;
    307 			*p = wc;
    308 			return 4;
    309 		}
    310 		/* 3 bytes */
    311 		wc = ((((c0 & Mask3) << Bitx) |
    312 			c1) << Bitx) | c2;
    313 		if(wc <= Wchar2)
    314 			goto bad;
    315 		*p = wc;
    316 		return 3;
    317 	}
    318 	if(c0 >= T2) {
    319 		/* 2 bytes */
    320 		if(n < 2)
    321 			goto bad;
    322 		c1 = us[1] ^ Tx;
    323 		if(c1 & T2)
    324 			goto bad;
    325 		wc = ((c0 & Mask2) << Bitx) |
    326 			c1;
    327 		if(wc <= Wchar1)
    328 			goto bad;
    329 		*p = wc;
    330 		return 2;
    331 	}
    332 	/* 1 byte */
    333 	if(c0 >= Tx)
    334 		goto bad;
    335 	*p = c0;
    336 	return 1;
    337 
    338 bad:
    339 	errno = EILSEQ;
    340 	return -1;
    341 }