plan9port

fork of plan9port with libvec, libstr and libsdb
Log | Files | Refs | README | LICENSE

conv_jis.c (11016B)


      1 #ifdef	PLAN9
      2 #include	<u.h>
      3 #include	<libc.h>
      4 #include	<bio.h>
      5 #else
      6 #include	<stdio.h>
      7 #include	<unistd.h>
      8 #include	"plan9.h"
      9 #endif
     10 #include	"hdr.h"
     11 #include	"conv.h"
     12 #include	"kuten208.h"
     13 #include	"jis.h"
     14 
     15 /*
     16 	a state machine for interpreting all sorts of encodings
     17 */
     18 static void
     19 alljis(int c, Rune **r, long input_loc)
     20 {
     21 	static enum { state0, state1, state2, state3, state4 } state = state0;
     22 	static int set8 = 0;
     23 	static int japan646 = 0;
     24 	static int lastc;
     25 	int n;
     26 	long l;
     27 
     28 again:
     29 	switch(state)
     30 	{
     31 	case state0:	/* idle state */
     32 		if(c == ESC){ state = state1; return; }
     33 		if(c < 0) return;
     34 		if(!set8 && (c < 128)){
     35 			if(japan646){
     36 				switch(c)
     37 				{
     38 				case '\\':	emit(0xA5); return;	/* yen */
     39 				case '~':	emit(0xAF); return;	/* spacing macron */
     40 				default:	emit(c); return;
     41 				}
     42 			} else {
     43 				emit(c);
     44 				return;
     45 			}
     46 		}
     47 		if(c < 0x21){	/* guard against bogus characters in JIS mode */
     48 			if(squawk)
     49 				EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc);
     50 			emit(c);
     51 			return;
     52 		}
     53 		lastc = c; state = state4; return;
     54 
     55 	case state1:	/* seen an escape */
     56 		if(c == '$'){ state = state2; return; }
     57 		if(c == '('){ state = state3; return; }
     58 		emit(ESC); state = state0; goto again;
     59 
     60 	case state2:	/* may be shifting into JIS */
     61 		if((c == '@') || (c == 'B')){
     62 			set8 = 1; state = state0; return;
     63 		}
     64 		emit(ESC); emit('$'); state = state0; goto again;
     65 
     66 	case state3:	/* may be shifting out of JIS */
     67 		if((c == 'J') || (c == 'H') || (c == 'B')){
     68 			japan646 = (c == 'J');
     69 			set8 = 0; state = state0; return;
     70 		}
     71 		emit(ESC); emit('('); state = state0; goto again;
     72 
     73 	case state4:	/* two part char */
     74 		if(c < 0){
     75 			if(squawk)
     76 				EPR "%s: unexpected EOF in %s\n", argv0, file);
     77 			c = 0x21 | (lastc&0x80);
     78 		}
     79 		if(CANS2J(lastc, c)){	/* ms dos sjis */
     80 			int hi = lastc, lo = c;
     81 			S2J(hi, lo);			/* convert to 208 */
     82 			n = hi*100 + lo - 3232;		/* convert to kuten208 */
     83 		} else
     84 			n = (lastc&0x7F)*100 + (c&0x7f) - 3232;	/* kuten208 */
     85 		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
     86 			nerrors++;
     87 			if(squawk)
     88 				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
     89 			if(!clean)
     90 				emit(BADMAP);
     91 		} else {
     92 			if(l < 0){
     93 				l = -l;
     94 				if(squawk)
     95 					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
     96 			}
     97 			emit(l);
     98 		}
     99 		state = state0;
    100 	}
    101 }
    102 
    103 /*
    104 	a state machine for interpreting ms-kanji == shift-jis.
    105 */
    106 static void
    107 ms(int c, Rune **r, long input_loc)
    108 {
    109 	static enum { state0, state1, state2, state3, state4 } state = state0;
    110 	static int set8 = 0;
    111 	static int japan646 = 0;
    112 	static int lastc;
    113 	int n;
    114 	long l;
    115 
    116 again:
    117 	switch(state)
    118 	{
    119 	case state0:	/* idle state */
    120 		if(c == ESC){ state = state1; return; }
    121 		if(c < 0) return;
    122 		if(!set8 && (c < 128)){
    123 			if(japan646){
    124 				switch(c)
    125 				{
    126 				case '\\':	emit(0xA5); return;	/* yen */
    127 				case '~':	emit(0xAF); return;	/* spacing macron */
    128 				default:	emit(c); return;
    129 				}
    130 			} else {
    131 				emit(c);
    132 				return;
    133 			}
    134 		}
    135 		lastc = c; state = state4; return;
    136 
    137 	case state1:	/* seen an escape */
    138 		if(c == '$'){ state = state2; return; }
    139 		if(c == '('){ state = state3; return; }
    140 		emit(ESC); state = state0; goto again;
    141 
    142 	case state2:	/* may be shifting into JIS */
    143 		if((c == '@') || (c == 'B')){
    144 			set8 = 1; state = state0; return;
    145 		}
    146 		emit(ESC); emit('$'); state = state0; goto again;
    147 
    148 	case state3:	/* may be shifting out of JIS */
    149 		if((c == 'J') || (c == 'H') || (c == 'B')){
    150 			japan646 = (c == 'J');
    151 			set8 = 0; state = state0; return;
    152 		}
    153 		emit(ESC); emit('('); state = state0; goto again;
    154 
    155 	case state4:	/* two part char */
    156 		if(c < 0){
    157 			if(squawk)
    158 				EPR "%s: unexpected EOF in %s\n", argv0, file);
    159 			c = 0x21 | (lastc&0x80);
    160 		}
    161 		if(CANS2J(lastc, c)){	/* ms dos sjis */
    162 			int hi = lastc, lo = c;
    163 			S2J(hi, lo);			/* convert to 208 */
    164 			n = hi*100 + lo - 3232;		/* convert to kuten208 */
    165 		} else {
    166 			nerrors++;
    167 			if(squawk)
    168 				EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
    169 			if(!clean)
    170 				emit(BADMAP);
    171 			state = state0;
    172 			goto again;
    173 		}
    174 		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
    175 			nerrors++;
    176 			if(squawk)
    177 				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
    178 			if(!clean)
    179 				emit(BADMAP);
    180 		} else {
    181 			if(l < 0){
    182 				l = -l;
    183 				if(squawk)
    184 					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
    185 			}
    186 			emit(l);
    187 		}
    188 		state = state0;
    189 	}
    190 }
    191 
    192 /*
    193 	a state machine for interpreting ujis == EUC
    194 */
    195 static void
    196 ujis(int c, Rune **r, long input_loc)
    197 {
    198 	static enum { state0, state1 } state = state0;
    199 	static int lastc;
    200 	int n;
    201 	long l;
    202 
    203 	switch(state)
    204 	{
    205 	case state0:	/* idle state */
    206 		if(c < 0) return;
    207 		if(c < 128){
    208 			emit(c);
    209 			return;
    210 		}
    211 		if(c == 0x8e){	/* codeset 2 */
    212 			nerrors++;
    213 			if(squawk)
    214 				EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
    215 			if(!clean)
    216 				emit(BADMAP);
    217 			return;
    218 		}
    219 		if(c == 0x8f){	/* codeset 3 */
    220 			nerrors++;
    221 			if(squawk)
    222 				EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
    223 			if(!clean)
    224 				emit(BADMAP);
    225 			return;
    226 		}
    227 		lastc = c;
    228 		state = state1;
    229 		return;
    230 
    231 	case state1:	/* two part char */
    232 		if(c < 0){
    233 			if(squawk)
    234 				EPR "%s: unexpected EOF in %s\n", argv0, file);
    235 			c = 0xA1;
    236 		}
    237 		n = (lastc&0x7F)*100 + (c&0x7F) - 3232;	/* kuten208 */
    238 		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
    239 			nerrors++;
    240 			if(squawk)
    241 				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
    242 			if(!clean)
    243 				emit(BADMAP);
    244 		} else {
    245 			if(l < 0){
    246 				l = -l;
    247 				if(squawk)
    248 					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
    249 			}
    250 			emit(l);
    251 		}
    252 		state = state0;
    253 	}
    254 }
    255 
    256 /*
    257 	a state machine for interpreting jis-kanji == 2022-JP
    258 */
    259 static void
    260 jis(int c, Rune **r, long input_loc)
    261 {
    262 	static enum { state0, state1, state2, state3, state4 } state = state0;
    263 	static int set8 = 0;
    264 	static int japan646 = 0;
    265 	static int lastc;
    266 	int n;
    267 	long l;
    268 
    269 again:
    270 	switch(state)
    271 	{
    272 	case state0:	/* idle state */
    273 		if(c == ESC){ state = state1; return; }
    274 		if(c < 0) return;
    275 		if(!set8 && (c < 128)){
    276 			if(japan646){
    277 				switch(c)
    278 				{
    279 				case '\\':	emit(0xA5); return;	/* yen */
    280 				case '~':	emit(0xAF); return;	/* spacing macron */
    281 				default:	emit(c); return;
    282 				}
    283 			} else {
    284 				emit(c);
    285 				return;
    286 			}
    287 		}
    288 		lastc = c; state = state4; return;
    289 
    290 	case state1:	/* seen an escape */
    291 		if(c == '$'){ state = state2; return; }
    292 		if(c == '('){ state = state3; return; }
    293 		emit(ESC); state = state0; goto again;
    294 
    295 	case state2:	/* may be shifting into JIS */
    296 		if((c == '@') || (c == 'B')){
    297 			set8 = 1; state = state0; return;
    298 		}
    299 		emit(ESC); emit('$'); state = state0; goto again;
    300 
    301 	case state3:	/* may be shifting out of JIS */
    302 		if((c == 'J') || (c == 'H') || (c == 'B')){
    303 			japan646 = (c == 'J');
    304 			set8 = 0; state = state0; return;
    305 		}
    306 		emit(ESC); emit('('); state = state0; goto again;
    307 
    308 	case state4:	/* two part char */
    309 		if(c < 0){
    310 			if(squawk)
    311 				EPR "%s: unexpected EOF in %s\n", argv0, file);
    312 			c = 0x21 | (lastc&0x80);
    313 		}
    314 		if((lastc&0x80) != (c&0x80)){	/* guard against latin1 in jis */
    315 			emit(lastc);
    316 			state = state0;
    317 			goto again;
    318 		}
    319 		n = (lastc&0x7F)*100 + (c&0x7f) - 3232;	/* kuten208 */
    320 		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
    321 			nerrors++;
    322 			if(squawk)
    323 				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
    324 			if(!clean)
    325 				emit(BADMAP);
    326 		} else {
    327 			if(l < 0){
    328 				l = -l;
    329 				if(squawk)
    330 					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
    331 			}
    332 			emit(l);
    333 		}
    334 		state = state0;
    335 	}
    336 }
    337 
    338 static void
    339 do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
    340 {
    341 	Rune ob[N];
    342 	Rune *r, *re;
    343 	uchar ibuf[N];
    344 	int n, i;
    345 	long nin;
    346 
    347 	r = ob;
    348 	re = ob+N-3;
    349 	nin = 0;
    350 	while((n = read(fd, ibuf, sizeof ibuf)) > 0){
    351 		for(i = 0; i < n; i++){
    352 			(*procfn)(ibuf[i], &r, nin++);
    353 			if(r >= re){
    354 				OUT(out, ob, r-ob);
    355 				r = ob;
    356 			}
    357 		}
    358 		if(r > ob){
    359 			OUT(out, ob, r-ob);
    360 			r = ob;
    361 		}
    362 	}
    363 	(*procfn)(-1, &r, nin);
    364 	if(r > ob)
    365 		OUT(out, ob, r-ob);
    366 	OUT(out, ob, 0);
    367 }
    368 
    369 void
    370 jis_in(int fd, long *notused, struct convert *out)
    371 {
    372 	USED(notused);
    373 	do_in(fd, alljis, out);
    374 }
    375 
    376 void
    377 ujis_in(int fd, long *notused, struct convert *out)
    378 {
    379 	USED(notused);
    380 	do_in(fd, ujis, out);
    381 }
    382 
    383 void
    384 msjis_in(int fd, long *notused, struct convert *out)
    385 {
    386 	USED(notused);
    387 	do_in(fd, ms, out);
    388 }
    389 
    390 void
    391 jisjis_in(int fd, long *notused, struct convert *out)
    392 {
    393 	USED(notused);
    394 	do_in(fd, jis, out);
    395 }
    396 
    397 static int first = 1;
    398 
    399 static void
    400 tab_init(void)
    401 {
    402 	int i;
    403 	long l;
    404 
    405 	first = 0;
    406 	for(i = 0; i < NRUNE; i++)
    407 		tab[i] = -1;
    408 	for(i = 0; i < KUTEN208MAX; i++)
    409 		if((l = tabkuten208[i]) != -1){
    410 			if(l < 0)
    411 				tab[-l] = i;
    412 			else
    413 				tab[l] = i;
    414 		}
    415 }
    416 
    417 
    418 /*	jis-kanji, or ISO 2022-JP	*/
    419 void
    420 jisjis_out(Rune *base, int n, long *notused)
    421 {
    422 	char *p;
    423 	int i;
    424 	Rune r;
    425 	static enum { ascii, japan646, jp2022 } state = ascii;
    426 
    427 	USED(notused);
    428 	if(first)
    429 		tab_init();
    430 	nrunes += n;
    431 	p = obuf;
    432 	for(i = 0; i < n; i++){
    433 		r = base[i];
    434 		if(r < 128){
    435 			if(state == jp2022){
    436 				*p++ = ESC; *p++ = '('; *p++ = 'B';
    437 				state = ascii;
    438 			}
    439 			*p++ = r;
    440 		} else {
    441 			if(tab[r] != -1){
    442 				if(state != jp2022){
    443 					*p++ = ESC; *p++ = '$'; *p++ = 'B';
    444 					state = jp2022;
    445 				}
    446 				*p++ = tab[r]/100 + ' ';
    447 				*p++ = tab[r]%100 + ' ';
    448 				continue;
    449 			}
    450 			if(squawk)
    451 				EPR "%s: rune 0x%x not in output cs\n", argv0, r);
    452 			nerrors++;
    453 			if(clean)
    454 				continue;
    455 			*p++ = BYTEBADMAP;
    456 		}
    457 	}
    458 	noutput += p-obuf;
    459 	if(p > obuf)
    460 		write(1, obuf, p-obuf);
    461 }
    462 
    463 /*	ms-kanji, or Shift-JIS	*/
    464 void
    465 msjis_out(Rune *base, int n, long *notused)
    466 {
    467 	char *p;
    468 	int i, hi, lo;
    469 	Rune r;
    470 
    471 	USED(notused);
    472 	if(first)
    473 		tab_init();
    474 	nrunes += n;
    475 	p = obuf;
    476 	for(i = 0; i < n; i++){
    477 		r = base[i];
    478 		if(r < 128)
    479 			*p++ = r;
    480 		else {
    481 			if(tab[r] != -1){
    482 				hi = tab[r]/100 + ' ';
    483 				lo = tab[r]%100 + ' ';
    484 				J2S(hi, lo);
    485 				*p++ = hi;
    486 				*p++ = lo;
    487 				continue;
    488 			}
    489 			if(squawk)
    490 				EPR "%s: rune 0x%x not in output cs\n", argv0, r);
    491 			nerrors++;
    492 			if(clean)
    493 				continue;
    494 			*p++ = BYTEBADMAP;
    495 		}
    496 	}
    497 	noutput += p-obuf;
    498 	if(p > obuf)
    499 		write(1, obuf, p-obuf);
    500 }
    501 
    502 /*	ujis, or EUC	*/
    503 void
    504 ujis_out(Rune *base, int n, long *notused)
    505 {
    506 	char *p;
    507 	int i;
    508 	Rune r;
    509 
    510 	USED(notused);
    511 	if(first)
    512 		tab_init();
    513 	nrunes += n;
    514 	p = obuf;
    515 	for(i = 0; i < n; i++){
    516 		r = base[i];
    517 		if(r < 128)
    518 			*p++ = r;
    519 		else {
    520 			if(tab[r] != -1){
    521 				*p++ = 0x80 | (tab[r]/100 + ' ');
    522 				*p++ = 0x80 | (tab[r]%100 + ' ');
    523 				continue;
    524 			}
    525 			if(squawk)
    526 				EPR "%s: rune 0x%x not in output cs\n", argv0, r);
    527 			nerrors++;
    528 			if(clean)
    529 				continue;
    530 			*p++ = BYTEBADMAP;
    531 		}
    532 	}
    533 	noutput += p-obuf;
    534 	if(p > obuf)
    535 		write(1, obuf, p-obuf);
    536 }