conv_jis.c (11016B)
1 #ifdef PLAN9 2 #include <u.h> 3 #include <libc.h> 4 #include <bio.h> 5 #else 6 #include <stdio.h> 7 #include <unistd.h> 8 #include "plan9.h" 9 #endif 10 #include "hdr.h" 11 #include "conv.h" 12 #include "kuten208.h" 13 #include "jis.h" 14 15 /* 16 a state machine for interpreting all sorts of encodings 17 */ 18 static void 19 alljis(int c, Rune **r, long input_loc) 20 { 21 static enum { state0, state1, state2, state3, state4 } state = state0; 22 static int set8 = 0; 23 static int japan646 = 0; 24 static int lastc; 25 int n; 26 long l; 27 28 again: 29 switch(state) 30 { 31 case state0: /* idle state */ 32 if(c == ESC){ state = state1; return; } 33 if(c < 0) return; 34 if(!set8 && (c < 128)){ 35 if(japan646){ 36 switch(c) 37 { 38 case '\\': emit(0xA5); return; /* yen */ 39 case '~': emit(0xAF); return; /* spacing macron */ 40 default: emit(c); return; 41 } 42 } else { 43 emit(c); 44 return; 45 } 46 } 47 if(c < 0x21){ /* guard against bogus characters in JIS mode */ 48 if(squawk) 49 EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc); 50 emit(c); 51 return; 52 } 53 lastc = c; state = state4; return; 54 55 case state1: /* seen an escape */ 56 if(c == '$'){ state = state2; return; } 57 if(c == '('){ state = state3; return; } 58 emit(ESC); state = state0; goto again; 59 60 case state2: /* may be shifting into JIS */ 61 if((c == '@') || (c == 'B')){ 62 set8 = 1; state = state0; return; 63 } 64 emit(ESC); emit('$'); state = state0; goto again; 65 66 case state3: /* may be shifting out of JIS */ 67 if((c == 'J') || (c == 'H') || (c == 'B')){ 68 japan646 = (c == 'J'); 69 set8 = 0; state = state0; return; 70 } 71 emit(ESC); emit('('); state = state0; goto again; 72 73 case state4: /* two part char */ 74 if(c < 0){ 75 if(squawk) 76 EPR "%s: unexpected EOF in %s\n", argv0, file); 77 c = 0x21 | (lastc&0x80); 78 } 79 if(CANS2J(lastc, c)){ /* ms dos sjis */ 80 int hi = lastc, lo = c; 81 S2J(hi, lo); /* convert to 208 */ 82 n = hi*100 + lo - 3232; /* convert to kuten208 */ 83 } else 84 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */ 85 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ 86 nerrors++; 87 if(squawk) 88 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); 89 if(!clean) 90 emit(BADMAP); 91 } else { 92 if(l < 0){ 93 l = -l; 94 if(squawk) 95 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); 96 } 97 emit(l); 98 } 99 state = state0; 100 } 101 } 102 103 /* 104 a state machine for interpreting ms-kanji == shift-jis. 105 */ 106 static void 107 ms(int c, Rune **r, long input_loc) 108 { 109 static enum { state0, state1, state2, state3, state4 } state = state0; 110 static int set8 = 0; 111 static int japan646 = 0; 112 static int lastc; 113 int n; 114 long l; 115 116 again: 117 switch(state) 118 { 119 case state0: /* idle state */ 120 if(c == ESC){ state = state1; return; } 121 if(c < 0) return; 122 if(!set8 && (c < 128)){ 123 if(japan646){ 124 switch(c) 125 { 126 case '\\': emit(0xA5); return; /* yen */ 127 case '~': emit(0xAF); return; /* spacing macron */ 128 default: emit(c); return; 129 } 130 } else { 131 emit(c); 132 return; 133 } 134 } 135 lastc = c; state = state4; return; 136 137 case state1: /* seen an escape */ 138 if(c == '$'){ state = state2; return; } 139 if(c == '('){ state = state3; return; } 140 emit(ESC); state = state0; goto again; 141 142 case state2: /* may be shifting into JIS */ 143 if((c == '@') || (c == 'B')){ 144 set8 = 1; state = state0; return; 145 } 146 emit(ESC); emit('$'); state = state0; goto again; 147 148 case state3: /* may be shifting out of JIS */ 149 if((c == 'J') || (c == 'H') || (c == 'B')){ 150 japan646 = (c == 'J'); 151 set8 = 0; state = state0; return; 152 } 153 emit(ESC); emit('('); state = state0; goto again; 154 155 case state4: /* two part char */ 156 if(c < 0){ 157 if(squawk) 158 EPR "%s: unexpected EOF in %s\n", argv0, file); 159 c = 0x21 | (lastc&0x80); 160 } 161 if(CANS2J(lastc, c)){ /* ms dos sjis */ 162 int hi = lastc, lo = c; 163 S2J(hi, lo); /* convert to 208 */ 164 n = hi*100 + lo - 3232; /* convert to kuten208 */ 165 } else { 166 nerrors++; 167 if(squawk) 168 EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file); 169 if(!clean) 170 emit(BADMAP); 171 state = state0; 172 goto again; 173 } 174 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ 175 nerrors++; 176 if(squawk) 177 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); 178 if(!clean) 179 emit(BADMAP); 180 } else { 181 if(l < 0){ 182 l = -l; 183 if(squawk) 184 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); 185 } 186 emit(l); 187 } 188 state = state0; 189 } 190 } 191 192 /* 193 a state machine for interpreting ujis == EUC 194 */ 195 static void 196 ujis(int c, Rune **r, long input_loc) 197 { 198 static enum { state0, state1 } state = state0; 199 static int lastc; 200 int n; 201 long l; 202 203 switch(state) 204 { 205 case state0: /* idle state */ 206 if(c < 0) return; 207 if(c < 128){ 208 emit(c); 209 return; 210 } 211 if(c == 0x8e){ /* codeset 2 */ 212 nerrors++; 213 if(squawk) 214 EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file); 215 if(!clean) 216 emit(BADMAP); 217 return; 218 } 219 if(c == 0x8f){ /* codeset 3 */ 220 nerrors++; 221 if(squawk) 222 EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file); 223 if(!clean) 224 emit(BADMAP); 225 return; 226 } 227 lastc = c; 228 state = state1; 229 return; 230 231 case state1: /* two part char */ 232 if(c < 0){ 233 if(squawk) 234 EPR "%s: unexpected EOF in %s\n", argv0, file); 235 c = 0xA1; 236 } 237 n = (lastc&0x7F)*100 + (c&0x7F) - 3232; /* kuten208 */ 238 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ 239 nerrors++; 240 if(squawk) 241 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); 242 if(!clean) 243 emit(BADMAP); 244 } else { 245 if(l < 0){ 246 l = -l; 247 if(squawk) 248 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); 249 } 250 emit(l); 251 } 252 state = state0; 253 } 254 } 255 256 /* 257 a state machine for interpreting jis-kanji == 2022-JP 258 */ 259 static void 260 jis(int c, Rune **r, long input_loc) 261 { 262 static enum { state0, state1, state2, state3, state4 } state = state0; 263 static int set8 = 0; 264 static int japan646 = 0; 265 static int lastc; 266 int n; 267 long l; 268 269 again: 270 switch(state) 271 { 272 case state0: /* idle state */ 273 if(c == ESC){ state = state1; return; } 274 if(c < 0) return; 275 if(!set8 && (c < 128)){ 276 if(japan646){ 277 switch(c) 278 { 279 case '\\': emit(0xA5); return; /* yen */ 280 case '~': emit(0xAF); return; /* spacing macron */ 281 default: emit(c); return; 282 } 283 } else { 284 emit(c); 285 return; 286 } 287 } 288 lastc = c; state = state4; return; 289 290 case state1: /* seen an escape */ 291 if(c == '$'){ state = state2; return; } 292 if(c == '('){ state = state3; return; } 293 emit(ESC); state = state0; goto again; 294 295 case state2: /* may be shifting into JIS */ 296 if((c == '@') || (c == 'B')){ 297 set8 = 1; state = state0; return; 298 } 299 emit(ESC); emit('$'); state = state0; goto again; 300 301 case state3: /* may be shifting out of JIS */ 302 if((c == 'J') || (c == 'H') || (c == 'B')){ 303 japan646 = (c == 'J'); 304 set8 = 0; state = state0; return; 305 } 306 emit(ESC); emit('('); state = state0; goto again; 307 308 case state4: /* two part char */ 309 if(c < 0){ 310 if(squawk) 311 EPR "%s: unexpected EOF in %s\n", argv0, file); 312 c = 0x21 | (lastc&0x80); 313 } 314 if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */ 315 emit(lastc); 316 state = state0; 317 goto again; 318 } 319 n = (lastc&0x7F)*100 + (c&0x7f) - 3232; /* kuten208 */ 320 if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){ 321 nerrors++; 322 if(squawk) 323 EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file); 324 if(!clean) 325 emit(BADMAP); 326 } else { 327 if(l < 0){ 328 l = -l; 329 if(squawk) 330 EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file); 331 } 332 emit(l); 333 } 334 state = state0; 335 } 336 } 337 338 static void 339 do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out) 340 { 341 Rune ob[N]; 342 Rune *r, *re; 343 uchar ibuf[N]; 344 int n, i; 345 long nin; 346 347 r = ob; 348 re = ob+N-3; 349 nin = 0; 350 while((n = read(fd, ibuf, sizeof ibuf)) > 0){ 351 for(i = 0; i < n; i++){ 352 (*procfn)(ibuf[i], &r, nin++); 353 if(r >= re){ 354 OUT(out, ob, r-ob); 355 r = ob; 356 } 357 } 358 if(r > ob){ 359 OUT(out, ob, r-ob); 360 r = ob; 361 } 362 } 363 (*procfn)(-1, &r, nin); 364 if(r > ob) 365 OUT(out, ob, r-ob); 366 OUT(out, ob, 0); 367 } 368 369 void 370 jis_in(int fd, long *notused, struct convert *out) 371 { 372 USED(notused); 373 do_in(fd, alljis, out); 374 } 375 376 void 377 ujis_in(int fd, long *notused, struct convert *out) 378 { 379 USED(notused); 380 do_in(fd, ujis, out); 381 } 382 383 void 384 msjis_in(int fd, long *notused, struct convert *out) 385 { 386 USED(notused); 387 do_in(fd, ms, out); 388 } 389 390 void 391 jisjis_in(int fd, long *notused, struct convert *out) 392 { 393 USED(notused); 394 do_in(fd, jis, out); 395 } 396 397 static int first = 1; 398 399 static void 400 tab_init(void) 401 { 402 int i; 403 long l; 404 405 first = 0; 406 for(i = 0; i < NRUNE; i++) 407 tab[i] = -1; 408 for(i = 0; i < KUTEN208MAX; i++) 409 if((l = tabkuten208[i]) != -1){ 410 if(l < 0) 411 tab[-l] = i; 412 else 413 tab[l] = i; 414 } 415 } 416 417 418 /* jis-kanji, or ISO 2022-JP */ 419 void 420 jisjis_out(Rune *base, int n, long *notused) 421 { 422 char *p; 423 int i; 424 Rune r; 425 static enum { ascii, japan646, jp2022 } state = ascii; 426 427 USED(notused); 428 if(first) 429 tab_init(); 430 nrunes += n; 431 p = obuf; 432 for(i = 0; i < n; i++){ 433 r = base[i]; 434 if(r < 128){ 435 if(state == jp2022){ 436 *p++ = ESC; *p++ = '('; *p++ = 'B'; 437 state = ascii; 438 } 439 *p++ = r; 440 } else { 441 if(tab[r] != -1){ 442 if(state != jp2022){ 443 *p++ = ESC; *p++ = '$'; *p++ = 'B'; 444 state = jp2022; 445 } 446 *p++ = tab[r]/100 + ' '; 447 *p++ = tab[r]%100 + ' '; 448 continue; 449 } 450 if(squawk) 451 EPR "%s: rune 0x%x not in output cs\n", argv0, r); 452 nerrors++; 453 if(clean) 454 continue; 455 *p++ = BYTEBADMAP; 456 } 457 } 458 noutput += p-obuf; 459 if(p > obuf) 460 write(1, obuf, p-obuf); 461 } 462 463 /* ms-kanji, or Shift-JIS */ 464 void 465 msjis_out(Rune *base, int n, long *notused) 466 { 467 char *p; 468 int i, hi, lo; 469 Rune r; 470 471 USED(notused); 472 if(first) 473 tab_init(); 474 nrunes += n; 475 p = obuf; 476 for(i = 0; i < n; i++){ 477 r = base[i]; 478 if(r < 128) 479 *p++ = r; 480 else { 481 if(tab[r] != -1){ 482 hi = tab[r]/100 + ' '; 483 lo = tab[r]%100 + ' '; 484 J2S(hi, lo); 485 *p++ = hi; 486 *p++ = lo; 487 continue; 488 } 489 if(squawk) 490 EPR "%s: rune 0x%x not in output cs\n", argv0, r); 491 nerrors++; 492 if(clean) 493 continue; 494 *p++ = BYTEBADMAP; 495 } 496 } 497 noutput += p-obuf; 498 if(p > obuf) 499 write(1, obuf, p-obuf); 500 } 501 502 /* ujis, or EUC */ 503 void 504 ujis_out(Rune *base, int n, long *notused) 505 { 506 char *p; 507 int i; 508 Rune r; 509 510 USED(notused); 511 if(first) 512 tab_init(); 513 nrunes += n; 514 p = obuf; 515 for(i = 0; i < n; i++){ 516 r = base[i]; 517 if(r < 128) 518 *p++ = r; 519 else { 520 if(tab[r] != -1){ 521 *p++ = 0x80 | (tab[r]/100 + ' '); 522 *p++ = 0x80 | (tab[r]%100 + ' '); 523 continue; 524 } 525 if(squawk) 526 EPR "%s: rune 0x%x not in output cs\n", argv0, r); 527 nerrors++; 528 if(clean) 529 continue; 530 *p++ = BYTEBADMAP; 531 } 532 } 533 noutput += p-obuf; 534 if(p > obuf) 535 write(1, obuf, p-obuf); 536 }