Annotation of embedaddon/libiconv/lib/iso2022_jp2.h, revision 1.1.1.2
1.1 misho 1: /*
1.1.1.2 ! misho 2: * Copyright (C) 1999-2001, 2008, 2016 Free Software Foundation, Inc.
1.1 misho 3: * This file is part of the GNU LIBICONV Library.
4: *
5: * The GNU LIBICONV Library is free software; you can redistribute it
6: * and/or modify it under the terms of the GNU Library General Public
7: * License as published by the Free Software Foundation; either version 2
8: * of the License, or (at your option) any later version.
9: *
10: * The GNU LIBICONV Library is distributed in the hope that it will be
11: * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13: * Library General Public License for more details.
14: *
15: * You should have received a copy of the GNU Library General Public
16: * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
1.1.1.2 ! misho 17: * If not, see <https://www.gnu.org/licenses/>.
1.1 misho 18: */
19:
20: /*
21: * ISO-2022-JP-2
22: */
23:
24: /* Specification: RFC 1554 */
25: /* ESC '(' 'I' for JISX0201 Katakana is an extension not found in RFC 1554 or
26: CJK.INF, but implemented in glibc-2.1 and qt-2.0. */
27:
28: #define ESC 0x1b
29:
30: /*
31: * The state is composed of one of the following values
32: */
33: #define STATE_ASCII 0
34: #define STATE_JISX0201ROMAN 1
35: #define STATE_JISX0201KATAKANA 2
36: #define STATE_JISX0208 3
37: #define STATE_JISX0212 4
38: #define STATE_GB2312 5
39: #define STATE_KSC5601 6
40: /*
41: * and one of the following values, << 8
42: */
43: #define STATE_G2_NONE 0
44: #define STATE_G2_ISO8859_1 1
45: #define STATE_G2_ISO8859_7 2
46:
47: #define SPLIT_STATE \
48: unsigned int state1 = state & 0xff, state2 = state >> 8
49: #define COMBINE_STATE \
50: state = (state2 << 8) | state1
51:
52: static int
1.1.1.2 ! misho 53: iso2022_jp2_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
1.1 misho 54: {
55: state_t state = conv->istate;
56: SPLIT_STATE;
57: int count = 0;
58: unsigned char c;
59: for (;;) {
60: c = *s;
61: if (c == ESC) {
62: if (n < count+3)
63: goto none;
64: if (s[1] == '(') {
65: if (s[2] == 'B') {
66: state1 = STATE_ASCII;
67: s += 3; count += 3;
68: if (n < count+1)
69: goto none;
70: continue;
71: }
72: if (s[2] == 'J') {
73: state1 = STATE_JISX0201ROMAN;
74: s += 3; count += 3;
75: if (n < count+1)
76: goto none;
77: continue;
78: }
79: if (s[2] == 'I') {
80: state1 = STATE_JISX0201KATAKANA;
81: s += 3; count += 3;
82: if (n < count+1)
83: goto none;
84: continue;
85: }
86: goto ilseq;
87: }
88: if (s[1] == '$') {
89: if (s[2] == '@' || s[2] == 'B') {
90: /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
91: state1 = STATE_JISX0208;
92: s += 3; count += 3;
93: if (n < count+1)
94: goto none;
95: continue;
96: }
97: if (s[2] == 'A') {
98: state1 = STATE_GB2312;
99: s += 3; count += 3;
100: if (n < count+1)
101: goto none;
102: continue;
103: }
104: if (s[2] == '(') {
105: if (n < count+4)
106: goto none;
107: if (s[3] == 'D') {
108: state1 = STATE_JISX0212;
109: s += 4; count += 4;
110: if (n < count+1)
111: goto none;
112: continue;
113: }
114: if (s[3] == 'C') {
115: state1 = STATE_KSC5601;
116: s += 4; count += 4;
117: if (n < count+1)
118: goto none;
119: continue;
120: }
121: goto ilseq;
122: }
123: goto ilseq;
124: }
125: if (s[1] == '.') {
126: if (n < count+3)
127: goto none;
128: if (s[2] == 'A') {
129: state2 = STATE_G2_ISO8859_1;
130: s += 3; count += 3;
131: if (n < count+1)
132: goto none;
133: continue;
134: }
135: if (s[2] == 'F') {
136: state2 = STATE_G2_ISO8859_7;
137: s += 3; count += 3;
138: if (n < count+1)
139: goto none;
140: continue;
141: }
142: goto ilseq;
143: }
144: if (s[1] == 'N') {
145: switch (state2) {
146: case STATE_G2_NONE:
147: goto ilseq;
148: case STATE_G2_ISO8859_1:
149: if (s[2] < 0x80) {
150: unsigned char buf = s[2]+0x80;
151: int ret = iso8859_1_mbtowc(conv,pwc,&buf,1);
152: if (ret == RET_ILSEQ)
153: goto ilseq;
154: if (ret != 1) abort();
155: COMBINE_STATE;
156: conv->istate = state;
157: return count+3;
158: } else
159: goto ilseq;
160: case STATE_G2_ISO8859_7:
161: if (s[2] < 0x80) {
162: unsigned char buf = s[2]+0x80;
163: int ret = iso8859_7_mbtowc(conv,pwc,&buf,1);
164: if (ret == RET_ILSEQ)
165: goto ilseq;
166: if (ret != 1) abort();
167: COMBINE_STATE;
168: conv->istate = state;
169: return count+3;
170: } else
171: goto ilseq;
172: default: abort();
173: }
174: }
175: goto ilseq;
176: }
177: break;
178: }
179: switch (state1) {
180: case STATE_ASCII:
181: if (c < 0x80) {
182: int ret = ascii_mbtowc(conv,pwc,s,1);
183: if (ret == RET_ILSEQ)
184: goto ilseq;
185: if (ret != 1) abort();
186: if (*pwc == 0x000a || *pwc == 0x000d)
187: state2 = STATE_G2_NONE;
188: COMBINE_STATE;
189: conv->istate = state;
190: return count+1;
191: } else
192: goto ilseq;
193: case STATE_JISX0201ROMAN:
194: if (c < 0x80) {
195: int ret = jisx0201_mbtowc(conv,pwc,s,1);
196: if (ret == RET_ILSEQ)
197: goto ilseq;
198: if (ret != 1) abort();
199: if (*pwc == 0x000a || *pwc == 0x000d)
200: state2 = STATE_G2_NONE;
201: COMBINE_STATE;
202: conv->istate = state;
203: return count+1;
204: } else
205: goto ilseq;
206: case STATE_JISX0201KATAKANA:
207: if (c < 0x80) {
208: unsigned char buf = c+0x80;
209: int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
210: if (ret == RET_ILSEQ)
211: goto ilseq;
212: if (ret != 1) abort();
213: COMBINE_STATE;
214: conv->istate = state;
215: return count+1;
216: } else
217: goto ilseq;
218: case STATE_JISX0208:
219: if (n < count+2)
220: goto none;
221: if (s[0] < 0x80 && s[1] < 0x80) {
222: int ret = jisx0208_mbtowc(conv,pwc,s,2);
223: if (ret == RET_ILSEQ)
224: goto ilseq;
225: if (ret != 2) abort();
226: COMBINE_STATE;
227: conv->istate = state;
228: return count+2;
229: } else
230: goto ilseq;
231: case STATE_JISX0212:
232: if (n < count+2)
233: goto none;
234: if (s[0] < 0x80 && s[1] < 0x80) {
235: int ret = jisx0212_mbtowc(conv,pwc,s,2);
236: if (ret == RET_ILSEQ)
237: goto ilseq;
238: if (ret != 2) abort();
239: COMBINE_STATE;
240: conv->istate = state;
241: return count+2;
242: } else
243: goto ilseq;
244: case STATE_GB2312:
245: if (n < count+2)
246: goto none;
247: if (s[0] < 0x80 && s[1] < 0x80) {
248: int ret = gb2312_mbtowc(conv,pwc,s,2);
249: if (ret == RET_ILSEQ)
250: goto ilseq;
251: if (ret != 2) abort();
252: COMBINE_STATE;
253: conv->istate = state;
254: return count+2;
255: } else
256: goto ilseq;
257: case STATE_KSC5601:
258: if (n < count+2)
259: goto none;
260: if (s[0] < 0x80 && s[1] < 0x80) {
261: int ret = ksc5601_mbtowc(conv,pwc,s,2);
262: if (ret == RET_ILSEQ)
263: goto ilseq;
264: if (ret != 2) abort();
265: COMBINE_STATE;
266: conv->istate = state;
267: return count+2;
268: } else
269: goto ilseq;
270: default: abort();
271: }
272:
273: none:
274: COMBINE_STATE;
275: conv->istate = state;
276: return RET_TOOFEW(count);
277:
278: ilseq:
279: COMBINE_STATE;
280: conv->istate = state;
281: return RET_SHIFT_ILSEQ(count);
282: }
283:
284: #undef COMBINE_STATE
285: #undef SPLIT_STATE
286:
287: /*
288: * The state can also contain one of the following values, << 16.
289: * Values >= STATE_TAG_LANGUAGE are temporary tag parsing states.
290: */
291: #define STATE_TAG_NONE 0
292: #define STATE_TAG_LANGUAGE 4
293: #define STATE_TAG_LANGUAGE_j 5
294: #define STATE_TAG_LANGUAGE_ja 1
295: #define STATE_TAG_LANGUAGE_k 6
296: #define STATE_TAG_LANGUAGE_ko 2
297: #define STATE_TAG_LANGUAGE_z 7
298: #define STATE_TAG_LANGUAGE_zh 3
299:
300: #define SPLIT_STATE \
301: unsigned int state1 = state & 0xff, state2 = (state >> 8) & 0xff, state3 = state >> 16
302: #define COMBINE_STATE \
303: state = (state3 << 16) | (state2 << 8) | state1
304:
305: static int
1.1.1.2 ! misho 306: iso2022_jp2_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
1.1 misho 307: {
308: state_t state = conv->ostate;
309: SPLIT_STATE;
310: unsigned char buf[2];
311: int ret;
312: /* This defines the conversion preferences depending on the current
313: langauge tag. */
314: enum conversion { none = 0, european, japanese, chinese, korean, other };
315: static const unsigned int conversion_lists[STATE_TAG_LANGUAGE] = {
316: /* STATE_TAG_NONE */
317: japanese + (european << 3) + (chinese << 6) + (korean << 9) + (other << 12),
318: /* STATE_TAG_LANGUAGE_ja */
319: japanese + (european << 3) + (chinese << 6) + (korean << 9) + (other << 12),
320: /* STATE_TAG_LANGUAGE_ko */
321: korean + (european << 3) + (japanese << 6) + (chinese << 9) + (other << 12),
322: /* STATE_TAG_LANGUAGE_zh */
323: chinese + (european << 3) + (japanese << 6) + (korean << 9) + (other << 12)
324: };
325: unsigned int conversion_list;
326:
327: /* Handle Unicode tag characters (range U+E0000..U+E007F). */
328: if ((wc >> 7) == (0xe0000 >> 7)) {
329: char c = wc & 0x7f;
330: if (c >= 'A' && c <= 'Z')
331: c += 'a'-'A';
332: switch (c) {
333: case 0x01:
334: state3 = STATE_TAG_LANGUAGE;
335: COMBINE_STATE;
336: conv->ostate = state;
337: return 0;
338: case 'j':
339: if (state3 == STATE_TAG_LANGUAGE) {
340: state3 = STATE_TAG_LANGUAGE_j;
341: COMBINE_STATE;
342: conv->ostate = state;
343: return 0;
344: }
345: break;
346: case 'a':
347: if (state3 == STATE_TAG_LANGUAGE_j) {
348: state3 = STATE_TAG_LANGUAGE_ja;
349: COMBINE_STATE;
350: conv->ostate = state;
351: return 0;
352: }
353: break;
354: case 'k':
355: if (state3 == STATE_TAG_LANGUAGE) {
356: state3 = STATE_TAG_LANGUAGE_k;
357: COMBINE_STATE;
358: conv->ostate = state;
359: return 0;
360: }
361: break;
362: case 'o':
363: if (state3 == STATE_TAG_LANGUAGE_k) {
364: state3 = STATE_TAG_LANGUAGE_ko;
365: COMBINE_STATE;
366: conv->ostate = state;
367: return 0;
368: }
369: break;
370: case 'z':
371: if (state3 == STATE_TAG_LANGUAGE) {
372: state3 = STATE_TAG_LANGUAGE_z;
373: COMBINE_STATE;
374: conv->ostate = state;
375: return 0;
376: }
377: break;
378: case 'h':
379: if (state3 == STATE_TAG_LANGUAGE_z) {
380: state3 = STATE_TAG_LANGUAGE_zh;
381: COMBINE_STATE;
382: conv->ostate = state;
383: return 0;
384: }
385: break;
386: case 0x7f:
387: state3 = STATE_TAG_NONE;
388: COMBINE_STATE;
389: conv->ostate = state;
390: return 0;
391: default:
392: break;
393: }
394: /* Other tag characters reset the tag parsing state or are ignored. */
395: if (state3 >= STATE_TAG_LANGUAGE)
396: state3 = STATE_TAG_NONE;
397: COMBINE_STATE;
398: conv->ostate = state;
399: return 0;
400: }
401: if (state3 >= STATE_TAG_LANGUAGE)
402: state3 = STATE_TAG_NONE;
403:
404: /* Try ASCII. */
405: ret = ascii_wctomb(conv,buf,wc,1);
406: if (ret != RET_ILUNI) {
407: if (ret != 1) abort();
408: if (buf[0] < 0x80) {
409: int count = (state1 == STATE_ASCII ? 1 : 4);
410: if (n < count)
411: return RET_TOOSMALL;
412: if (state1 != STATE_ASCII) {
413: r[0] = ESC;
414: r[1] = '(';
415: r[2] = 'B';
416: r += 3;
417: state1 = STATE_ASCII;
418: }
419: r[0] = buf[0];
420: if (wc == 0x000a || wc == 0x000d)
421: state2 = STATE_G2_NONE;
422: COMBINE_STATE;
423: conv->ostate = state;
424: return count;
425: }
426: }
427:
428: conversion_list = conversion_lists[state3];
429:
430: do {
431: switch (conversion_list & ((1 << 3) - 1)) {
432:
433: case european:
434:
435: /* Try ISO-8859-1. */
436: ret = iso8859_1_wctomb(conv,buf,wc,1);
437: if (ret != RET_ILUNI) {
438: if (ret != 1) abort();
439: if (buf[0] >= 0x80) {
440: int count = (state2 == STATE_G2_ISO8859_1 ? 3 : 6);
441: if (n < count)
442: return RET_TOOSMALL;
443: if (state2 != STATE_G2_ISO8859_1) {
444: r[0] = ESC;
445: r[1] = '.';
446: r[2] = 'A';
447: r += 3;
448: state2 = STATE_G2_ISO8859_1;
449: }
450: r[0] = ESC;
451: r[1] = 'N';
452: r[2] = buf[0]-0x80;
453: COMBINE_STATE;
454: conv->ostate = state;
455: return count;
456: }
457: }
458:
459: /* Try ISO-8859-7. */
460: ret = iso8859_7_wctomb(conv,buf,wc,1);
461: if (ret != RET_ILUNI) {
462: if (ret != 1) abort();
463: if (buf[0] >= 0x80) {
464: int count = (state2 == STATE_G2_ISO8859_7 ? 3 : 6);
465: if (n < count)
466: return RET_TOOSMALL;
467: if (state2 != STATE_G2_ISO8859_7) {
468: r[0] = ESC;
469: r[1] = '.';
470: r[2] = 'F';
471: r += 3;
472: state2 = STATE_G2_ISO8859_7;
473: }
474: r[0] = ESC;
475: r[1] = 'N';
476: r[2] = buf[0]-0x80;
477: COMBINE_STATE;
478: conv->ostate = state;
479: return count;
480: }
481: }
482:
483: break;
484:
485: case japanese:
486:
487: /* Try JIS X 0201-1976 Roman. */
488: ret = jisx0201_wctomb(conv,buf,wc,1);
489: if (ret != RET_ILUNI) {
490: if (ret != 1) abort();
491: if (buf[0] < 0x80) {
492: int count = (state1 == STATE_JISX0201ROMAN ? 1 : 4);
493: if (n < count)
494: return RET_TOOSMALL;
495: if (state1 != STATE_JISX0201ROMAN) {
496: r[0] = ESC;
497: r[1] = '(';
498: r[2] = 'J';
499: r += 3;
500: state1 = STATE_JISX0201ROMAN;
501: }
502: r[0] = buf[0];
503: if (wc == 0x000a || wc == 0x000d)
504: state2 = STATE_G2_NONE;
505: COMBINE_STATE;
506: conv->ostate = state;
507: return count;
508: }
509: }
510:
511: /* Try JIS X 0208-1990 in place of JIS X 0208-1978 and
512: JIS X 0208-1983. */
513: ret = jisx0208_wctomb(conv,buf,wc,2);
514: if (ret != RET_ILUNI) {
515: if (ret != 2) abort();
516: if (buf[0] < 0x80 && buf[1] < 0x80) {
517: int count = (state1 == STATE_JISX0208 ? 2 : 5);
518: if (n < count)
519: return RET_TOOSMALL;
520: if (state1 != STATE_JISX0208) {
521: r[0] = ESC;
522: r[1] = '$';
523: r[2] = 'B';
524: r += 3;
525: state1 = STATE_JISX0208;
526: }
527: r[0] = buf[0];
528: r[1] = buf[1];
529: COMBINE_STATE;
530: conv->ostate = state;
531: return count;
532: }
533: }
534:
535: /* Try JIS X 0212-1990. */
536: ret = jisx0212_wctomb(conv,buf,wc,2);
537: if (ret != RET_ILUNI) {
538: if (ret != 2) abort();
539: if (buf[0] < 0x80 && buf[1] < 0x80) {
540: int count = (state1 == STATE_JISX0212 ? 2 : 6);
541: if (n < count)
542: return RET_TOOSMALL;
543: if (state1 != STATE_JISX0212) {
544: r[0] = ESC;
545: r[1] = '$';
546: r[2] = '(';
547: r[3] = 'D';
548: r += 4;
549: state1 = STATE_JISX0212;
550: }
551: r[0] = buf[0];
552: r[1] = buf[1];
553: COMBINE_STATE;
554: conv->ostate = state;
555: return count;
556: }
557: }
558:
559: break;
560:
561: case chinese:
562:
563: /* Try GB 2312-1980. */
564: ret = gb2312_wctomb(conv,buf,wc,2);
565: if (ret != RET_ILUNI) {
566: if (ret != 2) abort();
567: if (buf[0] < 0x80 && buf[1] < 0x80) {
568: int count = (state1 == STATE_GB2312 ? 2 : 5);
569: if (n < count)
570: return RET_TOOSMALL;
571: if (state1 != STATE_GB2312) {
572: r[0] = ESC;
573: r[1] = '$';
574: r[2] = 'A';
575: r += 3;
576: state1 = STATE_GB2312;
577: }
578: r[0] = buf[0];
579: r[1] = buf[1];
580: COMBINE_STATE;
581: conv->ostate = state;
582: return count;
583: }
584: }
585:
586: break;
587:
588: case korean:
589:
590: /* Try KS C 5601-1992. */
591: ret = ksc5601_wctomb(conv,buf,wc,2);
592: if (ret != RET_ILUNI) {
593: if (ret != 2) abort();
594: if (buf[0] < 0x80 && buf[1] < 0x80) {
595: int count = (state1 == STATE_KSC5601 ? 2 : 6);
596: if (n < count)
597: return RET_TOOSMALL;
598: if (state1 != STATE_KSC5601) {
599: r[0] = ESC;
600: r[1] = '$';
601: r[2] = '(';
602: r[3] = 'C';
603: r += 4;
604: state1 = STATE_KSC5601;
605: }
606: r[0] = buf[0];
607: r[1] = buf[1];
608: COMBINE_STATE;
609: conv->ostate = state;
610: return count;
611: }
612: }
613:
614: break;
615:
616: case other:
617:
618: /* Try JIS X 0201-1976 Kana. This is not officially part of
619: ISO-2022-JP-2, according to RFC 1554. Therefore we try this
620: only after all other attempts. */
621: ret = jisx0201_wctomb(conv,buf,wc,1);
622: if (ret != RET_ILUNI) {
623: if (ret != 1) abort();
624: if (buf[0] >= 0x80) {
625: int count = (state1 == STATE_JISX0201KATAKANA ? 1 : 4);
626: if (n < count)
627: return RET_TOOSMALL;
628: if (state1 != STATE_JISX0201KATAKANA) {
629: r[0] = ESC;
630: r[1] = '(';
631: r[2] = 'I';
632: r += 3;
633: state1 = STATE_JISX0201KATAKANA;
634: }
635: r[0] = buf[0]-0x80;
636: COMBINE_STATE;
637: conv->ostate = state;
638: return count;
639: }
640: }
641:
642: break;
643:
644: default:
645: abort();
646: }
647:
648: conversion_list = conversion_list >> 3;
649: } while (conversion_list != 0);
650:
651: return RET_ILUNI;
652: }
653:
654: static int
1.1.1.2 ! misho 655: iso2022_jp2_reset (conv_t conv, unsigned char *r, size_t n)
1.1 misho 656: {
657: state_t state = conv->ostate;
658: SPLIT_STATE;
659: (void)state2;
660: (void)state3;
661: if (state1 != STATE_ASCII) {
662: if (n < 3)
663: return RET_TOOSMALL;
664: r[0] = ESC;
665: r[1] = '(';
666: r[2] = 'B';
667: /* conv->ostate = 0; will be done by the caller */
668: return 3;
669: } else
670: return 0;
671: }
672:
673: #undef COMBINE_STATE
674: #undef SPLIT_STATE
675: #undef STATE_TAG_LANGUAGE_zh
676: #undef STATE_TAG_LANGUAGE_z
677: #undef STATE_TAG_LANGUAGE_ko
678: #undef STATE_TAG_LANGUAGE_k
679: #undef STATE_TAG_LANGUAGE_ja
680: #undef STATE_TAG_LANGUAGE_j
681: #undef STATE_TAG_LANGUAGE
682: #undef STATE_TAG_NONE
683: #undef STATE_G2_ISO8859_7
684: #undef STATE_G2_ISO8859_1
685: #undef STATE_G2_NONE
686: #undef STATE_KSC5601
687: #undef STATE_GB2312
688: #undef STATE_JISX0212
689: #undef STATE_JISX0208
690: #undef STATE_JISX0201KATAKANA
691: #undef STATE_JISX0201ROMAN
692: #undef STATE_ASCII
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>