Annotation of embedaddon/libiconv/lib/iso2022_jpms.h, revision 1.1.1.1
1.1 misho 1: /*
2: * Copyright (C) 1999-2001, 2008, 2011-2012, 2016, 2018 Free Software Foundation, Inc.
3: * This file is part of the GNU LIBICONV Library.
4: *
5: * The GNU LIBICONV Library is free software; you can redistribute it
6: * and/or modify it under the terms of the GNU Library General Public
7: * License as published by the Free Software Foundation; either version 2
8: * of the License, or (at your option) any later version.
9: *
10: * The GNU LIBICONV Library is distributed in the hope that it will be
11: * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
12: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13: * Library General Public License for more details.
14: *
15: * You should have received a copy of the GNU Library General Public
16: * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
17: * If not, see <https://www.gnu.org/licenses/>.
18: */
19:
20: /*
21: * ISO-2022-JP-MS
22: * alias CP50221
23: *
24: * This is an extension of ISO-2022-JP-1 with larger character sets.
25: * It uses ESC $ B and ESC $ ( D to denote *extensions* of JIS X 0208 and
26: * JIS X 0212, respectively. This violates the principles of ISO 2022,
27: * where
28: * 1. character sets to be used by ISO 2022 have to be registered at the
29: * ISO IR registry <https://www.itscj.ipsj.or.jp/ISO-IR/>,
30: * 2. different character sets are designated by different escape
31: * sequences.
32: * It's a typical instance of the "embrace and extend" strategy by Microsoft
33: * <https://en.wikipedia.org/wiki/Embrace,_extend_and_extinguish>.
34: */
35:
36: /*
37: * Windows has three encodings CP50220, CP50221, CP50222.
38: * The common parts are:
39: * - US-ASCII (0x00..0x7F)
40: * - JIS X 0208 extended by
41: * - one row (0x2D),
42: * - a private use area (rows 0x75..0x7E = U+E000..U+E3AB),
43: * enabled with ESC $ B, disabled with ESC ( B.
44: * - JIS X 0212 extended by
45: * - two rows (0x73..0x74),
46: * - a private use area (rows 0x75..0x7E = U+E3AC..U+E757),
47: * enabled with ESC $ ( D, disabled with ESC ( B.
48: * They differ in the handling of JIS X 0201 characters (halfwidth Katakana)
49: * in the conversion direction Unicode -> CP5022x:
50: * * CP50220 maps the halfwidth Katakana to fullwidth Katakana characters.
51: * * CP50221 contains the JIS X 0201 halfwidth Katakana characters,
52: * enabled with ESC ( I, disabled with ESC ( B.
53: * * CP50222 contains the JIS X 0201 halfwidth Katakana characters,
54: * enabled with ESC ( J 0x0E, disabled with ESC ( B.
55: * In the conversion direction CP5022x -> Unicode, all three operate the same:
56: * - ESC ( I is supported and understood.
57: * - ESC ( J 0x0E is not accepted. (Tested on Windows XP SP3.)
58: * Conclusion:
59: * - CP50222 should not be used, because the multibyte sequence that it
60: * produces cannot be parsed by either of the three encodings.
61: * - CP50221 is preferrable to CP50220, because it can faithfully represent
62: * the halfwidth Katakana characters.
63: * We therefore implement CP50221. As an extension, in the mbtowc conversion
64: * direction, we support also ESC ( J 0x0E, just in case.
65: */
66:
67: #include "cp50221_0208_ext.h"
68: #include "cp50221_0212_ext.h"
69:
70: #define ESC 0x1b
71: #define SO 0x0e
72: #define SI 0x0f
73:
74: /*
75: * The state can be one of the following values.
76: */
77: #define STATE_ASCII 0 /* Esc ( B */
78: #define STATE_JISX0201ROMAN 1 /* Esc ( J */ /* only in mbtowc direction */
79: #define STATE_JISX0201KATAKANA 2 /* Esc ( I */
80: #define STATE_JISX0208MS 3 /* Esc $ @ or Esc $ B */
81: #define STATE_JISX0212MS 4 /* Esc $ ( D */
82:
83: static int
84: iso2022_jpms_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
85: {
86: state_t state = conv->istate;
87: int count = 0;
88: unsigned char c;
89: for (;;) {
90: c = *s;
91: if (c == ESC) {
92: if (n < count+3)
93: goto none;
94: if (s[1] == '(') {
95: if (s[2] == 'B') {
96: state = STATE_ASCII;
97: s += 3; count += 3;
98: if (n < count+1)
99: goto none;
100: continue;
101: }
102: if (s[2] == 'I') {
103: state = STATE_JISX0201KATAKANA;
104: s += 3; count += 3;
105: if (n < count+1)
106: goto none;
107: continue;
108: }
109: if (s[2] == 'J') {
110: state = STATE_JISX0201ROMAN;
111: s += 3; count += 3;
112: if (n < count+1)
113: goto none;
114: continue;
115: }
116: goto ilseq;
117: }
118: if (s[1] == '$') {
119: if (s[2] == '@' || s[2] == 'B') {
120: /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
121: state = STATE_JISX0208MS;
122: s += 3; count += 3;
123: if (n < count+1)
124: goto none;
125: continue;
126: }
127: if (s[2] == '(') {
128: if (n < count+4)
129: goto none;
130: if (s[3] == 'D') {
131: state = STATE_JISX0212MS;
132: s += 4; count += 4;
133: if (n < count+1)
134: goto none;
135: continue;
136: }
137: }
138: goto ilseq;
139: }
140: goto ilseq;
141: }
142: if (c == SO) {
143: if (state == STATE_JISX0201ROMAN)
144: state = STATE_JISX0201KATAKANA;
145: s += 1; count += 1;
146: if (n < count+1)
147: goto none;
148: continue;
149: }
150: if (c == SI) {
151: if (state == STATE_JISX0201KATAKANA)
152: state = STATE_JISX0201ROMAN;
153: s += 1; count += 1;
154: if (n < count+1)
155: goto none;
156: continue;
157: }
158: break;
159: }
160: switch (state) {
161: case STATE_ASCII:
162: if (c < 0x80) {
163: int ret = ascii_mbtowc(conv,pwc,s,1);
164: if (ret == RET_ILSEQ)
165: goto ilseq;
166: if (ret != 1) abort();
167: conv->istate = state;
168: return count+1;
169: } else
170: goto ilseq;
171: case STATE_JISX0201ROMAN:
172: if (c < 0x80) {
173: int ret = jisx0201_mbtowc(conv,pwc,s,1);
174: if (ret == RET_ILSEQ)
175: goto ilseq;
176: if (ret != 1) abort();
177: conv->istate = state;
178: return count+1;
179: } else
180: goto ilseq;
181: case STATE_JISX0201KATAKANA:
182: if (c < 0x80) {
183: unsigned char buf = c+0x80;
184: int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
185: if (ret == RET_ILSEQ)
186: goto ilseq;
187: if (ret != 1) abort();
188: conv->istate = state;
189: return count+1;
190: } else
191: goto ilseq;
192: case STATE_JISX0208MS:
193: if (n < count+2)
194: goto none;
195: if (s[0] < 0x80 && s[1] < 0x80) {
196: int ret;
197: if (s[0] < 0x75) {
198: if (s[0] == 0x2d) {
199: /* Extension of JIS X 0208. */
200: if (s[1] >= 0x21 && s[1] <= 0x79) {
201: unsigned char i = (s[1] - 0x21) + 1;
202: ret = cp50221_0208_ext_mbtowc(conv,pwc,&i,1);
203: if (ret == 1)
204: ret = 2;
205: } else
206: ret = RET_ILSEQ;
207: } else {
208: /* JIS X 0208. */
209: ret = jisx0208_mbtowc(conv,pwc,s,2);
210: }
211: } else {
212: /* Extension of JIS X 0208.
213: 0x{75..7E}{21..8E} maps to U+E000..U+E3AB.
214: But some rows maps to characters present in CP932. */
215: if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
216: unsigned short wc = 0xfffd;
217: if (s[0] >= 0x79 && s[0] <= 0x7c)
218: wc = cp932ext_2uni_pageed[(s[0] - 0x79) * 94 + (s[1] - 0x21)];
219: if (wc == 0xfffd)
220: wc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe000;
221: *pwc = wc;
222: ret = 2;
223: } else
224: ret = RET_ILSEQ;
225: }
226: if (ret == RET_ILSEQ)
227: goto ilseq;
228: if (ret != 2) abort();
229: conv->istate = state;
230: return count+2;
231: } else
232: goto ilseq;
233: case STATE_JISX0212MS:
234: if (n < count+2)
235: goto none;
236: if (s[0] < 0x80 && s[1] < 0x80) {
237: int ret;
238: if (s[0] < 0x73) {
239: /* JIS X 0212. */
240: ret = jisx0212_mbtowc(conv,pwc,s,2);
241: } else {
242: if (s[0] < 0x75) {
243: /* Extension of JIS X 0212. */
244: if (s[1] >= 0x21 && s[1] <= 0x7e) {
245: unsigned char i = (s[0] - 0x73) * 94 + (s[1] - 0x21) + 1;
246: ret = cp50221_0212_ext_mbtowc(conv,pwc,&i,1);
247: if (ret == 1)
248: ret = 2;
249: } else
250: ret = RET_ILSEQ;
251: } else {
252: /* Extension of JIS X 0208.
253: 0x{75..7E}{21..8E} maps to U+E3AC..U+E757. */
254: if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
255: *pwc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe3ac;
256: ret = 2;
257: } else
258: ret = RET_ILSEQ;
259: }
260: }
261: if (ret == RET_ILSEQ)
262: goto ilseq;
263: if (ret != 2) abort();
264: conv->istate = state;
265: return count+2;
266: } else
267: goto ilseq;
268: default: abort();
269: }
270:
271: none:
272: conv->istate = state;
273: return RET_TOOFEW(count);
274:
275: ilseq:
276: conv->istate = state;
277: return RET_SHIFT_ILSEQ(count);
278: }
279:
280: static int
281: iso2022_jpms_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
282: {
283: state_t state = conv->ostate;
284: unsigned char buf[2];
285: int ret;
286:
287: /* Try ASCII. */
288: ret = ascii_wctomb(conv,buf,wc,1);
289: if (ret != RET_ILUNI) {
290: if (ret != 1) abort();
291: if (buf[0] < 0x80) {
292: int count = (state == STATE_ASCII ? 1 : 4);
293: if (n < count)
294: return RET_TOOSMALL;
295: if (state != STATE_ASCII) {
296: r[0] = ESC;
297: r[1] = '(';
298: r[2] = 'B';
299: r += 3;
300: state = STATE_ASCII;
301: }
302: r[0] = buf[0];
303: conv->ostate = state;
304: return count;
305: }
306: }
307:
308: /* Try JIS X 0201-1976 Katakana. */
309: ret = jisx0201_wctomb(conv,buf,wc,1);
310: if (ret != RET_ILUNI) {
311: if (ret != 1) abort();
312: if (buf[0] >= 0x80) {
313: int count = (state == STATE_JISX0201KATAKANA ? 1 : 4);
314: if (n < count)
315: return RET_TOOSMALL;
316: if (state != STATE_JISX0201KATAKANA) {
317: r[0] = ESC;
318: r[1] = '(';
319: r[2] = 'I';
320: r += 3;
321: state = STATE_JISX0201KATAKANA;
322: }
323: r[0] = buf[0]-0x80;
324: conv->ostate = state;
325: return count;
326: }
327: }
328:
329: /* Try JIS X 0208-1990, in place of JIS X 0208-1978 and JIS X 0208-1983,
330: and the extensions mentioned above. */
331: if (wc >= 0xe000 && wc < 0xe3ac) {
332: unsigned short i = wc - 0xe000;
333: buf[0] = (i / 94) + 0x75;
334: buf[1] = (i % 94) + 0x21;
335: ret = 2;
336: } else {
337: ret = jisx0208_wctomb(conv,buf,wc,2);
338: if (ret == RET_ILUNI) {
339: /* Extension of JIS X 0208. */
340: unsigned char i;
341: ret = cp50221_0208_ext_wctomb(conv,&i,wc,1);
342: if (ret == 1) {
343: buf[0] = 0x2d;
344: buf[1] = i-1 + 0x21;
345: ret = 2;
346: } else if (wc == 0x663B) {
347: buf[0] = 0x7a;
348: buf[1] = 0x36;
349: ret = 2;
350: } else if (wc == 0xffe2) {
351: buf[0] = 0x7c;
352: buf[1] = 0x7b;
353: ret = 2;
354: } else if (wc == 0xffe4) {
355: buf[0] = 0x7c;
356: buf[1] = 0x7c;
357: ret = 2;
358: }
359: }
360: }
361: if (ret != RET_ILUNI) {
362: if (ret != 2) abort();
363: if (buf[0] < 0x80 && buf[1] < 0x80) {
364: int count = (state == STATE_JISX0208MS ? 2 : 5);
365: if (n < count)
366: return RET_TOOSMALL;
367: if (state != STATE_JISX0208MS) {
368: r[0] = ESC;
369: r[1] = '$';
370: r[2] = 'B';
371: r += 3;
372: state = STATE_JISX0208MS;
373: }
374: r[0] = buf[0];
375: r[1] = buf[1];
376: conv->ostate = state;
377: return count;
378: }
379: }
380:
381: /* Try JIS X 0212-1990 and the extensions mentioned above. */
382: if (wc >= 0xe3ac && wc < 0xe758) {
383: unsigned short i = wc - 0xe3ac;
384: buf[0] = (i / 94) + 0x75;
385: buf[1] = (i % 94) + 0x21;
386: ret = 2;
387: } else {
388: ret = jisx0212_wctomb(conv,buf,wc,2);
389: if (ret == RET_ILUNI) {
390: /* Extension of JIS X 0212. */
391: unsigned char i;
392: ret = cp50221_0212_ext_wctomb(conv,&i,wc,1);
393: if (ret == 1) {
394: i -= 1;
395: buf[0] = (i / 94) + 0x73;
396: buf[1] = (i % 94) + 0x21;
397: ret = 2;
398: }
399: }
400: }
401: if (ret != RET_ILUNI) {
402: if (ret != 2) abort();
403: if (buf[0] < 0x80 && buf[1] < 0x80) {
404: int count = (state == STATE_JISX0212MS ? 2 : 6);
405: if (n < count)
406: return RET_TOOSMALL;
407: if (state != STATE_JISX0212MS) {
408: r[0] = ESC;
409: r[1] = '$';
410: r[2] = '(';
411: r[3] = 'D';
412: r += 4;
413: state = STATE_JISX0212MS;
414: }
415: r[0] = buf[0];
416: r[1] = buf[1];
417: conv->ostate = state;
418: return count;
419: }
420: }
421:
422: return RET_ILUNI;
423: }
424:
425: static int
426: iso2022_jpms_reset (conv_t conv, unsigned char *r, size_t n)
427: {
428: state_t state = conv->ostate;
429: if (state != STATE_ASCII) {
430: if (n < 3)
431: return RET_TOOSMALL;
432: r[0] = ESC;
433: r[1] = '(';
434: r[2] = 'B';
435: /* conv->ostate = 0; will be done by the caller */
436: return 3;
437: } else
438: return 0;
439: }
440:
441: #undef STATE_JISX0212MS
442: #undef STATE_JISX0208MS
443: #undef STATE_JISX0201KATAKANA
444: #undef STATE_JISX0201ROMAN
445: #undef STATE_ASCII
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>