Annotation of embedaddon/libiconv/lib/iso2022_jpms.h, revision 1.1

1.1     ! misho       1: /*
        !             2:  * Copyright (C) 1999-2001, 2008, 2011-2012, 2016, 2018 Free Software Foundation, Inc.
        !             3:  * This file is part of the GNU LIBICONV Library.
        !             4:  *
        !             5:  * The GNU LIBICONV Library is free software; you can redistribute it
        !             6:  * and/or modify it under the terms of the GNU Library General Public
        !             7:  * License as published by the Free Software Foundation; either version 2
        !             8:  * of the License, or (at your option) any later version.
        !             9:  *
        !            10:  * The GNU LIBICONV Library is distributed in the hope that it will be
        !            11:  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            12:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            13:  * Library General Public License for more details.
        !            14:  *
        !            15:  * You should have received a copy of the GNU Library General Public
        !            16:  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
        !            17:  * If not, see <https://www.gnu.org/licenses/>.
        !            18:  */
        !            19: 
        !            20: /*
        !            21:  * ISO-2022-JP-MS
        !            22:  * alias CP50221
        !            23:  *
        !            24:  * This is an extension of ISO-2022-JP-1 with larger character sets.
        !            25:  * It uses ESC $ B and ESC $ ( D to denote *extensions* of JIS X 0208 and
        !            26:  * JIS X 0212, respectively.  This violates the principles of ISO 2022,
        !            27:  * where
        !            28:  *   1. character sets to be used by ISO 2022 have to be registered at the
        !            29:  *      ISO IR registry <https://www.itscj.ipsj.or.jp/ISO-IR/>,
        !            30:  *   2. different character sets are designated by different escape
        !            31:  *      sequences.
        !            32:  * It's a typical instance of the "embrace and extend" strategy by Microsoft
        !            33:  * <https://en.wikipedia.org/wiki/Embrace,_extend_and_extinguish>.
        !            34:  */
        !            35: 
        !            36: /*
        !            37:  * Windows has three encodings CP50220, CP50221, CP50222.
        !            38:  * The common parts are:
        !            39:  *   - US-ASCII (0x00..0x7F)
        !            40:  *   - JIS X 0208 extended by
        !            41:  *       - one row (0x2D),
        !            42:  *       - a private use area (rows 0x75..0x7E = U+E000..U+E3AB),
        !            43:  *     enabled with ESC $ B, disabled with ESC ( B.
        !            44:  *   - JIS X 0212 extended by
        !            45:  *       - two rows (0x73..0x74),
        !            46:  *       - a private use area (rows 0x75..0x7E = U+E3AC..U+E757),
        !            47:  *     enabled with ESC $ ( D, disabled with ESC ( B.
        !            48:  * They differ in the handling of JIS X 0201 characters (halfwidth Katakana)
        !            49:  * in the conversion direction Unicode -> CP5022x:
        !            50:  *   * CP50220 maps the halfwidth Katakana to fullwidth Katakana characters.
        !            51:  *   * CP50221 contains the JIS X 0201 halfwidth Katakana characters,
        !            52:  *     enabled with ESC ( I, disabled with ESC ( B.
        !            53:  *   * CP50222 contains the JIS X 0201 halfwidth Katakana characters,
        !            54:  *     enabled with ESC ( J 0x0E, disabled with ESC ( B.
        !            55:  * In the conversion direction CP5022x -> Unicode, all three operate the same:
        !            56:  *   - ESC ( I is supported and understood.
        !            57:  *   - ESC ( J 0x0E is not accepted.  (Tested on Windows XP SP3.)
        !            58:  * Conclusion:
        !            59:  *   - CP50222 should not be used, because the multibyte sequence that it
        !            60:  *     produces cannot be parsed by either of the three encodings.
        !            61:  *   - CP50221 is preferrable to CP50220, because it can faithfully represent
        !            62:  *     the halfwidth Katakana characters.
        !            63:  * We therefore implement CP50221.  As an extension, in the mbtowc conversion
        !            64:  * direction, we support also ESC ( J 0x0E, just in case.
        !            65:  */
        !            66: 
        !            67: #include "cp50221_0208_ext.h"
        !            68: #include "cp50221_0212_ext.h"
        !            69: 
        !            70: #define ESC 0x1b
        !            71: #define SO  0x0e
        !            72: #define SI  0x0f
        !            73: 
        !            74: /*
        !            75:  * The state can be one of the following values.
        !            76:  */
        !            77: #define STATE_ASCII             0  /* Esc ( B */
        !            78: #define STATE_JISX0201ROMAN     1  /* Esc ( J */ /* only in mbtowc direction */
        !            79: #define STATE_JISX0201KATAKANA  2  /* Esc ( I */
        !            80: #define STATE_JISX0208MS        3  /* Esc $ @ or Esc $ B */
        !            81: #define STATE_JISX0212MS        4  /* Esc $ ( D */
        !            82: 
        !            83: static int
        !            84: iso2022_jpms_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
        !            85: {
        !            86:   state_t state = conv->istate;
        !            87:   int count = 0;
        !            88:   unsigned char c;
        !            89:   for (;;) {
        !            90:     c = *s;
        !            91:     if (c == ESC) {
        !            92:       if (n < count+3)
        !            93:         goto none;
        !            94:       if (s[1] == '(') {
        !            95:         if (s[2] == 'B') {
        !            96:           state = STATE_ASCII;
        !            97:           s += 3; count += 3;
        !            98:           if (n < count+1)
        !            99:             goto none;
        !           100:           continue;
        !           101:         }
        !           102:         if (s[2] == 'I') {
        !           103:           state = STATE_JISX0201KATAKANA;
        !           104:           s += 3; count += 3;
        !           105:           if (n < count+1)
        !           106:             goto none;
        !           107:           continue;
        !           108:         }
        !           109:         if (s[2] == 'J') {
        !           110:           state = STATE_JISX0201ROMAN;
        !           111:           s += 3; count += 3;
        !           112:           if (n < count+1)
        !           113:             goto none;
        !           114:           continue;
        !           115:         }
        !           116:         goto ilseq;
        !           117:       }
        !           118:       if (s[1] == '$') {
        !           119:         if (s[2] == '@' || s[2] == 'B') {
        !           120:           /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
        !           121:           state = STATE_JISX0208MS;
        !           122:           s += 3; count += 3;
        !           123:           if (n < count+1)
        !           124:             goto none;
        !           125:           continue;
        !           126:         }
        !           127:         if (s[2] == '(') {
        !           128:           if (n < count+4)
        !           129:             goto none;
        !           130:           if (s[3] == 'D') {
        !           131:             state = STATE_JISX0212MS;
        !           132:             s += 4; count += 4;
        !           133:             if (n < count+1)
        !           134:               goto none;
        !           135:             continue;
        !           136:           }
        !           137:         }
        !           138:         goto ilseq;
        !           139:       }
        !           140:       goto ilseq;
        !           141:     }
        !           142:     if (c == SO) {
        !           143:       if (state == STATE_JISX0201ROMAN)
        !           144:         state = STATE_JISX0201KATAKANA;
        !           145:       s += 1; count += 1;
        !           146:       if (n < count+1)
        !           147:         goto none;
        !           148:       continue;
        !           149:     }
        !           150:     if (c == SI) {
        !           151:       if (state == STATE_JISX0201KATAKANA)
        !           152:         state = STATE_JISX0201ROMAN;
        !           153:       s += 1; count += 1;
        !           154:       if (n < count+1)
        !           155:         goto none;
        !           156:       continue;
        !           157:     }
        !           158:     break;
        !           159:   }
        !           160:   switch (state) {
        !           161:     case STATE_ASCII:
        !           162:       if (c < 0x80) {
        !           163:         int ret = ascii_mbtowc(conv,pwc,s,1);
        !           164:         if (ret == RET_ILSEQ)
        !           165:           goto ilseq;
        !           166:         if (ret != 1) abort();
        !           167:         conv->istate = state;
        !           168:         return count+1;
        !           169:       } else
        !           170:         goto ilseq;
        !           171:     case STATE_JISX0201ROMAN:
        !           172:       if (c < 0x80) {
        !           173:         int ret = jisx0201_mbtowc(conv,pwc,s,1);
        !           174:         if (ret == RET_ILSEQ)
        !           175:           goto ilseq;
        !           176:         if (ret != 1) abort();
        !           177:         conv->istate = state;
        !           178:         return count+1;
        !           179:       } else
        !           180:         goto ilseq;
        !           181:     case STATE_JISX0201KATAKANA:
        !           182:       if (c < 0x80) {
        !           183:         unsigned char buf = c+0x80;
        !           184:         int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
        !           185:         if (ret == RET_ILSEQ)
        !           186:           goto ilseq;
        !           187:         if (ret != 1) abort();
        !           188:         conv->istate = state;
        !           189:         return count+1;
        !           190:       } else
        !           191:         goto ilseq;
        !           192:     case STATE_JISX0208MS:
        !           193:       if (n < count+2)
        !           194:         goto none;
        !           195:       if (s[0] < 0x80 && s[1] < 0x80) {
        !           196:         int ret;
        !           197:         if (s[0] < 0x75) {
        !           198:           if (s[0] == 0x2d) {
        !           199:             /* Extension of JIS X 0208.  */
        !           200:             if (s[1] >= 0x21 && s[1] <= 0x79) {
        !           201:               unsigned char i = (s[1] - 0x21) + 1;
        !           202:               ret = cp50221_0208_ext_mbtowc(conv,pwc,&i,1);
        !           203:               if (ret == 1)
        !           204:                 ret = 2;
        !           205:             } else
        !           206:               ret = RET_ILSEQ;
        !           207:           } else {
        !           208:             /* JIS X 0208.  */
        !           209:             ret = jisx0208_mbtowc(conv,pwc,s,2);
        !           210:           }
        !           211:         } else {
        !           212:           /* Extension of JIS X 0208.
        !           213:              0x{75..7E}{21..8E} maps to U+E000..U+E3AB.
        !           214:              But some rows maps to characters present in CP932.  */
        !           215:           if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
        !           216:             unsigned short wc = 0xfffd;
        !           217:             if (s[0] >= 0x79 && s[0] <= 0x7c)
        !           218:               wc = cp932ext_2uni_pageed[(s[0] - 0x79) * 94 + (s[1] - 0x21)];
        !           219:             if (wc == 0xfffd)
        !           220:               wc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe000;
        !           221:             *pwc = wc;
        !           222:             ret = 2;
        !           223:           } else
        !           224:             ret = RET_ILSEQ;
        !           225:         }
        !           226:         if (ret == RET_ILSEQ)
        !           227:           goto ilseq;
        !           228:         if (ret != 2) abort();
        !           229:         conv->istate = state;
        !           230:         return count+2;
        !           231:       } else
        !           232:         goto ilseq;
        !           233:     case STATE_JISX0212MS:
        !           234:       if (n < count+2)
        !           235:         goto none;
        !           236:       if (s[0] < 0x80 && s[1] < 0x80) {
        !           237:         int ret;
        !           238:         if (s[0] < 0x73) {
        !           239:           /* JIS X 0212.  */
        !           240:           ret = jisx0212_mbtowc(conv,pwc,s,2);
        !           241:         } else {
        !           242:           if (s[0] < 0x75) {
        !           243:             /* Extension of JIS X 0212.  */
        !           244:             if (s[1] >= 0x21 && s[1] <= 0x7e) {
        !           245:               unsigned char i = (s[0] - 0x73) * 94 + (s[1] - 0x21) + 1;
        !           246:               ret = cp50221_0212_ext_mbtowc(conv,pwc,&i,1);
        !           247:               if (ret == 1)
        !           248:                 ret = 2;
        !           249:             } else
        !           250:               ret = RET_ILSEQ;
        !           251:           } else {
        !           252:             /* Extension of JIS X 0208.
        !           253:                0x{75..7E}{21..8E} maps to U+E3AC..U+E757.  */
        !           254:             if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
        !           255:               *pwc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe3ac;
        !           256:               ret = 2;
        !           257:             } else
        !           258:               ret = RET_ILSEQ;
        !           259:           }
        !           260:         }
        !           261:         if (ret == RET_ILSEQ)
        !           262:           goto ilseq;
        !           263:         if (ret != 2) abort();
        !           264:         conv->istate = state;
        !           265:         return count+2;
        !           266:       } else
        !           267:         goto ilseq;
        !           268:     default: abort();
        !           269:   }
        !           270: 
        !           271: none:
        !           272:   conv->istate = state;
        !           273:   return RET_TOOFEW(count);
        !           274: 
        !           275: ilseq:
        !           276:   conv->istate = state;
        !           277:   return RET_SHIFT_ILSEQ(count);
        !           278: }
        !           279: 
        !           280: static int
        !           281: iso2022_jpms_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
        !           282: {
        !           283:   state_t state = conv->ostate;
        !           284:   unsigned char buf[2];
        !           285:   int ret;
        !           286: 
        !           287:   /* Try ASCII. */
        !           288:   ret = ascii_wctomb(conv,buf,wc,1);
        !           289:   if (ret != RET_ILUNI) {
        !           290:     if (ret != 1) abort();
        !           291:     if (buf[0] < 0x80) {
        !           292:       int count = (state == STATE_ASCII ? 1 : 4);
        !           293:       if (n < count)
        !           294:         return RET_TOOSMALL;
        !           295:       if (state != STATE_ASCII) {
        !           296:         r[0] = ESC;
        !           297:         r[1] = '(';
        !           298:         r[2] = 'B';
        !           299:         r += 3;
        !           300:         state = STATE_ASCII;
        !           301:       }
        !           302:       r[0] = buf[0];
        !           303:       conv->ostate = state;
        !           304:       return count;
        !           305:     }
        !           306:   }
        !           307: 
        !           308:   /* Try JIS X 0201-1976 Katakana. */
        !           309:   ret = jisx0201_wctomb(conv,buf,wc,1);
        !           310:   if (ret != RET_ILUNI) {
        !           311:     if (ret != 1) abort();
        !           312:     if (buf[0] >= 0x80) {
        !           313:       int count = (state == STATE_JISX0201KATAKANA ? 1 : 4);
        !           314:       if (n < count)
        !           315:         return RET_TOOSMALL;
        !           316:       if (state != STATE_JISX0201KATAKANA) {
        !           317:         r[0] = ESC;
        !           318:         r[1] = '(';
        !           319:         r[2] = 'I';
        !           320:         r += 3;
        !           321:         state = STATE_JISX0201KATAKANA;
        !           322:       }
        !           323:       r[0] = buf[0]-0x80;
        !           324:       conv->ostate = state;
        !           325:       return count;
        !           326:     }
        !           327:   }
        !           328: 
        !           329:   /* Try JIS X 0208-1990, in place of JIS X 0208-1978 and JIS X 0208-1983,
        !           330:      and the extensions mentioned above.  */
        !           331:   if (wc >= 0xe000 && wc < 0xe3ac) {
        !           332:     unsigned short i = wc - 0xe000;
        !           333:     buf[0] = (i / 94) + 0x75;
        !           334:     buf[1] = (i % 94) + 0x21;
        !           335:     ret = 2;
        !           336:   } else {
        !           337:     ret = jisx0208_wctomb(conv,buf,wc,2);
        !           338:     if (ret == RET_ILUNI) {
        !           339:       /* Extension of JIS X 0208.  */
        !           340:       unsigned char i;
        !           341:       ret = cp50221_0208_ext_wctomb(conv,&i,wc,1);
        !           342:       if (ret == 1) {
        !           343:         buf[0] = 0x2d;
        !           344:         buf[1] = i-1 + 0x21;
        !           345:         ret = 2;
        !           346:       } else if (wc == 0x663B) {
        !           347:         buf[0] = 0x7a;
        !           348:         buf[1] = 0x36;
        !           349:         ret = 2;
        !           350:       } else if (wc == 0xffe2) {
        !           351:         buf[0] = 0x7c;
        !           352:         buf[1] = 0x7b;
        !           353:         ret = 2;
        !           354:       } else if (wc == 0xffe4) {
        !           355:         buf[0] = 0x7c;
        !           356:         buf[1] = 0x7c;
        !           357:         ret = 2;
        !           358:       }
        !           359:     }
        !           360:   }
        !           361:   if (ret != RET_ILUNI) {
        !           362:     if (ret != 2) abort();
        !           363:     if (buf[0] < 0x80 && buf[1] < 0x80) {
        !           364:       int count = (state == STATE_JISX0208MS ? 2 : 5);
        !           365:       if (n < count)
        !           366:         return RET_TOOSMALL;
        !           367:       if (state != STATE_JISX0208MS) {
        !           368:         r[0] = ESC;
        !           369:         r[1] = '$';
        !           370:         r[2] = 'B';
        !           371:         r += 3;
        !           372:         state = STATE_JISX0208MS;
        !           373:       }
        !           374:       r[0] = buf[0];
        !           375:       r[1] = buf[1];
        !           376:       conv->ostate = state;
        !           377:       return count;
        !           378:     }
        !           379:   }
        !           380: 
        !           381:   /* Try JIS X 0212-1990 and the extensions mentioned above. */
        !           382:   if (wc >= 0xe3ac && wc < 0xe758) {
        !           383:     unsigned short i = wc - 0xe3ac;
        !           384:     buf[0] = (i / 94) + 0x75;
        !           385:     buf[1] = (i % 94) + 0x21;
        !           386:     ret = 2;
        !           387:   } else {
        !           388:     ret = jisx0212_wctomb(conv,buf,wc,2);
        !           389:     if (ret == RET_ILUNI) {
        !           390:       /* Extension of JIS X 0212.  */
        !           391:       unsigned char i;
        !           392:       ret = cp50221_0212_ext_wctomb(conv,&i,wc,1);
        !           393:       if (ret == 1) {
        !           394:         i -= 1;
        !           395:         buf[0] = (i / 94) + 0x73;
        !           396:         buf[1] = (i % 94) + 0x21;
        !           397:         ret = 2;
        !           398:       }
        !           399:     }
        !           400:   }
        !           401:   if (ret != RET_ILUNI) {
        !           402:     if (ret != 2) abort();
        !           403:     if (buf[0] < 0x80 && buf[1] < 0x80) {
        !           404:       int count = (state == STATE_JISX0212MS ? 2 : 6);
        !           405:       if (n < count)
        !           406:         return RET_TOOSMALL;
        !           407:       if (state != STATE_JISX0212MS) {
        !           408:         r[0] = ESC;
        !           409:         r[1] = '$';
        !           410:         r[2] = '(';
        !           411:         r[3] = 'D';
        !           412:         r += 4;
        !           413:         state = STATE_JISX0212MS;
        !           414:       }
        !           415:       r[0] = buf[0];
        !           416:       r[1] = buf[1];
        !           417:       conv->ostate = state;
        !           418:       return count;
        !           419:     }
        !           420:   }
        !           421: 
        !           422:   return RET_ILUNI;
        !           423: }
        !           424: 
        !           425: static int
        !           426: iso2022_jpms_reset (conv_t conv, unsigned char *r, size_t n)
        !           427: {
        !           428:   state_t state = conv->ostate;
        !           429:   if (state != STATE_ASCII) {
        !           430:     if (n < 3)
        !           431:       return RET_TOOSMALL;
        !           432:     r[0] = ESC;
        !           433:     r[1] = '(';
        !           434:     r[2] = 'B';
        !           435:     /* conv->ostate = 0; will be done by the caller */
        !           436:     return 3;
        !           437:   } else
        !           438:     return 0;
        !           439: }
        !           440: 
        !           441: #undef STATE_JISX0212MS
        !           442: #undef STATE_JISX0208MS
        !           443: #undef STATE_JISX0201KATAKANA
        !           444: #undef STATE_JISX0201ROMAN
        !           445: #undef STATE_ASCII

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>