File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / libiconv / lib / iso2022_jpms.h
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Wed Mar 17 13:38:46 2021 UTC (3 years, 4 months ago) by misho
Branches: libiconv, MAIN
CVS tags: v1_16p0, HEAD
libiconv 1.16

    1: /*
    2:  * Copyright (C) 1999-2001, 2008, 2011-2012, 2016, 2018 Free Software Foundation, Inc.
    3:  * This file is part of the GNU LIBICONV Library.
    4:  *
    5:  * The GNU LIBICONV Library is free software; you can redistribute it
    6:  * and/or modify it under the terms of the GNU Library General Public
    7:  * License as published by the Free Software Foundation; either version 2
    8:  * of the License, or (at your option) any later version.
    9:  *
   10:  * The GNU LIBICONV Library is distributed in the hope that it will be
   11:  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
   12:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13:  * Library General Public License for more details.
   14:  *
   15:  * You should have received a copy of the GNU Library General Public
   16:  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
   17:  * If not, see <https://www.gnu.org/licenses/>.
   18:  */
   19: 
   20: /*
   21:  * ISO-2022-JP-MS
   22:  * alias CP50221
   23:  *
   24:  * This is an extension of ISO-2022-JP-1 with larger character sets.
   25:  * It uses ESC $ B and ESC $ ( D to denote *extensions* of JIS X 0208 and
   26:  * JIS X 0212, respectively.  This violates the principles of ISO 2022,
   27:  * where
   28:  *   1. character sets to be used by ISO 2022 have to be registered at the
   29:  *      ISO IR registry <https://www.itscj.ipsj.or.jp/ISO-IR/>,
   30:  *   2. different character sets are designated by different escape
   31:  *      sequences.
   32:  * It's a typical instance of the "embrace and extend" strategy by Microsoft
   33:  * <https://en.wikipedia.org/wiki/Embrace,_extend_and_extinguish>.
   34:  */
   35: 
   36: /*
   37:  * Windows has three encodings CP50220, CP50221, CP50222.
   38:  * The common parts are:
   39:  *   - US-ASCII (0x00..0x7F)
   40:  *   - JIS X 0208 extended by
   41:  *       - one row (0x2D),
   42:  *       - a private use area (rows 0x75..0x7E = U+E000..U+E3AB),
   43:  *     enabled with ESC $ B, disabled with ESC ( B.
   44:  *   - JIS X 0212 extended by
   45:  *       - two rows (0x73..0x74),
   46:  *       - a private use area (rows 0x75..0x7E = U+E3AC..U+E757),
   47:  *     enabled with ESC $ ( D, disabled with ESC ( B.
   48:  * They differ in the handling of JIS X 0201 characters (halfwidth Katakana)
   49:  * in the conversion direction Unicode -> CP5022x:
   50:  *   * CP50220 maps the halfwidth Katakana to fullwidth Katakana characters.
   51:  *   * CP50221 contains the JIS X 0201 halfwidth Katakana characters,
   52:  *     enabled with ESC ( I, disabled with ESC ( B.
   53:  *   * CP50222 contains the JIS X 0201 halfwidth Katakana characters,
   54:  *     enabled with ESC ( J 0x0E, disabled with ESC ( B.
   55:  * In the conversion direction CP5022x -> Unicode, all three operate the same:
   56:  *   - ESC ( I is supported and understood.
   57:  *   - ESC ( J 0x0E is not accepted.  (Tested on Windows XP SP3.)
   58:  * Conclusion:
   59:  *   - CP50222 should not be used, because the multibyte sequence that it
   60:  *     produces cannot be parsed by either of the three encodings.
   61:  *   - CP50221 is preferrable to CP50220, because it can faithfully represent
   62:  *     the halfwidth Katakana characters.
   63:  * We therefore implement CP50221.  As an extension, in the mbtowc conversion
   64:  * direction, we support also ESC ( J 0x0E, just in case.
   65:  */
   66: 
   67: #include "cp50221_0208_ext.h"
   68: #include "cp50221_0212_ext.h"
   69: 
   70: #define ESC 0x1b
   71: #define SO  0x0e
   72: #define SI  0x0f
   73: 
   74: /*
   75:  * The state can be one of the following values.
   76:  */
   77: #define STATE_ASCII             0  /* Esc ( B */
   78: #define STATE_JISX0201ROMAN     1  /* Esc ( J */ /* only in mbtowc direction */
   79: #define STATE_JISX0201KATAKANA  2  /* Esc ( I */
   80: #define STATE_JISX0208MS        3  /* Esc $ @ or Esc $ B */
   81: #define STATE_JISX0212MS        4  /* Esc $ ( D */
   82: 
   83: static int
   84: iso2022_jpms_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
   85: {
   86:   state_t state = conv->istate;
   87:   int count = 0;
   88:   unsigned char c;
   89:   for (;;) {
   90:     c = *s;
   91:     if (c == ESC) {
   92:       if (n < count+3)
   93:         goto none;
   94:       if (s[1] == '(') {
   95:         if (s[2] == 'B') {
   96:           state = STATE_ASCII;
   97:           s += 3; count += 3;
   98:           if (n < count+1)
   99:             goto none;
  100:           continue;
  101:         }
  102:         if (s[2] == 'I') {
  103:           state = STATE_JISX0201KATAKANA;
  104:           s += 3; count += 3;
  105:           if (n < count+1)
  106:             goto none;
  107:           continue;
  108:         }
  109:         if (s[2] == 'J') {
  110:           state = STATE_JISX0201ROMAN;
  111:           s += 3; count += 3;
  112:           if (n < count+1)
  113:             goto none;
  114:           continue;
  115:         }
  116:         goto ilseq;
  117:       }
  118:       if (s[1] == '$') {
  119:         if (s[2] == '@' || s[2] == 'B') {
  120:           /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
  121:           state = STATE_JISX0208MS;
  122:           s += 3; count += 3;
  123:           if (n < count+1)
  124:             goto none;
  125:           continue;
  126:         }
  127:         if (s[2] == '(') {
  128:           if (n < count+4)
  129:             goto none;
  130:           if (s[3] == 'D') {
  131:             state = STATE_JISX0212MS;
  132:             s += 4; count += 4;
  133:             if (n < count+1)
  134:               goto none;
  135:             continue;
  136:           }
  137:         }
  138:         goto ilseq;
  139:       }
  140:       goto ilseq;
  141:     }
  142:     if (c == SO) {
  143:       if (state == STATE_JISX0201ROMAN)
  144:         state = STATE_JISX0201KATAKANA;
  145:       s += 1; count += 1;
  146:       if (n < count+1)
  147:         goto none;
  148:       continue;
  149:     }
  150:     if (c == SI) {
  151:       if (state == STATE_JISX0201KATAKANA)
  152:         state = STATE_JISX0201ROMAN;
  153:       s += 1; count += 1;
  154:       if (n < count+1)
  155:         goto none;
  156:       continue;
  157:     }
  158:     break;
  159:   }
  160:   switch (state) {
  161:     case STATE_ASCII:
  162:       if (c < 0x80) {
  163:         int ret = ascii_mbtowc(conv,pwc,s,1);
  164:         if (ret == RET_ILSEQ)
  165:           goto ilseq;
  166:         if (ret != 1) abort();
  167:         conv->istate = state;
  168:         return count+1;
  169:       } else
  170:         goto ilseq;
  171:     case STATE_JISX0201ROMAN:
  172:       if (c < 0x80) {
  173:         int ret = jisx0201_mbtowc(conv,pwc,s,1);
  174:         if (ret == RET_ILSEQ)
  175:           goto ilseq;
  176:         if (ret != 1) abort();
  177:         conv->istate = state;
  178:         return count+1;
  179:       } else
  180:         goto ilseq;
  181:     case STATE_JISX0201KATAKANA:
  182:       if (c < 0x80) {
  183:         unsigned char buf = c+0x80;
  184:         int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
  185:         if (ret == RET_ILSEQ)
  186:           goto ilseq;
  187:         if (ret != 1) abort();
  188:         conv->istate = state;
  189:         return count+1;
  190:       } else
  191:         goto ilseq;
  192:     case STATE_JISX0208MS:
  193:       if (n < count+2)
  194:         goto none;
  195:       if (s[0] < 0x80 && s[1] < 0x80) {
  196:         int ret;
  197:         if (s[0] < 0x75) {
  198:           if (s[0] == 0x2d) {
  199:             /* Extension of JIS X 0208.  */
  200:             if (s[1] >= 0x21 && s[1] <= 0x79) {
  201:               unsigned char i = (s[1] - 0x21) + 1;
  202:               ret = cp50221_0208_ext_mbtowc(conv,pwc,&i,1);
  203:               if (ret == 1)
  204:                 ret = 2;
  205:             } else
  206:               ret = RET_ILSEQ;
  207:           } else {
  208:             /* JIS X 0208.  */
  209:             ret = jisx0208_mbtowc(conv,pwc,s,2);
  210:           }
  211:         } else {
  212:           /* Extension of JIS X 0208.
  213:              0x{75..7E}{21..8E} maps to U+E000..U+E3AB.
  214:              But some rows maps to characters present in CP932.  */
  215:           if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
  216:             unsigned short wc = 0xfffd;
  217:             if (s[0] >= 0x79 && s[0] <= 0x7c)
  218:               wc = cp932ext_2uni_pageed[(s[0] - 0x79) * 94 + (s[1] - 0x21)];
  219:             if (wc == 0xfffd)
  220:               wc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe000;
  221:             *pwc = wc;
  222:             ret = 2;
  223:           } else
  224:             ret = RET_ILSEQ;
  225:         }
  226:         if (ret == RET_ILSEQ)
  227:           goto ilseq;
  228:         if (ret != 2) abort();
  229:         conv->istate = state;
  230:         return count+2;
  231:       } else
  232:         goto ilseq;
  233:     case STATE_JISX0212MS:
  234:       if (n < count+2)
  235:         goto none;
  236:       if (s[0] < 0x80 && s[1] < 0x80) {
  237:         int ret;
  238:         if (s[0] < 0x73) {
  239:           /* JIS X 0212.  */
  240:           ret = jisx0212_mbtowc(conv,pwc,s,2);
  241:         } else {
  242:           if (s[0] < 0x75) {
  243:             /* Extension of JIS X 0212.  */
  244:             if (s[1] >= 0x21 && s[1] <= 0x7e) {
  245:               unsigned char i = (s[0] - 0x73) * 94 + (s[1] - 0x21) + 1;
  246:               ret = cp50221_0212_ext_mbtowc(conv,pwc,&i,1);
  247:               if (ret == 1)
  248:                 ret = 2;
  249:             } else
  250:               ret = RET_ILSEQ;
  251:           } else {
  252:             /* Extension of JIS X 0208.
  253:                0x{75..7E}{21..8E} maps to U+E3AC..U+E757.  */
  254:             if (s[0] <= 0x7e && (s[1] >= 0x21 && s[1] <= 0x7e)) {
  255:               *pwc = (s[0] - 0x75) * 94 + (s[1] - 0x21) + 0xe3ac;
  256:               ret = 2;
  257:             } else
  258:               ret = RET_ILSEQ;
  259:           }
  260:         }
  261:         if (ret == RET_ILSEQ)
  262:           goto ilseq;
  263:         if (ret != 2) abort();
  264:         conv->istate = state;
  265:         return count+2;
  266:       } else
  267:         goto ilseq;
  268:     default: abort();
  269:   }
  270: 
  271: none:
  272:   conv->istate = state;
  273:   return RET_TOOFEW(count);
  274: 
  275: ilseq:
  276:   conv->istate = state;
  277:   return RET_SHIFT_ILSEQ(count);
  278: }
  279: 
  280: static int
  281: iso2022_jpms_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
  282: {
  283:   state_t state = conv->ostate;
  284:   unsigned char buf[2];
  285:   int ret;
  286: 
  287:   /* Try ASCII. */
  288:   ret = ascii_wctomb(conv,buf,wc,1);
  289:   if (ret != RET_ILUNI) {
  290:     if (ret != 1) abort();
  291:     if (buf[0] < 0x80) {
  292:       int count = (state == STATE_ASCII ? 1 : 4);
  293:       if (n < count)
  294:         return RET_TOOSMALL;
  295:       if (state != STATE_ASCII) {
  296:         r[0] = ESC;
  297:         r[1] = '(';
  298:         r[2] = 'B';
  299:         r += 3;
  300:         state = STATE_ASCII;
  301:       }
  302:       r[0] = buf[0];
  303:       conv->ostate = state;
  304:       return count;
  305:     }
  306:   }
  307: 
  308:   /* Try JIS X 0201-1976 Katakana. */
  309:   ret = jisx0201_wctomb(conv,buf,wc,1);
  310:   if (ret != RET_ILUNI) {
  311:     if (ret != 1) abort();
  312:     if (buf[0] >= 0x80) {
  313:       int count = (state == STATE_JISX0201KATAKANA ? 1 : 4);
  314:       if (n < count)
  315:         return RET_TOOSMALL;
  316:       if (state != STATE_JISX0201KATAKANA) {
  317:         r[0] = ESC;
  318:         r[1] = '(';
  319:         r[2] = 'I';
  320:         r += 3;
  321:         state = STATE_JISX0201KATAKANA;
  322:       }
  323:       r[0] = buf[0]-0x80;
  324:       conv->ostate = state;
  325:       return count;
  326:     }
  327:   }
  328: 
  329:   /* Try JIS X 0208-1990, in place of JIS X 0208-1978 and JIS X 0208-1983,
  330:      and the extensions mentioned above.  */
  331:   if (wc >= 0xe000 && wc < 0xe3ac) {
  332:     unsigned short i = wc - 0xe000;
  333:     buf[0] = (i / 94) + 0x75;
  334:     buf[1] = (i % 94) + 0x21;
  335:     ret = 2;
  336:   } else {
  337:     ret = jisx0208_wctomb(conv,buf,wc,2);
  338:     if (ret == RET_ILUNI) {
  339:       /* Extension of JIS X 0208.  */
  340:       unsigned char i;
  341:       ret = cp50221_0208_ext_wctomb(conv,&i,wc,1);
  342:       if (ret == 1) {
  343:         buf[0] = 0x2d;
  344:         buf[1] = i-1 + 0x21;
  345:         ret = 2;
  346:       } else if (wc == 0x663B) {
  347:         buf[0] = 0x7a;
  348:         buf[1] = 0x36;
  349:         ret = 2;
  350:       } else if (wc == 0xffe2) {
  351:         buf[0] = 0x7c;
  352:         buf[1] = 0x7b;
  353:         ret = 2;
  354:       } else if (wc == 0xffe4) {
  355:         buf[0] = 0x7c;
  356:         buf[1] = 0x7c;
  357:         ret = 2;
  358:       }
  359:     }
  360:   }
  361:   if (ret != RET_ILUNI) {
  362:     if (ret != 2) abort();
  363:     if (buf[0] < 0x80 && buf[1] < 0x80) {
  364:       int count = (state == STATE_JISX0208MS ? 2 : 5);
  365:       if (n < count)
  366:         return RET_TOOSMALL;
  367:       if (state != STATE_JISX0208MS) {
  368:         r[0] = ESC;
  369:         r[1] = '$';
  370:         r[2] = 'B';
  371:         r += 3;
  372:         state = STATE_JISX0208MS;
  373:       }
  374:       r[0] = buf[0];
  375:       r[1] = buf[1];
  376:       conv->ostate = state;
  377:       return count;
  378:     }
  379:   }
  380: 
  381:   /* Try JIS X 0212-1990 and the extensions mentioned above. */
  382:   if (wc >= 0xe3ac && wc < 0xe758) {
  383:     unsigned short i = wc - 0xe3ac;
  384:     buf[0] = (i / 94) + 0x75;
  385:     buf[1] = (i % 94) + 0x21;
  386:     ret = 2;
  387:   } else {
  388:     ret = jisx0212_wctomb(conv,buf,wc,2);
  389:     if (ret == RET_ILUNI) {
  390:       /* Extension of JIS X 0212.  */
  391:       unsigned char i;
  392:       ret = cp50221_0212_ext_wctomb(conv,&i,wc,1);
  393:       if (ret == 1) {
  394:         i -= 1;
  395:         buf[0] = (i / 94) + 0x73;
  396:         buf[1] = (i % 94) + 0x21;
  397:         ret = 2;
  398:       }
  399:     }
  400:   }
  401:   if (ret != RET_ILUNI) {
  402:     if (ret != 2) abort();
  403:     if (buf[0] < 0x80 && buf[1] < 0x80) {
  404:       int count = (state == STATE_JISX0212MS ? 2 : 6);
  405:       if (n < count)
  406:         return RET_TOOSMALL;
  407:       if (state != STATE_JISX0212MS) {
  408:         r[0] = ESC;
  409:         r[1] = '$';
  410:         r[2] = '(';
  411:         r[3] = 'D';
  412:         r += 4;
  413:         state = STATE_JISX0212MS;
  414:       }
  415:       r[0] = buf[0];
  416:       r[1] = buf[1];
  417:       conv->ostate = state;
  418:       return count;
  419:     }
  420:   }
  421: 
  422:   return RET_ILUNI;
  423: }
  424: 
  425: static int
  426: iso2022_jpms_reset (conv_t conv, unsigned char *r, size_t n)
  427: {
  428:   state_t state = conv->ostate;
  429:   if (state != STATE_ASCII) {
  430:     if (n < 3)
  431:       return RET_TOOSMALL;
  432:     r[0] = ESC;
  433:     r[1] = '(';
  434:     r[2] = 'B';
  435:     /* conv->ostate = 0; will be done by the caller */
  436:     return 3;
  437:   } else
  438:     return 0;
  439: }
  440: 
  441: #undef STATE_JISX0212MS
  442: #undef STATE_JISX0208MS
  443: #undef STATE_JISX0201KATAKANA
  444: #undef STATE_JISX0201ROMAN
  445: #undef STATE_ASCII

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>