File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / libiconv / lib / iso2022_jp3.h
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 22:57:48 2012 UTC (12 years, 5 months ago) by misho
Branches: libiconv, MAIN
CVS tags: v1_14p0, v1_14, v1_13_1, HEAD
libiconv

    1: /*
    2:  * Copyright (C) 1999-2004, 2008 Free Software Foundation, Inc.
    3:  * This file is part of the GNU LIBICONV Library.
    4:  *
    5:  * The GNU LIBICONV Library is free software; you can redistribute it
    6:  * and/or modify it under the terms of the GNU Library General Public
    7:  * License as published by the Free Software Foundation; either version 2
    8:  * of the License, or (at your option) any later version.
    9:  *
   10:  * The GNU LIBICONV Library is distributed in the hope that it will be
   11:  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
   12:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   13:  * Library General Public License for more details.
   14:  *
   15:  * You should have received a copy of the GNU Library General Public
   16:  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
   17:  * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
   18:  * Fifth Floor, Boston, MA 02110-1301, USA.
   19:  */
   20: 
   21: /*
   22:  * ISO-2022-JP-3
   23:  */
   24: 
   25: #include "jisx0213.h"
   26: 
   27: #define ESC 0x1b
   28: 
   29: /*
   30:  * The state is composed of one of the following values
   31:  */
   32: #define STATE_ASCII             0  /* Esc ( B */
   33: #define STATE_JISX0201ROMAN     1  /* Esc ( J */
   34: #define STATE_JISX0201KATAKANA  2  /* Esc ( I */
   35: #define STATE_JISX0208          3  /* Esc $ @ or Esc $ B */
   36: #define STATE_JISX02131         4  /* Esc $ ( O or Esc $ ( Q*/
   37: #define STATE_JISX02132         5  /* Esc $ ( P */
   38: 
   39: /*
   40:  * In the ISO-2022-JP-3 to UCS-4 direction, the state also holds the last
   41:  * character to be output, shifted by 3 bits.
   42:  */
   43: 
   44: static int
   45: iso2022_jp3_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
   46: {
   47:   ucs4_t last_wc = conv->istate >> 3;
   48:   if (last_wc) {
   49:     /* Output the buffered character. */
   50:     conv->istate &= 7;
   51:     *pwc = last_wc;
   52:     return 0; /* Don't advance the input pointer. */
   53:   } else {
   54:     state_t state = conv->istate;
   55:     int count = 0;
   56:     unsigned char c;
   57:     for (;;) {
   58:       c = *s;
   59:       if (c == ESC) {
   60:         if (n < count+3)
   61:           goto none;
   62:         if (s[1] == '(') {
   63:           if (s[2] == 'B') {
   64:             state = STATE_ASCII;
   65:             s += 3; count += 3;
   66:             if (n < count+1)
   67:               goto none;
   68:             continue;
   69:           }
   70:           if (s[2] == 'J') {
   71:             state = STATE_JISX0201ROMAN;
   72:             s += 3; count += 3;
   73:             if (n < count+1)
   74:               goto none;
   75:             continue;
   76:           }
   77:           if (s[2] == 'I') {
   78:             state = STATE_JISX0201KATAKANA;
   79:             s += 3; count += 3;
   80:             if (n < count+1)
   81:               goto none;
   82:             continue;
   83:           }
   84:           goto ilseq;
   85:         }
   86:         if (s[1] == '$') {
   87:           if (s[2] == '@' || s[2] == 'B') {
   88:             /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
   89:             state = STATE_JISX0208;
   90:             s += 3; count += 3;
   91:             if (n < count+1)
   92:               goto none;
   93:             continue;
   94:           }
   95:           if (s[2] == '(') {
   96:             if (n < count+4)
   97:               goto none;
   98:             if (s[3] == 'O' || s[3] == 'Q') {
   99:               state = STATE_JISX02131;
  100:               s += 4; count += 4;
  101:               if (n < count+1)
  102:                 goto none;
  103:               continue;
  104:             }
  105:             if (s[3] == 'P') {
  106:               state = STATE_JISX02132;
  107:               s += 4; count += 4;
  108:               if (n < count+1)
  109:                 goto none;
  110:               continue;
  111:             }
  112:           }
  113:           goto ilseq;
  114:         }
  115:         goto ilseq;
  116:       }
  117:       break;
  118:     }
  119:     switch (state) {
  120:       case STATE_ASCII:
  121:         if (c < 0x80) {
  122:           int ret = ascii_mbtowc(conv,pwc,s,1);
  123:           if (ret == RET_ILSEQ)
  124:             goto ilseq;
  125:           if (ret != 1) abort();
  126:           conv->istate = state;
  127:           return count+1;
  128:         } else
  129:           goto ilseq;
  130:       case STATE_JISX0201ROMAN:
  131:         if (c < 0x80) {
  132:           int ret = jisx0201_mbtowc(conv,pwc,s,1);
  133:           if (ret == RET_ILSEQ)
  134:             goto ilseq;
  135:           if (ret != 1) abort();
  136:           conv->istate = state;
  137:           return count+1;
  138:         } else
  139:           goto ilseq;
  140:       case STATE_JISX0201KATAKANA:
  141:         if (c < 0x80) {
  142:           unsigned char buf = c+0x80;
  143:           int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
  144:           if (ret == RET_ILSEQ)
  145:             goto ilseq;
  146:           if (ret != 1) abort();
  147:           conv->istate = state;
  148:           return count+1;
  149:         } else
  150:           goto ilseq;
  151:       case STATE_JISX0208:
  152:         if (n < count+2)
  153:           goto none;
  154:         if (s[0] < 0x80 && s[1] < 0x80) {
  155:           int ret = jisx0208_mbtowc(conv,pwc,s,2);
  156:           if (ret == RET_ILSEQ)
  157:             goto ilseq;
  158:           if (ret != 2) abort();
  159:           conv->istate = state;
  160:           return count+2;
  161:         } else
  162:           goto ilseq;
  163:       case STATE_JISX02131:
  164:       case STATE_JISX02132:
  165:         if (n < count+2)
  166:           goto none;
  167:         if (s[0] < 0x80 && s[1] < 0x80) {
  168:           ucs4_t wc = jisx0213_to_ucs4(((state-STATE_JISX02131+1)<<8)+s[0],s[1]);
  169:           if (wc) {
  170:             if (wc < 0x80) {
  171:               /* It's a combining character. */
  172:               ucs4_t wc1 = jisx0213_to_ucs_combining[wc - 1][0];
  173:               ucs4_t wc2 = jisx0213_to_ucs_combining[wc - 1][1];
  174:               /* We cannot output two Unicode characters at once. So,
  175:                  output the first character and buffer the second one. */
  176:               *pwc = wc1;
  177:               conv->istate = (wc2 << 3) | state;
  178:             } else {
  179:               *pwc = wc;
  180:               conv->istate = state;
  181:             }
  182:             return count+2;
  183:           }
  184:         }
  185:         goto ilseq;
  186:       default: abort();
  187:     }
  188:   none:
  189:     conv->istate = state;
  190:     return RET_TOOFEW(count);
  191: 
  192:   ilseq:
  193:     conv->istate = state;
  194:     return RET_SHIFT_ILSEQ(count);
  195:   }
  196: }
  197: 
  198: static int
  199: iso2022_jp3_flushwc (conv_t conv, ucs4_t *pwc)
  200: {
  201:   ucs4_t last_wc = conv->istate >> 3;
  202:   if (last_wc) {
  203:     /* Output the buffered character. */
  204:     conv->istate &= 7;
  205:     *pwc = last_wc;
  206:     return 1;
  207:   } else
  208:     return 0;
  209: }
  210: 
  211: /*
  212:  * In the UCS-4 to ISO-2022-JP-3 direction, the state also holds the last two
  213:  * bytes to be output, shifted by 3 bits, and the STATE_xxxxx value that was
  214:  * effective before this buffered character, shifted by 19 bits.
  215:  */
  216: 
  217: /* Composition tables for each of the relevant combining characters.  */
  218: static const struct { unsigned short base; unsigned short composed; } iso2022_jp3_comp_table_data[] = {
  219: #define iso2022_jp3_comp_table02e5_idx 0
  220: #define iso2022_jp3_comp_table02e5_len 1
  221:   { 0x2b64, 0x2b65 }, /* 0x12B65 = 0x12B64 U+02E5 */
  222: #define iso2022_jp3_comp_table02e9_idx (iso2022_jp3_comp_table02e5_idx+iso2022_jp3_comp_table02e5_len)
  223: #define iso2022_jp3_comp_table02e9_len 1
  224:   { 0x2b60, 0x2b66 }, /* 0x12B66 = 0x12B60 U+02E9 */
  225: #define iso2022_jp3_comp_table0300_idx (iso2022_jp3_comp_table02e9_idx+iso2022_jp3_comp_table02e9_len)
  226: #define iso2022_jp3_comp_table0300_len 5
  227:   { 0x295c, 0x2b44 }, /* 0x12B44 = 0x1295C U+0300 */
  228:   { 0x2b38, 0x2b48 }, /* 0x12B48 = 0x12B38 U+0300 */
  229:   { 0x2b37, 0x2b4a }, /* 0x12B4A = 0x12B37 U+0300 */
  230:   { 0x2b30, 0x2b4c }, /* 0x12B4C = 0x12B30 U+0300 */
  231:   { 0x2b43, 0x2b4e }, /* 0x12B4E = 0x12B43 U+0300 */
  232: #define iso2022_jp3_comp_table0301_idx (iso2022_jp3_comp_table0300_idx+iso2022_jp3_comp_table0300_len)
  233: #define iso2022_jp3_comp_table0301_len 4
  234:   { 0x2b38, 0x2b49 }, /* 0x12B49 = 0x12B38 U+0301 */
  235:   { 0x2b37, 0x2b4b }, /* 0x12B4B = 0x12B37 U+0301 */
  236:   { 0x2b30, 0x2b4d }, /* 0x12B4D = 0x12B30 U+0301 */
  237:   { 0x2b43, 0x2b4f }, /* 0x12B4F = 0x12B43 U+0301 */
  238: #define iso2022_jp3_comp_table309a_idx (iso2022_jp3_comp_table0301_idx+iso2022_jp3_comp_table0301_len)
  239: #define iso2022_jp3_comp_table309a_len 14
  240:   { 0x242b, 0x2477 }, /* 0x12477 = 0x1242B U+309A */
  241:   { 0x242d, 0x2478 }, /* 0x12478 = 0x1242D U+309A */
  242:   { 0x242f, 0x2479 }, /* 0x12479 = 0x1242F U+309A */
  243:   { 0x2431, 0x247a }, /* 0x1247A = 0x12431 U+309A */
  244:   { 0x2433, 0x247b }, /* 0x1247B = 0x12433 U+309A */
  245:   { 0x252b, 0x2577 }, /* 0x12577 = 0x1252B U+309A */
  246:   { 0x252d, 0x2578 }, /* 0x12578 = 0x1252D U+309A */
  247:   { 0x252f, 0x2579 }, /* 0x12579 = 0x1252F U+309A */
  248:   { 0x2531, 0x257a }, /* 0x1257A = 0x12531 U+309A */
  249:   { 0x2533, 0x257b }, /* 0x1257B = 0x12533 U+309A */
  250:   { 0x253b, 0x257c }, /* 0x1257C = 0x1253B U+309A */
  251:   { 0x2544, 0x257d }, /* 0x1257D = 0x12544 U+309A */
  252:   { 0x2548, 0x257e }, /* 0x1257E = 0x12548 U+309A */
  253:   { 0x2675, 0x2678 }, /* 0x12678 = 0x12675 U+309A */
  254: };
  255: 
  256: #define SPLIT_STATE \
  257:   unsigned short lasttwo = state >> 3; state_t prevstate = state >> 19; state &= 7
  258: #define COMBINE_STATE \
  259:   state |= (prevstate << 19) | (lasttwo << 3)
  260: #define COMBINE_STATE_NO_LASTTWO \
  261:   /* assume lasttwo == 0, then prevstate is ignored */
  262: 
  263: static int
  264: iso2022_jp3_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
  265: {
  266:   int count = 0;
  267:   unsigned char buf[2];
  268:   unsigned short jch;
  269:   int ret;
  270:   state_t state = conv->ostate;
  271:   SPLIT_STATE;
  272: 
  273:   if (lasttwo) {
  274:     /* Attempt to combine the last character with this one. */
  275:     unsigned int idx;
  276:     unsigned int len;
  277: 
  278:     if (wc == 0x02e5)
  279:       idx = iso2022_jp3_comp_table02e5_idx,
  280:       len = iso2022_jp3_comp_table02e5_len;
  281:     else if (wc == 0x02e9)
  282:       idx = iso2022_jp3_comp_table02e9_idx,
  283:       len = iso2022_jp3_comp_table02e9_len;
  284:     else if (wc == 0x0300)
  285:       idx = iso2022_jp3_comp_table0300_idx,
  286:       len = iso2022_jp3_comp_table0300_len;
  287:     else if (wc == 0x0301)
  288:       idx = iso2022_jp3_comp_table0301_idx,
  289:       len = iso2022_jp3_comp_table0301_len;
  290:     else if (wc == 0x309a)
  291:       idx = iso2022_jp3_comp_table309a_idx,
  292:       len = iso2022_jp3_comp_table309a_len;
  293:     else
  294:       goto not_combining;
  295: 
  296:     do
  297:       if (iso2022_jp3_comp_table_data[idx].base == lasttwo)
  298:         break;
  299:     while (++idx, --len > 0);
  300: 
  301:     if (len > 0) {
  302:       /* Output the combined character. */
  303:       /* We know the combined character is in JISX0213 plane 1, but
  304:          the buffered character may have been in JISX0208 or in
  305:          JISX0213 plane 1. */
  306:       count = (state != STATE_JISX02131 ? 4 : 0) + 2;
  307:       if (n < count)
  308:         return RET_TOOSMALL;
  309:       if (state != STATE_JISX02131) {
  310:         r[0] = ESC;
  311:         r[1] = '$';
  312:         r[2] = '(';
  313:         r[3] = 'Q';
  314:         r += 4;
  315:         state = STATE_JISX02131;
  316:       }
  317:       lasttwo = iso2022_jp3_comp_table_data[idx].composed;
  318:       r[0] = (lasttwo >> 8) & 0xff;
  319:       r[1] = lasttwo & 0xff;
  320:       COMBINE_STATE_NO_LASTTWO;
  321:       conv->ostate = state;
  322:       return count;
  323:     }
  324: 
  325:   not_combining:
  326:     /* Output the buffered character. */
  327:     /* We know it is in JISX0208 or in JISX0213 plane 1. */
  328:     count = (prevstate != state ? 3 : 0) + 2;
  329:     if (n < count)
  330:       return RET_TOOSMALL;
  331:     if (prevstate != state) {
  332:       if (state != STATE_JISX0208) abort();
  333:       r[0] = ESC;
  334:       r[1] = '$';
  335:       r[2] = 'B';
  336:       r += 3;
  337:     }
  338:     r[0] = (lasttwo >> 8) & 0xff;
  339:     r[1] = lasttwo & 0xff;
  340:     r += 2;
  341:   }
  342: 
  343:   /* Try ASCII. */
  344:   ret = ascii_wctomb(conv,buf,wc,1);
  345:   if (ret != RET_ILUNI) {
  346:     if (ret != 1) abort();
  347:     if (buf[0] < 0x80) {
  348:       count += (state == STATE_ASCII ? 1 : 4);
  349:       if (n < count)
  350:         return RET_TOOSMALL;
  351:       if (state != STATE_ASCII) {
  352:         r[0] = ESC;
  353:         r[1] = '(';
  354:         r[2] = 'B';
  355:         r += 3;
  356:         state = STATE_ASCII;
  357:       }
  358:       r[0] = buf[0];
  359:       COMBINE_STATE_NO_LASTTWO;
  360:       conv->ostate = state;
  361:       return count;
  362:     }
  363:   }
  364: 
  365:   /* Try JIS X 0201-1976 Roman. */
  366:   ret = jisx0201_wctomb(conv,buf,wc,1);
  367:   if (ret != RET_ILUNI) {
  368:     if (ret != 1) abort();
  369:     if (buf[0] < 0x80) {
  370:       count += (state == STATE_JISX0201ROMAN ? 1 : 4);
  371:       if (n < count)
  372:         return RET_TOOSMALL;
  373:       if (state != STATE_JISX0201ROMAN) {
  374:         r[0] = ESC;
  375:         r[1] = '(';
  376:         r[2] = 'J';
  377:         r += 3;
  378:         state = STATE_JISX0201ROMAN;
  379:       }
  380:       r[0] = buf[0];
  381:       COMBINE_STATE_NO_LASTTWO;
  382:       conv->ostate = state;
  383:       return count;
  384:     }
  385:   }
  386: 
  387:   jch = ucs4_to_jisx0213(wc);
  388: 
  389:   /* Try JIS X 0208-1990 in place of JIS X 0208-1978 and JIS X 0208-1983. */
  390:   ret = jisx0208_wctomb(conv,buf,wc,2);
  391:   if (ret != RET_ILUNI) {
  392:     if (ret != 2) abort();
  393:     if (buf[0] < 0x80 && buf[1] < 0x80) {
  394:       if (jch & 0x0080) {
  395:         /* A possible match in comp_table_data. Buffer it. */
  396:         prevstate = state;
  397:         lasttwo = jch & 0x7f7f;
  398:         state = STATE_JISX0208;
  399:         COMBINE_STATE;
  400:         conv->ostate = state;
  401:         return count;
  402:       } else {
  403:         count += (state == STATE_JISX0208 ? 2 : 5);
  404:         if (n < count)
  405:           return RET_TOOSMALL;
  406:         if (state != STATE_JISX0208) {
  407:           r[0] = ESC;
  408:           r[1] = '$';
  409:           r[2] = 'B';
  410:           r += 3;
  411:           state = STATE_JISX0208;
  412:         }
  413:         r[0] = buf[0];
  414:         r[1] = buf[1];
  415:         COMBINE_STATE_NO_LASTTWO;
  416:         conv->ostate = state;
  417:         return count;
  418:       }
  419:     }
  420:   }
  421: 
  422:   /* Try JISX 0213 plane 1 and JISX 0213 plane 2. */
  423:   if (jch != 0) {
  424:     if (jch & 0x8000) {
  425:       /* JISX 0213 plane 2. */
  426:       if (state != STATE_JISX02132) {
  427:         count += 4;
  428:         if (n < count)
  429:           return RET_TOOSMALL;
  430:         r[0] = ESC;
  431:         r[1] = '$';
  432:         r[2] = '(';
  433:         r[3] = 'P';
  434:         r += 4;
  435:         state = STATE_JISX02132;
  436:       }
  437:     } else {
  438:       /* JISX 0213 plane 1. */
  439:       if (state != STATE_JISX02131) {
  440:         count += 4;
  441:         if (n < count)
  442:           return RET_TOOSMALL;
  443:         r[0] = ESC;
  444:         r[1] = '$';
  445:         r[2] = '(';
  446:         r[3] = 'Q';
  447:         r += 4;
  448:         state = STATE_JISX02131;
  449:       }
  450:     }
  451:     if (jch & 0x0080) {
  452:       /* A possible match in comp_table_data. We have to buffer it. */
  453:       /* We know it's a JISX 0213 plane 1 character. */
  454:       if (jch & 0x8000) abort();
  455:       prevstate = state;
  456:       lasttwo = jch & 0x7f7f;
  457:       COMBINE_STATE;
  458:       conv->ostate = state;
  459:       return count;
  460:     }
  461:     count += 2;
  462:     if (n < count)
  463:       return RET_TOOSMALL;
  464:     r[0] = (jch >> 8) & 0x7f;
  465:     r[1] = jch & 0x7f;
  466:     COMBINE_STATE_NO_LASTTWO;
  467:     conv->ostate = state;
  468:     return count;
  469:   }
  470: 
  471:   /* Try JIS X 0201-1976 Katakana. This is not officially part of
  472:      ISO-2022-JP-3. Therefore we try it after all other attempts. */
  473:   ret = jisx0201_wctomb(conv,buf,wc,1);
  474:   if (ret != RET_ILUNI) {
  475:     if (ret != 1) abort();
  476:     if (buf[0] >= 0x80) {
  477:       count += (state == STATE_JISX0201KATAKANA ? 1 : 4);
  478:       if (n < count)
  479:         return RET_TOOSMALL;
  480:       if (state != STATE_JISX0201KATAKANA) {
  481:         r[0] = ESC;
  482:         r[1] = '(';
  483:         r[2] = 'I';
  484:         r += 3;
  485:         state = STATE_JISX0201KATAKANA;
  486:       }
  487:       r[0] = buf[0]-0x80;
  488:       COMBINE_STATE_NO_LASTTWO;
  489:       conv->ostate = state;
  490:       return count;
  491:     }
  492:   }
  493: 
  494:   return RET_ILUNI;
  495: }
  496: 
  497: static int
  498: iso2022_jp3_reset (conv_t conv, unsigned char *r, int n)
  499: {
  500:   state_t state = conv->ostate;
  501:   SPLIT_STATE;
  502:   {
  503:     int count =
  504:       (lasttwo ? (prevstate != state ? 3 : 0) + 2 : 0)
  505:       + (state != STATE_ASCII ? 3 : 0);
  506:     if (n < count)
  507:       return RET_TOOSMALL;
  508:     if (lasttwo) {
  509:       if (prevstate != state) {
  510:         if (state != STATE_JISX0208) abort();
  511:         r[0] = ESC;
  512:         r[1] = '$';
  513:         r[2] = 'B';
  514:         r += 3;
  515:       }
  516:       r[0] = (lasttwo >> 8) & 0xff;
  517:       r[1] = lasttwo & 0xff;
  518:       r += 2;
  519:     }
  520:     if (state != STATE_ASCII) {
  521:       r[0] = ESC;
  522:       r[1] = '(';
  523:       r[2] = 'B';
  524:     }
  525:     /* conv->ostate = 0; will be done by the caller */
  526:     return count;
  527:   }
  528: }
  529: 
  530: #undef COMBINE_STATE_NO_LASTTWO
  531: #undef COMBINE_STATE
  532: #undef SPLIT_STATE
  533: #undef STATE_JISX02132
  534: #undef STATE_JISX02131
  535: #undef STATE_JISX0208
  536: #undef STATE_JISX0201KATAKANA
  537: #undef STATE_JISX0201ROMAN
  538: #undef STATE_ASCII

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>