Annotation of embedaddon/libiconv/lib/iso2022_jp3.h, revision 1.1.1.1

1.1       misho       1: /*
                      2:  * Copyright (C) 1999-2004, 2008 Free Software Foundation, Inc.
                      3:  * This file is part of the GNU LIBICONV Library.
                      4:  *
                      5:  * The GNU LIBICONV Library is free software; you can redistribute it
                      6:  * and/or modify it under the terms of the GNU Library General Public
                      7:  * License as published by the Free Software Foundation; either version 2
                      8:  * of the License, or (at your option) any later version.
                      9:  *
                     10:  * The GNU LIBICONV Library is distributed in the hope that it will be
                     11:  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
                     12:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     13:  * Library General Public License for more details.
                     14:  *
                     15:  * You should have received a copy of the GNU Library General Public
                     16:  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
                     17:  * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
                     18:  * Fifth Floor, Boston, MA 02110-1301, USA.
                     19:  */
                     20: 
                     21: /*
                     22:  * ISO-2022-JP-3
                     23:  */
                     24: 
                     25: #include "jisx0213.h"
                     26: 
                     27: #define ESC 0x1b
                     28: 
                     29: /*
                     30:  * The state is composed of one of the following values
                     31:  */
                     32: #define STATE_ASCII             0  /* Esc ( B */
                     33: #define STATE_JISX0201ROMAN     1  /* Esc ( J */
                     34: #define STATE_JISX0201KATAKANA  2  /* Esc ( I */
                     35: #define STATE_JISX0208          3  /* Esc $ @ or Esc $ B */
                     36: #define STATE_JISX02131         4  /* Esc $ ( O or Esc $ ( Q*/
                     37: #define STATE_JISX02132         5  /* Esc $ ( P */
                     38: 
                     39: /*
                     40:  * In the ISO-2022-JP-3 to UCS-4 direction, the state also holds the last
                     41:  * character to be output, shifted by 3 bits.
                     42:  */
                     43: 
                     44: static int
                     45: iso2022_jp3_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
                     46: {
                     47:   ucs4_t last_wc = conv->istate >> 3;
                     48:   if (last_wc) {
                     49:     /* Output the buffered character. */
                     50:     conv->istate &= 7;
                     51:     *pwc = last_wc;
                     52:     return 0; /* Don't advance the input pointer. */
                     53:   } else {
                     54:     state_t state = conv->istate;
                     55:     int count = 0;
                     56:     unsigned char c;
                     57:     for (;;) {
                     58:       c = *s;
                     59:       if (c == ESC) {
                     60:         if (n < count+3)
                     61:           goto none;
                     62:         if (s[1] == '(') {
                     63:           if (s[2] == 'B') {
                     64:             state = STATE_ASCII;
                     65:             s += 3; count += 3;
                     66:             if (n < count+1)
                     67:               goto none;
                     68:             continue;
                     69:           }
                     70:           if (s[2] == 'J') {
                     71:             state = STATE_JISX0201ROMAN;
                     72:             s += 3; count += 3;
                     73:             if (n < count+1)
                     74:               goto none;
                     75:             continue;
                     76:           }
                     77:           if (s[2] == 'I') {
                     78:             state = STATE_JISX0201KATAKANA;
                     79:             s += 3; count += 3;
                     80:             if (n < count+1)
                     81:               goto none;
                     82:             continue;
                     83:           }
                     84:           goto ilseq;
                     85:         }
                     86:         if (s[1] == '$') {
                     87:           if (s[2] == '@' || s[2] == 'B') {
                     88:             /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
                     89:             state = STATE_JISX0208;
                     90:             s += 3; count += 3;
                     91:             if (n < count+1)
                     92:               goto none;
                     93:             continue;
                     94:           }
                     95:           if (s[2] == '(') {
                     96:             if (n < count+4)
                     97:               goto none;
                     98:             if (s[3] == 'O' || s[3] == 'Q') {
                     99:               state = STATE_JISX02131;
                    100:               s += 4; count += 4;
                    101:               if (n < count+1)
                    102:                 goto none;
                    103:               continue;
                    104:             }
                    105:             if (s[3] == 'P') {
                    106:               state = STATE_JISX02132;
                    107:               s += 4; count += 4;
                    108:               if (n < count+1)
                    109:                 goto none;
                    110:               continue;
                    111:             }
                    112:           }
                    113:           goto ilseq;
                    114:         }
                    115:         goto ilseq;
                    116:       }
                    117:       break;
                    118:     }
                    119:     switch (state) {
                    120:       case STATE_ASCII:
                    121:         if (c < 0x80) {
                    122:           int ret = ascii_mbtowc(conv,pwc,s,1);
                    123:           if (ret == RET_ILSEQ)
                    124:             goto ilseq;
                    125:           if (ret != 1) abort();
                    126:           conv->istate = state;
                    127:           return count+1;
                    128:         } else
                    129:           goto ilseq;
                    130:       case STATE_JISX0201ROMAN:
                    131:         if (c < 0x80) {
                    132:           int ret = jisx0201_mbtowc(conv,pwc,s,1);
                    133:           if (ret == RET_ILSEQ)
                    134:             goto ilseq;
                    135:           if (ret != 1) abort();
                    136:           conv->istate = state;
                    137:           return count+1;
                    138:         } else
                    139:           goto ilseq;
                    140:       case STATE_JISX0201KATAKANA:
                    141:         if (c < 0x80) {
                    142:           unsigned char buf = c+0x80;
                    143:           int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
                    144:           if (ret == RET_ILSEQ)
                    145:             goto ilseq;
                    146:           if (ret != 1) abort();
                    147:           conv->istate = state;
                    148:           return count+1;
                    149:         } else
                    150:           goto ilseq;
                    151:       case STATE_JISX0208:
                    152:         if (n < count+2)
                    153:           goto none;
                    154:         if (s[0] < 0x80 && s[1] < 0x80) {
                    155:           int ret = jisx0208_mbtowc(conv,pwc,s,2);
                    156:           if (ret == RET_ILSEQ)
                    157:             goto ilseq;
                    158:           if (ret != 2) abort();
                    159:           conv->istate = state;
                    160:           return count+2;
                    161:         } else
                    162:           goto ilseq;
                    163:       case STATE_JISX02131:
                    164:       case STATE_JISX02132:
                    165:         if (n < count+2)
                    166:           goto none;
                    167:         if (s[0] < 0x80 && s[1] < 0x80) {
                    168:           ucs4_t wc = jisx0213_to_ucs4(((state-STATE_JISX02131+1)<<8)+s[0],s[1]);
                    169:           if (wc) {
                    170:             if (wc < 0x80) {
                    171:               /* It's a combining character. */
                    172:               ucs4_t wc1 = jisx0213_to_ucs_combining[wc - 1][0];
                    173:               ucs4_t wc2 = jisx0213_to_ucs_combining[wc - 1][1];
                    174:               /* We cannot output two Unicode characters at once. So,
                    175:                  output the first character and buffer the second one. */
                    176:               *pwc = wc1;
                    177:               conv->istate = (wc2 << 3) | state;
                    178:             } else {
                    179:               *pwc = wc;
                    180:               conv->istate = state;
                    181:             }
                    182:             return count+2;
                    183:           }
                    184:         }
                    185:         goto ilseq;
                    186:       default: abort();
                    187:     }
                    188:   none:
                    189:     conv->istate = state;
                    190:     return RET_TOOFEW(count);
                    191: 
                    192:   ilseq:
                    193:     conv->istate = state;
                    194:     return RET_SHIFT_ILSEQ(count);
                    195:   }
                    196: }
                    197: 
                    198: static int
                    199: iso2022_jp3_flushwc (conv_t conv, ucs4_t *pwc)
                    200: {
                    201:   ucs4_t last_wc = conv->istate >> 3;
                    202:   if (last_wc) {
                    203:     /* Output the buffered character. */
                    204:     conv->istate &= 7;
                    205:     *pwc = last_wc;
                    206:     return 1;
                    207:   } else
                    208:     return 0;
                    209: }
                    210: 
                    211: /*
                    212:  * In the UCS-4 to ISO-2022-JP-3 direction, the state also holds the last two
                    213:  * bytes to be output, shifted by 3 bits, and the STATE_xxxxx value that was
                    214:  * effective before this buffered character, shifted by 19 bits.
                    215:  */
                    216: 
                    217: /* Composition tables for each of the relevant combining characters.  */
                    218: static const struct { unsigned short base; unsigned short composed; } iso2022_jp3_comp_table_data[] = {
                    219: #define iso2022_jp3_comp_table02e5_idx 0
                    220: #define iso2022_jp3_comp_table02e5_len 1
                    221:   { 0x2b64, 0x2b65 }, /* 0x12B65 = 0x12B64 U+02E5 */
                    222: #define iso2022_jp3_comp_table02e9_idx (iso2022_jp3_comp_table02e5_idx+iso2022_jp3_comp_table02e5_len)
                    223: #define iso2022_jp3_comp_table02e9_len 1
                    224:   { 0x2b60, 0x2b66 }, /* 0x12B66 = 0x12B60 U+02E9 */
                    225: #define iso2022_jp3_comp_table0300_idx (iso2022_jp3_comp_table02e9_idx+iso2022_jp3_comp_table02e9_len)
                    226: #define iso2022_jp3_comp_table0300_len 5
                    227:   { 0x295c, 0x2b44 }, /* 0x12B44 = 0x1295C U+0300 */
                    228:   { 0x2b38, 0x2b48 }, /* 0x12B48 = 0x12B38 U+0300 */
                    229:   { 0x2b37, 0x2b4a }, /* 0x12B4A = 0x12B37 U+0300 */
                    230:   { 0x2b30, 0x2b4c }, /* 0x12B4C = 0x12B30 U+0300 */
                    231:   { 0x2b43, 0x2b4e }, /* 0x12B4E = 0x12B43 U+0300 */
                    232: #define iso2022_jp3_comp_table0301_idx (iso2022_jp3_comp_table0300_idx+iso2022_jp3_comp_table0300_len)
                    233: #define iso2022_jp3_comp_table0301_len 4
                    234:   { 0x2b38, 0x2b49 }, /* 0x12B49 = 0x12B38 U+0301 */
                    235:   { 0x2b37, 0x2b4b }, /* 0x12B4B = 0x12B37 U+0301 */
                    236:   { 0x2b30, 0x2b4d }, /* 0x12B4D = 0x12B30 U+0301 */
                    237:   { 0x2b43, 0x2b4f }, /* 0x12B4F = 0x12B43 U+0301 */
                    238: #define iso2022_jp3_comp_table309a_idx (iso2022_jp3_comp_table0301_idx+iso2022_jp3_comp_table0301_len)
                    239: #define iso2022_jp3_comp_table309a_len 14
                    240:   { 0x242b, 0x2477 }, /* 0x12477 = 0x1242B U+309A */
                    241:   { 0x242d, 0x2478 }, /* 0x12478 = 0x1242D U+309A */
                    242:   { 0x242f, 0x2479 }, /* 0x12479 = 0x1242F U+309A */
                    243:   { 0x2431, 0x247a }, /* 0x1247A = 0x12431 U+309A */
                    244:   { 0x2433, 0x247b }, /* 0x1247B = 0x12433 U+309A */
                    245:   { 0x252b, 0x2577 }, /* 0x12577 = 0x1252B U+309A */
                    246:   { 0x252d, 0x2578 }, /* 0x12578 = 0x1252D U+309A */
                    247:   { 0x252f, 0x2579 }, /* 0x12579 = 0x1252F U+309A */
                    248:   { 0x2531, 0x257a }, /* 0x1257A = 0x12531 U+309A */
                    249:   { 0x2533, 0x257b }, /* 0x1257B = 0x12533 U+309A */
                    250:   { 0x253b, 0x257c }, /* 0x1257C = 0x1253B U+309A */
                    251:   { 0x2544, 0x257d }, /* 0x1257D = 0x12544 U+309A */
                    252:   { 0x2548, 0x257e }, /* 0x1257E = 0x12548 U+309A */
                    253:   { 0x2675, 0x2678 }, /* 0x12678 = 0x12675 U+309A */
                    254: };
                    255: 
                    256: #define SPLIT_STATE \
                    257:   unsigned short lasttwo = state >> 3; state_t prevstate = state >> 19; state &= 7
                    258: #define COMBINE_STATE \
                    259:   state |= (prevstate << 19) | (lasttwo << 3)
                    260: #define COMBINE_STATE_NO_LASTTWO \
                    261:   /* assume lasttwo == 0, then prevstate is ignored */
                    262: 
                    263: static int
                    264: iso2022_jp3_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
                    265: {
                    266:   int count = 0;
                    267:   unsigned char buf[2];
                    268:   unsigned short jch;
                    269:   int ret;
                    270:   state_t state = conv->ostate;
                    271:   SPLIT_STATE;
                    272: 
                    273:   if (lasttwo) {
                    274:     /* Attempt to combine the last character with this one. */
                    275:     unsigned int idx;
                    276:     unsigned int len;
                    277: 
                    278:     if (wc == 0x02e5)
                    279:       idx = iso2022_jp3_comp_table02e5_idx,
                    280:       len = iso2022_jp3_comp_table02e5_len;
                    281:     else if (wc == 0x02e9)
                    282:       idx = iso2022_jp3_comp_table02e9_idx,
                    283:       len = iso2022_jp3_comp_table02e9_len;
                    284:     else if (wc == 0x0300)
                    285:       idx = iso2022_jp3_comp_table0300_idx,
                    286:       len = iso2022_jp3_comp_table0300_len;
                    287:     else if (wc == 0x0301)
                    288:       idx = iso2022_jp3_comp_table0301_idx,
                    289:       len = iso2022_jp3_comp_table0301_len;
                    290:     else if (wc == 0x309a)
                    291:       idx = iso2022_jp3_comp_table309a_idx,
                    292:       len = iso2022_jp3_comp_table309a_len;
                    293:     else
                    294:       goto not_combining;
                    295: 
                    296:     do
                    297:       if (iso2022_jp3_comp_table_data[idx].base == lasttwo)
                    298:         break;
                    299:     while (++idx, --len > 0);
                    300: 
                    301:     if (len > 0) {
                    302:       /* Output the combined character. */
                    303:       /* We know the combined character is in JISX0213 plane 1, but
                    304:          the buffered character may have been in JISX0208 or in
                    305:          JISX0213 plane 1. */
                    306:       count = (state != STATE_JISX02131 ? 4 : 0) + 2;
                    307:       if (n < count)
                    308:         return RET_TOOSMALL;
                    309:       if (state != STATE_JISX02131) {
                    310:         r[0] = ESC;
                    311:         r[1] = '$';
                    312:         r[2] = '(';
                    313:         r[3] = 'Q';
                    314:         r += 4;
                    315:         state = STATE_JISX02131;
                    316:       }
                    317:       lasttwo = iso2022_jp3_comp_table_data[idx].composed;
                    318:       r[0] = (lasttwo >> 8) & 0xff;
                    319:       r[1] = lasttwo & 0xff;
                    320:       COMBINE_STATE_NO_LASTTWO;
                    321:       conv->ostate = state;
                    322:       return count;
                    323:     }
                    324: 
                    325:   not_combining:
                    326:     /* Output the buffered character. */
                    327:     /* We know it is in JISX0208 or in JISX0213 plane 1. */
                    328:     count = (prevstate != state ? 3 : 0) + 2;
                    329:     if (n < count)
                    330:       return RET_TOOSMALL;
                    331:     if (prevstate != state) {
                    332:       if (state != STATE_JISX0208) abort();
                    333:       r[0] = ESC;
                    334:       r[1] = '$';
                    335:       r[2] = 'B';
                    336:       r += 3;
                    337:     }
                    338:     r[0] = (lasttwo >> 8) & 0xff;
                    339:     r[1] = lasttwo & 0xff;
                    340:     r += 2;
                    341:   }
                    342: 
                    343:   /* Try ASCII. */
                    344:   ret = ascii_wctomb(conv,buf,wc,1);
                    345:   if (ret != RET_ILUNI) {
                    346:     if (ret != 1) abort();
                    347:     if (buf[0] < 0x80) {
                    348:       count += (state == STATE_ASCII ? 1 : 4);
                    349:       if (n < count)
                    350:         return RET_TOOSMALL;
                    351:       if (state != STATE_ASCII) {
                    352:         r[0] = ESC;
                    353:         r[1] = '(';
                    354:         r[2] = 'B';
                    355:         r += 3;
                    356:         state = STATE_ASCII;
                    357:       }
                    358:       r[0] = buf[0];
                    359:       COMBINE_STATE_NO_LASTTWO;
                    360:       conv->ostate = state;
                    361:       return count;
                    362:     }
                    363:   }
                    364: 
                    365:   /* Try JIS X 0201-1976 Roman. */
                    366:   ret = jisx0201_wctomb(conv,buf,wc,1);
                    367:   if (ret != RET_ILUNI) {
                    368:     if (ret != 1) abort();
                    369:     if (buf[0] < 0x80) {
                    370:       count += (state == STATE_JISX0201ROMAN ? 1 : 4);
                    371:       if (n < count)
                    372:         return RET_TOOSMALL;
                    373:       if (state != STATE_JISX0201ROMAN) {
                    374:         r[0] = ESC;
                    375:         r[1] = '(';
                    376:         r[2] = 'J';
                    377:         r += 3;
                    378:         state = STATE_JISX0201ROMAN;
                    379:       }
                    380:       r[0] = buf[0];
                    381:       COMBINE_STATE_NO_LASTTWO;
                    382:       conv->ostate = state;
                    383:       return count;
                    384:     }
                    385:   }
                    386: 
                    387:   jch = ucs4_to_jisx0213(wc);
                    388: 
                    389:   /* Try JIS X 0208-1990 in place of JIS X 0208-1978 and JIS X 0208-1983. */
                    390:   ret = jisx0208_wctomb(conv,buf,wc,2);
                    391:   if (ret != RET_ILUNI) {
                    392:     if (ret != 2) abort();
                    393:     if (buf[0] < 0x80 && buf[1] < 0x80) {
                    394:       if (jch & 0x0080) {
                    395:         /* A possible match in comp_table_data. Buffer it. */
                    396:         prevstate = state;
                    397:         lasttwo = jch & 0x7f7f;
                    398:         state = STATE_JISX0208;
                    399:         COMBINE_STATE;
                    400:         conv->ostate = state;
                    401:         return count;
                    402:       } else {
                    403:         count += (state == STATE_JISX0208 ? 2 : 5);
                    404:         if (n < count)
                    405:           return RET_TOOSMALL;
                    406:         if (state != STATE_JISX0208) {
                    407:           r[0] = ESC;
                    408:           r[1] = '$';
                    409:           r[2] = 'B';
                    410:           r += 3;
                    411:           state = STATE_JISX0208;
                    412:         }
                    413:         r[0] = buf[0];
                    414:         r[1] = buf[1];
                    415:         COMBINE_STATE_NO_LASTTWO;
                    416:         conv->ostate = state;
                    417:         return count;
                    418:       }
                    419:     }
                    420:   }
                    421: 
                    422:   /* Try JISX 0213 plane 1 and JISX 0213 plane 2. */
                    423:   if (jch != 0) {
                    424:     if (jch & 0x8000) {
                    425:       /* JISX 0213 plane 2. */
                    426:       if (state != STATE_JISX02132) {
                    427:         count += 4;
                    428:         if (n < count)
                    429:           return RET_TOOSMALL;
                    430:         r[0] = ESC;
                    431:         r[1] = '$';
                    432:         r[2] = '(';
                    433:         r[3] = 'P';
                    434:         r += 4;
                    435:         state = STATE_JISX02132;
                    436:       }
                    437:     } else {
                    438:       /* JISX 0213 plane 1. */
                    439:       if (state != STATE_JISX02131) {
                    440:         count += 4;
                    441:         if (n < count)
                    442:           return RET_TOOSMALL;
                    443:         r[0] = ESC;
                    444:         r[1] = '$';
                    445:         r[2] = '(';
                    446:         r[3] = 'Q';
                    447:         r += 4;
                    448:         state = STATE_JISX02131;
                    449:       }
                    450:     }
                    451:     if (jch & 0x0080) {
                    452:       /* A possible match in comp_table_data. We have to buffer it. */
                    453:       /* We know it's a JISX 0213 plane 1 character. */
                    454:       if (jch & 0x8000) abort();
                    455:       prevstate = state;
                    456:       lasttwo = jch & 0x7f7f;
                    457:       COMBINE_STATE;
                    458:       conv->ostate = state;
                    459:       return count;
                    460:     }
                    461:     count += 2;
                    462:     if (n < count)
                    463:       return RET_TOOSMALL;
                    464:     r[0] = (jch >> 8) & 0x7f;
                    465:     r[1] = jch & 0x7f;
                    466:     COMBINE_STATE_NO_LASTTWO;
                    467:     conv->ostate = state;
                    468:     return count;
                    469:   }
                    470: 
                    471:   /* Try JIS X 0201-1976 Katakana. This is not officially part of
                    472:      ISO-2022-JP-3. Therefore we try it after all other attempts. */
                    473:   ret = jisx0201_wctomb(conv,buf,wc,1);
                    474:   if (ret != RET_ILUNI) {
                    475:     if (ret != 1) abort();
                    476:     if (buf[0] >= 0x80) {
                    477:       count += (state == STATE_JISX0201KATAKANA ? 1 : 4);
                    478:       if (n < count)
                    479:         return RET_TOOSMALL;
                    480:       if (state != STATE_JISX0201KATAKANA) {
                    481:         r[0] = ESC;
                    482:         r[1] = '(';
                    483:         r[2] = 'I';
                    484:         r += 3;
                    485:         state = STATE_JISX0201KATAKANA;
                    486:       }
                    487:       r[0] = buf[0]-0x80;
                    488:       COMBINE_STATE_NO_LASTTWO;
                    489:       conv->ostate = state;
                    490:       return count;
                    491:     }
                    492:   }
                    493: 
                    494:   return RET_ILUNI;
                    495: }
                    496: 
                    497: static int
                    498: iso2022_jp3_reset (conv_t conv, unsigned char *r, int n)
                    499: {
                    500:   state_t state = conv->ostate;
                    501:   SPLIT_STATE;
                    502:   {
                    503:     int count =
                    504:       (lasttwo ? (prevstate != state ? 3 : 0) + 2 : 0)
                    505:       + (state != STATE_ASCII ? 3 : 0);
                    506:     if (n < count)
                    507:       return RET_TOOSMALL;
                    508:     if (lasttwo) {
                    509:       if (prevstate != state) {
                    510:         if (state != STATE_JISX0208) abort();
                    511:         r[0] = ESC;
                    512:         r[1] = '$';
                    513:         r[2] = 'B';
                    514:         r += 3;
                    515:       }
                    516:       r[0] = (lasttwo >> 8) & 0xff;
                    517:       r[1] = lasttwo & 0xff;
                    518:       r += 2;
                    519:     }
                    520:     if (state != STATE_ASCII) {
                    521:       r[0] = ESC;
                    522:       r[1] = '(';
                    523:       r[2] = 'B';
                    524:     }
                    525:     /* conv->ostate = 0; will be done by the caller */
                    526:     return count;
                    527:   }
                    528: }
                    529: 
                    530: #undef COMBINE_STATE_NO_LASTTWO
                    531: #undef COMBINE_STATE
                    532: #undef SPLIT_STATE
                    533: #undef STATE_JISX02132
                    534: #undef STATE_JISX02131
                    535: #undef STATE_JISX0208
                    536: #undef STATE_JISX0201KATAKANA
                    537: #undef STATE_JISX0201ROMAN
                    538: #undef STATE_ASCII

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>