Annotation of embedaddon/libiconv/lib/iso2022_jp3.h, revision 1.1.1.2

1.1       misho       1: /*
1.1.1.2 ! misho       2:  * Copyright (C) 1999-2004, 2008, 2016 Free Software Foundation, Inc.
1.1       misho       3:  * This file is part of the GNU LIBICONV Library.
                      4:  *
                      5:  * The GNU LIBICONV Library is free software; you can redistribute it
                      6:  * and/or modify it under the terms of the GNU Library General Public
                      7:  * License as published by the Free Software Foundation; either version 2
                      8:  * of the License, or (at your option) any later version.
                      9:  *
                     10:  * The GNU LIBICONV Library is distributed in the hope that it will be
                     11:  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
                     12:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     13:  * Library General Public License for more details.
                     14:  *
                     15:  * You should have received a copy of the GNU Library General Public
                     16:  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
1.1.1.2 ! misho      17:  * If not, see <https://www.gnu.org/licenses/>.
1.1       misho      18:  */
                     19: 
                     20: /*
                     21:  * ISO-2022-JP-3
                     22:  */
                     23: 
                     24: #include "jisx0213.h"
                     25: 
                     26: #define ESC 0x1b
                     27: 
                     28: /*
                     29:  * The state is composed of one of the following values
                     30:  */
                     31: #define STATE_ASCII             0  /* Esc ( B */
                     32: #define STATE_JISX0201ROMAN     1  /* Esc ( J */
                     33: #define STATE_JISX0201KATAKANA  2  /* Esc ( I */
                     34: #define STATE_JISX0208          3  /* Esc $ @ or Esc $ B */
                     35: #define STATE_JISX02131         4  /* Esc $ ( O or Esc $ ( Q*/
                     36: #define STATE_JISX02132         5  /* Esc $ ( P */
                     37: 
                     38: /*
                     39:  * In the ISO-2022-JP-3 to UCS-4 direction, the state also holds the last
                     40:  * character to be output, shifted by 3 bits.
                     41:  */
                     42: 
                     43: static int
1.1.1.2 ! misho      44: iso2022_jp3_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)
1.1       misho      45: {
                     46:   ucs4_t last_wc = conv->istate >> 3;
                     47:   if (last_wc) {
                     48:     /* Output the buffered character. */
                     49:     conv->istate &= 7;
                     50:     *pwc = last_wc;
                     51:     return 0; /* Don't advance the input pointer. */
                     52:   } else {
                     53:     state_t state = conv->istate;
                     54:     int count = 0;
                     55:     unsigned char c;
                     56:     for (;;) {
                     57:       c = *s;
                     58:       if (c == ESC) {
                     59:         if (n < count+3)
                     60:           goto none;
                     61:         if (s[1] == '(') {
                     62:           if (s[2] == 'B') {
                     63:             state = STATE_ASCII;
                     64:             s += 3; count += 3;
                     65:             if (n < count+1)
                     66:               goto none;
                     67:             continue;
                     68:           }
                     69:           if (s[2] == 'J') {
                     70:             state = STATE_JISX0201ROMAN;
                     71:             s += 3; count += 3;
                     72:             if (n < count+1)
                     73:               goto none;
                     74:             continue;
                     75:           }
                     76:           if (s[2] == 'I') {
                     77:             state = STATE_JISX0201KATAKANA;
                     78:             s += 3; count += 3;
                     79:             if (n < count+1)
                     80:               goto none;
                     81:             continue;
                     82:           }
                     83:           goto ilseq;
                     84:         }
                     85:         if (s[1] == '$') {
                     86:           if (s[2] == '@' || s[2] == 'B') {
                     87:             /* We don't distinguish JIS X 0208-1978 and JIS X 0208-1983. */
                     88:             state = STATE_JISX0208;
                     89:             s += 3; count += 3;
                     90:             if (n < count+1)
                     91:               goto none;
                     92:             continue;
                     93:           }
                     94:           if (s[2] == '(') {
                     95:             if (n < count+4)
                     96:               goto none;
                     97:             if (s[3] == 'O' || s[3] == 'Q') {
                     98:               state = STATE_JISX02131;
                     99:               s += 4; count += 4;
                    100:               if (n < count+1)
                    101:                 goto none;
                    102:               continue;
                    103:             }
                    104:             if (s[3] == 'P') {
                    105:               state = STATE_JISX02132;
                    106:               s += 4; count += 4;
                    107:               if (n < count+1)
                    108:                 goto none;
                    109:               continue;
                    110:             }
                    111:           }
                    112:           goto ilseq;
                    113:         }
                    114:         goto ilseq;
                    115:       }
                    116:       break;
                    117:     }
                    118:     switch (state) {
                    119:       case STATE_ASCII:
                    120:         if (c < 0x80) {
                    121:           int ret = ascii_mbtowc(conv,pwc,s,1);
                    122:           if (ret == RET_ILSEQ)
                    123:             goto ilseq;
                    124:           if (ret != 1) abort();
                    125:           conv->istate = state;
                    126:           return count+1;
                    127:         } else
                    128:           goto ilseq;
                    129:       case STATE_JISX0201ROMAN:
                    130:         if (c < 0x80) {
                    131:           int ret = jisx0201_mbtowc(conv,pwc,s,1);
                    132:           if (ret == RET_ILSEQ)
                    133:             goto ilseq;
                    134:           if (ret != 1) abort();
                    135:           conv->istate = state;
                    136:           return count+1;
                    137:         } else
                    138:           goto ilseq;
                    139:       case STATE_JISX0201KATAKANA:
                    140:         if (c < 0x80) {
                    141:           unsigned char buf = c+0x80;
                    142:           int ret = jisx0201_mbtowc(conv,pwc,&buf,1);
                    143:           if (ret == RET_ILSEQ)
                    144:             goto ilseq;
                    145:           if (ret != 1) abort();
                    146:           conv->istate = state;
                    147:           return count+1;
                    148:         } else
                    149:           goto ilseq;
                    150:       case STATE_JISX0208:
                    151:         if (n < count+2)
                    152:           goto none;
                    153:         if (s[0] < 0x80 && s[1] < 0x80) {
                    154:           int ret = jisx0208_mbtowc(conv,pwc,s,2);
                    155:           if (ret == RET_ILSEQ)
                    156:             goto ilseq;
                    157:           if (ret != 2) abort();
                    158:           conv->istate = state;
                    159:           return count+2;
                    160:         } else
                    161:           goto ilseq;
                    162:       case STATE_JISX02131:
                    163:       case STATE_JISX02132:
                    164:         if (n < count+2)
                    165:           goto none;
                    166:         if (s[0] < 0x80 && s[1] < 0x80) {
                    167:           ucs4_t wc = jisx0213_to_ucs4(((state-STATE_JISX02131+1)<<8)+s[0],s[1]);
                    168:           if (wc) {
                    169:             if (wc < 0x80) {
                    170:               /* It's a combining character. */
                    171:               ucs4_t wc1 = jisx0213_to_ucs_combining[wc - 1][0];
                    172:               ucs4_t wc2 = jisx0213_to_ucs_combining[wc - 1][1];
                    173:               /* We cannot output two Unicode characters at once. So,
                    174:                  output the first character and buffer the second one. */
                    175:               *pwc = wc1;
                    176:               conv->istate = (wc2 << 3) | state;
                    177:             } else {
                    178:               *pwc = wc;
                    179:               conv->istate = state;
                    180:             }
                    181:             return count+2;
                    182:           }
                    183:         }
                    184:         goto ilseq;
                    185:       default: abort();
                    186:     }
                    187:   none:
                    188:     conv->istate = state;
                    189:     return RET_TOOFEW(count);
                    190: 
                    191:   ilseq:
                    192:     conv->istate = state;
                    193:     return RET_SHIFT_ILSEQ(count);
                    194:   }
                    195: }
                    196: 
                    197: static int
                    198: iso2022_jp3_flushwc (conv_t conv, ucs4_t *pwc)
                    199: {
                    200:   ucs4_t last_wc = conv->istate >> 3;
                    201:   if (last_wc) {
                    202:     /* Output the buffered character. */
                    203:     conv->istate &= 7;
                    204:     *pwc = last_wc;
                    205:     return 1;
                    206:   } else
                    207:     return 0;
                    208: }
                    209: 
                    210: /*
                    211:  * In the UCS-4 to ISO-2022-JP-3 direction, the state also holds the last two
                    212:  * bytes to be output, shifted by 3 bits, and the STATE_xxxxx value that was
                    213:  * effective before this buffered character, shifted by 19 bits.
                    214:  */
                    215: 
                    216: /* Composition tables for each of the relevant combining characters.  */
                    217: static const struct { unsigned short base; unsigned short composed; } iso2022_jp3_comp_table_data[] = {
                    218: #define iso2022_jp3_comp_table02e5_idx 0
                    219: #define iso2022_jp3_comp_table02e5_len 1
                    220:   { 0x2b64, 0x2b65 }, /* 0x12B65 = 0x12B64 U+02E5 */
                    221: #define iso2022_jp3_comp_table02e9_idx (iso2022_jp3_comp_table02e5_idx+iso2022_jp3_comp_table02e5_len)
                    222: #define iso2022_jp3_comp_table02e9_len 1
                    223:   { 0x2b60, 0x2b66 }, /* 0x12B66 = 0x12B60 U+02E9 */
                    224: #define iso2022_jp3_comp_table0300_idx (iso2022_jp3_comp_table02e9_idx+iso2022_jp3_comp_table02e9_len)
                    225: #define iso2022_jp3_comp_table0300_len 5
                    226:   { 0x295c, 0x2b44 }, /* 0x12B44 = 0x1295C U+0300 */
                    227:   { 0x2b38, 0x2b48 }, /* 0x12B48 = 0x12B38 U+0300 */
                    228:   { 0x2b37, 0x2b4a }, /* 0x12B4A = 0x12B37 U+0300 */
                    229:   { 0x2b30, 0x2b4c }, /* 0x12B4C = 0x12B30 U+0300 */
                    230:   { 0x2b43, 0x2b4e }, /* 0x12B4E = 0x12B43 U+0300 */
                    231: #define iso2022_jp3_comp_table0301_idx (iso2022_jp3_comp_table0300_idx+iso2022_jp3_comp_table0300_len)
                    232: #define iso2022_jp3_comp_table0301_len 4
                    233:   { 0x2b38, 0x2b49 }, /* 0x12B49 = 0x12B38 U+0301 */
                    234:   { 0x2b37, 0x2b4b }, /* 0x12B4B = 0x12B37 U+0301 */
                    235:   { 0x2b30, 0x2b4d }, /* 0x12B4D = 0x12B30 U+0301 */
                    236:   { 0x2b43, 0x2b4f }, /* 0x12B4F = 0x12B43 U+0301 */
                    237: #define iso2022_jp3_comp_table309a_idx (iso2022_jp3_comp_table0301_idx+iso2022_jp3_comp_table0301_len)
                    238: #define iso2022_jp3_comp_table309a_len 14
                    239:   { 0x242b, 0x2477 }, /* 0x12477 = 0x1242B U+309A */
                    240:   { 0x242d, 0x2478 }, /* 0x12478 = 0x1242D U+309A */
                    241:   { 0x242f, 0x2479 }, /* 0x12479 = 0x1242F U+309A */
                    242:   { 0x2431, 0x247a }, /* 0x1247A = 0x12431 U+309A */
                    243:   { 0x2433, 0x247b }, /* 0x1247B = 0x12433 U+309A */
                    244:   { 0x252b, 0x2577 }, /* 0x12577 = 0x1252B U+309A */
                    245:   { 0x252d, 0x2578 }, /* 0x12578 = 0x1252D U+309A */
                    246:   { 0x252f, 0x2579 }, /* 0x12579 = 0x1252F U+309A */
                    247:   { 0x2531, 0x257a }, /* 0x1257A = 0x12531 U+309A */
                    248:   { 0x2533, 0x257b }, /* 0x1257B = 0x12533 U+309A */
                    249:   { 0x253b, 0x257c }, /* 0x1257C = 0x1253B U+309A */
                    250:   { 0x2544, 0x257d }, /* 0x1257D = 0x12544 U+309A */
                    251:   { 0x2548, 0x257e }, /* 0x1257E = 0x12548 U+309A */
                    252:   { 0x2675, 0x2678 }, /* 0x12678 = 0x12675 U+309A */
                    253: };
                    254: 
                    255: #define SPLIT_STATE \
                    256:   unsigned short lasttwo = state >> 3; state_t prevstate = state >> 19; state &= 7
                    257: #define COMBINE_STATE \
                    258:   state |= (prevstate << 19) | (lasttwo << 3)
                    259: #define COMBINE_STATE_NO_LASTTWO \
                    260:   /* assume lasttwo == 0, then prevstate is ignored */
                    261: 
                    262: static int
1.1.1.2 ! misho     263: iso2022_jp3_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)
1.1       misho     264: {
                    265:   int count = 0;
                    266:   unsigned char buf[2];
                    267:   unsigned short jch;
                    268:   int ret;
                    269:   state_t state = conv->ostate;
                    270:   SPLIT_STATE;
                    271: 
                    272:   if (lasttwo) {
                    273:     /* Attempt to combine the last character with this one. */
                    274:     unsigned int idx;
                    275:     unsigned int len;
                    276: 
                    277:     if (wc == 0x02e5)
                    278:       idx = iso2022_jp3_comp_table02e5_idx,
                    279:       len = iso2022_jp3_comp_table02e5_len;
                    280:     else if (wc == 0x02e9)
                    281:       idx = iso2022_jp3_comp_table02e9_idx,
                    282:       len = iso2022_jp3_comp_table02e9_len;
                    283:     else if (wc == 0x0300)
                    284:       idx = iso2022_jp3_comp_table0300_idx,
                    285:       len = iso2022_jp3_comp_table0300_len;
                    286:     else if (wc == 0x0301)
                    287:       idx = iso2022_jp3_comp_table0301_idx,
                    288:       len = iso2022_jp3_comp_table0301_len;
                    289:     else if (wc == 0x309a)
                    290:       idx = iso2022_jp3_comp_table309a_idx,
                    291:       len = iso2022_jp3_comp_table309a_len;
                    292:     else
                    293:       goto not_combining;
                    294: 
                    295:     do
                    296:       if (iso2022_jp3_comp_table_data[idx].base == lasttwo)
                    297:         break;
                    298:     while (++idx, --len > 0);
                    299: 
                    300:     if (len > 0) {
                    301:       /* Output the combined character. */
                    302:       /* We know the combined character is in JISX0213 plane 1, but
                    303:          the buffered character may have been in JISX0208 or in
                    304:          JISX0213 plane 1. */
                    305:       count = (state != STATE_JISX02131 ? 4 : 0) + 2;
                    306:       if (n < count)
                    307:         return RET_TOOSMALL;
                    308:       if (state != STATE_JISX02131) {
                    309:         r[0] = ESC;
                    310:         r[1] = '$';
                    311:         r[2] = '(';
                    312:         r[3] = 'Q';
                    313:         r += 4;
                    314:         state = STATE_JISX02131;
                    315:       }
                    316:       lasttwo = iso2022_jp3_comp_table_data[idx].composed;
                    317:       r[0] = (lasttwo >> 8) & 0xff;
                    318:       r[1] = lasttwo & 0xff;
                    319:       COMBINE_STATE_NO_LASTTWO;
                    320:       conv->ostate = state;
                    321:       return count;
                    322:     }
                    323: 
                    324:   not_combining:
                    325:     /* Output the buffered character. */
                    326:     /* We know it is in JISX0208 or in JISX0213 plane 1. */
                    327:     count = (prevstate != state ? 3 : 0) + 2;
                    328:     if (n < count)
                    329:       return RET_TOOSMALL;
                    330:     if (prevstate != state) {
                    331:       if (state != STATE_JISX0208) abort();
                    332:       r[0] = ESC;
                    333:       r[1] = '$';
                    334:       r[2] = 'B';
                    335:       r += 3;
                    336:     }
                    337:     r[0] = (lasttwo >> 8) & 0xff;
                    338:     r[1] = lasttwo & 0xff;
                    339:     r += 2;
                    340:   }
                    341: 
                    342:   /* Try ASCII. */
                    343:   ret = ascii_wctomb(conv,buf,wc,1);
                    344:   if (ret != RET_ILUNI) {
                    345:     if (ret != 1) abort();
                    346:     if (buf[0] < 0x80) {
                    347:       count += (state == STATE_ASCII ? 1 : 4);
                    348:       if (n < count)
                    349:         return RET_TOOSMALL;
                    350:       if (state != STATE_ASCII) {
                    351:         r[0] = ESC;
                    352:         r[1] = '(';
                    353:         r[2] = 'B';
                    354:         r += 3;
                    355:         state = STATE_ASCII;
                    356:       }
                    357:       r[0] = buf[0];
                    358:       COMBINE_STATE_NO_LASTTWO;
                    359:       conv->ostate = state;
                    360:       return count;
                    361:     }
                    362:   }
                    363: 
                    364:   /* Try JIS X 0201-1976 Roman. */
                    365:   ret = jisx0201_wctomb(conv,buf,wc,1);
                    366:   if (ret != RET_ILUNI) {
                    367:     if (ret != 1) abort();
                    368:     if (buf[0] < 0x80) {
                    369:       count += (state == STATE_JISX0201ROMAN ? 1 : 4);
                    370:       if (n < count)
                    371:         return RET_TOOSMALL;
                    372:       if (state != STATE_JISX0201ROMAN) {
                    373:         r[0] = ESC;
                    374:         r[1] = '(';
                    375:         r[2] = 'J';
                    376:         r += 3;
                    377:         state = STATE_JISX0201ROMAN;
                    378:       }
                    379:       r[0] = buf[0];
                    380:       COMBINE_STATE_NO_LASTTWO;
                    381:       conv->ostate = state;
                    382:       return count;
                    383:     }
                    384:   }
                    385: 
                    386:   jch = ucs4_to_jisx0213(wc);
                    387: 
                    388:   /* Try JIS X 0208-1990 in place of JIS X 0208-1978 and JIS X 0208-1983. */
                    389:   ret = jisx0208_wctomb(conv,buf,wc,2);
                    390:   if (ret != RET_ILUNI) {
                    391:     if (ret != 2) abort();
                    392:     if (buf[0] < 0x80 && buf[1] < 0x80) {
                    393:       if (jch & 0x0080) {
                    394:         /* A possible match in comp_table_data. Buffer it. */
                    395:         prevstate = state;
                    396:         lasttwo = jch & 0x7f7f;
                    397:         state = STATE_JISX0208;
                    398:         COMBINE_STATE;
                    399:         conv->ostate = state;
                    400:         return count;
                    401:       } else {
                    402:         count += (state == STATE_JISX0208 ? 2 : 5);
                    403:         if (n < count)
                    404:           return RET_TOOSMALL;
                    405:         if (state != STATE_JISX0208) {
                    406:           r[0] = ESC;
                    407:           r[1] = '$';
                    408:           r[2] = 'B';
                    409:           r += 3;
                    410:           state = STATE_JISX0208;
                    411:         }
                    412:         r[0] = buf[0];
                    413:         r[1] = buf[1];
                    414:         COMBINE_STATE_NO_LASTTWO;
                    415:         conv->ostate = state;
                    416:         return count;
                    417:       }
                    418:     }
                    419:   }
                    420: 
                    421:   /* Try JISX 0213 plane 1 and JISX 0213 plane 2. */
                    422:   if (jch != 0) {
                    423:     if (jch & 0x8000) {
                    424:       /* JISX 0213 plane 2. */
                    425:       if (state != STATE_JISX02132) {
                    426:         count += 4;
                    427:         if (n < count)
                    428:           return RET_TOOSMALL;
                    429:         r[0] = ESC;
                    430:         r[1] = '$';
                    431:         r[2] = '(';
                    432:         r[3] = 'P';
                    433:         r += 4;
                    434:         state = STATE_JISX02132;
                    435:       }
                    436:     } else {
                    437:       /* JISX 0213 plane 1. */
                    438:       if (state != STATE_JISX02131) {
                    439:         count += 4;
                    440:         if (n < count)
                    441:           return RET_TOOSMALL;
                    442:         r[0] = ESC;
                    443:         r[1] = '$';
                    444:         r[2] = '(';
                    445:         r[3] = 'Q';
                    446:         r += 4;
                    447:         state = STATE_JISX02131;
                    448:       }
                    449:     }
                    450:     if (jch & 0x0080) {
                    451:       /* A possible match in comp_table_data. We have to buffer it. */
                    452:       /* We know it's a JISX 0213 plane 1 character. */
                    453:       if (jch & 0x8000) abort();
                    454:       prevstate = state;
                    455:       lasttwo = jch & 0x7f7f;
                    456:       COMBINE_STATE;
                    457:       conv->ostate = state;
                    458:       return count;
                    459:     }
                    460:     count += 2;
                    461:     if (n < count)
                    462:       return RET_TOOSMALL;
                    463:     r[0] = (jch >> 8) & 0x7f;
                    464:     r[1] = jch & 0x7f;
                    465:     COMBINE_STATE_NO_LASTTWO;
                    466:     conv->ostate = state;
                    467:     return count;
                    468:   }
                    469: 
                    470:   /* Try JIS X 0201-1976 Katakana. This is not officially part of
                    471:      ISO-2022-JP-3. Therefore we try it after all other attempts. */
                    472:   ret = jisx0201_wctomb(conv,buf,wc,1);
                    473:   if (ret != RET_ILUNI) {
                    474:     if (ret != 1) abort();
                    475:     if (buf[0] >= 0x80) {
                    476:       count += (state == STATE_JISX0201KATAKANA ? 1 : 4);
                    477:       if (n < count)
                    478:         return RET_TOOSMALL;
                    479:       if (state != STATE_JISX0201KATAKANA) {
                    480:         r[0] = ESC;
                    481:         r[1] = '(';
                    482:         r[2] = 'I';
                    483:         r += 3;
                    484:         state = STATE_JISX0201KATAKANA;
                    485:       }
                    486:       r[0] = buf[0]-0x80;
                    487:       COMBINE_STATE_NO_LASTTWO;
                    488:       conv->ostate = state;
                    489:       return count;
                    490:     }
                    491:   }
                    492: 
                    493:   return RET_ILUNI;
                    494: }
                    495: 
                    496: static int
1.1.1.2 ! misho     497: iso2022_jp3_reset (conv_t conv, unsigned char *r, size_t n)
1.1       misho     498: {
                    499:   state_t state = conv->ostate;
                    500:   SPLIT_STATE;
                    501:   {
                    502:     int count =
                    503:       (lasttwo ? (prevstate != state ? 3 : 0) + 2 : 0)
                    504:       + (state != STATE_ASCII ? 3 : 0);
                    505:     if (n < count)
                    506:       return RET_TOOSMALL;
                    507:     if (lasttwo) {
                    508:       if (prevstate != state) {
                    509:         if (state != STATE_JISX0208) abort();
                    510:         r[0] = ESC;
                    511:         r[1] = '$';
                    512:         r[2] = 'B';
                    513:         r += 3;
                    514:       }
                    515:       r[0] = (lasttwo >> 8) & 0xff;
                    516:       r[1] = lasttwo & 0xff;
                    517:       r += 2;
                    518:     }
                    519:     if (state != STATE_ASCII) {
                    520:       r[0] = ESC;
                    521:       r[1] = '(';
                    522:       r[2] = 'B';
                    523:     }
                    524:     /* conv->ostate = 0; will be done by the caller */
                    525:     return count;
                    526:   }
                    527: }
                    528: 
                    529: #undef COMBINE_STATE_NO_LASTTWO
                    530: #undef COMBINE_STATE
                    531: #undef SPLIT_STATE
                    532: #undef STATE_JISX02132
                    533: #undef STATE_JISX02131
                    534: #undef STATE_JISX0208
                    535: #undef STATE_JISX0201KATAKANA
                    536: #undef STATE_JISX0201ROMAN
                    537: #undef STATE_ASCII

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>