Annotation of embedaddon/libiconv/lib/johab_hangul.h, revision 1.1

1.1     ! misho       1: /*
        !             2:  * Copyright (C) 1999-2001 Free Software Foundation, Inc.
        !             3:  * This file is part of the GNU LIBICONV Library.
        !             4:  *
        !             5:  * The GNU LIBICONV Library is free software; you can redistribute it
        !             6:  * and/or modify it under the terms of the GNU Library General Public
        !             7:  * License as published by the Free Software Foundation; either version 2
        !             8:  * of the License, or (at your option) any later version.
        !             9:  *
        !            10:  * The GNU LIBICONV Library is distributed in the hope that it will be
        !            11:  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            12:  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            13:  * Library General Public License for more details.
        !            14:  *
        !            15:  * You should have received a copy of the GNU Library General Public
        !            16:  * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
        !            17:  * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
        !            18:  * Fifth Floor, Boston, MA 02110-1301, USA.
        !            19:  */
        !            20: 
        !            21: /*
        !            22:  * JOHAB Hangul
        !            23:  *
        !            24:  * Ken Lunde writes in his "CJKV Information Processing" book, p. 114:
        !            25:  * "Hangul can be composed of two or three jamo (some jamo are considered
        !            26:  *  compound). Johab uses 19 initial jamo (consonants), 21 medial jamo (vowels)
        !            27:  *  and 27 final jamo (consonants; 28 when you include the "fill" character
        !            28:  *  for Hangul containing only two jamo). Multiplying these numbers results in
        !            29:  *  11172."
        !            30:  *
        !            31:  * Structure of the Johab encoding (see p. 181-184):
        !            32:  *   bit 15 = 1
        !            33:  *   bit 14..10 = initial jamo, only 19+1 out of 32 possible values are used
        !            34:  *   bit 9..5 = medial jamo, only 21+1 out of 32 possible values are used
        !            35:  *   bit 4..0 = final jamo, only 27+1 out of 32 possible values are used
        !            36:  * 
        !            37:  * Structure of the Unicode encoding:
        !            38:  * grep '^0x\([8-C]...\|D[0-7]..\)' unicode.org-mappings/EASTASIA/KSC/JOHAB.TXT
        !            39:  * You see that all characters there are marked "HANGUL LETTER" or "HANGUL
        !            40:  * SYLLABLE". If you eliminate the "HANGUL LETTER"s, the table is sorted
        !            41:  * in ascending order according to Johab encoding and according to the Unicode
        !            42:  * encoding. Now look a little more carefully, and you see that the following
        !            43:  * formula holds:
        !            44:  *     unicode == 0xAC00
        !            45:  *                + 21 * 28 * (jamo_initial_index[(johab >> 10) & 31] - 1)
        !            46:  *                + 28 * (jamo_medial_index[(johab >> 5) & 31] - 1)
        !            47:  *                + jamo_final_index[johab & 31]
        !            48:  * where the index tables are defined as below.
        !            49:  */
        !            50: 
        !            51: /* Tables mapping 5-bit groups to jamo letters. */
        !            52: /* Note that Jamo XX = UHC 0xA4A0+XX = Unicode 0x3130+XX */
        !            53: #define NONE 0xfd
        !            54: #define FILL 0xff
        !            55: static const unsigned char jamo_initial[32] = {
        !            56:   NONE, FILL, 0x01, 0x02, 0x04, 0x07, 0x08, 0x09,
        !            57:   0x11, 0x12, 0x13, 0x15, 0x16, 0x17, 0x18, 0x19,
        !            58:   0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE, NONE,
        !            59:   NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
        !            60: };
        !            61: static const unsigned char jamo_medial[32] = {
        !            62:   NONE, NONE, FILL, 0x1f, 0x20, 0x21, 0x22, 0x23,
        !            63:   NONE, NONE, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
        !            64:   NONE, NONE, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
        !            65:   NONE, NONE, 0x30, 0x31, 0x32, 0x33, NONE, NONE,
        !            66: };
        !            67: static const unsigned char jamo_final[32] = {
        !            68:   NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
        !            69:   0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
        !            70:   0x10, 0x11, NONE, 0x12, 0x14, 0x15, 0x16, 0x17,
        !            71:   0x18, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE,
        !            72: };
        !            73: /* Same as jamo_final, except that it excludes characters already
        !            74:    contained in jamo_initial. 11 characters instead of 27. */
        !            75: static const unsigned char jamo_final_notinitial[32] = {
        !            76:   NONE, NONE, NONE, NONE, 0x03, NONE, 0x05, 0x06,
        !            77:   NONE, NONE, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
        !            78:   0x10, NONE, NONE, NONE, 0x14, NONE, NONE, NONE,
        !            79:   NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
        !            80: };
        !            81: 
        !            82: /* Tables mapping 5-bit groups to packed indices. */
        !            83: #define none -1
        !            84: #define fill 0
        !            85: static const signed char jamo_initial_index[32] = {
        !            86:   none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
        !            87:   0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
        !            88:   0x0f, 0x10, 0x11, 0x12, 0x13, none, none, none,
        !            89:   none, none, none, none, none, none, none, none,
        !            90: };
        !            91: static const signed char jamo_medial_index[32] = {
        !            92:   none, none, fill, 0x01, 0x02, 0x03, 0x04, 0x05,
        !            93:   none, none, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
        !            94:   none, none, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
        !            95:   none, none, 0x12, 0x13, 0x14, 0x15, none, none,
        !            96: };
        !            97: static const signed char jamo_final_index[32] = {
        !            98:   none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
        !            99:   0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
        !           100:   0x0f, 0x10, none, 0x11, 0x12, 0x13, 0x14, 0x15,
        !           101:   0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, none, none,
        !           102: };
        !           103: 
        !           104: static int
        !           105: johab_hangul_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
        !           106: {
        !           107:   unsigned char c1 = s[0];
        !           108:   if ((c1 >= 0x84 && c1 <= 0xd3)) {
        !           109:     if (n >= 2) {
        !           110:       unsigned char c2 = s[1];
        !           111:       if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) {
        !           112:         unsigned int johab = (c1 << 8) | c2;
        !           113:         unsigned int bitspart1 = (johab >> 10) & 31;
        !           114:         unsigned int bitspart2 = (johab >> 5) & 31;
        !           115:         unsigned int bitspart3 = johab & 31;
        !           116:         int index1 = jamo_initial_index[bitspart1];
        !           117:         int index2 = jamo_medial_index[bitspart2];
        !           118:         int index3 = jamo_final_index[bitspart3];
        !           119:         /* Exclude "none" values. */
        !           120:         if (index1 >= 0 && index2 >= 0 && index3 >= 0) {
        !           121:           /* Deal with "fill" values in initial or medial position. */
        !           122:           if (index1 == fill) {
        !           123:             if (index2 == fill) {
        !           124:               unsigned char jamo3 = jamo_final_notinitial[bitspart3];
        !           125:               if (jamo3 != NONE) {
        !           126:                 *pwc = (ucs4_t) 0x3130 + jamo3;
        !           127:                 return 2;
        !           128:               }
        !           129:             } else if (index3 == fill) {
        !           130:               unsigned char jamo2 = jamo_medial[bitspart2];
        !           131:               if (jamo2 != NONE && jamo2 != FILL) {
        !           132:                 *pwc = (ucs4_t) 0x3130 + jamo2;
        !           133:                 return 2;
        !           134:               }
        !           135:             }
        !           136:             /* Syllables composed only of medial and final don't exist. */
        !           137:           } else if (index2 == fill) {
        !           138:             if (index3 == fill) {
        !           139:               unsigned char jamo1 = jamo_initial[bitspart1];
        !           140:               if (jamo1 != NONE && jamo1 != FILL) {
        !           141:                 *pwc = (ucs4_t) 0x3130 + jamo1;
        !           142:                 return 2;
        !           143:               }
        !           144:             }
        !           145:             /* Syllables composed only of initial and final don't exist. */
        !           146:           } else {
        !           147:              /* index1 and index2 are not fill, but index3 may be fill. */
        !           148:              /* Nothing more to exclude. All 11172 code points are valid. */
        !           149:              *pwc = 0xac00 + ((index1 - 1) * 21 + (index2 - 1)) * 28 + index3;
        !           150:              return 2;
        !           151:           }
        !           152:         }
        !           153:       }
        !           154:       return RET_ILSEQ;
        !           155:     }
        !           156:     return RET_TOOFEW(0);
        !           157:   }
        !           158:   return RET_ILSEQ;
        !           159: }
        !           160: 
        !           161: /* 51 Jamo: 19 initial, 21 medial, 11 final not initial. */
        !           162: static const unsigned short johab_hangul_page31[51] = {
        !           163:           0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441, /*0x30-0x37*/
        !           164:   0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, /*0x38-0x3f*/
        !           165:   0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441, /*0x40-0x47*/
        !           166:   0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461, /*0x48-0x4f*/
        !           167:   0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1, /*0x50-0x57*/
        !           168:   0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, /*0x58-0x5f*/
        !           169:   0x8741, 0x8761, 0x8781, 0x87a1,                                 /*0x60-0x67*/
        !           170: };
        !           171: 
        !           172: /* Tables mapping packed indices to 5-bit groups. */
        !           173: /* index1+1 = jamo_initial_index[bitspart1]  <==>
        !           174:    bitspart1 = jamo_initial_index_inverse[index1] */
        !           175: static const char jamo_initial_index_inverse[19] = {
        !           176:               0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
        !           177:   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
        !           178:   0x10, 0x11, 0x12, 0x13, 0x14,
        !           179: };
        !           180: /* index2+1 = jamo_medial_index[bitspart2]  <==>
        !           181:    bitspart2 = jamo_medial_index_inverse[index2] */
        !           182: static const char jamo_medial_index_inverse[21] = {
        !           183:                     0x03, 0x04, 0x05, 0x06, 0x07,
        !           184:               0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
        !           185:               0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
        !           186:               0x1a, 0x1b, 0x1c, 0x1d,
        !           187: };
        !           188: /* index3 = jamo_final_index[bitspart3]  <==>
        !           189:    bitspart3 = jamo_final_index_inverse[index3] */
        !           190: static const char jamo_final_index_inverse[28] = {
        !           191:         0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
        !           192:   0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
        !           193:   0x10, 0x11,       0x13, 0x14, 0x15, 0x16, 0x17,
        !           194:   0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
        !           195: };
        !           196: 
        !           197: static int
        !           198: johab_hangul_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
        !           199: {
        !           200:   if (n >= 2) {
        !           201:     if (wc >= 0x3131 && wc < 0x3164) {
        !           202:       unsigned short c = johab_hangul_page31[wc-0x3131];
        !           203:       r[0] = (c >> 8); r[1] = (c & 0xff);
        !           204:       return 2;
        !           205:     } else if (wc >= 0xac00 && wc < 0xd7a4) {
        !           206:       unsigned int index1;
        !           207:       unsigned int index2;
        !           208:       unsigned int index3;
        !           209:       unsigned short c;
        !           210:       unsigned int tmp = wc - 0xac00;
        !           211:       index3 = tmp % 28; tmp = tmp / 28;
        !           212:       index2 = tmp % 21; tmp = tmp / 21;
        !           213:       index1 = tmp;
        !           214:       c = (((((1 << 5)
        !           215:               | jamo_initial_index_inverse[index1]) << 5)
        !           216:             | jamo_medial_index_inverse[index2]) << 5)
        !           217:           | jamo_final_index_inverse[index3];
        !           218:       r[0] = (c >> 8); r[1] = (c & 0xff);
        !           219:       return 2;
        !           220:     }
        !           221:     return RET_ILUNI;
        !           222:   }
        !           223:   return RET_TOOSMALL;
        !           224: }
        !           225: 
        !           226: /*
        !           227:  * Decomposition of JOHAB Hangul in one to three Johab Jamo elements.
        !           228:  */
        !           229: 
        !           230: /* Decompose wc into r[0..2], and return the number of resulting Jamo elements.
        !           231:    Return RET_ILUNI if decomposition is not possible. */
        !           232: 
        !           233: static int johab_hangul_decompose (conv_t conv, ucs4_t* r, ucs4_t wc)
        !           234: {
        !           235:   unsigned char buf[2];
        !           236:   int ret = johab_hangul_wctomb(conv,buf,wc,2);
        !           237:   if (ret != RET_ILUNI) {
        !           238:     unsigned int hangul = (buf[0] << 8) | buf[1];
        !           239:     unsigned char jamo1 = jamo_initial[(hangul >> 10) & 31];
        !           240:     unsigned char jamo2 = jamo_medial[(hangul >> 5) & 31];
        !           241:     unsigned char jamo3 = jamo_final[hangul & 31];
        !           242:     if ((hangul >> 15) != 1) abort();
        !           243:     if (jamo1 != NONE && jamo2 != NONE && jamo3 != NONE) {
        !           244:       /* They are not all three == FILL because that would correspond to
        !           245:          johab = 0x8441, which doesn't exist. */
        !           246:       ucs4_t* p = r;
        !           247:       if (jamo1 != FILL)
        !           248:         *p++ = 0x3130 + jamo1;
        !           249:       if (jamo2 != FILL)
        !           250:         *p++ = 0x3130 + jamo2;
        !           251:       if (jamo3 != FILL)
        !           252:         *p++ = 0x3130 + jamo3;
        !           253:       return p-r;
        !           254:     }
        !           255:   }
        !           256:   return RET_ILUNI;
        !           257: }
        !           258: 
        !           259: #undef fill
        !           260: #undef none
        !           261: #undef FILL
        !           262: #undef NONE

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>