Annotation of embedaddon/libiconv/lib/johab_hangul.h, revision 1.1
1.1 ! misho 1: /*
! 2: * Copyright (C) 1999-2001 Free Software Foundation, Inc.
! 3: * This file is part of the GNU LIBICONV Library.
! 4: *
! 5: * The GNU LIBICONV Library is free software; you can redistribute it
! 6: * and/or modify it under the terms of the GNU Library General Public
! 7: * License as published by the Free Software Foundation; either version 2
! 8: * of the License, or (at your option) any later version.
! 9: *
! 10: * The GNU LIBICONV Library is distributed in the hope that it will be
! 11: * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
! 12: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 13: * Library General Public License for more details.
! 14: *
! 15: * You should have received a copy of the GNU Library General Public
! 16: * License along with the GNU LIBICONV Library; see the file COPYING.LIB.
! 17: * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,
! 18: * Fifth Floor, Boston, MA 02110-1301, USA.
! 19: */
! 20:
! 21: /*
! 22: * JOHAB Hangul
! 23: *
! 24: * Ken Lunde writes in his "CJKV Information Processing" book, p. 114:
! 25: * "Hangul can be composed of two or three jamo (some jamo are considered
! 26: * compound). Johab uses 19 initial jamo (consonants), 21 medial jamo (vowels)
! 27: * and 27 final jamo (consonants; 28 when you include the "fill" character
! 28: * for Hangul containing only two jamo). Multiplying these numbers results in
! 29: * 11172."
! 30: *
! 31: * Structure of the Johab encoding (see p. 181-184):
! 32: * bit 15 = 1
! 33: * bit 14..10 = initial jamo, only 19+1 out of 32 possible values are used
! 34: * bit 9..5 = medial jamo, only 21+1 out of 32 possible values are used
! 35: * bit 4..0 = final jamo, only 27+1 out of 32 possible values are used
! 36: *
! 37: * Structure of the Unicode encoding:
! 38: * grep '^0x\([8-C]...\|D[0-7]..\)' unicode.org-mappings/EASTASIA/KSC/JOHAB.TXT
! 39: * You see that all characters there are marked "HANGUL LETTER" or "HANGUL
! 40: * SYLLABLE". If you eliminate the "HANGUL LETTER"s, the table is sorted
! 41: * in ascending order according to Johab encoding and according to the Unicode
! 42: * encoding. Now look a little more carefully, and you see that the following
! 43: * formula holds:
! 44: * unicode == 0xAC00
! 45: * + 21 * 28 * (jamo_initial_index[(johab >> 10) & 31] - 1)
! 46: * + 28 * (jamo_medial_index[(johab >> 5) & 31] - 1)
! 47: * + jamo_final_index[johab & 31]
! 48: * where the index tables are defined as below.
! 49: */
! 50:
! 51: /* Tables mapping 5-bit groups to jamo letters. */
! 52: /* Note that Jamo XX = UHC 0xA4A0+XX = Unicode 0x3130+XX */
! 53: #define NONE 0xfd
! 54: #define FILL 0xff
! 55: static const unsigned char jamo_initial[32] = {
! 56: NONE, FILL, 0x01, 0x02, 0x04, 0x07, 0x08, 0x09,
! 57: 0x11, 0x12, 0x13, 0x15, 0x16, 0x17, 0x18, 0x19,
! 58: 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE, NONE,
! 59: NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
! 60: };
! 61: static const unsigned char jamo_medial[32] = {
! 62: NONE, NONE, FILL, 0x1f, 0x20, 0x21, 0x22, 0x23,
! 63: NONE, NONE, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
! 64: NONE, NONE, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
! 65: NONE, NONE, 0x30, 0x31, 0x32, 0x33, NONE, NONE,
! 66: };
! 67: static const unsigned char jamo_final[32] = {
! 68: NONE, FILL, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
! 69: 0x07, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
! 70: 0x10, 0x11, NONE, 0x12, 0x14, 0x15, 0x16, 0x17,
! 71: 0x18, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, NONE, NONE,
! 72: };
! 73: /* Same as jamo_final, except that it excludes characters already
! 74: contained in jamo_initial. 11 characters instead of 27. */
! 75: static const unsigned char jamo_final_notinitial[32] = {
! 76: NONE, NONE, NONE, NONE, 0x03, NONE, 0x05, 0x06,
! 77: NONE, NONE, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
! 78: 0x10, NONE, NONE, NONE, 0x14, NONE, NONE, NONE,
! 79: NONE, NONE, NONE, NONE, NONE, NONE, NONE, NONE,
! 80: };
! 81:
! 82: /* Tables mapping 5-bit groups to packed indices. */
! 83: #define none -1
! 84: #define fill 0
! 85: static const signed char jamo_initial_index[32] = {
! 86: none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
! 87: 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
! 88: 0x0f, 0x10, 0x11, 0x12, 0x13, none, none, none,
! 89: none, none, none, none, none, none, none, none,
! 90: };
! 91: static const signed char jamo_medial_index[32] = {
! 92: none, none, fill, 0x01, 0x02, 0x03, 0x04, 0x05,
! 93: none, none, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
! 94: none, none, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
! 95: none, none, 0x12, 0x13, 0x14, 0x15, none, none,
! 96: };
! 97: static const signed char jamo_final_index[32] = {
! 98: none, fill, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06,
! 99: 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e,
! 100: 0x0f, 0x10, none, 0x11, 0x12, 0x13, 0x14, 0x15,
! 101: 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, none, none,
! 102: };
! 103:
! 104: static int
! 105: johab_hangul_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)
! 106: {
! 107: unsigned char c1 = s[0];
! 108: if ((c1 >= 0x84 && c1 <= 0xd3)) {
! 109: if (n >= 2) {
! 110: unsigned char c2 = s[1];
! 111: if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff)) {
! 112: unsigned int johab = (c1 << 8) | c2;
! 113: unsigned int bitspart1 = (johab >> 10) & 31;
! 114: unsigned int bitspart2 = (johab >> 5) & 31;
! 115: unsigned int bitspart3 = johab & 31;
! 116: int index1 = jamo_initial_index[bitspart1];
! 117: int index2 = jamo_medial_index[bitspart2];
! 118: int index3 = jamo_final_index[bitspart3];
! 119: /* Exclude "none" values. */
! 120: if (index1 >= 0 && index2 >= 0 && index3 >= 0) {
! 121: /* Deal with "fill" values in initial or medial position. */
! 122: if (index1 == fill) {
! 123: if (index2 == fill) {
! 124: unsigned char jamo3 = jamo_final_notinitial[bitspart3];
! 125: if (jamo3 != NONE) {
! 126: *pwc = (ucs4_t) 0x3130 + jamo3;
! 127: return 2;
! 128: }
! 129: } else if (index3 == fill) {
! 130: unsigned char jamo2 = jamo_medial[bitspart2];
! 131: if (jamo2 != NONE && jamo2 != FILL) {
! 132: *pwc = (ucs4_t) 0x3130 + jamo2;
! 133: return 2;
! 134: }
! 135: }
! 136: /* Syllables composed only of medial and final don't exist. */
! 137: } else if (index2 == fill) {
! 138: if (index3 == fill) {
! 139: unsigned char jamo1 = jamo_initial[bitspart1];
! 140: if (jamo1 != NONE && jamo1 != FILL) {
! 141: *pwc = (ucs4_t) 0x3130 + jamo1;
! 142: return 2;
! 143: }
! 144: }
! 145: /* Syllables composed only of initial and final don't exist. */
! 146: } else {
! 147: /* index1 and index2 are not fill, but index3 may be fill. */
! 148: /* Nothing more to exclude. All 11172 code points are valid. */
! 149: *pwc = 0xac00 + ((index1 - 1) * 21 + (index2 - 1)) * 28 + index3;
! 150: return 2;
! 151: }
! 152: }
! 153: }
! 154: return RET_ILSEQ;
! 155: }
! 156: return RET_TOOFEW(0);
! 157: }
! 158: return RET_ILSEQ;
! 159: }
! 160:
! 161: /* 51 Jamo: 19 initial, 21 medial, 11 final not initial. */
! 162: static const unsigned short johab_hangul_page31[51] = {
! 163: 0x8841, 0x8c41, 0x8444, 0x9041, 0x8446, 0x8447, 0x9441, /*0x30-0x37*/
! 164: 0x9841, 0x9c41, 0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, /*0x38-0x3f*/
! 165: 0x8450, 0xa041, 0xa441, 0xa841, 0x8454, 0xac41, 0xb041, 0xb441, /*0x40-0x47*/
! 166: 0xb841, 0xbc41, 0xc041, 0xc441, 0xc841, 0xcc41, 0xd041, 0x8461, /*0x48-0x4f*/
! 167: 0x8481, 0x84a1, 0x84c1, 0x84e1, 0x8541, 0x8561, 0x8581, 0x85a1, /*0x50-0x57*/
! 168: 0x85c1, 0x85e1, 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1, /*0x58-0x5f*/
! 169: 0x8741, 0x8761, 0x8781, 0x87a1, /*0x60-0x67*/
! 170: };
! 171:
! 172: /* Tables mapping packed indices to 5-bit groups. */
! 173: /* index1+1 = jamo_initial_index[bitspart1] <==>
! 174: bitspart1 = jamo_initial_index_inverse[index1] */
! 175: static const char jamo_initial_index_inverse[19] = {
! 176: 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
! 177: 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
! 178: 0x10, 0x11, 0x12, 0x13, 0x14,
! 179: };
! 180: /* index2+1 = jamo_medial_index[bitspart2] <==>
! 181: bitspart2 = jamo_medial_index_inverse[index2] */
! 182: static const char jamo_medial_index_inverse[21] = {
! 183: 0x03, 0x04, 0x05, 0x06, 0x07,
! 184: 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
! 185: 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
! 186: 0x1a, 0x1b, 0x1c, 0x1d,
! 187: };
! 188: /* index3 = jamo_final_index[bitspart3] <==>
! 189: bitspart3 = jamo_final_index_inverse[index3] */
! 190: static const char jamo_final_index_inverse[28] = {
! 191: 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
! 192: 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
! 193: 0x10, 0x11, 0x13, 0x14, 0x15, 0x16, 0x17,
! 194: 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
! 195: };
! 196:
! 197: static int
! 198: johab_hangul_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)
! 199: {
! 200: if (n >= 2) {
! 201: if (wc >= 0x3131 && wc < 0x3164) {
! 202: unsigned short c = johab_hangul_page31[wc-0x3131];
! 203: r[0] = (c >> 8); r[1] = (c & 0xff);
! 204: return 2;
! 205: } else if (wc >= 0xac00 && wc < 0xd7a4) {
! 206: unsigned int index1;
! 207: unsigned int index2;
! 208: unsigned int index3;
! 209: unsigned short c;
! 210: unsigned int tmp = wc - 0xac00;
! 211: index3 = tmp % 28; tmp = tmp / 28;
! 212: index2 = tmp % 21; tmp = tmp / 21;
! 213: index1 = tmp;
! 214: c = (((((1 << 5)
! 215: | jamo_initial_index_inverse[index1]) << 5)
! 216: | jamo_medial_index_inverse[index2]) << 5)
! 217: | jamo_final_index_inverse[index3];
! 218: r[0] = (c >> 8); r[1] = (c & 0xff);
! 219: return 2;
! 220: }
! 221: return RET_ILUNI;
! 222: }
! 223: return RET_TOOSMALL;
! 224: }
! 225:
! 226: /*
! 227: * Decomposition of JOHAB Hangul in one to three Johab Jamo elements.
! 228: */
! 229:
! 230: /* Decompose wc into r[0..2], and return the number of resulting Jamo elements.
! 231: Return RET_ILUNI if decomposition is not possible. */
! 232:
! 233: static int johab_hangul_decompose (conv_t conv, ucs4_t* r, ucs4_t wc)
! 234: {
! 235: unsigned char buf[2];
! 236: int ret = johab_hangul_wctomb(conv,buf,wc,2);
! 237: if (ret != RET_ILUNI) {
! 238: unsigned int hangul = (buf[0] << 8) | buf[1];
! 239: unsigned char jamo1 = jamo_initial[(hangul >> 10) & 31];
! 240: unsigned char jamo2 = jamo_medial[(hangul >> 5) & 31];
! 241: unsigned char jamo3 = jamo_final[hangul & 31];
! 242: if ((hangul >> 15) != 1) abort();
! 243: if (jamo1 != NONE && jamo2 != NONE && jamo3 != NONE) {
! 244: /* They are not all three == FILL because that would correspond to
! 245: johab = 0x8441, which doesn't exist. */
! 246: ucs4_t* p = r;
! 247: if (jamo1 != FILL)
! 248: *p++ = 0x3130 + jamo1;
! 249: if (jamo2 != FILL)
! 250: *p++ = 0x3130 + jamo2;
! 251: if (jamo3 != FILL)
! 252: *p++ = 0x3130 + jamo3;
! 253: return p-r;
! 254: }
! 255: }
! 256: return RET_ILUNI;
! 257: }
! 258:
! 259: #undef fill
! 260: #undef none
! 261: #undef FILL
! 262: #undef NONE
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>