Annotation of embedaddon/libiconv/tools/cjk_tab_to_h.c, revision 1.1.1.1
1.1 misho 1: /* Copyright (C) 1999-2004, 2006-2007 Free Software Foundation, Inc.
2: This file is part of the GNU LIBICONV Tools.
3:
4: This program is free software: you can redistribute it and/or modify
5: it under the terms of the GNU General Public License as published by
6: the Free Software Foundation; either version 3 of the License, or
7: (at your option) any later version.
8:
9: This program is distributed in the hope that it will be useful,
10: but WITHOUT ANY WARRANTY; without even the implied warranty of
11: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12: GNU General Public License for more details.
13:
14: You should have received a copy of the GNU General Public License
15: along with this program; if not, write to the Free Software Foundation,
16: Inc., along with this program. If not, see <http://www.gnu.org/licenses/>. */
17:
18: /*
19: * Generates a CJK character set table from a .TXT table as found on
20: * ftp.unicode.org or in the X nls directory.
21: * Examples:
22: *
23: * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
24: * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
25: * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
26: *
27: * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
28: * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
29: * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
30: * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
31: * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
32: *
33: * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
34: *
35: * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
36: *
37: * ./cjk_tab_to_h JISX0213:2004 jisx0213 > jisx0213.h < JISX0213.TXT
38: */
39:
40: #include <stdio.h>
41: #include <stdlib.h>
42: #include <stdbool.h>
43: #include <string.h>
44: #include <ctype.h>
45: #include <assert.h>
46:
47: typedef struct {
48: int start;
49: int end;
50: } Block;
51:
52: typedef struct {
53: int rows; /* number of possible values for the 1st byte */
54: int cols; /* number of possible values for the 2nd byte */
55: int (*row_byte) (int row); /* returns the 1st byte value for a given row */
56: int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
57: int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
58: int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
59: const char* check_row_expr; /* format string for 1st byte value checking */
60: const char* check_col_expr; /* format string for 2nd byte value checking */
61: const char* byte_row_expr; /* format string for 1st byte value to row */
62: const char* byte_col_expr; /* format string for 2nd byte value to col */
63: int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
64: /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
65: Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
66: int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
67: int ncharsetblocks;
68: Block* charsetblocks; /* blocks[0..nblocks-1] */
69: int* uni2charset; /* uni2charset[0x0000..0xffff] */
70: int fffd; /* uni representation of the invalid character */
71: } Encoding;
72:
73: /*
74: * Outputs the file title.
75: */
76: static void output_title (const char *charsetname)
77: {
78: printf("/*\n");
79: printf(" * Copyright (C) 1999-2007 Free Software Foundation, Inc.\n");
80: printf(" * This file is part of the GNU LIBICONV Library.\n");
81: printf(" *\n");
82: printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
83: printf(" * and/or modify it under the terms of the GNU Library General Public\n");
84: printf(" * License as published by the Free Software Foundation; either version 2\n");
85: printf(" * of the License, or (at your option) any later version.\n");
86: printf(" *\n");
87: printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
88: printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
89: printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
90: printf(" * Library General Public License for more details.\n");
91: printf(" *\n");
92: printf(" * You should have received a copy of the GNU Library General Public\n");
93: printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
94: printf(" * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,\n");
95: printf(" * Fifth Floor, Boston, MA 02110-1301, USA.\n");
96: printf(" */\n");
97: printf("\n");
98: printf("/*\n");
99: printf(" * %s\n", charsetname);
100: printf(" */\n");
101: printf("\n");
102: }
103:
104: /*
105: * Reads the charset2uni table from standard input.
106: */
107: static void read_table (Encoding* enc)
108: {
109: int row, col, i, i1, i2, c, j;
110:
111: enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
112: for (row = 0; row < enc->rows; row++)
113: enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
114:
115: for (row = 0; row < enc->rows; row++)
116: for (col = 0; col < enc->cols; col++)
117: enc->charset2uni[row][col] = 0xfffd;
118:
119: c = getc(stdin);
120: ungetc(c,stdin);
121: if (c == '#') {
122: /* Read a unicode.org style .TXT file. */
123: for (;;) {
124: c = getc(stdin);
125: if (c == EOF)
126: break;
127: if (c == '\n' || c == ' ' || c == '\t')
128: continue;
129: if (c == '#') {
130: do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
131: continue;
132: }
133: ungetc(c,stdin);
134: if (scanf("0x%x", &j) != 1)
135: exit(1);
136: i1 = j >> 8;
137: i2 = j & 0xff;
138: row = enc->byte_row(i1);
139: col = enc->byte_col(i2);
140: if (row < 0 || col < 0) {
141: fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
142: exit(1);
143: }
144: if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
145: exit(1);
146: }
147: } else {
148: /* Read a table of hexadecimal Unicode values. */
149: for (i1 = 32; i1 < 132; i1++)
150: for (i2 = 32; i2 < 132; i2++) {
151: i = scanf("%x", &j);
152: if (i == EOF)
153: goto read_done;
154: if (i != 1)
155: exit(1);
156: if (j < 0 || j == 0xffff)
157: j = 0xfffd;
158: if (j != 0xfffd) {
159: if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
160: fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
161: exit (1);
162: }
163: enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
164: }
165: }
166: read_done: ;
167: }
168: }
169:
170: /*
171: * Determine whether the Unicode range goes outside the BMP.
172: */
173: static bool is_charset2uni_large (Encoding* enc)
174: {
175: int row, col;
176:
177: for (row = 0; row < enc->rows; row++)
178: for (col = 0; col < enc->cols; col++)
179: if (enc->charset2uni[row][col] >= 0x10000)
180: return true;
181: return false;
182: }
183:
184: /*
185: * Compactify the Unicode range by use of an auxiliary table,
186: * so 16 bits suffice to store each value.
187: */
188: static int compact_large_charset2uni (Encoding* enc, unsigned int **urows, unsigned int *urowshift)
189: {
190: unsigned int shift;
191:
192: for (shift = 8; ; shift--) {
193: int *upages = (int *) malloc((0x110000>>shift) * sizeof(int));
194: int i, row, col, nurows;
195:
196: for (i = 0; i < 0x110000>>shift; i++)
197: upages[i] = -1;
198:
199: for (row = 0; row < enc->rows; row++)
200: for (col = 0; col < enc->cols; col++)
201: upages[enc->charset2uni[row][col] >> shift] = 0;
202:
203: nurows = 0;
204: for (i = 0; i < 0x110000>>shift; i++)
205: if (upages[i] == 0)
206: nurows++;
207:
208: /* We want all table entries to fit in an 'unsigned short'. */
209: if (nurows <= 1<<(16-shift)) {
210: int** old_charset2uni;
211:
212: *urows = (unsigned int *) malloc(nurows * sizeof(unsigned int));
213: *urowshift = shift;
214:
215: nurows = 0;
216: for (i = 0; i < 0x110000>>shift; i++)
217: if (upages[i] == 0) {
218: upages[i] = nurows;
219: (*urows)[nurows] = i;
220: nurows++;
221: }
222:
223: old_charset2uni = enc->charset2uni;
224: enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
225: for (row = 0; row < enc->rows; row++)
226: enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
227: for (row = 0; row < enc->rows; row++)
228: for (col = 0; col < enc->cols; col++) {
229: int u = old_charset2uni[row][col];
230: enc->charset2uni[row][col] =
231: (upages[u >> shift] << shift) | (u & ((1 << shift) - 1));
232: }
233: enc->fffd =
234: (upages[0xfffd >> shift] << shift) | (0xfffd & ((1 << shift) - 1));
235:
236: return nurows;
237: }
238: }
239: abort();
240: }
241:
242: /*
243: * Computes the charsetpage[0..rows] array.
244: */
245: static void find_charset2uni_pages (Encoding* enc)
246: {
247: int row, col;
248:
249: enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
250:
251: for (row = 0; row <= enc->rows; row++)
252: enc->charsetpage[row] = 0;
253:
254: for (row = 0; row < enc->rows; row++) {
255: int used = 0;
256: for (col = 0; col < enc->cols; col++)
257: if (enc->charset2uni[row][col] != enc->fffd)
258: used = col+1;
259: enc->charsetpage[row] = used;
260: }
261: }
262:
263: /*
264: * Fills in nblocks and blocks.
265: */
266: static void find_charset2uni_blocks (Encoding* enc)
267: {
268: int n, row, lastrow;
269:
270: enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
271:
272: n = 0;
273: for (row = 0; row < enc->rows; row++)
274: if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
275: for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
276: enc->charsetblocks[n].start = row * enc->cols;
277: enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
278: n++;
279: }
280: enc->ncharsetblocks = n;
281: }
282:
283: /*
284: * Outputs the charset to unicode table and function.
285: */
286: static void output_charset2uni (const char* name, Encoding* enc)
287: {
288: int nurows, row, col, lastrow, col_max, i, i1_min, i1_max;
289: bool is_large;
290: unsigned int* urows;
291: unsigned int urowshift;
292: Encoding tmpenc;
293:
294: is_large = is_charset2uni_large(enc);
295: if (is_large) {
296: /* Use a temporary copy of enc. */
297: tmpenc = *enc;
298: enc = &tmpenc;
299: nurows = compact_large_charset2uni(enc,&urows,&urowshift);
300: } else {
301: nurows = 0; urows = NULL; urowshift = 0; enc->fffd = 0xfffd;
302: }
303:
304: find_charset2uni_pages(enc);
305:
306: find_charset2uni_blocks(enc);
307:
308: for (row = 0; row < enc->rows; row++)
309: if (enc->charsetpage[row] > 0) {
310: if (row == 0 || enc->charsetpage[row-1] == 0) {
311: /* Start a new block. */
312: for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
313: printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
314: name, enc->row_byte(row),
315: (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
316: }
317: printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
318: col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
319: for (col = 0; col < col_max; col++) {
320: printf(" 0x%04x,", enc->charset2uni[row][col]);
321: if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
322: }
323: printf("\n");
324: if (enc->charsetpage[row+1] == 0) {
325: /* End a block. */
326: printf("};\n");
327: }
328: }
329: printf("\n");
330:
331: if (is_large) {
332: printf("static const ucs4_t %s_2uni_upages[%d] = {\n ", name, nurows);
333: for (i = 0; i < nurows; i++) {
334: printf(" 0x%05x,", urows[i] << urowshift);
335: if ((i % 8) == 7 && (i+1 < nurows)) printf("\n ");
336: }
337: printf("\n");
338: printf("};\n");
339: printf("\n");
340: }
341:
342: printf("static int\n");
343: printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
344: printf("{\n");
345: printf(" unsigned char c1 = s[0];\n");
346: printf(" if (");
347: for (i = 0; i < enc->ncharsetblocks; i++) {
348: i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
349: i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
350: if (i > 0)
351: printf(" || ");
352: if (i1_min == i1_max)
353: printf("(c1 == 0x%02x)", i1_min);
354: else
355: printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
356: }
357: printf(") {\n");
358: printf(" if (n >= 2) {\n");
359: printf(" unsigned char c2 = s[1];\n");
360: printf(" if (");
361: printf(enc->check_col_expr, "c2");
362: printf(") {\n");
363: printf(" unsigned int i = %d * (", enc->cols);
364: printf(enc->byte_row_expr, "c1");
365: printf(") + (");
366: printf(enc->byte_col_expr, "c2");
367: printf(");\n");
368: printf(" %s wc = 0xfffd;\n", is_large ? "ucs4_t" : "unsigned short");
369: if (is_large) printf(" unsigned short swc;\n");
370: for (i = 0; i < enc->ncharsetblocks; i++) {
371: printf(" ");
372: if (i > 0)
373: printf("} else ");
374: if (i < enc->ncharsetblocks-1)
375: printf("if (i < %d) ", enc->charsetblocks[i+1].start);
376: printf("{\n");
377: printf(" if (i < %d)\n", enc->charsetblocks[i].end);
378: printf(" %s = ", is_large ? "swc" : "wc");
379: printf("%s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
380: if (enc->charsetblocks[i].start > 0)
381: printf("-%d", enc->charsetblocks[i].start);
382: printf("]");
383: if (is_large) printf(",\n wc = %s_2uni_upages[swc>>%d] | (swc & 0x%x)", name, urowshift, (1 << urowshift) - 1);
384: printf(";\n");
385: }
386: printf(" }\n");
387: printf(" if (wc != 0xfffd) {\n");
388: printf(" *pwc = %swc;\n", is_large ? "" : "(ucs4_t) ");
389: printf(" return 2;\n");
390: printf(" }\n");
391: printf(" }\n");
392: printf(" return RET_ILSEQ;\n");
393: printf(" }\n");
394: printf(" return RET_TOOFEW(0);\n");
395: printf(" }\n");
396: printf(" return RET_ILSEQ;\n");
397: printf("}\n");
398: printf("\n");
399: }
400:
401: /*
402: * Outputs the charset to unicode table and function.
403: * (Suitable if the mapping function is well defined, i.e. has no holes, and
404: * is monotonically increasing with small gaps only.)
405: */
406: static void output_charset2uni_noholes_monotonic (const char* name, Encoding* enc)
407: {
408: int row, col, lastrow, r, col_max, i, i1_min, i1_max;
409:
410: /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and
411: enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]
412: is always < 0x100. */
413: int steps_per_row = 2;
414: int stepsize = (enc->cols + steps_per_row-1) / steps_per_row;
415:
416: find_charset2uni_pages(enc);
417:
418: find_charset2uni_blocks(enc);
419:
420: for (row = 0; row < enc->rows; row++)
421: if (enc->charsetpage[row] > 0) {
422: if (row == 0 || enc->charsetpage[row-1] == 0) {
423: /* Start a new block. */
424: for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
425: printf("static const unsigned short %s_2uni_main_page%02x[%d] = {\n ",
426: name, enc->row_byte(row),
427: steps_per_row*(lastrow-row+1));
428: for (r = row; r <= lastrow; r++) {
429: for (i = 0; i < steps_per_row; i++)
430: printf(" 0x%04x,", enc->charset2uni[r][i*stepsize]);
431: if (((r-row) % 4) == 3 && (r < lastrow)) printf("\n ");
432: }
433: printf("\n");
434: printf("};\n");
435: printf("static const unsigned char %s_2uni_page%02x[%d] = {\n",
436: name, enc->row_byte(row),
437: (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
438: }
439: printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
440: col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
441: for (col = 0; col < col_max; col++) {
442: printf(" 0x%02x,", enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]);
443: if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
444: }
445: printf("\n");
446: if (enc->charsetpage[row+1] == 0) {
447: /* End a block. */
448: printf("};\n");
449: }
450: }
451: printf("\n");
452:
453: printf("static int\n");
454: printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
455: printf("{\n");
456: printf(" unsigned char c1 = s[0];\n");
457: printf(" if (");
458: for (i = 0; i < enc->ncharsetblocks; i++) {
459: i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
460: i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
461: if (i > 0)
462: printf(" || ");
463: if (i1_min == i1_max)
464: printf("(c1 == 0x%02x)", i1_min);
465: else
466: printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
467: }
468: printf(") {\n");
469: printf(" if (n >= 2) {\n");
470: printf(" unsigned char c2 = s[1];\n");
471: printf(" if (");
472: printf(enc->check_col_expr, "c2");
473: printf(") {\n");
474: printf(" unsigned int row = ");
475: printf(enc->byte_row_expr, "c1");
476: printf(";\n");
477: printf(" unsigned int col = ");
478: printf(enc->byte_col_expr, "c2");
479: printf(";\n");
480: printf(" unsigned int i = %d * row + col;\n", enc->cols);
481: printf(" unsigned short wc = 0xfffd;\n");
482: for (i = 0; i < enc->ncharsetblocks; i++) {
483: printf(" ");
484: if (i > 0)
485: printf("} else ");
486: if (i < enc->ncharsetblocks-1)
487: printf("if (i < %d) ", enc->charsetblocks[i+1].start);
488: printf("{\n");
489: printf(" if (i < %d)\n", enc->charsetblocks[i].end);
490: printf(" wc = %s_2uni_main_page%02x[%d*", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols), steps_per_row);
491: if (enc->charsetblocks[i].start > 0)
492: printf("(row-%d)", enc->charsetblocks[i].start / enc->cols);
493: else
494: printf("row");
495: printf("+");
496: if (steps_per_row == 2)
497: printf("(col>=%d?1:0)", stepsize);
498: else
499: printf("col/%d", stepsize);
500: printf("] + %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
501: if (enc->charsetblocks[i].start > 0)
502: printf("-%d", enc->charsetblocks[i].start);
503: printf("];\n");
504: }
505: printf(" }\n");
506: printf(" if (wc != 0xfffd) {\n");
507: printf(" *pwc = (ucs4_t) wc;\n");
508: printf(" return 2;\n");
509: printf(" }\n");
510: printf(" }\n");
511: printf(" return RET_ILSEQ;\n");
512: printf(" }\n");
513: printf(" return RET_TOOFEW(0);\n");
514: printf(" }\n");
515: printf(" return RET_ILSEQ;\n");
516: printf("}\n");
517: printf("\n");
518: }
519:
520: /*
521: * Computes the uni2charset[0x0000..0x2ffff] array.
522: */
523: static void invert (Encoding* enc)
524: {
525: int row, col, j;
526:
527: enc->uni2charset = (int*) malloc(0x30000*sizeof(int));
528:
529: for (j = 0; j < 0x30000; j++)
530: enc->uni2charset[j] = 0;
531:
532: for (row = 0; row < enc->rows; row++)
533: for (col = 0; col < enc->cols; col++) {
534: j = enc->charset2uni[row][col];
535: if (j != 0xfffd)
536: enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
537: }
538: }
539:
540: /*
541: * Outputs the unicode to charset table and function, using a linear array.
542: * (Suitable if the table is dense.)
543: */
544: static void output_uni2charset_dense (const char* name, Encoding* enc)
545: {
546: /* Like in 8bit_tab_to_h.c */
547: bool pages[0x300];
548: int line[0x6000];
549: int tableno;
550: struct { int minline; int maxline; int usecount; } tables[0x6000];
551: bool first;
552: int row, col, j, p, j1, j2, t;
553:
554: for (p = 0; p < 0x300; p++)
555: pages[p] = false;
556: for (row = 0; row < enc->rows; row++)
557: for (col = 0; col < enc->cols; col++) {
558: j = enc->charset2uni[row][col];
559: if (j != 0xfffd)
560: pages[j>>8] = true;
561: }
562: for (j1 = 0; j1 < 0x6000; j1++) {
563: bool all_invalid = true;
564: for (j2 = 0; j2 < 8; j2++) {
565: j = 8*j1+j2;
566: if (enc->uni2charset[j] != 0)
567: all_invalid = false;
568: }
569: if (all_invalid)
570: line[j1] = -1;
571: else
572: line[j1] = 0;
573: }
574: tableno = 0;
575: for (j1 = 0; j1 < 0x6000; j1++) {
576: if (line[j1] >= 0) {
577: if (tableno > 0
578: && ((j1 > 0 && line[j1-1] == tableno-1)
579: || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
580: && j1 - tables[tableno-1].maxline <= 8))) {
581: line[j1] = tableno-1;
582: tables[tableno-1].maxline = j1;
583: } else {
584: tableno++;
585: line[j1] = tableno-1;
586: tables[tableno-1].minline = tables[tableno-1].maxline = j1;
587: }
588: }
589: }
590: for (t = 0; t < tableno; t++) {
591: tables[t].usecount = 0;
592: j1 = 8*tables[t].minline;
593: j2 = 8*(tables[t].maxline+1);
594: for (j = j1; j < j2; j++)
595: if (enc->uni2charset[j] != 0)
596: tables[t].usecount++;
597: }
598: {
599: p = -1;
600: for (t = 0; t < tableno; t++)
601: if (tables[t].usecount > 1) {
602: p = tables[t].minline >> 5;
603: printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
604: for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
605: if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
606: printf(" /* 0x%04x */\n", 8*j1);
607: printf(" ");
608: for (j2 = 0; j2 < 8; j2++) {
609: j = 8*j1+j2;
610: printf(" 0x%04x,", enc->uni2charset[j]);
611: }
612: printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
613: }
614: printf("};\n");
615: }
616: if (p >= 0)
617: printf("\n");
618: }
619: printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
620: printf("{\n");
621: printf(" if (n >= 2) {\n");
622: printf(" unsigned short c = 0;\n");
623: first = true;
624: for (j1 = 0; j1 < 0x6000;) {
625: t = line[j1];
626: for (j2 = j1; j2 < 0x6000 && line[j2] == t; j2++);
627: if (t >= 0) {
628: if (j1 != tables[t].minline) abort();
629: if (j2 > tables[t].maxline+1) abort();
630: j2 = tables[t].maxline+1;
631: if (first)
632: printf(" ");
633: else
634: printf(" else ");
635: first = false;
636: if (tables[t].usecount == 0) abort();
637: if (tables[t].usecount == 1) {
638: if (j2 != j1+1) abort();
639: for (j = 8*j1; j < 8*j2; j++)
640: if (enc->uni2charset[j] != 0) {
641: printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j, enc->uni2charset[j]);
642: break;
643: }
644: } else {
645: if (j1 == 0) {
646: printf("if (wc < 0x%04x)", 8*j2);
647: } else {
648: printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
649: }
650: printf("\n c = %s_page%02x[wc", name, j1 >> 5);
651: if (tables[t].minline > 0)
652: printf("-0x%04x", 8*j1);
653: printf("];\n");
654: }
655: }
656: j1 = j2;
657: }
658: printf(" if (c != 0) {\n");
659: printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
660: printf(" return 2;\n");
661: printf(" }\n");
662: printf(" return RET_ILUNI;\n");
663: printf(" }\n");
664: printf(" return RET_TOOSMALL;\n");
665: printf("}\n");
666: }
667:
668: /*
669: * Outputs the unicode to charset table and function, using a packed array.
670: * (Suitable if the table is sparse.)
671: * The argument 'monotonic' may be set to true if the mapping is monotonically
672: * increasing with small gaps only.
673: */
674: static void output_uni2charset_sparse (const char* name, Encoding* enc, bool monotonic)
675: {
676: bool pages[0x300];
677: Block pageblocks[0x300]; int npageblocks;
678: int indx2charset[0x30000];
679: int summary_indx[0x3000];
680: int summary_used[0x3000];
681: int i, row, col, j, p, j1, j2, indx;
682: bool is_large;
683: /* for monotonic: */
684: int log2_stepsize = (!strcmp(name,"uhc_2") ? 6 : 7);
685: int stepsize = 1 << log2_stepsize;
686: int indxsteps;
687:
688: /* Fill pages[0x300]. */
689: for (p = 0; p < 0x300; p++)
690: pages[p] = false;
691: for (row = 0; row < enc->rows; row++)
692: for (col = 0; col < enc->cols; col++) {
693: j = enc->charset2uni[row][col];
694: if (j != 0xfffd)
695: pages[j>>8] = true;
696: }
697:
698: /* Determine whether two or three bytes are needed for each character. */
699: is_large = false;
700: for (j = 0; j < 0x30000; j++)
701: if (enc->uni2charset[j] >= 0x10000)
702: is_large = true;
703:
704: #if 0
705: for (p = 0; p < 0x300; p++)
706: if (pages[p]) {
707: printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
708: for (j1 = 0; j1 < 32; j1++) {
709: printf(" ");
710: for (j2 = 0; j2 < 8; j2++)
711: printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
712: printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
713: }
714: printf("};\n");
715: }
716: printf("\n");
717: #endif
718:
719: /* Fill summary_indx[] and summary_used[]. */
720: indx = 0;
721: for (j1 = 0; j1 < 0x3000; j1++) {
722: summary_indx[j1] = indx;
723: summary_used[j1] = 0;
724: for (j2 = 0; j2 < 16; j2++) {
725: j = 16*j1+j2;
726: if (enc->uni2charset[j] != 0) {
727: indx2charset[indx++] = enc->uni2charset[j];
728: summary_used[j1] |= (1 << j2);
729: }
730: }
731: }
732:
733: /* Fill npageblocks and pageblocks[]. */
734: npageblocks = 0;
735: for (p = 0; p < 0x300; ) {
736: if (pages[p] && (p == 0 || !pages[p-1])) {
737: pageblocks[npageblocks].start = 16*p;
738: do p++; while (p < 0x300 && pages[p]);
739: j1 = 16*p;
740: while (summary_used[j1-1] == 0) j1--;
741: pageblocks[npageblocks].end = j1;
742: npageblocks++;
743: } else
744: p++;
745: }
746:
747: if (monotonic) {
748: indxsteps = (indx + stepsize-1) / stepsize;
749: printf("static const unsigned short %s_2charset_main[%d] = {\n", name, indxsteps);
750: for (i = 0; i < indxsteps; ) {
751: if ((i % 8) == 0) printf(" ");
752: printf(" 0x%04x,", indx2charset[i*stepsize]);
753: i++;
754: if ((i % 8) == 0 || i == indxsteps) printf("\n");
755: }
756: printf("};\n");
757: printf("static const unsigned char %s_2charset[%d] = {\n", name, indx);
758: for (i = 0; i < indx; ) {
759: if ((i % 8) == 0) printf(" ");
760: printf(" 0x%02x,", indx2charset[i] - indx2charset[i/stepsize*stepsize]);
761: i++;
762: if ((i % 8) == 0 || i == indx) printf("\n");
763: }
764: printf("};\n");
765: } else {
766: if (is_large) {
767: printf("static const unsigned char %s_2charset[3*%d] = {\n", name, indx);
768: for (i = 0; i < indx; ) {
769: if ((i % 4) == 0) printf(" ");
770: printf(" 0x%1x,0x%02x,0x%02x,", indx2charset[i] >> 16,
771: (indx2charset[i] >> 8) & 0xff, indx2charset[i] & 0xff);
772: i++;
773: if ((i % 4) == 0 || i == indx) printf("\n");
774: }
775: printf("};\n");
776: } else {
777: printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
778: for (i = 0; i < indx; ) {
779: if ((i % 8) == 0) printf(" ");
780: printf(" 0x%04x,", indx2charset[i]);
781: i++;
782: if ((i % 8) == 0 || i == indx) printf("\n");
783: }
784: printf("};\n");
785: }
786: }
787: printf("\n");
788: for (i = 0; i < npageblocks; i++) {
789: printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
790: pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
791: for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
792: if (((16*j1) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1);
793: if ((j1 % 4) == 0) printf(" ");
794: printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
795: j1++;
796: if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
797: }
798: printf("};\n");
799: }
800: printf("\n");
801:
802: printf("static int\n");
803: printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
804: printf("{\n");
805: printf(" if (n >= 2) {\n");
806: printf(" const Summary16 *summary = NULL;\n");
807: for (i = 0; i < npageblocks; i++) {
808: printf(" ");
809: if (i > 0)
810: printf("else ");
811: printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
812: 16*pageblocks[i].start, 16*pageblocks[i].end);
813: printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name,
814: pageblocks[i].start/16);
815: if (pageblocks[i].start > 0)
816: printf("-0x%03x", pageblocks[i].start);
817: printf("];\n");
818: }
819: printf(" if (summary) {\n");
820: printf(" unsigned short used = summary->used;\n");
821: printf(" unsigned int i = wc & 0x0f;\n");
822: printf(" if (used & ((unsigned short) 1 << i)) {\n");
823: if (monotonic || !is_large)
824: printf(" unsigned short c;\n");
825: printf(" /* Keep in `used' only the bits 0..i-1. */\n");
826: printf(" used &= ((unsigned short) 1 << i) - 1;\n");
827: printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n");
828: printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
829: printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
830: printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
831: printf(" used = (used & 0x00ff) + (used >> 8);\n");
832: if (monotonic) {
833: printf(" used += summary->indx;\n");
834: printf(" c = %s_2charset_main[used>>%d] + %s_2charset[used];\n", name, log2_stepsize, name);
835: printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
836: printf(" return 2;\n");
837: } else {
838: if (is_large) {
839: printf(" used += summary->indx;\n");
840: printf(" r[0] = %s_2charset[3*used];\n", name);
841: printf(" r[1] = %s_2charset[3*used+1];\n", name);
842: printf(" r[2] = %s_2charset[3*used+2];\n", name);
843: printf(" return 3;\n");
844: } else {
845: printf(" c = %s_2charset[summary->indx + used];\n", name);
846: printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
847: printf(" return 2;\n");
848: }
849: }
850: printf(" }\n");
851: printf(" }\n");
852: printf(" return RET_ILUNI;\n");
853: printf(" }\n");
854: printf(" return RET_TOOSMALL;\n");
855: printf("}\n");
856: }
857:
858: /* ISO-2022/EUC specifics */
859:
860: static int row_byte_normal (int row) { return 0x21+row; }
861: static int col_byte_normal (int col) { return 0x21+col; }
862: static int byte_row_normal (int byte) { return byte-0x21; }
863: static int byte_col_normal (int byte) { return byte-0x21; }
864:
865: static void do_normal (const char* name)
866: {
867: Encoding enc;
868:
869: enc.rows = 94;
870: enc.cols = 94;
871: enc.row_byte = row_byte_normal;
872: enc.col_byte = col_byte_normal;
873: enc.byte_row = byte_row_normal;
874: enc.byte_col = byte_col_normal;
875: enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
876: enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
877: enc.byte_row_expr = "%1$s - 0x21";
878: enc.byte_col_expr = "%1$s - 0x21";
879:
880: read_table(&enc);
881: output_charset2uni(name,&enc);
882: invert(&enc); output_uni2charset_sparse(name,&enc,false);
883: }
884:
885: /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
886: starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
887: order. There are 75 out-of-order values, scattered all throughout the table.
888: */
889:
890: static void do_normal_only_charset2uni (const char* name)
891: {
892: Encoding enc;
893:
894: enc.rows = 94;
895: enc.cols = 94;
896: enc.row_byte = row_byte_normal;
897: enc.col_byte = col_byte_normal;
898: enc.byte_row = byte_row_normal;
899: enc.byte_col = byte_col_normal;
900: enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
901: enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
902: enc.byte_row_expr = "%1$s - 0x21";
903: enc.byte_col_expr = "%1$s - 0x21";
904:
905: read_table(&enc);
906: output_charset2uni(name,&enc);
907: }
908:
909: /* CNS 11643 specifics - trick to put two tables into one */
910:
911: static int row_byte_cns11643 (int row) {
912: return 0x100 * (row / 94) + (row % 94) + 0x21;
913: }
914: static int byte_row_cns11643 (int byte) {
915: return (byte >> 8) * 94 + (byte & 0xff) - 0x21;
916: }
917:
918: static void do_cns11643_only_uni2charset (const char* name)
919: {
920: Encoding enc;
921:
922: enc.rows = 16*94;
923: enc.cols = 94;
924: enc.row_byte = row_byte_cns11643;
925: enc.col_byte = col_byte_normal;
926: enc.byte_row = byte_row_cns11643;
927: enc.byte_col = byte_col_normal;
928: enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
929: enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
930: enc.byte_row_expr = "%1$s - 0x21";
931: enc.byte_col_expr = "%1$s - 0x21";
932:
933: read_table(&enc);
934: invert(&enc);
935: output_uni2charset_sparse(name,&enc,false);
936: }
937:
938: /* GBK specifics */
939:
940: static int row_byte_gbk1 (int row) {
941: return 0x81+row;
942: }
943: static int col_byte_gbk1 (int col) {
944: return (col >= 0x3f ? 0x41 : 0x40) + col;
945: }
946: static int byte_row_gbk1 (int byte) {
947: if (byte >= 0x81 && byte < 0xff)
948: return byte-0x81;
949: else
950: return -1;
951: }
952: static int byte_col_gbk1 (int byte) {
953: if (byte >= 0x40 && byte < 0x7f)
954: return byte-0x40;
955: else if (byte >= 0x80 && byte < 0xff)
956: return byte-0x41;
957: else
958: return -1;
959: }
960:
961: static void do_gbk1 (const char* name)
962: {
963: Encoding enc;
964:
965: enc.rows = 126;
966: enc.cols = 190;
967: enc.row_byte = row_byte_gbk1;
968: enc.col_byte = col_byte_gbk1;
969: enc.byte_row = byte_row_gbk1;
970: enc.byte_col = byte_col_gbk1;
971: enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
972: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
973: enc.byte_row_expr = "%1$s - 0x81";
974: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
975:
976: read_table(&enc);
977: output_charset2uni(name,&enc);
978: invert(&enc); output_uni2charset_dense(name,&enc);
979: }
980:
981: static void do_gbk1_only_charset2uni (const char* name)
982: {
983: Encoding enc;
984:
985: enc.rows = 126;
986: enc.cols = 190;
987: enc.row_byte = row_byte_gbk1;
988: enc.col_byte = col_byte_gbk1;
989: enc.byte_row = byte_row_gbk1;
990: enc.byte_col = byte_col_gbk1;
991: enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
992: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
993: enc.byte_row_expr = "%1$s - 0x81";
994: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
995:
996: read_table(&enc);
997: output_charset2uni(name,&enc);
998: }
999:
1000: static int row_byte_gbk2 (int row) {
1001: return 0x81+row;
1002: }
1003: static int col_byte_gbk2 (int col) {
1004: return (col >= 0x3f ? 0x41 : 0x40) + col;
1005: }
1006: static int byte_row_gbk2 (int byte) {
1007: if (byte >= 0x81 && byte < 0xff)
1008: return byte-0x81;
1009: else
1010: return -1;
1011: }
1012: static int byte_col_gbk2 (int byte) {
1013: if (byte >= 0x40 && byte < 0x7f)
1014: return byte-0x40;
1015: else if (byte >= 0x80 && byte < 0xa1)
1016: return byte-0x41;
1017: else
1018: return -1;
1019: }
1020:
1021: static void do_gbk2_only_charset2uni (const char* name)
1022: {
1023: Encoding enc;
1024:
1025: enc.rows = 126;
1026: enc.cols = 96;
1027: enc.row_byte = row_byte_gbk2;
1028: enc.col_byte = col_byte_gbk2;
1029: enc.byte_row = byte_row_gbk2;
1030: enc.byte_col = byte_col_gbk2;
1031: enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1032: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
1033: enc.byte_row_expr = "%1$s - 0x81";
1034: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1035:
1036: read_table(&enc);
1037: output_charset2uni(name,&enc);
1038: }
1039:
1040: static void do_gbk1_only_uni2charset (const char* name)
1041: {
1042: Encoding enc;
1043:
1044: enc.rows = 126;
1045: enc.cols = 190;
1046: enc.row_byte = row_byte_gbk1;
1047: enc.col_byte = col_byte_gbk1;
1048: enc.byte_row = byte_row_gbk1;
1049: enc.byte_col = byte_col_gbk1;
1050: enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1051: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
1052: enc.byte_row_expr = "%1$s - 0x81";
1053: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1054:
1055: read_table(&enc);
1056: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1057: }
1058:
1059: /* KSC 5601 specifics */
1060:
1061: /*
1062: * Reads the charset2uni table from standard input.
1063: */
1064: static void read_table_ksc5601 (Encoding* enc)
1065: {
1066: int row, col, i, i1, i2, c, j;
1067:
1068: enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
1069: for (row = 0; row < enc->rows; row++)
1070: enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
1071:
1072: for (row = 0; row < enc->rows; row++)
1073: for (col = 0; col < enc->cols; col++)
1074: enc->charset2uni[row][col] = 0xfffd;
1075:
1076: c = getc(stdin);
1077: ungetc(c,stdin);
1078: if (c == '#') {
1079: /* Read a unicode.org style .TXT file. */
1080: for (;;) {
1081: c = getc(stdin);
1082: if (c == EOF)
1083: break;
1084: if (c == '\n' || c == ' ' || c == '\t')
1085: continue;
1086: if (c == '#') {
1087: do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1088: continue;
1089: }
1090: ungetc(c,stdin);
1091: if (scanf("0x%x", &j) != 1)
1092: exit(1);
1093: i1 = j >> 8;
1094: i2 = j & 0xff;
1095: if (scanf(" 0x%x", &j) != 1)
1096: exit(1);
1097: /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
1098: = KS X 1001.1992, ignore the rest. */
1099: if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
1100: continue; /* KSC5601 specific */
1101: i1 &= 0x7f; /* KSC5601 specific */
1102: i2 &= 0x7f; /* KSC5601 specific */
1103: row = enc->byte_row(i1);
1104: col = enc->byte_col(i2);
1105: if (row < 0 || col < 0) {
1106: fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
1107: exit(1);
1108: }
1109: enc->charset2uni[row][col] = j;
1110: }
1111: } else {
1112: /* Read a table of hexadecimal Unicode values. */
1113: for (i1 = 33; i1 < 127; i1++)
1114: for (i2 = 33; i2 < 127; i2++) {
1115: i = scanf("%x", &j);
1116: if (i == EOF)
1117: goto read_done;
1118: if (i != 1)
1119: exit(1);
1120: if (j < 0 || j == 0xffff)
1121: j = 0xfffd;
1122: if (j != 0xfffd) {
1123: if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
1124: fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
1125: exit (1);
1126: }
1127: enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
1128: }
1129: }
1130: read_done: ;
1131: }
1132: }
1133:
1134: static void do_ksc5601 (const char* name)
1135: {
1136: Encoding enc;
1137:
1138: enc.rows = 94;
1139: enc.cols = 94;
1140: enc.row_byte = row_byte_normal;
1141: enc.col_byte = col_byte_normal;
1142: enc.byte_row = byte_row_normal;
1143: enc.byte_col = byte_col_normal;
1144: enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1145: enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1146: enc.byte_row_expr = "%1$s - 0x21";
1147: enc.byte_col_expr = "%1$s - 0x21";
1148:
1149: read_table_ksc5601(&enc);
1150: output_charset2uni(name,&enc);
1151: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1152: }
1153:
1154: /* UHC specifics */
1155:
1156: /* UHC part 1: 0x{81..A0}{41..5A,61..7A,81..FE} */
1157:
1158: static int row_byte_uhc_1 (int row) {
1159: return 0x81 + row;
1160: }
1161: static int col_byte_uhc_1 (int col) {
1162: return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1163: }
1164: static int byte_row_uhc_1 (int byte) {
1165: if (byte >= 0x81 && byte < 0xa1)
1166: return byte-0x81;
1167: else
1168: return -1;
1169: }
1170: static int byte_col_uhc_1 (int byte) {
1171: if (byte >= 0x41 && byte < 0x5b)
1172: return byte-0x41;
1173: else if (byte >= 0x61 && byte < 0x7b)
1174: return byte-0x47;
1175: else if (byte >= 0x81 && byte < 0xff)
1176: return byte-0x4d;
1177: else
1178: return -1;
1179: }
1180:
1181: static void do_uhc_1 (const char* name)
1182: {
1183: Encoding enc;
1184:
1185: enc.rows = 32;
1186: enc.cols = 178;
1187: enc.row_byte = row_byte_uhc_1;
1188: enc.col_byte = col_byte_uhc_1;
1189: enc.byte_row = byte_row_uhc_1;
1190: enc.byte_col = byte_col_uhc_1;
1191: enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa1)";
1192: enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xff)";
1193: enc.byte_row_expr = "%1$s - 0x81";
1194: enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1195:
1196: read_table(&enc);
1197: output_charset2uni_noholes_monotonic(name,&enc);
1198: invert(&enc); output_uni2charset_sparse(name,&enc,true);
1199: }
1200:
1201: /* UHC part 2: 0x{A1..C6}{41..5A,61..7A,81..A0} */
1202:
1203: static int row_byte_uhc_2 (int row) {
1204: return 0xa1 + row;
1205: }
1206: static int col_byte_uhc_2 (int col) {
1207: return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1208: }
1209: static int byte_row_uhc_2 (int byte) {
1210: if (byte >= 0xa1 && byte < 0xff)
1211: return byte-0xa1;
1212: else
1213: return -1;
1214: }
1215: static int byte_col_uhc_2 (int byte) {
1216: if (byte >= 0x41 && byte < 0x5b)
1217: return byte-0x41;
1218: else if (byte >= 0x61 && byte < 0x7b)
1219: return byte-0x47;
1220: else if (byte >= 0x81 && byte < 0xa1)
1221: return byte-0x4d;
1222: else
1223: return -1;
1224: }
1225:
1226: static void do_uhc_2 (const char* name)
1227: {
1228: Encoding enc;
1229:
1230: enc.rows = 94;
1231: enc.cols = 84;
1232: enc.row_byte = row_byte_uhc_2;
1233: enc.col_byte = col_byte_uhc_2;
1234: enc.byte_row = byte_row_uhc_2;
1235: enc.byte_col = byte_col_uhc_2;
1236: enc.check_row_expr = "(%1$s >= 0xa1 && %1$s < 0xff)";
1237: enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xa1)";
1238: enc.byte_row_expr = "%1$s - 0xa1";
1239: enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1240:
1241: read_table(&enc);
1242: output_charset2uni_noholes_monotonic(name,&enc);
1243: invert(&enc); output_uni2charset_sparse(name,&enc,true);
1244: }
1245:
1246: /* Big5 specifics */
1247:
1248: static int row_byte_big5 (int row) {
1249: return 0xa1+row;
1250: }
1251: static int col_byte_big5 (int col) {
1252: return (col >= 0x3f ? 0x62 : 0x40) + col;
1253: }
1254: static int byte_row_big5 (int byte) {
1255: if (byte >= 0xa1 && byte < 0xff)
1256: return byte-0xa1;
1257: else
1258: return -1;
1259: }
1260: static int byte_col_big5 (int byte) {
1261: if (byte >= 0x40 && byte < 0x7f)
1262: return byte-0x40;
1263: else if (byte >= 0xa1 && byte < 0xff)
1264: return byte-0x62;
1265: else
1266: return -1;
1267: }
1268:
1269: static void do_big5 (const char* name)
1270: {
1271: Encoding enc;
1272:
1273: enc.rows = 94;
1274: enc.cols = 157;
1275: enc.row_byte = row_byte_big5;
1276: enc.col_byte = col_byte_big5;
1277: enc.byte_row = byte_row_big5;
1278: enc.byte_col = byte_col_big5;
1279: enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
1280: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1281: enc.byte_row_expr = "%1$s - 0xa1";
1282: enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1283:
1284: read_table(&enc);
1285: output_charset2uni(name,&enc);
1286: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1287: }
1288:
1289: /* HKSCS specifics */
1290:
1291: static int row_byte_hkscs (int row) {
1292: return 0x80+row;
1293: }
1294: static int byte_row_hkscs (int byte) {
1295: if (byte >= 0x80 && byte < 0xff)
1296: return byte-0x80;
1297: else
1298: return -1;
1299: }
1300:
1301: static void do_hkscs (const char* name)
1302: {
1303: Encoding enc;
1304:
1305: enc.rows = 128;
1306: enc.cols = 157;
1307: enc.row_byte = row_byte_hkscs;
1308: enc.col_byte = col_byte_big5;
1309: enc.byte_row = byte_row_hkscs;
1310: enc.byte_col = byte_col_big5;
1311: enc.check_row_expr = "%1$s >= 0x80 && %1$s < 0xff";
1312: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1313: enc.byte_row_expr = "%1$s - 0x80";
1314: enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1315:
1316: read_table(&enc);
1317: output_charset2uni(name,&enc);
1318: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1319: }
1320:
1321: /* Johab Hangul specifics */
1322:
1323: static int row_byte_johab_hangul (int row) {
1324: return 0x84+row;
1325: }
1326: static int col_byte_johab_hangul (int col) {
1327: return (col >= 0x3e ? 0x43 : 0x41) + col;
1328: }
1329: static int byte_row_johab_hangul (int byte) {
1330: if (byte >= 0x84 && byte < 0xd4)
1331: return byte-0x84;
1332: else
1333: return -1;
1334: }
1335: static int byte_col_johab_hangul (int byte) {
1336: if (byte >= 0x41 && byte < 0x7f)
1337: return byte-0x41;
1338: else if (byte >= 0x81 && byte < 0xff)
1339: return byte-0x43;
1340: else
1341: return -1;
1342: }
1343:
1344: static void do_johab_hangul (const char* name)
1345: {
1346: Encoding enc;
1347:
1348: enc.rows = 80;
1349: enc.cols = 188;
1350: enc.row_byte = row_byte_johab_hangul;
1351: enc.col_byte = col_byte_johab_hangul;
1352: enc.byte_row = byte_row_johab_hangul;
1353: enc.byte_col = byte_col_johab_hangul;
1354: enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
1355: enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
1356: enc.byte_row_expr = "%1$s - 0x84";
1357: enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
1358:
1359: read_table(&enc);
1360: output_charset2uni(name,&enc);
1361: invert(&enc); output_uni2charset_dense(name,&enc);
1362: }
1363:
1364: /* SJIS specifics */
1365:
1366: static int row_byte_sjis (int row) {
1367: return (row >= 0x1f ? 0xc1 : 0x81) + row;
1368: }
1369: static int col_byte_sjis (int col) {
1370: return (col >= 0x3f ? 0x41 : 0x40) + col;
1371: }
1372: static int byte_row_sjis (int byte) {
1373: if (byte >= 0x81 && byte < 0xa0)
1374: return byte-0x81;
1375: else if (byte >= 0xe0)
1376: return byte-0xc1;
1377: else
1378: return -1;
1379: }
1380: static int byte_col_sjis (int byte) {
1381: if (byte >= 0x40 && byte < 0x7f)
1382: return byte-0x40;
1383: else if (byte >= 0x80 && byte < 0xfd)
1384: return byte-0x41;
1385: else
1386: return -1;
1387: }
1388:
1389: static void do_sjis (const char* name)
1390: {
1391: Encoding enc;
1392:
1393: enc.rows = 94;
1394: enc.cols = 188;
1395: enc.row_byte = row_byte_sjis;
1396: enc.col_byte = col_byte_sjis;
1397: enc.byte_row = byte_row_sjis;
1398: enc.byte_col = byte_col_sjis;
1399: enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
1400: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
1401: enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
1402: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1403:
1404: read_table(&enc);
1405: output_charset2uni(name,&enc);
1406: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1407: }
1408:
1409: /* GB18030 Unicode specifics */
1410:
1411: static void do_gb18030uni (const char* name)
1412: {
1413: int c;
1414: unsigned int bytes;
1415: int i1, i2, i3, i4, i, j, k;
1416: int charset2uni[4*10*126*10];
1417: int uni2charset[0x10000];
1418: struct { int low; int high; int diff; int total; } ranges[256];
1419: int ranges_count, ranges_total;
1420:
1421: for (i = 0; i < 4*10*126*10; i++)
1422: charset2uni[i] = 0;
1423: for (j = 0; j < 0x10000; j++)
1424: uni2charset[j] = 0;
1425:
1426: /* Read a unicode.org style .TXT file. */
1427: for (;;) {
1428: c = getc(stdin);
1429: if (c == EOF)
1430: break;
1431: if (c == '\n' || c == ' ' || c == '\t')
1432: continue;
1433: if (c == '#') {
1434: do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1435: continue;
1436: }
1437: ungetc(c,stdin);
1438: if (scanf("0x%x", &bytes) != 1)
1439: exit(1);
1440: i1 = (bytes >> 24) & 0xff;
1441: i2 = (bytes >> 16) & 0xff;
1442: i3 = (bytes >> 8) & 0xff;
1443: i4 = bytes & 0xff;
1444: if (!(i1 >= 0x81 && i1 <= 0x84
1445: && i2 >= 0x30 && i2 <= 0x39
1446: && i3 >= 0x81 && i3 <= 0xfe
1447: && i4 >= 0x30 && i4 <= 0x39)) {
1448: fprintf(stderr, "lost entry for %02x %02x %02x %02x\n", i1, i2, i3, i4);
1449: exit(1);
1450: }
1451: i = (((i1-0x81) * 10 + (i2-0x30)) * 126 + (i3-0x81)) * 10 + (i4-0x30);
1452: if (scanf(" 0x%x", &j) != 1)
1453: exit(1);
1454: if (!(j >= 0 && j < 0x10000))
1455: exit(1);
1456: charset2uni[i] = j;
1457: uni2charset[j] = i;
1458: }
1459:
1460: /* Verify that the mapping i -> j is monotonically increasing and
1461: of the form
1462: low[k] <= i <= high[k] => j = diff[k] + i
1463: with a set of disjoint intervals (low[k], high[k]). */
1464: ranges_count = 0;
1465: for (i = 0; i < 4*10*126*10; i++)
1466: if (charset2uni[i] != 0) {
1467: int diff;
1468: j = charset2uni[i];
1469: diff = j - i;
1470: if (ranges_count > 0) {
1471: if (!(i > ranges[ranges_count-1].high))
1472: exit(1);
1473: if (!(j > ranges[ranges_count-1].high + ranges[ranges_count-1].diff))
1474: exit(1);
1475: /* Additional property: The diffs are also increasing. */
1476: if (!(diff >= ranges[ranges_count-1].diff))
1477: exit(1);
1478: }
1479: if (ranges_count > 0 && diff == ranges[ranges_count-1].diff)
1480: ranges[ranges_count-1].high = i;
1481: else {
1482: if (ranges_count == 256)
1483: exit(1);
1484: ranges[ranges_count].low = i;
1485: ranges[ranges_count].high = i;
1486: ranges[ranges_count].diff = diff;
1487: ranges_count++;
1488: }
1489: }
1490:
1491: /* Determine size of bitmap. */
1492: ranges_total = 0;
1493: for (k = 0; k < ranges_count; k++) {
1494: ranges[k].total = ranges_total;
1495: ranges_total += ranges[k].high - ranges[k].low + 1;
1496: }
1497:
1498: printf("static const unsigned short %s_charset2uni_ranges[%d] = {\n", name, 2*ranges_count);
1499: for (k = 0; k < ranges_count; k++) {
1500: printf(" 0x%04x, 0x%04x", ranges[k].low, ranges[k].high);
1501: if (k+1 < ranges_count) printf(",");
1502: if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1503: }
1504: printf("\n");
1505: printf("};\n");
1506:
1507: printf("\n");
1508:
1509: printf("static const unsigned short %s_uni2charset_ranges[%d] = {\n", name, 2*ranges_count);
1510: for (k = 0; k < ranges_count; k++) {
1511: printf(" 0x%04x, 0x%04x", ranges[k].low + ranges[k].diff, ranges[k].high + ranges[k].diff);
1512: if (k+1 < ranges_count) printf(",");
1513: if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1514: }
1515: printf("\n");
1516: printf("};\n");
1517:
1518: printf("\n");
1519:
1520: printf("static const struct { unsigned short diff; unsigned short bitmap_offset; } %s_ranges[%d] = {\n ", name, ranges_count);
1521: for (k = 0; k < ranges_count; k++) {
1522: printf(" { %5d, 0x%04x }", ranges[k].diff, ranges[k].total);
1523: if (k+1 < ranges_count) printf(",");
1524: if ((k % 4) == 3 && k+1 < ranges_count) printf("\n ");
1525: }
1526: printf("\n");
1527: printf("};\n");
1528:
1529: printf("\n");
1530:
1531: printf("static const unsigned char %s_bitmap[%d] = {\n ", name, (ranges_total + 7) / 8);
1532: {
1533: int accu = 0;
1534: for (k = 0; k < ranges_count; k++) {
1535: for (i = ranges[k].total; i <= ranges[k].total + (ranges[k].high - ranges[k].low);) {
1536: if (charset2uni[i - ranges[k].total + ranges[k].low] != 0)
1537: accu |= (1 << (i % 8));
1538: i++;
1539: if ((i % 8) == 0) {
1540: printf(" 0x%02x", accu);
1541: if ((i / 8) < (ranges_total + 7) / 8) printf(",");
1542: if (((i / 8) % 12) == 0)
1543: printf("\n ");
1544: accu = 0;
1545: }
1546: }
1547: if (i != (k+1 < ranges_count ? ranges[k+1].total : ranges_total)) abort();
1548: }
1549: if ((ranges_total % 8) != 0)
1550: printf(" 0x%02x", accu);
1551: printf("\n");
1552: }
1553: printf("};\n");
1554:
1555: printf("\n");
1556:
1557: printf("static int\n");
1558: printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name);
1559: printf("{\n");
1560: printf(" unsigned char c1 = s[0];\n");
1561: printf(" if (c1 >= 0x81 && c1 <= 0x84) {\n");
1562: printf(" if (n >= 2) {\n");
1563: printf(" unsigned char c2 = s[1];\n");
1564: printf(" if (c2 >= 0x30 && c2 <= 0x39) {\n");
1565: printf(" if (n >= 3) {\n");
1566: printf(" unsigned char c3 = s[2];\n");
1567: printf(" if (c3 >= 0x81 && c3 <= 0xfe) {\n");
1568: printf(" if (n >= 4) {\n");
1569: printf(" unsigned char c4 = s[3];\n");
1570: printf(" if (c4 >= 0x30 && c4 <= 0x39) {\n");
1571: printf(" unsigned int i = (((c1 - 0x81) * 10 + (c2 - 0x30)) * 126 + (c3 - 0x81)) * 10 + (c4 - 0x30);\n");
1572: printf(" if (i >= %d && i <= %d) {\n", ranges[0].low, ranges[ranges_count-1].high);
1573: printf(" unsigned int k1 = 0;\n");
1574: printf(" unsigned int k2 = %d;\n", ranges_count-1);
1575: printf(" while (k1 < k2) {\n");
1576: printf(" unsigned int k = (k1 + k2) / 2;\n");
1577: printf(" if (i <= %s_charset2uni_ranges[2*k+1])\n", name);
1578: printf(" k2 = k;\n");
1579: printf(" else if (i >= %s_charset2uni_ranges[2*k+2])\n", name);
1580: printf(" k1 = k + 1;\n");
1581: printf(" else\n");
1582: printf(" return RET_ILSEQ;\n");
1583: printf(" }\n");
1584: printf(" {\n");
1585: printf(" unsigned int bitmap_index = i - %s_charset2uni_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1586: printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1587: printf(" unsigned int diff = %s_ranges[k1].diff;\n", name);
1588: printf(" *pwc = (ucs4_t) (i + diff);\n");
1589: printf(" return 4;\n");
1590: printf(" }\n");
1591: printf(" }\n");
1592: printf(" }\n");
1593: printf(" }\n");
1594: printf(" return RET_ILSEQ;\n");
1595: printf(" }\n");
1596: printf(" return RET_TOOFEW(0);\n");
1597: printf(" }\n");
1598: printf(" return RET_ILSEQ;\n");
1599: printf(" }\n");
1600: printf(" return RET_TOOFEW(0);\n");
1601: printf(" }\n");
1602: printf(" return RET_ILSEQ;\n");
1603: printf(" }\n");
1604: printf(" return RET_TOOFEW(0);\n");
1605: printf(" }\n");
1606: printf(" return RET_ILSEQ;\n");
1607: printf("}\n");
1608:
1609: printf("\n");
1610:
1611: printf("static int\n");
1612: printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name);
1613: printf("{\n");
1614: printf(" if (n >= 4) {\n");
1615: printf(" unsigned int i = wc;\n");
1616: printf(" if (i >= 0x%04x && i <= 0x%04x) {\n", ranges[0].low + ranges[0].diff, ranges[ranges_count-1].high + ranges[ranges_count-1].diff);
1617: printf(" unsigned int k1 = 0;\n");
1618: printf(" unsigned int k2 = %d;\n", ranges_count-1);
1619: printf(" while (k1 < k2) {\n");
1620: printf(" unsigned int k = (k1 + k2) / 2;\n");
1621: printf(" if (i <= %s_uni2charset_ranges[2*k+1])\n", name);
1622: printf(" k2 = k;\n");
1623: printf(" else if (i >= %s_uni2charset_ranges[2*k+2])\n", name);
1624: printf(" k1 = k + 1;\n");
1625: printf(" else\n");
1626: printf(" return RET_ILUNI;\n");
1627: printf(" }\n");
1628: printf(" {\n");
1629: printf(" unsigned int bitmap_index = i - %s_uni2charset_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1630: printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1631: printf(" unsigned int diff = %s_ranges[k1].diff;\n", name);
1632: printf(" i -= diff;\n");
1633: printf(" r[3] = (i %% 10) + 0x30; i = i / 10;\n");
1634: printf(" r[2] = (i %% 126) + 0x81; i = i / 126;\n");
1635: printf(" r[1] = (i %% 10) + 0x30; i = i / 10;\n");
1636: printf(" r[0] = i + 0x81;\n");
1637: printf(" return 4;\n");
1638: printf(" }\n");
1639: printf(" }\n");
1640: printf(" }\n");
1641: printf(" return RET_ILUNI;\n");
1642: printf(" }\n");
1643: printf(" return RET_TOOSMALL;\n");
1644: printf("}\n");
1645: }
1646:
1647: /* JISX0213 specifics */
1648:
1649: static void do_jisx0213 (const char* name)
1650: {
1651: printf("#ifndef _JISX0213_H\n");
1652: printf("#define _JISX0213_H\n");
1653: printf("\n");
1654: printf("/* JISX0213 plane 1 (= ISO-IR-233) characters are in the range\n");
1655: printf(" 0x{21..7E}{21..7E}.\n");
1656: printf(" JISX0213 plane 2 (= ISO-IR-229) characters are in the range\n");
1657: printf(" 0x{21,23..25,28,2C..2F,6E..7E}{21..7E}.\n");
1658: printf(" Together this makes 120 rows of 94 characters.\n");
1659: printf("*/\n");
1660: printf("\n");
1661: {
1662: #define row_convert(row) \
1663: ((row) >= 0x121 && (row) <= 0x17E ? row-289 : /* 0..93 */ \
1664: (row) == 0x221 ? row-451 : /* 94 */ \
1665: (row) >= 0x223 && (row) <= 0x225 ? row-452 : /* 95..97 */ \
1666: (row) == 0x228 ? row-454 : /* 98 */ \
1667: (row) >= 0x22C && (row) <= 0x22F ? row-457 : /* 99..102 */ \
1668: (row) >= 0x26E && (row) <= 0x27E ? row-519 : /* 103..119 */ \
1669: -1)
1670: unsigned int table[120][94];
1671: int pagemin[0x1100];
1672: int pagemax[0x1100];
1673: int pageidx[0x1100];
1674: unsigned int pagestart[0x1100];
1675: unsigned int pagestart_len = 0;
1676: {
1677: unsigned int rowc, colc;
1678: for (rowc = 0; rowc < 120; rowc++)
1679: for (colc = 0; colc < 94; colc++)
1680: table[rowc][colc] = 0;
1681: }
1682: {
1683: unsigned int page;
1684: for (page = 0; page < 0x1100; page++)
1685: pagemin[page] = -1;
1686: for (page = 0; page < 0x1100; page++)
1687: pagemax[page] = -1;
1688: for (page = 0; page < 0x1100; page++)
1689: pageidx[page] = -1;
1690: }
1691: printf("static const unsigned short jisx0213_to_ucs_combining[][2] = {\n");
1692: {
1693: int private_use = 0x0001;
1694: for (;;) {
1695: char line[30];
1696: unsigned int row, col;
1697: unsigned int ucs;
1698: memset(line,0,sizeof(line));
1699: if (scanf("%[^\n]\n",line) < 1)
1700: break;
1701: assert(line[0]=='0');
1702: assert(line[1]=='x');
1703: assert(isxdigit(line[2]));
1704: assert(isxdigit(line[3]));
1705: assert(isxdigit(line[4]));
1706: assert(isxdigit(line[5]));
1707: assert(isxdigit(line[6]));
1708: assert(line[7]=='\t');
1709: line[7] = '\0';
1710: col = strtoul(&line[5],NULL,16);
1711: line[5] = '\0';
1712: row = strtoul(&line[2],NULL,16);
1713: if (line[20] != '\0' && line[21] == '\0') {
1714: unsigned int u1, u2;
1715: assert(line[8]=='0');
1716: assert(line[9]=='x');
1717: assert(isxdigit(line[10]));
1718: assert(isxdigit(line[11]));
1719: assert(isxdigit(line[12]));
1720: assert(isxdigit(line[13]));
1721: assert(line[14]==' ');
1722: assert(line[15]=='0');
1723: assert(line[16]=='x');
1724: assert(isxdigit(line[17]));
1725: assert(isxdigit(line[18]));
1726: assert(isxdigit(line[19]));
1727: assert(isxdigit(line[20]));
1728: u2 = strtoul(&line[17],NULL,16);
1729: line[14] = '\0';
1730: u1 = strtoul(&line[10],NULL,16);
1731: printf(" { 0x%04x, 0x%04x },\n", u1, u2);
1732: ucs = private_use++;
1733: } else {
1734: assert(line[8]=='0');
1735: assert(line[9]=='x');
1736: assert(isxdigit(line[10]));
1737: assert(isxdigit(line[11]));
1738: assert(isxdigit(line[12]));
1739: assert(isxdigit(line[13]));
1740: ucs = strtoul(&line[10],NULL,16);
1741: }
1742: assert((unsigned int) row_convert(row) < 120);
1743: assert((unsigned int) (col-0x21) < 94);
1744: table[row_convert(row)][col-0x21] = ucs;
1745: }
1746: }
1747: printf("};\n");
1748: printf("\n");
1749: {
1750: unsigned int rowc, colc;
1751: for (rowc = 0; rowc < 120; rowc++) {
1752: for (colc = 0; colc < 94; colc++) {
1753: unsigned int value = table[rowc][colc];
1754: unsigned int page = value >> 8;
1755: unsigned int rest = value & 0xff;
1756: if (pagemin[page] < 0 || pagemin[page] > rest) pagemin[page] = rest;
1757: if (pagemax[page] < 0 || pagemax[page] < rest) pagemax[page] = rest;
1758: }
1759: }
1760: }
1761: {
1762: unsigned int index = 0;
1763: unsigned int i;
1764: for (i = 0; i < 0x1100; ) {
1765: if (pagemin[i] >= 0) {
1766: if (pagemin[i+1] >= 0 && pagemin[i] >= 0x80 && pagemax[i+1] < 0x80) {
1767: /* Combine two pages into a single one. */
1768: assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0]));
1769: pagestart[pagestart_len++] = (i<<8)+0x80;
1770: pageidx[i] = index;
1771: pageidx[i+1] = index;
1772: index++;
1773: i += 2;
1774: } else {
1775: /* A single page. */
1776: assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0]));
1777: pagestart[pagestart_len++] = i<<8;
1778: pageidx[i] = index;
1779: index++;
1780: i += 1;
1781: }
1782: } else
1783: i++;
1784: }
1785: }
1786: printf("static const unsigned short jisx0213_to_ucs_main[120 * 94] = {\n");
1787: {
1788: unsigned int row;
1789: for (row = 0; row < 0x300; row++) {
1790: unsigned int rowc = row_convert(row);
1791: if (rowc != (unsigned int) (-1)) {
1792: printf(" /* 0x%X21..0x%X7E */\n",row,row);
1793: {
1794: unsigned int count = 0;
1795: unsigned int colc;
1796: for (colc = 0; colc < 94; colc++) {
1797: if ((count % 8) == 0) printf(" ");
1798: {
1799: unsigned int value = table[rowc][colc];
1800: unsigned int page = value >> 8;
1801: unsigned int index = pageidx[page];
1802: assert(value-pagestart[index] < 0x100);
1803: printf(" 0x%04x,",(index<<8)|(value-pagestart[index]));
1804: }
1805: count++;
1806: if ((count % 8) == 0) printf("\n");
1807: }
1808: }
1809: printf("\n");
1810: }
1811: }
1812: }
1813: printf("};\n");
1814: printf("\n");
1815: printf("static const ucs4_t jisx0213_to_ucs_pagestart[] = {\n");
1816: {
1817: unsigned int count = 0;
1818: unsigned int i;
1819: for (i = 0; i < pagestart_len; i++) {
1820: char buf[10];
1821: if ((count % 8) == 0) printf(" ");
1822: printf(" ");
1823: sprintf(buf,"0x%04x",pagestart[i]);
1824: if (strlen(buf) < 7) printf("%*s",7-strlen(buf),"");
1825: printf("%s,",buf);
1826: count++;
1827: if ((count % 8) == 0) printf("\n");
1828: }
1829: }
1830: printf("\n");
1831: printf("};\n");
1832: #undef row_convert
1833: }
1834: rewind(stdin);
1835: printf("\n");
1836: {
1837: int table[0x110000];
1838: bool pages[0x4400];
1839: int maxpage = -1;
1840: unsigned int combining_prefixes[100];
1841: unsigned int combining_prefixes_len = 0;
1842: {
1843: unsigned int i;
1844: for (i = 0; i < 0x110000; i++)
1845: table[i] = -1;
1846: for (i = 0; i < 0x4400; i++)
1847: pages[i] = false;
1848: }
1849: for (;;) {
1850: char line[30];
1851: unsigned int plane, row, col;
1852: memset(line,0,sizeof(line));
1853: if (scanf("%[^\n]\n",line) < 1)
1854: break;
1855: assert(line[0]=='0');
1856: assert(line[1]=='x');
1857: assert(isxdigit(line[2]));
1858: assert(isxdigit(line[3]));
1859: assert(isxdigit(line[4]));
1860: assert(isxdigit(line[5]));
1861: assert(isxdigit(line[6]));
1862: assert(line[7]=='\t');
1863: line[7] = '\0';
1864: col = strtoul(&line[5],NULL,16);
1865: line[5] = '\0';
1866: row = strtoul(&line[3],NULL,16);
1867: line[3] = '\0';
1868: plane = strtoul(&line[2],NULL,16) - 1;
1869: if (line[20] != '\0' && line[21] == '\0') {
1870: unsigned int u1, u2;
1871: assert(line[8]=='0');
1872: assert(line[9]=='x');
1873: assert(isxdigit(line[10]));
1874: assert(isxdigit(line[11]));
1875: assert(isxdigit(line[12]));
1876: assert(isxdigit(line[13]));
1877: assert(line[14]==' ');
1878: assert(line[15]=='0');
1879: assert(line[16]=='x');
1880: assert(isxdigit(line[17]));
1881: assert(isxdigit(line[18]));
1882: assert(isxdigit(line[19]));
1883: assert(isxdigit(line[20]));
1884: u2 = strtoul(&line[17],NULL,16);
1885: line[14] = '\0';
1886: u1 = strtoul(&line[10],NULL,16);
1887: assert(u2 == 0x02E5 || u2 == 0x02E9 || u2 == 0x0300 || u2 == 0x0301
1888: || u2 == 0x309A);
1889: assert(combining_prefixes_len < sizeof(combining_prefixes)/sizeof(combining_prefixes[0]));
1890: combining_prefixes[combining_prefixes_len++] = u1;
1891: } else {
1892: unsigned int ucs;
1893: assert(line[8]=='0');
1894: assert(line[9]=='x');
1895: assert(isxdigit(line[10]));
1896: assert(isxdigit(line[11]));
1897: assert(isxdigit(line[12]));
1898: assert(isxdigit(line[13]));
1899: ucs = strtoul(&line[10],NULL,16);
1900: /* Add an entry. */
1901: assert(plane <= 1);
1902: assert(row <= 0x7f);
1903: assert(col <= 0x7f);
1904: table[ucs] = (plane << 15) | (row << 8) | col;
1905: pages[ucs>>6] = true;
1906: if (maxpage < 0 || (ucs>>6) > maxpage) maxpage = ucs>>6;
1907: }
1908: }
1909: {
1910: unsigned int i;
1911: for (i = 0; i < combining_prefixes_len; i++) {
1912: unsigned int u1 = combining_prefixes[i];
1913: assert(table[u1] >= 0);
1914: table[u1] |= 0x0080;
1915: }
1916: }
1917: printf("static const short jisx0213_from_ucs_level1[%d] = {\n",maxpage+1);
1918: {
1919: unsigned int index = 0;
1920: unsigned int i;
1921: for (i = 0; i <= maxpage; i++) {
1922: if ((i % 8) == 0) printf(" ");
1923: if (pages[i]) {
1924: printf(" %3u,",index);
1925: index++;
1926: } else {
1927: printf(" %3d,",-1);
1928: }
1929: if (((i+1) % 8) == 0) printf("\n");
1930: }
1931: }
1932: printf("\n");
1933: printf("};\n");
1934: printf("\n");
1935: #if 0 /* Dense array */
1936: printf("static const unsigned short jisx0213_from_ucs_level2[] = {\n");
1937: {
1938: unsigned int i;
1939: for (i = 0; i <= maxpage; i++) {
1940: if (pages[i]) {
1941: printf(" /* 0x%04X */\n",i<<6);
1942: {
1943: unsigned int j;
1944: for (j = 0; j < 0x40; ) {
1945: unsigned int ucs = (i<<6)+j;
1946: int value = table[ucs];
1947: if (value < 0) value = 0;
1948: if ((j % 8) == 0) printf(" ");
1949: printf(" 0x%04x,",value);
1950: j++;
1951: if ((j % 8) == 0) printf("\n");
1952: }
1953: }
1954: }
1955: }
1956: }
1957: printf("};\n");
1958: #else /* Sparse array */
1959: {
1960: int summary_indx[0x11000];
1961: int summary_used[0x11000];
1962: unsigned int i, k, indx;
1963: printf("static const unsigned short jisx0213_from_ucs_level2_data[] = {\n");
1964: /* Fill summary_indx[] and summary_used[]. */
1965: indx = 0;
1966: for (i = 0, k = 0; i <= maxpage; i++) {
1967: if (pages[i]) {
1968: unsigned int j1, j2;
1969: unsigned int count = 0;
1970: printf(" /* 0x%04X */\n",i<<6);
1971: for (j1 = 0; j1 < 4; j1++) {
1972: summary_indx[4*k+j1] = indx;
1973: summary_used[4*k+j1] = 0;
1974: for (j2 = 0; j2 < 16; j2++) {
1975: unsigned int j = 16*j1+j2;
1976: unsigned int ucs = (i<<6)+j;
1977: int value = table[ucs];
1978: if (value < 0) value = 0;
1979: if (value > 0) {
1980: summary_used[4*k+j1] |= (1 << j2);
1981: if ((count % 8) == 0) printf(" ");
1982: printf(" 0x%04x,",value);
1983: count++;
1984: if ((count % 8) == 0) printf("\n");
1985: indx++;
1986: }
1987: }
1988: }
1989: if ((count % 8) > 0)
1990: printf("\n");
1991: k++;
1992: }
1993: }
1994: printf("};\n");
1995: printf("\n");
1996: printf("static const Summary16 jisx0213_from_ucs_level2_2indx[] = {\n");
1997: for (i = 0, k = 0; i <= maxpage; i++) {
1998: if (pages[i]) {
1999: unsigned int j1;
2000: printf(" /* 0x%04X */\n",i<<6);
2001: printf(" ");
2002: for (j1 = 0; j1 < 4; j1++) {
2003: printf(" { %4d, 0x%04x },", summary_indx[4*k+j1], summary_used[4*k+j1]);
2004: }
2005: printf("\n");
2006: k++;
2007: }
2008: }
2009: printf("};\n");
2010: }
2011: #endif
2012: printf("\n");
2013: }
2014: printf("#ifdef __GNUC__\n");
2015: printf("__inline\n");
2016: printf("#else\n");
2017: printf("#ifdef __cplusplus\n");
2018: printf("inline\n");
2019: printf("#endif\n");
2020: printf("#endif\n");
2021: printf("static ucs4_t jisx0213_to_ucs4 (unsigned int row, unsigned int col)\n");
2022: printf("{\n");
2023: printf(" ucs4_t val;\n");
2024: printf("\n");
2025: printf(" if (row >= 0x121 && row <= 0x17e)\n");
2026: printf(" row -= 289;\n");
2027: printf(" else if (row == 0x221)\n");
2028: printf(" row -= 451;\n");
2029: printf(" else if (row >= 0x223 && row <= 0x225)\n");
2030: printf(" row -= 452;\n");
2031: printf(" else if (row == 0x228)\n");
2032: printf(" row -= 454;\n");
2033: printf(" else if (row >= 0x22c && row <= 0x22f)\n");
2034: printf(" row -= 457;\n");
2035: printf(" else if (row >= 0x26e && row <= 0x27e)\n");
2036: printf(" row -= 519;\n");
2037: printf(" else\n");
2038: printf(" return 0x0000;\n");
2039: printf("\n");
2040: printf(" if (col >= 0x21 && col <= 0x7e)\n");
2041: printf(" col -= 0x21;\n");
2042: printf(" else\n");
2043: printf(" return 0x0000;\n");
2044: printf("\n");
2045: printf(" val = jisx0213_to_ucs_main[row * 94 + col];\n");
2046: printf(" val = jisx0213_to_ucs_pagestart[val >> 8] + (val & 0xff);\n");
2047: printf(" if (val == 0xfffd)\n");
2048: printf(" val = 0x0000;\n");
2049: printf(" return val;\n");
2050: printf("}\n");
2051: printf("\n");
2052: printf("#ifdef __GNUC__\n");
2053: printf("__inline\n");
2054: printf("#else\n");
2055: printf("#ifdef __cplusplus\n");
2056: printf("inline\n");
2057: printf("#endif\n");
2058: printf("#endif\n");
2059: printf("static unsigned short ucs4_to_jisx0213 (ucs4_t ucs)\n");
2060: printf("{\n");
2061: printf(" if (ucs < (sizeof(jisx0213_from_ucs_level1)/sizeof(jisx0213_from_ucs_level1[0])) << 6) {\n");
2062: printf(" int index1 = jisx0213_from_ucs_level1[ucs >> 6];\n");
2063: printf(" if (index1 >= 0)");
2064: #if 0 /* Dense array */
2065: printf("\n");
2066: printf(" return jisx0213_from_ucs_level2[(index1 << 6) + (ucs & 0x3f)];\n");
2067: #else /* Sparse array */
2068: printf(" {\n");
2069: printf(" const Summary16 *summary = &jisx0213_from_ucs_level2_2indx[((index1 << 6) + (ucs & 0x3f)) >> 4];\n");
2070: printf(" unsigned short used = summary->used;\n");
2071: printf(" unsigned int i = ucs & 0x0f;\n");
2072: printf(" if (used & ((unsigned short) 1 << i)) {\n");
2073: printf(" /* Keep in `used' only the bits 0..i-1. */\n");
2074: printf(" used &= ((unsigned short) 1 << i) - 1;\n");
2075: printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n");
2076: printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
2077: printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
2078: printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
2079: printf(" used = (used & 0x00ff) + (used >> 8);\n");
2080: printf(" return jisx0213_from_ucs_level2_data[summary->indx + used];\n");
2081: printf(" };\n");
2082: printf(" };\n");
2083: #endif
2084: printf(" }\n");
2085: printf(" return 0x0000;\n");
2086: printf("}\n");
2087: printf("\n");
2088: printf("#endif /* _JISX0213_H */\n");
2089: }
2090:
2091: /* Main program */
2092:
2093: int main (int argc, char *argv[])
2094: {
2095: const char* charsetname;
2096: const char* name;
2097:
2098: if (argc != 3)
2099: exit(1);
2100: charsetname = argv[1];
2101: name = argv[2];
2102:
2103: output_title(charsetname);
2104:
2105: if (!strcmp(name,"gb2312")
2106: || !strcmp(name,"isoir165ext") || !strcmp(name,"gb12345ext")
2107: || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
2108: do_normal(name);
2109: else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
2110: || !strcmp(name,"cns11643_3") || !strcmp(name,"cns11643_4a")
2111: || !strcmp(name,"cns11643_4b") || !strcmp(name,"cns11643_5")
2112: || !strcmp(name,"cns11643_6") || !strcmp(name,"cns11643_7")
2113: || !strcmp(name,"cns11643_15"))
2114: do_normal_only_charset2uni(name);
2115: else if (!strcmp(name,"cns11643_inv"))
2116: do_cns11643_only_uni2charset(name);
2117: else if (!strcmp(name,"gbkext1"))
2118: do_gbk1_only_charset2uni(name);
2119: else if (!strcmp(name,"gbkext2"))
2120: do_gbk2_only_charset2uni(name);
2121: else if (!strcmp(name,"gbkext_inv"))
2122: do_gbk1_only_uni2charset(name);
2123: else if (!strcmp(name,"cp936ext") || !strcmp(name,"gb18030ext"))
2124: do_gbk1(name);
2125: else if (!strcmp(name,"ksc5601"))
2126: do_ksc5601(name);
2127: else if (!strcmp(name,"uhc_1"))
2128: do_uhc_1(name);
2129: else if (!strcmp(name,"uhc_2"))
2130: do_uhc_2(name);
2131: else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
2132: do_big5(name);
2133: else if (!strcmp(name,"hkscs1999") || !strcmp(name,"hkscs2001")
2134: || !strcmp(name,"hkscs2004"))
2135: do_hkscs(name);
2136: else if (!strcmp(name,"johab_hangul"))
2137: do_johab_hangul(name);
2138: else if (!strcmp(name,"cp932ext"))
2139: do_sjis(name);
2140: else if (!strcmp(name,"gb18030uni"))
2141: do_gb18030uni(name);
2142: else if (!strcmp(name,"jisx0213"))
2143: do_jisx0213(name);
2144: else
2145: exit(1);
2146:
2147: return 0;
2148: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>