Annotation of embedaddon/libiconv/tools/cjk_tab_to_h.c, revision 1.1.1.3
1.1.1.3 ! misho 1: /* Copyright (C) 1999-2004, 2006-2007, 2010, 2012, 2016, 2018 Free Software Foundation, Inc.
1.1 misho 2: This file is part of the GNU LIBICONV Tools.
3:
4: This program is free software: you can redistribute it and/or modify
5: it under the terms of the GNU General Public License as published by
6: the Free Software Foundation; either version 3 of the License, or
7: (at your option) any later version.
8:
9: This program is distributed in the hope that it will be useful,
10: but WITHOUT ANY WARRANTY; without even the implied warranty of
11: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12: GNU General Public License for more details.
13:
14: You should have received a copy of the GNU General Public License
1.1.1.3 ! misho 15: along with this program; if not, see <https://www.gnu.org/licenses/>. */
1.1 misho 16:
17: /*
18: * Generates a CJK character set table from a .TXT table as found on
19: * ftp.unicode.org or in the X nls directory.
20: * Examples:
21: *
22: * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312
23: * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208
24: * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601
25: *
26: * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT
27: * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT
28: * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT
29: * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT
30: * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT
31: *
32: * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT
33: *
34: * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT
35: *
36: * ./cjk_tab_to_h JISX0213:2004 jisx0213 > jisx0213.h < JISX0213.TXT
37: */
38:
39: #include <stdio.h>
40: #include <stdlib.h>
41: #include <stdbool.h>
42: #include <string.h>
43: #include <ctype.h>
44: #include <assert.h>
45:
46: typedef struct {
47: int start;
48: int end;
49: } Block;
50:
51: typedef struct {
52: int rows; /* number of possible values for the 1st byte */
53: int cols; /* number of possible values for the 2nd byte */
54: int (*row_byte) (int row); /* returns the 1st byte value for a given row */
55: int (*col_byte) (int col); /* returns the 2nd byte value for a given col */
56: int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */
57: int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */
58: const char* check_row_expr; /* format string for 1st byte value checking */
59: const char* check_col_expr; /* format string for 2nd byte value checking */
60: const char* byte_row_expr; /* format string for 1st byte value to row */
61: const char* byte_col_expr; /* format string for 2nd byte value to col */
62: int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */
63: /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book.
64: Once a row is fixed, choosing a "col" is the same as choosing a "cell". */
65: int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */
66: int ncharsetblocks;
67: Block* charsetblocks; /* blocks[0..nblocks-1] */
68: int* uni2charset; /* uni2charset[0x0000..0xffff] */
69: int fffd; /* uni representation of the invalid character */
70: } Encoding;
71:
72: /*
73: * Outputs the file title.
74: */
75: static void output_title (const char *charsetname)
76: {
77: printf("/*\n");
1.1.1.3 ! misho 78: printf(" * Copyright (C) 1999-2016 Free Software Foundation, Inc.\n");
1.1 misho 79: printf(" * This file is part of the GNU LIBICONV Library.\n");
80: printf(" *\n");
81: printf(" * The GNU LIBICONV Library is free software; you can redistribute it\n");
82: printf(" * and/or modify it under the terms of the GNU Library General Public\n");
83: printf(" * License as published by the Free Software Foundation; either version 2\n");
84: printf(" * of the License, or (at your option) any later version.\n");
85: printf(" *\n");
86: printf(" * The GNU LIBICONV Library is distributed in the hope that it will be\n");
87: printf(" * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
88: printf(" * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
89: printf(" * Library General Public License for more details.\n");
90: printf(" *\n");
91: printf(" * You should have received a copy of the GNU Library General Public\n");
92: printf(" * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
1.1.1.3 ! misho 93: printf(" * If not, see <https://www.gnu.org/licenses/>.\n");
1.1 misho 94: printf(" */\n");
95: printf("\n");
96: printf("/*\n");
97: printf(" * %s\n", charsetname);
98: printf(" */\n");
99: printf("\n");
100: }
101:
102: /*
103: * Reads the charset2uni table from standard input.
104: */
105: static void read_table (Encoding* enc)
106: {
107: int row, col, i, i1, i2, c, j;
108:
109: enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
110: for (row = 0; row < enc->rows; row++)
111: enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
112:
113: for (row = 0; row < enc->rows; row++)
114: for (col = 0; col < enc->cols; col++)
115: enc->charset2uni[row][col] = 0xfffd;
116:
117: c = getc(stdin);
118: ungetc(c,stdin);
119: if (c == '#') {
120: /* Read a unicode.org style .TXT file. */
121: for (;;) {
122: c = getc(stdin);
123: if (c == EOF)
124: break;
125: if (c == '\n' || c == ' ' || c == '\t')
126: continue;
127: if (c == '#') {
128: do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
129: continue;
130: }
131: ungetc(c,stdin);
132: if (scanf("0x%x", &j) != 1)
133: exit(1);
134: i1 = j >> 8;
135: i2 = j & 0xff;
136: row = enc->byte_row(i1);
137: col = enc->byte_col(i2);
138: if (row < 0 || col < 0) {
139: fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
140: exit(1);
141: }
142: if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1)
143: exit(1);
144: }
145: } else {
146: /* Read a table of hexadecimal Unicode values. */
147: for (i1 = 32; i1 < 132; i1++)
148: for (i2 = 32; i2 < 132; i2++) {
149: i = scanf("%x", &j);
150: if (i == EOF)
151: goto read_done;
152: if (i != 1)
153: exit(1);
154: if (j < 0 || j == 0xffff)
155: j = 0xfffd;
156: if (j != 0xfffd) {
157: if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
158: fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
159: exit (1);
160: }
161: enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
162: }
163: }
164: read_done: ;
165: }
166: }
167:
168: /*
169: * Determine whether the Unicode range goes outside the BMP.
170: */
171: static bool is_charset2uni_large (Encoding* enc)
172: {
173: int row, col;
174:
175: for (row = 0; row < enc->rows; row++)
176: for (col = 0; col < enc->cols; col++)
177: if (enc->charset2uni[row][col] >= 0x10000)
178: return true;
179: return false;
180: }
181:
182: /*
183: * Compactify the Unicode range by use of an auxiliary table,
184: * so 16 bits suffice to store each value.
185: */
186: static int compact_large_charset2uni (Encoding* enc, unsigned int **urows, unsigned int *urowshift)
187: {
188: unsigned int shift;
189:
190: for (shift = 8; ; shift--) {
191: int *upages = (int *) malloc((0x110000>>shift) * sizeof(int));
192: int i, row, col, nurows;
193:
194: for (i = 0; i < 0x110000>>shift; i++)
195: upages[i] = -1;
196:
197: for (row = 0; row < enc->rows; row++)
198: for (col = 0; col < enc->cols; col++)
199: upages[enc->charset2uni[row][col] >> shift] = 0;
200:
201: nurows = 0;
202: for (i = 0; i < 0x110000>>shift; i++)
203: if (upages[i] == 0)
204: nurows++;
205:
206: /* We want all table entries to fit in an 'unsigned short'. */
207: if (nurows <= 1<<(16-shift)) {
208: int** old_charset2uni;
209:
210: *urows = (unsigned int *) malloc(nurows * sizeof(unsigned int));
211: *urowshift = shift;
212:
213: nurows = 0;
214: for (i = 0; i < 0x110000>>shift; i++)
215: if (upages[i] == 0) {
216: upages[i] = nurows;
217: (*urows)[nurows] = i;
218: nurows++;
219: }
220:
221: old_charset2uni = enc->charset2uni;
222: enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
223: for (row = 0; row < enc->rows; row++)
224: enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
225: for (row = 0; row < enc->rows; row++)
226: for (col = 0; col < enc->cols; col++) {
227: int u = old_charset2uni[row][col];
228: enc->charset2uni[row][col] =
229: (upages[u >> shift] << shift) | (u & ((1 << shift) - 1));
230: }
231: enc->fffd =
232: (upages[0xfffd >> shift] << shift) | (0xfffd & ((1 << shift) - 1));
233:
234: return nurows;
235: }
236: }
237: abort();
238: }
239:
240: /*
241: * Computes the charsetpage[0..rows] array.
242: */
243: static void find_charset2uni_pages (Encoding* enc)
244: {
245: int row, col;
246:
247: enc->charsetpage = (int*) malloc((enc->rows+1)*sizeof(int));
248:
249: for (row = 0; row <= enc->rows; row++)
250: enc->charsetpage[row] = 0;
251:
252: for (row = 0; row < enc->rows; row++) {
253: int used = 0;
254: for (col = 0; col < enc->cols; col++)
255: if (enc->charset2uni[row][col] != enc->fffd)
256: used = col+1;
257: enc->charsetpage[row] = used;
258: }
259: }
260:
261: /*
262: * Fills in nblocks and blocks.
263: */
264: static void find_charset2uni_blocks (Encoding* enc)
265: {
266: int n, row, lastrow;
267:
268: enc->charsetblocks = (Block*) malloc(enc->rows*sizeof(Block));
269:
270: n = 0;
271: for (row = 0; row < enc->rows; row++)
272: if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) {
273: for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
274: enc->charsetblocks[n].start = row * enc->cols;
275: enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow];
276: n++;
277: }
278: enc->ncharsetblocks = n;
279: }
280:
281: /*
282: * Outputs the charset to unicode table and function.
283: */
284: static void output_charset2uni (const char* name, Encoding* enc)
285: {
286: int nurows, row, col, lastrow, col_max, i, i1_min, i1_max;
287: bool is_large;
288: unsigned int* urows;
289: unsigned int urowshift;
290: Encoding tmpenc;
291:
292: is_large = is_charset2uni_large(enc);
293: if (is_large) {
294: /* Use a temporary copy of enc. */
295: tmpenc = *enc;
296: enc = &tmpenc;
297: nurows = compact_large_charset2uni(enc,&urows,&urowshift);
298: } else {
299: nurows = 0; urows = NULL; urowshift = 0; enc->fffd = 0xfffd;
300: }
301:
302: find_charset2uni_pages(enc);
303:
304: find_charset2uni_blocks(enc);
305:
306: for (row = 0; row < enc->rows; row++)
307: if (enc->charsetpage[row] > 0) {
308: if (row == 0 || enc->charsetpage[row-1] == 0) {
309: /* Start a new block. */
310: for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
311: printf("static const unsigned short %s_2uni_page%02x[%d] = {\n",
312: name, enc->row_byte(row),
313: (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
314: }
315: printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
316: col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
317: for (col = 0; col < col_max; col++) {
318: printf(" 0x%04x,", enc->charset2uni[row][col]);
319: if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
320: }
321: printf("\n");
322: if (enc->charsetpage[row+1] == 0) {
323: /* End a block. */
324: printf("};\n");
325: }
326: }
327: printf("\n");
328:
329: if (is_large) {
330: printf("static const ucs4_t %s_2uni_upages[%d] = {\n ", name, nurows);
331: for (i = 0; i < nurows; i++) {
332: printf(" 0x%05x,", urows[i] << urowshift);
333: if ((i % 8) == 7 && (i+1 < nurows)) printf("\n ");
334: }
335: printf("\n");
336: printf("};\n");
337: printf("\n");
338: }
339:
340: printf("static int\n");
1.1.1.3 ! misho 341: printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)\n", name);
1.1 misho 342: printf("{\n");
343: printf(" unsigned char c1 = s[0];\n");
344: printf(" if (");
345: for (i = 0; i < enc->ncharsetblocks; i++) {
346: i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
347: i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
348: if (i > 0)
349: printf(" || ");
350: if (i1_min == i1_max)
351: printf("(c1 == 0x%02x)", i1_min);
352: else
353: printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
354: }
355: printf(") {\n");
356: printf(" if (n >= 2) {\n");
357: printf(" unsigned char c2 = s[1];\n");
358: printf(" if (");
359: printf(enc->check_col_expr, "c2");
360: printf(") {\n");
361: printf(" unsigned int i = %d * (", enc->cols);
362: printf(enc->byte_row_expr, "c1");
363: printf(") + (");
364: printf(enc->byte_col_expr, "c2");
365: printf(");\n");
366: printf(" %s wc = 0xfffd;\n", is_large ? "ucs4_t" : "unsigned short");
367: if (is_large) printf(" unsigned short swc;\n");
368: for (i = 0; i < enc->ncharsetblocks; i++) {
369: printf(" ");
370: if (i > 0)
371: printf("} else ");
372: if (i < enc->ncharsetblocks-1)
373: printf("if (i < %d) ", enc->charsetblocks[i+1].start);
374: printf("{\n");
375: printf(" if (i < %d)\n", enc->charsetblocks[i].end);
376: printf(" %s = ", is_large ? "swc" : "wc");
377: printf("%s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
378: if (enc->charsetblocks[i].start > 0)
379: printf("-%d", enc->charsetblocks[i].start);
380: printf("]");
381: if (is_large) printf(",\n wc = %s_2uni_upages[swc>>%d] | (swc & 0x%x)", name, urowshift, (1 << urowshift) - 1);
382: printf(";\n");
383: }
384: printf(" }\n");
385: printf(" if (wc != 0xfffd) {\n");
386: printf(" *pwc = %swc;\n", is_large ? "" : "(ucs4_t) ");
387: printf(" return 2;\n");
388: printf(" }\n");
389: printf(" }\n");
390: printf(" return RET_ILSEQ;\n");
391: printf(" }\n");
392: printf(" return RET_TOOFEW(0);\n");
393: printf(" }\n");
394: printf(" return RET_ILSEQ;\n");
395: printf("}\n");
396: printf("\n");
397: }
398:
399: /*
400: * Outputs the charset to unicode table and function.
401: * (Suitable if the mapping function is well defined, i.e. has no holes, and
402: * is monotonically increasing with small gaps only.)
403: */
404: static void output_charset2uni_noholes_monotonic (const char* name, Encoding* enc)
405: {
406: int row, col, lastrow, r, col_max, i, i1_min, i1_max;
407:
408: /* Choose stepsize so that stepsize*steps_per_row >= enc->cols, and
409: enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]
410: is always < 0x100. */
411: int steps_per_row = 2;
412: int stepsize = (enc->cols + steps_per_row-1) / steps_per_row;
413:
414: find_charset2uni_pages(enc);
415:
416: find_charset2uni_blocks(enc);
417:
418: for (row = 0; row < enc->rows; row++)
419: if (enc->charsetpage[row] > 0) {
420: if (row == 0 || enc->charsetpage[row-1] == 0) {
421: /* Start a new block. */
422: for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++);
423: printf("static const unsigned short %s_2uni_main_page%02x[%d] = {\n ",
424: name, enc->row_byte(row),
425: steps_per_row*(lastrow-row+1));
426: for (r = row; r <= lastrow; r++) {
427: for (i = 0; i < steps_per_row; i++)
428: printf(" 0x%04x,", enc->charset2uni[r][i*stepsize]);
429: if (((r-row) % 4) == 3 && (r < lastrow)) printf("\n ");
430: }
431: printf("\n");
432: printf("};\n");
433: printf("static const unsigned char %s_2uni_page%02x[%d] = {\n",
434: name, enc->row_byte(row),
435: (lastrow-row) * enc->cols + enc->charsetpage[lastrow]);
436: }
437: printf(" /""* 0x%02x *""/\n ", enc->row_byte(row));
438: col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]);
439: for (col = 0; col < col_max; col++) {
440: printf(" 0x%02x,", enc->charset2uni[row][col] - enc->charset2uni[row][col/stepsize*stepsize]);
441: if ((col % 8) == 7 && (col+1 < col_max)) printf("\n ");
442: }
443: printf("\n");
444: if (enc->charsetpage[row+1] == 0) {
445: /* End a block. */
446: printf("};\n");
447: }
448: }
449: printf("\n");
450:
451: printf("static int\n");
1.1.1.3 ! misho 452: printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)\n", name);
1.1 misho 453: printf("{\n");
454: printf(" unsigned char c1 = s[0];\n");
455: printf(" if (");
456: for (i = 0; i < enc->ncharsetblocks; i++) {
457: i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols);
458: i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols);
459: if (i > 0)
460: printf(" || ");
461: if (i1_min == i1_max)
462: printf("(c1 == 0x%02x)", i1_min);
463: else
464: printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max);
465: }
466: printf(") {\n");
467: printf(" if (n >= 2) {\n");
468: printf(" unsigned char c2 = s[1];\n");
469: printf(" if (");
470: printf(enc->check_col_expr, "c2");
471: printf(") {\n");
472: printf(" unsigned int row = ");
473: printf(enc->byte_row_expr, "c1");
474: printf(";\n");
475: printf(" unsigned int col = ");
476: printf(enc->byte_col_expr, "c2");
477: printf(";\n");
478: printf(" unsigned int i = %d * row + col;\n", enc->cols);
479: printf(" unsigned short wc = 0xfffd;\n");
480: for (i = 0; i < enc->ncharsetblocks; i++) {
481: printf(" ");
482: if (i > 0)
483: printf("} else ");
484: if (i < enc->ncharsetblocks-1)
485: printf("if (i < %d) ", enc->charsetblocks[i+1].start);
486: printf("{\n");
487: printf(" if (i < %d)\n", enc->charsetblocks[i].end);
488: printf(" wc = %s_2uni_main_page%02x[%d*", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols), steps_per_row);
489: if (enc->charsetblocks[i].start > 0)
490: printf("(row-%d)", enc->charsetblocks[i].start / enc->cols);
491: else
492: printf("row");
493: printf("+");
494: if (steps_per_row == 2)
495: printf("(col>=%d?1:0)", stepsize);
496: else
497: printf("col/%d", stepsize);
498: printf("] + %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols));
499: if (enc->charsetblocks[i].start > 0)
500: printf("-%d", enc->charsetblocks[i].start);
501: printf("];\n");
502: }
503: printf(" }\n");
504: printf(" if (wc != 0xfffd) {\n");
505: printf(" *pwc = (ucs4_t) wc;\n");
506: printf(" return 2;\n");
507: printf(" }\n");
508: printf(" }\n");
509: printf(" return RET_ILSEQ;\n");
510: printf(" }\n");
511: printf(" return RET_TOOFEW(0);\n");
512: printf(" }\n");
513: printf(" return RET_ILSEQ;\n");
514: printf("}\n");
515: printf("\n");
516: }
517:
518: /*
519: * Computes the uni2charset[0x0000..0x2ffff] array.
520: */
521: static void invert (Encoding* enc)
522: {
523: int row, col, j;
524:
525: enc->uni2charset = (int*) malloc(0x30000*sizeof(int));
526:
527: for (j = 0; j < 0x30000; j++)
528: enc->uni2charset[j] = 0;
529:
530: for (row = 0; row < enc->rows; row++)
531: for (col = 0; col < enc->cols; col++) {
532: j = enc->charset2uni[row][col];
533: if (j != 0xfffd)
534: enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col);
535: }
536: }
537:
538: /*
539: * Outputs the unicode to charset table and function, using a linear array.
540: * (Suitable if the table is dense.)
541: */
542: static void output_uni2charset_dense (const char* name, Encoding* enc)
543: {
544: /* Like in 8bit_tab_to_h.c */
545: bool pages[0x300];
546: int line[0x6000];
547: int tableno;
548: struct { int minline; int maxline; int usecount; } tables[0x6000];
549: bool first;
550: int row, col, j, p, j1, j2, t;
551:
552: for (p = 0; p < 0x300; p++)
553: pages[p] = false;
554: for (row = 0; row < enc->rows; row++)
555: for (col = 0; col < enc->cols; col++) {
556: j = enc->charset2uni[row][col];
557: if (j != 0xfffd)
558: pages[j>>8] = true;
559: }
560: for (j1 = 0; j1 < 0x6000; j1++) {
561: bool all_invalid = true;
562: for (j2 = 0; j2 < 8; j2++) {
563: j = 8*j1+j2;
564: if (enc->uni2charset[j] != 0)
565: all_invalid = false;
566: }
567: if (all_invalid)
568: line[j1] = -1;
569: else
570: line[j1] = 0;
571: }
572: tableno = 0;
573: for (j1 = 0; j1 < 0x6000; j1++) {
574: if (line[j1] >= 0) {
575: if (tableno > 0
576: && ((j1 > 0 && line[j1-1] == tableno-1)
577: || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
578: && j1 - tables[tableno-1].maxline <= 8))) {
579: line[j1] = tableno-1;
580: tables[tableno-1].maxline = j1;
581: } else {
582: tableno++;
583: line[j1] = tableno-1;
584: tables[tableno-1].minline = tables[tableno-1].maxline = j1;
585: }
586: }
587: }
588: for (t = 0; t < tableno; t++) {
589: tables[t].usecount = 0;
590: j1 = 8*tables[t].minline;
591: j2 = 8*(tables[t].maxline+1);
592: for (j = j1; j < j2; j++)
593: if (enc->uni2charset[j] != 0)
594: tables[t].usecount++;
595: }
596: {
597: p = -1;
598: for (t = 0; t < tableno; t++)
599: if (tables[t].usecount > 1) {
600: p = tables[t].minline >> 5;
601: printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1));
602: for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
603: if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
604: printf(" /* 0x%04x */\n", 8*j1);
605: printf(" ");
606: for (j2 = 0; j2 < 8; j2++) {
607: j = 8*j1+j2;
608: printf(" 0x%04x,", enc->uni2charset[j]);
609: }
610: printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
611: }
612: printf("};\n");
613: }
614: if (p >= 0)
615: printf("\n");
616: }
1.1.1.3 ! misho 617: printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)\n", name);
1.1 misho 618: printf("{\n");
619: printf(" if (n >= 2) {\n");
620: printf(" unsigned short c = 0;\n");
621: first = true;
622: for (j1 = 0; j1 < 0x6000;) {
623: t = line[j1];
624: for (j2 = j1; j2 < 0x6000 && line[j2] == t; j2++);
625: if (t >= 0) {
626: if (j1 != tables[t].minline) abort();
627: if (j2 > tables[t].maxline+1) abort();
628: j2 = tables[t].maxline+1;
629: if (first)
630: printf(" ");
631: else
632: printf(" else ");
633: first = false;
634: if (tables[t].usecount == 0) abort();
635: if (tables[t].usecount == 1) {
636: if (j2 != j1+1) abort();
637: for (j = 8*j1; j < 8*j2; j++)
638: if (enc->uni2charset[j] != 0) {
639: printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j, enc->uni2charset[j]);
640: break;
641: }
642: } else {
643: if (j1 == 0) {
644: printf("if (wc < 0x%04x)", 8*j2);
645: } else {
646: printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
647: }
648: printf("\n c = %s_page%02x[wc", name, j1 >> 5);
649: if (tables[t].minline > 0)
650: printf("-0x%04x", 8*j1);
651: printf("];\n");
652: }
653: }
654: j1 = j2;
655: }
656: printf(" if (c != 0) {\n");
657: printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
658: printf(" return 2;\n");
659: printf(" }\n");
660: printf(" return RET_ILUNI;\n");
661: printf(" }\n");
662: printf(" return RET_TOOSMALL;\n");
663: printf("}\n");
664: }
665:
666: /*
667: * Outputs the unicode to charset table and function, using a packed array.
668: * (Suitable if the table is sparse.)
669: * The argument 'monotonic' may be set to true if the mapping is monotonically
670: * increasing with small gaps only.
671: */
672: static void output_uni2charset_sparse (const char* name, Encoding* enc, bool monotonic)
673: {
674: bool pages[0x300];
675: Block pageblocks[0x300]; int npageblocks;
676: int indx2charset[0x30000];
677: int summary_indx[0x3000];
678: int summary_used[0x3000];
679: int i, row, col, j, p, j1, j2, indx;
680: bool is_large;
681: /* for monotonic: */
682: int log2_stepsize = (!strcmp(name,"uhc_2") ? 6 : 7);
683: int stepsize = 1 << log2_stepsize;
684: int indxsteps;
685:
686: /* Fill pages[0x300]. */
687: for (p = 0; p < 0x300; p++)
688: pages[p] = false;
689: for (row = 0; row < enc->rows; row++)
690: for (col = 0; col < enc->cols; col++) {
691: j = enc->charset2uni[row][col];
692: if (j != 0xfffd)
693: pages[j>>8] = true;
694: }
695:
696: /* Determine whether two or three bytes are needed for each character. */
697: is_large = false;
698: for (j = 0; j < 0x30000; j++)
699: if (enc->uni2charset[j] >= 0x10000)
700: is_large = true;
701:
702: #if 0
703: for (p = 0; p < 0x300; p++)
704: if (pages[p]) {
705: printf("static const unsigned short %s_page%02x[256] = {\n", name, p);
706: for (j1 = 0; j1 < 32; j1++) {
707: printf(" ");
708: for (j2 = 0; j2 < 8; j2++)
709: printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]);
710: printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7);
711: }
712: printf("};\n");
713: }
714: printf("\n");
715: #endif
716:
717: /* Fill summary_indx[] and summary_used[]. */
718: indx = 0;
719: for (j1 = 0; j1 < 0x3000; j1++) {
720: summary_indx[j1] = indx;
721: summary_used[j1] = 0;
722: for (j2 = 0; j2 < 16; j2++) {
723: j = 16*j1+j2;
724: if (enc->uni2charset[j] != 0) {
725: indx2charset[indx++] = enc->uni2charset[j];
726: summary_used[j1] |= (1 << j2);
727: }
728: }
729: }
730:
731: /* Fill npageblocks and pageblocks[]. */
732: npageblocks = 0;
733: for (p = 0; p < 0x300; ) {
734: if (pages[p] && (p == 0 || !pages[p-1])) {
735: pageblocks[npageblocks].start = 16*p;
736: do p++; while (p < 0x300 && pages[p]);
737: j1 = 16*p;
738: while (summary_used[j1-1] == 0) j1--;
739: pageblocks[npageblocks].end = j1;
740: npageblocks++;
741: } else
742: p++;
743: }
744:
745: if (monotonic) {
746: indxsteps = (indx + stepsize-1) / stepsize;
747: printf("static const unsigned short %s_2charset_main[%d] = {\n", name, indxsteps);
748: for (i = 0; i < indxsteps; ) {
749: if ((i % 8) == 0) printf(" ");
750: printf(" 0x%04x,", indx2charset[i*stepsize]);
751: i++;
752: if ((i % 8) == 0 || i == indxsteps) printf("\n");
753: }
754: printf("};\n");
755: printf("static const unsigned char %s_2charset[%d] = {\n", name, indx);
756: for (i = 0; i < indx; ) {
757: if ((i % 8) == 0) printf(" ");
758: printf(" 0x%02x,", indx2charset[i] - indx2charset[i/stepsize*stepsize]);
759: i++;
760: if ((i % 8) == 0 || i == indx) printf("\n");
761: }
762: printf("};\n");
763: } else {
764: if (is_large) {
765: printf("static const unsigned char %s_2charset[3*%d] = {\n", name, indx);
766: for (i = 0; i < indx; ) {
767: if ((i % 4) == 0) printf(" ");
768: printf(" 0x%1x,0x%02x,0x%02x,", indx2charset[i] >> 16,
769: (indx2charset[i] >> 8) & 0xff, indx2charset[i] & 0xff);
770: i++;
771: if ((i % 4) == 0 || i == indx) printf("\n");
772: }
773: printf("};\n");
774: } else {
775: printf("static const unsigned short %s_2charset[%d] = {\n", name, indx);
776: for (i = 0; i < indx; ) {
777: if ((i % 8) == 0) printf(" ");
778: printf(" 0x%04x,", indx2charset[i]);
779: i++;
780: if ((i % 8) == 0 || i == indx) printf("\n");
781: }
782: printf("};\n");
783: }
784: }
785: printf("\n");
786: for (i = 0; i < npageblocks; i++) {
787: printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name,
788: pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start);
789: for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) {
790: if (((16*j1) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1);
791: if ((j1 % 4) == 0) printf(" ");
792: printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]);
793: j1++;
794: if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n");
795: }
796: printf("};\n");
797: }
798: printf("\n");
799:
800: printf("static int\n");
1.1.1.3 ! misho 801: printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)\n", name);
1.1 misho 802: printf("{\n");
803: printf(" if (n >= 2) {\n");
804: printf(" const Summary16 *summary = NULL;\n");
805: for (i = 0; i < npageblocks; i++) {
806: printf(" ");
807: if (i > 0)
808: printf("else ");
809: printf("if (wc >= 0x%04x && wc < 0x%04x)\n",
810: 16*pageblocks[i].start, 16*pageblocks[i].end);
811: printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name,
812: pageblocks[i].start/16);
813: if (pageblocks[i].start > 0)
814: printf("-0x%03x", pageblocks[i].start);
815: printf("];\n");
816: }
817: printf(" if (summary) {\n");
818: printf(" unsigned short used = summary->used;\n");
819: printf(" unsigned int i = wc & 0x0f;\n");
820: printf(" if (used & ((unsigned short) 1 << i)) {\n");
821: if (monotonic || !is_large)
822: printf(" unsigned short c;\n");
1.1.1.3 ! misho 823: printf(" /* Keep in 'used' only the bits 0..i-1. */\n");
1.1 misho 824: printf(" used &= ((unsigned short) 1 << i) - 1;\n");
1.1.1.3 ! misho 825: printf(" /* Add 'summary->indx' and the number of bits set in 'used'. */\n");
1.1 misho 826: printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
827: printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
828: printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
829: printf(" used = (used & 0x00ff) + (used >> 8);\n");
830: if (monotonic) {
831: printf(" used += summary->indx;\n");
832: printf(" c = %s_2charset_main[used>>%d] + %s_2charset[used];\n", name, log2_stepsize, name);
833: printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
834: printf(" return 2;\n");
835: } else {
836: if (is_large) {
837: printf(" used += summary->indx;\n");
838: printf(" r[0] = %s_2charset[3*used];\n", name);
839: printf(" r[1] = %s_2charset[3*used+1];\n", name);
840: printf(" r[2] = %s_2charset[3*used+2];\n", name);
841: printf(" return 3;\n");
842: } else {
843: printf(" c = %s_2charset[summary->indx + used];\n", name);
844: printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n");
845: printf(" return 2;\n");
846: }
847: }
848: printf(" }\n");
849: printf(" }\n");
850: printf(" return RET_ILUNI;\n");
851: printf(" }\n");
852: printf(" return RET_TOOSMALL;\n");
853: printf("}\n");
854: }
855:
856: /* ISO-2022/EUC specifics */
857:
858: static int row_byte_normal (int row) { return 0x21+row; }
859: static int col_byte_normal (int col) { return 0x21+col; }
860: static int byte_row_normal (int byte) { return byte-0x21; }
861: static int byte_col_normal (int byte) { return byte-0x21; }
862:
863: static void do_normal (const char* name)
864: {
865: Encoding enc;
866:
867: enc.rows = 94;
868: enc.cols = 94;
869: enc.row_byte = row_byte_normal;
870: enc.col_byte = col_byte_normal;
871: enc.byte_row = byte_row_normal;
872: enc.byte_col = byte_col_normal;
873: enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
874: enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
875: enc.byte_row_expr = "%1$s - 0x21";
876: enc.byte_col_expr = "%1$s - 0x21";
877:
878: read_table(&enc);
879: output_charset2uni(name,&enc);
880: invert(&enc); output_uni2charset_sparse(name,&enc,false);
881: }
882:
883: /* Note: On first sight, the jisx0212_2charset[] table seems to be in order,
884: starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in
885: order. There are 75 out-of-order values, scattered all throughout the table.
886: */
887:
888: static void do_normal_only_charset2uni (const char* name)
889: {
890: Encoding enc;
891:
892: enc.rows = 94;
893: enc.cols = 94;
894: enc.row_byte = row_byte_normal;
895: enc.col_byte = col_byte_normal;
896: enc.byte_row = byte_row_normal;
897: enc.byte_col = byte_col_normal;
898: enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
899: enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
900: enc.byte_row_expr = "%1$s - 0x21";
901: enc.byte_col_expr = "%1$s - 0x21";
902:
903: read_table(&enc);
904: output_charset2uni(name,&enc);
905: }
906:
907: /* CNS 11643 specifics - trick to put two tables into one */
908:
909: static int row_byte_cns11643 (int row) {
910: return 0x100 * (row / 94) + (row % 94) + 0x21;
911: }
912: static int byte_row_cns11643 (int byte) {
913: return (byte >> 8) * 94 + (byte & 0xff) - 0x21;
914: }
915:
916: static void do_cns11643_only_uni2charset (const char* name)
917: {
918: Encoding enc;
919:
920: enc.rows = 16*94;
921: enc.cols = 94;
922: enc.row_byte = row_byte_cns11643;
923: enc.col_byte = col_byte_normal;
924: enc.byte_row = byte_row_cns11643;
925: enc.byte_col = byte_col_normal;
926: enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
927: enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
928: enc.byte_row_expr = "%1$s - 0x21";
929: enc.byte_col_expr = "%1$s - 0x21";
930:
931: read_table(&enc);
932: invert(&enc);
933: output_uni2charset_sparse(name,&enc,false);
934: }
935:
936: /* GBK specifics */
937:
938: static int row_byte_gbk1 (int row) {
939: return 0x81+row;
940: }
941: static int col_byte_gbk1 (int col) {
942: return (col >= 0x3f ? 0x41 : 0x40) + col;
943: }
944: static int byte_row_gbk1 (int byte) {
945: if (byte >= 0x81 && byte < 0xff)
946: return byte-0x81;
947: else
948: return -1;
949: }
950: static int byte_col_gbk1 (int byte) {
951: if (byte >= 0x40 && byte < 0x7f)
952: return byte-0x40;
953: else if (byte >= 0x80 && byte < 0xff)
954: return byte-0x41;
955: else
956: return -1;
957: }
958:
959: static void do_gbk1 (const char* name)
960: {
961: Encoding enc;
962:
963: enc.rows = 126;
964: enc.cols = 190;
965: enc.row_byte = row_byte_gbk1;
966: enc.col_byte = col_byte_gbk1;
967: enc.byte_row = byte_row_gbk1;
968: enc.byte_col = byte_col_gbk1;
969: enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
970: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
971: enc.byte_row_expr = "%1$s - 0x81";
972: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
973:
974: read_table(&enc);
975: output_charset2uni(name,&enc);
976: invert(&enc); output_uni2charset_dense(name,&enc);
977: }
978:
979: static void do_gbk1_only_charset2uni (const char* name)
980: {
981: Encoding enc;
982:
983: enc.rows = 126;
984: enc.cols = 190;
985: enc.row_byte = row_byte_gbk1;
986: enc.col_byte = col_byte_gbk1;
987: enc.byte_row = byte_row_gbk1;
988: enc.byte_col = byte_col_gbk1;
989: enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
990: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
991: enc.byte_row_expr = "%1$s - 0x81";
992: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
993:
994: read_table(&enc);
995: output_charset2uni(name,&enc);
996: }
997:
998: static int row_byte_gbk2 (int row) {
999: return 0x81+row;
1000: }
1001: static int col_byte_gbk2 (int col) {
1002: return (col >= 0x3f ? 0x41 : 0x40) + col;
1003: }
1004: static int byte_row_gbk2 (int byte) {
1005: if (byte >= 0x81 && byte < 0xff)
1006: return byte-0x81;
1007: else
1008: return -1;
1009: }
1010: static int byte_col_gbk2 (int byte) {
1011: if (byte >= 0x40 && byte < 0x7f)
1012: return byte-0x40;
1013: else if (byte >= 0x80 && byte < 0xa1)
1014: return byte-0x41;
1015: else
1016: return -1;
1017: }
1018:
1019: static void do_gbk2_only_charset2uni (const char* name)
1020: {
1021: Encoding enc;
1022:
1023: enc.rows = 126;
1024: enc.cols = 96;
1025: enc.row_byte = row_byte_gbk2;
1026: enc.col_byte = col_byte_gbk2;
1027: enc.byte_row = byte_row_gbk2;
1028: enc.byte_col = byte_col_gbk2;
1029: enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1030: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)";
1031: enc.byte_row_expr = "%1$s - 0x81";
1032: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1033:
1034: read_table(&enc);
1035: output_charset2uni(name,&enc);
1036: }
1037:
1038: static void do_gbk1_only_uni2charset (const char* name)
1039: {
1040: Encoding enc;
1041:
1042: enc.rows = 126;
1043: enc.cols = 190;
1044: enc.row_byte = row_byte_gbk1;
1045: enc.col_byte = col_byte_gbk1;
1046: enc.byte_row = byte_row_gbk1;
1047: enc.byte_col = byte_col_gbk1;
1048: enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff";
1049: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)";
1050: enc.byte_row_expr = "%1$s - 0x81";
1051: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1052:
1053: read_table(&enc);
1054: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1055: }
1056:
1057: /* KSC 5601 specifics */
1058:
1059: /*
1060: * Reads the charset2uni table from standard input.
1061: */
1062: static void read_table_ksc5601 (Encoding* enc)
1063: {
1064: int row, col, i, i1, i2, c, j;
1065:
1066: enc->charset2uni = (int**) malloc(enc->rows*sizeof(int*));
1067: for (row = 0; row < enc->rows; row++)
1068: enc->charset2uni[row] = (int*) malloc(enc->cols*sizeof(int));
1069:
1070: for (row = 0; row < enc->rows; row++)
1071: for (col = 0; col < enc->cols; col++)
1072: enc->charset2uni[row][col] = 0xfffd;
1073:
1074: c = getc(stdin);
1075: ungetc(c,stdin);
1076: if (c == '#') {
1077: /* Read a unicode.org style .TXT file. */
1078: for (;;) {
1079: c = getc(stdin);
1080: if (c == EOF)
1081: break;
1082: if (c == '\n' || c == ' ' || c == '\t')
1083: continue;
1084: if (c == '#') {
1085: do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1086: continue;
1087: }
1088: ungetc(c,stdin);
1089: if (scanf("0x%x", &j) != 1)
1090: exit(1);
1091: i1 = j >> 8;
1092: i2 = j & 0xff;
1093: if (scanf(" 0x%x", &j) != 1)
1094: exit(1);
1095: /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0
1096: = KS X 1001.1992, ignore the rest. */
1097: if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127))
1098: continue; /* KSC5601 specific */
1099: i1 &= 0x7f; /* KSC5601 specific */
1100: i2 &= 0x7f; /* KSC5601 specific */
1101: row = enc->byte_row(i1);
1102: col = enc->byte_col(i2);
1103: if (row < 0 || col < 0) {
1104: fprintf(stderr, "lost entry for %02x %02x\n", i1, i2);
1105: exit(1);
1106: }
1107: enc->charset2uni[row][col] = j;
1108: }
1109: } else {
1110: /* Read a table of hexadecimal Unicode values. */
1111: for (i1 = 33; i1 < 127; i1++)
1112: for (i2 = 33; i2 < 127; i2++) {
1113: i = scanf("%x", &j);
1114: if (i == EOF)
1115: goto read_done;
1116: if (i != 1)
1117: exit(1);
1118: if (j < 0 || j == 0xffff)
1119: j = 0xfffd;
1120: if (j != 0xfffd) {
1121: if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) {
1122: fprintf(stderr, "lost entry at %02x %02x\n", i1, i2);
1123: exit (1);
1124: }
1125: enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j;
1126: }
1127: }
1128: read_done: ;
1129: }
1130: }
1131:
1132: static void do_ksc5601 (const char* name)
1133: {
1134: Encoding enc;
1135:
1136: enc.rows = 94;
1137: enc.cols = 94;
1138: enc.row_byte = row_byte_normal;
1139: enc.col_byte = col_byte_normal;
1140: enc.byte_row = byte_row_normal;
1141: enc.byte_col = byte_col_normal;
1142: enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1143: enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f";
1144: enc.byte_row_expr = "%1$s - 0x21";
1145: enc.byte_col_expr = "%1$s - 0x21";
1146:
1147: read_table_ksc5601(&enc);
1148: output_charset2uni(name,&enc);
1149: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1150: }
1151:
1152: /* UHC specifics */
1153:
1154: /* UHC part 1: 0x{81..A0}{41..5A,61..7A,81..FE} */
1155:
1156: static int row_byte_uhc_1 (int row) {
1157: return 0x81 + row;
1158: }
1159: static int col_byte_uhc_1 (int col) {
1160: return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1161: }
1162: static int byte_row_uhc_1 (int byte) {
1163: if (byte >= 0x81 && byte < 0xa1)
1164: return byte-0x81;
1165: else
1166: return -1;
1167: }
1168: static int byte_col_uhc_1 (int byte) {
1169: if (byte >= 0x41 && byte < 0x5b)
1170: return byte-0x41;
1171: else if (byte >= 0x61 && byte < 0x7b)
1172: return byte-0x47;
1173: else if (byte >= 0x81 && byte < 0xff)
1174: return byte-0x4d;
1175: else
1176: return -1;
1177: }
1178:
1179: static void do_uhc_1 (const char* name)
1180: {
1181: Encoding enc;
1182:
1183: enc.rows = 32;
1184: enc.cols = 178;
1185: enc.row_byte = row_byte_uhc_1;
1186: enc.col_byte = col_byte_uhc_1;
1187: enc.byte_row = byte_row_uhc_1;
1188: enc.byte_col = byte_col_uhc_1;
1189: enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa1)";
1190: enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xff)";
1191: enc.byte_row_expr = "%1$s - 0x81";
1192: enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1193:
1194: read_table(&enc);
1195: output_charset2uni_noholes_monotonic(name,&enc);
1196: invert(&enc); output_uni2charset_sparse(name,&enc,true);
1197: }
1198:
1199: /* UHC part 2: 0x{A1..C6}{41..5A,61..7A,81..A0} */
1200:
1201: static int row_byte_uhc_2 (int row) {
1202: return 0xa1 + row;
1203: }
1204: static int col_byte_uhc_2 (int col) {
1205: return (col >= 0x34 ? 0x4d : col >= 0x1a ? 0x47 : 0x41) + col;
1206: }
1207: static int byte_row_uhc_2 (int byte) {
1208: if (byte >= 0xa1 && byte < 0xff)
1209: return byte-0xa1;
1210: else
1211: return -1;
1212: }
1213: static int byte_col_uhc_2 (int byte) {
1214: if (byte >= 0x41 && byte < 0x5b)
1215: return byte-0x41;
1216: else if (byte >= 0x61 && byte < 0x7b)
1217: return byte-0x47;
1218: else if (byte >= 0x81 && byte < 0xa1)
1219: return byte-0x4d;
1220: else
1221: return -1;
1222: }
1223:
1224: static void do_uhc_2 (const char* name)
1225: {
1226: Encoding enc;
1227:
1228: enc.rows = 94;
1229: enc.cols = 84;
1230: enc.row_byte = row_byte_uhc_2;
1231: enc.col_byte = col_byte_uhc_2;
1232: enc.byte_row = byte_row_uhc_2;
1233: enc.byte_col = byte_col_uhc_2;
1234: enc.check_row_expr = "(%1$s >= 0xa1 && %1$s < 0xff)";
1235: enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x5b) || (%1$s >= 0x61 && %1$s < 0x7b) || (%1$s >= 0x81 && %1$s < 0xa1)";
1236: enc.byte_row_expr = "%1$s - 0xa1";
1237: enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x4d : %1$s >= 0x61 ? 0x47 : 0x41)";
1238:
1239: read_table(&enc);
1240: output_charset2uni_noholes_monotonic(name,&enc);
1241: invert(&enc); output_uni2charset_sparse(name,&enc,true);
1242: }
1243:
1244: /* Big5 specifics */
1245:
1246: static int row_byte_big5 (int row) {
1247: return 0xa1+row;
1248: }
1249: static int col_byte_big5 (int col) {
1250: return (col >= 0x3f ? 0x62 : 0x40) + col;
1251: }
1252: static int byte_row_big5 (int byte) {
1253: if (byte >= 0xa1 && byte < 0xff)
1254: return byte-0xa1;
1255: else
1256: return -1;
1257: }
1258: static int byte_col_big5 (int byte) {
1259: if (byte >= 0x40 && byte < 0x7f)
1260: return byte-0x40;
1261: else if (byte >= 0xa1 && byte < 0xff)
1262: return byte-0x62;
1263: else
1264: return -1;
1265: }
1266:
1267: static void do_big5 (const char* name)
1268: {
1269: Encoding enc;
1270:
1271: enc.rows = 94;
1272: enc.cols = 157;
1273: enc.row_byte = row_byte_big5;
1274: enc.col_byte = col_byte_big5;
1275: enc.byte_row = byte_row_big5;
1276: enc.byte_col = byte_col_big5;
1277: enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff";
1278: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1279: enc.byte_row_expr = "%1$s - 0xa1";
1280: enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1281:
1282: read_table(&enc);
1283: output_charset2uni(name,&enc);
1284: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1285: }
1286:
1287: /* HKSCS specifics */
1288:
1289: static int row_byte_hkscs (int row) {
1290: return 0x80+row;
1291: }
1292: static int byte_row_hkscs (int byte) {
1293: if (byte >= 0x80 && byte < 0xff)
1294: return byte-0x80;
1295: else
1296: return -1;
1297: }
1298:
1299: static void do_hkscs (const char* name)
1300: {
1301: Encoding enc;
1302:
1303: enc.rows = 128;
1304: enc.cols = 157;
1305: enc.row_byte = row_byte_hkscs;
1306: enc.col_byte = col_byte_big5;
1307: enc.byte_row = byte_row_hkscs;
1308: enc.byte_col = byte_col_big5;
1309: enc.check_row_expr = "%1$s >= 0x80 && %1$s < 0xff";
1310: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)";
1311: enc.byte_row_expr = "%1$s - 0x80";
1312: enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)";
1313:
1314: read_table(&enc);
1315: output_charset2uni(name,&enc);
1316: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1317: }
1318:
1319: /* Johab Hangul specifics */
1320:
1321: static int row_byte_johab_hangul (int row) {
1322: return 0x84+row;
1323: }
1324: static int col_byte_johab_hangul (int col) {
1325: return (col >= 0x3e ? 0x43 : 0x41) + col;
1326: }
1327: static int byte_row_johab_hangul (int byte) {
1328: if (byte >= 0x84 && byte < 0xd4)
1329: return byte-0x84;
1330: else
1331: return -1;
1332: }
1333: static int byte_col_johab_hangul (int byte) {
1334: if (byte >= 0x41 && byte < 0x7f)
1335: return byte-0x41;
1336: else if (byte >= 0x81 && byte < 0xff)
1337: return byte-0x43;
1338: else
1339: return -1;
1340: }
1341:
1342: static void do_johab_hangul (const char* name)
1343: {
1344: Encoding enc;
1345:
1346: enc.rows = 80;
1347: enc.cols = 188;
1348: enc.row_byte = row_byte_johab_hangul;
1349: enc.col_byte = col_byte_johab_hangul;
1350: enc.byte_row = byte_row_johab_hangul;
1351: enc.byte_col = byte_col_johab_hangul;
1352: enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4";
1353: enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)";
1354: enc.byte_row_expr = "%1$s - 0x84";
1355: enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)";
1356:
1357: read_table(&enc);
1358: output_charset2uni(name,&enc);
1359: invert(&enc); output_uni2charset_dense(name,&enc);
1360: }
1361:
1362: /* SJIS specifics */
1363:
1364: static int row_byte_sjis (int row) {
1365: return (row >= 0x1f ? 0xc1 : 0x81) + row;
1366: }
1367: static int col_byte_sjis (int col) {
1368: return (col >= 0x3f ? 0x41 : 0x40) + col;
1369: }
1370: static int byte_row_sjis (int byte) {
1371: if (byte >= 0x81 && byte < 0xa0)
1372: return byte-0x81;
1373: else if (byte >= 0xe0)
1374: return byte-0xc1;
1375: else
1376: return -1;
1377: }
1378: static int byte_col_sjis (int byte) {
1379: if (byte >= 0x40 && byte < 0x7f)
1380: return byte-0x40;
1381: else if (byte >= 0x80 && byte < 0xfd)
1382: return byte-0x41;
1383: else
1384: return -1;
1385: }
1386:
1387: static void do_sjis (const char* name)
1388: {
1389: Encoding enc;
1390:
1391: enc.rows = 94;
1392: enc.cols = 188;
1393: enc.row_byte = row_byte_sjis;
1394: enc.col_byte = col_byte_sjis;
1395: enc.byte_row = byte_row_sjis;
1396: enc.byte_col = byte_col_sjis;
1397: enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)";
1398: enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)";
1399: enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)";
1400: enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)";
1401:
1402: read_table(&enc);
1403: output_charset2uni(name,&enc);
1404: invert(&enc); output_uni2charset_sparse(name,&enc,false);
1405: }
1406:
1407: /* GB18030 Unicode specifics */
1408:
1409: static void do_gb18030uni (const char* name)
1410: {
1411: int c;
1412: unsigned int bytes;
1413: int i1, i2, i3, i4, i, j, k;
1414: int charset2uni[4*10*126*10];
1415: int uni2charset[0x10000];
1416: struct { int low; int high; int diff; int total; } ranges[256];
1417: int ranges_count, ranges_total;
1418:
1419: for (i = 0; i < 4*10*126*10; i++)
1420: charset2uni[i] = 0;
1421: for (j = 0; j < 0x10000; j++)
1422: uni2charset[j] = 0;
1423:
1424: /* Read a unicode.org style .TXT file. */
1425: for (;;) {
1426: c = getc(stdin);
1427: if (c == EOF)
1428: break;
1429: if (c == '\n' || c == ' ' || c == '\t')
1430: continue;
1431: if (c == '#') {
1432: do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
1433: continue;
1434: }
1435: ungetc(c,stdin);
1436: if (scanf("0x%x", &bytes) != 1)
1437: exit(1);
1438: i1 = (bytes >> 24) & 0xff;
1439: i2 = (bytes >> 16) & 0xff;
1440: i3 = (bytes >> 8) & 0xff;
1441: i4 = bytes & 0xff;
1442: if (!(i1 >= 0x81 && i1 <= 0x84
1443: && i2 >= 0x30 && i2 <= 0x39
1444: && i3 >= 0x81 && i3 <= 0xfe
1445: && i4 >= 0x30 && i4 <= 0x39)) {
1446: fprintf(stderr, "lost entry for %02x %02x %02x %02x\n", i1, i2, i3, i4);
1447: exit(1);
1448: }
1449: i = (((i1-0x81) * 10 + (i2-0x30)) * 126 + (i3-0x81)) * 10 + (i4-0x30);
1450: if (scanf(" 0x%x", &j) != 1)
1451: exit(1);
1452: if (!(j >= 0 && j < 0x10000))
1453: exit(1);
1454: charset2uni[i] = j;
1455: uni2charset[j] = i;
1456: }
1457:
1458: /* Verify that the mapping i -> j is monotonically increasing and
1459: of the form
1460: low[k] <= i <= high[k] => j = diff[k] + i
1461: with a set of disjoint intervals (low[k], high[k]). */
1462: ranges_count = 0;
1463: for (i = 0; i < 4*10*126*10; i++)
1464: if (charset2uni[i] != 0) {
1465: int diff;
1466: j = charset2uni[i];
1467: diff = j - i;
1468: if (ranges_count > 0) {
1469: if (!(i > ranges[ranges_count-1].high))
1470: exit(1);
1471: if (!(j > ranges[ranges_count-1].high + ranges[ranges_count-1].diff))
1472: exit(1);
1473: /* Additional property: The diffs are also increasing. */
1474: if (!(diff >= ranges[ranges_count-1].diff))
1475: exit(1);
1476: }
1477: if (ranges_count > 0 && diff == ranges[ranges_count-1].diff)
1478: ranges[ranges_count-1].high = i;
1479: else {
1480: if (ranges_count == 256)
1481: exit(1);
1482: ranges[ranges_count].low = i;
1483: ranges[ranges_count].high = i;
1484: ranges[ranges_count].diff = diff;
1485: ranges_count++;
1486: }
1487: }
1488:
1489: /* Determine size of bitmap. */
1490: ranges_total = 0;
1491: for (k = 0; k < ranges_count; k++) {
1492: ranges[k].total = ranges_total;
1493: ranges_total += ranges[k].high - ranges[k].low + 1;
1494: }
1495:
1496: printf("static const unsigned short %s_charset2uni_ranges[%d] = {\n", name, 2*ranges_count);
1497: for (k = 0; k < ranges_count; k++) {
1498: printf(" 0x%04x, 0x%04x", ranges[k].low, ranges[k].high);
1499: if (k+1 < ranges_count) printf(",");
1500: if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1501: }
1502: printf("\n");
1503: printf("};\n");
1504:
1505: printf("\n");
1506:
1507: printf("static const unsigned short %s_uni2charset_ranges[%d] = {\n", name, 2*ranges_count);
1508: for (k = 0; k < ranges_count; k++) {
1509: printf(" 0x%04x, 0x%04x", ranges[k].low + ranges[k].diff, ranges[k].high + ranges[k].diff);
1510: if (k+1 < ranges_count) printf(",");
1511: if ((k % 4) == 3 && k+1 < ranges_count) printf("\n");
1512: }
1513: printf("\n");
1514: printf("};\n");
1515:
1516: printf("\n");
1517:
1518: printf("static const struct { unsigned short diff; unsigned short bitmap_offset; } %s_ranges[%d] = {\n ", name, ranges_count);
1519: for (k = 0; k < ranges_count; k++) {
1520: printf(" { %5d, 0x%04x }", ranges[k].diff, ranges[k].total);
1521: if (k+1 < ranges_count) printf(",");
1522: if ((k % 4) == 3 && k+1 < ranges_count) printf("\n ");
1523: }
1524: printf("\n");
1525: printf("};\n");
1526:
1527: printf("\n");
1528:
1529: printf("static const unsigned char %s_bitmap[%d] = {\n ", name, (ranges_total + 7) / 8);
1530: {
1531: int accu = 0;
1532: for (k = 0; k < ranges_count; k++) {
1533: for (i = ranges[k].total; i <= ranges[k].total + (ranges[k].high - ranges[k].low);) {
1534: if (charset2uni[i - ranges[k].total + ranges[k].low] != 0)
1535: accu |= (1 << (i % 8));
1536: i++;
1537: if ((i % 8) == 0) {
1538: printf(" 0x%02x", accu);
1539: if ((i / 8) < (ranges_total + 7) / 8) printf(",");
1540: if (((i / 8) % 12) == 0)
1541: printf("\n ");
1542: accu = 0;
1543: }
1544: }
1545: if (i != (k+1 < ranges_count ? ranges[k+1].total : ranges_total)) abort();
1546: }
1547: if ((ranges_total % 8) != 0)
1548: printf(" 0x%02x", accu);
1549: printf("\n");
1550: }
1551: printf("};\n");
1552:
1553: printf("\n");
1554:
1555: printf("static int\n");
1.1.1.3 ! misho 1556: printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)\n", name);
1.1 misho 1557: printf("{\n");
1558: printf(" unsigned char c1 = s[0];\n");
1559: printf(" if (c1 >= 0x81 && c1 <= 0x84) {\n");
1560: printf(" if (n >= 2) {\n");
1561: printf(" unsigned char c2 = s[1];\n");
1562: printf(" if (c2 >= 0x30 && c2 <= 0x39) {\n");
1563: printf(" if (n >= 3) {\n");
1564: printf(" unsigned char c3 = s[2];\n");
1565: printf(" if (c3 >= 0x81 && c3 <= 0xfe) {\n");
1566: printf(" if (n >= 4) {\n");
1567: printf(" unsigned char c4 = s[3];\n");
1568: printf(" if (c4 >= 0x30 && c4 <= 0x39) {\n");
1569: printf(" unsigned int i = (((c1 - 0x81) * 10 + (c2 - 0x30)) * 126 + (c3 - 0x81)) * 10 + (c4 - 0x30);\n");
1570: printf(" if (i >= %d && i <= %d) {\n", ranges[0].low, ranges[ranges_count-1].high);
1571: printf(" unsigned int k1 = 0;\n");
1572: printf(" unsigned int k2 = %d;\n", ranges_count-1);
1573: printf(" while (k1 < k2) {\n");
1574: printf(" unsigned int k = (k1 + k2) / 2;\n");
1575: printf(" if (i <= %s_charset2uni_ranges[2*k+1])\n", name);
1576: printf(" k2 = k;\n");
1577: printf(" else if (i >= %s_charset2uni_ranges[2*k+2])\n", name);
1578: printf(" k1 = k + 1;\n");
1579: printf(" else\n");
1580: printf(" return RET_ILSEQ;\n");
1581: printf(" }\n");
1582: printf(" {\n");
1583: printf(" unsigned int bitmap_index = i - %s_charset2uni_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1584: printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1585: printf(" unsigned int diff = %s_ranges[k1].diff;\n", name);
1586: printf(" *pwc = (ucs4_t) (i + diff);\n");
1587: printf(" return 4;\n");
1588: printf(" }\n");
1589: printf(" }\n");
1590: printf(" }\n");
1591: printf(" }\n");
1592: printf(" return RET_ILSEQ;\n");
1593: printf(" }\n");
1594: printf(" return RET_TOOFEW(0);\n");
1595: printf(" }\n");
1596: printf(" return RET_ILSEQ;\n");
1597: printf(" }\n");
1598: printf(" return RET_TOOFEW(0);\n");
1599: printf(" }\n");
1600: printf(" return RET_ILSEQ;\n");
1601: printf(" }\n");
1602: printf(" return RET_TOOFEW(0);\n");
1603: printf(" }\n");
1604: printf(" return RET_ILSEQ;\n");
1605: printf("}\n");
1606:
1607: printf("\n");
1608:
1609: printf("static int\n");
1.1.1.3 ! misho 1610: printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)\n", name);
1.1 misho 1611: printf("{\n");
1612: printf(" if (n >= 4) {\n");
1613: printf(" unsigned int i = wc;\n");
1614: printf(" if (i >= 0x%04x && i <= 0x%04x) {\n", ranges[0].low + ranges[0].diff, ranges[ranges_count-1].high + ranges[ranges_count-1].diff);
1615: printf(" unsigned int k1 = 0;\n");
1616: printf(" unsigned int k2 = %d;\n", ranges_count-1);
1617: printf(" while (k1 < k2) {\n");
1618: printf(" unsigned int k = (k1 + k2) / 2;\n");
1619: printf(" if (i <= %s_uni2charset_ranges[2*k+1])\n", name);
1620: printf(" k2 = k;\n");
1621: printf(" else if (i >= %s_uni2charset_ranges[2*k+2])\n", name);
1622: printf(" k1 = k + 1;\n");
1623: printf(" else\n");
1624: printf(" return RET_ILUNI;\n");
1625: printf(" }\n");
1626: printf(" {\n");
1627: printf(" unsigned int bitmap_index = i - %s_uni2charset_ranges[2*k1] + %s_ranges[k1].bitmap_offset;\n", name, name);
1628: printf(" if ((%s_bitmap[bitmap_index >> 3] >> (bitmap_index & 7)) & 1) {\n", name);
1629: printf(" unsigned int diff = %s_ranges[k1].diff;\n", name);
1630: printf(" i -= diff;\n");
1631: printf(" r[3] = (i %% 10) + 0x30; i = i / 10;\n");
1632: printf(" r[2] = (i %% 126) + 0x81; i = i / 126;\n");
1633: printf(" r[1] = (i %% 10) + 0x30; i = i / 10;\n");
1634: printf(" r[0] = i + 0x81;\n");
1635: printf(" return 4;\n");
1636: printf(" }\n");
1637: printf(" }\n");
1638: printf(" }\n");
1639: printf(" return RET_ILUNI;\n");
1640: printf(" }\n");
1641: printf(" return RET_TOOSMALL;\n");
1642: printf("}\n");
1643: }
1644:
1645: /* JISX0213 specifics */
1646:
1647: static void do_jisx0213 (const char* name)
1648: {
1649: printf("#ifndef _JISX0213_H\n");
1650: printf("#define _JISX0213_H\n");
1651: printf("\n");
1652: printf("/* JISX0213 plane 1 (= ISO-IR-233) characters are in the range\n");
1653: printf(" 0x{21..7E}{21..7E}.\n");
1654: printf(" JISX0213 plane 2 (= ISO-IR-229) characters are in the range\n");
1655: printf(" 0x{21,23..25,28,2C..2F,6E..7E}{21..7E}.\n");
1656: printf(" Together this makes 120 rows of 94 characters.\n");
1657: printf("*/\n");
1658: printf("\n");
1659: {
1660: #define row_convert(row) \
1661: ((row) >= 0x121 && (row) <= 0x17E ? row-289 : /* 0..93 */ \
1662: (row) == 0x221 ? row-451 : /* 94 */ \
1663: (row) >= 0x223 && (row) <= 0x225 ? row-452 : /* 95..97 */ \
1664: (row) == 0x228 ? row-454 : /* 98 */ \
1665: (row) >= 0x22C && (row) <= 0x22F ? row-457 : /* 99..102 */ \
1666: (row) >= 0x26E && (row) <= 0x27E ? row-519 : /* 103..119 */ \
1667: -1)
1668: unsigned int table[120][94];
1669: int pagemin[0x1100];
1670: int pagemax[0x1100];
1671: int pageidx[0x1100];
1672: unsigned int pagestart[0x1100];
1673: unsigned int pagestart_len = 0;
1674: {
1675: unsigned int rowc, colc;
1676: for (rowc = 0; rowc < 120; rowc++)
1677: for (colc = 0; colc < 94; colc++)
1678: table[rowc][colc] = 0;
1679: }
1680: {
1681: unsigned int page;
1682: for (page = 0; page < 0x1100; page++)
1683: pagemin[page] = -1;
1684: for (page = 0; page < 0x1100; page++)
1685: pagemax[page] = -1;
1686: for (page = 0; page < 0x1100; page++)
1687: pageidx[page] = -1;
1688: }
1689: printf("static const unsigned short jisx0213_to_ucs_combining[][2] = {\n");
1690: {
1691: int private_use = 0x0001;
1692: for (;;) {
1693: char line[30];
1694: unsigned int row, col;
1695: unsigned int ucs;
1696: memset(line,0,sizeof(line));
1697: if (scanf("%[^\n]\n",line) < 1)
1698: break;
1699: assert(line[0]=='0');
1700: assert(line[1]=='x');
1701: assert(isxdigit(line[2]));
1702: assert(isxdigit(line[3]));
1703: assert(isxdigit(line[4]));
1704: assert(isxdigit(line[5]));
1705: assert(isxdigit(line[6]));
1706: assert(line[7]=='\t');
1707: line[7] = '\0';
1708: col = strtoul(&line[5],NULL,16);
1709: line[5] = '\0';
1710: row = strtoul(&line[2],NULL,16);
1711: if (line[20] != '\0' && line[21] == '\0') {
1712: unsigned int u1, u2;
1713: assert(line[8]=='0');
1714: assert(line[9]=='x');
1715: assert(isxdigit(line[10]));
1716: assert(isxdigit(line[11]));
1717: assert(isxdigit(line[12]));
1718: assert(isxdigit(line[13]));
1719: assert(line[14]==' ');
1720: assert(line[15]=='0');
1721: assert(line[16]=='x');
1722: assert(isxdigit(line[17]));
1723: assert(isxdigit(line[18]));
1724: assert(isxdigit(line[19]));
1725: assert(isxdigit(line[20]));
1726: u2 = strtoul(&line[17],NULL,16);
1727: line[14] = '\0';
1728: u1 = strtoul(&line[10],NULL,16);
1729: printf(" { 0x%04x, 0x%04x },\n", u1, u2);
1730: ucs = private_use++;
1731: } else {
1732: assert(line[8]=='0');
1733: assert(line[9]=='x');
1734: assert(isxdigit(line[10]));
1735: assert(isxdigit(line[11]));
1736: assert(isxdigit(line[12]));
1737: assert(isxdigit(line[13]));
1738: ucs = strtoul(&line[10],NULL,16);
1739: }
1740: assert((unsigned int) row_convert(row) < 120);
1741: assert((unsigned int) (col-0x21) < 94);
1742: table[row_convert(row)][col-0x21] = ucs;
1743: }
1744: }
1745: printf("};\n");
1746: printf("\n");
1747: {
1748: unsigned int rowc, colc;
1749: for (rowc = 0; rowc < 120; rowc++) {
1750: for (colc = 0; colc < 94; colc++) {
1751: unsigned int value = table[rowc][colc];
1752: unsigned int page = value >> 8;
1753: unsigned int rest = value & 0xff;
1754: if (pagemin[page] < 0 || pagemin[page] > rest) pagemin[page] = rest;
1755: if (pagemax[page] < 0 || pagemax[page] < rest) pagemax[page] = rest;
1756: }
1757: }
1758: }
1759: {
1760: unsigned int index = 0;
1761: unsigned int i;
1762: for (i = 0; i < 0x1100; ) {
1763: if (pagemin[i] >= 0) {
1764: if (pagemin[i+1] >= 0 && pagemin[i] >= 0x80 && pagemax[i+1] < 0x80) {
1765: /* Combine two pages into a single one. */
1766: assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0]));
1767: pagestart[pagestart_len++] = (i<<8)+0x80;
1768: pageidx[i] = index;
1769: pageidx[i+1] = index;
1770: index++;
1771: i += 2;
1772: } else {
1773: /* A single page. */
1774: assert(pagestart_len < sizeof(pagestart)/sizeof(pagestart[0]));
1775: pagestart[pagestart_len++] = i<<8;
1776: pageidx[i] = index;
1777: index++;
1778: i += 1;
1779: }
1780: } else
1781: i++;
1782: }
1783: }
1784: printf("static const unsigned short jisx0213_to_ucs_main[120 * 94] = {\n");
1785: {
1786: unsigned int row;
1787: for (row = 0; row < 0x300; row++) {
1788: unsigned int rowc = row_convert(row);
1789: if (rowc != (unsigned int) (-1)) {
1790: printf(" /* 0x%X21..0x%X7E */\n",row,row);
1791: {
1792: unsigned int count = 0;
1793: unsigned int colc;
1794: for (colc = 0; colc < 94; colc++) {
1795: if ((count % 8) == 0) printf(" ");
1796: {
1797: unsigned int value = table[rowc][colc];
1798: unsigned int page = value >> 8;
1799: unsigned int index = pageidx[page];
1800: assert(value-pagestart[index] < 0x100);
1801: printf(" 0x%04x,",(index<<8)|(value-pagestart[index]));
1802: }
1803: count++;
1804: if ((count % 8) == 0) printf("\n");
1805: }
1806: }
1807: printf("\n");
1808: }
1809: }
1810: }
1811: printf("};\n");
1812: printf("\n");
1813: printf("static const ucs4_t jisx0213_to_ucs_pagestart[] = {\n");
1814: {
1815: unsigned int count = 0;
1816: unsigned int i;
1817: for (i = 0; i < pagestart_len; i++) {
1818: char buf[10];
1819: if ((count % 8) == 0) printf(" ");
1820: printf(" ");
1821: sprintf(buf,"0x%04x",pagestart[i]);
1.1.1.2 misho 1822: if (strlen(buf) < 7) printf("%*s",(int)(7-strlen(buf)),"");
1.1 misho 1823: printf("%s,",buf);
1824: count++;
1825: if ((count % 8) == 0) printf("\n");
1826: }
1827: }
1828: printf("\n");
1829: printf("};\n");
1830: #undef row_convert
1831: }
1832: rewind(stdin);
1833: printf("\n");
1834: {
1835: int table[0x110000];
1836: bool pages[0x4400];
1837: int maxpage = -1;
1838: unsigned int combining_prefixes[100];
1839: unsigned int combining_prefixes_len = 0;
1840: {
1841: unsigned int i;
1842: for (i = 0; i < 0x110000; i++)
1843: table[i] = -1;
1844: for (i = 0; i < 0x4400; i++)
1845: pages[i] = false;
1846: }
1847: for (;;) {
1848: char line[30];
1849: unsigned int plane, row, col;
1850: memset(line,0,sizeof(line));
1851: if (scanf("%[^\n]\n",line) < 1)
1852: break;
1853: assert(line[0]=='0');
1854: assert(line[1]=='x');
1855: assert(isxdigit(line[2]));
1856: assert(isxdigit(line[3]));
1857: assert(isxdigit(line[4]));
1858: assert(isxdigit(line[5]));
1859: assert(isxdigit(line[6]));
1860: assert(line[7]=='\t');
1861: line[7] = '\0';
1862: col = strtoul(&line[5],NULL,16);
1863: line[5] = '\0';
1864: row = strtoul(&line[3],NULL,16);
1865: line[3] = '\0';
1866: plane = strtoul(&line[2],NULL,16) - 1;
1867: if (line[20] != '\0' && line[21] == '\0') {
1868: unsigned int u1, u2;
1869: assert(line[8]=='0');
1870: assert(line[9]=='x');
1871: assert(isxdigit(line[10]));
1872: assert(isxdigit(line[11]));
1873: assert(isxdigit(line[12]));
1874: assert(isxdigit(line[13]));
1875: assert(line[14]==' ');
1876: assert(line[15]=='0');
1877: assert(line[16]=='x');
1878: assert(isxdigit(line[17]));
1879: assert(isxdigit(line[18]));
1880: assert(isxdigit(line[19]));
1881: assert(isxdigit(line[20]));
1882: u2 = strtoul(&line[17],NULL,16);
1883: line[14] = '\0';
1884: u1 = strtoul(&line[10],NULL,16);
1885: assert(u2 == 0x02E5 || u2 == 0x02E9 || u2 == 0x0300 || u2 == 0x0301
1886: || u2 == 0x309A);
1887: assert(combining_prefixes_len < sizeof(combining_prefixes)/sizeof(combining_prefixes[0]));
1888: combining_prefixes[combining_prefixes_len++] = u1;
1889: } else {
1890: unsigned int ucs;
1891: assert(line[8]=='0');
1892: assert(line[9]=='x');
1893: assert(isxdigit(line[10]));
1894: assert(isxdigit(line[11]));
1895: assert(isxdigit(line[12]));
1896: assert(isxdigit(line[13]));
1897: ucs = strtoul(&line[10],NULL,16);
1898: /* Add an entry. */
1899: assert(plane <= 1);
1900: assert(row <= 0x7f);
1901: assert(col <= 0x7f);
1902: table[ucs] = (plane << 15) | (row << 8) | col;
1903: pages[ucs>>6] = true;
1904: if (maxpage < 0 || (ucs>>6) > maxpage) maxpage = ucs>>6;
1905: }
1906: }
1907: {
1908: unsigned int i;
1909: for (i = 0; i < combining_prefixes_len; i++) {
1910: unsigned int u1 = combining_prefixes[i];
1911: assert(table[u1] >= 0);
1912: table[u1] |= 0x0080;
1913: }
1914: }
1915: printf("static const short jisx0213_from_ucs_level1[%d] = {\n",maxpage+1);
1916: {
1917: unsigned int index = 0;
1918: unsigned int i;
1919: for (i = 0; i <= maxpage; i++) {
1920: if ((i % 8) == 0) printf(" ");
1921: if (pages[i]) {
1922: printf(" %3u,",index);
1923: index++;
1924: } else {
1925: printf(" %3d,",-1);
1926: }
1927: if (((i+1) % 8) == 0) printf("\n");
1928: }
1929: }
1930: printf("\n");
1931: printf("};\n");
1932: printf("\n");
1933: #if 0 /* Dense array */
1934: printf("static const unsigned short jisx0213_from_ucs_level2[] = {\n");
1935: {
1936: unsigned int i;
1937: for (i = 0; i <= maxpage; i++) {
1938: if (pages[i]) {
1939: printf(" /* 0x%04X */\n",i<<6);
1940: {
1941: unsigned int j;
1942: for (j = 0; j < 0x40; ) {
1943: unsigned int ucs = (i<<6)+j;
1944: int value = table[ucs];
1945: if (value < 0) value = 0;
1946: if ((j % 8) == 0) printf(" ");
1947: printf(" 0x%04x,",value);
1948: j++;
1949: if ((j % 8) == 0) printf("\n");
1950: }
1951: }
1952: }
1953: }
1954: }
1955: printf("};\n");
1956: #else /* Sparse array */
1957: {
1958: int summary_indx[0x11000];
1959: int summary_used[0x11000];
1960: unsigned int i, k, indx;
1961: printf("static const unsigned short jisx0213_from_ucs_level2_data[] = {\n");
1962: /* Fill summary_indx[] and summary_used[]. */
1963: indx = 0;
1964: for (i = 0, k = 0; i <= maxpage; i++) {
1965: if (pages[i]) {
1966: unsigned int j1, j2;
1967: unsigned int count = 0;
1968: printf(" /* 0x%04X */\n",i<<6);
1969: for (j1 = 0; j1 < 4; j1++) {
1970: summary_indx[4*k+j1] = indx;
1971: summary_used[4*k+j1] = 0;
1972: for (j2 = 0; j2 < 16; j2++) {
1973: unsigned int j = 16*j1+j2;
1974: unsigned int ucs = (i<<6)+j;
1975: int value = table[ucs];
1976: if (value < 0) value = 0;
1977: if (value > 0) {
1978: summary_used[4*k+j1] |= (1 << j2);
1979: if ((count % 8) == 0) printf(" ");
1980: printf(" 0x%04x,",value);
1981: count++;
1982: if ((count % 8) == 0) printf("\n");
1983: indx++;
1984: }
1985: }
1986: }
1987: if ((count % 8) > 0)
1988: printf("\n");
1989: k++;
1990: }
1991: }
1992: printf("};\n");
1993: printf("\n");
1994: printf("static const Summary16 jisx0213_from_ucs_level2_2indx[] = {\n");
1995: for (i = 0, k = 0; i <= maxpage; i++) {
1996: if (pages[i]) {
1997: unsigned int j1;
1998: printf(" /* 0x%04X */\n",i<<6);
1999: printf(" ");
2000: for (j1 = 0; j1 < 4; j1++) {
2001: printf(" { %4d, 0x%04x },", summary_indx[4*k+j1], summary_used[4*k+j1]);
2002: }
2003: printf("\n");
2004: k++;
2005: }
2006: }
2007: printf("};\n");
2008: }
2009: #endif
2010: printf("\n");
2011: }
2012: printf("#ifdef __GNUC__\n");
2013: printf("__inline\n");
2014: printf("#else\n");
2015: printf("#ifdef __cplusplus\n");
2016: printf("inline\n");
2017: printf("#endif\n");
2018: printf("#endif\n");
2019: printf("static ucs4_t jisx0213_to_ucs4 (unsigned int row, unsigned int col)\n");
2020: printf("{\n");
2021: printf(" ucs4_t val;\n");
2022: printf("\n");
2023: printf(" if (row >= 0x121 && row <= 0x17e)\n");
2024: printf(" row -= 289;\n");
2025: printf(" else if (row == 0x221)\n");
2026: printf(" row -= 451;\n");
2027: printf(" else if (row >= 0x223 && row <= 0x225)\n");
2028: printf(" row -= 452;\n");
2029: printf(" else if (row == 0x228)\n");
2030: printf(" row -= 454;\n");
2031: printf(" else if (row >= 0x22c && row <= 0x22f)\n");
2032: printf(" row -= 457;\n");
2033: printf(" else if (row >= 0x26e && row <= 0x27e)\n");
2034: printf(" row -= 519;\n");
2035: printf(" else\n");
2036: printf(" return 0x0000;\n");
2037: printf("\n");
2038: printf(" if (col >= 0x21 && col <= 0x7e)\n");
2039: printf(" col -= 0x21;\n");
2040: printf(" else\n");
2041: printf(" return 0x0000;\n");
2042: printf("\n");
2043: printf(" val = jisx0213_to_ucs_main[row * 94 + col];\n");
2044: printf(" val = jisx0213_to_ucs_pagestart[val >> 8] + (val & 0xff);\n");
2045: printf(" if (val == 0xfffd)\n");
2046: printf(" val = 0x0000;\n");
2047: printf(" return val;\n");
2048: printf("}\n");
2049: printf("\n");
2050: printf("#ifdef __GNUC__\n");
2051: printf("__inline\n");
2052: printf("#else\n");
2053: printf("#ifdef __cplusplus\n");
2054: printf("inline\n");
2055: printf("#endif\n");
2056: printf("#endif\n");
2057: printf("static unsigned short ucs4_to_jisx0213 (ucs4_t ucs)\n");
2058: printf("{\n");
2059: printf(" if (ucs < (sizeof(jisx0213_from_ucs_level1)/sizeof(jisx0213_from_ucs_level1[0])) << 6) {\n");
2060: printf(" int index1 = jisx0213_from_ucs_level1[ucs >> 6];\n");
2061: printf(" if (index1 >= 0)");
2062: #if 0 /* Dense array */
2063: printf("\n");
2064: printf(" return jisx0213_from_ucs_level2[(index1 << 6) + (ucs & 0x3f)];\n");
2065: #else /* Sparse array */
2066: printf(" {\n");
2067: printf(" const Summary16 *summary = &jisx0213_from_ucs_level2_2indx[((index1 << 6) + (ucs & 0x3f)) >> 4];\n");
2068: printf(" unsigned short used = summary->used;\n");
2069: printf(" unsigned int i = ucs & 0x0f;\n");
2070: printf(" if (used & ((unsigned short) 1 << i)) {\n");
1.1.1.3 ! misho 2071: printf(" /* Keep in 'used' only the bits 0..i-1. */\n");
1.1 misho 2072: printf(" used &= ((unsigned short) 1 << i) - 1;\n");
1.1.1.3 ! misho 2073: printf(" /* Add 'summary->indx' and the number of bits set in 'used'. */\n");
1.1 misho 2074: printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n");
2075: printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n");
2076: printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n");
2077: printf(" used = (used & 0x00ff) + (used >> 8);\n");
2078: printf(" return jisx0213_from_ucs_level2_data[summary->indx + used];\n");
2079: printf(" };\n");
2080: printf(" };\n");
2081: #endif
2082: printf(" }\n");
2083: printf(" return 0x0000;\n");
2084: printf("}\n");
2085: printf("\n");
2086: printf("#endif /* _JISX0213_H */\n");
2087: }
2088:
2089: /* Main program */
2090:
2091: int main (int argc, char *argv[])
2092: {
2093: const char* charsetname;
2094: const char* name;
2095:
2096: if (argc != 3)
2097: exit(1);
2098: charsetname = argv[1];
2099: name = argv[2];
2100:
2101: output_title(charsetname);
2102:
2103: if (!strcmp(name,"gb2312")
2104: || !strcmp(name,"isoir165ext") || !strcmp(name,"gb12345ext")
2105: || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212"))
2106: do_normal(name);
2107: else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2")
2108: || !strcmp(name,"cns11643_3") || !strcmp(name,"cns11643_4a")
2109: || !strcmp(name,"cns11643_4b") || !strcmp(name,"cns11643_5")
2110: || !strcmp(name,"cns11643_6") || !strcmp(name,"cns11643_7")
2111: || !strcmp(name,"cns11643_15"))
2112: do_normal_only_charset2uni(name);
2113: else if (!strcmp(name,"cns11643_inv"))
2114: do_cns11643_only_uni2charset(name);
2115: else if (!strcmp(name,"gbkext1"))
2116: do_gbk1_only_charset2uni(name);
2117: else if (!strcmp(name,"gbkext2"))
2118: do_gbk2_only_charset2uni(name);
2119: else if (!strcmp(name,"gbkext_inv"))
2120: do_gbk1_only_uni2charset(name);
2121: else if (!strcmp(name,"cp936ext") || !strcmp(name,"gb18030ext"))
2122: do_gbk1(name);
2123: else if (!strcmp(name,"ksc5601"))
2124: do_ksc5601(name);
2125: else if (!strcmp(name,"uhc_1"))
2126: do_uhc_1(name);
2127: else if (!strcmp(name,"uhc_2"))
2128: do_uhc_2(name);
2129: else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext"))
2130: do_big5(name);
2131: else if (!strcmp(name,"hkscs1999") || !strcmp(name,"hkscs2001")
1.1.1.2 misho 2132: || !strcmp(name,"hkscs2004") || !strcmp(name,"hkscs2008"))
1.1 misho 2133: do_hkscs(name);
2134: else if (!strcmp(name,"johab_hangul"))
2135: do_johab_hangul(name);
2136: else if (!strcmp(name,"cp932ext"))
2137: do_sjis(name);
2138: else if (!strcmp(name,"gb18030uni"))
2139: do_gb18030uni(name);
2140: else if (!strcmp(name,"jisx0213"))
2141: do_jisx0213(name);
2142: else
2143: exit(1);
2144:
2145: return 0;
2146: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>