Annotation of embedaddon/libiconv/tools/8bit_tab_to_h.c, revision 1.1.1.1
1.1 misho 1: /* Copyright (C) 1999-2002 Free Software Foundation, Inc.
2: This file is part of the GNU LIBICONV Tools.
3:
4: This program is free software: you can redistribute it and/or modify
5: it under the terms of the GNU General Public License as published by
6: the Free Software Foundation; either version 3 of the License, or
7: (at your option) any later version.
8:
9: This program is distributed in the hope that it will be useful,
10: but WITHOUT ANY WARRANTY; without even the implied warranty of
11: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12: GNU General Public License for more details.
13:
14: You should have received a copy of the GNU General Public License
15: along with this program; if not, write to the Free Software Foundation,
16: Inc., along with this program. If not, see <http://www.gnu.org/licenses/>. */
17:
18: /*
19: * Generates an 8-bit character set table from a .TXT table as found on
20: * ftp.unicode.org or from a table containing the 256 Unicode values as
21: * hexadecimal integers.
22: * Examples:
23: *
24: * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < tab8859_1
25: * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < tab8859_2
26: * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < tab8859_3
27: * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < tab8859_4
28: * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < tab8859_5
29: * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < tab8859_6
30: * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < tab8859_7
31: * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < tab8859_8
32: * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < tab8859_9
33: * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < tab8859_10
34: * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < tab8859_14
35: * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < tab8859_15
36: * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < jis0201
37: * ./8bit_tab_to_h TIS620.2533-1 tis620 < tabtis620
38: * ./8bit_tab_to_h KOI8-R koi8_r < tabkoi8_r
39: * ./8bit_tab_to_h KOI8-U koi8_u < tabkoi8_u
40: * ./8bit_tab_to_h ARMSCII-8 armscii_8 < tabarmscii_8
41: * ./8bit_tab_to_h CP1133 cp1133 < tabibm_cp1133
42: * ./8bit_tab_to_h MULELAO-1 mulelao < tabmulelao_1
43: * ./8bit_tab_to_h VISCII1.1-1 viscii1 < tabviscii
44: * ./8bit_tab_to_h TCVN-5712 tcvn < tabtcvn
45: * ./8bit_tab_to_h GEORGIAN-ACADEMY georgian_ac < tabgeorgian_academy
46: * ./8bit_tab_to_h GEORGIAN-PS georgian_ps < tabgeorgian_ps
47: *
48: * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < 8859-1.TXT
49: * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < 8859-2.TXT
50: * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < 8859-3.TXT
51: * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < 8859-4.TXT
52: * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < 8859-5.TXT
53: * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < 8859-6.TXT
54: * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < 8859-7.TXT
55: * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < 8859-8.TXT
56: * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < 8859-9.TXT
57: * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < 8859-10.TXT
58: * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < 8859-14.TXT
59: * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < 8859-15.TXT
60: * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < JIS0201.TXT
61: * ./8bit_tab_to_h KOI8-R koi8_r < KOI8-R.TXT
62: */
63:
64: #include <stdio.h>
65: #include <stdlib.h>
66: #include <stdbool.h>
67: #include <string.h>
68:
69: int main (int argc, char *argv[])
70: {
71: const char* charsetname;
72: const char* c_charsetname;
73: const char* filename;
74: const char* directory;
75: int charset2uni[0x100];
76:
77: if (argc != 3 && argc != 4 && argc != 5)
78: exit(1);
79: charsetname = argv[1];
80: c_charsetname = argv[2];
81: if (argc > 3) {
82: filename = argv[3];
83: } else {
84: char* s = (char*) malloc(strlen(c_charsetname)+strlen(".h")+1);
85: strcpy(s,c_charsetname); strcat(s,".h");
86: filename = s;
87: }
88: directory = (argc > 4 ? argv[4] : "");
89:
90: fprintf(stderr, "Creating %s%s\n", directory, filename);
91:
92: {
93: int i, c;
94: c = getc(stdin);
95: ungetc(c,stdin);
96: if (c == '#') {
97: /* Read a unicode.org style .TXT file. */
98: for (i = 0; i < 0x100; i++)
99: charset2uni[i] = 0xfffd;
100: for (;;) {
101: c = getc(stdin);
102: if (c == EOF)
103: break;
104: if (c == '\n' || c == ' ' || c == '\t')
105: continue;
106: if (c == '#') {
107: do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
108: continue;
109: }
110: ungetc(c,stdin);
111: if (scanf("0x%x", &i) != 1 || !(i >= 0 && i < 0x100))
112: exit(1);
113: do { c = getc(stdin); } while (c == ' ' || c == '\t');
114: if (c != EOF)
115: ungetc(c,stdin);
116: if (c == '\n' || c == '#')
117: continue;
118: if (scanf("0x%x", &charset2uni[i]) != 1)
119: exit(1);
120: }
121: } else {
122: /* Read a table of hexadecimal Unicode values. */
123: for (i = 0; i < 0x100; i++) {
124: if (scanf("%x", &charset2uni[i]) != 1)
125: exit(1);
126: if (charset2uni[i] < 0 || charset2uni[i] == 0xffff)
127: charset2uni[i] = 0xfffd;
128: }
129: if (scanf("%x", &i) != EOF)
130: exit(1);
131: }
132: }
133:
134: /* Write the output file. */
135: {
136: FILE* f;
137:
138: {
139: char* fname = malloc(strlen(directory)+strlen(filename)+1);
140: strcpy(fname,directory); strcat(fname,filename);
141: f = fopen(fname,"w");
142: if (f == NULL)
143: exit(1);
144: }
145:
146: fprintf(f, "/*\n");
147: fprintf(f, " * Copyright (C) 1999-2002 Free Software Foundation, Inc.\n");
148: fprintf(f, " * This file is part of the GNU LIBICONV Library.\n");
149: fprintf(f, " *\n");
150: fprintf(f, " * The GNU LIBICONV Library is free software; you can redistribute it\n");
151: fprintf(f, " * and/or modify it under the terms of the GNU Library General Public\n");
152: fprintf(f, " * License as published by the Free Software Foundation; either version 2\n");
153: fprintf(f, " * of the License, or (at your option) any later version.\n");
154: fprintf(f, " *\n");
155: fprintf(f, " * The GNU LIBICONV Library is distributed in the hope that it will be\n");
156: fprintf(f, " * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
157: fprintf(f, " * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
158: fprintf(f, " * Library General Public License for more details.\n");
159: fprintf(f, " *\n");
160: fprintf(f, " * You should have received a copy of the GNU Library General Public\n");
161: fprintf(f, " * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
162: fprintf(f, " * If not, write to the Free Software Foundation, Inc., 51 Franklin Street,\n");
163: fprintf(f, " * Fifth Floor, Boston, MA 02110-1301, USA.\n");
164: fprintf(f, " */\n");
165: fprintf(f, "\n");
166: fprintf(f, "/*\n");
167: fprintf(f, " * %s\n", charsetname);
168: fprintf(f, " */\n");
169: fprintf(f, "\n");
170:
171: {
172: int i, i1, i2, i3;
173: int line[16];
174: int tableno;
175: struct { int minline; int maxline; } tables[16];
176: bool some_invalid;
177: bool final_ret_reached;
178:
179: for (i1 = 0; i1 < 16; i1++) {
180: bool all_invalid = true;
181: bool all_identity = true;
182: for (i2 = 0; i2 < 16; i2++) {
183: i = 16*i1+i2;
184: if (charset2uni[i] != 0xfffd)
185: all_invalid = false;
186: if (charset2uni[i] != i)
187: all_identity = false;
188: }
189: if (all_invalid)
190: line[i1] = -2;
191: else if (all_identity)
192: line[i1] = -1;
193: else
194: line[i1] = 0;
195: }
196: tableno = 0;
197: for (i1 = 0; i1 < 16; i1++) {
198: if (line[i1] >= 0) {
199: if (i1 > 0 && tableno > 0 && line[i1-1] == tableno-1) {
200: line[i1] = tableno-1;
201: tables[tableno-1].maxline = i1;
202: } else {
203: tableno++;
204: line[i1] = tableno-1;
205: tables[tableno-1].minline = tables[tableno-1].maxline = i1;
206: }
207: }
208: }
209: some_invalid = false;
210: for (i = 0; i < 0x100; i++)
211: if (charset2uni[i] == 0xfffd)
212: some_invalid = true;
213: if (tableno > 0) {
214: int t;
215: for (t = 0; t < tableno; t++) {
216: fprintf(f, "static const unsigned short %s_2uni", c_charsetname);
217: if (tableno > 1)
218: fprintf(f, "_%d", t+1);
219: fprintf(f, "[%d] = {\n", 16*(tables[t].maxline-tables[t].minline+1));
220: for (i1 = tables[t].minline; i1 <= tables[t].maxline; i1++) {
221: fprintf(f, " /* 0x%02x */\n", 16*i1);
222: for (i2 = 0; i2 < 2; i2++) {
223: fprintf(f, " ");
224: for (i3 = 0; i3 < 8; i3++) {
225: i = 16*i1+8*i2+i3;
226: fprintf(f, " 0x%04x,", charset2uni[i]);
227: }
228: fprintf(f, "\n");
229: }
230: }
231: fprintf(f, "};\n");
232: }
233: fprintf(f, "\n");
234: }
235: final_ret_reached = false;
236: fprintf(f, "static int\n%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", c_charsetname);
237: fprintf(f, "{\n");
238: fprintf(f, " unsigned char c = *s;\n");
239: if (some_invalid) {
240: for (i1 = 0; i1 < 16;) {
241: int t = line[i1];
242: const char* indent;
243: for (i2 = i1; i2 < 16 && line[i2] == t; i2++);
244: indent = (i1 == 0 && i2 == 16 ? " " : " ");
245: if (i1 == 0) {
246: if (i2 == 16) {
247: } else {
248: fprintf(f, " if (c < 0x%02x) {\n", 16*i2);
249: }
250: } else {
251: if (i2 == 16) {
252: fprintf(f, " else {\n");
253: } else {
254: fprintf(f, " else if (c < 0x%02x) {\n", 16*i2);
255: }
256: }
257: if (t == -2) {
258: final_ret_reached = true;
259: } else if (t == -1) {
260: fprintf(f, "%s*pwc = (ucs4_t) c;\n", indent);
261: fprintf(f, "%sreturn 1;\n", indent);
262: } else {
263: fprintf(f, "%s", indent);
264: some_invalid = false;
265: for (i = 16*i1; i < 16*i2; i++)
266: if (charset2uni[i] == 0xfffd)
267: some_invalid = true;
268: if (some_invalid)
269: fprintf(f, "unsigned short wc = ");
270: else
271: fprintf(f, "*pwc = (ucs4_t) ");
272: fprintf(f, "%s_2uni", c_charsetname);
273: if (tableno > 1)
274: fprintf(f, "_%d", t+1);
275: fprintf(f, "[c");
276: if (tables[t].minline > 0)
277: fprintf(f, "-0x%02x", 16*tables[t].minline);
278: fprintf(f, "];\n");
279: if (some_invalid) {
280: fprintf(f, "%sif (wc != 0xfffd) {\n", indent);
281: fprintf(f, "%s *pwc = (ucs4_t) wc;\n", indent);
282: fprintf(f, "%s return 1;\n", indent);
283: fprintf(f, "%s}\n", indent);
284: final_ret_reached = true;
285: } else {
286: fprintf(f, "%sreturn 1;\n", indent);
287: }
288: }
289: if (!(i1 == 0 && i2 == 16))
290: fprintf(f, " }\n");
291: i1 = i2;
292: }
293: if (final_ret_reached)
294: fprintf(f, " return RET_ILSEQ;\n");
295: } else {
296: for (i1 = 0; i1 < 16;) {
297: int t = line[i1];
298: for (i2 = i1; i2 < 16 && line[i2] == t; i2++);
299: if (i1 == 0) {
300: if (i2 == 16) {
301: fprintf(f, " ");
302: } else {
303: fprintf(f, " if (c < 0x%02x)\n ", 16*i2);
304: }
305: } else {
306: if (i2 == 16) {
307: fprintf(f, " else\n ");
308: } else {
309: fprintf(f, " else if (c < 0x%02x)\n ", 16*i2);
310: }
311: }
312: if (t == -1)
313: fprintf(f, "*pwc = (ucs4_t) c;\n");
314: else {
315: fprintf(f, "*pwc = (ucs4_t) %s_2uni", c_charsetname);
316: if (tableno > 1)
317: fprintf(f, "_%d", t+1);
318: fprintf(f, "[c");
319: if (tables[t].minline > 0)
320: fprintf(f, "-0x%02x", 16*tables[t].minline);
321: fprintf(f, "];\n");
322: }
323: i1 = i2;
324: }
325: fprintf(f, " return 1;\n");
326: }
327: fprintf(f, "}\n");
328:
329: }
330:
331: fprintf(f, "\n");
332:
333: {
334: int uni2charset[0x10000];
335: bool pages[0x100];
336: int line[0x2000];
337: int tableno;
338: struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000];
339: bool need_c;
340: bool fix_0000;
341: int i, j, p, j1, j2, t;
342:
343: for (j = 0; j < 0x10000; j++)
344: uni2charset[j] = 0;
345: for (p = 0; p < 0x100; p++)
346: pages[p] = false;
347: for (i = 0; i < 0x100; i++) {
348: j = charset2uni[i];
349: if (j != 0xfffd) {
350: uni2charset[j] = i;
351: pages[j>>8] = true;
352: }
353: }
354: for (j1 = 0; j1 < 0x2000; j1++) {
355: bool all_invalid = true;
356: bool all_identity = true;
357: for (j2 = 0; j2 < 8; j2++) {
358: j = 8*j1+j2;
359: if (uni2charset[j] != 0)
360: all_invalid = false;
361: if (uni2charset[j] != j)
362: all_identity = false;
363: }
364: if (all_invalid)
365: line[j1] = -2;
366: else if (all_identity)
367: line[j1] = -1;
368: else
369: line[j1] = 0;
370: }
371: tableno = 0;
372: for (j1 = 0; j1 < 0x2000; j1++) {
373: if (line[j1] >= 0) {
374: if (tableno > 0
375: && ((j1 > 0 && line[j1-1] == tableno-1)
376: || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
377: && j1 - tables[tableno-1].maxline <= 8))) {
378: line[j1] = tableno-1;
379: tables[tableno-1].maxline = j1;
380: } else {
381: tableno++;
382: line[j1] = tableno-1;
383: tables[tableno-1].minline = tables[tableno-1].maxline = j1;
384: }
385: }
386: }
387: for (t = 0; t < tableno; t++) {
388: tables[t].usecount = 0;
389: j1 = 8*tables[t].minline;
390: j2 = 8*(tables[t].maxline+1);
391: for (j = j1; j < j2; j++)
392: if (uni2charset[j] != 0)
393: tables[t].usecount++;
394: }
395: for (t = 0, p = -1, i = 0; t < tableno; t++) {
396: if (tables[t].usecount > 1) {
397: char* s;
398: if (p == tables[t].minline >> 5) {
399: s = (char*) malloc(5+1);
400: sprintf(s, "%02x_%d", p, ++i);
401: } else {
402: p = tables[t].minline >> 5;
403: s = (char*) malloc(2+1);
404: sprintf(s, "%02x", p);
405: }
406: tables[t].suffix = s;
407: } else
408: tables[t].suffix = NULL;
409: }
410: {
411: p = -1;
412: for (t = 0; t < tableno; t++)
413: if (tables[t].usecount > 1) {
414: p = 0;
415: fprintf(f, "static const unsigned char %s_page%s[%d] = {\n", c_charsetname, tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1));
416: for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
417: if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
418: fprintf(f, " /* 0x%04x */\n", 8*j1);
419: fprintf(f, " ");
420: for (j2 = 0; j2 < 8; j2++) {
421: j = 8*j1+j2;
422: fprintf(f, " 0x%02x,", uni2charset[j]);
423: }
424: fprintf(f, " /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
425: }
426: fprintf(f, "};\n");
427: }
428: if (p >= 0)
429: fprintf(f, "\n");
430: }
431: need_c = false;
432: for (j1 = 0; j1 < 0x2000;) {
433: t = line[j1];
434: for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
435: if (t >= 0)
436: j2 = tables[t].maxline+1;
437: if (!(t == -2 || (t == -1 && j1 == 0)))
438: need_c = true;
439: j1 = j2;
440: }
441: fix_0000 = false;
442: fprintf(f, "static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", c_charsetname);
443: fprintf(f, "{\n");
444: if (need_c)
445: fprintf(f, " unsigned char c = 0;\n");
446: for (j1 = 0; j1 < 0x2000;) {
447: t = line[j1];
448: for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
449: if (t >= 0) {
450: if (j1 != tables[t].minline) abort();
451: if (j2 > tables[t].maxline+1) abort();
452: j2 = tables[t].maxline+1;
453: }
454: if (t == -2) {
455: } else {
456: if (j1 == 0)
457: fprintf(f, " ");
458: else
459: fprintf(f, " else ");
460: if (t >= 0 && tables[t].usecount == 0) abort();
461: if (t >= 0 && tables[t].usecount == 1) {
462: if (j2 != j1+1) abort();
463: for (j = 8*j1; j < 8*j2; j++)
464: if (uni2charset[j] != 0) {
465: fprintf(f, "if (wc == 0x%04x)\n c = 0x%02x;\n", j, uni2charset[j]);
466: break;
467: }
468: } else {
469: if (j1 == 0) {
470: fprintf(f, "if (wc < 0x%04x)", 8*j2);
471: } else {
472: fprintf(f, "if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
473: }
474: if (t == -1) {
475: if (j1 == 0)
476: /* If wc == 0, the function must return 1, not -1. */
477: fprintf(f, " {\n *r = wc;\n return 1;\n }\n");
478: else
479: fprintf(f, "\n c = wc;\n");
480: } else {
481: fprintf(f, "\n c = %s_page%s[wc", c_charsetname, tables[t].suffix);
482: if (tables[t].minline > 0)
483: fprintf(f, "-0x%04x", 8*j1);
484: fprintf(f, "];\n");
485: if (j1 == 0 && uni2charset[0] == 0)
486: /* If wc == 0, the function must return 1, not -1. */
487: fix_0000 = true;
488: }
489: }
490: }
491: j1 = j2;
492: }
493: if (need_c) {
494: if (fix_0000)
495: fprintf(f, " if (c != 0 || wc == 0) {\n");
496: else
497: fprintf(f, " if (c != 0) {\n");
498: fprintf(f, " *r = c;\n");
499: fprintf(f, " return 1;\n");
500: fprintf(f, " }\n");
501: }
502: fprintf(f, " return RET_ILUNI;\n");
503: fprintf(f, "}\n");
504:
505: }
506:
507: if (ferror(f) || fclose(f))
508: exit(1);
509: }
510:
511: #if 0
512:
513: int i1, i2, i3, i1_min, i1_max, j1, j2;
514:
515: i1_min = 16;
516: i1_max = -1;
517: for (i1 = 0; i1 < 16; i1++)
518: for (i2 = 0; i2 < 16; i2++)
519: if (charset2uni[16*i1+i2] != 0xfffd) {
520: if (i1_min > i1) i1_min = i1;
521: if (i1_max < i1) i1_max = i1;
522: }
523: printf("static const unsigned short %s_2uni[%d] = {\n",
524: name, 16*(i1_max-i1_min+1));
525: for (i1 = i1_min; i1 <= i1_max; i1++) {
526: printf(" /""* 0x%02x *""/\n", 16*i1);
527: for (i2 = 0; i2 < 2; i2++) {
528: printf(" ");
529: for (i3 = 0; i3 < 8; i3++) {
530: if (i3 > 0) printf(" ");
531: printf("0x%04x,", charset2uni[16*i1+8*i2+i3]);
532: }
533: printf("\n");
534: }
535: }
536: printf("};\n");
537: printf("\n");
538:
539: for (p = 0; p < 0x100; p++)
540: pages[p] = 0;
541: for (i = 0; i < 0x100; i++)
542: if (charset2uni[i] != 0xfffd)
543: pages[charset2uni[i]>>8] = 1;
544: for (p = 0; p < 0x100; p++)
545: if (pages[p]) {
546: int j1_min = 32;
547: int j1_max = -1;
548: for (j1 = 0; j1 < 32; j1++)
549: for (j2 = 0; j2 < 8; j2++)
550: if (uni2charset[256*p+8*j1+j2] != 0) {
551: if (j1_min > j1) j1_min = j1;
552: if (j1_max < j1) j1_max = j1;
553: }
554: printf("static const unsigned char %s_page%02x[%d] = {\n",
555: name, p, 8*(j1_max-j1_min+1));
556: for (j1 = j1_min; j1 <= j1_max; j1++) {
557: printf(" ");
558: for (j2 = 0; j2 < 8; j2++)
559: printf("0x%02x, ", uni2charset[256*p+8*j1+j2]);
560: printf("/""* 0x%02x-0x%02x *""/\n", 8*j1, 8*j1+7);
561: }
562: printf("};\n");
563: }
564: printf("\n");
565:
566: }
567: #endif
568:
569: exit(0);
570: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>