Annotation of embedaddon/libiconv/tools/8bit_tab_to_h.c, revision 1.1.1.2
1.1.1.2 ! misho 1: /* Copyright (C) 1999-2002, 2011-2012, 2016, 2018 Free Software Foundation, Inc.
1.1 misho 2: This file is part of the GNU LIBICONV Tools.
3:
4: This program is free software: you can redistribute it and/or modify
5: it under the terms of the GNU General Public License as published by
6: the Free Software Foundation; either version 3 of the License, or
7: (at your option) any later version.
8:
9: This program is distributed in the hope that it will be useful,
10: but WITHOUT ANY WARRANTY; without even the implied warranty of
11: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12: GNU General Public License for more details.
13:
14: You should have received a copy of the GNU General Public License
1.1.1.2 ! misho 15: along with this program; if not, see <https://www.gnu.org/licenses/>. */
1.1 misho 16:
17: /*
18: * Generates an 8-bit character set table from a .TXT table as found on
19: * ftp.unicode.org or from a table containing the 256 Unicode values as
20: * hexadecimal integers.
21: * Examples:
22: *
23: * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < tab8859_1
24: * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < tab8859_2
25: * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < tab8859_3
26: * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < tab8859_4
27: * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < tab8859_5
28: * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < tab8859_6
29: * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < tab8859_7
30: * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < tab8859_8
31: * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < tab8859_9
32: * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < tab8859_10
33: * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < tab8859_14
34: * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < tab8859_15
35: * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < jis0201
36: * ./8bit_tab_to_h TIS620.2533-1 tis620 < tabtis620
37: * ./8bit_tab_to_h KOI8-R koi8_r < tabkoi8_r
38: * ./8bit_tab_to_h KOI8-U koi8_u < tabkoi8_u
39: * ./8bit_tab_to_h ARMSCII-8 armscii_8 < tabarmscii_8
40: * ./8bit_tab_to_h CP1133 cp1133 < tabibm_cp1133
41: * ./8bit_tab_to_h MULELAO-1 mulelao < tabmulelao_1
42: * ./8bit_tab_to_h VISCII1.1-1 viscii1 < tabviscii
43: * ./8bit_tab_to_h TCVN-5712 tcvn < tabtcvn
44: * ./8bit_tab_to_h GEORGIAN-ACADEMY georgian_ac < tabgeorgian_academy
45: * ./8bit_tab_to_h GEORGIAN-PS georgian_ps < tabgeorgian_ps
46: *
47: * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < 8859-1.TXT
48: * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < 8859-2.TXT
49: * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < 8859-3.TXT
50: * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < 8859-4.TXT
51: * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < 8859-5.TXT
52: * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < 8859-6.TXT
53: * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < 8859-7.TXT
54: * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < 8859-8.TXT
55: * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < 8859-9.TXT
56: * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < 8859-10.TXT
57: * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < 8859-14.TXT
58: * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < 8859-15.TXT
59: * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < JIS0201.TXT
60: * ./8bit_tab_to_h KOI8-R koi8_r < KOI8-R.TXT
1.1.1.2 ! misho 61: *
! 62: * ./8bit_tab_to_h 'CP50221 JISX0208 extensions' cp50221_0208_ext < CP50221-0208-EXT.TXT
! 63: * ./8bit_tab_to_h 'CP50221 JISX0212 extensions' cp50221_0212_ext < CP50221-0212-EXT.TXT
1.1 misho 64: */
65:
66: #include <stdio.h>
67: #include <stdlib.h>
68: #include <stdbool.h>
69: #include <string.h>
70:
71: int main (int argc, char *argv[])
72: {
73: const char* charsetname;
74: const char* c_charsetname;
75: const char* filename;
76: const char* directory;
77: int charset2uni[0x100];
78:
79: if (argc != 3 && argc != 4 && argc != 5)
80: exit(1);
81: charsetname = argv[1];
82: c_charsetname = argv[2];
83: if (argc > 3) {
84: filename = argv[3];
85: } else {
86: char* s = (char*) malloc(strlen(c_charsetname)+strlen(".h")+1);
87: strcpy(s,c_charsetname); strcat(s,".h");
88: filename = s;
89: }
90: directory = (argc > 4 ? argv[4] : "");
91:
92: fprintf(stderr, "Creating %s%s\n", directory, filename);
93:
94: {
95: int i, c;
96: c = getc(stdin);
97: ungetc(c,stdin);
98: if (c == '#') {
99: /* Read a unicode.org style .TXT file. */
100: for (i = 0; i < 0x100; i++)
101: charset2uni[i] = 0xfffd;
102: for (;;) {
103: c = getc(stdin);
104: if (c == EOF)
105: break;
106: if (c == '\n' || c == ' ' || c == '\t')
107: continue;
108: if (c == '#') {
109: do { c = getc(stdin); } while (!(c == EOF || c == '\n'));
110: continue;
111: }
112: ungetc(c,stdin);
113: if (scanf("0x%x", &i) != 1 || !(i >= 0 && i < 0x100))
114: exit(1);
115: do { c = getc(stdin); } while (c == ' ' || c == '\t');
116: if (c != EOF)
117: ungetc(c,stdin);
118: if (c == '\n' || c == '#')
119: continue;
120: if (scanf("0x%x", &charset2uni[i]) != 1)
121: exit(1);
122: }
123: } else {
124: /* Read a table of hexadecimal Unicode values. */
125: for (i = 0; i < 0x100; i++) {
126: if (scanf("%x", &charset2uni[i]) != 1)
127: exit(1);
128: if (charset2uni[i] < 0 || charset2uni[i] == 0xffff)
129: charset2uni[i] = 0xfffd;
130: }
131: if (scanf("%x", &i) != EOF)
132: exit(1);
133: }
134: }
135:
136: /* Write the output file. */
137: {
138: FILE* f;
139:
140: {
141: char* fname = malloc(strlen(directory)+strlen(filename)+1);
142: strcpy(fname,directory); strcat(fname,filename);
143: f = fopen(fname,"w");
144: if (f == NULL)
145: exit(1);
146: }
147:
148: fprintf(f, "/*\n");
149: fprintf(f, " * Copyright (C) 1999-2002 Free Software Foundation, Inc.\n");
150: fprintf(f, " * This file is part of the GNU LIBICONV Library.\n");
151: fprintf(f, " *\n");
152: fprintf(f, " * The GNU LIBICONV Library is free software; you can redistribute it\n");
153: fprintf(f, " * and/or modify it under the terms of the GNU Library General Public\n");
154: fprintf(f, " * License as published by the Free Software Foundation; either version 2\n");
155: fprintf(f, " * of the License, or (at your option) any later version.\n");
156: fprintf(f, " *\n");
157: fprintf(f, " * The GNU LIBICONV Library is distributed in the hope that it will be\n");
158: fprintf(f, " * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of\n");
159: fprintf(f, " * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU\n");
160: fprintf(f, " * Library General Public License for more details.\n");
161: fprintf(f, " *\n");
162: fprintf(f, " * You should have received a copy of the GNU Library General Public\n");
163: fprintf(f, " * License along with the GNU LIBICONV Library; see the file COPYING.LIB.\n");
1.1.1.2 ! misho 164: fprintf(f, " * If not, see <https://www.gnu.org/licenses/>.\n");
1.1 misho 165: fprintf(f, " */\n");
166: fprintf(f, "\n");
167: fprintf(f, "/*\n");
168: fprintf(f, " * %s\n", charsetname);
169: fprintf(f, " */\n");
170: fprintf(f, "\n");
171:
172: {
173: int i, i1, i2, i3;
174: int line[16];
175: int tableno;
176: struct { int minline; int maxline; } tables[16];
177: bool some_invalid;
178: bool final_ret_reached;
179:
180: for (i1 = 0; i1 < 16; i1++) {
181: bool all_invalid = true;
182: bool all_identity = true;
183: for (i2 = 0; i2 < 16; i2++) {
184: i = 16*i1+i2;
185: if (charset2uni[i] != 0xfffd)
186: all_invalid = false;
187: if (charset2uni[i] != i)
188: all_identity = false;
189: }
190: if (all_invalid)
191: line[i1] = -2;
192: else if (all_identity)
193: line[i1] = -1;
194: else
195: line[i1] = 0;
196: }
197: tableno = 0;
198: for (i1 = 0; i1 < 16; i1++) {
199: if (line[i1] >= 0) {
200: if (i1 > 0 && tableno > 0 && line[i1-1] == tableno-1) {
201: line[i1] = tableno-1;
202: tables[tableno-1].maxline = i1;
203: } else {
204: tableno++;
205: line[i1] = tableno-1;
206: tables[tableno-1].minline = tables[tableno-1].maxline = i1;
207: }
208: }
209: }
210: some_invalid = false;
211: for (i = 0; i < 0x100; i++)
212: if (charset2uni[i] == 0xfffd)
213: some_invalid = true;
214: if (tableno > 0) {
215: int t;
216: for (t = 0; t < tableno; t++) {
217: fprintf(f, "static const unsigned short %s_2uni", c_charsetname);
218: if (tableno > 1)
219: fprintf(f, "_%d", t+1);
220: fprintf(f, "[%d] = {\n", 16*(tables[t].maxline-tables[t].minline+1));
221: for (i1 = tables[t].minline; i1 <= tables[t].maxline; i1++) {
222: fprintf(f, " /* 0x%02x */\n", 16*i1);
223: for (i2 = 0; i2 < 2; i2++) {
224: fprintf(f, " ");
225: for (i3 = 0; i3 < 8; i3++) {
226: i = 16*i1+8*i2+i3;
227: fprintf(f, " 0x%04x,", charset2uni[i]);
228: }
229: fprintf(f, "\n");
230: }
231: }
232: fprintf(f, "};\n");
233: }
234: fprintf(f, "\n");
235: }
236: final_ret_reached = false;
1.1.1.2 ! misho 237: fprintf(f, "static int\n%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, size_t n)\n", c_charsetname);
1.1 misho 238: fprintf(f, "{\n");
239: fprintf(f, " unsigned char c = *s;\n");
240: if (some_invalid) {
241: for (i1 = 0; i1 < 16;) {
242: int t = line[i1];
243: const char* indent;
244: for (i2 = i1; i2 < 16 && line[i2] == t; i2++);
245: indent = (i1 == 0 && i2 == 16 ? " " : " ");
246: if (i1 == 0) {
247: if (i2 == 16) {
248: } else {
249: fprintf(f, " if (c < 0x%02x) {\n", 16*i2);
250: }
251: } else {
252: if (i2 == 16) {
253: fprintf(f, " else {\n");
254: } else {
255: fprintf(f, " else if (c < 0x%02x) {\n", 16*i2);
256: }
257: }
258: if (t == -2) {
259: final_ret_reached = true;
260: } else if (t == -1) {
261: fprintf(f, "%s*pwc = (ucs4_t) c;\n", indent);
262: fprintf(f, "%sreturn 1;\n", indent);
263: } else {
264: fprintf(f, "%s", indent);
265: some_invalid = false;
266: for (i = 16*i1; i < 16*i2; i++)
267: if (charset2uni[i] == 0xfffd)
268: some_invalid = true;
269: if (some_invalid)
270: fprintf(f, "unsigned short wc = ");
271: else
272: fprintf(f, "*pwc = (ucs4_t) ");
273: fprintf(f, "%s_2uni", c_charsetname);
274: if (tableno > 1)
275: fprintf(f, "_%d", t+1);
276: fprintf(f, "[c");
277: if (tables[t].minline > 0)
278: fprintf(f, "-0x%02x", 16*tables[t].minline);
279: fprintf(f, "];\n");
280: if (some_invalid) {
281: fprintf(f, "%sif (wc != 0xfffd) {\n", indent);
282: fprintf(f, "%s *pwc = (ucs4_t) wc;\n", indent);
283: fprintf(f, "%s return 1;\n", indent);
284: fprintf(f, "%s}\n", indent);
285: final_ret_reached = true;
286: } else {
287: fprintf(f, "%sreturn 1;\n", indent);
288: }
289: }
290: if (!(i1 == 0 && i2 == 16))
291: fprintf(f, " }\n");
292: i1 = i2;
293: }
294: if (final_ret_reached)
295: fprintf(f, " return RET_ILSEQ;\n");
296: } else {
297: for (i1 = 0; i1 < 16;) {
298: int t = line[i1];
299: for (i2 = i1; i2 < 16 && line[i2] == t; i2++);
300: if (i1 == 0) {
301: if (i2 == 16) {
302: fprintf(f, " ");
303: } else {
304: fprintf(f, " if (c < 0x%02x)\n ", 16*i2);
305: }
306: } else {
307: if (i2 == 16) {
308: fprintf(f, " else\n ");
309: } else {
310: fprintf(f, " else if (c < 0x%02x)\n ", 16*i2);
311: }
312: }
313: if (t == -1)
314: fprintf(f, "*pwc = (ucs4_t) c;\n");
315: else {
316: fprintf(f, "*pwc = (ucs4_t) %s_2uni", c_charsetname);
317: if (tableno > 1)
318: fprintf(f, "_%d", t+1);
319: fprintf(f, "[c");
320: if (tables[t].minline > 0)
321: fprintf(f, "-0x%02x", 16*tables[t].minline);
322: fprintf(f, "];\n");
323: }
324: i1 = i2;
325: }
326: fprintf(f, " return 1;\n");
327: }
328: fprintf(f, "}\n");
329:
330: }
331:
332: fprintf(f, "\n");
333:
334: {
335: int uni2charset[0x10000];
336: bool pages[0x100];
337: int line[0x2000];
338: int tableno;
339: struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000];
340: bool need_c;
341: bool fix_0000;
342: int i, j, p, j1, j2, t;
343:
344: for (j = 0; j < 0x10000; j++)
345: uni2charset[j] = 0;
346: for (p = 0; p < 0x100; p++)
347: pages[p] = false;
348: for (i = 0; i < 0x100; i++) {
349: j = charset2uni[i];
350: if (j != 0xfffd) {
351: uni2charset[j] = i;
352: pages[j>>8] = true;
353: }
354: }
355: for (j1 = 0; j1 < 0x2000; j1++) {
356: bool all_invalid = true;
357: bool all_identity = true;
358: for (j2 = 0; j2 < 8; j2++) {
359: j = 8*j1+j2;
360: if (uni2charset[j] != 0)
361: all_invalid = false;
362: if (uni2charset[j] != j)
363: all_identity = false;
364: }
365: if (all_invalid)
366: line[j1] = -2;
367: else if (all_identity)
368: line[j1] = -1;
369: else
370: line[j1] = 0;
371: }
372: tableno = 0;
373: for (j1 = 0; j1 < 0x2000; j1++) {
374: if (line[j1] >= 0) {
375: if (tableno > 0
376: && ((j1 > 0 && line[j1-1] == tableno-1)
377: || ((tables[tableno-1].maxline >> 5) == (j1 >> 5)
378: && j1 - tables[tableno-1].maxline <= 8))) {
379: line[j1] = tableno-1;
380: tables[tableno-1].maxline = j1;
381: } else {
382: tableno++;
383: line[j1] = tableno-1;
384: tables[tableno-1].minline = tables[tableno-1].maxline = j1;
385: }
386: }
387: }
388: for (t = 0; t < tableno; t++) {
389: tables[t].usecount = 0;
390: j1 = 8*tables[t].minline;
391: j2 = 8*(tables[t].maxline+1);
392: for (j = j1; j < j2; j++)
393: if (uni2charset[j] != 0)
394: tables[t].usecount++;
395: }
396: for (t = 0, p = -1, i = 0; t < tableno; t++) {
397: if (tables[t].usecount > 1) {
398: char* s;
399: if (p == tables[t].minline >> 5) {
400: s = (char*) malloc(5+1);
401: sprintf(s, "%02x_%d", p, ++i);
402: } else {
403: p = tables[t].minline >> 5;
404: s = (char*) malloc(2+1);
405: sprintf(s, "%02x", p);
406: }
407: tables[t].suffix = s;
408: } else
409: tables[t].suffix = NULL;
410: }
411: {
412: p = -1;
413: for (t = 0; t < tableno; t++)
414: if (tables[t].usecount > 1) {
415: p = 0;
416: fprintf(f, "static const unsigned char %s_page%s[%d] = {\n", c_charsetname, tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1));
417: for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) {
418: if ((j1 % 0x20) == 0 && j1 > tables[t].minline)
419: fprintf(f, " /* 0x%04x */\n", 8*j1);
420: fprintf(f, " ");
421: for (j2 = 0; j2 < 8; j2++) {
422: j = 8*j1+j2;
423: fprintf(f, " 0x%02x,", uni2charset[j]);
424: }
425: fprintf(f, " /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7);
426: }
427: fprintf(f, "};\n");
428: }
429: if (p >= 0)
430: fprintf(f, "\n");
431: }
432: need_c = false;
433: for (j1 = 0; j1 < 0x2000;) {
434: t = line[j1];
435: for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
436: if (t >= 0)
437: j2 = tables[t].maxline+1;
438: if (!(t == -2 || (t == -1 && j1 == 0)))
439: need_c = true;
440: j1 = j2;
441: }
442: fix_0000 = false;
1.1.1.2 ! misho 443: fprintf(f, "static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, size_t n)\n", c_charsetname);
1.1 misho 444: fprintf(f, "{\n");
445: if (need_c)
446: fprintf(f, " unsigned char c = 0;\n");
447: for (j1 = 0; j1 < 0x2000;) {
448: t = line[j1];
449: for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++);
450: if (t >= 0) {
451: if (j1 != tables[t].minline) abort();
452: if (j2 > tables[t].maxline+1) abort();
453: j2 = tables[t].maxline+1;
454: }
455: if (t == -2) {
456: } else {
457: if (j1 == 0)
458: fprintf(f, " ");
459: else
460: fprintf(f, " else ");
461: if (t >= 0 && tables[t].usecount == 0) abort();
462: if (t >= 0 && tables[t].usecount == 1) {
463: if (j2 != j1+1) abort();
464: for (j = 8*j1; j < 8*j2; j++)
465: if (uni2charset[j] != 0) {
466: fprintf(f, "if (wc == 0x%04x)\n c = 0x%02x;\n", j, uni2charset[j]);
467: break;
468: }
469: } else {
470: if (j1 == 0) {
471: fprintf(f, "if (wc < 0x%04x)", 8*j2);
472: } else {
473: fprintf(f, "if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2);
474: }
475: if (t == -1) {
476: if (j1 == 0)
477: /* If wc == 0, the function must return 1, not -1. */
478: fprintf(f, " {\n *r = wc;\n return 1;\n }\n");
479: else
480: fprintf(f, "\n c = wc;\n");
481: } else {
482: fprintf(f, "\n c = %s_page%s[wc", c_charsetname, tables[t].suffix);
483: if (tables[t].minline > 0)
484: fprintf(f, "-0x%04x", 8*j1);
485: fprintf(f, "];\n");
486: if (j1 == 0 && uni2charset[0] == 0)
487: /* If wc == 0, the function must return 1, not -1. */
488: fix_0000 = true;
489: }
490: }
491: }
492: j1 = j2;
493: }
494: if (need_c) {
495: if (fix_0000)
496: fprintf(f, " if (c != 0 || wc == 0) {\n");
497: else
498: fprintf(f, " if (c != 0) {\n");
499: fprintf(f, " *r = c;\n");
500: fprintf(f, " return 1;\n");
501: fprintf(f, " }\n");
502: }
503: fprintf(f, " return RET_ILUNI;\n");
504: fprintf(f, "}\n");
505:
506: }
507:
508: if (ferror(f) || fclose(f))
509: exit(1);
510: }
511:
512: #if 0
513:
514: int i1, i2, i3, i1_min, i1_max, j1, j2;
515:
516: i1_min = 16;
517: i1_max = -1;
518: for (i1 = 0; i1 < 16; i1++)
519: for (i2 = 0; i2 < 16; i2++)
520: if (charset2uni[16*i1+i2] != 0xfffd) {
521: if (i1_min > i1) i1_min = i1;
522: if (i1_max < i1) i1_max = i1;
523: }
524: printf("static const unsigned short %s_2uni[%d] = {\n",
525: name, 16*(i1_max-i1_min+1));
526: for (i1 = i1_min; i1 <= i1_max; i1++) {
527: printf(" /""* 0x%02x *""/\n", 16*i1);
528: for (i2 = 0; i2 < 2; i2++) {
529: printf(" ");
530: for (i3 = 0; i3 < 8; i3++) {
531: if (i3 > 0) printf(" ");
532: printf("0x%04x,", charset2uni[16*i1+8*i2+i3]);
533: }
534: printf("\n");
535: }
536: }
537: printf("};\n");
538: printf("\n");
539:
540: for (p = 0; p < 0x100; p++)
541: pages[p] = 0;
542: for (i = 0; i < 0x100; i++)
543: if (charset2uni[i] != 0xfffd)
544: pages[charset2uni[i]>>8] = 1;
545: for (p = 0; p < 0x100; p++)
546: if (pages[p]) {
547: int j1_min = 32;
548: int j1_max = -1;
549: for (j1 = 0; j1 < 32; j1++)
550: for (j2 = 0; j2 < 8; j2++)
551: if (uni2charset[256*p+8*j1+j2] != 0) {
552: if (j1_min > j1) j1_min = j1;
553: if (j1_max < j1) j1_max = j1;
554: }
555: printf("static const unsigned char %s_page%02x[%d] = {\n",
556: name, p, 8*(j1_max-j1_min+1));
557: for (j1 = j1_min; j1 <= j1_max; j1++) {
558: printf(" ");
559: for (j2 = 0; j2 < 8; j2++)
560: printf("0x%02x, ", uni2charset[256*p+8*j1+j2]);
561: printf("/""* 0x%02x-0x%02x *""/\n", 8*j1, 8*j1+7);
562: }
563: printf("};\n");
564: }
565: printf("\n");
566:
567: }
568: #endif
569:
570: exit(0);
571: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>