Annotation of embedaddon/libxml2/genUnicode.py, revision 1.1.1.1
1.1 misho 1: #!/usr/bin/python -u
2: #
3: # Original script modified in November 2003 to take advantage of
4: # the character-validation range routines, and updated to the
5: # current Unicode information (Version 4.0.1)
6: #
7: # NOTE: there is an 'alias' facility for blocks which are not present in
8: # the current release, but are needed for ABI compatibility. This
9: # must be accomplished MANUALLY! Please see the comments below under
10: # 'blockAliases'
11: #
12: import sys
13: import string
14: import time
15:
16: webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
17: sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
18:
19: #
20: # blockAliases is a small hack - it is used for mapping block names which
21: # were were used in the 3.1 release, but are missing or changed in the current
22: # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
23: blockAliases = []
24: blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
25: blockAliases.append("Greek:GreekandCoptic")
26: blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
27: "SupplementaryPrivateUseArea-B")
28:
29: # minTableSize gives the minimum number of ranges which must be present
30: # before a range table is produced. If there are less than this
31: # number, inline comparisons are generated
32: minTableSize = 8
33:
34: (blockfile, catfile) = string.split(sources)
35:
36:
37: #
38: # Now process the "blocks" file, reducing it to a dictionary
39: # indexed by blockname, containing a tuple with the applicable
40: # block range
41: #
42: BlockNames = {}
43: try:
44: blocks = open(blockfile, "r")
45: except:
46: print "Missing %s, aborting ..." % blockfile
47: sys.exit(1)
48:
49: for line in blocks.readlines():
50: if line[0] == '#':
51: continue
52: line = string.strip(line)
53: if line == '':
54: continue
55: try:
56: fields = string.split(line, ';')
57: range = string.strip(fields[0])
58: (start, end) = string.split(range, "..")
59: name = string.strip(fields[1])
60: name = string.replace(name, ' ', '')
61: except:
62: print "Failed to process line: %s" % (line)
63: continue
64: start = "0x" + start
65: end = "0x" + end
66: try:
67: BlockNames[name].append((start, end))
68: except:
69: BlockNames[name] = [(start, end)]
70: blocks.close()
71: print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
72:
73: for block in blockAliases:
74: alias = string.split(block,':')
75: alist = string.split(alias[1],',')
76: for comp in alist:
77: if BlockNames.has_key(comp):
78: if alias[0] not in BlockNames:
79: BlockNames[alias[0]] = []
80: for r in BlockNames[comp]:
81: BlockNames[alias[0]].append(r)
82: else:
83: print "Alias %s: %s not in Blocks" % (alias[0], comp)
84: continue
85:
86: #
87: # Next process the Categories file. This is more complex, since
88: # the file is in code sequence, and we need to invert it. We use
89: # a dictionary with index category-name, with each entry containing
90: # all the ranges (codepoints) of that category. Note that category
91: # names comprise two parts - the general category, and the "subclass"
92: # within that category. Therefore, both "general category" (which is
93: # the first character of the 2-character category-name) and the full
94: # (2-character) name are entered into this dictionary.
95: #
96: try:
97: data = open(catfile, "r")
98: except:
99: print "Missing %s, aborting ..." % catfile
100: sys.exit(1)
101:
102: nbchar = 0;
103: Categories = {}
104: for line in data.readlines():
105: if line[0] == '#':
106: continue
107: line = string.strip(line)
108: if line == '':
109: continue
110: try:
111: fields = string.split(line, ';')
112: point = string.strip(fields[0])
113: value = 0
114: while point != '':
115: value = value * 16
116: if point[0] >= '0' and point[0] <= '9':
117: value = value + ord(point[0]) - ord('0')
118: elif point[0] >= 'A' and point[0] <= 'F':
119: value = value + 10 + ord(point[0]) - ord('A')
120: elif point[0] >= 'a' and point[0] <= 'f':
121: value = value + 10 + ord(point[0]) - ord('a')
122: point = point[1:]
123: name = fields[2]
124: except:
125: print "Failed to process line: %s" % (line)
126: continue
127:
128: nbchar = nbchar + 1
129: # update entry for "full name"
130: try:
131: Categories[name].append(value)
132: except:
133: try:
134: Categories[name] = [value]
135: except:
136: print "Failed to process line: %s" % (line)
137: # update "general category" name
138: try:
139: Categories[name[0]].append(value)
140: except:
141: try:
142: Categories[name[0]] = [value]
143: except:
144: print "Failed to process line: %s" % (line)
145:
146: blocks.close()
147: print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
148:
149: #
150: # The data is now all read. Time to process it into a more useful form.
151: #
152: # reduce the number list into ranges
153: for cat in Categories.keys():
154: list = Categories[cat]
155: start = -1
156: prev = -1
157: end = -1
158: ranges = []
159: for val in list:
160: if start == -1:
161: start = val
162: prev = val
163: continue
164: elif val == prev + 1:
165: prev = val
166: continue
167: elif prev == start:
168: ranges.append((prev, prev))
169: start = val
170: prev = val
171: continue
172: else:
173: ranges.append((start, prev))
174: start = val
175: prev = val
176: continue
177: if prev == start:
178: ranges.append((prev, prev))
179: else:
180: ranges.append((start, prev))
181: Categories[cat] = ranges
182:
183: #
184: # Assure all data is in alphabetic order, since we will be doing binary
185: # searches on the tables.
186: #
187: bkeys = BlockNames.keys()
188: bkeys.sort()
189:
190: ckeys = Categories.keys()
191: ckeys.sort()
192:
193: #
194: # Generate the resulting files
195: #
196: try:
197: header = open("include/libxml/xmlunicode.h", "w")
198: except:
199: print "Failed to open include/libxml/xmlunicode.h"
200: sys.exit(1)
201:
202: try:
203: output = open("xmlunicode.c", "w")
204: except:
205: print "Failed to open xmlunicode.c"
206: sys.exit(1)
207:
208: date = time.asctime(time.localtime(time.time()))
209:
210: header.write(
211: """/*
212: * Summary: Unicode character APIs
213: * Description: API for the Unicode character APIs
214: *
215: * This file is automatically generated from the
216: * UCS description files of the Unicode Character Database
217: * %s
218: * using the genUnicode.py Python script.
219: *
220: * Generation date: %s
221: * Sources: %s
222: * Author: Daniel Veillard
223: */
224:
225: #ifndef __XML_UNICODE_H__
226: #define __XML_UNICODE_H__
227:
228: #include <libxml/xmlversion.h>
229:
230: #ifdef LIBXML_UNICODE_ENABLED
231:
232: #ifdef __cplusplus
233: extern "C" {
234: #endif
235:
236: """ % (webpage, date, sources));
237:
238: output.write(
239: """/*
240: * xmlunicode.c: this module implements the Unicode character APIs
241: *
242: * This file is automatically generated from the
243: * UCS description files of the Unicode Character Database
244: * %s
245: * using the genUnicode.py Python script.
246: *
247: * Generation date: %s
248: * Sources: %s
249: * Daniel Veillard <veillard@redhat.com>
250: */
251:
252: #define IN_LIBXML
253: #include "libxml.h"
254:
255: #ifdef LIBXML_UNICODE_ENABLED
256:
257: #include <string.h>
258: #include <libxml/xmlversion.h>
259: #include <libxml/xmlunicode.h>
260: #include <libxml/chvalid.h>
261:
262: typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
263:
264: typedef struct {
265: const char *rangename;
266: xmlIntFunc *func;
267: } xmlUnicodeRange;
268:
269: typedef struct {
270: xmlUnicodeRange *table;
271: int numentries;
272: } xmlUnicodeNameTable;
273:
274:
275: static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
276:
277: static xmlUnicodeRange xmlUnicodeBlocks[] = {
278: """ % (webpage, date, sources));
279:
280: flag = 0
281: for block in bkeys:
282: name = string.replace(block, '-', '')
283: if flag:
284: output.write(',\n')
285: else:
286: flag = 1
287: output.write(' {"%s", xmlUCSIs%s}' % (block, name))
288: output.write('};\n\n')
289:
290: output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
291: flag = 0;
292: for name in ckeys:
293: if flag:
294: output.write(',\n')
295: else:
296: flag = 1
297: output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
298: output.write('};\n\n')
299:
300: #
301: # For any categories with more than minTableSize ranges we generate
302: # a range table suitable for xmlCharInRange
303: #
304: for name in ckeys:
305: if len(Categories[name]) > minTableSize:
306: numshort = 0
307: numlong = 0
308: ranges = Categories[name]
309: sptr = "NULL"
310: lptr = "NULL"
311: for range in ranges:
312: (low, high) = range
313: if high < 0x10000:
314: if numshort == 0:
315: pline = "static const xmlChSRange xml%sS[] = {" % name
316: sptr = "xml%sS" % name
317: else:
318: pline += ", "
319: numshort += 1
320: else:
321: if numlong == 0:
322: if numshort > 0:
323: output.write(pline + " };\n")
324: pline = "static const xmlChLRange xml%sL[] = {" % name
325: lptr = "xml%sL" % name
326: else:
327: pline += ", "
328: numlong += 1
329: if len(pline) > 60:
330: output.write(pline + "\n")
331: pline = " "
332: pline += "{%s, %s}" % (hex(low), hex(high))
333: output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
334: % (name, numshort, numlong, sptr, lptr))
335:
336:
337: output.write(
338: """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
339: static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
340:
341: /**
342: * xmlUnicodeLookup:
343: * @tptr: pointer to the name table
344: * @name: name to be found
345: *
346: * binary table lookup for user-supplied name
347: *
348: * Returns pointer to range function if found, otherwise NULL
349: */
350: static xmlIntFunc
351: *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
352: int low, high, mid, cmp;
353: xmlUnicodeRange *sptr;
354:
355: if ((tptr == NULL) || (tname == NULL)) return(NULL);
356:
357: low = 0;
358: high = tptr->numentries - 1;
359: sptr = tptr->table;
360: while (low <= high) {
361: mid = (low + high) / 2;
362: if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
363: return (sptr[mid].func);
364: if (cmp < 0)
365: high = mid - 1;
366: else
367: low = mid + 1;
368: }
369: return (NULL);
370: }
371:
372: """ % (len(BlockNames), len(Categories)) )
373:
374: for block in bkeys:
375: name = string.replace(block, '-', '')
376: header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
377: output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
378: output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
379: (block))
380: output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
381: output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
382: flag = 0
383: for (start, end) in BlockNames[block]:
384: if flag:
385: output.write(" ||\n ")
386: else:
387: flag = 1
388: output.write("((code >= %s) && (code <= %s))" % (start, end))
389: output.write(");\n}\n\n")
390:
391: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
392: output.write(
393: """/**
394: * xmlUCSIsBlock:
395: * @code: UCS code point
396: * @block: UCS block name
397: *
398: * Check whether the character is part of the UCS Block
399: *
400: * Returns 1 if true, 0 if false and -1 on unknown block
401: */
402: int
403: xmlUCSIsBlock(int code, const char *block) {
404: xmlIntFunc *func;
405:
406: func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
407: if (func == NULL)
408: return (-1);
409: return (func(code));
410: }
411:
412: """)
413:
414: for name in ckeys:
415: ranges = Categories[name]
416: header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
417: output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
418: output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
419: (name))
420: output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
421: output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
422: if len(Categories[name]) > minTableSize:
423: output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
424: % name)
425: else:
426: start = 1
427: for range in ranges:
428: (begin, end) = range;
429: if start:
430: output.write(" return(");
431: start = 0
432: else:
433: output.write(" ||\n ");
434: if (begin == end):
435: output.write("(code == %s)" % (hex(begin)))
436: else:
437: output.write("((code >= %s) && (code <= %s))" % (
438: hex(begin), hex(end)))
439: output.write(");\n}\n\n")
440:
441: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
442: output.write(
443: """/**
444: * xmlUCSIsCat:
445: * @code: UCS code point
446: * @cat: UCS Category name
447: *
448: * Check whether the character is part of the UCS Category
449: *
450: * Returns 1 if true, 0 if false and -1 on unknown category
451: */
452: int
453: xmlUCSIsCat(int code, const char *cat) {
454: xmlIntFunc *func;
455:
456: func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
457: if (func == NULL)
458: return (-1);
459: return (func(code));
460: }
461:
462: #define bottom_xmlunicode
463: #include "elfgcchack.h"
464: #endif /* LIBXML_UNICODE_ENABLED */
465: """)
466:
467: header.write("""
468: #ifdef __cplusplus
469: }
470: #endif
471:
472: #endif /* LIBXML_UNICODE_ENABLED */
473:
474: #endif /* __XML_UNICODE_H__ */
475: """);
476:
477: header.close()
478: output.close()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>