embedaddon/libxml2/genUnicode.py - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / genUnicode.py
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:37:58 2012 UTC (13 years, 4 months ago) by misho
Branches: libxml2, MAIN
CVS tags: v2_9_1p0, v2_9_1, v2_8_0p0, v2_8_0, v2_7_8, HEAD

libxml2

1: #!/usr/bin/python -u 2: # 3: # Original script modified in November 2003 to take advantage of 4: # the character-validation range routines, and updated to the 5: # current Unicode information (Version 4.0.1) 6: # 7: # NOTE: there is an 'alias' facility for blocks which are not present in 8: # the current release, but are needed for ABI compatibility. This 9: # must be accomplished MANUALLY! Please see the comments below under 10: # 'blockAliases' 11: # 12: import sys 13: import string 14: import time 15: 16: webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html" 17: sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt" 18: 19: # 20: # blockAliases is a small hack - it is used for mapping block names which 21: # were were used in the 3.1 release, but are missing or changed in the current 22: # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]" 23: blockAliases = [] 24: blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols") 25: blockAliases.append("Greek:GreekandCoptic") 26: blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 27: "SupplementaryPrivateUseArea-B") 28: 29: # minTableSize gives the minimum number of ranges which must be present 30: # before a range table is produced. If there are less than this 31: # number, inline comparisons are generated 32: minTableSize = 8 33: 34: (blockfile, catfile) = string.split(sources) 35: 36: 37: # 38: # Now process the "blocks" file, reducing it to a dictionary 39: # indexed by blockname, containing a tuple with the applicable 40: # block range 41: # 42: BlockNames = {} 43: try: 44: blocks = open(blockfile, "r") 45: except: 46: print "Missing %s, aborting ..." % blockfile 47: sys.exit(1) 48: 49: for line in blocks.readlines(): 50: if line[0] == '#': 51: continue 52: line = string.strip(line) 53: if line == '': 54: continue 55: try: 56: fields = string.split(line, ';') 57: range = string.strip(fields[0]) 58: (start, end) = string.split(range, "..") 59: name = string.strip(fields[1]) 60: name = string.replace(name, ' ', '') 61: except: 62: print "Failed to process line: %s" % (line) 63: continue 64: start = "0x" + start 65: end = "0x" + end 66: try: 67: BlockNames[name].append((start, end)) 68: except: 69: BlockNames[name] = [(start, end)] 70: blocks.close() 71: print "Parsed %d blocks descriptions" % (len(BlockNames.keys())) 72: 73: for block in blockAliases: 74: alias = string.split(block,':') 75: alist = string.split(alias[1],',') 76: for comp in alist: 77: if BlockNames.has_key(comp): 78: if alias[0] not in BlockNames: 79: BlockNames[alias[0]] = [] 80: for r in BlockNames[comp]: 81: BlockNames[alias[0]].append(r) 82: else: 83: print "Alias %s: %s not in Blocks" % (alias[0], comp) 84: continue 85: 86: # 87: # Next process the Categories file. This is more complex, since 88: # the file is in code sequence, and we need to invert it. We use 89: # a dictionary with index category-name, with each entry containing 90: # all the ranges (codepoints) of that category. Note that category 91: # names comprise two parts - the general category, and the "subclass" 92: # within that category. Therefore, both "general category" (which is 93: # the first character of the 2-character category-name) and the full 94: # (2-character) name are entered into this dictionary. 95: # 96: try: 97: data = open(catfile, "r") 98: except: 99: print "Missing %s, aborting ..." % catfile 100: sys.exit(1) 101: 102: nbchar = 0; 103: Categories = {} 104: for line in data.readlines(): 105: if line[0] == '#': 106: continue 107: line = string.strip(line) 108: if line == '': 109: continue 110: try: 111: fields = string.split(line, ';') 112: point = string.strip(fields[0]) 113: value = 0 114: while point != '': 115: value = value * 16 116: if point[0] >= '0' and point[0] <= '9': 117: value = value + ord(point[0]) - ord('0') 118: elif point[0] >= 'A' and point[0] <= 'F': 119: value = value + 10 + ord(point[0]) - ord('A') 120: elif point[0] >= 'a' and point[0] <= 'f': 121: value = value + 10 + ord(point[0]) - ord('a') 122: point = point[1:] 123: name = fields[2] 124: except: 125: print "Failed to process line: %s" % (line) 126: continue 127: 128: nbchar = nbchar + 1 129: # update entry for "full name" 130: try: 131: Categories[name].append(value) 132: except: 133: try: 134: Categories[name] = [value] 135: except: 136: print "Failed to process line: %s" % (line) 137: # update "general category" name 138: try: 139: Categories[name[0]].append(value) 140: except: 141: try: 142: Categories[name[0]] = [value] 143: except: 144: print "Failed to process line: %s" % (line) 145: 146: blocks.close() 147: print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys())) 148: 149: # 150: # The data is now all read. Time to process it into a more useful form. 151: # 152: # reduce the number list into ranges 153: for cat in Categories.keys(): 154: list = Categories[cat] 155: start = -1 156: prev = -1 157: end = -1 158: ranges = [] 159: for val in list: 160: if start == -1: 161: start = val 162: prev = val 163: continue 164: elif val == prev + 1: 165: prev = val 166: continue 167: elif prev == start: 168: ranges.append((prev, prev)) 169: start = val 170: prev = val 171: continue 172: else: 173: ranges.append((start, prev)) 174: start = val 175: prev = val 176: continue 177: if prev == start: 178: ranges.append((prev, prev)) 179: else: 180: ranges.append((start, prev)) 181: Categories[cat] = ranges 182: 183: # 184: # Assure all data is in alphabetic order, since we will be doing binary 185: # searches on the tables. 186: # 187: bkeys = BlockNames.keys() 188: bkeys.sort() 189: 190: ckeys = Categories.keys() 191: ckeys.sort() 192: 193: # 194: # Generate the resulting files 195: # 196: try: 197: header = open("include/libxml/xmlunicode.h", "w") 198: except: 199: print "Failed to open include/libxml/xmlunicode.h" 200: sys.exit(1) 201: 202: try: 203: output = open("xmlunicode.c", "w") 204: except: 205: print "Failed to open xmlunicode.c" 206: sys.exit(1) 207: 208: date = time.asctime(time.localtime(time.time())) 209: 210: header.write( 211: """/* 212: * Summary: Unicode character APIs 213: * Description: API for the Unicode character APIs 214: * 215: * This file is automatically generated from the 216: * UCS description files of the Unicode Character Database 217: * %s 218: * using the genUnicode.py Python script. 219: * 220: * Generation date: %s 221: * Sources: %s 222: * Author: Daniel Veillard 223: */ 224: 225: #ifndef __XML_UNICODE_H__ 226: #define __XML_UNICODE_H__ 227: 228: #include <libxml/xmlversion.h> 229: 230: #ifdef LIBXML_UNICODE_ENABLED 231: 232: #ifdef __cplusplus 233: extern "C" { 234: #endif 235: 236: """ % (webpage, date, sources)); 237: 238: output.write( 239: """/* 240: * xmlunicode.c: this module implements the Unicode character APIs 241: * 242: * This file is automatically generated from the 243: * UCS description files of the Unicode Character Database 244: * %s 245: * using the genUnicode.py Python script. 246: * 247: * Generation date: %s 248: * Sources: %s 249: * Daniel Veillard <veillard@redhat.com> 250: */ 251: 252: #define IN_LIBXML 253: #include "libxml.h" 254: 255: #ifdef LIBXML_UNICODE_ENABLED 256: 257: #include <string.h> 258: #include <libxml/xmlversion.h> 259: #include <libxml/xmlunicode.h> 260: #include <libxml/chvalid.h> 261: 262: typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */ 263: 264: typedef struct { 265: const char *rangename; 266: xmlIntFunc *func; 267: } xmlUnicodeRange; 268: 269: typedef struct { 270: xmlUnicodeRange *table; 271: int numentries; 272: } xmlUnicodeNameTable; 273: 274: 275: static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname); 276: 277: static xmlUnicodeRange xmlUnicodeBlocks[] = { 278: """ % (webpage, date, sources)); 279: 280: flag = 0 281: for block in bkeys: 282: name = string.replace(block, '-', '') 283: if flag: 284: output.write(',\n') 285: else: 286: flag = 1 287: output.write(' {"%s", xmlUCSIs%s}' % (block, name)) 288: output.write('};\n\n') 289: 290: output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n') 291: flag = 0; 292: for name in ckeys: 293: if flag: 294: output.write(',\n') 295: else: 296: flag = 1 297: output.write(' {"%s", xmlUCSIsCat%s}' % (name, name)) 298: output.write('};\n\n') 299: 300: # 301: # For any categories with more than minTableSize ranges we generate 302: # a range table suitable for xmlCharInRange 303: # 304: for name in ckeys: 305: if len(Categories[name]) > minTableSize: 306: numshort = 0 307: numlong = 0 308: ranges = Categories[name] 309: sptr = "NULL" 310: lptr = "NULL" 311: for range in ranges: 312: (low, high) = range 313: if high < 0x10000: 314: if numshort == 0: 315: pline = "static const xmlChSRange xml%sS[] = {" % name 316: sptr = "xml%sS" % name 317: else: 318: pline += ", " 319: numshort += 1 320: else: 321: if numlong == 0: 322: if numshort > 0: 323: output.write(pline + " };\n") 324: pline = "static const xmlChLRange xml%sL[] = {" % name 325: lptr = "xml%sL" % name 326: else: 327: pline += ", " 328: numlong += 1 329: if len(pline) > 60: 330: output.write(pline + "\n") 331: pline = " " 332: pline += "{%s, %s}" % (hex(low), hex(high)) 333: output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n" 334: % (name, numshort, numlong, sptr, lptr)) 335: 336: 337: output.write( 338: """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s}; 339: static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s}; 340: 341: /** 342: * xmlUnicodeLookup: 343: * @tptr: pointer to the name table 344: * @name: name to be found 345: * 346: * binary table lookup for user-supplied name 347: * 348: * Returns pointer to range function if found, otherwise NULL 349: */ 350: static xmlIntFunc 351: *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) { 352: int low, high, mid, cmp; 353: xmlUnicodeRange *sptr; 354: 355: if ((tptr == NULL) || (tname == NULL)) return(NULL); 356: 357: low = 0; 358: high = tptr->numentries - 1; 359: sptr = tptr->table; 360: while (low <= high) { 361: mid = (low + high) / 2; 362: if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0) 363: return (sptr[mid].func); 364: if (cmp < 0) 365: high = mid - 1; 366: else 367: low = mid + 1; 368: } 369: return (NULL); 370: } 371: 372: """ % (len(BlockNames), len(Categories)) ) 373: 374: for block in bkeys: 375: name = string.replace(block, '-', '') 376: header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name) 377: output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name)) 378: output.write(" *\n * Check whether the character is part of %s UCS Block\n"% 379: (block)) 380: output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); 381: output.write("int\nxmlUCSIs%s(int code) {\n return(" % name) 382: flag = 0 383: for (start, end) in BlockNames[block]: 384: if flag: 385: output.write(" ||\n ") 386: else: 387: flag = 1 388: output.write("((code >= %s) && (code <= %s))" % (start, end)) 389: output.write(");\n}\n\n") 390: 391: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n") 392: output.write( 393: """/** 394: * xmlUCSIsBlock: 395: * @code: UCS code point 396: * @block: UCS block name 397: * 398: * Check whether the character is part of the UCS Block 399: * 400: * Returns 1 if true, 0 if false and -1 on unknown block 401: */ 402: int 403: xmlUCSIsBlock(int code, const char *block) { 404: xmlIntFunc *func; 405: 406: func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block); 407: if (func == NULL) 408: return (-1); 409: return (func(code)); 410: } 411: 412: """) 413: 414: for name in ckeys: 415: ranges = Categories[name] 416: header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name) 417: output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name)) 418: output.write(" *\n * Check whether the character is part of %s UCS Category\n"% 419: (name)) 420: output.write(" *\n * Returns 1 if true 0 otherwise\n */\n"); 421: output.write("int\nxmlUCSIsCat%s(int code) {\n" % name) 422: if len(Categories[name]) > minTableSize: 423: output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)" 424: % name) 425: else: 426: start = 1 427: for range in ranges: 428: (begin, end) = range; 429: if start: 430: output.write(" return("); 431: start = 0 432: else: 433: output.write(" ||\n "); 434: if (begin == end): 435: output.write("(code == %s)" % (hex(begin))) 436: else: 437: output.write("((code >= %s) && (code <= %s))" % ( 438: hex(begin), hex(end))) 439: output.write(");\n}\n\n") 440: 441: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n") 442: output.write( 443: """/** 444: * xmlUCSIsCat: 445: * @code: UCS code point 446: * @cat: UCS Category name 447: * 448: * Check whether the character is part of the UCS Category 449: * 450: * Returns 1 if true, 0 if false and -1 on unknown category 451: */ 452: int 453: xmlUCSIsCat(int code, const char *cat) { 454: xmlIntFunc *func; 455: 456: func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat); 457: if (func == NULL) 458: return (-1); 459: return (func(code)); 460: } 461: 462: #define bottom_xmlunicode 463: #include "elfgcchack.h" 464: #endif /* LIBXML_UNICODE_ENABLED */ 465: """) 466: 467: header.write(""" 468: #ifdef __cplusplus 469: } 470: #endif 471: 472: #endif /* LIBXML_UNICODE_ENABLED */ 473: 474: #endif /* __XML_UNICODE_H__ */ 475: """); 476: 477: header.close() 478: output.close()