Annotation of embedaddon/libxml2/genUnicode.py, revision 1.1
1.1 ! misho 1: #!/usr/bin/python -u
! 2: #
! 3: # Original script modified in November 2003 to take advantage of
! 4: # the character-validation range routines, and updated to the
! 5: # current Unicode information (Version 4.0.1)
! 6: #
! 7: # NOTE: there is an 'alias' facility for blocks which are not present in
! 8: # the current release, but are needed for ABI compatibility. This
! 9: # must be accomplished MANUALLY! Please see the comments below under
! 10: # 'blockAliases'
! 11: #
! 12: import sys
! 13: import string
! 14: import time
! 15:
! 16: webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
! 17: sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
! 18:
! 19: #
! 20: # blockAliases is a small hack - it is used for mapping block names which
! 21: # were were used in the 3.1 release, but are missing or changed in the current
! 22: # release. The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
! 23: blockAliases = []
! 24: blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
! 25: blockAliases.append("Greek:GreekandCoptic")
! 26: blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," +
! 27: "SupplementaryPrivateUseArea-B")
! 28:
! 29: # minTableSize gives the minimum number of ranges which must be present
! 30: # before a range table is produced. If there are less than this
! 31: # number, inline comparisons are generated
! 32: minTableSize = 8
! 33:
! 34: (blockfile, catfile) = string.split(sources)
! 35:
! 36:
! 37: #
! 38: # Now process the "blocks" file, reducing it to a dictionary
! 39: # indexed by blockname, containing a tuple with the applicable
! 40: # block range
! 41: #
! 42: BlockNames = {}
! 43: try:
! 44: blocks = open(blockfile, "r")
! 45: except:
! 46: print "Missing %s, aborting ..." % blockfile
! 47: sys.exit(1)
! 48:
! 49: for line in blocks.readlines():
! 50: if line[0] == '#':
! 51: continue
! 52: line = string.strip(line)
! 53: if line == '':
! 54: continue
! 55: try:
! 56: fields = string.split(line, ';')
! 57: range = string.strip(fields[0])
! 58: (start, end) = string.split(range, "..")
! 59: name = string.strip(fields[1])
! 60: name = string.replace(name, ' ', '')
! 61: except:
! 62: print "Failed to process line: %s" % (line)
! 63: continue
! 64: start = "0x" + start
! 65: end = "0x" + end
! 66: try:
! 67: BlockNames[name].append((start, end))
! 68: except:
! 69: BlockNames[name] = [(start, end)]
! 70: blocks.close()
! 71: print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
! 72:
! 73: for block in blockAliases:
! 74: alias = string.split(block,':')
! 75: alist = string.split(alias[1],',')
! 76: for comp in alist:
! 77: if BlockNames.has_key(comp):
! 78: if alias[0] not in BlockNames:
! 79: BlockNames[alias[0]] = []
! 80: for r in BlockNames[comp]:
! 81: BlockNames[alias[0]].append(r)
! 82: else:
! 83: print "Alias %s: %s not in Blocks" % (alias[0], comp)
! 84: continue
! 85:
! 86: #
! 87: # Next process the Categories file. This is more complex, since
! 88: # the file is in code sequence, and we need to invert it. We use
! 89: # a dictionary with index category-name, with each entry containing
! 90: # all the ranges (codepoints) of that category. Note that category
! 91: # names comprise two parts - the general category, and the "subclass"
! 92: # within that category. Therefore, both "general category" (which is
! 93: # the first character of the 2-character category-name) and the full
! 94: # (2-character) name are entered into this dictionary.
! 95: #
! 96: try:
! 97: data = open(catfile, "r")
! 98: except:
! 99: print "Missing %s, aborting ..." % catfile
! 100: sys.exit(1)
! 101:
! 102: nbchar = 0;
! 103: Categories = {}
! 104: for line in data.readlines():
! 105: if line[0] == '#':
! 106: continue
! 107: line = string.strip(line)
! 108: if line == '':
! 109: continue
! 110: try:
! 111: fields = string.split(line, ';')
! 112: point = string.strip(fields[0])
! 113: value = 0
! 114: while point != '':
! 115: value = value * 16
! 116: if point[0] >= '0' and point[0] <= '9':
! 117: value = value + ord(point[0]) - ord('0')
! 118: elif point[0] >= 'A' and point[0] <= 'F':
! 119: value = value + 10 + ord(point[0]) - ord('A')
! 120: elif point[0] >= 'a' and point[0] <= 'f':
! 121: value = value + 10 + ord(point[0]) - ord('a')
! 122: point = point[1:]
! 123: name = fields[2]
! 124: except:
! 125: print "Failed to process line: %s" % (line)
! 126: continue
! 127:
! 128: nbchar = nbchar + 1
! 129: # update entry for "full name"
! 130: try:
! 131: Categories[name].append(value)
! 132: except:
! 133: try:
! 134: Categories[name] = [value]
! 135: except:
! 136: print "Failed to process line: %s" % (line)
! 137: # update "general category" name
! 138: try:
! 139: Categories[name[0]].append(value)
! 140: except:
! 141: try:
! 142: Categories[name[0]] = [value]
! 143: except:
! 144: print "Failed to process line: %s" % (line)
! 145:
! 146: blocks.close()
! 147: print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
! 148:
! 149: #
! 150: # The data is now all read. Time to process it into a more useful form.
! 151: #
! 152: # reduce the number list into ranges
! 153: for cat in Categories.keys():
! 154: list = Categories[cat]
! 155: start = -1
! 156: prev = -1
! 157: end = -1
! 158: ranges = []
! 159: for val in list:
! 160: if start == -1:
! 161: start = val
! 162: prev = val
! 163: continue
! 164: elif val == prev + 1:
! 165: prev = val
! 166: continue
! 167: elif prev == start:
! 168: ranges.append((prev, prev))
! 169: start = val
! 170: prev = val
! 171: continue
! 172: else:
! 173: ranges.append((start, prev))
! 174: start = val
! 175: prev = val
! 176: continue
! 177: if prev == start:
! 178: ranges.append((prev, prev))
! 179: else:
! 180: ranges.append((start, prev))
! 181: Categories[cat] = ranges
! 182:
! 183: #
! 184: # Assure all data is in alphabetic order, since we will be doing binary
! 185: # searches on the tables.
! 186: #
! 187: bkeys = BlockNames.keys()
! 188: bkeys.sort()
! 189:
! 190: ckeys = Categories.keys()
! 191: ckeys.sort()
! 192:
! 193: #
! 194: # Generate the resulting files
! 195: #
! 196: try:
! 197: header = open("include/libxml/xmlunicode.h", "w")
! 198: except:
! 199: print "Failed to open include/libxml/xmlunicode.h"
! 200: sys.exit(1)
! 201:
! 202: try:
! 203: output = open("xmlunicode.c", "w")
! 204: except:
! 205: print "Failed to open xmlunicode.c"
! 206: sys.exit(1)
! 207:
! 208: date = time.asctime(time.localtime(time.time()))
! 209:
! 210: header.write(
! 211: """/*
! 212: * Summary: Unicode character APIs
! 213: * Description: API for the Unicode character APIs
! 214: *
! 215: * This file is automatically generated from the
! 216: * UCS description files of the Unicode Character Database
! 217: * %s
! 218: * using the genUnicode.py Python script.
! 219: *
! 220: * Generation date: %s
! 221: * Sources: %s
! 222: * Author: Daniel Veillard
! 223: */
! 224:
! 225: #ifndef __XML_UNICODE_H__
! 226: #define __XML_UNICODE_H__
! 227:
! 228: #include <libxml/xmlversion.h>
! 229:
! 230: #ifdef LIBXML_UNICODE_ENABLED
! 231:
! 232: #ifdef __cplusplus
! 233: extern "C" {
! 234: #endif
! 235:
! 236: """ % (webpage, date, sources));
! 237:
! 238: output.write(
! 239: """/*
! 240: * xmlunicode.c: this module implements the Unicode character APIs
! 241: *
! 242: * This file is automatically generated from the
! 243: * UCS description files of the Unicode Character Database
! 244: * %s
! 245: * using the genUnicode.py Python script.
! 246: *
! 247: * Generation date: %s
! 248: * Sources: %s
! 249: * Daniel Veillard <veillard@redhat.com>
! 250: */
! 251:
! 252: #define IN_LIBXML
! 253: #include "libxml.h"
! 254:
! 255: #ifdef LIBXML_UNICODE_ENABLED
! 256:
! 257: #include <string.h>
! 258: #include <libxml/xmlversion.h>
! 259: #include <libxml/xmlunicode.h>
! 260: #include <libxml/chvalid.h>
! 261:
! 262: typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
! 263:
! 264: typedef struct {
! 265: const char *rangename;
! 266: xmlIntFunc *func;
! 267: } xmlUnicodeRange;
! 268:
! 269: typedef struct {
! 270: xmlUnicodeRange *table;
! 271: int numentries;
! 272: } xmlUnicodeNameTable;
! 273:
! 274:
! 275: static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
! 276:
! 277: static xmlUnicodeRange xmlUnicodeBlocks[] = {
! 278: """ % (webpage, date, sources));
! 279:
! 280: flag = 0
! 281: for block in bkeys:
! 282: name = string.replace(block, '-', '')
! 283: if flag:
! 284: output.write(',\n')
! 285: else:
! 286: flag = 1
! 287: output.write(' {"%s", xmlUCSIs%s}' % (block, name))
! 288: output.write('};\n\n')
! 289:
! 290: output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
! 291: flag = 0;
! 292: for name in ckeys:
! 293: if flag:
! 294: output.write(',\n')
! 295: else:
! 296: flag = 1
! 297: output.write(' {"%s", xmlUCSIsCat%s}' % (name, name))
! 298: output.write('};\n\n')
! 299:
! 300: #
! 301: # For any categories with more than minTableSize ranges we generate
! 302: # a range table suitable for xmlCharInRange
! 303: #
! 304: for name in ckeys:
! 305: if len(Categories[name]) > minTableSize:
! 306: numshort = 0
! 307: numlong = 0
! 308: ranges = Categories[name]
! 309: sptr = "NULL"
! 310: lptr = "NULL"
! 311: for range in ranges:
! 312: (low, high) = range
! 313: if high < 0x10000:
! 314: if numshort == 0:
! 315: pline = "static const xmlChSRange xml%sS[] = {" % name
! 316: sptr = "xml%sS" % name
! 317: else:
! 318: pline += ", "
! 319: numshort += 1
! 320: else:
! 321: if numlong == 0:
! 322: if numshort > 0:
! 323: output.write(pline + " };\n")
! 324: pline = "static const xmlChLRange xml%sL[] = {" % name
! 325: lptr = "xml%sL" % name
! 326: else:
! 327: pline += ", "
! 328: numlong += 1
! 329: if len(pline) > 60:
! 330: output.write(pline + "\n")
! 331: pline = " "
! 332: pline += "{%s, %s}" % (hex(low), hex(high))
! 333: output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
! 334: % (name, numshort, numlong, sptr, lptr))
! 335:
! 336:
! 337: output.write(
! 338: """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
! 339: static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
! 340:
! 341: /**
! 342: * xmlUnicodeLookup:
! 343: * @tptr: pointer to the name table
! 344: * @name: name to be found
! 345: *
! 346: * binary table lookup for user-supplied name
! 347: *
! 348: * Returns pointer to range function if found, otherwise NULL
! 349: */
! 350: static xmlIntFunc
! 351: *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
! 352: int low, high, mid, cmp;
! 353: xmlUnicodeRange *sptr;
! 354:
! 355: if ((tptr == NULL) || (tname == NULL)) return(NULL);
! 356:
! 357: low = 0;
! 358: high = tptr->numentries - 1;
! 359: sptr = tptr->table;
! 360: while (low <= high) {
! 361: mid = (low + high) / 2;
! 362: if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
! 363: return (sptr[mid].func);
! 364: if (cmp < 0)
! 365: high = mid - 1;
! 366: else
! 367: low = mid + 1;
! 368: }
! 369: return (NULL);
! 370: }
! 371:
! 372: """ % (len(BlockNames), len(Categories)) )
! 373:
! 374: for block in bkeys:
! 375: name = string.replace(block, '-', '')
! 376: header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
! 377: output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
! 378: output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
! 379: (block))
! 380: output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
! 381: output.write("int\nxmlUCSIs%s(int code) {\n return(" % name)
! 382: flag = 0
! 383: for (start, end) in BlockNames[block]:
! 384: if flag:
! 385: output.write(" ||\n ")
! 386: else:
! 387: flag = 1
! 388: output.write("((code >= %s) && (code <= %s))" % (start, end))
! 389: output.write(");\n}\n\n")
! 390:
! 391: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
! 392: output.write(
! 393: """/**
! 394: * xmlUCSIsBlock:
! 395: * @code: UCS code point
! 396: * @block: UCS block name
! 397: *
! 398: * Check whether the character is part of the UCS Block
! 399: *
! 400: * Returns 1 if true, 0 if false and -1 on unknown block
! 401: */
! 402: int
! 403: xmlUCSIsBlock(int code, const char *block) {
! 404: xmlIntFunc *func;
! 405:
! 406: func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
! 407: if (func == NULL)
! 408: return (-1);
! 409: return (func(code));
! 410: }
! 411:
! 412: """)
! 413:
! 414: for name in ckeys:
! 415: ranges = Categories[name]
! 416: header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
! 417: output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
! 418: output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
! 419: (name))
! 420: output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
! 421: output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
! 422: if len(Categories[name]) > minTableSize:
! 423: output.write(" return(xmlCharInRange((unsigned int)code, &xml%sG)"
! 424: % name)
! 425: else:
! 426: start = 1
! 427: for range in ranges:
! 428: (begin, end) = range;
! 429: if start:
! 430: output.write(" return(");
! 431: start = 0
! 432: else:
! 433: output.write(" ||\n ");
! 434: if (begin == end):
! 435: output.write("(code == %s)" % (hex(begin)))
! 436: else:
! 437: output.write("((code >= %s) && (code <= %s))" % (
! 438: hex(begin), hex(end)))
! 439: output.write(");\n}\n\n")
! 440:
! 441: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
! 442: output.write(
! 443: """/**
! 444: * xmlUCSIsCat:
! 445: * @code: UCS code point
! 446: * @cat: UCS Category name
! 447: *
! 448: * Check whether the character is part of the UCS Category
! 449: *
! 450: * Returns 1 if true, 0 if false and -1 on unknown category
! 451: */
! 452: int
! 453: xmlUCSIsCat(int code, const char *cat) {
! 454: xmlIntFunc *func;
! 455:
! 456: func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
! 457: if (func == NULL)
! 458: return (-1);
! 459: return (func(code));
! 460: }
! 461:
! 462: #define bottom_xmlunicode
! 463: #include "elfgcchack.h"
! 464: #endif /* LIBXML_UNICODE_ENABLED */
! 465: """)
! 466:
! 467: header.write("""
! 468: #ifdef __cplusplus
! 469: }
! 470: #endif
! 471:
! 472: #endif /* LIBXML_UNICODE_ENABLED */
! 473:
! 474: #endif /* __XML_UNICODE_H__ */
! 475: """);
! 476:
! 477: header.close()
! 478: output.close()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>