Annotation of embedaddon/libxml2/genUnicode.py, revision 1.1

1.1     ! misho       1: #!/usr/bin/python -u
        !             2: #
        !             3: # Original script modified in November 2003 to take advantage of
        !             4: # the character-validation range routines, and updated to the
        !             5: # current Unicode information (Version 4.0.1)
        !             6: #
        !             7: # NOTE: there is an 'alias' facility for blocks which are not present in
        !             8: #      the current release, but are needed for ABI compatibility.  This
        !             9: #      must be accomplished MANUALLY!  Please see the comments below under
        !            10: #     'blockAliases'
        !            11: #
        !            12: import sys
        !            13: import string
        !            14: import time
        !            15: 
        !            16: webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
        !            17: sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
        !            18: 
        !            19: #
        !            20: # blockAliases is a small hack - it is used for mapping block names which
        !            21: # were were used in the 3.1 release, but are missing or changed in the current
        !            22: # release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
        !            23: blockAliases = []
        !            24: blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
        !            25: blockAliases.append("Greek:GreekandCoptic")
        !            26: blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 
        !            27:        "SupplementaryPrivateUseArea-B")
        !            28: 
        !            29: # minTableSize gives the minimum number of ranges which must be present
        !            30: # before a range table is produced.  If there are less than this
        !            31: # number, inline comparisons are generated
        !            32: minTableSize = 8
        !            33: 
        !            34: (blockfile, catfile) = string.split(sources)
        !            35: 
        !            36: 
        !            37: #
        !            38: # Now process the "blocks" file, reducing it to a dictionary
        !            39: # indexed by blockname, containing a tuple with the applicable
        !            40: # block range
        !            41: #
        !            42: BlockNames = {}
        !            43: try:
        !            44:     blocks = open(blockfile, "r")
        !            45: except:
        !            46:     print "Missing %s, aborting ..." % blockfile
        !            47:     sys.exit(1)
        !            48: 
        !            49: for line in blocks.readlines():
        !            50:     if line[0] == '#':
        !            51:         continue
        !            52:     line = string.strip(line)
        !            53:     if line == '':
        !            54:         continue
        !            55:     try:
        !            56:         fields = string.split(line, ';')
        !            57:         range = string.strip(fields[0])
        !            58:         (start, end) = string.split(range, "..")
        !            59:         name = string.strip(fields[1])
        !            60:         name = string.replace(name, ' ', '')
        !            61:     except:
        !            62:         print "Failed to process line: %s" % (line)
        !            63:         continue
        !            64:     start = "0x" + start
        !            65:     end = "0x" + end
        !            66:     try:
        !            67:         BlockNames[name].append((start, end))
        !            68:     except:
        !            69:         BlockNames[name] = [(start, end)]
        !            70: blocks.close()
        !            71: print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
        !            72: 
        !            73: for block in blockAliases:
        !            74:     alias = string.split(block,':')
        !            75:     alist = string.split(alias[1],',')
        !            76:     for comp in alist:
        !            77:         if BlockNames.has_key(comp):
        !            78:             if alias[0] not in BlockNames:
        !            79:                 BlockNames[alias[0]] = []
        !            80:             for r in BlockNames[comp]:
        !            81:                 BlockNames[alias[0]].append(r)
        !            82:         else:
        !            83:             print "Alias %s: %s not in Blocks" % (alias[0], comp)
        !            84:             continue
        !            85: 
        !            86: #
        !            87: # Next process the Categories file. This is more complex, since
        !            88: # the file is in code sequence, and we need to invert it.  We use
        !            89: # a dictionary with index category-name, with each entry containing
        !            90: # all the ranges (codepoints) of that category.  Note that category
        !            91: # names comprise two parts - the general category, and the "subclass"
        !            92: # within that category.  Therefore, both "general category" (which is
        !            93: # the first character of the 2-character category-name) and the full
        !            94: # (2-character) name are entered into this dictionary.
        !            95: #
        !            96: try:
        !            97:     data = open(catfile, "r")
        !            98: except:
        !            99:     print "Missing %s, aborting ..." % catfile
        !           100:     sys.exit(1)
        !           101: 
        !           102: nbchar = 0;
        !           103: Categories = {}
        !           104: for line in data.readlines():
        !           105:     if line[0] == '#':
        !           106:         continue
        !           107:     line = string.strip(line)
        !           108:     if line == '':
        !           109:         continue
        !           110:     try:
        !           111:         fields = string.split(line, ';')
        !           112:         point = string.strip(fields[0])
        !           113:         value = 0
        !           114:         while point != '':
        !           115:             value = value * 16
        !           116:             if point[0] >= '0' and point[0] <= '9':
        !           117:                 value = value + ord(point[0]) - ord('0')
        !           118:             elif point[0] >= 'A' and point[0] <= 'F':
        !           119:                 value = value + 10 + ord(point[0]) - ord('A')
        !           120:             elif point[0] >= 'a' and point[0] <= 'f':
        !           121:                 value = value + 10 + ord(point[0]) - ord('a')
        !           122:             point = point[1:]
        !           123:         name = fields[2]
        !           124:     except:
        !           125:         print "Failed to process line: %s" % (line)
        !           126:         continue
        !           127:     
        !           128:     nbchar = nbchar + 1
        !           129:     # update entry for "full name"
        !           130:     try:
        !           131:         Categories[name].append(value)
        !           132:     except:
        !           133:         try:
        !           134:             Categories[name] = [value]
        !           135:         except:
        !           136:             print "Failed to process line: %s" % (line)
        !           137:     # update "general category" name
        !           138:     try:
        !           139:         Categories[name[0]].append(value)
        !           140:     except:
        !           141:         try:
        !           142:             Categories[name[0]] = [value]
        !           143:         except:
        !           144:             print "Failed to process line: %s" % (line)
        !           145: 
        !           146: blocks.close()
        !           147: print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
        !           148: 
        !           149: #
        !           150: # The data is now all read.  Time to process it into a more useful form.
        !           151: #
        !           152: # reduce the number list into ranges
        !           153: for cat in Categories.keys():
        !           154:     list = Categories[cat]
        !           155:     start = -1
        !           156:     prev = -1
        !           157:     end = -1
        !           158:     ranges = []
        !           159:     for val in list:
        !           160:         if start == -1:
        !           161:             start = val
        !           162:             prev = val
        !           163:             continue
        !           164:         elif val == prev + 1:
        !           165:             prev = val
        !           166:             continue
        !           167:         elif prev == start:
        !           168:             ranges.append((prev, prev))
        !           169:             start = val
        !           170:             prev = val
        !           171:             continue
        !           172:         else:
        !           173:             ranges.append((start, prev))
        !           174:             start = val
        !           175:             prev = val
        !           176:             continue
        !           177:     if prev == start:
        !           178:         ranges.append((prev, prev))
        !           179:     else:
        !           180:         ranges.append((start, prev))
        !           181:     Categories[cat] = ranges
        !           182: 
        !           183: #
        !           184: # Assure all data is in alphabetic order, since we will be doing binary
        !           185: # searches on the tables.
        !           186: #
        !           187: bkeys = BlockNames.keys()
        !           188: bkeys.sort()
        !           189: 
        !           190: ckeys = Categories.keys()
        !           191: ckeys.sort()
        !           192: 
        !           193: #
        !           194: # Generate the resulting files
        !           195: #
        !           196: try:
        !           197:     header = open("include/libxml/xmlunicode.h", "w")
        !           198: except:
        !           199:     print "Failed to open include/libxml/xmlunicode.h"
        !           200:     sys.exit(1)
        !           201: 
        !           202: try:
        !           203:     output = open("xmlunicode.c", "w")
        !           204: except:
        !           205:     print "Failed to open xmlunicode.c"
        !           206:     sys.exit(1)
        !           207: 
        !           208: date = time.asctime(time.localtime(time.time()))
        !           209: 
        !           210: header.write(
        !           211: """/*
        !           212:  * Summary: Unicode character APIs
        !           213:  * Description: API for the Unicode character APIs
        !           214:  *
        !           215:  * This file is automatically generated from the
        !           216:  * UCS description files of the Unicode Character Database
        !           217:  * %s
        !           218:  * using the genUnicode.py Python script.
        !           219:  *
        !           220:  * Generation date: %s
        !           221:  * Sources: %s
        !           222:  * Author: Daniel Veillard
        !           223:  */
        !           224: 
        !           225: #ifndef __XML_UNICODE_H__
        !           226: #define __XML_UNICODE_H__
        !           227: 
        !           228: #include <libxml/xmlversion.h>
        !           229: 
        !           230: #ifdef LIBXML_UNICODE_ENABLED
        !           231: 
        !           232: #ifdef __cplusplus
        !           233: extern "C" {
        !           234: #endif
        !           235: 
        !           236: """ % (webpage, date, sources));
        !           237: 
        !           238: output.write(
        !           239: """/*
        !           240:  * xmlunicode.c: this module implements the Unicode character APIs
        !           241:  *
        !           242:  * This file is automatically generated from the
        !           243:  * UCS description files of the Unicode Character Database
        !           244:  * %s
        !           245:  * using the genUnicode.py Python script.
        !           246:  *
        !           247:  * Generation date: %s
        !           248:  * Sources: %s
        !           249:  * Daniel Veillard <veillard@redhat.com>
        !           250:  */
        !           251: 
        !           252: #define IN_LIBXML
        !           253: #include "libxml.h"
        !           254: 
        !           255: #ifdef LIBXML_UNICODE_ENABLED
        !           256: 
        !           257: #include <string.h>
        !           258: #include <libxml/xmlversion.h>
        !           259: #include <libxml/xmlunicode.h>
        !           260: #include <libxml/chvalid.h>
        !           261: 
        !           262: typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
        !           263: 
        !           264: typedef struct {
        !           265:     const char *rangename;
        !           266:     xmlIntFunc *func;
        !           267: } xmlUnicodeRange;
        !           268: 
        !           269: typedef struct {
        !           270:     xmlUnicodeRange *table;
        !           271:     int                    numentries;
        !           272: } xmlUnicodeNameTable;
        !           273: 
        !           274: 
        !           275: static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
        !           276: 
        !           277: static xmlUnicodeRange xmlUnicodeBlocks[] = {
        !           278: """ % (webpage, date, sources));
        !           279: 
        !           280: flag = 0
        !           281: for block in bkeys:
        !           282:     name = string.replace(block, '-', '')
        !           283:     if flag:
        !           284:         output.write(',\n')
        !           285:     else:
        !           286:         flag = 1
        !           287:     output.write('  {"%s", xmlUCSIs%s}' % (block, name))
        !           288: output.write('};\n\n')
        !           289: 
        !           290: output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
        !           291: flag = 0;
        !           292: for name in ckeys:
        !           293:     if flag:
        !           294:         output.write(',\n')
        !           295:     else:
        !           296:         flag = 1
        !           297:     output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
        !           298: output.write('};\n\n')
        !           299: 
        !           300: #
        !           301: # For any categories with more than minTableSize ranges we generate
        !           302: # a range table suitable for xmlCharInRange
        !           303: #
        !           304: for name in ckeys:
        !           305:   if len(Categories[name]) > minTableSize:
        !           306:     numshort = 0
        !           307:     numlong = 0
        !           308:     ranges = Categories[name]
        !           309:     sptr = "NULL"
        !           310:     lptr = "NULL"
        !           311:     for range in ranges:
        !           312:       (low, high) = range
        !           313:       if high < 0x10000:
        !           314:         if numshort == 0:
        !           315:           pline = "static const xmlChSRange xml%sS[] = {" % name
        !           316:           sptr = "xml%sS" % name
        !           317:         else:
        !           318:           pline += ", "
        !           319:         numshort += 1
        !           320:       else:
        !           321:         if numlong == 0:
        !           322:           if numshort > 0:
        !           323:             output.write(pline + " };\n")
        !           324:           pline = "static const xmlChLRange xml%sL[] = {" % name
        !           325:           lptr = "xml%sL" % name
        !           326:         else:
        !           327:           pline += ", "
        !           328:         numlong += 1
        !           329:       if len(pline) > 60:
        !           330:         output.write(pline + "\n")
        !           331:         pline = "    "
        !           332:       pline += "{%s, %s}" % (hex(low), hex(high))
        !           333:     output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
        !           334:          % (name, numshort, numlong, sptr, lptr))
        !           335: 
        !           336: 
        !           337: output.write(
        !           338: """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
        !           339: static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
        !           340: 
        !           341: /**
        !           342:  * xmlUnicodeLookup:
        !           343:  * @tptr: pointer to the name table
        !           344:  * @name: name to be found
        !           345:  *
        !           346:  * binary table lookup for user-supplied name
        !           347:  *
        !           348:  * Returns pointer to range function if found, otherwise NULL
        !           349:  */
        !           350: static xmlIntFunc
        !           351: *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
        !           352:     int low, high, mid, cmp;
        !           353:     xmlUnicodeRange *sptr;
        !           354: 
        !           355:     if ((tptr == NULL) || (tname == NULL)) return(NULL);
        !           356: 
        !           357:     low = 0;
        !           358:     high = tptr->numentries - 1;
        !           359:     sptr = tptr->table;
        !           360:     while (low <= high) {
        !           361:        mid = (low + high) / 2;
        !           362:        if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
        !           363:            return (sptr[mid].func);
        !           364:        if (cmp < 0)
        !           365:            high = mid - 1;
        !           366:        else
        !           367:            low = mid + 1;
        !           368:     }
        !           369:     return (NULL);    
        !           370: }
        !           371: 
        !           372: """ % (len(BlockNames), len(Categories)) )
        !           373: 
        !           374: for block in bkeys:
        !           375:     name = string.replace(block, '-', '')
        !           376:     header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
        !           377:     output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
        !           378:     output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
        !           379:                  (block))
        !           380:     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
        !           381:     output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
        !           382:     flag = 0
        !           383:     for (start, end) in BlockNames[block]:
        !           384:         if flag:
        !           385:             output.write(" ||\n           ")
        !           386:         else:
        !           387:             flag = 1
        !           388:         output.write("((code >= %s) && (code <= %s))" % (start, end))
        !           389:     output.write(");\n}\n\n")
        !           390: 
        !           391: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
        !           392: output.write(
        !           393: """/**
        !           394:  * xmlUCSIsBlock:
        !           395:  * @code: UCS code point
        !           396:  * @block: UCS block name
        !           397:  *
        !           398:  * Check whether the character is part of the UCS Block
        !           399:  *
        !           400:  * Returns 1 if true, 0 if false and -1 on unknown block
        !           401:  */
        !           402: int
        !           403: xmlUCSIsBlock(int code, const char *block) {
        !           404:     xmlIntFunc *func;
        !           405: 
        !           406:     func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
        !           407:     if (func == NULL)
        !           408:        return (-1);
        !           409:     return (func(code));
        !           410: }
        !           411: 
        !           412: """)
        !           413: 
        !           414: for name in ckeys:
        !           415:     ranges = Categories[name]
        !           416:     header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
        !           417:     output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
        !           418:     output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
        !           419:                  (name))
        !           420:     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
        !           421:     output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
        !           422:     if len(Categories[name]) > minTableSize:
        !           423:         output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
        !           424:             % name)
        !           425:     else:
        !           426:         start = 1
        !           427:         for range in ranges:
        !           428:             (begin, end) = range;
        !           429:             if start:
        !           430:                 output.write("    return(");
        !           431:                 start = 0
        !           432:             else:
        !           433:                 output.write(" ||\n           ");
        !           434:             if (begin == end):
        !           435:                 output.write("(code == %s)" % (hex(begin)))
        !           436:             else:
        !           437:                 output.write("((code >= %s) && (code <= %s))" % (
        !           438:                          hex(begin), hex(end)))
        !           439:     output.write(");\n}\n\n")
        !           440: 
        !           441: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
        !           442: output.write(
        !           443: """/**
        !           444:  * xmlUCSIsCat:
        !           445:  * @code: UCS code point
        !           446:  * @cat: UCS Category name
        !           447:  *
        !           448:  * Check whether the character is part of the UCS Category
        !           449:  *
        !           450:  * Returns 1 if true, 0 if false and -1 on unknown category
        !           451:  */
        !           452: int
        !           453: xmlUCSIsCat(int code, const char *cat) {
        !           454:     xmlIntFunc *func;
        !           455: 
        !           456:     func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
        !           457:     if (func == NULL)
        !           458:        return (-1);
        !           459:     return (func(code));
        !           460: }
        !           461: 
        !           462: #define bottom_xmlunicode
        !           463: #include "elfgcchack.h"
        !           464: #endif /* LIBXML_UNICODE_ENABLED */
        !           465: """)
        !           466: 
        !           467: header.write("""
        !           468: #ifdef __cplusplus
        !           469: }
        !           470: #endif
        !           471: 
        !           472: #endif /* LIBXML_UNICODE_ENABLED */
        !           473: 
        !           474: #endif /* __XML_UNICODE_H__ */
        !           475: """);
        !           476: 
        !           477: header.close()
        !           478: output.close()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>