Annotation of embedaddon/libxml2/genUnicode.py, revision 1.1.1.1

1.1       misho       1: #!/usr/bin/python -u
                      2: #
                      3: # Original script modified in November 2003 to take advantage of
                      4: # the character-validation range routines, and updated to the
                      5: # current Unicode information (Version 4.0.1)
                      6: #
                      7: # NOTE: there is an 'alias' facility for blocks which are not present in
                      8: #      the current release, but are needed for ABI compatibility.  This
                      9: #      must be accomplished MANUALLY!  Please see the comments below under
                     10: #     'blockAliases'
                     11: #
                     12: import sys
                     13: import string
                     14: import time
                     15: 
                     16: webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
                     17: sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
                     18: 
                     19: #
                     20: # blockAliases is a small hack - it is used for mapping block names which
                     21: # were were used in the 3.1 release, but are missing or changed in the current
                     22: # release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
                     23: blockAliases = []
                     24: blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
                     25: blockAliases.append("Greek:GreekandCoptic")
                     26: blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 
                     27:        "SupplementaryPrivateUseArea-B")
                     28: 
                     29: # minTableSize gives the minimum number of ranges which must be present
                     30: # before a range table is produced.  If there are less than this
                     31: # number, inline comparisons are generated
                     32: minTableSize = 8
                     33: 
                     34: (blockfile, catfile) = string.split(sources)
                     35: 
                     36: 
                     37: #
                     38: # Now process the "blocks" file, reducing it to a dictionary
                     39: # indexed by blockname, containing a tuple with the applicable
                     40: # block range
                     41: #
                     42: BlockNames = {}
                     43: try:
                     44:     blocks = open(blockfile, "r")
                     45: except:
                     46:     print "Missing %s, aborting ..." % blockfile
                     47:     sys.exit(1)
                     48: 
                     49: for line in blocks.readlines():
                     50:     if line[0] == '#':
                     51:         continue
                     52:     line = string.strip(line)
                     53:     if line == '':
                     54:         continue
                     55:     try:
                     56:         fields = string.split(line, ';')
                     57:         range = string.strip(fields[0])
                     58:         (start, end) = string.split(range, "..")
                     59:         name = string.strip(fields[1])
                     60:         name = string.replace(name, ' ', '')
                     61:     except:
                     62:         print "Failed to process line: %s" % (line)
                     63:         continue
                     64:     start = "0x" + start
                     65:     end = "0x" + end
                     66:     try:
                     67:         BlockNames[name].append((start, end))
                     68:     except:
                     69:         BlockNames[name] = [(start, end)]
                     70: blocks.close()
                     71: print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
                     72: 
                     73: for block in blockAliases:
                     74:     alias = string.split(block,':')
                     75:     alist = string.split(alias[1],',')
                     76:     for comp in alist:
                     77:         if BlockNames.has_key(comp):
                     78:             if alias[0] not in BlockNames:
                     79:                 BlockNames[alias[0]] = []
                     80:             for r in BlockNames[comp]:
                     81:                 BlockNames[alias[0]].append(r)
                     82:         else:
                     83:             print "Alias %s: %s not in Blocks" % (alias[0], comp)
                     84:             continue
                     85: 
                     86: #
                     87: # Next process the Categories file. This is more complex, since
                     88: # the file is in code sequence, and we need to invert it.  We use
                     89: # a dictionary with index category-name, with each entry containing
                     90: # all the ranges (codepoints) of that category.  Note that category
                     91: # names comprise two parts - the general category, and the "subclass"
                     92: # within that category.  Therefore, both "general category" (which is
                     93: # the first character of the 2-character category-name) and the full
                     94: # (2-character) name are entered into this dictionary.
                     95: #
                     96: try:
                     97:     data = open(catfile, "r")
                     98: except:
                     99:     print "Missing %s, aborting ..." % catfile
                    100:     sys.exit(1)
                    101: 
                    102: nbchar = 0;
                    103: Categories = {}
                    104: for line in data.readlines():
                    105:     if line[0] == '#':
                    106:         continue
                    107:     line = string.strip(line)
                    108:     if line == '':
                    109:         continue
                    110:     try:
                    111:         fields = string.split(line, ';')
                    112:         point = string.strip(fields[0])
                    113:         value = 0
                    114:         while point != '':
                    115:             value = value * 16
                    116:             if point[0] >= '0' and point[0] <= '9':
                    117:                 value = value + ord(point[0]) - ord('0')
                    118:             elif point[0] >= 'A' and point[0] <= 'F':
                    119:                 value = value + 10 + ord(point[0]) - ord('A')
                    120:             elif point[0] >= 'a' and point[0] <= 'f':
                    121:                 value = value + 10 + ord(point[0]) - ord('a')
                    122:             point = point[1:]
                    123:         name = fields[2]
                    124:     except:
                    125:         print "Failed to process line: %s" % (line)
                    126:         continue
                    127:     
                    128:     nbchar = nbchar + 1
                    129:     # update entry for "full name"
                    130:     try:
                    131:         Categories[name].append(value)
                    132:     except:
                    133:         try:
                    134:             Categories[name] = [value]
                    135:         except:
                    136:             print "Failed to process line: %s" % (line)
                    137:     # update "general category" name
                    138:     try:
                    139:         Categories[name[0]].append(value)
                    140:     except:
                    141:         try:
                    142:             Categories[name[0]] = [value]
                    143:         except:
                    144:             print "Failed to process line: %s" % (line)
                    145: 
                    146: blocks.close()
                    147: print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
                    148: 
                    149: #
                    150: # The data is now all read.  Time to process it into a more useful form.
                    151: #
                    152: # reduce the number list into ranges
                    153: for cat in Categories.keys():
                    154:     list = Categories[cat]
                    155:     start = -1
                    156:     prev = -1
                    157:     end = -1
                    158:     ranges = []
                    159:     for val in list:
                    160:         if start == -1:
                    161:             start = val
                    162:             prev = val
                    163:             continue
                    164:         elif val == prev + 1:
                    165:             prev = val
                    166:             continue
                    167:         elif prev == start:
                    168:             ranges.append((prev, prev))
                    169:             start = val
                    170:             prev = val
                    171:             continue
                    172:         else:
                    173:             ranges.append((start, prev))
                    174:             start = val
                    175:             prev = val
                    176:             continue
                    177:     if prev == start:
                    178:         ranges.append((prev, prev))
                    179:     else:
                    180:         ranges.append((start, prev))
                    181:     Categories[cat] = ranges
                    182: 
                    183: #
                    184: # Assure all data is in alphabetic order, since we will be doing binary
                    185: # searches on the tables.
                    186: #
                    187: bkeys = BlockNames.keys()
                    188: bkeys.sort()
                    189: 
                    190: ckeys = Categories.keys()
                    191: ckeys.sort()
                    192: 
                    193: #
                    194: # Generate the resulting files
                    195: #
                    196: try:
                    197:     header = open("include/libxml/xmlunicode.h", "w")
                    198: except:
                    199:     print "Failed to open include/libxml/xmlunicode.h"
                    200:     sys.exit(1)
                    201: 
                    202: try:
                    203:     output = open("xmlunicode.c", "w")
                    204: except:
                    205:     print "Failed to open xmlunicode.c"
                    206:     sys.exit(1)
                    207: 
                    208: date = time.asctime(time.localtime(time.time()))
                    209: 
                    210: header.write(
                    211: """/*
                    212:  * Summary: Unicode character APIs
                    213:  * Description: API for the Unicode character APIs
                    214:  *
                    215:  * This file is automatically generated from the
                    216:  * UCS description files of the Unicode Character Database
                    217:  * %s
                    218:  * using the genUnicode.py Python script.
                    219:  *
                    220:  * Generation date: %s
                    221:  * Sources: %s
                    222:  * Author: Daniel Veillard
                    223:  */
                    224: 
                    225: #ifndef __XML_UNICODE_H__
                    226: #define __XML_UNICODE_H__
                    227: 
                    228: #include <libxml/xmlversion.h>
                    229: 
                    230: #ifdef LIBXML_UNICODE_ENABLED
                    231: 
                    232: #ifdef __cplusplus
                    233: extern "C" {
                    234: #endif
                    235: 
                    236: """ % (webpage, date, sources));
                    237: 
                    238: output.write(
                    239: """/*
                    240:  * xmlunicode.c: this module implements the Unicode character APIs
                    241:  *
                    242:  * This file is automatically generated from the
                    243:  * UCS description files of the Unicode Character Database
                    244:  * %s
                    245:  * using the genUnicode.py Python script.
                    246:  *
                    247:  * Generation date: %s
                    248:  * Sources: %s
                    249:  * Daniel Veillard <veillard@redhat.com>
                    250:  */
                    251: 
                    252: #define IN_LIBXML
                    253: #include "libxml.h"
                    254: 
                    255: #ifdef LIBXML_UNICODE_ENABLED
                    256: 
                    257: #include <string.h>
                    258: #include <libxml/xmlversion.h>
                    259: #include <libxml/xmlunicode.h>
                    260: #include <libxml/chvalid.h>
                    261: 
                    262: typedef int (xmlIntFunc)(int); /* just to keep one's mind untwisted */
                    263: 
                    264: typedef struct {
                    265:     const char *rangename;
                    266:     xmlIntFunc *func;
                    267: } xmlUnicodeRange;
                    268: 
                    269: typedef struct {
                    270:     xmlUnicodeRange *table;
                    271:     int                    numentries;
                    272: } xmlUnicodeNameTable;
                    273: 
                    274: 
                    275: static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
                    276: 
                    277: static xmlUnicodeRange xmlUnicodeBlocks[] = {
                    278: """ % (webpage, date, sources));
                    279: 
                    280: flag = 0
                    281: for block in bkeys:
                    282:     name = string.replace(block, '-', '')
                    283:     if flag:
                    284:         output.write(',\n')
                    285:     else:
                    286:         flag = 1
                    287:     output.write('  {"%s", xmlUCSIs%s}' % (block, name))
                    288: output.write('};\n\n')
                    289: 
                    290: output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
                    291: flag = 0;
                    292: for name in ckeys:
                    293:     if flag:
                    294:         output.write(',\n')
                    295:     else:
                    296:         flag = 1
                    297:     output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
                    298: output.write('};\n\n')
                    299: 
                    300: #
                    301: # For any categories with more than minTableSize ranges we generate
                    302: # a range table suitable for xmlCharInRange
                    303: #
                    304: for name in ckeys:
                    305:   if len(Categories[name]) > minTableSize:
                    306:     numshort = 0
                    307:     numlong = 0
                    308:     ranges = Categories[name]
                    309:     sptr = "NULL"
                    310:     lptr = "NULL"
                    311:     for range in ranges:
                    312:       (low, high) = range
                    313:       if high < 0x10000:
                    314:         if numshort == 0:
                    315:           pline = "static const xmlChSRange xml%sS[] = {" % name
                    316:           sptr = "xml%sS" % name
                    317:         else:
                    318:           pline += ", "
                    319:         numshort += 1
                    320:       else:
                    321:         if numlong == 0:
                    322:           if numshort > 0:
                    323:             output.write(pline + " };\n")
                    324:           pline = "static const xmlChLRange xml%sL[] = {" % name
                    325:           lptr = "xml%sL" % name
                    326:         else:
                    327:           pline += ", "
                    328:         numlong += 1
                    329:       if len(pline) > 60:
                    330:         output.write(pline + "\n")
                    331:         pline = "    "
                    332:       pline += "{%s, %s}" % (hex(low), hex(high))
                    333:     output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
                    334:          % (name, numshort, numlong, sptr, lptr))
                    335: 
                    336: 
                    337: output.write(
                    338: """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
                    339: static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
                    340: 
                    341: /**
                    342:  * xmlUnicodeLookup:
                    343:  * @tptr: pointer to the name table
                    344:  * @name: name to be found
                    345:  *
                    346:  * binary table lookup for user-supplied name
                    347:  *
                    348:  * Returns pointer to range function if found, otherwise NULL
                    349:  */
                    350: static xmlIntFunc
                    351: *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
                    352:     int low, high, mid, cmp;
                    353:     xmlUnicodeRange *sptr;
                    354: 
                    355:     if ((tptr == NULL) || (tname == NULL)) return(NULL);
                    356: 
                    357:     low = 0;
                    358:     high = tptr->numentries - 1;
                    359:     sptr = tptr->table;
                    360:     while (low <= high) {
                    361:        mid = (low + high) / 2;
                    362:        if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
                    363:            return (sptr[mid].func);
                    364:        if (cmp < 0)
                    365:            high = mid - 1;
                    366:        else
                    367:            low = mid + 1;
                    368:     }
                    369:     return (NULL);    
                    370: }
                    371: 
                    372: """ % (len(BlockNames), len(Categories)) )
                    373: 
                    374: for block in bkeys:
                    375:     name = string.replace(block, '-', '')
                    376:     header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
                    377:     output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
                    378:     output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
                    379:                  (block))
                    380:     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
                    381:     output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
                    382:     flag = 0
                    383:     for (start, end) in BlockNames[block]:
                    384:         if flag:
                    385:             output.write(" ||\n           ")
                    386:         else:
                    387:             flag = 1
                    388:         output.write("((code >= %s) && (code <= %s))" % (start, end))
                    389:     output.write(");\n}\n\n")
                    390: 
                    391: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
                    392: output.write(
                    393: """/**
                    394:  * xmlUCSIsBlock:
                    395:  * @code: UCS code point
                    396:  * @block: UCS block name
                    397:  *
                    398:  * Check whether the character is part of the UCS Block
                    399:  *
                    400:  * Returns 1 if true, 0 if false and -1 on unknown block
                    401:  */
                    402: int
                    403: xmlUCSIsBlock(int code, const char *block) {
                    404:     xmlIntFunc *func;
                    405: 
                    406:     func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
                    407:     if (func == NULL)
                    408:        return (-1);
                    409:     return (func(code));
                    410: }
                    411: 
                    412: """)
                    413: 
                    414: for name in ckeys:
                    415:     ranges = Categories[name]
                    416:     header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
                    417:     output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
                    418:     output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
                    419:                  (name))
                    420:     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
                    421:     output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
                    422:     if len(Categories[name]) > minTableSize:
                    423:         output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
                    424:             % name)
                    425:     else:
                    426:         start = 1
                    427:         for range in ranges:
                    428:             (begin, end) = range;
                    429:             if start:
                    430:                 output.write("    return(");
                    431:                 start = 0
                    432:             else:
                    433:                 output.write(" ||\n           ");
                    434:             if (begin == end):
                    435:                 output.write("(code == %s)" % (hex(begin)))
                    436:             else:
                    437:                 output.write("((code >= %s) && (code <= %s))" % (
                    438:                          hex(begin), hex(end)))
                    439:     output.write(");\n}\n\n")
                    440: 
                    441: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
                    442: output.write(
                    443: """/**
                    444:  * xmlUCSIsCat:
                    445:  * @code: UCS code point
                    446:  * @cat: UCS Category name
                    447:  *
                    448:  * Check whether the character is part of the UCS Category
                    449:  *
                    450:  * Returns 1 if true, 0 if false and -1 on unknown category
                    451:  */
                    452: int
                    453: xmlUCSIsCat(int code, const char *cat) {
                    454:     xmlIntFunc *func;
                    455: 
                    456:     func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
                    457:     if (func == NULL)
                    458:        return (-1);
                    459:     return (func(code));
                    460: }
                    461: 
                    462: #define bottom_xmlunicode
                    463: #include "elfgcchack.h"
                    464: #endif /* LIBXML_UNICODE_ENABLED */
                    465: """)
                    466: 
                    467: header.write("""
                    468: #ifdef __cplusplus
                    469: }
                    470: #endif
                    471: 
                    472: #endif /* LIBXML_UNICODE_ENABLED */
                    473: 
                    474: #endif /* __XML_UNICODE_H__ */
                    475: """);
                    476: 
                    477: header.close()
                    478: output.close()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>