File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / genUnicode.py
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:37:58 2012 UTC (12 years, 4 months ago) by misho
Branches: libxml2, MAIN
CVS tags: v2_9_1p0, v2_9_1, v2_8_0p0, v2_8_0, v2_7_8, HEAD
libxml2

    1: #!/usr/bin/python -u
    2: #
    3: # Original script modified in November 2003 to take advantage of
    4: # the character-validation range routines, and updated to the
    5: # current Unicode information (Version 4.0.1)
    6: #
    7: # NOTE: there is an 'alias' facility for blocks which are not present in
    8: #	the current release, but are needed for ABI compatibility.  This
    9: #	must be accomplished MANUALLY!  Please see the comments below under
   10: #     'blockAliases'
   11: #
   12: import sys
   13: import string
   14: import time
   15: 
   16: webpage = "http://www.unicode.org/Public/4.0-Update1/UCD-4.0.1.html"
   17: sources = "Blocks-4.0.1.txt UnicodeData-4.0.1.txt"
   18: 
   19: #
   20: # blockAliases is a small hack - it is used for mapping block names which
   21: # were were used in the 3.1 release, but are missing or changed in the current
   22: # release.  The format is "OldBlockName:NewBlockName1[,NewBlockName2[,...]]"
   23: blockAliases = []
   24: blockAliases.append("CombiningMarksforSymbols:CombiningDiacriticalMarksforSymbols")
   25: blockAliases.append("Greek:GreekandCoptic")
   26: blockAliases.append("PrivateUse:PrivateUseArea,SupplementaryPrivateUseArea-A," + 
   27: 	"SupplementaryPrivateUseArea-B")
   28: 
   29: # minTableSize gives the minimum number of ranges which must be present
   30: # before a range table is produced.  If there are less than this
   31: # number, inline comparisons are generated
   32: minTableSize = 8
   33: 
   34: (blockfile, catfile) = string.split(sources)
   35: 
   36: 
   37: #
   38: # Now process the "blocks" file, reducing it to a dictionary
   39: # indexed by blockname, containing a tuple with the applicable
   40: # block range
   41: #
   42: BlockNames = {}
   43: try:
   44:     blocks = open(blockfile, "r")
   45: except:
   46:     print "Missing %s, aborting ..." % blockfile
   47:     sys.exit(1)
   48: 
   49: for line in blocks.readlines():
   50:     if line[0] == '#':
   51:         continue
   52:     line = string.strip(line)
   53:     if line == '':
   54:         continue
   55:     try:
   56:         fields = string.split(line, ';')
   57:         range = string.strip(fields[0])
   58:         (start, end) = string.split(range, "..")
   59:         name = string.strip(fields[1])
   60:         name = string.replace(name, ' ', '')
   61:     except:
   62:         print "Failed to process line: %s" % (line)
   63:         continue
   64:     start = "0x" + start
   65:     end = "0x" + end
   66:     try:
   67:         BlockNames[name].append((start, end))
   68:     except:
   69:         BlockNames[name] = [(start, end)]
   70: blocks.close()
   71: print "Parsed %d blocks descriptions" % (len(BlockNames.keys()))
   72: 
   73: for block in blockAliases:
   74:     alias = string.split(block,':')
   75:     alist = string.split(alias[1],',')
   76:     for comp in alist:
   77:         if BlockNames.has_key(comp):
   78:             if alias[0] not in BlockNames:
   79:                 BlockNames[alias[0]] = []
   80:             for r in BlockNames[comp]:
   81:                 BlockNames[alias[0]].append(r)
   82:         else:
   83:             print "Alias %s: %s not in Blocks" % (alias[0], comp)
   84:             continue
   85: 
   86: #
   87: # Next process the Categories file. This is more complex, since
   88: # the file is in code sequence, and we need to invert it.  We use
   89: # a dictionary with index category-name, with each entry containing
   90: # all the ranges (codepoints) of that category.  Note that category
   91: # names comprise two parts - the general category, and the "subclass"
   92: # within that category.  Therefore, both "general category" (which is
   93: # the first character of the 2-character category-name) and the full
   94: # (2-character) name are entered into this dictionary.
   95: #
   96: try:
   97:     data = open(catfile, "r")
   98: except:
   99:     print "Missing %s, aborting ..." % catfile
  100:     sys.exit(1)
  101: 
  102: nbchar = 0;
  103: Categories = {}
  104: for line in data.readlines():
  105:     if line[0] == '#':
  106:         continue
  107:     line = string.strip(line)
  108:     if line == '':
  109:         continue
  110:     try:
  111:         fields = string.split(line, ';')
  112:         point = string.strip(fields[0])
  113:         value = 0
  114:         while point != '':
  115:             value = value * 16
  116:             if point[0] >= '0' and point[0] <= '9':
  117:                 value = value + ord(point[0]) - ord('0')
  118:             elif point[0] >= 'A' and point[0] <= 'F':
  119:                 value = value + 10 + ord(point[0]) - ord('A')
  120:             elif point[0] >= 'a' and point[0] <= 'f':
  121:                 value = value + 10 + ord(point[0]) - ord('a')
  122:             point = point[1:]
  123:         name = fields[2]
  124:     except:
  125:         print "Failed to process line: %s" % (line)
  126:         continue
  127:     
  128:     nbchar = nbchar + 1
  129:     # update entry for "full name"
  130:     try:
  131:         Categories[name].append(value)
  132:     except:
  133:         try:
  134:             Categories[name] = [value]
  135:         except:
  136:             print "Failed to process line: %s" % (line)
  137:     # update "general category" name
  138:     try:
  139:         Categories[name[0]].append(value)
  140:     except:
  141:         try:
  142:             Categories[name[0]] = [value]
  143:         except:
  144:             print "Failed to process line: %s" % (line)
  145: 
  146: blocks.close()
  147: print "Parsed %d char generating %d categories" % (nbchar, len(Categories.keys()))
  148: 
  149: #
  150: # The data is now all read.  Time to process it into a more useful form.
  151: #
  152: # reduce the number list into ranges
  153: for cat in Categories.keys():
  154:     list = Categories[cat]
  155:     start = -1
  156:     prev = -1
  157:     end = -1
  158:     ranges = []
  159:     for val in list:
  160:         if start == -1:
  161:             start = val
  162:             prev = val
  163:             continue
  164:         elif val == prev + 1:
  165:             prev = val
  166:             continue
  167:         elif prev == start:
  168:             ranges.append((prev, prev))
  169:             start = val
  170:             prev = val
  171:             continue
  172:         else:
  173:             ranges.append((start, prev))
  174:             start = val
  175:             prev = val
  176:             continue
  177:     if prev == start:
  178:         ranges.append((prev, prev))
  179:     else:
  180:         ranges.append((start, prev))
  181:     Categories[cat] = ranges
  182: 
  183: #
  184: # Assure all data is in alphabetic order, since we will be doing binary
  185: # searches on the tables.
  186: #
  187: bkeys = BlockNames.keys()
  188: bkeys.sort()
  189: 
  190: ckeys = Categories.keys()
  191: ckeys.sort()
  192: 
  193: #
  194: # Generate the resulting files
  195: #
  196: try:
  197:     header = open("include/libxml/xmlunicode.h", "w")
  198: except:
  199:     print "Failed to open include/libxml/xmlunicode.h"
  200:     sys.exit(1)
  201: 
  202: try:
  203:     output = open("xmlunicode.c", "w")
  204: except:
  205:     print "Failed to open xmlunicode.c"
  206:     sys.exit(1)
  207: 
  208: date = time.asctime(time.localtime(time.time()))
  209: 
  210: header.write(
  211: """/*
  212:  * Summary: Unicode character APIs
  213:  * Description: API for the Unicode character APIs
  214:  *
  215:  * This file is automatically generated from the
  216:  * UCS description files of the Unicode Character Database
  217:  * %s
  218:  * using the genUnicode.py Python script.
  219:  *
  220:  * Generation date: %s
  221:  * Sources: %s
  222:  * Author: Daniel Veillard
  223:  */
  224: 
  225: #ifndef __XML_UNICODE_H__
  226: #define __XML_UNICODE_H__
  227: 
  228: #include <libxml/xmlversion.h>
  229: 
  230: #ifdef LIBXML_UNICODE_ENABLED
  231: 
  232: #ifdef __cplusplus
  233: extern "C" {
  234: #endif
  235: 
  236: """ % (webpage, date, sources));
  237: 
  238: output.write(
  239: """/*
  240:  * xmlunicode.c: this module implements the Unicode character APIs
  241:  *
  242:  * This file is automatically generated from the
  243:  * UCS description files of the Unicode Character Database
  244:  * %s
  245:  * using the genUnicode.py Python script.
  246:  *
  247:  * Generation date: %s
  248:  * Sources: %s
  249:  * Daniel Veillard <veillard@redhat.com>
  250:  */
  251: 
  252: #define IN_LIBXML
  253: #include "libxml.h"
  254: 
  255: #ifdef LIBXML_UNICODE_ENABLED
  256: 
  257: #include <string.h>
  258: #include <libxml/xmlversion.h>
  259: #include <libxml/xmlunicode.h>
  260: #include <libxml/chvalid.h>
  261: 
  262: typedef int (xmlIntFunc)(int);	/* just to keep one's mind untwisted */
  263: 
  264: typedef struct {
  265:     const char *rangename;
  266:     xmlIntFunc *func;
  267: } xmlUnicodeRange;
  268: 
  269: typedef struct {
  270:     xmlUnicodeRange *table;
  271:     int		    numentries;
  272: } xmlUnicodeNameTable;
  273: 
  274: 
  275: static xmlIntFunc *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname);
  276: 
  277: static xmlUnicodeRange xmlUnicodeBlocks[] = {
  278: """ % (webpage, date, sources));
  279: 
  280: flag = 0
  281: for block in bkeys:
  282:     name = string.replace(block, '-', '')
  283:     if flag:
  284:         output.write(',\n')
  285:     else:
  286:         flag = 1
  287:     output.write('  {"%s", xmlUCSIs%s}' % (block, name))
  288: output.write('};\n\n')
  289: 
  290: output.write('static xmlUnicodeRange xmlUnicodeCats[] = {\n')
  291: flag = 0;
  292: for name in ckeys:
  293:     if flag:
  294:         output.write(',\n')
  295:     else:
  296:         flag = 1
  297:     output.write('  {"%s", xmlUCSIsCat%s}' % (name, name))
  298: output.write('};\n\n')
  299: 
  300: #
  301: # For any categories with more than minTableSize ranges we generate
  302: # a range table suitable for xmlCharInRange
  303: #
  304: for name in ckeys:
  305:   if len(Categories[name]) > minTableSize:
  306:     numshort = 0
  307:     numlong = 0
  308:     ranges = Categories[name]
  309:     sptr = "NULL"
  310:     lptr = "NULL"
  311:     for range in ranges:
  312:       (low, high) = range
  313:       if high < 0x10000:
  314:         if numshort == 0:
  315:           pline = "static const xmlChSRange xml%sS[] = {" % name
  316:           sptr = "xml%sS" % name
  317:         else:
  318:           pline += ", "
  319:         numshort += 1
  320:       else:
  321:         if numlong == 0:
  322:           if numshort > 0:
  323:             output.write(pline + " };\n")
  324:           pline = "static const xmlChLRange xml%sL[] = {" % name
  325:           lptr = "xml%sL" % name
  326:         else:
  327:           pline += ", "
  328:         numlong += 1
  329:       if len(pline) > 60:
  330:         output.write(pline + "\n")
  331:         pline = "    "
  332:       pline += "{%s, %s}" % (hex(low), hex(high))
  333:     output.write(pline + " };\nstatic xmlChRangeGroup xml%sG = {%s,%s,%s,%s};\n\n"
  334:          % (name, numshort, numlong, sptr, lptr))
  335: 
  336: 
  337: output.write(
  338: """static xmlUnicodeNameTable xmlUnicodeBlockTbl = {xmlUnicodeBlocks, %s};
  339: static xmlUnicodeNameTable xmlUnicodeCatTbl = {xmlUnicodeCats, %s};
  340: 
  341: /**
  342:  * xmlUnicodeLookup:
  343:  * @tptr: pointer to the name table
  344:  * @name: name to be found
  345:  *
  346:  * binary table lookup for user-supplied name
  347:  *
  348:  * Returns pointer to range function if found, otherwise NULL
  349:  */
  350: static xmlIntFunc
  351: *xmlUnicodeLookup(xmlUnicodeNameTable *tptr, const char *tname) {
  352:     int low, high, mid, cmp;
  353:     xmlUnicodeRange *sptr;
  354: 
  355:     if ((tptr == NULL) || (tname == NULL)) return(NULL);
  356: 
  357:     low = 0;
  358:     high = tptr->numentries - 1;
  359:     sptr = tptr->table;
  360:     while (low <= high) {
  361: 	mid = (low + high) / 2;
  362: 	if ((cmp=strcmp(tname, sptr[mid].rangename)) == 0)
  363: 	    return (sptr[mid].func);
  364: 	if (cmp < 0)
  365: 	    high = mid - 1;
  366: 	else
  367: 	    low = mid + 1;
  368:     }
  369:     return (NULL);    
  370: }
  371: 
  372: """ % (len(BlockNames), len(Categories)) )
  373: 
  374: for block in bkeys:
  375:     name = string.replace(block, '-', '')
  376:     header.write("XMLPUBFUN int XMLCALL xmlUCSIs%s\t(int code);\n" % name)
  377:     output.write("/**\n * xmlUCSIs%s:\n * @code: UCS code point\n" % (name))
  378:     output.write(" *\n * Check whether the character is part of %s UCS Block\n"%
  379:                  (block))
  380:     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
  381:     output.write("int\nxmlUCSIs%s(int code) {\n    return(" % name)
  382:     flag = 0
  383:     for (start, end) in BlockNames[block]:
  384:         if flag:
  385:             output.write(" ||\n           ")
  386:         else:
  387:             flag = 1
  388:         output.write("((code >= %s) && (code <= %s))" % (start, end))
  389:     output.write(");\n}\n\n")
  390: 
  391: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsBlock\t(int code, const char *block);\n\n")
  392: output.write(
  393: """/**
  394:  * xmlUCSIsBlock:
  395:  * @code: UCS code point
  396:  * @block: UCS block name
  397:  *
  398:  * Check whether the character is part of the UCS Block
  399:  *
  400:  * Returns 1 if true, 0 if false and -1 on unknown block
  401:  */
  402: int
  403: xmlUCSIsBlock(int code, const char *block) {
  404:     xmlIntFunc *func;
  405: 
  406:     func = xmlUnicodeLookup(&xmlUnicodeBlockTbl, block);
  407:     if (func == NULL)
  408: 	return (-1);
  409:     return (func(code));
  410: }
  411: 
  412: """)
  413: 
  414: for name in ckeys:
  415:     ranges = Categories[name]
  416:     header.write("XMLPUBFUN int XMLCALL xmlUCSIsCat%s\t(int code);\n" % name)
  417:     output.write("/**\n * xmlUCSIsCat%s:\n * @code: UCS code point\n" % (name))
  418:     output.write(" *\n * Check whether the character is part of %s UCS Category\n"%
  419:                  (name))
  420:     output.write(" *\n * Returns 1 if true 0 otherwise\n */\n");
  421:     output.write("int\nxmlUCSIsCat%s(int code) {\n" % name)
  422:     if len(Categories[name]) > minTableSize:
  423:         output.write("    return(xmlCharInRange((unsigned int)code, &xml%sG)"
  424:             % name)
  425:     else:
  426:         start = 1
  427:         for range in ranges:
  428:             (begin, end) = range;
  429:             if start:
  430:                 output.write("    return(");
  431:                 start = 0
  432:             else:
  433:                 output.write(" ||\n           ");
  434:             if (begin == end):
  435:                 output.write("(code == %s)" % (hex(begin)))
  436:             else:
  437:                 output.write("((code >= %s) && (code <= %s))" % (
  438:                          hex(begin), hex(end)))
  439:     output.write(");\n}\n\n")
  440: 
  441: header.write("\nXMLPUBFUN int XMLCALL xmlUCSIsCat\t(int code, const char *cat);\n")
  442: output.write(
  443: """/**
  444:  * xmlUCSIsCat:
  445:  * @code: UCS code point
  446:  * @cat: UCS Category name
  447:  *
  448:  * Check whether the character is part of the UCS Category
  449:  *
  450:  * Returns 1 if true, 0 if false and -1 on unknown category
  451:  */
  452: int
  453: xmlUCSIsCat(int code, const char *cat) {
  454:     xmlIntFunc *func;
  455: 
  456:     func = xmlUnicodeLookup(&xmlUnicodeCatTbl, cat);
  457:     if (func == NULL)
  458: 	return (-1);
  459:     return (func(code));
  460: }
  461: 
  462: #define bottom_xmlunicode
  463: #include "elfgcchack.h"
  464: #endif /* LIBXML_UNICODE_ENABLED */
  465: """)
  466: 
  467: header.write("""
  468: #ifdef __cplusplus
  469: }
  470: #endif
  471: 
  472: #endif /* LIBXML_UNICODE_ENABLED */
  473: 
  474: #endif /* __XML_UNICODE_H__ */
  475: """);
  476: 
  477: header.close()
  478: output.close()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>