File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / python / drv_libxml2.py
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:38:00 2012 UTC (12 years, 7 months ago) by misho
Branches: libxml2, MAIN
CVS tags: v2_9_1p0, v2_9_1, v2_8_0p0, v2_8_0, v2_7_8, HEAD
libxml2

    1: # -*- coding: iso-8859-1 -*-
    2: """ A SAX2 driver for libxml2, on top of it's XmlReader API
    3: 
    4: USAGE
    5:     # put this file (drv_libxml2.py) in PYTHONPATH
    6:     import xml.sax
    7:     reader = xml.sax.make_parser(["drv_libxml2"])
    8:     # ...and the rest is standard python sax.
    9: 
   10: CAVEATS
   11:     - Lexical handlers are supported, except for start/endEntity
   12:       (waiting for XmlReader.ResolveEntity) and start/endDTD
   13:     - Error callbacks are not exactly synchronous, they tend
   14:       to be invoked before the corresponding content callback,
   15:       because the underlying reader interface parses
   16:       data by chunks of 512 bytes
   17:     
   18: TODO
   19:     - search for TODO
   20:     - some ErrorHandler events (warning)
   21:     - some ContentHandler events (setDocumentLocator, skippedEntity)
   22:     - EntityResolver (using libxml2.?)
   23:     - DTDHandler (if/when libxml2 exposes such node types)
   24:     - DeclHandler (if/when libxml2 exposes such node types)
   25:     - property_xml_string?
   26:     - feature_string_interning?
   27:     - Incremental parser
   28:     - additional performance tuning:
   29:       - one might cache callbacks to avoid some name lookups
   30:       - one might implement a smarter way to pass attributes to startElement
   31:         (some kind of lazy evaluation?)
   32:       - there might be room for improvement in start/endPrefixMapping
   33:       - other?
   34: 
   35: """
   36: 
   37: __author__  = u"Stéphane Bidoul <sbi@skynet.be>"
   38: __version__ = "0.3"
   39: 
   40: import codecs
   41: from types import StringType, UnicodeType
   42: StringTypes = (StringType,UnicodeType)
   43: 
   44: from xml.sax._exceptions import *
   45: from xml.sax import xmlreader, saxutils
   46: from xml.sax.handler import \
   47:      feature_namespaces, \
   48:      feature_namespace_prefixes, \
   49:      feature_string_interning, \
   50:      feature_validation, \
   51:      feature_external_ges, \
   52:      feature_external_pes, \
   53:      property_lexical_handler, \
   54:      property_declaration_handler, \
   55:      property_dom_node, \
   56:      property_xml_string
   57: 
   58: # libxml2 returns strings as UTF8
   59: _decoder = codecs.lookup("utf8")[1]
   60: def _d(s):
   61:     if s is None:
   62:         return s
   63:     else:
   64:         return _decoder(s)[0]
   65: 
   66: try:
   67:     import libxml2
   68: except ImportError, e:
   69:     raise SAXReaderNotAvailable("libxml2 not available: " \
   70:                                 "import error was: %s" % e)
   71: 
   72: class Locator(xmlreader.Locator):
   73:     """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
   74: 
   75:     def __init__(self,locator):
   76:         self.__locator = locator
   77: 
   78:     def getColumnNumber(self):
   79:         "Return the column number where the current event ends."
   80:         return -1
   81: 
   82:     def getLineNumber(self):
   83:         "Return the line number where the current event ends."
   84:         return self.__locator.LineNumber()
   85: 
   86:     def getPublicId(self):
   87:         "Return the public identifier for the current event."
   88:         return None
   89: 
   90:     def getSystemId(self):
   91:         "Return the system identifier for the current event."
   92:         return self.__locator.BaseURI()
   93: 
   94: class LibXml2Reader(xmlreader.XMLReader):
   95: 
   96:     def __init__(self):
   97:         xmlreader.XMLReader.__init__(self)
   98:         # features
   99:         self.__ns = 0
  100:         self.__nspfx = 0
  101:         self.__validate = 0
  102:         self.__extparams = 1
  103:         # parsing flag
  104:         self.__parsing = 0
  105:         # additional handlers
  106:         self.__lex_handler = None
  107:         self.__decl_handler = None
  108:         # error messages accumulator
  109:         self.__errors = None
  110: 
  111:     def _errorHandler(self,arg,msg,severity,locator):
  112:         if self.__errors is None:
  113:             self.__errors = []
  114:         self.__errors.append((severity,
  115:                               SAXParseException(msg,None,
  116:                                                 Locator(locator))))
  117: 
  118:     def _reportErrors(self,fatal):
  119:         for severity,exception in self.__errors:
  120:             if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
  121:                             libxml2.PARSER_SEVERITY_WARNING):
  122:                 self._err_handler.warning(exception)
  123:             else:
  124:                 # when fatal is set, the parse will stop;
  125:                 # we consider that the last error reported
  126:                 # is the fatal one.
  127:                 if fatal and exception is self.__errors[-1][1]:
  128:                     self._err_handler.fatalError(exception)
  129:                 else:
  130:                     self._err_handler.error(exception)
  131:         self.__errors = None
  132: 
  133:     def parse(self, source):
  134:         self.__parsing = 1
  135:         try:
  136:             # prepare source and create reader
  137:             if type(source) in StringTypes:
  138:                 reader = libxml2.newTextReaderFilename(source)
  139:             else:
  140:                 source = saxutils.prepare_input_source(source)
  141:                 input = libxml2.inputBuffer(source.getByteStream())
  142:                 reader = input.newTextReader(source.getSystemId())
  143:             reader.SetErrorHandler(self._errorHandler,None)
  144:             # configure reader
  145:             if self.__extparams:
  146:                 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
  147:                 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
  148:                 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
  149:                 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
  150:             else:
  151:                 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
  152:             # we reuse attribute maps (for a slight performance gain)
  153:             if self.__ns:
  154:                 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
  155:             else:
  156:                 attributesImpl = xmlreader.AttributesImpl({})
  157:             # prefixes to pop (for endPrefixMapping)
  158:             prefixes = []
  159:             # start loop
  160:             self._cont_handler.startDocument()
  161:             while 1:
  162:                 r = reader.Read()
  163:                 # check for errors
  164:                 if r == 1:
  165:                     if not self.__errors is None:
  166:                         self._reportErrors(0)
  167:                 elif r == 0:
  168:                     if not self.__errors is None:
  169:                         self._reportErrors(0)
  170:                     break # end of parse
  171:                 else:
  172:                     if not self.__errors is None:
  173:                         self._reportErrors(1)
  174:                     else:
  175:                         self._err_handler.fatalError(\
  176:                             SAXException("Read failed (no details available)"))
  177:                     break # fatal parse error
  178:                 # get node type
  179:                 nodeType = reader.NodeType()
  180:                 # Element
  181:                 if nodeType == 1: 
  182:                     if self.__ns:
  183:                         eltName = (_d(reader.NamespaceUri()),\
  184:                                    _d(reader.LocalName()))
  185:                         eltQName = _d(reader.Name())
  186:                         attributesNSImpl._attrs = attrs = {}
  187:                         attributesNSImpl._qnames = qnames = {}
  188:                         newPrefixes = []
  189:                         while reader.MoveToNextAttribute():
  190:                             qname = _d(reader.Name())
  191:                             value = _d(reader.Value())
  192:                             if qname.startswith("xmlns"):
  193:                                 if len(qname) > 5:
  194:                                     newPrefix = qname[6:]
  195:                                 else:
  196:                                     newPrefix = None
  197:                                 newPrefixes.append(newPrefix)
  198:                                 self._cont_handler.startPrefixMapping(\
  199:                                     newPrefix,value)
  200:                                 if not self.__nspfx:
  201:                                     continue # don't report xmlns attribute
  202:                             attName = (_d(reader.NamespaceUri()),
  203:                                        _d(reader.LocalName()))
  204:                             qnames[attName] = qname
  205:                             attrs[attName] = value
  206:                         reader.MoveToElement()
  207:                         self._cont_handler.startElementNS( \
  208:                             eltName,eltQName,attributesNSImpl) 
  209:                         if reader.IsEmptyElement():
  210:                             self._cont_handler.endElementNS(eltName,eltQName)
  211:                             for newPrefix in newPrefixes:
  212:                                 self._cont_handler.endPrefixMapping(newPrefix)
  213:                         else:
  214:                             prefixes.append(newPrefixes)
  215:                     else:
  216:                         eltName = _d(reader.Name())
  217:                         attributesImpl._attrs = attrs = {}
  218:                         while reader.MoveToNextAttribute():
  219:                             attName = _d(reader.Name())
  220:                             attrs[attName] = _d(reader.Value())
  221:                         reader.MoveToElement()
  222:                         self._cont_handler.startElement( \
  223:                             eltName,attributesImpl)
  224:                         if reader.IsEmptyElement():
  225:                             self._cont_handler.endElement(eltName)
  226:                 # EndElement
  227:                 elif nodeType == 15: 
  228:                     if self.__ns:
  229:                         self._cont_handler.endElementNS( \
  230:                              (_d(reader.NamespaceUri()),_d(reader.LocalName())),
  231:                              _d(reader.Name()))
  232:                         for prefix in prefixes.pop():
  233:                             self._cont_handler.endPrefixMapping(prefix)
  234:                     else:
  235:                         self._cont_handler.endElement(_d(reader.Name()))
  236:                 # Text
  237:                 elif nodeType == 3: 
  238:                     self._cont_handler.characters(_d(reader.Value()))
  239:                 # Whitespace
  240:                 elif nodeType == 13: 
  241:                     self._cont_handler.ignorableWhitespace(_d(reader.Value()))
  242:                 # SignificantWhitespace
  243:                 elif nodeType == 14:
  244:                     self._cont_handler.characters(_d(reader.Value()))
  245:                 # CDATA
  246:                 elif nodeType == 4:
  247:                     if not self.__lex_handler is None:
  248:                         self.__lex_handler.startCDATA()
  249:                     self._cont_handler.characters(_d(reader.Value()))
  250:                     if not self.__lex_handler is None:
  251:                         self.__lex_handler.endCDATA()
  252:                 # EntityReference
  253:                 elif nodeType == 5:
  254:                     if not self.__lex_handler is None:
  255:                         self.startEntity(_d(reader.Name()))
  256:                     reader.ResolveEntity()
  257:                 # EndEntity
  258:                 elif nodeType == 16:
  259:                     if not self.__lex_handler is None:
  260:                         self.endEntity(_d(reader.Name()))
  261:                 # ProcessingInstruction
  262:                 elif nodeType == 7: 
  263:                     self._cont_handler.processingInstruction( \
  264:                         _d(reader.Name()),_d(reader.Value()))
  265:                 # Comment
  266:                 elif nodeType == 8:
  267:                     if not self.__lex_handler is None:
  268:                         self.__lex_handler.comment(_d(reader.Value()))
  269:                 # DocumentType
  270:                 elif nodeType == 10:
  271:                     #if not self.__lex_handler is None:
  272:                     #    self.__lex_handler.startDTD()
  273:                     pass # TODO (how to detect endDTD? on first non-dtd event?)
  274:                 # XmlDeclaration
  275:                 elif nodeType == 17:
  276:                     pass # TODO
  277:                 # Entity
  278:                 elif nodeType == 6:
  279:                     pass # TODO (entity decl)
  280:                 # Notation (decl)
  281:                 elif nodeType == 12:
  282:                     pass # TODO
  283:                 # Attribute (never in this loop)
  284:                 #elif nodeType == 2: 
  285:                 #    pass
  286:                 # Document (not exposed)
  287:                 #elif nodeType == 9: 
  288:                 #    pass
  289:                 # DocumentFragment (never returned by XmlReader)
  290:                 #elif nodeType == 11:
  291:                 #    pass
  292:                 # None
  293:                 #elif nodeType == 0:
  294:                 #    pass
  295:                 # -
  296:                 else:
  297:                     raise SAXException("Unexpected node type %d" % nodeType)
  298:             if r == 0:
  299:                 self._cont_handler.endDocument()
  300:             reader.Close()
  301:         finally:
  302:             self.__parsing = 0
  303: 
  304:     def setDTDHandler(self, handler):
  305:         # TODO (when supported, the inherited method works just fine)
  306:         raise SAXNotSupportedException("DTDHandler not supported")
  307: 
  308:     def setEntityResolver(self, resolver):
  309:         # TODO (when supported, the inherited method works just fine)
  310:         raise SAXNotSupportedException("EntityResolver not supported")
  311: 
  312:     def getFeature(self, name):
  313:         if name == feature_namespaces:
  314:             return self.__ns
  315:         elif name == feature_namespace_prefixes:
  316:             return self.__nspfx
  317:         elif name == feature_validation:
  318:             return self.__validate
  319:         elif name == feature_external_ges:
  320:             return 1 # TODO (does that relate to PARSER_LOADDTD)?
  321:         elif name == feature_external_pes:
  322:             return self.__extparams
  323:         else:
  324:             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
  325:                                             name)
  326: 
  327:     def setFeature(self, name, state):
  328:         if self.__parsing:
  329:             raise SAXNotSupportedException("Cannot set feature %s " \
  330:                                            "while parsing" % name)
  331:         if name == feature_namespaces:
  332:             self.__ns = state
  333:         elif name == feature_namespace_prefixes:
  334:             self.__nspfx = state
  335:         elif name == feature_validation:
  336:             self.__validate = state
  337:         elif name == feature_external_ges:
  338:             if state == 0:
  339:                 # TODO (does that relate to PARSER_LOADDTD)?
  340:                 raise SAXNotSupportedException("Feature '%s' not supported" % \
  341:                                                name)
  342:         elif name == feature_external_pes:
  343:             self.__extparams = state
  344:         else:
  345:             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
  346:                                             name)
  347: 
  348:     def getProperty(self, name):
  349:         if name == property_lexical_handler:
  350:             return self.__lex_handler
  351:         elif name == property_declaration_handler:
  352:             return self.__decl_handler
  353:         else:
  354:             raise SAXNotRecognizedException("Property '%s' not recognized" % \
  355:                                             name)
  356: 
  357:     def setProperty(self, name, value):     
  358:         if name == property_lexical_handler:
  359:             self.__lex_handler = value
  360:         elif name == property_declaration_handler:
  361:             # TODO: remove if/when libxml2 supports dtd events
  362:             raise SAXNotSupportedException("Property '%s' not supported" % \
  363:                                            name)
  364:             self.__decl_handler = value
  365:         else:
  366:             raise SAXNotRecognizedException("Property '%s' not recognized" % \
  367:                                             name)
  368: 
  369: def create_parser():
  370:     return LibXml2Reader()
  371: 

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>