Annotation of embedaddon/libxml2/python/drv_libxml2.py, revision 1.1
1.1 ! misho 1: # -*- coding: iso-8859-1 -*-
! 2: """ A SAX2 driver for libxml2, on top of it's XmlReader API
! 3:
! 4: USAGE
! 5: # put this file (drv_libxml2.py) in PYTHONPATH
! 6: import xml.sax
! 7: reader = xml.sax.make_parser(["drv_libxml2"])
! 8: # ...and the rest is standard python sax.
! 9:
! 10: CAVEATS
! 11: - Lexical handlers are supported, except for start/endEntity
! 12: (waiting for XmlReader.ResolveEntity) and start/endDTD
! 13: - Error callbacks are not exactly synchronous, they tend
! 14: to be invoked before the corresponding content callback,
! 15: because the underlying reader interface parses
! 16: data by chunks of 512 bytes
! 17:
! 18: TODO
! 19: - search for TODO
! 20: - some ErrorHandler events (warning)
! 21: - some ContentHandler events (setDocumentLocator, skippedEntity)
! 22: - EntityResolver (using libxml2.?)
! 23: - DTDHandler (if/when libxml2 exposes such node types)
! 24: - DeclHandler (if/when libxml2 exposes such node types)
! 25: - property_xml_string?
! 26: - feature_string_interning?
! 27: - Incremental parser
! 28: - additional performance tuning:
! 29: - one might cache callbacks to avoid some name lookups
! 30: - one might implement a smarter way to pass attributes to startElement
! 31: (some kind of lazy evaluation?)
! 32: - there might be room for improvement in start/endPrefixMapping
! 33: - other?
! 34:
! 35: """
! 36:
! 37: __author__ = u"Stéphane Bidoul <sbi@skynet.be>"
! 38: __version__ = "0.3"
! 39:
! 40: import codecs
! 41: from types import StringType, UnicodeType
! 42: StringTypes = (StringType,UnicodeType)
! 43:
! 44: from xml.sax._exceptions import *
! 45: from xml.sax import xmlreader, saxutils
! 46: from xml.sax.handler import \
! 47: feature_namespaces, \
! 48: feature_namespace_prefixes, \
! 49: feature_string_interning, \
! 50: feature_validation, \
! 51: feature_external_ges, \
! 52: feature_external_pes, \
! 53: property_lexical_handler, \
! 54: property_declaration_handler, \
! 55: property_dom_node, \
! 56: property_xml_string
! 57:
! 58: # libxml2 returns strings as UTF8
! 59: _decoder = codecs.lookup("utf8")[1]
! 60: def _d(s):
! 61: if s is None:
! 62: return s
! 63: else:
! 64: return _decoder(s)[0]
! 65:
! 66: try:
! 67: import libxml2
! 68: except ImportError, e:
! 69: raise SAXReaderNotAvailable("libxml2 not available: " \
! 70: "import error was: %s" % e)
! 71:
! 72: class Locator(xmlreader.Locator):
! 73: """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
! 74:
! 75: def __init__(self,locator):
! 76: self.__locator = locator
! 77:
! 78: def getColumnNumber(self):
! 79: "Return the column number where the current event ends."
! 80: return -1
! 81:
! 82: def getLineNumber(self):
! 83: "Return the line number where the current event ends."
! 84: return self.__locator.LineNumber()
! 85:
! 86: def getPublicId(self):
! 87: "Return the public identifier for the current event."
! 88: return None
! 89:
! 90: def getSystemId(self):
! 91: "Return the system identifier for the current event."
! 92: return self.__locator.BaseURI()
! 93:
! 94: class LibXml2Reader(xmlreader.XMLReader):
! 95:
! 96: def __init__(self):
! 97: xmlreader.XMLReader.__init__(self)
! 98: # features
! 99: self.__ns = 0
! 100: self.__nspfx = 0
! 101: self.__validate = 0
! 102: self.__extparams = 1
! 103: # parsing flag
! 104: self.__parsing = 0
! 105: # additional handlers
! 106: self.__lex_handler = None
! 107: self.__decl_handler = None
! 108: # error messages accumulator
! 109: self.__errors = None
! 110:
! 111: def _errorHandler(self,arg,msg,severity,locator):
! 112: if self.__errors is None:
! 113: self.__errors = []
! 114: self.__errors.append((severity,
! 115: SAXParseException(msg,None,
! 116: Locator(locator))))
! 117:
! 118: def _reportErrors(self,fatal):
! 119: for severity,exception in self.__errors:
! 120: if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
! 121: libxml2.PARSER_SEVERITY_WARNING):
! 122: self._err_handler.warning(exception)
! 123: else:
! 124: # when fatal is set, the parse will stop;
! 125: # we consider that the last error reported
! 126: # is the fatal one.
! 127: if fatal and exception is self.__errors[-1][1]:
! 128: self._err_handler.fatalError(exception)
! 129: else:
! 130: self._err_handler.error(exception)
! 131: self.__errors = None
! 132:
! 133: def parse(self, source):
! 134: self.__parsing = 1
! 135: try:
! 136: # prepare source and create reader
! 137: if type(source) in StringTypes:
! 138: reader = libxml2.newTextReaderFilename(source)
! 139: else:
! 140: source = saxutils.prepare_input_source(source)
! 141: input = libxml2.inputBuffer(source.getByteStream())
! 142: reader = input.newTextReader(source.getSystemId())
! 143: reader.SetErrorHandler(self._errorHandler,None)
! 144: # configure reader
! 145: if self.__extparams:
! 146: reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
! 147: reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
! 148: reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
! 149: reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
! 150: else:
! 151: reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
! 152: # we reuse attribute maps (for a slight performance gain)
! 153: if self.__ns:
! 154: attributesNSImpl = xmlreader.AttributesNSImpl({},{})
! 155: else:
! 156: attributesImpl = xmlreader.AttributesImpl({})
! 157: # prefixes to pop (for endPrefixMapping)
! 158: prefixes = []
! 159: # start loop
! 160: self._cont_handler.startDocument()
! 161: while 1:
! 162: r = reader.Read()
! 163: # check for errors
! 164: if r == 1:
! 165: if not self.__errors is None:
! 166: self._reportErrors(0)
! 167: elif r == 0:
! 168: if not self.__errors is None:
! 169: self._reportErrors(0)
! 170: break # end of parse
! 171: else:
! 172: if not self.__errors is None:
! 173: self._reportErrors(1)
! 174: else:
! 175: self._err_handler.fatalError(\
! 176: SAXException("Read failed (no details available)"))
! 177: break # fatal parse error
! 178: # get node type
! 179: nodeType = reader.NodeType()
! 180: # Element
! 181: if nodeType == 1:
! 182: if self.__ns:
! 183: eltName = (_d(reader.NamespaceUri()),\
! 184: _d(reader.LocalName()))
! 185: eltQName = _d(reader.Name())
! 186: attributesNSImpl._attrs = attrs = {}
! 187: attributesNSImpl._qnames = qnames = {}
! 188: newPrefixes = []
! 189: while reader.MoveToNextAttribute():
! 190: qname = _d(reader.Name())
! 191: value = _d(reader.Value())
! 192: if qname.startswith("xmlns"):
! 193: if len(qname) > 5:
! 194: newPrefix = qname[6:]
! 195: else:
! 196: newPrefix = None
! 197: newPrefixes.append(newPrefix)
! 198: self._cont_handler.startPrefixMapping(\
! 199: newPrefix,value)
! 200: if not self.__nspfx:
! 201: continue # don't report xmlns attribute
! 202: attName = (_d(reader.NamespaceUri()),
! 203: _d(reader.LocalName()))
! 204: qnames[attName] = qname
! 205: attrs[attName] = value
! 206: reader.MoveToElement()
! 207: self._cont_handler.startElementNS( \
! 208: eltName,eltQName,attributesNSImpl)
! 209: if reader.IsEmptyElement():
! 210: self._cont_handler.endElementNS(eltName,eltQName)
! 211: for newPrefix in newPrefixes:
! 212: self._cont_handler.endPrefixMapping(newPrefix)
! 213: else:
! 214: prefixes.append(newPrefixes)
! 215: else:
! 216: eltName = _d(reader.Name())
! 217: attributesImpl._attrs = attrs = {}
! 218: while reader.MoveToNextAttribute():
! 219: attName = _d(reader.Name())
! 220: attrs[attName] = _d(reader.Value())
! 221: reader.MoveToElement()
! 222: self._cont_handler.startElement( \
! 223: eltName,attributesImpl)
! 224: if reader.IsEmptyElement():
! 225: self._cont_handler.endElement(eltName)
! 226: # EndElement
! 227: elif nodeType == 15:
! 228: if self.__ns:
! 229: self._cont_handler.endElementNS( \
! 230: (_d(reader.NamespaceUri()),_d(reader.LocalName())),
! 231: _d(reader.Name()))
! 232: for prefix in prefixes.pop():
! 233: self._cont_handler.endPrefixMapping(prefix)
! 234: else:
! 235: self._cont_handler.endElement(_d(reader.Name()))
! 236: # Text
! 237: elif nodeType == 3:
! 238: self._cont_handler.characters(_d(reader.Value()))
! 239: # Whitespace
! 240: elif nodeType == 13:
! 241: self._cont_handler.ignorableWhitespace(_d(reader.Value()))
! 242: # SignificantWhitespace
! 243: elif nodeType == 14:
! 244: self._cont_handler.characters(_d(reader.Value()))
! 245: # CDATA
! 246: elif nodeType == 4:
! 247: if not self.__lex_handler is None:
! 248: self.__lex_handler.startCDATA()
! 249: self._cont_handler.characters(_d(reader.Value()))
! 250: if not self.__lex_handler is None:
! 251: self.__lex_handler.endCDATA()
! 252: # EntityReference
! 253: elif nodeType == 5:
! 254: if not self.__lex_handler is None:
! 255: self.startEntity(_d(reader.Name()))
! 256: reader.ResolveEntity()
! 257: # EndEntity
! 258: elif nodeType == 16:
! 259: if not self.__lex_handler is None:
! 260: self.endEntity(_d(reader.Name()))
! 261: # ProcessingInstruction
! 262: elif nodeType == 7:
! 263: self._cont_handler.processingInstruction( \
! 264: _d(reader.Name()),_d(reader.Value()))
! 265: # Comment
! 266: elif nodeType == 8:
! 267: if not self.__lex_handler is None:
! 268: self.__lex_handler.comment(_d(reader.Value()))
! 269: # DocumentType
! 270: elif nodeType == 10:
! 271: #if not self.__lex_handler is None:
! 272: # self.__lex_handler.startDTD()
! 273: pass # TODO (how to detect endDTD? on first non-dtd event?)
! 274: # XmlDeclaration
! 275: elif nodeType == 17:
! 276: pass # TODO
! 277: # Entity
! 278: elif nodeType == 6:
! 279: pass # TODO (entity decl)
! 280: # Notation (decl)
! 281: elif nodeType == 12:
! 282: pass # TODO
! 283: # Attribute (never in this loop)
! 284: #elif nodeType == 2:
! 285: # pass
! 286: # Document (not exposed)
! 287: #elif nodeType == 9:
! 288: # pass
! 289: # DocumentFragment (never returned by XmlReader)
! 290: #elif nodeType == 11:
! 291: # pass
! 292: # None
! 293: #elif nodeType == 0:
! 294: # pass
! 295: # -
! 296: else:
! 297: raise SAXException("Unexpected node type %d" % nodeType)
! 298: if r == 0:
! 299: self._cont_handler.endDocument()
! 300: reader.Close()
! 301: finally:
! 302: self.__parsing = 0
! 303:
! 304: def setDTDHandler(self, handler):
! 305: # TODO (when supported, the inherited method works just fine)
! 306: raise SAXNotSupportedException("DTDHandler not supported")
! 307:
! 308: def setEntityResolver(self, resolver):
! 309: # TODO (when supported, the inherited method works just fine)
! 310: raise SAXNotSupportedException("EntityResolver not supported")
! 311:
! 312: def getFeature(self, name):
! 313: if name == feature_namespaces:
! 314: return self.__ns
! 315: elif name == feature_namespace_prefixes:
! 316: return self.__nspfx
! 317: elif name == feature_validation:
! 318: return self.__validate
! 319: elif name == feature_external_ges:
! 320: return 1 # TODO (does that relate to PARSER_LOADDTD)?
! 321: elif name == feature_external_pes:
! 322: return self.__extparams
! 323: else:
! 324: raise SAXNotRecognizedException("Feature '%s' not recognized" % \
! 325: name)
! 326:
! 327: def setFeature(self, name, state):
! 328: if self.__parsing:
! 329: raise SAXNotSupportedException("Cannot set feature %s " \
! 330: "while parsing" % name)
! 331: if name == feature_namespaces:
! 332: self.__ns = state
! 333: elif name == feature_namespace_prefixes:
! 334: self.__nspfx = state
! 335: elif name == feature_validation:
! 336: self.__validate = state
! 337: elif name == feature_external_ges:
! 338: if state == 0:
! 339: # TODO (does that relate to PARSER_LOADDTD)?
! 340: raise SAXNotSupportedException("Feature '%s' not supported" % \
! 341: name)
! 342: elif name == feature_external_pes:
! 343: self.__extparams = state
! 344: else:
! 345: raise SAXNotRecognizedException("Feature '%s' not recognized" % \
! 346: name)
! 347:
! 348: def getProperty(self, name):
! 349: if name == property_lexical_handler:
! 350: return self.__lex_handler
! 351: elif name == property_declaration_handler:
! 352: return self.__decl_handler
! 353: else:
! 354: raise SAXNotRecognizedException("Property '%s' not recognized" % \
! 355: name)
! 356:
! 357: def setProperty(self, name, value):
! 358: if name == property_lexical_handler:
! 359: self.__lex_handler = value
! 360: elif name == property_declaration_handler:
! 361: # TODO: remove if/when libxml2 supports dtd events
! 362: raise SAXNotSupportedException("Property '%s' not supported" % \
! 363: name)
! 364: self.__decl_handler = value
! 365: else:
! 366: raise SAXNotRecognizedException("Property '%s' not recognized" % \
! 367: name)
! 368:
! 369: def create_parser():
! 370: return LibXml2Reader()
! 371:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>