Annotation of embedaddon/libxml2/python/drv_libxml2.py, revision 1.1.1.1.4.1
1.1 misho 1: # -*- coding: iso-8859-1 -*-
2: """ A SAX2 driver for libxml2, on top of it's XmlReader API
3:
4: USAGE
5: # put this file (drv_libxml2.py) in PYTHONPATH
6: import xml.sax
7: reader = xml.sax.make_parser(["drv_libxml2"])
8: # ...and the rest is standard python sax.
9:
10: CAVEATS
11: - Lexical handlers are supported, except for start/endEntity
12: (waiting for XmlReader.ResolveEntity) and start/endDTD
13: - Error callbacks are not exactly synchronous, they tend
14: to be invoked before the corresponding content callback,
15: because the underlying reader interface parses
16: data by chunks of 512 bytes
17:
18: TODO
19: - search for TODO
20: - some ErrorHandler events (warning)
21: - some ContentHandler events (setDocumentLocator, skippedEntity)
22: - EntityResolver (using libxml2.?)
23: - DTDHandler (if/when libxml2 exposes such node types)
24: - DeclHandler (if/when libxml2 exposes such node types)
25: - property_xml_string?
26: - feature_string_interning?
27: - Incremental parser
28: - additional performance tuning:
29: - one might cache callbacks to avoid some name lookups
30: - one might implement a smarter way to pass attributes to startElement
31: (some kind of lazy evaluation?)
32: - there might be room for improvement in start/endPrefixMapping
33: - other?
34:
35: """
36:
1.1.1.1.4.1! misho 37: __author__ = "Stéphane Bidoul <sbi@skynet.be>"
1.1 misho 38: __version__ = "0.3"
39:
1.1.1.1.4.1! misho 40: import sys
1.1 misho 41: import codecs
1.1.1.1.4.1! misho 42:
! 43: if sys.version < "3":
! 44: __author__ = codecs.unicode_escape_decode(__author__)[0]
! 45:
! 46: from types import StringType, UnicodeType
! 47: StringTypes = (StringType,UnicodeType)
! 48:
! 49: else:
! 50: StringTypes = (str)
1.1 misho 51:
52: from xml.sax._exceptions import *
53: from xml.sax import xmlreader, saxutils
54: from xml.sax.handler import \
55: feature_namespaces, \
56: feature_namespace_prefixes, \
57: feature_string_interning, \
58: feature_validation, \
59: feature_external_ges, \
60: feature_external_pes, \
61: property_lexical_handler, \
62: property_declaration_handler, \
63: property_dom_node, \
64: property_xml_string
65:
66: # libxml2 returns strings as UTF8
67: _decoder = codecs.lookup("utf8")[1]
68: def _d(s):
69: if s is None:
70: return s
71: else:
72: return _decoder(s)[0]
73:
74: try:
75: import libxml2
1.1.1.1.4.1! misho 76: except ImportError as e:
1.1 misho 77: raise SAXReaderNotAvailable("libxml2 not available: " \
78: "import error was: %s" % e)
79:
80: class Locator(xmlreader.Locator):
81: """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
82:
83: def __init__(self,locator):
84: self.__locator = locator
85:
86: def getColumnNumber(self):
87: "Return the column number where the current event ends."
88: return -1
89:
90: def getLineNumber(self):
91: "Return the line number where the current event ends."
92: return self.__locator.LineNumber()
93:
94: def getPublicId(self):
95: "Return the public identifier for the current event."
96: return None
97:
98: def getSystemId(self):
99: "Return the system identifier for the current event."
100: return self.__locator.BaseURI()
101:
102: class LibXml2Reader(xmlreader.XMLReader):
103:
104: def __init__(self):
105: xmlreader.XMLReader.__init__(self)
106: # features
107: self.__ns = 0
108: self.__nspfx = 0
109: self.__validate = 0
110: self.__extparams = 1
111: # parsing flag
112: self.__parsing = 0
113: # additional handlers
114: self.__lex_handler = None
115: self.__decl_handler = None
116: # error messages accumulator
117: self.__errors = None
118:
119: def _errorHandler(self,arg,msg,severity,locator):
120: if self.__errors is None:
121: self.__errors = []
122: self.__errors.append((severity,
123: SAXParseException(msg,None,
124: Locator(locator))))
125:
126: def _reportErrors(self,fatal):
127: for severity,exception in self.__errors:
128: if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
129: libxml2.PARSER_SEVERITY_WARNING):
130: self._err_handler.warning(exception)
131: else:
132: # when fatal is set, the parse will stop;
133: # we consider that the last error reported
134: # is the fatal one.
135: if fatal and exception is self.__errors[-1][1]:
136: self._err_handler.fatalError(exception)
137: else:
138: self._err_handler.error(exception)
139: self.__errors = None
140:
141: def parse(self, source):
142: self.__parsing = 1
143: try:
144: # prepare source and create reader
145: if type(source) in StringTypes:
146: reader = libxml2.newTextReaderFilename(source)
147: else:
148: source = saxutils.prepare_input_source(source)
149: input = libxml2.inputBuffer(source.getByteStream())
150: reader = input.newTextReader(source.getSystemId())
151: reader.SetErrorHandler(self._errorHandler,None)
152: # configure reader
153: if self.__extparams:
154: reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
155: reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
156: reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
157: reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
158: else:
159: reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
160: # we reuse attribute maps (for a slight performance gain)
161: if self.__ns:
162: attributesNSImpl = xmlreader.AttributesNSImpl({},{})
163: else:
164: attributesImpl = xmlreader.AttributesImpl({})
165: # prefixes to pop (for endPrefixMapping)
166: prefixes = []
167: # start loop
168: self._cont_handler.startDocument()
169: while 1:
170: r = reader.Read()
171: # check for errors
172: if r == 1:
173: if not self.__errors is None:
174: self._reportErrors(0)
175: elif r == 0:
176: if not self.__errors is None:
177: self._reportErrors(0)
178: break # end of parse
179: else:
180: if not self.__errors is None:
181: self._reportErrors(1)
182: else:
183: self._err_handler.fatalError(\
184: SAXException("Read failed (no details available)"))
185: break # fatal parse error
186: # get node type
187: nodeType = reader.NodeType()
188: # Element
189: if nodeType == 1:
190: if self.__ns:
191: eltName = (_d(reader.NamespaceUri()),\
192: _d(reader.LocalName()))
193: eltQName = _d(reader.Name())
194: attributesNSImpl._attrs = attrs = {}
195: attributesNSImpl._qnames = qnames = {}
196: newPrefixes = []
197: while reader.MoveToNextAttribute():
198: qname = _d(reader.Name())
199: value = _d(reader.Value())
200: if qname.startswith("xmlns"):
201: if len(qname) > 5:
202: newPrefix = qname[6:]
203: else:
204: newPrefix = None
205: newPrefixes.append(newPrefix)
206: self._cont_handler.startPrefixMapping(\
207: newPrefix,value)
208: if not self.__nspfx:
209: continue # don't report xmlns attribute
210: attName = (_d(reader.NamespaceUri()),
211: _d(reader.LocalName()))
212: qnames[attName] = qname
213: attrs[attName] = value
214: reader.MoveToElement()
215: self._cont_handler.startElementNS( \
216: eltName,eltQName,attributesNSImpl)
217: if reader.IsEmptyElement():
218: self._cont_handler.endElementNS(eltName,eltQName)
219: for newPrefix in newPrefixes:
220: self._cont_handler.endPrefixMapping(newPrefix)
221: else:
222: prefixes.append(newPrefixes)
223: else:
224: eltName = _d(reader.Name())
225: attributesImpl._attrs = attrs = {}
226: while reader.MoveToNextAttribute():
227: attName = _d(reader.Name())
228: attrs[attName] = _d(reader.Value())
229: reader.MoveToElement()
230: self._cont_handler.startElement( \
231: eltName,attributesImpl)
232: if reader.IsEmptyElement():
233: self._cont_handler.endElement(eltName)
234: # EndElement
235: elif nodeType == 15:
236: if self.__ns:
237: self._cont_handler.endElementNS( \
238: (_d(reader.NamespaceUri()),_d(reader.LocalName())),
239: _d(reader.Name()))
240: for prefix in prefixes.pop():
241: self._cont_handler.endPrefixMapping(prefix)
242: else:
243: self._cont_handler.endElement(_d(reader.Name()))
244: # Text
245: elif nodeType == 3:
246: self._cont_handler.characters(_d(reader.Value()))
247: # Whitespace
248: elif nodeType == 13:
249: self._cont_handler.ignorableWhitespace(_d(reader.Value()))
250: # SignificantWhitespace
251: elif nodeType == 14:
252: self._cont_handler.characters(_d(reader.Value()))
253: # CDATA
254: elif nodeType == 4:
255: if not self.__lex_handler is None:
256: self.__lex_handler.startCDATA()
257: self._cont_handler.characters(_d(reader.Value()))
258: if not self.__lex_handler is None:
259: self.__lex_handler.endCDATA()
260: # EntityReference
261: elif nodeType == 5:
262: if not self.__lex_handler is None:
263: self.startEntity(_d(reader.Name()))
264: reader.ResolveEntity()
265: # EndEntity
266: elif nodeType == 16:
267: if not self.__lex_handler is None:
268: self.endEntity(_d(reader.Name()))
269: # ProcessingInstruction
270: elif nodeType == 7:
271: self._cont_handler.processingInstruction( \
272: _d(reader.Name()),_d(reader.Value()))
273: # Comment
274: elif nodeType == 8:
275: if not self.__lex_handler is None:
276: self.__lex_handler.comment(_d(reader.Value()))
277: # DocumentType
278: elif nodeType == 10:
279: #if not self.__lex_handler is None:
280: # self.__lex_handler.startDTD()
281: pass # TODO (how to detect endDTD? on first non-dtd event?)
282: # XmlDeclaration
283: elif nodeType == 17:
284: pass # TODO
285: # Entity
286: elif nodeType == 6:
287: pass # TODO (entity decl)
288: # Notation (decl)
289: elif nodeType == 12:
290: pass # TODO
291: # Attribute (never in this loop)
292: #elif nodeType == 2:
293: # pass
294: # Document (not exposed)
295: #elif nodeType == 9:
296: # pass
297: # DocumentFragment (never returned by XmlReader)
298: #elif nodeType == 11:
299: # pass
300: # None
301: #elif nodeType == 0:
302: # pass
303: # -
304: else:
305: raise SAXException("Unexpected node type %d" % nodeType)
306: if r == 0:
307: self._cont_handler.endDocument()
308: reader.Close()
309: finally:
310: self.__parsing = 0
311:
312: def setDTDHandler(self, handler):
313: # TODO (when supported, the inherited method works just fine)
314: raise SAXNotSupportedException("DTDHandler not supported")
315:
316: def setEntityResolver(self, resolver):
317: # TODO (when supported, the inherited method works just fine)
318: raise SAXNotSupportedException("EntityResolver not supported")
319:
320: def getFeature(self, name):
321: if name == feature_namespaces:
322: return self.__ns
323: elif name == feature_namespace_prefixes:
324: return self.__nspfx
325: elif name == feature_validation:
326: return self.__validate
327: elif name == feature_external_ges:
328: return 1 # TODO (does that relate to PARSER_LOADDTD)?
329: elif name == feature_external_pes:
330: return self.__extparams
331: else:
332: raise SAXNotRecognizedException("Feature '%s' not recognized" % \
333: name)
334:
335: def setFeature(self, name, state):
336: if self.__parsing:
337: raise SAXNotSupportedException("Cannot set feature %s " \
338: "while parsing" % name)
339: if name == feature_namespaces:
340: self.__ns = state
341: elif name == feature_namespace_prefixes:
342: self.__nspfx = state
343: elif name == feature_validation:
344: self.__validate = state
345: elif name == feature_external_ges:
346: if state == 0:
347: # TODO (does that relate to PARSER_LOADDTD)?
348: raise SAXNotSupportedException("Feature '%s' not supported" % \
349: name)
350: elif name == feature_external_pes:
351: self.__extparams = state
352: else:
353: raise SAXNotRecognizedException("Feature '%s' not recognized" % \
354: name)
355:
356: def getProperty(self, name):
357: if name == property_lexical_handler:
358: return self.__lex_handler
359: elif name == property_declaration_handler:
360: return self.__decl_handler
361: else:
362: raise SAXNotRecognizedException("Property '%s' not recognized" % \
363: name)
364:
365: def setProperty(self, name, value):
366: if name == property_lexical_handler:
367: self.__lex_handler = value
368: elif name == property_declaration_handler:
369: # TODO: remove if/when libxml2 supports dtd events
370: raise SAXNotSupportedException("Property '%s' not supported" % \
371: name)
372: self.__decl_handler = value
373: else:
374: raise SAXNotRecognizedException("Property '%s' not recognized" % \
375: name)
376:
377: def create_parser():
378: return LibXml2Reader()
379:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>