1: # -*- coding: iso-8859-1 -*-
2: """ A SAX2 driver for libxml2, on top of it's XmlReader API
3:
4: USAGE
5: # put this file (drv_libxml2.py) in PYTHONPATH
6: import xml.sax
7: reader = xml.sax.make_parser(["drv_libxml2"])
8: # ...and the rest is standard python sax.
9:
10: CAVEATS
11: - Lexical handlers are supported, except for start/endEntity
12: (waiting for XmlReader.ResolveEntity) and start/endDTD
13: - Error callbacks are not exactly synchronous, they tend
14: to be invoked before the corresponding content callback,
15: because the underlying reader interface parses
16: data by chunks of 512 bytes
17:
18: TODO
19: - search for TODO
20: - some ErrorHandler events (warning)
21: - some ContentHandler events (setDocumentLocator, skippedEntity)
22: - EntityResolver (using libxml2.?)
23: - DTDHandler (if/when libxml2 exposes such node types)
24: - DeclHandler (if/when libxml2 exposes such node types)
25: - property_xml_string?
26: - feature_string_interning?
27: - Incremental parser
28: - additional performance tuning:
29: - one might cache callbacks to avoid some name lookups
30: - one might implement a smarter way to pass attributes to startElement
31: (some kind of lazy evaluation?)
32: - there might be room for improvement in start/endPrefixMapping
33: - other?
34:
35: """
36:
37: __author__ = u"Stéphane Bidoul <sbi@skynet.be>"
38: __version__ = "0.3"
39:
40: import codecs
41: from types import StringType, UnicodeType
42: StringTypes = (StringType,UnicodeType)
43:
44: from xml.sax._exceptions import *
45: from xml.sax import xmlreader, saxutils
46: from xml.sax.handler import \
47: feature_namespaces, \
48: feature_namespace_prefixes, \
49: feature_string_interning, \
50: feature_validation, \
51: feature_external_ges, \
52: feature_external_pes, \
53: property_lexical_handler, \
54: property_declaration_handler, \
55: property_dom_node, \
56: property_xml_string
57:
58: # libxml2 returns strings as UTF8
59: _decoder = codecs.lookup("utf8")[1]
60: def _d(s):
61: if s is None:
62: return s
63: else:
64: return _decoder(s)[0]
65:
66: try:
67: import libxml2
68: except ImportError, e:
69: raise SAXReaderNotAvailable("libxml2 not available: " \
70: "import error was: %s" % e)
71:
72: class Locator(xmlreader.Locator):
73: """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
74:
75: def __init__(self,locator):
76: self.__locator = locator
77:
78: def getColumnNumber(self):
79: "Return the column number where the current event ends."
80: return -1
81:
82: def getLineNumber(self):
83: "Return the line number where the current event ends."
84: return self.__locator.LineNumber()
85:
86: def getPublicId(self):
87: "Return the public identifier for the current event."
88: return None
89:
90: def getSystemId(self):
91: "Return the system identifier for the current event."
92: return self.__locator.BaseURI()
93:
94: class LibXml2Reader(xmlreader.XMLReader):
95:
96: def __init__(self):
97: xmlreader.XMLReader.__init__(self)
98: # features
99: self.__ns = 0
100: self.__nspfx = 0
101: self.__validate = 0
102: self.__extparams = 1
103: # parsing flag
104: self.__parsing = 0
105: # additional handlers
106: self.__lex_handler = None
107: self.__decl_handler = None
108: # error messages accumulator
109: self.__errors = None
110:
111: def _errorHandler(self,arg,msg,severity,locator):
112: if self.__errors is None:
113: self.__errors = []
114: self.__errors.append((severity,
115: SAXParseException(msg,None,
116: Locator(locator))))
117:
118: def _reportErrors(self,fatal):
119: for severity,exception in self.__errors:
120: if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
121: libxml2.PARSER_SEVERITY_WARNING):
122: self._err_handler.warning(exception)
123: else:
124: # when fatal is set, the parse will stop;
125: # we consider that the last error reported
126: # is the fatal one.
127: if fatal and exception is self.__errors[-1][1]:
128: self._err_handler.fatalError(exception)
129: else:
130: self._err_handler.error(exception)
131: self.__errors = None
132:
133: def parse(self, source):
134: self.__parsing = 1
135: try:
136: # prepare source and create reader
137: if type(source) in StringTypes:
138: reader = libxml2.newTextReaderFilename(source)
139: else:
140: source = saxutils.prepare_input_source(source)
141: input = libxml2.inputBuffer(source.getByteStream())
142: reader = input.newTextReader(source.getSystemId())
143: reader.SetErrorHandler(self._errorHandler,None)
144: # configure reader
145: if self.__extparams:
146: reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
147: reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
148: reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
149: reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
150: else:
151: reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
152: # we reuse attribute maps (for a slight performance gain)
153: if self.__ns:
154: attributesNSImpl = xmlreader.AttributesNSImpl({},{})
155: else:
156: attributesImpl = xmlreader.AttributesImpl({})
157: # prefixes to pop (for endPrefixMapping)
158: prefixes = []
159: # start loop
160: self._cont_handler.startDocument()
161: while 1:
162: r = reader.Read()
163: # check for errors
164: if r == 1:
165: if not self.__errors is None:
166: self._reportErrors(0)
167: elif r == 0:
168: if not self.__errors is None:
169: self._reportErrors(0)
170: break # end of parse
171: else:
172: if not self.__errors is None:
173: self._reportErrors(1)
174: else:
175: self._err_handler.fatalError(\
176: SAXException("Read failed (no details available)"))
177: break # fatal parse error
178: # get node type
179: nodeType = reader.NodeType()
180: # Element
181: if nodeType == 1:
182: if self.__ns:
183: eltName = (_d(reader.NamespaceUri()),\
184: _d(reader.LocalName()))
185: eltQName = _d(reader.Name())
186: attributesNSImpl._attrs = attrs = {}
187: attributesNSImpl._qnames = qnames = {}
188: newPrefixes = []
189: while reader.MoveToNextAttribute():
190: qname = _d(reader.Name())
191: value = _d(reader.Value())
192: if qname.startswith("xmlns"):
193: if len(qname) > 5:
194: newPrefix = qname[6:]
195: else:
196: newPrefix = None
197: newPrefixes.append(newPrefix)
198: self._cont_handler.startPrefixMapping(\
199: newPrefix,value)
200: if not self.__nspfx:
201: continue # don't report xmlns attribute
202: attName = (_d(reader.NamespaceUri()),
203: _d(reader.LocalName()))
204: qnames[attName] = qname
205: attrs[attName] = value
206: reader.MoveToElement()
207: self._cont_handler.startElementNS( \
208: eltName,eltQName,attributesNSImpl)
209: if reader.IsEmptyElement():
210: self._cont_handler.endElementNS(eltName,eltQName)
211: for newPrefix in newPrefixes:
212: self._cont_handler.endPrefixMapping(newPrefix)
213: else:
214: prefixes.append(newPrefixes)
215: else:
216: eltName = _d(reader.Name())
217: attributesImpl._attrs = attrs = {}
218: while reader.MoveToNextAttribute():
219: attName = _d(reader.Name())
220: attrs[attName] = _d(reader.Value())
221: reader.MoveToElement()
222: self._cont_handler.startElement( \
223: eltName,attributesImpl)
224: if reader.IsEmptyElement():
225: self._cont_handler.endElement(eltName)
226: # EndElement
227: elif nodeType == 15:
228: if self.__ns:
229: self._cont_handler.endElementNS( \
230: (_d(reader.NamespaceUri()),_d(reader.LocalName())),
231: _d(reader.Name()))
232: for prefix in prefixes.pop():
233: self._cont_handler.endPrefixMapping(prefix)
234: else:
235: self._cont_handler.endElement(_d(reader.Name()))
236: # Text
237: elif nodeType == 3:
238: self._cont_handler.characters(_d(reader.Value()))
239: # Whitespace
240: elif nodeType == 13:
241: self._cont_handler.ignorableWhitespace(_d(reader.Value()))
242: # SignificantWhitespace
243: elif nodeType == 14:
244: self._cont_handler.characters(_d(reader.Value()))
245: # CDATA
246: elif nodeType == 4:
247: if not self.__lex_handler is None:
248: self.__lex_handler.startCDATA()
249: self._cont_handler.characters(_d(reader.Value()))
250: if not self.__lex_handler is None:
251: self.__lex_handler.endCDATA()
252: # EntityReference
253: elif nodeType == 5:
254: if not self.__lex_handler is None:
255: self.startEntity(_d(reader.Name()))
256: reader.ResolveEntity()
257: # EndEntity
258: elif nodeType == 16:
259: if not self.__lex_handler is None:
260: self.endEntity(_d(reader.Name()))
261: # ProcessingInstruction
262: elif nodeType == 7:
263: self._cont_handler.processingInstruction( \
264: _d(reader.Name()),_d(reader.Value()))
265: # Comment
266: elif nodeType == 8:
267: if not self.__lex_handler is None:
268: self.__lex_handler.comment(_d(reader.Value()))
269: # DocumentType
270: elif nodeType == 10:
271: #if not self.__lex_handler is None:
272: # self.__lex_handler.startDTD()
273: pass # TODO (how to detect endDTD? on first non-dtd event?)
274: # XmlDeclaration
275: elif nodeType == 17:
276: pass # TODO
277: # Entity
278: elif nodeType == 6:
279: pass # TODO (entity decl)
280: # Notation (decl)
281: elif nodeType == 12:
282: pass # TODO
283: # Attribute (never in this loop)
284: #elif nodeType == 2:
285: # pass
286: # Document (not exposed)
287: #elif nodeType == 9:
288: # pass
289: # DocumentFragment (never returned by XmlReader)
290: #elif nodeType == 11:
291: # pass
292: # None
293: #elif nodeType == 0:
294: # pass
295: # -
296: else:
297: raise SAXException("Unexpected node type %d" % nodeType)
298: if r == 0:
299: self._cont_handler.endDocument()
300: reader.Close()
301: finally:
302: self.__parsing = 0
303:
304: def setDTDHandler(self, handler):
305: # TODO (when supported, the inherited method works just fine)
306: raise SAXNotSupportedException("DTDHandler not supported")
307:
308: def setEntityResolver(self, resolver):
309: # TODO (when supported, the inherited method works just fine)
310: raise SAXNotSupportedException("EntityResolver not supported")
311:
312: def getFeature(self, name):
313: if name == feature_namespaces:
314: return self.__ns
315: elif name == feature_namespace_prefixes:
316: return self.__nspfx
317: elif name == feature_validation:
318: return self.__validate
319: elif name == feature_external_ges:
320: return 1 # TODO (does that relate to PARSER_LOADDTD)?
321: elif name == feature_external_pes:
322: return self.__extparams
323: else:
324: raise SAXNotRecognizedException("Feature '%s' not recognized" % \
325: name)
326:
327: def setFeature(self, name, state):
328: if self.__parsing:
329: raise SAXNotSupportedException("Cannot set feature %s " \
330: "while parsing" % name)
331: if name == feature_namespaces:
332: self.__ns = state
333: elif name == feature_namespace_prefixes:
334: self.__nspfx = state
335: elif name == feature_validation:
336: self.__validate = state
337: elif name == feature_external_ges:
338: if state == 0:
339: # TODO (does that relate to PARSER_LOADDTD)?
340: raise SAXNotSupportedException("Feature '%s' not supported" % \
341: name)
342: elif name == feature_external_pes:
343: self.__extparams = state
344: else:
345: raise SAXNotRecognizedException("Feature '%s' not recognized" % \
346: name)
347:
348: def getProperty(self, name):
349: if name == property_lexical_handler:
350: return self.__lex_handler
351: elif name == property_declaration_handler:
352: return self.__decl_handler
353: else:
354: raise SAXNotRecognizedException("Property '%s' not recognized" % \
355: name)
356:
357: def setProperty(self, name, value):
358: if name == property_lexical_handler:
359: self.__lex_handler = value
360: elif name == property_declaration_handler:
361: # TODO: remove if/when libxml2 supports dtd events
362: raise SAXNotSupportedException("Property '%s' not supported" % \
363: name)
364: self.__decl_handler = value
365: else:
366: raise SAXNotRecognizedException("Property '%s' not recognized" % \
367: name)
368:
369: def create_parser():
370: return LibXml2Reader()
371:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>