Annotation of embedaddon/libxml2/include/libxml/HTMLparser.h, revision 1.1.1.1
1.1 misho 1: /*
2: * Summary: interface for an HTML 4.0 non-verifying parser
3: * Description: this module implements an HTML 4.0 non-verifying parser
4: * with API compatible with the XML parser ones. It should
5: * be able to parse "real world" HTML, even if severely
6: * broken from a specification point of view.
7: *
8: * Copy: See Copyright for the status of this software.
9: *
10: * Author: Daniel Veillard
11: */
12:
13: #ifndef __HTML_PARSER_H__
14: #define __HTML_PARSER_H__
15: #include <libxml/xmlversion.h>
16: #include <libxml/parser.h>
17:
18: #ifdef LIBXML_HTML_ENABLED
19:
20: #ifdef __cplusplus
21: extern "C" {
22: #endif
23:
24: /*
25: * Most of the back-end structures from XML and HTML are shared.
26: */
27: typedef xmlParserCtxt htmlParserCtxt;
28: typedef xmlParserCtxtPtr htmlParserCtxtPtr;
29: typedef xmlParserNodeInfo htmlParserNodeInfo;
30: typedef xmlSAXHandler htmlSAXHandler;
31: typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
32: typedef xmlParserInput htmlParserInput;
33: typedef xmlParserInputPtr htmlParserInputPtr;
34: typedef xmlDocPtr htmlDocPtr;
35: typedef xmlNodePtr htmlNodePtr;
36:
37: /*
38: * Internal description of an HTML element, representing HTML 4.01
39: * and XHTML 1.0 (which share the same structure).
40: */
41: typedef struct _htmlElemDesc htmlElemDesc;
42: typedef htmlElemDesc *htmlElemDescPtr;
43: struct _htmlElemDesc {
44: const char *name; /* The tag name */
45: char startTag; /* Whether the start tag can be implied */
46: char endTag; /* Whether the end tag can be implied */
47: char saveEndTag; /* Whether the end tag should be saved */
48: char empty; /* Is this an empty element ? */
49: char depr; /* Is this a deprecated element ? */
50: char dtd; /* 1: only in Loose DTD, 2: only Frameset one */
51: char isinline; /* is this a block 0 or inline 1 element */
52: const char *desc; /* the description */
53:
54: /* NRK Jan.2003
55: * New fields encapsulating HTML structure
56: *
57: * Bugs:
58: * This is a very limited representation. It fails to tell us when
59: * an element *requires* subelements (we only have whether they're
60: * allowed or not), and it doesn't tell us where CDATA and PCDATA
61: * are allowed. Some element relationships are not fully represented:
62: * these are flagged with the word MODIFIER
63: */
64: const char** subelts; /* allowed sub-elements of this element */
65: const char* defaultsubelt; /* subelement for suggested auto-repair
66: if necessary or NULL */
67: const char** attrs_opt; /* Optional Attributes */
68: const char** attrs_depr; /* Additional deprecated attributes */
69: const char** attrs_req; /* Required attributes */
70: };
71:
72: /*
73: * Internal description of an HTML entity.
74: */
75: typedef struct _htmlEntityDesc htmlEntityDesc;
76: typedef htmlEntityDesc *htmlEntityDescPtr;
77: struct _htmlEntityDesc {
78: unsigned int value; /* the UNICODE value for the character */
79: const char *name; /* The entity name */
80: const char *desc; /* the description */
81: };
82:
83: /*
84: * There is only few public functions.
85: */
86: XMLPUBFUN const htmlElemDesc * XMLCALL
87: htmlTagLookup (const xmlChar *tag);
88: XMLPUBFUN const htmlEntityDesc * XMLCALL
89: htmlEntityLookup(const xmlChar *name);
90: XMLPUBFUN const htmlEntityDesc * XMLCALL
91: htmlEntityValueLookup(unsigned int value);
92:
93: XMLPUBFUN int XMLCALL
94: htmlIsAutoClosed(htmlDocPtr doc,
95: htmlNodePtr elem);
96: XMLPUBFUN int XMLCALL
97: htmlAutoCloseTag(htmlDocPtr doc,
98: const xmlChar *name,
99: htmlNodePtr elem);
100: XMLPUBFUN const htmlEntityDesc * XMLCALL
101: htmlParseEntityRef(htmlParserCtxtPtr ctxt,
102: const xmlChar **str);
103: XMLPUBFUN int XMLCALL
104: htmlParseCharRef(htmlParserCtxtPtr ctxt);
105: XMLPUBFUN void XMLCALL
106: htmlParseElement(htmlParserCtxtPtr ctxt);
107:
108: XMLPUBFUN htmlParserCtxtPtr XMLCALL
109: htmlNewParserCtxt(void);
110:
111: XMLPUBFUN htmlParserCtxtPtr XMLCALL
112: htmlCreateMemoryParserCtxt(const char *buffer,
113: int size);
114:
115: XMLPUBFUN int XMLCALL
116: htmlParseDocument(htmlParserCtxtPtr ctxt);
117: XMLPUBFUN htmlDocPtr XMLCALL
118: htmlSAXParseDoc (xmlChar *cur,
119: const char *encoding,
120: htmlSAXHandlerPtr sax,
121: void *userData);
122: XMLPUBFUN htmlDocPtr XMLCALL
123: htmlParseDoc (xmlChar *cur,
124: const char *encoding);
125: XMLPUBFUN htmlDocPtr XMLCALL
126: htmlSAXParseFile(const char *filename,
127: const char *encoding,
128: htmlSAXHandlerPtr sax,
129: void *userData);
130: XMLPUBFUN htmlDocPtr XMLCALL
131: htmlParseFile (const char *filename,
132: const char *encoding);
133: XMLPUBFUN int XMLCALL
134: UTF8ToHtml (unsigned char *out,
135: int *outlen,
136: const unsigned char *in,
137: int *inlen);
138: XMLPUBFUN int XMLCALL
139: htmlEncodeEntities(unsigned char *out,
140: int *outlen,
141: const unsigned char *in,
142: int *inlen, int quoteChar);
143: XMLPUBFUN int XMLCALL
144: htmlIsScriptAttribute(const xmlChar *name);
145: XMLPUBFUN int XMLCALL
146: htmlHandleOmittedElem(int val);
147:
148: #ifdef LIBXML_PUSH_ENABLED
149: /**
150: * Interfaces for the Push mode.
151: */
152: XMLPUBFUN htmlParserCtxtPtr XMLCALL
153: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
154: void *user_data,
155: const char *chunk,
156: int size,
157: const char *filename,
158: xmlCharEncoding enc);
159: XMLPUBFUN int XMLCALL
160: htmlParseChunk (htmlParserCtxtPtr ctxt,
161: const char *chunk,
162: int size,
163: int terminate);
164: #endif /* LIBXML_PUSH_ENABLED */
165:
166: XMLPUBFUN void XMLCALL
167: htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
168:
169: /*
170: * New set of simpler/more flexible APIs
171: */
172: /**
173: * xmlParserOption:
174: *
175: * This is the set of XML parser options that can be passed down
176: * to the xmlReadDoc() and similar calls.
177: */
178: typedef enum {
179: HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
180: HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
181: HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
182: HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
183: HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
184: HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
185: HTML_PARSE_NONET = 1<<11,/* Forbid network access */
186: HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
187: HTML_PARSE_COMPACT = 1<<16 /* compact small text nodes */
188: } htmlParserOption;
189:
190: XMLPUBFUN void XMLCALL
191: htmlCtxtReset (htmlParserCtxtPtr ctxt);
192: XMLPUBFUN int XMLCALL
193: htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
194: int options);
195: XMLPUBFUN htmlDocPtr XMLCALL
196: htmlReadDoc (const xmlChar *cur,
197: const char *URL,
198: const char *encoding,
199: int options);
200: XMLPUBFUN htmlDocPtr XMLCALL
201: htmlReadFile (const char *URL,
202: const char *encoding,
203: int options);
204: XMLPUBFUN htmlDocPtr XMLCALL
205: htmlReadMemory (const char *buffer,
206: int size,
207: const char *URL,
208: const char *encoding,
209: int options);
210: XMLPUBFUN htmlDocPtr XMLCALL
211: htmlReadFd (int fd,
212: const char *URL,
213: const char *encoding,
214: int options);
215: XMLPUBFUN htmlDocPtr XMLCALL
216: htmlReadIO (xmlInputReadCallback ioread,
217: xmlInputCloseCallback ioclose,
218: void *ioctx,
219: const char *URL,
220: const char *encoding,
221: int options);
222: XMLPUBFUN htmlDocPtr XMLCALL
223: htmlCtxtReadDoc (xmlParserCtxtPtr ctxt,
224: const xmlChar *cur,
225: const char *URL,
226: const char *encoding,
227: int options);
228: XMLPUBFUN htmlDocPtr XMLCALL
229: htmlCtxtReadFile (xmlParserCtxtPtr ctxt,
230: const char *filename,
231: const char *encoding,
232: int options);
233: XMLPUBFUN htmlDocPtr XMLCALL
234: htmlCtxtReadMemory (xmlParserCtxtPtr ctxt,
235: const char *buffer,
236: int size,
237: const char *URL,
238: const char *encoding,
239: int options);
240: XMLPUBFUN htmlDocPtr XMLCALL
241: htmlCtxtReadFd (xmlParserCtxtPtr ctxt,
242: int fd,
243: const char *URL,
244: const char *encoding,
245: int options);
246: XMLPUBFUN htmlDocPtr XMLCALL
247: htmlCtxtReadIO (xmlParserCtxtPtr ctxt,
248: xmlInputReadCallback ioread,
249: xmlInputCloseCallback ioclose,
250: void *ioctx,
251: const char *URL,
252: const char *encoding,
253: int options);
254:
255: /* NRK/Jan2003: further knowledge of HTML structure
256: */
257: typedef enum {
258: HTML_NA = 0 , /* something we don't check at all */
259: HTML_INVALID = 0x1 ,
260: HTML_DEPRECATED = 0x2 ,
261: HTML_VALID = 0x4 ,
262: HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
263: } htmlStatus ;
264:
265: /* Using htmlElemDesc rather than name here, to emphasise the fact
266: that otherwise there's a lookup overhead
267: */
268: XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
269: XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
270: XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
271: XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;
272: /**
273: * htmlDefaultSubelement:
274: * @elt: HTML element
275: *
276: * Returns the default subelement for this element
277: */
278: #define htmlDefaultSubelement(elt) elt->defaultsubelt
279: /**
280: * htmlElementAllowedHereDesc:
281: * @parent: HTML parent element
282: * @elt: HTML element
283: *
284: * Checks whether an HTML element description may be a
285: * direct child of the specified element.
286: *
287: * Returns 1 if allowed; 0 otherwise.
288: */
289: #define htmlElementAllowedHereDesc(parent,elt) \
290: htmlElementAllowedHere((parent), (elt)->name)
291: /**
292: * htmlRequiredAttrs:
293: * @elt: HTML element
294: *
295: * Returns the attributes required for the specified element.
296: */
297: #define htmlRequiredAttrs(elt) (elt)->attrs_req
298:
299:
300: #ifdef __cplusplus
301: }
302: #endif
303:
304: #endif /* LIBXML_HTML_ENABLED */
305: #endif /* __HTML_PARSER_H__ */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>