Return to HTMLparser.h CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / include / libxml |
1.1 ! misho 1: /* ! 2: * Summary: interface for an HTML 4.0 non-verifying parser ! 3: * Description: this module implements an HTML 4.0 non-verifying parser ! 4: * with API compatible with the XML parser ones. It should ! 5: * be able to parse "real world" HTML, even if severely ! 6: * broken from a specification point of view. ! 7: * ! 8: * Copy: See Copyright for the status of this software. ! 9: * ! 10: * Author: Daniel Veillard ! 11: */ ! 12: ! 13: #ifndef __HTML_PARSER_H__ ! 14: #define __HTML_PARSER_H__ ! 15: #include <libxml/xmlversion.h> ! 16: #include <libxml/parser.h> ! 17: ! 18: #ifdef LIBXML_HTML_ENABLED ! 19: ! 20: #ifdef __cplusplus ! 21: extern "C" { ! 22: #endif ! 23: ! 24: /* ! 25: * Most of the back-end structures from XML and HTML are shared. ! 26: */ ! 27: typedef xmlParserCtxt htmlParserCtxt; ! 28: typedef xmlParserCtxtPtr htmlParserCtxtPtr; ! 29: typedef xmlParserNodeInfo htmlParserNodeInfo; ! 30: typedef xmlSAXHandler htmlSAXHandler; ! 31: typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; ! 32: typedef xmlParserInput htmlParserInput; ! 33: typedef xmlParserInputPtr htmlParserInputPtr; ! 34: typedef xmlDocPtr htmlDocPtr; ! 35: typedef xmlNodePtr htmlNodePtr; ! 36: ! 37: /* ! 38: * Internal description of an HTML element, representing HTML 4.01 ! 39: * and XHTML 1.0 (which share the same structure). ! 40: */ ! 41: typedef struct _htmlElemDesc htmlElemDesc; ! 42: typedef htmlElemDesc *htmlElemDescPtr; ! 43: struct _htmlElemDesc { ! 44: const char *name; /* The tag name */ ! 45: char startTag; /* Whether the start tag can be implied */ ! 46: char endTag; /* Whether the end tag can be implied */ ! 47: char saveEndTag; /* Whether the end tag should be saved */ ! 48: char empty; /* Is this an empty element ? */ ! 49: char depr; /* Is this a deprecated element ? */ ! 50: char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ ! 51: char isinline; /* is this a block 0 or inline 1 element */ ! 52: const char *desc; /* the description */ ! 53: ! 54: /* NRK Jan.2003 ! 55: * New fields encapsulating HTML structure ! 56: * ! 57: * Bugs: ! 58: * This is a very limited representation. It fails to tell us when ! 59: * an element *requires* subelements (we only have whether they're ! 60: * allowed or not), and it doesn't tell us where CDATA and PCDATA ! 61: * are allowed. Some element relationships are not fully represented: ! 62: * these are flagged with the word MODIFIER ! 63: */ ! 64: const char** subelts; /* allowed sub-elements of this element */ ! 65: const char* defaultsubelt; /* subelement for suggested auto-repair ! 66: if necessary or NULL */ ! 67: const char** attrs_opt; /* Optional Attributes */ ! 68: const char** attrs_depr; /* Additional deprecated attributes */ ! 69: const char** attrs_req; /* Required attributes */ ! 70: }; ! 71: ! 72: /* ! 73: * Internal description of an HTML entity. ! 74: */ ! 75: typedef struct _htmlEntityDesc htmlEntityDesc; ! 76: typedef htmlEntityDesc *htmlEntityDescPtr; ! 77: struct _htmlEntityDesc { ! 78: unsigned int value; /* the UNICODE value for the character */ ! 79: const char *name; /* The entity name */ ! 80: const char *desc; /* the description */ ! 81: }; ! 82: ! 83: /* ! 84: * There is only few public functions. ! 85: */ ! 86: XMLPUBFUN const htmlElemDesc * XMLCALL ! 87: htmlTagLookup (const xmlChar *tag); ! 88: XMLPUBFUN const htmlEntityDesc * XMLCALL ! 89: htmlEntityLookup(const xmlChar *name); ! 90: XMLPUBFUN const htmlEntityDesc * XMLCALL ! 91: htmlEntityValueLookup(unsigned int value); ! 92: ! 93: XMLPUBFUN int XMLCALL ! 94: htmlIsAutoClosed(htmlDocPtr doc, ! 95: htmlNodePtr elem); ! 96: XMLPUBFUN int XMLCALL ! 97: htmlAutoCloseTag(htmlDocPtr doc, ! 98: const xmlChar *name, ! 99: htmlNodePtr elem); ! 100: XMLPUBFUN const htmlEntityDesc * XMLCALL ! 101: htmlParseEntityRef(htmlParserCtxtPtr ctxt, ! 102: const xmlChar **str); ! 103: XMLPUBFUN int XMLCALL ! 104: htmlParseCharRef(htmlParserCtxtPtr ctxt); ! 105: XMLPUBFUN void XMLCALL ! 106: htmlParseElement(htmlParserCtxtPtr ctxt); ! 107: ! 108: XMLPUBFUN htmlParserCtxtPtr XMLCALL ! 109: htmlNewParserCtxt(void); ! 110: ! 111: XMLPUBFUN htmlParserCtxtPtr XMLCALL ! 112: htmlCreateMemoryParserCtxt(const char *buffer, ! 113: int size); ! 114: ! 115: XMLPUBFUN int XMLCALL ! 116: htmlParseDocument(htmlParserCtxtPtr ctxt); ! 117: XMLPUBFUN htmlDocPtr XMLCALL ! 118: htmlSAXParseDoc (xmlChar *cur, ! 119: const char *encoding, ! 120: htmlSAXHandlerPtr sax, ! 121: void *userData); ! 122: XMLPUBFUN htmlDocPtr XMLCALL ! 123: htmlParseDoc (xmlChar *cur, ! 124: const char *encoding); ! 125: XMLPUBFUN htmlDocPtr XMLCALL ! 126: htmlSAXParseFile(const char *filename, ! 127: const char *encoding, ! 128: htmlSAXHandlerPtr sax, ! 129: void *userData); ! 130: XMLPUBFUN htmlDocPtr XMLCALL ! 131: htmlParseFile (const char *filename, ! 132: const char *encoding); ! 133: XMLPUBFUN int XMLCALL ! 134: UTF8ToHtml (unsigned char *out, ! 135: int *outlen, ! 136: const unsigned char *in, ! 137: int *inlen); ! 138: XMLPUBFUN int XMLCALL ! 139: htmlEncodeEntities(unsigned char *out, ! 140: int *outlen, ! 141: const unsigned char *in, ! 142: int *inlen, int quoteChar); ! 143: XMLPUBFUN int XMLCALL ! 144: htmlIsScriptAttribute(const xmlChar *name); ! 145: XMLPUBFUN int XMLCALL ! 146: htmlHandleOmittedElem(int val); ! 147: ! 148: #ifdef LIBXML_PUSH_ENABLED ! 149: /** ! 150: * Interfaces for the Push mode. ! 151: */ ! 152: XMLPUBFUN htmlParserCtxtPtr XMLCALL ! 153: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, ! 154: void *user_data, ! 155: const char *chunk, ! 156: int size, ! 157: const char *filename, ! 158: xmlCharEncoding enc); ! 159: XMLPUBFUN int XMLCALL ! 160: htmlParseChunk (htmlParserCtxtPtr ctxt, ! 161: const char *chunk, ! 162: int size, ! 163: int terminate); ! 164: #endif /* LIBXML_PUSH_ENABLED */ ! 165: ! 166: XMLPUBFUN void XMLCALL ! 167: htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); ! 168: ! 169: /* ! 170: * New set of simpler/more flexible APIs ! 171: */ ! 172: /** ! 173: * xmlParserOption: ! 174: * ! 175: * This is the set of XML parser options that can be passed down ! 176: * to the xmlReadDoc() and similar calls. ! 177: */ ! 178: typedef enum { ! 179: HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ ! 180: HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ ! 181: HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ ! 182: HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ ! 183: HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ ! 184: HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ ! 185: HTML_PARSE_NONET = 1<<11,/* Forbid network access */ ! 186: HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ ! 187: HTML_PARSE_COMPACT = 1<<16 /* compact small text nodes */ ! 188: } htmlParserOption; ! 189: ! 190: XMLPUBFUN void XMLCALL ! 191: htmlCtxtReset (htmlParserCtxtPtr ctxt); ! 192: XMLPUBFUN int XMLCALL ! 193: htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, ! 194: int options); ! 195: XMLPUBFUN htmlDocPtr XMLCALL ! 196: htmlReadDoc (const xmlChar *cur, ! 197: const char *URL, ! 198: const char *encoding, ! 199: int options); ! 200: XMLPUBFUN htmlDocPtr XMLCALL ! 201: htmlReadFile (const char *URL, ! 202: const char *encoding, ! 203: int options); ! 204: XMLPUBFUN htmlDocPtr XMLCALL ! 205: htmlReadMemory (const char *buffer, ! 206: int size, ! 207: const char *URL, ! 208: const char *encoding, ! 209: int options); ! 210: XMLPUBFUN htmlDocPtr XMLCALL ! 211: htmlReadFd (int fd, ! 212: const char *URL, ! 213: const char *encoding, ! 214: int options); ! 215: XMLPUBFUN htmlDocPtr XMLCALL ! 216: htmlReadIO (xmlInputReadCallback ioread, ! 217: xmlInputCloseCallback ioclose, ! 218: void *ioctx, ! 219: const char *URL, ! 220: const char *encoding, ! 221: int options); ! 222: XMLPUBFUN htmlDocPtr XMLCALL ! 223: htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, ! 224: const xmlChar *cur, ! 225: const char *URL, ! 226: const char *encoding, ! 227: int options); ! 228: XMLPUBFUN htmlDocPtr XMLCALL ! 229: htmlCtxtReadFile (xmlParserCtxtPtr ctxt, ! 230: const char *filename, ! 231: const char *encoding, ! 232: int options); ! 233: XMLPUBFUN htmlDocPtr XMLCALL ! 234: htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, ! 235: const char *buffer, ! 236: int size, ! 237: const char *URL, ! 238: const char *encoding, ! 239: int options); ! 240: XMLPUBFUN htmlDocPtr XMLCALL ! 241: htmlCtxtReadFd (xmlParserCtxtPtr ctxt, ! 242: int fd, ! 243: const char *URL, ! 244: const char *encoding, ! 245: int options); ! 246: XMLPUBFUN htmlDocPtr XMLCALL ! 247: htmlCtxtReadIO (xmlParserCtxtPtr ctxt, ! 248: xmlInputReadCallback ioread, ! 249: xmlInputCloseCallback ioclose, ! 250: void *ioctx, ! 251: const char *URL, ! 252: const char *encoding, ! 253: int options); ! 254: ! 255: /* NRK/Jan2003: further knowledge of HTML structure ! 256: */ ! 257: typedef enum { ! 258: HTML_NA = 0 , /* something we don't check at all */ ! 259: HTML_INVALID = 0x1 , ! 260: HTML_DEPRECATED = 0x2 , ! 261: HTML_VALID = 0x4 , ! 262: HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ ! 263: } htmlStatus ; ! 264: ! 265: /* Using htmlElemDesc rather than name here, to emphasise the fact ! 266: that otherwise there's a lookup overhead ! 267: */ ! 268: XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; ! 269: XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; ! 270: XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; ! 271: XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; ! 272: /** ! 273: * htmlDefaultSubelement: ! 274: * @elt: HTML element ! 275: * ! 276: * Returns the default subelement for this element ! 277: */ ! 278: #define htmlDefaultSubelement(elt) elt->defaultsubelt ! 279: /** ! 280: * htmlElementAllowedHereDesc: ! 281: * @parent: HTML parent element ! 282: * @elt: HTML element ! 283: * ! 284: * Checks whether an HTML element description may be a ! 285: * direct child of the specified element. ! 286: * ! 287: * Returns 1 if allowed; 0 otherwise. ! 288: */ ! 289: #define htmlElementAllowedHereDesc(parent,elt) \ ! 290: htmlElementAllowedHere((parent), (elt)->name) ! 291: /** ! 292: * htmlRequiredAttrs: ! 293: * @elt: HTML element ! 294: * ! 295: * Returns the attributes required for the specified element. ! 296: */ ! 297: #define htmlRequiredAttrs(elt) (elt)->attrs_req ! 298: ! 299: ! 300: #ifdef __cplusplus ! 301: } ! 302: #endif ! 303: ! 304: #endif /* LIBXML_HTML_ENABLED */ ! 305: #endif /* __HTML_PARSER_H__ */