embedaddon/libxml2/HTMLparser.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / HTMLparser.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 23:37:58 2012 UTC (12 years, 4 months ago) by misho
Branches: libxml2, MAIN
CVS tags: v2_7_8, HEAD

libxml2

1: /* 2: * HTMLparser.c : an HTML 4.0 non-verifying parser 3: * 4: * See Copyright for the status of this software. 5: * 6: * daniel@veillard.com 7: */ 8: 9: #define IN_LIBXML 10: #include "libxml.h" 11: #ifdef LIBXML_HTML_ENABLED 12: 13: #include <string.h> 14: #ifdef HAVE_CTYPE_H 15: #include <ctype.h> 16: #endif 17: #ifdef HAVE_STDLIB_H 18: #include <stdlib.h> 19: #endif 20: #ifdef HAVE_SYS_STAT_H 21: #include <sys/stat.h> 22: #endif 23: #ifdef HAVE_FCNTL_H 24: #include <fcntl.h> 25: #endif 26: #ifdef HAVE_UNISTD_H 27: #include <unistd.h> 28: #endif 29: #ifdef HAVE_ZLIB_H 30: #include <zlib.h> 31: #endif 32: 33: #include <libxml/xmlmemory.h> 34: #include <libxml/tree.h> 35: #include <libxml/parser.h> 36: #include <libxml/parserInternals.h> 37: #include <libxml/xmlerror.h> 38: #include <libxml/HTMLparser.h> 39: #include <libxml/HTMLtree.h> 40: #include <libxml/entities.h> 41: #include <libxml/encoding.h> 42: #include <libxml/valid.h> 43: #include <libxml/xmlIO.h> 44: #include <libxml/globals.h> 45: #include <libxml/uri.h> 46: 47: #define HTML_MAX_NAMELEN 1000 48: #define HTML_PARSER_BIG_BUFFER_SIZE 1000 49: #define HTML_PARSER_BUFFER_SIZE 100 50: 51: /* #define DEBUG */ 52: /* #define DEBUG_PUSH */ 53: 54: static int htmlOmittedDefaultValue = 1; 55: 56: xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 57: xmlChar end, xmlChar end2, xmlChar end3); 58: static void htmlParseComment(htmlParserCtxtPtr ctxt); 59: 60: /************************************************************************ 61: * * 62: * Some factorized error routines * 63: * * 64: ************************************************************************/ 65: 66: /** 67: * htmlErrMemory: 68: * @ctxt: an HTML parser context 69: * @extra: extra informations 70: * 71: * Handle a redefinition of attribute error 72: */ 73: static void 74: htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 75: { 76: if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 77: (ctxt->instate == XML_PARSER_EOF)) 78: return; 79: if (ctxt != NULL) { 80: ctxt->errNo = XML_ERR_NO_MEMORY; 81: ctxt->instate = XML_PARSER_EOF; 82: ctxt->disableSAX = 1; 83: } 84: if (extra) 85: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 86: XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 87: NULL, NULL, 0, 0, 88: "Memory allocation failed : %s\n", extra); 89: else 90: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 91: XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 92: NULL, NULL, 0, 0, "Memory allocation failed\n"); 93: } 94: 95: /** 96: * htmlParseErr: 97: * @ctxt: an HTML parser context 98: * @error: the error number 99: * @msg: the error message 100: * @str1: string infor 101: * @str2: string infor 102: * 103: * Handle a fatal parser error, i.e. violating Well-Formedness constraints 104: */ 105: static void 106: htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 107: const char *msg, const xmlChar *str1, const xmlChar *str2) 108: { 109: if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 110: (ctxt->instate == XML_PARSER_EOF)) 111: return; 112: if (ctxt != NULL) 113: ctxt->errNo = error; 114: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 115: XML_ERR_ERROR, NULL, 0, 116: (const char *) str1, (const char *) str2, 117: NULL, 0, 0, 118: msg, str1, str2); 119: if (ctxt != NULL) 120: ctxt->wellFormed = 0; 121: } 122: 123: /** 124: * htmlParseErrInt: 125: * @ctxt: an HTML parser context 126: * @error: the error number 127: * @msg: the error message 128: * @val: integer info 129: * 130: * Handle a fatal parser error, i.e. violating Well-Formedness constraints 131: */ 132: static void 133: htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 134: const char *msg, int val) 135: { 136: if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 137: (ctxt->instate == XML_PARSER_EOF)) 138: return; 139: if (ctxt != NULL) 140: ctxt->errNo = error; 141: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 142: XML_ERR_ERROR, NULL, 0, NULL, NULL, 143: NULL, val, 0, msg, val); 144: if (ctxt != NULL) 145: ctxt->wellFormed = 0; 146: } 147: 148: /************************************************************************ 149: * * 150: * Parser stacks related functions and macros * 151: * * 152: ************************************************************************/ 153: 154: /** 155: * htmlnamePush: 156: * @ctxt: an HTML parser context 157: * @value: the element name 158: * 159: * Pushes a new element name on top of the name stack 160: * 161: * Returns 0 in case of error, the index in the stack otherwise 162: */ 163: static int 164: htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 165: { 166: if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) 167: ctxt->html = 3; 168: if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) 169: ctxt->html = 10; 170: if (ctxt->nameNr >= ctxt->nameMax) { 171: ctxt->nameMax *= 2; 172: ctxt->nameTab = (const xmlChar * *) 173: xmlRealloc((xmlChar * *)ctxt->nameTab, 174: ctxt->nameMax * 175: sizeof(ctxt->nameTab[0])); 176: if (ctxt->nameTab == NULL) { 177: htmlErrMemory(ctxt, NULL); 178: return (0); 179: } 180: } 181: ctxt->nameTab[ctxt->nameNr] = value; 182: ctxt->name = value; 183: return (ctxt->nameNr++); 184: } 185: /** 186: * htmlnamePop: 187: * @ctxt: an HTML parser context 188: * 189: * Pops the top element name from the name stack 190: * 191: * Returns the name just removed 192: */ 193: static const xmlChar * 194: htmlnamePop(htmlParserCtxtPtr ctxt) 195: { 196: const xmlChar *ret; 197: 198: if (ctxt->nameNr <= 0) 199: return (NULL); 200: ctxt->nameNr--; 201: if (ctxt->nameNr < 0) 202: return (NULL); 203: if (ctxt->nameNr > 0) 204: ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 205: else 206: ctxt->name = NULL; 207: ret = ctxt->nameTab[ctxt->nameNr]; 208: ctxt->nameTab[ctxt->nameNr] = NULL; 209: return (ret); 210: } 211: 212: /** 213: * htmlNodeInfoPush: 214: * @ctxt: an HTML parser context 215: * @value: the node info 216: * 217: * Pushes a new element name on top of the node info stack 218: * 219: * Returns 0 in case of error, the index in the stack otherwise 220: */ 221: static int 222: htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) 223: { 224: if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { 225: if (ctxt->nodeInfoMax == 0) 226: ctxt->nodeInfoMax = 5; 227: ctxt->nodeInfoMax *= 2; 228: ctxt->nodeInfoTab = (htmlParserNodeInfo *) 229: xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, 230: ctxt->nodeInfoMax * 231: sizeof(ctxt->nodeInfoTab[0])); 232: if (ctxt->nodeInfoTab == NULL) { 233: htmlErrMemory(ctxt, NULL); 234: return (0); 235: } 236: } 237: ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; 238: ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 239: return (ctxt->nodeInfoNr++); 240: } 241: 242: /** 243: * htmlNodeInfoPop: 244: * @ctxt: an HTML parser context 245: * 246: * Pops the top element name from the node info stack 247: * 248: * Returns 0 in case of error, the pointer to NodeInfo otherwise 249: */ 250: static htmlParserNodeInfo * 251: htmlNodeInfoPop(htmlParserCtxtPtr ctxt) 252: { 253: if (ctxt->nodeInfoNr <= 0) 254: return (NULL); 255: ctxt->nodeInfoNr--; 256: if (ctxt->nodeInfoNr < 0) 257: return (NULL); 258: if (ctxt->nodeInfoNr > 0) 259: ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; 260: else 261: ctxt->nodeInfo = NULL; 262: return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 263: } 264: 265: /* 266: * Macros for accessing the content. Those should be used only by the parser, 267: * and not exported. 268: * 269: * Dirty macros, i.e. one need to make assumption on the context to use them 270: * 271: * CUR_PTR return the current pointer to the xmlChar to be parsed. 272: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 273: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 274: * in UNICODE mode. This should be used internally by the parser 275: * only to compare to ASCII values otherwise it would break when 276: * running with UTF-8 encoding. 277: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 278: * to compare on ASCII based substring. 279: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 280: * it should be used only to compare on ASCII based substring. 281: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 282: * strings without newlines within the parser. 283: * 284: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 285: * 286: * CURRENT Returns the current char value, with the full decoding of 287: * UTF-8 if we are using this mode. It returns an int. 288: * NEXT Skip to the next character, this does the proper decoding 289: * in UTF-8 mode. It also pop-up unfinished entities on the fly. 290: * NEXTL(l) Skip the current unicode character of l xmlChars long. 291: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 292: */ 293: 294: #define UPPER (toupper(*ctxt->input->cur)) 295: 296: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 297: 298: #define NXT(val) ctxt->input->cur[(val)] 299: 300: #define UPP(val) (toupper(ctxt->input->cur[(val)])) 301: 302: #define CUR_PTR ctxt->input->cur 303: 304: #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 305: (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 306: xmlParserInputShrink(ctxt->input) 307: 308: #define GROW if ((ctxt->progressive == 0) && \ 309: (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 310: xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 311: 312: #define CURRENT ((int) (*ctxt->input->cur)) 313: 314: #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 315: 316: /* Inported from XML */ 317: 318: /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 319: #define CUR ((int) (*ctxt->input->cur)) 320: #define NEXT xmlNextChar(ctxt) 321: 322: #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 323: 324: 325: #define NEXTL(l) do { \ 326: if (*(ctxt->input->cur) == '\n') { \ 327: ctxt->input->line++; ctxt->input->col = 1; \ 328: } else ctxt->input->col++; \ 329: ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 330: } while (0) 331: 332: /************ 333: \ 334: if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 335: if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 336: ************/ 337: 338: #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 339: #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 340: 341: #define COPY_BUF(l,b,i,v) \ 342: if (l == 1) b[i++] = (xmlChar) v; \ 343: else i += xmlCopyChar(l,&b[i],v) 344: 345: /** 346: * htmlFindEncoding: 347: * @the HTML parser context 348: * 349: * Ty to find and encoding in the current data available in the input 350: * buffer this is needed to try to switch to the proper encoding when 351: * one face a character error. 352: * That's an heuristic, since it's operating outside of parsing it could 353: * try to use a meta which had been commented out, that's the reason it 354: * should only be used in case of error, not as a default. 355: * 356: * Returns an encoding string or NULL if not found, the string need to 357: * be freed 358: */ 359: static xmlChar * 360: htmlFindEncoding(xmlParserCtxtPtr ctxt) { 361: const xmlChar *start, *cur, *end; 362: 363: if ((ctxt == NULL) || (ctxt->input == NULL) || 364: (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || 365: (ctxt->input->buf->encoder != NULL)) 366: return(NULL); 367: if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) 368: return(NULL); 369: 370: start = ctxt->input->cur; 371: end = ctxt->input->end; 372: /* we also expect the input buffer to be zero terminated */ 373: if (*end != 0) 374: return(NULL); 375: 376: cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV"); 377: if (cur == NULL) 378: return(NULL); 379: cur = xmlStrcasestr(cur, BAD_CAST "CONTENT"); 380: if (cur == NULL) 381: return(NULL); 382: cur = xmlStrcasestr(cur, BAD_CAST "CHARSET="); 383: if (cur == NULL) 384: return(NULL); 385: cur += 8; 386: start = cur; 387: while (((*cur >= 'A') && (*cur <= 'Z')) || 388: ((*cur >= 'a') && (*cur <= 'z')) || 389: ((*cur >= '0') && (*cur <= '9')) || 390: (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) 391: cur++; 392: if (cur == start) 393: return(NULL); 394: return(xmlStrndup(start, cur - start)); 395: } 396: 397: /** 398: * htmlCurrentChar: 399: * @ctxt: the HTML parser context 400: * @len: pointer to the length of the char read 401: * 402: * The current char value, if using UTF-8 this may actually span multiple 403: * bytes in the input buffer. Implement the end of line normalization: 404: * 2.11 End-of-Line Handling 405: * If the encoding is unspecified, in the case we find an ISO-Latin-1 406: * char, then the encoding converter is plugged in automatically. 407: * 408: * Returns the current char value and its length 409: */ 410: 411: static int 412: htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 413: if (ctxt->instate == XML_PARSER_EOF) 414: return(0); 415: 416: if (ctxt->token != 0) { 417: *len = 0; 418: return(ctxt->token); 419: } 420: if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 421: /* 422: * We are supposed to handle UTF8, check it's valid 423: * From rfc2044: encoding of the Unicode values on UTF-8: 424: * 425: * UCS-4 range (hex.) UTF-8 octet sequence (binary) 426: * 0000 0000-0000 007F 0xxxxxxx 427: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 428: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 429: * 430: * Check for the 0x110000 limit too 431: */ 432: const unsigned char *cur = ctxt->input->cur; 433: unsigned char c; 434: unsigned int val; 435: 436: c = *cur; 437: if (c & 0x80) { 438: if (cur[1] == 0) { 439: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 440: cur = ctxt->input->cur; 441: } 442: if ((cur[1] & 0xc0) != 0x80) 443: goto encoding_error; 444: if ((c & 0xe0) == 0xe0) { 445: 446: if (cur[2] == 0) { 447: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 448: cur = ctxt->input->cur; 449: } 450: if ((cur[2] & 0xc0) != 0x80) 451: goto encoding_error; 452: if ((c & 0xf0) == 0xf0) { 453: if (cur[3] == 0) { 454: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 455: cur = ctxt->input->cur; 456: } 457: if (((c & 0xf8) != 0xf0) || 458: ((cur[3] & 0xc0) != 0x80)) 459: goto encoding_error; 460: /* 4-byte code */ 461: *len = 4; 462: val = (cur[0] & 0x7) << 18; 463: val |= (cur[1] & 0x3f) << 12; 464: val |= (cur[2] & 0x3f) << 6; 465: val |= cur[3] & 0x3f; 466: } else { 467: /* 3-byte code */ 468: *len = 3; 469: val = (cur[0] & 0xf) << 12; 470: val |= (cur[1] & 0x3f) << 6; 471: val |= cur[2] & 0x3f; 472: } 473: } else { 474: /* 2-byte code */ 475: *len = 2; 476: val = (cur[0] & 0x1f) << 6; 477: val |= cur[1] & 0x3f; 478: } 479: if (!IS_CHAR(val)) { 480: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 481: "Char 0x%X out of allowed range\n", val); 482: } 483: return(val); 484: } else { 485: if ((*ctxt->input->cur == 0) && 486: (ctxt->input->cur < ctxt->input->end)) { 487: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 488: "Char 0x%X out of allowed range\n", 0); 489: *len = 1; 490: return(' '); 491: } 492: /* 1-byte code */ 493: *len = 1; 494: return((int) *ctxt->input->cur); 495: } 496: } 497: /* 498: * Assume it's a fixed length encoding (1) with 499: * a compatible encoding for the ASCII set, since 500: * XML constructs only use < 128 chars 501: */ 502: *len = 1; 503: if ((int) *ctxt->input->cur < 0x80) 504: return((int) *ctxt->input->cur); 505: 506: /* 507: * Humm this is bad, do an automatic flow conversion 508: */ 509: { 510: xmlChar * guess; 511: xmlCharEncodingHandlerPtr handler; 512: 513: guess = htmlFindEncoding(ctxt); 514: if (guess == NULL) { 515: xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 516: } else { 517: if (ctxt->input->encoding != NULL) 518: xmlFree((xmlChar *) ctxt->input->encoding); 519: ctxt->input->encoding = guess; 520: handler = xmlFindCharEncodingHandler((const char *) guess); 521: if (handler != NULL) { 522: xmlSwitchToEncoding(ctxt, handler); 523: } else { 524: htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 525: "Unsupported encoding %s", guess, NULL); 526: } 527: } 528: ctxt->charset = XML_CHAR_ENCODING_UTF8; 529: } 530: 531: return(xmlCurrentChar(ctxt, len)); 532: 533: encoding_error: 534: /* 535: * If we detect an UTF8 error that probably mean that the 536: * input encoding didn't get properly advertized in the 537: * declaration header. Report the error and switch the encoding 538: * to ISO-Latin-1 (if you don't like this policy, just declare the 539: * encoding !) 540: */ 541: { 542: char buffer[150]; 543: 544: if (ctxt->input->end - ctxt->input->cur >= 4) { 545: snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 546: ctxt->input->cur[0], ctxt->input->cur[1], 547: ctxt->input->cur[2], ctxt->input->cur[3]); 548: } else { 549: snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 550: } 551: htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 552: "Input is not proper UTF-8, indicate encoding !\n", 553: BAD_CAST buffer, NULL); 554: } 555: 556: ctxt->charset = XML_CHAR_ENCODING_8859_1; 557: *len = 1; 558: return((int) *ctxt->input->cur); 559: } 560: 561: /** 562: * htmlSkipBlankChars: 563: * @ctxt: the HTML parser context 564: * 565: * skip all blanks character found at that point in the input streams. 566: * 567: * Returns the number of space chars skipped 568: */ 569: 570: static int 571: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 572: int res = 0; 573: 574: while (IS_BLANK_CH(*(ctxt->input->cur))) { 575: if ((*ctxt->input->cur == 0) && 576: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 577: xmlPopInput(ctxt); 578: } else { 579: if (*(ctxt->input->cur) == '\n') { 580: ctxt->input->line++; ctxt->input->col = 1; 581: } else ctxt->input->col++; 582: ctxt->input->cur++; 583: ctxt->nbChars++; 584: if (*ctxt->input->cur == 0) 585: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 586: } 587: res++; 588: } 589: return(res); 590: } 591: 592: 593: 594: /************************************************************************ 595: * * 596: * The list of HTML elements and their properties * 597: * * 598: ************************************************************************/ 599: 600: /* 601: * Start Tag: 1 means the start tag can be ommited 602: * End Tag: 1 means the end tag can be ommited 603: * 2 means it's forbidden (empty elements) 604: * 3 means the tag is stylistic and should be closed easily 605: * Depr: this element is deprecated 606: * DTD: 1 means that this element is valid only in the Loose DTD 607: * 2 means that this element is valid only in the Frameset DTD 608: * 609: * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 610: , subElements , impliedsubelt , Attributes, userdata 611: */ 612: 613: /* Definitions and a couple of vars for HTML Elements */ 614: 615: #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 616: #define NB_FONTSTYLE 8 617: #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 618: #define NB_PHRASE 10 619: #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 620: #define NB_SPECIAL 16 621: #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL 622: #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 623: #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 624: #define NB_BLOCK NB_HEADING + NB_LIST + 14 625: #define FORMCTRL "input", "select", "textarea", "label", "button" 626: #define NB_FORMCTRL 5 627: #define PCDATA 628: #define NB_PCDATA 0 629: #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 630: #define NB_HEADING 6 631: #define LIST "ul", "ol", "dir", "menu" 632: #define NB_LIST 4 633: #define MODIFIER 634: #define NB_MODIFIER 0 635: #define FLOW BLOCK,INLINE 636: #define NB_FLOW NB_BLOCK + NB_INLINE 637: #define EMPTY NULL 638: 639: 640: static const char* const html_flow[] = { FLOW, NULL } ; 641: static const char* const html_inline[] = { INLINE, NULL } ; 642: 643: /* placeholders: elts with content but no subelements */ 644: static const char* const html_pcdata[] = { NULL } ; 645: #define html_cdata html_pcdata 646: 647: 648: /* ... and for HTML Attributes */ 649: 650: #define COREATTRS "id", "class", "style", "title" 651: #define NB_COREATTRS 4 652: #define I18N "lang", "dir" 653: #define NB_I18N 2 654: #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 655: #define NB_EVENTS 9 656: #define ATTRS COREATTRS,I18N,EVENTS 657: #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 658: #define CELLHALIGN "align", "char", "charoff" 659: #define NB_CELLHALIGN 3 660: #define CELLVALIGN "valign" 661: #define NB_CELLVALIGN 1 662: 663: static const char* const html_attrs[] = { ATTRS, NULL } ; 664: static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 665: static const char* const core_attrs[] = { COREATTRS, NULL } ; 666: static const char* const i18n_attrs[] = { I18N, NULL } ; 667: 668: 669: /* Other declarations that should go inline ... */ 670: static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 671: "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 672: "tabindex", "onfocus", "onblur", NULL } ; 673: static const char* const target_attr[] = { "target", NULL } ; 674: static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 675: static const char* const alt_attr[] = { "alt", NULL } ; 676: static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 677: static const char* const href_attrs[] = { "href", NULL } ; 678: static const char* const clear_attrs[] = { "clear", NULL } ; 679: static const char* const inline_p[] = { INLINE, "p", NULL } ; 680: 681: static const char* const flow_param[] = { FLOW, "param", NULL } ; 682: static const char* const applet_attrs[] = { COREATTRS , "codebase", 683: "archive", "alt", "name", "height", "width", "align", 684: "hspace", "vspace", NULL } ; 685: static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 686: "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 687: static const char* const basefont_attrs[] = 688: { "id", "size", "color", "face", NULL } ; 689: static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 690: static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 691: static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 692: static const char* const body_depr[] = { "background", "bgcolor", "text", 693: "link", "vlink", "alink", NULL } ; 694: static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 695: "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 696: 697: 698: static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 699: static const char* const col_elt[] = { "col", NULL } ; 700: static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 701: static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 702: static const char* const dl_contents[] = { "dt", "dd", NULL } ; 703: static const char* const compact_attr[] = { "compact", NULL } ; 704: static const char* const label_attr[] = { "label", NULL } ; 705: static const char* const fieldset_contents[] = { FLOW, "legend" } ; 706: static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 707: static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 708: static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 709: static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 710: static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 711: static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 712: static const char* const head_attrs[] = { I18N, "profile", NULL } ; 713: static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 714: static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 715: static const char* const version_attr[] = { "version", NULL } ; 716: static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 717: static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 718: static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 719: static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 720: static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 721: static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 722: static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 723: static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 724: static const char* const align_attr[] = { "align", NULL } ; 725: static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 726: static const char* const map_contents[] = { BLOCK, "area", NULL } ; 727: static const char* const name_attr[] = { "name", NULL } ; 728: static const char* const action_attr[] = { "action", NULL } ; 729: static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 730: static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; 731: static const char* const content_attr[] = { "content", NULL } ; 732: static const char* const type_attr[] = { "type", NULL } ; 733: static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 734: static const char* const object_contents[] = { FLOW, "param", NULL } ; 735: static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 736: static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 737: static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 738: static const char* const option_elt[] = { "option", NULL } ; 739: static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 740: static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 741: static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 742: static const char* const width_attr[] = { "width", NULL } ; 743: static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 744: static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 745: static const char* const language_attr[] = { "language", NULL } ; 746: static const char* const select_content[] = { "optgroup", "option", NULL } ; 747: static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 748: static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 749: static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 750: static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 751: static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 752: static const char* const tr_elt[] = { "tr", NULL } ; 753: static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 754: static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 755: static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 756: static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 757: static const char* const tr_contents[] = { "th", "td", NULL } ; 758: static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 759: static const char* const li_elt[] = { "li", NULL } ; 760: static const char* const ul_depr[] = { "type", "compact", NULL} ; 761: static const char* const dir_attr[] = { "dir", NULL} ; 762: 763: #define DECL (const char**) 764: 765: static const htmlElemDesc 766: html40ElementTable[] = { 767: { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 768: DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 769: }, 770: { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 771: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 772: }, 773: { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 774: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 775: }, 776: { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 777: DECL inline_p , NULL , DECL html_attrs, NULL, NULL 778: }, 779: { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 780: DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 781: }, 782: { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 783: EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 784: }, 785: { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 786: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 787: }, 788: { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 789: EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 790: }, 791: { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 792: EMPTY , NULL , NULL, DECL basefont_attrs, NULL 793: }, 794: { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 795: DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 796: }, 797: { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 798: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 799: }, 800: { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 801: DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 802: }, 803: { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 804: DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 805: }, 806: { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 807: EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 808: }, 809: { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 810: DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 811: }, 812: { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 813: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 814: }, 815: { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 816: DECL html_flow , NULL , NULL, DECL html_attrs, NULL 817: }, 818: { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 819: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 820: }, 821: { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 822: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 823: }, 824: { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 825: EMPTY , NULL , DECL col_attrs , NULL, NULL 826: }, 827: { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 828: DECL col_elt , "col" , DECL col_attrs , NULL, NULL 829: }, 830: { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 831: DECL html_flow , NULL , DECL html_attrs, NULL, NULL 832: }, 833: { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 834: DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 835: }, 836: { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 837: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 838: }, 839: { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 840: DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 841: }, 842: { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 843: DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 844: }, 845: { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 846: DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 847: }, 848: { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 849: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 850: }, 851: { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 852: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 853: }, 854: { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 855: EMPTY, NULL, DECL embed_attrs, NULL, NULL 856: }, 857: { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 858: DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 859: }, 860: { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 861: DECL html_inline, NULL, NULL, DECL font_attrs, NULL 862: }, 863: { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 864: DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 865: }, 866: { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 867: EMPTY, NULL, NULL, DECL frame_attrs, NULL 868: }, 869: { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 870: DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 871: }, 872: { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 873: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 874: }, 875: { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 876: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 877: }, 878: { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 879: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 880: }, 881: { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 882: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 883: }, 884: { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 885: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 886: }, 887: { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 888: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 889: }, 890: { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 891: DECL head_contents, NULL, DECL head_attrs, NULL, NULL 892: }, 893: { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 894: EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 895: }, 896: { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 897: DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 898: }, 899: { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 900: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 901: }, 902: { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 903: DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 904: }, 905: { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 906: EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 907: }, 908: { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 909: EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 910: }, 911: { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 912: DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 913: }, 914: { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 915: EMPTY, NULL, NULL, DECL prompt_attrs, NULL 916: }, 917: { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 918: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 919: }, 920: { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 921: DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 922: }, 923: { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 924: DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 925: }, 926: { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 927: DECL html_flow, NULL, DECL html_attrs, NULL, NULL 928: }, 929: { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 930: EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 931: }, 932: { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 933: DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 934: }, 935: { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 936: DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 937: }, 938: { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 939: EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 940: }, 941: { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 942: DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 943: }, 944: { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 945: DECL html_flow, "div", DECL html_attrs, NULL, NULL 946: }, 947: { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 948: DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 949: }, 950: { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 951: DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 952: }, 953: { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 954: DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 955: }, 956: { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 957: DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 958: }, 959: { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 960: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 961: }, 962: { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 963: EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 964: }, 965: { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 966: DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 967: }, 968: { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 969: DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 970: }, 971: { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 972: DECL html_inline, NULL, NULL, DECL html_attrs, NULL 973: }, 974: { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 975: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 976: }, 977: { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 978: DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 979: }, 980: { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 981: DECL select_content, NULL, DECL select_attrs, NULL, NULL 982: }, 983: { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 984: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 985: }, 986: { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 987: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 988: }, 989: { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 990: DECL html_inline, NULL, NULL, DECL html_attrs, NULL 991: }, 992: { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 993: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 994: }, 995: { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 996: DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 997: }, 998: { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 999: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1000: }, 1001: { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 1002: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1003: }, 1004: { "table", 0, 0, 0, 0, 0, 0, 0, "", 1005: DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 1006: }, 1007: { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 1008: DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1009: }, 1010: { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 1011: DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1012: }, 1013: { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 1014: DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 1015: }, 1016: { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 1017: DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1018: }, 1019: { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 1020: DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1021: }, 1022: { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 1023: DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1024: }, 1025: { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 1026: DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 1027: }, 1028: { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 1029: DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 1030: }, 1031: { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 1032: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1033: }, 1034: { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 1035: DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1036: }, 1037: { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 1038: DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 1039: }, 1040: { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 1041: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1042: } 1043: }; 1044: 1045: /* 1046: * start tags that imply the end of current element 1047: */ 1048: static const char * const htmlStartClose[] = { 1049: "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 1050: "dl", "ul", "ol", "menu", "dir", "address", "pre", 1051: "listing", "xmp", "head", NULL, 1052: "head", "p", NULL, 1053: "title", "p", NULL, 1054: "body", "head", "style", "link", "title", "p", NULL, 1055: "frameset", "head", "style", "link", "title", "p", NULL, 1056: "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 1057: "pre", "listing", "xmp", "head", "li", NULL, 1058: "hr", "p", "head", NULL, 1059: "h1", "p", "head", NULL, 1060: "h2", "p", "head", NULL, 1061: "h3", "p", "head", NULL, 1062: "h4", "p", "head", NULL, 1063: "h5", "p", "head", NULL, 1064: "h6", "p", "head", NULL, 1065: "dir", "p", "head", NULL, 1066: "address", "p", "head", "ul", NULL, 1067: "pre", "p", "head", "ul", NULL, 1068: "listing", "p", "head", NULL, 1069: "xmp", "p", "head", NULL, 1070: "blockquote", "p", "head", NULL, 1071: "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 1072: "xmp", "head", NULL, 1073: "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1074: "head", "dd", NULL, 1075: "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1076: "head", "dt", NULL, 1077: "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 1078: "listing", "xmp", NULL, 1079: "ol", "p", "head", "ul", NULL, 1080: "menu", "p", "head", "ul", NULL, 1081: "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, 1082: "div", "p", "head", NULL, 1083: "noscript", "p", "head", NULL, 1084: "center", "font", "b", "i", "p", "head", NULL, 1085: "a", "a", NULL, 1086: "caption", "p", NULL, 1087: "colgroup", "caption", "colgroup", "col", "p", NULL, 1088: "col", "caption", "col", "p", NULL, 1089: "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 1090: "listing", "xmp", "a", NULL, 1091: "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1092: "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1093: "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 1094: "thead", "caption", "col", "colgroup", NULL, 1095: "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1096: "tbody", "p", NULL, 1097: "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1098: "tfoot", "tbody", "p", NULL, 1099: "optgroup", "option", NULL, 1100: "option", "option", NULL, 1101: "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 1102: "pre", "listing", "xmp", "a", NULL, 1103: NULL 1104: }; 1105: 1106: /* 1107: * The list of HTML elements which are supposed not to have 1108: * CDATA content and where a p element will be implied 1109: * 1110: * TODO: extend that list by reading the HTML SGML DTD on 1111: * implied paragraph 1112: */ 1113: static const char *const htmlNoContentElements[] = { 1114: "html", 1115: "head", 1116: NULL 1117: }; 1118: 1119: /* 1120: * The list of HTML attributes which are of content %Script; 1121: * NOTE: when adding ones, check htmlIsScriptAttribute() since 1122: * it assumes the name starts with 'on' 1123: */ 1124: static const char *const htmlScriptAttributes[] = { 1125: "onclick", 1126: "ondblclick", 1127: "onmousedown", 1128: "onmouseup", 1129: "onmouseover", 1130: "onmousemove", 1131: "onmouseout", 1132: "onkeypress", 1133: "onkeydown", 1134: "onkeyup", 1135: "onload", 1136: "onunload", 1137: "onfocus", 1138: "onblur", 1139: "onsubmit", 1140: "onrest", 1141: "onchange", 1142: "onselect" 1143: }; 1144: 1145: /* 1146: * This table is used by the htmlparser to know what to do with 1147: * broken html pages. By assigning different priorities to different 1148: * elements the parser can decide how to handle extra endtags. 1149: * Endtags are only allowed to close elements with lower or equal 1150: * priority. 1151: */ 1152: 1153: typedef struct { 1154: const char *name; 1155: int priority; 1156: } elementPriority; 1157: 1158: static const elementPriority htmlEndPriority[] = { 1159: {"div", 150}, 1160: {"td", 160}, 1161: {"th", 160}, 1162: {"tr", 170}, 1163: {"thead", 180}, 1164: {"tbody", 180}, 1165: {"tfoot", 180}, 1166: {"table", 190}, 1167: {"head", 200}, 1168: {"body", 200}, 1169: {"html", 220}, 1170: {NULL, 100} /* Default priority */ 1171: }; 1172: 1173: static const char** htmlStartCloseIndex[100]; 1174: static int htmlStartCloseIndexinitialized = 0; 1175: 1176: /************************************************************************ 1177: * * 1178: * functions to handle HTML specific data * 1179: * * 1180: ************************************************************************/ 1181: 1182: /** 1183: * htmlInitAutoClose: 1184: * 1185: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1186: * This is not reentrant. Call xmlInitParser() once before processing in 1187: * case of use in multithreaded programs. 1188: */ 1189: void 1190: htmlInitAutoClose(void) { 1191: int indx, i = 0; 1192: 1193: if (htmlStartCloseIndexinitialized) return; 1194: 1195: for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1196: indx = 0; 1197: while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1198: htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 1199: while (htmlStartClose[i] != NULL) i++; 1200: i++; 1201: } 1202: htmlStartCloseIndexinitialized = 1; 1203: } 1204: 1205: /** 1206: * htmlTagLookup: 1207: * @tag: The tag name in lowercase 1208: * 1209: * Lookup the HTML tag in the ElementTable 1210: * 1211: * Returns the related htmlElemDescPtr or NULL if not found. 1212: */ 1213: const htmlElemDesc * 1214: htmlTagLookup(const xmlChar *tag) { 1215: unsigned int i; 1216: 1217: for (i = 0; i < (sizeof(html40ElementTable) / 1218: sizeof(html40ElementTable[0]));i++) { 1219: if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1220: return((htmlElemDescPtr) &html40ElementTable[i]); 1221: } 1222: return(NULL); 1223: } 1224: 1225: /** 1226: * htmlGetEndPriority: 1227: * @name: The name of the element to look up the priority for. 1228: * 1229: * Return value: The "endtag" priority. 1230: **/ 1231: static int 1232: htmlGetEndPriority (const xmlChar *name) { 1233: int i = 0; 1234: 1235: while ((htmlEndPriority[i].name != NULL) && 1236: (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1237: i++; 1238: 1239: return(htmlEndPriority[i].priority); 1240: } 1241: 1242: 1243: /** 1244: * htmlCheckAutoClose: 1245: * @newtag: The new tag name 1246: * @oldtag: The old tag name 1247: * 1248: * Checks whether the new tag is one of the registered valid tags for 1249: * closing old. 1250: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1251: * 1252: * Returns 0 if no, 1 if yes. 1253: */ 1254: static int 1255: htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1256: { 1257: int i, indx; 1258: const char **closed = NULL; 1259: 1260: if (htmlStartCloseIndexinitialized == 0) 1261: htmlInitAutoClose(); 1262: 1263: /* inefficient, but not a big deal */ 1264: for (indx = 0; indx < 100; indx++) { 1265: closed = htmlStartCloseIndex[indx]; 1266: if (closed == NULL) 1267: return (0); 1268: if (xmlStrEqual(BAD_CAST * closed, newtag)) 1269: break; 1270: } 1271: 1272: i = closed - htmlStartClose; 1273: i++; 1274: while (htmlStartClose[i] != NULL) { 1275: if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1276: return (1); 1277: } 1278: i++; 1279: } 1280: return (0); 1281: } 1282: 1283: /** 1284: * htmlAutoCloseOnClose: 1285: * @ctxt: an HTML parser context 1286: * @newtag: The new tag name 1287: * @force: force the tag closure 1288: * 1289: * The HTML DTD allows an ending tag to implicitly close other tags. 1290: */ 1291: static void 1292: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1293: { 1294: const htmlElemDesc *info; 1295: int i, priority; 1296: 1297: priority = htmlGetEndPriority(newtag); 1298: 1299: for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1300: 1301: if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1302: break; 1303: /* 1304: * A missplaced endtag can only close elements with lower 1305: * or equal priority, so if we find an element with higher 1306: * priority before we find an element with 1307: * matching name, we just ignore this endtag 1308: */ 1309: if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1310: return; 1311: } 1312: if (i < 0) 1313: return; 1314: 1315: while (!xmlStrEqual(newtag, ctxt->name)) { 1316: info = htmlTagLookup(ctxt->name); 1317: if ((info != NULL) && (info->endTag == 3)) { 1318: htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1319: "Opening and ending tag mismatch: %s and %s\n", 1320: newtag, ctxt->name); 1321: } 1322: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1323: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1324: htmlnamePop(ctxt); 1325: } 1326: } 1327: 1328: /** 1329: * htmlAutoCloseOnEnd: 1330: * @ctxt: an HTML parser context 1331: * 1332: * Close all remaining tags at the end of the stream 1333: */ 1334: static void 1335: htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1336: { 1337: int i; 1338: 1339: if (ctxt->nameNr == 0) 1340: return; 1341: for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1342: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1343: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1344: htmlnamePop(ctxt); 1345: } 1346: } 1347: 1348: /** 1349: * htmlAutoClose: 1350: * @ctxt: an HTML parser context 1351: * @newtag: The new tag name or NULL 1352: * 1353: * The HTML DTD allows a tag to implicitly close other tags. 1354: * The list is kept in htmlStartClose array. This function is 1355: * called when a new tag has been detected and generates the 1356: * appropriates closes if possible/needed. 1357: * If newtag is NULL this mean we are at the end of the resource 1358: * and we should check 1359: */ 1360: static void 1361: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1362: { 1363: while ((newtag != NULL) && (ctxt->name != NULL) && 1364: (htmlCheckAutoClose(newtag, ctxt->name))) { 1365: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1366: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1367: htmlnamePop(ctxt); 1368: } 1369: if (newtag == NULL) { 1370: htmlAutoCloseOnEnd(ctxt); 1371: return; 1372: } 1373: while ((newtag == NULL) && (ctxt->name != NULL) && 1374: ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1375: (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1376: (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1377: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1378: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1379: htmlnamePop(ctxt); 1380: } 1381: } 1382: 1383: /** 1384: * htmlAutoCloseTag: 1385: * @doc: the HTML document 1386: * @name: The tag name 1387: * @elem: the HTML element 1388: * 1389: * The HTML DTD allows a tag to implicitly close other tags. 1390: * The list is kept in htmlStartClose array. This function checks 1391: * if the element or one of it's children would autoclose the 1392: * given tag. 1393: * 1394: * Returns 1 if autoclose, 0 otherwise 1395: */ 1396: int 1397: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1398: htmlNodePtr child; 1399: 1400: if (elem == NULL) return(1); 1401: if (xmlStrEqual(name, elem->name)) return(0); 1402: if (htmlCheckAutoClose(elem->name, name)) return(1); 1403: child = elem->children; 1404: while (child != NULL) { 1405: if (htmlAutoCloseTag(doc, name, child)) return(1); 1406: child = child->next; 1407: } 1408: return(0); 1409: } 1410: 1411: /** 1412: * htmlIsAutoClosed: 1413: * @doc: the HTML document 1414: * @elem: the HTML element 1415: * 1416: * The HTML DTD allows a tag to implicitly close other tags. 1417: * The list is kept in htmlStartClose array. This function checks 1418: * if a tag is autoclosed by one of it's child 1419: * 1420: * Returns 1 if autoclosed, 0 otherwise 1421: */ 1422: int 1423: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1424: htmlNodePtr child; 1425: 1426: if (elem == NULL) return(1); 1427: child = elem->children; 1428: while (child != NULL) { 1429: if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1430: child = child->next; 1431: } 1432: return(0); 1433: } 1434: 1435: /** 1436: * htmlCheckImplied: 1437: * @ctxt: an HTML parser context 1438: * @newtag: The new tag name 1439: * 1440: * The HTML DTD allows a tag to exists only implicitly 1441: * called when a new tag has been detected and generates the 1442: * appropriates implicit tags if missing 1443: */ 1444: static void 1445: htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1446: int i; 1447: 1448: if (ctxt->options & HTML_PARSE_NOIMPLIED) 1449: return; 1450: if (!htmlOmittedDefaultValue) 1451: return; 1452: if (xmlStrEqual(newtag, BAD_CAST"html")) 1453: return; 1454: if (ctxt->nameNr <= 0) { 1455: htmlnamePush(ctxt, BAD_CAST"html"); 1456: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1457: ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1458: } 1459: if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1460: return; 1461: if ((ctxt->nameNr <= 1) && 1462: ((xmlStrEqual(newtag, BAD_CAST"script")) || 1463: (xmlStrEqual(newtag, BAD_CAST"style")) || 1464: (xmlStrEqual(newtag, BAD_CAST"meta")) || 1465: (xmlStrEqual(newtag, BAD_CAST"link")) || 1466: (xmlStrEqual(newtag, BAD_CAST"title")) || 1467: (xmlStrEqual(newtag, BAD_CAST"base")))) { 1468: if (ctxt->html >= 3) { 1469: /* we already saw or generated an <head> before */ 1470: return; 1471: } 1472: /* 1473: * dropped OBJECT ... i you put it first BODY will be 1474: * assumed ! 1475: */ 1476: htmlnamePush(ctxt, BAD_CAST"head"); 1477: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1478: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1479: } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1480: (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1481: (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1482: if (ctxt->html >= 10) { 1483: /* we already saw or generated a <body> before */ 1484: return; 1485: } 1486: for (i = 0;i < ctxt->nameNr;i++) { 1487: if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1488: return; 1489: } 1490: if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1491: return; 1492: } 1493: } 1494: 1495: htmlnamePush(ctxt, BAD_CAST"body"); 1496: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1497: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1498: } 1499: } 1500: 1501: /** 1502: * htmlCheckParagraph 1503: * @ctxt: an HTML parser context 1504: * 1505: * Check whether a p element need to be implied before inserting 1506: * characters in the current element. 1507: * 1508: * Returns 1 if a paragraph has been inserted, 0 if not and -1 1509: * in case of error. 1510: */ 1511: 1512: static int 1513: htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1514: const xmlChar *tag; 1515: int i; 1516: 1517: if (ctxt == NULL) 1518: return(-1); 1519: tag = ctxt->name; 1520: if (tag == NULL) { 1521: htmlAutoClose(ctxt, BAD_CAST"p"); 1522: htmlCheckImplied(ctxt, BAD_CAST"p"); 1523: htmlnamePush(ctxt, BAD_CAST"p"); 1524: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1525: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1526: return(1); 1527: } 1528: if (!htmlOmittedDefaultValue) 1529: return(0); 1530: for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1531: if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1532: htmlAutoClose(ctxt, BAD_CAST"p"); 1533: htmlCheckImplied(ctxt, BAD_CAST"p"); 1534: htmlnamePush(ctxt, BAD_CAST"p"); 1535: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1536: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1537: return(1); 1538: } 1539: } 1540: return(0); 1541: } 1542: 1543: /** 1544: * htmlIsScriptAttribute: 1545: * @name: an attribute name 1546: * 1547: * Check if an attribute is of content type Script 1548: * 1549: * Returns 1 is the attribute is a script 0 otherwise 1550: */ 1551: int 1552: htmlIsScriptAttribute(const xmlChar *name) { 1553: unsigned int i; 1554: 1555: if (name == NULL) 1556: return(0); 1557: /* 1558: * all script attributes start with 'on' 1559: */ 1560: if ((name[0] != 'o') || (name[1] != 'n')) 1561: return(0); 1562: for (i = 0; 1563: i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1564: i++) { 1565: if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1566: return(1); 1567: } 1568: return(0); 1569: } 1570: 1571: /************************************************************************ 1572: * * 1573: * The list of HTML predefined entities * 1574: * * 1575: ************************************************************************/ 1576: 1577: 1578: static const htmlEntityDesc html40EntitiesTable[] = { 1579: /* 1580: * the 4 absolute ones, plus apostrophe. 1581: */ 1582: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1583: { 38, "amp", "ampersand, U+0026 ISOnum" }, 1584: { 39, "apos", "single quote" }, 1585: { 60, "lt", "less-than sign, U+003C ISOnum" }, 1586: { 62, "gt", "greater-than sign, U+003E ISOnum" }, 1587: 1588: /* 1589: * A bunch still in the 128-255 range 1590: * Replacing them depend really on the charset used. 1591: */ 1592: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1593: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1594: { 162, "cent", "cent sign, U+00A2 ISOnum" }, 1595: { 163, "pound","pound sign, U+00A3 ISOnum" }, 1596: { 164, "curren","currency sign, U+00A4 ISOnum" }, 1597: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1598: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1599: { 167, "sect", "section sign, U+00A7 ISOnum" }, 1600: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1601: { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1602: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1603: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1604: { 172, "not", "not sign, U+00AC ISOnum" }, 1605: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1606: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1607: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1608: { 176, "deg", "degree sign, U+00B0 ISOnum" }, 1609: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1610: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1611: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1612: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1613: { 181, "micro","micro sign, U+00B5 ISOnum" }, 1614: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1615: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1616: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1617: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1618: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1619: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1620: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1621: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1622: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1623: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1624: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1625: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1626: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1627: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1628: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1629: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1630: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1631: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1632: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1633: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1634: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1635: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1636: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1637: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1638: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1639: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1640: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1641: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1642: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1643: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1644: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1645: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1646: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1647: { 215, "times","multiplication sign, U+00D7 ISOnum" }, 1648: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1649: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1650: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1651: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1652: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1653: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1654: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1655: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1656: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1657: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1658: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1659: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1660: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1661: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1662: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1663: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1664: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1665: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1666: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1667: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1668: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1669: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1670: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1671: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1672: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1673: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1674: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1675: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1676: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1677: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1678: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1679: { 247, "divide","division sign, U+00F7 ISOnum" }, 1680: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1681: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1682: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1683: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1684: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1685: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1686: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1687: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1688: 1689: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1690: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1691: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1692: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1693: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1694: 1695: /* 1696: * Anything below should really be kept as entities references 1697: */ 1698: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1699: 1700: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1701: { 732, "tilde","small tilde, U+02DC ISOdia" }, 1702: 1703: { 913, "Alpha","greek capital letter alpha, U+0391" }, 1704: { 914, "Beta", "greek capital letter beta, U+0392" }, 1705: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1706: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1707: { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1708: { 918, "Zeta", "greek capital letter zeta, U+0396" }, 1709: { 919, "Eta", "greek capital letter eta, U+0397" }, 1710: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1711: { 921, "Iota", "greek capital letter iota, U+0399" }, 1712: { 922, "Kappa","greek capital letter kappa, U+039A" }, 1713: { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1714: { 924, "Mu", "greek capital letter mu, U+039C" }, 1715: { 925, "Nu", "greek capital letter nu, U+039D" }, 1716: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1717: { 927, "Omicron","greek capital letter omicron, U+039F" }, 1718: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1719: { 929, "Rho", "greek capital letter rho, U+03A1" }, 1720: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1721: { 932, "Tau", "greek capital letter tau, U+03A4" }, 1722: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1723: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1724: { 935, "Chi", "greek capital letter chi, U+03A7" }, 1725: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1726: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1727: 1728: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1729: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1730: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1731: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1732: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1733: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1734: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1735: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1736: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1737: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1738: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1739: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1740: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1741: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1742: { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1743: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1744: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1745: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1746: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1747: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1748: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1749: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1750: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1751: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1752: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1753: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1754: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1755: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1756: 1757: { 8194, "ensp", "en space, U+2002 ISOpub" }, 1758: { 8195, "emsp", "em space, U+2003 ISOpub" }, 1759: { 8201, "thinsp","thin space, U+2009 ISOpub" }, 1760: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1761: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1762: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1763: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1764: { 8211, "ndash","en dash, U+2013 ISOpub" }, 1765: { 8212, "mdash","em dash, U+2014 ISOpub" }, 1766: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1767: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1768: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1769: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1770: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1771: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1772: { 8224, "dagger","dagger, U+2020 ISOpub" }, 1773: { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1774: 1775: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1776: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1777: 1778: { 8240, "permil","per mille sign, U+2030 ISOtech" }, 1779: 1780: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1781: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1782: 1783: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1784: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1785: 1786: { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1787: { 8260, "frasl","fraction slash, U+2044 NEW" }, 1788: 1789: { 8364, "euro", "euro sign, U+20AC NEW" }, 1790: 1791: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1792: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1793: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1794: { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1795: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1796: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1797: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1798: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1799: { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1800: { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1801: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1802: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1803: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1804: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1805: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1806: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1807: 1808: { 8704, "forall","for all, U+2200 ISOtech" }, 1809: { 8706, "part", "partial differential, U+2202 ISOtech" }, 1810: { 8707, "exist","there exists, U+2203 ISOtech" }, 1811: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1812: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1813: { 8712, "isin", "element of, U+2208 ISOtech" }, 1814: { 8713, "notin","not an element of, U+2209 ISOtech" }, 1815: { 8715, "ni", "contains as member, U+220B ISOtech" }, 1816: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1817: { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1818: { 8722, "minus","minus sign, U+2212 ISOtech" }, 1819: { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1820: { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1821: { 8733, "prop", "proportional to, U+221D ISOtech" }, 1822: { 8734, "infin","infinity, U+221E ISOtech" }, 1823: { 8736, "ang", "angle, U+2220 ISOamso" }, 1824: { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1825: { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1826: { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1827: { 8746, "cup", "union = cup, U+222A ISOtech" }, 1828: { 8747, "int", "integral, U+222B ISOtech" }, 1829: { 8756, "there4","therefore, U+2234 ISOtech" }, 1830: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1831: { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1832: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1833: { 8800, "ne", "not equal to, U+2260 ISOtech" }, 1834: { 8801, "equiv","identical to, U+2261 ISOtech" }, 1835: { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1836: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1837: { 8834, "sub", "subset of, U+2282 ISOtech" }, 1838: { 8835, "sup", "superset of, U+2283 ISOtech" }, 1839: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1840: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1841: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1842: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1843: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1844: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1845: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1846: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1847: { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1848: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1849: { 8971, "rfloor","right floor, U+230B ISOamsc" }, 1850: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1851: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1852: { 9674, "loz", "lozenge, U+25CA ISOpub" }, 1853: 1854: { 9824, "spades","black spade suit, U+2660 ISOpub" }, 1855: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1856: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1857: { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1858: 1859: }; 1860: 1861: /************************************************************************ 1862: * * 1863: * Commodity functions to handle entities * 1864: * * 1865: ************************************************************************/ 1866: 1867: /* 1868: * Macro used to grow the current buffer. 1869: */ 1870: #define growBuffer(buffer) { \ 1871: xmlChar *tmp; \ 1872: buffer##_size *= 2; \ 1873: tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1874: if (tmp == NULL) { \ 1875: htmlErrMemory(ctxt, "growing buffer\n"); \ 1876: xmlFree(buffer); \ 1877: return(NULL); \ 1878: } \ 1879: buffer = tmp; \ 1880: } 1881: 1882: /** 1883: * htmlEntityLookup: 1884: * @name: the entity name 1885: * 1886: * Lookup the given entity in EntitiesTable 1887: * 1888: * TODO: the linear scan is really ugly, an hash table is really needed. 1889: * 1890: * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1891: */ 1892: const htmlEntityDesc * 1893: htmlEntityLookup(const xmlChar *name) { 1894: unsigned int i; 1895: 1896: for (i = 0;i < (sizeof(html40EntitiesTable)/ 1897: sizeof(html40EntitiesTable[0]));i++) { 1898: if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1899: return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1900: } 1901: } 1902: return(NULL); 1903: } 1904: 1905: /** 1906: * htmlEntityValueLookup: 1907: * @value: the entity's unicode value 1908: * 1909: * Lookup the given entity in EntitiesTable 1910: * 1911: * TODO: the linear scan is really ugly, an hash table is really needed. 1912: * 1913: * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1914: */ 1915: const htmlEntityDesc * 1916: htmlEntityValueLookup(unsigned int value) { 1917: unsigned int i; 1918: 1919: for (i = 0;i < (sizeof(html40EntitiesTable)/ 1920: sizeof(html40EntitiesTable[0]));i++) { 1921: if (html40EntitiesTable[i].value >= value) { 1922: if (html40EntitiesTable[i].value > value) 1923: break; 1924: return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1925: } 1926: } 1927: return(NULL); 1928: } 1929: 1930: /** 1931: * UTF8ToHtml: 1932: * @out: a pointer to an array of bytes to store the result 1933: * @outlen: the length of @out 1934: * @in: a pointer to an array of UTF-8 chars 1935: * @inlen: the length of @in 1936: * 1937: * Take a block of UTF-8 chars in and try to convert it to an ASCII 1938: * plus HTML entities block of chars out. 1939: * 1940: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1941: * The value of @inlen after return is the number of octets consumed 1942: * as the return value is positive, else unpredictable. 1943: * The value of @outlen after return is the number of octets consumed. 1944: */ 1945: int 1946: UTF8ToHtml(unsigned char* out, int *outlen, 1947: const unsigned char* in, int *inlen) { 1948: const unsigned char* processed = in; 1949: const unsigned char* outend; 1950: const unsigned char* outstart = out; 1951: const unsigned char* instart = in; 1952: const unsigned char* inend; 1953: unsigned int c, d; 1954: int trailing; 1955: 1956: if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1957: if (in == NULL) { 1958: /* 1959: * initialization nothing to do 1960: */ 1961: *outlen = 0; 1962: *inlen = 0; 1963: return(0); 1964: } 1965: inend = in + (*inlen); 1966: outend = out + (*outlen); 1967: while (in < inend) { 1968: d = *in++; 1969: if (d < 0x80) { c= d; trailing= 0; } 1970: else if (d < 0xC0) { 1971: /* trailing byte in leading position */ 1972: *outlen = out - outstart; 1973: *inlen = processed - instart; 1974: return(-2); 1975: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1976: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1977: else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1978: else { 1979: /* no chance for this in Ascii */ 1980: *outlen = out - outstart; 1981: *inlen = processed - instart; 1982: return(-2); 1983: } 1984: 1985: if (inend - in < trailing) { 1986: break; 1987: } 1988: 1989: for ( ; trailing; trailing--) { 1990: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 1991: break; 1992: c <<= 6; 1993: c |= d & 0x3F; 1994: } 1995: 1996: /* assertion: c is a single UTF-4 value */ 1997: if (c < 0x80) { 1998: if (out + 1 >= outend) 1999: break; 2000: *out++ = c; 2001: } else { 2002: int len; 2003: const htmlEntityDesc * ent; 2004: const char *cp; 2005: char nbuf[16]; 2006: 2007: /* 2008: * Try to lookup a predefined HTML entity for it 2009: */ 2010: 2011: ent = htmlEntityValueLookup(c); 2012: if (ent == NULL) { 2013: snprintf(nbuf, sizeof(nbuf), "#%u", c); 2014: cp = nbuf; 2015: } 2016: else 2017: cp = ent->name; 2018: len = strlen(cp); 2019: if (out + 2 + len >= outend) 2020: break; 2021: *out++ = '&'; 2022: memcpy(out, cp, len); 2023: out += len; 2024: *out++ = ';'; 2025: } 2026: processed = in; 2027: } 2028: *outlen = out - outstart; 2029: *inlen = processed - instart; 2030: return(0); 2031: } 2032: 2033: /** 2034: * htmlEncodeEntities: 2035: * @out: a pointer to an array of bytes to store the result 2036: * @outlen: the length of @out 2037: * @in: a pointer to an array of UTF-8 chars 2038: * @inlen: the length of @in 2039: * @quoteChar: the quote character to escape (' or ") or zero. 2040: * 2041: * Take a block of UTF-8 chars in and try to convert it to an ASCII 2042: * plus HTML entities block of chars out. 2043: * 2044: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 2045: * The value of @inlen after return is the number of octets consumed 2046: * as the return value is positive, else unpredictable. 2047: * The value of @outlen after return is the number of octets consumed. 2048: */ 2049: int 2050: htmlEncodeEntities(unsigned char* out, int *outlen, 2051: const unsigned char* in, int *inlen, int quoteChar) { 2052: const unsigned char* processed = in; 2053: const unsigned char* outend; 2054: const unsigned char* outstart = out; 2055: const unsigned char* instart = in; 2056: const unsigned char* inend; 2057: unsigned int c, d; 2058: int trailing; 2059: 2060: if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 2061: return(-1); 2062: outend = out + (*outlen); 2063: inend = in + (*inlen); 2064: while (in < inend) { 2065: d = *in++; 2066: if (d < 0x80) { c= d; trailing= 0; } 2067: else if (d < 0xC0) { 2068: /* trailing byte in leading position */ 2069: *outlen = out - outstart; 2070: *inlen = processed - instart; 2071: return(-2); 2072: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2073: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2074: else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2075: else { 2076: /* no chance for this in Ascii */ 2077: *outlen = out - outstart; 2078: *inlen = processed - instart; 2079: return(-2); 2080: } 2081: 2082: if (inend - in < trailing) 2083: break; 2084: 2085: while (trailing--) { 2086: if (((d= *in++) & 0xC0) != 0x80) { 2087: *outlen = out - outstart; 2088: *inlen = processed - instart; 2089: return(-2); 2090: } 2091: c <<= 6; 2092: c |= d & 0x3F; 2093: } 2094: 2095: /* assertion: c is a single UTF-4 value */ 2096: if ((c < 0x80) && (c != (unsigned int) quoteChar) && 2097: (c != '&') && (c != '<') && (c != '>')) { 2098: if (out >= outend) 2099: break; 2100: *out++ = c; 2101: } else { 2102: const htmlEntityDesc * ent; 2103: const char *cp; 2104: char nbuf[16]; 2105: int len; 2106: 2107: /* 2108: * Try to lookup a predefined HTML entity for it 2109: */ 2110: ent = htmlEntityValueLookup(c); 2111: if (ent == NULL) { 2112: snprintf(nbuf, sizeof(nbuf), "#%u", c); 2113: cp = nbuf; 2114: } 2115: else 2116: cp = ent->name; 2117: len = strlen(cp); 2118: if (out + 2 + len > outend) 2119: break; 2120: *out++ = '&'; 2121: memcpy(out, cp, len); 2122: out += len; 2123: *out++ = ';'; 2124: } 2125: processed = in; 2126: } 2127: *outlen = out - outstart; 2128: *inlen = processed - instart; 2129: return(0); 2130: } 2131: 2132: /************************************************************************ 2133: * * 2134: * Commodity functions to handle streams * 2135: * * 2136: ************************************************************************/ 2137: 2138: /** 2139: * htmlNewInputStream: 2140: * @ctxt: an HTML parser context 2141: * 2142: * Create a new input stream structure 2143: * Returns the new input stream or NULL 2144: */ 2145: static htmlParserInputPtr 2146: htmlNewInputStream(htmlParserCtxtPtr ctxt) { 2147: htmlParserInputPtr input; 2148: 2149: input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 2150: if (input == NULL) { 2151: htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 2152: return(NULL); 2153: } 2154: memset(input, 0, sizeof(htmlParserInput)); 2155: input->filename = NULL; 2156: input->directory = NULL; 2157: input->base = NULL; 2158: input->cur = NULL; 2159: input->buf = NULL; 2160: input->line = 1; 2161: input->col = 1; 2162: input->buf = NULL; 2163: input->free = NULL; 2164: input->version = NULL; 2165: input->consumed = 0; 2166: input->length = 0; 2167: return(input); 2168: } 2169: 2170: 2171: /************************************************************************ 2172: * * 2173: * Commodity functions, cleanup needed ? * 2174: * * 2175: ************************************************************************/ 2176: /* 2177: * all tags allowing pc data from the html 4.01 loose dtd 2178: * NOTE: it might be more apropriate to integrate this information 2179: * into the html40ElementTable array but I don't want to risk any 2180: * binary incomptibility 2181: */ 2182: static const char *allowPCData[] = { 2183: "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2184: "blockquote", "body", "button", "caption", "center", "cite", "code", 2185: "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2186: "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2187: "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2188: "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2189: }; 2190: 2191: /** 2192: * areBlanks: 2193: * @ctxt: an HTML parser context 2194: * @str: a xmlChar * 2195: * @len: the size of @str 2196: * 2197: * Is this a sequence of blank chars that one can ignore ? 2198: * 2199: * Returns 1 if ignorable 0 otherwise. 2200: */ 2201: 2202: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2203: unsigned int i; 2204: int j; 2205: xmlNodePtr lastChild; 2206: xmlDtdPtr dtd; 2207: 2208: for (j = 0;j < len;j++) 2209: if (!(IS_BLANK_CH(str[j]))) return(0); 2210: 2211: if (CUR == 0) return(1); 2212: if (CUR != '<') return(0); 2213: if (ctxt->name == NULL) 2214: return(1); 2215: if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2216: return(1); 2217: if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2218: return(1); 2219: 2220: /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2221: if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2222: dtd = xmlGetIntSubset(ctxt->myDoc); 2223: if (dtd != NULL && dtd->ExternalID != NULL) { 2224: if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2225: !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2226: return(1); 2227: } 2228: } 2229: 2230: if (ctxt->node == NULL) return(0); 2231: lastChild = xmlGetLastChild(ctxt->node); 2232: while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2233: lastChild = lastChild->prev; 2234: if (lastChild == NULL) { 2235: if ((ctxt->node->type != XML_ELEMENT_NODE) && 2236: (ctxt->node->content != NULL)) return(0); 2237: /* keep ws in constructs like ... ... 2238: for all tags "b" allowing PCDATA */ 2239: for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2240: if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2241: return(0); 2242: } 2243: } 2244: } else if (xmlNodeIsText(lastChild)) { 2245: return(0); 2246: } else { 2247: /* keep ws in constructs like xy z 2248: for all tags "p" allowing PCDATA */ 2249: for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2250: if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2251: return(0); 2252: } 2253: } 2254: } 2255: return(1); 2256: } 2257: 2258: /** 2259: * htmlNewDocNoDtD: 2260: * @URI: URI for the dtd, or NULL 2261: * @ExternalID: the external ID of the DTD, or NULL 2262: * 2263: * Creates a new HTML document without a DTD node if @URI and @ExternalID 2264: * are NULL 2265: * 2266: * Returns a new document, do not initialize the DTD if not provided 2267: */ 2268: htmlDocPtr 2269: htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2270: xmlDocPtr cur; 2271: 2272: /* 2273: * Allocate a new document and fill the fields. 2274: */ 2275: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2276: if (cur == NULL) { 2277: htmlErrMemory(NULL, "HTML document creation failed\n"); 2278: return(NULL); 2279: } 2280: memset(cur, 0, sizeof(xmlDoc)); 2281: 2282: cur->type = XML_HTML_DOCUMENT_NODE; 2283: cur->version = NULL; 2284: cur->intSubset = NULL; 2285: cur->doc = cur; 2286: cur->name = NULL; 2287: cur->children = NULL; 2288: cur->extSubset = NULL; 2289: cur->oldNs = NULL; 2290: cur->encoding = NULL; 2291: cur->standalone = 1; 2292: cur->compression = 0; 2293: cur->ids = NULL; 2294: cur->refs = NULL; 2295: cur->_private = NULL; 2296: cur->charset = XML_CHAR_ENCODING_UTF8; 2297: cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 2298: if ((ExternalID != NULL) || 2299: (URI != NULL)) 2300: xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2301: return(cur); 2302: } 2303: 2304: /** 2305: * htmlNewDoc: 2306: * @URI: URI for the dtd, or NULL 2307: * @ExternalID: the external ID of the DTD, or NULL 2308: * 2309: * Creates a new HTML document 2310: * 2311: * Returns a new document 2312: */ 2313: htmlDocPtr 2314: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2315: if ((URI == NULL) && (ExternalID == NULL)) 2316: return(htmlNewDocNoDtD( 2317: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2318: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2319: 2320: return(htmlNewDocNoDtD(URI, ExternalID)); 2321: } 2322: 2323: 2324: /************************************************************************ 2325: * * 2326: * The parser itself * 2327: * Relates to http://www.w3.org/TR/html40 * 2328: * * 2329: ************************************************************************/ 2330: 2331: /************************************************************************ 2332: * * 2333: * The parser itself * 2334: * * 2335: ************************************************************************/ 2336: 2337: static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2338: 2339: /** 2340: * htmlParseHTMLName: 2341: * @ctxt: an HTML parser context 2342: * 2343: * parse an HTML tag or attribute name, note that we convert it to lowercase 2344: * since HTML names are not case-sensitive. 2345: * 2346: * Returns the Tag Name parsed or NULL 2347: */ 2348: 2349: static const xmlChar * 2350: htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2351: int i = 0; 2352: xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2353: 2354: if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2355: (CUR != ':') && (CUR != '.')) return(NULL); 2356: 2357: while ((i < HTML_PARSER_BUFFER_SIZE) && 2358: ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2359: (CUR == ':') || (CUR == '-') || (CUR == '_') || 2360: (CUR == '.'))) { 2361: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2362: else loc[i] = CUR; 2363: i++; 2364: 2365: NEXT; 2366: } 2367: 2368: return(xmlDictLookup(ctxt->dict, loc, i)); 2369: } 2370: 2371: 2372: /** 2373: * htmlParseHTMLName_nonInvasive: 2374: * @ctxt: an HTML parser context 2375: * 2376: * parse an HTML tag or attribute name, note that we convert it to lowercase 2377: * since HTML names are not case-sensitive, this doesn't consume the data 2378: * from the stream, it's a look-ahead 2379: * 2380: * Returns the Tag Name parsed or NULL 2381: */ 2382: 2383: static const xmlChar * 2384: htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2385: int i = 0; 2386: xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2387: 2388: if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2389: (NXT(1) != ':')) return(NULL); 2390: 2391: while ((i < HTML_PARSER_BUFFER_SIZE) && 2392: ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2393: (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2394: if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2395: else loc[i] = NXT(1+i); 2396: i++; 2397: } 2398: 2399: return(xmlDictLookup(ctxt->dict, loc, i)); 2400: } 2401: 2402: 2403: /** 2404: * htmlParseName: 2405: * @ctxt: an HTML parser context 2406: * 2407: * parse an HTML name, this routine is case sensitive. 2408: * 2409: * Returns the Name parsed or NULL 2410: */ 2411: 2412: static const xmlChar * 2413: htmlParseName(htmlParserCtxtPtr ctxt) { 2414: const xmlChar *in; 2415: const xmlChar *ret; 2416: int count = 0; 2417: 2418: GROW; 2419: 2420: /* 2421: * Accelerator for simple ASCII names 2422: */ 2423: in = ctxt->input->cur; 2424: if (((*in >= 0x61) && (*in <= 0x7A)) || 2425: ((*in >= 0x41) && (*in <= 0x5A)) || 2426: (*in == '_') || (*in == ':')) { 2427: in++; 2428: while (((*in >= 0x61) && (*in <= 0x7A)) || 2429: ((*in >= 0x41) && (*in <= 0x5A)) || 2430: ((*in >= 0x30) && (*in <= 0x39)) || 2431: (*in == '_') || (*in == '-') || 2432: (*in == ':') || (*in == '.')) 2433: in++; 2434: if ((*in > 0) && (*in < 0x80)) { 2435: count = in - ctxt->input->cur; 2436: ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2437: ctxt->input->cur = in; 2438: ctxt->nbChars += count; 2439: ctxt->input->col += count; 2440: return(ret); 2441: } 2442: } 2443: return(htmlParseNameComplex(ctxt)); 2444: } 2445: 2446: static const xmlChar * 2447: htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2448: int len = 0, l; 2449: int c; 2450: int count = 0; 2451: 2452: /* 2453: * Handler for more complex cases 2454: */ 2455: GROW; 2456: c = CUR_CHAR(l); 2457: if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2458: (!IS_LETTER(c) && (c != '_') && 2459: (c != ':'))) { 2460: return(NULL); 2461: } 2462: 2463: while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2464: ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2465: (c == '.') || (c == '-') || 2466: (c == '_') || (c == ':') || 2467: (IS_COMBINING(c)) || 2468: (IS_EXTENDER(c)))) { 2469: if (count++ > 100) { 2470: count = 0; 2471: GROW; 2472: } 2473: len += l; 2474: NEXTL(l); 2475: c = CUR_CHAR(l); 2476: } 2477: return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2478: } 2479: 2480: 2481: /** 2482: * htmlParseHTMLAttribute: 2483: * @ctxt: an HTML parser context 2484: * @stop: a char stop value 2485: * 2486: * parse an HTML attribute value till the stop (quote), if 2487: * stop is 0 then it stops at the first space 2488: * 2489: * Returns the attribute parsed or NULL 2490: */ 2491: 2492: static xmlChar * 2493: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2494: xmlChar *buffer = NULL; 2495: int buffer_size = 0; 2496: xmlChar *out = NULL; 2497: const xmlChar *name = NULL; 2498: const xmlChar *cur = NULL; 2499: const htmlEntityDesc * ent; 2500: 2501: /* 2502: * allocate a translation buffer. 2503: */ 2504: buffer_size = HTML_PARSER_BUFFER_SIZE; 2505: buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2506: if (buffer == NULL) { 2507: htmlErrMemory(ctxt, "buffer allocation failed\n"); 2508: return(NULL); 2509: } 2510: out = buffer; 2511: 2512: /* 2513: * Ok loop until we reach one of the ending chars 2514: */ 2515: while ((CUR != 0) && (CUR != stop)) { 2516: if ((stop == 0) && (CUR == '>')) break; 2517: if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2518: if (CUR == '&') { 2519: if (NXT(1) == '#') { 2520: unsigned int c; 2521: int bits; 2522: 2523: c = htmlParseCharRef(ctxt); 2524: if (c < 0x80) 2525: { *out++ = c; bits= -6; } 2526: else if (c < 0x800) 2527: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2528: else if (c < 0x10000) 2529: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2530: else 2531: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2532: 2533: for ( ; bits >= 0; bits-= 6) { 2534: *out++ = ((c >> bits) & 0x3F) | 0x80; 2535: } 2536: 2537: if (out - buffer > buffer_size - 100) { 2538: int indx = out - buffer; 2539: 2540: growBuffer(buffer); 2541: out = &buffer[indx]; 2542: } 2543: } else { 2544: ent = htmlParseEntityRef(ctxt, &name); 2545: if (name == NULL) { 2546: *out++ = '&'; 2547: if (out - buffer > buffer_size - 100) { 2548: int indx = out - buffer; 2549: 2550: growBuffer(buffer); 2551: out = &buffer[indx]; 2552: } 2553: } else if (ent == NULL) { 2554: *out++ = '&'; 2555: cur = name; 2556: while (*cur != 0) { 2557: if (out - buffer > buffer_size - 100) { 2558: int indx = out - buffer; 2559: 2560: growBuffer(buffer); 2561: out = &buffer[indx]; 2562: } 2563: *out++ = *cur++; 2564: } 2565: } else { 2566: unsigned int c; 2567: int bits; 2568: 2569: if (out - buffer > buffer_size - 100) { 2570: int indx = out - buffer; 2571: 2572: growBuffer(buffer); 2573: out = &buffer[indx]; 2574: } 2575: c = ent->value; 2576: if (c < 0x80) 2577: { *out++ = c; bits= -6; } 2578: else if (c < 0x800) 2579: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2580: else if (c < 0x10000) 2581: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2582: else 2583: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2584: 2585: for ( ; bits >= 0; bits-= 6) { 2586: *out++ = ((c >> bits) & 0x3F) | 0x80; 2587: } 2588: } 2589: } 2590: } else { 2591: unsigned int c; 2592: int bits, l; 2593: 2594: if (out - buffer > buffer_size - 100) { 2595: int indx = out - buffer; 2596: 2597: growBuffer(buffer); 2598: out = &buffer[indx]; 2599: } 2600: c = CUR_CHAR(l); 2601: if (c < 0x80) 2602: { *out++ = c; bits= -6; } 2603: else if (c < 0x800) 2604: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2605: else if (c < 0x10000) 2606: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2607: else 2608: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2609: 2610: for ( ; bits >= 0; bits-= 6) { 2611: *out++ = ((c >> bits) & 0x3F) | 0x80; 2612: } 2613: NEXT; 2614: } 2615: } 2616: *out = 0; 2617: return(buffer); 2618: } 2619: 2620: /** 2621: * htmlParseEntityRef: 2622: * @ctxt: an HTML parser context 2623: * @str: location to store the entity name 2624: * 2625: * parse an HTML ENTITY references 2626: * 2627: * [68] EntityRef ::= '&' Name ';' 2628: * 2629: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2630: * if non-NULL *str will have to be freed by the caller. 2631: */ 2632: const htmlEntityDesc * 2633: htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2634: const xmlChar *name; 2635: const htmlEntityDesc * ent = NULL; 2636: 2637: if (str != NULL) *str = NULL; 2638: if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2639: 2640: if (CUR == '&') { 2641: NEXT; 2642: name = htmlParseName(ctxt); 2643: if (name == NULL) { 2644: htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2645: "htmlParseEntityRef: no name\n", NULL, NULL); 2646: } else { 2647: GROW; 2648: if (CUR == ';') { 2649: if (str != NULL) 2650: *str = name; 2651: 2652: /* 2653: * Lookup the entity in the table. 2654: */ 2655: ent = htmlEntityLookup(name); 2656: if (ent != NULL) /* OK that's ugly !!! */ 2657: NEXT; 2658: } else { 2659: htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2660: "htmlParseEntityRef: expecting ';'\n", 2661: NULL, NULL); 2662: if (str != NULL) 2663: *str = name; 2664: } 2665: } 2666: } 2667: return(ent); 2668: } 2669: 2670: /** 2671: * htmlParseAttValue: 2672: * @ctxt: an HTML parser context 2673: * 2674: * parse a value for an attribute 2675: * Note: the parser won't do substitution of entities here, this 2676: * will be handled later in xmlStringGetNodeList, unless it was 2677: * asked for ctxt->replaceEntities != 0 2678: * 2679: * Returns the AttValue parsed or NULL. 2680: */ 2681: 2682: static xmlChar * 2683: htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2684: xmlChar *ret = NULL; 2685: 2686: if (CUR == '"') { 2687: NEXT; 2688: ret = htmlParseHTMLAttribute(ctxt, '"'); 2689: if (CUR != '"') { 2690: htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2691: "AttValue: \" expected\n", NULL, NULL); 2692: } else 2693: NEXT; 2694: } else if (CUR == '\'') { 2695: NEXT; 2696: ret = htmlParseHTMLAttribute(ctxt, '\''); 2697: if (CUR != '\'') { 2698: htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2699: "AttValue: ' expected\n", NULL, NULL); 2700: } else 2701: NEXT; 2702: } else { 2703: /* 2704: * That's an HTMLism, the attribute value may not be quoted 2705: */ 2706: ret = htmlParseHTMLAttribute(ctxt, 0); 2707: if (ret == NULL) { 2708: htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2709: "AttValue: no value found\n", NULL, NULL); 2710: } 2711: } 2712: return(ret); 2713: } 2714: 2715: /** 2716: * htmlParseSystemLiteral: 2717: * @ctxt: an HTML parser context 2718: * 2719: * parse an HTML Literal 2720: * 2721: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2722: * 2723: * Returns the SystemLiteral parsed or NULL 2724: */ 2725: 2726: static xmlChar * 2727: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2728: const xmlChar *q; 2729: xmlChar *ret = NULL; 2730: 2731: if (CUR == '"') { 2732: NEXT; 2733: q = CUR_PTR; 2734: while ((IS_CHAR_CH(CUR)) && (CUR != '"')) 2735: NEXT; 2736: if (!IS_CHAR_CH(CUR)) { 2737: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2738: "Unfinished SystemLiteral\n", NULL, NULL); 2739: } else { 2740: ret = xmlStrndup(q, CUR_PTR - q); 2741: NEXT; 2742: } 2743: } else if (CUR == '\'') { 2744: NEXT; 2745: q = CUR_PTR; 2746: while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) 2747: NEXT; 2748: if (!IS_CHAR_CH(CUR)) { 2749: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2750: "Unfinished SystemLiteral\n", NULL, NULL); 2751: } else { 2752: ret = xmlStrndup(q, CUR_PTR - q); 2753: NEXT; 2754: } 2755: } else { 2756: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2757: " or ' expected\n", NULL, NULL); 2758: } 2759: 2760: return(ret); 2761: } 2762: 2763: /** 2764: * htmlParsePubidLiteral: 2765: * @ctxt: an HTML parser context 2766: * 2767: * parse an HTML public literal 2768: * 2769: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2770: * 2771: * Returns the PubidLiteral parsed or NULL. 2772: */ 2773: 2774: static xmlChar * 2775: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2776: const xmlChar *q; 2777: xmlChar *ret = NULL; 2778: /* 2779: * Name ::= (Letter | '_') (NameChar)* 2780: */ 2781: if (CUR == '"') { 2782: NEXT; 2783: q = CUR_PTR; 2784: while (IS_PUBIDCHAR_CH(CUR)) NEXT; 2785: if (CUR != '"') { 2786: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2787: "Unfinished PubidLiteral\n", NULL, NULL); 2788: } else { 2789: ret = xmlStrndup(q, CUR_PTR - q); 2790: NEXT; 2791: } 2792: } else if (CUR == '\'') { 2793: NEXT; 2794: q = CUR_PTR; 2795: while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) 2796: NEXT; 2797: if (CUR != '\'') { 2798: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2799: "Unfinished PubidLiteral\n", NULL, NULL); 2800: } else { 2801: ret = xmlStrndup(q, CUR_PTR - q); 2802: NEXT; 2803: } 2804: } else { 2805: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2806: "PubidLiteral \" or ' expected\n", NULL, NULL); 2807: } 2808: 2809: return(ret); 2810: } 2811: 2812: /** 2813: * htmlParseScript: 2814: * @ctxt: an HTML parser context 2815: * 2816: * parse the content of an HTML SCRIPT or STYLE element 2817: * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2818: * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2819: * http://www.w3.org/TR/html4/types.html#type-script 2820: * http://www.w3.org/TR/html4/types.html#h-6.15 2821: * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2822: * 2823: * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2824: * element and the value of intrinsic event attributes. User agents must 2825: * not evaluate script data as HTML markup but instead must pass it on as 2826: * data to a script engine. 2827: * NOTES: 2828: * - The content is passed like CDATA 2829: * - the attributes for style and scripting "onXXX" are also described 2830: * as CDATA but SGML allows entities references in attributes so their 2831: * processing is identical as other attributes 2832: */ 2833: static void 2834: htmlParseScript(htmlParserCtxtPtr ctxt) { 2835: xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2836: int nbchar = 0; 2837: int cur,l; 2838: 2839: SHRINK; 2840: cur = CUR_CHAR(l); 2841: while (IS_CHAR_CH(cur)) { 2842: if ((cur == '<') && (NXT(1) == '/')) { 2843: /* 2844: * One should break here, the specification is clear: 2845: * Authors should therefore escape "</" within the content. 2846: * Escape mechanisms are specific to each scripting or 2847: * style sheet language. 2848: * 2849: * In recovery mode, only break if end tag match the 2850: * current tag, effectively ignoring all tags inside the 2851: * script/style block and treating the entire block as 2852: * CDATA. 2853: */ 2854: if (ctxt->recovery) { 2855: if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2856: xmlStrlen(ctxt->name)) == 0) 2857: { 2858: break; /* while */ 2859: } else { 2860: htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2861: "Element %s embeds close tag\n", 2862: ctxt->name, NULL); 2863: } 2864: } else { 2865: if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2866: ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2867: { 2868: break; /* while */ 2869: } 2870: } 2871: } 2872: COPY_BUF(l,buf,nbchar,cur); 2873: if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2874: if (ctxt->sax->cdataBlock!= NULL) { 2875: /* 2876: * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2877: */ 2878: ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2879: } else if (ctxt->sax->characters != NULL) { 2880: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2881: } 2882: nbchar = 0; 2883: } 2884: GROW; 2885: NEXTL(l); 2886: cur = CUR_CHAR(l); 2887: } 2888: 2889: if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 2890: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2891: "Invalid char in CDATA 0x%X\n", cur); 2892: if (ctxt->input->cur < ctxt->input->end) { 2893: NEXT; 2894: } 2895: } 2896: 2897: if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2898: if (ctxt->sax->cdataBlock!= NULL) { 2899: /* 2900: * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2901: */ 2902: ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2903: } else if (ctxt->sax->characters != NULL) { 2904: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2905: } 2906: } 2907: } 2908: 2909: 2910: /** 2911: * htmlParseCharData: 2912: * @ctxt: an HTML parser context 2913: * 2914: * parse a CharData section. 2915: * if we are within a CDATA section ']]>' marks an end of section. 2916: * 2917: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2918: */ 2919: 2920: static void 2921: htmlParseCharData(htmlParserCtxtPtr ctxt) { 2922: xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2923: int nbchar = 0; 2924: int cur, l; 2925: int chunk = 0; 2926: 2927: SHRINK; 2928: cur = CUR_CHAR(l); 2929: while (((cur != '<') || (ctxt->token == '<')) && 2930: ((cur != '&') || (ctxt->token == '&')) && 2931: (cur != 0)) { 2932: if (!(IS_CHAR(cur))) { 2933: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2934: "Invalid char in CDATA 0x%X\n", cur); 2935: } else { 2936: COPY_BUF(l,buf,nbchar,cur); 2937: } 2938: if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2939: /* 2940: * Ok the segment is to be consumed as chars. 2941: */ 2942: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2943: if (areBlanks(ctxt, buf, nbchar)) { 2944: if (ctxt->sax->ignorableWhitespace != NULL) 2945: ctxt->sax->ignorableWhitespace(ctxt->userData, 2946: buf, nbchar); 2947: } else { 2948: htmlCheckParagraph(ctxt); 2949: if (ctxt->sax->characters != NULL) 2950: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2951: } 2952: } 2953: nbchar = 0; 2954: } 2955: NEXTL(l); 2956: chunk++; 2957: if (chunk > HTML_PARSER_BUFFER_SIZE) { 2958: chunk = 0; 2959: SHRINK; 2960: GROW; 2961: } 2962: cur = CUR_CHAR(l); 2963: if (cur == 0) { 2964: SHRINK; 2965: GROW; 2966: cur = CUR_CHAR(l); 2967: } 2968: } 2969: if (nbchar != 0) { 2970: buf[nbchar] = 0; 2971: 2972: /* 2973: * Ok the segment is to be consumed as chars. 2974: */ 2975: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2976: if (areBlanks(ctxt, buf, nbchar)) { 2977: if (ctxt->sax->ignorableWhitespace != NULL) 2978: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); 2979: } else { 2980: htmlCheckParagraph(ctxt); 2981: if (ctxt->sax->characters != NULL) 2982: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2983: } 2984: } 2985: } else { 2986: /* 2987: * Loop detection 2988: */ 2989: if (cur == 0) 2990: ctxt->instate = XML_PARSER_EOF; 2991: } 2992: } 2993: 2994: /** 2995: * htmlParseExternalID: 2996: * @ctxt: an HTML parser context 2997: * @publicID: a xmlChar** receiving PubidLiteral 2998: * 2999: * Parse an External ID or a Public ID 3000: * 3001: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 3002: * | 'PUBLIC' S PubidLiteral S SystemLiteral 3003: * 3004: * [83] PublicID ::= 'PUBLIC' S PubidLiteral 3005: * 3006: * Returns the function returns SystemLiteral and in the second 3007: * case publicID receives PubidLiteral, is strict is off 3008: * it is possible to return NULL and have publicID set. 3009: */ 3010: 3011: static xmlChar * 3012: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 3013: xmlChar *URI = NULL; 3014: 3015: if ((UPPER == 'S') && (UPP(1) == 'Y') && 3016: (UPP(2) == 'S') && (UPP(3) == 'T') && 3017: (UPP(4) == 'E') && (UPP(5) == 'M')) { 3018: SKIP(6); 3019: if (!IS_BLANK_CH(CUR)) { 3020: htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3021: "Space required after 'SYSTEM'\n", NULL, NULL); 3022: } 3023: SKIP_BLANKS; 3024: URI = htmlParseSystemLiteral(ctxt); 3025: if (URI == NULL) { 3026: htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 3027: "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 3028: } 3029: } else if ((UPPER == 'P') && (UPP(1) == 'U') && 3030: (UPP(2) == 'B') && (UPP(3) == 'L') && 3031: (UPP(4) == 'I') && (UPP(5) == 'C')) { 3032: SKIP(6); 3033: if (!IS_BLANK_CH(CUR)) { 3034: htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3035: "Space required after 'PUBLIC'\n", NULL, NULL); 3036: } 3037: SKIP_BLANKS; 3038: *publicID = htmlParsePubidLiteral(ctxt); 3039: if (*publicID == NULL) { 3040: htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 3041: "htmlParseExternalID: PUBLIC, no Public Identifier\n", 3042: NULL, NULL); 3043: } 3044: SKIP_BLANKS; 3045: if ((CUR == '"') || (CUR == '\'')) { 3046: URI = htmlParseSystemLiteral(ctxt); 3047: } 3048: } 3049: return(URI); 3050: } 3051: 3052: /** 3053: * xmlParsePI: 3054: * @ctxt: an XML parser context 3055: * 3056: * parse an XML Processing Instruction. 3057: * 3058: * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 3059: */ 3060: static void 3061: htmlParsePI(htmlParserCtxtPtr ctxt) { 3062: xmlChar *buf = NULL; 3063: int len = 0; 3064: int size = HTML_PARSER_BUFFER_SIZE; 3065: int cur, l; 3066: const xmlChar *target; 3067: xmlParserInputState state; 3068: int count = 0; 3069: 3070: if ((RAW == '<') && (NXT(1) == '?')) { 3071: state = ctxt->instate; 3072: ctxt->instate = XML_PARSER_PI; 3073: /* 3074: * this is a Processing Instruction. 3075: */ 3076: SKIP(2); 3077: SHRINK; 3078: 3079: /* 3080: * Parse the target name and check for special support like 3081: * namespace. 3082: */ 3083: target = htmlParseName(ctxt); 3084: if (target != NULL) { 3085: if (RAW == '>') { 3086: SKIP(1); 3087: 3088: /* 3089: * SAX: PI detected. 3090: */ 3091: if ((ctxt->sax) && (!ctxt->disableSAX) && 3092: (ctxt->sax->processingInstruction != NULL)) 3093: ctxt->sax->processingInstruction(ctxt->userData, 3094: target, NULL); 3095: ctxt->instate = state; 3096: return; 3097: } 3098: buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3099: if (buf == NULL) { 3100: htmlErrMemory(ctxt, NULL); 3101: ctxt->instate = state; 3102: return; 3103: } 3104: cur = CUR; 3105: if (!IS_BLANK(cur)) { 3106: htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3107: "ParsePI: PI %s space expected\n", target, NULL); 3108: } 3109: SKIP_BLANKS; 3110: cur = CUR_CHAR(l); 3111: while (IS_CHAR(cur) && (cur != '>')) { 3112: if (len + 5 >= size) { 3113: xmlChar *tmp; 3114: 3115: size *= 2; 3116: tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3117: if (tmp == NULL) { 3118: htmlErrMemory(ctxt, NULL); 3119: xmlFree(buf); 3120: ctxt->instate = state; 3121: return; 3122: } 3123: buf = tmp; 3124: } 3125: count++; 3126: if (count > 50) { 3127: GROW; 3128: count = 0; 3129: } 3130: COPY_BUF(l,buf,len,cur); 3131: NEXTL(l); 3132: cur = CUR_CHAR(l); 3133: if (cur == 0) { 3134: SHRINK; 3135: GROW; 3136: cur = CUR_CHAR(l); 3137: } 3138: } 3139: buf[len] = 0; 3140: if (cur != '>') { 3141: htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 3142: "ParsePI: PI %s never end ...\n", target, NULL); 3143: } else { 3144: SKIP(1); 3145: 3146: /* 3147: * SAX: PI detected. 3148: */ 3149: if ((ctxt->sax) && (!ctxt->disableSAX) && 3150: (ctxt->sax->processingInstruction != NULL)) 3151: ctxt->sax->processingInstruction(ctxt->userData, 3152: target, buf); 3153: } 3154: xmlFree(buf); 3155: } else { 3156: htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3157: "PI is not started correctly", NULL, NULL); 3158: } 3159: ctxt->instate = state; 3160: } 3161: } 3162: 3163: /** 3164: * htmlParseComment: 3165: * @ctxt: an HTML parser context 3166: * 3167: * Parse an XML (SGML) comment  3168: * 3169: * [15] Comment ::= '' 3170: */ 3171: static void 3172: htmlParseComment(htmlParserCtxtPtr ctxt) { 3173: xmlChar *buf = NULL; 3174: int len; 3175: int size = HTML_PARSER_BUFFER_SIZE; 3176: int q, ql; 3177: int r, rl; 3178: int cur, l; 3179: xmlParserInputState state; 3180: 3181: /* 3182: * Check that there is a comment right here. 3183: */ 3184: if ((RAW != '<') || (NXT(1) != '!') || 3185: (NXT(2) != '-') || (NXT(3) != '-')) return; 3186: 3187: state = ctxt->instate; 3188: ctxt->instate = XML_PARSER_COMMENT; 3189: SHRINK; 3190: SKIP(4); 3191: buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3192: if (buf == NULL) { 3193: htmlErrMemory(ctxt, "buffer allocation failed\n"); 3194: ctxt->instate = state; 3195: return; 3196: } 3197: q = CUR_CHAR(ql); 3198: NEXTL(ql); 3199: r = CUR_CHAR(rl); 3200: NEXTL(rl); 3201: cur = CUR_CHAR(l); 3202: len = 0; 3203: while (IS_CHAR(cur) && 3204: ((cur != '>') || 3205: (r != '-') || (q != '-'))) { 3206: if (len + 5 >= size) { 3207: xmlChar *tmp; 3208: 3209: size *= 2; 3210: tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3211: if (tmp == NULL) { 3212: xmlFree(buf); 3213: htmlErrMemory(ctxt, "growing buffer failed\n"); 3214: ctxt->instate = state; 3215: return; 3216: } 3217: buf = tmp; 3218: } 3219: COPY_BUF(ql,buf,len,q); 3220: q = r; 3221: ql = rl; 3222: r = cur; 3223: rl = l; 3224: NEXTL(l); 3225: cur = CUR_CHAR(l); 3226: if (cur == 0) { 3227: SHRINK; 3228: GROW; 3229: cur = CUR_CHAR(l); 3230: } 3231: } 3232: buf[len] = 0; 3233: if (!IS_CHAR(cur)) { 3234: htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3235: "Comment not terminated \n */ 5014: base += 2; 5015: } 5016: } 5017: if (ignoreattrval) { 5018: if (buf[base] == '"' || buf[base] == '\'') { 5019: if (invalue) { 5020: if (buf[base] == valdellim) { 5021: invalue = 0; 5022: continue; 5023: } 5024: } else { 5025: valdellim = buf[base]; 5026: invalue = 1; 5027: continue; 5028: } 5029: } else if (invalue) { 5030: continue; 5031: } 5032: } 5033: if (incomment) { 5034: if (base + 3 > len) 5035: return (-1); 5036: if ((buf[base] == '-') && (buf[base + 1] == '-') && 5037: (buf[base + 2] == '>')) { 5038: incomment = 0; 5039: base += 2; 5040: } 5041: continue; 5042: } 5043: if (buf[base] == first) { 5044: if (third != 0) { 5045: if ((buf[base + 1] != next) || (buf[base + 2] != third)) 5046: continue; 5047: } else if (next != 0) { 5048: if (buf[base + 1] != next) 5049: continue; 5050: } 5051: ctxt->checkIndex = 0; 5052: #ifdef DEBUG_PUSH 5053: if (next == 0) 5054: xmlGenericError(xmlGenericErrorContext, 5055: "HPP: lookup '%c' found at %d\n", 5056: first, base); 5057: else if (third == 0) 5058: xmlGenericError(xmlGenericErrorContext, 5059: "HPP: lookup '%c%c' found at %d\n", 5060: first, next, base); 5061: else 5062: xmlGenericError(xmlGenericErrorContext, 5063: "HPP: lookup '%c%c%c' found at %d\n", 5064: first, next, third, base); 5065: #endif 5066: return (base - (in->cur - in->base)); 5067: } 5068: } 5069: if ((!incomment) && (!invalue)) 5070: ctxt->checkIndex = base; 5071: #ifdef DEBUG_PUSH 5072: if (next == 0) 5073: xmlGenericError(xmlGenericErrorContext, 5074: "HPP: lookup '%c' failed\n", first); 5075: else if (third == 0) 5076: xmlGenericError(xmlGenericErrorContext, 5077: "HPP: lookup '%c%c' failed\n", first, next); 5078: else 5079: xmlGenericError(xmlGenericErrorContext, 5080: "HPP: lookup '%c%c%c' failed\n", first, next, 5081: third); 5082: #endif 5083: return (-1); 5084: } 5085: 5086: /** 5087: * htmlParseLookupChars: 5088: * @ctxt: an HTML parser context 5089: * @stop: Array of chars, which stop the lookup. 5090: * @stopLen: Length of stop-Array 5091: * 5092: * Try to find if any char of the stop-Array is available in the input 5093: * stream. 5094: * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5095: * to avoid rescanning sequences of bytes, it DOES change the state of the 5096: * parser, do not use liberally. 5097: * 5098: * Returns the index to the current parsing point if a stopChar 5099: * is available, -1 otherwise. 5100: */ 5101: static int 5102: htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, 5103: int stopLen) 5104: { 5105: int base, len; 5106: htmlParserInputPtr in; 5107: const xmlChar *buf; 5108: int incomment = 0; 5109: int i; 5110: 5111: in = ctxt->input; 5112: if (in == NULL) 5113: return (-1); 5114: 5115: base = in->cur - in->base; 5116: if (base < 0) 5117: return (-1); 5118: 5119: if (ctxt->checkIndex > base) 5120: base = ctxt->checkIndex; 5121: 5122: if (in->buf == NULL) { 5123: buf = in->base; 5124: len = in->length; 5125: } else { 5126: buf = in->buf->buffer->content; 5127: len = in->buf->buffer->use; 5128: } 5129: 5130: for (; base < len; base++) { 5131: if (!incomment && (base + 4 < len)) { 5132: if ((buf[base] == '<') && (buf[base + 1] == '!') && 5133: (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5134: incomment = 1; 5135: /* do not increment past <! - some people use <!--> */ 5136: base += 2; 5137: } 5138: } 5139: if (incomment) { 5140: if (base + 3 > len) 5141: return (-1); 5142: if ((buf[base] == '-') && (buf[base + 1] == '-') && 5143: (buf[base + 2] == '>')) { 5144: incomment = 0; 5145: base += 2; 5146: } 5147: continue; 5148: } 5149: for (i = 0; i < stopLen; ++i) { 5150: if (buf[base] == stop[i]) { 5151: ctxt->checkIndex = 0; 5152: return (base - (in->cur - in->base)); 5153: } 5154: } 5155: } 5156: ctxt->checkIndex = base; 5157: return (-1); 5158: } 5159: 5160: /** 5161: * htmlParseTryOrFinish: 5162: * @ctxt: an HTML parser context 5163: * @terminate: last chunk indicator 5164: * 5165: * Try to progress on parsing 5166: * 5167: * Returns zero if no parsing was possible 5168: */ 5169: static int 5170: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 5171: int ret = 0; 5172: htmlParserInputPtr in; 5173: int avail = 0; 5174: xmlChar cur, next; 5175: 5176: #ifdef DEBUG_PUSH 5177: switch (ctxt->instate) { 5178: case XML_PARSER_EOF: 5179: xmlGenericError(xmlGenericErrorContext, 5180: "HPP: try EOF\n"); break; 5181: case XML_PARSER_START: 5182: xmlGenericError(xmlGenericErrorContext, 5183: "HPP: try START\n"); break; 5184: case XML_PARSER_MISC: 5185: xmlGenericError(xmlGenericErrorContext, 5186: "HPP: try MISC\n");break; 5187: case XML_PARSER_COMMENT: 5188: xmlGenericError(xmlGenericErrorContext, 5189: "HPP: try COMMENT\n");break; 5190: case XML_PARSER_PROLOG: 5191: xmlGenericError(xmlGenericErrorContext, 5192: "HPP: try PROLOG\n");break; 5193: case XML_PARSER_START_TAG: 5194: xmlGenericError(xmlGenericErrorContext, 5195: "HPP: try START_TAG\n");break; 5196: case XML_PARSER_CONTENT: 5197: xmlGenericError(xmlGenericErrorContext, 5198: "HPP: try CONTENT\n");break; 5199: case XML_PARSER_CDATA_SECTION: 5200: xmlGenericError(xmlGenericErrorContext, 5201: "HPP: try CDATA_SECTION\n");break; 5202: case XML_PARSER_END_TAG: 5203: xmlGenericError(xmlGenericErrorContext, 5204: "HPP: try END_TAG\n");break; 5205: case XML_PARSER_ENTITY_DECL: 5206: xmlGenericError(xmlGenericErrorContext, 5207: "HPP: try ENTITY_DECL\n");break; 5208: case XML_PARSER_ENTITY_VALUE: 5209: xmlGenericError(xmlGenericErrorContext, 5210: "HPP: try ENTITY_VALUE\n");break; 5211: case XML_PARSER_ATTRIBUTE_VALUE: 5212: xmlGenericError(xmlGenericErrorContext, 5213: "HPP: try ATTRIBUTE_VALUE\n");break; 5214: case XML_PARSER_DTD: 5215: xmlGenericError(xmlGenericErrorContext, 5216: "HPP: try DTD\n");break; 5217: case XML_PARSER_EPILOG: 5218: xmlGenericError(xmlGenericErrorContext, 5219: "HPP: try EPILOG\n");break; 5220: case XML_PARSER_PI: 5221: xmlGenericError(xmlGenericErrorContext, 5222: "HPP: try PI\n");break; 5223: case XML_PARSER_SYSTEM_LITERAL: 5224: xmlGenericError(xmlGenericErrorContext, 5225: "HPP: try SYSTEM_LITERAL\n");break; 5226: } 5227: #endif 5228: 5229: while (1) { 5230: 5231: in = ctxt->input; 5232: if (in == NULL) break; 5233: if (in->buf == NULL) 5234: avail = in->length - (in->cur - in->base); 5235: else 5236: avail = in->buf->buffer->use - (in->cur - in->base); 5237: if ((avail == 0) && (terminate)) { 5238: htmlAutoCloseOnEnd(ctxt); 5239: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5240: /* 5241: * SAX: end of the document processing. 5242: */ 5243: ctxt->instate = XML_PARSER_EOF; 5244: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5245: ctxt->sax->endDocument(ctxt->userData); 5246: } 5247: } 5248: if (avail < 1) 5249: goto done; 5250: cur = in->cur[0]; 5251: if (cur == 0) { 5252: SKIP(1); 5253: continue; 5254: } 5255: 5256: switch (ctxt->instate) { 5257: case XML_PARSER_EOF: 5258: /* 5259: * Document parsing is done ! 5260: */ 5261: goto done; 5262: case XML_PARSER_START: 5263: /* 5264: * Very first chars read from the document flow. 5265: */ 5266: cur = in->cur[0]; 5267: if (IS_BLANK_CH(cur)) { 5268: SKIP_BLANKS; 5269: if (in->buf == NULL) 5270: avail = in->length - (in->cur - in->base); 5271: else 5272: avail = in->buf->buffer->use - (in->cur - in->base); 5273: } 5274: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 5275: ctxt->sax->setDocumentLocator(ctxt->userData, 5276: &xmlDefaultSAXLocator); 5277: if ((ctxt->sax) && (ctxt->sax->startDocument) && 5278: (!ctxt->disableSAX)) 5279: ctxt->sax->startDocument(ctxt->userData); 5280: 5281: cur = in->cur[0]; 5282: next = in->cur[1]; 5283: if ((cur == '<') && (next == '!') && 5284: (UPP(2) == 'D') && (UPP(3) == 'O') && 5285: (UPP(4) == 'C') && (UPP(5) == 'T') && 5286: (UPP(6) == 'Y') && (UPP(7) == 'P') && 5287: (UPP(8) == 'E')) { 5288: if ((!terminate) && 5289: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5290: goto done; 5291: #ifdef DEBUG_PUSH 5292: xmlGenericError(xmlGenericErrorContext, 5293: "HPP: Parsing internal subset\n"); 5294: #endif 5295: htmlParseDocTypeDecl(ctxt); 5296: ctxt->instate = XML_PARSER_PROLOG; 5297: #ifdef DEBUG_PUSH 5298: xmlGenericError(xmlGenericErrorContext, 5299: "HPP: entering PROLOG\n"); 5300: #endif 5301: } else { 5302: ctxt->instate = XML_PARSER_MISC; 5303: #ifdef DEBUG_PUSH 5304: xmlGenericError(xmlGenericErrorContext, 5305: "HPP: entering MISC\n"); 5306: #endif 5307: } 5308: break; 5309: case XML_PARSER_MISC: 5310: SKIP_BLANKS; 5311: if (in->buf == NULL) 5312: avail = in->length - (in->cur - in->base); 5313: else 5314: avail = in->buf->buffer->use - (in->cur - in->base); 5315: if (avail < 2) 5316: goto done; 5317: cur = in->cur[0]; 5318: next = in->cur[1]; 5319: if ((cur == '<') && (next == '!') && 5320: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5321: if ((!terminate) && 5322: (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5323: goto done; 5324: #ifdef DEBUG_PUSH 5325: xmlGenericError(xmlGenericErrorContext, 5326: "HPP: Parsing Comment\n"); 5327: #endif 5328: htmlParseComment(ctxt); 5329: ctxt->instate = XML_PARSER_MISC; 5330: } else if ((cur == '<') && (next == '?')) { 5331: if ((!terminate) && 5332: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5333: goto done; 5334: #ifdef DEBUG_PUSH 5335: xmlGenericError(xmlGenericErrorContext, 5336: "HPP: Parsing PI\n"); 5337: #endif 5338: htmlParsePI(ctxt); 5339: ctxt->instate = XML_PARSER_MISC; 5340: } else if ((cur == '<') && (next == '!') && 5341: (UPP(2) == 'D') && (UPP(3) == 'O') && 5342: (UPP(4) == 'C') && (UPP(5) == 'T') && 5343: (UPP(6) == 'Y') && (UPP(7) == 'P') && 5344: (UPP(8) == 'E')) { 5345: if ((!terminate) && 5346: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5347: goto done; 5348: #ifdef DEBUG_PUSH 5349: xmlGenericError(xmlGenericErrorContext, 5350: "HPP: Parsing internal subset\n"); 5351: #endif 5352: htmlParseDocTypeDecl(ctxt); 5353: ctxt->instate = XML_PARSER_PROLOG; 5354: #ifdef DEBUG_PUSH 5355: xmlGenericError(xmlGenericErrorContext, 5356: "HPP: entering PROLOG\n"); 5357: #endif 5358: } else if ((cur == '<') && (next == '!') && 5359: (avail < 9)) { 5360: goto done; 5361: } else { 5362: ctxt->instate = XML_PARSER_START_TAG; 5363: #ifdef DEBUG_PUSH 5364: xmlGenericError(xmlGenericErrorContext, 5365: "HPP: entering START_TAG\n"); 5366: #endif 5367: } 5368: break; 5369: case XML_PARSER_PROLOG: 5370: SKIP_BLANKS; 5371: if (in->buf == NULL) 5372: avail = in->length - (in->cur - in->base); 5373: else 5374: avail = in->buf->buffer->use - (in->cur - in->base); 5375: if (avail < 2) 5376: goto done; 5377: cur = in->cur[0]; 5378: next = in->cur[1]; 5379: if ((cur == '<') && (next == '!') && 5380: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5381: if ((!terminate) && 5382: (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5383: goto done; 5384: #ifdef DEBUG_PUSH 5385: xmlGenericError(xmlGenericErrorContext, 5386: "HPP: Parsing Comment\n"); 5387: #endif 5388: htmlParseComment(ctxt); 5389: ctxt->instate = XML_PARSER_PROLOG; 5390: } else if ((cur == '<') && (next == '?')) { 5391: if ((!terminate) && 5392: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5393: goto done; 5394: #ifdef DEBUG_PUSH 5395: xmlGenericError(xmlGenericErrorContext, 5396: "HPP: Parsing PI\n"); 5397: #endif 5398: htmlParsePI(ctxt); 5399: ctxt->instate = XML_PARSER_PROLOG; 5400: } else if ((cur == '<') && (next == '!') && 5401: (avail < 4)) { 5402: goto done; 5403: } else { 5404: ctxt->instate = XML_PARSER_START_TAG; 5405: #ifdef DEBUG_PUSH 5406: xmlGenericError(xmlGenericErrorContext, 5407: "HPP: entering START_TAG\n"); 5408: #endif 5409: } 5410: break; 5411: case XML_PARSER_EPILOG: 5412: if (in->buf == NULL) 5413: avail = in->length - (in->cur - in->base); 5414: else 5415: avail = in->buf->buffer->use - (in->cur - in->base); 5416: if (avail < 1) 5417: goto done; 5418: cur = in->cur[0]; 5419: if (IS_BLANK_CH(cur)) { 5420: htmlParseCharData(ctxt); 5421: goto done; 5422: } 5423: if (avail < 2) 5424: goto done; 5425: next = in->cur[1]; 5426: if ((cur == '<') && (next == '!') && 5427: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5428: if ((!terminate) && 5429: (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5430: goto done; 5431: #ifdef DEBUG_PUSH 5432: xmlGenericError(xmlGenericErrorContext, 5433: "HPP: Parsing Comment\n"); 5434: #endif 5435: htmlParseComment(ctxt); 5436: ctxt->instate = XML_PARSER_EPILOG; 5437: } else if ((cur == '<') && (next == '?')) { 5438: if ((!terminate) && 5439: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5440: goto done; 5441: #ifdef DEBUG_PUSH 5442: xmlGenericError(xmlGenericErrorContext, 5443: "HPP: Parsing PI\n"); 5444: #endif 5445: htmlParsePI(ctxt); 5446: ctxt->instate = XML_PARSER_EPILOG; 5447: } else if ((cur == '<') && (next == '!') && 5448: (avail < 4)) { 5449: goto done; 5450: } else { 5451: ctxt->errNo = XML_ERR_DOCUMENT_END; 5452: ctxt->wellFormed = 0; 5453: ctxt->instate = XML_PARSER_EOF; 5454: #ifdef DEBUG_PUSH 5455: xmlGenericError(xmlGenericErrorContext, 5456: "HPP: entering EOF\n"); 5457: #endif 5458: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5459: ctxt->sax->endDocument(ctxt->userData); 5460: goto done; 5461: } 5462: break; 5463: case XML_PARSER_START_TAG: { 5464: const xmlChar *name; 5465: int failed; 5466: const htmlElemDesc * info; 5467: 5468: if (avail < 2) 5469: goto done; 5470: cur = in->cur[0]; 5471: if (cur != '<') { 5472: ctxt->instate = XML_PARSER_CONTENT; 5473: #ifdef DEBUG_PUSH 5474: xmlGenericError(xmlGenericErrorContext, 5475: "HPP: entering CONTENT\n"); 5476: #endif 5477: break; 5478: } 5479: if (in->cur[1] == '/') { 5480: ctxt->instate = XML_PARSER_END_TAG; 5481: ctxt->checkIndex = 0; 5482: #ifdef DEBUG_PUSH 5483: xmlGenericError(xmlGenericErrorContext, 5484: "HPP: entering END_TAG\n"); 5485: #endif 5486: break; 5487: } 5488: if ((!terminate) && 5489: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5490: goto done; 5491: 5492: failed = htmlParseStartTag(ctxt); 5493: name = ctxt->name; 5494: if ((failed == -1) || 5495: (name == NULL)) { 5496: if (CUR == '>') 5497: NEXT; 5498: break; 5499: } 5500: 5501: /* 5502: * Lookup the info for that element. 5503: */ 5504: info = htmlTagLookup(name); 5505: if (info == NULL) { 5506: htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 5507: "Tag %s invalid\n", name, NULL); 5508: } 5509: 5510: /* 5511: * Check for an Empty Element labeled the XML/SGML way 5512: */ 5513: if ((CUR == '/') && (NXT(1) == '>')) { 5514: SKIP(2); 5515: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5516: ctxt->sax->endElement(ctxt->userData, name); 5517: htmlnamePop(ctxt); 5518: ctxt->instate = XML_PARSER_CONTENT; 5519: #ifdef DEBUG_PUSH 5520: xmlGenericError(xmlGenericErrorContext, 5521: "HPP: entering CONTENT\n"); 5522: #endif 5523: break; 5524: } 5525: 5526: if (CUR == '>') { 5527: NEXT; 5528: } else { 5529: htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 5530: "Couldn't find end of Start Tag %s\n", 5531: name, NULL); 5532: 5533: /* 5534: * end of parsing of this node. 5535: */ 5536: if (xmlStrEqual(name, ctxt->name)) { 5537: nodePop(ctxt); 5538: htmlnamePop(ctxt); 5539: } 5540: 5541: ctxt->instate = XML_PARSER_CONTENT; 5542: #ifdef DEBUG_PUSH 5543: xmlGenericError(xmlGenericErrorContext, 5544: "HPP: entering CONTENT\n"); 5545: #endif 5546: break; 5547: } 5548: 5549: /* 5550: * Check for an Empty Element from DTD definition 5551: */ 5552: if ((info != NULL) && (info->empty)) { 5553: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5554: ctxt->sax->endElement(ctxt->userData, name); 5555: htmlnamePop(ctxt); 5556: } 5557: ctxt->instate = XML_PARSER_CONTENT; 5558: #ifdef DEBUG_PUSH 5559: xmlGenericError(xmlGenericErrorContext, 5560: "HPP: entering CONTENT\n"); 5561: #endif 5562: break; 5563: } 5564: case XML_PARSER_CONTENT: { 5565: long cons; 5566: /* 5567: * Handle preparsed entities and charRef 5568: */ 5569: if (ctxt->token != 0) { 5570: xmlChar chr[2] = { 0 , 0 } ; 5571: 5572: chr[0] = (xmlChar) ctxt->token; 5573: htmlCheckParagraph(ctxt); 5574: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 5575: ctxt->sax->characters(ctxt->userData, chr, 1); 5576: ctxt->token = 0; 5577: ctxt->checkIndex = 0; 5578: } 5579: if ((avail == 1) && (terminate)) { 5580: cur = in->cur[0]; 5581: if ((cur != '<') && (cur != '&')) { 5582: if (ctxt->sax != NULL) { 5583: if (IS_BLANK_CH(cur)) { 5584: if (ctxt->sax->ignorableWhitespace != NULL) 5585: ctxt->sax->ignorableWhitespace( 5586: ctxt->userData, &cur, 1); 5587: } else { 5588: htmlCheckParagraph(ctxt); 5589: if (ctxt->sax->characters != NULL) 5590: ctxt->sax->characters( 5591: ctxt->userData, &cur, 1); 5592: } 5593: } 5594: ctxt->token = 0; 5595: ctxt->checkIndex = 0; 5596: in->cur++; 5597: break; 5598: } 5599: } 5600: if (avail < 2) 5601: goto done; 5602: cur = in->cur[0]; 5603: next = in->cur[1]; 5604: cons = ctxt->nbChars; 5605: if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5606: (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5607: /* 5608: * Handle SCRIPT/STYLE separately 5609: */ 5610: if (!terminate) { 5611: int idx; 5612: xmlChar val; 5613: 5614: idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); 5615: if (idx < 0) 5616: goto done; 5617: val = in->cur[idx + 2]; 5618: if (val == 0) /* bad cut of input */ 5619: goto done; 5620: } 5621: htmlParseScript(ctxt); 5622: if ((cur == '<') && (next == '/')) { 5623: ctxt->instate = XML_PARSER_END_TAG; 5624: ctxt->checkIndex = 0; 5625: #ifdef DEBUG_PUSH 5626: xmlGenericError(xmlGenericErrorContext, 5627: "HPP: entering END_TAG\n"); 5628: #endif 5629: break; 5630: } 5631: } else { 5632: /* 5633: * Sometimes DOCTYPE arrives in the middle of the document 5634: */ 5635: if ((cur == '<') && (next == '!') && 5636: (UPP(2) == 'D') && (UPP(3) == 'O') && 5637: (UPP(4) == 'C') && (UPP(5) == 'T') && 5638: (UPP(6) == 'Y') && (UPP(7) == 'P') && 5639: (UPP(8) == 'E')) { 5640: if ((!terminate) && 5641: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5642: goto done; 5643: htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5644: "Misplaced DOCTYPE declaration\n", 5645: BAD_CAST "DOCTYPE" , NULL); 5646: htmlParseDocTypeDecl(ctxt); 5647: } else if ((cur == '<') && (next == '!') && 5648: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5649: if ((!terminate) && 5650: (htmlParseLookupSequence( 5651: ctxt, '-', '-', '>', 1, 1) < 0)) 5652: goto done; 5653: #ifdef DEBUG_PUSH 5654: xmlGenericError(xmlGenericErrorContext, 5655: "HPP: Parsing Comment\n"); 5656: #endif 5657: htmlParseComment(ctxt); 5658: ctxt->instate = XML_PARSER_CONTENT; 5659: } else if ((cur == '<') && (next == '?')) { 5660: if ((!terminate) && 5661: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5662: goto done; 5663: #ifdef DEBUG_PUSH 5664: xmlGenericError(xmlGenericErrorContext, 5665: "HPP: Parsing PI\n"); 5666: #endif 5667: htmlParsePI(ctxt); 5668: ctxt->instate = XML_PARSER_CONTENT; 5669: } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5670: goto done; 5671: } else if ((cur == '<') && (next == '/')) { 5672: ctxt->instate = XML_PARSER_END_TAG; 5673: ctxt->checkIndex = 0; 5674: #ifdef DEBUG_PUSH 5675: xmlGenericError(xmlGenericErrorContext, 5676: "HPP: entering END_TAG\n"); 5677: #endif 5678: break; 5679: } else if (cur == '<') { 5680: ctxt->instate = XML_PARSER_START_TAG; 5681: ctxt->checkIndex = 0; 5682: #ifdef DEBUG_PUSH 5683: xmlGenericError(xmlGenericErrorContext, 5684: "HPP: entering START_TAG\n"); 5685: #endif 5686: break; 5687: } else if (cur == '&') { 5688: if ((!terminate) && 5689: (htmlParseLookupChars(ctxt, 5690: BAD_CAST "; >/", 4) < 0)) 5691: goto done; 5692: #ifdef DEBUG_PUSH 5693: xmlGenericError(xmlGenericErrorContext, 5694: "HPP: Parsing Reference\n"); 5695: #endif 5696: /* TODO: check generation of subtrees if noent !!! */ 5697: htmlParseReference(ctxt); 5698: } else { 5699: /* 5700: * check that the text sequence is complete 5701: * before handing out the data to the parser 5702: * to avoid problems with erroneous end of 5703: * data detection. 5704: */ 5705: if ((!terminate) && 5706: (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) 5707: goto done; 5708: ctxt->checkIndex = 0; 5709: #ifdef DEBUG_PUSH 5710: xmlGenericError(xmlGenericErrorContext, 5711: "HPP: Parsing char data\n"); 5712: #endif 5713: htmlParseCharData(ctxt); 5714: } 5715: } 5716: if (cons == ctxt->nbChars) { 5717: if (ctxt->node != NULL) { 5718: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5719: "detected an error in element content\n", 5720: NULL, NULL); 5721: } 5722: NEXT; 5723: break; 5724: } 5725: 5726: break; 5727: } 5728: case XML_PARSER_END_TAG: 5729: if (avail < 2) 5730: goto done; 5731: if ((!terminate) && 5732: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5733: goto done; 5734: htmlParseEndTag(ctxt); 5735: if (ctxt->nameNr == 0) { 5736: ctxt->instate = XML_PARSER_EPILOG; 5737: } else { 5738: ctxt->instate = XML_PARSER_CONTENT; 5739: } 5740: ctxt->checkIndex = 0; 5741: #ifdef DEBUG_PUSH 5742: xmlGenericError(xmlGenericErrorContext, 5743: "HPP: entering CONTENT\n"); 5744: #endif 5745: break; 5746: case XML_PARSER_CDATA_SECTION: 5747: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5748: "HPP: internal error, state == CDATA\n", 5749: NULL, NULL); 5750: ctxt->instate = XML_PARSER_CONTENT; 5751: ctxt->checkIndex = 0; 5752: #ifdef DEBUG_PUSH 5753: xmlGenericError(xmlGenericErrorContext, 5754: "HPP: entering CONTENT\n"); 5755: #endif 5756: break; 5757: case XML_PARSER_DTD: 5758: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5759: "HPP: internal error, state == DTD\n", 5760: NULL, NULL); 5761: ctxt->instate = XML_PARSER_CONTENT; 5762: ctxt->checkIndex = 0; 5763: #ifdef DEBUG_PUSH 5764: xmlGenericError(xmlGenericErrorContext, 5765: "HPP: entering CONTENT\n"); 5766: #endif 5767: break; 5768: case XML_PARSER_COMMENT: 5769: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5770: "HPP: internal error, state == COMMENT\n", 5771: NULL, NULL); 5772: ctxt->instate = XML_PARSER_CONTENT; 5773: ctxt->checkIndex = 0; 5774: #ifdef DEBUG_PUSH 5775: xmlGenericError(xmlGenericErrorContext, 5776: "HPP: entering CONTENT\n"); 5777: #endif 5778: break; 5779: case XML_PARSER_PI: 5780: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5781: "HPP: internal error, state == PI\n", 5782: NULL, NULL); 5783: ctxt->instate = XML_PARSER_CONTENT; 5784: ctxt->checkIndex = 0; 5785: #ifdef DEBUG_PUSH 5786: xmlGenericError(xmlGenericErrorContext, 5787: "HPP: entering CONTENT\n"); 5788: #endif 5789: break; 5790: case XML_PARSER_ENTITY_DECL: 5791: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5792: "HPP: internal error, state == ENTITY_DECL\n", 5793: NULL, NULL); 5794: ctxt->instate = XML_PARSER_CONTENT; 5795: ctxt->checkIndex = 0; 5796: #ifdef DEBUG_PUSH 5797: xmlGenericError(xmlGenericErrorContext, 5798: "HPP: entering CONTENT\n"); 5799: #endif 5800: break; 5801: case XML_PARSER_ENTITY_VALUE: 5802: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5803: "HPP: internal error, state == ENTITY_VALUE\n", 5804: NULL, NULL); 5805: ctxt->instate = XML_PARSER_CONTENT; 5806: ctxt->checkIndex = 0; 5807: #ifdef DEBUG_PUSH 5808: xmlGenericError(xmlGenericErrorContext, 5809: "HPP: entering DTD\n"); 5810: #endif 5811: break; 5812: case XML_PARSER_ATTRIBUTE_VALUE: 5813: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5814: "HPP: internal error, state == ATTRIBUTE_VALUE\n", 5815: NULL, NULL); 5816: ctxt->instate = XML_PARSER_START_TAG; 5817: ctxt->checkIndex = 0; 5818: #ifdef DEBUG_PUSH 5819: xmlGenericError(xmlGenericErrorContext, 5820: "HPP: entering START_TAG\n"); 5821: #endif 5822: break; 5823: case XML_PARSER_SYSTEM_LITERAL: 5824: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5825: "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 5826: NULL, NULL); 5827: ctxt->instate = XML_PARSER_CONTENT; 5828: ctxt->checkIndex = 0; 5829: #ifdef DEBUG_PUSH 5830: xmlGenericError(xmlGenericErrorContext, 5831: "HPP: entering CONTENT\n"); 5832: #endif 5833: break; 5834: case XML_PARSER_IGNORE: 5835: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5836: "HPP: internal error, state == XML_PARSER_IGNORE\n", 5837: NULL, NULL); 5838: ctxt->instate = XML_PARSER_CONTENT; 5839: ctxt->checkIndex = 0; 5840: #ifdef DEBUG_PUSH 5841: xmlGenericError(xmlGenericErrorContext, 5842: "HPP: entering CONTENT\n"); 5843: #endif 5844: break; 5845: case XML_PARSER_PUBLIC_LITERAL: 5846: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5847: "HPP: internal error, state == XML_PARSER_LITERAL\n", 5848: NULL, NULL); 5849: ctxt->instate = XML_PARSER_CONTENT; 5850: ctxt->checkIndex = 0; 5851: #ifdef DEBUG_PUSH 5852: xmlGenericError(xmlGenericErrorContext, 5853: "HPP: entering CONTENT\n"); 5854: #endif 5855: break; 5856: 5857: } 5858: } 5859: done: 5860: if ((avail == 0) && (terminate)) { 5861: htmlAutoCloseOnEnd(ctxt); 5862: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5863: /* 5864: * SAX: end of the document processing. 5865: */ 5866: ctxt->instate = XML_PARSER_EOF; 5867: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5868: ctxt->sax->endDocument(ctxt->userData); 5869: } 5870: } 5871: if ((ctxt->myDoc != NULL) && 5872: ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 5873: (ctxt->instate == XML_PARSER_EPILOG))) { 5874: xmlDtdPtr dtd; 5875: dtd = xmlGetIntSubset(ctxt->myDoc); 5876: if (dtd == NULL) 5877: ctxt->myDoc->intSubset = 5878: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 5879: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 5880: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 5881: } 5882: #ifdef DEBUG_PUSH 5883: xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 5884: #endif 5885: return(ret); 5886: } 5887: 5888: /** 5889: * htmlParseChunk: 5890: * @ctxt: an HTML parser context 5891: * @chunk: an char array 5892: * @size: the size in byte of the chunk 5893: * @terminate: last chunk indicator 5894: * 5895: * Parse a Chunk of memory 5896: * 5897: * Returns zero if no error, the xmlParserErrors otherwise. 5898: */ 5899: int 5900: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 5901: int terminate) { 5902: if ((ctxt == NULL) || (ctxt->input == NULL)) { 5903: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5904: "htmlParseChunk: context error\n", NULL, NULL); 5905: return(XML_ERR_INTERNAL_ERROR); 5906: } 5907: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5908: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 5909: int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5910: int cur = ctxt->input->cur - ctxt->input->base; 5911: int res; 5912: 5913: res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5914: if (res < 0) { 5915: ctxt->errNo = XML_PARSER_EOF; 5916: ctxt->disableSAX = 1; 5917: return (XML_PARSER_EOF); 5918: } 5919: ctxt->input->base = ctxt->input->buf->buffer->content + base; 5920: ctxt->input->cur = ctxt->input->base + cur; 5921: ctxt->input->end = 5922: &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5923: #ifdef DEBUG_PUSH 5924: xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5925: #endif 5926: 5927: #if 0 5928: if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 5929: htmlParseTryOrFinish(ctxt, terminate); 5930: #endif 5931: } else if (ctxt->instate != XML_PARSER_EOF) { 5932: if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 5933: xmlParserInputBufferPtr in = ctxt->input->buf; 5934: if ((in->encoder != NULL) && (in->buffer != NULL) && 5935: (in->raw != NULL)) { 5936: int nbchars; 5937: 5938: nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); 5939: if (nbchars < 0) { 5940: htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 5941: "encoder error\n", NULL, NULL); 5942: return(XML_ERR_INVALID_ENCODING); 5943: } 5944: } 5945: } 5946: } 5947: htmlParseTryOrFinish(ctxt, terminate); 5948: if (terminate) { 5949: if ((ctxt->instate != XML_PARSER_EOF) && 5950: (ctxt->instate != XML_PARSER_EPILOG) && 5951: (ctxt->instate != XML_PARSER_MISC)) { 5952: ctxt->errNo = XML_ERR_DOCUMENT_END; 5953: ctxt->wellFormed = 0; 5954: } 5955: if (ctxt->instate != XML_PARSER_EOF) { 5956: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5957: ctxt->sax->endDocument(ctxt->userData); 5958: } 5959: ctxt->instate = XML_PARSER_EOF; 5960: } 5961: return((xmlParserErrors) ctxt->errNo); 5962: } 5963: 5964: /************************************************************************ 5965: * * 5966: * User entry points * 5967: * * 5968: ************************************************************************/ 5969: 5970: /** 5971: * htmlCreatePushParserCtxt: 5972: * @sax: a SAX handler 5973: * @user_data: The user data returned on SAX callbacks 5974: * @chunk: a pointer to an array of chars 5975: * @size: number of chars in the array 5976: * @filename: an optional file name or URI 5977: * @enc: an optional encoding 5978: * 5979: * Create a parser context for using the HTML parser in push mode 5980: * The value of @filename is used for fetching external entities 5981: * and error/warning reports. 5982: * 5983: * Returns the new parser context or NULL 5984: */ 5985: htmlParserCtxtPtr 5986: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 5987: const char *chunk, int size, const char *filename, 5988: xmlCharEncoding enc) { 5989: htmlParserCtxtPtr ctxt; 5990: htmlParserInputPtr inputStream; 5991: xmlParserInputBufferPtr buf; 5992: 5993: xmlInitParser(); 5994: 5995: buf = xmlAllocParserInputBuffer(enc); 5996: if (buf == NULL) return(NULL); 5997: 5998: ctxt = htmlNewParserCtxt(); 5999: if (ctxt == NULL) { 6000: xmlFreeParserInputBuffer(buf); 6001: return(NULL); 6002: } 6003: if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 6004: ctxt->charset=XML_CHAR_ENCODING_UTF8; 6005: if (sax != NULL) { 6006: if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 6007: xmlFree(ctxt->sax); 6008: ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 6009: if (ctxt->sax == NULL) { 6010: xmlFree(buf); 6011: xmlFree(ctxt); 6012: return(NULL); 6013: } 6014: memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 6015: if (user_data != NULL) 6016: ctxt->userData = user_data; 6017: } 6018: if (filename == NULL) { 6019: ctxt->directory = NULL; 6020: } else { 6021: ctxt->directory = xmlParserGetDirectory(filename); 6022: } 6023: 6024: inputStream = htmlNewInputStream(ctxt); 6025: if (inputStream == NULL) { 6026: xmlFreeParserCtxt(ctxt); 6027: xmlFree(buf); 6028: return(NULL); 6029: } 6030: 6031: if (filename == NULL) 6032: inputStream->filename = NULL; 6033: else 6034: inputStream->filename = (char *) 6035: xmlCanonicPath((const xmlChar *) filename); 6036: inputStream->buf = buf; 6037: inputStream->base = inputStream->buf->buffer->content; 6038: inputStream->cur = inputStream->buf->buffer->content; 6039: inputStream->end = 6040: &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; 6041: 6042: inputPush(ctxt, inputStream); 6043: 6044: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6045: (ctxt->input->buf != NULL)) { 6046: int base = ctxt->input->base - ctxt->input->buf->buffer->content; 6047: int cur = ctxt->input->cur - ctxt->input->base; 6048: 6049: xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6050: 6051: ctxt->input->base = ctxt->input->buf->buffer->content + base; 6052: ctxt->input->cur = ctxt->input->base + cur; 6053: ctxt->input->end = 6054: &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 6055: #ifdef DEBUG_PUSH 6056: xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6057: #endif 6058: } 6059: ctxt->progressive = 1; 6060: 6061: return(ctxt); 6062: } 6063: #endif /* LIBXML_PUSH_ENABLED */ 6064: 6065: /** 6066: * htmlSAXParseDoc: 6067: * @cur: a pointer to an array of xmlChar 6068: * @encoding: a free form C string describing the HTML document encoding, or NULL 6069: * @sax: the SAX handler block 6070: * @userData: if using SAX, this pointer will be provided on callbacks. 6071: * 6072: * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 6073: * to handle parse events. If sax is NULL, fallback to the default DOM 6074: * behavior and return a tree. 6075: * 6076: * Returns the resulting document tree unless SAX is NULL or the document is 6077: * not well formed. 6078: */ 6079: 6080: htmlDocPtr 6081: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 6082: htmlDocPtr ret; 6083: htmlParserCtxtPtr ctxt; 6084: 6085: xmlInitParser(); 6086: 6087: if (cur == NULL) return(NULL); 6088: 6089: 6090: ctxt = htmlCreateDocParserCtxt(cur, encoding); 6091: if (ctxt == NULL) return(NULL); 6092: if (sax != NULL) { 6093: if (ctxt->sax != NULL) xmlFree (ctxt->sax); 6094: ctxt->sax = sax; 6095: ctxt->userData = userData; 6096: } 6097: 6098: htmlParseDocument(ctxt); 6099: ret = ctxt->myDoc; 6100: if (sax != NULL) { 6101: ctxt->sax = NULL; 6102: ctxt->userData = NULL; 6103: } 6104: htmlFreeParserCtxt(ctxt); 6105: 6106: return(ret); 6107: } 6108: 6109: /** 6110: * htmlParseDoc: 6111: * @cur: a pointer to an array of xmlChar 6112: * @encoding: a free form C string describing the HTML document encoding, or NULL 6113: * 6114: * parse an HTML in-memory document and build a tree. 6115: * 6116: * Returns the resulting document tree 6117: */ 6118: 6119: htmlDocPtr 6120: htmlParseDoc(xmlChar *cur, const char *encoding) { 6121: return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 6122: } 6123: 6124: 6125: /** 6126: * htmlCreateFileParserCtxt: 6127: * @filename: the filename 6128: * @encoding: a free form C string describing the HTML document encoding, or NULL 6129: * 6130: * Create a parser context for a file content. 6131: * Automatic support for ZLIB/Compress compressed document is provided 6132: * by default if found at compile-time. 6133: * 6134: * Returns the new parser context or NULL 6135: */ 6136: htmlParserCtxtPtr 6137: htmlCreateFileParserCtxt(const char *filename, const char *encoding) 6138: { 6139: htmlParserCtxtPtr ctxt; 6140: htmlParserInputPtr inputStream; 6141: char *canonicFilename; 6142: /* htmlCharEncoding enc; */ 6143: xmlChar *content, *content_line = (xmlChar *) "charset="; 6144: 6145: if (filename == NULL) 6146: return(NULL); 6147: 6148: ctxt = htmlNewParserCtxt(); 6149: if (ctxt == NULL) { 6150: return(NULL); 6151: } 6152: canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 6153: if (canonicFilename == NULL) { 6154: #ifdef LIBXML_SAX1_ENABLED 6155: if (xmlDefaultSAXHandler.error != NULL) { 6156: xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 6157: } 6158: #endif 6159: xmlFreeParserCtxt(ctxt); 6160: return(NULL); 6161: } 6162: 6163: inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 6164: xmlFree(canonicFilename); 6165: if (inputStream == NULL) { 6166: xmlFreeParserCtxt(ctxt); 6167: return(NULL); 6168: } 6169: 6170: inputPush(ctxt, inputStream); 6171: 6172: /* set encoding */ 6173: if (encoding) { 6174: content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 6175: if (content) { 6176: strcpy ((char *)content, (char *)content_line); 6177: strcat ((char *)content, (char *)encoding); 6178: htmlCheckEncoding (ctxt, content); 6179: xmlFree (content); 6180: } 6181: } 6182: 6183: return(ctxt); 6184: } 6185: 6186: /** 6187: * htmlSAXParseFile: 6188: * @filename: the filename 6189: * @encoding: a free form C string describing the HTML document encoding, or NULL 6190: * @sax: the SAX handler block 6191: * @userData: if using SAX, this pointer will be provided on callbacks. 6192: * 6193: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6194: * compressed document is provided by default if found at compile-time. 6195: * It use the given SAX function block to handle the parsing callback. 6196: * If sax is NULL, fallback to the default DOM tree building routines. 6197: * 6198: * Returns the resulting document tree unless SAX is NULL or the document is 6199: * not well formed. 6200: */ 6201: 6202: htmlDocPtr 6203: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 6204: void *userData) { 6205: htmlDocPtr ret; 6206: htmlParserCtxtPtr ctxt; 6207: htmlSAXHandlerPtr oldsax = NULL; 6208: 6209: xmlInitParser(); 6210: 6211: ctxt = htmlCreateFileParserCtxt(filename, encoding); 6212: if (ctxt == NULL) return(NULL); 6213: if (sax != NULL) { 6214: oldsax = ctxt->sax; 6215: ctxt->sax = sax; 6216: ctxt->userData = userData; 6217: } 6218: 6219: htmlParseDocument(ctxt); 6220: 6221: ret = ctxt->myDoc; 6222: if (sax != NULL) { 6223: ctxt->sax = oldsax; 6224: ctxt->userData = NULL; 6225: } 6226: htmlFreeParserCtxt(ctxt); 6227: 6228: return(ret); 6229: } 6230: 6231: /** 6232: * htmlParseFile: 6233: * @filename: the filename 6234: * @encoding: a free form C string describing the HTML document encoding, or NULL 6235: * 6236: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6237: * compressed document is provided by default if found at compile-time. 6238: * 6239: * Returns the resulting document tree 6240: */ 6241: 6242: htmlDocPtr 6243: htmlParseFile(const char *filename, const char *encoding) { 6244: return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 6245: } 6246: 6247: /** 6248: * htmlHandleOmittedElem: 6249: * @val: int 0 or 1 6250: * 6251: * Set and return the previous value for handling HTML omitted tags. 6252: * 6253: * Returns the last value for 0 for no handling, 1 for auto insertion. 6254: */ 6255: 6256: int 6257: htmlHandleOmittedElem(int val) { 6258: int old = htmlOmittedDefaultValue; 6259: 6260: htmlOmittedDefaultValue = val; 6261: return(old); 6262: } 6263: 6264: /** 6265: * htmlElementAllowedHere: 6266: * @parent: HTML parent element 6267: * @elt: HTML element 6268: * 6269: * Checks whether an HTML element may be a direct child of a parent element. 6270: * Note - doesn't check for deprecated elements 6271: * 6272: * Returns 1 if allowed; 0 otherwise. 6273: */ 6274: int 6275: htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 6276: const char** p ; 6277: 6278: if ( ! elt || ! parent || ! parent->subelts ) 6279: return 0 ; 6280: 6281: for ( p = parent->subelts; *p; ++p ) 6282: if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 6283: return 1 ; 6284: 6285: return 0 ; 6286: } 6287: /** 6288: * htmlElementStatusHere: 6289: * @parent: HTML parent element 6290: * @elt: HTML element 6291: * 6292: * Checks whether an HTML element may be a direct child of a parent element. 6293: * and if so whether it is valid or deprecated. 6294: * 6295: * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6296: */ 6297: htmlStatus 6298: htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 6299: if ( ! parent || ! elt ) 6300: return HTML_INVALID ; 6301: if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 6302: return HTML_INVALID ; 6303: 6304: return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 6305: } 6306: /** 6307: * htmlAttrAllowed: 6308: * @elt: HTML element 6309: * @attr: HTML attribute 6310: * @legacy: whether to allow deprecated attributes 6311: * 6312: * Checks whether an attribute is valid for an element 6313: * Has full knowledge of Required and Deprecated attributes 6314: * 6315: * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6316: */ 6317: htmlStatus 6318: htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 6319: const char** p ; 6320: 6321: if ( !elt || ! attr ) 6322: return HTML_INVALID ; 6323: 6324: if ( elt->attrs_req ) 6325: for ( p = elt->attrs_req; *p; ++p) 6326: if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6327: return HTML_REQUIRED ; 6328: 6329: if ( elt->attrs_opt ) 6330: for ( p = elt->attrs_opt; *p; ++p) 6331: if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6332: return HTML_VALID ; 6333: 6334: if ( legacy && elt->attrs_depr ) 6335: for ( p = elt->attrs_depr; *p; ++p) 6336: if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6337: return HTML_DEPRECATED ; 6338: 6339: return HTML_INVALID ; 6340: } 6341: /** 6342: * htmlNodeStatus: 6343: * @node: an htmlNodePtr in a tree 6344: * @legacy: whether to allow deprecated elements (YES is faster here 6345: * for Element nodes) 6346: * 6347: * Checks whether the tree node is valid. Experimental (the author 6348: * only uses the HTML enhancements in a SAX parser) 6349: * 6350: * Return: for Element nodes, a return from htmlElementAllowedHere (if 6351: * legacy allowed) or htmlElementStatusHere (otherwise). 6352: * for Attribute nodes, a return from htmlAttrAllowed 6353: * for other nodes, HTML_NA (no checks performed) 6354: */ 6355: htmlStatus 6356: htmlNodeStatus(const htmlNodePtr node, int legacy) { 6357: if ( ! node ) 6358: return HTML_INVALID ; 6359: 6360: switch ( node->type ) { 6361: case XML_ELEMENT_NODE: 6362: return legacy 6363: ? ( htmlElementAllowedHere ( 6364: htmlTagLookup(node->parent->name) , node->name 6365: ) ? HTML_VALID : HTML_INVALID ) 6366: : htmlElementStatusHere( 6367: htmlTagLookup(node->parent->name) , 6368: htmlTagLookup(node->name) ) 6369: ; 6370: case XML_ATTRIBUTE_NODE: 6371: return htmlAttrAllowed( 6372: htmlTagLookup(node->parent->name) , node->name, legacy) ; 6373: default: return HTML_NA ; 6374: } 6375: } 6376: /************************************************************************ 6377: * * 6378: * New set (2.6.0) of simpler and more flexible APIs * 6379: * * 6380: ************************************************************************/ 6381: /** 6382: * DICT_FREE: 6383: * @str: a string 6384: * 6385: * Free a string if it is not owned by the "dict" dictionnary in the 6386: * current scope 6387: */ 6388: #define DICT_FREE(str) \ 6389: if ((str) && ((!dict) || \ 6390: (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 6391: xmlFree((char *)(str)); 6392: 6393: /** 6394: * htmlCtxtReset: 6395: * @ctxt: an HTML parser context 6396: * 6397: * Reset a parser context 6398: */ 6399: void 6400: htmlCtxtReset(htmlParserCtxtPtr ctxt) 6401: { 6402: xmlParserInputPtr input; 6403: xmlDictPtr dict; 6404: 6405: if (ctxt == NULL) 6406: return; 6407: 6408: xmlInitParser(); 6409: dict = ctxt->dict; 6410: 6411: while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 6412: xmlFreeInputStream(input); 6413: } 6414: ctxt->inputNr = 0; 6415: ctxt->input = NULL; 6416: 6417: ctxt->spaceNr = 0; 6418: if (ctxt->spaceTab != NULL) { 6419: ctxt->spaceTab[0] = -1; 6420: ctxt->space = &ctxt->spaceTab[0]; 6421: } else { 6422: ctxt->space = NULL; 6423: } 6424: 6425: 6426: ctxt->nodeNr = 0; 6427: ctxt->node = NULL; 6428: 6429: ctxt->nameNr = 0; 6430: ctxt->name = NULL; 6431: 6432: DICT_FREE(ctxt->version); 6433: ctxt->version = NULL; 6434: DICT_FREE(ctxt->encoding); 6435: ctxt->encoding = NULL; 6436: DICT_FREE(ctxt->directory); 6437: ctxt->directory = NULL; 6438: DICT_FREE(ctxt->extSubURI); 6439: ctxt->extSubURI = NULL; 6440: DICT_FREE(ctxt->extSubSystem); 6441: ctxt->extSubSystem = NULL; 6442: if (ctxt->myDoc != NULL) 6443: xmlFreeDoc(ctxt->myDoc); 6444: ctxt->myDoc = NULL; 6445: 6446: ctxt->standalone = -1; 6447: ctxt->hasExternalSubset = 0; 6448: ctxt->hasPErefs = 0; 6449: ctxt->html = 1; 6450: ctxt->external = 0; 6451: ctxt->instate = XML_PARSER_START; 6452: ctxt->token = 0; 6453: 6454: ctxt->wellFormed = 1; 6455: ctxt->nsWellFormed = 1; 6456: ctxt->disableSAX = 0; 6457: ctxt->valid = 1; 6458: ctxt->vctxt.userData = ctxt; 6459: ctxt->vctxt.error = xmlParserValidityError; 6460: ctxt->vctxt.warning = xmlParserValidityWarning; 6461: ctxt->record_info = 0; 6462: ctxt->nbChars = 0; 6463: ctxt->checkIndex = 0; 6464: ctxt->inSubset = 0; 6465: ctxt->errNo = XML_ERR_OK; 6466: ctxt->depth = 0; 6467: ctxt->charset = XML_CHAR_ENCODING_NONE; 6468: ctxt->catalogs = NULL; 6469: xmlInitNodeInfoSeq(&ctxt->node_seq); 6470: 6471: if (ctxt->attsDefault != NULL) { 6472: xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 6473: ctxt->attsDefault = NULL; 6474: } 6475: if (ctxt->attsSpecial != NULL) { 6476: xmlHashFree(ctxt->attsSpecial, NULL); 6477: ctxt->attsSpecial = NULL; 6478: } 6479: } 6480: 6481: /** 6482: * htmlCtxtUseOptions: 6483: * @ctxt: an HTML parser context 6484: * @options: a combination of htmlParserOption(s) 6485: * 6486: * Applies the options to the parser context 6487: * 6488: * Returns 0 in case of success, the set of unknown or unimplemented options 6489: * in case of error. 6490: */ 6491: int 6492: htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 6493: { 6494: if (ctxt == NULL) 6495: return(-1); 6496: 6497: if (options & HTML_PARSE_NOWARNING) { 6498: ctxt->sax->warning = NULL; 6499: ctxt->vctxt.warning = NULL; 6500: options -= XML_PARSE_NOWARNING; 6501: ctxt->options |= XML_PARSE_NOWARNING; 6502: } 6503: if (options & HTML_PARSE_NOERROR) { 6504: ctxt->sax->error = NULL; 6505: ctxt->vctxt.error = NULL; 6506: ctxt->sax->fatalError = NULL; 6507: options -= XML_PARSE_NOERROR; 6508: ctxt->options |= XML_PARSE_NOERROR; 6509: } 6510: if (options & HTML_PARSE_PEDANTIC) { 6511: ctxt->pedantic = 1; 6512: options -= XML_PARSE_PEDANTIC; 6513: ctxt->options |= XML_PARSE_PEDANTIC; 6514: } else 6515: ctxt->pedantic = 0; 6516: if (options & XML_PARSE_NOBLANKS) { 6517: ctxt->keepBlanks = 0; 6518: ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 6519: options -= XML_PARSE_NOBLANKS; 6520: ctxt->options |= XML_PARSE_NOBLANKS; 6521: } else 6522: ctxt->keepBlanks = 1; 6523: if (options & HTML_PARSE_RECOVER) { 6524: ctxt->recovery = 1; 6525: options -= HTML_PARSE_RECOVER; 6526: } else 6527: ctxt->recovery = 0; 6528: if (options & HTML_PARSE_COMPACT) { 6529: ctxt->options |= HTML_PARSE_COMPACT; 6530: options -= HTML_PARSE_COMPACT; 6531: } 6532: if (options & XML_PARSE_HUGE) { 6533: ctxt->options |= XML_PARSE_HUGE; 6534: options -= XML_PARSE_HUGE; 6535: } 6536: if (options & HTML_PARSE_NODEFDTD) { 6537: ctxt->options |= HTML_PARSE_NODEFDTD; 6538: options -= HTML_PARSE_NODEFDTD; 6539: } 6540: ctxt->dictNames = 0; 6541: return (options); 6542: } 6543: 6544: /** 6545: * htmlDoRead: 6546: * @ctxt: an HTML parser context 6547: * @URL: the base URL to use for the document 6548: * @encoding: the document encoding, or NULL 6549: * @options: a combination of htmlParserOption(s) 6550: * @reuse: keep the context for reuse 6551: * 6552: * Common front-end for the htmlRead functions 6553: * 6554: * Returns the resulting document tree or NULL 6555: */ 6556: static htmlDocPtr 6557: htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 6558: int options, int reuse) 6559: { 6560: htmlDocPtr ret; 6561: 6562: htmlCtxtUseOptions(ctxt, options); 6563: ctxt->html = 1; 6564: if (encoding != NULL) { 6565: xmlCharEncodingHandlerPtr hdlr; 6566: 6567: hdlr = xmlFindCharEncodingHandler(encoding); 6568: if (hdlr != NULL) { 6569: xmlSwitchToEncoding(ctxt, hdlr); 6570: if (ctxt->input->encoding != NULL) 6571: xmlFree((xmlChar *) ctxt->input->encoding); 6572: ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 6573: } 6574: } 6575: if ((URL != NULL) && (ctxt->input != NULL) && 6576: (ctxt->input->filename == NULL)) 6577: ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6578: htmlParseDocument(ctxt); 6579: ret = ctxt->myDoc; 6580: ctxt->myDoc = NULL; 6581: if (!reuse) { 6582: if ((ctxt->dictNames) && 6583: (ret != NULL) && 6584: (ret->dict == ctxt->dict)) 6585: ctxt->dict = NULL; 6586: xmlFreeParserCtxt(ctxt); 6587: } 6588: return (ret); 6589: } 6590: 6591: /** 6592: * htmlReadDoc: 6593: * @cur: a pointer to a zero terminated string 6594: * @URL: the base URL to use for the document 6595: * @encoding: the document encoding, or NULL 6596: * @options: a combination of htmlParserOption(s) 6597: * 6598: * parse an XML in-memory document and build a tree. 6599: * 6600: * Returns the resulting document tree 6601: */ 6602: htmlDocPtr 6603: htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 6604: { 6605: htmlParserCtxtPtr ctxt; 6606: 6607: if (cur == NULL) 6608: return (NULL); 6609: 6610: xmlInitParser(); 6611: ctxt = htmlCreateDocParserCtxt(cur, NULL); 6612: if (ctxt == NULL) 6613: return (NULL); 6614: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6615: } 6616: 6617: /** 6618: * htmlReadFile: 6619: * @filename: a file or URL 6620: * @encoding: the document encoding, or NULL 6621: * @options: a combination of htmlParserOption(s) 6622: * 6623: * parse an XML file from the filesystem or the network. 6624: * 6625: * Returns the resulting document tree 6626: */ 6627: htmlDocPtr 6628: htmlReadFile(const char *filename, const char *encoding, int options) 6629: { 6630: htmlParserCtxtPtr ctxt; 6631: 6632: xmlInitParser(); 6633: ctxt = htmlCreateFileParserCtxt(filename, encoding); 6634: if (ctxt == NULL) 6635: return (NULL); 6636: return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6637: } 6638: 6639: /** 6640: * htmlReadMemory: 6641: * @buffer: a pointer to a char array 6642: * @size: the size of the array 6643: * @URL: the base URL to use for the document 6644: * @encoding: the document encoding, or NULL 6645: * @options: a combination of htmlParserOption(s) 6646: * 6647: * parse an XML in-memory document and build a tree. 6648: * 6649: * Returns the resulting document tree 6650: */ 6651: htmlDocPtr 6652: htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 6653: { 6654: htmlParserCtxtPtr ctxt; 6655: 6656: xmlInitParser(); 6657: ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6658: if (ctxt == NULL) 6659: return (NULL); 6660: htmlDefaultSAXHandlerInit(); 6661: if (ctxt->sax != NULL) 6662: memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6663: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6664: } 6665: 6666: /** 6667: * htmlReadFd: 6668: * @fd: an open file descriptor 6669: * @URL: the base URL to use for the document 6670: * @encoding: the document encoding, or NULL 6671: * @options: a combination of htmlParserOption(s) 6672: * 6673: * parse an XML from a file descriptor and build a tree. 6674: * 6675: * Returns the resulting document tree 6676: */ 6677: htmlDocPtr 6678: htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6679: { 6680: htmlParserCtxtPtr ctxt; 6681: xmlParserInputBufferPtr input; 6682: xmlParserInputPtr stream; 6683: 6684: if (fd < 0) 6685: return (NULL); 6686: 6687: xmlInitParser(); 6688: input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6689: if (input == NULL) 6690: return (NULL); 6691: ctxt = xmlNewParserCtxt(); 6692: if (ctxt == NULL) { 6693: xmlFreeParserInputBuffer(input); 6694: return (NULL); 6695: } 6696: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6697: if (stream == NULL) { 6698: xmlFreeParserInputBuffer(input); 6699: xmlFreeParserCtxt(ctxt); 6700: return (NULL); 6701: } 6702: inputPush(ctxt, stream); 6703: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6704: } 6705: 6706: /** 6707: * htmlReadIO: 6708: * @ioread: an I/O read function 6709: * @ioclose: an I/O close function 6710: * @ioctx: an I/O handler 6711: * @URL: the base URL to use for the document 6712: * @encoding: the document encoding, or NULL 6713: * @options: a combination of htmlParserOption(s) 6714: * 6715: * parse an HTML document from I/O functions and source and build a tree. 6716: * 6717: * Returns the resulting document tree 6718: */ 6719: htmlDocPtr 6720: htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6721: void *ioctx, const char *URL, const char *encoding, int options) 6722: { 6723: htmlParserCtxtPtr ctxt; 6724: xmlParserInputBufferPtr input; 6725: xmlParserInputPtr stream; 6726: 6727: if (ioread == NULL) 6728: return (NULL); 6729: xmlInitParser(); 6730: 6731: input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6732: XML_CHAR_ENCODING_NONE); 6733: if (input == NULL) 6734: return (NULL); 6735: ctxt = htmlNewParserCtxt(); 6736: if (ctxt == NULL) { 6737: xmlFreeParserInputBuffer(input); 6738: return (NULL); 6739: } 6740: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6741: if (stream == NULL) { 6742: xmlFreeParserInputBuffer(input); 6743: xmlFreeParserCtxt(ctxt); 6744: return (NULL); 6745: } 6746: inputPush(ctxt, stream); 6747: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6748: } 6749: 6750: /** 6751: * htmlCtxtReadDoc: 6752: * @ctxt: an HTML parser context 6753: * @cur: a pointer to a zero terminated string 6754: * @URL: the base URL to use for the document 6755: * @encoding: the document encoding, or NULL 6756: * @options: a combination of htmlParserOption(s) 6757: * 6758: * parse an XML in-memory document and build a tree. 6759: * This reuses the existing @ctxt parser context 6760: * 6761: * Returns the resulting document tree 6762: */ 6763: htmlDocPtr 6764: htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6765: const char *URL, const char *encoding, int options) 6766: { 6767: xmlParserInputPtr stream; 6768: 6769: if (cur == NULL) 6770: return (NULL); 6771: if (ctxt == NULL) 6772: return (NULL); 6773: 6774: htmlCtxtReset(ctxt); 6775: 6776: stream = xmlNewStringInputStream(ctxt, cur); 6777: if (stream == NULL) { 6778: return (NULL); 6779: } 6780: inputPush(ctxt, stream); 6781: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6782: } 6783: 6784: /** 6785: * htmlCtxtReadFile: 6786: * @ctxt: an HTML parser context 6787: * @filename: a file or URL 6788: * @encoding: the document encoding, or NULL 6789: * @options: a combination of htmlParserOption(s) 6790: * 6791: * parse an XML file from the filesystem or the network. 6792: * This reuses the existing @ctxt parser context 6793: * 6794: * Returns the resulting document tree 6795: */ 6796: htmlDocPtr 6797: htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 6798: const char *encoding, int options) 6799: { 6800: xmlParserInputPtr stream; 6801: 6802: if (filename == NULL) 6803: return (NULL); 6804: if (ctxt == NULL) 6805: return (NULL); 6806: 6807: htmlCtxtReset(ctxt); 6808: 6809: stream = xmlLoadExternalEntity(filename, NULL, ctxt); 6810: if (stream == NULL) { 6811: return (NULL); 6812: } 6813: inputPush(ctxt, stream); 6814: return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 6815: } 6816: 6817: /** 6818: * htmlCtxtReadMemory: 6819: * @ctxt: an HTML parser context 6820: * @buffer: a pointer to a char array 6821: * @size: the size of the array 6822: * @URL: the base URL to use for the document 6823: * @encoding: the document encoding, or NULL 6824: * @options: a combination of htmlParserOption(s) 6825: * 6826: * parse an XML in-memory document and build a tree. 6827: * This reuses the existing @ctxt parser context 6828: * 6829: * Returns the resulting document tree 6830: */ 6831: htmlDocPtr 6832: htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 6833: const char *URL, const char *encoding, int options) 6834: { 6835: xmlParserInputBufferPtr input; 6836: xmlParserInputPtr stream; 6837: 6838: if (ctxt == NULL) 6839: return (NULL); 6840: if (buffer == NULL) 6841: return (NULL); 6842: 6843: htmlCtxtReset(ctxt); 6844: 6845: input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 6846: if (input == NULL) { 6847: return(NULL); 6848: } 6849: 6850: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6851: if (stream == NULL) { 6852: xmlFreeParserInputBuffer(input); 6853: return(NULL); 6854: } 6855: 6856: inputPush(ctxt, stream); 6857: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6858: } 6859: 6860: /** 6861: * htmlCtxtReadFd: 6862: * @ctxt: an HTML parser context 6863: * @fd: an open file descriptor 6864: * @URL: the base URL to use for the document 6865: * @encoding: the document encoding, or NULL 6866: * @options: a combination of htmlParserOption(s) 6867: * 6868: * parse an XML from a file descriptor and build a tree. 6869: * This reuses the existing @ctxt parser context 6870: * 6871: * Returns the resulting document tree 6872: */ 6873: htmlDocPtr 6874: htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 6875: const char *URL, const char *encoding, int options) 6876: { 6877: xmlParserInputBufferPtr input; 6878: xmlParserInputPtr stream; 6879: 6880: if (fd < 0) 6881: return (NULL); 6882: if (ctxt == NULL) 6883: return (NULL); 6884: 6885: htmlCtxtReset(ctxt); 6886: 6887: 6888: input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6889: if (input == NULL) 6890: return (NULL); 6891: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6892: if (stream == NULL) { 6893: xmlFreeParserInputBuffer(input); 6894: return (NULL); 6895: } 6896: inputPush(ctxt, stream); 6897: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6898: } 6899: 6900: /** 6901: * htmlCtxtReadIO: 6902: * @ctxt: an HTML parser context 6903: * @ioread: an I/O read function 6904: * @ioclose: an I/O close function 6905: * @ioctx: an I/O handler 6906: * @URL: the base URL to use for the document 6907: * @encoding: the document encoding, or NULL 6908: * @options: a combination of htmlParserOption(s) 6909: * 6910: * parse an HTML document from I/O functions and source and build a tree. 6911: * This reuses the existing @ctxt parser context 6912: * 6913: * Returns the resulting document tree 6914: */ 6915: htmlDocPtr 6916: htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 6917: xmlInputCloseCallback ioclose, void *ioctx, 6918: const char *URL, 6919: const char *encoding, int options) 6920: { 6921: xmlParserInputBufferPtr input; 6922: xmlParserInputPtr stream; 6923: 6924: if (ioread == NULL) 6925: return (NULL); 6926: if (ctxt == NULL) 6927: return (NULL); 6928: 6929: htmlCtxtReset(ctxt); 6930: 6931: input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6932: XML_CHAR_ENCODING_NONE); 6933: if (input == NULL) 6934: return (NULL); 6935: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6936: if (stream == NULL) { 6937: xmlFreeParserInputBuffer(input); 6938: return (NULL); 6939: } 6940: inputPush(ctxt, stream); 6941: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6942: } 6943: 6944: #define bottom_HTMLparser 6945: #include "elfgcchack.h" 6946: #endif /* LIBXML_HTML_ENABLED */