embedaddon/libxml2/HTMLparser.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / HTMLparser.c
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Mon Jul 22 01:22:19 2013 UTC (11 years ago) by misho
Branches: libxml2, MAIN
CVS tags: v2_8_0p0, v2_8_0, HEAD

2.8.0

1: /* 2: * HTMLparser.c : an HTML 4.0 non-verifying parser 3: * 4: * See Copyright for the status of this software. 5: * 6: * daniel@veillard.com 7: */ 8: 9: #define IN_LIBXML 10: #include "libxml.h" 11: #ifdef LIBXML_HTML_ENABLED 12: 13: #include <string.h> 14: #ifdef HAVE_CTYPE_H 15: #include <ctype.h> 16: #endif 17: #ifdef HAVE_STDLIB_H 18: #include <stdlib.h> 19: #endif 20: #ifdef HAVE_SYS_STAT_H 21: #include <sys/stat.h> 22: #endif 23: #ifdef HAVE_FCNTL_H 24: #include <fcntl.h> 25: #endif 26: #ifdef HAVE_UNISTD_H 27: #include <unistd.h> 28: #endif 29: #ifdef HAVE_ZLIB_H 30: #include <zlib.h> 31: #endif 32: 33: #include <libxml/xmlmemory.h> 34: #include <libxml/tree.h> 35: #include <libxml/parser.h> 36: #include <libxml/parserInternals.h> 37: #include <libxml/xmlerror.h> 38: #include <libxml/HTMLparser.h> 39: #include <libxml/HTMLtree.h> 40: #include <libxml/entities.h> 41: #include <libxml/encoding.h> 42: #include <libxml/valid.h> 43: #include <libxml/xmlIO.h> 44: #include <libxml/globals.h> 45: #include <libxml/uri.h> 46: 47: #define HTML_MAX_NAMELEN 1000 48: #define HTML_PARSER_BIG_BUFFER_SIZE 1000 49: #define HTML_PARSER_BUFFER_SIZE 100 50: 51: /* #define DEBUG */ 52: /* #define DEBUG_PUSH */ 53: 54: static int htmlOmittedDefaultValue = 1; 55: 56: xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 57: xmlChar end, xmlChar end2, xmlChar end3); 58: static void htmlParseComment(htmlParserCtxtPtr ctxt); 59: 60: /************************************************************************ 61: * * 62: * Some factorized error routines * 63: * * 64: ************************************************************************/ 65: 66: /** 67: * htmlErrMemory: 68: * @ctxt: an HTML parser context 69: * @extra: extra informations 70: * 71: * Handle a redefinition of attribute error 72: */ 73: static void 74: htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 75: { 76: if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 77: (ctxt->instate == XML_PARSER_EOF)) 78: return; 79: if (ctxt != NULL) { 80: ctxt->errNo = XML_ERR_NO_MEMORY; 81: ctxt->instate = XML_PARSER_EOF; 82: ctxt->disableSAX = 1; 83: } 84: if (extra) 85: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 86: XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 87: NULL, NULL, 0, 0, 88: "Memory allocation failed : %s\n", extra); 89: else 90: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 91: XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 92: NULL, NULL, 0, 0, "Memory allocation failed\n"); 93: } 94: 95: /** 96: * htmlParseErr: 97: * @ctxt: an HTML parser context 98: * @error: the error number 99: * @msg: the error message 100: * @str1: string infor 101: * @str2: string infor 102: * 103: * Handle a fatal parser error, i.e. violating Well-Formedness constraints 104: */ 105: static void 106: htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 107: const char *msg, const xmlChar *str1, const xmlChar *str2) 108: { 109: if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 110: (ctxt->instate == XML_PARSER_EOF)) 111: return; 112: if (ctxt != NULL) 113: ctxt->errNo = error; 114: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 115: XML_ERR_ERROR, NULL, 0, 116: (const char *) str1, (const char *) str2, 117: NULL, 0, 0, 118: msg, str1, str2); 119: if (ctxt != NULL) 120: ctxt->wellFormed = 0; 121: } 122: 123: /** 124: * htmlParseErrInt: 125: * @ctxt: an HTML parser context 126: * @error: the error number 127: * @msg: the error message 128: * @val: integer info 129: * 130: * Handle a fatal parser error, i.e. violating Well-Formedness constraints 131: */ 132: static void 133: htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 134: const char *msg, int val) 135: { 136: if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 137: (ctxt->instate == XML_PARSER_EOF)) 138: return; 139: if (ctxt != NULL) 140: ctxt->errNo = error; 141: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 142: XML_ERR_ERROR, NULL, 0, NULL, NULL, 143: NULL, val, 0, msg, val); 144: if (ctxt != NULL) 145: ctxt->wellFormed = 0; 146: } 147: 148: /************************************************************************ 149: * * 150: * Parser stacks related functions and macros * 151: * * 152: ************************************************************************/ 153: 154: /** 155: * htmlnamePush: 156: * @ctxt: an HTML parser context 157: * @value: the element name 158: * 159: * Pushes a new element name on top of the name stack 160: * 161: * Returns 0 in case of error, the index in the stack otherwise 162: */ 163: static int 164: htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 165: { 166: if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) 167: ctxt->html = 3; 168: if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) 169: ctxt->html = 10; 170: if (ctxt->nameNr >= ctxt->nameMax) { 171: ctxt->nameMax *= 2; 172: ctxt->nameTab = (const xmlChar * *) 173: xmlRealloc((xmlChar * *)ctxt->nameTab, 174: ctxt->nameMax * 175: sizeof(ctxt->nameTab[0])); 176: if (ctxt->nameTab == NULL) { 177: htmlErrMemory(ctxt, NULL); 178: return (0); 179: } 180: } 181: ctxt->nameTab[ctxt->nameNr] = value; 182: ctxt->name = value; 183: return (ctxt->nameNr++); 184: } 185: /** 186: * htmlnamePop: 187: * @ctxt: an HTML parser context 188: * 189: * Pops the top element name from the name stack 190: * 191: * Returns the name just removed 192: */ 193: static const xmlChar * 194: htmlnamePop(htmlParserCtxtPtr ctxt) 195: { 196: const xmlChar *ret; 197: 198: if (ctxt->nameNr <= 0) 199: return (NULL); 200: ctxt->nameNr--; 201: if (ctxt->nameNr < 0) 202: return (NULL); 203: if (ctxt->nameNr > 0) 204: ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 205: else 206: ctxt->name = NULL; 207: ret = ctxt->nameTab[ctxt->nameNr]; 208: ctxt->nameTab[ctxt->nameNr] = NULL; 209: return (ret); 210: } 211: 212: /** 213: * htmlNodeInfoPush: 214: * @ctxt: an HTML parser context 215: * @value: the node info 216: * 217: * Pushes a new element name on top of the node info stack 218: * 219: * Returns 0 in case of error, the index in the stack otherwise 220: */ 221: static int 222: htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) 223: { 224: if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { 225: if (ctxt->nodeInfoMax == 0) 226: ctxt->nodeInfoMax = 5; 227: ctxt->nodeInfoMax *= 2; 228: ctxt->nodeInfoTab = (htmlParserNodeInfo *) 229: xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, 230: ctxt->nodeInfoMax * 231: sizeof(ctxt->nodeInfoTab[0])); 232: if (ctxt->nodeInfoTab == NULL) { 233: htmlErrMemory(ctxt, NULL); 234: return (0); 235: } 236: } 237: ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; 238: ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 239: return (ctxt->nodeInfoNr++); 240: } 241: 242: /** 243: * htmlNodeInfoPop: 244: * @ctxt: an HTML parser context 245: * 246: * Pops the top element name from the node info stack 247: * 248: * Returns 0 in case of error, the pointer to NodeInfo otherwise 249: */ 250: static htmlParserNodeInfo * 251: htmlNodeInfoPop(htmlParserCtxtPtr ctxt) 252: { 253: if (ctxt->nodeInfoNr <= 0) 254: return (NULL); 255: ctxt->nodeInfoNr--; 256: if (ctxt->nodeInfoNr < 0) 257: return (NULL); 258: if (ctxt->nodeInfoNr > 0) 259: ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; 260: else 261: ctxt->nodeInfo = NULL; 262: return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 263: } 264: 265: /* 266: * Macros for accessing the content. Those should be used only by the parser, 267: * and not exported. 268: * 269: * Dirty macros, i.e. one need to make assumption on the context to use them 270: * 271: * CUR_PTR return the current pointer to the xmlChar to be parsed. 272: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 273: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 274: * in UNICODE mode. This should be used internally by the parser 275: * only to compare to ASCII values otherwise it would break when 276: * running with UTF-8 encoding. 277: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 278: * to compare on ASCII based substring. 279: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 280: * it should be used only to compare on ASCII based substring. 281: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 282: * strings without newlines within the parser. 283: * 284: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 285: * 286: * CURRENT Returns the current char value, with the full decoding of 287: * UTF-8 if we are using this mode. It returns an int. 288: * NEXT Skip to the next character, this does the proper decoding 289: * in UTF-8 mode. It also pop-up unfinished entities on the fly. 290: * NEXTL(l) Skip the current unicode character of l xmlChars long. 291: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 292: */ 293: 294: #define UPPER (toupper(*ctxt->input->cur)) 295: 296: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 297: 298: #define NXT(val) ctxt->input->cur[(val)] 299: 300: #define UPP(val) (toupper(ctxt->input->cur[(val)])) 301: 302: #define CUR_PTR ctxt->input->cur 303: 304: #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 305: (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 306: xmlParserInputShrink(ctxt->input) 307: 308: #define GROW if ((ctxt->progressive == 0) && \ 309: (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 310: xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 311: 312: #define CURRENT ((int) (*ctxt->input->cur)) 313: 314: #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 315: 316: /* Inported from XML */ 317: 318: /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 319: #define CUR ((int) (*ctxt->input->cur)) 320: #define NEXT xmlNextChar(ctxt) 321: 322: #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 323: 324: 325: #define NEXTL(l) do { \ 326: if (*(ctxt->input->cur) == '\n') { \ 327: ctxt->input->line++; ctxt->input->col = 1; \ 328: } else ctxt->input->col++; \ 329: ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 330: } while (0) 331: 332: /************ 333: \ 334: if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 335: if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 336: ************/ 337: 338: #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 339: #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 340: 341: #define COPY_BUF(l,b,i,v) \ 342: if (l == 1) b[i++] = (xmlChar) v; \ 343: else i += xmlCopyChar(l,&b[i],v) 344: 345: /** 346: * htmlFindEncoding: 347: * @the HTML parser context 348: * 349: * Ty to find and encoding in the current data available in the input 350: * buffer this is needed to try to switch to the proper encoding when 351: * one face a character error. 352: * That's an heuristic, since it's operating outside of parsing it could 353: * try to use a meta which had been commented out, that's the reason it 354: * should only be used in case of error, not as a default. 355: * 356: * Returns an encoding string or NULL if not found, the string need to 357: * be freed 358: */ 359: static xmlChar * 360: htmlFindEncoding(xmlParserCtxtPtr ctxt) { 361: const xmlChar *start, *cur, *end; 362: 363: if ((ctxt == NULL) || (ctxt->input == NULL) || 364: (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || 365: (ctxt->input->buf->encoder != NULL)) 366: return(NULL); 367: if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) 368: return(NULL); 369: 370: start = ctxt->input->cur; 371: end = ctxt->input->end; 372: /* we also expect the input buffer to be zero terminated */ 373: if (*end != 0) 374: return(NULL); 375: 376: cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV"); 377: if (cur == NULL) 378: return(NULL); 379: cur = xmlStrcasestr(cur, BAD_CAST "CONTENT"); 380: if (cur == NULL) 381: return(NULL); 382: cur = xmlStrcasestr(cur, BAD_CAST "CHARSET="); 383: if (cur == NULL) 384: return(NULL); 385: cur += 8; 386: start = cur; 387: while (((*cur >= 'A') && (*cur <= 'Z')) || 388: ((*cur >= 'a') && (*cur <= 'z')) || 389: ((*cur >= '0') && (*cur <= '9')) || 390: (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) 391: cur++; 392: if (cur == start) 393: return(NULL); 394: return(xmlStrndup(start, cur - start)); 395: } 396: 397: /** 398: * htmlCurrentChar: 399: * @ctxt: the HTML parser context 400: * @len: pointer to the length of the char read 401: * 402: * The current char value, if using UTF-8 this may actually span multiple 403: * bytes in the input buffer. Implement the end of line normalization: 404: * 2.11 End-of-Line Handling 405: * If the encoding is unspecified, in the case we find an ISO-Latin-1 406: * char, then the encoding converter is plugged in automatically. 407: * 408: * Returns the current char value and its length 409: */ 410: 411: static int 412: htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 413: if (ctxt->instate == XML_PARSER_EOF) 414: return(0); 415: 416: if (ctxt->token != 0) { 417: *len = 0; 418: return(ctxt->token); 419: } 420: if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 421: /* 422: * We are supposed to handle UTF8, check it's valid 423: * From rfc2044: encoding of the Unicode values on UTF-8: 424: * 425: * UCS-4 range (hex.) UTF-8 octet sequence (binary) 426: * 0000 0000-0000 007F 0xxxxxxx 427: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 428: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 429: * 430: * Check for the 0x110000 limit too 431: */ 432: const unsigned char *cur = ctxt->input->cur; 433: unsigned char c; 434: unsigned int val; 435: 436: c = *cur; 437: if (c & 0x80) { 438: if (cur[1] == 0) { 439: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 440: cur = ctxt->input->cur; 441: } 442: if ((cur[1] & 0xc0) != 0x80) 443: goto encoding_error; 444: if ((c & 0xe0) == 0xe0) { 445: 446: if (cur[2] == 0) { 447: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 448: cur = ctxt->input->cur; 449: } 450: if ((cur[2] & 0xc0) != 0x80) 451: goto encoding_error; 452: if ((c & 0xf0) == 0xf0) { 453: if (cur[3] == 0) { 454: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 455: cur = ctxt->input->cur; 456: } 457: if (((c & 0xf8) != 0xf0) || 458: ((cur[3] & 0xc0) != 0x80)) 459: goto encoding_error; 460: /* 4-byte code */ 461: *len = 4; 462: val = (cur[0] & 0x7) << 18; 463: val |= (cur[1] & 0x3f) << 12; 464: val |= (cur[2] & 0x3f) << 6; 465: val |= cur[3] & 0x3f; 466: } else { 467: /* 3-byte code */ 468: *len = 3; 469: val = (cur[0] & 0xf) << 12; 470: val |= (cur[1] & 0x3f) << 6; 471: val |= cur[2] & 0x3f; 472: } 473: } else { 474: /* 2-byte code */ 475: *len = 2; 476: val = (cur[0] & 0x1f) << 6; 477: val |= cur[1] & 0x3f; 478: } 479: if (!IS_CHAR(val)) { 480: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 481: "Char 0x%X out of allowed range\n", val); 482: } 483: return(val); 484: } else { 485: if ((*ctxt->input->cur == 0) && 486: (ctxt->input->cur < ctxt->input->end)) { 487: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 488: "Char 0x%X out of allowed range\n", 0); 489: *len = 1; 490: return(' '); 491: } 492: /* 1-byte code */ 493: *len = 1; 494: return((int) *ctxt->input->cur); 495: } 496: } 497: /* 498: * Assume it's a fixed length encoding (1) with 499: * a compatible encoding for the ASCII set, since 500: * XML constructs only use < 128 chars 501: */ 502: *len = 1; 503: if ((int) *ctxt->input->cur < 0x80) 504: return((int) *ctxt->input->cur); 505: 506: /* 507: * Humm this is bad, do an automatic flow conversion 508: */ 509: { 510: xmlChar * guess; 511: xmlCharEncodingHandlerPtr handler; 512: 513: guess = htmlFindEncoding(ctxt); 514: if (guess == NULL) { 515: xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 516: } else { 517: if (ctxt->input->encoding != NULL) 518: xmlFree((xmlChar *) ctxt->input->encoding); 519: ctxt->input->encoding = guess; 520: handler = xmlFindCharEncodingHandler((const char *) guess); 521: if (handler != NULL) { 522: xmlSwitchToEncoding(ctxt, handler); 523: } else { 524: htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 525: "Unsupported encoding %s", guess, NULL); 526: } 527: } 528: ctxt->charset = XML_CHAR_ENCODING_UTF8; 529: } 530: 531: return(xmlCurrentChar(ctxt, len)); 532: 533: encoding_error: 534: /* 535: * If we detect an UTF8 error that probably mean that the 536: * input encoding didn't get properly advertized in the 537: * declaration header. Report the error and switch the encoding 538: * to ISO-Latin-1 (if you don't like this policy, just declare the 539: * encoding !) 540: */ 541: { 542: char buffer[150]; 543: 544: if (ctxt->input->end - ctxt->input->cur >= 4) { 545: snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 546: ctxt->input->cur[0], ctxt->input->cur[1], 547: ctxt->input->cur[2], ctxt->input->cur[3]); 548: } else { 549: snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 550: } 551: htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 552: "Input is not proper UTF-8, indicate encoding !\n", 553: BAD_CAST buffer, NULL); 554: } 555: 556: ctxt->charset = XML_CHAR_ENCODING_8859_1; 557: *len = 1; 558: return((int) *ctxt->input->cur); 559: } 560: 561: /** 562: * htmlSkipBlankChars: 563: * @ctxt: the HTML parser context 564: * 565: * skip all blanks character found at that point in the input streams. 566: * 567: * Returns the number of space chars skipped 568: */ 569: 570: static int 571: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 572: int res = 0; 573: 574: while (IS_BLANK_CH(*(ctxt->input->cur))) { 575: if ((*ctxt->input->cur == 0) && 576: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 577: xmlPopInput(ctxt); 578: } else { 579: if (*(ctxt->input->cur) == '\n') { 580: ctxt->input->line++; ctxt->input->col = 1; 581: } else ctxt->input->col++; 582: ctxt->input->cur++; 583: ctxt->nbChars++; 584: if (*ctxt->input->cur == 0) 585: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 586: } 587: res++; 588: } 589: return(res); 590: } 591: 592: 593: 594: /************************************************************************ 595: * * 596: * The list of HTML elements and their properties * 597: * * 598: ************************************************************************/ 599: 600: /* 601: * Start Tag: 1 means the start tag can be ommited 602: * End Tag: 1 means the end tag can be ommited 603: * 2 means it's forbidden (empty elements) 604: * 3 means the tag is stylistic and should be closed easily 605: * Depr: this element is deprecated 606: * DTD: 1 means that this element is valid only in the Loose DTD 607: * 2 means that this element is valid only in the Frameset DTD 608: * 609: * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 610: , subElements , impliedsubelt , Attributes, userdata 611: */ 612: 613: /* Definitions and a couple of vars for HTML Elements */ 614: 615: #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 616: #define NB_FONTSTYLE 8 617: #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 618: #define NB_PHRASE 10 619: #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 620: #define NB_SPECIAL 16 621: #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL 622: #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 623: #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 624: #define NB_BLOCK NB_HEADING + NB_LIST + 14 625: #define FORMCTRL "input", "select", "textarea", "label", "button" 626: #define NB_FORMCTRL 5 627: #define PCDATA 628: #define NB_PCDATA 0 629: #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 630: #define NB_HEADING 6 631: #define LIST "ul", "ol", "dir", "menu" 632: #define NB_LIST 4 633: #define MODIFIER 634: #define NB_MODIFIER 0 635: #define FLOW BLOCK,INLINE 636: #define NB_FLOW NB_BLOCK + NB_INLINE 637: #define EMPTY NULL 638: 639: 640: static const char* const html_flow[] = { FLOW, NULL } ; 641: static const char* const html_inline[] = { INLINE, NULL } ; 642: 643: /* placeholders: elts with content but no subelements */ 644: static const char* const html_pcdata[] = { NULL } ; 645: #define html_cdata html_pcdata 646: 647: 648: /* ... and for HTML Attributes */ 649: 650: #define COREATTRS "id", "class", "style", "title" 651: #define NB_COREATTRS 4 652: #define I18N "lang", "dir" 653: #define NB_I18N 2 654: #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 655: #define NB_EVENTS 9 656: #define ATTRS COREATTRS,I18N,EVENTS 657: #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 658: #define CELLHALIGN "align", "char", "charoff" 659: #define NB_CELLHALIGN 3 660: #define CELLVALIGN "valign" 661: #define NB_CELLVALIGN 1 662: 663: static const char* const html_attrs[] = { ATTRS, NULL } ; 664: static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 665: static const char* const core_attrs[] = { COREATTRS, NULL } ; 666: static const char* const i18n_attrs[] = { I18N, NULL } ; 667: 668: 669: /* Other declarations that should go inline ... */ 670: static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 671: "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 672: "tabindex", "onfocus", "onblur", NULL } ; 673: static const char* const target_attr[] = { "target", NULL } ; 674: static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 675: static const char* const alt_attr[] = { "alt", NULL } ; 676: static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 677: static const char* const href_attrs[] = { "href", NULL } ; 678: static const char* const clear_attrs[] = { "clear", NULL } ; 679: static const char* const inline_p[] = { INLINE, "p", NULL } ; 680: 681: static const char* const flow_param[] = { FLOW, "param", NULL } ; 682: static const char* const applet_attrs[] = { COREATTRS , "codebase", 683: "archive", "alt", "name", "height", "width", "align", 684: "hspace", "vspace", NULL } ; 685: static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 686: "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 687: static const char* const basefont_attrs[] = 688: { "id", "size", "color", "face", NULL } ; 689: static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 690: static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 691: static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 692: static const char* const body_depr[] = { "background", "bgcolor", "text", 693: "link", "vlink", "alink", NULL } ; 694: static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 695: "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 696: 697: 698: static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 699: static const char* const col_elt[] = { "col", NULL } ; 700: static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 701: static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 702: static const char* const dl_contents[] = { "dt", "dd", NULL } ; 703: static const char* const compact_attr[] = { "compact", NULL } ; 704: static const char* const label_attr[] = { "label", NULL } ; 705: static const char* const fieldset_contents[] = { FLOW, "legend" } ; 706: static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 707: static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 708: static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 709: static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 710: static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 711: static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 712: static const char* const head_attrs[] = { I18N, "profile", NULL } ; 713: static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 714: static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 715: static const char* const version_attr[] = { "version", NULL } ; 716: static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 717: static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 718: static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 719: static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 720: static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 721: static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 722: static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 723: static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 724: static const char* const align_attr[] = { "align", NULL } ; 725: static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 726: static const char* const map_contents[] = { BLOCK, "area", NULL } ; 727: static const char* const name_attr[] = { "name", NULL } ; 728: static const char* const action_attr[] = { "action", NULL } ; 729: static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 730: static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ; 731: static const char* const content_attr[] = { "content", NULL } ; 732: static const char* const type_attr[] = { "type", NULL } ; 733: static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 734: static const char* const object_contents[] = { FLOW, "param", NULL } ; 735: static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 736: static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 737: static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 738: static const char* const option_elt[] = { "option", NULL } ; 739: static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 740: static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 741: static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 742: static const char* const width_attr[] = { "width", NULL } ; 743: static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 744: static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 745: static const char* const language_attr[] = { "language", NULL } ; 746: static const char* const select_content[] = { "optgroup", "option", NULL } ; 747: static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 748: static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 749: static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 750: static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 751: static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 752: static const char* const tr_elt[] = { "tr", NULL } ; 753: static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 754: static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 755: static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 756: static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 757: static const char* const tr_contents[] = { "th", "td", NULL } ; 758: static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 759: static const char* const li_elt[] = { "li", NULL } ; 760: static const char* const ul_depr[] = { "type", "compact", NULL} ; 761: static const char* const dir_attr[] = { "dir", NULL} ; 762: 763: #define DECL (const char**) 764: 765: static const htmlElemDesc 766: html40ElementTable[] = { 767: { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 768: DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 769: }, 770: { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 771: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 772: }, 773: { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 774: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 775: }, 776: { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 777: DECL inline_p , NULL , DECL html_attrs, NULL, NULL 778: }, 779: { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 780: DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 781: }, 782: { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 783: EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 784: }, 785: { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 786: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 787: }, 788: { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 789: EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 790: }, 791: { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 792: EMPTY , NULL , NULL, DECL basefont_attrs, NULL 793: }, 794: { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 795: DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 796: }, 797: { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 798: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 799: }, 800: { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 801: DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 802: }, 803: { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 804: DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 805: }, 806: { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 807: EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 808: }, 809: { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 810: DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 811: }, 812: { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 813: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 814: }, 815: { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 816: DECL html_flow , NULL , NULL, DECL html_attrs, NULL 817: }, 818: { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 819: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 820: }, 821: { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 822: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 823: }, 824: { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 825: EMPTY , NULL , DECL col_attrs , NULL, NULL 826: }, 827: { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 828: DECL col_elt , "col" , DECL col_attrs , NULL, NULL 829: }, 830: { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 831: DECL html_flow , NULL , DECL html_attrs, NULL, NULL 832: }, 833: { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 834: DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 835: }, 836: { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 837: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 838: }, 839: { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 840: DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 841: }, 842: { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 843: DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 844: }, 845: { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 846: DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 847: }, 848: { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 849: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 850: }, 851: { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 852: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 853: }, 854: { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 855: EMPTY, NULL, DECL embed_attrs, NULL, NULL 856: }, 857: { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 858: DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 859: }, 860: { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 861: DECL html_inline, NULL, NULL, DECL font_attrs, NULL 862: }, 863: { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 864: DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 865: }, 866: { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 867: EMPTY, NULL, NULL, DECL frame_attrs, NULL 868: }, 869: { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 870: DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 871: }, 872: { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 873: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 874: }, 875: { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 876: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 877: }, 878: { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 879: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 880: }, 881: { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 882: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 883: }, 884: { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 885: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 886: }, 887: { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 888: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 889: }, 890: { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 891: DECL head_contents, NULL, DECL head_attrs, NULL, NULL 892: }, 893: { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 894: EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 895: }, 896: { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 897: DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 898: }, 899: { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 900: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 901: }, 902: { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 903: DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 904: }, 905: { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 906: EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 907: }, 908: { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 909: EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 910: }, 911: { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 912: DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 913: }, 914: { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 915: EMPTY, NULL, NULL, DECL prompt_attrs, NULL 916: }, 917: { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 918: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 919: }, 920: { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 921: DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 922: }, 923: { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 924: DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 925: }, 926: { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 927: DECL html_flow, NULL, DECL html_attrs, NULL, NULL 928: }, 929: { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 930: EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 931: }, 932: { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 933: DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 934: }, 935: { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 936: DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 937: }, 938: { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 939: EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 940: }, 941: { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 942: DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 943: }, 944: { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 945: DECL html_flow, "div", DECL html_attrs, NULL, NULL 946: }, 947: { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 948: DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 949: }, 950: { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 951: DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 952: }, 953: { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 954: DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 955: }, 956: { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 957: DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 958: }, 959: { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 960: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 961: }, 962: { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 963: EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 964: }, 965: { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 966: DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 967: }, 968: { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 969: DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 970: }, 971: { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 972: DECL html_inline, NULL, NULL, DECL html_attrs, NULL 973: }, 974: { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 975: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 976: }, 977: { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 978: DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 979: }, 980: { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 981: DECL select_content, NULL, DECL select_attrs, NULL, NULL 982: }, 983: { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 984: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 985: }, 986: { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 987: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 988: }, 989: { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 990: DECL html_inline, NULL, NULL, DECL html_attrs, NULL 991: }, 992: { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 993: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 994: }, 995: { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 996: DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 997: }, 998: { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 999: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1000: }, 1001: { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 1002: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1003: }, 1004: { "table", 0, 0, 0, 0, 0, 0, 0, "", 1005: DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 1006: }, 1007: { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 1008: DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1009: }, 1010: { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 1011: DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1012: }, 1013: { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 1014: DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 1015: }, 1016: { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 1017: DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1018: }, 1019: { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 1020: DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1021: }, 1022: { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 1023: DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1024: }, 1025: { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 1026: DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 1027: }, 1028: { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 1029: DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 1030: }, 1031: { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 1032: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1033: }, 1034: { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 1035: DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1036: }, 1037: { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 1038: DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 1039: }, 1040: { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 1041: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1042: } 1043: }; 1044: 1045: /* 1046: * start tags that imply the end of current element 1047: */ 1048: static const char * const htmlStartClose[] = { 1049: "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 1050: "dl", "ul", "ol", "menu", "dir", "address", "pre", 1051: "listing", "xmp", "head", NULL, 1052: "head", "p", NULL, 1053: "title", "p", NULL, 1054: "body", "head", "style", "link", "title", "p", NULL, 1055: "frameset", "head", "style", "link", "title", "p", NULL, 1056: "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 1057: "pre", "listing", "xmp", "head", "li", NULL, 1058: "hr", "p", "head", NULL, 1059: "h1", "p", "head", NULL, 1060: "h2", "p", "head", NULL, 1061: "h3", "p", "head", NULL, 1062: "h4", "p", "head", NULL, 1063: "h5", "p", "head", NULL, 1064: "h6", "p", "head", NULL, 1065: "dir", "p", "head", NULL, 1066: "address", "p", "head", "ul", NULL, 1067: "pre", "p", "head", "ul", NULL, 1068: "listing", "p", "head", NULL, 1069: "xmp", "p", "head", NULL, 1070: "blockquote", "p", "head", NULL, 1071: "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 1072: "xmp", "head", NULL, 1073: "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1074: "head", "dd", NULL, 1075: "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1076: "head", "dt", NULL, 1077: "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 1078: "listing", "xmp", NULL, 1079: "ol", "p", "head", "ul", NULL, 1080: "menu", "p", "head", "ul", NULL, 1081: "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, 1082: "div", "p", "head", NULL, 1083: "noscript", "p", NULL, 1084: "center", "font", "b", "i", "p", "head", NULL, 1085: "a", "a", NULL, 1086: "caption", "p", NULL, 1087: "colgroup", "caption", "colgroup", "col", "p", NULL, 1088: "col", "caption", "col", "p", NULL, 1089: "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 1090: "listing", "xmp", "a", NULL, 1091: "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1092: "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1093: "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 1094: "thead", "caption", "col", "colgroup", NULL, 1095: "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1096: "tbody", "p", NULL, 1097: "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1098: "tfoot", "tbody", "p", NULL, 1099: "optgroup", "option", NULL, 1100: "option", "option", NULL, 1101: "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 1102: "pre", "listing", "xmp", "a", NULL, 1103: NULL 1104: }; 1105: 1106: /* 1107: * The list of HTML elements which are supposed not to have 1108: * CDATA content and where a p element will be implied 1109: * 1110: * TODO: extend that list by reading the HTML SGML DTD on 1111: * implied paragraph 1112: */ 1113: static const char *const htmlNoContentElements[] = { 1114: "html", 1115: "head", 1116: NULL 1117: }; 1118: 1119: /* 1120: * The list of HTML attributes which are of content %Script; 1121: * NOTE: when adding ones, check htmlIsScriptAttribute() since 1122: * it assumes the name starts with 'on' 1123: */ 1124: static const char *const htmlScriptAttributes[] = { 1125: "onclick", 1126: "ondblclick", 1127: "onmousedown", 1128: "onmouseup", 1129: "onmouseover", 1130: "onmousemove", 1131: "onmouseout", 1132: "onkeypress", 1133: "onkeydown", 1134: "onkeyup", 1135: "onload", 1136: "onunload", 1137: "onfocus", 1138: "onblur", 1139: "onsubmit", 1140: "onrest", 1141: "onchange", 1142: "onselect" 1143: }; 1144: 1145: /* 1146: * This table is used by the htmlparser to know what to do with 1147: * broken html pages. By assigning different priorities to different 1148: * elements the parser can decide how to handle extra endtags. 1149: * Endtags are only allowed to close elements with lower or equal 1150: * priority. 1151: */ 1152: 1153: typedef struct { 1154: const char *name; 1155: int priority; 1156: } elementPriority; 1157: 1158: static const elementPriority htmlEndPriority[] = { 1159: {"div", 150}, 1160: {"td", 160}, 1161: {"th", 160}, 1162: {"tr", 170}, 1163: {"thead", 180}, 1164: {"tbody", 180}, 1165: {"tfoot", 180}, 1166: {"table", 190}, 1167: {"head", 200}, 1168: {"body", 200}, 1169: {"html", 220}, 1170: {NULL, 100} /* Default priority */ 1171: }; 1172: 1173: static const char** htmlStartCloseIndex[100]; 1174: static int htmlStartCloseIndexinitialized = 0; 1175: 1176: /************************************************************************ 1177: * * 1178: * functions to handle HTML specific data * 1179: * * 1180: ************************************************************************/ 1181: 1182: /** 1183: * htmlInitAutoClose: 1184: * 1185: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1186: * This is not reentrant. Call xmlInitParser() once before processing in 1187: * case of use in multithreaded programs. 1188: */ 1189: void 1190: htmlInitAutoClose(void) { 1191: int indx, i = 0; 1192: 1193: if (htmlStartCloseIndexinitialized) return; 1194: 1195: for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1196: indx = 0; 1197: while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1198: htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 1199: while (htmlStartClose[i] != NULL) i++; 1200: i++; 1201: } 1202: htmlStartCloseIndexinitialized = 1; 1203: } 1204: 1205: /** 1206: * htmlTagLookup: 1207: * @tag: The tag name in lowercase 1208: * 1209: * Lookup the HTML tag in the ElementTable 1210: * 1211: * Returns the related htmlElemDescPtr or NULL if not found. 1212: */ 1213: const htmlElemDesc * 1214: htmlTagLookup(const xmlChar *tag) { 1215: unsigned int i; 1216: 1217: for (i = 0; i < (sizeof(html40ElementTable) / 1218: sizeof(html40ElementTable[0]));i++) { 1219: if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1220: return((htmlElemDescPtr) &html40ElementTable[i]); 1221: } 1222: return(NULL); 1223: } 1224: 1225: /** 1226: * htmlGetEndPriority: 1227: * @name: The name of the element to look up the priority for. 1228: * 1229: * Return value: The "endtag" priority. 1230: **/ 1231: static int 1232: htmlGetEndPriority (const xmlChar *name) { 1233: int i = 0; 1234: 1235: while ((htmlEndPriority[i].name != NULL) && 1236: (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1237: i++; 1238: 1239: return(htmlEndPriority[i].priority); 1240: } 1241: 1242: 1243: /** 1244: * htmlCheckAutoClose: 1245: * @newtag: The new tag name 1246: * @oldtag: The old tag name 1247: * 1248: * Checks whether the new tag is one of the registered valid tags for 1249: * closing old. 1250: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1251: * 1252: * Returns 0 if no, 1 if yes. 1253: */ 1254: static int 1255: htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1256: { 1257: int i, indx; 1258: const char **closed = NULL; 1259: 1260: if (htmlStartCloseIndexinitialized == 0) 1261: htmlInitAutoClose(); 1262: 1263: /* inefficient, but not a big deal */ 1264: for (indx = 0; indx < 100; indx++) { 1265: closed = htmlStartCloseIndex[indx]; 1266: if (closed == NULL) 1267: return (0); 1268: if (xmlStrEqual(BAD_CAST * closed, newtag)) 1269: break; 1270: } 1271: 1272: i = closed - htmlStartClose; 1273: i++; 1274: while (htmlStartClose[i] != NULL) { 1275: if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1276: return (1); 1277: } 1278: i++; 1279: } 1280: return (0); 1281: } 1282: 1283: /** 1284: * htmlAutoCloseOnClose: 1285: * @ctxt: an HTML parser context 1286: * @newtag: The new tag name 1287: * @force: force the tag closure 1288: * 1289: * The HTML DTD allows an ending tag to implicitly close other tags. 1290: */ 1291: static void 1292: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1293: { 1294: const htmlElemDesc *info; 1295: int i, priority; 1296: 1297: priority = htmlGetEndPriority(newtag); 1298: 1299: for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1300: 1301: if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1302: break; 1303: /* 1304: * A missplaced endtag can only close elements with lower 1305: * or equal priority, so if we find an element with higher 1306: * priority before we find an element with 1307: * matching name, we just ignore this endtag 1308: */ 1309: if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1310: return; 1311: } 1312: if (i < 0) 1313: return; 1314: 1315: while (!xmlStrEqual(newtag, ctxt->name)) { 1316: info = htmlTagLookup(ctxt->name); 1317: if ((info != NULL) && (info->endTag == 3)) { 1318: htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1319: "Opening and ending tag mismatch: %s and %s\n", 1320: newtag, ctxt->name); 1321: } 1322: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1323: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1324: htmlnamePop(ctxt); 1325: } 1326: } 1327: 1328: /** 1329: * htmlAutoCloseOnEnd: 1330: * @ctxt: an HTML parser context 1331: * 1332: * Close all remaining tags at the end of the stream 1333: */ 1334: static void 1335: htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1336: { 1337: int i; 1338: 1339: if (ctxt->nameNr == 0) 1340: return; 1341: for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1342: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1343: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1344: htmlnamePop(ctxt); 1345: } 1346: } 1347: 1348: /** 1349: * htmlAutoClose: 1350: * @ctxt: an HTML parser context 1351: * @newtag: The new tag name or NULL 1352: * 1353: * The HTML DTD allows a tag to implicitly close other tags. 1354: * The list is kept in htmlStartClose array. This function is 1355: * called when a new tag has been detected and generates the 1356: * appropriates closes if possible/needed. 1357: * If newtag is NULL this mean we are at the end of the resource 1358: * and we should check 1359: */ 1360: static void 1361: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1362: { 1363: while ((newtag != NULL) && (ctxt->name != NULL) && 1364: (htmlCheckAutoClose(newtag, ctxt->name))) { 1365: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1366: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1367: htmlnamePop(ctxt); 1368: } 1369: if (newtag == NULL) { 1370: htmlAutoCloseOnEnd(ctxt); 1371: return; 1372: } 1373: while ((newtag == NULL) && (ctxt->name != NULL) && 1374: ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1375: (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1376: (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1377: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1378: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1379: htmlnamePop(ctxt); 1380: } 1381: } 1382: 1383: /** 1384: * htmlAutoCloseTag: 1385: * @doc: the HTML document 1386: * @name: The tag name 1387: * @elem: the HTML element 1388: * 1389: * The HTML DTD allows a tag to implicitly close other tags. 1390: * The list is kept in htmlStartClose array. This function checks 1391: * if the element or one of it's children would autoclose the 1392: * given tag. 1393: * 1394: * Returns 1 if autoclose, 0 otherwise 1395: */ 1396: int 1397: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1398: htmlNodePtr child; 1399: 1400: if (elem == NULL) return(1); 1401: if (xmlStrEqual(name, elem->name)) return(0); 1402: if (htmlCheckAutoClose(elem->name, name)) return(1); 1403: child = elem->children; 1404: while (child != NULL) { 1405: if (htmlAutoCloseTag(doc, name, child)) return(1); 1406: child = child->next; 1407: } 1408: return(0); 1409: } 1410: 1411: /** 1412: * htmlIsAutoClosed: 1413: * @doc: the HTML document 1414: * @elem: the HTML element 1415: * 1416: * The HTML DTD allows a tag to implicitly close other tags. 1417: * The list is kept in htmlStartClose array. This function checks 1418: * if a tag is autoclosed by one of it's child 1419: * 1420: * Returns 1 if autoclosed, 0 otherwise 1421: */ 1422: int 1423: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1424: htmlNodePtr child; 1425: 1426: if (elem == NULL) return(1); 1427: child = elem->children; 1428: while (child != NULL) { 1429: if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1430: child = child->next; 1431: } 1432: return(0); 1433: } 1434: 1435: /** 1436: * htmlCheckImplied: 1437: * @ctxt: an HTML parser context 1438: * @newtag: The new tag name 1439: * 1440: * The HTML DTD allows a tag to exists only implicitly 1441: * called when a new tag has been detected and generates the 1442: * appropriates implicit tags if missing 1443: */ 1444: static void 1445: htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1446: int i; 1447: 1448: if (ctxt->options & HTML_PARSE_NOIMPLIED) 1449: return; 1450: if (!htmlOmittedDefaultValue) 1451: return; 1452: if (xmlStrEqual(newtag, BAD_CAST"html")) 1453: return; 1454: if (ctxt->nameNr <= 0) { 1455: htmlnamePush(ctxt, BAD_CAST"html"); 1456: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1457: ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1458: } 1459: if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1460: return; 1461: if ((ctxt->nameNr <= 1) && 1462: ((xmlStrEqual(newtag, BAD_CAST"script")) || 1463: (xmlStrEqual(newtag, BAD_CAST"style")) || 1464: (xmlStrEqual(newtag, BAD_CAST"meta")) || 1465: (xmlStrEqual(newtag, BAD_CAST"link")) || 1466: (xmlStrEqual(newtag, BAD_CAST"title")) || 1467: (xmlStrEqual(newtag, BAD_CAST"base")))) { 1468: if (ctxt->html >= 3) { 1469: /* we already saw or generated an <head> before */ 1470: return; 1471: } 1472: /* 1473: * dropped OBJECT ... i you put it first BODY will be 1474: * assumed ! 1475: */ 1476: htmlnamePush(ctxt, BAD_CAST"head"); 1477: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1478: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1479: } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1480: (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1481: (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1482: if (ctxt->html >= 10) { 1483: /* we already saw or generated a <body> before */ 1484: return; 1485: } 1486: for (i = 0;i < ctxt->nameNr;i++) { 1487: if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1488: return; 1489: } 1490: if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1491: return; 1492: } 1493: } 1494: 1495: htmlnamePush(ctxt, BAD_CAST"body"); 1496: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1497: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1498: } 1499: } 1500: 1501: /** 1502: * htmlCheckParagraph 1503: * @ctxt: an HTML parser context 1504: * 1505: * Check whether a p element need to be implied before inserting 1506: * characters in the current element. 1507: * 1508: * Returns 1 if a paragraph has been inserted, 0 if not and -1 1509: * in case of error. 1510: */ 1511: 1512: static int 1513: htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1514: const xmlChar *tag; 1515: int i; 1516: 1517: if (ctxt == NULL) 1518: return(-1); 1519: tag = ctxt->name; 1520: if (tag == NULL) { 1521: htmlAutoClose(ctxt, BAD_CAST"p"); 1522: htmlCheckImplied(ctxt, BAD_CAST"p"); 1523: htmlnamePush(ctxt, BAD_CAST"p"); 1524: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1525: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1526: return(1); 1527: } 1528: if (!htmlOmittedDefaultValue) 1529: return(0); 1530: for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1531: if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1532: htmlAutoClose(ctxt, BAD_CAST"p"); 1533: htmlCheckImplied(ctxt, BAD_CAST"p"); 1534: htmlnamePush(ctxt, BAD_CAST"p"); 1535: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1536: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1537: return(1); 1538: } 1539: } 1540: return(0); 1541: } 1542: 1543: /** 1544: * htmlIsScriptAttribute: 1545: * @name: an attribute name 1546: * 1547: * Check if an attribute is of content type Script 1548: * 1549: * Returns 1 is the attribute is a script 0 otherwise 1550: */ 1551: int 1552: htmlIsScriptAttribute(const xmlChar *name) { 1553: unsigned int i; 1554: 1555: if (name == NULL) 1556: return(0); 1557: /* 1558: * all script attributes start with 'on' 1559: */ 1560: if ((name[0] != 'o') || (name[1] != 'n')) 1561: return(0); 1562: for (i = 0; 1563: i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1564: i++) { 1565: if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1566: return(1); 1567: } 1568: return(0); 1569: } 1570: 1571: /************************************************************************ 1572: * * 1573: * The list of HTML predefined entities * 1574: * * 1575: ************************************************************************/ 1576: 1577: 1578: static const htmlEntityDesc html40EntitiesTable[] = { 1579: /* 1580: * the 4 absolute ones, plus apostrophe. 1581: */ 1582: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1583: { 38, "amp", "ampersand, U+0026 ISOnum" }, 1584: { 39, "apos", "single quote" }, 1585: { 60, "lt", "less-than sign, U+003C ISOnum" }, 1586: { 62, "gt", "greater-than sign, U+003E ISOnum" }, 1587: 1588: /* 1589: * A bunch still in the 128-255 range 1590: * Replacing them depend really on the charset used. 1591: */ 1592: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1593: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1594: { 162, "cent", "cent sign, U+00A2 ISOnum" }, 1595: { 163, "pound","pound sign, U+00A3 ISOnum" }, 1596: { 164, "curren","currency sign, U+00A4 ISOnum" }, 1597: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1598: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1599: { 167, "sect", "section sign, U+00A7 ISOnum" }, 1600: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1601: { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1602: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1603: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1604: { 172, "not", "not sign, U+00AC ISOnum" }, 1605: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1606: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1607: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1608: { 176, "deg", "degree sign, U+00B0 ISOnum" }, 1609: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1610: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1611: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1612: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1613: { 181, "micro","micro sign, U+00B5 ISOnum" }, 1614: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1615: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1616: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1617: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1618: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1619: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1620: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1621: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1622: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1623: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1624: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1625: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1626: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1627: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1628: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1629: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1630: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1631: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1632: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1633: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1634: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1635: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1636: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1637: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1638: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1639: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1640: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1641: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1642: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1643: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1644: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1645: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1646: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1647: { 215, "times","multiplication sign, U+00D7 ISOnum" }, 1648: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1649: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1650: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1651: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1652: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1653: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1654: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1655: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1656: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1657: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1658: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1659: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1660: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1661: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1662: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1663: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1664: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1665: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1666: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1667: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1668: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1669: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1670: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1671: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1672: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1673: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1674: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1675: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1676: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1677: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1678: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1679: { 247, "divide","division sign, U+00F7 ISOnum" }, 1680: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1681: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1682: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1683: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1684: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1685: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1686: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1687: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1688: 1689: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1690: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1691: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1692: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1693: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1694: 1695: /* 1696: * Anything below should really be kept as entities references 1697: */ 1698: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1699: 1700: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1701: { 732, "tilde","small tilde, U+02DC ISOdia" }, 1702: 1703: { 913, "Alpha","greek capital letter alpha, U+0391" }, 1704: { 914, "Beta", "greek capital letter beta, U+0392" }, 1705: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1706: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1707: { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1708: { 918, "Zeta", "greek capital letter zeta, U+0396" }, 1709: { 919, "Eta", "greek capital letter eta, U+0397" }, 1710: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1711: { 921, "Iota", "greek capital letter iota, U+0399" }, 1712: { 922, "Kappa","greek capital letter kappa, U+039A" }, 1713: { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1714: { 924, "Mu", "greek capital letter mu, U+039C" }, 1715: { 925, "Nu", "greek capital letter nu, U+039D" }, 1716: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1717: { 927, "Omicron","greek capital letter omicron, U+039F" }, 1718: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1719: { 929, "Rho", "greek capital letter rho, U+03A1" }, 1720: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1721: { 932, "Tau", "greek capital letter tau, U+03A4" }, 1722: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1723: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1724: { 935, "Chi", "greek capital letter chi, U+03A7" }, 1725: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1726: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1727: 1728: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1729: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1730: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1731: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1732: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1733: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1734: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1735: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1736: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1737: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1738: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1739: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1740: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1741: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1742: { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1743: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1744: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1745: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1746: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1747: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1748: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1749: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1750: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1751: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1752: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1753: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1754: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1755: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1756: 1757: { 8194, "ensp", "en space, U+2002 ISOpub" }, 1758: { 8195, "emsp", "em space, U+2003 ISOpub" }, 1759: { 8201, "thinsp","thin space, U+2009 ISOpub" }, 1760: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1761: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1762: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1763: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1764: { 8211, "ndash","en dash, U+2013 ISOpub" }, 1765: { 8212, "mdash","em dash, U+2014 ISOpub" }, 1766: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1767: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1768: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1769: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1770: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1771: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1772: { 8224, "dagger","dagger, U+2020 ISOpub" }, 1773: { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1774: 1775: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1776: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1777: 1778: { 8240, "permil","per mille sign, U+2030 ISOtech" }, 1779: 1780: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1781: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1782: 1783: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1784: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1785: 1786: { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1787: { 8260, "frasl","fraction slash, U+2044 NEW" }, 1788: 1789: { 8364, "euro", "euro sign, U+20AC NEW" }, 1790: 1791: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1792: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1793: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1794: { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1795: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1796: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1797: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1798: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1799: { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1800: { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1801: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1802: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1803: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1804: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1805: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1806: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1807: 1808: { 8704, "forall","for all, U+2200 ISOtech" }, 1809: { 8706, "part", "partial differential, U+2202 ISOtech" }, 1810: { 8707, "exist","there exists, U+2203 ISOtech" }, 1811: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1812: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1813: { 8712, "isin", "element of, U+2208 ISOtech" }, 1814: { 8713, "notin","not an element of, U+2209 ISOtech" }, 1815: { 8715, "ni", "contains as member, U+220B ISOtech" }, 1816: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1817: { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1818: { 8722, "minus","minus sign, U+2212 ISOtech" }, 1819: { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1820: { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1821: { 8733, "prop", "proportional to, U+221D ISOtech" }, 1822: { 8734, "infin","infinity, U+221E ISOtech" }, 1823: { 8736, "ang", "angle, U+2220 ISOamso" }, 1824: { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1825: { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1826: { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1827: { 8746, "cup", "union = cup, U+222A ISOtech" }, 1828: { 8747, "int", "integral, U+222B ISOtech" }, 1829: { 8756, "there4","therefore, U+2234 ISOtech" }, 1830: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1831: { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1832: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1833: { 8800, "ne", "not equal to, U+2260 ISOtech" }, 1834: { 8801, "equiv","identical to, U+2261 ISOtech" }, 1835: { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1836: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1837: { 8834, "sub", "subset of, U+2282 ISOtech" }, 1838: { 8835, "sup", "superset of, U+2283 ISOtech" }, 1839: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1840: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1841: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1842: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1843: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1844: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1845: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1846: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1847: { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1848: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1849: { 8971, "rfloor","right floor, U+230B ISOamsc" }, 1850: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1851: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1852: { 9674, "loz", "lozenge, U+25CA ISOpub" }, 1853: 1854: { 9824, "spades","black spade suit, U+2660 ISOpub" }, 1855: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1856: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1857: { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1858: 1859: }; 1860: 1861: /************************************************************************ 1862: * * 1863: * Commodity functions to handle entities * 1864: * * 1865: ************************************************************************/ 1866: 1867: /* 1868: * Macro used to grow the current buffer. 1869: */ 1870: #define growBuffer(buffer) { \ 1871: xmlChar *tmp; \ 1872: buffer##_size *= 2; \ 1873: tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1874: if (tmp == NULL) { \ 1875: htmlErrMemory(ctxt, "growing buffer\n"); \ 1876: xmlFree(buffer); \ 1877: return(NULL); \ 1878: } \ 1879: buffer = tmp; \ 1880: } 1881: 1882: /** 1883: * htmlEntityLookup: 1884: * @name: the entity name 1885: * 1886: * Lookup the given entity in EntitiesTable 1887: * 1888: * TODO: the linear scan is really ugly, an hash table is really needed. 1889: * 1890: * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1891: */ 1892: const htmlEntityDesc * 1893: htmlEntityLookup(const xmlChar *name) { 1894: unsigned int i; 1895: 1896: for (i = 0;i < (sizeof(html40EntitiesTable)/ 1897: sizeof(html40EntitiesTable[0]));i++) { 1898: if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1899: return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1900: } 1901: } 1902: return(NULL); 1903: } 1904: 1905: /** 1906: * htmlEntityValueLookup: 1907: * @value: the entity's unicode value 1908: * 1909: * Lookup the given entity in EntitiesTable 1910: * 1911: * TODO: the linear scan is really ugly, an hash table is really needed. 1912: * 1913: * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1914: */ 1915: const htmlEntityDesc * 1916: htmlEntityValueLookup(unsigned int value) { 1917: unsigned int i; 1918: 1919: for (i = 0;i < (sizeof(html40EntitiesTable)/ 1920: sizeof(html40EntitiesTable[0]));i++) { 1921: if (html40EntitiesTable[i].value >= value) { 1922: if (html40EntitiesTable[i].value > value) 1923: break; 1924: return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1925: } 1926: } 1927: return(NULL); 1928: } 1929: 1930: /** 1931: * UTF8ToHtml: 1932: * @out: a pointer to an array of bytes to store the result 1933: * @outlen: the length of @out 1934: * @in: a pointer to an array of UTF-8 chars 1935: * @inlen: the length of @in 1936: * 1937: * Take a block of UTF-8 chars in and try to convert it to an ASCII 1938: * plus HTML entities block of chars out. 1939: * 1940: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1941: * The value of @inlen after return is the number of octets consumed 1942: * as the return value is positive, else unpredictable. 1943: * The value of @outlen after return is the number of octets consumed. 1944: */ 1945: int 1946: UTF8ToHtml(unsigned char* out, int *outlen, 1947: const unsigned char* in, int *inlen) { 1948: const unsigned char* processed = in; 1949: const unsigned char* outend; 1950: const unsigned char* outstart = out; 1951: const unsigned char* instart = in; 1952: const unsigned char* inend; 1953: unsigned int c, d; 1954: int trailing; 1955: 1956: if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1957: if (in == NULL) { 1958: /* 1959: * initialization nothing to do 1960: */ 1961: *outlen = 0; 1962: *inlen = 0; 1963: return(0); 1964: } 1965: inend = in + (*inlen); 1966: outend = out + (*outlen); 1967: while (in < inend) { 1968: d = *in++; 1969: if (d < 0x80) { c= d; trailing= 0; } 1970: else if (d < 0xC0) { 1971: /* trailing byte in leading position */ 1972: *outlen = out - outstart; 1973: *inlen = processed - instart; 1974: return(-2); 1975: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 1976: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 1977: else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 1978: else { 1979: /* no chance for this in Ascii */ 1980: *outlen = out - outstart; 1981: *inlen = processed - instart; 1982: return(-2); 1983: } 1984: 1985: if (inend - in < trailing) { 1986: break; 1987: } 1988: 1989: for ( ; trailing; trailing--) { 1990: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 1991: break; 1992: c <<= 6; 1993: c |= d & 0x3F; 1994: } 1995: 1996: /* assertion: c is a single UTF-4 value */ 1997: if (c < 0x80) { 1998: if (out + 1 >= outend) 1999: break; 2000: *out++ = c; 2001: } else { 2002: int len; 2003: const htmlEntityDesc * ent; 2004: const char *cp; 2005: char nbuf[16]; 2006: 2007: /* 2008: * Try to lookup a predefined HTML entity for it 2009: */ 2010: 2011: ent = htmlEntityValueLookup(c); 2012: if (ent == NULL) { 2013: snprintf(nbuf, sizeof(nbuf), "#%u", c); 2014: cp = nbuf; 2015: } 2016: else 2017: cp = ent->name; 2018: len = strlen(cp); 2019: if (out + 2 + len >= outend) 2020: break; 2021: *out++ = '&'; 2022: memcpy(out, cp, len); 2023: out += len; 2024: *out++ = ';'; 2025: } 2026: processed = in; 2027: } 2028: *outlen = out - outstart; 2029: *inlen = processed - instart; 2030: return(0); 2031: } 2032: 2033: /** 2034: * htmlEncodeEntities: 2035: * @out: a pointer to an array of bytes to store the result 2036: * @outlen: the length of @out 2037: * @in: a pointer to an array of UTF-8 chars 2038: * @inlen: the length of @in 2039: * @quoteChar: the quote character to escape (' or ") or zero. 2040: * 2041: * Take a block of UTF-8 chars in and try to convert it to an ASCII 2042: * plus HTML entities block of chars out. 2043: * 2044: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 2045: * The value of @inlen after return is the number of octets consumed 2046: * as the return value is positive, else unpredictable. 2047: * The value of @outlen after return is the number of octets consumed. 2048: */ 2049: int 2050: htmlEncodeEntities(unsigned char* out, int *outlen, 2051: const unsigned char* in, int *inlen, int quoteChar) { 2052: const unsigned char* processed = in; 2053: const unsigned char* outend; 2054: const unsigned char* outstart = out; 2055: const unsigned char* instart = in; 2056: const unsigned char* inend; 2057: unsigned int c, d; 2058: int trailing; 2059: 2060: if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 2061: return(-1); 2062: outend = out + (*outlen); 2063: inend = in + (*inlen); 2064: while (in < inend) { 2065: d = *in++; 2066: if (d < 0x80) { c= d; trailing= 0; } 2067: else if (d < 0xC0) { 2068: /* trailing byte in leading position */ 2069: *outlen = out - outstart; 2070: *inlen = processed - instart; 2071: return(-2); 2072: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2073: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2074: else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2075: else { 2076: /* no chance for this in Ascii */ 2077: *outlen = out - outstart; 2078: *inlen = processed - instart; 2079: return(-2); 2080: } 2081: 2082: if (inend - in < trailing) 2083: break; 2084: 2085: while (trailing--) { 2086: if (((d= *in++) & 0xC0) != 0x80) { 2087: *outlen = out - outstart; 2088: *inlen = processed - instart; 2089: return(-2); 2090: } 2091: c <<= 6; 2092: c |= d & 0x3F; 2093: } 2094: 2095: /* assertion: c is a single UTF-4 value */ 2096: if ((c < 0x80) && (c != (unsigned int) quoteChar) && 2097: (c != '&') && (c != '<') && (c != '>')) { 2098: if (out >= outend) 2099: break; 2100: *out++ = c; 2101: } else { 2102: const htmlEntityDesc * ent; 2103: const char *cp; 2104: char nbuf[16]; 2105: int len; 2106: 2107: /* 2108: * Try to lookup a predefined HTML entity for it 2109: */ 2110: ent = htmlEntityValueLookup(c); 2111: if (ent == NULL) { 2112: snprintf(nbuf, sizeof(nbuf), "#%u", c); 2113: cp = nbuf; 2114: } 2115: else 2116: cp = ent->name; 2117: len = strlen(cp); 2118: if (out + 2 + len > outend) 2119: break; 2120: *out++ = '&'; 2121: memcpy(out, cp, len); 2122: out += len; 2123: *out++ = ';'; 2124: } 2125: processed = in; 2126: } 2127: *outlen = out - outstart; 2128: *inlen = processed - instart; 2129: return(0); 2130: } 2131: 2132: /************************************************************************ 2133: * * 2134: * Commodity functions to handle streams * 2135: * * 2136: ************************************************************************/ 2137: 2138: /** 2139: * htmlNewInputStream: 2140: * @ctxt: an HTML parser context 2141: * 2142: * Create a new input stream structure 2143: * Returns the new input stream or NULL 2144: */ 2145: static htmlParserInputPtr 2146: htmlNewInputStream(htmlParserCtxtPtr ctxt) { 2147: htmlParserInputPtr input; 2148: 2149: input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 2150: if (input == NULL) { 2151: htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 2152: return(NULL); 2153: } 2154: memset(input, 0, sizeof(htmlParserInput)); 2155: input->filename = NULL; 2156: input->directory = NULL; 2157: input->base = NULL; 2158: input->cur = NULL; 2159: input->buf = NULL; 2160: input->line = 1; 2161: input->col = 1; 2162: input->buf = NULL; 2163: input->free = NULL; 2164: input->version = NULL; 2165: input->consumed = 0; 2166: input->length = 0; 2167: return(input); 2168: } 2169: 2170: 2171: /************************************************************************ 2172: * * 2173: * Commodity functions, cleanup needed ? * 2174: * * 2175: ************************************************************************/ 2176: /* 2177: * all tags allowing pc data from the html 4.01 loose dtd 2178: * NOTE: it might be more apropriate to integrate this information 2179: * into the html40ElementTable array but I don't want to risk any 2180: * binary incomptibility 2181: */ 2182: static const char *allowPCData[] = { 2183: "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2184: "blockquote", "body", "button", "caption", "center", "cite", "code", 2185: "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2186: "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2187: "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2188: "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2189: }; 2190: 2191: /** 2192: * areBlanks: 2193: * @ctxt: an HTML parser context 2194: * @str: a xmlChar * 2195: * @len: the size of @str 2196: * 2197: * Is this a sequence of blank chars that one can ignore ? 2198: * 2199: * Returns 1 if ignorable 0 otherwise. 2200: */ 2201: 2202: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2203: unsigned int i; 2204: int j; 2205: xmlNodePtr lastChild; 2206: xmlDtdPtr dtd; 2207: 2208: for (j = 0;j < len;j++) 2209: if (!(IS_BLANK_CH(str[j]))) return(0); 2210: 2211: if (CUR == 0) return(1); 2212: if (CUR != '<') return(0); 2213: if (ctxt->name == NULL) 2214: return(1); 2215: if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2216: return(1); 2217: if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2218: return(1); 2219: 2220: /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2221: if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2222: dtd = xmlGetIntSubset(ctxt->myDoc); 2223: if (dtd != NULL && dtd->ExternalID != NULL) { 2224: if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2225: !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2226: return(1); 2227: } 2228: } 2229: 2230: if (ctxt->node == NULL) return(0); 2231: lastChild = xmlGetLastChild(ctxt->node); 2232: while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2233: lastChild = lastChild->prev; 2234: if (lastChild == NULL) { 2235: if ((ctxt->node->type != XML_ELEMENT_NODE) && 2236: (ctxt->node->content != NULL)) return(0); 2237: /* keep ws in constructs like ... ... 2238: for all tags "b" allowing PCDATA */ 2239: for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2240: if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2241: return(0); 2242: } 2243: } 2244: } else if (xmlNodeIsText(lastChild)) { 2245: return(0); 2246: } else { 2247: /* keep ws in constructs like xy z 2248: for all tags "p" allowing PCDATA */ 2249: for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2250: if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2251: return(0); 2252: } 2253: } 2254: } 2255: return(1); 2256: } 2257: 2258: /** 2259: * htmlNewDocNoDtD: 2260: * @URI: URI for the dtd, or NULL 2261: * @ExternalID: the external ID of the DTD, or NULL 2262: * 2263: * Creates a new HTML document without a DTD node if @URI and @ExternalID 2264: * are NULL 2265: * 2266: * Returns a new document, do not initialize the DTD if not provided 2267: */ 2268: htmlDocPtr 2269: htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2270: xmlDocPtr cur; 2271: 2272: /* 2273: * Allocate a new document and fill the fields. 2274: */ 2275: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2276: if (cur == NULL) { 2277: htmlErrMemory(NULL, "HTML document creation failed\n"); 2278: return(NULL); 2279: } 2280: memset(cur, 0, sizeof(xmlDoc)); 2281: 2282: cur->type = XML_HTML_DOCUMENT_NODE; 2283: cur->version = NULL; 2284: cur->intSubset = NULL; 2285: cur->doc = cur; 2286: cur->name = NULL; 2287: cur->children = NULL; 2288: cur->extSubset = NULL; 2289: cur->oldNs = NULL; 2290: cur->encoding = NULL; 2291: cur->standalone = 1; 2292: cur->compression = 0; 2293: cur->ids = NULL; 2294: cur->refs = NULL; 2295: cur->_private = NULL; 2296: cur->charset = XML_CHAR_ENCODING_UTF8; 2297: cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 2298: if ((ExternalID != NULL) || 2299: (URI != NULL)) 2300: xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2301: return(cur); 2302: } 2303: 2304: /** 2305: * htmlNewDoc: 2306: * @URI: URI for the dtd, or NULL 2307: * @ExternalID: the external ID of the DTD, or NULL 2308: * 2309: * Creates a new HTML document 2310: * 2311: * Returns a new document 2312: */ 2313: htmlDocPtr 2314: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2315: if ((URI == NULL) && (ExternalID == NULL)) 2316: return(htmlNewDocNoDtD( 2317: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2318: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2319: 2320: return(htmlNewDocNoDtD(URI, ExternalID)); 2321: } 2322: 2323: 2324: /************************************************************************ 2325: * * 2326: * The parser itself * 2327: * Relates to http://www.w3.org/TR/html40 * 2328: * * 2329: ************************************************************************/ 2330: 2331: /************************************************************************ 2332: * * 2333: * The parser itself * 2334: * * 2335: ************************************************************************/ 2336: 2337: static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2338: 2339: /** 2340: * htmlParseHTMLName: 2341: * @ctxt: an HTML parser context 2342: * 2343: * parse an HTML tag or attribute name, note that we convert it to lowercase 2344: * since HTML names are not case-sensitive. 2345: * 2346: * Returns the Tag Name parsed or NULL 2347: */ 2348: 2349: static const xmlChar * 2350: htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2351: int i = 0; 2352: xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2353: 2354: if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2355: (CUR != ':') && (CUR != '.')) return(NULL); 2356: 2357: while ((i < HTML_PARSER_BUFFER_SIZE) && 2358: ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2359: (CUR == ':') || (CUR == '-') || (CUR == '_') || 2360: (CUR == '.'))) { 2361: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2362: else loc[i] = CUR; 2363: i++; 2364: 2365: NEXT; 2366: } 2367: 2368: return(xmlDictLookup(ctxt->dict, loc, i)); 2369: } 2370: 2371: 2372: /** 2373: * htmlParseHTMLName_nonInvasive: 2374: * @ctxt: an HTML parser context 2375: * 2376: * parse an HTML tag or attribute name, note that we convert it to lowercase 2377: * since HTML names are not case-sensitive, this doesn't consume the data 2378: * from the stream, it's a look-ahead 2379: * 2380: * Returns the Tag Name parsed or NULL 2381: */ 2382: 2383: static const xmlChar * 2384: htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2385: int i = 0; 2386: xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2387: 2388: if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2389: (NXT(1) != ':')) return(NULL); 2390: 2391: while ((i < HTML_PARSER_BUFFER_SIZE) && 2392: ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2393: (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2394: if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2395: else loc[i] = NXT(1+i); 2396: i++; 2397: } 2398: 2399: return(xmlDictLookup(ctxt->dict, loc, i)); 2400: } 2401: 2402: 2403: /** 2404: * htmlParseName: 2405: * @ctxt: an HTML parser context 2406: * 2407: * parse an HTML name, this routine is case sensitive. 2408: * 2409: * Returns the Name parsed or NULL 2410: */ 2411: 2412: static const xmlChar * 2413: htmlParseName(htmlParserCtxtPtr ctxt) { 2414: const xmlChar *in; 2415: const xmlChar *ret; 2416: int count = 0; 2417: 2418: GROW; 2419: 2420: /* 2421: * Accelerator for simple ASCII names 2422: */ 2423: in = ctxt->input->cur; 2424: if (((*in >= 0x61) && (*in <= 0x7A)) || 2425: ((*in >= 0x41) && (*in <= 0x5A)) || 2426: (*in == '_') || (*in == ':')) { 2427: in++; 2428: while (((*in >= 0x61) && (*in <= 0x7A)) || 2429: ((*in >= 0x41) && (*in <= 0x5A)) || 2430: ((*in >= 0x30) && (*in <= 0x39)) || 2431: (*in == '_') || (*in == '-') || 2432: (*in == ':') || (*in == '.')) 2433: in++; 2434: if ((*in > 0) && (*in < 0x80)) { 2435: count = in - ctxt->input->cur; 2436: ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2437: ctxt->input->cur = in; 2438: ctxt->nbChars += count; 2439: ctxt->input->col += count; 2440: return(ret); 2441: } 2442: } 2443: return(htmlParseNameComplex(ctxt)); 2444: } 2445: 2446: static const xmlChar * 2447: htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2448: int len = 0, l; 2449: int c; 2450: int count = 0; 2451: 2452: /* 2453: * Handler for more complex cases 2454: */ 2455: GROW; 2456: c = CUR_CHAR(l); 2457: if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2458: (!IS_LETTER(c) && (c != '_') && 2459: (c != ':'))) { 2460: return(NULL); 2461: } 2462: 2463: while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2464: ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2465: (c == '.') || (c == '-') || 2466: (c == '_') || (c == ':') || 2467: (IS_COMBINING(c)) || 2468: (IS_EXTENDER(c)))) { 2469: if (count++ > 100) { 2470: count = 0; 2471: GROW; 2472: } 2473: len += l; 2474: NEXTL(l); 2475: c = CUR_CHAR(l); 2476: } 2477: return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2478: } 2479: 2480: 2481: /** 2482: * htmlParseHTMLAttribute: 2483: * @ctxt: an HTML parser context 2484: * @stop: a char stop value 2485: * 2486: * parse an HTML attribute value till the stop (quote), if 2487: * stop is 0 then it stops at the first space 2488: * 2489: * Returns the attribute parsed or NULL 2490: */ 2491: 2492: static xmlChar * 2493: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2494: xmlChar *buffer = NULL; 2495: int buffer_size = 0; 2496: xmlChar *out = NULL; 2497: const xmlChar *name = NULL; 2498: const xmlChar *cur = NULL; 2499: const htmlEntityDesc * ent; 2500: 2501: /* 2502: * allocate a translation buffer. 2503: */ 2504: buffer_size = HTML_PARSER_BUFFER_SIZE; 2505: buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2506: if (buffer == NULL) { 2507: htmlErrMemory(ctxt, "buffer allocation failed\n"); 2508: return(NULL); 2509: } 2510: out = buffer; 2511: 2512: /* 2513: * Ok loop until we reach one of the ending chars 2514: */ 2515: while ((CUR != 0) && (CUR != stop)) { 2516: if ((stop == 0) && (CUR == '>')) break; 2517: if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2518: if (CUR == '&') { 2519: if (NXT(1) == '#') { 2520: unsigned int c; 2521: int bits; 2522: 2523: c = htmlParseCharRef(ctxt); 2524: if (c < 0x80) 2525: { *out++ = c; bits= -6; } 2526: else if (c < 0x800) 2527: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2528: else if (c < 0x10000) 2529: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2530: else 2531: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2532: 2533: for ( ; bits >= 0; bits-= 6) { 2534: *out++ = ((c >> bits) & 0x3F) | 0x80; 2535: } 2536: 2537: if (out - buffer > buffer_size - 100) { 2538: int indx = out - buffer; 2539: 2540: growBuffer(buffer); 2541: out = &buffer[indx]; 2542: } 2543: } else { 2544: ent = htmlParseEntityRef(ctxt, &name); 2545: if (name == NULL) { 2546: *out++ = '&'; 2547: if (out - buffer > buffer_size - 100) { 2548: int indx = out - buffer; 2549: 2550: growBuffer(buffer); 2551: out = &buffer[indx]; 2552: } 2553: } else if (ent == NULL) { 2554: *out++ = '&'; 2555: cur = name; 2556: while (*cur != 0) { 2557: if (out - buffer > buffer_size - 100) { 2558: int indx = out - buffer; 2559: 2560: growBuffer(buffer); 2561: out = &buffer[indx]; 2562: } 2563: *out++ = *cur++; 2564: } 2565: } else { 2566: unsigned int c; 2567: int bits; 2568: 2569: if (out - buffer > buffer_size - 100) { 2570: int indx = out - buffer; 2571: 2572: growBuffer(buffer); 2573: out = &buffer[indx]; 2574: } 2575: c = ent->value; 2576: if (c < 0x80) 2577: { *out++ = c; bits= -6; } 2578: else if (c < 0x800) 2579: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2580: else if (c < 0x10000) 2581: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2582: else 2583: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2584: 2585: for ( ; bits >= 0; bits-= 6) { 2586: *out++ = ((c >> bits) & 0x3F) | 0x80; 2587: } 2588: } 2589: } 2590: } else { 2591: unsigned int c; 2592: int bits, l; 2593: 2594: if (out - buffer > buffer_size - 100) { 2595: int indx = out - buffer; 2596: 2597: growBuffer(buffer); 2598: out = &buffer[indx]; 2599: } 2600: c = CUR_CHAR(l); 2601: if (c < 0x80) 2602: { *out++ = c; bits= -6; } 2603: else if (c < 0x800) 2604: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2605: else if (c < 0x10000) 2606: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2607: else 2608: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2609: 2610: for ( ; bits >= 0; bits-= 6) { 2611: *out++ = ((c >> bits) & 0x3F) | 0x80; 2612: } 2613: NEXT; 2614: } 2615: } 2616: *out = 0; 2617: return(buffer); 2618: } 2619: 2620: /** 2621: * htmlParseEntityRef: 2622: * @ctxt: an HTML parser context 2623: * @str: location to store the entity name 2624: * 2625: * parse an HTML ENTITY references 2626: * 2627: * [68] EntityRef ::= '&' Name ';' 2628: * 2629: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2630: * if non-NULL *str will have to be freed by the caller. 2631: */ 2632: const htmlEntityDesc * 2633: htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2634: const xmlChar *name; 2635: const htmlEntityDesc * ent = NULL; 2636: 2637: if (str != NULL) *str = NULL; 2638: if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2639: 2640: if (CUR == '&') { 2641: NEXT; 2642: name = htmlParseName(ctxt); 2643: if (name == NULL) { 2644: htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2645: "htmlParseEntityRef: no name\n", NULL, NULL); 2646: } else { 2647: GROW; 2648: if (CUR == ';') { 2649: if (str != NULL) 2650: *str = name; 2651: 2652: /* 2653: * Lookup the entity in the table. 2654: */ 2655: ent = htmlEntityLookup(name); 2656: if (ent != NULL) /* OK that's ugly !!! */ 2657: NEXT; 2658: } else { 2659: htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2660: "htmlParseEntityRef: expecting ';'\n", 2661: NULL, NULL); 2662: if (str != NULL) 2663: *str = name; 2664: } 2665: } 2666: } 2667: return(ent); 2668: } 2669: 2670: /** 2671: * htmlParseAttValue: 2672: * @ctxt: an HTML parser context 2673: * 2674: * parse a value for an attribute 2675: * Note: the parser won't do substitution of entities here, this 2676: * will be handled later in xmlStringGetNodeList, unless it was 2677: * asked for ctxt->replaceEntities != 0 2678: * 2679: * Returns the AttValue parsed or NULL. 2680: */ 2681: 2682: static xmlChar * 2683: htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2684: xmlChar *ret = NULL; 2685: 2686: if (CUR == '"') { 2687: NEXT; 2688: ret = htmlParseHTMLAttribute(ctxt, '"'); 2689: if (CUR != '"') { 2690: htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2691: "AttValue: \" expected\n", NULL, NULL); 2692: } else 2693: NEXT; 2694: } else if (CUR == '\'') { 2695: NEXT; 2696: ret = htmlParseHTMLAttribute(ctxt, '\''); 2697: if (CUR != '\'') { 2698: htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2699: "AttValue: ' expected\n", NULL, NULL); 2700: } else 2701: NEXT; 2702: } else { 2703: /* 2704: * That's an HTMLism, the attribute value may not be quoted 2705: */ 2706: ret = htmlParseHTMLAttribute(ctxt, 0); 2707: if (ret == NULL) { 2708: htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2709: "AttValue: no value found\n", NULL, NULL); 2710: } 2711: } 2712: return(ret); 2713: } 2714: 2715: /** 2716: * htmlParseSystemLiteral: 2717: * @ctxt: an HTML parser context 2718: * 2719: * parse an HTML Literal 2720: * 2721: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2722: * 2723: * Returns the SystemLiteral parsed or NULL 2724: */ 2725: 2726: static xmlChar * 2727: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2728: const xmlChar *q; 2729: xmlChar *ret = NULL; 2730: 2731: if (CUR == '"') { 2732: NEXT; 2733: q = CUR_PTR; 2734: while ((IS_CHAR_CH(CUR)) && (CUR != '"')) 2735: NEXT; 2736: if (!IS_CHAR_CH(CUR)) { 2737: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2738: "Unfinished SystemLiteral\n", NULL, NULL); 2739: } else { 2740: ret = xmlStrndup(q, CUR_PTR - q); 2741: NEXT; 2742: } 2743: } else if (CUR == '\'') { 2744: NEXT; 2745: q = CUR_PTR; 2746: while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) 2747: NEXT; 2748: if (!IS_CHAR_CH(CUR)) { 2749: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2750: "Unfinished SystemLiteral\n", NULL, NULL); 2751: } else { 2752: ret = xmlStrndup(q, CUR_PTR - q); 2753: NEXT; 2754: } 2755: } else { 2756: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2757: " or ' expected\n", NULL, NULL); 2758: } 2759: 2760: return(ret); 2761: } 2762: 2763: /** 2764: * htmlParsePubidLiteral: 2765: * @ctxt: an HTML parser context 2766: * 2767: * parse an HTML public literal 2768: * 2769: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2770: * 2771: * Returns the PubidLiteral parsed or NULL. 2772: */ 2773: 2774: static xmlChar * 2775: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2776: const xmlChar *q; 2777: xmlChar *ret = NULL; 2778: /* 2779: * Name ::= (Letter | '_') (NameChar)* 2780: */ 2781: if (CUR == '"') { 2782: NEXT; 2783: q = CUR_PTR; 2784: while (IS_PUBIDCHAR_CH(CUR)) NEXT; 2785: if (CUR != '"') { 2786: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2787: "Unfinished PubidLiteral\n", NULL, NULL); 2788: } else { 2789: ret = xmlStrndup(q, CUR_PTR - q); 2790: NEXT; 2791: } 2792: } else if (CUR == '\'') { 2793: NEXT; 2794: q = CUR_PTR; 2795: while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) 2796: NEXT; 2797: if (CUR != '\'') { 2798: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2799: "Unfinished PubidLiteral\n", NULL, NULL); 2800: } else { 2801: ret = xmlStrndup(q, CUR_PTR - q); 2802: NEXT; 2803: } 2804: } else { 2805: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2806: "PubidLiteral \" or ' expected\n", NULL, NULL); 2807: } 2808: 2809: return(ret); 2810: } 2811: 2812: /** 2813: * htmlParseScript: 2814: * @ctxt: an HTML parser context 2815: * 2816: * parse the content of an HTML SCRIPT or STYLE element 2817: * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2818: * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2819: * http://www.w3.org/TR/html4/types.html#type-script 2820: * http://www.w3.org/TR/html4/types.html#h-6.15 2821: * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2822: * 2823: * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2824: * element and the value of intrinsic event attributes. User agents must 2825: * not evaluate script data as HTML markup but instead must pass it on as 2826: * data to a script engine. 2827: * NOTES: 2828: * - The content is passed like CDATA 2829: * - the attributes for style and scripting "onXXX" are also described 2830: * as CDATA but SGML allows entities references in attributes so their 2831: * processing is identical as other attributes 2832: */ 2833: static void 2834: htmlParseScript(htmlParserCtxtPtr ctxt) { 2835: xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2836: int nbchar = 0; 2837: int cur,l; 2838: 2839: SHRINK; 2840: cur = CUR_CHAR(l); 2841: while (IS_CHAR_CH(cur)) { 2842: if ((cur == '<') && (NXT(1) == '/')) { 2843: /* 2844: * One should break here, the specification is clear: 2845: * Authors should therefore escape "</" within the content. 2846: * Escape mechanisms are specific to each scripting or 2847: * style sheet language. 2848: * 2849: * In recovery mode, only break if end tag match the 2850: * current tag, effectively ignoring all tags inside the 2851: * script/style block and treating the entire block as 2852: * CDATA. 2853: */ 2854: if (ctxt->recovery) { 2855: if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2856: xmlStrlen(ctxt->name)) == 0) 2857: { 2858: break; /* while */ 2859: } else { 2860: htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2861: "Element %s embeds close tag\n", 2862: ctxt->name, NULL); 2863: } 2864: } else { 2865: if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2866: ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2867: { 2868: break; /* while */ 2869: } 2870: } 2871: } 2872: COPY_BUF(l,buf,nbchar,cur); 2873: if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2874: if (ctxt->sax->cdataBlock!= NULL) { 2875: /* 2876: * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2877: */ 2878: ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2879: } else if (ctxt->sax->characters != NULL) { 2880: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2881: } 2882: nbchar = 0; 2883: } 2884: GROW; 2885: NEXTL(l); 2886: cur = CUR_CHAR(l); 2887: } 2888: 2889: if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 2890: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2891: "Invalid char in CDATA 0x%X\n", cur); 2892: if (ctxt->input->cur < ctxt->input->end) { 2893: NEXT; 2894: } 2895: } 2896: 2897: if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2898: if (ctxt->sax->cdataBlock!= NULL) { 2899: /* 2900: * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2901: */ 2902: ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2903: } else if (ctxt->sax->characters != NULL) { 2904: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2905: } 2906: } 2907: } 2908: 2909: 2910: /** 2911: * htmlParseCharData: 2912: * @ctxt: an HTML parser context 2913: * 2914: * parse a CharData section. 2915: * if we are within a CDATA section ']]>' marks an end of section. 2916: * 2917: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2918: */ 2919: 2920: static void 2921: htmlParseCharData(htmlParserCtxtPtr ctxt) { 2922: xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2923: int nbchar = 0; 2924: int cur, l; 2925: int chunk = 0; 2926: 2927: SHRINK; 2928: cur = CUR_CHAR(l); 2929: while (((cur != '<') || (ctxt->token == '<')) && 2930: ((cur != '&') || (ctxt->token == '&')) && 2931: (cur != 0)) { 2932: if (!(IS_CHAR(cur))) { 2933: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2934: "Invalid char in CDATA 0x%X\n", cur); 2935: } else { 2936: COPY_BUF(l,buf,nbchar,cur); 2937: } 2938: if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2939: /* 2940: * Ok the segment is to be consumed as chars. 2941: */ 2942: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2943: if (areBlanks(ctxt, buf, nbchar)) { 2944: if (ctxt->sax->ignorableWhitespace != NULL) 2945: ctxt->sax->ignorableWhitespace(ctxt->userData, 2946: buf, nbchar); 2947: } else { 2948: htmlCheckParagraph(ctxt); 2949: if (ctxt->sax->characters != NULL) 2950: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2951: } 2952: } 2953: nbchar = 0; 2954: } 2955: NEXTL(l); 2956: chunk++; 2957: if (chunk > HTML_PARSER_BUFFER_SIZE) { 2958: chunk = 0; 2959: SHRINK; 2960: GROW; 2961: } 2962: cur = CUR_CHAR(l); 2963: if (cur == 0) { 2964: SHRINK; 2965: GROW; 2966: cur = CUR_CHAR(l); 2967: } 2968: } 2969: if (nbchar != 0) { 2970: buf[nbchar] = 0; 2971: 2972: /* 2973: * Ok the segment is to be consumed as chars. 2974: */ 2975: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2976: if (areBlanks(ctxt, buf, nbchar)) { 2977: if (ctxt->sax->ignorableWhitespace != NULL) 2978: ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); 2979: } else { 2980: htmlCheckParagraph(ctxt); 2981: if (ctxt->sax->characters != NULL) 2982: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2983: } 2984: } 2985: } else { 2986: /* 2987: * Loop detection 2988: */ 2989: if (cur == 0) 2990: ctxt->instate = XML_PARSER_EOF; 2991: } 2992: } 2993: 2994: /** 2995: * htmlParseExternalID: 2996: * @ctxt: an HTML parser context 2997: * @publicID: a xmlChar** receiving PubidLiteral 2998: * 2999: * Parse an External ID or a Public ID 3000: * 3001: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 3002: * | 'PUBLIC' S PubidLiteral S SystemLiteral 3003: * 3004: * [83] PublicID ::= 'PUBLIC' S PubidLiteral 3005: * 3006: * Returns the function returns SystemLiteral and in the second 3007: * case publicID receives PubidLiteral, is strict is off 3008: * it is possible to return NULL and have publicID set. 3009: */ 3010: 3011: static xmlChar * 3012: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 3013: xmlChar *URI = NULL; 3014: 3015: if ((UPPER == 'S') && (UPP(1) == 'Y') && 3016: (UPP(2) == 'S') && (UPP(3) == 'T') && 3017: (UPP(4) == 'E') && (UPP(5) == 'M')) { 3018: SKIP(6); 3019: if (!IS_BLANK_CH(CUR)) { 3020: htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3021: "Space required after 'SYSTEM'\n", NULL, NULL); 3022: } 3023: SKIP_BLANKS; 3024: URI = htmlParseSystemLiteral(ctxt); 3025: if (URI == NULL) { 3026: htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 3027: "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 3028: } 3029: } else if ((UPPER == 'P') && (UPP(1) == 'U') && 3030: (UPP(2) == 'B') && (UPP(3) == 'L') && 3031: (UPP(4) == 'I') && (UPP(5) == 'C')) { 3032: SKIP(6); 3033: if (!IS_BLANK_CH(CUR)) { 3034: htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3035: "Space required after 'PUBLIC'\n", NULL, NULL); 3036: } 3037: SKIP_BLANKS; 3038: *publicID = htmlParsePubidLiteral(ctxt); 3039: if (*publicID == NULL) { 3040: htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 3041: "htmlParseExternalID: PUBLIC, no Public Identifier\n", 3042: NULL, NULL); 3043: } 3044: SKIP_BLANKS; 3045: if ((CUR == '"') || (CUR == '\'')) { 3046: URI = htmlParseSystemLiteral(ctxt); 3047: } 3048: } 3049: return(URI); 3050: } 3051: 3052: /** 3053: * xmlParsePI: 3054: * @ctxt: an XML parser context 3055: * 3056: * parse an XML Processing Instruction. 3057: * 3058: * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 3059: */ 3060: static void 3061: htmlParsePI(htmlParserCtxtPtr ctxt) { 3062: xmlChar *buf = NULL; 3063: int len = 0; 3064: int size = HTML_PARSER_BUFFER_SIZE; 3065: int cur, l; 3066: const xmlChar *target; 3067: xmlParserInputState state; 3068: int count = 0; 3069: 3070: if ((RAW == '<') && (NXT(1) == '?')) { 3071: state = ctxt->instate; 3072: ctxt->instate = XML_PARSER_PI; 3073: /* 3074: * this is a Processing Instruction. 3075: */ 3076: SKIP(2); 3077: SHRINK; 3078: 3079: /* 3080: * Parse the target name and check for special support like 3081: * namespace. 3082: */ 3083: target = htmlParseName(ctxt); 3084: if (target != NULL) { 3085: if (RAW == '>') { 3086: SKIP(1); 3087: 3088: /* 3089: * SAX: PI detected. 3090: */ 3091: if ((ctxt->sax) && (!ctxt->disableSAX) && 3092: (ctxt->sax->processingInstruction != NULL)) 3093: ctxt->sax->processingInstruction(ctxt->userData, 3094: target, NULL); 3095: ctxt->instate = state; 3096: return; 3097: } 3098: buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3099: if (buf == NULL) { 3100: htmlErrMemory(ctxt, NULL); 3101: ctxt->instate = state; 3102: return; 3103: } 3104: cur = CUR; 3105: if (!IS_BLANK(cur)) { 3106: htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3107: "ParsePI: PI %s space expected\n", target, NULL); 3108: } 3109: SKIP_BLANKS; 3110: cur = CUR_CHAR(l); 3111: while (IS_CHAR(cur) && (cur != '>')) { 3112: if (len + 5 >= size) { 3113: xmlChar *tmp; 3114: 3115: size *= 2; 3116: tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3117: if (tmp == NULL) { 3118: htmlErrMemory(ctxt, NULL); 3119: xmlFree(buf); 3120: ctxt->instate = state; 3121: return; 3122: } 3123: buf = tmp; 3124: } 3125: count++; 3126: if (count > 50) { 3127: GROW; 3128: count = 0; 3129: } 3130: COPY_BUF(l,buf,len,cur); 3131: NEXTL(l); 3132: cur = CUR_CHAR(l); 3133: if (cur == 0) { 3134: SHRINK; 3135: GROW; 3136: cur = CUR_CHAR(l); 3137: } 3138: } 3139: buf[len] = 0; 3140: if (cur != '>') { 3141: htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 3142: "ParsePI: PI %s never end ...\n", target, NULL); 3143: } else { 3144: SKIP(1); 3145: 3146: /* 3147: * SAX: PI detected. 3148: */ 3149: if ((ctxt->sax) && (!ctxt->disableSAX) && 3150: (ctxt->sax->processingInstruction != NULL)) 3151: ctxt->sax->processingInstruction(ctxt->userData, 3152: target, buf); 3153: } 3154: xmlFree(buf); 3155: } else { 3156: htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3157: "PI is not started correctly", NULL, NULL); 3158: } 3159: ctxt->instate = state; 3160: } 3161: } 3162: 3163: /** 3164: * htmlParseComment: 3165: * @ctxt: an HTML parser context 3166: * 3167: * Parse an XML (SGML) comment  3168: * 3169: * [15] Comment ::= '' 3170: */ 3171: static void 3172: htmlParseComment(htmlParserCtxtPtr ctxt) { 3173: xmlChar *buf = NULL; 3174: int len; 3175: int size = HTML_PARSER_BUFFER_SIZE; 3176: int q, ql; 3177: int r, rl; 3178: int cur, l; 3179: xmlParserInputState state; 3180: 3181: /* 3182: * Check that there is a comment right here. 3183: */ 3184: if ((RAW != '<') || (NXT(1) != '!') || 3185: (NXT(2) != '-') || (NXT(3) != '-')) return; 3186: 3187: state = ctxt->instate; 3188: ctxt->instate = XML_PARSER_COMMENT; 3189: SHRINK; 3190: SKIP(4); 3191: buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3192: if (buf == NULL) { 3193: htmlErrMemory(ctxt, "buffer allocation failed\n"); 3194: ctxt->instate = state; 3195: return; 3196: } 3197: q = CUR_CHAR(ql); 3198: NEXTL(ql); 3199: r = CUR_CHAR(rl); 3200: NEXTL(rl); 3201: cur = CUR_CHAR(l); 3202: len = 0; 3203: while (IS_CHAR(cur) && 3204: ((cur != '>') || 3205: (r != '-') || (q != '-'))) { 3206: if (len + 5 >= size) { 3207: xmlChar *tmp; 3208: 3209: size *= 2; 3210: tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3211: if (tmp == NULL) { 3212: xmlFree(buf); 3213: htmlErrMemory(ctxt, "growing buffer failed\n"); 3214: ctxt->instate = state; 3215: return; 3216: } 3217: buf = tmp; 3218: } 3219: COPY_BUF(ql,buf,len,q); 3220: q = r; 3221: ql = rl; 3222: r = cur; 3223: rl = l; 3224: NEXTL(l); 3225: cur = CUR_CHAR(l); 3226: if (cur == 0) { 3227: SHRINK; 3228: GROW; 3229: cur = CUR_CHAR(l); 3230: } 3231: } 3232: buf[len] = 0; 3233: if (!IS_CHAR(cur)) { 3234: htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3235: "Comment not terminated \n */ 5043: base += 2; 5044: } 5045: } 5046: if (ignoreattrval) { 5047: if (buf[base] == '"' || buf[base] == '\'') { 5048: if (invalue) { 5049: if (buf[base] == valdellim) { 5050: invalue = 0; 5051: continue; 5052: } 5053: } else { 5054: valdellim = buf[base]; 5055: invalue = 1; 5056: continue; 5057: } 5058: } else if (invalue) { 5059: continue; 5060: } 5061: } 5062: if (incomment) { 5063: if (base + 3 > len) 5064: return (-1); 5065: if ((buf[base] == '-') && (buf[base + 1] == '-') && 5066: (buf[base + 2] == '>')) { 5067: incomment = 0; 5068: base += 2; 5069: } 5070: continue; 5071: } 5072: if (buf[base] == first) { 5073: if (third != 0) { 5074: if ((buf[base + 1] != next) || (buf[base + 2] != third)) 5075: continue; 5076: } else if (next != 0) { 5077: if (buf[base + 1] != next) 5078: continue; 5079: } 5080: ctxt->checkIndex = 0; 5081: #ifdef DEBUG_PUSH 5082: if (next == 0) 5083: xmlGenericError(xmlGenericErrorContext, 5084: "HPP: lookup '%c' found at %d\n", 5085: first, base); 5086: else if (third == 0) 5087: xmlGenericError(xmlGenericErrorContext, 5088: "HPP: lookup '%c%c' found at %d\n", 5089: first, next, base); 5090: else 5091: xmlGenericError(xmlGenericErrorContext, 5092: "HPP: lookup '%c%c%c' found at %d\n", 5093: first, next, third, base); 5094: #endif 5095: return (base - (in->cur - in->base)); 5096: } 5097: } 5098: if ((!incomment) && (!invalue)) 5099: ctxt->checkIndex = base; 5100: #ifdef DEBUG_PUSH 5101: if (next == 0) 5102: xmlGenericError(xmlGenericErrorContext, 5103: "HPP: lookup '%c' failed\n", first); 5104: else if (third == 0) 5105: xmlGenericError(xmlGenericErrorContext, 5106: "HPP: lookup '%c%c' failed\n", first, next); 5107: else 5108: xmlGenericError(xmlGenericErrorContext, 5109: "HPP: lookup '%c%c%c' failed\n", first, next, 5110: third); 5111: #endif 5112: return (-1); 5113: } 5114: 5115: /** 5116: * htmlParseLookupChars: 5117: * @ctxt: an HTML parser context 5118: * @stop: Array of chars, which stop the lookup. 5119: * @stopLen: Length of stop-Array 5120: * 5121: * Try to find if any char of the stop-Array is available in the input 5122: * stream. 5123: * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5124: * to avoid rescanning sequences of bytes, it DOES change the state of the 5125: * parser, do not use liberally. 5126: * 5127: * Returns the index to the current parsing point if a stopChar 5128: * is available, -1 otherwise. 5129: */ 5130: static int 5131: htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, 5132: int stopLen) 5133: { 5134: int base, len; 5135: htmlParserInputPtr in; 5136: const xmlChar *buf; 5137: int incomment = 0; 5138: int i; 5139: 5140: in = ctxt->input; 5141: if (in == NULL) 5142: return (-1); 5143: 5144: base = in->cur - in->base; 5145: if (base < 0) 5146: return (-1); 5147: 5148: if (ctxt->checkIndex > base) 5149: base = ctxt->checkIndex; 5150: 5151: if (in->buf == NULL) { 5152: buf = in->base; 5153: len = in->length; 5154: } else { 5155: buf = in->buf->buffer->content; 5156: len = in->buf->buffer->use; 5157: } 5158: 5159: for (; base < len; base++) { 5160: if (!incomment && (base + 4 < len)) { 5161: if ((buf[base] == '<') && (buf[base + 1] == '!') && 5162: (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5163: incomment = 1; 5164: /* do not increment past <! - some people use <!--> */ 5165: base += 2; 5166: } 5167: } 5168: if (incomment) { 5169: if (base + 3 > len) 5170: return (-1); 5171: if ((buf[base] == '-') && (buf[base + 1] == '-') && 5172: (buf[base + 2] == '>')) { 5173: incomment = 0; 5174: base += 2; 5175: } 5176: continue; 5177: } 5178: for (i = 0; i < stopLen; ++i) { 5179: if (buf[base] == stop[i]) { 5180: ctxt->checkIndex = 0; 5181: return (base - (in->cur - in->base)); 5182: } 5183: } 5184: } 5185: ctxt->checkIndex = base; 5186: return (-1); 5187: } 5188: 5189: /** 5190: * htmlParseTryOrFinish: 5191: * @ctxt: an HTML parser context 5192: * @terminate: last chunk indicator 5193: * 5194: * Try to progress on parsing 5195: * 5196: * Returns zero if no parsing was possible 5197: */ 5198: static int 5199: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 5200: int ret = 0; 5201: htmlParserInputPtr in; 5202: int avail = 0; 5203: xmlChar cur, next; 5204: 5205: htmlParserNodeInfo node_info; 5206: 5207: #ifdef DEBUG_PUSH 5208: switch (ctxt->instate) { 5209: case XML_PARSER_EOF: 5210: xmlGenericError(xmlGenericErrorContext, 5211: "HPP: try EOF\n"); break; 5212: case XML_PARSER_START: 5213: xmlGenericError(xmlGenericErrorContext, 5214: "HPP: try START\n"); break; 5215: case XML_PARSER_MISC: 5216: xmlGenericError(xmlGenericErrorContext, 5217: "HPP: try MISC\n");break; 5218: case XML_PARSER_COMMENT: 5219: xmlGenericError(xmlGenericErrorContext, 5220: "HPP: try COMMENT\n");break; 5221: case XML_PARSER_PROLOG: 5222: xmlGenericError(xmlGenericErrorContext, 5223: "HPP: try PROLOG\n");break; 5224: case XML_PARSER_START_TAG: 5225: xmlGenericError(xmlGenericErrorContext, 5226: "HPP: try START_TAG\n");break; 5227: case XML_PARSER_CONTENT: 5228: xmlGenericError(xmlGenericErrorContext, 5229: "HPP: try CONTENT\n");break; 5230: case XML_PARSER_CDATA_SECTION: 5231: xmlGenericError(xmlGenericErrorContext, 5232: "HPP: try CDATA_SECTION\n");break; 5233: case XML_PARSER_END_TAG: 5234: xmlGenericError(xmlGenericErrorContext, 5235: "HPP: try END_TAG\n");break; 5236: case XML_PARSER_ENTITY_DECL: 5237: xmlGenericError(xmlGenericErrorContext, 5238: "HPP: try ENTITY_DECL\n");break; 5239: case XML_PARSER_ENTITY_VALUE: 5240: xmlGenericError(xmlGenericErrorContext, 5241: "HPP: try ENTITY_VALUE\n");break; 5242: case XML_PARSER_ATTRIBUTE_VALUE: 5243: xmlGenericError(xmlGenericErrorContext, 5244: "HPP: try ATTRIBUTE_VALUE\n");break; 5245: case XML_PARSER_DTD: 5246: xmlGenericError(xmlGenericErrorContext, 5247: "HPP: try DTD\n");break; 5248: case XML_PARSER_EPILOG: 5249: xmlGenericError(xmlGenericErrorContext, 5250: "HPP: try EPILOG\n");break; 5251: case XML_PARSER_PI: 5252: xmlGenericError(xmlGenericErrorContext, 5253: "HPP: try PI\n");break; 5254: case XML_PARSER_SYSTEM_LITERAL: 5255: xmlGenericError(xmlGenericErrorContext, 5256: "HPP: try SYSTEM_LITERAL\n");break; 5257: } 5258: #endif 5259: 5260: while (1) { 5261: 5262: in = ctxt->input; 5263: if (in == NULL) break; 5264: if (in->buf == NULL) 5265: avail = in->length - (in->cur - in->base); 5266: else 5267: avail = in->buf->buffer->use - (in->cur - in->base); 5268: if ((avail == 0) && (terminate)) { 5269: htmlAutoCloseOnEnd(ctxt); 5270: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5271: /* 5272: * SAX: end of the document processing. 5273: */ 5274: ctxt->instate = XML_PARSER_EOF; 5275: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5276: ctxt->sax->endDocument(ctxt->userData); 5277: } 5278: } 5279: if (avail < 1) 5280: goto done; 5281: cur = in->cur[0]; 5282: if (cur == 0) { 5283: SKIP(1); 5284: continue; 5285: } 5286: 5287: switch (ctxt->instate) { 5288: case XML_PARSER_EOF: 5289: /* 5290: * Document parsing is done ! 5291: */ 5292: goto done; 5293: case XML_PARSER_START: 5294: /* 5295: * Very first chars read from the document flow. 5296: */ 5297: cur = in->cur[0]; 5298: if (IS_BLANK_CH(cur)) { 5299: SKIP_BLANKS; 5300: if (in->buf == NULL) 5301: avail = in->length - (in->cur - in->base); 5302: else 5303: avail = in->buf->buffer->use - (in->cur - in->base); 5304: } 5305: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 5306: ctxt->sax->setDocumentLocator(ctxt->userData, 5307: &xmlDefaultSAXLocator); 5308: if ((ctxt->sax) && (ctxt->sax->startDocument) && 5309: (!ctxt->disableSAX)) 5310: ctxt->sax->startDocument(ctxt->userData); 5311: 5312: cur = in->cur[0]; 5313: next = in->cur[1]; 5314: if ((cur == '<') && (next == '!') && 5315: (UPP(2) == 'D') && (UPP(3) == 'O') && 5316: (UPP(4) == 'C') && (UPP(5) == 'T') && 5317: (UPP(6) == 'Y') && (UPP(7) == 'P') && 5318: (UPP(8) == 'E')) { 5319: if ((!terminate) && 5320: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5321: goto done; 5322: #ifdef DEBUG_PUSH 5323: xmlGenericError(xmlGenericErrorContext, 5324: "HPP: Parsing internal subset\n"); 5325: #endif 5326: htmlParseDocTypeDecl(ctxt); 5327: ctxt->instate = XML_PARSER_PROLOG; 5328: #ifdef DEBUG_PUSH 5329: xmlGenericError(xmlGenericErrorContext, 5330: "HPP: entering PROLOG\n"); 5331: #endif 5332: } else { 5333: ctxt->instate = XML_PARSER_MISC; 5334: #ifdef DEBUG_PUSH 5335: xmlGenericError(xmlGenericErrorContext, 5336: "HPP: entering MISC\n"); 5337: #endif 5338: } 5339: break; 5340: case XML_PARSER_MISC: 5341: SKIP_BLANKS; 5342: if (in->buf == NULL) 5343: avail = in->length - (in->cur - in->base); 5344: else 5345: avail = in->buf->buffer->use - (in->cur - in->base); 5346: /* 5347: * no chars in buffer 5348: */ 5349: if (avail < 1) 5350: goto done; 5351: /* 5352: * not enouth chars in buffer 5353: */ 5354: if (avail < 2) { 5355: if (!terminate) 5356: goto done; 5357: else 5358: next = ' '; 5359: } else { 5360: next = in->cur[1]; 5361: } 5362: cur = in->cur[0]; 5363: if ((cur == '<') && (next == '!') && 5364: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5365: if ((!terminate) && 5366: (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5367: goto done; 5368: #ifdef DEBUG_PUSH 5369: xmlGenericError(xmlGenericErrorContext, 5370: "HPP: Parsing Comment\n"); 5371: #endif 5372: htmlParseComment(ctxt); 5373: ctxt->instate = XML_PARSER_MISC; 5374: } else if ((cur == '<') && (next == '?')) { 5375: if ((!terminate) && 5376: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5377: goto done; 5378: #ifdef DEBUG_PUSH 5379: xmlGenericError(xmlGenericErrorContext, 5380: "HPP: Parsing PI\n"); 5381: #endif 5382: htmlParsePI(ctxt); 5383: ctxt->instate = XML_PARSER_MISC; 5384: } else if ((cur == '<') && (next == '!') && 5385: (UPP(2) == 'D') && (UPP(3) == 'O') && 5386: (UPP(4) == 'C') && (UPP(5) == 'T') && 5387: (UPP(6) == 'Y') && (UPP(7) == 'P') && 5388: (UPP(8) == 'E')) { 5389: if ((!terminate) && 5390: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5391: goto done; 5392: #ifdef DEBUG_PUSH 5393: xmlGenericError(xmlGenericErrorContext, 5394: "HPP: Parsing internal subset\n"); 5395: #endif 5396: htmlParseDocTypeDecl(ctxt); 5397: ctxt->instate = XML_PARSER_PROLOG; 5398: #ifdef DEBUG_PUSH 5399: xmlGenericError(xmlGenericErrorContext, 5400: "HPP: entering PROLOG\n"); 5401: #endif 5402: } else if ((cur == '<') && (next == '!') && 5403: (avail < 9)) { 5404: goto done; 5405: } else { 5406: ctxt->instate = XML_PARSER_START_TAG; 5407: #ifdef DEBUG_PUSH 5408: xmlGenericError(xmlGenericErrorContext, 5409: "HPP: entering START_TAG\n"); 5410: #endif 5411: } 5412: break; 5413: case XML_PARSER_PROLOG: 5414: SKIP_BLANKS; 5415: if (in->buf == NULL) 5416: avail = in->length - (in->cur - in->base); 5417: else 5418: avail = in->buf->buffer->use - (in->cur - in->base); 5419: if (avail < 2) 5420: goto done; 5421: cur = in->cur[0]; 5422: next = in->cur[1]; 5423: if ((cur == '<') && (next == '!') && 5424: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5425: if ((!terminate) && 5426: (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5427: goto done; 5428: #ifdef DEBUG_PUSH 5429: xmlGenericError(xmlGenericErrorContext, 5430: "HPP: Parsing Comment\n"); 5431: #endif 5432: htmlParseComment(ctxt); 5433: ctxt->instate = XML_PARSER_PROLOG; 5434: } else if ((cur == '<') && (next == '?')) { 5435: if ((!terminate) && 5436: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5437: goto done; 5438: #ifdef DEBUG_PUSH 5439: xmlGenericError(xmlGenericErrorContext, 5440: "HPP: Parsing PI\n"); 5441: #endif 5442: htmlParsePI(ctxt); 5443: ctxt->instate = XML_PARSER_PROLOG; 5444: } else if ((cur == '<') && (next == '!') && 5445: (avail < 4)) { 5446: goto done; 5447: } else { 5448: ctxt->instate = XML_PARSER_START_TAG; 5449: #ifdef DEBUG_PUSH 5450: xmlGenericError(xmlGenericErrorContext, 5451: "HPP: entering START_TAG\n"); 5452: #endif 5453: } 5454: break; 5455: case XML_PARSER_EPILOG: 5456: if (in->buf == NULL) 5457: avail = in->length - (in->cur - in->base); 5458: else 5459: avail = in->buf->buffer->use - (in->cur - in->base); 5460: if (avail < 1) 5461: goto done; 5462: cur = in->cur[0]; 5463: if (IS_BLANK_CH(cur)) { 5464: htmlParseCharData(ctxt); 5465: goto done; 5466: } 5467: if (avail < 2) 5468: goto done; 5469: next = in->cur[1]; 5470: if ((cur == '<') && (next == '!') && 5471: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5472: if ((!terminate) && 5473: (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5474: goto done; 5475: #ifdef DEBUG_PUSH 5476: xmlGenericError(xmlGenericErrorContext, 5477: "HPP: Parsing Comment\n"); 5478: #endif 5479: htmlParseComment(ctxt); 5480: ctxt->instate = XML_PARSER_EPILOG; 5481: } else if ((cur == '<') && (next == '?')) { 5482: if ((!terminate) && 5483: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5484: goto done; 5485: #ifdef DEBUG_PUSH 5486: xmlGenericError(xmlGenericErrorContext, 5487: "HPP: Parsing PI\n"); 5488: #endif 5489: htmlParsePI(ctxt); 5490: ctxt->instate = XML_PARSER_EPILOG; 5491: } else if ((cur == '<') && (next == '!') && 5492: (avail < 4)) { 5493: goto done; 5494: } else { 5495: ctxt->errNo = XML_ERR_DOCUMENT_END; 5496: ctxt->wellFormed = 0; 5497: ctxt->instate = XML_PARSER_EOF; 5498: #ifdef DEBUG_PUSH 5499: xmlGenericError(xmlGenericErrorContext, 5500: "HPP: entering EOF\n"); 5501: #endif 5502: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5503: ctxt->sax->endDocument(ctxt->userData); 5504: goto done; 5505: } 5506: break; 5507: case XML_PARSER_START_TAG: { 5508: const xmlChar *name; 5509: int failed; 5510: const htmlElemDesc * info; 5511: 5512: /* 5513: * no chars in buffer 5514: */ 5515: if (avail < 1) 5516: goto done; 5517: /* 5518: * not enouth chars in buffer 5519: */ 5520: if (avail < 2) { 5521: if (!terminate) 5522: goto done; 5523: else 5524: next = ' '; 5525: } else { 5526: next = in->cur[1]; 5527: } 5528: cur = in->cur[0]; 5529: if (cur != '<') { 5530: ctxt->instate = XML_PARSER_CONTENT; 5531: #ifdef DEBUG_PUSH 5532: xmlGenericError(xmlGenericErrorContext, 5533: "HPP: entering CONTENT\n"); 5534: #endif 5535: break; 5536: } 5537: if (next == '/') { 5538: ctxt->instate = XML_PARSER_END_TAG; 5539: ctxt->checkIndex = 0; 5540: #ifdef DEBUG_PUSH 5541: xmlGenericError(xmlGenericErrorContext, 5542: "HPP: entering END_TAG\n"); 5543: #endif 5544: break; 5545: } 5546: if ((!terminate) && 5547: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5548: goto done; 5549: 5550: /* Capture start position */ 5551: if (ctxt->record_info) { 5552: node_info.begin_pos = ctxt->input->consumed + 5553: (CUR_PTR - ctxt->input->base); 5554: node_info.begin_line = ctxt->input->line; 5555: } 5556: 5557: 5558: failed = htmlParseStartTag(ctxt); 5559: name = ctxt->name; 5560: if ((failed == -1) || 5561: (name == NULL)) { 5562: if (CUR == '>') 5563: NEXT; 5564: break; 5565: } 5566: 5567: /* 5568: * Lookup the info for that element. 5569: */ 5570: info = htmlTagLookup(name); 5571: if (info == NULL) { 5572: htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 5573: "Tag %s invalid\n", name, NULL); 5574: } 5575: 5576: /* 5577: * Check for an Empty Element labeled the XML/SGML way 5578: */ 5579: if ((CUR == '/') && (NXT(1) == '>')) { 5580: SKIP(2); 5581: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5582: ctxt->sax->endElement(ctxt->userData, name); 5583: htmlnamePop(ctxt); 5584: ctxt->instate = XML_PARSER_CONTENT; 5585: #ifdef DEBUG_PUSH 5586: xmlGenericError(xmlGenericErrorContext, 5587: "HPP: entering CONTENT\n"); 5588: #endif 5589: break; 5590: } 5591: 5592: if (CUR == '>') { 5593: NEXT; 5594: } else { 5595: htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 5596: "Couldn't find end of Start Tag %s\n", 5597: name, NULL); 5598: 5599: /* 5600: * end of parsing of this node. 5601: */ 5602: if (xmlStrEqual(name, ctxt->name)) { 5603: nodePop(ctxt); 5604: htmlnamePop(ctxt); 5605: } 5606: 5607: if (ctxt->record_info) 5608: htmlNodeInfoPush(ctxt, &node_info); 5609: 5610: ctxt->instate = XML_PARSER_CONTENT; 5611: #ifdef DEBUG_PUSH 5612: xmlGenericError(xmlGenericErrorContext, 5613: "HPP: entering CONTENT\n"); 5614: #endif 5615: break; 5616: } 5617: 5618: /* 5619: * Check for an Empty Element from DTD definition 5620: */ 5621: if ((info != NULL) && (info->empty)) { 5622: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5623: ctxt->sax->endElement(ctxt->userData, name); 5624: htmlnamePop(ctxt); 5625: } 5626: 5627: if (ctxt->record_info) 5628: htmlNodeInfoPush(ctxt, &node_info); 5629: 5630: ctxt->instate = XML_PARSER_CONTENT; 5631: #ifdef DEBUG_PUSH 5632: xmlGenericError(xmlGenericErrorContext, 5633: "HPP: entering CONTENT\n"); 5634: #endif 5635: break; 5636: } 5637: case XML_PARSER_CONTENT: { 5638: long cons; 5639: /* 5640: * Handle preparsed entities and charRef 5641: */ 5642: if (ctxt->token != 0) { 5643: xmlChar chr[2] = { 0 , 0 } ; 5644: 5645: chr[0] = (xmlChar) ctxt->token; 5646: htmlCheckParagraph(ctxt); 5647: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 5648: ctxt->sax->characters(ctxt->userData, chr, 1); 5649: ctxt->token = 0; 5650: ctxt->checkIndex = 0; 5651: } 5652: if ((avail == 1) && (terminate)) { 5653: cur = in->cur[0]; 5654: if ((cur != '<') && (cur != '&')) { 5655: if (ctxt->sax != NULL) { 5656: if (IS_BLANK_CH(cur)) { 5657: if (ctxt->sax->ignorableWhitespace != NULL) 5658: ctxt->sax->ignorableWhitespace( 5659: ctxt->userData, &cur, 1); 5660: } else { 5661: htmlCheckParagraph(ctxt); 5662: if (ctxt->sax->characters != NULL) 5663: ctxt->sax->characters( 5664: ctxt->userData, &cur, 1); 5665: } 5666: } 5667: ctxt->token = 0; 5668: ctxt->checkIndex = 0; 5669: in->cur++; 5670: break; 5671: } 5672: } 5673: if (avail < 2) 5674: goto done; 5675: cur = in->cur[0]; 5676: next = in->cur[1]; 5677: cons = ctxt->nbChars; 5678: if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5679: (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5680: /* 5681: * Handle SCRIPT/STYLE separately 5682: */ 5683: if (!terminate) { 5684: int idx; 5685: xmlChar val; 5686: 5687: idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); 5688: if (idx < 0) 5689: goto done; 5690: val = in->cur[idx + 2]; 5691: if (val == 0) /* bad cut of input */ 5692: goto done; 5693: } 5694: htmlParseScript(ctxt); 5695: if ((cur == '<') && (next == '/')) { 5696: ctxt->instate = XML_PARSER_END_TAG; 5697: ctxt->checkIndex = 0; 5698: #ifdef DEBUG_PUSH 5699: xmlGenericError(xmlGenericErrorContext, 5700: "HPP: entering END_TAG\n"); 5701: #endif 5702: break; 5703: } 5704: } else { 5705: /* 5706: * Sometimes DOCTYPE arrives in the middle of the document 5707: */ 5708: if ((cur == '<') && (next == '!') && 5709: (UPP(2) == 'D') && (UPP(3) == 'O') && 5710: (UPP(4) == 'C') && (UPP(5) == 'T') && 5711: (UPP(6) == 'Y') && (UPP(7) == 'P') && 5712: (UPP(8) == 'E')) { 5713: if ((!terminate) && 5714: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5715: goto done; 5716: htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5717: "Misplaced DOCTYPE declaration\n", 5718: BAD_CAST "DOCTYPE" , NULL); 5719: htmlParseDocTypeDecl(ctxt); 5720: } else if ((cur == '<') && (next == '!') && 5721: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5722: if ((!terminate) && 5723: (htmlParseLookupSequence( 5724: ctxt, '-', '-', '>', 1, 1) < 0)) 5725: goto done; 5726: #ifdef DEBUG_PUSH 5727: xmlGenericError(xmlGenericErrorContext, 5728: "HPP: Parsing Comment\n"); 5729: #endif 5730: htmlParseComment(ctxt); 5731: ctxt->instate = XML_PARSER_CONTENT; 5732: } else if ((cur == '<') && (next == '?')) { 5733: if ((!terminate) && 5734: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5735: goto done; 5736: #ifdef DEBUG_PUSH 5737: xmlGenericError(xmlGenericErrorContext, 5738: "HPP: Parsing PI\n"); 5739: #endif 5740: htmlParsePI(ctxt); 5741: ctxt->instate = XML_PARSER_CONTENT; 5742: } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5743: goto done; 5744: } else if ((cur == '<') && (next == '/')) { 5745: ctxt->instate = XML_PARSER_END_TAG; 5746: ctxt->checkIndex = 0; 5747: #ifdef DEBUG_PUSH 5748: xmlGenericError(xmlGenericErrorContext, 5749: "HPP: entering END_TAG\n"); 5750: #endif 5751: break; 5752: } else if (cur == '<') { 5753: ctxt->instate = XML_PARSER_START_TAG; 5754: ctxt->checkIndex = 0; 5755: #ifdef DEBUG_PUSH 5756: xmlGenericError(xmlGenericErrorContext, 5757: "HPP: entering START_TAG\n"); 5758: #endif 5759: break; 5760: } else if (cur == '&') { 5761: if ((!terminate) && 5762: (htmlParseLookupChars(ctxt, 5763: BAD_CAST "; >/", 4) < 0)) 5764: goto done; 5765: #ifdef DEBUG_PUSH 5766: xmlGenericError(xmlGenericErrorContext, 5767: "HPP: Parsing Reference\n"); 5768: #endif 5769: /* TODO: check generation of subtrees if noent !!! */ 5770: htmlParseReference(ctxt); 5771: } else { 5772: /* 5773: * check that the text sequence is complete 5774: * before handing out the data to the parser 5775: * to avoid problems with erroneous end of 5776: * data detection. 5777: */ 5778: if ((!terminate) && 5779: (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) 5780: goto done; 5781: ctxt->checkIndex = 0; 5782: #ifdef DEBUG_PUSH 5783: xmlGenericError(xmlGenericErrorContext, 5784: "HPP: Parsing char data\n"); 5785: #endif 5786: htmlParseCharData(ctxt); 5787: } 5788: } 5789: if (cons == ctxt->nbChars) { 5790: if (ctxt->node != NULL) { 5791: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5792: "detected an error in element content\n", 5793: NULL, NULL); 5794: } 5795: NEXT; 5796: break; 5797: } 5798: 5799: break; 5800: } 5801: case XML_PARSER_END_TAG: 5802: if (avail < 2) 5803: goto done; 5804: if ((!terminate) && 5805: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5806: goto done; 5807: htmlParseEndTag(ctxt); 5808: if (ctxt->nameNr == 0) { 5809: ctxt->instate = XML_PARSER_EPILOG; 5810: } else { 5811: ctxt->instate = XML_PARSER_CONTENT; 5812: } 5813: ctxt->checkIndex = 0; 5814: #ifdef DEBUG_PUSH 5815: xmlGenericError(xmlGenericErrorContext, 5816: "HPP: entering CONTENT\n"); 5817: #endif 5818: break; 5819: case XML_PARSER_CDATA_SECTION: 5820: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5821: "HPP: internal error, state == CDATA\n", 5822: NULL, NULL); 5823: ctxt->instate = XML_PARSER_CONTENT; 5824: ctxt->checkIndex = 0; 5825: #ifdef DEBUG_PUSH 5826: xmlGenericError(xmlGenericErrorContext, 5827: "HPP: entering CONTENT\n"); 5828: #endif 5829: break; 5830: case XML_PARSER_DTD: 5831: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5832: "HPP: internal error, state == DTD\n", 5833: NULL, NULL); 5834: ctxt->instate = XML_PARSER_CONTENT; 5835: ctxt->checkIndex = 0; 5836: #ifdef DEBUG_PUSH 5837: xmlGenericError(xmlGenericErrorContext, 5838: "HPP: entering CONTENT\n"); 5839: #endif 5840: break; 5841: case XML_PARSER_COMMENT: 5842: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5843: "HPP: internal error, state == COMMENT\n", 5844: NULL, NULL); 5845: ctxt->instate = XML_PARSER_CONTENT; 5846: ctxt->checkIndex = 0; 5847: #ifdef DEBUG_PUSH 5848: xmlGenericError(xmlGenericErrorContext, 5849: "HPP: entering CONTENT\n"); 5850: #endif 5851: break; 5852: case XML_PARSER_PI: 5853: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5854: "HPP: internal error, state == PI\n", 5855: NULL, NULL); 5856: ctxt->instate = XML_PARSER_CONTENT; 5857: ctxt->checkIndex = 0; 5858: #ifdef DEBUG_PUSH 5859: xmlGenericError(xmlGenericErrorContext, 5860: "HPP: entering CONTENT\n"); 5861: #endif 5862: break; 5863: case XML_PARSER_ENTITY_DECL: 5864: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5865: "HPP: internal error, state == ENTITY_DECL\n", 5866: NULL, NULL); 5867: ctxt->instate = XML_PARSER_CONTENT; 5868: ctxt->checkIndex = 0; 5869: #ifdef DEBUG_PUSH 5870: xmlGenericError(xmlGenericErrorContext, 5871: "HPP: entering CONTENT\n"); 5872: #endif 5873: break; 5874: case XML_PARSER_ENTITY_VALUE: 5875: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5876: "HPP: internal error, state == ENTITY_VALUE\n", 5877: NULL, NULL); 5878: ctxt->instate = XML_PARSER_CONTENT; 5879: ctxt->checkIndex = 0; 5880: #ifdef DEBUG_PUSH 5881: xmlGenericError(xmlGenericErrorContext, 5882: "HPP: entering DTD\n"); 5883: #endif 5884: break; 5885: case XML_PARSER_ATTRIBUTE_VALUE: 5886: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5887: "HPP: internal error, state == ATTRIBUTE_VALUE\n", 5888: NULL, NULL); 5889: ctxt->instate = XML_PARSER_START_TAG; 5890: ctxt->checkIndex = 0; 5891: #ifdef DEBUG_PUSH 5892: xmlGenericError(xmlGenericErrorContext, 5893: "HPP: entering START_TAG\n"); 5894: #endif 5895: break; 5896: case XML_PARSER_SYSTEM_LITERAL: 5897: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5898: "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 5899: NULL, NULL); 5900: ctxt->instate = XML_PARSER_CONTENT; 5901: ctxt->checkIndex = 0; 5902: #ifdef DEBUG_PUSH 5903: xmlGenericError(xmlGenericErrorContext, 5904: "HPP: entering CONTENT\n"); 5905: #endif 5906: break; 5907: case XML_PARSER_IGNORE: 5908: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5909: "HPP: internal error, state == XML_PARSER_IGNORE\n", 5910: NULL, NULL); 5911: ctxt->instate = XML_PARSER_CONTENT; 5912: ctxt->checkIndex = 0; 5913: #ifdef DEBUG_PUSH 5914: xmlGenericError(xmlGenericErrorContext, 5915: "HPP: entering CONTENT\n"); 5916: #endif 5917: break; 5918: case XML_PARSER_PUBLIC_LITERAL: 5919: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5920: "HPP: internal error, state == XML_PARSER_LITERAL\n", 5921: NULL, NULL); 5922: ctxt->instate = XML_PARSER_CONTENT; 5923: ctxt->checkIndex = 0; 5924: #ifdef DEBUG_PUSH 5925: xmlGenericError(xmlGenericErrorContext, 5926: "HPP: entering CONTENT\n"); 5927: #endif 5928: break; 5929: 5930: } 5931: } 5932: done: 5933: if ((avail == 0) && (terminate)) { 5934: htmlAutoCloseOnEnd(ctxt); 5935: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5936: /* 5937: * SAX: end of the document processing. 5938: */ 5939: ctxt->instate = XML_PARSER_EOF; 5940: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5941: ctxt->sax->endDocument(ctxt->userData); 5942: } 5943: } 5944: if ((ctxt->myDoc != NULL) && 5945: ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 5946: (ctxt->instate == XML_PARSER_EPILOG))) { 5947: xmlDtdPtr dtd; 5948: dtd = xmlGetIntSubset(ctxt->myDoc); 5949: if (dtd == NULL) 5950: ctxt->myDoc->intSubset = 5951: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 5952: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 5953: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 5954: } 5955: #ifdef DEBUG_PUSH 5956: xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 5957: #endif 5958: return(ret); 5959: } 5960: 5961: /** 5962: * htmlParseChunk: 5963: * @ctxt: an HTML parser context 5964: * @chunk: an char array 5965: * @size: the size in byte of the chunk 5966: * @terminate: last chunk indicator 5967: * 5968: * Parse a Chunk of memory 5969: * 5970: * Returns zero if no error, the xmlParserErrors otherwise. 5971: */ 5972: int 5973: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 5974: int terminate) { 5975: if ((ctxt == NULL) || (ctxt->input == NULL)) { 5976: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5977: "htmlParseChunk: context error\n", NULL, NULL); 5978: return(XML_ERR_INTERNAL_ERROR); 5979: } 5980: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 5981: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 5982: int base = ctxt->input->base - ctxt->input->buf->buffer->content; 5983: int cur = ctxt->input->cur - ctxt->input->base; 5984: int res; 5985: 5986: res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 5987: if (res < 0) { 5988: ctxt->errNo = XML_PARSER_EOF; 5989: ctxt->disableSAX = 1; 5990: return (XML_PARSER_EOF); 5991: } 5992: ctxt->input->base = ctxt->input->buf->buffer->content + base; 5993: ctxt->input->cur = ctxt->input->base + cur; 5994: ctxt->input->end = 5995: &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 5996: #ifdef DEBUG_PUSH 5997: xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 5998: #endif 5999: 6000: #if 0 6001: if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 6002: htmlParseTryOrFinish(ctxt, terminate); 6003: #endif 6004: } else if (ctxt->instate != XML_PARSER_EOF) { 6005: if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 6006: xmlParserInputBufferPtr in = ctxt->input->buf; 6007: if ((in->encoder != NULL) && (in->buffer != NULL) && 6008: (in->raw != NULL)) { 6009: int nbchars; 6010: 6011: nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); 6012: if (nbchars < 0) { 6013: htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 6014: "encoder error\n", NULL, NULL); 6015: return(XML_ERR_INVALID_ENCODING); 6016: } 6017: } 6018: } 6019: } 6020: htmlParseTryOrFinish(ctxt, terminate); 6021: if (terminate) { 6022: if ((ctxt->instate != XML_PARSER_EOF) && 6023: (ctxt->instate != XML_PARSER_EPILOG) && 6024: (ctxt->instate != XML_PARSER_MISC)) { 6025: ctxt->errNo = XML_ERR_DOCUMENT_END; 6026: ctxt->wellFormed = 0; 6027: } 6028: if (ctxt->instate != XML_PARSER_EOF) { 6029: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 6030: ctxt->sax->endDocument(ctxt->userData); 6031: } 6032: ctxt->instate = XML_PARSER_EOF; 6033: } 6034: return((xmlParserErrors) ctxt->errNo); 6035: } 6036: 6037: /************************************************************************ 6038: * * 6039: * User entry points * 6040: * * 6041: ************************************************************************/ 6042: 6043: /** 6044: * htmlCreatePushParserCtxt: 6045: * @sax: a SAX handler 6046: * @user_data: The user data returned on SAX callbacks 6047: * @chunk: a pointer to an array of chars 6048: * @size: number of chars in the array 6049: * @filename: an optional file name or URI 6050: * @enc: an optional encoding 6051: * 6052: * Create a parser context for using the HTML parser in push mode 6053: * The value of @filename is used for fetching external entities 6054: * and error/warning reports. 6055: * 6056: * Returns the new parser context or NULL 6057: */ 6058: htmlParserCtxtPtr 6059: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 6060: const char *chunk, int size, const char *filename, 6061: xmlCharEncoding enc) { 6062: htmlParserCtxtPtr ctxt; 6063: htmlParserInputPtr inputStream; 6064: xmlParserInputBufferPtr buf; 6065: 6066: xmlInitParser(); 6067: 6068: buf = xmlAllocParserInputBuffer(enc); 6069: if (buf == NULL) return(NULL); 6070: 6071: ctxt = htmlNewParserCtxt(); 6072: if (ctxt == NULL) { 6073: xmlFreeParserInputBuffer(buf); 6074: return(NULL); 6075: } 6076: if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 6077: ctxt->charset=XML_CHAR_ENCODING_UTF8; 6078: if (sax != NULL) { 6079: if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 6080: xmlFree(ctxt->sax); 6081: ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 6082: if (ctxt->sax == NULL) { 6083: xmlFree(buf); 6084: xmlFree(ctxt); 6085: return(NULL); 6086: } 6087: memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 6088: if (user_data != NULL) 6089: ctxt->userData = user_data; 6090: } 6091: if (filename == NULL) { 6092: ctxt->directory = NULL; 6093: } else { 6094: ctxt->directory = xmlParserGetDirectory(filename); 6095: } 6096: 6097: inputStream = htmlNewInputStream(ctxt); 6098: if (inputStream == NULL) { 6099: xmlFreeParserCtxt(ctxt); 6100: xmlFree(buf); 6101: return(NULL); 6102: } 6103: 6104: if (filename == NULL) 6105: inputStream->filename = NULL; 6106: else 6107: inputStream->filename = (char *) 6108: xmlCanonicPath((const xmlChar *) filename); 6109: inputStream->buf = buf; 6110: inputStream->base = inputStream->buf->buffer->content; 6111: inputStream->cur = inputStream->buf->buffer->content; 6112: inputStream->end = 6113: &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; 6114: 6115: inputPush(ctxt, inputStream); 6116: 6117: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6118: (ctxt->input->buf != NULL)) { 6119: int base = ctxt->input->base - ctxt->input->buf->buffer->content; 6120: int cur = ctxt->input->cur - ctxt->input->base; 6121: 6122: xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6123: 6124: ctxt->input->base = ctxt->input->buf->buffer->content + base; 6125: ctxt->input->cur = ctxt->input->base + cur; 6126: ctxt->input->end = 6127: &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; 6128: #ifdef DEBUG_PUSH 6129: xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6130: #endif 6131: } 6132: ctxt->progressive = 1; 6133: 6134: return(ctxt); 6135: } 6136: #endif /* LIBXML_PUSH_ENABLED */ 6137: 6138: /** 6139: * htmlSAXParseDoc: 6140: * @cur: a pointer to an array of xmlChar 6141: * @encoding: a free form C string describing the HTML document encoding, or NULL 6142: * @sax: the SAX handler block 6143: * @userData: if using SAX, this pointer will be provided on callbacks. 6144: * 6145: * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 6146: * to handle parse events. If sax is NULL, fallback to the default DOM 6147: * behavior and return a tree. 6148: * 6149: * Returns the resulting document tree unless SAX is NULL or the document is 6150: * not well formed. 6151: */ 6152: 6153: htmlDocPtr 6154: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 6155: htmlDocPtr ret; 6156: htmlParserCtxtPtr ctxt; 6157: 6158: xmlInitParser(); 6159: 6160: if (cur == NULL) return(NULL); 6161: 6162: 6163: ctxt = htmlCreateDocParserCtxt(cur, encoding); 6164: if (ctxt == NULL) return(NULL); 6165: if (sax != NULL) { 6166: if (ctxt->sax != NULL) xmlFree (ctxt->sax); 6167: ctxt->sax = sax; 6168: ctxt->userData = userData; 6169: } 6170: 6171: htmlParseDocument(ctxt); 6172: ret = ctxt->myDoc; 6173: if (sax != NULL) { 6174: ctxt->sax = NULL; 6175: ctxt->userData = NULL; 6176: } 6177: htmlFreeParserCtxt(ctxt); 6178: 6179: return(ret); 6180: } 6181: 6182: /** 6183: * htmlParseDoc: 6184: * @cur: a pointer to an array of xmlChar 6185: * @encoding: a free form C string describing the HTML document encoding, or NULL 6186: * 6187: * parse an HTML in-memory document and build a tree. 6188: * 6189: * Returns the resulting document tree 6190: */ 6191: 6192: htmlDocPtr 6193: htmlParseDoc(xmlChar *cur, const char *encoding) { 6194: return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 6195: } 6196: 6197: 6198: /** 6199: * htmlCreateFileParserCtxt: 6200: * @filename: the filename 6201: * @encoding: a free form C string describing the HTML document encoding, or NULL 6202: * 6203: * Create a parser context for a file content. 6204: * Automatic support for ZLIB/Compress compressed document is provided 6205: * by default if found at compile-time. 6206: * 6207: * Returns the new parser context or NULL 6208: */ 6209: htmlParserCtxtPtr 6210: htmlCreateFileParserCtxt(const char *filename, const char *encoding) 6211: { 6212: htmlParserCtxtPtr ctxt; 6213: htmlParserInputPtr inputStream; 6214: char *canonicFilename; 6215: /* htmlCharEncoding enc; */ 6216: xmlChar *content, *content_line = (xmlChar *) "charset="; 6217: 6218: if (filename == NULL) 6219: return(NULL); 6220: 6221: ctxt = htmlNewParserCtxt(); 6222: if (ctxt == NULL) { 6223: return(NULL); 6224: } 6225: canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 6226: if (canonicFilename == NULL) { 6227: #ifdef LIBXML_SAX1_ENABLED 6228: if (xmlDefaultSAXHandler.error != NULL) { 6229: xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 6230: } 6231: #endif 6232: xmlFreeParserCtxt(ctxt); 6233: return(NULL); 6234: } 6235: 6236: inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 6237: xmlFree(canonicFilename); 6238: if (inputStream == NULL) { 6239: xmlFreeParserCtxt(ctxt); 6240: return(NULL); 6241: } 6242: 6243: inputPush(ctxt, inputStream); 6244: 6245: /* set encoding */ 6246: if (encoding) { 6247: content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 6248: if (content) { 6249: strcpy ((char *)content, (char *)content_line); 6250: strcat ((char *)content, (char *)encoding); 6251: htmlCheckEncoding (ctxt, content); 6252: xmlFree (content); 6253: } 6254: } 6255: 6256: return(ctxt); 6257: } 6258: 6259: /** 6260: * htmlSAXParseFile: 6261: * @filename: the filename 6262: * @encoding: a free form C string describing the HTML document encoding, or NULL 6263: * @sax: the SAX handler block 6264: * @userData: if using SAX, this pointer will be provided on callbacks. 6265: * 6266: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6267: * compressed document is provided by default if found at compile-time. 6268: * It use the given SAX function block to handle the parsing callback. 6269: * If sax is NULL, fallback to the default DOM tree building routines. 6270: * 6271: * Returns the resulting document tree unless SAX is NULL or the document is 6272: * not well formed. 6273: */ 6274: 6275: htmlDocPtr 6276: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 6277: void *userData) { 6278: htmlDocPtr ret; 6279: htmlParserCtxtPtr ctxt; 6280: htmlSAXHandlerPtr oldsax = NULL; 6281: 6282: xmlInitParser(); 6283: 6284: ctxt = htmlCreateFileParserCtxt(filename, encoding); 6285: if (ctxt == NULL) return(NULL); 6286: if (sax != NULL) { 6287: oldsax = ctxt->sax; 6288: ctxt->sax = sax; 6289: ctxt->userData = userData; 6290: } 6291: 6292: htmlParseDocument(ctxt); 6293: 6294: ret = ctxt->myDoc; 6295: if (sax != NULL) { 6296: ctxt->sax = oldsax; 6297: ctxt->userData = NULL; 6298: } 6299: htmlFreeParserCtxt(ctxt); 6300: 6301: return(ret); 6302: } 6303: 6304: /** 6305: * htmlParseFile: 6306: * @filename: the filename 6307: * @encoding: a free form C string describing the HTML document encoding, or NULL 6308: * 6309: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6310: * compressed document is provided by default if found at compile-time. 6311: * 6312: * Returns the resulting document tree 6313: */ 6314: 6315: htmlDocPtr 6316: htmlParseFile(const char *filename, const char *encoding) { 6317: return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 6318: } 6319: 6320: /** 6321: * htmlHandleOmittedElem: 6322: * @val: int 0 or 1 6323: * 6324: * Set and return the previous value for handling HTML omitted tags. 6325: * 6326: * Returns the last value for 0 for no handling, 1 for auto insertion. 6327: */ 6328: 6329: int 6330: htmlHandleOmittedElem(int val) { 6331: int old = htmlOmittedDefaultValue; 6332: 6333: htmlOmittedDefaultValue = val; 6334: return(old); 6335: } 6336: 6337: /** 6338: * htmlElementAllowedHere: 6339: * @parent: HTML parent element 6340: * @elt: HTML element 6341: * 6342: * Checks whether an HTML element may be a direct child of a parent element. 6343: * Note - doesn't check for deprecated elements 6344: * 6345: * Returns 1 if allowed; 0 otherwise. 6346: */ 6347: int 6348: htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 6349: const char** p ; 6350: 6351: if ( ! elt || ! parent || ! parent->subelts ) 6352: return 0 ; 6353: 6354: for ( p = parent->subelts; *p; ++p ) 6355: if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 6356: return 1 ; 6357: 6358: return 0 ; 6359: } 6360: /** 6361: * htmlElementStatusHere: 6362: * @parent: HTML parent element 6363: * @elt: HTML element 6364: * 6365: * Checks whether an HTML element may be a direct child of a parent element. 6366: * and if so whether it is valid or deprecated. 6367: * 6368: * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6369: */ 6370: htmlStatus 6371: htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 6372: if ( ! parent || ! elt ) 6373: return HTML_INVALID ; 6374: if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 6375: return HTML_INVALID ; 6376: 6377: return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 6378: } 6379: /** 6380: * htmlAttrAllowed: 6381: * @elt: HTML element 6382: * @attr: HTML attribute 6383: * @legacy: whether to allow deprecated attributes 6384: * 6385: * Checks whether an attribute is valid for an element 6386: * Has full knowledge of Required and Deprecated attributes 6387: * 6388: * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6389: */ 6390: htmlStatus 6391: htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 6392: const char** p ; 6393: 6394: if ( !elt || ! attr ) 6395: return HTML_INVALID ; 6396: 6397: if ( elt->attrs_req ) 6398: for ( p = elt->attrs_req; *p; ++p) 6399: if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6400: return HTML_REQUIRED ; 6401: 6402: if ( elt->attrs_opt ) 6403: for ( p = elt->attrs_opt; *p; ++p) 6404: if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6405: return HTML_VALID ; 6406: 6407: if ( legacy && elt->attrs_depr ) 6408: for ( p = elt->attrs_depr; *p; ++p) 6409: if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6410: return HTML_DEPRECATED ; 6411: 6412: return HTML_INVALID ; 6413: } 6414: /** 6415: * htmlNodeStatus: 6416: * @node: an htmlNodePtr in a tree 6417: * @legacy: whether to allow deprecated elements (YES is faster here 6418: * for Element nodes) 6419: * 6420: * Checks whether the tree node is valid. Experimental (the author 6421: * only uses the HTML enhancements in a SAX parser) 6422: * 6423: * Return: for Element nodes, a return from htmlElementAllowedHere (if 6424: * legacy allowed) or htmlElementStatusHere (otherwise). 6425: * for Attribute nodes, a return from htmlAttrAllowed 6426: * for other nodes, HTML_NA (no checks performed) 6427: */ 6428: htmlStatus 6429: htmlNodeStatus(const htmlNodePtr node, int legacy) { 6430: if ( ! node ) 6431: return HTML_INVALID ; 6432: 6433: switch ( node->type ) { 6434: case XML_ELEMENT_NODE: 6435: return legacy 6436: ? ( htmlElementAllowedHere ( 6437: htmlTagLookup(node->parent->name) , node->name 6438: ) ? HTML_VALID : HTML_INVALID ) 6439: : htmlElementStatusHere( 6440: htmlTagLookup(node->parent->name) , 6441: htmlTagLookup(node->name) ) 6442: ; 6443: case XML_ATTRIBUTE_NODE: 6444: return htmlAttrAllowed( 6445: htmlTagLookup(node->parent->name) , node->name, legacy) ; 6446: default: return HTML_NA ; 6447: } 6448: } 6449: /************************************************************************ 6450: * * 6451: * New set (2.6.0) of simpler and more flexible APIs * 6452: * * 6453: ************************************************************************/ 6454: /** 6455: * DICT_FREE: 6456: * @str: a string 6457: * 6458: * Free a string if it is not owned by the "dict" dictionnary in the 6459: * current scope 6460: */ 6461: #define DICT_FREE(str) \ 6462: if ((str) && ((!dict) || \ 6463: (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 6464: xmlFree((char *)(str)); 6465: 6466: /** 6467: * htmlCtxtReset: 6468: * @ctxt: an HTML parser context 6469: * 6470: * Reset a parser context 6471: */ 6472: void 6473: htmlCtxtReset(htmlParserCtxtPtr ctxt) 6474: { 6475: xmlParserInputPtr input; 6476: xmlDictPtr dict; 6477: 6478: if (ctxt == NULL) 6479: return; 6480: 6481: xmlInitParser(); 6482: dict = ctxt->dict; 6483: 6484: while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 6485: xmlFreeInputStream(input); 6486: } 6487: ctxt->inputNr = 0; 6488: ctxt->input = NULL; 6489: 6490: ctxt->spaceNr = 0; 6491: if (ctxt->spaceTab != NULL) { 6492: ctxt->spaceTab[0] = -1; 6493: ctxt->space = &ctxt->spaceTab[0]; 6494: } else { 6495: ctxt->space = NULL; 6496: } 6497: 6498: 6499: ctxt->nodeNr = 0; 6500: ctxt->node = NULL; 6501: 6502: ctxt->nameNr = 0; 6503: ctxt->name = NULL; 6504: 6505: DICT_FREE(ctxt->version); 6506: ctxt->version = NULL; 6507: DICT_FREE(ctxt->encoding); 6508: ctxt->encoding = NULL; 6509: DICT_FREE(ctxt->directory); 6510: ctxt->directory = NULL; 6511: DICT_FREE(ctxt->extSubURI); 6512: ctxt->extSubURI = NULL; 6513: DICT_FREE(ctxt->extSubSystem); 6514: ctxt->extSubSystem = NULL; 6515: if (ctxt->myDoc != NULL) 6516: xmlFreeDoc(ctxt->myDoc); 6517: ctxt->myDoc = NULL; 6518: 6519: ctxt->standalone = -1; 6520: ctxt->hasExternalSubset = 0; 6521: ctxt->hasPErefs = 0; 6522: ctxt->html = 1; 6523: ctxt->external = 0; 6524: ctxt->instate = XML_PARSER_START; 6525: ctxt->token = 0; 6526: 6527: ctxt->wellFormed = 1; 6528: ctxt->nsWellFormed = 1; 6529: ctxt->disableSAX = 0; 6530: ctxt->valid = 1; 6531: ctxt->vctxt.userData = ctxt; 6532: ctxt->vctxt.error = xmlParserValidityError; 6533: ctxt->vctxt.warning = xmlParserValidityWarning; 6534: ctxt->record_info = 0; 6535: ctxt->nbChars = 0; 6536: ctxt->checkIndex = 0; 6537: ctxt->inSubset = 0; 6538: ctxt->errNo = XML_ERR_OK; 6539: ctxt->depth = 0; 6540: ctxt->charset = XML_CHAR_ENCODING_NONE; 6541: ctxt->catalogs = NULL; 6542: xmlInitNodeInfoSeq(&ctxt->node_seq); 6543: 6544: if (ctxt->attsDefault != NULL) { 6545: xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 6546: ctxt->attsDefault = NULL; 6547: } 6548: if (ctxt->attsSpecial != NULL) { 6549: xmlHashFree(ctxt->attsSpecial, NULL); 6550: ctxt->attsSpecial = NULL; 6551: } 6552: } 6553: 6554: /** 6555: * htmlCtxtUseOptions: 6556: * @ctxt: an HTML parser context 6557: * @options: a combination of htmlParserOption(s) 6558: * 6559: * Applies the options to the parser context 6560: * 6561: * Returns 0 in case of success, the set of unknown or unimplemented options 6562: * in case of error. 6563: */ 6564: int 6565: htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 6566: { 6567: if (ctxt == NULL) 6568: return(-1); 6569: 6570: if (options & HTML_PARSE_NOWARNING) { 6571: ctxt->sax->warning = NULL; 6572: ctxt->vctxt.warning = NULL; 6573: options -= XML_PARSE_NOWARNING; 6574: ctxt->options |= XML_PARSE_NOWARNING; 6575: } 6576: if (options & HTML_PARSE_NOERROR) { 6577: ctxt->sax->error = NULL; 6578: ctxt->vctxt.error = NULL; 6579: ctxt->sax->fatalError = NULL; 6580: options -= XML_PARSE_NOERROR; 6581: ctxt->options |= XML_PARSE_NOERROR; 6582: } 6583: if (options & HTML_PARSE_PEDANTIC) { 6584: ctxt->pedantic = 1; 6585: options -= XML_PARSE_PEDANTIC; 6586: ctxt->options |= XML_PARSE_PEDANTIC; 6587: } else 6588: ctxt->pedantic = 0; 6589: if (options & XML_PARSE_NOBLANKS) { 6590: ctxt->keepBlanks = 0; 6591: ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 6592: options -= XML_PARSE_NOBLANKS; 6593: ctxt->options |= XML_PARSE_NOBLANKS; 6594: } else 6595: ctxt->keepBlanks = 1; 6596: if (options & HTML_PARSE_RECOVER) { 6597: ctxt->recovery = 1; 6598: options -= HTML_PARSE_RECOVER; 6599: } else 6600: ctxt->recovery = 0; 6601: if (options & HTML_PARSE_COMPACT) { 6602: ctxt->options |= HTML_PARSE_COMPACT; 6603: options -= HTML_PARSE_COMPACT; 6604: } 6605: if (options & XML_PARSE_HUGE) { 6606: ctxt->options |= XML_PARSE_HUGE; 6607: options -= XML_PARSE_HUGE; 6608: } 6609: if (options & HTML_PARSE_NODEFDTD) { 6610: ctxt->options |= HTML_PARSE_NODEFDTD; 6611: options -= HTML_PARSE_NODEFDTD; 6612: } 6613: if (options & HTML_PARSE_IGNORE_ENC) { 6614: ctxt->options |= HTML_PARSE_IGNORE_ENC; 6615: options -= HTML_PARSE_IGNORE_ENC; 6616: } 6617: if (options & HTML_PARSE_NOIMPLIED) { 6618: ctxt->options |= HTML_PARSE_NOIMPLIED; 6619: options -= HTML_PARSE_NOIMPLIED; 6620: } 6621: ctxt->dictNames = 0; 6622: return (options); 6623: } 6624: 6625: /** 6626: * htmlDoRead: 6627: * @ctxt: an HTML parser context 6628: * @URL: the base URL to use for the document 6629: * @encoding: the document encoding, or NULL 6630: * @options: a combination of htmlParserOption(s) 6631: * @reuse: keep the context for reuse 6632: * 6633: * Common front-end for the htmlRead functions 6634: * 6635: * Returns the resulting document tree or NULL 6636: */ 6637: static htmlDocPtr 6638: htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 6639: int options, int reuse) 6640: { 6641: htmlDocPtr ret; 6642: 6643: htmlCtxtUseOptions(ctxt, options); 6644: ctxt->html = 1; 6645: if (encoding != NULL) { 6646: xmlCharEncodingHandlerPtr hdlr; 6647: 6648: hdlr = xmlFindCharEncodingHandler(encoding); 6649: if (hdlr != NULL) { 6650: xmlSwitchToEncoding(ctxt, hdlr); 6651: if (ctxt->input->encoding != NULL) 6652: xmlFree((xmlChar *) ctxt->input->encoding); 6653: ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 6654: } 6655: } 6656: if ((URL != NULL) && (ctxt->input != NULL) && 6657: (ctxt->input->filename == NULL)) 6658: ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6659: htmlParseDocument(ctxt); 6660: ret = ctxt->myDoc; 6661: ctxt->myDoc = NULL; 6662: if (!reuse) { 6663: if ((ctxt->dictNames) && 6664: (ret != NULL) && 6665: (ret->dict == ctxt->dict)) 6666: ctxt->dict = NULL; 6667: xmlFreeParserCtxt(ctxt); 6668: } 6669: return (ret); 6670: } 6671: 6672: /** 6673: * htmlReadDoc: 6674: * @cur: a pointer to a zero terminated string 6675: * @URL: the base URL to use for the document 6676: * @encoding: the document encoding, or NULL 6677: * @options: a combination of htmlParserOption(s) 6678: * 6679: * parse an XML in-memory document and build a tree. 6680: * 6681: * Returns the resulting document tree 6682: */ 6683: htmlDocPtr 6684: htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 6685: { 6686: htmlParserCtxtPtr ctxt; 6687: 6688: if (cur == NULL) 6689: return (NULL); 6690: 6691: xmlInitParser(); 6692: ctxt = htmlCreateDocParserCtxt(cur, NULL); 6693: if (ctxt == NULL) 6694: return (NULL); 6695: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6696: } 6697: 6698: /** 6699: * htmlReadFile: 6700: * @filename: a file or URL 6701: * @encoding: the document encoding, or NULL 6702: * @options: a combination of htmlParserOption(s) 6703: * 6704: * parse an XML file from the filesystem or the network. 6705: * 6706: * Returns the resulting document tree 6707: */ 6708: htmlDocPtr 6709: htmlReadFile(const char *filename, const char *encoding, int options) 6710: { 6711: htmlParserCtxtPtr ctxt; 6712: 6713: xmlInitParser(); 6714: ctxt = htmlCreateFileParserCtxt(filename, encoding); 6715: if (ctxt == NULL) 6716: return (NULL); 6717: return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6718: } 6719: 6720: /** 6721: * htmlReadMemory: 6722: * @buffer: a pointer to a char array 6723: * @size: the size of the array 6724: * @URL: the base URL to use for the document 6725: * @encoding: the document encoding, or NULL 6726: * @options: a combination of htmlParserOption(s) 6727: * 6728: * parse an XML in-memory document and build a tree. 6729: * 6730: * Returns the resulting document tree 6731: */ 6732: htmlDocPtr 6733: htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 6734: { 6735: htmlParserCtxtPtr ctxt; 6736: 6737: xmlInitParser(); 6738: ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6739: if (ctxt == NULL) 6740: return (NULL); 6741: htmlDefaultSAXHandlerInit(); 6742: if (ctxt->sax != NULL) 6743: memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6744: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6745: } 6746: 6747: /** 6748: * htmlReadFd: 6749: * @fd: an open file descriptor 6750: * @URL: the base URL to use for the document 6751: * @encoding: the document encoding, or NULL 6752: * @options: a combination of htmlParserOption(s) 6753: * 6754: * parse an XML from a file descriptor and build a tree. 6755: * 6756: * Returns the resulting document tree 6757: */ 6758: htmlDocPtr 6759: htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6760: { 6761: htmlParserCtxtPtr ctxt; 6762: xmlParserInputBufferPtr input; 6763: xmlParserInputPtr stream; 6764: 6765: if (fd < 0) 6766: return (NULL); 6767: 6768: xmlInitParser(); 6769: input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6770: if (input == NULL) 6771: return (NULL); 6772: ctxt = xmlNewParserCtxt(); 6773: if (ctxt == NULL) { 6774: xmlFreeParserInputBuffer(input); 6775: return (NULL); 6776: } 6777: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6778: if (stream == NULL) { 6779: xmlFreeParserInputBuffer(input); 6780: xmlFreeParserCtxt(ctxt); 6781: return (NULL); 6782: } 6783: inputPush(ctxt, stream); 6784: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6785: } 6786: 6787: /** 6788: * htmlReadIO: 6789: * @ioread: an I/O read function 6790: * @ioclose: an I/O close function 6791: * @ioctx: an I/O handler 6792: * @URL: the base URL to use for the document 6793: * @encoding: the document encoding, or NULL 6794: * @options: a combination of htmlParserOption(s) 6795: * 6796: * parse an HTML document from I/O functions and source and build a tree. 6797: * 6798: * Returns the resulting document tree 6799: */ 6800: htmlDocPtr 6801: htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6802: void *ioctx, const char *URL, const char *encoding, int options) 6803: { 6804: htmlParserCtxtPtr ctxt; 6805: xmlParserInputBufferPtr input; 6806: xmlParserInputPtr stream; 6807: 6808: if (ioread == NULL) 6809: return (NULL); 6810: xmlInitParser(); 6811: 6812: input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6813: XML_CHAR_ENCODING_NONE); 6814: if (input == NULL) { 6815: if (ioclose != NULL) 6816: ioclose(ioctx); 6817: return (NULL); 6818: } 6819: ctxt = htmlNewParserCtxt(); 6820: if (ctxt == NULL) { 6821: xmlFreeParserInputBuffer(input); 6822: return (NULL); 6823: } 6824: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6825: if (stream == NULL) { 6826: xmlFreeParserInputBuffer(input); 6827: xmlFreeParserCtxt(ctxt); 6828: return (NULL); 6829: } 6830: inputPush(ctxt, stream); 6831: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6832: } 6833: 6834: /** 6835: * htmlCtxtReadDoc: 6836: * @ctxt: an HTML parser context 6837: * @cur: a pointer to a zero terminated string 6838: * @URL: the base URL to use for the document 6839: * @encoding: the document encoding, or NULL 6840: * @options: a combination of htmlParserOption(s) 6841: * 6842: * parse an XML in-memory document and build a tree. 6843: * This reuses the existing @ctxt parser context 6844: * 6845: * Returns the resulting document tree 6846: */ 6847: htmlDocPtr 6848: htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6849: const char *URL, const char *encoding, int options) 6850: { 6851: xmlParserInputPtr stream; 6852: 6853: if (cur == NULL) 6854: return (NULL); 6855: if (ctxt == NULL) 6856: return (NULL); 6857: 6858: htmlCtxtReset(ctxt); 6859: 6860: stream = xmlNewStringInputStream(ctxt, cur); 6861: if (stream == NULL) { 6862: return (NULL); 6863: } 6864: inputPush(ctxt, stream); 6865: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6866: } 6867: 6868: /** 6869: * htmlCtxtReadFile: 6870: * @ctxt: an HTML parser context 6871: * @filename: a file or URL 6872: * @encoding: the document encoding, or NULL 6873: * @options: a combination of htmlParserOption(s) 6874: * 6875: * parse an XML file from the filesystem or the network. 6876: * This reuses the existing @ctxt parser context 6877: * 6878: * Returns the resulting document tree 6879: */ 6880: htmlDocPtr 6881: htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 6882: const char *encoding, int options) 6883: { 6884: xmlParserInputPtr stream; 6885: 6886: if (filename == NULL) 6887: return (NULL); 6888: if (ctxt == NULL) 6889: return (NULL); 6890: 6891: htmlCtxtReset(ctxt); 6892: 6893: stream = xmlLoadExternalEntity(filename, NULL, ctxt); 6894: if (stream == NULL) { 6895: return (NULL); 6896: } 6897: inputPush(ctxt, stream); 6898: return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 6899: } 6900: 6901: /** 6902: * htmlCtxtReadMemory: 6903: * @ctxt: an HTML parser context 6904: * @buffer: a pointer to a char array 6905: * @size: the size of the array 6906: * @URL: the base URL to use for the document 6907: * @encoding: the document encoding, or NULL 6908: * @options: a combination of htmlParserOption(s) 6909: * 6910: * parse an XML in-memory document and build a tree. 6911: * This reuses the existing @ctxt parser context 6912: * 6913: * Returns the resulting document tree 6914: */ 6915: htmlDocPtr 6916: htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 6917: const char *URL, const char *encoding, int options) 6918: { 6919: xmlParserInputBufferPtr input; 6920: xmlParserInputPtr stream; 6921: 6922: if (ctxt == NULL) 6923: return (NULL); 6924: if (buffer == NULL) 6925: return (NULL); 6926: 6927: htmlCtxtReset(ctxt); 6928: 6929: input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 6930: if (input == NULL) { 6931: return(NULL); 6932: } 6933: 6934: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6935: if (stream == NULL) { 6936: xmlFreeParserInputBuffer(input); 6937: return(NULL); 6938: } 6939: 6940: inputPush(ctxt, stream); 6941: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6942: } 6943: 6944: /** 6945: * htmlCtxtReadFd: 6946: * @ctxt: an HTML parser context 6947: * @fd: an open file descriptor 6948: * @URL: the base URL to use for the document 6949: * @encoding: the document encoding, or NULL 6950: * @options: a combination of htmlParserOption(s) 6951: * 6952: * parse an XML from a file descriptor and build a tree. 6953: * This reuses the existing @ctxt parser context 6954: * 6955: * Returns the resulting document tree 6956: */ 6957: htmlDocPtr 6958: htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 6959: const char *URL, const char *encoding, int options) 6960: { 6961: xmlParserInputBufferPtr input; 6962: xmlParserInputPtr stream; 6963: 6964: if (fd < 0) 6965: return (NULL); 6966: if (ctxt == NULL) 6967: return (NULL); 6968: 6969: htmlCtxtReset(ctxt); 6970: 6971: 6972: input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6973: if (input == NULL) 6974: return (NULL); 6975: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6976: if (stream == NULL) { 6977: xmlFreeParserInputBuffer(input); 6978: return (NULL); 6979: } 6980: inputPush(ctxt, stream); 6981: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6982: } 6983: 6984: /** 6985: * htmlCtxtReadIO: 6986: * @ctxt: an HTML parser context 6987: * @ioread: an I/O read function 6988: * @ioclose: an I/O close function 6989: * @ioctx: an I/O handler 6990: * @URL: the base URL to use for the document 6991: * @encoding: the document encoding, or NULL 6992: * @options: a combination of htmlParserOption(s) 6993: * 6994: * parse an HTML document from I/O functions and source and build a tree. 6995: * This reuses the existing @ctxt parser context 6996: * 6997: * Returns the resulting document tree 6998: */ 6999: htmlDocPtr 7000: htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 7001: xmlInputCloseCallback ioclose, void *ioctx, 7002: const char *URL, 7003: const char *encoding, int options) 7004: { 7005: xmlParserInputBufferPtr input; 7006: xmlParserInputPtr stream; 7007: 7008: if (ioread == NULL) 7009: return (NULL); 7010: if (ctxt == NULL) 7011: return (NULL); 7012: 7013: htmlCtxtReset(ctxt); 7014: 7015: input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 7016: XML_CHAR_ENCODING_NONE); 7017: if (input == NULL) { 7018: if (ioclose != NULL) 7019: ioclose(ioctx); 7020: return (NULL); 7021: } 7022: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7023: if (stream == NULL) { 7024: xmlFreeParserInputBuffer(input); 7025: return (NULL); 7026: } 7027: inputPush(ctxt, stream); 7028: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7029: } 7030: 7031: #define bottom_HTMLparser 7032: #include "elfgcchack.h" 7033: #endif /* LIBXML_HTML_ENABLED */