embedaddon/libxml2/HTMLparser.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / HTMLparser.c
Revision 1.1.1.3 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 19:53:28 2014 UTC (10 years ago) by misho
Branches: libxml2, MAIN
CVS tags: v2_9_1p0, v2_9_1, HEAD

libxml2 2.9.1

1: /* 2: * HTMLparser.c : an HTML 4.0 non-verifying parser 3: * 4: * See Copyright for the status of this software. 5: * 6: * daniel@veillard.com 7: */ 8: 9: #define IN_LIBXML 10: #include "libxml.h" 11: #ifdef LIBXML_HTML_ENABLED 12: 13: #include <string.h> 14: #ifdef HAVE_CTYPE_H 15: #include <ctype.h> 16: #endif 17: #ifdef HAVE_STDLIB_H 18: #include <stdlib.h> 19: #endif 20: #ifdef HAVE_SYS_STAT_H 21: #include <sys/stat.h> 22: #endif 23: #ifdef HAVE_FCNTL_H 24: #include <fcntl.h> 25: #endif 26: #ifdef HAVE_UNISTD_H 27: #include <unistd.h> 28: #endif 29: #ifdef HAVE_ZLIB_H 30: #include <zlib.h> 31: #endif 32: 33: #include <libxml/xmlmemory.h> 34: #include <libxml/tree.h> 35: #include <libxml/parser.h> 36: #include <libxml/parserInternals.h> 37: #include <libxml/xmlerror.h> 38: #include <libxml/HTMLparser.h> 39: #include <libxml/HTMLtree.h> 40: #include <libxml/entities.h> 41: #include <libxml/encoding.h> 42: #include <libxml/valid.h> 43: #include <libxml/xmlIO.h> 44: #include <libxml/globals.h> 45: #include <libxml/uri.h> 46: 47: #include "buf.h" 48: #include "enc.h" 49: 50: #define HTML_MAX_NAMELEN 1000 51: #define HTML_PARSER_BIG_BUFFER_SIZE 1000 52: #define HTML_PARSER_BUFFER_SIZE 100 53: 54: /* #define DEBUG */ 55: /* #define DEBUG_PUSH */ 56: 57: static int htmlOmittedDefaultValue = 1; 58: 59: xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, 60: xmlChar end, xmlChar end2, xmlChar end3); 61: static void htmlParseComment(htmlParserCtxtPtr ctxt); 62: 63: /************************************************************************ 64: * * 65: * Some factorized error routines * 66: * * 67: ************************************************************************/ 68: 69: /** 70: * htmlErrMemory: 71: * @ctxt: an HTML parser context 72: * @extra: extra informations 73: * 74: * Handle a redefinition of attribute error 75: */ 76: static void 77: htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) 78: { 79: if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 80: (ctxt->instate == XML_PARSER_EOF)) 81: return; 82: if (ctxt != NULL) { 83: ctxt->errNo = XML_ERR_NO_MEMORY; 84: ctxt->instate = XML_PARSER_EOF; 85: ctxt->disableSAX = 1; 86: } 87: if (extra) 88: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 89: XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, 90: NULL, NULL, 0, 0, 91: "Memory allocation failed : %s\n", extra); 92: else 93: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, 94: XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, 95: NULL, NULL, 0, 0, "Memory allocation failed\n"); 96: } 97: 98: /** 99: * htmlParseErr: 100: * @ctxt: an HTML parser context 101: * @error: the error number 102: * @msg: the error message 103: * @str1: string infor 104: * @str2: string infor 105: * 106: * Handle a fatal parser error, i.e. violating Well-Formedness constraints 107: */ 108: static void 109: htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, 110: const char *msg, const xmlChar *str1, const xmlChar *str2) 111: { 112: if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 113: (ctxt->instate == XML_PARSER_EOF)) 114: return; 115: if (ctxt != NULL) 116: ctxt->errNo = error; 117: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 118: XML_ERR_ERROR, NULL, 0, 119: (const char *) str1, (const char *) str2, 120: NULL, 0, 0, 121: msg, str1, str2); 122: if (ctxt != NULL) 123: ctxt->wellFormed = 0; 124: } 125: 126: /** 127: * htmlParseErrInt: 128: * @ctxt: an HTML parser context 129: * @error: the error number 130: * @msg: the error message 131: * @val: integer info 132: * 133: * Handle a fatal parser error, i.e. violating Well-Formedness constraints 134: */ 135: static void 136: htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, 137: const char *msg, int val) 138: { 139: if ((ctxt != NULL) && (ctxt->disableSAX != 0) && 140: (ctxt->instate == XML_PARSER_EOF)) 141: return; 142: if (ctxt != NULL) 143: ctxt->errNo = error; 144: __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, 145: XML_ERR_ERROR, NULL, 0, NULL, NULL, 146: NULL, val, 0, msg, val); 147: if (ctxt != NULL) 148: ctxt->wellFormed = 0; 149: } 150: 151: /************************************************************************ 152: * * 153: * Parser stacks related functions and macros * 154: * * 155: ************************************************************************/ 156: 157: /** 158: * htmlnamePush: 159: * @ctxt: an HTML parser context 160: * @value: the element name 161: * 162: * Pushes a new element name on top of the name stack 163: * 164: * Returns 0 in case of error, the index in the stack otherwise 165: */ 166: static int 167: htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) 168: { 169: if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head"))) 170: ctxt->html = 3; 171: if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body"))) 172: ctxt->html = 10; 173: if (ctxt->nameNr >= ctxt->nameMax) { 174: ctxt->nameMax *= 2; 175: ctxt->nameTab = (const xmlChar * *) 176: xmlRealloc((xmlChar * *)ctxt->nameTab, 177: ctxt->nameMax * 178: sizeof(ctxt->nameTab[0])); 179: if (ctxt->nameTab == NULL) { 180: htmlErrMemory(ctxt, NULL); 181: return (0); 182: } 183: } 184: ctxt->nameTab[ctxt->nameNr] = value; 185: ctxt->name = value; 186: return (ctxt->nameNr++); 187: } 188: /** 189: * htmlnamePop: 190: * @ctxt: an HTML parser context 191: * 192: * Pops the top element name from the name stack 193: * 194: * Returns the name just removed 195: */ 196: static const xmlChar * 197: htmlnamePop(htmlParserCtxtPtr ctxt) 198: { 199: const xmlChar *ret; 200: 201: if (ctxt->nameNr <= 0) 202: return (NULL); 203: ctxt->nameNr--; 204: if (ctxt->nameNr < 0) 205: return (NULL); 206: if (ctxt->nameNr > 0) 207: ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; 208: else 209: ctxt->name = NULL; 210: ret = ctxt->nameTab[ctxt->nameNr]; 211: ctxt->nameTab[ctxt->nameNr] = NULL; 212: return (ret); 213: } 214: 215: /** 216: * htmlNodeInfoPush: 217: * @ctxt: an HTML parser context 218: * @value: the node info 219: * 220: * Pushes a new element name on top of the node info stack 221: * 222: * Returns 0 in case of error, the index in the stack otherwise 223: */ 224: static int 225: htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value) 226: { 227: if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) { 228: if (ctxt->nodeInfoMax == 0) 229: ctxt->nodeInfoMax = 5; 230: ctxt->nodeInfoMax *= 2; 231: ctxt->nodeInfoTab = (htmlParserNodeInfo *) 232: xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab, 233: ctxt->nodeInfoMax * 234: sizeof(ctxt->nodeInfoTab[0])); 235: if (ctxt->nodeInfoTab == NULL) { 236: htmlErrMemory(ctxt, NULL); 237: return (0); 238: } 239: } 240: ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value; 241: ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 242: return (ctxt->nodeInfoNr++); 243: } 244: 245: /** 246: * htmlNodeInfoPop: 247: * @ctxt: an HTML parser context 248: * 249: * Pops the top element name from the node info stack 250: * 251: * Returns 0 in case of error, the pointer to NodeInfo otherwise 252: */ 253: static htmlParserNodeInfo * 254: htmlNodeInfoPop(htmlParserCtxtPtr ctxt) 255: { 256: if (ctxt->nodeInfoNr <= 0) 257: return (NULL); 258: ctxt->nodeInfoNr--; 259: if (ctxt->nodeInfoNr < 0) 260: return (NULL); 261: if (ctxt->nodeInfoNr > 0) 262: ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1]; 263: else 264: ctxt->nodeInfo = NULL; 265: return &ctxt->nodeInfoTab[ctxt->nodeInfoNr]; 266: } 267: 268: /* 269: * Macros for accessing the content. Those should be used only by the parser, 270: * and not exported. 271: * 272: * Dirty macros, i.e. one need to make assumption on the context to use them 273: * 274: * CUR_PTR return the current pointer to the xmlChar to be parsed. 275: * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled 276: * in ISO-Latin or UTF-8, and the current 16 bit value if compiled 277: * in UNICODE mode. This should be used internally by the parser 278: * only to compare to ASCII values otherwise it would break when 279: * running with UTF-8 encoding. 280: * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only 281: * to compare on ASCII based substring. 282: * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR 283: * it should be used only to compare on ASCII based substring. 284: * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined 285: * strings without newlines within the parser. 286: * 287: * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding 288: * 289: * CURRENT Returns the current char value, with the full decoding of 290: * UTF-8 if we are using this mode. It returns an int. 291: * NEXT Skip to the next character, this does the proper decoding 292: * in UTF-8 mode. It also pop-up unfinished entities on the fly. 293: * NEXTL(l) Skip the current unicode character of l xmlChars long. 294: * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly 295: */ 296: 297: #define UPPER (toupper(*ctxt->input->cur)) 298: 299: #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) 300: 301: #define NXT(val) ctxt->input->cur[(val)] 302: 303: #define UPP(val) (toupper(ctxt->input->cur[(val)])) 304: 305: #define CUR_PTR ctxt->input->cur 306: 307: #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ 308: (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ 309: xmlParserInputShrink(ctxt->input) 310: 311: #define GROW if ((ctxt->progressive == 0) && \ 312: (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ 313: xmlParserInputGrow(ctxt->input, INPUT_CHUNK) 314: 315: #define CURRENT ((int) (*ctxt->input->cur)) 316: 317: #define SKIP_BLANKS htmlSkipBlankChars(ctxt) 318: 319: /* Inported from XML */ 320: 321: /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ 322: #define CUR ((int) (*ctxt->input->cur)) 323: #define NEXT xmlNextChar(ctxt) 324: 325: #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) 326: 327: 328: #define NEXTL(l) do { \ 329: if (*(ctxt->input->cur) == '\n') { \ 330: ctxt->input->line++; ctxt->input->col = 1; \ 331: } else ctxt->input->col++; \ 332: ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ 333: } while (0) 334: 335: /************ 336: \ 337: if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ 338: if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); 339: ************/ 340: 341: #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) 342: #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) 343: 344: #define COPY_BUF(l,b,i,v) \ 345: if (l == 1) b[i++] = (xmlChar) v; \ 346: else i += xmlCopyChar(l,&b[i],v) 347: 348: /** 349: * htmlFindEncoding: 350: * @the HTML parser context 351: * 352: * Ty to find and encoding in the current data available in the input 353: * buffer this is needed to try to switch to the proper encoding when 354: * one face a character error. 355: * That's an heuristic, since it's operating outside of parsing it could 356: * try to use a meta which had been commented out, that's the reason it 357: * should only be used in case of error, not as a default. 358: * 359: * Returns an encoding string or NULL if not found, the string need to 360: * be freed 361: */ 362: static xmlChar * 363: htmlFindEncoding(xmlParserCtxtPtr ctxt) { 364: const xmlChar *start, *cur, *end; 365: 366: if ((ctxt == NULL) || (ctxt->input == NULL) || 367: (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || 368: (ctxt->input->buf->encoder != NULL)) 369: return(NULL); 370: if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) 371: return(NULL); 372: 373: start = ctxt->input->cur; 374: end = ctxt->input->end; 375: /* we also expect the input buffer to be zero terminated */ 376: if (*end != 0) 377: return(NULL); 378: 379: cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV"); 380: if (cur == NULL) 381: return(NULL); 382: cur = xmlStrcasestr(cur, BAD_CAST "CONTENT"); 383: if (cur == NULL) 384: return(NULL); 385: cur = xmlStrcasestr(cur, BAD_CAST "CHARSET="); 386: if (cur == NULL) 387: return(NULL); 388: cur += 8; 389: start = cur; 390: while (((*cur >= 'A') && (*cur <= 'Z')) || 391: ((*cur >= 'a') && (*cur <= 'z')) || 392: ((*cur >= '0') && (*cur <= '9')) || 393: (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/')) 394: cur++; 395: if (cur == start) 396: return(NULL); 397: return(xmlStrndup(start, cur - start)); 398: } 399: 400: /** 401: * htmlCurrentChar: 402: * @ctxt: the HTML parser context 403: * @len: pointer to the length of the char read 404: * 405: * The current char value, if using UTF-8 this may actually span multiple 406: * bytes in the input buffer. Implement the end of line normalization: 407: * 2.11 End-of-Line Handling 408: * If the encoding is unspecified, in the case we find an ISO-Latin-1 409: * char, then the encoding converter is plugged in automatically. 410: * 411: * Returns the current char value and its length 412: */ 413: 414: static int 415: htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { 416: if (ctxt->instate == XML_PARSER_EOF) 417: return(0); 418: 419: if (ctxt->token != 0) { 420: *len = 0; 421: return(ctxt->token); 422: } 423: if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { 424: /* 425: * We are supposed to handle UTF8, check it's valid 426: * From rfc2044: encoding of the Unicode values on UTF-8: 427: * 428: * UCS-4 range (hex.) UTF-8 octet sequence (binary) 429: * 0000 0000-0000 007F 0xxxxxxx 430: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx 431: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx 432: * 433: * Check for the 0x110000 limit too 434: */ 435: const unsigned char *cur = ctxt->input->cur; 436: unsigned char c; 437: unsigned int val; 438: 439: c = *cur; 440: if (c & 0x80) { 441: if (cur[1] == 0) { 442: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 443: cur = ctxt->input->cur; 444: } 445: if ((cur[1] & 0xc0) != 0x80) 446: goto encoding_error; 447: if ((c & 0xe0) == 0xe0) { 448: 449: if (cur[2] == 0) { 450: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 451: cur = ctxt->input->cur; 452: } 453: if ((cur[2] & 0xc0) != 0x80) 454: goto encoding_error; 455: if ((c & 0xf0) == 0xf0) { 456: if (cur[3] == 0) { 457: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 458: cur = ctxt->input->cur; 459: } 460: if (((c & 0xf8) != 0xf0) || 461: ((cur[3] & 0xc0) != 0x80)) 462: goto encoding_error; 463: /* 4-byte code */ 464: *len = 4; 465: val = (cur[0] & 0x7) << 18; 466: val |= (cur[1] & 0x3f) << 12; 467: val |= (cur[2] & 0x3f) << 6; 468: val |= cur[3] & 0x3f; 469: } else { 470: /* 3-byte code */ 471: *len = 3; 472: val = (cur[0] & 0xf) << 12; 473: val |= (cur[1] & 0x3f) << 6; 474: val |= cur[2] & 0x3f; 475: } 476: } else { 477: /* 2-byte code */ 478: *len = 2; 479: val = (cur[0] & 0x1f) << 6; 480: val |= cur[1] & 0x3f; 481: } 482: if (!IS_CHAR(val)) { 483: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 484: "Char 0x%X out of allowed range\n", val); 485: } 486: return(val); 487: } else { 488: if ((*ctxt->input->cur == 0) && 489: (ctxt->input->cur < ctxt->input->end)) { 490: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 491: "Char 0x%X out of allowed range\n", 0); 492: *len = 1; 493: return(' '); 494: } 495: /* 1-byte code */ 496: *len = 1; 497: return((int) *ctxt->input->cur); 498: } 499: } 500: /* 501: * Assume it's a fixed length encoding (1) with 502: * a compatible encoding for the ASCII set, since 503: * XML constructs only use < 128 chars 504: */ 505: *len = 1; 506: if ((int) *ctxt->input->cur < 0x80) 507: return((int) *ctxt->input->cur); 508: 509: /* 510: * Humm this is bad, do an automatic flow conversion 511: */ 512: { 513: xmlChar * guess; 514: xmlCharEncodingHandlerPtr handler; 515: 516: guess = htmlFindEncoding(ctxt); 517: if (guess == NULL) { 518: xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); 519: } else { 520: if (ctxt->input->encoding != NULL) 521: xmlFree((xmlChar *) ctxt->input->encoding); 522: ctxt->input->encoding = guess; 523: handler = xmlFindCharEncodingHandler((const char *) guess); 524: if (handler != NULL) { 525: xmlSwitchToEncoding(ctxt, handler); 526: } else { 527: htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 528: "Unsupported encoding %s", guess, NULL); 529: } 530: } 531: ctxt->charset = XML_CHAR_ENCODING_UTF8; 532: } 533: 534: return(xmlCurrentChar(ctxt, len)); 535: 536: encoding_error: 537: /* 538: * If we detect an UTF8 error that probably mean that the 539: * input encoding didn't get properly advertized in the 540: * declaration header. Report the error and switch the encoding 541: * to ISO-Latin-1 (if you don't like this policy, just declare the 542: * encoding !) 543: */ 544: { 545: char buffer[150]; 546: 547: if (ctxt->input->end - ctxt->input->cur >= 4) { 548: snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", 549: ctxt->input->cur[0], ctxt->input->cur[1], 550: ctxt->input->cur[2], ctxt->input->cur[3]); 551: } else { 552: snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]); 553: } 554: htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 555: "Input is not proper UTF-8, indicate encoding !\n", 556: BAD_CAST buffer, NULL); 557: } 558: 559: ctxt->charset = XML_CHAR_ENCODING_8859_1; 560: *len = 1; 561: return((int) *ctxt->input->cur); 562: } 563: 564: /** 565: * htmlSkipBlankChars: 566: * @ctxt: the HTML parser context 567: * 568: * skip all blanks character found at that point in the input streams. 569: * 570: * Returns the number of space chars skipped 571: */ 572: 573: static int 574: htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { 575: int res = 0; 576: 577: while (IS_BLANK_CH(*(ctxt->input->cur))) { 578: if ((*ctxt->input->cur == 0) && 579: (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { 580: xmlPopInput(ctxt); 581: } else { 582: if (*(ctxt->input->cur) == '\n') { 583: ctxt->input->line++; ctxt->input->col = 1; 584: } else ctxt->input->col++; 585: ctxt->input->cur++; 586: ctxt->nbChars++; 587: if (*ctxt->input->cur == 0) 588: xmlParserInputGrow(ctxt->input, INPUT_CHUNK); 589: } 590: res++; 591: } 592: return(res); 593: } 594: 595: 596: 597: /************************************************************************ 598: * * 599: * The list of HTML elements and their properties * 600: * * 601: ************************************************************************/ 602: 603: /* 604: * Start Tag: 1 means the start tag can be ommited 605: * End Tag: 1 means the end tag can be ommited 606: * 2 means it's forbidden (empty elements) 607: * 3 means the tag is stylistic and should be closed easily 608: * Depr: this element is deprecated 609: * DTD: 1 means that this element is valid only in the Loose DTD 610: * 2 means that this element is valid only in the Frameset DTD 611: * 612: * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description 613: , subElements , impliedsubelt , Attributes, userdata 614: */ 615: 616: /* Definitions and a couple of vars for HTML Elements */ 617: 618: #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" 619: #define NB_FONTSTYLE 8 620: #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" 621: #define NB_PHRASE 10 622: #define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" 623: #define NB_SPECIAL 16 624: #define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL 625: #define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL 626: #define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" 627: #define NB_BLOCK NB_HEADING + NB_LIST + 14 628: #define FORMCTRL "input", "select", "textarea", "label", "button" 629: #define NB_FORMCTRL 5 630: #define PCDATA 631: #define NB_PCDATA 0 632: #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" 633: #define NB_HEADING 6 634: #define LIST "ul", "ol", "dir", "menu" 635: #define NB_LIST 4 636: #define MODIFIER 637: #define NB_MODIFIER 0 638: #define FLOW BLOCK,INLINE 639: #define NB_FLOW NB_BLOCK + NB_INLINE 640: #define EMPTY NULL 641: 642: 643: static const char* const html_flow[] = { FLOW, NULL } ; 644: static const char* const html_inline[] = { INLINE, NULL } ; 645: 646: /* placeholders: elts with content but no subelements */ 647: static const char* const html_pcdata[] = { NULL } ; 648: #define html_cdata html_pcdata 649: 650: 651: /* ... and for HTML Attributes */ 652: 653: #define COREATTRS "id", "class", "style", "title" 654: #define NB_COREATTRS 4 655: #define I18N "lang", "dir" 656: #define NB_I18N 2 657: #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" 658: #define NB_EVENTS 9 659: #define ATTRS COREATTRS,I18N,EVENTS 660: #define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS 661: #define CELLHALIGN "align", "char", "charoff" 662: #define NB_CELLHALIGN 3 663: #define CELLVALIGN "valign" 664: #define NB_CELLVALIGN 1 665: 666: static const char* const html_attrs[] = { ATTRS, NULL } ; 667: static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ; 668: static const char* const core_attrs[] = { COREATTRS, NULL } ; 669: static const char* const i18n_attrs[] = { I18N, NULL } ; 670: 671: 672: /* Other declarations that should go inline ... */ 673: static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", 674: "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", 675: "tabindex", "onfocus", "onblur", NULL } ; 676: static const char* const target_attr[] = { "target", NULL } ; 677: static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; 678: static const char* const alt_attr[] = { "alt", NULL } ; 679: static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; 680: static const char* const href_attrs[] = { "href", NULL } ; 681: static const char* const clear_attrs[] = { "clear", NULL } ; 682: static const char* const inline_p[] = { INLINE, "p", NULL } ; 683: 684: static const char* const flow_param[] = { FLOW, "param", NULL } ; 685: static const char* const applet_attrs[] = { COREATTRS , "codebase", 686: "archive", "alt", "name", "height", "width", "align", 687: "hspace", "vspace", NULL } ; 688: static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", 689: "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 690: static const char* const basefont_attrs[] = 691: { "id", "size", "color", "face", NULL } ; 692: static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; 693: static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; 694: static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; 695: static const char* const body_depr[] = { "background", "bgcolor", "text", 696: "link", "vlink", "alink", NULL } ; 697: static const char* const button_attrs[] = { ATTRS, "name", "value", "type", 698: "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; 699: 700: 701: static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; 702: static const char* const col_elt[] = { "col", NULL } ; 703: static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; 704: static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; 705: static const char* const dl_contents[] = { "dt", "dd", NULL } ; 706: static const char* const compact_attr[] = { "compact", NULL } ; 707: static const char* const label_attr[] = { "label", NULL } ; 708: static const char* const fieldset_contents[] = { FLOW, "legend" } ; 709: static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; 710: static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; 711: static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; 712: static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; 713: static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; 714: static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; 715: static const char* const head_attrs[] = { I18N, "profile", NULL } ; 716: static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; 717: static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; 718: static const char* const version_attr[] = { "version", NULL } ; 719: static const char* const html_content[] = { "head", "body", "frameset", NULL } ; 720: static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; 721: static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; 722: static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ; 723: static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; 724: static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; 725: static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; 726: static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; 727: static const char* const align_attr[] = { "align", NULL } ; 728: static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; 729: static const char* const map_contents[] = { BLOCK, "area", NULL } ; 730: static const char* const name_attr[] = { "name", NULL } ; 731: static const char* const action_attr[] = { "action", NULL } ; 732: static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; 733: static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ; 734: static const char* const content_attr[] = { "content", NULL } ; 735: static const char* const type_attr[] = { "type", NULL } ; 736: static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; 737: static const char* const object_contents[] = { FLOW, "param", NULL } ; 738: static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; 739: static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; 740: static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; 741: static const char* const option_elt[] = { "option", NULL } ; 742: static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; 743: static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; 744: static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; 745: static const char* const width_attr[] = { "width", NULL } ; 746: static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; 747: static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; 748: static const char* const language_attr[] = { "language", NULL } ; 749: static const char* const select_content[] = { "optgroup", "option", NULL } ; 750: static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; 751: static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; 752: static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; 753: static const char* const table_depr[] = { "align", "bgcolor", NULL } ; 754: static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; 755: static const char* const tr_elt[] = { "tr", NULL } ; 756: static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; 757: static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; 758: static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; 759: static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; 760: static const char* const tr_contents[] = { "th", "td", NULL } ; 761: static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; 762: static const char* const li_elt[] = { "li", NULL } ; 763: static const char* const ul_depr[] = { "type", "compact", NULL} ; 764: static const char* const dir_attr[] = { "dir", NULL} ; 765: 766: #define DECL (const char**) 767: 768: static const htmlElemDesc 769: html40ElementTable[] = { 770: { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", 771: DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL 772: }, 773: { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", 774: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 775: }, 776: { "acronym", 0, 0, 0, 0, 0, 0, 1, "", 777: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 778: }, 779: { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", 780: DECL inline_p , NULL , DECL html_attrs, NULL, NULL 781: }, 782: { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", 783: DECL flow_param , NULL , NULL , DECL applet_attrs, NULL 784: }, 785: { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", 786: EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr 787: }, 788: { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", 789: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 790: }, 791: { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", 792: EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs 793: }, 794: { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , 795: EMPTY , NULL , NULL, DECL basefont_attrs, NULL 796: }, 797: { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", 798: DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr 799: }, 800: { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", 801: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 802: }, 803: { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", 804: DECL html_flow , NULL , DECL quote_attrs , NULL, NULL 805: }, 806: { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", 807: DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL 808: }, 809: { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", 810: EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL 811: }, 812: { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", 813: DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL 814: }, 815: { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", 816: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 817: }, 818: { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", 819: DECL html_flow , NULL , NULL, DECL html_attrs, NULL 820: }, 821: { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", 822: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 823: }, 824: { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", 825: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 826: }, 827: { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", 828: EMPTY , NULL , DECL col_attrs , NULL, NULL 829: }, 830: { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", 831: DECL col_elt , "col" , DECL col_attrs , NULL, NULL 832: }, 833: { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", 834: DECL html_flow , NULL , DECL html_attrs, NULL, NULL 835: }, 836: { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", 837: DECL html_flow , NULL , DECL edit_attrs , NULL, NULL 838: }, 839: { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", 840: DECL html_inline , NULL , DECL html_attrs, NULL, NULL 841: }, 842: { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", 843: DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL 844: }, 845: { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", 846: DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL 847: }, 848: { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", 849: DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL 850: }, 851: { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", 852: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 853: }, 854: { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", 855: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 856: }, 857: { "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ", 858: EMPTY, NULL, DECL embed_attrs, NULL, NULL 859: }, 860: { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", 861: DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL 862: }, 863: { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", 864: DECL html_inline, NULL, NULL, DECL font_attrs, NULL 865: }, 866: { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", 867: DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr 868: }, 869: { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , 870: EMPTY, NULL, NULL, DECL frame_attrs, NULL 871: }, 872: { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , 873: DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL 874: }, 875: { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", 876: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 877: }, 878: { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", 879: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 880: }, 881: { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", 882: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 883: }, 884: { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", 885: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 886: }, 887: { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", 888: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 889: }, 890: { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", 891: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 892: }, 893: { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", 894: DECL head_contents, NULL, DECL head_attrs, NULL, NULL 895: }, 896: { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , 897: EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL 898: }, 899: { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", 900: DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL 901: }, 902: { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", 903: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 904: }, 905: { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", 906: DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL 907: }, 908: { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", 909: EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs 910: }, 911: { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", 912: EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL 913: }, 914: { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", 915: DECL html_flow, NULL, DECL edit_attrs, NULL, NULL 916: }, 917: { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", 918: EMPTY, NULL, NULL, DECL prompt_attrs, NULL 919: }, 920: { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", 921: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 922: }, 923: { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", 924: DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL 925: }, 926: { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", 927: DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL 928: }, 929: { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", 930: DECL html_flow, NULL, DECL html_attrs, NULL, NULL 931: }, 932: { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", 933: EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL 934: }, 935: { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", 936: DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr 937: }, 938: { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", 939: DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL 940: }, 941: { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", 942: EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr 943: }, 944: { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", 945: DECL noframes_content, "body" , DECL html_attrs, NULL, NULL 946: }, 947: { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", 948: DECL html_flow, "div", DECL html_attrs, NULL, NULL 949: }, 950: { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", 951: DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL 952: }, 953: { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", 954: DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL 955: }, 956: { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", 957: DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr 958: }, 959: { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , 960: DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL 961: }, 962: { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", 963: DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL 964: }, 965: { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", 966: EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr 967: }, 968: { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", 969: DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL 970: }, 971: { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", 972: DECL html_inline, NULL, DECL quote_attrs, NULL, NULL 973: }, 974: { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", 975: DECL html_inline, NULL, NULL, DECL html_attrs, NULL 976: }, 977: { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", 978: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 979: }, 980: { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", 981: DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr 982: }, 983: { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", 984: DECL select_content, NULL, DECL select_attrs, NULL, NULL 985: }, 986: { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", 987: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 988: }, 989: { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", 990: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 991: }, 992: { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", 993: DECL html_inline, NULL, NULL, DECL html_attrs, NULL 994: }, 995: { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", 996: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 997: }, 998: { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", 999: DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr 1000: }, 1001: { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", 1002: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1003: }, 1004: { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", 1005: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1006: }, 1007: { "table", 0, 0, 0, 0, 0, 0, 0, "", 1008: DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL 1009: }, 1010: { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", 1011: DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1012: }, 1013: { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", 1014: DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1015: }, 1016: { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", 1017: DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr 1018: }, 1019: { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", 1020: DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1021: }, 1022: { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", 1023: DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL 1024: }, 1025: { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", 1026: DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL 1027: }, 1028: { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", 1029: DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL 1030: }, 1031: { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", 1032: DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL 1033: }, 1034: { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", 1035: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1036: }, 1037: { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", 1038: DECL html_inline, NULL, NULL, DECL html_attrs, NULL 1039: }, 1040: { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", 1041: DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL 1042: }, 1043: { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", 1044: DECL html_inline, NULL, DECL html_attrs, NULL, NULL 1045: } 1046: }; 1047: 1048: /* 1049: * start tags that imply the end of current element 1050: */ 1051: static const char * const htmlStartClose[] = { 1052: "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", 1053: "dl", "ul", "ol", "menu", "dir", "address", "pre", 1054: "listing", "xmp", "head", NULL, 1055: "head", "p", NULL, 1056: "title", "p", NULL, 1057: "body", "head", "style", "link", "title", "p", NULL, 1058: "frameset", "head", "style", "link", "title", "p", NULL, 1059: "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", 1060: "pre", "listing", "xmp", "head", "li", NULL, 1061: "hr", "p", "head", NULL, 1062: "h1", "p", "head", NULL, 1063: "h2", "p", "head", NULL, 1064: "h3", "p", "head", NULL, 1065: "h4", "p", "head", NULL, 1066: "h5", "p", "head", NULL, 1067: "h6", "p", "head", NULL, 1068: "dir", "p", "head", NULL, 1069: "address", "p", "head", "ul", NULL, 1070: "pre", "p", "head", "ul", NULL, 1071: "listing", "p", "head", NULL, 1072: "xmp", "p", "head", NULL, 1073: "blockquote", "p", "head", NULL, 1074: "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", 1075: "xmp", "head", NULL, 1076: "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1077: "head", "dd", NULL, 1078: "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", 1079: "head", "dt", NULL, 1080: "ul", "p", "head", "ol", "menu", "dir", "address", "pre", 1081: "listing", "xmp", NULL, 1082: "ol", "p", "head", "ul", NULL, 1083: "menu", "p", "head", "ul", NULL, 1084: "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, 1085: "div", "p", "head", NULL, 1086: "noscript", "p", NULL, 1087: "center", "font", "b", "i", "p", "head", NULL, 1088: "a", "a", "head", NULL, 1089: "caption", "p", NULL, 1090: "colgroup", "caption", "colgroup", "col", "p", NULL, 1091: "col", "caption", "col", "p", NULL, 1092: "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", 1093: "listing", "xmp", "a", NULL, 1094: "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1095: "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, 1096: "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, 1097: "thead", "caption", "col", "colgroup", NULL, 1098: "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1099: "tbody", "p", NULL, 1100: "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", 1101: "tfoot", "tbody", "p", NULL, 1102: "optgroup", "option", NULL, 1103: "option", "option", NULL, 1104: "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", 1105: "pre", "listing", "xmp", "a", NULL, 1106: /* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */ 1107: "tt", "head", NULL, 1108: "i", "head", NULL, 1109: "b", "head", NULL, 1110: "u", "head", NULL, 1111: "s", "head", NULL, 1112: "strike", "head", NULL, 1113: "big", "head", NULL, 1114: "small", "head", NULL, 1115: 1116: "em", "head", NULL, 1117: "strong", "head", NULL, 1118: "dfn", "head", NULL, 1119: "code", "head", NULL, 1120: "samp", "head", NULL, 1121: "kbd", "head", NULL, 1122: "var", "head", NULL, 1123: "cite", "head", NULL, 1124: "abbr", "head", NULL, 1125: "acronym", "head", NULL, 1126: 1127: /* "a" */ 1128: "img", "head", NULL, 1129: /* "applet" */ 1130: /* "embed" */ 1131: /* "object" */ 1132: "font", "head", NULL, 1133: /* "basefont" */ 1134: "br", "head", NULL, 1135: /* "script" */ 1136: "map", "head", NULL, 1137: "q", "head", NULL, 1138: "sub", "head", NULL, 1139: "sup", "head", NULL, 1140: "span", "head", NULL, 1141: "bdo", "head", NULL, 1142: "iframe", "head", NULL, 1143: NULL 1144: }; 1145: 1146: /* 1147: * The list of HTML elements which are supposed not to have 1148: * CDATA content and where a p element will be implied 1149: * 1150: * TODO: extend that list by reading the HTML SGML DTD on 1151: * implied paragraph 1152: */ 1153: static const char *const htmlNoContentElements[] = { 1154: "html", 1155: "head", 1156: NULL 1157: }; 1158: 1159: /* 1160: * The list of HTML attributes which are of content %Script; 1161: * NOTE: when adding ones, check htmlIsScriptAttribute() since 1162: * it assumes the name starts with 'on' 1163: */ 1164: static const char *const htmlScriptAttributes[] = { 1165: "onclick", 1166: "ondblclick", 1167: "onmousedown", 1168: "onmouseup", 1169: "onmouseover", 1170: "onmousemove", 1171: "onmouseout", 1172: "onkeypress", 1173: "onkeydown", 1174: "onkeyup", 1175: "onload", 1176: "onunload", 1177: "onfocus", 1178: "onblur", 1179: "onsubmit", 1180: "onrest", 1181: "onchange", 1182: "onselect" 1183: }; 1184: 1185: /* 1186: * This table is used by the htmlparser to know what to do with 1187: * broken html pages. By assigning different priorities to different 1188: * elements the parser can decide how to handle extra endtags. 1189: * Endtags are only allowed to close elements with lower or equal 1190: * priority. 1191: */ 1192: 1193: typedef struct { 1194: const char *name; 1195: int priority; 1196: } elementPriority; 1197: 1198: static const elementPriority htmlEndPriority[] = { 1199: {"div", 150}, 1200: {"td", 160}, 1201: {"th", 160}, 1202: {"tr", 170}, 1203: {"thead", 180}, 1204: {"tbody", 180}, 1205: {"tfoot", 180}, 1206: {"table", 190}, 1207: {"head", 200}, 1208: {"body", 200}, 1209: {"html", 220}, 1210: {NULL, 100} /* Default priority */ 1211: }; 1212: 1213: static const char** htmlStartCloseIndex[100]; 1214: static int htmlStartCloseIndexinitialized = 0; 1215: 1216: /************************************************************************ 1217: * * 1218: * functions to handle HTML specific data * 1219: * * 1220: ************************************************************************/ 1221: 1222: /** 1223: * htmlInitAutoClose: 1224: * 1225: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1226: * This is not reentrant. Call xmlInitParser() once before processing in 1227: * case of use in multithreaded programs. 1228: */ 1229: void 1230: htmlInitAutoClose(void) { 1231: int indx, i = 0; 1232: 1233: if (htmlStartCloseIndexinitialized) return; 1234: 1235: for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; 1236: indx = 0; 1237: while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { 1238: htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i]; 1239: while (htmlStartClose[i] != NULL) i++; 1240: i++; 1241: } 1242: htmlStartCloseIndexinitialized = 1; 1243: } 1244: 1245: /** 1246: * htmlTagLookup: 1247: * @tag: The tag name in lowercase 1248: * 1249: * Lookup the HTML tag in the ElementTable 1250: * 1251: * Returns the related htmlElemDescPtr or NULL if not found. 1252: */ 1253: const htmlElemDesc * 1254: htmlTagLookup(const xmlChar *tag) { 1255: unsigned int i; 1256: 1257: for (i = 0; i < (sizeof(html40ElementTable) / 1258: sizeof(html40ElementTable[0]));i++) { 1259: if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) 1260: return((htmlElemDescPtr) &html40ElementTable[i]); 1261: } 1262: return(NULL); 1263: } 1264: 1265: /** 1266: * htmlGetEndPriority: 1267: * @name: The name of the element to look up the priority for. 1268: * 1269: * Return value: The "endtag" priority. 1270: **/ 1271: static int 1272: htmlGetEndPriority (const xmlChar *name) { 1273: int i = 0; 1274: 1275: while ((htmlEndPriority[i].name != NULL) && 1276: (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) 1277: i++; 1278: 1279: return(htmlEndPriority[i].priority); 1280: } 1281: 1282: 1283: /** 1284: * htmlCheckAutoClose: 1285: * @newtag: The new tag name 1286: * @oldtag: The old tag name 1287: * 1288: * Checks whether the new tag is one of the registered valid tags for 1289: * closing old. 1290: * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. 1291: * 1292: * Returns 0 if no, 1 if yes. 1293: */ 1294: static int 1295: htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) 1296: { 1297: int i, indx; 1298: const char **closed = NULL; 1299: 1300: if (htmlStartCloseIndexinitialized == 0) 1301: htmlInitAutoClose(); 1302: 1303: /* inefficient, but not a big deal */ 1304: for (indx = 0; indx < 100; indx++) { 1305: closed = htmlStartCloseIndex[indx]; 1306: if (closed == NULL) 1307: return (0); 1308: if (xmlStrEqual(BAD_CAST * closed, newtag)) 1309: break; 1310: } 1311: 1312: i = closed - htmlStartClose; 1313: i++; 1314: while (htmlStartClose[i] != NULL) { 1315: if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { 1316: return (1); 1317: } 1318: i++; 1319: } 1320: return (0); 1321: } 1322: 1323: /** 1324: * htmlAutoCloseOnClose: 1325: * @ctxt: an HTML parser context 1326: * @newtag: The new tag name 1327: * @force: force the tag closure 1328: * 1329: * The HTML DTD allows an ending tag to implicitly close other tags. 1330: */ 1331: static void 1332: htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1333: { 1334: const htmlElemDesc *info; 1335: int i, priority; 1336: 1337: priority = htmlGetEndPriority(newtag); 1338: 1339: for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1340: 1341: if (xmlStrEqual(newtag, ctxt->nameTab[i])) 1342: break; 1343: /* 1344: * A missplaced endtag can only close elements with lower 1345: * or equal priority, so if we find an element with higher 1346: * priority before we find an element with 1347: * matching name, we just ignore this endtag 1348: */ 1349: if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) 1350: return; 1351: } 1352: if (i < 0) 1353: return; 1354: 1355: while (!xmlStrEqual(newtag, ctxt->name)) { 1356: info = htmlTagLookup(ctxt->name); 1357: if ((info != NULL) && (info->endTag == 3)) { 1358: htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 1359: "Opening and ending tag mismatch: %s and %s\n", 1360: newtag, ctxt->name); 1361: } 1362: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1363: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1364: htmlnamePop(ctxt); 1365: } 1366: } 1367: 1368: /** 1369: * htmlAutoCloseOnEnd: 1370: * @ctxt: an HTML parser context 1371: * 1372: * Close all remaining tags at the end of the stream 1373: */ 1374: static void 1375: htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) 1376: { 1377: int i; 1378: 1379: if (ctxt->nameNr == 0) 1380: return; 1381: for (i = (ctxt->nameNr - 1); i >= 0; i--) { 1382: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1383: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1384: htmlnamePop(ctxt); 1385: } 1386: } 1387: 1388: /** 1389: * htmlAutoClose: 1390: * @ctxt: an HTML parser context 1391: * @newtag: The new tag name or NULL 1392: * 1393: * The HTML DTD allows a tag to implicitly close other tags. 1394: * The list is kept in htmlStartClose array. This function is 1395: * called when a new tag has been detected and generates the 1396: * appropriates closes if possible/needed. 1397: * If newtag is NULL this mean we are at the end of the resource 1398: * and we should check 1399: */ 1400: static void 1401: htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) 1402: { 1403: while ((newtag != NULL) && (ctxt->name != NULL) && 1404: (htmlCheckAutoClose(newtag, ctxt->name))) { 1405: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1406: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1407: htmlnamePop(ctxt); 1408: } 1409: if (newtag == NULL) { 1410: htmlAutoCloseOnEnd(ctxt); 1411: return; 1412: } 1413: while ((newtag == NULL) && (ctxt->name != NULL) && 1414: ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || 1415: (xmlStrEqual(ctxt->name, BAD_CAST "body")) || 1416: (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { 1417: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 1418: ctxt->sax->endElement(ctxt->userData, ctxt->name); 1419: htmlnamePop(ctxt); 1420: } 1421: } 1422: 1423: /** 1424: * htmlAutoCloseTag: 1425: * @doc: the HTML document 1426: * @name: The tag name 1427: * @elem: the HTML element 1428: * 1429: * The HTML DTD allows a tag to implicitly close other tags. 1430: * The list is kept in htmlStartClose array. This function checks 1431: * if the element or one of it's children would autoclose the 1432: * given tag. 1433: * 1434: * Returns 1 if autoclose, 0 otherwise 1435: */ 1436: int 1437: htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { 1438: htmlNodePtr child; 1439: 1440: if (elem == NULL) return(1); 1441: if (xmlStrEqual(name, elem->name)) return(0); 1442: if (htmlCheckAutoClose(elem->name, name)) return(1); 1443: child = elem->children; 1444: while (child != NULL) { 1445: if (htmlAutoCloseTag(doc, name, child)) return(1); 1446: child = child->next; 1447: } 1448: return(0); 1449: } 1450: 1451: /** 1452: * htmlIsAutoClosed: 1453: * @doc: the HTML document 1454: * @elem: the HTML element 1455: * 1456: * The HTML DTD allows a tag to implicitly close other tags. 1457: * The list is kept in htmlStartClose array. This function checks 1458: * if a tag is autoclosed by one of it's child 1459: * 1460: * Returns 1 if autoclosed, 0 otherwise 1461: */ 1462: int 1463: htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { 1464: htmlNodePtr child; 1465: 1466: if (elem == NULL) return(1); 1467: child = elem->children; 1468: while (child != NULL) { 1469: if (htmlAutoCloseTag(doc, elem->name, child)) return(1); 1470: child = child->next; 1471: } 1472: return(0); 1473: } 1474: 1475: /** 1476: * htmlCheckImplied: 1477: * @ctxt: an HTML parser context 1478: * @newtag: The new tag name 1479: * 1480: * The HTML DTD allows a tag to exists only implicitly 1481: * called when a new tag has been detected and generates the 1482: * appropriates implicit tags if missing 1483: */ 1484: static void 1485: htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { 1486: int i; 1487: 1488: if (ctxt->options & HTML_PARSE_NOIMPLIED) 1489: return; 1490: if (!htmlOmittedDefaultValue) 1491: return; 1492: if (xmlStrEqual(newtag, BAD_CAST"html")) 1493: return; 1494: if (ctxt->nameNr <= 0) { 1495: htmlnamePush(ctxt, BAD_CAST"html"); 1496: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1497: ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); 1498: } 1499: if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) 1500: return; 1501: if ((ctxt->nameNr <= 1) && 1502: ((xmlStrEqual(newtag, BAD_CAST"script")) || 1503: (xmlStrEqual(newtag, BAD_CAST"style")) || 1504: (xmlStrEqual(newtag, BAD_CAST"meta")) || 1505: (xmlStrEqual(newtag, BAD_CAST"link")) || 1506: (xmlStrEqual(newtag, BAD_CAST"title")) || 1507: (xmlStrEqual(newtag, BAD_CAST"base")))) { 1508: if (ctxt->html >= 3) { 1509: /* we already saw or generated an <head> before */ 1510: return; 1511: } 1512: /* 1513: * dropped OBJECT ... i you put it first BODY will be 1514: * assumed ! 1515: */ 1516: htmlnamePush(ctxt, BAD_CAST"head"); 1517: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1518: ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); 1519: } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && 1520: (!xmlStrEqual(newtag, BAD_CAST"frame")) && 1521: (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { 1522: if (ctxt->html >= 10) { 1523: /* we already saw or generated a <body> before */ 1524: return; 1525: } 1526: for (i = 0;i < ctxt->nameNr;i++) { 1527: if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { 1528: return; 1529: } 1530: if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { 1531: return; 1532: } 1533: } 1534: 1535: htmlnamePush(ctxt, BAD_CAST"body"); 1536: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1537: ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); 1538: } 1539: } 1540: 1541: /** 1542: * htmlCheckParagraph 1543: * @ctxt: an HTML parser context 1544: * 1545: * Check whether a p element need to be implied before inserting 1546: * characters in the current element. 1547: * 1548: * Returns 1 if a paragraph has been inserted, 0 if not and -1 1549: * in case of error. 1550: */ 1551: 1552: static int 1553: htmlCheckParagraph(htmlParserCtxtPtr ctxt) { 1554: const xmlChar *tag; 1555: int i; 1556: 1557: if (ctxt == NULL) 1558: return(-1); 1559: tag = ctxt->name; 1560: if (tag == NULL) { 1561: htmlAutoClose(ctxt, BAD_CAST"p"); 1562: htmlCheckImplied(ctxt, BAD_CAST"p"); 1563: htmlnamePush(ctxt, BAD_CAST"p"); 1564: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1565: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1566: return(1); 1567: } 1568: if (!htmlOmittedDefaultValue) 1569: return(0); 1570: for (i = 0; htmlNoContentElements[i] != NULL; i++) { 1571: if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { 1572: htmlAutoClose(ctxt, BAD_CAST"p"); 1573: htmlCheckImplied(ctxt, BAD_CAST"p"); 1574: htmlnamePush(ctxt, BAD_CAST"p"); 1575: if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) 1576: ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); 1577: return(1); 1578: } 1579: } 1580: return(0); 1581: } 1582: 1583: /** 1584: * htmlIsScriptAttribute: 1585: * @name: an attribute name 1586: * 1587: * Check if an attribute is of content type Script 1588: * 1589: * Returns 1 is the attribute is a script 0 otherwise 1590: */ 1591: int 1592: htmlIsScriptAttribute(const xmlChar *name) { 1593: unsigned int i; 1594: 1595: if (name == NULL) 1596: return(0); 1597: /* 1598: * all script attributes start with 'on' 1599: */ 1600: if ((name[0] != 'o') || (name[1] != 'n')) 1601: return(0); 1602: for (i = 0; 1603: i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); 1604: i++) { 1605: if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) 1606: return(1); 1607: } 1608: return(0); 1609: } 1610: 1611: /************************************************************************ 1612: * * 1613: * The list of HTML predefined entities * 1614: * * 1615: ************************************************************************/ 1616: 1617: 1618: static const htmlEntityDesc html40EntitiesTable[] = { 1619: /* 1620: * the 4 absolute ones, plus apostrophe. 1621: */ 1622: { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, 1623: { 38, "amp", "ampersand, U+0026 ISOnum" }, 1624: { 39, "apos", "single quote" }, 1625: { 60, "lt", "less-than sign, U+003C ISOnum" }, 1626: { 62, "gt", "greater-than sign, U+003E ISOnum" }, 1627: 1628: /* 1629: * A bunch still in the 128-255 range 1630: * Replacing them depend really on the charset used. 1631: */ 1632: { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, 1633: { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, 1634: { 162, "cent", "cent sign, U+00A2 ISOnum" }, 1635: { 163, "pound","pound sign, U+00A3 ISOnum" }, 1636: { 164, "curren","currency sign, U+00A4 ISOnum" }, 1637: { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, 1638: { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, 1639: { 167, "sect", "section sign, U+00A7 ISOnum" }, 1640: { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, 1641: { 169, "copy", "copyright sign, U+00A9 ISOnum" }, 1642: { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, 1643: { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, 1644: { 172, "not", "not sign, U+00AC ISOnum" }, 1645: { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, 1646: { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, 1647: { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, 1648: { 176, "deg", "degree sign, U+00B0 ISOnum" }, 1649: { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, 1650: { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, 1651: { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, 1652: { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, 1653: { 181, "micro","micro sign, U+00B5 ISOnum" }, 1654: { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, 1655: { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, 1656: { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, 1657: { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, 1658: { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, 1659: { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, 1660: { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, 1661: { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, 1662: { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, 1663: { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, 1664: { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, 1665: { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, 1666: { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, 1667: { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, 1668: { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, 1669: { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, 1670: { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, 1671: { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, 1672: { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, 1673: { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, 1674: { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, 1675: { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, 1676: { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, 1677: { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, 1678: { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, 1679: { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, 1680: { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, 1681: { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, 1682: { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, 1683: { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, 1684: { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, 1685: { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, 1686: { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, 1687: { 215, "times","multiplication sign, U+00D7 ISOnum" }, 1688: { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, 1689: { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, 1690: { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, 1691: { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, 1692: { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, 1693: { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, 1694: { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, 1695: { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, 1696: { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, 1697: { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, 1698: { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, 1699: { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, 1700: { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, 1701: { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, 1702: { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, 1703: { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, 1704: { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, 1705: { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, 1706: { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, 1707: { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, 1708: { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, 1709: { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, 1710: { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, 1711: { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, 1712: { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, 1713: { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, 1714: { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, 1715: { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, 1716: { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, 1717: { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, 1718: { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, 1719: { 247, "divide","division sign, U+00F7 ISOnum" }, 1720: { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, 1721: { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, 1722: { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, 1723: { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, 1724: { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, 1725: { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, 1726: { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, 1727: { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, 1728: 1729: { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, 1730: { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, 1731: { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, 1732: { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, 1733: { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, 1734: 1735: /* 1736: * Anything below should really be kept as entities references 1737: */ 1738: { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, 1739: 1740: { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, 1741: { 732, "tilde","small tilde, U+02DC ISOdia" }, 1742: 1743: { 913, "Alpha","greek capital letter alpha, U+0391" }, 1744: { 914, "Beta", "greek capital letter beta, U+0392" }, 1745: { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, 1746: { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, 1747: { 917, "Epsilon","greek capital letter epsilon, U+0395" }, 1748: { 918, "Zeta", "greek capital letter zeta, U+0396" }, 1749: { 919, "Eta", "greek capital letter eta, U+0397" }, 1750: { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, 1751: { 921, "Iota", "greek capital letter iota, U+0399" }, 1752: { 922, "Kappa","greek capital letter kappa, U+039A" }, 1753: { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, 1754: { 924, "Mu", "greek capital letter mu, U+039C" }, 1755: { 925, "Nu", "greek capital letter nu, U+039D" }, 1756: { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, 1757: { 927, "Omicron","greek capital letter omicron, U+039F" }, 1758: { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, 1759: { 929, "Rho", "greek capital letter rho, U+03A1" }, 1760: { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, 1761: { 932, "Tau", "greek capital letter tau, U+03A4" }, 1762: { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, 1763: { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, 1764: { 935, "Chi", "greek capital letter chi, U+03A7" }, 1765: { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, 1766: { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, 1767: 1768: { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, 1769: { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, 1770: { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, 1771: { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, 1772: { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, 1773: { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, 1774: { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, 1775: { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, 1776: { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, 1777: { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, 1778: { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, 1779: { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, 1780: { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, 1781: { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, 1782: { 959, "omicron","greek small letter omicron, U+03BF NEW" }, 1783: { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, 1784: { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, 1785: { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, 1786: { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, 1787: { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, 1788: { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, 1789: { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, 1790: { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, 1791: { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, 1792: { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, 1793: { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, 1794: { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, 1795: { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, 1796: 1797: { 8194, "ensp", "en space, U+2002 ISOpub" }, 1798: { 8195, "emsp", "em space, U+2003 ISOpub" }, 1799: { 8201, "thinsp","thin space, U+2009 ISOpub" }, 1800: { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, 1801: { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, 1802: { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, 1803: { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, 1804: { 8211, "ndash","en dash, U+2013 ISOpub" }, 1805: { 8212, "mdash","em dash, U+2014 ISOpub" }, 1806: { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, 1807: { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, 1808: { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, 1809: { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, 1810: { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, 1811: { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, 1812: { 8224, "dagger","dagger, U+2020 ISOpub" }, 1813: { 8225, "Dagger","double dagger, U+2021 ISOpub" }, 1814: 1815: { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, 1816: { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, 1817: 1818: { 8240, "permil","per mille sign, U+2030 ISOtech" }, 1819: 1820: { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, 1821: { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, 1822: 1823: { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, 1824: { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, 1825: 1826: { 8254, "oline","overline = spacing overscore, U+203E NEW" }, 1827: { 8260, "frasl","fraction slash, U+2044 NEW" }, 1828: 1829: { 8364, "euro", "euro sign, U+20AC NEW" }, 1830: 1831: { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, 1832: { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, 1833: { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, 1834: { 8482, "trade","trade mark sign, U+2122 ISOnum" }, 1835: { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, 1836: { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, 1837: { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, 1838: { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, 1839: { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, 1840: { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, 1841: { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, 1842: { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, 1843: { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, 1844: { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, 1845: { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, 1846: { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, 1847: 1848: { 8704, "forall","for all, U+2200 ISOtech" }, 1849: { 8706, "part", "partial differential, U+2202 ISOtech" }, 1850: { 8707, "exist","there exists, U+2203 ISOtech" }, 1851: { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, 1852: { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, 1853: { 8712, "isin", "element of, U+2208 ISOtech" }, 1854: { 8713, "notin","not an element of, U+2209 ISOtech" }, 1855: { 8715, "ni", "contains as member, U+220B ISOtech" }, 1856: { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, 1857: { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, 1858: { 8722, "minus","minus sign, U+2212 ISOtech" }, 1859: { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, 1860: { 8730, "radic","square root = radical sign, U+221A ISOtech" }, 1861: { 8733, "prop", "proportional to, U+221D ISOtech" }, 1862: { 8734, "infin","infinity, U+221E ISOtech" }, 1863: { 8736, "ang", "angle, U+2220 ISOamso" }, 1864: { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, 1865: { 8744, "or", "logical or = vee, U+2228 ISOtech" }, 1866: { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, 1867: { 8746, "cup", "union = cup, U+222A ISOtech" }, 1868: { 8747, "int", "integral, U+222B ISOtech" }, 1869: { 8756, "there4","therefore, U+2234 ISOtech" }, 1870: { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, 1871: { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, 1872: { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, 1873: { 8800, "ne", "not equal to, U+2260 ISOtech" }, 1874: { 8801, "equiv","identical to, U+2261 ISOtech" }, 1875: { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, 1876: { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, 1877: { 8834, "sub", "subset of, U+2282 ISOtech" }, 1878: { 8835, "sup", "superset of, U+2283 ISOtech" }, 1879: { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, 1880: { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, 1881: { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, 1882: { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, 1883: { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, 1884: { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, 1885: { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, 1886: { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, 1887: { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, 1888: { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, 1889: { 8971, "rfloor","right floor, U+230B ISOamsc" }, 1890: { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, 1891: { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, 1892: { 9674, "loz", "lozenge, U+25CA ISOpub" }, 1893: 1894: { 9824, "spades","black spade suit, U+2660 ISOpub" }, 1895: { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, 1896: { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, 1897: { 9830, "diams","black diamond suit, U+2666 ISOpub" }, 1898: 1899: }; 1900: 1901: /************************************************************************ 1902: * * 1903: * Commodity functions to handle entities * 1904: * * 1905: ************************************************************************/ 1906: 1907: /* 1908: * Macro used to grow the current buffer. 1909: */ 1910: #define growBuffer(buffer) { \ 1911: xmlChar *tmp; \ 1912: buffer##_size *= 2; \ 1913: tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ 1914: if (tmp == NULL) { \ 1915: htmlErrMemory(ctxt, "growing buffer\n"); \ 1916: xmlFree(buffer); \ 1917: return(NULL); \ 1918: } \ 1919: buffer = tmp; \ 1920: } 1921: 1922: /** 1923: * htmlEntityLookup: 1924: * @name: the entity name 1925: * 1926: * Lookup the given entity in EntitiesTable 1927: * 1928: * TODO: the linear scan is really ugly, an hash table is really needed. 1929: * 1930: * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1931: */ 1932: const htmlEntityDesc * 1933: htmlEntityLookup(const xmlChar *name) { 1934: unsigned int i; 1935: 1936: for (i = 0;i < (sizeof(html40EntitiesTable)/ 1937: sizeof(html40EntitiesTable[0]));i++) { 1938: if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { 1939: return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1940: } 1941: } 1942: return(NULL); 1943: } 1944: 1945: /** 1946: * htmlEntityValueLookup: 1947: * @value: the entity's unicode value 1948: * 1949: * Lookup the given entity in EntitiesTable 1950: * 1951: * TODO: the linear scan is really ugly, an hash table is really needed. 1952: * 1953: * Returns the associated htmlEntityDescPtr if found, NULL otherwise. 1954: */ 1955: const htmlEntityDesc * 1956: htmlEntityValueLookup(unsigned int value) { 1957: unsigned int i; 1958: 1959: for (i = 0;i < (sizeof(html40EntitiesTable)/ 1960: sizeof(html40EntitiesTable[0]));i++) { 1961: if (html40EntitiesTable[i].value >= value) { 1962: if (html40EntitiesTable[i].value > value) 1963: break; 1964: return((htmlEntityDescPtr) &html40EntitiesTable[i]); 1965: } 1966: } 1967: return(NULL); 1968: } 1969: 1970: /** 1971: * UTF8ToHtml: 1972: * @out: a pointer to an array of bytes to store the result 1973: * @outlen: the length of @out 1974: * @in: a pointer to an array of UTF-8 chars 1975: * @inlen: the length of @in 1976: * 1977: * Take a block of UTF-8 chars in and try to convert it to an ASCII 1978: * plus HTML entities block of chars out. 1979: * 1980: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 1981: * The value of @inlen after return is the number of octets consumed 1982: * as the return value is positive, else unpredictable. 1983: * The value of @outlen after return is the number of octets consumed. 1984: */ 1985: int 1986: UTF8ToHtml(unsigned char* out, int *outlen, 1987: const unsigned char* in, int *inlen) { 1988: const unsigned char* processed = in; 1989: const unsigned char* outend; 1990: const unsigned char* outstart = out; 1991: const unsigned char* instart = in; 1992: const unsigned char* inend; 1993: unsigned int c, d; 1994: int trailing; 1995: 1996: if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); 1997: if (in == NULL) { 1998: /* 1999: * initialization nothing to do 2000: */ 2001: *outlen = 0; 2002: *inlen = 0; 2003: return(0); 2004: } 2005: inend = in + (*inlen); 2006: outend = out + (*outlen); 2007: while (in < inend) { 2008: d = *in++; 2009: if (d < 0x80) { c= d; trailing= 0; } 2010: else if (d < 0xC0) { 2011: /* trailing byte in leading position */ 2012: *outlen = out - outstart; 2013: *inlen = processed - instart; 2014: return(-2); 2015: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2016: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2017: else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2018: else { 2019: /* no chance for this in Ascii */ 2020: *outlen = out - outstart; 2021: *inlen = processed - instart; 2022: return(-2); 2023: } 2024: 2025: if (inend - in < trailing) { 2026: break; 2027: } 2028: 2029: for ( ; trailing; trailing--) { 2030: if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) 2031: break; 2032: c <<= 6; 2033: c |= d & 0x3F; 2034: } 2035: 2036: /* assertion: c is a single UTF-4 value */ 2037: if (c < 0x80) { 2038: if (out + 1 >= outend) 2039: break; 2040: *out++ = c; 2041: } else { 2042: int len; 2043: const htmlEntityDesc * ent; 2044: const char *cp; 2045: char nbuf[16]; 2046: 2047: /* 2048: * Try to lookup a predefined HTML entity for it 2049: */ 2050: 2051: ent = htmlEntityValueLookup(c); 2052: if (ent == NULL) { 2053: snprintf(nbuf, sizeof(nbuf), "#%u", c); 2054: cp = nbuf; 2055: } 2056: else 2057: cp = ent->name; 2058: len = strlen(cp); 2059: if (out + 2 + len >= outend) 2060: break; 2061: *out++ = '&'; 2062: memcpy(out, cp, len); 2063: out += len; 2064: *out++ = ';'; 2065: } 2066: processed = in; 2067: } 2068: *outlen = out - outstart; 2069: *inlen = processed - instart; 2070: return(0); 2071: } 2072: 2073: /** 2074: * htmlEncodeEntities: 2075: * @out: a pointer to an array of bytes to store the result 2076: * @outlen: the length of @out 2077: * @in: a pointer to an array of UTF-8 chars 2078: * @inlen: the length of @in 2079: * @quoteChar: the quote character to escape (' or ") or zero. 2080: * 2081: * Take a block of UTF-8 chars in and try to convert it to an ASCII 2082: * plus HTML entities block of chars out. 2083: * 2084: * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise 2085: * The value of @inlen after return is the number of octets consumed 2086: * as the return value is positive, else unpredictable. 2087: * The value of @outlen after return is the number of octets consumed. 2088: */ 2089: int 2090: htmlEncodeEntities(unsigned char* out, int *outlen, 2091: const unsigned char* in, int *inlen, int quoteChar) { 2092: const unsigned char* processed = in; 2093: const unsigned char* outend; 2094: const unsigned char* outstart = out; 2095: const unsigned char* instart = in; 2096: const unsigned char* inend; 2097: unsigned int c, d; 2098: int trailing; 2099: 2100: if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) 2101: return(-1); 2102: outend = out + (*outlen); 2103: inend = in + (*inlen); 2104: while (in < inend) { 2105: d = *in++; 2106: if (d < 0x80) { c= d; trailing= 0; } 2107: else if (d < 0xC0) { 2108: /* trailing byte in leading position */ 2109: *outlen = out - outstart; 2110: *inlen = processed - instart; 2111: return(-2); 2112: } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } 2113: else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } 2114: else if (d < 0xF8) { c= d & 0x07; trailing= 3; } 2115: else { 2116: /* no chance for this in Ascii */ 2117: *outlen = out - outstart; 2118: *inlen = processed - instart; 2119: return(-2); 2120: } 2121: 2122: if (inend - in < trailing) 2123: break; 2124: 2125: while (trailing--) { 2126: if (((d= *in++) & 0xC0) != 0x80) { 2127: *outlen = out - outstart; 2128: *inlen = processed - instart; 2129: return(-2); 2130: } 2131: c <<= 6; 2132: c |= d & 0x3F; 2133: } 2134: 2135: /* assertion: c is a single UTF-4 value */ 2136: if ((c < 0x80) && (c != (unsigned int) quoteChar) && 2137: (c != '&') && (c != '<') && (c != '>')) { 2138: if (out >= outend) 2139: break; 2140: *out++ = c; 2141: } else { 2142: const htmlEntityDesc * ent; 2143: const char *cp; 2144: char nbuf[16]; 2145: int len; 2146: 2147: /* 2148: * Try to lookup a predefined HTML entity for it 2149: */ 2150: ent = htmlEntityValueLookup(c); 2151: if (ent == NULL) { 2152: snprintf(nbuf, sizeof(nbuf), "#%u", c); 2153: cp = nbuf; 2154: } 2155: else 2156: cp = ent->name; 2157: len = strlen(cp); 2158: if (out + 2 + len > outend) 2159: break; 2160: *out++ = '&'; 2161: memcpy(out, cp, len); 2162: out += len; 2163: *out++ = ';'; 2164: } 2165: processed = in; 2166: } 2167: *outlen = out - outstart; 2168: *inlen = processed - instart; 2169: return(0); 2170: } 2171: 2172: /************************************************************************ 2173: * * 2174: * Commodity functions to handle streams * 2175: * * 2176: ************************************************************************/ 2177: 2178: /** 2179: * htmlNewInputStream: 2180: * @ctxt: an HTML parser context 2181: * 2182: * Create a new input stream structure 2183: * Returns the new input stream or NULL 2184: */ 2185: static htmlParserInputPtr 2186: htmlNewInputStream(htmlParserCtxtPtr ctxt) { 2187: htmlParserInputPtr input; 2188: 2189: input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); 2190: if (input == NULL) { 2191: htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); 2192: return(NULL); 2193: } 2194: memset(input, 0, sizeof(htmlParserInput)); 2195: input->filename = NULL; 2196: input->directory = NULL; 2197: input->base = NULL; 2198: input->cur = NULL; 2199: input->buf = NULL; 2200: input->line = 1; 2201: input->col = 1; 2202: input->buf = NULL; 2203: input->free = NULL; 2204: input->version = NULL; 2205: input->consumed = 0; 2206: input->length = 0; 2207: return(input); 2208: } 2209: 2210: 2211: /************************************************************************ 2212: * * 2213: * Commodity functions, cleanup needed ? * 2214: * * 2215: ************************************************************************/ 2216: /* 2217: * all tags allowing pc data from the html 4.01 loose dtd 2218: * NOTE: it might be more apropriate to integrate this information 2219: * into the html40ElementTable array but I don't want to risk any 2220: * binary incomptibility 2221: */ 2222: static const char *allowPCData[] = { 2223: "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", 2224: "blockquote", "body", "button", "caption", "center", "cite", "code", 2225: "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", 2226: "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", 2227: "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", 2228: "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" 2229: }; 2230: 2231: /** 2232: * areBlanks: 2233: * @ctxt: an HTML parser context 2234: * @str: a xmlChar * 2235: * @len: the size of @str 2236: * 2237: * Is this a sequence of blank chars that one can ignore ? 2238: * 2239: * Returns 1 if ignorable 0 otherwise. 2240: */ 2241: 2242: static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { 2243: unsigned int i; 2244: int j; 2245: xmlNodePtr lastChild; 2246: xmlDtdPtr dtd; 2247: 2248: for (j = 0;j < len;j++) 2249: if (!(IS_BLANK_CH(str[j]))) return(0); 2250: 2251: if (CUR == 0) return(1); 2252: if (CUR != '<') return(0); 2253: if (ctxt->name == NULL) 2254: return(1); 2255: if (xmlStrEqual(ctxt->name, BAD_CAST"html")) 2256: return(1); 2257: if (xmlStrEqual(ctxt->name, BAD_CAST"head")) 2258: return(1); 2259: 2260: /* Only strip CDATA children of the body tag for strict HTML DTDs */ 2261: if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { 2262: dtd = xmlGetIntSubset(ctxt->myDoc); 2263: if (dtd != NULL && dtd->ExternalID != NULL) { 2264: if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || 2265: !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) 2266: return(1); 2267: } 2268: } 2269: 2270: if (ctxt->node == NULL) return(0); 2271: lastChild = xmlGetLastChild(ctxt->node); 2272: while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) 2273: lastChild = lastChild->prev; 2274: if (lastChild == NULL) { 2275: if ((ctxt->node->type != XML_ELEMENT_NODE) && 2276: (ctxt->node->content != NULL)) return(0); 2277: /* keep ws in constructs like ... ... 2278: for all tags "b" allowing PCDATA */ 2279: for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2280: if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { 2281: return(0); 2282: } 2283: } 2284: } else if (xmlNodeIsText(lastChild)) { 2285: return(0); 2286: } else { 2287: /* keep ws in constructs like xy z 2288: for all tags "p" allowing PCDATA */ 2289: for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { 2290: if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { 2291: return(0); 2292: } 2293: } 2294: } 2295: return(1); 2296: } 2297: 2298: /** 2299: * htmlNewDocNoDtD: 2300: * @URI: URI for the dtd, or NULL 2301: * @ExternalID: the external ID of the DTD, or NULL 2302: * 2303: * Creates a new HTML document without a DTD node if @URI and @ExternalID 2304: * are NULL 2305: * 2306: * Returns a new document, do not initialize the DTD if not provided 2307: */ 2308: htmlDocPtr 2309: htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { 2310: xmlDocPtr cur; 2311: 2312: /* 2313: * Allocate a new document and fill the fields. 2314: */ 2315: cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); 2316: if (cur == NULL) { 2317: htmlErrMemory(NULL, "HTML document creation failed\n"); 2318: return(NULL); 2319: } 2320: memset(cur, 0, sizeof(xmlDoc)); 2321: 2322: cur->type = XML_HTML_DOCUMENT_NODE; 2323: cur->version = NULL; 2324: cur->intSubset = NULL; 2325: cur->doc = cur; 2326: cur->name = NULL; 2327: cur->children = NULL; 2328: cur->extSubset = NULL; 2329: cur->oldNs = NULL; 2330: cur->encoding = NULL; 2331: cur->standalone = 1; 2332: cur->compression = 0; 2333: cur->ids = NULL; 2334: cur->refs = NULL; 2335: cur->_private = NULL; 2336: cur->charset = XML_CHAR_ENCODING_UTF8; 2337: cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT; 2338: if ((ExternalID != NULL) || 2339: (URI != NULL)) 2340: xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); 2341: return(cur); 2342: } 2343: 2344: /** 2345: * htmlNewDoc: 2346: * @URI: URI for the dtd, or NULL 2347: * @ExternalID: the external ID of the DTD, or NULL 2348: * 2349: * Creates a new HTML document 2350: * 2351: * Returns a new document 2352: */ 2353: htmlDocPtr 2354: htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { 2355: if ((URI == NULL) && (ExternalID == NULL)) 2356: return(htmlNewDocNoDtD( 2357: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", 2358: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); 2359: 2360: return(htmlNewDocNoDtD(URI, ExternalID)); 2361: } 2362: 2363: 2364: /************************************************************************ 2365: * * 2366: * The parser itself * 2367: * Relates to http://www.w3.org/TR/html40 * 2368: * * 2369: ************************************************************************/ 2370: 2371: /************************************************************************ 2372: * * 2373: * The parser itself * 2374: * * 2375: ************************************************************************/ 2376: 2377: static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); 2378: 2379: /** 2380: * htmlParseHTMLName: 2381: * @ctxt: an HTML parser context 2382: * 2383: * parse an HTML tag or attribute name, note that we convert it to lowercase 2384: * since HTML names are not case-sensitive. 2385: * 2386: * Returns the Tag Name parsed or NULL 2387: */ 2388: 2389: static const xmlChar * 2390: htmlParseHTMLName(htmlParserCtxtPtr ctxt) { 2391: int i = 0; 2392: xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2393: 2394: if (!IS_ASCII_LETTER(CUR) && (CUR != '_') && 2395: (CUR != ':') && (CUR != '.')) return(NULL); 2396: 2397: while ((i < HTML_PARSER_BUFFER_SIZE) && 2398: ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) || 2399: (CUR == ':') || (CUR == '-') || (CUR == '_') || 2400: (CUR == '.'))) { 2401: if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; 2402: else loc[i] = CUR; 2403: i++; 2404: 2405: NEXT; 2406: } 2407: 2408: return(xmlDictLookup(ctxt->dict, loc, i)); 2409: } 2410: 2411: 2412: /** 2413: * htmlParseHTMLName_nonInvasive: 2414: * @ctxt: an HTML parser context 2415: * 2416: * parse an HTML tag or attribute name, note that we convert it to lowercase 2417: * since HTML names are not case-sensitive, this doesn't consume the data 2418: * from the stream, it's a look-ahead 2419: * 2420: * Returns the Tag Name parsed or NULL 2421: */ 2422: 2423: static const xmlChar * 2424: htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) { 2425: int i = 0; 2426: xmlChar loc[HTML_PARSER_BUFFER_SIZE]; 2427: 2428: if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') && 2429: (NXT(1) != ':')) return(NULL); 2430: 2431: while ((i < HTML_PARSER_BUFFER_SIZE) && 2432: ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) || 2433: (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) { 2434: if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20; 2435: else loc[i] = NXT(1+i); 2436: i++; 2437: } 2438: 2439: return(xmlDictLookup(ctxt->dict, loc, i)); 2440: } 2441: 2442: 2443: /** 2444: * htmlParseName: 2445: * @ctxt: an HTML parser context 2446: * 2447: * parse an HTML name, this routine is case sensitive. 2448: * 2449: * Returns the Name parsed or NULL 2450: */ 2451: 2452: static const xmlChar * 2453: htmlParseName(htmlParserCtxtPtr ctxt) { 2454: const xmlChar *in; 2455: const xmlChar *ret; 2456: int count = 0; 2457: 2458: GROW; 2459: 2460: /* 2461: * Accelerator for simple ASCII names 2462: */ 2463: in = ctxt->input->cur; 2464: if (((*in >= 0x61) && (*in <= 0x7A)) || 2465: ((*in >= 0x41) && (*in <= 0x5A)) || 2466: (*in == '_') || (*in == ':')) { 2467: in++; 2468: while (((*in >= 0x61) && (*in <= 0x7A)) || 2469: ((*in >= 0x41) && (*in <= 0x5A)) || 2470: ((*in >= 0x30) && (*in <= 0x39)) || 2471: (*in == '_') || (*in == '-') || 2472: (*in == ':') || (*in == '.')) 2473: in++; 2474: if ((*in > 0) && (*in < 0x80)) { 2475: count = in - ctxt->input->cur; 2476: ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); 2477: ctxt->input->cur = in; 2478: ctxt->nbChars += count; 2479: ctxt->input->col += count; 2480: return(ret); 2481: } 2482: } 2483: return(htmlParseNameComplex(ctxt)); 2484: } 2485: 2486: static const xmlChar * 2487: htmlParseNameComplex(xmlParserCtxtPtr ctxt) { 2488: int len = 0, l; 2489: int c; 2490: int count = 0; 2491: 2492: /* 2493: * Handler for more complex cases 2494: */ 2495: GROW; 2496: c = CUR_CHAR(l); 2497: if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ 2498: (!IS_LETTER(c) && (c != '_') && 2499: (c != ':'))) { 2500: return(NULL); 2501: } 2502: 2503: while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ 2504: ((IS_LETTER(c)) || (IS_DIGIT(c)) || 2505: (c == '.') || (c == '-') || 2506: (c == '_') || (c == ':') || 2507: (IS_COMBINING(c)) || 2508: (IS_EXTENDER(c)))) { 2509: if (count++ > 100) { 2510: count = 0; 2511: GROW; 2512: } 2513: len += l; 2514: NEXTL(l); 2515: c = CUR_CHAR(l); 2516: } 2517: return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); 2518: } 2519: 2520: 2521: /** 2522: * htmlParseHTMLAttribute: 2523: * @ctxt: an HTML parser context 2524: * @stop: a char stop value 2525: * 2526: * parse an HTML attribute value till the stop (quote), if 2527: * stop is 0 then it stops at the first space 2528: * 2529: * Returns the attribute parsed or NULL 2530: */ 2531: 2532: static xmlChar * 2533: htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { 2534: xmlChar *buffer = NULL; 2535: int buffer_size = 0; 2536: xmlChar *out = NULL; 2537: const xmlChar *name = NULL; 2538: const xmlChar *cur = NULL; 2539: const htmlEntityDesc * ent; 2540: 2541: /* 2542: * allocate a translation buffer. 2543: */ 2544: buffer_size = HTML_PARSER_BUFFER_SIZE; 2545: buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); 2546: if (buffer == NULL) { 2547: htmlErrMemory(ctxt, "buffer allocation failed\n"); 2548: return(NULL); 2549: } 2550: out = buffer; 2551: 2552: /* 2553: * Ok loop until we reach one of the ending chars 2554: */ 2555: while ((CUR != 0) && (CUR != stop)) { 2556: if ((stop == 0) && (CUR == '>')) break; 2557: if ((stop == 0) && (IS_BLANK_CH(CUR))) break; 2558: if (CUR == '&') { 2559: if (NXT(1) == '#') { 2560: unsigned int c; 2561: int bits; 2562: 2563: c = htmlParseCharRef(ctxt); 2564: if (c < 0x80) 2565: { *out++ = c; bits= -6; } 2566: else if (c < 0x800) 2567: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2568: else if (c < 0x10000) 2569: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2570: else 2571: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2572: 2573: for ( ; bits >= 0; bits-= 6) { 2574: *out++ = ((c >> bits) & 0x3F) | 0x80; 2575: } 2576: 2577: if (out - buffer > buffer_size - 100) { 2578: int indx = out - buffer; 2579: 2580: growBuffer(buffer); 2581: out = &buffer[indx]; 2582: } 2583: } else { 2584: ent = htmlParseEntityRef(ctxt, &name); 2585: if (name == NULL) { 2586: *out++ = '&'; 2587: if (out - buffer > buffer_size - 100) { 2588: int indx = out - buffer; 2589: 2590: growBuffer(buffer); 2591: out = &buffer[indx]; 2592: } 2593: } else if (ent == NULL) { 2594: *out++ = '&'; 2595: cur = name; 2596: while (*cur != 0) { 2597: if (out - buffer > buffer_size - 100) { 2598: int indx = out - buffer; 2599: 2600: growBuffer(buffer); 2601: out = &buffer[indx]; 2602: } 2603: *out++ = *cur++; 2604: } 2605: } else { 2606: unsigned int c; 2607: int bits; 2608: 2609: if (out - buffer > buffer_size - 100) { 2610: int indx = out - buffer; 2611: 2612: growBuffer(buffer); 2613: out = &buffer[indx]; 2614: } 2615: c = ent->value; 2616: if (c < 0x80) 2617: { *out++ = c; bits= -6; } 2618: else if (c < 0x800) 2619: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2620: else if (c < 0x10000) 2621: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2622: else 2623: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2624: 2625: for ( ; bits >= 0; bits-= 6) { 2626: *out++ = ((c >> bits) & 0x3F) | 0x80; 2627: } 2628: } 2629: } 2630: } else { 2631: unsigned int c; 2632: int bits, l; 2633: 2634: if (out - buffer > buffer_size - 100) { 2635: int indx = out - buffer; 2636: 2637: growBuffer(buffer); 2638: out = &buffer[indx]; 2639: } 2640: c = CUR_CHAR(l); 2641: if (c < 0x80) 2642: { *out++ = c; bits= -6; } 2643: else if (c < 0x800) 2644: { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } 2645: else if (c < 0x10000) 2646: { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } 2647: else 2648: { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } 2649: 2650: for ( ; bits >= 0; bits-= 6) { 2651: *out++ = ((c >> bits) & 0x3F) | 0x80; 2652: } 2653: NEXT; 2654: } 2655: } 2656: *out = 0; 2657: return(buffer); 2658: } 2659: 2660: /** 2661: * htmlParseEntityRef: 2662: * @ctxt: an HTML parser context 2663: * @str: location to store the entity name 2664: * 2665: * parse an HTML ENTITY references 2666: * 2667: * [68] EntityRef ::= '&' Name ';' 2668: * 2669: * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, 2670: * if non-NULL *str will have to be freed by the caller. 2671: */ 2672: const htmlEntityDesc * 2673: htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { 2674: const xmlChar *name; 2675: const htmlEntityDesc * ent = NULL; 2676: 2677: if (str != NULL) *str = NULL; 2678: if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL); 2679: 2680: if (CUR == '&') { 2681: NEXT; 2682: name = htmlParseName(ctxt); 2683: if (name == NULL) { 2684: htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, 2685: "htmlParseEntityRef: no name\n", NULL, NULL); 2686: } else { 2687: GROW; 2688: if (CUR == ';') { 2689: if (str != NULL) 2690: *str = name; 2691: 2692: /* 2693: * Lookup the entity in the table. 2694: */ 2695: ent = htmlEntityLookup(name); 2696: if (ent != NULL) /* OK that's ugly !!! */ 2697: NEXT; 2698: } else { 2699: htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, 2700: "htmlParseEntityRef: expecting ';'\n", 2701: NULL, NULL); 2702: if (str != NULL) 2703: *str = name; 2704: } 2705: } 2706: } 2707: return(ent); 2708: } 2709: 2710: /** 2711: * htmlParseAttValue: 2712: * @ctxt: an HTML parser context 2713: * 2714: * parse a value for an attribute 2715: * Note: the parser won't do substitution of entities here, this 2716: * will be handled later in xmlStringGetNodeList, unless it was 2717: * asked for ctxt->replaceEntities != 0 2718: * 2719: * Returns the AttValue parsed or NULL. 2720: */ 2721: 2722: static xmlChar * 2723: htmlParseAttValue(htmlParserCtxtPtr ctxt) { 2724: xmlChar *ret = NULL; 2725: 2726: if (CUR == '"') { 2727: NEXT; 2728: ret = htmlParseHTMLAttribute(ctxt, '"'); 2729: if (CUR != '"') { 2730: htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2731: "AttValue: \" expected\n", NULL, NULL); 2732: } else 2733: NEXT; 2734: } else if (CUR == '\'') { 2735: NEXT; 2736: ret = htmlParseHTMLAttribute(ctxt, '\''); 2737: if (CUR != '\'') { 2738: htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, 2739: "AttValue: ' expected\n", NULL, NULL); 2740: } else 2741: NEXT; 2742: } else { 2743: /* 2744: * That's an HTMLism, the attribute value may not be quoted 2745: */ 2746: ret = htmlParseHTMLAttribute(ctxt, 0); 2747: if (ret == NULL) { 2748: htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, 2749: "AttValue: no value found\n", NULL, NULL); 2750: } 2751: } 2752: return(ret); 2753: } 2754: 2755: /** 2756: * htmlParseSystemLiteral: 2757: * @ctxt: an HTML parser context 2758: * 2759: * parse an HTML Literal 2760: * 2761: * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") 2762: * 2763: * Returns the SystemLiteral parsed or NULL 2764: */ 2765: 2766: static xmlChar * 2767: htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { 2768: const xmlChar *q; 2769: xmlChar *ret = NULL; 2770: 2771: if (CUR == '"') { 2772: NEXT; 2773: q = CUR_PTR; 2774: while ((IS_CHAR_CH(CUR)) && (CUR != '"')) 2775: NEXT; 2776: if (!IS_CHAR_CH(CUR)) { 2777: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2778: "Unfinished SystemLiteral\n", NULL, NULL); 2779: } else { 2780: ret = xmlStrndup(q, CUR_PTR - q); 2781: NEXT; 2782: } 2783: } else if (CUR == '\'') { 2784: NEXT; 2785: q = CUR_PTR; 2786: while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) 2787: NEXT; 2788: if (!IS_CHAR_CH(CUR)) { 2789: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2790: "Unfinished SystemLiteral\n", NULL, NULL); 2791: } else { 2792: ret = xmlStrndup(q, CUR_PTR - q); 2793: NEXT; 2794: } 2795: } else { 2796: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2797: " or ' expected\n", NULL, NULL); 2798: } 2799: 2800: return(ret); 2801: } 2802: 2803: /** 2804: * htmlParsePubidLiteral: 2805: * @ctxt: an HTML parser context 2806: * 2807: * parse an HTML public literal 2808: * 2809: * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" 2810: * 2811: * Returns the PubidLiteral parsed or NULL. 2812: */ 2813: 2814: static xmlChar * 2815: htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { 2816: const xmlChar *q; 2817: xmlChar *ret = NULL; 2818: /* 2819: * Name ::= (Letter | '_') (NameChar)* 2820: */ 2821: if (CUR == '"') { 2822: NEXT; 2823: q = CUR_PTR; 2824: while (IS_PUBIDCHAR_CH(CUR)) NEXT; 2825: if (CUR != '"') { 2826: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2827: "Unfinished PubidLiteral\n", NULL, NULL); 2828: } else { 2829: ret = xmlStrndup(q, CUR_PTR - q); 2830: NEXT; 2831: } 2832: } else if (CUR == '\'') { 2833: NEXT; 2834: q = CUR_PTR; 2835: while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) 2836: NEXT; 2837: if (CUR != '\'') { 2838: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, 2839: "Unfinished PubidLiteral\n", NULL, NULL); 2840: } else { 2841: ret = xmlStrndup(q, CUR_PTR - q); 2842: NEXT; 2843: } 2844: } else { 2845: htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, 2846: "PubidLiteral \" or ' expected\n", NULL, NULL); 2847: } 2848: 2849: return(ret); 2850: } 2851: 2852: /** 2853: * htmlParseScript: 2854: * @ctxt: an HTML parser context 2855: * 2856: * parse the content of an HTML SCRIPT or STYLE element 2857: * http://www.w3.org/TR/html4/sgml/dtd.html#Script 2858: * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet 2859: * http://www.w3.org/TR/html4/types.html#type-script 2860: * http://www.w3.org/TR/html4/types.html#h-6.15 2861: * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 2862: * 2863: * Script data ( %Script; in the DTD) can be the content of the SCRIPT 2864: * element and the value of intrinsic event attributes. User agents must 2865: * not evaluate script data as HTML markup but instead must pass it on as 2866: * data to a script engine. 2867: * NOTES: 2868: * - The content is passed like CDATA 2869: * - the attributes for style and scripting "onXXX" are also described 2870: * as CDATA but SGML allows entities references in attributes so their 2871: * processing is identical as other attributes 2872: */ 2873: static void 2874: htmlParseScript(htmlParserCtxtPtr ctxt) { 2875: xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2876: int nbchar = 0; 2877: int cur,l; 2878: 2879: SHRINK; 2880: cur = CUR_CHAR(l); 2881: while (IS_CHAR_CH(cur)) { 2882: if ((cur == '<') && (NXT(1) == '/')) { 2883: /* 2884: * One should break here, the specification is clear: 2885: * Authors should therefore escape "</" within the content. 2886: * Escape mechanisms are specific to each scripting or 2887: * style sheet language. 2888: * 2889: * In recovery mode, only break if end tag match the 2890: * current tag, effectively ignoring all tags inside the 2891: * script/style block and treating the entire block as 2892: * CDATA. 2893: */ 2894: if (ctxt->recovery) { 2895: if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 2896: xmlStrlen(ctxt->name)) == 0) 2897: { 2898: break; /* while */ 2899: } else { 2900: htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, 2901: "Element %s embeds close tag\n", 2902: ctxt->name, NULL); 2903: } 2904: } else { 2905: if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || 2906: ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 2907: { 2908: break; /* while */ 2909: } 2910: } 2911: } 2912: COPY_BUF(l,buf,nbchar,cur); 2913: if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2914: if (ctxt->sax->cdataBlock!= NULL) { 2915: /* 2916: * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2917: */ 2918: ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2919: } else if (ctxt->sax->characters != NULL) { 2920: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2921: } 2922: nbchar = 0; 2923: } 2924: GROW; 2925: NEXTL(l); 2926: cur = CUR_CHAR(l); 2927: } 2928: 2929: if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { 2930: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2931: "Invalid char in CDATA 0x%X\n", cur); 2932: if (ctxt->input->cur < ctxt->input->end) { 2933: NEXT; 2934: } 2935: } 2936: 2937: if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2938: if (ctxt->sax->cdataBlock!= NULL) { 2939: /* 2940: * Insert as CDATA, which is the same as HTML_PRESERVE_NODE 2941: */ 2942: ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); 2943: } else if (ctxt->sax->characters != NULL) { 2944: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2945: } 2946: } 2947: } 2948: 2949: 2950: /** 2951: * htmlParseCharData: 2952: * @ctxt: an HTML parser context 2953: * 2954: * parse a CharData section. 2955: * if we are within a CDATA section ']]>' marks an end of section. 2956: * 2957: * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2958: */ 2959: 2960: static void 2961: htmlParseCharData(htmlParserCtxtPtr ctxt) { 2962: xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; 2963: int nbchar = 0; 2964: int cur, l; 2965: int chunk = 0; 2966: 2967: SHRINK; 2968: cur = CUR_CHAR(l); 2969: while (((cur != '<') || (ctxt->token == '<')) && 2970: ((cur != '&') || (ctxt->token == '&')) && 2971: (cur != 0)) { 2972: if (!(IS_CHAR(cur))) { 2973: htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, 2974: "Invalid char in CDATA 0x%X\n", cur); 2975: } else { 2976: COPY_BUF(l,buf,nbchar,cur); 2977: } 2978: if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { 2979: /* 2980: * Ok the segment is to be consumed as chars. 2981: */ 2982: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 2983: if (areBlanks(ctxt, buf, nbchar)) { 2984: if (ctxt->keepBlanks) { 2985: if (ctxt->sax->characters != NULL) 2986: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2987: } else { 2988: if (ctxt->sax->ignorableWhitespace != NULL) 2989: ctxt->sax->ignorableWhitespace(ctxt->userData, 2990: buf, nbchar); 2991: } 2992: } else { 2993: htmlCheckParagraph(ctxt); 2994: if (ctxt->sax->characters != NULL) 2995: ctxt->sax->characters(ctxt->userData, buf, nbchar); 2996: } 2997: } 2998: nbchar = 0; 2999: } 3000: NEXTL(l); 3001: chunk++; 3002: if (chunk > HTML_PARSER_BUFFER_SIZE) { 3003: chunk = 0; 3004: SHRINK; 3005: GROW; 3006: } 3007: cur = CUR_CHAR(l); 3008: if (cur == 0) { 3009: SHRINK; 3010: GROW; 3011: cur = CUR_CHAR(l); 3012: } 3013: } 3014: if (nbchar != 0) { 3015: buf[nbchar] = 0; 3016: 3017: /* 3018: * Ok the segment is to be consumed as chars. 3019: */ 3020: if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { 3021: if (areBlanks(ctxt, buf, nbchar)) { 3022: if (ctxt->keepBlanks) { 3023: if (ctxt->sax->characters != NULL) 3024: ctxt->sax->characters(ctxt->userData, buf, nbchar); 3025: } else { 3026: if (ctxt->sax->ignorableWhitespace != NULL) 3027: ctxt->sax->ignorableWhitespace(ctxt->userData, 3028: buf, nbchar); 3029: } 3030: } else { 3031: htmlCheckParagraph(ctxt); 3032: if (ctxt->sax->characters != NULL) 3033: ctxt->sax->characters(ctxt->userData, buf, nbchar); 3034: } 3035: } 3036: } else { 3037: /* 3038: * Loop detection 3039: */ 3040: if (cur == 0) 3041: ctxt->instate = XML_PARSER_EOF; 3042: } 3043: } 3044: 3045: /** 3046: * htmlParseExternalID: 3047: * @ctxt: an HTML parser context 3048: * @publicID: a xmlChar** receiving PubidLiteral 3049: * 3050: * Parse an External ID or a Public ID 3051: * 3052: * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 3053: * | 'PUBLIC' S PubidLiteral S SystemLiteral 3054: * 3055: * [83] PublicID ::= 'PUBLIC' S PubidLiteral 3056: * 3057: * Returns the function returns SystemLiteral and in the second 3058: * case publicID receives PubidLiteral, is strict is off 3059: * it is possible to return NULL and have publicID set. 3060: */ 3061: 3062: static xmlChar * 3063: htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { 3064: xmlChar *URI = NULL; 3065: 3066: if ((UPPER == 'S') && (UPP(1) == 'Y') && 3067: (UPP(2) == 'S') && (UPP(3) == 'T') && 3068: (UPP(4) == 'E') && (UPP(5) == 'M')) { 3069: SKIP(6); 3070: if (!IS_BLANK_CH(CUR)) { 3071: htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3072: "Space required after 'SYSTEM'\n", NULL, NULL); 3073: } 3074: SKIP_BLANKS; 3075: URI = htmlParseSystemLiteral(ctxt); 3076: if (URI == NULL) { 3077: htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, 3078: "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); 3079: } 3080: } else if ((UPPER == 'P') && (UPP(1) == 'U') && 3081: (UPP(2) == 'B') && (UPP(3) == 'L') && 3082: (UPP(4) == 'I') && (UPP(5) == 'C')) { 3083: SKIP(6); 3084: if (!IS_BLANK_CH(CUR)) { 3085: htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3086: "Space required after 'PUBLIC'\n", NULL, NULL); 3087: } 3088: SKIP_BLANKS; 3089: *publicID = htmlParsePubidLiteral(ctxt); 3090: if (*publicID == NULL) { 3091: htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, 3092: "htmlParseExternalID: PUBLIC, no Public Identifier\n", 3093: NULL, NULL); 3094: } 3095: SKIP_BLANKS; 3096: if ((CUR == '"') || (CUR == '\'')) { 3097: URI = htmlParseSystemLiteral(ctxt); 3098: } 3099: } 3100: return(URI); 3101: } 3102: 3103: /** 3104: * xmlParsePI: 3105: * @ctxt: an XML parser context 3106: * 3107: * parse an XML Processing Instruction. 3108: * 3109: * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>' 3110: */ 3111: static void 3112: htmlParsePI(htmlParserCtxtPtr ctxt) { 3113: xmlChar *buf = NULL; 3114: int len = 0; 3115: int size = HTML_PARSER_BUFFER_SIZE; 3116: int cur, l; 3117: const xmlChar *target; 3118: xmlParserInputState state; 3119: int count = 0; 3120: 3121: if ((RAW == '<') && (NXT(1) == '?')) { 3122: state = ctxt->instate; 3123: ctxt->instate = XML_PARSER_PI; 3124: /* 3125: * this is a Processing Instruction. 3126: */ 3127: SKIP(2); 3128: SHRINK; 3129: 3130: /* 3131: * Parse the target name and check for special support like 3132: * namespace. 3133: */ 3134: target = htmlParseName(ctxt); 3135: if (target != NULL) { 3136: if (RAW == '>') { 3137: SKIP(1); 3138: 3139: /* 3140: * SAX: PI detected. 3141: */ 3142: if ((ctxt->sax) && (!ctxt->disableSAX) && 3143: (ctxt->sax->processingInstruction != NULL)) 3144: ctxt->sax->processingInstruction(ctxt->userData, 3145: target, NULL); 3146: ctxt->instate = state; 3147: return; 3148: } 3149: buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3150: if (buf == NULL) { 3151: htmlErrMemory(ctxt, NULL); 3152: ctxt->instate = state; 3153: return; 3154: } 3155: cur = CUR; 3156: if (!IS_BLANK(cur)) { 3157: htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, 3158: "ParsePI: PI %s space expected\n", target, NULL); 3159: } 3160: SKIP_BLANKS; 3161: cur = CUR_CHAR(l); 3162: while (IS_CHAR(cur) && (cur != '>')) { 3163: if (len + 5 >= size) { 3164: xmlChar *tmp; 3165: 3166: size *= 2; 3167: tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3168: if (tmp == NULL) { 3169: htmlErrMemory(ctxt, NULL); 3170: xmlFree(buf); 3171: ctxt->instate = state; 3172: return; 3173: } 3174: buf = tmp; 3175: } 3176: count++; 3177: if (count > 50) { 3178: GROW; 3179: count = 0; 3180: } 3181: COPY_BUF(l,buf,len,cur); 3182: NEXTL(l); 3183: cur = CUR_CHAR(l); 3184: if (cur == 0) { 3185: SHRINK; 3186: GROW; 3187: cur = CUR_CHAR(l); 3188: } 3189: } 3190: buf[len] = 0; 3191: if (cur != '>') { 3192: htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED, 3193: "ParsePI: PI %s never end ...\n", target, NULL); 3194: } else { 3195: SKIP(1); 3196: 3197: /* 3198: * SAX: PI detected. 3199: */ 3200: if ((ctxt->sax) && (!ctxt->disableSAX) && 3201: (ctxt->sax->processingInstruction != NULL)) 3202: ctxt->sax->processingInstruction(ctxt->userData, 3203: target, buf); 3204: } 3205: xmlFree(buf); 3206: } else { 3207: htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 3208: "PI is not started correctly", NULL, NULL); 3209: } 3210: ctxt->instate = state; 3211: } 3212: } 3213: 3214: /** 3215: * htmlParseComment: 3216: * @ctxt: an HTML parser context 3217: * 3218: * Parse an XML (SGML) comment  3219: * 3220: * [15] Comment ::= '' 3221: */ 3222: static void 3223: htmlParseComment(htmlParserCtxtPtr ctxt) { 3224: xmlChar *buf = NULL; 3225: int len; 3226: int size = HTML_PARSER_BUFFER_SIZE; 3227: int q, ql; 3228: int r, rl; 3229: int cur, l; 3230: xmlParserInputState state; 3231: 3232: /* 3233: * Check that there is a comment right here. 3234: */ 3235: if ((RAW != '<') || (NXT(1) != '!') || 3236: (NXT(2) != '-') || (NXT(3) != '-')) return; 3237: 3238: state = ctxt->instate; 3239: ctxt->instate = XML_PARSER_COMMENT; 3240: SHRINK; 3241: SKIP(4); 3242: buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar)); 3243: if (buf == NULL) { 3244: htmlErrMemory(ctxt, "buffer allocation failed\n"); 3245: ctxt->instate = state; 3246: return; 3247: } 3248: q = CUR_CHAR(ql); 3249: NEXTL(ql); 3250: r = CUR_CHAR(rl); 3251: NEXTL(rl); 3252: cur = CUR_CHAR(l); 3253: len = 0; 3254: while (IS_CHAR(cur) && 3255: ((cur != '>') || 3256: (r != '-') || (q != '-'))) { 3257: if (len + 5 >= size) { 3258: xmlChar *tmp; 3259: 3260: size *= 2; 3261: tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar)); 3262: if (tmp == NULL) { 3263: xmlFree(buf); 3264: htmlErrMemory(ctxt, "growing buffer failed\n"); 3265: ctxt->instate = state; 3266: return; 3267: } 3268: buf = tmp; 3269: } 3270: COPY_BUF(ql,buf,len,q); 3271: q = r; 3272: ql = rl; 3273: r = cur; 3274: rl = l; 3275: NEXTL(l); 3276: cur = CUR_CHAR(l); 3277: if (cur == 0) { 3278: SHRINK; 3279: GROW; 3280: cur = CUR_CHAR(l); 3281: } 3282: } 3283: buf[len] = 0; 3284: if (!IS_CHAR(cur)) { 3285: htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, 3286: "Comment not terminated \n */ 5087: base += 2; 5088: } 5089: } 5090: if (ignoreattrval) { 5091: if (buf[base] == '"' || buf[base] == '\'') { 5092: if (invalue) { 5093: if (buf[base] == valdellim) { 5094: invalue = 0; 5095: continue; 5096: } 5097: } else { 5098: valdellim = buf[base]; 5099: invalue = 1; 5100: continue; 5101: } 5102: } else if (invalue) { 5103: continue; 5104: } 5105: } 5106: if (incomment) { 5107: if (base + 3 > len) 5108: return (-1); 5109: if ((buf[base] == '-') && (buf[base + 1] == '-') && 5110: (buf[base + 2] == '>')) { 5111: incomment = 0; 5112: base += 2; 5113: } 5114: continue; 5115: } 5116: if (buf[base] == first) { 5117: if (third != 0) { 5118: if ((buf[base + 1] != next) || (buf[base + 2] != third)) 5119: continue; 5120: } else if (next != 0) { 5121: if (buf[base + 1] != next) 5122: continue; 5123: } 5124: ctxt->checkIndex = 0; 5125: #ifdef DEBUG_PUSH 5126: if (next == 0) 5127: xmlGenericError(xmlGenericErrorContext, 5128: "HPP: lookup '%c' found at %d\n", 5129: first, base); 5130: else if (third == 0) 5131: xmlGenericError(xmlGenericErrorContext, 5132: "HPP: lookup '%c%c' found at %d\n", 5133: first, next, base); 5134: else 5135: xmlGenericError(xmlGenericErrorContext, 5136: "HPP: lookup '%c%c%c' found at %d\n", 5137: first, next, third, base); 5138: #endif 5139: return (base - (in->cur - in->base)); 5140: } 5141: } 5142: if ((!incomment) && (!invalue)) 5143: ctxt->checkIndex = base; 5144: #ifdef DEBUG_PUSH 5145: if (next == 0) 5146: xmlGenericError(xmlGenericErrorContext, 5147: "HPP: lookup '%c' failed\n", first); 5148: else if (third == 0) 5149: xmlGenericError(xmlGenericErrorContext, 5150: "HPP: lookup '%c%c' failed\n", first, next); 5151: else 5152: xmlGenericError(xmlGenericErrorContext, 5153: "HPP: lookup '%c%c%c' failed\n", first, next, 5154: third); 5155: #endif 5156: return (-1); 5157: } 5158: 5159: /** 5160: * htmlParseLookupChars: 5161: * @ctxt: an HTML parser context 5162: * @stop: Array of chars, which stop the lookup. 5163: * @stopLen: Length of stop-Array 5164: * 5165: * Try to find if any char of the stop-Array is available in the input 5166: * stream. 5167: * This function has a side effect of (possibly) incrementing ctxt->checkIndex 5168: * to avoid rescanning sequences of bytes, it DOES change the state of the 5169: * parser, do not use liberally. 5170: * 5171: * Returns the index to the current parsing point if a stopChar 5172: * is available, -1 otherwise. 5173: */ 5174: static int 5175: htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, 5176: int stopLen) 5177: { 5178: int base, len; 5179: htmlParserInputPtr in; 5180: const xmlChar *buf; 5181: int incomment = 0; 5182: int i; 5183: 5184: in = ctxt->input; 5185: if (in == NULL) 5186: return (-1); 5187: 5188: base = in->cur - in->base; 5189: if (base < 0) 5190: return (-1); 5191: 5192: if (ctxt->checkIndex > base) 5193: base = ctxt->checkIndex; 5194: 5195: if (in->buf == NULL) { 5196: buf = in->base; 5197: len = in->length; 5198: } else { 5199: buf = xmlBufContent(in->buf->buffer); 5200: len = xmlBufUse(in->buf->buffer); 5201: } 5202: 5203: for (; base < len; base++) { 5204: if (!incomment && (base + 4 < len)) { 5205: if ((buf[base] == '<') && (buf[base + 1] == '!') && 5206: (buf[base + 2] == '-') && (buf[base + 3] == '-')) { 5207: incomment = 1; 5208: /* do not increment past <! - some people use <!--> */ 5209: base += 2; 5210: } 5211: } 5212: if (incomment) { 5213: if (base + 3 > len) 5214: return (-1); 5215: if ((buf[base] == '-') && (buf[base + 1] == '-') && 5216: (buf[base + 2] == '>')) { 5217: incomment = 0; 5218: base += 2; 5219: } 5220: continue; 5221: } 5222: for (i = 0; i < stopLen; ++i) { 5223: if (buf[base] == stop[i]) { 5224: ctxt->checkIndex = 0; 5225: return (base - (in->cur - in->base)); 5226: } 5227: } 5228: } 5229: ctxt->checkIndex = base; 5230: return (-1); 5231: } 5232: 5233: /** 5234: * htmlParseTryOrFinish: 5235: * @ctxt: an HTML parser context 5236: * @terminate: last chunk indicator 5237: * 5238: * Try to progress on parsing 5239: * 5240: * Returns zero if no parsing was possible 5241: */ 5242: static int 5243: htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { 5244: int ret = 0; 5245: htmlParserInputPtr in; 5246: int avail = 0; 5247: xmlChar cur, next; 5248: 5249: htmlParserNodeInfo node_info; 5250: 5251: #ifdef DEBUG_PUSH 5252: switch (ctxt->instate) { 5253: case XML_PARSER_EOF: 5254: xmlGenericError(xmlGenericErrorContext, 5255: "HPP: try EOF\n"); break; 5256: case XML_PARSER_START: 5257: xmlGenericError(xmlGenericErrorContext, 5258: "HPP: try START\n"); break; 5259: case XML_PARSER_MISC: 5260: xmlGenericError(xmlGenericErrorContext, 5261: "HPP: try MISC\n");break; 5262: case XML_PARSER_COMMENT: 5263: xmlGenericError(xmlGenericErrorContext, 5264: "HPP: try COMMENT\n");break; 5265: case XML_PARSER_PROLOG: 5266: xmlGenericError(xmlGenericErrorContext, 5267: "HPP: try PROLOG\n");break; 5268: case XML_PARSER_START_TAG: 5269: xmlGenericError(xmlGenericErrorContext, 5270: "HPP: try START_TAG\n");break; 5271: case XML_PARSER_CONTENT: 5272: xmlGenericError(xmlGenericErrorContext, 5273: "HPP: try CONTENT\n");break; 5274: case XML_PARSER_CDATA_SECTION: 5275: xmlGenericError(xmlGenericErrorContext, 5276: "HPP: try CDATA_SECTION\n");break; 5277: case XML_PARSER_END_TAG: 5278: xmlGenericError(xmlGenericErrorContext, 5279: "HPP: try END_TAG\n");break; 5280: case XML_PARSER_ENTITY_DECL: 5281: xmlGenericError(xmlGenericErrorContext, 5282: "HPP: try ENTITY_DECL\n");break; 5283: case XML_PARSER_ENTITY_VALUE: 5284: xmlGenericError(xmlGenericErrorContext, 5285: "HPP: try ENTITY_VALUE\n");break; 5286: case XML_PARSER_ATTRIBUTE_VALUE: 5287: xmlGenericError(xmlGenericErrorContext, 5288: "HPP: try ATTRIBUTE_VALUE\n");break; 5289: case XML_PARSER_DTD: 5290: xmlGenericError(xmlGenericErrorContext, 5291: "HPP: try DTD\n");break; 5292: case XML_PARSER_EPILOG: 5293: xmlGenericError(xmlGenericErrorContext, 5294: "HPP: try EPILOG\n");break; 5295: case XML_PARSER_PI: 5296: xmlGenericError(xmlGenericErrorContext, 5297: "HPP: try PI\n");break; 5298: case XML_PARSER_SYSTEM_LITERAL: 5299: xmlGenericError(xmlGenericErrorContext, 5300: "HPP: try SYSTEM_LITERAL\n");break; 5301: } 5302: #endif 5303: 5304: while (1) { 5305: 5306: in = ctxt->input; 5307: if (in == NULL) break; 5308: if (in->buf == NULL) 5309: avail = in->length - (in->cur - in->base); 5310: else 5311: avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5312: if ((avail == 0) && (terminate)) { 5313: htmlAutoCloseOnEnd(ctxt); 5314: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5315: /* 5316: * SAX: end of the document processing. 5317: */ 5318: ctxt->instate = XML_PARSER_EOF; 5319: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5320: ctxt->sax->endDocument(ctxt->userData); 5321: } 5322: } 5323: if (avail < 1) 5324: goto done; 5325: cur = in->cur[0]; 5326: if (cur == 0) { 5327: SKIP(1); 5328: continue; 5329: } 5330: 5331: switch (ctxt->instate) { 5332: case XML_PARSER_EOF: 5333: /* 5334: * Document parsing is done ! 5335: */ 5336: goto done; 5337: case XML_PARSER_START: 5338: /* 5339: * Very first chars read from the document flow. 5340: */ 5341: cur = in->cur[0]; 5342: if (IS_BLANK_CH(cur)) { 5343: SKIP_BLANKS; 5344: if (in->buf == NULL) 5345: avail = in->length - (in->cur - in->base); 5346: else 5347: avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5348: } 5349: if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) 5350: ctxt->sax->setDocumentLocator(ctxt->userData, 5351: &xmlDefaultSAXLocator); 5352: if ((ctxt->sax) && (ctxt->sax->startDocument) && 5353: (!ctxt->disableSAX)) 5354: ctxt->sax->startDocument(ctxt->userData); 5355: 5356: cur = in->cur[0]; 5357: next = in->cur[1]; 5358: if ((cur == '<') && (next == '!') && 5359: (UPP(2) == 'D') && (UPP(3) == 'O') && 5360: (UPP(4) == 'C') && (UPP(5) == 'T') && 5361: (UPP(6) == 'Y') && (UPP(7) == 'P') && 5362: (UPP(8) == 'E')) { 5363: if ((!terminate) && 5364: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5365: goto done; 5366: #ifdef DEBUG_PUSH 5367: xmlGenericError(xmlGenericErrorContext, 5368: "HPP: Parsing internal subset\n"); 5369: #endif 5370: htmlParseDocTypeDecl(ctxt); 5371: ctxt->instate = XML_PARSER_PROLOG; 5372: #ifdef DEBUG_PUSH 5373: xmlGenericError(xmlGenericErrorContext, 5374: "HPP: entering PROLOG\n"); 5375: #endif 5376: } else { 5377: ctxt->instate = XML_PARSER_MISC; 5378: #ifdef DEBUG_PUSH 5379: xmlGenericError(xmlGenericErrorContext, 5380: "HPP: entering MISC\n"); 5381: #endif 5382: } 5383: break; 5384: case XML_PARSER_MISC: 5385: SKIP_BLANKS; 5386: if (in->buf == NULL) 5387: avail = in->length - (in->cur - in->base); 5388: else 5389: avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5390: /* 5391: * no chars in buffer 5392: */ 5393: if (avail < 1) 5394: goto done; 5395: /* 5396: * not enouth chars in buffer 5397: */ 5398: if (avail < 2) { 5399: if (!terminate) 5400: goto done; 5401: else 5402: next = ' '; 5403: } else { 5404: next = in->cur[1]; 5405: } 5406: cur = in->cur[0]; 5407: if ((cur == '<') && (next == '!') && 5408: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5409: if ((!terminate) && 5410: (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5411: goto done; 5412: #ifdef DEBUG_PUSH 5413: xmlGenericError(xmlGenericErrorContext, 5414: "HPP: Parsing Comment\n"); 5415: #endif 5416: htmlParseComment(ctxt); 5417: ctxt->instate = XML_PARSER_MISC; 5418: } else if ((cur == '<') && (next == '?')) { 5419: if ((!terminate) && 5420: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5421: goto done; 5422: #ifdef DEBUG_PUSH 5423: xmlGenericError(xmlGenericErrorContext, 5424: "HPP: Parsing PI\n"); 5425: #endif 5426: htmlParsePI(ctxt); 5427: ctxt->instate = XML_PARSER_MISC; 5428: } else if ((cur == '<') && (next == '!') && 5429: (UPP(2) == 'D') && (UPP(3) == 'O') && 5430: (UPP(4) == 'C') && (UPP(5) == 'T') && 5431: (UPP(6) == 'Y') && (UPP(7) == 'P') && 5432: (UPP(8) == 'E')) { 5433: if ((!terminate) && 5434: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5435: goto done; 5436: #ifdef DEBUG_PUSH 5437: xmlGenericError(xmlGenericErrorContext, 5438: "HPP: Parsing internal subset\n"); 5439: #endif 5440: htmlParseDocTypeDecl(ctxt); 5441: ctxt->instate = XML_PARSER_PROLOG; 5442: #ifdef DEBUG_PUSH 5443: xmlGenericError(xmlGenericErrorContext, 5444: "HPP: entering PROLOG\n"); 5445: #endif 5446: } else if ((cur == '<') && (next == '!') && 5447: (avail < 9)) { 5448: goto done; 5449: } else { 5450: ctxt->instate = XML_PARSER_START_TAG; 5451: #ifdef DEBUG_PUSH 5452: xmlGenericError(xmlGenericErrorContext, 5453: "HPP: entering START_TAG\n"); 5454: #endif 5455: } 5456: break; 5457: case XML_PARSER_PROLOG: 5458: SKIP_BLANKS; 5459: if (in->buf == NULL) 5460: avail = in->length - (in->cur - in->base); 5461: else 5462: avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5463: if (avail < 2) 5464: goto done; 5465: cur = in->cur[0]; 5466: next = in->cur[1]; 5467: if ((cur == '<') && (next == '!') && 5468: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5469: if ((!terminate) && 5470: (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5471: goto done; 5472: #ifdef DEBUG_PUSH 5473: xmlGenericError(xmlGenericErrorContext, 5474: "HPP: Parsing Comment\n"); 5475: #endif 5476: htmlParseComment(ctxt); 5477: ctxt->instate = XML_PARSER_PROLOG; 5478: } else if ((cur == '<') && (next == '?')) { 5479: if ((!terminate) && 5480: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5481: goto done; 5482: #ifdef DEBUG_PUSH 5483: xmlGenericError(xmlGenericErrorContext, 5484: "HPP: Parsing PI\n"); 5485: #endif 5486: htmlParsePI(ctxt); 5487: ctxt->instate = XML_PARSER_PROLOG; 5488: } else if ((cur == '<') && (next == '!') && 5489: (avail < 4)) { 5490: goto done; 5491: } else { 5492: ctxt->instate = XML_PARSER_START_TAG; 5493: #ifdef DEBUG_PUSH 5494: xmlGenericError(xmlGenericErrorContext, 5495: "HPP: entering START_TAG\n"); 5496: #endif 5497: } 5498: break; 5499: case XML_PARSER_EPILOG: 5500: if (in->buf == NULL) 5501: avail = in->length - (in->cur - in->base); 5502: else 5503: avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); 5504: if (avail < 1) 5505: goto done; 5506: cur = in->cur[0]; 5507: if (IS_BLANK_CH(cur)) { 5508: htmlParseCharData(ctxt); 5509: goto done; 5510: } 5511: if (avail < 2) 5512: goto done; 5513: next = in->cur[1]; 5514: if ((cur == '<') && (next == '!') && 5515: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5516: if ((!terminate) && 5517: (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) 5518: goto done; 5519: #ifdef DEBUG_PUSH 5520: xmlGenericError(xmlGenericErrorContext, 5521: "HPP: Parsing Comment\n"); 5522: #endif 5523: htmlParseComment(ctxt); 5524: ctxt->instate = XML_PARSER_EPILOG; 5525: } else if ((cur == '<') && (next == '?')) { 5526: if ((!terminate) && 5527: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5528: goto done; 5529: #ifdef DEBUG_PUSH 5530: xmlGenericError(xmlGenericErrorContext, 5531: "HPP: Parsing PI\n"); 5532: #endif 5533: htmlParsePI(ctxt); 5534: ctxt->instate = XML_PARSER_EPILOG; 5535: } else if ((cur == '<') && (next == '!') && 5536: (avail < 4)) { 5537: goto done; 5538: } else { 5539: ctxt->errNo = XML_ERR_DOCUMENT_END; 5540: ctxt->wellFormed = 0; 5541: ctxt->instate = XML_PARSER_EOF; 5542: #ifdef DEBUG_PUSH 5543: xmlGenericError(xmlGenericErrorContext, 5544: "HPP: entering EOF\n"); 5545: #endif 5546: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5547: ctxt->sax->endDocument(ctxt->userData); 5548: goto done; 5549: } 5550: break; 5551: case XML_PARSER_START_TAG: { 5552: const xmlChar *name; 5553: int failed; 5554: const htmlElemDesc * info; 5555: 5556: /* 5557: * no chars in buffer 5558: */ 5559: if (avail < 1) 5560: goto done; 5561: /* 5562: * not enouth chars in buffer 5563: */ 5564: if (avail < 2) { 5565: if (!terminate) 5566: goto done; 5567: else 5568: next = ' '; 5569: } else { 5570: next = in->cur[1]; 5571: } 5572: cur = in->cur[0]; 5573: if (cur != '<') { 5574: ctxt->instate = XML_PARSER_CONTENT; 5575: #ifdef DEBUG_PUSH 5576: xmlGenericError(xmlGenericErrorContext, 5577: "HPP: entering CONTENT\n"); 5578: #endif 5579: break; 5580: } 5581: if (next == '/') { 5582: ctxt->instate = XML_PARSER_END_TAG; 5583: ctxt->checkIndex = 0; 5584: #ifdef DEBUG_PUSH 5585: xmlGenericError(xmlGenericErrorContext, 5586: "HPP: entering END_TAG\n"); 5587: #endif 5588: break; 5589: } 5590: if ((!terminate) && 5591: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5592: goto done; 5593: 5594: /* Capture start position */ 5595: if (ctxt->record_info) { 5596: node_info.begin_pos = ctxt->input->consumed + 5597: (CUR_PTR - ctxt->input->base); 5598: node_info.begin_line = ctxt->input->line; 5599: } 5600: 5601: 5602: failed = htmlParseStartTag(ctxt); 5603: name = ctxt->name; 5604: if ((failed == -1) || 5605: (name == NULL)) { 5606: if (CUR == '>') 5607: NEXT; 5608: break; 5609: } 5610: 5611: /* 5612: * Lookup the info for that element. 5613: */ 5614: info = htmlTagLookup(name); 5615: if (info == NULL) { 5616: htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, 5617: "Tag %s invalid\n", name, NULL); 5618: } 5619: 5620: /* 5621: * Check for an Empty Element labeled the XML/SGML way 5622: */ 5623: if ((CUR == '/') && (NXT(1) == '>')) { 5624: SKIP(2); 5625: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5626: ctxt->sax->endElement(ctxt->userData, name); 5627: htmlnamePop(ctxt); 5628: ctxt->instate = XML_PARSER_CONTENT; 5629: #ifdef DEBUG_PUSH 5630: xmlGenericError(xmlGenericErrorContext, 5631: "HPP: entering CONTENT\n"); 5632: #endif 5633: break; 5634: } 5635: 5636: if (CUR == '>') { 5637: NEXT; 5638: } else { 5639: htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, 5640: "Couldn't find end of Start Tag %s\n", 5641: name, NULL); 5642: 5643: /* 5644: * end of parsing of this node. 5645: */ 5646: if (xmlStrEqual(name, ctxt->name)) { 5647: nodePop(ctxt); 5648: htmlnamePop(ctxt); 5649: } 5650: 5651: if (ctxt->record_info) 5652: htmlNodeInfoPush(ctxt, &node_info); 5653: 5654: ctxt->instate = XML_PARSER_CONTENT; 5655: #ifdef DEBUG_PUSH 5656: xmlGenericError(xmlGenericErrorContext, 5657: "HPP: entering CONTENT\n"); 5658: #endif 5659: break; 5660: } 5661: 5662: /* 5663: * Check for an Empty Element from DTD definition 5664: */ 5665: if ((info != NULL) && (info->empty)) { 5666: if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) 5667: ctxt->sax->endElement(ctxt->userData, name); 5668: htmlnamePop(ctxt); 5669: } 5670: 5671: if (ctxt->record_info) 5672: htmlNodeInfoPush(ctxt, &node_info); 5673: 5674: ctxt->instate = XML_PARSER_CONTENT; 5675: #ifdef DEBUG_PUSH 5676: xmlGenericError(xmlGenericErrorContext, 5677: "HPP: entering CONTENT\n"); 5678: #endif 5679: break; 5680: } 5681: case XML_PARSER_CONTENT: { 5682: long cons; 5683: /* 5684: * Handle preparsed entities and charRef 5685: */ 5686: if (ctxt->token != 0) { 5687: xmlChar chr[2] = { 0 , 0 } ; 5688: 5689: chr[0] = (xmlChar) ctxt->token; 5690: htmlCheckParagraph(ctxt); 5691: if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) 5692: ctxt->sax->characters(ctxt->userData, chr, 1); 5693: ctxt->token = 0; 5694: ctxt->checkIndex = 0; 5695: } 5696: if ((avail == 1) && (terminate)) { 5697: cur = in->cur[0]; 5698: if ((cur != '<') && (cur != '&')) { 5699: if (ctxt->sax != NULL) { 5700: if (IS_BLANK_CH(cur)) { 5701: if (ctxt->keepBlanks) { 5702: if (ctxt->sax->characters != NULL) 5703: ctxt->sax->characters( 5704: ctxt->userData, &cur, 1); 5705: } else { 5706: if (ctxt->sax->ignorableWhitespace != NULL) 5707: ctxt->sax->ignorableWhitespace( 5708: ctxt->userData, &cur, 1); 5709: } 5710: } else { 5711: htmlCheckParagraph(ctxt); 5712: if (ctxt->sax->characters != NULL) 5713: ctxt->sax->characters( 5714: ctxt->userData, &cur, 1); 5715: } 5716: } 5717: ctxt->token = 0; 5718: ctxt->checkIndex = 0; 5719: in->cur++; 5720: break; 5721: } 5722: } 5723: if (avail < 2) 5724: goto done; 5725: cur = in->cur[0]; 5726: next = in->cur[1]; 5727: cons = ctxt->nbChars; 5728: if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || 5729: (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { 5730: /* 5731: * Handle SCRIPT/STYLE separately 5732: */ 5733: if (!terminate) { 5734: int idx; 5735: xmlChar val; 5736: 5737: idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); 5738: if (idx < 0) 5739: goto done; 5740: val = in->cur[idx + 2]; 5741: if (val == 0) /* bad cut of input */ 5742: goto done; 5743: } 5744: htmlParseScript(ctxt); 5745: if ((cur == '<') && (next == '/')) { 5746: ctxt->instate = XML_PARSER_END_TAG; 5747: ctxt->checkIndex = 0; 5748: #ifdef DEBUG_PUSH 5749: xmlGenericError(xmlGenericErrorContext, 5750: "HPP: entering END_TAG\n"); 5751: #endif 5752: break; 5753: } 5754: } else { 5755: /* 5756: * Sometimes DOCTYPE arrives in the middle of the document 5757: */ 5758: if ((cur == '<') && (next == '!') && 5759: (UPP(2) == 'D') && (UPP(3) == 'O') && 5760: (UPP(4) == 'C') && (UPP(5) == 'T') && 5761: (UPP(6) == 'Y') && (UPP(7) == 'P') && 5762: (UPP(8) == 'E')) { 5763: if ((!terminate) && 5764: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5765: goto done; 5766: htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, 5767: "Misplaced DOCTYPE declaration\n", 5768: BAD_CAST "DOCTYPE" , NULL); 5769: htmlParseDocTypeDecl(ctxt); 5770: } else if ((cur == '<') && (next == '!') && 5771: (in->cur[2] == '-') && (in->cur[3] == '-')) { 5772: if ((!terminate) && 5773: (htmlParseLookupSequence( 5774: ctxt, '-', '-', '>', 1, 1) < 0)) 5775: goto done; 5776: #ifdef DEBUG_PUSH 5777: xmlGenericError(xmlGenericErrorContext, 5778: "HPP: Parsing Comment\n"); 5779: #endif 5780: htmlParseComment(ctxt); 5781: ctxt->instate = XML_PARSER_CONTENT; 5782: } else if ((cur == '<') && (next == '?')) { 5783: if ((!terminate) && 5784: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5785: goto done; 5786: #ifdef DEBUG_PUSH 5787: xmlGenericError(xmlGenericErrorContext, 5788: "HPP: Parsing PI\n"); 5789: #endif 5790: htmlParsePI(ctxt); 5791: ctxt->instate = XML_PARSER_CONTENT; 5792: } else if ((cur == '<') && (next == '!') && (avail < 4)) { 5793: goto done; 5794: } else if ((cur == '<') && (next == '/')) { 5795: ctxt->instate = XML_PARSER_END_TAG; 5796: ctxt->checkIndex = 0; 5797: #ifdef DEBUG_PUSH 5798: xmlGenericError(xmlGenericErrorContext, 5799: "HPP: entering END_TAG\n"); 5800: #endif 5801: break; 5802: } else if (cur == '<') { 5803: ctxt->instate = XML_PARSER_START_TAG; 5804: ctxt->checkIndex = 0; 5805: #ifdef DEBUG_PUSH 5806: xmlGenericError(xmlGenericErrorContext, 5807: "HPP: entering START_TAG\n"); 5808: #endif 5809: break; 5810: } else if (cur == '&') { 5811: if ((!terminate) && 5812: (htmlParseLookupChars(ctxt, 5813: BAD_CAST "; >/", 4) < 0)) 5814: goto done; 5815: #ifdef DEBUG_PUSH 5816: xmlGenericError(xmlGenericErrorContext, 5817: "HPP: Parsing Reference\n"); 5818: #endif 5819: /* TODO: check generation of subtrees if noent !!! */ 5820: htmlParseReference(ctxt); 5821: } else { 5822: /* 5823: * check that the text sequence is complete 5824: * before handing out the data to the parser 5825: * to avoid problems with erroneous end of 5826: * data detection. 5827: */ 5828: if ((!terminate) && 5829: (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) 5830: goto done; 5831: ctxt->checkIndex = 0; 5832: #ifdef DEBUG_PUSH 5833: xmlGenericError(xmlGenericErrorContext, 5834: "HPP: Parsing char data\n"); 5835: #endif 5836: htmlParseCharData(ctxt); 5837: } 5838: } 5839: if (cons == ctxt->nbChars) { 5840: if (ctxt->node != NULL) { 5841: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5842: "detected an error in element content\n", 5843: NULL, NULL); 5844: } 5845: NEXT; 5846: break; 5847: } 5848: 5849: break; 5850: } 5851: case XML_PARSER_END_TAG: 5852: if (avail < 2) 5853: goto done; 5854: if ((!terminate) && 5855: (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) 5856: goto done; 5857: htmlParseEndTag(ctxt); 5858: if (ctxt->nameNr == 0) { 5859: ctxt->instate = XML_PARSER_EPILOG; 5860: } else { 5861: ctxt->instate = XML_PARSER_CONTENT; 5862: } 5863: ctxt->checkIndex = 0; 5864: #ifdef DEBUG_PUSH 5865: xmlGenericError(xmlGenericErrorContext, 5866: "HPP: entering CONTENT\n"); 5867: #endif 5868: break; 5869: case XML_PARSER_CDATA_SECTION: 5870: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5871: "HPP: internal error, state == CDATA\n", 5872: NULL, NULL); 5873: ctxt->instate = XML_PARSER_CONTENT; 5874: ctxt->checkIndex = 0; 5875: #ifdef DEBUG_PUSH 5876: xmlGenericError(xmlGenericErrorContext, 5877: "HPP: entering CONTENT\n"); 5878: #endif 5879: break; 5880: case XML_PARSER_DTD: 5881: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5882: "HPP: internal error, state == DTD\n", 5883: NULL, NULL); 5884: ctxt->instate = XML_PARSER_CONTENT; 5885: ctxt->checkIndex = 0; 5886: #ifdef DEBUG_PUSH 5887: xmlGenericError(xmlGenericErrorContext, 5888: "HPP: entering CONTENT\n"); 5889: #endif 5890: break; 5891: case XML_PARSER_COMMENT: 5892: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5893: "HPP: internal error, state == COMMENT\n", 5894: NULL, NULL); 5895: ctxt->instate = XML_PARSER_CONTENT; 5896: ctxt->checkIndex = 0; 5897: #ifdef DEBUG_PUSH 5898: xmlGenericError(xmlGenericErrorContext, 5899: "HPP: entering CONTENT\n"); 5900: #endif 5901: break; 5902: case XML_PARSER_PI: 5903: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5904: "HPP: internal error, state == PI\n", 5905: NULL, NULL); 5906: ctxt->instate = XML_PARSER_CONTENT; 5907: ctxt->checkIndex = 0; 5908: #ifdef DEBUG_PUSH 5909: xmlGenericError(xmlGenericErrorContext, 5910: "HPP: entering CONTENT\n"); 5911: #endif 5912: break; 5913: case XML_PARSER_ENTITY_DECL: 5914: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5915: "HPP: internal error, state == ENTITY_DECL\n", 5916: NULL, NULL); 5917: ctxt->instate = XML_PARSER_CONTENT; 5918: ctxt->checkIndex = 0; 5919: #ifdef DEBUG_PUSH 5920: xmlGenericError(xmlGenericErrorContext, 5921: "HPP: entering CONTENT\n"); 5922: #endif 5923: break; 5924: case XML_PARSER_ENTITY_VALUE: 5925: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5926: "HPP: internal error, state == ENTITY_VALUE\n", 5927: NULL, NULL); 5928: ctxt->instate = XML_PARSER_CONTENT; 5929: ctxt->checkIndex = 0; 5930: #ifdef DEBUG_PUSH 5931: xmlGenericError(xmlGenericErrorContext, 5932: "HPP: entering DTD\n"); 5933: #endif 5934: break; 5935: case XML_PARSER_ATTRIBUTE_VALUE: 5936: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5937: "HPP: internal error, state == ATTRIBUTE_VALUE\n", 5938: NULL, NULL); 5939: ctxt->instate = XML_PARSER_START_TAG; 5940: ctxt->checkIndex = 0; 5941: #ifdef DEBUG_PUSH 5942: xmlGenericError(xmlGenericErrorContext, 5943: "HPP: entering START_TAG\n"); 5944: #endif 5945: break; 5946: case XML_PARSER_SYSTEM_LITERAL: 5947: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5948: "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", 5949: NULL, NULL); 5950: ctxt->instate = XML_PARSER_CONTENT; 5951: ctxt->checkIndex = 0; 5952: #ifdef DEBUG_PUSH 5953: xmlGenericError(xmlGenericErrorContext, 5954: "HPP: entering CONTENT\n"); 5955: #endif 5956: break; 5957: case XML_PARSER_IGNORE: 5958: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5959: "HPP: internal error, state == XML_PARSER_IGNORE\n", 5960: NULL, NULL); 5961: ctxt->instate = XML_PARSER_CONTENT; 5962: ctxt->checkIndex = 0; 5963: #ifdef DEBUG_PUSH 5964: xmlGenericError(xmlGenericErrorContext, 5965: "HPP: entering CONTENT\n"); 5966: #endif 5967: break; 5968: case XML_PARSER_PUBLIC_LITERAL: 5969: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 5970: "HPP: internal error, state == XML_PARSER_LITERAL\n", 5971: NULL, NULL); 5972: ctxt->instate = XML_PARSER_CONTENT; 5973: ctxt->checkIndex = 0; 5974: #ifdef DEBUG_PUSH 5975: xmlGenericError(xmlGenericErrorContext, 5976: "HPP: entering CONTENT\n"); 5977: #endif 5978: break; 5979: 5980: } 5981: } 5982: done: 5983: if ((avail == 0) && (terminate)) { 5984: htmlAutoCloseOnEnd(ctxt); 5985: if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 5986: /* 5987: * SAX: end of the document processing. 5988: */ 5989: ctxt->instate = XML_PARSER_EOF; 5990: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 5991: ctxt->sax->endDocument(ctxt->userData); 5992: } 5993: } 5994: if ((ctxt->myDoc != NULL) && 5995: ((terminate) || (ctxt->instate == XML_PARSER_EOF) || 5996: (ctxt->instate == XML_PARSER_EPILOG))) { 5997: xmlDtdPtr dtd; 5998: dtd = xmlGetIntSubset(ctxt->myDoc); 5999: if (dtd == NULL) 6000: ctxt->myDoc->intSubset = 6001: xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 6002: BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", 6003: BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); 6004: } 6005: #ifdef DEBUG_PUSH 6006: xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); 6007: #endif 6008: return(ret); 6009: } 6010: 6011: /** 6012: * htmlParseChunk: 6013: * @ctxt: an HTML parser context 6014: * @chunk: an char array 6015: * @size: the size in byte of the chunk 6016: * @terminate: last chunk indicator 6017: * 6018: * Parse a Chunk of memory 6019: * 6020: * Returns zero if no error, the xmlParserErrors otherwise. 6021: */ 6022: int 6023: htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, 6024: int terminate) { 6025: if ((ctxt == NULL) || (ctxt->input == NULL)) { 6026: htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, 6027: "htmlParseChunk: context error\n", NULL, NULL); 6028: return(XML_ERR_INTERNAL_ERROR); 6029: } 6030: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6031: (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { 6032: size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); 6033: size_t cur = ctxt->input->cur - ctxt->input->base; 6034: int res; 6035: 6036: res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6037: if (res < 0) { 6038: ctxt->errNo = XML_PARSER_EOF; 6039: ctxt->disableSAX = 1; 6040: return (XML_PARSER_EOF); 6041: } 6042: xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); 6043: #ifdef DEBUG_PUSH 6044: xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6045: #endif 6046: 6047: #if 0 6048: if ((terminate) || (ctxt->input->buf->buffer->use > 80)) 6049: htmlParseTryOrFinish(ctxt, terminate); 6050: #endif 6051: } else if (ctxt->instate != XML_PARSER_EOF) { 6052: if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { 6053: xmlParserInputBufferPtr in = ctxt->input->buf; 6054: if ((in->encoder != NULL) && (in->buffer != NULL) && 6055: (in->raw != NULL)) { 6056: int nbchars; 6057: size_t base = xmlBufGetInputBase(in->buffer, ctxt->input); 6058: size_t current = ctxt->input->cur - ctxt->input->base; 6059: 6060: nbchars = xmlCharEncInput(in, terminate); 6061: if (nbchars < 0) { 6062: htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, 6063: "encoder error\n", NULL, NULL); 6064: return(XML_ERR_INVALID_ENCODING); 6065: } 6066: xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); 6067: } 6068: } 6069: } 6070: htmlParseTryOrFinish(ctxt, terminate); 6071: if (terminate) { 6072: if ((ctxt->instate != XML_PARSER_EOF) && 6073: (ctxt->instate != XML_PARSER_EPILOG) && 6074: (ctxt->instate != XML_PARSER_MISC)) { 6075: ctxt->errNo = XML_ERR_DOCUMENT_END; 6076: ctxt->wellFormed = 0; 6077: } 6078: if (ctxt->instate != XML_PARSER_EOF) { 6079: if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) 6080: ctxt->sax->endDocument(ctxt->userData); 6081: } 6082: ctxt->instate = XML_PARSER_EOF; 6083: } 6084: return((xmlParserErrors) ctxt->errNo); 6085: } 6086: 6087: /************************************************************************ 6088: * * 6089: * User entry points * 6090: * * 6091: ************************************************************************/ 6092: 6093: /** 6094: * htmlCreatePushParserCtxt: 6095: * @sax: a SAX handler 6096: * @user_data: The user data returned on SAX callbacks 6097: * @chunk: a pointer to an array of chars 6098: * @size: number of chars in the array 6099: * @filename: an optional file name or URI 6100: * @enc: an optional encoding 6101: * 6102: * Create a parser context for using the HTML parser in push mode 6103: * The value of @filename is used for fetching external entities 6104: * and error/warning reports. 6105: * 6106: * Returns the new parser context or NULL 6107: */ 6108: htmlParserCtxtPtr 6109: htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 6110: const char *chunk, int size, const char *filename, 6111: xmlCharEncoding enc) { 6112: htmlParserCtxtPtr ctxt; 6113: htmlParserInputPtr inputStream; 6114: xmlParserInputBufferPtr buf; 6115: 6116: xmlInitParser(); 6117: 6118: buf = xmlAllocParserInputBuffer(enc); 6119: if (buf == NULL) return(NULL); 6120: 6121: ctxt = htmlNewParserCtxt(); 6122: if (ctxt == NULL) { 6123: xmlFreeParserInputBuffer(buf); 6124: return(NULL); 6125: } 6126: if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) 6127: ctxt->charset=XML_CHAR_ENCODING_UTF8; 6128: if (sax != NULL) { 6129: if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) 6130: xmlFree(ctxt->sax); 6131: ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); 6132: if (ctxt->sax == NULL) { 6133: xmlFree(buf); 6134: xmlFree(ctxt); 6135: return(NULL); 6136: } 6137: memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); 6138: if (user_data != NULL) 6139: ctxt->userData = user_data; 6140: } 6141: if (filename == NULL) { 6142: ctxt->directory = NULL; 6143: } else { 6144: ctxt->directory = xmlParserGetDirectory(filename); 6145: } 6146: 6147: inputStream = htmlNewInputStream(ctxt); 6148: if (inputStream == NULL) { 6149: xmlFreeParserCtxt(ctxt); 6150: xmlFree(buf); 6151: return(NULL); 6152: } 6153: 6154: if (filename == NULL) 6155: inputStream->filename = NULL; 6156: else 6157: inputStream->filename = (char *) 6158: xmlCanonicPath((const xmlChar *) filename); 6159: inputStream->buf = buf; 6160: xmlBufResetInput(buf->buffer, inputStream); 6161: 6162: inputPush(ctxt, inputStream); 6163: 6164: if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && 6165: (ctxt->input->buf != NULL)) { 6166: size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); 6167: size_t cur = ctxt->input->cur - ctxt->input->base; 6168: 6169: xmlParserInputBufferPush(ctxt->input->buf, size, chunk); 6170: 6171: xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); 6172: #ifdef DEBUG_PUSH 6173: xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); 6174: #endif 6175: } 6176: ctxt->progressive = 1; 6177: 6178: return(ctxt); 6179: } 6180: #endif /* LIBXML_PUSH_ENABLED */ 6181: 6182: /** 6183: * htmlSAXParseDoc: 6184: * @cur: a pointer to an array of xmlChar 6185: * @encoding: a free form C string describing the HTML document encoding, or NULL 6186: * @sax: the SAX handler block 6187: * @userData: if using SAX, this pointer will be provided on callbacks. 6188: * 6189: * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks 6190: * to handle parse events. If sax is NULL, fallback to the default DOM 6191: * behavior and return a tree. 6192: * 6193: * Returns the resulting document tree unless SAX is NULL or the document is 6194: * not well formed. 6195: */ 6196: 6197: htmlDocPtr 6198: htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { 6199: htmlDocPtr ret; 6200: htmlParserCtxtPtr ctxt; 6201: 6202: xmlInitParser(); 6203: 6204: if (cur == NULL) return(NULL); 6205: 6206: 6207: ctxt = htmlCreateDocParserCtxt(cur, encoding); 6208: if (ctxt == NULL) return(NULL); 6209: if (sax != NULL) { 6210: if (ctxt->sax != NULL) xmlFree (ctxt->sax); 6211: ctxt->sax = sax; 6212: ctxt->userData = userData; 6213: } 6214: 6215: htmlParseDocument(ctxt); 6216: ret = ctxt->myDoc; 6217: if (sax != NULL) { 6218: ctxt->sax = NULL; 6219: ctxt->userData = NULL; 6220: } 6221: htmlFreeParserCtxt(ctxt); 6222: 6223: return(ret); 6224: } 6225: 6226: /** 6227: * htmlParseDoc: 6228: * @cur: a pointer to an array of xmlChar 6229: * @encoding: a free form C string describing the HTML document encoding, or NULL 6230: * 6231: * parse an HTML in-memory document and build a tree. 6232: * 6233: * Returns the resulting document tree 6234: */ 6235: 6236: htmlDocPtr 6237: htmlParseDoc(xmlChar *cur, const char *encoding) { 6238: return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); 6239: } 6240: 6241: 6242: /** 6243: * htmlCreateFileParserCtxt: 6244: * @filename: the filename 6245: * @encoding: a free form C string describing the HTML document encoding, or NULL 6246: * 6247: * Create a parser context for a file content. 6248: * Automatic support for ZLIB/Compress compressed document is provided 6249: * by default if found at compile-time. 6250: * 6251: * Returns the new parser context or NULL 6252: */ 6253: htmlParserCtxtPtr 6254: htmlCreateFileParserCtxt(const char *filename, const char *encoding) 6255: { 6256: htmlParserCtxtPtr ctxt; 6257: htmlParserInputPtr inputStream; 6258: char *canonicFilename; 6259: /* htmlCharEncoding enc; */ 6260: xmlChar *content, *content_line = (xmlChar *) "charset="; 6261: 6262: if (filename == NULL) 6263: return(NULL); 6264: 6265: ctxt = htmlNewParserCtxt(); 6266: if (ctxt == NULL) { 6267: return(NULL); 6268: } 6269: canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); 6270: if (canonicFilename == NULL) { 6271: #ifdef LIBXML_SAX1_ENABLED 6272: if (xmlDefaultSAXHandler.error != NULL) { 6273: xmlDefaultSAXHandler.error(NULL, "out of memory\n"); 6274: } 6275: #endif 6276: xmlFreeParserCtxt(ctxt); 6277: return(NULL); 6278: } 6279: 6280: inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); 6281: xmlFree(canonicFilename); 6282: if (inputStream == NULL) { 6283: xmlFreeParserCtxt(ctxt); 6284: return(NULL); 6285: } 6286: 6287: inputPush(ctxt, inputStream); 6288: 6289: /* set encoding */ 6290: if (encoding) { 6291: content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); 6292: if (content) { 6293: strcpy ((char *)content, (char *)content_line); 6294: strcat ((char *)content, (char *)encoding); 6295: htmlCheckEncoding (ctxt, content); 6296: xmlFree (content); 6297: } 6298: } 6299: 6300: return(ctxt); 6301: } 6302: 6303: /** 6304: * htmlSAXParseFile: 6305: * @filename: the filename 6306: * @encoding: a free form C string describing the HTML document encoding, or NULL 6307: * @sax: the SAX handler block 6308: * @userData: if using SAX, this pointer will be provided on callbacks. 6309: * 6310: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6311: * compressed document is provided by default if found at compile-time. 6312: * It use the given SAX function block to handle the parsing callback. 6313: * If sax is NULL, fallback to the default DOM tree building routines. 6314: * 6315: * Returns the resulting document tree unless SAX is NULL or the document is 6316: * not well formed. 6317: */ 6318: 6319: htmlDocPtr 6320: htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 6321: void *userData) { 6322: htmlDocPtr ret; 6323: htmlParserCtxtPtr ctxt; 6324: htmlSAXHandlerPtr oldsax = NULL; 6325: 6326: xmlInitParser(); 6327: 6328: ctxt = htmlCreateFileParserCtxt(filename, encoding); 6329: if (ctxt == NULL) return(NULL); 6330: if (sax != NULL) { 6331: oldsax = ctxt->sax; 6332: ctxt->sax = sax; 6333: ctxt->userData = userData; 6334: } 6335: 6336: htmlParseDocument(ctxt); 6337: 6338: ret = ctxt->myDoc; 6339: if (sax != NULL) { 6340: ctxt->sax = oldsax; 6341: ctxt->userData = NULL; 6342: } 6343: htmlFreeParserCtxt(ctxt); 6344: 6345: return(ret); 6346: } 6347: 6348: /** 6349: * htmlParseFile: 6350: * @filename: the filename 6351: * @encoding: a free form C string describing the HTML document encoding, or NULL 6352: * 6353: * parse an HTML file and build a tree. Automatic support for ZLIB/Compress 6354: * compressed document is provided by default if found at compile-time. 6355: * 6356: * Returns the resulting document tree 6357: */ 6358: 6359: htmlDocPtr 6360: htmlParseFile(const char *filename, const char *encoding) { 6361: return(htmlSAXParseFile(filename, encoding, NULL, NULL)); 6362: } 6363: 6364: /** 6365: * htmlHandleOmittedElem: 6366: * @val: int 0 or 1 6367: * 6368: * Set and return the previous value for handling HTML omitted tags. 6369: * 6370: * Returns the last value for 0 for no handling, 1 for auto insertion. 6371: */ 6372: 6373: int 6374: htmlHandleOmittedElem(int val) { 6375: int old = htmlOmittedDefaultValue; 6376: 6377: htmlOmittedDefaultValue = val; 6378: return(old); 6379: } 6380: 6381: /** 6382: * htmlElementAllowedHere: 6383: * @parent: HTML parent element 6384: * @elt: HTML element 6385: * 6386: * Checks whether an HTML element may be a direct child of a parent element. 6387: * Note - doesn't check for deprecated elements 6388: * 6389: * Returns 1 if allowed; 0 otherwise. 6390: */ 6391: int 6392: htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { 6393: const char** p ; 6394: 6395: if ( ! elt || ! parent || ! parent->subelts ) 6396: return 0 ; 6397: 6398: for ( p = parent->subelts; *p; ++p ) 6399: if ( !xmlStrcmp((const xmlChar *)*p, elt) ) 6400: return 1 ; 6401: 6402: return 0 ; 6403: } 6404: /** 6405: * htmlElementStatusHere: 6406: * @parent: HTML parent element 6407: * @elt: HTML element 6408: * 6409: * Checks whether an HTML element may be a direct child of a parent element. 6410: * and if so whether it is valid or deprecated. 6411: * 6412: * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6413: */ 6414: htmlStatus 6415: htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { 6416: if ( ! parent || ! elt ) 6417: return HTML_INVALID ; 6418: if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) 6419: return HTML_INVALID ; 6420: 6421: return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; 6422: } 6423: /** 6424: * htmlAttrAllowed: 6425: * @elt: HTML element 6426: * @attr: HTML attribute 6427: * @legacy: whether to allow deprecated attributes 6428: * 6429: * Checks whether an attribute is valid for an element 6430: * Has full knowledge of Required and Deprecated attributes 6431: * 6432: * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID 6433: */ 6434: htmlStatus 6435: htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { 6436: const char** p ; 6437: 6438: if ( !elt || ! attr ) 6439: return HTML_INVALID ; 6440: 6441: if ( elt->attrs_req ) 6442: for ( p = elt->attrs_req; *p; ++p) 6443: if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6444: return HTML_REQUIRED ; 6445: 6446: if ( elt->attrs_opt ) 6447: for ( p = elt->attrs_opt; *p; ++p) 6448: if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6449: return HTML_VALID ; 6450: 6451: if ( legacy && elt->attrs_depr ) 6452: for ( p = elt->attrs_depr; *p; ++p) 6453: if ( !xmlStrcmp((const xmlChar*)*p, attr) ) 6454: return HTML_DEPRECATED ; 6455: 6456: return HTML_INVALID ; 6457: } 6458: /** 6459: * htmlNodeStatus: 6460: * @node: an htmlNodePtr in a tree 6461: * @legacy: whether to allow deprecated elements (YES is faster here 6462: * for Element nodes) 6463: * 6464: * Checks whether the tree node is valid. Experimental (the author 6465: * only uses the HTML enhancements in a SAX parser) 6466: * 6467: * Return: for Element nodes, a return from htmlElementAllowedHere (if 6468: * legacy allowed) or htmlElementStatusHere (otherwise). 6469: * for Attribute nodes, a return from htmlAttrAllowed 6470: * for other nodes, HTML_NA (no checks performed) 6471: */ 6472: htmlStatus 6473: htmlNodeStatus(const htmlNodePtr node, int legacy) { 6474: if ( ! node ) 6475: return HTML_INVALID ; 6476: 6477: switch ( node->type ) { 6478: case XML_ELEMENT_NODE: 6479: return legacy 6480: ? ( htmlElementAllowedHere ( 6481: htmlTagLookup(node->parent->name) , node->name 6482: ) ? HTML_VALID : HTML_INVALID ) 6483: : htmlElementStatusHere( 6484: htmlTagLookup(node->parent->name) , 6485: htmlTagLookup(node->name) ) 6486: ; 6487: case XML_ATTRIBUTE_NODE: 6488: return htmlAttrAllowed( 6489: htmlTagLookup(node->parent->name) , node->name, legacy) ; 6490: default: return HTML_NA ; 6491: } 6492: } 6493: /************************************************************************ 6494: * * 6495: * New set (2.6.0) of simpler and more flexible APIs * 6496: * * 6497: ************************************************************************/ 6498: /** 6499: * DICT_FREE: 6500: * @str: a string 6501: * 6502: * Free a string if it is not owned by the "dict" dictionnary in the 6503: * current scope 6504: */ 6505: #define DICT_FREE(str) \ 6506: if ((str) && ((!dict) || \ 6507: (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ 6508: xmlFree((char *)(str)); 6509: 6510: /** 6511: * htmlCtxtReset: 6512: * @ctxt: an HTML parser context 6513: * 6514: * Reset a parser context 6515: */ 6516: void 6517: htmlCtxtReset(htmlParserCtxtPtr ctxt) 6518: { 6519: xmlParserInputPtr input; 6520: xmlDictPtr dict; 6521: 6522: if (ctxt == NULL) 6523: return; 6524: 6525: xmlInitParser(); 6526: dict = ctxt->dict; 6527: 6528: while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ 6529: xmlFreeInputStream(input); 6530: } 6531: ctxt->inputNr = 0; 6532: ctxt->input = NULL; 6533: 6534: ctxt->spaceNr = 0; 6535: if (ctxt->spaceTab != NULL) { 6536: ctxt->spaceTab[0] = -1; 6537: ctxt->space = &ctxt->spaceTab[0]; 6538: } else { 6539: ctxt->space = NULL; 6540: } 6541: 6542: 6543: ctxt->nodeNr = 0; 6544: ctxt->node = NULL; 6545: 6546: ctxt->nameNr = 0; 6547: ctxt->name = NULL; 6548: 6549: DICT_FREE(ctxt->version); 6550: ctxt->version = NULL; 6551: DICT_FREE(ctxt->encoding); 6552: ctxt->encoding = NULL; 6553: DICT_FREE(ctxt->directory); 6554: ctxt->directory = NULL; 6555: DICT_FREE(ctxt->extSubURI); 6556: ctxt->extSubURI = NULL; 6557: DICT_FREE(ctxt->extSubSystem); 6558: ctxt->extSubSystem = NULL; 6559: if (ctxt->myDoc != NULL) 6560: xmlFreeDoc(ctxt->myDoc); 6561: ctxt->myDoc = NULL; 6562: 6563: ctxt->standalone = -1; 6564: ctxt->hasExternalSubset = 0; 6565: ctxt->hasPErefs = 0; 6566: ctxt->html = 1; 6567: ctxt->external = 0; 6568: ctxt->instate = XML_PARSER_START; 6569: ctxt->token = 0; 6570: 6571: ctxt->wellFormed = 1; 6572: ctxt->nsWellFormed = 1; 6573: ctxt->disableSAX = 0; 6574: ctxt->valid = 1; 6575: ctxt->vctxt.userData = ctxt; 6576: ctxt->vctxt.error = xmlParserValidityError; 6577: ctxt->vctxt.warning = xmlParserValidityWarning; 6578: ctxt->record_info = 0; 6579: ctxt->nbChars = 0; 6580: ctxt->checkIndex = 0; 6581: ctxt->inSubset = 0; 6582: ctxt->errNo = XML_ERR_OK; 6583: ctxt->depth = 0; 6584: ctxt->charset = XML_CHAR_ENCODING_NONE; 6585: ctxt->catalogs = NULL; 6586: xmlInitNodeInfoSeq(&ctxt->node_seq); 6587: 6588: if (ctxt->attsDefault != NULL) { 6589: xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); 6590: ctxt->attsDefault = NULL; 6591: } 6592: if (ctxt->attsSpecial != NULL) { 6593: xmlHashFree(ctxt->attsSpecial, NULL); 6594: ctxt->attsSpecial = NULL; 6595: } 6596: } 6597: 6598: /** 6599: * htmlCtxtUseOptions: 6600: * @ctxt: an HTML parser context 6601: * @options: a combination of htmlParserOption(s) 6602: * 6603: * Applies the options to the parser context 6604: * 6605: * Returns 0 in case of success, the set of unknown or unimplemented options 6606: * in case of error. 6607: */ 6608: int 6609: htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) 6610: { 6611: if (ctxt == NULL) 6612: return(-1); 6613: 6614: if (options & HTML_PARSE_NOWARNING) { 6615: ctxt->sax->warning = NULL; 6616: ctxt->vctxt.warning = NULL; 6617: options -= XML_PARSE_NOWARNING; 6618: ctxt->options |= XML_PARSE_NOWARNING; 6619: } 6620: if (options & HTML_PARSE_NOERROR) { 6621: ctxt->sax->error = NULL; 6622: ctxt->vctxt.error = NULL; 6623: ctxt->sax->fatalError = NULL; 6624: options -= XML_PARSE_NOERROR; 6625: ctxt->options |= XML_PARSE_NOERROR; 6626: } 6627: if (options & HTML_PARSE_PEDANTIC) { 6628: ctxt->pedantic = 1; 6629: options -= XML_PARSE_PEDANTIC; 6630: ctxt->options |= XML_PARSE_PEDANTIC; 6631: } else 6632: ctxt->pedantic = 0; 6633: if (options & XML_PARSE_NOBLANKS) { 6634: ctxt->keepBlanks = 0; 6635: ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; 6636: options -= XML_PARSE_NOBLANKS; 6637: ctxt->options |= XML_PARSE_NOBLANKS; 6638: } else 6639: ctxt->keepBlanks = 1; 6640: if (options & HTML_PARSE_RECOVER) { 6641: ctxt->recovery = 1; 6642: options -= HTML_PARSE_RECOVER; 6643: } else 6644: ctxt->recovery = 0; 6645: if (options & HTML_PARSE_COMPACT) { 6646: ctxt->options |= HTML_PARSE_COMPACT; 6647: options -= HTML_PARSE_COMPACT; 6648: } 6649: if (options & XML_PARSE_HUGE) { 6650: ctxt->options |= XML_PARSE_HUGE; 6651: options -= XML_PARSE_HUGE; 6652: } 6653: if (options & HTML_PARSE_NODEFDTD) { 6654: ctxt->options |= HTML_PARSE_NODEFDTD; 6655: options -= HTML_PARSE_NODEFDTD; 6656: } 6657: if (options & HTML_PARSE_IGNORE_ENC) { 6658: ctxt->options |= HTML_PARSE_IGNORE_ENC; 6659: options -= HTML_PARSE_IGNORE_ENC; 6660: } 6661: if (options & HTML_PARSE_NOIMPLIED) { 6662: ctxt->options |= HTML_PARSE_NOIMPLIED; 6663: options -= HTML_PARSE_NOIMPLIED; 6664: } 6665: ctxt->dictNames = 0; 6666: return (options); 6667: } 6668: 6669: /** 6670: * htmlDoRead: 6671: * @ctxt: an HTML parser context 6672: * @URL: the base URL to use for the document 6673: * @encoding: the document encoding, or NULL 6674: * @options: a combination of htmlParserOption(s) 6675: * @reuse: keep the context for reuse 6676: * 6677: * Common front-end for the htmlRead functions 6678: * 6679: * Returns the resulting document tree or NULL 6680: */ 6681: static htmlDocPtr 6682: htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, 6683: int options, int reuse) 6684: { 6685: htmlDocPtr ret; 6686: 6687: htmlCtxtUseOptions(ctxt, options); 6688: ctxt->html = 1; 6689: if (encoding != NULL) { 6690: xmlCharEncodingHandlerPtr hdlr; 6691: 6692: hdlr = xmlFindCharEncodingHandler(encoding); 6693: if (hdlr != NULL) { 6694: xmlSwitchToEncoding(ctxt, hdlr); 6695: if (ctxt->input->encoding != NULL) 6696: xmlFree((xmlChar *) ctxt->input->encoding); 6697: ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); 6698: } 6699: } 6700: if ((URL != NULL) && (ctxt->input != NULL) && 6701: (ctxt->input->filename == NULL)) 6702: ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); 6703: htmlParseDocument(ctxt); 6704: ret = ctxt->myDoc; 6705: ctxt->myDoc = NULL; 6706: if (!reuse) { 6707: if ((ctxt->dictNames) && 6708: (ret != NULL) && 6709: (ret->dict == ctxt->dict)) 6710: ctxt->dict = NULL; 6711: xmlFreeParserCtxt(ctxt); 6712: } 6713: return (ret); 6714: } 6715: 6716: /** 6717: * htmlReadDoc: 6718: * @cur: a pointer to a zero terminated string 6719: * @URL: the base URL to use for the document 6720: * @encoding: the document encoding, or NULL 6721: * @options: a combination of htmlParserOption(s) 6722: * 6723: * parse an XML in-memory document and build a tree. 6724: * 6725: * Returns the resulting document tree 6726: */ 6727: htmlDocPtr 6728: htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) 6729: { 6730: htmlParserCtxtPtr ctxt; 6731: 6732: if (cur == NULL) 6733: return (NULL); 6734: 6735: xmlInitParser(); 6736: ctxt = htmlCreateDocParserCtxt(cur, NULL); 6737: if (ctxt == NULL) 6738: return (NULL); 6739: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6740: } 6741: 6742: /** 6743: * htmlReadFile: 6744: * @filename: a file or URL 6745: * @encoding: the document encoding, or NULL 6746: * @options: a combination of htmlParserOption(s) 6747: * 6748: * parse an XML file from the filesystem or the network. 6749: * 6750: * Returns the resulting document tree 6751: */ 6752: htmlDocPtr 6753: htmlReadFile(const char *filename, const char *encoding, int options) 6754: { 6755: htmlParserCtxtPtr ctxt; 6756: 6757: xmlInitParser(); 6758: ctxt = htmlCreateFileParserCtxt(filename, encoding); 6759: if (ctxt == NULL) 6760: return (NULL); 6761: return (htmlDoRead(ctxt, NULL, NULL, options, 0)); 6762: } 6763: 6764: /** 6765: * htmlReadMemory: 6766: * @buffer: a pointer to a char array 6767: * @size: the size of the array 6768: * @URL: the base URL to use for the document 6769: * @encoding: the document encoding, or NULL 6770: * @options: a combination of htmlParserOption(s) 6771: * 6772: * parse an XML in-memory document and build a tree. 6773: * 6774: * Returns the resulting document tree 6775: */ 6776: htmlDocPtr 6777: htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) 6778: { 6779: htmlParserCtxtPtr ctxt; 6780: 6781: xmlInitParser(); 6782: ctxt = xmlCreateMemoryParserCtxt(buffer, size); 6783: if (ctxt == NULL) 6784: return (NULL); 6785: htmlDefaultSAXHandlerInit(); 6786: if (ctxt->sax != NULL) 6787: memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); 6788: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6789: } 6790: 6791: /** 6792: * htmlReadFd: 6793: * @fd: an open file descriptor 6794: * @URL: the base URL to use for the document 6795: * @encoding: the document encoding, or NULL 6796: * @options: a combination of htmlParserOption(s) 6797: * 6798: * parse an XML from a file descriptor and build a tree. 6799: * 6800: * Returns the resulting document tree 6801: */ 6802: htmlDocPtr 6803: htmlReadFd(int fd, const char *URL, const char *encoding, int options) 6804: { 6805: htmlParserCtxtPtr ctxt; 6806: xmlParserInputBufferPtr input; 6807: xmlParserInputPtr stream; 6808: 6809: if (fd < 0) 6810: return (NULL); 6811: 6812: xmlInitParser(); 6813: input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 6814: if (input == NULL) 6815: return (NULL); 6816: ctxt = xmlNewParserCtxt(); 6817: if (ctxt == NULL) { 6818: xmlFreeParserInputBuffer(input); 6819: return (NULL); 6820: } 6821: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6822: if (stream == NULL) { 6823: xmlFreeParserInputBuffer(input); 6824: xmlFreeParserCtxt(ctxt); 6825: return (NULL); 6826: } 6827: inputPush(ctxt, stream); 6828: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6829: } 6830: 6831: /** 6832: * htmlReadIO: 6833: * @ioread: an I/O read function 6834: * @ioclose: an I/O close function 6835: * @ioctx: an I/O handler 6836: * @URL: the base URL to use for the document 6837: * @encoding: the document encoding, or NULL 6838: * @options: a combination of htmlParserOption(s) 6839: * 6840: * parse an HTML document from I/O functions and source and build a tree. 6841: * 6842: * Returns the resulting document tree 6843: */ 6844: htmlDocPtr 6845: htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, 6846: void *ioctx, const char *URL, const char *encoding, int options) 6847: { 6848: htmlParserCtxtPtr ctxt; 6849: xmlParserInputBufferPtr input; 6850: xmlParserInputPtr stream; 6851: 6852: if (ioread == NULL) 6853: return (NULL); 6854: xmlInitParser(); 6855: 6856: input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 6857: XML_CHAR_ENCODING_NONE); 6858: if (input == NULL) { 6859: if (ioclose != NULL) 6860: ioclose(ioctx); 6861: return (NULL); 6862: } 6863: ctxt = htmlNewParserCtxt(); 6864: if (ctxt == NULL) { 6865: xmlFreeParserInputBuffer(input); 6866: return (NULL); 6867: } 6868: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6869: if (stream == NULL) { 6870: xmlFreeParserInputBuffer(input); 6871: xmlFreeParserCtxt(ctxt); 6872: return (NULL); 6873: } 6874: inputPush(ctxt, stream); 6875: return (htmlDoRead(ctxt, URL, encoding, options, 0)); 6876: } 6877: 6878: /** 6879: * htmlCtxtReadDoc: 6880: * @ctxt: an HTML parser context 6881: * @cur: a pointer to a zero terminated string 6882: * @URL: the base URL to use for the document 6883: * @encoding: the document encoding, or NULL 6884: * @options: a combination of htmlParserOption(s) 6885: * 6886: * parse an XML in-memory document and build a tree. 6887: * This reuses the existing @ctxt parser context 6888: * 6889: * Returns the resulting document tree 6890: */ 6891: htmlDocPtr 6892: htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, 6893: const char *URL, const char *encoding, int options) 6894: { 6895: xmlParserInputPtr stream; 6896: 6897: if (cur == NULL) 6898: return (NULL); 6899: if (ctxt == NULL) 6900: return (NULL); 6901: 6902: htmlCtxtReset(ctxt); 6903: 6904: stream = xmlNewStringInputStream(ctxt, cur); 6905: if (stream == NULL) { 6906: return (NULL); 6907: } 6908: inputPush(ctxt, stream); 6909: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6910: } 6911: 6912: /** 6913: * htmlCtxtReadFile: 6914: * @ctxt: an HTML parser context 6915: * @filename: a file or URL 6916: * @encoding: the document encoding, or NULL 6917: * @options: a combination of htmlParserOption(s) 6918: * 6919: * parse an XML file from the filesystem or the network. 6920: * This reuses the existing @ctxt parser context 6921: * 6922: * Returns the resulting document tree 6923: */ 6924: htmlDocPtr 6925: htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, 6926: const char *encoding, int options) 6927: { 6928: xmlParserInputPtr stream; 6929: 6930: if (filename == NULL) 6931: return (NULL); 6932: if (ctxt == NULL) 6933: return (NULL); 6934: 6935: htmlCtxtReset(ctxt); 6936: 6937: stream = xmlLoadExternalEntity(filename, NULL, ctxt); 6938: if (stream == NULL) { 6939: return (NULL); 6940: } 6941: inputPush(ctxt, stream); 6942: return (htmlDoRead(ctxt, NULL, encoding, options, 1)); 6943: } 6944: 6945: /** 6946: * htmlCtxtReadMemory: 6947: * @ctxt: an HTML parser context 6948: * @buffer: a pointer to a char array 6949: * @size: the size of the array 6950: * @URL: the base URL to use for the document 6951: * @encoding: the document encoding, or NULL 6952: * @options: a combination of htmlParserOption(s) 6953: * 6954: * parse an XML in-memory document and build a tree. 6955: * This reuses the existing @ctxt parser context 6956: * 6957: * Returns the resulting document tree 6958: */ 6959: htmlDocPtr 6960: htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, 6961: const char *URL, const char *encoding, int options) 6962: { 6963: xmlParserInputBufferPtr input; 6964: xmlParserInputPtr stream; 6965: 6966: if (ctxt == NULL) 6967: return (NULL); 6968: if (buffer == NULL) 6969: return (NULL); 6970: 6971: htmlCtxtReset(ctxt); 6972: 6973: input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); 6974: if (input == NULL) { 6975: return(NULL); 6976: } 6977: 6978: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 6979: if (stream == NULL) { 6980: xmlFreeParserInputBuffer(input); 6981: return(NULL); 6982: } 6983: 6984: inputPush(ctxt, stream); 6985: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 6986: } 6987: 6988: /** 6989: * htmlCtxtReadFd: 6990: * @ctxt: an HTML parser context 6991: * @fd: an open file descriptor 6992: * @URL: the base URL to use for the document 6993: * @encoding: the document encoding, or NULL 6994: * @options: a combination of htmlParserOption(s) 6995: * 6996: * parse an XML from a file descriptor and build a tree. 6997: * This reuses the existing @ctxt parser context 6998: * 6999: * Returns the resulting document tree 7000: */ 7001: htmlDocPtr 7002: htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, 7003: const char *URL, const char *encoding, int options) 7004: { 7005: xmlParserInputBufferPtr input; 7006: xmlParserInputPtr stream; 7007: 7008: if (fd < 0) 7009: return (NULL); 7010: if (ctxt == NULL) 7011: return (NULL); 7012: 7013: htmlCtxtReset(ctxt); 7014: 7015: 7016: input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); 7017: if (input == NULL) 7018: return (NULL); 7019: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7020: if (stream == NULL) { 7021: xmlFreeParserInputBuffer(input); 7022: return (NULL); 7023: } 7024: inputPush(ctxt, stream); 7025: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7026: } 7027: 7028: /** 7029: * htmlCtxtReadIO: 7030: * @ctxt: an HTML parser context 7031: * @ioread: an I/O read function 7032: * @ioclose: an I/O close function 7033: * @ioctx: an I/O handler 7034: * @URL: the base URL to use for the document 7035: * @encoding: the document encoding, or NULL 7036: * @options: a combination of htmlParserOption(s) 7037: * 7038: * parse an HTML document from I/O functions and source and build a tree. 7039: * This reuses the existing @ctxt parser context 7040: * 7041: * Returns the resulting document tree 7042: */ 7043: htmlDocPtr 7044: htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, 7045: xmlInputCloseCallback ioclose, void *ioctx, 7046: const char *URL, 7047: const char *encoding, int options) 7048: { 7049: xmlParserInputBufferPtr input; 7050: xmlParserInputPtr stream; 7051: 7052: if (ioread == NULL) 7053: return (NULL); 7054: if (ctxt == NULL) 7055: return (NULL); 7056: 7057: htmlCtxtReset(ctxt); 7058: 7059: input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, 7060: XML_CHAR_ENCODING_NONE); 7061: if (input == NULL) { 7062: if (ioclose != NULL) 7063: ioclose(ioctx); 7064: return (NULL); 7065: } 7066: stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); 7067: if (stream == NULL) { 7068: xmlFreeParserInputBuffer(input); 7069: return (NULL); 7070: } 7071: inputPush(ctxt, stream); 7072: return (htmlDoRead(ctxt, URL, encoding, options, 1)); 7073: } 7074: 7075: #define bottom_HTMLparser 7076: #include "elfgcchack.h" 7077: #endif /* LIBXML_HTML_ENABLED */