--- embedaddon/libxml2/HTMLparser.c 2012/02/21 23:37:58 1.1 +++ embedaddon/libxml2/HTMLparser.c 2014/06/15 19:53:28 1.1.1.3 @@ -44,6 +44,9 @@ #include #include +#include "buf.h" +#include "enc.h" + #define HTML_MAX_NAMELEN 1000 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 #define HTML_PARSER_BUFFER_SIZE 100 @@ -727,7 +730,7 @@ static const char* const map_contents[] = { BLOCK, "ar static const char* const name_attr[] = { "name", NULL } ; static const char* const action_attr[] = { "action", NULL } ; static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; -static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; +static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ; static const char* const content_attr[] = { "content", NULL } ; static const char* const type_attr[] = { "type", NULL } ; static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; @@ -1080,9 +1083,9 @@ static const char * const htmlStartClose[] = { "menu", "p", "head", "ul", NULL, "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL, "div", "p", "head", NULL, -"noscript", "p", "head", NULL, +"noscript", "p", NULL, "center", "font", "b", "i", "p", "head", NULL, -"a", "a", NULL, +"a", "a", "head", NULL, "caption", "p", NULL, "colgroup", "caption", "colgroup", "col", "p", NULL, "col", "caption", "col", "p", NULL, @@ -1100,6 +1103,43 @@ static const char * const htmlStartClose[] = { "option", "option", NULL, "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "listing", "xmp", "a", NULL, +/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close */ +"tt", "head", NULL, +"i", "head", NULL, +"b", "head", NULL, +"u", "head", NULL, +"s", "head", NULL, +"strike", "head", NULL, +"big", "head", NULL, +"small", "head", NULL, + +"em", "head", NULL, +"strong", "head", NULL, +"dfn", "head", NULL, +"code", "head", NULL, +"samp", "head", NULL, +"kbd", "head", NULL, +"var", "head", NULL, +"cite", "head", NULL, +"abbr", "head", NULL, +"acronym", "head", NULL, + +/* "a" */ +"img", "head", NULL, +/* "applet" */ +/* "embed" */ +/* "object" */ +"font", "head", NULL, +/* "basefont" */ +"br", "head", NULL, +/* "script" */ +"map", "head", NULL, +"q", "head", NULL, +"sub", "head", NULL, +"sup", "head", NULL, +"span", "head", NULL, +"bdo", "head", NULL, +"iframe", "head", NULL, NULL }; @@ -2941,9 +2981,14 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) { */ if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { if (areBlanks(ctxt, buf, nbchar)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, - buf, nbchar); + if (ctxt->keepBlanks) { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, nbchar); + } else { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, + buf, nbchar); + } } else { htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) @@ -2974,8 +3019,14 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) { */ if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { if (areBlanks(ctxt, buf, nbchar)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); + if (ctxt->keepBlanks) { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, nbchar); + } else { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, + buf, nbchar); + } } else { htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) @@ -3435,35 +3486,27 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **v } /** - * htmlCheckEncoding: + * htmlCheckEncodingDirect: * @ctxt: an HTML parser context * @attvalue: the attribute value * - * Checks an http-equiv attribute from a Meta tag to detect + * Checks an attribute value to detect * the encoding * If a new encoding is detected the parser is switched to decode * it and pass UTF8 */ static void -htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { - const xmlChar *encoding; +htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { - if ((ctxt == NULL) || (attvalue == NULL)) + if ((ctxt == NULL) || (encoding == NULL) || + (ctxt->options & HTML_PARSE_IGNORE_ENC)) return; /* do not change encoding */ if (ctxt->input->encoding != NULL) return; - encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); if (encoding != NULL) { - encoding += 8; - } else { - encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); - if (encoding != NULL) - encoding += 9; - } - if (encoding != NULL) { xmlCharEncoding enc; xmlCharEncodingHandlerPtr handler; @@ -3500,7 +3543,9 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlCha xmlSwitchToEncoding(ctxt, handler); ctxt->charset = XML_CHAR_ENCODING_UTF8; } else { - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, + "htmlCheckEncoding: unknown encoding %s\n", + encoding, NULL); } } @@ -3515,24 +3560,51 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlCha * convert as much as possible to the parser reading buffer. */ processed = ctxt->input->cur - ctxt->input->base; - xmlBufferShrink(ctxt->input->buf->buffer, processed); - nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, - ctxt->input->buf->buffer, - ctxt->input->buf->raw); + xmlBufShrink(ctxt->input->buf->buffer, processed); + nbchars = xmlCharEncInput(ctxt->input->buf, 1); if (nbchars < 0) { htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, "htmlCheckEncoding: encoder error\n", NULL, NULL); } - ctxt->input->base = - ctxt->input->cur = ctxt->input->buf->buffer->content; - ctxt->input->end = - &ctxt->input->base[ctxt->input->buf->buffer->use]; + xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input); } } } /** + * htmlCheckEncoding: + * @ctxt: an HTML parser context + * @attvalue: the attribute value + * + * Checks an http-equiv attribute from a Meta tag to detect + * the encoding + * If a new encoding is detected the parser is switched to decode + * it and pass UTF8 + */ +static void +htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { + const xmlChar *encoding; + + if (!attvalue) + return; + + encoding = xmlStrcasestr(attvalue, BAD_CAST"charset"); + if (encoding != NULL) { + encoding += 7; + } + /* + * skip blank + */ + if (encoding && IS_BLANK_CH(*encoding)) + encoding = xmlStrcasestr(attvalue, BAD_CAST"="); + if (encoding && *encoding == '=') { + encoding ++; + htmlCheckEncodingDirect(ctxt, encoding); + } +} + +/** * htmlCheckMeta: * @ctxt: an HTML parser context * @atts: the attributes values @@ -3556,6 +3628,8 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar ** if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; + else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset"))) + htmlCheckEncodingDirect(ctxt, value); else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) content = value; att = atts[i++]; @@ -3885,6 +3959,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) ctxt->sax->endElement(ctxt->userData, name); + htmlNodeInfoPop(ctxt); htmlnamePop(ctxt); ret = 1; } else { @@ -4877,9 +4952,7 @@ htmlCreateMemoryParserCtxt(const char *buffer, int siz input->filename = NULL; input->buf = buf; - input->base = input->buf->buffer->content; - input->cur = input->buf->buffer->content; - input->end = &input->buf->buffer->content[input->buf->buffer->use]; + xmlBufResetInput(buf->buffer, input); inputPush(ctxt, input); return(ctxt); @@ -4996,8 +5069,8 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlCha buf = in->base; len = in->length; } else { - buf = in->buf->buffer->content; - len = in->buf->buffer->use; + buf = xmlBufContent(in->buf->buffer); + len = xmlBufUse(in->buf->buffer); } /* take into account the sequence length */ @@ -5089,13 +5162,13 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlCha * @stop: Array of chars, which stop the lookup. * @stopLen: Length of stop-Array * - * Try to find if any char of the stop-Array is available in the input + * Try to find if any char of the stop-Array is available in the input * stream. * This function has a side effect of (possibly) incrementing ctxt->checkIndex * to avoid rescanning sequences of bytes, it DOES change the state of the * parser, do not use liberally. * - * Returns the index to the current parsing point if a stopChar + * Returns the index to the current parsing point if a stopChar * is available, -1 otherwise. */ static int @@ -5123,8 +5196,8 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xml buf = in->base; len = in->length; } else { - buf = in->buf->buffer->content; - len = in->buf->buffer->use; + buf = xmlBufContent(in->buf->buffer); + len = xmlBufUse(in->buf->buffer); } for (; base < len; base++) { @@ -5173,6 +5246,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi int avail = 0; xmlChar cur, next; + htmlParserNodeInfo node_info; + #ifdef DEBUG_PUSH switch (ctxt->instate) { case XML_PARSER_EOF: @@ -5233,7 +5308,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); if ((avail == 0) && (terminate)) { htmlAutoCloseOnEnd(ctxt); if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { @@ -5269,7 +5344,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); } if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, @@ -5311,11 +5386,24 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); - if (avail < 2) + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + /* + * no chars in buffer + */ + if (avail < 1) goto done; + /* + * not enouth chars in buffer + */ + if (avail < 2) { + if (!terminate) + goto done; + else + next = ' '; + } else { + next = in->cur[1]; + } cur = in->cur[0]; - next = in->cur[1]; if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { if ((!terminate) && @@ -5371,7 +5459,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); if (avail < 2) goto done; cur = in->cur[0]; @@ -5412,7 +5500,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); if (avail < 1) goto done; cur = in->cur[0]; @@ -5465,8 +5553,22 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi int failed; const htmlElemDesc * info; - if (avail < 2) + /* + * no chars in buffer + */ + if (avail < 1) goto done; + /* + * not enouth chars in buffer + */ + if (avail < 2) { + if (!terminate) + goto done; + else + next = ' '; + } else { + next = in->cur[1]; + } cur = in->cur[0]; if (cur != '<') { ctxt->instate = XML_PARSER_CONTENT; @@ -5476,7 +5578,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi #endif break; } - if (in->cur[1] == '/') { + if (next == '/') { ctxt->instate = XML_PARSER_END_TAG; ctxt->checkIndex = 0; #ifdef DEBUG_PUSH @@ -5489,6 +5591,14 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) goto done; + /* Capture start position */ + if (ctxt->record_info) { + node_info.begin_pos = ctxt->input->consumed + + (CUR_PTR - ctxt->input->base); + node_info.begin_line = ctxt->input->line; + } + + failed = htmlParseStartTag(ctxt); name = ctxt->name; if ((failed == -1) || @@ -5538,6 +5648,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi htmlnamePop(ctxt); } + if (ctxt->record_info) + htmlNodeInfoPush(ctxt, &node_info); + ctxt->instate = XML_PARSER_CONTENT; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5554,6 +5667,10 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi ctxt->sax->endElement(ctxt->userData, name); htmlnamePop(ctxt); } + + if (ctxt->record_info) + htmlNodeInfoPush(ctxt, &node_info); + ctxt->instate = XML_PARSER_CONTENT; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5581,9 +5698,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi if ((cur != '<') && (cur != '&')) { if (ctxt->sax != NULL) { if (IS_BLANK_CH(cur)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace( - ctxt->userData, &cur, 1); + if (ctxt->keepBlanks) { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters( + ctxt->userData, &cur, 1); + } else { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace( + ctxt->userData, &cur, 1); + } } else { htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) @@ -5906,8 +6029,8 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chu } if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { - int base = ctxt->input->base - ctxt->input->buf->buffer->content; - int cur = ctxt->input->cur - ctxt->input->base; + size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); + size_t cur = ctxt->input->cur - ctxt->input->base; int res; res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); @@ -5916,10 +6039,7 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chu ctxt->disableSAX = 1; return (XML_PARSER_EOF); } - ctxt->input->base = ctxt->input->buf->buffer->content + base; - ctxt->input->cur = ctxt->input->base + cur; - ctxt->input->end = - &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; + xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif @@ -5934,13 +6054,16 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chu if ((in->encoder != NULL) && (in->buffer != NULL) && (in->raw != NULL)) { int nbchars; + size_t base = xmlBufGetInputBase(in->buffer, ctxt->input); + size_t current = ctxt->input->cur - ctxt->input->base; - nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); + nbchars = xmlCharEncInput(in, terminate); if (nbchars < 0) { htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, "encoder error\n", NULL, NULL); return(XML_ERR_INVALID_ENCODING); } + xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); } } } @@ -6034,24 +6157,18 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void * inputStream->filename = (char *) xmlCanonicPath((const xmlChar *) filename); inputStream->buf = buf; - inputStream->base = inputStream->buf->buffer->content; - inputStream->cur = inputStream->buf->buffer->content; - inputStream->end = - &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; + xmlBufResetInput(buf->buffer, inputStream); inputPush(ctxt, inputStream); if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && (ctxt->input->buf != NULL)) { - int base = ctxt->input->base - ctxt->input->buf->buffer->content; - int cur = ctxt->input->cur - ctxt->input->base; + size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); + size_t cur = ctxt->input->cur - ctxt->input->base; xmlParserInputBufferPush(ctxt->input->buf, size, chunk); - ctxt->input->base = ctxt->input->buf->buffer->content + base; - ctxt->input->cur = ctxt->input->base + cur; - ctxt->input->end = - &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; + xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif @@ -6537,6 +6654,14 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options ctxt->options |= HTML_PARSE_NODEFDTD; options -= HTML_PARSE_NODEFDTD; } + if (options & HTML_PARSE_IGNORE_ENC) { + ctxt->options |= HTML_PARSE_IGNORE_ENC; + options -= HTML_PARSE_IGNORE_ENC; + } + if (options & HTML_PARSE_NOIMPLIED) { + ctxt->options |= HTML_PARSE_NOIMPLIED; + options -= HTML_PARSE_NOIMPLIED; + } ctxt->dictNames = 0; return (options); } @@ -6730,8 +6855,11 @@ htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseC input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, XML_CHAR_ENCODING_NONE); - if (input == NULL) + if (input == NULL) { + if (ioclose != NULL) + ioclose(ioctx); return (NULL); + } ctxt = htmlNewParserCtxt(); if (ctxt == NULL) { xmlFreeParserInputBuffer(input); @@ -6930,8 +7058,11 @@ htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCal input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, XML_CHAR_ENCODING_NONE); - if (input == NULL) + if (input == NULL) { + if (ioclose != NULL) + ioclose(ioctx); return (NULL); + } stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); if (stream == NULL) { xmlFreeParserInputBuffer(input);