--- embedaddon/libxml2/HTMLparser.c 2012/02/21 23:37:58 1.1
+++ embedaddon/libxml2/HTMLparser.c 2014/06/15 19:53:28 1.1.1.3
@@ -44,6 +44,9 @@
#include
#include
+#include "buf.h"
+#include "enc.h"
+
#define HTML_MAX_NAMELEN 1000
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
#define HTML_PARSER_BUFFER_SIZE 100
@@ -727,7 +730,7 @@ static const char* const map_contents[] = { BLOCK, "ar
static const char* const name_attr[] = { "name", NULL } ;
static const char* const action_attr[] = { "action", NULL } ;
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
-static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
+static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
static const char* const content_attr[] = { "content", NULL } ;
static const char* const type_attr[] = { "type", NULL } ;
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
@@ -1080,9 +1083,9 @@ static const char * const htmlStartClose[] = {
"menu", "p", "head", "ul", NULL,
"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
"div", "p", "head", NULL,
-"noscript", "p", "head", NULL,
+"noscript", "p", NULL,
"center", "font", "b", "i", "p", "head", NULL,
-"a", "a", NULL,
+"a", "a", "head", NULL,
"caption", "p", NULL,
"colgroup", "caption", "colgroup", "col", "p", NULL,
"col", "caption", "col", "p", NULL,
@@ -1100,6 +1103,43 @@ static const char * const htmlStartClose[] = {
"option", "option", NULL,
"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
"pre", "listing", "xmp", "a", NULL,
+/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close */
+"tt", "head", NULL,
+"i", "head", NULL,
+"b", "head", NULL,
+"u", "head", NULL,
+"s", "head", NULL,
+"strike", "head", NULL,
+"big", "head", NULL,
+"small", "head", NULL,
+
+"em", "head", NULL,
+"strong", "head", NULL,
+"dfn", "head", NULL,
+"code", "head", NULL,
+"samp", "head", NULL,
+"kbd", "head", NULL,
+"var", "head", NULL,
+"cite", "head", NULL,
+"abbr", "head", NULL,
+"acronym", "head", NULL,
+
+/* "a" */
+"img", "head", NULL,
+/* "applet" */
+/* "embed" */
+/* "object" */
+"font", "head", NULL,
+/* "basefont" */
+"br", "head", NULL,
+/* "script" */
+"map", "head", NULL,
+"q", "head", NULL,
+"sub", "head", NULL,
+"sup", "head", NULL,
+"span", "head", NULL,
+"bdo", "head", NULL,
+"iframe", "head", NULL,
NULL
};
@@ -2941,9 +2981,14 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar)) {
- if (ctxt->sax->ignorableWhitespace != NULL)
- ctxt->sax->ignorableWhitespace(ctxt->userData,
- buf, nbchar);
+ if (ctxt->keepBlanks) {
+ if (ctxt->sax->characters != NULL)
+ ctxt->sax->characters(ctxt->userData, buf, nbchar);
+ } else {
+ if (ctxt->sax->ignorableWhitespace != NULL)
+ ctxt->sax->ignorableWhitespace(ctxt->userData,
+ buf, nbchar);
+ }
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
@@ -2974,8 +3019,14 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar)) {
- if (ctxt->sax->ignorableWhitespace != NULL)
- ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
+ if (ctxt->keepBlanks) {
+ if (ctxt->sax->characters != NULL)
+ ctxt->sax->characters(ctxt->userData, buf, nbchar);
+ } else {
+ if (ctxt->sax->ignorableWhitespace != NULL)
+ ctxt->sax->ignorableWhitespace(ctxt->userData,
+ buf, nbchar);
+ }
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
@@ -3435,35 +3486,27 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **v
}
/**
- * htmlCheckEncoding:
+ * htmlCheckEncodingDirect:
* @ctxt: an HTML parser context
* @attvalue: the attribute value
*
- * Checks an http-equiv attribute from a Meta tag to detect
+ * Checks an attribute value to detect
* the encoding
* If a new encoding is detected the parser is switched to decode
* it and pass UTF8
*/
static void
-htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
- const xmlChar *encoding;
+htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
- if ((ctxt == NULL) || (attvalue == NULL))
+ if ((ctxt == NULL) || (encoding == NULL) ||
+ (ctxt->options & HTML_PARSE_IGNORE_ENC))
return;
/* do not change encoding */
if (ctxt->input->encoding != NULL)
return;
- encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
if (encoding != NULL) {
- encoding += 8;
- } else {
- encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
- if (encoding != NULL)
- encoding += 9;
- }
- if (encoding != NULL) {
xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
@@ -3500,7 +3543,9 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlCha
xmlSwitchToEncoding(ctxt, handler);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
} else {
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
+ "htmlCheckEncoding: unknown encoding %s\n",
+ encoding, NULL);
}
}
@@ -3515,24 +3560,51 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlCha
* convert as much as possible to the parser reading buffer.
*/
processed = ctxt->input->cur - ctxt->input->base;
- xmlBufferShrink(ctxt->input->buf->buffer, processed);
- nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
- ctxt->input->buf->buffer,
- ctxt->input->buf->raw);
+ xmlBufShrink(ctxt->input->buf->buffer, processed);
+ nbchars = xmlCharEncInput(ctxt->input->buf, 1);
if (nbchars < 0) {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"htmlCheckEncoding: encoder error\n",
NULL, NULL);
}
- ctxt->input->base =
- ctxt->input->cur = ctxt->input->buf->buffer->content;
- ctxt->input->end =
- &ctxt->input->base[ctxt->input->buf->buffer->use];
+ xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
}
}
}
/**
+ * htmlCheckEncoding:
+ * @ctxt: an HTML parser context
+ * @attvalue: the attribute value
+ *
+ * Checks an http-equiv attribute from a Meta tag to detect
+ * the encoding
+ * If a new encoding is detected the parser is switched to decode
+ * it and pass UTF8
+ */
+static void
+htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
+ const xmlChar *encoding;
+
+ if (!attvalue)
+ return;
+
+ encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
+ if (encoding != NULL) {
+ encoding += 7;
+ }
+ /*
+ * skip blank
+ */
+ if (encoding && IS_BLANK_CH(*encoding))
+ encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
+ if (encoding && *encoding == '=') {
+ encoding ++;
+ htmlCheckEncodingDirect(ctxt, encoding);
+ }
+}
+
+/**
* htmlCheckMeta:
* @ctxt: an HTML parser context
* @atts: the attributes values
@@ -3556,6 +3628,8 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **
if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
http = 1;
+ else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
+ htmlCheckEncodingDirect(ctxt, value);
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
content = value;
att = atts[i++];
@@ -3885,6 +3959,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
+ htmlNodeInfoPop(ctxt);
htmlnamePop(ctxt);
ret = 1;
} else {
@@ -4877,9 +4952,7 @@ htmlCreateMemoryParserCtxt(const char *buffer, int siz
input->filename = NULL;
input->buf = buf;
- input->base = input->buf->buffer->content;
- input->cur = input->buf->buffer->content;
- input->end = &input->buf->buffer->content[input->buf->buffer->use];
+ xmlBufResetInput(buf->buffer, input);
inputPush(ctxt, input);
return(ctxt);
@@ -4996,8 +5069,8 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlCha
buf = in->base;
len = in->length;
} else {
- buf = in->buf->buffer->content;
- len = in->buf->buffer->use;
+ buf = xmlBufContent(in->buf->buffer);
+ len = xmlBufUse(in->buf->buffer);
}
/* take into account the sequence length */
@@ -5089,13 +5162,13 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlCha
* @stop: Array of chars, which stop the lookup.
* @stopLen: Length of stop-Array
*
- * Try to find if any char of the stop-Array is available in the input
+ * Try to find if any char of the stop-Array is available in the input
* stream.
* This function has a side effect of (possibly) incrementing ctxt->checkIndex
* to avoid rescanning sequences of bytes, it DOES change the state of the
* parser, do not use liberally.
*
- * Returns the index to the current parsing point if a stopChar
+ * Returns the index to the current parsing point if a stopChar
* is available, -1 otherwise.
*/
static int
@@ -5123,8 +5196,8 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xml
buf = in->base;
len = in->length;
} else {
- buf = in->buf->buffer->content;
- len = in->buf->buffer->use;
+ buf = xmlBufContent(in->buf->buffer);
+ len = xmlBufUse(in->buf->buffer);
}
for (; base < len; base++) {
@@ -5173,6 +5246,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
int avail = 0;
xmlChar cur, next;
+ htmlParserNodeInfo node_info;
+
#ifdef DEBUG_PUSH
switch (ctxt->instate) {
case XML_PARSER_EOF:
@@ -5233,7 +5308,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
if ((avail == 0) && (terminate)) {
htmlAutoCloseOnEnd(ctxt);
if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
@@ -5269,7 +5344,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
}
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData,
@@ -5311,11 +5386,24 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
- if (avail < 2)
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
+ /*
+ * no chars in buffer
+ */
+ if (avail < 1)
goto done;
+ /*
+ * not enouth chars in buffer
+ */
+ if (avail < 2) {
+ if (!terminate)
+ goto done;
+ else
+ next = ' ';
+ } else {
+ next = in->cur[1];
+ }
cur = in->cur[0];
- next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
@@ -5371,7 +5459,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
if (avail < 2)
goto done;
cur = in->cur[0];
@@ -5412,7 +5500,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
if (in->buf == NULL)
avail = in->length - (in->cur - in->base);
else
- avail = in->buf->buffer->use - (in->cur - in->base);
+ avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
if (avail < 1)
goto done;
cur = in->cur[0];
@@ -5465,8 +5553,22 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
int failed;
const htmlElemDesc * info;
- if (avail < 2)
+ /*
+ * no chars in buffer
+ */
+ if (avail < 1)
goto done;
+ /*
+ * not enouth chars in buffer
+ */
+ if (avail < 2) {
+ if (!terminate)
+ goto done;
+ else
+ next = ' ';
+ } else {
+ next = in->cur[1];
+ }
cur = in->cur[0];
if (cur != '<') {
ctxt->instate = XML_PARSER_CONTENT;
@@ -5476,7 +5578,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
#endif
break;
}
- if (in->cur[1] == '/') {
+ if (next == '/') {
ctxt->instate = XML_PARSER_END_TAG;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
@@ -5489,6 +5591,14 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
goto done;
+ /* Capture start position */
+ if (ctxt->record_info) {
+ node_info.begin_pos = ctxt->input->consumed +
+ (CUR_PTR - ctxt->input->base);
+ node_info.begin_line = ctxt->input->line;
+ }
+
+
failed = htmlParseStartTag(ctxt);
name = ctxt->name;
if ((failed == -1) ||
@@ -5538,6 +5648,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
htmlnamePop(ctxt);
}
+ if (ctxt->record_info)
+ htmlNodeInfoPush(ctxt, &node_info);
+
ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5554,6 +5667,10 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
ctxt->sax->endElement(ctxt->userData, name);
htmlnamePop(ctxt);
}
+
+ if (ctxt->record_info)
+ htmlNodeInfoPush(ctxt, &node_info);
+
ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5581,9 +5698,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int termi
if ((cur != '<') && (cur != '&')) {
if (ctxt->sax != NULL) {
if (IS_BLANK_CH(cur)) {
- if (ctxt->sax->ignorableWhitespace != NULL)
- ctxt->sax->ignorableWhitespace(
- ctxt->userData, &cur, 1);
+ if (ctxt->keepBlanks) {
+ if (ctxt->sax->characters != NULL)
+ ctxt->sax->characters(
+ ctxt->userData, &cur, 1);
+ } else {
+ if (ctxt->sax->ignorableWhitespace != NULL)
+ ctxt->sax->ignorableWhitespace(
+ ctxt->userData, &cur, 1);
+ }
} else {
htmlCheckParagraph(ctxt);
if (ctxt->sax->characters != NULL)
@@ -5906,8 +6029,8 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chu
}
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
- int base = ctxt->input->base - ctxt->input->buf->buffer->content;
- int cur = ctxt->input->cur - ctxt->input->base;
+ size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
+ size_t cur = ctxt->input->cur - ctxt->input->base;
int res;
res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
@@ -5916,10 +6039,7 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chu
ctxt->disableSAX = 1;
return (XML_PARSER_EOF);
}
- ctxt->input->base = ctxt->input->buf->buffer->content + base;
- ctxt->input->cur = ctxt->input->base + cur;
- ctxt->input->end =
- &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
+ xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
#endif
@@ -5934,13 +6054,16 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chu
if ((in->encoder != NULL) && (in->buffer != NULL) &&
(in->raw != NULL)) {
int nbchars;
+ size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
+ size_t current = ctxt->input->cur - ctxt->input->base;
- nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
+ nbchars = xmlCharEncInput(in, terminate);
if (nbchars < 0) {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"encoder error\n", NULL, NULL);
return(XML_ERR_INVALID_ENCODING);
}
+ xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
}
}
}
@@ -6034,24 +6157,18 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *
inputStream->filename = (char *)
xmlCanonicPath((const xmlChar *) filename);
inputStream->buf = buf;
- inputStream->base = inputStream->buf->buffer->content;
- inputStream->cur = inputStream->buf->buffer->content;
- inputStream->end =
- &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
+ xmlBufResetInput(buf->buffer, inputStream);
inputPush(ctxt, inputStream);
if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
(ctxt->input->buf != NULL)) {
- int base = ctxt->input->base - ctxt->input->buf->buffer->content;
- int cur = ctxt->input->cur - ctxt->input->base;
+ size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
+ size_t cur = ctxt->input->cur - ctxt->input->base;
xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
- ctxt->input->base = ctxt->input->buf->buffer->content + base;
- ctxt->input->cur = ctxt->input->base + cur;
- ctxt->input->end =
- &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
+ xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
#endif
@@ -6537,6 +6654,14 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options
ctxt->options |= HTML_PARSE_NODEFDTD;
options -= HTML_PARSE_NODEFDTD;
}
+ if (options & HTML_PARSE_IGNORE_ENC) {
+ ctxt->options |= HTML_PARSE_IGNORE_ENC;
+ options -= HTML_PARSE_IGNORE_ENC;
+ }
+ if (options & HTML_PARSE_NOIMPLIED) {
+ ctxt->options |= HTML_PARSE_NOIMPLIED;
+ options -= HTML_PARSE_NOIMPLIED;
+ }
ctxt->dictNames = 0;
return (options);
}
@@ -6730,8 +6855,11 @@ htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseC
input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
XML_CHAR_ENCODING_NONE);
- if (input == NULL)
+ if (input == NULL) {
+ if (ioclose != NULL)
+ ioclose(ioctx);
return (NULL);
+ }
ctxt = htmlNewParserCtxt();
if (ctxt == NULL) {
xmlFreeParserInputBuffer(input);
@@ -6930,8 +7058,11 @@ htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCal
input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
XML_CHAR_ENCODING_NONE);
- if (input == NULL)
+ if (input == NULL) {
+ if (ioclose != NULL)
+ ioclose(ioctx);
return (NULL);
+ }
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
if (stream == NULL) {
xmlFreeParserInputBuffer(input);