--- embedaddon/libxml2/encoding.c 2013/07/22 01:22:22 1.1.1.2 +++ embedaddon/libxml2/encoding.c 2014/06/15 19:53:29 1.1.1.3 @@ -24,6 +24,7 @@ #include "libxml.h" #include +#include #ifdef HAVE_CTYPE_H #include @@ -44,6 +45,9 @@ #include #include +#include "buf.h" +#include "enc.h" + static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; @@ -1513,6 +1517,8 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("EBCDIC-US"); if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("IBM-037"); + if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_UCS4BE: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); @@ -1825,7 +1831,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *o /************************************************************************ * * - * ICU based generic conversion functions * + * ICU based generic conversion functions * * * ************************************************************************/ @@ -1897,9 +1903,6 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned c * The real API used by libxml for on-the-fly conversion * * * ************************************************************************/ -int -xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, - xmlBufferPtr in, int len); /** * xmlCharEncFirstLineInt: @@ -1946,7 +1949,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler toconv = 180; } if (toconv * 2 >= written) { - xmlBufferGrow(out, toconv); + xmlBufferGrow(out, toconv * 2); written = out->size - out->use - 1; } @@ -2029,6 +2032,252 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, x } /** + * xmlCharEncFirstLineInput: + * @input: a parser input buffer + * @len: number of bytes to convert for the first line, or -1 + * + * Front-end for the encoding handler input function, but handle only + * the very first line. Point is that this is based on autodetection + * of the encoding and once that first line is converted we may find + * out that a different decoder is needed to process the input. + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len) +{ + int ret = -2; + size_t written; + size_t toconv; + int c_in; + int c_out; + xmlBufPtr in; + xmlBufPtr out; + + if ((input == NULL) || (input->encoder == NULL) || + (input->buffer == NULL) || (input->raw == NULL)) + return (-1); + out = input->buffer; + in = input->raw; + + toconv = xmlBufUse(in); + if (toconv == 0) + return (0); + written = xmlBufAvail(out) - 1; /* count '\0' */ + /* + * echo '' | wc -c => 38 + * 45 chars should be sufficient to reach the end of the encoding + * declaration without going too far inside the document content. + * on UTF-16 this means 90bytes, on UCS4 this means 180 + * The actual value depending on guessed encoding is passed as @len + * if provided + */ + if (len >= 0) { + if (toconv > (unsigned int) len) + toconv = len; + } else { + if (toconv > 180) + toconv = 180; + } + if (toconv * 2 >= written) { + xmlBufGrow(out, toconv * 2); + written = xmlBufAvail(out) - 1; + } + if (written > 360) + written = 360; + + c_in = toconv; + c_out = written; + if (input->encoder->input != NULL) { + ret = input->encoder->input(xmlBufEnd(out), &c_out, + xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + } +#ifdef LIBXML_ICONV_ENABLED + else if (input->encoder->iconv_in != NULL) { + ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (input->encoder->uconv_in != NULL) { + ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICU_ENABLED */ + switch (ret) { + case 0: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input\n", + c_in, c_out); +#endif + break; + case -1: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + c_in, c_out, (int)xmlBufUse(in)); +#endif + break; + case -3: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + c_in, c_out, (int)xmlBufUse(in)); +#endif + break; + case -2: { + char buf[50]; + const xmlChar *content = xmlBufContent(in); + + snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", + content[0], content[1], + content[2], content[3]); + buf[49] = 0; + xmlEncodingErr(XML_I18N_CONV_FAILED, + "input conversion failed due to input error, bytes %s\n", + buf); + } + } + /* + * Ignore when input buffer is not on a boundary + */ + if (ret == -3) ret = 0; + if (ret == -1) ret = 0; + return(ret); +} + +/** + * xmlCharEncInput: + * @input: a parser input buffer + * @flush: try to flush all the raw buffer + * + * Generic front-end for the encoding handler on parser input + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncInput(xmlParserInputBufferPtr input, int flush) +{ + int ret = -2; + size_t written; + size_t toconv; + int c_in; + int c_out; + xmlBufPtr in; + xmlBufPtr out; + + if ((input == NULL) || (input->encoder == NULL) || + (input->buffer == NULL) || (input->raw == NULL)) + return (-1); + out = input->buffer; + in = input->raw; + + toconv = xmlBufUse(in); + if (toconv == 0) + return (0); + if ((toconv > 64 * 1024) && (flush == 0)) + toconv = 64 * 1024; + written = xmlBufAvail(out); + if (written > 0) + written--; /* count '\0' */ + if (toconv * 2 >= written) { + xmlBufGrow(out, toconv * 2); + written = xmlBufAvail(out); + if (written > 0) + written--; /* count '\0' */ + } + if ((written > 128 * 1024) && (flush == 0)) + written = 128 * 1024; + + c_in = toconv; + c_out = written; + if (input->encoder->input != NULL) { + ret = input->encoder->input(xmlBufEnd(out), &c_out, + xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + } +#ifdef LIBXML_ICONV_ENABLED + else if (input->encoder->iconv_in != NULL) { + ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (input->encoder->uconv_in != NULL) { + ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICU_ENABLED */ + switch (ret) { + case 0: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input\n", + c_in, c_out); +#endif + break; + case -1: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + c_in, c_out, (int)xmlBufUse(in)); +#endif + break; + case -3: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + c_in, c_out, (int)xmlBufUse(in)); +#endif + break; + case -2: { + char buf[50]; + const xmlChar *content = xmlBufContent(in); + + snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", + content[0], content[1], + content[2], content[3]); + buf[49] = 0; + xmlEncodingErr(XML_I18N_CONV_FAILED, + "input conversion failed due to input error, bytes %s\n", + buf); + } + } + /* + * Ignore when input buffer is not on a boundary + */ + if (ret == -3) + ret = 0; + return (c_out? c_out : ret); +} + +/** * xmlCharEncInFunc: * @handler: char encoding transformation data structure * @out: an xmlBuffer for the output. @@ -2136,6 +2385,235 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xml } /** + * xmlCharEncOutput: + * @output: a parser output buffer + * @init: is this an initialization call without data + * + * Generic front-end for the encoding handler on parser output + * a first call with @init == 1 has to be made first to initiate the + * output in case of non-stateless encoding needing to initiate their + * state or the output (like the BOM in UTF16). + * In case of UTF8 sequence conversion errors for the given encoder, + * the content will be automatically remapped to a CharRef sequence. + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncOutput(xmlOutputBufferPtr output, int init) +{ + int ret = -2; + size_t written; + size_t writtentot = 0; + size_t toconv; + int c_in; + int c_out; + xmlBufPtr in; + xmlBufPtr out; + int charref_len = 0; + + if ((output == NULL) || (output->encoder == NULL) || + (output->buffer == NULL) || (output->conv == NULL)) + return (-1); + out = output->conv; + in = output->buffer; + +retry: + + written = xmlBufAvail(out); + if (written > 0) + written--; /* count '\0' */ + + /* + * First specific handling of the initialization call + */ + if (init) { + c_in = 0; + c_out = written; + if (output->encoder->output != NULL) { + ret = output->encoder->output(xmlBufEnd(out), &c_out, + NULL, &c_in); + if (ret > 0) /* Gennady: check return value */ + xmlBufAddLen(out, c_out); + } +#ifdef LIBXML_ICONV_ENABLED + else if (output->encoder->iconv_out != NULL) { + ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out), + &c_out, NULL, &c_in); + xmlBufAddLen(out, c_out); + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (output->encoder->uconv_out != NULL) { + ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out), + &c_out, NULL, &c_in); + xmlBufAddLen(out, c_out); + } +#endif /* LIBXML_ICU_ENABLED */ +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "initialized encoder\n"); +#endif + return(0); + } + + /* + * Conversion itself. + */ + toconv = xmlBufUse(in); + if (toconv == 0) + return (0); + if (toconv > 64 * 1024) + toconv = 64 * 1024; + if (toconv * 4 >= written) { + xmlBufGrow(out, toconv * 4); + written = xmlBufAvail(out) - 1; + } + if (written > 256 * 1024) + written = 256 * 1024; + + c_in = toconv; + c_out = written; + if (output->encoder->output != NULL) { + ret = output->encoder->output(xmlBufEnd(out), &c_out, + xmlBufContent(in), &c_in); + if (c_out > 0) { + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + writtentot += c_out; + } + } +#ifdef LIBXML_ICONV_ENABLED + else if (output->encoder->iconv_out != NULL) { + ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + writtentot += c_out; + if (ret == -1) { + if (c_out > 0) { + /* + * Can be a limitation of iconv + */ + charref_len = 0; + goto retry; + } + ret = -3; + } + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (output->encoder->uconv_out != NULL) { + ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + writtentot += c_out; + if (ret == -1) { + if (c_out > 0) { + /* + * Can be a limitation of uconv + */ + charref_len = 0; + goto retry; + } + ret = -3; + } + } +#endif /* LIBXML_ICU_ENABLED */ + else { + xmlEncodingErr(XML_I18N_NO_OUTPUT, + "xmlCharEncOutFunc: no output function !\n", NULL); + return(-1); + } + + if (ret >= 0) output += ret; + + /* + * Attempt to handle error cases + */ + switch (ret) { + case 0: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of output\n", + c_in, c_out); +#endif + break; + case -1: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "output conversion failed by lack of space\n"); +#endif + break; + case -3: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n", + c_in, c_out, (int) xmlBufUse(in)); +#endif + break; + case -2: { + int len = (int) xmlBufUse(in); + xmlChar *content = xmlBufContent(in); + int cur; + + cur = xmlGetUTF8Char(content, &len); + if ((charref_len != 0) && (c_out < charref_len)) { + /* + * We attempted to insert a character reference and failed. + * Undo what was written and skip the remaining charref. + */ + xmlBufErase(out, c_out); + writtentot -= c_out; + xmlBufShrink(in, charref_len - c_out); + charref_len = 0; + + ret = -1; + break; + } else if (cur > 0) { + xmlChar charref[20]; + +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "handling output conversion error\n"); + xmlGenericError(xmlGenericErrorContext, + "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + content[0], content[1], + content[2], content[3]); +#endif + /* + * Removes the UTF8 sequence, and replace it by a charref + * and continue the transcoding phase, hoping the error + * did not mangle the encoder state. + */ + charref_len = snprintf((char *) &charref[0], sizeof(charref), + "&#%d;", cur); + xmlBufShrink(in, len); + xmlBufAddHead(in, charref, -1); + + goto retry; + } else { + char buf[50]; + + snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", + content[0], content[1], + content[2], content[3]); + buf[49] = 0; + xmlEncodingErr(XML_I18N_CONV_FAILED, + "output conversion failed due to conv error, bytes %s\n", + buf); + if (xmlBufGetAllocationScheme(in) != XML_BUFFER_ALLOC_IMMUTABLE) + content[0] = ' '; + } + break; + } + } + return(ret); +} + +/** * xmlCharEncOutFunc: * @handler: char enconding transformation data structure * @out: an xmlBuffer for the output. @@ -2198,7 +2676,7 @@ retry: else if (handler->uconv_out != NULL) { ret = xmlUconvWrapper(handler->uconv_out, 0, &out->content[out->use], - &written, NULL, &toconv); + &written, NULL, &toconv); out->use += written; out->content[out->use] = 0; } @@ -2619,7 +3097,7 @@ UTF8ToISO8859x(unsigned char* out, int *outlen, c2 = c2 & 0x3F; d = d & 0x0F; d = xlattable [48 + c2 + xlattable [48 + c1 + - xlattable [32 + d] * 64] * 64]; + xlattable [32 + d] * 64] * 64]; if (d == 0) { /* not in character set */ *outlen = out - outstart;