version 1.1.1.2, 2013/07/22 01:22:22
|
version 1.1.1.3, 2014/06/15 19:53:29
|
Line 24
|
Line 24
|
#include "libxml.h" |
#include "libxml.h" |
|
|
#include <string.h> |
#include <string.h> |
|
#include <limits.h> |
|
|
#ifdef HAVE_CTYPE_H |
#ifdef HAVE_CTYPE_H |
#include <ctype.h> |
#include <ctype.h> |
Line 44
|
Line 45
|
#include <libxml/globals.h> |
#include <libxml/globals.h> |
#include <libxml/xmlerror.h> |
#include <libxml/xmlerror.h> |
|
|
|
#include "buf.h" |
|
#include "enc.h" |
|
|
static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; |
static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; |
static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; |
static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; |
|
|
Line 1513 xmlGetCharEncodingHandler(xmlCharEncoding enc) {
|
Line 1517 xmlGetCharEncodingHandler(xmlCharEncoding enc) {
|
if (handler != NULL) return(handler); |
if (handler != NULL) return(handler); |
handler = xmlFindCharEncodingHandler("EBCDIC-US"); |
handler = xmlFindCharEncodingHandler("EBCDIC-US"); |
if (handler != NULL) return(handler); |
if (handler != NULL) return(handler); |
|
handler = xmlFindCharEncodingHandler("IBM-037"); |
|
if (handler != NULL) return(handler); |
break; |
break; |
case XML_CHAR_ENCODING_UCS4BE: |
case XML_CHAR_ENCODING_UCS4BE: |
handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); |
handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); |
Line 1825 xmlIconvWrapper(iconv_t cd, unsigned char *out, int *o
|
Line 1831 xmlIconvWrapper(iconv_t cd, unsigned char *out, int *o
|
|
|
/************************************************************************ |
/************************************************************************ |
* * |
* * |
* ICU based generic conversion functions * | * ICU based generic conversion functions * |
* * |
* * |
************************************************************************/ |
************************************************************************/ |
|
|
Line 1897 xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned c
|
Line 1903 xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned c
|
* The real API used by libxml for on-the-fly conversion * |
* The real API used by libxml for on-the-fly conversion * |
* * |
* * |
************************************************************************/ |
************************************************************************/ |
int |
|
xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
|
xmlBufferPtr in, int len); |
|
|
|
/** |
/** |
* xmlCharEncFirstLineInt: |
* xmlCharEncFirstLineInt: |
Line 1946 xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler
|
Line 1949 xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler
|
toconv = 180; |
toconv = 180; |
} |
} |
if (toconv * 2 >= written) { |
if (toconv * 2 >= written) { |
xmlBufferGrow(out, toconv); | xmlBufferGrow(out, toconv * 2); |
written = out->size - out->use - 1; |
written = out->size - out->use - 1; |
} |
} |
|
|
Line 2029 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, x
|
Line 2032 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, x
|
} |
} |
|
|
/** |
/** |
|
* xmlCharEncFirstLineInput: |
|
* @input: a parser input buffer |
|
* @len: number of bytes to convert for the first line, or -1 |
|
* |
|
* Front-end for the encoding handler input function, but handle only |
|
* the very first line. Point is that this is based on autodetection |
|
* of the encoding and once that first line is converted we may find |
|
* out that a different decoder is needed to process the input. |
|
* |
|
* Returns the number of byte written if success, or |
|
* -1 general error |
|
* -2 if the transcoding fails (for *in is not valid utf8 string or |
|
* the result of transformation can't fit into the encoding we want), or |
|
*/ |
|
int |
|
xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len) |
|
{ |
|
int ret = -2; |
|
size_t written; |
|
size_t toconv; |
|
int c_in; |
|
int c_out; |
|
xmlBufPtr in; |
|
xmlBufPtr out; |
|
|
|
if ((input == NULL) || (input->encoder == NULL) || |
|
(input->buffer == NULL) || (input->raw == NULL)) |
|
return (-1); |
|
out = input->buffer; |
|
in = input->raw; |
|
|
|
toconv = xmlBufUse(in); |
|
if (toconv == 0) |
|
return (0); |
|
written = xmlBufAvail(out) - 1; /* count '\0' */ |
|
/* |
|
* echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 |
|
* 45 chars should be sufficient to reach the end of the encoding |
|
* declaration without going too far inside the document content. |
|
* on UTF-16 this means 90bytes, on UCS4 this means 180 |
|
* The actual value depending on guessed encoding is passed as @len |
|
* if provided |
|
*/ |
|
if (len >= 0) { |
|
if (toconv > (unsigned int) len) |
|
toconv = len; |
|
} else { |
|
if (toconv > 180) |
|
toconv = 180; |
|
} |
|
if (toconv * 2 >= written) { |
|
xmlBufGrow(out, toconv * 2); |
|
written = xmlBufAvail(out) - 1; |
|
} |
|
if (written > 360) |
|
written = 360; |
|
|
|
c_in = toconv; |
|
c_out = written; |
|
if (input->encoder->input != NULL) { |
|
ret = input->encoder->input(xmlBufEnd(out), &c_out, |
|
xmlBufContent(in), &c_in); |
|
xmlBufShrink(in, c_in); |
|
xmlBufAddLen(out, c_out); |
|
} |
|
#ifdef LIBXML_ICONV_ENABLED |
|
else if (input->encoder->iconv_in != NULL) { |
|
ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out), |
|
&c_out, xmlBufContent(in), &c_in); |
|
xmlBufShrink(in, c_in); |
|
xmlBufAddLen(out, c_out); |
|
if (ret == -1) |
|
ret = -3; |
|
} |
|
#endif /* LIBXML_ICONV_ENABLED */ |
|
#ifdef LIBXML_ICU_ENABLED |
|
else if (input->encoder->uconv_in != NULL) { |
|
ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out), |
|
&c_out, xmlBufContent(in), &c_in); |
|
xmlBufShrink(in, c_in); |
|
xmlBufAddLen(out, c_out); |
|
if (ret == -1) |
|
ret = -3; |
|
} |
|
#endif /* LIBXML_ICU_ENABLED */ |
|
switch (ret) { |
|
case 0: |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"converted %d bytes to %d bytes of input\n", |
|
c_in, c_out); |
|
#endif |
|
break; |
|
case -1: |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"converted %d bytes to %d bytes of input, %d left\n", |
|
c_in, c_out, (int)xmlBufUse(in)); |
|
#endif |
|
break; |
|
case -3: |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"converted %d bytes to %d bytes of input, %d left\n", |
|
c_in, c_out, (int)xmlBufUse(in)); |
|
#endif |
|
break; |
|
case -2: { |
|
char buf[50]; |
|
const xmlChar *content = xmlBufContent(in); |
|
|
|
snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", |
|
content[0], content[1], |
|
content[2], content[3]); |
|
buf[49] = 0; |
|
xmlEncodingErr(XML_I18N_CONV_FAILED, |
|
"input conversion failed due to input error, bytes %s\n", |
|
buf); |
|
} |
|
} |
|
/* |
|
* Ignore when input buffer is not on a boundary |
|
*/ |
|
if (ret == -3) ret = 0; |
|
if (ret == -1) ret = 0; |
|
return(ret); |
|
} |
|
|
|
/** |
|
* xmlCharEncInput: |
|
* @input: a parser input buffer |
|
* @flush: try to flush all the raw buffer |
|
* |
|
* Generic front-end for the encoding handler on parser input |
|
* |
|
* Returns the number of byte written if success, or |
|
* -1 general error |
|
* -2 if the transcoding fails (for *in is not valid utf8 string or |
|
* the result of transformation can't fit into the encoding we want), or |
|
*/ |
|
int |
|
xmlCharEncInput(xmlParserInputBufferPtr input, int flush) |
|
{ |
|
int ret = -2; |
|
size_t written; |
|
size_t toconv; |
|
int c_in; |
|
int c_out; |
|
xmlBufPtr in; |
|
xmlBufPtr out; |
|
|
|
if ((input == NULL) || (input->encoder == NULL) || |
|
(input->buffer == NULL) || (input->raw == NULL)) |
|
return (-1); |
|
out = input->buffer; |
|
in = input->raw; |
|
|
|
toconv = xmlBufUse(in); |
|
if (toconv == 0) |
|
return (0); |
|
if ((toconv > 64 * 1024) && (flush == 0)) |
|
toconv = 64 * 1024; |
|
written = xmlBufAvail(out); |
|
if (written > 0) |
|
written--; /* count '\0' */ |
|
if (toconv * 2 >= written) { |
|
xmlBufGrow(out, toconv * 2); |
|
written = xmlBufAvail(out); |
|
if (written > 0) |
|
written--; /* count '\0' */ |
|
} |
|
if ((written > 128 * 1024) && (flush == 0)) |
|
written = 128 * 1024; |
|
|
|
c_in = toconv; |
|
c_out = written; |
|
if (input->encoder->input != NULL) { |
|
ret = input->encoder->input(xmlBufEnd(out), &c_out, |
|
xmlBufContent(in), &c_in); |
|
xmlBufShrink(in, c_in); |
|
xmlBufAddLen(out, c_out); |
|
} |
|
#ifdef LIBXML_ICONV_ENABLED |
|
else if (input->encoder->iconv_in != NULL) { |
|
ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out), |
|
&c_out, xmlBufContent(in), &c_in); |
|
xmlBufShrink(in, c_in); |
|
xmlBufAddLen(out, c_out); |
|
if (ret == -1) |
|
ret = -3; |
|
} |
|
#endif /* LIBXML_ICONV_ENABLED */ |
|
#ifdef LIBXML_ICU_ENABLED |
|
else if (input->encoder->uconv_in != NULL) { |
|
ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out), |
|
&c_out, xmlBufContent(in), &c_in); |
|
xmlBufShrink(in, c_in); |
|
xmlBufAddLen(out, c_out); |
|
if (ret == -1) |
|
ret = -3; |
|
} |
|
#endif /* LIBXML_ICU_ENABLED */ |
|
switch (ret) { |
|
case 0: |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"converted %d bytes to %d bytes of input\n", |
|
c_in, c_out); |
|
#endif |
|
break; |
|
case -1: |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"converted %d bytes to %d bytes of input, %d left\n", |
|
c_in, c_out, (int)xmlBufUse(in)); |
|
#endif |
|
break; |
|
case -3: |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"converted %d bytes to %d bytes of input, %d left\n", |
|
c_in, c_out, (int)xmlBufUse(in)); |
|
#endif |
|
break; |
|
case -2: { |
|
char buf[50]; |
|
const xmlChar *content = xmlBufContent(in); |
|
|
|
snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", |
|
content[0], content[1], |
|
content[2], content[3]); |
|
buf[49] = 0; |
|
xmlEncodingErr(XML_I18N_CONV_FAILED, |
|
"input conversion failed due to input error, bytes %s\n", |
|
buf); |
|
} |
|
} |
|
/* |
|
* Ignore when input buffer is not on a boundary |
|
*/ |
|
if (ret == -3) |
|
ret = 0; |
|
return (c_out? c_out : ret); |
|
} |
|
|
|
/** |
* xmlCharEncInFunc: |
* xmlCharEncInFunc: |
* @handler: char encoding transformation data structure |
* @handler: char encoding transformation data structure |
* @out: an xmlBuffer for the output. |
* @out: an xmlBuffer for the output. |
Line 2136 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xml
|
Line 2385 xmlCharEncInFunc(xmlCharEncodingHandler * handler, xml
|
} |
} |
|
|
/** |
/** |
|
* xmlCharEncOutput: |
|
* @output: a parser output buffer |
|
* @init: is this an initialization call without data |
|
* |
|
* Generic front-end for the encoding handler on parser output |
|
* a first call with @init == 1 has to be made first to initiate the |
|
* output in case of non-stateless encoding needing to initiate their |
|
* state or the output (like the BOM in UTF16). |
|
* In case of UTF8 sequence conversion errors for the given encoder, |
|
* the content will be automatically remapped to a CharRef sequence. |
|
* |
|
* Returns the number of byte written if success, or |
|
* -1 general error |
|
* -2 if the transcoding fails (for *in is not valid utf8 string or |
|
* the result of transformation can't fit into the encoding we want), or |
|
*/ |
|
int |
|
xmlCharEncOutput(xmlOutputBufferPtr output, int init) |
|
{ |
|
int ret = -2; |
|
size_t written; |
|
size_t writtentot = 0; |
|
size_t toconv; |
|
int c_in; |
|
int c_out; |
|
xmlBufPtr in; |
|
xmlBufPtr out; |
|
int charref_len = 0; |
|
|
|
if ((output == NULL) || (output->encoder == NULL) || |
|
(output->buffer == NULL) || (output->conv == NULL)) |
|
return (-1); |
|
out = output->conv; |
|
in = output->buffer; |
|
|
|
retry: |
|
|
|
written = xmlBufAvail(out); |
|
if (written > 0) |
|
written--; /* count '\0' */ |
|
|
|
/* |
|
* First specific handling of the initialization call |
|
*/ |
|
if (init) { |
|
c_in = 0; |
|
c_out = written; |
|
if (output->encoder->output != NULL) { |
|
ret = output->encoder->output(xmlBufEnd(out), &c_out, |
|
NULL, &c_in); |
|
if (ret > 0) /* Gennady: check return value */ |
|
xmlBufAddLen(out, c_out); |
|
} |
|
#ifdef LIBXML_ICONV_ENABLED |
|
else if (output->encoder->iconv_out != NULL) { |
|
ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out), |
|
&c_out, NULL, &c_in); |
|
xmlBufAddLen(out, c_out); |
|
} |
|
#endif /* LIBXML_ICONV_ENABLED */ |
|
#ifdef LIBXML_ICU_ENABLED |
|
else if (output->encoder->uconv_out != NULL) { |
|
ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out), |
|
&c_out, NULL, &c_in); |
|
xmlBufAddLen(out, c_out); |
|
} |
|
#endif /* LIBXML_ICU_ENABLED */ |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"initialized encoder\n"); |
|
#endif |
|
return(0); |
|
} |
|
|
|
/* |
|
* Conversion itself. |
|
*/ |
|
toconv = xmlBufUse(in); |
|
if (toconv == 0) |
|
return (0); |
|
if (toconv > 64 * 1024) |
|
toconv = 64 * 1024; |
|
if (toconv * 4 >= written) { |
|
xmlBufGrow(out, toconv * 4); |
|
written = xmlBufAvail(out) - 1; |
|
} |
|
if (written > 256 * 1024) |
|
written = 256 * 1024; |
|
|
|
c_in = toconv; |
|
c_out = written; |
|
if (output->encoder->output != NULL) { |
|
ret = output->encoder->output(xmlBufEnd(out), &c_out, |
|
xmlBufContent(in), &c_in); |
|
if (c_out > 0) { |
|
xmlBufShrink(in, c_in); |
|
xmlBufAddLen(out, c_out); |
|
writtentot += c_out; |
|
} |
|
} |
|
#ifdef LIBXML_ICONV_ENABLED |
|
else if (output->encoder->iconv_out != NULL) { |
|
ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out), |
|
&c_out, xmlBufContent(in), &c_in); |
|
xmlBufShrink(in, c_in); |
|
xmlBufAddLen(out, c_out); |
|
writtentot += c_out; |
|
if (ret == -1) { |
|
if (c_out > 0) { |
|
/* |
|
* Can be a limitation of iconv |
|
*/ |
|
charref_len = 0; |
|
goto retry; |
|
} |
|
ret = -3; |
|
} |
|
} |
|
#endif /* LIBXML_ICONV_ENABLED */ |
|
#ifdef LIBXML_ICU_ENABLED |
|
else if (output->encoder->uconv_out != NULL) { |
|
ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out), |
|
&c_out, xmlBufContent(in), &c_in); |
|
xmlBufShrink(in, c_in); |
|
xmlBufAddLen(out, c_out); |
|
writtentot += c_out; |
|
if (ret == -1) { |
|
if (c_out > 0) { |
|
/* |
|
* Can be a limitation of uconv |
|
*/ |
|
charref_len = 0; |
|
goto retry; |
|
} |
|
ret = -3; |
|
} |
|
} |
|
#endif /* LIBXML_ICU_ENABLED */ |
|
else { |
|
xmlEncodingErr(XML_I18N_NO_OUTPUT, |
|
"xmlCharEncOutFunc: no output function !\n", NULL); |
|
return(-1); |
|
} |
|
|
|
if (ret >= 0) output += ret; |
|
|
|
/* |
|
* Attempt to handle error cases |
|
*/ |
|
switch (ret) { |
|
case 0: |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"converted %d bytes to %d bytes of output\n", |
|
c_in, c_out); |
|
#endif |
|
break; |
|
case -1: |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"output conversion failed by lack of space\n"); |
|
#endif |
|
break; |
|
case -3: |
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n", |
|
c_in, c_out, (int) xmlBufUse(in)); |
|
#endif |
|
break; |
|
case -2: { |
|
int len = (int) xmlBufUse(in); |
|
xmlChar *content = xmlBufContent(in); |
|
int cur; |
|
|
|
cur = xmlGetUTF8Char(content, &len); |
|
if ((charref_len != 0) && (c_out < charref_len)) { |
|
/* |
|
* We attempted to insert a character reference and failed. |
|
* Undo what was written and skip the remaining charref. |
|
*/ |
|
xmlBufErase(out, c_out); |
|
writtentot -= c_out; |
|
xmlBufShrink(in, charref_len - c_out); |
|
charref_len = 0; |
|
|
|
ret = -1; |
|
break; |
|
} else if (cur > 0) { |
|
xmlChar charref[20]; |
|
|
|
#ifdef DEBUG_ENCODING |
|
xmlGenericError(xmlGenericErrorContext, |
|
"handling output conversion error\n"); |
|
xmlGenericError(xmlGenericErrorContext, |
|
"Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", |
|
content[0], content[1], |
|
content[2], content[3]); |
|
#endif |
|
/* |
|
* Removes the UTF8 sequence, and replace it by a charref |
|
* and continue the transcoding phase, hoping the error |
|
* did not mangle the encoder state. |
|
*/ |
|
charref_len = snprintf((char *) &charref[0], sizeof(charref), |
|
"&#%d;", cur); |
|
xmlBufShrink(in, len); |
|
xmlBufAddHead(in, charref, -1); |
|
|
|
goto retry; |
|
} else { |
|
char buf[50]; |
|
|
|
snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", |
|
content[0], content[1], |
|
content[2], content[3]); |
|
buf[49] = 0; |
|
xmlEncodingErr(XML_I18N_CONV_FAILED, |
|
"output conversion failed due to conv error, bytes %s\n", |
|
buf); |
|
if (xmlBufGetAllocationScheme(in) != XML_BUFFER_ALLOC_IMMUTABLE) |
|
content[0] = ' '; |
|
} |
|
break; |
|
} |
|
} |
|
return(ret); |
|
} |
|
|
|
/** |
* xmlCharEncOutFunc: |
* xmlCharEncOutFunc: |
* @handler: char enconding transformation data structure |
* @handler: char enconding transformation data structure |
* @out: an xmlBuffer for the output. |
* @out: an xmlBuffer for the output. |
Line 2198 retry:
|
Line 2676 retry:
|
else if (handler->uconv_out != NULL) { |
else if (handler->uconv_out != NULL) { |
ret = xmlUconvWrapper(handler->uconv_out, 0, |
ret = xmlUconvWrapper(handler->uconv_out, 0, |
&out->content[out->use], |
&out->content[out->use], |
&written, NULL, &toconv); | &written, NULL, &toconv); |
out->use += written; |
out->use += written; |
out->content[out->use] = 0; |
out->content[out->use] = 0; |
} |
} |
Line 2619 UTF8ToISO8859x(unsigned char* out, int *outlen,
|
Line 3097 UTF8ToISO8859x(unsigned char* out, int *outlen,
|
c2 = c2 & 0x3F; |
c2 = c2 & 0x3F; |
d = d & 0x0F; |
d = d & 0x0F; |
d = xlattable [48 + c2 + xlattable [48 + c1 + |
d = xlattable [48 + c2 + xlattable [48 + c1 + |
xlattable [32 + d] * 64] * 64]; | xlattable [32 + d] * 64] * 64]; |
if (d == 0) { |
if (d == 0) { |
/* not in character set */ |
/* not in character set */ |
*outlen = out - outstart; |
*outlen = out - outstart; |