Annotation of gpl/axl/babel/axl_babel.c, revision 1.1
1.1 ! misho 1: /*
! 2: * LibAxl: Another XML library
! 3: * Copyright (C) 2008 Advanced Software Production Line, S.L.
! 4: *
! 5: * This program is free software; you can redistribute it and/or
! 6: * modify it under the terms of the GNU Lesser General Public License
! 7: * as published by the Free Software Foundation; either version 2.1 of
! 8: * the License, or (at your option) any later version.
! 9: *
! 10: * This program is distributed in the hope that it will be useful,
! 11: * but WITHOUT ANY WARRANTY; without even the implied warranty of
! 12: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! 13: * GNU Lesser General Public License for more details.
! 14: *
! 15: * You should have received a copy of the GNU Lesser General Public
! 16: * License along with this program; if not, write to the Free
! 17: * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
! 18: * 02111-1307 USA
! 19: *
! 20: * You may find a copy of the license under this software is released
! 21: * at COPYING file. This is LGPL software: you are welcome to
! 22: * develop proprietary applications using this library without any
! 23: * royalty or fee but returning back any change, improvement or
! 24: * addition in the form of source code, project image, documentation
! 25: * patches, etc.
! 26: *
! 27: * For commercial support on build XML enabled solutions contact us:
! 28: *
! 29: * Postal address:
! 30: * Advanced Software Production Line, S.L.
! 31: * Edificio Alius A, Oficina 102,
! 32: * C/ Antonio Suarez Nº 10,
! 33: * Alcalá de Henares 28802 Madrid
! 34: * Spain
! 35: *
! 36: * Email address:
! 37: * info@aspl.es - http://www.aspl.es/xml
! 38: */
! 39:
! 40: #include <axl_babel.h>
! 41:
! 42: /* include local headers */
! 43: #include <axl_babel_iso88591.h>
! 44: #include <axl_babel_iso88592.h>
! 45: #include <axl_babel_iso88593.h>
! 46: #include <axl_babel_iso88594.h>
! 47: #include <axl_babel_iso88595.h>
! 48: #include <axl_babel_iso88596.h>
! 49: #include <axl_babel_iso88597.h>
! 50: #include <axl_babel_iso88598.h>
! 51: #include <axl_babel_iso88599.h>
! 52: #include <axl_babel_iso885915.h>
! 53:
! 54: #define LOG_DOMAIN "axl-babel"
! 55:
! 56: /**
! 57: * \defgroup axl_babel Axl Babel: Main functions to enable axl babel support
! 58: */
! 59:
! 60: /**
! 61: * \addtogroup axl_babel
! 62: * @{
! 63: */
! 64:
! 65: /**
! 66: * @brief Allows to configure babel encoding functions making axl
! 67: * library to use its API to support encoding formats.
! 68: *
! 69: * Current encoding format supported at:
! 70: *
! 71: * - utf-8, ascii.
! 72: *
! 73: * - iso-8859-1, iso-8859-2, iso-8859-3, iso-8859-4, iso-8859-5,
! 74: * iso-8859-6, iso-8859-7, iso-8859-8, iso-8859-9, iso-8859-15
! 75: *
! 76: * @param error An optional reference to an axlError where failure
! 77: * will be notified.
! 78: *
! 79: * @return axl_true if the init operation was properly implemented,
! 80: * otherwise axl_false is returned.
! 81: */
! 82: axl_bool axl_babel_init (axlError ** error)
! 83: {
! 84: /* call to configure babel */
! 85: axl_doc_set_detect_codification_func (axl_babel_detect_codification, NULL);
! 86: axl_doc_set_configure_codification_func (axl_babel_configure_encoding, NULL);
! 87:
! 88: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "configure axl-babel handlers..");
! 89:
! 90: return axl_true;
! 91: }
! 92:
! 93: /**
! 94: * @brief Remove handlers installed and babel configuration from base
! 95: * library.
! 96: */
! 97: void axl_babel_finish ()
! 98: {
! 99: /* call to configure babel */
! 100: axl_doc_set_detect_codification_func (NULL, NULL);
! 101: axl_doc_set_configure_codification_func (NULL, NULL);
! 102:
! 103: return;
! 104: }
! 105:
! 106: /**
! 107: * @internal Library function that allows to detect entity codification
! 108: * found to use the appropiate built-in decoder handler until the
! 109: * right codification is found (due to encoding header
! 110: * declaration). The intention is to move the content read from the
! 111: * stream abstraction into a utf-8 unified representation inside
! 112: * memory.
! 113: *
! 114: * @param doc The document that is about to be checked for the
! 115: * appropiate codification.
! 116: *
! 117: * @param encoding Detected encoding by the function.
! 118: *
! 119: * @param error The reference where errors will be reported.
! 120: *
! 121: * @return axl_true if the codification detection was performed properly,
! 122: * otherwise axl_false is returned if an error is found.
! 123: */
! 124: axl_bool axl_babel_detect_codification (axlStream * stream,
! 125: const char ** encoding,
! 126: axlPointer user_data,
! 127: axlError ** error)
! 128: {
! 129: /* check basic case where the stream have no content to
! 130: * parse */
! 131: if (axl_stream_get_size (stream) < 4) {
! 132: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "unable to detect codification, stream received doesn't have enough content to parse");
! 133: return axl_false;
! 134: } /* end if */
! 135:
! 136: /* clear encoding */
! 137: if (encoding)
! 138: (*encoding) = NULL;
! 139:
! 140: /* Check built-in supported formats. First check for documents
! 141: * with the BOM mark configured */
! 142:
! 143: /* check UTF-8 BOM: EF BB BF */
! 144: if (axl_stream_inspect_code (stream, 0xEF, 0) &&
! 145: axl_stream_inspect_code (stream, 0xBB, 1) &&
! 146: axl_stream_inspect_code (stream, 0xBF, 2)) {
! 147:
! 148: /* configure encoding detected */
! 149: if (encoding)
! 150: (*encoding) = "utf8";
! 151:
! 152: /* update stream */
! 153: axl_stream_move (stream, axl_stream_get_index (stream) + 3);
! 154:
! 155: /* found utf-8 encoding, install associated filter */
! 156: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "utf-8 BOM mark found, assuming utf-8 content");
! 157: return axl_true;
! 158: } /* end if */
! 159:
! 160: /* check UTF-16 (little-endian) BOM: FF FE */
! 161: if (axl_stream_inspect_code (stream, 0xFF, 0) &&
! 162: axl_stream_inspect_code (stream, 0xFE, 1)) {
! 163: /* configure encoding detected */
! 164: if (encoding)
! 165: (*encoding) = "utf16";
! 166:
! 167: /* update stream */
! 168: axl_stream_move (stream, axl_stream_get_index (stream) + 2);
! 169:
! 170: /* found utf-16 encoding, install associated filter */
! 171: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "utf-16 BOM mark found, assuming utf-16 content");
! 172: return axl_true;
! 173: }
! 174:
! 175: /* check UTF-32 (little-endian) BOM: FF FE 00 00 */
! 176: if (axl_stream_inspect_code (stream, 0xFF, 0) &&
! 177: axl_stream_inspect_code (stream, 0xFE, 1) &&
! 178: axl_stream_inspect_code (stream, 0x00, 2) &&
! 179: axl_stream_inspect_code (stream, 0x00, 3)) {
! 180: /* configure encoding detected */
! 181: if (encoding)
! 182: (*encoding) = "utf32";
! 183:
! 184: /* update stream */
! 185: axl_stream_move (stream, axl_stream_get_index (stream) + 4);
! 186:
! 187: /* found utf-16 encoding, install associated filter */
! 188: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "utf-32 BOM mark found, assuming utf-8 content");
! 189: return axl_true;
! 190: } /* end if */
! 191:
! 192: /* NO BOM MARK SECTION */
! 193:
! 194: /* detect utf-8, iso 646, ascii,...*/
! 195: if (axl_stream_inspect_code (stream, 0x3C, 0) &&
! 196: axl_stream_inspect_code (stream, 0X3F, 1) &&
! 197: axl_stream_inspect_code (stream, 0x78, 2) &&
! 198: axl_stream_inspect_code (stream, 0x6D, 3)) {
! 199: assume_utf8:
! 200: /* no encoding detected we are not sure */
! 201:
! 202: /* found utf-16 encoding, install associated filter */
! 203: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "found utf-8, iso 646, ascii or something similiar without mark, assuming utf-8 until encoding declaration..");
! 204: return axl_true;
! 205: } /* end if */
! 206:
! 207: /* check last case where an utf-8 document could be found without xml header */
! 208: if (axl_stream_inspect_code (stream, 0x3C, 0) &&
! 209: ! axl_stream_inspect_code (stream, 0x3C, 1) &&
! 210: ! axl_stream_inspect_code (stream, 0x3E, 1)) {
! 211: goto assume_utf8;
! 212: }
! 213:
! 214: /* unable to detect the encoding format */
! 215: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL,
! 216: "unable to detect encoding format, failed to detect encoding format");
! 217: axl_error_new (-1, "unable to detect encoding format, failed to detect encoding format", NULL, error);
! 218: return axl_false;
! 219:
! 220: }
! 221:
! 222: /**
! 223: * @internal Function that implements the identity operation. Does
! 224: * nothing just translates data from source to output.
! 225: */
! 226: int axl_babel_identity_utf8 (const char * source, int source_size,
! 227: const char * source_encoding,
! 228: char * output, int output_size,
! 229: int * output_converted,
! 230: int * remain_source_index,
! 231: axlPointer user_data)
! 232: {
! 233: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "received request to translate source code from iso-8859-15 to utf-8: source size=%d: %s",
! 234: source_size, source);
! 235:
! 236: /* identity operation */
! 237: memcpy (output, source, source_size);
! 238: (*output_converted) = source_size;
! 239:
! 240: /* operation completed */
! 241: return 1;
! 242: }
! 243:
! 244: /**
! 245: * @internal Function that performs translation from encoding
! 246: * representations using 1 octect (0..255) into utf-8.
! 247: *
! 248: * @return The handler must return 1 if the operation was completed, 2
! 249: * if the operation was completed but not enough size was found on
! 250: * output buffer to store the content or 0 if the function fails.
! 251: */
! 252: int axl_babel_single_to_utf8 (const char * source, int source_size,
! 253: const char * source_encoding,
! 254: char * output, int output_size,
! 255: int * output_converted,
! 256: int * remain_source_index,
! 257: axlPointer user_data)
! 258: {
! 259: axlBabelTable * table = (axlBabelTable *)user_data;
! 260: int iterator;
! 261: int iterator2;
! 262: int desp;
! 263: unsigned char value;
! 264:
! 265:
! 266: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "received request to translate source code from %s to utf-8: source size=%d on output size=%d: %s",
! 267: source_encoding, source_size, output_size, source);
! 268:
! 269: iterator = 0;
! 270: iterator2 = 0;
! 271: while (iterator < source_size && iterator2 < output_size) {
! 272:
! 273: /* get the value */
! 274: value = source[iterator];
! 275:
! 276: /* __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "translating value=%c (%d)", source[iterator], value); */
! 277:
! 278: /* check if we are able to place all the encoded item */
! 279: if ((iterator2 + table[value].size) > output_size) {
! 280: /* __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "unable to completely decode following sequence (%d size, on remain: %d)",
! 281: table[value].size, output_size - iterator2); */
! 282: break;
! 283: }
! 284:
! 285: desp = 0;
! 286: while ((desp < table[value].size)) {
! 287: /* __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, " configuring at %d value: %d",
! 288: iterator2 + desp,
! 289: table[value].buffer[desp]); */
! 290: output[iterator2] = table[value].buffer[desp];
! 291: desp++;
! 292: iterator2++;
! 293: }
! 294:
! 295: /* next */
! 296: iterator++;
! 297:
! 298: } /* end while */
! 299:
! 300: /* update output converted */
! 301: *output_converted = iterator2;
! 302: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "output converted=%d: %s (iterator:%d == source_size:%d)",
! 303: iterator2, output, iterator, source_size);
! 304:
! 305: /* check exit condition (if all output was converted) */
! 306: if (iterator == source_size)
! 307: return 1;
! 308:
! 309: /* source to be converted remains */
! 310: *remain_source_index = iterator;
! 311: return 2;
! 312: }
! 313:
! 314: /**
! 315: * @internal Function that performs translation from encoding
! 316: * representations using 1 octect (0..255) into utf-8.
! 317: *
! 318: * @return The handler must return 1 if the operation was completed, 2
! 319: * if the operation was completed but not enough size was found on
! 320: * output buffer to store the content or 0 if the function fails.
! 321: */
! 322: int axl_babel_utf8_check (const char * source,
! 323: int source_size,
! 324: const char * source_encoding,
! 325: axlPointer user_data,
! 326: axlError ** error)
! 327: {
! 328: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "received notification to check content size=%d to have valid utf-8: %s", source_size, source);
! 329: return axl_babel_check_utf8_content (source, source_size, NULL) ? 1 : 0;
! 330: }
! 331:
! 332:
! 333: /**
! 334: * @internal Function that tries to check encoding found to configure the
! 335: * proper set of functions to translate from and to utf-8.
! 336: *
! 337: * @param stream Stream to be configured.
! 338: *
! 339: * @param encoding Encoding declaration found at the xml header.
! 340: *
! 341: * @param detected Detected encoding found by the detect codification
! 342: * configured.
! 343: *
! 344: * @param user_data User defined pointer.
! 345: *
! 346: * @param error An optional error that will be filled in the case an
! 347: * error is found.
! 348: *
! 349: * @return axl_true if the operation was completed, otherwise axl_false is
! 350: * returned.
! 351: */
! 352: axl_bool axl_babel_configure_encoding (axlStream * stream,
! 353: const char * encoding,
! 354: const char * detected,
! 355: axlPointer user_data, axlError ** error)
! 356: {
! 357: axlBabelTable * table = NULL;
! 358:
! 359: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "configuring final document enconding, previously detected=%s, declared=%s",
! 360: detected ? detected : "none",
! 361: encoding ? encoding : "none");
! 362:
! 363: /* check case were a encoding was detected (the entity content
! 364: * is encoded as detected due to marks or other means) */
! 365: if (detected && encoding == NULL)
! 366: detected = encoding;
! 367:
! 368: /* check encoding found (either detected or defined) */
! 369: if (axl_cmp (encoding, "iso88591")) {
! 370: /* install a translator handler */
! 371: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-1");
! 372: table = axl_babel_build_iso88591_table ();
! 373: } /* end if */
! 374:
! 375: if (axl_cmp (encoding, "iso88592")) {
! 376: /* install a translator handler */
! 377: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-2");
! 378: table = axl_babel_build_iso88592_table ();
! 379: } /* end if */
! 380:
! 381: if (axl_cmp (encoding, "iso88593")) {
! 382: /* install a translator handler */
! 383: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-3");
! 384: table = axl_babel_build_iso88593_table ();
! 385: } /* end if */
! 386:
! 387: if (axl_cmp (encoding, "iso88594")) {
! 388: /* install a translator handler */
! 389: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-4");
! 390: table = axl_babel_build_iso88594_table ();
! 391: } /* end if */
! 392:
! 393: if (axl_cmp (encoding, "iso88595")) {
! 394: /* install a translator handler */
! 395: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-5");
! 396: table = axl_babel_build_iso88595_table ();
! 397: } /* end if */
! 398:
! 399: if (axl_cmp (encoding, "iso88596")) {
! 400: /* install a translator handler */
! 401: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-6");
! 402: table = axl_babel_build_iso88596_table ();
! 403: } /* end if */
! 404:
! 405: if (axl_cmp (encoding, "iso88597")) {
! 406: /* install a translator handler */
! 407: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-7");
! 408: table = axl_babel_build_iso88597_table ();
! 409: } /* end if */
! 410:
! 411: if (axl_cmp (encoding, "iso88598")) {
! 412: /* install a translator handler */
! 413: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-8");
! 414: table = axl_babel_build_iso88598_table ();
! 415: } /* end if */
! 416:
! 417: if (axl_cmp (encoding, "iso88599")) {
! 418: /* install a translator handler */
! 419: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-9");
! 420: table = axl_babel_build_iso88599_table ();
! 421: } /* end if */
! 422:
! 423: if (axl_cmp (encoding, "iso885915")) {
! 424: /* install a translator handler */
! 425: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-15");
! 426: table = axl_babel_build_iso885915_table ();
! 427: } /* end if */
! 428:
! 429: if (axl_cmp (encoding, "utf8")) {
! 430: /* install a translator handler */
! 431: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for utf-8");
! 432:
! 433: /* install checker without table */
! 434: if (! axl_stream_setup_check (stream, encoding, axl_babel_utf8_check, NULL, error))
! 435: return axl_false;
! 436: return axl_true;
! 437: } /* end if */
! 438:
! 439: if (table == NULL) {
! 440: /* format not defined, use default utf-8 */
! 441: __axl_log (LOG_DOMAIN, AXL_LEVEL_WARNING, "encoding='%s' (detected: '%s') not supported, falling back into utf-8 without restriction",
! 442: encoding ? encoding : "",
! 443: detected ? detected : "");
! 444:
! 445: return axl_true;
! 446: } /* end if */
! 447:
! 448: /* associate to the stream */
! 449: axl_stream_link_full (stream, table, axl_free, axl_true);
! 450:
! 451: if (! axl_stream_setup_decode (stream, encoding, axl_babel_single_to_utf8, table, error))
! 452: return axl_false;
! 453:
! 454: return axl_true;
! 455: }
! 456:
! 457: /**
! 458: * @brief Allows to check if the provided string is in utf-8 coding
! 459: * form.
! 460: *
! 461: * @param content The content to length.
! 462: *
! 463: * @param content_length Length (in octets) of the string received. If
! 464: * provided -1, content will be calculated using strlen function.
! 465: *
! 466: * @param index_error Optional reference where will be reported the
! 467: * index position that caused the error.
! 468: *
! 469: * @return axl_true if the content provided is all in utf-8 otherwise
! 470: * axl_false is returned. In the case index_error or error is defined and
! 471: * an error is found, they are defined to the appropriate value.
! 472: */
! 473: axl_bool axl_babel_check_utf8_content (const char * content,
! 474: int content_length,
! 475: int * index_error)
! 476: {
! 477: int iterator = 0;
! 478: unsigned char value;
! 479:
! 480:
! 481: if (index_error)
! 482: *index_error = 0;
! 483:
! 484: axl_return_val_if_fail (content, axl_false);
! 485: axl_return_val_if_fail (content_length >= -1, axl_false);
! 486:
! 487: /* check and calculate content */
! 488: if (content_length == -1)
! 489: content_length = strlen (content);
! 490:
! 491: while (iterator < content_length) {
! 492: /* utf with 4 octects */
! 493: value = content[iterator];
! 494: if (value >= 240 && value <= 247 && (iterator + 1) < content_length ) {
! 495:
! 496: /* get next value */
! 497: value = content[iterator + 1];
! 498: if (value >= 128 && value <= 191 && (iterator + 2) < content_length ) {
! 499:
! 500: /* get next value */
! 501: value = content[iterator + 2];
! 502: if (value >= 128 && value <= 191 && (iterator + 3) < content_length ) {
! 503:
! 504: /* get next value */
! 505: value = content[iterator + 3];
! 506: if (value >= 128 && value <= 191) {
! 507: iterator += 4;
! 508: continue;
! 509: } /* end if */
! 510: }
! 511: }
! 512:
! 513: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL, "found error while detecting 4 octect utf-8 format..");
! 514: /* found error */
! 515: if (index_error)
! 516: *index_error = iterator;
! 517: return axl_false;
! 518: } /* end if */
! 519:
! 520: /* utf with 3 octects */
! 521: if (value >= 224 && value <= 239 && (iterator + 1) < content_length ) {
! 522: /* get next value */
! 523: value = content[iterator + 1];
! 524: if (value >= 128 && value <= 191 && (iterator + 2) < content_length ) {
! 525:
! 526: /* get next value */
! 527: value = content[iterator + 2];
! 528: if (value >= 128 && value <= 191) {
! 529: iterator += 3;
! 530: continue;
! 531: } /* end if */
! 532: }
! 533:
! 534: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL, "found error while detecting 3 octect utf-8 format..");
! 535:
! 536: /* found error */
! 537: if (index_error)
! 538: *index_error = iterator;
! 539: return axl_false;
! 540: }
! 541:
! 542: /* utf with 2 octects */
! 543: if (value >= 192 && value <= 223 && (iterator + 1) < content_length ) {
! 544: /* get next value */
! 545: value = content[iterator + 1];
! 546: if (value >= 128 && value <= 191) {
! 547: iterator += 2;
! 548: continue;
! 549: } /* end if */
! 550:
! 551: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL, "found error while detecting 2 octect utf-8 format value=%d..", value);
! 552:
! 553: /* found error */
! 554: if (index_error)
! 555: *index_error = iterator;
! 556: return axl_false;
! 557: }
! 558:
! 559: if (value <= 127 ) {
! 560: iterator++;
! 561: continue;
! 562: } /* end if */
! 563:
! 564: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL, "found error while detecting single octect utf-8 format..");
! 565:
! 566: /* found error */
! 567: if (index_error)
! 568: *index_error = iterator;
! 569: return axl_false;
! 570:
! 571:
! 572: } /* end while */
! 573:
! 574: return axl_true;
! 575: }
! 576:
! 577: /**
! 578: * @}
! 579: */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>