Annotation of gpl/axl/babel/axl_babel.c, revision 1.1.1.2
1.1 misho 1: /*
2: * LibAxl: Another XML library
3: * Copyright (C) 2008 Advanced Software Production Line, S.L.
4: *
5: * This program is free software; you can redistribute it and/or
6: * modify it under the terms of the GNU Lesser General Public License
7: * as published by the Free Software Foundation; either version 2.1 of
8: * the License, or (at your option) any later version.
9: *
10: * This program is distributed in the hope that it will be useful,
11: * but WITHOUT ANY WARRANTY; without even the implied warranty of
12: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13: * GNU Lesser General Public License for more details.
14: *
15: * You should have received a copy of the GNU Lesser General Public
16: * License along with this program; if not, write to the Free
17: * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18: * 02111-1307 USA
19: *
20: * You may find a copy of the license under this software is released
21: * at COPYING file. This is LGPL software: you are welcome to
22: * develop proprietary applications using this library without any
23: * royalty or fee but returning back any change, improvement or
24: * addition in the form of source code, project image, documentation
25: * patches, etc.
26: *
27: * For commercial support on build XML enabled solutions contact us:
28: *
29: * Postal address:
30: * Advanced Software Production Line, S.L.
31: * Edificio Alius A, Oficina 102,
32: * C/ Antonio Suarez Nº 10,
33: * Alcalá de Henares 28802 Madrid
34: * Spain
35: *
36: * Email address:
37: * info@aspl.es - http://www.aspl.es/xml
38: */
39:
40: #include <axl_babel.h>
41:
42: /* include local headers */
43: #include <axl_babel_iso88591.h>
44: #include <axl_babel_iso88592.h>
45: #include <axl_babel_iso88593.h>
46: #include <axl_babel_iso88594.h>
47: #include <axl_babel_iso88595.h>
48: #include <axl_babel_iso88596.h>
49: #include <axl_babel_iso88597.h>
50: #include <axl_babel_iso88598.h>
51: #include <axl_babel_iso88599.h>
52: #include <axl_babel_iso885915.h>
53:
54: #define LOG_DOMAIN "axl-babel"
55:
56: /**
57: * \defgroup axl_babel Axl Babel: Main functions to enable axl babel support
58: */
59:
60: /**
61: * \addtogroup axl_babel
62: * @{
63: */
64:
65: /**
66: * @brief Allows to configure babel encoding functions making axl
67: * library to use its API to support encoding formats.
68: *
69: * Current encoding format supported at:
70: *
71: * - utf-8, ascii.
72: *
73: * - iso-8859-1, iso-8859-2, iso-8859-3, iso-8859-4, iso-8859-5,
74: * iso-8859-6, iso-8859-7, iso-8859-8, iso-8859-9, iso-8859-15
75: *
76: * @param error An optional reference to an axlError where failure
77: * will be notified.
78: *
79: * @return axl_true if the init operation was properly implemented,
80: * otherwise axl_false is returned.
81: */
82: axl_bool axl_babel_init (axlError ** error)
83: {
84: /* call to configure babel */
85: axl_doc_set_detect_codification_func (axl_babel_detect_codification, NULL);
86: axl_doc_set_configure_codification_func (axl_babel_configure_encoding, NULL);
87:
88: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "configure axl-babel handlers..");
89:
90: return axl_true;
91: }
92:
93: /**
94: * @brief Remove handlers installed and babel configuration from base
95: * library.
96: */
1.1.1.2 ! misho 97: void axl_babel_finish (void)
1.1 misho 98: {
99: /* call to configure babel */
100: axl_doc_set_detect_codification_func (NULL, NULL);
101: axl_doc_set_configure_codification_func (NULL, NULL);
102:
103: return;
104: }
105:
106: /**
107: * @internal Library function that allows to detect entity codification
108: * found to use the appropiate built-in decoder handler until the
109: * right codification is found (due to encoding header
110: * declaration). The intention is to move the content read from the
111: * stream abstraction into a utf-8 unified representation inside
112: * memory.
113: *
114: * @param doc The document that is about to be checked for the
115: * appropiate codification.
116: *
117: * @param encoding Detected encoding by the function.
118: *
119: * @param error The reference where errors will be reported.
120: *
121: * @return axl_true if the codification detection was performed properly,
122: * otherwise axl_false is returned if an error is found.
123: */
124: axl_bool axl_babel_detect_codification (axlStream * stream,
125: const char ** encoding,
126: axlPointer user_data,
127: axlError ** error)
128: {
129: /* check basic case where the stream have no content to
130: * parse */
131: if (axl_stream_get_size (stream) < 4) {
132: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "unable to detect codification, stream received doesn't have enough content to parse");
133: return axl_false;
134: } /* end if */
135:
136: /* clear encoding */
137: if (encoding)
138: (*encoding) = NULL;
139:
140: /* Check built-in supported formats. First check for documents
141: * with the BOM mark configured */
142:
143: /* check UTF-8 BOM: EF BB BF */
144: if (axl_stream_inspect_code (stream, 0xEF, 0) &&
145: axl_stream_inspect_code (stream, 0xBB, 1) &&
146: axl_stream_inspect_code (stream, 0xBF, 2)) {
147:
148: /* configure encoding detected */
149: if (encoding)
150: (*encoding) = "utf8";
151:
152: /* update stream */
153: axl_stream_move (stream, axl_stream_get_index (stream) + 3);
154:
155: /* found utf-8 encoding, install associated filter */
156: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "utf-8 BOM mark found, assuming utf-8 content");
157: return axl_true;
158: } /* end if */
159:
160: /* check UTF-16 (little-endian) BOM: FF FE */
161: if (axl_stream_inspect_code (stream, 0xFF, 0) &&
162: axl_stream_inspect_code (stream, 0xFE, 1)) {
163: /* configure encoding detected */
164: if (encoding)
165: (*encoding) = "utf16";
166:
167: /* update stream */
168: axl_stream_move (stream, axl_stream_get_index (stream) + 2);
169:
170: /* found utf-16 encoding, install associated filter */
171: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "utf-16 BOM mark found, assuming utf-16 content");
172: return axl_true;
173: }
174:
175: /* check UTF-32 (little-endian) BOM: FF FE 00 00 */
176: if (axl_stream_inspect_code (stream, 0xFF, 0) &&
177: axl_stream_inspect_code (stream, 0xFE, 1) &&
178: axl_stream_inspect_code (stream, 0x00, 2) &&
179: axl_stream_inspect_code (stream, 0x00, 3)) {
180: /* configure encoding detected */
181: if (encoding)
182: (*encoding) = "utf32";
183:
184: /* update stream */
185: axl_stream_move (stream, axl_stream_get_index (stream) + 4);
186:
187: /* found utf-16 encoding, install associated filter */
188: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "utf-32 BOM mark found, assuming utf-8 content");
189: return axl_true;
190: } /* end if */
191:
192: /* NO BOM MARK SECTION */
193:
194: /* detect utf-8, iso 646, ascii,...*/
195: if (axl_stream_inspect_code (stream, 0x3C, 0) &&
196: axl_stream_inspect_code (stream, 0X3F, 1) &&
197: axl_stream_inspect_code (stream, 0x78, 2) &&
198: axl_stream_inspect_code (stream, 0x6D, 3)) {
199: assume_utf8:
200: /* no encoding detected we are not sure */
201:
202: /* found utf-16 encoding, install associated filter */
203: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "found utf-8, iso 646, ascii or something similiar without mark, assuming utf-8 until encoding declaration..");
204: return axl_true;
205: } /* end if */
206:
207: /* check last case where an utf-8 document could be found without xml header */
208: if (axl_stream_inspect_code (stream, 0x3C, 0) &&
209: ! axl_stream_inspect_code (stream, 0x3C, 1) &&
210: ! axl_stream_inspect_code (stream, 0x3E, 1)) {
211: goto assume_utf8;
212: }
213:
214: /* unable to detect the encoding format */
215: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL,
216: "unable to detect encoding format, failed to detect encoding format");
217: axl_error_new (-1, "unable to detect encoding format, failed to detect encoding format", NULL, error);
218: return axl_false;
219:
220: }
221:
222: /**
223: * @internal Function that implements the identity operation. Does
224: * nothing just translates data from source to output.
225: */
226: int axl_babel_identity_utf8 (const char * source, int source_size,
227: const char * source_encoding,
228: char * output, int output_size,
229: int * output_converted,
230: int * remain_source_index,
231: axlPointer user_data)
232: {
233: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "received request to translate source code from iso-8859-15 to utf-8: source size=%d: %s",
234: source_size, source);
235:
236: /* identity operation */
237: memcpy (output, source, source_size);
238: (*output_converted) = source_size;
239:
240: /* operation completed */
241: return 1;
242: }
243:
244: /**
245: * @internal Function that performs translation from encoding
246: * representations using 1 octect (0..255) into utf-8.
247: *
248: * @return The handler must return 1 if the operation was completed, 2
249: * if the operation was completed but not enough size was found on
250: * output buffer to store the content or 0 if the function fails.
251: */
252: int axl_babel_single_to_utf8 (const char * source, int source_size,
253: const char * source_encoding,
254: char * output, int output_size,
255: int * output_converted,
256: int * remain_source_index,
257: axlPointer user_data)
258: {
259: axlBabelTable * table = (axlBabelTable *)user_data;
260: int iterator;
261: int iterator2;
262: int desp;
263: unsigned char value;
264:
265:
266: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "received request to translate source code from %s to utf-8: source size=%d on output size=%d: %s",
267: source_encoding, source_size, output_size, source);
268:
269: iterator = 0;
270: iterator2 = 0;
271: while (iterator < source_size && iterator2 < output_size) {
272:
273: /* get the value */
274: value = source[iterator];
275:
276: /* __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "translating value=%c (%d)", source[iterator], value); */
277:
278: /* check if we are able to place all the encoded item */
279: if ((iterator2 + table[value].size) > output_size) {
280: /* __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "unable to completely decode following sequence (%d size, on remain: %d)",
281: table[value].size, output_size - iterator2); */
282: break;
283: }
284:
285: desp = 0;
286: while ((desp < table[value].size)) {
287: /* __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, " configuring at %d value: %d",
288: iterator2 + desp,
289: table[value].buffer[desp]); */
290: output[iterator2] = table[value].buffer[desp];
291: desp++;
292: iterator2++;
293: }
294:
295: /* next */
296: iterator++;
297:
298: } /* end while */
299:
300: /* update output converted */
301: *output_converted = iterator2;
302: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "output converted=%d: %s (iterator:%d == source_size:%d)",
303: iterator2, output, iterator, source_size);
304:
305: /* check exit condition (if all output was converted) */
306: if (iterator == source_size)
307: return 1;
308:
309: /* source to be converted remains */
310: *remain_source_index = iterator;
311: return 2;
312: }
313:
314: /**
315: * @internal Function that performs translation from encoding
316: * representations using 1 octect (0..255) into utf-8.
317: *
318: * @return The handler must return 1 if the operation was completed, 2
319: * if the operation was completed but not enough size was found on
320: * output buffer to store the content or 0 if the function fails.
321: */
322: int axl_babel_utf8_check (const char * source,
323: int source_size,
324: const char * source_encoding,
325: axlPointer user_data,
326: axlError ** error)
327: {
328: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "received notification to check content size=%d to have valid utf-8: %s", source_size, source);
329: return axl_babel_check_utf8_content (source, source_size, NULL) ? 1 : 0;
330: }
331:
332:
333: /**
334: * @internal Function that tries to check encoding found to configure the
335: * proper set of functions to translate from and to utf-8.
336: *
337: * @param stream Stream to be configured.
338: *
339: * @param encoding Encoding declaration found at the xml header.
340: *
341: * @param detected Detected encoding found by the detect codification
342: * configured.
343: *
344: * @param user_data User defined pointer.
345: *
346: * @param error An optional error that will be filled in the case an
347: * error is found.
348: *
349: * @return axl_true if the operation was completed, otherwise axl_false is
350: * returned.
351: */
352: axl_bool axl_babel_configure_encoding (axlStream * stream,
353: const char * encoding,
354: const char * detected,
355: axlPointer user_data, axlError ** error)
356: {
357: axlBabelTable * table = NULL;
358:
359: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "configuring final document enconding, previously detected=%s, declared=%s",
360: detected ? detected : "none",
361: encoding ? encoding : "none");
362:
363: /* check case were a encoding was detected (the entity content
364: * is encoded as detected due to marks or other means) */
365: if (detected && encoding == NULL)
366: detected = encoding;
367:
368: /* check encoding found (either detected or defined) */
369: if (axl_cmp (encoding, "iso88591")) {
370: /* install a translator handler */
371: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-1");
372: table = axl_babel_build_iso88591_table ();
373: } /* end if */
374:
375: if (axl_cmp (encoding, "iso88592")) {
376: /* install a translator handler */
377: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-2");
378: table = axl_babel_build_iso88592_table ();
379: } /* end if */
380:
381: if (axl_cmp (encoding, "iso88593")) {
382: /* install a translator handler */
383: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-3");
384: table = axl_babel_build_iso88593_table ();
385: } /* end if */
386:
387: if (axl_cmp (encoding, "iso88594")) {
388: /* install a translator handler */
389: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-4");
390: table = axl_babel_build_iso88594_table ();
391: } /* end if */
392:
393: if (axl_cmp (encoding, "iso88595")) {
394: /* install a translator handler */
395: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-5");
396: table = axl_babel_build_iso88595_table ();
397: } /* end if */
398:
399: if (axl_cmp (encoding, "iso88596")) {
400: /* install a translator handler */
401: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-6");
402: table = axl_babel_build_iso88596_table ();
403: } /* end if */
404:
405: if (axl_cmp (encoding, "iso88597")) {
406: /* install a translator handler */
407: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-7");
408: table = axl_babel_build_iso88597_table ();
409: } /* end if */
410:
411: if (axl_cmp (encoding, "iso88598")) {
412: /* install a translator handler */
413: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-8");
414: table = axl_babel_build_iso88598_table ();
415: } /* end if */
416:
417: if (axl_cmp (encoding, "iso88599")) {
418: /* install a translator handler */
419: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-9");
420: table = axl_babel_build_iso88599_table ();
421: } /* end if */
422:
423: if (axl_cmp (encoding, "iso885915")) {
424: /* install a translator handler */
425: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for iso-8859-15");
426: table = axl_babel_build_iso885915_table ();
427: } /* end if */
428:
429: if (axl_cmp (encoding, "utf8")) {
430: /* install a translator handler */
431: __axl_log (LOG_DOMAIN, AXL_LEVEL_DEBUG, "installed handler encoding for utf-8");
432:
433: /* install checker without table */
434: if (! axl_stream_setup_check (stream, encoding, axl_babel_utf8_check, NULL, error))
435: return axl_false;
436: return axl_true;
437: } /* end if */
438:
439: if (table == NULL) {
440: /* format not defined, use default utf-8 */
441: __axl_log (LOG_DOMAIN, AXL_LEVEL_WARNING, "encoding='%s' (detected: '%s') not supported, falling back into utf-8 without restriction",
442: encoding ? encoding : "",
443: detected ? detected : "");
444:
445: return axl_true;
446: } /* end if */
447:
448: /* associate to the stream */
449: axl_stream_link_full (stream, table, axl_free, axl_true);
450:
451: if (! axl_stream_setup_decode (stream, encoding, axl_babel_single_to_utf8, table, error))
452: return axl_false;
453:
454: return axl_true;
455: }
456:
457: /**
458: * @brief Allows to check if the provided string is in utf-8 coding
459: * form.
460: *
461: * @param content The content to length.
462: *
463: * @param content_length Length (in octets) of the string received. If
464: * provided -1, content will be calculated using strlen function.
465: *
466: * @param index_error Optional reference where will be reported the
467: * index position that caused the error.
468: *
469: * @return axl_true if the content provided is all in utf-8 otherwise
470: * axl_false is returned. In the case index_error or error is defined and
471: * an error is found, they are defined to the appropriate value.
472: */
473: axl_bool axl_babel_check_utf8_content (const char * content,
474: int content_length,
475: int * index_error)
476: {
477: int iterator = 0;
478: unsigned char value;
479:
480:
481: if (index_error)
482: *index_error = 0;
483:
484: axl_return_val_if_fail (content, axl_false);
485: axl_return_val_if_fail (content_length >= -1, axl_false);
486:
487: /* check and calculate content */
488: if (content_length == -1)
489: content_length = strlen (content);
490:
491: while (iterator < content_length) {
492: /* utf with 4 octects */
493: value = content[iterator];
494: if (value >= 240 && value <= 247 && (iterator + 1) < content_length ) {
495:
496: /* get next value */
497: value = content[iterator + 1];
498: if (value >= 128 && value <= 191 && (iterator + 2) < content_length ) {
499:
500: /* get next value */
501: value = content[iterator + 2];
502: if (value >= 128 && value <= 191 && (iterator + 3) < content_length ) {
503:
504: /* get next value */
505: value = content[iterator + 3];
506: if (value >= 128 && value <= 191) {
507: iterator += 4;
508: continue;
509: } /* end if */
510: }
511: }
512:
513: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL, "found error while detecting 4 octect utf-8 format..");
514: /* found error */
515: if (index_error)
516: *index_error = iterator;
517: return axl_false;
518: } /* end if */
519:
520: /* utf with 3 octects */
521: if (value >= 224 && value <= 239 && (iterator + 1) < content_length ) {
522: /* get next value */
523: value = content[iterator + 1];
524: if (value >= 128 && value <= 191 && (iterator + 2) < content_length ) {
525:
526: /* get next value */
527: value = content[iterator + 2];
528: if (value >= 128 && value <= 191) {
529: iterator += 3;
530: continue;
531: } /* end if */
532: }
533:
534: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL, "found error while detecting 3 octect utf-8 format..");
535:
536: /* found error */
537: if (index_error)
538: *index_error = iterator;
539: return axl_false;
540: }
541:
542: /* utf with 2 octects */
543: if (value >= 192 && value <= 223 && (iterator + 1) < content_length ) {
544: /* get next value */
545: value = content[iterator + 1];
546: if (value >= 128 && value <= 191) {
547: iterator += 2;
548: continue;
549: } /* end if */
550:
551: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL, "found error while detecting 2 octect utf-8 format value=%d..", value);
552:
553: /* found error */
554: if (index_error)
555: *index_error = iterator;
556: return axl_false;
557: }
558:
559: if (value <= 127 ) {
560: iterator++;
561: continue;
562: } /* end if */
563:
564: __axl_log (LOG_DOMAIN, AXL_LEVEL_CRITICAL, "found error while detecting single octect utf-8 format..");
565:
566: /* found error */
567: if (index_error)
568: *index_error = iterator;
569: return axl_false;
570:
571:
572: } /* end while */
573:
574: return axl_true;
575: }
576:
577: /**
578: * @}
579: */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>