Annotation of embedaddon/libxml2/xmlstring.c, revision 1.1.1.1
1.1 misho 1: /*
2: * string.c : an XML string utilities module
3: *
4: * This module provides various utility functions for manipulating
5: * the xmlChar* type. All functions named xmlStr* have been moved here
6: * from the parser.c file (their original home).
7: *
8: * See Copyright for the status of this software.
9: *
10: * UTF8 string routines from:
11: * William Brack <wbrack@mmm.com.hk>
12: *
13: * daniel@veillard.com
14: */
15:
16: #define IN_LIBXML
17: #include "libxml.h"
18:
19: #include <stdlib.h>
20: #include <string.h>
21: #include <libxml/xmlmemory.h>
22: #include <libxml/parserInternals.h>
23: #include <libxml/xmlstring.h>
24:
25: /************************************************************************
26: * *
27: * Commodity functions to handle xmlChars *
28: * *
29: ************************************************************************/
30:
31: /**
32: * xmlStrndup:
33: * @cur: the input xmlChar *
34: * @len: the len of @cur
35: *
36: * a strndup for array of xmlChar's
37: *
38: * Returns a new xmlChar * or NULL
39: */
40: xmlChar *
41: xmlStrndup(const xmlChar *cur, int len) {
42: xmlChar *ret;
43:
44: if ((cur == NULL) || (len < 0)) return(NULL);
45: ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46: if (ret == NULL) {
47: xmlErrMemory(NULL, NULL);
48: return(NULL);
49: }
50: memcpy(ret, cur, len * sizeof(xmlChar));
51: ret[len] = 0;
52: return(ret);
53: }
54:
55: /**
56: * xmlStrdup:
57: * @cur: the input xmlChar *
58: *
59: * a strdup for array of xmlChar's. Since they are supposed to be
60: * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61: * a termination mark of '0'.
62: *
63: * Returns a new xmlChar * or NULL
64: */
65: xmlChar *
66: xmlStrdup(const xmlChar *cur) {
67: const xmlChar *p = cur;
68:
69: if (cur == NULL) return(NULL);
70: while (*p != 0) p++; /* non input consuming */
71: return(xmlStrndup(cur, p - cur));
72: }
73:
74: /**
75: * xmlCharStrndup:
76: * @cur: the input char *
77: * @len: the len of @cur
78: *
79: * a strndup for char's to xmlChar's
80: *
81: * Returns a new xmlChar * or NULL
82: */
83:
84: xmlChar *
85: xmlCharStrndup(const char *cur, int len) {
86: int i;
87: xmlChar *ret;
88:
89: if ((cur == NULL) || (len < 0)) return(NULL);
90: ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91: if (ret == NULL) {
92: xmlErrMemory(NULL, NULL);
93: return(NULL);
94: }
95: for (i = 0;i < len;i++) {
96: ret[i] = (xmlChar) cur[i];
97: if (ret[i] == 0) return(ret);
98: }
99: ret[len] = 0;
100: return(ret);
101: }
102:
103: /**
104: * xmlCharStrdup:
105: * @cur: the input char *
106: *
107: * a strdup for char's to xmlChar's
108: *
109: * Returns a new xmlChar * or NULL
110: */
111:
112: xmlChar *
113: xmlCharStrdup(const char *cur) {
114: const char *p = cur;
115:
116: if (cur == NULL) return(NULL);
117: while (*p != '\0') p++; /* non input consuming */
118: return(xmlCharStrndup(cur, p - cur));
119: }
120:
121: /**
122: * xmlStrcmp:
123: * @str1: the first xmlChar *
124: * @str2: the second xmlChar *
125: *
126: * a strcmp for xmlChar's
127: *
128: * Returns the integer result of the comparison
129: */
130:
131: int
132: xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133: register int tmp;
134:
135: if (str1 == str2) return(0);
136: if (str1 == NULL) return(-1);
137: if (str2 == NULL) return(1);
138: do {
139: tmp = *str1++ - *str2;
140: if (tmp != 0) return(tmp);
141: } while (*str2++ != 0);
142: return 0;
143: }
144:
145: /**
146: * xmlStrEqual:
147: * @str1: the first xmlChar *
148: * @str2: the second xmlChar *
149: *
150: * Check if both strings are equal of have same content.
151: * Should be a bit more readable and faster than xmlStrcmp()
152: *
153: * Returns 1 if they are equal, 0 if they are different
154: */
155:
156: int
157: xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158: if (str1 == str2) return(1);
159: if (str1 == NULL) return(0);
160: if (str2 == NULL) return(0);
161: do {
162: if (*str1++ != *str2) return(0);
163: } while (*str2++);
164: return(1);
165: }
166:
167: /**
168: * xmlStrQEqual:
169: * @pref: the prefix of the QName
170: * @name: the localname of the QName
171: * @str: the second xmlChar *
172: *
173: * Check if a QName is Equal to a given string
174: *
175: * Returns 1 if they are equal, 0 if they are different
176: */
177:
178: int
179: xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180: if (pref == NULL) return(xmlStrEqual(name, str));
181: if (name == NULL) return(0);
182: if (str == NULL) return(0);
183:
184: do {
185: if (*pref++ != *str) return(0);
186: } while ((*str++) && (*pref));
187: if (*str++ != ':') return(0);
188: do {
189: if (*name++ != *str) return(0);
190: } while (*str++);
191: return(1);
192: }
193:
194: /**
195: * xmlStrncmp:
196: * @str1: the first xmlChar *
197: * @str2: the second xmlChar *
198: * @len: the max comparison length
199: *
200: * a strncmp for xmlChar's
201: *
202: * Returns the integer result of the comparison
203: */
204:
205: int
206: xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207: register int tmp;
208:
209: if (len <= 0) return(0);
210: if (str1 == str2) return(0);
211: if (str1 == NULL) return(-1);
212: if (str2 == NULL) return(1);
213: #ifdef __GNUC__
214: tmp = strncmp((const char *)str1, (const char *)str2, len);
215: return tmp;
216: #else
217: do {
218: tmp = *str1++ - *str2;
219: if (tmp != 0 || --len == 0) return(tmp);
220: } while (*str2++ != 0);
221: return 0;
222: #endif
223: }
224:
225: static const xmlChar casemap[256] = {
226: 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227: 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228: 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229: 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230: 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231: 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232: 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233: 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234: 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235: 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236: 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237: 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238: 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239: 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240: 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241: 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242: 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243: 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244: 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245: 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246: 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247: 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248: 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249: 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250: 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251: 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252: 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253: 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254: 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255: 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256: 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257: 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258: };
259:
260: /**
261: * xmlStrcasecmp:
262: * @str1: the first xmlChar *
263: * @str2: the second xmlChar *
264: *
265: * a strcasecmp for xmlChar's
266: *
267: * Returns the integer result of the comparison
268: */
269:
270: int
271: xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272: register int tmp;
273:
274: if (str1 == str2) return(0);
275: if (str1 == NULL) return(-1);
276: if (str2 == NULL) return(1);
277: do {
278: tmp = casemap[*str1++] - casemap[*str2];
279: if (tmp != 0) return(tmp);
280: } while (*str2++ != 0);
281: return 0;
282: }
283:
284: /**
285: * xmlStrncasecmp:
286: * @str1: the first xmlChar *
287: * @str2: the second xmlChar *
288: * @len: the max comparison length
289: *
290: * a strncasecmp for xmlChar's
291: *
292: * Returns the integer result of the comparison
293: */
294:
295: int
296: xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297: register int tmp;
298:
299: if (len <= 0) return(0);
300: if (str1 == str2) return(0);
301: if (str1 == NULL) return(-1);
302: if (str2 == NULL) return(1);
303: do {
304: tmp = casemap[*str1++] - casemap[*str2];
305: if (tmp != 0 || --len == 0) return(tmp);
306: } while (*str2++ != 0);
307: return 0;
308: }
309:
310: /**
311: * xmlStrchr:
312: * @str: the xmlChar * array
313: * @val: the xmlChar to search
314: *
315: * a strchr for xmlChar's
316: *
317: * Returns the xmlChar * for the first occurrence or NULL.
318: */
319:
320: const xmlChar *
321: xmlStrchr(const xmlChar *str, xmlChar val) {
322: if (str == NULL) return(NULL);
323: while (*str != 0) { /* non input consuming */
324: if (*str == val) return((xmlChar *) str);
325: str++;
326: }
327: return(NULL);
328: }
329:
330: /**
331: * xmlStrstr:
332: * @str: the xmlChar * array (haystack)
333: * @val: the xmlChar to search (needle)
334: *
335: * a strstr for xmlChar's
336: *
337: * Returns the xmlChar * for the first occurrence or NULL.
338: */
339:
340: const xmlChar *
341: xmlStrstr(const xmlChar *str, const xmlChar *val) {
342: int n;
343:
344: if (str == NULL) return(NULL);
345: if (val == NULL) return(NULL);
346: n = xmlStrlen(val);
347:
348: if (n == 0) return(str);
349: while (*str != 0) { /* non input consuming */
350: if (*str == *val) {
351: if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352: }
353: str++;
354: }
355: return(NULL);
356: }
357:
358: /**
359: * xmlStrcasestr:
360: * @str: the xmlChar * array (haystack)
361: * @val: the xmlChar to search (needle)
362: *
363: * a case-ignoring strstr for xmlChar's
364: *
365: * Returns the xmlChar * for the first occurrence or NULL.
366: */
367:
368: const xmlChar *
369: xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
370: int n;
371:
372: if (str == NULL) return(NULL);
373: if (val == NULL) return(NULL);
374: n = xmlStrlen(val);
375:
376: if (n == 0) return(str);
377: while (*str != 0) { /* non input consuming */
378: if (casemap[*str] == casemap[*val])
379: if (!xmlStrncasecmp(str, val, n)) return(str);
380: str++;
381: }
382: return(NULL);
383: }
384:
385: /**
386: * xmlStrsub:
387: * @str: the xmlChar * array (haystack)
388: * @start: the index of the first char (zero based)
389: * @len: the length of the substring
390: *
391: * Extract a substring of a given string
392: *
393: * Returns the xmlChar * for the first occurrence or NULL.
394: */
395:
396: xmlChar *
397: xmlStrsub(const xmlChar *str, int start, int len) {
398: int i;
399:
400: if (str == NULL) return(NULL);
401: if (start < 0) return(NULL);
402: if (len < 0) return(NULL);
403:
404: for (i = 0;i < start;i++) {
405: if (*str == 0) return(NULL);
406: str++;
407: }
408: if (*str == 0) return(NULL);
409: return(xmlStrndup(str, len));
410: }
411:
412: /**
413: * xmlStrlen:
414: * @str: the xmlChar * array
415: *
416: * length of a xmlChar's string
417: *
418: * Returns the number of xmlChar contained in the ARRAY.
419: */
420:
421: int
422: xmlStrlen(const xmlChar *str) {
423: int len = 0;
424:
425: if (str == NULL) return(0);
426: while (*str != 0) { /* non input consuming */
427: str++;
428: len++;
429: }
430: return(len);
431: }
432:
433: /**
434: * xmlStrncat:
435: * @cur: the original xmlChar * array
436: * @add: the xmlChar * array added
437: * @len: the length of @add
438: *
439: * a strncat for array of xmlChar's, it will extend @cur with the len
440: * first bytes of @add. Note that if @len < 0 then this is an API error
441: * and NULL will be returned.
442: *
443: * Returns a new xmlChar *, the original @cur is reallocated if needed
444: * and should not be freed
445: */
446:
447: xmlChar *
448: xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449: int size;
450: xmlChar *ret;
451:
452: if ((add == NULL) || (len == 0))
453: return(cur);
454: if (len < 0)
455: return(NULL);
456: if (cur == NULL)
457: return(xmlStrndup(add, len));
458:
459: size = xmlStrlen(cur);
460: ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
461: if (ret == NULL) {
462: xmlErrMemory(NULL, NULL);
463: return(cur);
464: }
465: memcpy(&ret[size], add, len * sizeof(xmlChar));
466: ret[size + len] = 0;
467: return(ret);
468: }
469:
470: /**
471: * xmlStrncatNew:
472: * @str1: first xmlChar string
473: * @str2: second xmlChar string
474: * @len: the len of @str2 or < 0
475: *
476: * same as xmlStrncat, but creates a new string. The original
477: * two strings are not freed. If @len is < 0 then the length
478: * will be calculated automatically.
479: *
480: * Returns a new xmlChar * or NULL
481: */
482: xmlChar *
483: xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
484: int size;
485: xmlChar *ret;
486:
487: if (len < 0)
488: len = xmlStrlen(str2);
489: if ((str2 == NULL) || (len == 0))
490: return(xmlStrdup(str1));
491: if (str1 == NULL)
492: return(xmlStrndup(str2, len));
493:
494: size = xmlStrlen(str1);
495: ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
496: if (ret == NULL) {
497: xmlErrMemory(NULL, NULL);
498: return(xmlStrndup(str1, size));
499: }
500: memcpy(ret, str1, size * sizeof(xmlChar));
501: memcpy(&ret[size], str2, len * sizeof(xmlChar));
502: ret[size + len] = 0;
503: return(ret);
504: }
505:
506: /**
507: * xmlStrcat:
508: * @cur: the original xmlChar * array
509: * @add: the xmlChar * array added
510: *
511: * a strcat for array of xmlChar's. Since they are supposed to be
512: * encoded in UTF-8 or an encoding with 8bit based chars, we assume
513: * a termination mark of '0'.
514: *
515: * Returns a new xmlChar * containing the concatenated string.
516: */
517: xmlChar *
518: xmlStrcat(xmlChar *cur, const xmlChar *add) {
519: const xmlChar *p = add;
520:
521: if (add == NULL) return(cur);
522: if (cur == NULL)
523: return(xmlStrdup(add));
524:
525: while (*p != 0) p++; /* non input consuming */
526: return(xmlStrncat(cur, add, p - add));
527: }
528:
529: /**
530: * xmlStrPrintf:
531: * @buf: the result buffer.
532: * @len: the result buffer length.
533: * @msg: the message with printf formatting.
534: * @...: extra parameters for the message.
535: *
536: * Formats @msg and places result into @buf.
537: *
538: * Returns the number of characters written to @buf or -1 if an error occurs.
539: */
540: int XMLCDECL
541: xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
542: va_list args;
543: int ret;
544:
545: if((buf == NULL) || (msg == NULL)) {
546: return(-1);
547: }
548:
549: va_start(args, msg);
550: ret = vsnprintf((char *) buf, len, (const char *) msg, args);
551: va_end(args);
552: buf[len - 1] = 0; /* be safe ! */
553:
554: return(ret);
555: }
556:
557: /**
558: * xmlStrVPrintf:
559: * @buf: the result buffer.
560: * @len: the result buffer length.
561: * @msg: the message with printf formatting.
562: * @ap: extra parameters for the message.
563: *
564: * Formats @msg and places result into @buf.
565: *
566: * Returns the number of characters written to @buf or -1 if an error occurs.
567: */
568: int
569: xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
570: int ret;
571:
572: if((buf == NULL) || (msg == NULL)) {
573: return(-1);
574: }
575:
576: ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
577: buf[len - 1] = 0; /* be safe ! */
578:
579: return(ret);
580: }
581:
582: /************************************************************************
583: * *
584: * Generic UTF8 handling routines *
585: * *
586: * From rfc2044: encoding of the Unicode values on UTF-8: *
587: * *
588: * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
589: * 0000 0000-0000 007F 0xxxxxxx *
590: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
591: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
592: * *
593: * I hope we won't use values > 0xFFFF anytime soon ! *
594: * *
595: ************************************************************************/
596:
597:
598: /**
599: * xmlUTF8Size:
600: * @utf: pointer to the UTF8 character
601: *
602: * calculates the internal size of a UTF8 character
603: *
604: * returns the numbers of bytes in the character, -1 on format error
605: */
606: int
607: xmlUTF8Size(const xmlChar *utf) {
608: xmlChar mask;
609: int len;
610:
611: if (utf == NULL)
612: return -1;
613: if (*utf < 0x80)
614: return 1;
615: /* check valid UTF8 character */
616: if (!(*utf & 0x40))
617: return -1;
618: /* determine number of bytes in char */
619: len = 2;
620: for (mask=0x20; mask != 0; mask>>=1) {
621: if (!(*utf & mask))
622: return len;
623: len++;
624: }
625: return -1;
626: }
627:
628: /**
629: * xmlUTF8Charcmp:
630: * @utf1: pointer to first UTF8 char
631: * @utf2: pointer to second UTF8 char
632: *
633: * compares the two UCS4 values
634: *
635: * returns result of the compare as with xmlStrncmp
636: */
637: int
638: xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
639:
640: if (utf1 == NULL ) {
641: if (utf2 == NULL)
642: return 0;
643: return -1;
644: }
645: return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
646: }
647:
648: /**
649: * xmlUTF8Strlen:
650: * @utf: a sequence of UTF-8 encoded bytes
651: *
652: * compute the length of an UTF8 string, it doesn't do a full UTF8
653: * checking of the content of the string.
654: *
655: * Returns the number of characters in the string or -1 in case of error
656: */
657: int
658: xmlUTF8Strlen(const xmlChar *utf) {
659: int ret = 0;
660:
661: if (utf == NULL)
662: return(-1);
663:
664: while (*utf != 0) {
665: if (utf[0] & 0x80) {
666: if ((utf[1] & 0xc0) != 0x80)
667: return(-1);
668: if ((utf[0] & 0xe0) == 0xe0) {
669: if ((utf[2] & 0xc0) != 0x80)
670: return(-1);
671: if ((utf[0] & 0xf0) == 0xf0) {
672: if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
673: return(-1);
674: utf += 4;
675: } else {
676: utf += 3;
677: }
678: } else {
679: utf += 2;
680: }
681: } else {
682: utf++;
683: }
684: ret++;
685: }
686: return(ret);
687: }
688:
689: /**
690: * xmlGetUTF8Char:
691: * @utf: a sequence of UTF-8 encoded bytes
692: * @len: a pointer to the minimum number of bytes present in
693: * the sequence. This is used to assure the next character
694: * is completely contained within the sequence.
695: *
696: * Read the first UTF8 character from @utf
697: *
698: * Returns the char value or -1 in case of error, and sets *len to
699: * the actual number of bytes consumed (0 in case of error)
700: */
701: int
702: xmlGetUTF8Char(const unsigned char *utf, int *len) {
703: unsigned int c;
704:
705: if (utf == NULL)
706: goto error;
707: if (len == NULL)
708: goto error;
709: if (*len < 1)
710: goto error;
711:
712: c = utf[0];
713: if (c & 0x80) {
714: if (*len < 2)
715: goto error;
716: if ((utf[1] & 0xc0) != 0x80)
717: goto error;
718: if ((c & 0xe0) == 0xe0) {
719: if (*len < 3)
720: goto error;
721: if ((utf[2] & 0xc0) != 0x80)
722: goto error;
723: if ((c & 0xf0) == 0xf0) {
724: if (*len < 4)
725: goto error;
726: if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
727: goto error;
728: *len = 4;
729: /* 4-byte code */
730: c = (utf[0] & 0x7) << 18;
731: c |= (utf[1] & 0x3f) << 12;
732: c |= (utf[2] & 0x3f) << 6;
733: c |= utf[3] & 0x3f;
734: } else {
735: /* 3-byte code */
736: *len = 3;
737: c = (utf[0] & 0xf) << 12;
738: c |= (utf[1] & 0x3f) << 6;
739: c |= utf[2] & 0x3f;
740: }
741: } else {
742: /* 2-byte code */
743: *len = 2;
744: c = (utf[0] & 0x1f) << 6;
745: c |= utf[1] & 0x3f;
746: }
747: } else {
748: /* 1-byte code */
749: *len = 1;
750: }
751: return(c);
752:
753: error:
754: if (len != NULL)
755: *len = 0;
756: return(-1);
757: }
758:
759: /**
760: * xmlCheckUTF8:
761: * @utf: Pointer to putative UTF-8 encoded string.
762: *
763: * Checks @utf for being valid UTF-8. @utf is assumed to be
764: * null-terminated. This function is not super-strict, as it will
765: * allow longer UTF-8 sequences than necessary. Note that Java is
766: * capable of producing these sequences if provoked. Also note, this
767: * routine checks for the 4-byte maximum size, but does not check for
768: * 0x10ffff maximum value.
769: *
770: * Return value: true if @utf is valid.
771: **/
772: int
773: xmlCheckUTF8(const unsigned char *utf)
774: {
775: int ix;
776: unsigned char c;
777:
778: if (utf == NULL)
779: return(0);
780: /*
781: * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
782: * are as follows (in "bit format"):
783: * 0xxxxxxx valid 1-byte
784: * 110xxxxx 10xxxxxx valid 2-byte
785: * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
786: * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
787: */
788: for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
789: if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
790: ix++;
791: } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
792: if ((utf[ix+1] & 0xc0 ) != 0x80)
793: return 0;
794: ix += 2;
795: } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
796: if (((utf[ix+1] & 0xc0) != 0x80) ||
797: ((utf[ix+2] & 0xc0) != 0x80))
798: return 0;
799: ix += 3;
800: } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
801: if (((utf[ix+1] & 0xc0) != 0x80) ||
802: ((utf[ix+2] & 0xc0) != 0x80) ||
803: ((utf[ix+3] & 0xc0) != 0x80))
804: return 0;
805: ix += 4;
806: } else /* unknown encoding */
807: return 0;
808: }
809: return(1);
810: }
811:
812: /**
813: * xmlUTF8Strsize:
814: * @utf: a sequence of UTF-8 encoded bytes
815: * @len: the number of characters in the array
816: *
817: * storage size of an UTF8 string
818: * the behaviour is not garanteed if the input string is not UTF-8
819: *
820: * Returns the storage size of
821: * the first 'len' characters of ARRAY
822: */
823:
824: int
825: xmlUTF8Strsize(const xmlChar *utf, int len) {
826: const xmlChar *ptr=utf;
827: xmlChar ch;
828:
829: if (utf == NULL)
830: return(0);
831:
832: if (len <= 0)
833: return(0);
834:
835: while ( len-- > 0) {
836: if ( !*ptr )
837: break;
838: if ( (ch = *ptr++) & 0x80)
839: while ((ch<<=1) & 0x80 ) {
840: ptr++;
841: if (*ptr == 0) break;
842: }
843: }
844: return (ptr - utf);
845: }
846:
847:
848: /**
849: * xmlUTF8Strndup:
850: * @utf: the input UTF8 *
851: * @len: the len of @utf (in chars)
852: *
853: * a strndup for array of UTF8's
854: *
855: * Returns a new UTF8 * or NULL
856: */
857: xmlChar *
858: xmlUTF8Strndup(const xmlChar *utf, int len) {
859: xmlChar *ret;
860: int i;
861:
862: if ((utf == NULL) || (len < 0)) return(NULL);
863: i = xmlUTF8Strsize(utf, len);
864: ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
865: if (ret == NULL) {
866: xmlGenericError(xmlGenericErrorContext,
867: "malloc of %ld byte failed\n",
868: (len + 1) * (long)sizeof(xmlChar));
869: return(NULL);
870: }
871: memcpy(ret, utf, i * sizeof(xmlChar));
872: ret[i] = 0;
873: return(ret);
874: }
875:
876: /**
877: * xmlUTF8Strpos:
878: * @utf: the input UTF8 *
879: * @pos: the position of the desired UTF8 char (in chars)
880: *
881: * a function to provide the equivalent of fetching a
882: * character from a string array
883: *
884: * Returns a pointer to the UTF8 character or NULL
885: */
886: const xmlChar *
887: xmlUTF8Strpos(const xmlChar *utf, int pos) {
888: xmlChar ch;
889:
890: if (utf == NULL) return(NULL);
891: if (pos < 0)
892: return(NULL);
893: while (pos--) {
894: if ((ch=*utf++) == 0) return(NULL);
895: if ( ch & 0x80 ) {
896: /* if not simple ascii, verify proper format */
897: if ( (ch & 0xc0) != 0xc0 )
898: return(NULL);
899: /* then skip over remaining bytes for this char */
900: while ( (ch <<= 1) & 0x80 )
901: if ( (*utf++ & 0xc0) != 0x80 )
902: return(NULL);
903: }
904: }
905: return((xmlChar *)utf);
906: }
907:
908: /**
909: * xmlUTF8Strloc:
910: * @utf: the input UTF8 *
911: * @utfchar: the UTF8 character to be found
912: *
913: * a function to provide the relative location of a UTF8 char
914: *
915: * Returns the relative character position of the desired char
916: * or -1 if not found
917: */
918: int
919: xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
920: int i, size;
921: xmlChar ch;
922:
923: if (utf==NULL || utfchar==NULL) return -1;
924: size = xmlUTF8Strsize(utfchar, 1);
925: for(i=0; (ch=*utf) != 0; i++) {
926: if (xmlStrncmp(utf, utfchar, size)==0)
927: return(i);
928: utf++;
929: if ( ch & 0x80 ) {
930: /* if not simple ascii, verify proper format */
931: if ( (ch & 0xc0) != 0xc0 )
932: return(-1);
933: /* then skip over remaining bytes for this char */
934: while ( (ch <<= 1) & 0x80 )
935: if ( (*utf++ & 0xc0) != 0x80 )
936: return(-1);
937: }
938: }
939:
940: return(-1);
941: }
942: /**
943: * xmlUTF8Strsub:
944: * @utf: a sequence of UTF-8 encoded bytes
945: * @start: relative pos of first char
946: * @len: total number to copy
947: *
948: * Create a substring from a given UTF-8 string
949: * Note: positions are given in units of UTF-8 chars
950: *
951: * Returns a pointer to a newly created string
952: * or NULL if any problem
953: */
954:
955: xmlChar *
956: xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
957: int i;
958: xmlChar ch;
959:
960: if (utf == NULL) return(NULL);
961: if (start < 0) return(NULL);
962: if (len < 0) return(NULL);
963:
964: /*
965: * Skip over any leading chars
966: */
967: for (i = 0;i < start;i++) {
968: if ((ch=*utf++) == 0) return(NULL);
969: if ( ch & 0x80 ) {
970: /* if not simple ascii, verify proper format */
971: if ( (ch & 0xc0) != 0xc0 )
972: return(NULL);
973: /* then skip over remaining bytes for this char */
974: while ( (ch <<= 1) & 0x80 )
975: if ( (*utf++ & 0xc0) != 0x80 )
976: return(NULL);
977: }
978: }
979:
980: return(xmlUTF8Strndup(utf, len));
981: }
982:
983: #define bottom_xmlstring
984: #include "elfgcchack.h"
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>