embedaddon/libxml2/xmlstring.c - view

File: [ELWIX - Embedded LightWeight unIX -] / embedaddon / libxml2 / xmlstring.c
Revision 1.1.1.2 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Sun Jun 15 19:53:28 2014 UTC (9 years, 11 months ago) by misho
Branches: libxml2, MAIN
CVS tags: v2_9_1p0, v2_9_1, HEAD

libxml2 2.9.1

1: /* 2: * string.c : an XML string utilities module 3: * 4: * This module provides various utility functions for manipulating 5: * the xmlChar* type. All functions named xmlStr* have been moved here 6: * from the parser.c file (their original home). 7: * 8: * See Copyright for the status of this software. 9: * 10: * UTF8 string routines from: 11: * William Brack <wbrack@mmm.com.hk> 12: * 13: * daniel@veillard.com 14: */ 15: 16: #define IN_LIBXML 17: #include "libxml.h" 18: 19: #include <stdlib.h> 20: #include <string.h> 21: #include <libxml/xmlmemory.h> 22: #include <libxml/parserInternals.h> 23: #include <libxml/xmlstring.h> 24: 25: /************************************************************************ 26: * * 27: * Commodity functions to handle xmlChars * 28: * * 29: ************************************************************************/ 30: 31: /** 32: * xmlStrndup: 33: * @cur: the input xmlChar * 34: * @len: the len of @cur 35: * 36: * a strndup for array of xmlChar's 37: * 38: * Returns a new xmlChar * or NULL 39: */ 40: xmlChar * 41: xmlStrndup(const xmlChar *cur, int len) { 42: xmlChar *ret; 43: 44: if ((cur == NULL) || (len < 0)) return(NULL); 45: ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar)); 46: if (ret == NULL) { 47: xmlErrMemory(NULL, NULL); 48: return(NULL); 49: } 50: memcpy(ret, cur, len * sizeof(xmlChar)); 51: ret[len] = 0; 52: return(ret); 53: } 54: 55: /** 56: * xmlStrdup: 57: * @cur: the input xmlChar * 58: * 59: * a strdup for array of xmlChar's. Since they are supposed to be 60: * encoded in UTF-8 or an encoding with 8bit based chars, we assume 61: * a termination mark of '0'. 62: * 63: * Returns a new xmlChar * or NULL 64: */ 65: xmlChar * 66: xmlStrdup(const xmlChar *cur) { 67: const xmlChar *p = cur; 68: 69: if (cur == NULL) return(NULL); 70: while (*p != 0) p++; /* non input consuming */ 71: return(xmlStrndup(cur, p - cur)); 72: } 73: 74: /** 75: * xmlCharStrndup: 76: * @cur: the input char * 77: * @len: the len of @cur 78: * 79: * a strndup for char's to xmlChar's 80: * 81: * Returns a new xmlChar * or NULL 82: */ 83: 84: xmlChar * 85: xmlCharStrndup(const char *cur, int len) { 86: int i; 87: xmlChar *ret; 88: 89: if ((cur == NULL) || (len < 0)) return(NULL); 90: ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar)); 91: if (ret == NULL) { 92: xmlErrMemory(NULL, NULL); 93: return(NULL); 94: } 95: for (i = 0;i < len;i++) { 96: ret[i] = (xmlChar) cur[i]; 97: if (ret[i] == 0) return(ret); 98: } 99: ret[len] = 0; 100: return(ret); 101: } 102: 103: /** 104: * xmlCharStrdup: 105: * @cur: the input char * 106: * 107: * a strdup for char's to xmlChar's 108: * 109: * Returns a new xmlChar * or NULL 110: */ 111: 112: xmlChar * 113: xmlCharStrdup(const char *cur) { 114: const char *p = cur; 115: 116: if (cur == NULL) return(NULL); 117: while (*p != '\0') p++; /* non input consuming */ 118: return(xmlCharStrndup(cur, p - cur)); 119: } 120: 121: /** 122: * xmlStrcmp: 123: * @str1: the first xmlChar * 124: * @str2: the second xmlChar * 125: * 126: * a strcmp for xmlChar's 127: * 128: * Returns the integer result of the comparison 129: */ 130: 131: int 132: xmlStrcmp(const xmlChar *str1, const xmlChar *str2) { 133: register int tmp; 134: 135: if (str1 == str2) return(0); 136: if (str1 == NULL) return(-1); 137: if (str2 == NULL) return(1); 138: do { 139: tmp = *str1++ - *str2; 140: if (tmp != 0) return(tmp); 141: } while (*str2++ != 0); 142: return 0; 143: } 144: 145: /** 146: * xmlStrEqual: 147: * @str1: the first xmlChar * 148: * @str2: the second xmlChar * 149: * 150: * Check if both strings are equal of have same content. 151: * Should be a bit more readable and faster than xmlStrcmp() 152: * 153: * Returns 1 if they are equal, 0 if they are different 154: */ 155: 156: int 157: xmlStrEqual(const xmlChar *str1, const xmlChar *str2) { 158: if (str1 == str2) return(1); 159: if (str1 == NULL) return(0); 160: if (str2 == NULL) return(0); 161: do { 162: if (*str1++ != *str2) return(0); 163: } while (*str2++); 164: return(1); 165: } 166: 167: /** 168: * xmlStrQEqual: 169: * @pref: the prefix of the QName 170: * @name: the localname of the QName 171: * @str: the second xmlChar * 172: * 173: * Check if a QName is Equal to a given string 174: * 175: * Returns 1 if they are equal, 0 if they are different 176: */ 177: 178: int 179: xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) { 180: if (pref == NULL) return(xmlStrEqual(name, str)); 181: if (name == NULL) return(0); 182: if (str == NULL) return(0); 183: 184: do { 185: if (*pref++ != *str) return(0); 186: } while ((*str++) && (*pref)); 187: if (*str++ != ':') return(0); 188: do { 189: if (*name++ != *str) return(0); 190: } while (*str++); 191: return(1); 192: } 193: 194: /** 195: * xmlStrncmp: 196: * @str1: the first xmlChar * 197: * @str2: the second xmlChar * 198: * @len: the max comparison length 199: * 200: * a strncmp for xmlChar's 201: * 202: * Returns the integer result of the comparison 203: */ 204: 205: int 206: xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) { 207: register int tmp; 208: 209: if (len <= 0) return(0); 210: if (str1 == str2) return(0); 211: if (str1 == NULL) return(-1); 212: if (str2 == NULL) return(1); 213: #ifdef __GNUC__ 214: tmp = strncmp((const char *)str1, (const char *)str2, len); 215: return tmp; 216: #else 217: do { 218: tmp = *str1++ - *str2; 219: if (tmp != 0 || --len == 0) return(tmp); 220: } while (*str2++ != 0); 221: return 0; 222: #endif 223: } 224: 225: static const xmlChar casemap[256] = { 226: 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 227: 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, 228: 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 229: 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, 230: 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, 231: 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, 232: 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 233: 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, 234: 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 235: 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 236: 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 237: 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F, 238: 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 239: 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 240: 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 241: 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, 242: 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 243: 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, 244: 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 245: 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, 246: 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7, 247: 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF, 248: 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7, 249: 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF, 250: 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7, 251: 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF, 252: 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7, 253: 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, 254: 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7, 255: 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, 256: 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7, 257: 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF 258: }; 259: 260: /** 261: * xmlStrcasecmp: 262: * @str1: the first xmlChar * 263: * @str2: the second xmlChar * 264: * 265: * a strcasecmp for xmlChar's 266: * 267: * Returns the integer result of the comparison 268: */ 269: 270: int 271: xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) { 272: register int tmp; 273: 274: if (str1 == str2) return(0); 275: if (str1 == NULL) return(-1); 276: if (str2 == NULL) return(1); 277: do { 278: tmp = casemap[*str1++] - casemap[*str2]; 279: if (tmp != 0) return(tmp); 280: } while (*str2++ != 0); 281: return 0; 282: } 283: 284: /** 285: * xmlStrncasecmp: 286: * @str1: the first xmlChar * 287: * @str2: the second xmlChar * 288: * @len: the max comparison length 289: * 290: * a strncasecmp for xmlChar's 291: * 292: * Returns the integer result of the comparison 293: */ 294: 295: int 296: xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) { 297: register int tmp; 298: 299: if (len <= 0) return(0); 300: if (str1 == str2) return(0); 301: if (str1 == NULL) return(-1); 302: if (str2 == NULL) return(1); 303: do { 304: tmp = casemap[*str1++] - casemap[*str2]; 305: if (tmp != 0 || --len == 0) return(tmp); 306: } while (*str2++ != 0); 307: return 0; 308: } 309: 310: /** 311: * xmlStrchr: 312: * @str: the xmlChar * array 313: * @val: the xmlChar to search 314: * 315: * a strchr for xmlChar's 316: * 317: * Returns the xmlChar * for the first occurrence or NULL. 318: */ 319: 320: const xmlChar * 321: xmlStrchr(const xmlChar *str, xmlChar val) { 322: if (str == NULL) return(NULL); 323: while (*str != 0) { /* non input consuming */ 324: if (*str == val) return((xmlChar *) str); 325: str++; 326: } 327: return(NULL); 328: } 329: 330: /** 331: * xmlStrstr: 332: * @str: the xmlChar * array (haystack) 333: * @val: the xmlChar to search (needle) 334: * 335: * a strstr for xmlChar's 336: * 337: * Returns the xmlChar * for the first occurrence or NULL. 338: */ 339: 340: const xmlChar * 341: xmlStrstr(const xmlChar *str, const xmlChar *val) { 342: int n; 343: 344: if (str == NULL) return(NULL); 345: if (val == NULL) return(NULL); 346: n = xmlStrlen(val); 347: 348: if (n == 0) return(str); 349: while (*str != 0) { /* non input consuming */ 350: if (*str == *val) { 351: if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str); 352: } 353: str++; 354: } 355: return(NULL); 356: } 357: 358: /** 359: * xmlStrcasestr: 360: * @str: the xmlChar * array (haystack) 361: * @val: the xmlChar to search (needle) 362: * 363: * a case-ignoring strstr for xmlChar's 364: * 365: * Returns the xmlChar * for the first occurrence or NULL. 366: */ 367: 368: const xmlChar * 369: xmlStrcasestr(const xmlChar *str, const xmlChar *val) { 370: int n; 371: 372: if (str == NULL) return(NULL); 373: if (val == NULL) return(NULL); 374: n = xmlStrlen(val); 375: 376: if (n == 0) return(str); 377: while (*str != 0) { /* non input consuming */ 378: if (casemap[*str] == casemap[*val]) 379: if (!xmlStrncasecmp(str, val, n)) return(str); 380: str++; 381: } 382: return(NULL); 383: } 384: 385: /** 386: * xmlStrsub: 387: * @str: the xmlChar * array (haystack) 388: * @start: the index of the first char (zero based) 389: * @len: the length of the substring 390: * 391: * Extract a substring of a given string 392: * 393: * Returns the xmlChar * for the first occurrence or NULL. 394: */ 395: 396: xmlChar * 397: xmlStrsub(const xmlChar *str, int start, int len) { 398: int i; 399: 400: if (str == NULL) return(NULL); 401: if (start < 0) return(NULL); 402: if (len < 0) return(NULL); 403: 404: for (i = 0;i < start;i++) { 405: if (*str == 0) return(NULL); 406: str++; 407: } 408: if (*str == 0) return(NULL); 409: return(xmlStrndup(str, len)); 410: } 411: 412: /** 413: * xmlStrlen: 414: * @str: the xmlChar * array 415: * 416: * length of a xmlChar's string 417: * 418: * Returns the number of xmlChar contained in the ARRAY. 419: */ 420: 421: int 422: xmlStrlen(const xmlChar *str) { 423: int len = 0; 424: 425: if (str == NULL) return(0); 426: while (*str != 0) { /* non input consuming */ 427: str++; 428: len++; 429: } 430: return(len); 431: } 432: 433: /** 434: * xmlStrncat: 435: * @cur: the original xmlChar * array 436: * @add: the xmlChar * array added 437: * @len: the length of @add 438: * 439: * a strncat for array of xmlChar's, it will extend @cur with the len 440: * first bytes of @add. Note that if @len < 0 then this is an API error 441: * and NULL will be returned. 442: * 443: * Returns a new xmlChar *, the original @cur is reallocated if needed 444: * and should not be freed 445: */ 446: 447: xmlChar * 448: xmlStrncat(xmlChar *cur, const xmlChar *add, int len) { 449: int size; 450: xmlChar *ret; 451: 452: if ((add == NULL) || (len == 0)) 453: return(cur); 454: if (len < 0) 455: return(NULL); 456: if (cur == NULL) 457: return(xmlStrndup(add, len)); 458: 459: size = xmlStrlen(cur); 460: ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar)); 461: if (ret == NULL) { 462: xmlErrMemory(NULL, NULL); 463: return(cur); 464: } 465: memcpy(&ret[size], add, len * sizeof(xmlChar)); 466: ret[size + len] = 0; 467: return(ret); 468: } 469: 470: /** 471: * xmlStrncatNew: 472: * @str1: first xmlChar string 473: * @str2: second xmlChar string 474: * @len: the len of @str2 or < 0 475: * 476: * same as xmlStrncat, but creates a new string. The original 477: * two strings are not freed. If @len is < 0 then the length 478: * will be calculated automatically. 479: * 480: * Returns a new xmlChar * or NULL 481: */ 482: xmlChar * 483: xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) { 484: int size; 485: xmlChar *ret; 486: 487: if (len < 0) 488: len = xmlStrlen(str2); 489: if ((str2 == NULL) || (len == 0)) 490: return(xmlStrdup(str1)); 491: if (str1 == NULL) 492: return(xmlStrndup(str2, len)); 493: 494: size = xmlStrlen(str1); 495: ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar)); 496: if (ret == NULL) { 497: xmlErrMemory(NULL, NULL); 498: return(xmlStrndup(str1, size)); 499: } 500: memcpy(ret, str1, size * sizeof(xmlChar)); 501: memcpy(&ret[size], str2, len * sizeof(xmlChar)); 502: ret[size + len] = 0; 503: return(ret); 504: } 505: 506: /** 507: * xmlStrcat: 508: * @cur: the original xmlChar * array 509: * @add: the xmlChar * array added 510: * 511: * a strcat for array of xmlChar's. Since they are supposed to be 512: * encoded in UTF-8 or an encoding with 8bit based chars, we assume 513: * a termination mark of '0'. 514: * 515: * Returns a new xmlChar * containing the concatenated string. 516: */ 517: xmlChar * 518: xmlStrcat(xmlChar *cur, const xmlChar *add) { 519: const xmlChar *p = add; 520: 521: if (add == NULL) return(cur); 522: if (cur == NULL) 523: return(xmlStrdup(add)); 524: 525: while (*p != 0) p++; /* non input consuming */ 526: return(xmlStrncat(cur, add, p - add)); 527: } 528: 529: /** 530: * xmlStrPrintf: 531: * @buf: the result buffer. 532: * @len: the result buffer length. 533: * @msg: the message with printf formatting. 534: * @...: extra parameters for the message. 535: * 536: * Formats @msg and places result into @buf. 537: * 538: * Returns the number of characters written to @buf or -1 if an error occurs. 539: */ 540: int XMLCDECL 541: xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) { 542: va_list args; 543: int ret; 544: 545: if((buf == NULL) || (msg == NULL)) { 546: return(-1); 547: } 548: 549: va_start(args, msg); 550: ret = vsnprintf((char *) buf, len, (const char *) msg, args); 551: va_end(args); 552: buf[len - 1] = 0; /* be safe ! */ 553: 554: return(ret); 555: } 556: 557: /** 558: * xmlStrVPrintf: 559: * @buf: the result buffer. 560: * @len: the result buffer length. 561: * @msg: the message with printf formatting. 562: * @ap: extra parameters for the message. 563: * 564: * Formats @msg and places result into @buf. 565: * 566: * Returns the number of characters written to @buf or -1 if an error occurs. 567: */ 568: int 569: xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) { 570: int ret; 571: 572: if((buf == NULL) || (msg == NULL)) { 573: return(-1); 574: } 575: 576: ret = vsnprintf((char *) buf, len, (const char *) msg, ap); 577: buf[len - 1] = 0; /* be safe ! */ 578: 579: return(ret); 580: } 581: 582: /************************************************************************ 583: * * 584: * Generic UTF8 handling routines * 585: * * 586: * From rfc2044: encoding of the Unicode values on UTF-8: * 587: * * 588: * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 589: * 0000 0000-0000 007F 0xxxxxxx * 590: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 591: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * 592: * * 593: * I hope we won't use values > 0xFFFF anytime soon ! * 594: * * 595: ************************************************************************/ 596: 597: 598: /** 599: * xmlUTF8Size: 600: * @utf: pointer to the UTF8 character 601: * 602: * calculates the internal size of a UTF8 character 603: * 604: * returns the numbers of bytes in the character, -1 on format error 605: */ 606: int 607: xmlUTF8Size(const xmlChar *utf) { 608: xmlChar mask; 609: int len; 610: 611: if (utf == NULL) 612: return -1; 613: if (*utf < 0x80) 614: return 1; 615: /* check valid UTF8 character */ 616: if (!(*utf & 0x40)) 617: return -1; 618: /* determine number of bytes in char */ 619: len = 2; 620: for (mask=0x20; mask != 0; mask>>=1) { 621: if (!(*utf & mask)) 622: return len; 623: len++; 624: } 625: return -1; 626: } 627: 628: /** 629: * xmlUTF8Charcmp: 630: * @utf1: pointer to first UTF8 char 631: * @utf2: pointer to second UTF8 char 632: * 633: * compares the two UCS4 values 634: * 635: * returns result of the compare as with xmlStrncmp 636: */ 637: int 638: xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) { 639: 640: if (utf1 == NULL ) { 641: if (utf2 == NULL) 642: return 0; 643: return -1; 644: } 645: return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1)); 646: } 647: 648: /** 649: * xmlUTF8Strlen: 650: * @utf: a sequence of UTF-8 encoded bytes 651: * 652: * compute the length of an UTF8 string, it doesn't do a full UTF8 653: * checking of the content of the string. 654: * 655: * Returns the number of characters in the string or -1 in case of error 656: */ 657: int 658: xmlUTF8Strlen(const xmlChar *utf) { 659: int ret = 0; 660: 661: if (utf == NULL) 662: return(-1); 663: 664: while (*utf != 0) { 665: if (utf[0] & 0x80) { 666: if ((utf[1] & 0xc0) != 0x80) 667: return(-1); 668: if ((utf[0] & 0xe0) == 0xe0) { 669: if ((utf[2] & 0xc0) != 0x80) 670: return(-1); 671: if ((utf[0] & 0xf0) == 0xf0) { 672: if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 673: return(-1); 674: utf += 4; 675: } else { 676: utf += 3; 677: } 678: } else { 679: utf += 2; 680: } 681: } else { 682: utf++; 683: } 684: ret++; 685: } 686: return(ret); 687: } 688: 689: /** 690: * xmlGetUTF8Char: 691: * @utf: a sequence of UTF-8 encoded bytes 692: * @len: a pointer to the minimum number of bytes present in 693: * the sequence. This is used to assure the next character 694: * is completely contained within the sequence. 695: * 696: * Read the first UTF8 character from @utf 697: * 698: * Returns the char value or -1 in case of error, and sets *len to 699: * the actual number of bytes consumed (0 in case of error) 700: */ 701: int 702: xmlGetUTF8Char(const unsigned char *utf, int *len) { 703: unsigned int c; 704: 705: if (utf == NULL) 706: goto error; 707: if (len == NULL) 708: goto error; 709: if (*len < 1) 710: goto error; 711: 712: c = utf[0]; 713: if (c & 0x80) { 714: if (*len < 2) 715: goto error; 716: if ((utf[1] & 0xc0) != 0x80) 717: goto error; 718: if ((c & 0xe0) == 0xe0) { 719: if (*len < 3) 720: goto error; 721: if ((utf[2] & 0xc0) != 0x80) 722: goto error; 723: if ((c & 0xf0) == 0xf0) { 724: if (*len < 4) 725: goto error; 726: if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 727: goto error; 728: *len = 4; 729: /* 4-byte code */ 730: c = (utf[0] & 0x7) << 18; 731: c |= (utf[1] & 0x3f) << 12; 732: c |= (utf[2] & 0x3f) << 6; 733: c |= utf[3] & 0x3f; 734: } else { 735: /* 3-byte code */ 736: *len = 3; 737: c = (utf[0] & 0xf) << 12; 738: c |= (utf[1] & 0x3f) << 6; 739: c |= utf[2] & 0x3f; 740: } 741: } else { 742: /* 2-byte code */ 743: *len = 2; 744: c = (utf[0] & 0x1f) << 6; 745: c |= utf[1] & 0x3f; 746: } 747: } else { 748: /* 1-byte code */ 749: *len = 1; 750: } 751: return(c); 752: 753: error: 754: if (len != NULL) 755: *len = 0; 756: return(-1); 757: } 758: 759: /** 760: * xmlCheckUTF8: 761: * @utf: Pointer to putative UTF-8 encoded string. 762: * 763: * Checks @utf for being valid UTF-8. @utf is assumed to be 764: * null-terminated. This function is not super-strict, as it will 765: * allow longer UTF-8 sequences than necessary. Note that Java is 766: * capable of producing these sequences if provoked. Also note, this 767: * routine checks for the 4-byte maximum size, but does not check for 768: * 0x10ffff maximum value. 769: * 770: * Return value: true if @utf is valid. 771: **/ 772: int 773: xmlCheckUTF8(const unsigned char *utf) 774: { 775: int ix; 776: unsigned char c; 777: 778: if (utf == NULL) 779: return(0); 780: /* 781: * utf is a string of 1, 2, 3 or 4 bytes. The valid strings 782: * are as follows (in "bit format"): 783: * 0xxxxxxx valid 1-byte 784: * 110xxxxx 10xxxxxx valid 2-byte 785: * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte 786: * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte 787: */ 788: for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */ 789: if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */ 790: ix++; 791: } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */ 792: if ((utf[ix+1] & 0xc0 ) != 0x80) 793: return 0; 794: ix += 2; 795: } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */ 796: if (((utf[ix+1] & 0xc0) != 0x80) || 797: ((utf[ix+2] & 0xc0) != 0x80)) 798: return 0; 799: ix += 3; 800: } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */ 801: if (((utf[ix+1] & 0xc0) != 0x80) || 802: ((utf[ix+2] & 0xc0) != 0x80) || 803: ((utf[ix+3] & 0xc0) != 0x80)) 804: return 0; 805: ix += 4; 806: } else /* unknown encoding */ 807: return 0; 808: } 809: return(1); 810: } 811: 812: /** 813: * xmlUTF8Strsize: 814: * @utf: a sequence of UTF-8 encoded bytes 815: * @len: the number of characters in the array 816: * 817: * storage size of an UTF8 string 818: * the behaviour is not garanteed if the input string is not UTF-8 819: * 820: * Returns the storage size of 821: * the first 'len' characters of ARRAY 822: */ 823: 824: int 825: xmlUTF8Strsize(const xmlChar *utf, int len) { 826: const xmlChar *ptr=utf; 827: xmlChar ch; 828: 829: if (utf == NULL) 830: return(0); 831: 832: if (len <= 0) 833: return(0); 834: 835: while ( len-- > 0) { 836: if ( !*ptr ) 837: break; 838: if ( (ch = *ptr++) & 0x80) 839: while ((ch<<=1) & 0x80 ) { 840: ptr++; 841: if (*ptr == 0) break; 842: } 843: } 844: return (ptr - utf); 845: } 846: 847: 848: /** 849: * xmlUTF8Strndup: 850: * @utf: the input UTF8 * 851: * @len: the len of @utf (in chars) 852: * 853: * a strndup for array of UTF8's 854: * 855: * Returns a new UTF8 * or NULL 856: */ 857: xmlChar * 858: xmlUTF8Strndup(const xmlChar *utf, int len) { 859: xmlChar *ret; 860: int i; 861: 862: if ((utf == NULL) || (len < 0)) return(NULL); 863: i = xmlUTF8Strsize(utf, len); 864: ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar)); 865: if (ret == NULL) { 866: xmlGenericError(xmlGenericErrorContext, 867: "malloc of %ld byte failed\n", 868: (len + 1) * (long)sizeof(xmlChar)); 869: return(NULL); 870: } 871: memcpy(ret, utf, i * sizeof(xmlChar)); 872: ret[i] = 0; 873: return(ret); 874: } 875: 876: /** 877: * xmlUTF8Strpos: 878: * @utf: the input UTF8 * 879: * @pos: the position of the desired UTF8 char (in chars) 880: * 881: * a function to provide the equivalent of fetching a 882: * character from a string array 883: * 884: * Returns a pointer to the UTF8 character or NULL 885: */ 886: const xmlChar * 887: xmlUTF8Strpos(const xmlChar *utf, int pos) { 888: xmlChar ch; 889: 890: if (utf == NULL) return(NULL); 891: if (pos < 0) 892: return(NULL); 893: while (pos--) { 894: if ((ch=*utf++) == 0) return(NULL); 895: if ( ch & 0x80 ) { 896: /* if not simple ascii, verify proper format */ 897: if ( (ch & 0xc0) != 0xc0 ) 898: return(NULL); 899: /* then skip over remaining bytes for this char */ 900: while ( (ch <<= 1) & 0x80 ) 901: if ( (*utf++ & 0xc0) != 0x80 ) 902: return(NULL); 903: } 904: } 905: return((xmlChar *)utf); 906: } 907: 908: /** 909: * xmlUTF8Strloc: 910: * @utf: the input UTF8 * 911: * @utfchar: the UTF8 character to be found 912: * 913: * a function to provide the relative location of a UTF8 char 914: * 915: * Returns the relative character position of the desired char 916: * or -1 if not found 917: */ 918: int 919: xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) { 920: int i, size; 921: xmlChar ch; 922: 923: if (utf==NULL || utfchar==NULL) return -1; 924: size = xmlUTF8Strsize(utfchar, 1); 925: for(i=0; (ch=*utf) != 0; i++) { 926: if (xmlStrncmp(utf, utfchar, size)==0) 927: return(i); 928: utf++; 929: if ( ch & 0x80 ) { 930: /* if not simple ascii, verify proper format */ 931: if ( (ch & 0xc0) != 0xc0 ) 932: return(-1); 933: /* then skip over remaining bytes for this char */ 934: while ( (ch <<= 1) & 0x80 ) 935: if ( (*utf++ & 0xc0) != 0x80 ) 936: return(-1); 937: } 938: } 939: 940: return(-1); 941: } 942: /** 943: * xmlUTF8Strsub: 944: * @utf: a sequence of UTF-8 encoded bytes 945: * @start: relative pos of first char 946: * @len: total number to copy 947: * 948: * Create a substring from a given UTF-8 string 949: * Note: positions are given in units of UTF-8 chars 950: * 951: * Returns a pointer to a newly created string 952: * or NULL if any problem 953: */ 954: 955: xmlChar * 956: xmlUTF8Strsub(const xmlChar *utf, int start, int len) { 957: int i; 958: xmlChar ch; 959: 960: if (utf == NULL) return(NULL); 961: if (start < 0) return(NULL); 962: if (len < 0) return(NULL); 963: 964: /* 965: * Skip over any leading chars 966: */ 967: for (i = 0;i < start;i++) { 968: if ((ch=*utf++) == 0) return(NULL); 969: if ( ch & 0x80 ) { 970: /* if not simple ascii, verify proper format */ 971: if ( (ch & 0xc0) != 0xc0 ) 972: return(NULL); 973: /* then skip over remaining bytes for this char */ 974: while ( (ch <<= 1) & 0x80 ) 975: if ( (*utf++ & 0xc0) != 0x80 ) 976: return(NULL); 977: } 978: } 979: 980: return(xmlUTF8Strndup(utf, len)); 981: } 982: 983: #define bottom_xmlstring 984: #include "elfgcchack.h"