Annotation of embedaddon/libxml2/xmlstring.c, revision 1.1
1.1 ! misho 1: /*
! 2: * string.c : an XML string utilities module
! 3: *
! 4: * This module provides various utility functions for manipulating
! 5: * the xmlChar* type. All functions named xmlStr* have been moved here
! 6: * from the parser.c file (their original home).
! 7: *
! 8: * See Copyright for the status of this software.
! 9: *
! 10: * UTF8 string routines from:
! 11: * William Brack <wbrack@mmm.com.hk>
! 12: *
! 13: * daniel@veillard.com
! 14: */
! 15:
! 16: #define IN_LIBXML
! 17: #include "libxml.h"
! 18:
! 19: #include <stdlib.h>
! 20: #include <string.h>
! 21: #include <libxml/xmlmemory.h>
! 22: #include <libxml/parserInternals.h>
! 23: #include <libxml/xmlstring.h>
! 24:
! 25: /************************************************************************
! 26: * *
! 27: * Commodity functions to handle xmlChars *
! 28: * *
! 29: ************************************************************************/
! 30:
! 31: /**
! 32: * xmlStrndup:
! 33: * @cur: the input xmlChar *
! 34: * @len: the len of @cur
! 35: *
! 36: * a strndup for array of xmlChar's
! 37: *
! 38: * Returns a new xmlChar * or NULL
! 39: */
! 40: xmlChar *
! 41: xmlStrndup(const xmlChar *cur, int len) {
! 42: xmlChar *ret;
! 43:
! 44: if ((cur == NULL) || (len < 0)) return(NULL);
! 45: ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
! 46: if (ret == NULL) {
! 47: xmlErrMemory(NULL, NULL);
! 48: return(NULL);
! 49: }
! 50: memcpy(ret, cur, len * sizeof(xmlChar));
! 51: ret[len] = 0;
! 52: return(ret);
! 53: }
! 54:
! 55: /**
! 56: * xmlStrdup:
! 57: * @cur: the input xmlChar *
! 58: *
! 59: * a strdup for array of xmlChar's. Since they are supposed to be
! 60: * encoded in UTF-8 or an encoding with 8bit based chars, we assume
! 61: * a termination mark of '0'.
! 62: *
! 63: * Returns a new xmlChar * or NULL
! 64: */
! 65: xmlChar *
! 66: xmlStrdup(const xmlChar *cur) {
! 67: const xmlChar *p = cur;
! 68:
! 69: if (cur == NULL) return(NULL);
! 70: while (*p != 0) p++; /* non input consuming */
! 71: return(xmlStrndup(cur, p - cur));
! 72: }
! 73:
! 74: /**
! 75: * xmlCharStrndup:
! 76: * @cur: the input char *
! 77: * @len: the len of @cur
! 78: *
! 79: * a strndup for char's to xmlChar's
! 80: *
! 81: * Returns a new xmlChar * or NULL
! 82: */
! 83:
! 84: xmlChar *
! 85: xmlCharStrndup(const char *cur, int len) {
! 86: int i;
! 87: xmlChar *ret;
! 88:
! 89: if ((cur == NULL) || (len < 0)) return(NULL);
! 90: ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
! 91: if (ret == NULL) {
! 92: xmlErrMemory(NULL, NULL);
! 93: return(NULL);
! 94: }
! 95: for (i = 0;i < len;i++) {
! 96: ret[i] = (xmlChar) cur[i];
! 97: if (ret[i] == 0) return(ret);
! 98: }
! 99: ret[len] = 0;
! 100: return(ret);
! 101: }
! 102:
! 103: /**
! 104: * xmlCharStrdup:
! 105: * @cur: the input char *
! 106: *
! 107: * a strdup for char's to xmlChar's
! 108: *
! 109: * Returns a new xmlChar * or NULL
! 110: */
! 111:
! 112: xmlChar *
! 113: xmlCharStrdup(const char *cur) {
! 114: const char *p = cur;
! 115:
! 116: if (cur == NULL) return(NULL);
! 117: while (*p != '\0') p++; /* non input consuming */
! 118: return(xmlCharStrndup(cur, p - cur));
! 119: }
! 120:
! 121: /**
! 122: * xmlStrcmp:
! 123: * @str1: the first xmlChar *
! 124: * @str2: the second xmlChar *
! 125: *
! 126: * a strcmp for xmlChar's
! 127: *
! 128: * Returns the integer result of the comparison
! 129: */
! 130:
! 131: int
! 132: xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
! 133: register int tmp;
! 134:
! 135: if (str1 == str2) return(0);
! 136: if (str1 == NULL) return(-1);
! 137: if (str2 == NULL) return(1);
! 138: do {
! 139: tmp = *str1++ - *str2;
! 140: if (tmp != 0) return(tmp);
! 141: } while (*str2++ != 0);
! 142: return 0;
! 143: }
! 144:
! 145: /**
! 146: * xmlStrEqual:
! 147: * @str1: the first xmlChar *
! 148: * @str2: the second xmlChar *
! 149: *
! 150: * Check if both strings are equal of have same content.
! 151: * Should be a bit more readable and faster than xmlStrcmp()
! 152: *
! 153: * Returns 1 if they are equal, 0 if they are different
! 154: */
! 155:
! 156: int
! 157: xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
! 158: if (str1 == str2) return(1);
! 159: if (str1 == NULL) return(0);
! 160: if (str2 == NULL) return(0);
! 161: do {
! 162: if (*str1++ != *str2) return(0);
! 163: } while (*str2++);
! 164: return(1);
! 165: }
! 166:
! 167: /**
! 168: * xmlStrQEqual:
! 169: * @pref: the prefix of the QName
! 170: * @name: the localname of the QName
! 171: * @str: the second xmlChar *
! 172: *
! 173: * Check if a QName is Equal to a given string
! 174: *
! 175: * Returns 1 if they are equal, 0 if they are different
! 176: */
! 177:
! 178: int
! 179: xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
! 180: if (pref == NULL) return(xmlStrEqual(name, str));
! 181: if (name == NULL) return(0);
! 182: if (str == NULL) return(0);
! 183:
! 184: do {
! 185: if (*pref++ != *str) return(0);
! 186: } while ((*str++) && (*pref));
! 187: if (*str++ != ':') return(0);
! 188: do {
! 189: if (*name++ != *str) return(0);
! 190: } while (*str++);
! 191: return(1);
! 192: }
! 193:
! 194: /**
! 195: * xmlStrncmp:
! 196: * @str1: the first xmlChar *
! 197: * @str2: the second xmlChar *
! 198: * @len: the max comparison length
! 199: *
! 200: * a strncmp for xmlChar's
! 201: *
! 202: * Returns the integer result of the comparison
! 203: */
! 204:
! 205: int
! 206: xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
! 207: register int tmp;
! 208:
! 209: if (len <= 0) return(0);
! 210: if (str1 == str2) return(0);
! 211: if (str1 == NULL) return(-1);
! 212: if (str2 == NULL) return(1);
! 213: #ifdef __GNUC__
! 214: tmp = strncmp((const char *)str1, (const char *)str2, len);
! 215: return tmp;
! 216: #else
! 217: do {
! 218: tmp = *str1++ - *str2;
! 219: if (tmp != 0 || --len == 0) return(tmp);
! 220: } while (*str2++ != 0);
! 221: return 0;
! 222: #endif
! 223: }
! 224:
! 225: static const xmlChar casemap[256] = {
! 226: 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
! 227: 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
! 228: 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
! 229: 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
! 230: 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
! 231: 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
! 232: 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
! 233: 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
! 234: 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
! 235: 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
! 236: 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
! 237: 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
! 238: 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
! 239: 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
! 240: 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
! 241: 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
! 242: 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
! 243: 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
! 244: 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
! 245: 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
! 246: 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
! 247: 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
! 248: 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
! 249: 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
! 250: 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
! 251: 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
! 252: 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
! 253: 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
! 254: 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
! 255: 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
! 256: 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
! 257: 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
! 258: };
! 259:
! 260: /**
! 261: * xmlStrcasecmp:
! 262: * @str1: the first xmlChar *
! 263: * @str2: the second xmlChar *
! 264: *
! 265: * a strcasecmp for xmlChar's
! 266: *
! 267: * Returns the integer result of the comparison
! 268: */
! 269:
! 270: int
! 271: xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
! 272: register int tmp;
! 273:
! 274: if (str1 == str2) return(0);
! 275: if (str1 == NULL) return(-1);
! 276: if (str2 == NULL) return(1);
! 277: do {
! 278: tmp = casemap[*str1++] - casemap[*str2];
! 279: if (tmp != 0) return(tmp);
! 280: } while (*str2++ != 0);
! 281: return 0;
! 282: }
! 283:
! 284: /**
! 285: * xmlStrncasecmp:
! 286: * @str1: the first xmlChar *
! 287: * @str2: the second xmlChar *
! 288: * @len: the max comparison length
! 289: *
! 290: * a strncasecmp for xmlChar's
! 291: *
! 292: * Returns the integer result of the comparison
! 293: */
! 294:
! 295: int
! 296: xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
! 297: register int tmp;
! 298:
! 299: if (len <= 0) return(0);
! 300: if (str1 == str2) return(0);
! 301: if (str1 == NULL) return(-1);
! 302: if (str2 == NULL) return(1);
! 303: do {
! 304: tmp = casemap[*str1++] - casemap[*str2];
! 305: if (tmp != 0 || --len == 0) return(tmp);
! 306: } while (*str2++ != 0);
! 307: return 0;
! 308: }
! 309:
! 310: /**
! 311: * xmlStrchr:
! 312: * @str: the xmlChar * array
! 313: * @val: the xmlChar to search
! 314: *
! 315: * a strchr for xmlChar's
! 316: *
! 317: * Returns the xmlChar * for the first occurrence or NULL.
! 318: */
! 319:
! 320: const xmlChar *
! 321: xmlStrchr(const xmlChar *str, xmlChar val) {
! 322: if (str == NULL) return(NULL);
! 323: while (*str != 0) { /* non input consuming */
! 324: if (*str == val) return((xmlChar *) str);
! 325: str++;
! 326: }
! 327: return(NULL);
! 328: }
! 329:
! 330: /**
! 331: * xmlStrstr:
! 332: * @str: the xmlChar * array (haystack)
! 333: * @val: the xmlChar to search (needle)
! 334: *
! 335: * a strstr for xmlChar's
! 336: *
! 337: * Returns the xmlChar * for the first occurrence or NULL.
! 338: */
! 339:
! 340: const xmlChar *
! 341: xmlStrstr(const xmlChar *str, const xmlChar *val) {
! 342: int n;
! 343:
! 344: if (str == NULL) return(NULL);
! 345: if (val == NULL) return(NULL);
! 346: n = xmlStrlen(val);
! 347:
! 348: if (n == 0) return(str);
! 349: while (*str != 0) { /* non input consuming */
! 350: if (*str == *val) {
! 351: if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
! 352: }
! 353: str++;
! 354: }
! 355: return(NULL);
! 356: }
! 357:
! 358: /**
! 359: * xmlStrcasestr:
! 360: * @str: the xmlChar * array (haystack)
! 361: * @val: the xmlChar to search (needle)
! 362: *
! 363: * a case-ignoring strstr for xmlChar's
! 364: *
! 365: * Returns the xmlChar * for the first occurrence or NULL.
! 366: */
! 367:
! 368: const xmlChar *
! 369: xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
! 370: int n;
! 371:
! 372: if (str == NULL) return(NULL);
! 373: if (val == NULL) return(NULL);
! 374: n = xmlStrlen(val);
! 375:
! 376: if (n == 0) return(str);
! 377: while (*str != 0) { /* non input consuming */
! 378: if (casemap[*str] == casemap[*val])
! 379: if (!xmlStrncasecmp(str, val, n)) return(str);
! 380: str++;
! 381: }
! 382: return(NULL);
! 383: }
! 384:
! 385: /**
! 386: * xmlStrsub:
! 387: * @str: the xmlChar * array (haystack)
! 388: * @start: the index of the first char (zero based)
! 389: * @len: the length of the substring
! 390: *
! 391: * Extract a substring of a given string
! 392: *
! 393: * Returns the xmlChar * for the first occurrence or NULL.
! 394: */
! 395:
! 396: xmlChar *
! 397: xmlStrsub(const xmlChar *str, int start, int len) {
! 398: int i;
! 399:
! 400: if (str == NULL) return(NULL);
! 401: if (start < 0) return(NULL);
! 402: if (len < 0) return(NULL);
! 403:
! 404: for (i = 0;i < start;i++) {
! 405: if (*str == 0) return(NULL);
! 406: str++;
! 407: }
! 408: if (*str == 0) return(NULL);
! 409: return(xmlStrndup(str, len));
! 410: }
! 411:
! 412: /**
! 413: * xmlStrlen:
! 414: * @str: the xmlChar * array
! 415: *
! 416: * length of a xmlChar's string
! 417: *
! 418: * Returns the number of xmlChar contained in the ARRAY.
! 419: */
! 420:
! 421: int
! 422: xmlStrlen(const xmlChar *str) {
! 423: int len = 0;
! 424:
! 425: if (str == NULL) return(0);
! 426: while (*str != 0) { /* non input consuming */
! 427: str++;
! 428: len++;
! 429: }
! 430: return(len);
! 431: }
! 432:
! 433: /**
! 434: * xmlStrncat:
! 435: * @cur: the original xmlChar * array
! 436: * @add: the xmlChar * array added
! 437: * @len: the length of @add
! 438: *
! 439: * a strncat for array of xmlChar's, it will extend @cur with the len
! 440: * first bytes of @add. Note that if @len < 0 then this is an API error
! 441: * and NULL will be returned.
! 442: *
! 443: * Returns a new xmlChar *, the original @cur is reallocated if needed
! 444: * and should not be freed
! 445: */
! 446:
! 447: xmlChar *
! 448: xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
! 449: int size;
! 450: xmlChar *ret;
! 451:
! 452: if ((add == NULL) || (len == 0))
! 453: return(cur);
! 454: if (len < 0)
! 455: return(NULL);
! 456: if (cur == NULL)
! 457: return(xmlStrndup(add, len));
! 458:
! 459: size = xmlStrlen(cur);
! 460: ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
! 461: if (ret == NULL) {
! 462: xmlErrMemory(NULL, NULL);
! 463: return(cur);
! 464: }
! 465: memcpy(&ret[size], add, len * sizeof(xmlChar));
! 466: ret[size + len] = 0;
! 467: return(ret);
! 468: }
! 469:
! 470: /**
! 471: * xmlStrncatNew:
! 472: * @str1: first xmlChar string
! 473: * @str2: second xmlChar string
! 474: * @len: the len of @str2 or < 0
! 475: *
! 476: * same as xmlStrncat, but creates a new string. The original
! 477: * two strings are not freed. If @len is < 0 then the length
! 478: * will be calculated automatically.
! 479: *
! 480: * Returns a new xmlChar * or NULL
! 481: */
! 482: xmlChar *
! 483: xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
! 484: int size;
! 485: xmlChar *ret;
! 486:
! 487: if (len < 0)
! 488: len = xmlStrlen(str2);
! 489: if ((str2 == NULL) || (len == 0))
! 490: return(xmlStrdup(str1));
! 491: if (str1 == NULL)
! 492: return(xmlStrndup(str2, len));
! 493:
! 494: size = xmlStrlen(str1);
! 495: ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
! 496: if (ret == NULL) {
! 497: xmlErrMemory(NULL, NULL);
! 498: return(xmlStrndup(str1, size));
! 499: }
! 500: memcpy(ret, str1, size * sizeof(xmlChar));
! 501: memcpy(&ret[size], str2, len * sizeof(xmlChar));
! 502: ret[size + len] = 0;
! 503: return(ret);
! 504: }
! 505:
! 506: /**
! 507: * xmlStrcat:
! 508: * @cur: the original xmlChar * array
! 509: * @add: the xmlChar * array added
! 510: *
! 511: * a strcat for array of xmlChar's. Since they are supposed to be
! 512: * encoded in UTF-8 or an encoding with 8bit based chars, we assume
! 513: * a termination mark of '0'.
! 514: *
! 515: * Returns a new xmlChar * containing the concatenated string.
! 516: */
! 517: xmlChar *
! 518: xmlStrcat(xmlChar *cur, const xmlChar *add) {
! 519: const xmlChar *p = add;
! 520:
! 521: if (add == NULL) return(cur);
! 522: if (cur == NULL)
! 523: return(xmlStrdup(add));
! 524:
! 525: while (*p != 0) p++; /* non input consuming */
! 526: return(xmlStrncat(cur, add, p - add));
! 527: }
! 528:
! 529: /**
! 530: * xmlStrPrintf:
! 531: * @buf: the result buffer.
! 532: * @len: the result buffer length.
! 533: * @msg: the message with printf formatting.
! 534: * @...: extra parameters for the message.
! 535: *
! 536: * Formats @msg and places result into @buf.
! 537: *
! 538: * Returns the number of characters written to @buf or -1 if an error occurs.
! 539: */
! 540: int XMLCDECL
! 541: xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
! 542: va_list args;
! 543: int ret;
! 544:
! 545: if((buf == NULL) || (msg == NULL)) {
! 546: return(-1);
! 547: }
! 548:
! 549: va_start(args, msg);
! 550: ret = vsnprintf((char *) buf, len, (const char *) msg, args);
! 551: va_end(args);
! 552: buf[len - 1] = 0; /* be safe ! */
! 553:
! 554: return(ret);
! 555: }
! 556:
! 557: /**
! 558: * xmlStrVPrintf:
! 559: * @buf: the result buffer.
! 560: * @len: the result buffer length.
! 561: * @msg: the message with printf formatting.
! 562: * @ap: extra parameters for the message.
! 563: *
! 564: * Formats @msg and places result into @buf.
! 565: *
! 566: * Returns the number of characters written to @buf or -1 if an error occurs.
! 567: */
! 568: int
! 569: xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
! 570: int ret;
! 571:
! 572: if((buf == NULL) || (msg == NULL)) {
! 573: return(-1);
! 574: }
! 575:
! 576: ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
! 577: buf[len - 1] = 0; /* be safe ! */
! 578:
! 579: return(ret);
! 580: }
! 581:
! 582: /************************************************************************
! 583: * *
! 584: * Generic UTF8 handling routines *
! 585: * *
! 586: * From rfc2044: encoding of the Unicode values on UTF-8: *
! 587: * *
! 588: * UCS-4 range (hex.) UTF-8 octet sequence (binary) *
! 589: * 0000 0000-0000 007F 0xxxxxxx *
! 590: * 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
! 591: * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
! 592: * *
! 593: * I hope we won't use values > 0xFFFF anytime soon ! *
! 594: * *
! 595: ************************************************************************/
! 596:
! 597:
! 598: /**
! 599: * xmlUTF8Size:
! 600: * @utf: pointer to the UTF8 character
! 601: *
! 602: * calculates the internal size of a UTF8 character
! 603: *
! 604: * returns the numbers of bytes in the character, -1 on format error
! 605: */
! 606: int
! 607: xmlUTF8Size(const xmlChar *utf) {
! 608: xmlChar mask;
! 609: int len;
! 610:
! 611: if (utf == NULL)
! 612: return -1;
! 613: if (*utf < 0x80)
! 614: return 1;
! 615: /* check valid UTF8 character */
! 616: if (!(*utf & 0x40))
! 617: return -1;
! 618: /* determine number of bytes in char */
! 619: len = 2;
! 620: for (mask=0x20; mask != 0; mask>>=1) {
! 621: if (!(*utf & mask))
! 622: return len;
! 623: len++;
! 624: }
! 625: return -1;
! 626: }
! 627:
! 628: /**
! 629: * xmlUTF8Charcmp:
! 630: * @utf1: pointer to first UTF8 char
! 631: * @utf2: pointer to second UTF8 char
! 632: *
! 633: * compares the two UCS4 values
! 634: *
! 635: * returns result of the compare as with xmlStrncmp
! 636: */
! 637: int
! 638: xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
! 639:
! 640: if (utf1 == NULL ) {
! 641: if (utf2 == NULL)
! 642: return 0;
! 643: return -1;
! 644: }
! 645: return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
! 646: }
! 647:
! 648: /**
! 649: * xmlUTF8Strlen:
! 650: * @utf: a sequence of UTF-8 encoded bytes
! 651: *
! 652: * compute the length of an UTF8 string, it doesn't do a full UTF8
! 653: * checking of the content of the string.
! 654: *
! 655: * Returns the number of characters in the string or -1 in case of error
! 656: */
! 657: int
! 658: xmlUTF8Strlen(const xmlChar *utf) {
! 659: int ret = 0;
! 660:
! 661: if (utf == NULL)
! 662: return(-1);
! 663:
! 664: while (*utf != 0) {
! 665: if (utf[0] & 0x80) {
! 666: if ((utf[1] & 0xc0) != 0x80)
! 667: return(-1);
! 668: if ((utf[0] & 0xe0) == 0xe0) {
! 669: if ((utf[2] & 0xc0) != 0x80)
! 670: return(-1);
! 671: if ((utf[0] & 0xf0) == 0xf0) {
! 672: if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
! 673: return(-1);
! 674: utf += 4;
! 675: } else {
! 676: utf += 3;
! 677: }
! 678: } else {
! 679: utf += 2;
! 680: }
! 681: } else {
! 682: utf++;
! 683: }
! 684: ret++;
! 685: }
! 686: return(ret);
! 687: }
! 688:
! 689: /**
! 690: * xmlGetUTF8Char:
! 691: * @utf: a sequence of UTF-8 encoded bytes
! 692: * @len: a pointer to the minimum number of bytes present in
! 693: * the sequence. This is used to assure the next character
! 694: * is completely contained within the sequence.
! 695: *
! 696: * Read the first UTF8 character from @utf
! 697: *
! 698: * Returns the char value or -1 in case of error, and sets *len to
! 699: * the actual number of bytes consumed (0 in case of error)
! 700: */
! 701: int
! 702: xmlGetUTF8Char(const unsigned char *utf, int *len) {
! 703: unsigned int c;
! 704:
! 705: if (utf == NULL)
! 706: goto error;
! 707: if (len == NULL)
! 708: goto error;
! 709: if (*len < 1)
! 710: goto error;
! 711:
! 712: c = utf[0];
! 713: if (c & 0x80) {
! 714: if (*len < 2)
! 715: goto error;
! 716: if ((utf[1] & 0xc0) != 0x80)
! 717: goto error;
! 718: if ((c & 0xe0) == 0xe0) {
! 719: if (*len < 3)
! 720: goto error;
! 721: if ((utf[2] & 0xc0) != 0x80)
! 722: goto error;
! 723: if ((c & 0xf0) == 0xf0) {
! 724: if (*len < 4)
! 725: goto error;
! 726: if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
! 727: goto error;
! 728: *len = 4;
! 729: /* 4-byte code */
! 730: c = (utf[0] & 0x7) << 18;
! 731: c |= (utf[1] & 0x3f) << 12;
! 732: c |= (utf[2] & 0x3f) << 6;
! 733: c |= utf[3] & 0x3f;
! 734: } else {
! 735: /* 3-byte code */
! 736: *len = 3;
! 737: c = (utf[0] & 0xf) << 12;
! 738: c |= (utf[1] & 0x3f) << 6;
! 739: c |= utf[2] & 0x3f;
! 740: }
! 741: } else {
! 742: /* 2-byte code */
! 743: *len = 2;
! 744: c = (utf[0] & 0x1f) << 6;
! 745: c |= utf[1] & 0x3f;
! 746: }
! 747: } else {
! 748: /* 1-byte code */
! 749: *len = 1;
! 750: }
! 751: return(c);
! 752:
! 753: error:
! 754: if (len != NULL)
! 755: *len = 0;
! 756: return(-1);
! 757: }
! 758:
! 759: /**
! 760: * xmlCheckUTF8:
! 761: * @utf: Pointer to putative UTF-8 encoded string.
! 762: *
! 763: * Checks @utf for being valid UTF-8. @utf is assumed to be
! 764: * null-terminated. This function is not super-strict, as it will
! 765: * allow longer UTF-8 sequences than necessary. Note that Java is
! 766: * capable of producing these sequences if provoked. Also note, this
! 767: * routine checks for the 4-byte maximum size, but does not check for
! 768: * 0x10ffff maximum value.
! 769: *
! 770: * Return value: true if @utf is valid.
! 771: **/
! 772: int
! 773: xmlCheckUTF8(const unsigned char *utf)
! 774: {
! 775: int ix;
! 776: unsigned char c;
! 777:
! 778: if (utf == NULL)
! 779: return(0);
! 780: /*
! 781: * utf is a string of 1, 2, 3 or 4 bytes. The valid strings
! 782: * are as follows (in "bit format"):
! 783: * 0xxxxxxx valid 1-byte
! 784: * 110xxxxx 10xxxxxx valid 2-byte
! 785: * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte
! 786: * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte
! 787: */
! 788: for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */
! 789: if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
! 790: ix++;
! 791: } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
! 792: if ((utf[ix+1] & 0xc0 ) != 0x80)
! 793: return 0;
! 794: ix += 2;
! 795: } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
! 796: if (((utf[ix+1] & 0xc0) != 0x80) ||
! 797: ((utf[ix+2] & 0xc0) != 0x80))
! 798: return 0;
! 799: ix += 3;
! 800: } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
! 801: if (((utf[ix+1] & 0xc0) != 0x80) ||
! 802: ((utf[ix+2] & 0xc0) != 0x80) ||
! 803: ((utf[ix+3] & 0xc0) != 0x80))
! 804: return 0;
! 805: ix += 4;
! 806: } else /* unknown encoding */
! 807: return 0;
! 808: }
! 809: return(1);
! 810: }
! 811:
! 812: /**
! 813: * xmlUTF8Strsize:
! 814: * @utf: a sequence of UTF-8 encoded bytes
! 815: * @len: the number of characters in the array
! 816: *
! 817: * storage size of an UTF8 string
! 818: * the behaviour is not garanteed if the input string is not UTF-8
! 819: *
! 820: * Returns the storage size of
! 821: * the first 'len' characters of ARRAY
! 822: */
! 823:
! 824: int
! 825: xmlUTF8Strsize(const xmlChar *utf, int len) {
! 826: const xmlChar *ptr=utf;
! 827: xmlChar ch;
! 828:
! 829: if (utf == NULL)
! 830: return(0);
! 831:
! 832: if (len <= 0)
! 833: return(0);
! 834:
! 835: while ( len-- > 0) {
! 836: if ( !*ptr )
! 837: break;
! 838: if ( (ch = *ptr++) & 0x80)
! 839: while ((ch<<=1) & 0x80 ) {
! 840: ptr++;
! 841: if (*ptr == 0) break;
! 842: }
! 843: }
! 844: return (ptr - utf);
! 845: }
! 846:
! 847:
! 848: /**
! 849: * xmlUTF8Strndup:
! 850: * @utf: the input UTF8 *
! 851: * @len: the len of @utf (in chars)
! 852: *
! 853: * a strndup for array of UTF8's
! 854: *
! 855: * Returns a new UTF8 * or NULL
! 856: */
! 857: xmlChar *
! 858: xmlUTF8Strndup(const xmlChar *utf, int len) {
! 859: xmlChar *ret;
! 860: int i;
! 861:
! 862: if ((utf == NULL) || (len < 0)) return(NULL);
! 863: i = xmlUTF8Strsize(utf, len);
! 864: ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
! 865: if (ret == NULL) {
! 866: xmlGenericError(xmlGenericErrorContext,
! 867: "malloc of %ld byte failed\n",
! 868: (len + 1) * (long)sizeof(xmlChar));
! 869: return(NULL);
! 870: }
! 871: memcpy(ret, utf, i * sizeof(xmlChar));
! 872: ret[i] = 0;
! 873: return(ret);
! 874: }
! 875:
! 876: /**
! 877: * xmlUTF8Strpos:
! 878: * @utf: the input UTF8 *
! 879: * @pos: the position of the desired UTF8 char (in chars)
! 880: *
! 881: * a function to provide the equivalent of fetching a
! 882: * character from a string array
! 883: *
! 884: * Returns a pointer to the UTF8 character or NULL
! 885: */
! 886: const xmlChar *
! 887: xmlUTF8Strpos(const xmlChar *utf, int pos) {
! 888: xmlChar ch;
! 889:
! 890: if (utf == NULL) return(NULL);
! 891: if (pos < 0)
! 892: return(NULL);
! 893: while (pos--) {
! 894: if ((ch=*utf++) == 0) return(NULL);
! 895: if ( ch & 0x80 ) {
! 896: /* if not simple ascii, verify proper format */
! 897: if ( (ch & 0xc0) != 0xc0 )
! 898: return(NULL);
! 899: /* then skip over remaining bytes for this char */
! 900: while ( (ch <<= 1) & 0x80 )
! 901: if ( (*utf++ & 0xc0) != 0x80 )
! 902: return(NULL);
! 903: }
! 904: }
! 905: return((xmlChar *)utf);
! 906: }
! 907:
! 908: /**
! 909: * xmlUTF8Strloc:
! 910: * @utf: the input UTF8 *
! 911: * @utfchar: the UTF8 character to be found
! 912: *
! 913: * a function to provide the relative location of a UTF8 char
! 914: *
! 915: * Returns the relative character position of the desired char
! 916: * or -1 if not found
! 917: */
! 918: int
! 919: xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
! 920: int i, size;
! 921: xmlChar ch;
! 922:
! 923: if (utf==NULL || utfchar==NULL) return -1;
! 924: size = xmlUTF8Strsize(utfchar, 1);
! 925: for(i=0; (ch=*utf) != 0; i++) {
! 926: if (xmlStrncmp(utf, utfchar, size)==0)
! 927: return(i);
! 928: utf++;
! 929: if ( ch & 0x80 ) {
! 930: /* if not simple ascii, verify proper format */
! 931: if ( (ch & 0xc0) != 0xc0 )
! 932: return(-1);
! 933: /* then skip over remaining bytes for this char */
! 934: while ( (ch <<= 1) & 0x80 )
! 935: if ( (*utf++ & 0xc0) != 0x80 )
! 936: return(-1);
! 937: }
! 938: }
! 939:
! 940: return(-1);
! 941: }
! 942: /**
! 943: * xmlUTF8Strsub:
! 944: * @utf: a sequence of UTF-8 encoded bytes
! 945: * @start: relative pos of first char
! 946: * @len: total number to copy
! 947: *
! 948: * Create a substring from a given UTF-8 string
! 949: * Note: positions are given in units of UTF-8 chars
! 950: *
! 951: * Returns a pointer to a newly created string
! 952: * or NULL if any problem
! 953: */
! 954:
! 955: xmlChar *
! 956: xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
! 957: int i;
! 958: xmlChar ch;
! 959:
! 960: if (utf == NULL) return(NULL);
! 961: if (start < 0) return(NULL);
! 962: if (len < 0) return(NULL);
! 963:
! 964: /*
! 965: * Skip over any leading chars
! 966: */
! 967: for (i = 0;i < start;i++) {
! 968: if ((ch=*utf++) == 0) return(NULL);
! 969: if ( ch & 0x80 ) {
! 970: /* if not simple ascii, verify proper format */
! 971: if ( (ch & 0xc0) != 0xc0 )
! 972: return(NULL);
! 973: /* then skip over remaining bytes for this char */
! 974: while ( (ch <<= 1) & 0x80 )
! 975: if ( (*utf++ & 0xc0) != 0x80 )
! 976: return(NULL);
! 977: }
! 978: }
! 979:
! 980: return(xmlUTF8Strndup(utf, len));
! 981: }
! 982:
! 983: #define bottom_xmlstring
! 984: #include "elfgcchack.h"
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>