Annotation of embedaddon/libxml2/xmlstring.c, revision 1.1

1.1     ! misho       1: /*
        !             2:  * string.c : an XML string utilities module
        !             3:  *
        !             4:  * This module provides various utility functions for manipulating
        !             5:  * the xmlChar* type. All functions named xmlStr* have been moved here
        !             6:  * from the parser.c file (their original home). 
        !             7:  *
        !             8:  * See Copyright for the status of this software.
        !             9:  *
        !            10:  * UTF8 string routines from:
        !            11:  * William Brack <wbrack@mmm.com.hk>
        !            12:  *
        !            13:  * daniel@veillard.com
        !            14:  */
        !            15: 
        !            16: #define IN_LIBXML
        !            17: #include "libxml.h"
        !            18: 
        !            19: #include <stdlib.h>
        !            20: #include <string.h>
        !            21: #include <libxml/xmlmemory.h>
        !            22: #include <libxml/parserInternals.h>
        !            23: #include <libxml/xmlstring.h>
        !            24: 
        !            25: /************************************************************************
        !            26:  *                                                                      *
        !            27:  *                Commodity functions to handle xmlChars                *
        !            28:  *                                                                      *
        !            29:  ************************************************************************/
        !            30: 
        !            31: /**
        !            32:  * xmlStrndup:
        !            33:  * @cur:  the input xmlChar *
        !            34:  * @len:  the len of @cur
        !            35:  *
        !            36:  * a strndup for array of xmlChar's
        !            37:  *
        !            38:  * Returns a new xmlChar * or NULL
        !            39:  */
        !            40: xmlChar *
        !            41: xmlStrndup(const xmlChar *cur, int len) {
        !            42:     xmlChar *ret;
        !            43:     
        !            44:     if ((cur == NULL) || (len < 0)) return(NULL);
        !            45:     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
        !            46:     if (ret == NULL) {
        !            47:         xmlErrMemory(NULL, NULL);
        !            48:         return(NULL);
        !            49:     }
        !            50:     memcpy(ret, cur, len * sizeof(xmlChar));
        !            51:     ret[len] = 0;
        !            52:     return(ret);
        !            53: }
        !            54: 
        !            55: /**
        !            56:  * xmlStrdup:
        !            57:  * @cur:  the input xmlChar *
        !            58:  *
        !            59:  * a strdup for array of xmlChar's. Since they are supposed to be
        !            60:  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
        !            61:  * a termination mark of '0'.
        !            62:  *
        !            63:  * Returns a new xmlChar * or NULL
        !            64:  */
        !            65: xmlChar *
        !            66: xmlStrdup(const xmlChar *cur) {
        !            67:     const xmlChar *p = cur;
        !            68: 
        !            69:     if (cur == NULL) return(NULL);
        !            70:     while (*p != 0) p++; /* non input consuming */
        !            71:     return(xmlStrndup(cur, p - cur));
        !            72: }
        !            73: 
        !            74: /**
        !            75:  * xmlCharStrndup:
        !            76:  * @cur:  the input char *
        !            77:  * @len:  the len of @cur
        !            78:  *
        !            79:  * a strndup for char's to xmlChar's
        !            80:  *
        !            81:  * Returns a new xmlChar * or NULL
        !            82:  */
        !            83: 
        !            84: xmlChar *
        !            85: xmlCharStrndup(const char *cur, int len) {
        !            86:     int i;
        !            87:     xmlChar *ret;
        !            88:     
        !            89:     if ((cur == NULL) || (len < 0)) return(NULL);
        !            90:     ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
        !            91:     if (ret == NULL) {
        !            92:         xmlErrMemory(NULL, NULL);
        !            93:         return(NULL);
        !            94:     }
        !            95:     for (i = 0;i < len;i++) {
        !            96:         ret[i] = (xmlChar) cur[i];
        !            97:         if (ret[i] == 0) return(ret);
        !            98:     }
        !            99:     ret[len] = 0;
        !           100:     return(ret);
        !           101: }
        !           102: 
        !           103: /**
        !           104:  * xmlCharStrdup:
        !           105:  * @cur:  the input char *
        !           106:  *
        !           107:  * a strdup for char's to xmlChar's
        !           108:  *
        !           109:  * Returns a new xmlChar * or NULL
        !           110:  */
        !           111: 
        !           112: xmlChar *
        !           113: xmlCharStrdup(const char *cur) {
        !           114:     const char *p = cur;
        !           115: 
        !           116:     if (cur == NULL) return(NULL);
        !           117:     while (*p != '\0') p++; /* non input consuming */
        !           118:     return(xmlCharStrndup(cur, p - cur));
        !           119: }
        !           120: 
        !           121: /**
        !           122:  * xmlStrcmp:
        !           123:  * @str1:  the first xmlChar *
        !           124:  * @str2:  the second xmlChar *
        !           125:  *
        !           126:  * a strcmp for xmlChar's
        !           127:  *
        !           128:  * Returns the integer result of the comparison
        !           129:  */
        !           130: 
        !           131: int
        !           132: xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
        !           133:     register int tmp;
        !           134: 
        !           135:     if (str1 == str2) return(0);
        !           136:     if (str1 == NULL) return(-1);
        !           137:     if (str2 == NULL) return(1);
        !           138:     do {
        !           139:         tmp = *str1++ - *str2;
        !           140:         if (tmp != 0) return(tmp);
        !           141:     } while (*str2++ != 0);
        !           142:     return 0;
        !           143: }
        !           144: 
        !           145: /**
        !           146:  * xmlStrEqual:
        !           147:  * @str1:  the first xmlChar *
        !           148:  * @str2:  the second xmlChar *
        !           149:  *
        !           150:  * Check if both strings are equal of have same content.
        !           151:  * Should be a bit more readable and faster than xmlStrcmp()
        !           152:  *
        !           153:  * Returns 1 if they are equal, 0 if they are different
        !           154:  */
        !           155: 
        !           156: int
        !           157: xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
        !           158:     if (str1 == str2) return(1);
        !           159:     if (str1 == NULL) return(0);
        !           160:     if (str2 == NULL) return(0);
        !           161:     do {
        !           162:         if (*str1++ != *str2) return(0);
        !           163:     } while (*str2++);
        !           164:     return(1);
        !           165: }
        !           166: 
        !           167: /**
        !           168:  * xmlStrQEqual:
        !           169:  * @pref:  the prefix of the QName
        !           170:  * @name:  the localname of the QName
        !           171:  * @str:  the second xmlChar *
        !           172:  *
        !           173:  * Check if a QName is Equal to a given string 
        !           174:  *
        !           175:  * Returns 1 if they are equal, 0 if they are different
        !           176:  */
        !           177: 
        !           178: int
        !           179: xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
        !           180:     if (pref == NULL) return(xmlStrEqual(name, str));
        !           181:     if (name == NULL) return(0);
        !           182:     if (str == NULL) return(0);
        !           183: 
        !           184:     do {
        !           185:         if (*pref++ != *str) return(0);
        !           186:     } while ((*str++) && (*pref));
        !           187:     if (*str++ != ':') return(0);
        !           188:     do {
        !           189:         if (*name++ != *str) return(0);
        !           190:     } while (*str++);
        !           191:     return(1);
        !           192: }
        !           193: 
        !           194: /**
        !           195:  * xmlStrncmp:
        !           196:  * @str1:  the first xmlChar *
        !           197:  * @str2:  the second xmlChar *
        !           198:  * @len:  the max comparison length
        !           199:  *
        !           200:  * a strncmp for xmlChar's
        !           201:  *
        !           202:  * Returns the integer result of the comparison
        !           203:  */
        !           204: 
        !           205: int
        !           206: xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
        !           207:     register int tmp;
        !           208: 
        !           209:     if (len <= 0) return(0);
        !           210:     if (str1 == str2) return(0);
        !           211:     if (str1 == NULL) return(-1);
        !           212:     if (str2 == NULL) return(1);
        !           213: #ifdef __GNUC__
        !           214:     tmp = strncmp((const char *)str1, (const char *)str2, len);
        !           215:     return tmp;
        !           216: #else
        !           217:     do {
        !           218:         tmp = *str1++ - *str2;
        !           219:         if (tmp != 0 || --len == 0) return(tmp);
        !           220:     } while (*str2++ != 0);
        !           221:     return 0;
        !           222: #endif
        !           223: }
        !           224: 
        !           225: static const xmlChar casemap[256] = {
        !           226:     0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
        !           227:     0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
        !           228:     0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
        !           229:     0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
        !           230:     0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
        !           231:     0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
        !           232:     0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
        !           233:     0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
        !           234:     0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
        !           235:     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
        !           236:     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
        !           237:     0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
        !           238:     0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
        !           239:     0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
        !           240:     0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
        !           241:     0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
        !           242:     0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
        !           243:     0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
        !           244:     0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
        !           245:     0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
        !           246:     0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
        !           247:     0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
        !           248:     0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
        !           249:     0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
        !           250:     0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
        !           251:     0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
        !           252:     0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
        !           253:     0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
        !           254:     0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
        !           255:     0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
        !           256:     0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
        !           257:     0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
        !           258: };
        !           259: 
        !           260: /**
        !           261:  * xmlStrcasecmp:
        !           262:  * @str1:  the first xmlChar *
        !           263:  * @str2:  the second xmlChar *
        !           264:  *
        !           265:  * a strcasecmp for xmlChar's
        !           266:  *
        !           267:  * Returns the integer result of the comparison
        !           268:  */
        !           269: 
        !           270: int
        !           271: xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
        !           272:     register int tmp;
        !           273: 
        !           274:     if (str1 == str2) return(0);
        !           275:     if (str1 == NULL) return(-1);
        !           276:     if (str2 == NULL) return(1);
        !           277:     do {
        !           278:         tmp = casemap[*str1++] - casemap[*str2];
        !           279:         if (tmp != 0) return(tmp);
        !           280:     } while (*str2++ != 0);
        !           281:     return 0;
        !           282: }
        !           283: 
        !           284: /**
        !           285:  * xmlStrncasecmp:
        !           286:  * @str1:  the first xmlChar *
        !           287:  * @str2:  the second xmlChar *
        !           288:  * @len:  the max comparison length
        !           289:  *
        !           290:  * a strncasecmp for xmlChar's
        !           291:  *
        !           292:  * Returns the integer result of the comparison
        !           293:  */
        !           294: 
        !           295: int
        !           296: xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
        !           297:     register int tmp;
        !           298: 
        !           299:     if (len <= 0) return(0);
        !           300:     if (str1 == str2) return(0);
        !           301:     if (str1 == NULL) return(-1);
        !           302:     if (str2 == NULL) return(1);
        !           303:     do {
        !           304:         tmp = casemap[*str1++] - casemap[*str2];
        !           305:         if (tmp != 0 || --len == 0) return(tmp);
        !           306:     } while (*str2++ != 0);
        !           307:     return 0;
        !           308: }
        !           309: 
        !           310: /**
        !           311:  * xmlStrchr:
        !           312:  * @str:  the xmlChar * array
        !           313:  * @val:  the xmlChar to search
        !           314:  *
        !           315:  * a strchr for xmlChar's
        !           316:  *
        !           317:  * Returns the xmlChar * for the first occurrence or NULL.
        !           318:  */
        !           319: 
        !           320: const xmlChar *
        !           321: xmlStrchr(const xmlChar *str, xmlChar val) {
        !           322:     if (str == NULL) return(NULL);
        !           323:     while (*str != 0) { /* non input consuming */
        !           324:         if (*str == val) return((xmlChar *) str);
        !           325:         str++;
        !           326:     }
        !           327:     return(NULL);
        !           328: }
        !           329: 
        !           330: /**
        !           331:  * xmlStrstr:
        !           332:  * @str:  the xmlChar * array (haystack)
        !           333:  * @val:  the xmlChar to search (needle)
        !           334:  *
        !           335:  * a strstr for xmlChar's
        !           336:  *
        !           337:  * Returns the xmlChar * for the first occurrence or NULL.
        !           338:  */
        !           339: 
        !           340: const xmlChar *
        !           341: xmlStrstr(const xmlChar *str, const xmlChar *val) {
        !           342:     int n;
        !           343:     
        !           344:     if (str == NULL) return(NULL);
        !           345:     if (val == NULL) return(NULL);
        !           346:     n = xmlStrlen(val);
        !           347: 
        !           348:     if (n == 0) return(str);
        !           349:     while (*str != 0) { /* non input consuming */
        !           350:         if (*str == *val) {
        !           351:             if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
        !           352:         }
        !           353:         str++;
        !           354:     }
        !           355:     return(NULL);
        !           356: }
        !           357: 
        !           358: /**
        !           359:  * xmlStrcasestr:
        !           360:  * @str:  the xmlChar * array (haystack)
        !           361:  * @val:  the xmlChar to search (needle)
        !           362:  *
        !           363:  * a case-ignoring strstr for xmlChar's
        !           364:  *
        !           365:  * Returns the xmlChar * for the first occurrence or NULL.
        !           366:  */
        !           367: 
        !           368: const xmlChar *
        !           369: xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
        !           370:     int n;
        !           371:     
        !           372:     if (str == NULL) return(NULL);
        !           373:     if (val == NULL) return(NULL);
        !           374:     n = xmlStrlen(val);
        !           375: 
        !           376:     if (n == 0) return(str);
        !           377:     while (*str != 0) { /* non input consuming */
        !           378:         if (casemap[*str] == casemap[*val])
        !           379:             if (!xmlStrncasecmp(str, val, n)) return(str);
        !           380:         str++;
        !           381:     }
        !           382:     return(NULL);
        !           383: }
        !           384: 
        !           385: /**
        !           386:  * xmlStrsub:
        !           387:  * @str:  the xmlChar * array (haystack)
        !           388:  * @start:  the index of the first char (zero based)
        !           389:  * @len:  the length of the substring
        !           390:  *
        !           391:  * Extract a substring of a given string
        !           392:  *
        !           393:  * Returns the xmlChar * for the first occurrence or NULL.
        !           394:  */
        !           395: 
        !           396: xmlChar *
        !           397: xmlStrsub(const xmlChar *str, int start, int len) {
        !           398:     int i;
        !           399:     
        !           400:     if (str == NULL) return(NULL);
        !           401:     if (start < 0) return(NULL);
        !           402:     if (len < 0) return(NULL);
        !           403: 
        !           404:     for (i = 0;i < start;i++) {
        !           405:         if (*str == 0) return(NULL);
        !           406:         str++;
        !           407:     }
        !           408:     if (*str == 0) return(NULL);
        !           409:     return(xmlStrndup(str, len));
        !           410: }
        !           411: 
        !           412: /**
        !           413:  * xmlStrlen:
        !           414:  * @str:  the xmlChar * array
        !           415:  *
        !           416:  * length of a xmlChar's string
        !           417:  *
        !           418:  * Returns the number of xmlChar contained in the ARRAY.
        !           419:  */
        !           420: 
        !           421: int
        !           422: xmlStrlen(const xmlChar *str) {
        !           423:     int len = 0;
        !           424: 
        !           425:     if (str == NULL) return(0);
        !           426:     while (*str != 0) { /* non input consuming */
        !           427:         str++;
        !           428:         len++;
        !           429:     }
        !           430:     return(len);
        !           431: }
        !           432: 
        !           433: /**
        !           434:  * xmlStrncat:
        !           435:  * @cur:  the original xmlChar * array
        !           436:  * @add:  the xmlChar * array added
        !           437:  * @len:  the length of @add
        !           438:  *
        !           439:  * a strncat for array of xmlChar's, it will extend @cur with the len
        !           440:  * first bytes of @add. Note that if @len < 0 then this is an API error
        !           441:  * and NULL will be returned.
        !           442:  *
        !           443:  * Returns a new xmlChar *, the original @cur is reallocated if needed
        !           444:  * and should not be freed
        !           445:  */
        !           446: 
        !           447: xmlChar *
        !           448: xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
        !           449:     int size;
        !           450:     xmlChar *ret;
        !           451: 
        !           452:     if ((add == NULL) || (len == 0))
        !           453:         return(cur);
        !           454:     if (len < 0)
        !           455:        return(NULL);
        !           456:     if (cur == NULL)
        !           457:         return(xmlStrndup(add, len));
        !           458: 
        !           459:     size = xmlStrlen(cur);
        !           460:     ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
        !           461:     if (ret == NULL) {
        !           462:         xmlErrMemory(NULL, NULL);
        !           463:         return(cur);
        !           464:     }
        !           465:     memcpy(&ret[size], add, len * sizeof(xmlChar));
        !           466:     ret[size + len] = 0;
        !           467:     return(ret);
        !           468: }
        !           469: 
        !           470: /**
        !           471:  * xmlStrncatNew:
        !           472:  * @str1:  first xmlChar string
        !           473:  * @str2:  second xmlChar string
        !           474:  * @len:  the len of @str2 or < 0
        !           475:  *
        !           476:  * same as xmlStrncat, but creates a new string.  The original
        !           477:  * two strings are not freed. If @len is < 0 then the length
        !           478:  * will be calculated automatically.
        !           479:  *
        !           480:  * Returns a new xmlChar * or NULL
        !           481:  */
        !           482: xmlChar *
        !           483: xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
        !           484:     int size;
        !           485:     xmlChar *ret;
        !           486: 
        !           487:     if (len < 0)
        !           488:         len = xmlStrlen(str2);
        !           489:     if ((str2 == NULL) || (len == 0))
        !           490:         return(xmlStrdup(str1));
        !           491:     if (str1 == NULL)
        !           492:         return(xmlStrndup(str2, len));
        !           493: 
        !           494:     size = xmlStrlen(str1);
        !           495:     ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
        !           496:     if (ret == NULL) {
        !           497:         xmlErrMemory(NULL, NULL);
        !           498:         return(xmlStrndup(str1, size));
        !           499:     }
        !           500:     memcpy(ret, str1, size * sizeof(xmlChar));
        !           501:     memcpy(&ret[size], str2, len * sizeof(xmlChar));
        !           502:     ret[size + len] = 0;
        !           503:     return(ret);
        !           504: }
        !           505: 
        !           506: /**
        !           507:  * xmlStrcat:
        !           508:  * @cur:  the original xmlChar * array
        !           509:  * @add:  the xmlChar * array added
        !           510:  *
        !           511:  * a strcat for array of xmlChar's. Since they are supposed to be
        !           512:  * encoded in UTF-8 or an encoding with 8bit based chars, we assume
        !           513:  * a termination mark of '0'.
        !           514:  *
        !           515:  * Returns a new xmlChar * containing the concatenated string.
        !           516:  */
        !           517: xmlChar *
        !           518: xmlStrcat(xmlChar *cur, const xmlChar *add) {
        !           519:     const xmlChar *p = add;
        !           520: 
        !           521:     if (add == NULL) return(cur);
        !           522:     if (cur == NULL) 
        !           523:         return(xmlStrdup(add));
        !           524: 
        !           525:     while (*p != 0) p++; /* non input consuming */
        !           526:     return(xmlStrncat(cur, add, p - add));
        !           527: }
        !           528: 
        !           529: /**
        !           530:  * xmlStrPrintf:
        !           531:  * @buf:   the result buffer.
        !           532:  * @len:   the result buffer length.
        !           533:  * @msg:   the message with printf formatting.
        !           534:  * @...:   extra parameters for the message.
        !           535:  *
        !           536:  * Formats @msg and places result into @buf.
        !           537:  *
        !           538:  * Returns the number of characters written to @buf or -1 if an error occurs.
        !           539:  */
        !           540: int XMLCDECL 
        !           541: xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
        !           542:     va_list args;
        !           543:     int ret;
        !           544:     
        !           545:     if((buf == NULL) || (msg == NULL)) {
        !           546:         return(-1);
        !           547:     }
        !           548:     
        !           549:     va_start(args, msg);
        !           550:     ret = vsnprintf((char *) buf, len, (const char *) msg, args);
        !           551:     va_end(args);
        !           552:     buf[len - 1] = 0; /* be safe ! */
        !           553:     
        !           554:     return(ret);
        !           555: }
        !           556: 
        !           557: /**
        !           558:  * xmlStrVPrintf:
        !           559:  * @buf:   the result buffer.
        !           560:  * @len:   the result buffer length.
        !           561:  * @msg:   the message with printf formatting.
        !           562:  * @ap:    extra parameters for the message.
        !           563:  *
        !           564:  * Formats @msg and places result into @buf.
        !           565:  *
        !           566:  * Returns the number of characters written to @buf or -1 if an error occurs.
        !           567:  */
        !           568: int 
        !           569: xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
        !           570:     int ret;
        !           571:     
        !           572:     if((buf == NULL) || (msg == NULL)) {
        !           573:         return(-1);
        !           574:     }
        !           575:     
        !           576:     ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
        !           577:     buf[len - 1] = 0; /* be safe ! */
        !           578:     
        !           579:     return(ret);
        !           580: }
        !           581: 
        !           582: /************************************************************************
        !           583:  *                                                                      *
        !           584:  *              Generic UTF8 handling routines                          *
        !           585:  *                                                                      *
        !           586:  * From rfc2044: encoding of the Unicode values on UTF-8:               *
        !           587:  *                                                                      *
        !           588:  * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
        !           589:  * 0000 0000-0000 007F   0xxxxxxx                                       *
        !           590:  * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
        !           591:  * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
        !           592:  *                                                                      *
        !           593:  * I hope we won't use values > 0xFFFF anytime soon !                   *
        !           594:  *                                                                      *
        !           595:  ************************************************************************/
        !           596: 
        !           597: 
        !           598: /**
        !           599:  * xmlUTF8Size:
        !           600:  * @utf: pointer to the UTF8 character
        !           601:  *
        !           602:  * calculates the internal size of a UTF8 character
        !           603:  *
        !           604:  * returns the numbers of bytes in the character, -1 on format error
        !           605:  */
        !           606: int
        !           607: xmlUTF8Size(const xmlChar *utf) {
        !           608:     xmlChar mask;
        !           609:     int len;
        !           610: 
        !           611:     if (utf == NULL)
        !           612:         return -1;
        !           613:     if (*utf < 0x80)
        !           614:         return 1;
        !           615:     /* check valid UTF8 character */
        !           616:     if (!(*utf & 0x40))
        !           617:         return -1;
        !           618:     /* determine number of bytes in char */
        !           619:     len = 2;
        !           620:     for (mask=0x20; mask != 0; mask>>=1) {
        !           621:         if (!(*utf & mask))
        !           622:             return len;
        !           623:         len++;
        !           624:     }
        !           625:     return -1;
        !           626: }
        !           627: 
        !           628: /**
        !           629:  * xmlUTF8Charcmp:
        !           630:  * @utf1: pointer to first UTF8 char
        !           631:  * @utf2: pointer to second UTF8 char
        !           632:  *
        !           633:  * compares the two UCS4 values
        !           634:  *
        !           635:  * returns result of the compare as with xmlStrncmp
        !           636:  */
        !           637: int
        !           638: xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
        !           639: 
        !           640:     if (utf1 == NULL ) {
        !           641:         if (utf2 == NULL)
        !           642:             return 0;
        !           643:         return -1;
        !           644:     }
        !           645:     return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
        !           646: }
        !           647: 
        !           648: /**
        !           649:  * xmlUTF8Strlen:
        !           650:  * @utf:  a sequence of UTF-8 encoded bytes
        !           651:  *
        !           652:  * compute the length of an UTF8 string, it doesn't do a full UTF8
        !           653:  * checking of the content of the string.
        !           654:  *
        !           655:  * Returns the number of characters in the string or -1 in case of error
        !           656:  */
        !           657: int
        !           658: xmlUTF8Strlen(const xmlChar *utf) {
        !           659:     int ret = 0;
        !           660: 
        !           661:     if (utf == NULL)
        !           662:         return(-1);
        !           663: 
        !           664:     while (*utf != 0) {
        !           665:         if (utf[0] & 0x80) {
        !           666:             if ((utf[1] & 0xc0) != 0x80)
        !           667:                 return(-1);
        !           668:             if ((utf[0] & 0xe0) == 0xe0) {
        !           669:                 if ((utf[2] & 0xc0) != 0x80)
        !           670:                     return(-1);
        !           671:                 if ((utf[0] & 0xf0) == 0xf0) {
        !           672:                     if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
        !           673:                         return(-1);
        !           674:                     utf += 4;
        !           675:                 } else {
        !           676:                     utf += 3;
        !           677:                 }
        !           678:             } else {
        !           679:                 utf += 2;
        !           680:             }
        !           681:         } else {
        !           682:             utf++;
        !           683:         }
        !           684:         ret++;
        !           685:     }
        !           686:     return(ret);
        !           687: }
        !           688: 
        !           689: /**
        !           690:  * xmlGetUTF8Char:
        !           691:  * @utf:  a sequence of UTF-8 encoded bytes
        !           692:  * @len:  a pointer to the minimum number of bytes present in
        !           693:  *        the sequence.  This is used to assure the next character
        !           694:  *        is completely contained within the sequence.
        !           695:  *
        !           696:  * Read the first UTF8 character from @utf
        !           697:  *
        !           698:  * Returns the char value or -1 in case of error, and sets *len to
        !           699:  *        the actual number of bytes consumed (0 in case of error)
        !           700:  */
        !           701: int
        !           702: xmlGetUTF8Char(const unsigned char *utf, int *len) {
        !           703:     unsigned int c;
        !           704: 
        !           705:     if (utf == NULL)
        !           706:         goto error;
        !           707:     if (len == NULL)
        !           708:         goto error;
        !           709:     if (*len < 1)
        !           710:         goto error;
        !           711: 
        !           712:     c = utf[0];
        !           713:     if (c & 0x80) {
        !           714:         if (*len < 2)
        !           715:             goto error;
        !           716:         if ((utf[1] & 0xc0) != 0x80)
        !           717:             goto error;
        !           718:         if ((c & 0xe0) == 0xe0) {
        !           719:             if (*len < 3)
        !           720:                 goto error;
        !           721:             if ((utf[2] & 0xc0) != 0x80)
        !           722:                 goto error;
        !           723:             if ((c & 0xf0) == 0xf0) {
        !           724:                 if (*len < 4)
        !           725:                     goto error;
        !           726:                 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
        !           727:                     goto error;
        !           728:                 *len = 4;
        !           729:                 /* 4-byte code */
        !           730:                 c = (utf[0] & 0x7) << 18;
        !           731:                 c |= (utf[1] & 0x3f) << 12;
        !           732:                 c |= (utf[2] & 0x3f) << 6;
        !           733:                 c |= utf[3] & 0x3f;
        !           734:             } else {
        !           735:               /* 3-byte code */
        !           736:                 *len = 3;
        !           737:                 c = (utf[0] & 0xf) << 12;
        !           738:                 c |= (utf[1] & 0x3f) << 6;
        !           739:                 c |= utf[2] & 0x3f;
        !           740:             }
        !           741:         } else {
        !           742:           /* 2-byte code */
        !           743:             *len = 2;
        !           744:             c = (utf[0] & 0x1f) << 6;
        !           745:             c |= utf[1] & 0x3f;
        !           746:         }
        !           747:     } else {
        !           748:         /* 1-byte code */
        !           749:         *len = 1;
        !           750:     }
        !           751:     return(c);
        !           752: 
        !           753: error:
        !           754:     if (len != NULL)
        !           755:        *len = 0;
        !           756:     return(-1);
        !           757: }
        !           758: 
        !           759: /**
        !           760:  * xmlCheckUTF8:
        !           761:  * @utf: Pointer to putative UTF-8 encoded string.
        !           762:  *
        !           763:  * Checks @utf for being valid UTF-8. @utf is assumed to be
        !           764:  * null-terminated. This function is not super-strict, as it will
        !           765:  * allow longer UTF-8 sequences than necessary. Note that Java is
        !           766:  * capable of producing these sequences if provoked. Also note, this
        !           767:  * routine checks for the 4-byte maximum size, but does not check for
        !           768:  * 0x10ffff maximum value.
        !           769:  *
        !           770:  * Return value: true if @utf is valid.
        !           771:  **/
        !           772: int
        !           773: xmlCheckUTF8(const unsigned char *utf)
        !           774: {
        !           775:     int ix;
        !           776:     unsigned char c;
        !           777: 
        !           778:     if (utf == NULL)
        !           779:         return(0);
        !           780:     /*
        !           781:      * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
        !           782:      * are as follows (in "bit format"):
        !           783:      *    0xxxxxxx                                      valid 1-byte
        !           784:      *    110xxxxx 10xxxxxx                             valid 2-byte
        !           785:      *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
        !           786:      *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
        !           787:      */
        !           788:     for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
        !           789:         if ((c & 0x80) == 0x00) {      /* 1-byte code, starts with 10 */
        !           790:             ix++;
        !           791:        } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
        !           792:            if ((utf[ix+1] & 0xc0 ) != 0x80)
        !           793:                return 0;
        !           794:            ix += 2;
        !           795:        } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
        !           796:            if (((utf[ix+1] & 0xc0) != 0x80) ||
        !           797:                ((utf[ix+2] & 0xc0) != 0x80))
        !           798:                    return 0;
        !           799:            ix += 3;
        !           800:        } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
        !           801:            if (((utf[ix+1] & 0xc0) != 0x80) ||
        !           802:                ((utf[ix+2] & 0xc0) != 0x80) ||
        !           803:                ((utf[ix+3] & 0xc0) != 0x80))
        !           804:                    return 0;
        !           805:            ix += 4;
        !           806:        } else                          /* unknown encoding */
        !           807:            return 0;
        !           808:       }
        !           809:       return(1);
        !           810: }
        !           811: 
        !           812: /**
        !           813:  * xmlUTF8Strsize:
        !           814:  * @utf:  a sequence of UTF-8 encoded bytes
        !           815:  * @len:  the number of characters in the array
        !           816:  *
        !           817:  * storage size of an UTF8 string
        !           818:  * the behaviour is not garanteed if the input string is not UTF-8
        !           819:  *
        !           820:  * Returns the storage size of
        !           821:  * the first 'len' characters of ARRAY
        !           822:  */
        !           823: 
        !           824: int
        !           825: xmlUTF8Strsize(const xmlChar *utf, int len) {
        !           826:     const xmlChar   *ptr=utf;
        !           827:     xmlChar         ch;
        !           828: 
        !           829:     if (utf == NULL)
        !           830:         return(0);
        !           831: 
        !           832:     if (len <= 0)
        !           833:         return(0);
        !           834: 
        !           835:     while ( len-- > 0) {
        !           836:         if ( !*ptr )
        !           837:             break;
        !           838:         if ( (ch = *ptr++) & 0x80)
        !           839:             while ((ch<<=1) & 0x80 ) {
        !           840:                 ptr++;
        !           841:                if (*ptr == 0) break;
        !           842:            }
        !           843:     }
        !           844:     return (ptr - utf);
        !           845: }
        !           846: 
        !           847: 
        !           848: /**
        !           849:  * xmlUTF8Strndup:
        !           850:  * @utf:  the input UTF8 *
        !           851:  * @len:  the len of @utf (in chars)
        !           852:  *
        !           853:  * a strndup for array of UTF8's
        !           854:  *
        !           855:  * Returns a new UTF8 * or NULL
        !           856:  */
        !           857: xmlChar *
        !           858: xmlUTF8Strndup(const xmlChar *utf, int len) {
        !           859:     xmlChar *ret;
        !           860:     int i;
        !           861:     
        !           862:     if ((utf == NULL) || (len < 0)) return(NULL);
        !           863:     i = xmlUTF8Strsize(utf, len);
        !           864:     ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
        !           865:     if (ret == NULL) {
        !           866:         xmlGenericError(xmlGenericErrorContext,
        !           867:                 "malloc of %ld byte failed\n",
        !           868:                 (len + 1) * (long)sizeof(xmlChar));
        !           869:         return(NULL);
        !           870:     }
        !           871:     memcpy(ret, utf, i * sizeof(xmlChar));
        !           872:     ret[i] = 0;
        !           873:     return(ret);
        !           874: }
        !           875: 
        !           876: /**
        !           877:  * xmlUTF8Strpos:
        !           878:  * @utf:  the input UTF8 *
        !           879:  * @pos:  the position of the desired UTF8 char (in chars)
        !           880:  *
        !           881:  * a function to provide the equivalent of fetching a
        !           882:  * character from a string array
        !           883:  *
        !           884:  * Returns a pointer to the UTF8 character or NULL
        !           885:  */
        !           886: const xmlChar *
        !           887: xmlUTF8Strpos(const xmlChar *utf, int pos) {
        !           888:     xmlChar ch;
        !           889: 
        !           890:     if (utf == NULL) return(NULL);
        !           891:     if (pos < 0)
        !           892:         return(NULL);
        !           893:     while (pos--) {
        !           894:         if ((ch=*utf++) == 0) return(NULL);
        !           895:         if ( ch & 0x80 ) {
        !           896:             /* if not simple ascii, verify proper format */
        !           897:             if ( (ch & 0xc0) != 0xc0 )
        !           898:                 return(NULL);
        !           899:             /* then skip over remaining bytes for this char */
        !           900:             while ( (ch <<= 1) & 0x80 )
        !           901:                 if ( (*utf++ & 0xc0) != 0x80 )
        !           902:                     return(NULL);
        !           903:         }
        !           904:     }
        !           905:     return((xmlChar *)utf);
        !           906: }
        !           907: 
        !           908: /**
        !           909:  * xmlUTF8Strloc:
        !           910:  * @utf:  the input UTF8 *
        !           911:  * @utfchar:  the UTF8 character to be found
        !           912:  *
        !           913:  * a function to provide the relative location of a UTF8 char
        !           914:  *
        !           915:  * Returns the relative character position of the desired char
        !           916:  * or -1 if not found
        !           917:  */
        !           918: int
        !           919: xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
        !           920:     int i, size;
        !           921:     xmlChar ch;
        !           922: 
        !           923:     if (utf==NULL || utfchar==NULL) return -1;
        !           924:     size = xmlUTF8Strsize(utfchar, 1);
        !           925:         for(i=0; (ch=*utf) != 0; i++) {
        !           926:             if (xmlStrncmp(utf, utfchar, size)==0)
        !           927:                 return(i);
        !           928:             utf++;
        !           929:             if ( ch & 0x80 ) {
        !           930:                 /* if not simple ascii, verify proper format */
        !           931:                 if ( (ch & 0xc0) != 0xc0 )
        !           932:                     return(-1);
        !           933:                 /* then skip over remaining bytes for this char */
        !           934:                 while ( (ch <<= 1) & 0x80 )
        !           935:                     if ( (*utf++ & 0xc0) != 0x80 )
        !           936:                         return(-1);
        !           937:             }
        !           938:         }
        !           939: 
        !           940:     return(-1);
        !           941: }
        !           942: /**
        !           943:  * xmlUTF8Strsub:
        !           944:  * @utf:  a sequence of UTF-8 encoded bytes
        !           945:  * @start: relative pos of first char
        !           946:  * @len:   total number to copy
        !           947:  *
        !           948:  * Create a substring from a given UTF-8 string
        !           949:  * Note:  positions are given in units of UTF-8 chars
        !           950:  *
        !           951:  * Returns a pointer to a newly created string
        !           952:  * or NULL if any problem
        !           953:  */
        !           954: 
        !           955: xmlChar *
        !           956: xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
        !           957:     int            i;
        !           958:     xmlChar ch;
        !           959: 
        !           960:     if (utf == NULL) return(NULL);
        !           961:     if (start < 0) return(NULL);
        !           962:     if (len < 0) return(NULL);
        !           963: 
        !           964:     /*
        !           965:      * Skip over any leading chars
        !           966:      */
        !           967:     for (i = 0;i < start;i++) {
        !           968:         if ((ch=*utf++) == 0) return(NULL);
        !           969:         if ( ch & 0x80 ) {
        !           970:             /* if not simple ascii, verify proper format */
        !           971:             if ( (ch & 0xc0) != 0xc0 )
        !           972:                 return(NULL);
        !           973:             /* then skip over remaining bytes for this char */
        !           974:             while ( (ch <<= 1) & 0x80 )
        !           975:                 if ( (*utf++ & 0xc0) != 0x80 )
        !           976:                     return(NULL);
        !           977:         }
        !           978:     }
        !           979: 
        !           980:     return(xmlUTF8Strndup(utf, len));
        !           981: }
        !           982: 
        !           983: #define bottom_xmlstring
        !           984: #include "elfgcchack.h"

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>