Annotation of embedaddon/php/ext/intl/grapheme/grapheme_string.c, revision 1.1
1.1 ! misho 1: /*
! 2: +----------------------------------------------------------------------+
! 3: | PHP Version 5 |
! 4: +----------------------------------------------------------------------+
! 5: | This source file is subject to version 3.01 of the PHP license, |
! 6: | that is bundled with this package in the file LICENSE, and is |
! 7: | available through the world-wide-web at the following url: |
! 8: | http://www.php.net/license/3_01.txt |
! 9: | If you did not receive a copy of the PHP license and are unable to |
! 10: | obtain it through the world-wide-web, please send a note to |
! 11: | license@php.net so we can mail you a copy immediately. |
! 12: +----------------------------------------------------------------------+
! 13: | Author: Ed Batutis <ed@batutis.com> |
! 14: +----------------------------------------------------------------------+
! 15: */
! 16:
! 17: /* {{{ includes */
! 18: #ifdef HAVE_CONFIG_H
! 19: #include "config.h"
! 20: #endif
! 21:
! 22: #include <php.h>
! 23: #include "grapheme.h"
! 24: #include "grapheme_util.h"
! 25:
! 26: #include <unicode/utypes.h>
! 27: #include <unicode/ucol.h>
! 28: #include <unicode/ustring.h>
! 29: #include <unicode/ubrk.h>
! 30:
! 31: #include "ext/standard/php_string.h"
! 32:
! 33: /* }}} */
! 34:
! 35: #define GRAPHEME_EXTRACT_TYPE_COUNT 0
! 36: #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
! 37: #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
! 38: #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
! 39: #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
! 40:
! 41:
! 42: /* {{{ grapheme_register_constants
! 43: * Register API constants
! 44: */
! 45: void grapheme_register_constants( INIT_FUNC_ARGS )
! 46: {
! 47: REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
! 48: REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
! 49: REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
! 50: }
! 51: /* }}} */
! 52:
! 53: /* {{{ proto int grapheme_strlen(string str)
! 54: Get number of graphemes in a string */
! 55: PHP_FUNCTION(grapheme_strlen)
! 56: {
! 57: unsigned char* string;
! 58: int string_len;
! 59: UChar* ustring = NULL;
! 60: int ustring_len = 0;
! 61: int ret_len;
! 62: UErrorCode status;
! 63:
! 64: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
! 65:
! 66: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 67: "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
! 68:
! 69: RETURN_FALSE;
! 70: }
! 71:
! 72: ret_len = grapheme_ascii_check(string, string_len);
! 73:
! 74: if ( ret_len >= 0 )
! 75: RETURN_LONG(ret_len);
! 76:
! 77: /* convert the string to UTF-16. */
! 78: status = U_ZERO_ERROR;
! 79: intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
! 80:
! 81: if ( U_FAILURE( status ) ) {
! 82: /* Set global error code. */
! 83: intl_error_set_code( NULL, status TSRMLS_CC );
! 84:
! 85: /* Set error messages. */
! 86: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
! 87: if (ustring) {
! 88: efree( ustring );
! 89: }
! 90: RETURN_NULL();
! 91: }
! 92:
! 93: ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
! 94:
! 95: if (ustring) {
! 96: efree( ustring );
! 97: }
! 98:
! 99: if (ret_len >= 0) {
! 100: RETVAL_LONG(ret_len);
! 101: } else {
! 102: RETVAL_FALSE;
! 103: }
! 104: }
! 105: /* }}} */
! 106:
! 107: /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
! 108: Find position of first occurrence of a string within another */
! 109: PHP_FUNCTION(grapheme_strpos)
! 110: {
! 111: unsigned char *haystack, *needle;
! 112: int haystack_len, needle_len;
! 113: unsigned char *found;
! 114: long loffset = 0;
! 115: int32_t offset = 0;
! 116: int ret_pos, uchar_pos;
! 117:
! 118: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
! 119:
! 120: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 121: "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
! 122:
! 123: RETURN_FALSE;
! 124: }
! 125:
! 126: if ( OUTSIDE_STRING(loffset, haystack_len) ) {
! 127:
! 128: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
! 129:
! 130: RETURN_FALSE;
! 131: }
! 132:
! 133: /* we checked that it will fit: */
! 134: offset = (int32_t) loffset;
! 135:
! 136: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
! 137:
! 138: if (needle_len == 0) {
! 139:
! 140: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
! 141:
! 142: RETURN_FALSE;
! 143: }
! 144:
! 145:
! 146: /* quick check to see if the string might be there
! 147: * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
! 148: */
! 149: found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
! 150:
! 151: /* if it isn't there the we are done */
! 152: if (!found) {
! 153: RETURN_FALSE;
! 154: }
! 155:
! 156: /* if it is there, and if the haystack is ascii, we are all done */
! 157: if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
! 158:
! 159: RETURN_LONG(found - haystack);
! 160: }
! 161:
! 162: /* do utf16 part of the strpos */
! 163: ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC );
! 164:
! 165: if ( ret_pos >= 0 ) {
! 166: RETURN_LONG(ret_pos + offset);
! 167: } else {
! 168: RETURN_FALSE;
! 169: }
! 170:
! 171: }
! 172: /* }}} */
! 173:
! 174: /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
! 175: Find position of first occurrence of a string within another, ignoring case differences */
! 176: PHP_FUNCTION(grapheme_stripos)
! 177: {
! 178: unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
! 179: int haystack_len, needle_len;
! 180: unsigned char *found;
! 181: long loffset = 0;
! 182: int32_t offset = 0;
! 183: int ret_pos, uchar_pos;
! 184: int is_ascii;
! 185:
! 186: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
! 187:
! 188: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 189: "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
! 190:
! 191: RETURN_FALSE;
! 192: }
! 193:
! 194: if ( OUTSIDE_STRING(loffset, haystack_len) ) {
! 195:
! 196: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
! 197:
! 198: RETURN_FALSE;
! 199: }
! 200:
! 201: /* we checked that it will fit: */
! 202: offset = (int32_t) loffset;
! 203:
! 204: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
! 205:
! 206: if (needle_len == 0) {
! 207:
! 208: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
! 209:
! 210: RETURN_FALSE;
! 211: }
! 212:
! 213:
! 214: is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
! 215:
! 216: if ( is_ascii ) {
! 217: needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
! 218: php_strtolower((char *)needle_dup, needle_len);
! 219: haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
! 220: php_strtolower((char *)haystack_dup, haystack_len);
! 221:
! 222: found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
! 223:
! 224: efree(haystack_dup);
! 225: efree(needle_dup);
! 226:
! 227: if (found) {
! 228: RETURN_LONG(found - haystack_dup);
! 229: }
! 230:
! 231: /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
! 232: if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
! 233: RETURN_FALSE;
! 234: }
! 235: }
! 236:
! 237: /* do utf16 part of the strpos */
! 238: ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC );
! 239:
! 240: if ( ret_pos >= 0 ) {
! 241: RETURN_LONG(ret_pos + offset);
! 242: } else {
! 243: RETURN_FALSE;
! 244: }
! 245:
! 246: }
! 247: /* }}} */
! 248:
! 249: /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
! 250: Find position of last occurrence of a string within another */
! 251: PHP_FUNCTION(grapheme_strrpos)
! 252: {
! 253: unsigned char *haystack, *needle;
! 254: int haystack_len, needle_len;
! 255: long loffset = 0;
! 256: int32_t offset = 0;
! 257: int32_t ret_pos;
! 258: int is_ascii;
! 259:
! 260: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
! 261:
! 262: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 263: "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
! 264:
! 265: RETURN_FALSE;
! 266: }
! 267:
! 268: if ( OUTSIDE_STRING(loffset, haystack_len) ) {
! 269:
! 270: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
! 271:
! 272: RETURN_FALSE;
! 273: }
! 274:
! 275: /* we checked that it will fit: */
! 276: offset = (int32_t) loffset;
! 277:
! 278: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
! 279:
! 280: if (needle_len == 0) {
! 281:
! 282: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
! 283:
! 284: RETURN_FALSE;
! 285: }
! 286:
! 287: is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
! 288:
! 289: if ( is_ascii ) {
! 290:
! 291: ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
! 292:
! 293:
! 294: if ( ret_pos >= 0 ) {
! 295: RETURN_LONG(ret_pos);
! 296: }
! 297:
! 298: /* if the needle was ascii too, we are done */
! 299:
! 300: if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
! 301: RETURN_FALSE;
! 302: }
! 303:
! 304: /* else we need to continue via utf16 */
! 305: }
! 306:
! 307: ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC);
! 308:
! 309: if ( ret_pos >= 0 ) {
! 310: RETURN_LONG(ret_pos);
! 311: } else {
! 312: RETURN_FALSE;
! 313: }
! 314:
! 315:
! 316: }
! 317: /* }}} */
! 318:
! 319: /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
! 320: Find position of last occurrence of a string within another, ignoring case */
! 321: PHP_FUNCTION(grapheme_strripos)
! 322: {
! 323: unsigned char *haystack, *needle;
! 324: int haystack_len, needle_len;
! 325: long loffset = 0;
! 326: int32_t offset = 0;
! 327: int32_t ret_pos;
! 328: int is_ascii;
! 329:
! 330: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
! 331:
! 332: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 333: "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
! 334:
! 335: RETURN_FALSE;
! 336: }
! 337:
! 338: if ( OUTSIDE_STRING(loffset, haystack_len) ) {
! 339:
! 340: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
! 341:
! 342: RETURN_FALSE;
! 343: }
! 344:
! 345: /* we checked that it will fit: */
! 346: offset = (int32_t) loffset;
! 347:
! 348: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
! 349:
! 350: if (needle_len == 0) {
! 351:
! 352: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
! 353:
! 354: RETURN_FALSE;
! 355: }
! 356:
! 357: is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
! 358:
! 359: if ( is_ascii ) {
! 360: unsigned char *needle_dup, *haystack_dup;
! 361:
! 362: needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
! 363: php_strtolower((char *)needle_dup, needle_len);
! 364: haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
! 365: php_strtolower((char *)haystack_dup, haystack_len);
! 366:
! 367: ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
! 368:
! 369: efree(haystack_dup);
! 370: efree(needle_dup);
! 371:
! 372: if ( ret_pos >= 0 ) {
! 373: RETURN_LONG(ret_pos);
! 374: }
! 375:
! 376: /* if the needle was ascii too, we are done */
! 377:
! 378: if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
! 379: RETURN_FALSE;
! 380: }
! 381:
! 382: /* else we need to continue via utf16 */
! 383: }
! 384:
! 385: ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC);
! 386:
! 387: if ( ret_pos >= 0 ) {
! 388: RETURN_LONG(ret_pos);
! 389: } else {
! 390: RETURN_FALSE;
! 391: }
! 392:
! 393:
! 394: }
! 395: /* }}} */
! 396:
! 397: /* {{{ proto string grapheme_substr(string str, int start [, int length])
! 398: Returns part of a string */
! 399: PHP_FUNCTION(grapheme_substr)
! 400: {
! 401: unsigned char *str, *sub_str;
! 402: UChar *ustr;
! 403: int str_len, sub_str_len, ustr_len;
! 404: long lstart = 0, length = 0;
! 405: int32_t start = 0;
! 406: int iter_val;
! 407: UErrorCode status;
! 408: unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
! 409: UBreakIterator* bi = NULL;
! 410: int sub_str_start_pos, sub_str_end_pos;
! 411: int32_t (*iter_func)(UBreakIterator *);
! 412:
! 413: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
! 414:
! 415: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 416: "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
! 417:
! 418: RETURN_FALSE;
! 419: }
! 420:
! 421: if ( OUTSIDE_STRING(lstart, str_len) ) {
! 422:
! 423: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
! 424:
! 425: RETURN_FALSE;
! 426: }
! 427:
! 428: /* we checked that it will fit: */
! 429: start = (int32_t) lstart;
! 430:
! 431: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
! 432:
! 433: if ( grapheme_ascii_check(str, str_len) >= 0 ) {
! 434: grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
! 435:
! 436: if ( NULL == sub_str ) {
! 437: RETURN_FALSE;
! 438: }
! 439:
! 440: RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
! 441: }
! 442:
! 443: ustr = NULL;
! 444: ustr_len = 0;
! 445: status = U_ZERO_ERROR;
! 446: intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
! 447:
! 448: if ( U_FAILURE( status ) ) {
! 449: /* Set global error code. */
! 450: intl_error_set_code( NULL, status TSRMLS_CC );
! 451:
! 452: /* Set error messages. */
! 453: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
! 454: if (ustr) {
! 455: efree( ustr );
! 456: }
! 457: RETURN_FALSE;
! 458: }
! 459:
! 460: bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
! 461:
! 462: if( U_FAILURE(status) ) {
! 463: RETURN_FALSE;
! 464: }
! 465:
! 466: ubrk_setText(bi, ustr, ustr_len, &status);
! 467:
! 468: if ( start < 0 ) {
! 469: iter_func = ubrk_previous;
! 470: ubrk_last(bi);
! 471: iter_val = 1;
! 472: }
! 473: else {
! 474: iter_func = ubrk_next;
! 475: iter_val = -1;
! 476: }
! 477:
! 478: sub_str_start_pos = 0;
! 479:
! 480: while ( start ) {
! 481: sub_str_start_pos = iter_func(bi);
! 482:
! 483: if ( UBRK_DONE == sub_str_start_pos ) {
! 484: break;
! 485: }
! 486:
! 487: start += iter_val;
! 488: }
! 489:
! 490: if ( 0 != start || sub_str_start_pos >= ustr_len ) {
! 491:
! 492: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
! 493:
! 494: if (ustr) {
! 495: efree(ustr);
! 496: }
! 497: ubrk_close(bi);
! 498: RETURN_FALSE;
! 499: }
! 500:
! 501: if (ZEND_NUM_ARGS() <= 2) {
! 502:
! 503: /* no length supplied, return the rest of the string */
! 504:
! 505: sub_str = NULL;
! 506: sub_str_len = 0;
! 507: status = U_ZERO_ERROR;
! 508: intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
! 509:
! 510: if (ustr) {
! 511: efree( ustr );
! 512: }
! 513: ubrk_close( bi );
! 514:
! 515: if ( U_FAILURE( status ) ) {
! 516: /* Set global error code. */
! 517: intl_error_set_code( NULL, status TSRMLS_CC );
! 518:
! 519: /* Set error messages. */
! 520: intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
! 521:
! 522: if (sub_str) {
! 523: efree( sub_str );
! 524: }
! 525:
! 526: RETURN_FALSE;
! 527: }
! 528:
! 529: /* return the allocated string, not a duplicate */
! 530: RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
! 531: }
! 532:
! 533: /* find the end point of the string to return */
! 534:
! 535: if ( length < 0 ) {
! 536: iter_func = ubrk_previous;
! 537: ubrk_last(bi);
! 538: iter_val = 1;
! 539: }
! 540: else {
! 541: iter_func = ubrk_next;
! 542: iter_val = -1;
! 543: }
! 544:
! 545: sub_str_end_pos = 0;
! 546:
! 547: while ( length ) {
! 548: sub_str_end_pos = iter_func(bi);
! 549:
! 550: if ( UBRK_DONE == sub_str_end_pos ) {
! 551: break;
! 552: }
! 553:
! 554: length += iter_val;
! 555: }
! 556:
! 557: if ( UBRK_DONE == sub_str_end_pos && length < 0) {
! 558:
! 559: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
! 560:
! 561: efree(ustr);
! 562: ubrk_close(bi);
! 563: RETURN_FALSE;
! 564: }
! 565:
! 566: sub_str = NULL;
! 567: status = U_ZERO_ERROR;
! 568: intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
! 569:
! 570: efree( ustr );
! 571: ubrk_close( bi );
! 572:
! 573: if ( U_FAILURE( status ) ) {
! 574: /* Set global error code. */
! 575: intl_error_set_code( NULL, status TSRMLS_CC );
! 576:
! 577: /* Set error messages. */
! 578: intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
! 579:
! 580: if ( NULL != sub_str )
! 581: efree( sub_str );
! 582:
! 583: RETURN_FALSE;
! 584: }
! 585:
! 586: /* return the allocated string, not a duplicate */
! 587: RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
! 588:
! 589: }
! 590: /* }}} */
! 591:
! 592: /* {{{ strstr_common_handler */
! 593: static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
! 594: {
! 595: unsigned char *haystack, *needle, *found;
! 596: int haystack_len, needle_len;
! 597: int ret_pos, uchar_pos;
! 598: zend_bool part = 0;
! 599:
! 600: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
! 601:
! 602: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 603: "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
! 604:
! 605: RETURN_FALSE;
! 606: }
! 607:
! 608: if (needle_len == 0) {
! 609:
! 610: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
! 611:
! 612: RETURN_FALSE;
! 613: }
! 614:
! 615:
! 616: if ( !f_ignore_case ) {
! 617:
! 618: /* ASCII optimization: quick check to see if the string might be there
! 619: * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
! 620: */
! 621: found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
! 622:
! 623: /* if it isn't there the we are done */
! 624: if ( !found ) {
! 625: RETURN_FALSE;
! 626: }
! 627:
! 628: /* if it is there, and if the haystack is ascii, we are all done */
! 629: if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
! 630: size_t found_offset = found - haystack;
! 631:
! 632: if (part) {
! 633: RETURN_STRINGL(((char *)haystack) , found_offset, 1);
! 634: } else {
! 635: RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
! 636: }
! 637: }
! 638:
! 639: }
! 640:
! 641: /* need to work in utf16 */
! 642: ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC );
! 643:
! 644: if ( ret_pos < 0 ) {
! 645: RETURN_FALSE;
! 646: }
! 647:
! 648: /* uchar_pos is the 'nth' Unicode character position of the needle */
! 649:
! 650: ret_pos = 0;
! 651: U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
! 652:
! 653: if (part) {
! 654: RETURN_STRINGL(((char *)haystack), ret_pos, 1);
! 655: }
! 656: else {
! 657: RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
! 658: }
! 659:
! 660: }
! 661: /* }}} */
! 662:
! 663: /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
! 664: Finds first occurrence of a string within another */
! 665: PHP_FUNCTION(grapheme_strstr)
! 666: {
! 667: strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
! 668: }
! 669: /* }}} */
! 670:
! 671: /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
! 672: Finds first occurrence of a string within another */
! 673: PHP_FUNCTION(grapheme_stristr)
! 674: {
! 675: strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
! 676: }
! 677: /* }}} */
! 678:
! 679: /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
! 680: static inline int32_t
! 681: grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
! 682: {
! 683: int pos = 0, prev_pos = 0;
! 684: int ret_pos = 0, prev_ret_pos = 0;
! 685:
! 686: while ( 1 ) {
! 687: pos = ubrk_next(bi);
! 688:
! 689: if ( UBRK_DONE == pos ) {
! 690: break;
! 691: }
! 692:
! 693: /* if we are beyond our limit, then the loop is done */
! 694: if ( pos > csize ) {
! 695: break;
! 696: }
! 697:
! 698: /* update our pointer in the original UTF-8 buffer by as many characters
! 699: as ubrk_next iterated over */
! 700:
! 701: prev_ret_pos = ret_pos;
! 702: U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
! 703:
! 704: if ( prev_ret_pos == ret_pos ) {
! 705: /* something wrong - malformed utf8? */
! 706: break;
! 707: }
! 708:
! 709: prev_pos = pos;
! 710: }
! 711:
! 712: return ret_pos;
! 713: }
! 714: /* }}} */
! 715:
! 716: /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
! 717: static inline int32_t
! 718: grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
! 719: {
! 720: int pos = 0, prev_pos = 0;
! 721: int ret_pos = 0, prev_ret_pos = 0;
! 722:
! 723: while ( 1 ) {
! 724: pos = ubrk_next(bi);
! 725:
! 726: if ( UBRK_DONE == pos ) {
! 727: break;
! 728: }
! 729:
! 730: prev_ret_pos = ret_pos;
! 731: U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
! 732:
! 733: if ( ret_pos > bsize ) {
! 734: ret_pos = prev_ret_pos;
! 735: break;
! 736: }
! 737:
! 738: if ( prev_ret_pos == ret_pos ) {
! 739: /* something wrong - malformed utf8? */
! 740: break;
! 741: }
! 742:
! 743: prev_pos = pos;
! 744: }
! 745:
! 746: return ret_pos;
! 747: }
! 748: /* }}} */
! 749:
! 750: /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
! 751: static inline int32_t
! 752: grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
! 753: {
! 754: int pos = 0, next_pos = 0;
! 755: int ret_pos = 0;
! 756:
! 757: while ( size ) {
! 758: next_pos = ubrk_next(bi);
! 759:
! 760: if ( UBRK_DONE == next_pos ) {
! 761: break;
! 762: }
! 763: pos = next_pos;
! 764: size--;
! 765: }
! 766:
! 767: /* pos is one past the last UChar - and represent the number of code units to
! 768: advance in the utf-8 buffer
! 769: */
! 770:
! 771: U8_FWD_N(pstr, ret_pos, str_len, pos);
! 772:
! 773: return ret_pos;
! 774: }
! 775: /* }}} */
! 776:
! 777: /* {{{ grapheme extract iter function pointer array */
! 778: typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
! 779:
! 780: static grapheme_extract_iter grapheme_extract_iters[] = {
! 781: &grapheme_extract_count_iter,
! 782: &grapheme_extract_bytecount_iter,
! 783: &grapheme_extract_charcount_iter,
! 784: };
! 785: /* }}} */
! 786:
! 787: /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
! 788: Function to extract a sequence of default grapheme clusters */
! 789: PHP_FUNCTION(grapheme_extract)
! 790: {
! 791: unsigned char *str, *pstr;
! 792: UChar *ustr;
! 793: int str_len, ustr_len;
! 794: long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
! 795: long lstart = 0; /* starting position in str in bytes */
! 796: int32_t start = 0;
! 797: long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
! 798: UErrorCode status;
! 799: unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
! 800: UBreakIterator* bi = NULL;
! 801: int ret_pos;
! 802: zval *next = NULL; /* return offset of next part of the string */
! 803:
! 804: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
! 805:
! 806: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 807: "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
! 808:
! 809: RETURN_FALSE;
! 810: }
! 811:
! 812: if ( NULL != next ) {
! 813: if ( !PZVAL_IS_REF(next) ) {
! 814: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 815: "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
! 816:
! 817: RETURN_FALSE;
! 818: }
! 819: else {
! 820: /* initialize next */
! 821: ZVAL_LONG(next, lstart);
! 822: }
! 823: }
! 824:
! 825: if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
! 826:
! 827: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 828: "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
! 829:
! 830: RETURN_FALSE;
! 831: }
! 832:
! 833: if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
! 834: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
! 835: RETURN_FALSE;
! 836: }
! 837:
! 838: if ( size > INT32_MAX || size < 0) {
! 839: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
! 840: RETURN_FALSE;
! 841: }
! 842: if (size == 0) {
! 843: RETURN_EMPTY_STRING();
! 844: }
! 845:
! 846: /* we checked that it will fit: */
! 847: start = (int32_t) lstart;
! 848:
! 849: pstr = str + start;
! 850:
! 851: /* just in case pstr points in the middle of a character, move forward to the start of the next char */
! 852: if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
! 853: unsigned char *str_end = str + str_len;
! 854:
! 855: while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
! 856: pstr++;
! 857: if ( pstr >= str_end ) {
! 858: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
! 859: "grapheme_extract: invalid input string", 0 TSRMLS_CC );
! 860:
! 861: RETURN_FALSE;
! 862: }
! 863: }
! 864: }
! 865:
! 866: str_len -= (pstr - str);
! 867:
! 868: /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
! 869: (size + 1 because the size-th character might be the beginning of a grapheme cluster)
! 870: */
! 871:
! 872: if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
! 873: long nsize = ( size < str_len ? size : str_len );
! 874: if ( NULL != next ) {
! 875: ZVAL_LONG(next, start+nsize);
! 876: }
! 877: RETURN_STRINGL(((char *)pstr), nsize, 1);
! 878: }
! 879:
! 880: /* convert the strings to UTF-16. */
! 881: ustr = NULL;
! 882: ustr_len = 0;
! 883: status = U_ZERO_ERROR;
! 884: intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
! 885:
! 886: if ( U_FAILURE( status ) ) {
! 887: /* Set global error code. */
! 888: intl_error_set_code( NULL, status TSRMLS_CC );
! 889:
! 890: /* Set error messages. */
! 891: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
! 892:
! 893: if ( NULL != ustr )
! 894: efree( ustr );
! 895:
! 896: RETURN_FALSE;
! 897: }
! 898:
! 899: bi = NULL;
! 900: status = U_ZERO_ERROR;
! 901: bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
! 902:
! 903: ubrk_setText(bi, ustr, ustr_len, &status);
! 904:
! 905: /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
! 906: can't back up. So, we will not do anything. */
! 907:
! 908: /* now we need to find the end of the chunk the user wants us to return */
! 909:
! 910: ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
! 911:
! 912: if (ustr) {
! 913: efree(ustr);
! 914: }
! 915: ubrk_close(bi);
! 916:
! 917: if ( NULL != next ) {
! 918: ZVAL_LONG(next, start+ret_pos);
! 919: }
! 920:
! 921: RETURN_STRINGL(((char *)pstr), ret_pos, 1);
! 922: }
! 923:
! 924: /* }}} */
! 925:
! 926: /*
! 927: * Local variables:
! 928: * tab-width: 4
! 929: * c-basic-offset: 4
! 930: * End:
! 931: * vim600: fdm=marker
! 932: * vim: noet sw=4 ts=4
! 933: */
! 934:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>