embedaddon/php/ext/intl/grapheme/grapheme_string.c - annotate

Return to grapheme_string.c CVS log
Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / intl / grapheme
Annotation of embedaddon/php/ext/intl/grapheme/grapheme_string.c, revision 1.1.1.1

1.1       misho       1: /*
                      2:    +----------------------------------------------------------------------+
                      3:    | PHP Version 5                                                                                                               |
                      4:    +----------------------------------------------------------------------+
                      5:    | This source file is subject to version 3.01 of the PHP license,     |
                      6:    | that is bundled with this package in the file LICENSE, and is               |
                      7:    | available through the world-wide-web at the following url:                          |
                      8:    | http://www.php.net/license/3_01.txt                                                                 |
                      9:    | If you did not receive a copy of the PHP license and are unable to   |
                     10:    | obtain it through the world-wide-web, please send a note to                 |
                     11:    | license@php.net so we can mail you a copy immediately.                              |
                     12:    +----------------------------------------------------------------------+
                     13:    | Author: Ed Batutis <ed@batutis.com>                                                                 |
                     14:    +----------------------------------------------------------------------+
                     15:  */
                     16: 
                     17: /* {{{ includes */
                     18: #ifdef HAVE_CONFIG_H
                     19: #include "config.h"
                     20: #endif
                     21: 
                     22: #include <php.h>
                     23: #include "grapheme.h"
                     24: #include "grapheme_util.h"
                     25: 
                     26: #include <unicode/utypes.h>
                     27: #include <unicode/ucol.h>
                     28: #include <unicode/ustring.h>
                     29: #include <unicode/ubrk.h>
                     30: 
                     31: #include "ext/standard/php_string.h"
                     32: 
                     33: /* }}} */
                     34: 
                     35: #define GRAPHEME_EXTRACT_TYPE_COUNT            0
                     36: #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
                     37: #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
                     38: #define GRAPHEME_EXTRACT_TYPE_MIN      GRAPHEME_EXTRACT_TYPE_COUNT
                     39: #define GRAPHEME_EXTRACT_TYPE_MAX      GRAPHEME_EXTRACT_TYPE_MAXCHARS
                     40: 
                     41: 
                     42: /* {{{ grapheme_register_constants
                     43:  * Register API constants
                     44:  */
                     45: void grapheme_register_constants( INIT_FUNC_ARGS )
                     46: {
                     47:        REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
                     48:        REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
                     49:        REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
                     50: }
                     51: /* }}} */
                     52: 
                     53: /* {{{ proto int grapheme_strlen(string str)
                     54:    Get number of graphemes in a string */
                     55: PHP_FUNCTION(grapheme_strlen)
                     56: {
                     57:        unsigned char* string;
                     58:        int string_len;
                     59:        UChar* ustring = NULL;
                     60:        int ustring_len = 0;
                     61:        int ret_len;
                     62:        UErrorCode status;
                     63: 
                     64:        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
                     65: 
                     66:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                     67:                         "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
                     68: 
                     69:                RETURN_FALSE;
                     70:        }
                     71: 
                     72:        ret_len = grapheme_ascii_check(string, string_len);
                     73: 
                     74:        if ( ret_len >= 0 )
                     75:                RETURN_LONG(ret_len);
                     76: 
                     77:        /* convert the string to UTF-16. */
                     78:        status = U_ZERO_ERROR;
                     79:        intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
                     80: 
                     81:        if ( U_FAILURE( status ) ) {
                     82:                /* Set global error code. */
                     83:                intl_error_set_code( NULL, status TSRMLS_CC );
                     84: 
                     85:                /* Set error messages. */
                     86:                intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
                     87:                if (ustring) {
                     88:                        efree( ustring );
                     89:                }
                     90:                RETURN_NULL();
                     91:        }
                     92: 
                     93:        ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
                     94: 
                     95:        if (ustring) {
                     96:                efree( ustring );
                     97:        }
                     98: 
                     99:        if (ret_len >= 0) {
                    100:                RETVAL_LONG(ret_len);
                    101:        } else {
                    102:                RETVAL_FALSE;
                    103:        }
                    104: }
                    105: /* }}} */
                    106: 
                    107: /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
                    108:    Find position of first occurrence of a string within another */
                    109: PHP_FUNCTION(grapheme_strpos)
                    110: {
                    111:        unsigned char *haystack, *needle;
                    112:        int haystack_len, needle_len;
                    113:        unsigned char *found;
                    114:        long loffset = 0;
                    115:        int32_t offset = 0;
                    116:        int ret_pos, uchar_pos;
                    117: 
                    118:        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
                    119: 
                    120:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    121:                         "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
                    122: 
                    123:                RETURN_FALSE;
                    124:        }
                    125: 
                    126:        if ( OUTSIDE_STRING(loffset, haystack_len) ) {
                    127: 
                    128:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
                    129: 
                    130:                RETURN_FALSE;
                    131:        }
                    132: 
                    133:        /* we checked that it will fit: */
                    134:        offset = (int32_t) loffset;
                    135: 
                    136:        /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
                    137: 
                    138:        if (needle_len == 0) {
                    139: 
                    140:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
                    141: 
                    142:                RETURN_FALSE;
                    143:        }
                    144: 
                    145: 
                    146:        /* quick check to see if the string might be there
                    147:         * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
                    148:        */
                    149:        found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
                    150: 
                    151:        /* if it isn't there the we are done */
                    152:        if (!found) {
                    153:                RETURN_FALSE;
                    154:        }
                    155: 
                    156:        /* if it is there, and if the haystack is ascii, we are all done */
                    157:        if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
                    158: 
                    159:                RETURN_LONG(found - haystack);
                    160:        }
                    161: 
                    162:        /* do utf16 part of the strpos */
                    163:        ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC );
                    164: 
                    165:        if ( ret_pos >= 0 ) {
                    166:                RETURN_LONG(ret_pos + offset);
                    167:        } else {
                    168:                RETURN_FALSE;
                    169:        }
                    170: 
                    171: }
                    172: /* }}} */
                    173: 
                    174: /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
                    175:    Find position of first occurrence of a string within another, ignoring case differences */
                    176: PHP_FUNCTION(grapheme_stripos)
                    177: {
                    178:        unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
                    179:        int haystack_len, needle_len;
                    180:        unsigned char *found;
                    181:        long loffset = 0;
                    182:        int32_t offset = 0;
                    183:        int ret_pos, uchar_pos;
                    184:        int is_ascii;
                    185: 
                    186:        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
                    187: 
                    188:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    189:                         "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
                    190: 
                    191:                RETURN_FALSE;
                    192:        }
                    193: 
                    194:        if ( OUTSIDE_STRING(loffset, haystack_len) ) {
                    195: 
                    196:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
                    197: 
                    198:                RETURN_FALSE;
                    199:        }
                    200: 
                    201:        /* we checked that it will fit: */
                    202:        offset = (int32_t) loffset;
                    203: 
                    204:        /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
                    205: 
                    206:        if (needle_len == 0) {
                    207: 
                    208:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
                    209: 
                    210:                RETURN_FALSE;
                    211:        }
                    212: 
                    213: 
                    214:        is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
                    215: 
                    216:        if ( is_ascii ) {
                    217:                needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
                    218:                php_strtolower((char *)needle_dup, needle_len);
                    219:                haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
                    220:                php_strtolower((char *)haystack_dup, haystack_len);
                    221: 
                    222:                found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
                    223: 
                    224:                efree(haystack_dup);
                    225:                efree(needle_dup);
                    226: 
                    227:                if (found) {
                    228:                        RETURN_LONG(found - haystack_dup);
                    229:                }
                    230: 
                    231:                /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
                    232:                if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
                    233:                        RETURN_FALSE;
                    234:                }
                    235:        }
                    236: 
                    237:        /* do utf16 part of the strpos */
                    238:        ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC );
                    239: 
                    240:        if ( ret_pos >= 0 ) {
                    241:                RETURN_LONG(ret_pos + offset);
                    242:        } else {
                    243:                RETURN_FALSE;
                    244:        }
                    245: 
                    246: }
                    247: /* }}} */
                    248: 
                    249: /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
                    250:    Find position of last occurrence of a string within another */
                    251: PHP_FUNCTION(grapheme_strrpos)
                    252: {
                    253:        unsigned char *haystack, *needle;
                    254:        int haystack_len, needle_len;
                    255:        long loffset = 0;
                    256:        int32_t offset = 0;
                    257:        int32_t ret_pos;
                    258:        int is_ascii;
                    259: 
                    260:        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
                    261: 
                    262:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    263:                         "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
                    264: 
                    265:                RETURN_FALSE;
                    266:        }
                    267: 
                    268:        if ( OUTSIDE_STRING(loffset, haystack_len) ) {
                    269: 
                    270:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
                    271: 
                    272:                RETURN_FALSE;
                    273:        }
                    274: 
                    275:        /* we checked that it will fit: */
                    276:        offset = (int32_t) loffset;
                    277: 
                    278:        /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
                    279: 
                    280:        if (needle_len == 0) {
                    281: 
                    282:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
                    283: 
                    284:                RETURN_FALSE;
                    285:        }
                    286: 
                    287:        is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
                    288: 
                    289:        if ( is_ascii ) {
                    290: 
                    291:                ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
                    292: 
                    293: 
                    294:                if ( ret_pos >= 0 ) {
                    295:                        RETURN_LONG(ret_pos);
                    296:                }
                    297: 
                    298:                /* if the needle was ascii too, we are done */
                    299: 
                    300:                if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
                    301:                        RETURN_FALSE;
                    302:                }
                    303: 
                    304:                /* else we need to continue via utf16 */
                    305:        }
                    306: 
                    307:        ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC);
                    308: 
                    309:        if ( ret_pos >= 0 ) {
                    310:                RETURN_LONG(ret_pos);
                    311:        } else {
                    312:                RETURN_FALSE;
                    313:        }
                    314: 
                    315: 
                    316: }
                    317: /* }}} */
                    318: 
                    319: /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
                    320:    Find position of last occurrence of a string within another, ignoring case */
                    321: PHP_FUNCTION(grapheme_strripos)
                    322: {
                    323:        unsigned char *haystack, *needle;
                    324:        int haystack_len, needle_len;
                    325:        long loffset = 0;
                    326:        int32_t offset = 0;
                    327:        int32_t ret_pos;
                    328:        int is_ascii;
                    329: 
                    330:        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
                    331: 
                    332:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    333:                         "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
                    334: 
                    335:                RETURN_FALSE;
                    336:        }
                    337: 
                    338:        if ( OUTSIDE_STRING(loffset, haystack_len) ) {
                    339: 
                    340:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
                    341: 
                    342:                RETURN_FALSE;
                    343:        }
                    344: 
                    345:        /* we checked that it will fit: */
                    346:        offset = (int32_t) loffset;
                    347: 
                    348:        /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
                    349: 
                    350:        if (needle_len == 0) {
                    351: 
                    352:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
                    353: 
                    354:                RETURN_FALSE;
                    355:        }
                    356: 
                    357:        is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
                    358: 
                    359:        if ( is_ascii ) {
                    360:                unsigned char *needle_dup, *haystack_dup;
                    361: 
                    362:                needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
                    363:                php_strtolower((char *)needle_dup, needle_len);
                    364:                haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
                    365:                php_strtolower((char *)haystack_dup, haystack_len);
                    366: 
                    367:                ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
                    368: 
                    369:                efree(haystack_dup);
                    370:                efree(needle_dup);
                    371: 
                    372:                if ( ret_pos >= 0 ) {
                    373:                        RETURN_LONG(ret_pos);
                    374:                }
                    375: 
                    376:                /* if the needle was ascii too, we are done */
                    377: 
                    378:                if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
                    379:                        RETURN_FALSE;
                    380:                }
                    381: 
                    382:                /* else we need to continue via utf16 */
                    383:        }
                    384: 
                    385:        ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC);
                    386: 
                    387:        if ( ret_pos >= 0 ) {
                    388:                RETURN_LONG(ret_pos);
                    389:        } else {
                    390:                RETURN_FALSE;
                    391:        }
                    392: 
                    393: 
                    394: }
                    395: /* }}} */
                    396: 
                    397: /* {{{ proto string grapheme_substr(string str, int start [, int length])
                    398:    Returns part of a string */
                    399: PHP_FUNCTION(grapheme_substr)
                    400: {
                    401:        unsigned char *str, *sub_str;
                    402:        UChar *ustr;
                    403:        int str_len, sub_str_len, ustr_len;
                    404:        long lstart = 0, length = 0;
                    405:        int32_t start = 0;
                    406:        int iter_val;
                    407:        UErrorCode status;
                    408:        unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
                    409:        UBreakIterator* bi = NULL;
                    410:        int sub_str_start_pos, sub_str_end_pos;
                    411:        int32_t (*iter_func)(UBreakIterator *);
                    412: 
                    413:        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
                    414: 
                    415:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    416:                         "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
                    417: 
                    418:                RETURN_FALSE;
                    419:        }
                    420: 
                    421:        if ( OUTSIDE_STRING(lstart, str_len) ) {
                    422: 
                    423:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
                    424: 
                    425:                RETURN_FALSE;
                    426:        }
                    427: 
                    428:        /* we checked that it will fit: */
                    429:        start = (int32_t) lstart;
                    430: 
                    431:        /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
                    432: 
                    433:        if ( grapheme_ascii_check(str, str_len) >= 0 ) {
                    434:                grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
                    435: 
                    436:                if ( NULL == sub_str ) {
                    437:                        RETURN_FALSE;
                    438:                }
                    439: 
                    440:                RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
                    441:        }
                    442: 
                    443:        ustr = NULL;
                    444:        ustr_len = 0;
                    445:        status = U_ZERO_ERROR;
                    446:        intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
                    447: 
                    448:        if ( U_FAILURE( status ) ) {
                    449:                /* Set global error code. */
                    450:                intl_error_set_code( NULL, status TSRMLS_CC );
                    451: 
                    452:                /* Set error messages. */
                    453:                intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
                    454:                if (ustr) {
                    455:                        efree( ustr );
                    456:                }
                    457:                RETURN_FALSE;
                    458:        }
                    459: 
                    460:        bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
                    461: 
                    462:        if( U_FAILURE(status) ) {
                    463:                RETURN_FALSE;
                    464:        }
                    465: 
                    466:        ubrk_setText(bi, ustr, ustr_len,        &status);
                    467: 
                    468:        if ( start < 0 ) {
                    469:                iter_func = ubrk_previous;
                    470:                ubrk_last(bi);
                    471:                iter_val = 1;
                    472:        }
                    473:        else {
                    474:                iter_func = ubrk_next;
                    475:                iter_val = -1;
                    476:        }
                    477: 
                    478:        sub_str_start_pos = 0;
                    479: 
                    480:        while ( start ) {
                    481:                sub_str_start_pos = iter_func(bi);
                    482: 
                    483:                if ( UBRK_DONE == sub_str_start_pos ) {
                    484:                        break;
                    485:                }
                    486: 
                    487:                start += iter_val;
                    488:        }
                    489: 
                    490:        if ( 0 != start || sub_str_start_pos >= ustr_len ) {
                    491: 
                    492:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
                    493: 
                    494:                if (ustr) {
                    495:                        efree(ustr);
                    496:                }
                    497:                ubrk_close(bi);
                    498:                RETURN_FALSE;
                    499:        }
                    500: 
                    501:        if (ZEND_NUM_ARGS() <= 2) {
                    502: 
                    503:                /* no length supplied, return the rest of the string */
                    504: 
                    505:                sub_str = NULL;
                    506:                sub_str_len = 0;
                    507:                status = U_ZERO_ERROR;
                    508:                intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
                    509: 
                    510:                if (ustr) {
                    511:                        efree( ustr );
                    512:                }
                    513:                ubrk_close( bi );
                    514: 
                    515:                if ( U_FAILURE( status ) ) {
                    516:                        /* Set global error code. */
                    517:                        intl_error_set_code( NULL, status TSRMLS_CC );
                    518: 
                    519:                        /* Set error messages. */
                    520:                        intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
                    521: 
                    522:                        if (sub_str) {
                    523:                                efree( sub_str );
                    524:                        }
                    525: 
                    526:                        RETURN_FALSE;
                    527:                }
                    528: 
                    529:                /* return the allocated string, not a duplicate */
                    530:                RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
                    531:        }
                    532: 
                    533:        /* find the end point of the string to return */
                    534: 
                    535:        if ( length < 0 ) {
                    536:                iter_func = ubrk_previous;
                    537:                ubrk_last(bi);
                    538:                iter_val = 1;
                    539:        }
                    540:        else {
                    541:                iter_func = ubrk_next;
                    542:                iter_val = -1;
                    543:        }
                    544: 
                    545:        sub_str_end_pos = 0;
                    546: 
                    547:        while ( length ) {
                    548:                sub_str_end_pos = iter_func(bi);
                    549: 
                    550:                if ( UBRK_DONE == sub_str_end_pos ) {
                    551:                        break;
                    552:                }
                    553: 
                    554:                length += iter_val;
                    555:        }
                    556: 
                    557:        if ( UBRK_DONE == sub_str_end_pos && length < 0) {
                    558: 
                    559:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
                    560: 
                    561:                efree(ustr);
                    562:                ubrk_close(bi);
                    563:                RETURN_FALSE;
                    564:        }
                    565: 
                    566:        sub_str = NULL;
                    567:        status = U_ZERO_ERROR;
                    568:        intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
                    569: 
                    570:        efree( ustr );
                    571:        ubrk_close( bi );
                    572: 
                    573:        if ( U_FAILURE( status ) ) {
                    574:                /* Set global error code. */
                    575:                intl_error_set_code( NULL, status TSRMLS_CC );
                    576: 
                    577:                /* Set error messages. */
                    578:                intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
                    579: 
                    580:                if ( NULL != sub_str )
                    581:                        efree( sub_str );
                    582: 
                    583:                RETURN_FALSE;
                    584:        }
                    585: 
                    586:         /* return the allocated string, not a duplicate */
                    587:        RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
                    588: 
                    589: }
                    590: /* }}} */
                    591: 
                    592: /* {{{ strstr_common_handler */
                    593: static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
                    594: {
                    595:        unsigned char *haystack, *needle, *found;
                    596:        int haystack_len, needle_len;
                    597:        int ret_pos, uchar_pos;
                    598:        zend_bool part = 0;
                    599: 
                    600:        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
                    601: 
                    602:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    603:                         "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
                    604: 
                    605:                RETURN_FALSE;
                    606:        }
                    607: 
                    608:        if (needle_len == 0) {
                    609: 
                    610:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
                    611: 
                    612:                RETURN_FALSE;
                    613:        }
                    614: 
                    615: 
                    616:        if ( !f_ignore_case ) {
                    617: 
                    618:                /* ASCII optimization: quick check to see if the string might be there
                    619:                 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
                    620:                */
                    621:                found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
                    622: 
                    623:                /* if it isn't there the we are done */
                    624:                if ( !found ) {
                    625:                        RETURN_FALSE;
                    626:                }
                    627: 
                    628:                /* if it is there, and if the haystack is ascii, we are all done */
                    629:                if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
                    630:                        size_t found_offset = found - haystack;
                    631: 
                    632:                        if (part) {
                    633:                                RETURN_STRINGL(((char *)haystack) , found_offset, 1);
                    634:                        } else {
                    635:                                RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
                    636:                        }
                    637:                }
                    638: 
                    639:        }
                    640: 
                    641:        /* need to work in utf16 */
                    642:        ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC );
                    643: 
                    644:        if ( ret_pos < 0 ) {
                    645:                RETURN_FALSE;
                    646:        }
                    647: 
                    648:        /* uchar_pos is the 'nth' Unicode character position of the needle */
                    649: 
                    650:        ret_pos = 0;
                    651:        U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
                    652: 
                    653:        if (part) {
                    654:                RETURN_STRINGL(((char *)haystack), ret_pos, 1);
                    655:        }
                    656:        else {
                    657:                RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
                    658:        }
                    659: 
                    660: }
                    661: /* }}} */
                    662: 
                    663: /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
                    664:    Finds first occurrence of a string within another */
                    665: PHP_FUNCTION(grapheme_strstr)
                    666: {
                    667:        strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
                    668: }
                    669: /* }}} */
                    670: 
                    671: /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
                    672:    Finds first occurrence of a string within another */
                    673: PHP_FUNCTION(grapheme_stristr)
                    674: {
                    675:        strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
                    676: }
                    677: /* }}} */
                    678: 
                    679: /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
                    680: static inline int32_t
                    681: grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
                    682: {
                    683:        int pos = 0, prev_pos = 0;
                    684:        int ret_pos = 0, prev_ret_pos = 0;
                    685: 
                    686:        while ( 1 ) {
                    687:                pos = ubrk_next(bi);
                    688: 
                    689:                if ( UBRK_DONE == pos ) {
                    690:                        break;
                    691:                }
                    692: 
                    693:                /* if we are beyond our limit, then the loop is done */
                    694:                if ( pos > csize ) {
                    695:                        break;
                    696:                }
                    697: 
                    698:                /* update our pointer in the original UTF-8 buffer by as many characters
                    699:                   as ubrk_next iterated over */
                    700: 
                    701:                prev_ret_pos = ret_pos;
                    702:                U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
                    703: 
                    704:                if ( prev_ret_pos == ret_pos ) {
                    705:                        /* something wrong - malformed utf8? */
                    706:                        break;
                    707:                }
                    708: 
                    709:                prev_pos = pos;
                    710:        }
                    711: 
                    712:        return ret_pos;
                    713: }
                    714: /* }}} */
                    715: 
                    716: /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
                    717: static inline int32_t
                    718: grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
                    719: {
                    720:        int pos = 0, prev_pos = 0;
                    721:        int ret_pos = 0, prev_ret_pos = 0;
                    722: 
                    723:        while ( 1 ) {
                    724:                pos = ubrk_next(bi);
                    725: 
                    726:                if ( UBRK_DONE == pos ) {
                    727:                        break;
                    728:                }
                    729: 
                    730:                prev_ret_pos = ret_pos;
                    731:                U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
                    732: 
                    733:                if ( ret_pos > bsize ) {
                    734:                        ret_pos = prev_ret_pos;
                    735:                        break;
                    736:                }
                    737: 
                    738:                if ( prev_ret_pos == ret_pos ) {
                    739:                        /* something wrong - malformed utf8? */
                    740:                        break;
                    741:                }
                    742: 
                    743:                prev_pos = pos;
                    744:        }
                    745: 
                    746:        return ret_pos;
                    747: }
                    748: /* }}} */
                    749: 
                    750: /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
                    751: static inline int32_t
                    752: grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
                    753: {
                    754:        int pos = 0, next_pos = 0;
                    755:        int ret_pos = 0;
                    756: 
                    757:        while ( size ) {
                    758:                next_pos = ubrk_next(bi);
                    759: 
                    760:                if ( UBRK_DONE == next_pos ) {
                    761:                        break;
                    762:                }
                    763:                pos = next_pos;
                    764:                size--;
                    765:        }
                    766: 
                    767:        /* pos is one past the last UChar - and represent the number of code units to
                    768:                advance in the utf-8 buffer
                    769:        */
                    770: 
                    771:        U8_FWD_N(pstr, ret_pos, str_len, pos);
                    772: 
                    773:        return ret_pos;
                    774: }
                    775: /* }}} */
                    776: 
                    777: /* {{{ grapheme extract iter function pointer array */
                    778: typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
                    779: 
                    780: static grapheme_extract_iter grapheme_extract_iters[] = {
                    781:        &grapheme_extract_count_iter,
                    782:        &grapheme_extract_bytecount_iter,
                    783:        &grapheme_extract_charcount_iter,
                    784: };
                    785: /* }}} */
                    786: 
                    787: /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
                    788:        Function to extract a sequence of default grapheme clusters */
                    789: PHP_FUNCTION(grapheme_extract)
                    790: {
                    791:        unsigned char *str, *pstr;
                    792:        UChar *ustr;
                    793:        int str_len, ustr_len;
                    794:        long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
                    795:        long lstart = 0; /* starting position in str in bytes */
                    796:        int32_t start = 0;
                    797:        long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
                    798:        UErrorCode status;
                    799:        unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
                    800:        UBreakIterator* bi = NULL;
                    801:        int ret_pos;
                    802:        zval *next = NULL; /* return offset of next part of the string */
                    803: 
                    804:        if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
                    805: 
                    806:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    807:                         "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
                    808: 
                    809:                RETURN_FALSE;
                    810:        }
                    811: 
                    812:        if ( NULL != next ) {
                    813:                if ( !PZVAL_IS_REF(next) ) {
                    814:                        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    815:                                 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
                    816: 
                    817:                        RETURN_FALSE;
                    818:                }
                    819:                else {
                    820:                        /* initialize next */
                    821:             ZVAL_LONG(next, lstart);
                    822:                }
                    823:        }
                    824: 
                    825:        if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
                    826: 
                    827:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    828:                         "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
                    829: 
                    830:                RETURN_FALSE;
                    831:        }
                    832: 
                    833:        if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
                    834:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
                    835:                RETURN_FALSE;
                    836:        }
                    837: 
                    838:        if ( size > INT32_MAX || size < 0) {
                    839:                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
                    840:                RETURN_FALSE;
                    841:        }
                    842:        if (size == 0) {
                    843:                RETURN_EMPTY_STRING();
                    844:        }
                    845: 
                    846:        /* we checked that it will fit: */
                    847:        start = (int32_t) lstart;
                    848: 
                    849:        pstr = str + start;
                    850: 
                    851:        /* just in case pstr points in the middle of a character, move forward to the start of the next char */
                    852:        if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
                    853:                unsigned char *str_end = str + str_len;
                    854: 
                    855:                while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
                    856:                        pstr++;
                    857:                        if ( pstr >= str_end ) {
                    858:                                intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
                    859:                                                                "grapheme_extract: invalid input string", 0 TSRMLS_CC );
                    860: 
                    861:                                RETURN_FALSE;
                    862:                        }
                    863:                }
                    864:        }
                    865: 
                    866:        str_len -= (pstr - str);
                    867: 
                    868:        /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
                    869:                (size + 1 because the size-th character might be the beginning of a grapheme cluster)
                    870:         */
                    871: 
                    872:        if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
                    873:         long nsize = ( size < str_len ? size : str_len );
                    874:                if ( NULL != next ) {
                    875:                        ZVAL_LONG(next, start+nsize);
                    876:                }
                    877:                RETURN_STRINGL(((char *)pstr), nsize, 1);
                    878:        }
                    879: 
                    880:        /* convert the strings to UTF-16. */
                    881:        ustr = NULL;
                    882:        ustr_len = 0;
                    883:        status = U_ZERO_ERROR;
                    884:        intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
                    885: 
                    886:        if ( U_FAILURE( status ) ) {
                    887:                /* Set global error code. */
                    888:                intl_error_set_code( NULL, status TSRMLS_CC );
                    889: 
                    890:                /* Set error messages. */
                    891:                intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
                    892: 
                    893:                if ( NULL != ustr )
                    894:                        efree( ustr );
                    895: 
                    896:                RETURN_FALSE;
                    897:        }
                    898: 
                    899:        bi = NULL;
                    900:        status = U_ZERO_ERROR;
                    901:        bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
                    902: 
                    903:        ubrk_setText(bi, ustr, ustr_len, &status);
                    904: 
                    905:        /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
                    906:                can't back up. So, we will not do anything. */
                    907: 
                    908:        /* now we need to find the end of the chunk the user wants us to return */
                    909: 
                    910:        ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
                    911: 
                    912:        if (ustr) {
                    913:                efree(ustr);
                    914:        }
                    915:        ubrk_close(bi);
                    916: 
                    917:        if ( NULL != next ) {
                    918:                ZVAL_LONG(next, start+ret_pos);
                    919:        }
                    920: 
                    921:        RETURN_STRINGL(((char *)pstr), ret_pos, 1);
                    922: }
                    923: 
                    924: /* }}} */
                    925: 
                    926: /*
                    927:  * Local variables:
                    928:  * tab-width: 4
                    929:  * c-basic-offset: 4
                    930:  * End:
                    931:  * vim600: fdm=marker
                    932:  * vim: noet sw=4 ts=4
                    933:  */
                    934:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>