Annotation of embedaddon/php/ext/intl/grapheme/grapheme_string.c, revision 1.1.1.1
1.1 misho 1: /*
2: +----------------------------------------------------------------------+
3: | PHP Version 5 |
4: +----------------------------------------------------------------------+
5: | This source file is subject to version 3.01 of the PHP license, |
6: | that is bundled with this package in the file LICENSE, and is |
7: | available through the world-wide-web at the following url: |
8: | http://www.php.net/license/3_01.txt |
9: | If you did not receive a copy of the PHP license and are unable to |
10: | obtain it through the world-wide-web, please send a note to |
11: | license@php.net so we can mail you a copy immediately. |
12: +----------------------------------------------------------------------+
13: | Author: Ed Batutis <ed@batutis.com> |
14: +----------------------------------------------------------------------+
15: */
16:
17: /* {{{ includes */
18: #ifdef HAVE_CONFIG_H
19: #include "config.h"
20: #endif
21:
22: #include <php.h>
23: #include "grapheme.h"
24: #include "grapheme_util.h"
25:
26: #include <unicode/utypes.h>
27: #include <unicode/ucol.h>
28: #include <unicode/ustring.h>
29: #include <unicode/ubrk.h>
30:
31: #include "ext/standard/php_string.h"
32:
33: /* }}} */
34:
35: #define GRAPHEME_EXTRACT_TYPE_COUNT 0
36: #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
37: #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
38: #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
39: #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
40:
41:
42: /* {{{ grapheme_register_constants
43: * Register API constants
44: */
45: void grapheme_register_constants( INIT_FUNC_ARGS )
46: {
47: REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48: REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49: REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50: }
51: /* }}} */
52:
53: /* {{{ proto int grapheme_strlen(string str)
54: Get number of graphemes in a string */
55: PHP_FUNCTION(grapheme_strlen)
56: {
57: unsigned char* string;
58: int string_len;
59: UChar* ustring = NULL;
60: int ustring_len = 0;
61: int ret_len;
62: UErrorCode status;
63:
64: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
65:
66: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
67: "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
68:
69: RETURN_FALSE;
70: }
71:
72: ret_len = grapheme_ascii_check(string, string_len);
73:
74: if ( ret_len >= 0 )
75: RETURN_LONG(ret_len);
76:
77: /* convert the string to UTF-16. */
78: status = U_ZERO_ERROR;
79: intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
80:
81: if ( U_FAILURE( status ) ) {
82: /* Set global error code. */
83: intl_error_set_code( NULL, status TSRMLS_CC );
84:
85: /* Set error messages. */
86: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
87: if (ustring) {
88: efree( ustring );
89: }
90: RETURN_NULL();
91: }
92:
93: ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
94:
95: if (ustring) {
96: efree( ustring );
97: }
98:
99: if (ret_len >= 0) {
100: RETVAL_LONG(ret_len);
101: } else {
102: RETVAL_FALSE;
103: }
104: }
105: /* }}} */
106:
107: /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
108: Find position of first occurrence of a string within another */
109: PHP_FUNCTION(grapheme_strpos)
110: {
111: unsigned char *haystack, *needle;
112: int haystack_len, needle_len;
113: unsigned char *found;
114: long loffset = 0;
115: int32_t offset = 0;
116: int ret_pos, uchar_pos;
117:
118: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
119:
120: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
121: "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
122:
123: RETURN_FALSE;
124: }
125:
126: if ( OUTSIDE_STRING(loffset, haystack_len) ) {
127:
128: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
129:
130: RETURN_FALSE;
131: }
132:
133: /* we checked that it will fit: */
134: offset = (int32_t) loffset;
135:
136: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
137:
138: if (needle_len == 0) {
139:
140: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
141:
142: RETURN_FALSE;
143: }
144:
145:
146: /* quick check to see if the string might be there
147: * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
148: */
149: found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
150:
151: /* if it isn't there the we are done */
152: if (!found) {
153: RETURN_FALSE;
154: }
155:
156: /* if it is there, and if the haystack is ascii, we are all done */
157: if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
158:
159: RETURN_LONG(found - haystack);
160: }
161:
162: /* do utf16 part of the strpos */
163: ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC );
164:
165: if ( ret_pos >= 0 ) {
166: RETURN_LONG(ret_pos + offset);
167: } else {
168: RETURN_FALSE;
169: }
170:
171: }
172: /* }}} */
173:
174: /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
175: Find position of first occurrence of a string within another, ignoring case differences */
176: PHP_FUNCTION(grapheme_stripos)
177: {
178: unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
179: int haystack_len, needle_len;
180: unsigned char *found;
181: long loffset = 0;
182: int32_t offset = 0;
183: int ret_pos, uchar_pos;
184: int is_ascii;
185:
186: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
187:
188: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
189: "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
190:
191: RETURN_FALSE;
192: }
193:
194: if ( OUTSIDE_STRING(loffset, haystack_len) ) {
195:
196: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
197:
198: RETURN_FALSE;
199: }
200:
201: /* we checked that it will fit: */
202: offset = (int32_t) loffset;
203:
204: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
205:
206: if (needle_len == 0) {
207:
208: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
209:
210: RETURN_FALSE;
211: }
212:
213:
214: is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
215:
216: if ( is_ascii ) {
217: needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
218: php_strtolower((char *)needle_dup, needle_len);
219: haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
220: php_strtolower((char *)haystack_dup, haystack_len);
221:
222: found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
223:
224: efree(haystack_dup);
225: efree(needle_dup);
226:
227: if (found) {
228: RETURN_LONG(found - haystack_dup);
229: }
230:
231: /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
232: if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
233: RETURN_FALSE;
234: }
235: }
236:
237: /* do utf16 part of the strpos */
238: ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC );
239:
240: if ( ret_pos >= 0 ) {
241: RETURN_LONG(ret_pos + offset);
242: } else {
243: RETURN_FALSE;
244: }
245:
246: }
247: /* }}} */
248:
249: /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
250: Find position of last occurrence of a string within another */
251: PHP_FUNCTION(grapheme_strrpos)
252: {
253: unsigned char *haystack, *needle;
254: int haystack_len, needle_len;
255: long loffset = 0;
256: int32_t offset = 0;
257: int32_t ret_pos;
258: int is_ascii;
259:
260: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
261:
262: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
263: "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
264:
265: RETURN_FALSE;
266: }
267:
268: if ( OUTSIDE_STRING(loffset, haystack_len) ) {
269:
270: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
271:
272: RETURN_FALSE;
273: }
274:
275: /* we checked that it will fit: */
276: offset = (int32_t) loffset;
277:
278: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
279:
280: if (needle_len == 0) {
281:
282: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
283:
284: RETURN_FALSE;
285: }
286:
287: is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
288:
289: if ( is_ascii ) {
290:
291: ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
292:
293:
294: if ( ret_pos >= 0 ) {
295: RETURN_LONG(ret_pos);
296: }
297:
298: /* if the needle was ascii too, we are done */
299:
300: if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
301: RETURN_FALSE;
302: }
303:
304: /* else we need to continue via utf16 */
305: }
306:
307: ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC);
308:
309: if ( ret_pos >= 0 ) {
310: RETURN_LONG(ret_pos);
311: } else {
312: RETURN_FALSE;
313: }
314:
315:
316: }
317: /* }}} */
318:
319: /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
320: Find position of last occurrence of a string within another, ignoring case */
321: PHP_FUNCTION(grapheme_strripos)
322: {
323: unsigned char *haystack, *needle;
324: int haystack_len, needle_len;
325: long loffset = 0;
326: int32_t offset = 0;
327: int32_t ret_pos;
328: int is_ascii;
329:
330: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
331:
332: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
333: "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
334:
335: RETURN_FALSE;
336: }
337:
338: if ( OUTSIDE_STRING(loffset, haystack_len) ) {
339:
340: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
341:
342: RETURN_FALSE;
343: }
344:
345: /* we checked that it will fit: */
346: offset = (int32_t) loffset;
347:
348: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
349:
350: if (needle_len == 0) {
351:
352: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
353:
354: RETURN_FALSE;
355: }
356:
357: is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
358:
359: if ( is_ascii ) {
360: unsigned char *needle_dup, *haystack_dup;
361:
362: needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
363: php_strtolower((char *)needle_dup, needle_len);
364: haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
365: php_strtolower((char *)haystack_dup, haystack_len);
366:
367: ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
368:
369: efree(haystack_dup);
370: efree(needle_dup);
371:
372: if ( ret_pos >= 0 ) {
373: RETURN_LONG(ret_pos);
374: }
375:
376: /* if the needle was ascii too, we are done */
377:
378: if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
379: RETURN_FALSE;
380: }
381:
382: /* else we need to continue via utf16 */
383: }
384:
385: ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC);
386:
387: if ( ret_pos >= 0 ) {
388: RETURN_LONG(ret_pos);
389: } else {
390: RETURN_FALSE;
391: }
392:
393:
394: }
395: /* }}} */
396:
397: /* {{{ proto string grapheme_substr(string str, int start [, int length])
398: Returns part of a string */
399: PHP_FUNCTION(grapheme_substr)
400: {
401: unsigned char *str, *sub_str;
402: UChar *ustr;
403: int str_len, sub_str_len, ustr_len;
404: long lstart = 0, length = 0;
405: int32_t start = 0;
406: int iter_val;
407: UErrorCode status;
408: unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
409: UBreakIterator* bi = NULL;
410: int sub_str_start_pos, sub_str_end_pos;
411: int32_t (*iter_func)(UBreakIterator *);
412:
413: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
414:
415: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
416: "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
417:
418: RETURN_FALSE;
419: }
420:
421: if ( OUTSIDE_STRING(lstart, str_len) ) {
422:
423: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
424:
425: RETURN_FALSE;
426: }
427:
428: /* we checked that it will fit: */
429: start = (int32_t) lstart;
430:
431: /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
432:
433: if ( grapheme_ascii_check(str, str_len) >= 0 ) {
434: grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
435:
436: if ( NULL == sub_str ) {
437: RETURN_FALSE;
438: }
439:
440: RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
441: }
442:
443: ustr = NULL;
444: ustr_len = 0;
445: status = U_ZERO_ERROR;
446: intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
447:
448: if ( U_FAILURE( status ) ) {
449: /* Set global error code. */
450: intl_error_set_code( NULL, status TSRMLS_CC );
451:
452: /* Set error messages. */
453: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
454: if (ustr) {
455: efree( ustr );
456: }
457: RETURN_FALSE;
458: }
459:
460: bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
461:
462: if( U_FAILURE(status) ) {
463: RETURN_FALSE;
464: }
465:
466: ubrk_setText(bi, ustr, ustr_len, &status);
467:
468: if ( start < 0 ) {
469: iter_func = ubrk_previous;
470: ubrk_last(bi);
471: iter_val = 1;
472: }
473: else {
474: iter_func = ubrk_next;
475: iter_val = -1;
476: }
477:
478: sub_str_start_pos = 0;
479:
480: while ( start ) {
481: sub_str_start_pos = iter_func(bi);
482:
483: if ( UBRK_DONE == sub_str_start_pos ) {
484: break;
485: }
486:
487: start += iter_val;
488: }
489:
490: if ( 0 != start || sub_str_start_pos >= ustr_len ) {
491:
492: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
493:
494: if (ustr) {
495: efree(ustr);
496: }
497: ubrk_close(bi);
498: RETURN_FALSE;
499: }
500:
501: if (ZEND_NUM_ARGS() <= 2) {
502:
503: /* no length supplied, return the rest of the string */
504:
505: sub_str = NULL;
506: sub_str_len = 0;
507: status = U_ZERO_ERROR;
508: intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
509:
510: if (ustr) {
511: efree( ustr );
512: }
513: ubrk_close( bi );
514:
515: if ( U_FAILURE( status ) ) {
516: /* Set global error code. */
517: intl_error_set_code( NULL, status TSRMLS_CC );
518:
519: /* Set error messages. */
520: intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
521:
522: if (sub_str) {
523: efree( sub_str );
524: }
525:
526: RETURN_FALSE;
527: }
528:
529: /* return the allocated string, not a duplicate */
530: RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
531: }
532:
533: /* find the end point of the string to return */
534:
535: if ( length < 0 ) {
536: iter_func = ubrk_previous;
537: ubrk_last(bi);
538: iter_val = 1;
539: }
540: else {
541: iter_func = ubrk_next;
542: iter_val = -1;
543: }
544:
545: sub_str_end_pos = 0;
546:
547: while ( length ) {
548: sub_str_end_pos = iter_func(bi);
549:
550: if ( UBRK_DONE == sub_str_end_pos ) {
551: break;
552: }
553:
554: length += iter_val;
555: }
556:
557: if ( UBRK_DONE == sub_str_end_pos && length < 0) {
558:
559: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
560:
561: efree(ustr);
562: ubrk_close(bi);
563: RETURN_FALSE;
564: }
565:
566: sub_str = NULL;
567: status = U_ZERO_ERROR;
568: intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
569:
570: efree( ustr );
571: ubrk_close( bi );
572:
573: if ( U_FAILURE( status ) ) {
574: /* Set global error code. */
575: intl_error_set_code( NULL, status TSRMLS_CC );
576:
577: /* Set error messages. */
578: intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
579:
580: if ( NULL != sub_str )
581: efree( sub_str );
582:
583: RETURN_FALSE;
584: }
585:
586: /* return the allocated string, not a duplicate */
587: RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
588:
589: }
590: /* }}} */
591:
592: /* {{{ strstr_common_handler */
593: static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
594: {
595: unsigned char *haystack, *needle, *found;
596: int haystack_len, needle_len;
597: int ret_pos, uchar_pos;
598: zend_bool part = 0;
599:
600: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
601:
602: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
603: "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
604:
605: RETURN_FALSE;
606: }
607:
608: if (needle_len == 0) {
609:
610: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
611:
612: RETURN_FALSE;
613: }
614:
615:
616: if ( !f_ignore_case ) {
617:
618: /* ASCII optimization: quick check to see if the string might be there
619: * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
620: */
621: found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
622:
623: /* if it isn't there the we are done */
624: if ( !found ) {
625: RETURN_FALSE;
626: }
627:
628: /* if it is there, and if the haystack is ascii, we are all done */
629: if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
630: size_t found_offset = found - haystack;
631:
632: if (part) {
633: RETURN_STRINGL(((char *)haystack) , found_offset, 1);
634: } else {
635: RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
636: }
637: }
638:
639: }
640:
641: /* need to work in utf16 */
642: ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC );
643:
644: if ( ret_pos < 0 ) {
645: RETURN_FALSE;
646: }
647:
648: /* uchar_pos is the 'nth' Unicode character position of the needle */
649:
650: ret_pos = 0;
651: U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
652:
653: if (part) {
654: RETURN_STRINGL(((char *)haystack), ret_pos, 1);
655: }
656: else {
657: RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
658: }
659:
660: }
661: /* }}} */
662:
663: /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
664: Finds first occurrence of a string within another */
665: PHP_FUNCTION(grapheme_strstr)
666: {
667: strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
668: }
669: /* }}} */
670:
671: /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
672: Finds first occurrence of a string within another */
673: PHP_FUNCTION(grapheme_stristr)
674: {
675: strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
676: }
677: /* }}} */
678:
679: /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
680: static inline int32_t
681: grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
682: {
683: int pos = 0, prev_pos = 0;
684: int ret_pos = 0, prev_ret_pos = 0;
685:
686: while ( 1 ) {
687: pos = ubrk_next(bi);
688:
689: if ( UBRK_DONE == pos ) {
690: break;
691: }
692:
693: /* if we are beyond our limit, then the loop is done */
694: if ( pos > csize ) {
695: break;
696: }
697:
698: /* update our pointer in the original UTF-8 buffer by as many characters
699: as ubrk_next iterated over */
700:
701: prev_ret_pos = ret_pos;
702: U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
703:
704: if ( prev_ret_pos == ret_pos ) {
705: /* something wrong - malformed utf8? */
706: break;
707: }
708:
709: prev_pos = pos;
710: }
711:
712: return ret_pos;
713: }
714: /* }}} */
715:
716: /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
717: static inline int32_t
718: grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
719: {
720: int pos = 0, prev_pos = 0;
721: int ret_pos = 0, prev_ret_pos = 0;
722:
723: while ( 1 ) {
724: pos = ubrk_next(bi);
725:
726: if ( UBRK_DONE == pos ) {
727: break;
728: }
729:
730: prev_ret_pos = ret_pos;
731: U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
732:
733: if ( ret_pos > bsize ) {
734: ret_pos = prev_ret_pos;
735: break;
736: }
737:
738: if ( prev_ret_pos == ret_pos ) {
739: /* something wrong - malformed utf8? */
740: break;
741: }
742:
743: prev_pos = pos;
744: }
745:
746: return ret_pos;
747: }
748: /* }}} */
749:
750: /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
751: static inline int32_t
752: grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
753: {
754: int pos = 0, next_pos = 0;
755: int ret_pos = 0;
756:
757: while ( size ) {
758: next_pos = ubrk_next(bi);
759:
760: if ( UBRK_DONE == next_pos ) {
761: break;
762: }
763: pos = next_pos;
764: size--;
765: }
766:
767: /* pos is one past the last UChar - and represent the number of code units to
768: advance in the utf-8 buffer
769: */
770:
771: U8_FWD_N(pstr, ret_pos, str_len, pos);
772:
773: return ret_pos;
774: }
775: /* }}} */
776:
777: /* {{{ grapheme extract iter function pointer array */
778: typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
779:
780: static grapheme_extract_iter grapheme_extract_iters[] = {
781: &grapheme_extract_count_iter,
782: &grapheme_extract_bytecount_iter,
783: &grapheme_extract_charcount_iter,
784: };
785: /* }}} */
786:
787: /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
788: Function to extract a sequence of default grapheme clusters */
789: PHP_FUNCTION(grapheme_extract)
790: {
791: unsigned char *str, *pstr;
792: UChar *ustr;
793: int str_len, ustr_len;
794: long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
795: long lstart = 0; /* starting position in str in bytes */
796: int32_t start = 0;
797: long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
798: UErrorCode status;
799: unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
800: UBreakIterator* bi = NULL;
801: int ret_pos;
802: zval *next = NULL; /* return offset of next part of the string */
803:
804: if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
805:
806: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
807: "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
808:
809: RETURN_FALSE;
810: }
811:
812: if ( NULL != next ) {
813: if ( !PZVAL_IS_REF(next) ) {
814: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
815: "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
816:
817: RETURN_FALSE;
818: }
819: else {
820: /* initialize next */
821: ZVAL_LONG(next, lstart);
822: }
823: }
824:
825: if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
826:
827: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
828: "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
829:
830: RETURN_FALSE;
831: }
832:
833: if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
834: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
835: RETURN_FALSE;
836: }
837:
838: if ( size > INT32_MAX || size < 0) {
839: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
840: RETURN_FALSE;
841: }
842: if (size == 0) {
843: RETURN_EMPTY_STRING();
844: }
845:
846: /* we checked that it will fit: */
847: start = (int32_t) lstart;
848:
849: pstr = str + start;
850:
851: /* just in case pstr points in the middle of a character, move forward to the start of the next char */
852: if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
853: unsigned char *str_end = str + str_len;
854:
855: while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
856: pstr++;
857: if ( pstr >= str_end ) {
858: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
859: "grapheme_extract: invalid input string", 0 TSRMLS_CC );
860:
861: RETURN_FALSE;
862: }
863: }
864: }
865:
866: str_len -= (pstr - str);
867:
868: /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
869: (size + 1 because the size-th character might be the beginning of a grapheme cluster)
870: */
871:
872: if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
873: long nsize = ( size < str_len ? size : str_len );
874: if ( NULL != next ) {
875: ZVAL_LONG(next, start+nsize);
876: }
877: RETURN_STRINGL(((char *)pstr), nsize, 1);
878: }
879:
880: /* convert the strings to UTF-16. */
881: ustr = NULL;
882: ustr_len = 0;
883: status = U_ZERO_ERROR;
884: intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
885:
886: if ( U_FAILURE( status ) ) {
887: /* Set global error code. */
888: intl_error_set_code( NULL, status TSRMLS_CC );
889:
890: /* Set error messages. */
891: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
892:
893: if ( NULL != ustr )
894: efree( ustr );
895:
896: RETURN_FALSE;
897: }
898:
899: bi = NULL;
900: status = U_ZERO_ERROR;
901: bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
902:
903: ubrk_setText(bi, ustr, ustr_len, &status);
904:
905: /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
906: can't back up. So, we will not do anything. */
907:
908: /* now we need to find the end of the chunk the user wants us to return */
909:
910: ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
911:
912: if (ustr) {
913: efree(ustr);
914: }
915: ubrk_close(bi);
916:
917: if ( NULL != next ) {
918: ZVAL_LONG(next, start+ret_pos);
919: }
920:
921: RETURN_STRINGL(((char *)pstr), ret_pos, 1);
922: }
923:
924: /* }}} */
925:
926: /*
927: * Local variables:
928: * tab-width: 4
929: * c-basic-offset: 4
930: * End:
931: * vim600: fdm=marker
932: * vim: noet sw=4 ts=4
933: */
934:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>