|
|
1.1 misho 1: /*
2: +----------------------------------------------------------------------+
3: | PHP Version 5 |
4: +----------------------------------------------------------------------+
5: | This source file is subject to version 3.01 of the PHP license, |
6: | that is bundled with this package in the file LICENSE, and is |
7: | available through the world-wide-web at the following url: |
8: | http://www.php.net/license/3_01.txt |
9: | If you did not receive a copy of the PHP license and are unable to |
10: | obtain it through the world-wide-web, please send a note to |
11: | license@php.net so we can mail you a copy immediately. |
12: +----------------------------------------------------------------------+
13: | Author: Ed Batutis <ed@batutis.com> |
14: +----------------------------------------------------------------------+
15: */
16:
17: /* {{{ includes */
18: #ifdef HAVE_CONFIG_H
19: #include "config.h"
20: #endif
21:
22: #include <php.h>
23: #include "grapheme.h"
24: #include "grapheme_util.h"
25: #include "intl_common.h"
26:
27: #include <unicode/utypes.h>
28: #include <unicode/ucol.h>
29: #include <unicode/ustring.h>
30: #include <unicode/ubrk.h>
31:
32: #include "ext/standard/php_string.h"
33:
34: ZEND_EXTERN_MODULE_GLOBALS( intl )
35:
36: /* }}} */
37:
38: /* {{{ grapheme_close_global_iterator - clean up */
39: void
40: grapheme_close_global_iterator( TSRMLS_D )
41: {
42: UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
43:
44: if ( NULL != global_break_iterator ) {
45: ubrk_close(global_break_iterator);
46: }
47: }
48: /* }}} */
49:
50: /* {{{ grapheme_intl_case_fold: convert string to lowercase */
51: void
52: grapheme_intl_case_fold(UChar** ptr_to_free, UChar **str, int32_t *str_len, UErrorCode *pstatus )
53: {
54: UChar *dest;
55: int32_t dest_len, size_required;
56:
57: /* allocate a destination string that is a bit larger than the src, hoping that is enough */
58: dest_len = (*str_len) + ( *str_len / 10 );
59: dest = (UChar*) eumalloc(dest_len);
60:
61: *pstatus = U_ZERO_ERROR;
62: size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus);
63:
64: dest_len = size_required;
65:
66: if ( U_BUFFER_OVERFLOW_ERROR == *pstatus ) {
67:
68: dest = (UChar*) eurealloc(dest, dest_len);
69:
70: *pstatus = U_ZERO_ERROR;
71: size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus);
72: }
73:
74: if ( U_FAILURE(*pstatus) ) {
75: return;
76: }
77:
78: if ( NULL != ptr_to_free) {
79: efree(*ptr_to_free);
80: *ptr_to_free = dest;
81: }
82:
83: *str = dest;
84: *str_len = dest_len;
85:
86: return;
87: }
88: /* }}} */
89:
90: /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
91: void
92: grapheme_substr_ascii(char *str, int str_len, int f, int l, int argc, char **sub_str, int *sub_str_len)
93: {
94: *sub_str = NULL;
95:
96: if (argc > 2) {
97: if ((l < 0 && -l > str_len)) {
98: return;
99: } else if (l > str_len) {
100: l = str_len;
101: }
102: } else {
103: l = str_len;
104: }
105:
106: if (f > str_len || (f < 0 && -f > str_len)) {
107: return;
108: }
109:
110: if (l < 0 && (l + str_len - f) < 0) {
111: return;
112: }
113:
114: /* if "from" position is negative, count start position from the end
115: * of the string
116: */
117: if (f < 0) {
118: f = str_len + f;
119: if (f < 0) {
120: f = 0;
121: }
122: }
123:
124:
125: /* if "length" position is negative, set it to the length
126: * needed to stop that many chars from the end of the string
127: */
128: if (l < 0) {
129: l = (str_len - f) + l;
130: if (l < 0) {
131: l = 0;
132: }
133: }
134:
135: if (f >= str_len) {
136: return;
137: }
138:
139: if ((f + l) > str_len) {
140: l = str_len - f;
141: }
142:
143: *sub_str = str + f;
144: *sub_str_len = l;
145:
146: return;
147: }
148: /* }}} */
149:
150: /* {{{ grapheme_strrpos_utf16 - strrpos using utf16 */
151: int
152: grapheme_strrpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int f_ignore_case TSRMLS_DC)
153: {
154: UChar *uhaystack, *puhaystack, *uhaystack_end, *uneedle;
155: int32_t uhaystack_len, uneedle_len;
156: UErrorCode status;
157: unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
158: UBreakIterator* bi = NULL;
159: int ret_pos, pos;
160:
161: /* convert the strings to UTF-16. */
162: uhaystack = NULL;
163: uhaystack_len = 0;
164: status = U_ZERO_ERROR;
165: intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );
166:
167: if ( U_FAILURE( status ) ) {
168: /* Set global error code. */
169: intl_error_set_code( NULL, status TSRMLS_CC );
170:
171: /* Set error messages. */
172: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
173: if (uhaystack) {
174: efree( uhaystack );
175: }
176: return -1;
177: }
178:
179: if ( f_ignore_case ) {
180: grapheme_intl_case_fold(&uhaystack, &uhaystack, &uhaystack_len, &status );
181: }
182:
183: /* get a pointer to the haystack taking into account the offset */
184: bi = NULL;
185: status = U_ZERO_ERROR;
186: bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
187:
188: puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset);
189:
190: if ( NULL == puhaystack ) {
191: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
192: if (uhaystack) {
193: efree( uhaystack );
194: }
195: ubrk_close (bi);
196: return -1;
197: }
198:
199: uneedle = NULL;
200: uneedle_len = 0;
201: status = U_ZERO_ERROR;
202: intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );
203:
204: if ( U_FAILURE( status ) ) {
205: /* Set global error code. */
206: intl_error_set_code( NULL, status TSRMLS_CC );
207:
208: /* Set error messages. */
209: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
210: if (uhaystack) {
211: efree( uhaystack );
212: }
213: if (uneedle) {
214: efree( uneedle );
215: }
216: ubrk_close (bi);
217: return -1;
218: }
219:
220: if ( f_ignore_case ) {
221: grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status );
222: }
223:
224: ret_pos = -1; /* -1 represents 'not found' */
225:
226: /* back up until there's needle_len characters to compare */
227:
228: uhaystack_end = uhaystack + uhaystack_len;
229: pos = ubrk_last(bi);
230: puhaystack = uhaystack + pos;
231:
232: while ( uhaystack_end - puhaystack < uneedle_len ) {
233:
234: pos = ubrk_previous(bi);
235:
236: if ( UBRK_DONE == pos ) {
237: break;
238: }
239:
240: puhaystack = uhaystack + pos;
241: }
242:
243: /* is there enough haystack left to hold the needle? */
244: if ( ( uhaystack_end - puhaystack ) < uneedle_len ) {
245: /* not enough, not found */
246: goto exit;
247: }
248:
249: while ( UBRK_DONE != pos ) {
250:
251: if (!u_memcmp(uneedle, puhaystack, uneedle_len)) { /* needle_len - 1 in zend memnstr? */
252:
253: /* does the grapheme in the haystack end at the same place as the last grapheme in the needle? */
254:
255: if ( ubrk_isBoundary(bi, pos + uneedle_len) ) {
256:
257: /* found it, get grapheme count offset */
258: ret_pos = grapheme_count_graphemes(bi, uhaystack, pos);
259: break;
260: }
261:
262: /* set position back */
263: ubrk_isBoundary(bi, pos);
264: }
265:
266: pos = ubrk_previous(bi);
267: puhaystack = uhaystack + pos;
268: }
269:
270: exit:
271: if (uhaystack) {
272: efree( uhaystack );
273: }
274: if (uneedle) {
275: efree( uneedle );
276: }
277: ubrk_close (bi);
278:
279: return ret_pos;
280: }
281:
282: /* }}} */
283:
284: /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
285: int
286: grapheme_strpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case TSRMLS_DC)
287: {
288: UChar *uhaystack, *puhaystack, *uneedle;
289: int32_t uhaystack_len, uneedle_len;
290: int ret_pos;
291: unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
292: UBreakIterator* bi;
293: UErrorCode status;
294:
295: *puchar_pos = -1;
296:
297: /* convert the strings to UTF-16. */
298:
299: uhaystack = NULL;
300: uhaystack_len = 0;
301: status = U_ZERO_ERROR;
302: intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );
303:
304: if ( U_FAILURE( status ) ) {
305: /* Set global error code. */
306: intl_error_set_code( NULL, status TSRMLS_CC );
307:
308: /* Set error messages. */
309: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
310: if (uhaystack) {
311: efree( uhaystack );
312: }
313: return -1;
314: }
315:
316: /* get a pointer to the haystack taking into account the offset */
317: bi = NULL;
318: status = U_ZERO_ERROR;
319: bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
320:
321: puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset);
322: uhaystack_len = (uhaystack_len - ( puhaystack - uhaystack));
323:
324: if ( NULL == puhaystack ) {
325:
326: intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
327: if (uhaystack) {
328: efree( uhaystack );
329: }
330: ubrk_close (bi);
331:
332: return -1;
333: }
334:
335: if ( f_ignore_case ) {
336: grapheme_intl_case_fold(&uhaystack, &puhaystack, &uhaystack_len, &status );
337: }
338:
339: uneedle = NULL;
340: uneedle_len = 0;
341: status = U_ZERO_ERROR;
342: intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );
343:
344: if ( U_FAILURE( status ) ) {
345: /* Set global error code. */
346: intl_error_set_code( NULL, status TSRMLS_CC );
347:
348: /* Set error messages. */
349: intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
350: if (uhaystack) {
351: efree( uhaystack );
352: }
353: if (uneedle) {
354: efree( uneedle );
355: }
356: ubrk_close (bi);
357:
358: return -1;
359: }
360:
361: if ( f_ignore_case ) {
362: grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status );
363: }
364:
365: ret_pos = grapheme_memnstr_grapheme(bi, puhaystack, uneedle, uneedle_len, puhaystack + uhaystack_len );
366:
367: *puchar_pos = ubrk_current(bi);
368:
369: if (uhaystack) {
370: efree( uhaystack );
371: }
372: if (uneedle) {
373: efree( uneedle );
374: }
375: ubrk_close (bi);
376:
377: return ret_pos;
378: }
379:
380: /* }}} */
381:
382: /* {{{ grapheme_ascii_check: ASCII check */
383: int grapheme_ascii_check(const unsigned char *day, int32_t len)
384: {
385: int ret_len = len;
386: while ( len-- ) {
387: if ( *day++ > 0x7f )
388: return -1;
389: }
390:
391: return ret_len;
392: }
393:
394: /* }}} */
395:
396: /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
397: int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC )
398: {
399: unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
400: UErrorCode status = U_ZERO_ERROR;
401: int ret_len, pos;
402: UBreakIterator* bi;
403:
404: bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
405:
406: if( U_FAILURE(status) ) {
407: return -1;
408: }
409:
410: ubrk_setText(bi, text, text_length, &status);
411:
412: pos = 0;
413:
414: for ( ret_len = 0; pos != UBRK_DONE; ) {
415:
416: pos = ubrk_next(bi);
417:
418: if ( pos != UBRK_DONE ) {
419:
420: if ( NULL != boundary_array && ret_len < boundary_array_len ) {
421: boundary_array[ret_len] = pos;
422: }
423:
424: ret_len++;
425: }
426: }
427:
428: ubrk_close(bi);
429:
430: return ret_len;
431: }
432: /* }}} */
433:
434: /* {{{ grapheme_count_graphemes */
435: int32_t
436: grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
437: {
438: int ret_len = 0;
439: int pos = 0;
440: UErrorCode status = U_ZERO_ERROR;
441:
442: ubrk_setText(bi, string, string_len, &status);
443:
444: do {
445:
446: pos = ubrk_next(bi);
447:
448: if ( UBRK_DONE != pos ) {
449: ret_len++;
450: }
451:
452: } while ( UBRK_DONE != pos );
453:
454: return ret_len;
455: }
456: /* }}} */
457:
458: /* {{{ grapheme_memnstr_grapheme: find needle in haystack using grapheme boundaries */
459: int32_t
460: grapheme_memnstr_grapheme(UBreakIterator *bi, UChar *haystack, UChar *needle, int32_t needle_len, UChar *end)
461: {
462: UChar *p = haystack;
463: UChar ne = needle[needle_len-1];
464: UErrorCode status;
465: int32_t grapheme_offset;
466:
467: end -= needle_len;
468:
469: while (p <= end) {
470:
471: if ((p = u_memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
472:
473: if (!u_memcmp(needle, p, needle_len - 1)) { /* needle_len - 1 works because if needle_len is 1, we've already tested the char */
474:
475: /* does the grapheme end here? */
476:
477: status = U_ZERO_ERROR;
478: ubrk_setText (bi, haystack, (end - haystack) + needle_len, &status);
479:
480: if ( ubrk_isBoundary (bi, (p - haystack) + needle_len) ) {
481:
482: /* found it, get grapheme count offset */
483: grapheme_offset = grapheme_count_graphemes(bi, haystack, (p - haystack));
484:
485: return grapheme_offset;
486: }
487: }
488: }
489:
490: if (p == NULL) {
491: return -1;
492: }
493:
494: p++;
495: }
496:
497: return -1;
498: }
499:
500: /* }}} */
501:
502: /* {{{ grapheme_memrstr_grapheme: reverse find needle in haystack using grapheme boundaries */
503: inline void *grapheme_memrchr_grapheme(const void *s, int c, int32_t n)
504: {
505: register unsigned char *e;
506:
507: if (n <= 0) {
508: return NULL;
509: }
510:
511: for (e = (unsigned char *)s + n - 1; e >= (unsigned char *)s; e--) {
512: if (*e == (unsigned char)c) {
513: return (void *)e;
514: }
515: }
516:
517: return NULL;
518: }
519: /* }}} */
520:
521: /* {{{ grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
522: UChar *
523: grapheme_get_haystack_offset(UBreakIterator* bi, UChar *uhaystack, int32_t uhaystack_len, int32_t offset)
524: {
525: UErrorCode status;
526: int32_t pos;
527: int32_t (*iter_op)(UBreakIterator* bi);
528: int iter_incr;
529:
530: if ( NULL != bi ) {
531: status = U_ZERO_ERROR;
532: ubrk_setText (bi, uhaystack, uhaystack_len, &status);
533: }
534:
535: if ( 0 == offset ) {
536: return uhaystack;
537: }
538:
539: if ( offset < 0 ) {
540: iter_op = ubrk_previous;
541: ubrk_last(bi); /* one past the end */
542: iter_incr = 1;
543: }
544: else {
545: iter_op = ubrk_next;
546: iter_incr = -1;
547: }
548:
549: pos = 0;
550:
551: while ( pos != UBRK_DONE && offset != 0 ) {
552:
553: pos = iter_op(bi);
554:
555: if ( UBRK_DONE != pos ) {
556: offset += iter_incr;
557: }
558: }
559:
560: if ( offset != 0 ) {
561: return NULL;
562: }
563:
564: return uhaystack + pos;
565: }
566: /* }}} */
567:
568: /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
569: int32_t
570: grapheme_strrpos_ascii(unsigned char *haystack, int32_t haystack_len, unsigned char *needle, int32_t needle_len, int32_t offset)
571: {
572: unsigned char *p, *e;
573:
574: if (offset >= 0) {
575: p = haystack + offset;
576: e = haystack + haystack_len - needle_len;
577: } else {
578: p = haystack;
579: if (needle_len > -offset) {
580: e = haystack + haystack_len - needle_len;
581: } else {
582: e = haystack + haystack_len + offset;
583: }
584: }
585:
586: if (needle_len == 1) {
587: /* Single character search can shortcut memcmps */
588: while (e >= p) {
589: if (*e == *needle) {
590: return (e - p + (offset > 0 ? offset : 0));
591: }
592: e--;
593: }
594: return -1;
595: }
596:
597: while (e >= p) {
598: if (memcmp(e, needle, needle_len) == 0) {
599: return (e - p + (offset > 0 ? offset : 0));
600: }
601: e--;
602: }
603:
604: return -1;
605: }
606:
607: /* }}} */
608:
609: /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
610: UBreakIterator*
611: grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status TSRMLS_DC )
612: {
613: int32_t buffer_size;
614:
615: UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
616:
617: if ( NULL == global_break_iterator ) {
618:
619: global_break_iterator = ubrk_open(UBRK_CHARACTER,
620: NULL, /* icu default locale - locale has no effect on this iterator */
621: NULL, /* text not set in global iterator */
622: 0, /* text length = 0 */
623: status);
624:
625: INTL_G(grapheme_iterator) = global_break_iterator;
626: }
627:
628: buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
629:
630: return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
631: }
632: /* }}} */
633:
634: /*
635: * Local variables:
636: * tab-width: 4
637: * c-basic-offset: 4
638: * End:
639: * vim600: fdm=marker
640: * vim: noet sw=4 ts=4
641: */
642: