Return to php_unicode.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / php / ext / mbstring |
1.1 misho 1: /*
2: +----------------------------------------------------------------------+
3: | PHP Version 5 |
4: +----------------------------------------------------------------------+
1.1.1.3 ! misho 5: | Copyright (c) 1997-2014 The PHP Group |
1.1 misho 6: +----------------------------------------------------------------------+
7: | This source file is subject to version 3.01 of the PHP license, |
8: | that is bundled with this package in the file LICENSE, and is |
9: | available through the world-wide-web at the following url: |
10: | http://www.php.net/license/3_01.txt |
11: | If you did not receive a copy of the PHP license and are unable to |
12: | obtain it through the world-wide-web, please send a note to |
13: | license@php.net so we can mail you a copy immediately. |
14: +----------------------------------------------------------------------+
15: | Author: Wez Furlong (wez@thebrainroom.com) |
16: +----------------------------------------------------------------------+
17:
18: Based on code from ucdata-2.5, which has the following Copyright:
19:
20: Copyright 2001 Computing Research Labs, New Mexico State University
21:
22: Permission is hereby granted, free of charge, to any person obtaining a
23: copy of this software and associated documentation files (the "Software"),
24: to deal in the Software without restriction, including without limitation
25: the rights to use, copy, modify, merge, publish, distribute, sublicense,
26: and/or sell copies of the Software, and to permit persons to whom the
27: Software is furnished to do so, subject to the following conditions:
28:
29: The above copyright notice and this permission notice shall be included in
30: all copies or substantial portions of the Software.
31: */
32:
33: #ifdef HAVE_CONFIG_H
34: #include "config.h"
35: #endif
36:
37: #include "php.h"
38: #include "php_ini.h"
39:
40: #if HAVE_MBSTRING
41:
42: /* include case folding data generated from the official UnicodeData.txt file */
43: #include "mbstring.h"
44: #include "php_unicode.h"
45: #include "unicode_data.h"
46:
47: ZEND_EXTERN_MODULE_GLOBALS(mbstring)
48:
49: /*
50: * A simple array of 32-bit masks for lookup.
51: */
52: static unsigned long masks32[32] = {
53: 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
54: 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
55: 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
56: 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
57: 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
58: 0x40000000, 0x80000000
59: };
60:
61:
62: static int prop_lookup(unsigned long code, unsigned long n)
63: {
64: long l, r, m;
65:
66: /*
67: * There is an extra node on the end of the offsets to allow this routine
68: * to work right. If the index is 0xffff, then there are no nodes for the
69: * property.
70: */
71: if ((l = _ucprop_offsets[n]) == 0xffff)
72: return 0;
73:
74: /*
75: * Locate the next offset that is not 0xffff. The sentinel at the end of
76: * the array is the max index value.
77: */
78: for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
79: ;
80:
81: r = _ucprop_offsets[n + m] - 1;
82:
83: while (l <= r) {
84: /*
85: * Determine a "mid" point and adjust to make sure the mid point is at
86: * the beginning of a range pair.
87: */
88: m = (l + r) >> 1;
89: m -= (m & 1);
90: if (code > _ucprop_ranges[m + 1])
91: l = m + 2;
92: else if (code < _ucprop_ranges[m])
93: r = m - 2;
94: else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
95: return 1;
96: }
97: return 0;
98:
99: }
100:
101: MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1,
102: unsigned long mask2)
103: {
104: unsigned long i;
105:
106: if (mask1 == 0 && mask2 == 0)
107: return 0;
108:
109: for (i = 0; mask1 && i < 32; i++) {
110: if ((mask1 & masks32[i]) && prop_lookup(code, i))
111: return 1;
112: }
113:
114: for (i = 32; mask2 && i < _ucprop_size; i++) {
115: if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
116: return 1;
117: }
118:
119: return 0;
120: }
121:
122: static unsigned long case_lookup(unsigned long code, long l, long r, int field)
123: {
124: long m;
125:
126: /*
127: * Do the binary search.
128: */
129: while (l <= r) {
130: /*
131: * Determine a "mid" point and adjust to make sure the mid point is at
132: * the beginning of a case mapping triple.
133: */
134: m = (l + r) >> 1;
135: m -= (m % 3);
136: if (code > _uccase_map[m])
137: l = m + 3;
138: else if (code < _uccase_map[m])
139: r = m - 3;
140: else if (code == _uccase_map[m])
141: return _uccase_map[m + field];
142: }
143:
144: return code;
145: }
146:
147: MBSTRING_API unsigned long php_turkish_toupper(unsigned long code, long l, long r, int field)
148: {
149: if (code == 0x0069L) {
150: return 0x0130L;
151: }
152: return case_lookup(code, l, r, field);
153: }
154:
155: MBSTRING_API unsigned long php_turkish_tolower(unsigned long code, long l, long r, int field)
156: {
157: if (code == 0x0049L) {
158: return 0x0131L;
159: }
160: return case_lookup(code, l, r, field);
161: }
162:
163: MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
164: {
165: int field;
166: long l, r;
167:
168: if (php_unicode_is_upper(code))
169: return code;
170:
171: if (php_unicode_is_lower(code)) {
172: /*
173: * The character is lower case.
174: */
175: field = 2;
176: l = _uccase_len[0];
177: r = (l + _uccase_len[1]) - 3;
178:
179: if (enc == mbfl_no_encoding_8859_9) {
180: return php_turkish_toupper(code, l, r, field);
181: }
182:
183: } else {
184: /*
185: * The character is title case.
186: */
187: field = 1;
188: l = _uccase_len[0] + _uccase_len[1];
189: r = _uccase_size - 3;
190: }
191: return case_lookup(code, l, r, field);
192: }
193:
194: MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
195: {
196: int field;
197: long l, r;
198:
199: if (php_unicode_is_lower(code))
200: return code;
201:
202: if (php_unicode_is_upper(code)) {
203: /*
204: * The character is upper case.
205: */
206: field = 1;
207: l = 0;
208: r = _uccase_len[0] - 3;
209:
210: if (enc == mbfl_no_encoding_8859_9) {
211: return php_turkish_tolower(code, l, r, field);
212: }
213:
214: } else {
215: /*
216: * The character is title case.
217: */
218: field = 2;
219: l = _uccase_len[0] + _uccase_len[1];
220: r = _uccase_size - 3;
221: }
222: return case_lookup(code, l, r, field);
223: }
224:
225: MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
226: {
227: int field;
228: long l, r;
229:
230: if (php_unicode_is_title(code))
231: return code;
232:
233: /*
234: * The offset will always be the same for converting to title case.
235: */
236: field = 2;
237:
238: if (php_unicode_is_upper(code)) {
239: /*
240: * The character is upper case.
241: */
242: l = 0;
243: r = _uccase_len[0] - 3;
244: } else {
245: /*
246: * The character is lower case.
247: */
248: l = _uccase_len[0];
249: r = (l + _uccase_len[1]) - 3;
250: }
251: return case_lookup(code, l, r, field);
252:
253: }
254:
255:
256: #define BE_ARY_TO_UINT32(ptr) (\
257: ((unsigned char*)(ptr))[0]<<24 |\
258: ((unsigned char*)(ptr))[1]<<16 |\
259: ((unsigned char*)(ptr))[2]<< 8 |\
260: ((unsigned char*)(ptr))[3] )
261:
262: #define UINT32_TO_BE_ARY(ptr,val) { \
263: unsigned int v = val; \
264: ((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
265: ((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
266: ((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
267: ((unsigned char*)(ptr))[3] = (v ) & 0xff;\
268: }
269:
270: MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
271: const char *src_encoding TSRMLS_DC)
272: {
273: char *unicode, *newstr;
274: size_t unicode_len;
275: unsigned char *unicode_ptr;
276: size_t i;
277: enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
278:
279: if (_src_encoding == mbfl_no_encoding_invalid) {
280: php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown encoding \"%s\"", src_encoding);
281: return NULL;
282: }
283:
284: unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len TSRMLS_CC);
285: if (unicode == NULL)
286: return NULL;
287:
288: unicode_ptr = (unsigned char *)unicode;
289:
290: switch(case_mode) {
291: case PHP_UNICODE_CASE_UPPER:
292: for (i = 0; i < unicode_len; i+=4) {
293: UINT32_TO_BE_ARY(&unicode_ptr[i],
294: php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
295: }
296: break;
297:
298: case PHP_UNICODE_CASE_LOWER:
299: for (i = 0; i < unicode_len; i+=4) {
300: UINT32_TO_BE_ARY(&unicode_ptr[i],
301: php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
302: }
303: break;
304:
305: case PHP_UNICODE_CASE_TITLE: {
306: int mode = 0;
307:
308: for (i = 0; i < unicode_len; i+=4) {
309: int res = php_unicode_is_prop(
310: BE_ARY_TO_UINT32(&unicode_ptr[i]),
311: UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT|UC_PO|UC_OS, 0);
312: if (mode) {
313: if (res) {
314: UINT32_TO_BE_ARY(&unicode_ptr[i],
315: php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
316: } else {
317: mode = 0;
318: }
319: } else {
320: if (res) {
321: mode = 1;
322: UINT32_TO_BE_ARY(&unicode_ptr[i],
323: php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
324: }
325: }
326: }
327: } break;
328:
329: }
330:
331: newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len TSRMLS_CC);
332: efree(unicode);
333:
334: return newstr;
335: }
336:
337:
338: #endif /* HAVE_MBSTRING */
339:
340: /*
341: * Local variables:
342: * tab-width: 4
343: * c-basic-offset: 4
344: * End:
345: * vim600: sw=4 ts=4 fdm=marker
346: * vim<600: sw=4 ts=4
347: */