Annotation of embedaddon/sqlite3/ext/icu/icu.c, revision 1.1.1.1
1.1 misho 1: /*
2: ** 2007 May 6
3: **
4: ** The author disclaims copyright to this source code. In place of
5: ** a legal notice, here is a blessing:
6: **
7: ** May you do good and not evil.
8: ** May you find forgiveness for yourself and forgive others.
9: ** May you share freely, never taking more than you give.
10: **
11: *************************************************************************
12: ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
13: **
14: ** This file implements an integration between the ICU library
15: ** ("International Components for Unicode", an open-source library
16: ** for handling unicode data) and SQLite. The integration uses
17: ** ICU to provide the following to SQLite:
18: **
19: ** * An implementation of the SQL regexp() function (and hence REGEXP
20: ** operator) using the ICU uregex_XX() APIs.
21: **
22: ** * Implementations of the SQL scalar upper() and lower() functions
23: ** for case mapping.
24: **
25: ** * Integration of ICU and SQLite collation seqences.
26: **
27: ** * An implementation of the LIKE operator that uses ICU to
28: ** provide case-independent matching.
29: */
30:
31: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
32:
33: /* Include ICU headers */
34: #include <unicode/utypes.h>
35: #include <unicode/uregex.h>
36: #include <unicode/ustring.h>
37: #include <unicode/ucol.h>
38:
39: #include <assert.h>
40:
41: #ifndef SQLITE_CORE
42: #include "sqlite3ext.h"
43: SQLITE_EXTENSION_INIT1
44: #else
45: #include "sqlite3.h"
46: #endif
47:
48: /*
49: ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
50: ** operator.
51: */
52: #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
53: # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
54: #endif
55:
56: /*
57: ** Version of sqlite3_free() that is always a function, never a macro.
58: */
59: static void xFree(void *p){
60: sqlite3_free(p);
61: }
62:
63: /*
64: ** Compare two UTF-8 strings for equality where the first string is
65: ** a "LIKE" expression. Return true (1) if they are the same and
66: ** false (0) if they are different.
67: */
68: static int icuLikeCompare(
69: const uint8_t *zPattern, /* LIKE pattern */
70: const uint8_t *zString, /* The UTF-8 string to compare against */
71: const UChar32 uEsc /* The escape character */
72: ){
73: static const int MATCH_ONE = (UChar32)'_';
74: static const int MATCH_ALL = (UChar32)'%';
75:
76: int iPattern = 0; /* Current byte index in zPattern */
77: int iString = 0; /* Current byte index in zString */
78:
79: int prevEscape = 0; /* True if the previous character was uEsc */
80:
81: while( zPattern[iPattern]!=0 ){
82:
83: /* Read (and consume) the next character from the input pattern. */
84: UChar32 uPattern;
85: U8_NEXT_UNSAFE(zPattern, iPattern, uPattern);
86: assert(uPattern!=0);
87:
88: /* There are now 4 possibilities:
89: **
90: ** 1. uPattern is an unescaped match-all character "%",
91: ** 2. uPattern is an unescaped match-one character "_",
92: ** 3. uPattern is an unescaped escape character, or
93: ** 4. uPattern is to be handled as an ordinary character
94: */
95: if( !prevEscape && uPattern==MATCH_ALL ){
96: /* Case 1. */
97: uint8_t c;
98:
99: /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
100: ** MATCH_ALL. For each MATCH_ONE, skip one character in the
101: ** test string.
102: */
103: while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){
104: if( c==MATCH_ONE ){
105: if( zString[iString]==0 ) return 0;
106: U8_FWD_1_UNSAFE(zString, iString);
107: }
108: iPattern++;
109: }
110:
111: if( zPattern[iPattern]==0 ) return 1;
112:
113: while( zString[iString] ){
114: if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){
115: return 1;
116: }
117: U8_FWD_1_UNSAFE(zString, iString);
118: }
119: return 0;
120:
121: }else if( !prevEscape && uPattern==MATCH_ONE ){
122: /* Case 2. */
123: if( zString[iString]==0 ) return 0;
124: U8_FWD_1_UNSAFE(zString, iString);
125:
126: }else if( !prevEscape && uPattern==uEsc){
127: /* Case 3. */
128: prevEscape = 1;
129:
130: }else{
131: /* Case 4. */
132: UChar32 uString;
133: U8_NEXT_UNSAFE(zString, iString, uString);
134: uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
135: uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
136: if( uString!=uPattern ){
137: return 0;
138: }
139: prevEscape = 0;
140: }
141: }
142:
143: return zString[iString]==0;
144: }
145:
146: /*
147: ** Implementation of the like() SQL function. This function implements
148: ** the build-in LIKE operator. The first argument to the function is the
149: ** pattern and the second argument is the string. So, the SQL statements:
150: **
151: ** A LIKE B
152: **
153: ** is implemented as like(B, A). If there is an escape character E,
154: **
155: ** A LIKE B ESCAPE E
156: **
157: ** is mapped to like(B, A, E).
158: */
159: static void icuLikeFunc(
160: sqlite3_context *context,
161: int argc,
162: sqlite3_value **argv
163: ){
164: const unsigned char *zA = sqlite3_value_text(argv[0]);
165: const unsigned char *zB = sqlite3_value_text(argv[1]);
166: UChar32 uEsc = 0;
167:
168: /* Limit the length of the LIKE or GLOB pattern to avoid problems
169: ** of deep recursion and N*N behavior in patternCompare().
170: */
171: if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
172: sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
173: return;
174: }
175:
176:
177: if( argc==3 ){
178: /* The escape character string must consist of a single UTF-8 character.
179: ** Otherwise, return an error.
180: */
181: int nE= sqlite3_value_bytes(argv[2]);
182: const unsigned char *zE = sqlite3_value_text(argv[2]);
183: int i = 0;
184: if( zE==0 ) return;
185: U8_NEXT(zE, i, nE, uEsc);
186: if( i!=nE){
187: sqlite3_result_error(context,
188: "ESCAPE expression must be a single character", -1);
189: return;
190: }
191: }
192:
193: if( zA && zB ){
194: sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
195: }
196: }
197:
198: /*
199: ** This function is called when an ICU function called from within
200: ** the implementation of an SQL scalar function returns an error.
201: **
202: ** The scalar function context passed as the first argument is
203: ** loaded with an error message based on the following two args.
204: */
205: static void icuFunctionError(
206: sqlite3_context *pCtx, /* SQLite scalar function context */
207: const char *zName, /* Name of ICU function that failed */
208: UErrorCode e /* Error code returned by ICU function */
209: ){
210: char zBuf[128];
211: sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
212: zBuf[127] = '\0';
213: sqlite3_result_error(pCtx, zBuf, -1);
214: }
215:
216: /*
217: ** Function to delete compiled regexp objects. Registered as
218: ** a destructor function with sqlite3_set_auxdata().
219: */
220: static void icuRegexpDelete(void *p){
221: URegularExpression *pExpr = (URegularExpression *)p;
222: uregex_close(pExpr);
223: }
224:
225: /*
226: ** Implementation of SQLite REGEXP operator. This scalar function takes
227: ** two arguments. The first is a regular expression pattern to compile
228: ** the second is a string to match against that pattern. If either
229: ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
230: ** is 1 if the string matches the pattern, or 0 otherwise.
231: **
232: ** SQLite maps the regexp() function to the regexp() operator such
233: ** that the following two are equivalent:
234: **
235: ** zString REGEXP zPattern
236: ** regexp(zPattern, zString)
237: **
238: ** Uses the following ICU regexp APIs:
239: **
240: ** uregex_open()
241: ** uregex_matches()
242: ** uregex_close()
243: */
244: static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
245: UErrorCode status = U_ZERO_ERROR;
246: URegularExpression *pExpr;
247: UBool res;
248: const UChar *zString = sqlite3_value_text16(apArg[1]);
249:
250: (void)nArg; /* Unused parameter */
251:
252: /* If the left hand side of the regexp operator is NULL,
253: ** then the result is also NULL.
254: */
255: if( !zString ){
256: return;
257: }
258:
259: pExpr = sqlite3_get_auxdata(p, 0);
260: if( !pExpr ){
261: const UChar *zPattern = sqlite3_value_text16(apArg[0]);
262: if( !zPattern ){
263: return;
264: }
265: pExpr = uregex_open(zPattern, -1, 0, 0, &status);
266:
267: if( U_SUCCESS(status) ){
268: sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
269: }else{
270: assert(!pExpr);
271: icuFunctionError(p, "uregex_open", status);
272: return;
273: }
274: }
275:
276: /* Configure the text that the regular expression operates on. */
277: uregex_setText(pExpr, zString, -1, &status);
278: if( !U_SUCCESS(status) ){
279: icuFunctionError(p, "uregex_setText", status);
280: return;
281: }
282:
283: /* Attempt the match */
284: res = uregex_matches(pExpr, 0, &status);
285: if( !U_SUCCESS(status) ){
286: icuFunctionError(p, "uregex_matches", status);
287: return;
288: }
289:
290: /* Set the text that the regular expression operates on to a NULL
291: ** pointer. This is not really necessary, but it is tidier than
292: ** leaving the regular expression object configured with an invalid
293: ** pointer after this function returns.
294: */
295: uregex_setText(pExpr, 0, 0, &status);
296:
297: /* Return 1 or 0. */
298: sqlite3_result_int(p, res ? 1 : 0);
299: }
300:
301: /*
302: ** Implementations of scalar functions for case mapping - upper() and
303: ** lower(). Function upper() converts its input to upper-case (ABC).
304: ** Function lower() converts to lower-case (abc).
305: **
306: ** ICU provides two types of case mapping, "general" case mapping and
307: ** "language specific". Refer to ICU documentation for the differences
308: ** between the two.
309: **
310: ** To utilise "general" case mapping, the upper() or lower() scalar
311: ** functions are invoked with one argument:
312: **
313: ** upper('ABC') -> 'abc'
314: ** lower('abc') -> 'ABC'
315: **
316: ** To access ICU "language specific" case mapping, upper() or lower()
317: ** should be invoked with two arguments. The second argument is the name
318: ** of the locale to use. Passing an empty string ("") or SQL NULL value
319: ** as the second argument is the same as invoking the 1 argument version
320: ** of upper() or lower().
321: **
322: ** lower('I', 'en_us') -> 'i'
323: ** lower('I', 'tr_tr') -> 'ı' (small dotless i)
324: **
325: ** http://www.icu-project.org/userguide/posix.html#case_mappings
326: */
327: static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
328: const UChar *zInput;
329: UChar *zOutput;
330: int nInput;
331: int nOutput;
332:
333: UErrorCode status = U_ZERO_ERROR;
334: const char *zLocale = 0;
335:
336: assert(nArg==1 || nArg==2);
337: if( nArg==2 ){
338: zLocale = (const char *)sqlite3_value_text(apArg[1]);
339: }
340:
341: zInput = sqlite3_value_text16(apArg[0]);
342: if( !zInput ){
343: return;
344: }
345: nInput = sqlite3_value_bytes16(apArg[0]);
346:
347: nOutput = nInput * 2 + 2;
348: zOutput = sqlite3_malloc(nOutput);
349: if( !zOutput ){
350: return;
351: }
352:
353: if( sqlite3_user_data(p) ){
354: u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
355: }else{
356: u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
357: }
358:
359: if( !U_SUCCESS(status) ){
360: icuFunctionError(p, "u_strToLower()/u_strToUpper", status);
361: return;
362: }
363:
364: sqlite3_result_text16(p, zOutput, -1, xFree);
365: }
366:
367: /*
368: ** Collation sequence destructor function. The pCtx argument points to
369: ** a UCollator structure previously allocated using ucol_open().
370: */
371: static void icuCollationDel(void *pCtx){
372: UCollator *p = (UCollator *)pCtx;
373: ucol_close(p);
374: }
375:
376: /*
377: ** Collation sequence comparison function. The pCtx argument points to
378: ** a UCollator structure previously allocated using ucol_open().
379: */
380: static int icuCollationColl(
381: void *pCtx,
382: int nLeft,
383: const void *zLeft,
384: int nRight,
385: const void *zRight
386: ){
387: UCollationResult res;
388: UCollator *p = (UCollator *)pCtx;
389: res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
390: switch( res ){
391: case UCOL_LESS: return -1;
392: case UCOL_GREATER: return +1;
393: case UCOL_EQUAL: return 0;
394: }
395: assert(!"Unexpected return value from ucol_strcoll()");
396: return 0;
397: }
398:
399: /*
400: ** Implementation of the scalar function icu_load_collation().
401: **
402: ** This scalar function is used to add ICU collation based collation
403: ** types to an SQLite database connection. It is intended to be called
404: ** as follows:
405: **
406: ** SELECT icu_load_collation(<locale>, <collation-name>);
407: **
408: ** Where <locale> is a string containing an ICU locale identifier (i.e.
409: ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
410: ** collation sequence to create.
411: */
412: static void icuLoadCollation(
413: sqlite3_context *p,
414: int nArg,
415: sqlite3_value **apArg
416: ){
417: sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
418: UErrorCode status = U_ZERO_ERROR;
419: const char *zLocale; /* Locale identifier - (eg. "jp_JP") */
420: const char *zName; /* SQL Collation sequence name (eg. "japanese") */
421: UCollator *pUCollator; /* ICU library collation object */
422: int rc; /* Return code from sqlite3_create_collation_x() */
423:
424: assert(nArg==2);
425: zLocale = (const char *)sqlite3_value_text(apArg[0]);
426: zName = (const char *)sqlite3_value_text(apArg[1]);
427:
428: if( !zLocale || !zName ){
429: return;
430: }
431:
432: pUCollator = ucol_open(zLocale, &status);
433: if( !U_SUCCESS(status) ){
434: icuFunctionError(p, "ucol_open", status);
435: return;
436: }
437: assert(p);
438:
439: rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator,
440: icuCollationColl, icuCollationDel
441: );
442: if( rc!=SQLITE_OK ){
443: ucol_close(pUCollator);
444: sqlite3_result_error(p, "Error registering collation function", -1);
445: }
446: }
447:
448: /*
449: ** Register the ICU extension functions with database db.
450: */
451: int sqlite3IcuInit(sqlite3 *db){
452: struct IcuScalar {
453: const char *zName; /* Function name */
454: int nArg; /* Number of arguments */
455: int enc; /* Optimal text encoding */
456: void *pContext; /* sqlite3_user_data() context */
457: void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
458: } scalars[] = {
459: {"regexp", 2, SQLITE_ANY, 0, icuRegexpFunc},
460:
461: {"lower", 1, SQLITE_UTF16, 0, icuCaseFunc16},
462: {"lower", 2, SQLITE_UTF16, 0, icuCaseFunc16},
463: {"upper", 1, SQLITE_UTF16, (void*)1, icuCaseFunc16},
464: {"upper", 2, SQLITE_UTF16, (void*)1, icuCaseFunc16},
465:
466: {"lower", 1, SQLITE_UTF8, 0, icuCaseFunc16},
467: {"lower", 2, SQLITE_UTF8, 0, icuCaseFunc16},
468: {"upper", 1, SQLITE_UTF8, (void*)1, icuCaseFunc16},
469: {"upper", 2, SQLITE_UTF8, (void*)1, icuCaseFunc16},
470:
471: {"like", 2, SQLITE_UTF8, 0, icuLikeFunc},
472: {"like", 3, SQLITE_UTF8, 0, icuLikeFunc},
473:
474: {"icu_load_collation", 2, SQLITE_UTF8, (void*)db, icuLoadCollation},
475: };
476:
477: int rc = SQLITE_OK;
478: int i;
479:
480: for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
481: struct IcuScalar *p = &scalars[i];
482: rc = sqlite3_create_function(
483: db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0
484: );
485: }
486:
487: return rc;
488: }
489:
490: #if !SQLITE_CORE
491: int sqlite3_extension_init(
492: sqlite3 *db,
493: char **pzErrMsg,
494: const sqlite3_api_routines *pApi
495: ){
496: SQLITE_EXTENSION_INIT2(pApi)
497: return sqlite3IcuInit(db);
498: }
499: #endif
500:
501: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>