embedaddon/sqlite3/ext/icu/icu.c - annotate

Return to icu.c CVS log
Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / icu
Annotation of embedaddon/sqlite3/ext/icu/icu.c, revision 1.1.1.1

1.1       misho       1: /*
                      2: ** 2007 May 6
                      3: **
                      4: ** The author disclaims copyright to this source code.  In place of
                      5: ** a legal notice, here is a blessing:
                      6: **
                      7: **    May you do good and not evil.
                      8: **    May you find forgiveness for yourself and forgive others.
                      9: **    May you share freely, never taking more than you give.
                     10: **
                     11: *************************************************************************
                     12: ** $Id: icu.c,v 1.7 2007/12/13 21:54:11 drh Exp $
                     13: **
                     14: ** This file implements an integration between the ICU library 
                     15: ** ("International Components for Unicode", an open-source library 
                     16: ** for handling unicode data) and SQLite. The integration uses 
                     17: ** ICU to provide the following to SQLite:
                     18: **
                     19: **   * An implementation of the SQL regexp() function (and hence REGEXP
                     20: **     operator) using the ICU uregex_XX() APIs.
                     21: **
                     22: **   * Implementations of the SQL scalar upper() and lower() functions
                     23: **     for case mapping.
                     24: **
                     25: **   * Integration of ICU and SQLite collation seqences.
                     26: **
                     27: **   * An implementation of the LIKE operator that uses ICU to 
                     28: **     provide case-independent matching.
                     29: */
                     30: 
                     31: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
                     32: 
                     33: /* Include ICU headers */
                     34: #include <unicode/utypes.h>
                     35: #include <unicode/uregex.h>
                     36: #include <unicode/ustring.h>
                     37: #include <unicode/ucol.h>
                     38: 
                     39: #include <assert.h>
                     40: 
                     41: #ifndef SQLITE_CORE
                     42:   #include "sqlite3ext.h"
                     43:   SQLITE_EXTENSION_INIT1
                     44: #else
                     45:   #include "sqlite3.h"
                     46: #endif
                     47: 
                     48: /*
                     49: ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
                     50: ** operator.
                     51: */
                     52: #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
                     53: # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
                     54: #endif
                     55: 
                     56: /*
                     57: ** Version of sqlite3_free() that is always a function, never a macro.
                     58: */
                     59: static void xFree(void *p){
                     60:   sqlite3_free(p);
                     61: }
                     62: 
                     63: /*
                     64: ** Compare two UTF-8 strings for equality where the first string is
                     65: ** a "LIKE" expression. Return true (1) if they are the same and 
                     66: ** false (0) if they are different.
                     67: */
                     68: static int icuLikeCompare(
                     69:   const uint8_t *zPattern,   /* LIKE pattern */
                     70:   const uint8_t *zString,    /* The UTF-8 string to compare against */
                     71:   const UChar32 uEsc         /* The escape character */
                     72: ){
                     73:   static const int MATCH_ONE = (UChar32)'_';
                     74:   static const int MATCH_ALL = (UChar32)'%';
                     75: 
                     76:   int iPattern = 0;       /* Current byte index in zPattern */
                     77:   int iString = 0;        /* Current byte index in zString */
                     78: 
                     79:   int prevEscape = 0;     /* True if the previous character was uEsc */
                     80: 
                     81:   while( zPattern[iPattern]!=0 ){
                     82: 
                     83:     /* Read (and consume) the next character from the input pattern. */
                     84:     UChar32 uPattern;
                     85:     U8_NEXT_UNSAFE(zPattern, iPattern, uPattern);
                     86:     assert(uPattern!=0);
                     87: 
                     88:     /* There are now 4 possibilities:
                     89:     **
                     90:     **     1. uPattern is an unescaped match-all character "%",
                     91:     **     2. uPattern is an unescaped match-one character "_",
                     92:     **     3. uPattern is an unescaped escape character, or
                     93:     **     4. uPattern is to be handled as an ordinary character
                     94:     */
                     95:     if( !prevEscape && uPattern==MATCH_ALL ){
                     96:       /* Case 1. */
                     97:       uint8_t c;
                     98: 
                     99:       /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
                    100:       ** MATCH_ALL. For each MATCH_ONE, skip one character in the 
                    101:       ** test string.
                    102:       */
                    103:       while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){
                    104:         if( c==MATCH_ONE ){
                    105:           if( zString[iString]==0 ) return 0;
                    106:           U8_FWD_1_UNSAFE(zString, iString);
                    107:         }
                    108:         iPattern++;
                    109:       }
                    110: 
                    111:       if( zPattern[iPattern]==0 ) return 1;
                    112: 
                    113:       while( zString[iString] ){
                    114:         if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){
                    115:           return 1;
                    116:         }
                    117:         U8_FWD_1_UNSAFE(zString, iString);
                    118:       }
                    119:       return 0;
                    120: 
                    121:     }else if( !prevEscape && uPattern==MATCH_ONE ){
                    122:       /* Case 2. */
                    123:       if( zString[iString]==0 ) return 0;
                    124:       U8_FWD_1_UNSAFE(zString, iString);
                    125: 
                    126:     }else if( !prevEscape && uPattern==uEsc){
                    127:       /* Case 3. */
                    128:       prevEscape = 1;
                    129: 
                    130:     }else{
                    131:       /* Case 4. */
                    132:       UChar32 uString;
                    133:       U8_NEXT_UNSAFE(zString, iString, uString);
                    134:       uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
                    135:       uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
                    136:       if( uString!=uPattern ){
                    137:         return 0;
                    138:       }
                    139:       prevEscape = 0;
                    140:     }
                    141:   }
                    142: 
                    143:   return zString[iString]==0;
                    144: }
                    145: 
                    146: /*
                    147: ** Implementation of the like() SQL function.  This function implements
                    148: ** the build-in LIKE operator.  The first argument to the function is the
                    149: ** pattern and the second argument is the string.  So, the SQL statements:
                    150: **
                    151: **       A LIKE B
                    152: **
                    153: ** is implemented as like(B, A). If there is an escape character E, 
                    154: **
                    155: **       A LIKE B ESCAPE E
                    156: **
                    157: ** is mapped to like(B, A, E).
                    158: */
                    159: static void icuLikeFunc(
                    160:   sqlite3_context *context, 
                    161:   int argc, 
                    162:   sqlite3_value **argv
                    163: ){
                    164:   const unsigned char *zA = sqlite3_value_text(argv[0]);
                    165:   const unsigned char *zB = sqlite3_value_text(argv[1]);
                    166:   UChar32 uEsc = 0;
                    167: 
                    168:   /* Limit the length of the LIKE or GLOB pattern to avoid problems
                    169:   ** of deep recursion and N*N behavior in patternCompare().
                    170:   */
                    171:   if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
                    172:     sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
                    173:     return;
                    174:   }
                    175: 
                    176: 
                    177:   if( argc==3 ){
                    178:     /* The escape character string must consist of a single UTF-8 character.
                    179:     ** Otherwise, return an error.
                    180:     */
                    181:     int nE= sqlite3_value_bytes(argv[2]);
                    182:     const unsigned char *zE = sqlite3_value_text(argv[2]);
                    183:     int i = 0;
                    184:     if( zE==0 ) return;
                    185:     U8_NEXT(zE, i, nE, uEsc);
                    186:     if( i!=nE){
                    187:       sqlite3_result_error(context, 
                    188:           "ESCAPE expression must be a single character", -1);
                    189:       return;
                    190:     }
                    191:   }
                    192: 
                    193:   if( zA && zB ){
                    194:     sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
                    195:   }
                    196: }
                    197: 
                    198: /*
                    199: ** This function is called when an ICU function called from within
                    200: ** the implementation of an SQL scalar function returns an error.
                    201: **
                    202: ** The scalar function context passed as the first argument is 
                    203: ** loaded with an error message based on the following two args.
                    204: */
                    205: static void icuFunctionError(
                    206:   sqlite3_context *pCtx,       /* SQLite scalar function context */
                    207:   const char *zName,           /* Name of ICU function that failed */
                    208:   UErrorCode e                 /* Error code returned by ICU function */
                    209: ){
                    210:   char zBuf[128];
                    211:   sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
                    212:   zBuf[127] = '\0';
                    213:   sqlite3_result_error(pCtx, zBuf, -1);
                    214: }
                    215: 
                    216: /*
                    217: ** Function to delete compiled regexp objects. Registered as
                    218: ** a destructor function with sqlite3_set_auxdata().
                    219: */
                    220: static void icuRegexpDelete(void *p){
                    221:   URegularExpression *pExpr = (URegularExpression *)p;
                    222:   uregex_close(pExpr);
                    223: }
                    224: 
                    225: /*
                    226: ** Implementation of SQLite REGEXP operator. This scalar function takes
                    227: ** two arguments. The first is a regular expression pattern to compile
                    228: ** the second is a string to match against that pattern. If either 
                    229: ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
                    230: ** is 1 if the string matches the pattern, or 0 otherwise.
                    231: **
                    232: ** SQLite maps the regexp() function to the regexp() operator such
                    233: ** that the following two are equivalent:
                    234: **
                    235: **     zString REGEXP zPattern
                    236: **     regexp(zPattern, zString)
                    237: **
                    238: ** Uses the following ICU regexp APIs:
                    239: **
                    240: **     uregex_open()
                    241: **     uregex_matches()
                    242: **     uregex_close()
                    243: */
                    244: static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
                    245:   UErrorCode status = U_ZERO_ERROR;
                    246:   URegularExpression *pExpr;
                    247:   UBool res;
                    248:   const UChar *zString = sqlite3_value_text16(apArg[1]);
                    249: 
                    250:   (void)nArg;  /* Unused parameter */
                    251: 
                    252:   /* If the left hand side of the regexp operator is NULL, 
                    253:   ** then the result is also NULL. 
                    254:   */
                    255:   if( !zString ){
                    256:     return;
                    257:   }
                    258: 
                    259:   pExpr = sqlite3_get_auxdata(p, 0);
                    260:   if( !pExpr ){
                    261:     const UChar *zPattern = sqlite3_value_text16(apArg[0]);
                    262:     if( !zPattern ){
                    263:       return;
                    264:     }
                    265:     pExpr = uregex_open(zPattern, -1, 0, 0, &status);
                    266: 
                    267:     if( U_SUCCESS(status) ){
                    268:       sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
                    269:     }else{
                    270:       assert(!pExpr);
                    271:       icuFunctionError(p, "uregex_open", status);
                    272:       return;
                    273:     }
                    274:   }
                    275: 
                    276:   /* Configure the text that the regular expression operates on. */
                    277:   uregex_setText(pExpr, zString, -1, &status);
                    278:   if( !U_SUCCESS(status) ){
                    279:     icuFunctionError(p, "uregex_setText", status);
                    280:     return;
                    281:   }
                    282: 
                    283:   /* Attempt the match */
                    284:   res = uregex_matches(pExpr, 0, &status);
                    285:   if( !U_SUCCESS(status) ){
                    286:     icuFunctionError(p, "uregex_matches", status);
                    287:     return;
                    288:   }
                    289: 
                    290:   /* Set the text that the regular expression operates on to a NULL
                    291:   ** pointer. This is not really necessary, but it is tidier than 
                    292:   ** leaving the regular expression object configured with an invalid
                    293:   ** pointer after this function returns.
                    294:   */
                    295:   uregex_setText(pExpr, 0, 0, &status);
                    296: 
                    297:   /* Return 1 or 0. */
                    298:   sqlite3_result_int(p, res ? 1 : 0);
                    299: }
                    300: 
                    301: /*
                    302: ** Implementations of scalar functions for case mapping - upper() and 
                    303: ** lower(). Function upper() converts its input to upper-case (ABC).
                    304: ** Function lower() converts to lower-case (abc).
                    305: **
                    306: ** ICU provides two types of case mapping, "general" case mapping and
                    307: ** "language specific". Refer to ICU documentation for the differences
                    308: ** between the two.
                    309: **
                    310: ** To utilise "general" case mapping, the upper() or lower() scalar 
                    311: ** functions are invoked with one argument:
                    312: **
                    313: **     upper('ABC') -> 'abc'
                    314: **     lower('abc') -> 'ABC'
                    315: **
                    316: ** To access ICU "language specific" case mapping, upper() or lower()
                    317: ** should be invoked with two arguments. The second argument is the name
                    318: ** of the locale to use. Passing an empty string ("") or SQL NULL value
                    319: ** as the second argument is the same as invoking the 1 argument version
                    320: ** of upper() or lower().
                    321: **
                    322: **     lower('I', 'en_us') -> 'i'
                    323: **     lower('I', 'tr_tr') -> 'ı' (small dotless i)
                    324: **
                    325: ** http://www.icu-project.org/userguide/posix.html#case_mappings
                    326: */
                    327: static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
                    328:   const UChar *zInput;
                    329:   UChar *zOutput;
                    330:   int nInput;
                    331:   int nOutput;
                    332: 
                    333:   UErrorCode status = U_ZERO_ERROR;
                    334:   const char *zLocale = 0;
                    335: 
                    336:   assert(nArg==1 || nArg==2);
                    337:   if( nArg==2 ){
                    338:     zLocale = (const char *)sqlite3_value_text(apArg[1]);
                    339:   }
                    340: 
                    341:   zInput = sqlite3_value_text16(apArg[0]);
                    342:   if( !zInput ){
                    343:     return;
                    344:   }
                    345:   nInput = sqlite3_value_bytes16(apArg[0]);
                    346: 
                    347:   nOutput = nInput * 2 + 2;
                    348:   zOutput = sqlite3_malloc(nOutput);
                    349:   if( !zOutput ){
                    350:     return;
                    351:   }
                    352: 
                    353:   if( sqlite3_user_data(p) ){
                    354:     u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
                    355:   }else{
                    356:     u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
                    357:   }
                    358: 
                    359:   if( !U_SUCCESS(status) ){
                    360:     icuFunctionError(p, "u_strToLower()/u_strToUpper", status);
                    361:     return;
                    362:   }
                    363: 
                    364:   sqlite3_result_text16(p, zOutput, -1, xFree);
                    365: }
                    366: 
                    367: /*
                    368: ** Collation sequence destructor function. The pCtx argument points to
                    369: ** a UCollator structure previously allocated using ucol_open().
                    370: */
                    371: static void icuCollationDel(void *pCtx){
                    372:   UCollator *p = (UCollator *)pCtx;
                    373:   ucol_close(p);
                    374: }
                    375: 
                    376: /*
                    377: ** Collation sequence comparison function. The pCtx argument points to
                    378: ** a UCollator structure previously allocated using ucol_open().
                    379: */
                    380: static int icuCollationColl(
                    381:   void *pCtx,
                    382:   int nLeft,
                    383:   const void *zLeft,
                    384:   int nRight,
                    385:   const void *zRight
                    386: ){
                    387:   UCollationResult res;
                    388:   UCollator *p = (UCollator *)pCtx;
                    389:   res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
                    390:   switch( res ){
                    391:     case UCOL_LESS:    return -1;
                    392:     case UCOL_GREATER: return +1;
                    393:     case UCOL_EQUAL:   return 0;
                    394:   }
                    395:   assert(!"Unexpected return value from ucol_strcoll()");
                    396:   return 0;
                    397: }
                    398: 
                    399: /*
                    400: ** Implementation of the scalar function icu_load_collation().
                    401: **
                    402: ** This scalar function is used to add ICU collation based collation 
                    403: ** types to an SQLite database connection. It is intended to be called
                    404: ** as follows:
                    405: **
                    406: **     SELECT icu_load_collation(<locale>, <collation-name>);
                    407: **
                    408: ** Where <locale> is a string containing an ICU locale identifier (i.e.
                    409: ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
                    410: ** collation sequence to create.
                    411: */
                    412: static void icuLoadCollation(
                    413:   sqlite3_context *p, 
                    414:   int nArg, 
                    415:   sqlite3_value **apArg
                    416: ){
                    417:   sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
                    418:   UErrorCode status = U_ZERO_ERROR;
                    419:   const char *zLocale;      /* Locale identifier - (eg. "jp_JP") */
                    420:   const char *zName;        /* SQL Collation sequence name (eg. "japanese") */
                    421:   UCollator *pUCollator;    /* ICU library collation object */
                    422:   int rc;                   /* Return code from sqlite3_create_collation_x() */
                    423: 
                    424:   assert(nArg==2);
                    425:   zLocale = (const char *)sqlite3_value_text(apArg[0]);
                    426:   zName = (const char *)sqlite3_value_text(apArg[1]);
                    427: 
                    428:   if( !zLocale || !zName ){
                    429:     return;
                    430:   }
                    431: 
                    432:   pUCollator = ucol_open(zLocale, &status);
                    433:   if( !U_SUCCESS(status) ){
                    434:     icuFunctionError(p, "ucol_open", status);
                    435:     return;
                    436:   }
                    437:   assert(p);
                    438: 
                    439:   rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator, 
                    440:       icuCollationColl, icuCollationDel
                    441:   );
                    442:   if( rc!=SQLITE_OK ){
                    443:     ucol_close(pUCollator);
                    444:     sqlite3_result_error(p, "Error registering collation function", -1);
                    445:   }
                    446: }
                    447: 
                    448: /*
                    449: ** Register the ICU extension functions with database db.
                    450: */
                    451: int sqlite3IcuInit(sqlite3 *db){
                    452:   struct IcuScalar {
                    453:     const char *zName;                        /* Function name */
                    454:     int nArg;                                 /* Number of arguments */
                    455:     int enc;                                  /* Optimal text encoding */
                    456:     void *pContext;                           /* sqlite3_user_data() context */
                    457:     void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
                    458:   } scalars[] = {
                    459:     {"regexp", 2, SQLITE_ANY,          0, icuRegexpFunc},
                    460: 
                    461:     {"lower",  1, SQLITE_UTF16,        0, icuCaseFunc16},
                    462:     {"lower",  2, SQLITE_UTF16,        0, icuCaseFunc16},
                    463:     {"upper",  1, SQLITE_UTF16, (void*)1, icuCaseFunc16},
                    464:     {"upper",  2, SQLITE_UTF16, (void*)1, icuCaseFunc16},
                    465: 
                    466:     {"lower",  1, SQLITE_UTF8,         0, icuCaseFunc16},
                    467:     {"lower",  2, SQLITE_UTF8,         0, icuCaseFunc16},
                    468:     {"upper",  1, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
                    469:     {"upper",  2, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
                    470: 
                    471:     {"like",   2, SQLITE_UTF8,         0, icuLikeFunc},
                    472:     {"like",   3, SQLITE_UTF8,         0, icuLikeFunc},
                    473: 
                    474:     {"icu_load_collation",  2, SQLITE_UTF8, (void*)db, icuLoadCollation},
                    475:   };
                    476: 
                    477:   int rc = SQLITE_OK;
                    478:   int i;
                    479: 
                    480:   for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
                    481:     struct IcuScalar *p = &scalars[i];
                    482:     rc = sqlite3_create_function(
                    483:         db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0
                    484:     );
                    485:   }
                    486: 
                    487:   return rc;
                    488: }
                    489: 
                    490: #if !SQLITE_CORE
                    491: int sqlite3_extension_init(
                    492:   sqlite3 *db, 
                    493:   char **pzErrMsg,
                    494:   const sqlite3_api_routines *pApi
                    495: ){
                    496:   SQLITE_EXTENSION_INIT2(pApi)
                    497:   return sqlite3IcuInit(db);
                    498: }
                    499: #endif
                    500: 
                    501: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>