File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / icu / icu.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 17:04:17 2012 UTC (13 years, 1 month ago) by misho
Branches: sqlite3, MAIN
CVS tags: v3_7_10, HEAD
sqlite3

    1: /*
    2: ** 2007 May 6
    3: **
    4: ** The author disclaims copyright to this source code.  In place of
    5: ** a legal notice, here is a blessing:
    6: **
    7: **    May you do good and not evil.
    8: **    May you find forgiveness for yourself and forgive others.
    9: **    May you share freely, never taking more than you give.
   10: **
   11: *************************************************************************
   12: ** $Id: icu.c,v 1.1.1.1 2012/02/21 17:04:17 misho Exp $
   13: **
   14: ** This file implements an integration between the ICU library 
   15: ** ("International Components for Unicode", an open-source library 
   16: ** for handling unicode data) and SQLite. The integration uses 
   17: ** ICU to provide the following to SQLite:
   18: **
   19: **   * An implementation of the SQL regexp() function (and hence REGEXP
   20: **     operator) using the ICU uregex_XX() APIs.
   21: **
   22: **   * Implementations of the SQL scalar upper() and lower() functions
   23: **     for case mapping.
   24: **
   25: **   * Integration of ICU and SQLite collation seqences.
   26: **
   27: **   * An implementation of the LIKE operator that uses ICU to 
   28: **     provide case-independent matching.
   29: */
   30: 
   31: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ICU)
   32: 
   33: /* Include ICU headers */
   34: #include <unicode/utypes.h>
   35: #include <unicode/uregex.h>
   36: #include <unicode/ustring.h>
   37: #include <unicode/ucol.h>
   38: 
   39: #include <assert.h>
   40: 
   41: #ifndef SQLITE_CORE
   42:   #include "sqlite3ext.h"
   43:   SQLITE_EXTENSION_INIT1
   44: #else
   45:   #include "sqlite3.h"
   46: #endif
   47: 
   48: /*
   49: ** Maximum length (in bytes) of the pattern in a LIKE or GLOB
   50: ** operator.
   51: */
   52: #ifndef SQLITE_MAX_LIKE_PATTERN_LENGTH
   53: # define SQLITE_MAX_LIKE_PATTERN_LENGTH 50000
   54: #endif
   55: 
   56: /*
   57: ** Version of sqlite3_free() that is always a function, never a macro.
   58: */
   59: static void xFree(void *p){
   60:   sqlite3_free(p);
   61: }
   62: 
   63: /*
   64: ** Compare two UTF-8 strings for equality where the first string is
   65: ** a "LIKE" expression. Return true (1) if they are the same and 
   66: ** false (0) if they are different.
   67: */
   68: static int icuLikeCompare(
   69:   const uint8_t *zPattern,   /* LIKE pattern */
   70:   const uint8_t *zString,    /* The UTF-8 string to compare against */
   71:   const UChar32 uEsc         /* The escape character */
   72: ){
   73:   static const int MATCH_ONE = (UChar32)'_';
   74:   static const int MATCH_ALL = (UChar32)'%';
   75: 
   76:   int iPattern = 0;       /* Current byte index in zPattern */
   77:   int iString = 0;        /* Current byte index in zString */
   78: 
   79:   int prevEscape = 0;     /* True if the previous character was uEsc */
   80: 
   81:   while( zPattern[iPattern]!=0 ){
   82: 
   83:     /* Read (and consume) the next character from the input pattern. */
   84:     UChar32 uPattern;
   85:     U8_NEXT_UNSAFE(zPattern, iPattern, uPattern);
   86:     assert(uPattern!=0);
   87: 
   88:     /* There are now 4 possibilities:
   89:     **
   90:     **     1. uPattern is an unescaped match-all character "%",
   91:     **     2. uPattern is an unescaped match-one character "_",
   92:     **     3. uPattern is an unescaped escape character, or
   93:     **     4. uPattern is to be handled as an ordinary character
   94:     */
   95:     if( !prevEscape && uPattern==MATCH_ALL ){
   96:       /* Case 1. */
   97:       uint8_t c;
   98: 
   99:       /* Skip any MATCH_ALL or MATCH_ONE characters that follow a
  100:       ** MATCH_ALL. For each MATCH_ONE, skip one character in the 
  101:       ** test string.
  102:       */
  103:       while( (c=zPattern[iPattern]) == MATCH_ALL || c == MATCH_ONE ){
  104:         if( c==MATCH_ONE ){
  105:           if( zString[iString]==0 ) return 0;
  106:           U8_FWD_1_UNSAFE(zString, iString);
  107:         }
  108:         iPattern++;
  109:       }
  110: 
  111:       if( zPattern[iPattern]==0 ) return 1;
  112: 
  113:       while( zString[iString] ){
  114:         if( icuLikeCompare(&zPattern[iPattern], &zString[iString], uEsc) ){
  115:           return 1;
  116:         }
  117:         U8_FWD_1_UNSAFE(zString, iString);
  118:       }
  119:       return 0;
  120: 
  121:     }else if( !prevEscape && uPattern==MATCH_ONE ){
  122:       /* Case 2. */
  123:       if( zString[iString]==0 ) return 0;
  124:       U8_FWD_1_UNSAFE(zString, iString);
  125: 
  126:     }else if( !prevEscape && uPattern==uEsc){
  127:       /* Case 3. */
  128:       prevEscape = 1;
  129: 
  130:     }else{
  131:       /* Case 4. */
  132:       UChar32 uString;
  133:       U8_NEXT_UNSAFE(zString, iString, uString);
  134:       uString = u_foldCase(uString, U_FOLD_CASE_DEFAULT);
  135:       uPattern = u_foldCase(uPattern, U_FOLD_CASE_DEFAULT);
  136:       if( uString!=uPattern ){
  137:         return 0;
  138:       }
  139:       prevEscape = 0;
  140:     }
  141:   }
  142: 
  143:   return zString[iString]==0;
  144: }
  145: 
  146: /*
  147: ** Implementation of the like() SQL function.  This function implements
  148: ** the build-in LIKE operator.  The first argument to the function is the
  149: ** pattern and the second argument is the string.  So, the SQL statements:
  150: **
  151: **       A LIKE B
  152: **
  153: ** is implemented as like(B, A). If there is an escape character E, 
  154: **
  155: **       A LIKE B ESCAPE E
  156: **
  157: ** is mapped to like(B, A, E).
  158: */
  159: static void icuLikeFunc(
  160:   sqlite3_context *context, 
  161:   int argc, 
  162:   sqlite3_value **argv
  163: ){
  164:   const unsigned char *zA = sqlite3_value_text(argv[0]);
  165:   const unsigned char *zB = sqlite3_value_text(argv[1]);
  166:   UChar32 uEsc = 0;
  167: 
  168:   /* Limit the length of the LIKE or GLOB pattern to avoid problems
  169:   ** of deep recursion and N*N behavior in patternCompare().
  170:   */
  171:   if( sqlite3_value_bytes(argv[0])>SQLITE_MAX_LIKE_PATTERN_LENGTH ){
  172:     sqlite3_result_error(context, "LIKE or GLOB pattern too complex", -1);
  173:     return;
  174:   }
  175: 
  176: 
  177:   if( argc==3 ){
  178:     /* The escape character string must consist of a single UTF-8 character.
  179:     ** Otherwise, return an error.
  180:     */
  181:     int nE= sqlite3_value_bytes(argv[2]);
  182:     const unsigned char *zE = sqlite3_value_text(argv[2]);
  183:     int i = 0;
  184:     if( zE==0 ) return;
  185:     U8_NEXT(zE, i, nE, uEsc);
  186:     if( i!=nE){
  187:       sqlite3_result_error(context, 
  188:           "ESCAPE expression must be a single character", -1);
  189:       return;
  190:     }
  191:   }
  192: 
  193:   if( zA && zB ){
  194:     sqlite3_result_int(context, icuLikeCompare(zA, zB, uEsc));
  195:   }
  196: }
  197: 
  198: /*
  199: ** This function is called when an ICU function called from within
  200: ** the implementation of an SQL scalar function returns an error.
  201: **
  202: ** The scalar function context passed as the first argument is 
  203: ** loaded with an error message based on the following two args.
  204: */
  205: static void icuFunctionError(
  206:   sqlite3_context *pCtx,       /* SQLite scalar function context */
  207:   const char *zName,           /* Name of ICU function that failed */
  208:   UErrorCode e                 /* Error code returned by ICU function */
  209: ){
  210:   char zBuf[128];
  211:   sqlite3_snprintf(128, zBuf, "ICU error: %s(): %s", zName, u_errorName(e));
  212:   zBuf[127] = '\0';
  213:   sqlite3_result_error(pCtx, zBuf, -1);
  214: }
  215: 
  216: /*
  217: ** Function to delete compiled regexp objects. Registered as
  218: ** a destructor function with sqlite3_set_auxdata().
  219: */
  220: static void icuRegexpDelete(void *p){
  221:   URegularExpression *pExpr = (URegularExpression *)p;
  222:   uregex_close(pExpr);
  223: }
  224: 
  225: /*
  226: ** Implementation of SQLite REGEXP operator. This scalar function takes
  227: ** two arguments. The first is a regular expression pattern to compile
  228: ** the second is a string to match against that pattern. If either 
  229: ** argument is an SQL NULL, then NULL Is returned. Otherwise, the result
  230: ** is 1 if the string matches the pattern, or 0 otherwise.
  231: **
  232: ** SQLite maps the regexp() function to the regexp() operator such
  233: ** that the following two are equivalent:
  234: **
  235: **     zString REGEXP zPattern
  236: **     regexp(zPattern, zString)
  237: **
  238: ** Uses the following ICU regexp APIs:
  239: **
  240: **     uregex_open()
  241: **     uregex_matches()
  242: **     uregex_close()
  243: */
  244: static void icuRegexpFunc(sqlite3_context *p, int nArg, sqlite3_value **apArg){
  245:   UErrorCode status = U_ZERO_ERROR;
  246:   URegularExpression *pExpr;
  247:   UBool res;
  248:   const UChar *zString = sqlite3_value_text16(apArg[1]);
  249: 
  250:   (void)nArg;  /* Unused parameter */
  251: 
  252:   /* If the left hand side of the regexp operator is NULL, 
  253:   ** then the result is also NULL. 
  254:   */
  255:   if( !zString ){
  256:     return;
  257:   }
  258: 
  259:   pExpr = sqlite3_get_auxdata(p, 0);
  260:   if( !pExpr ){
  261:     const UChar *zPattern = sqlite3_value_text16(apArg[0]);
  262:     if( !zPattern ){
  263:       return;
  264:     }
  265:     pExpr = uregex_open(zPattern, -1, 0, 0, &status);
  266: 
  267:     if( U_SUCCESS(status) ){
  268:       sqlite3_set_auxdata(p, 0, pExpr, icuRegexpDelete);
  269:     }else{
  270:       assert(!pExpr);
  271:       icuFunctionError(p, "uregex_open", status);
  272:       return;
  273:     }
  274:   }
  275: 
  276:   /* Configure the text that the regular expression operates on. */
  277:   uregex_setText(pExpr, zString, -1, &status);
  278:   if( !U_SUCCESS(status) ){
  279:     icuFunctionError(p, "uregex_setText", status);
  280:     return;
  281:   }
  282: 
  283:   /* Attempt the match */
  284:   res = uregex_matches(pExpr, 0, &status);
  285:   if( !U_SUCCESS(status) ){
  286:     icuFunctionError(p, "uregex_matches", status);
  287:     return;
  288:   }
  289: 
  290:   /* Set the text that the regular expression operates on to a NULL
  291:   ** pointer. This is not really necessary, but it is tidier than 
  292:   ** leaving the regular expression object configured with an invalid
  293:   ** pointer after this function returns.
  294:   */
  295:   uregex_setText(pExpr, 0, 0, &status);
  296: 
  297:   /* Return 1 or 0. */
  298:   sqlite3_result_int(p, res ? 1 : 0);
  299: }
  300: 
  301: /*
  302: ** Implementations of scalar functions for case mapping - upper() and 
  303: ** lower(). Function upper() converts its input to upper-case (ABC).
  304: ** Function lower() converts to lower-case (abc).
  305: **
  306: ** ICU provides two types of case mapping, "general" case mapping and
  307: ** "language specific". Refer to ICU documentation for the differences
  308: ** between the two.
  309: **
  310: ** To utilise "general" case mapping, the upper() or lower() scalar 
  311: ** functions are invoked with one argument:
  312: **
  313: **     upper('ABC') -> 'abc'
  314: **     lower('abc') -> 'ABC'
  315: **
  316: ** To access ICU "language specific" case mapping, upper() or lower()
  317: ** should be invoked with two arguments. The second argument is the name
  318: ** of the locale to use. Passing an empty string ("") or SQL NULL value
  319: ** as the second argument is the same as invoking the 1 argument version
  320: ** of upper() or lower().
  321: **
  322: **     lower('I', 'en_us') -> 'i'
  323: **     lower('I', 'tr_tr') -> 'ı' (small dotless i)
  324: **
  325: ** http://www.icu-project.org/userguide/posix.html#case_mappings
  326: */
  327: static void icuCaseFunc16(sqlite3_context *p, int nArg, sqlite3_value **apArg){
  328:   const UChar *zInput;
  329:   UChar *zOutput;
  330:   int nInput;
  331:   int nOutput;
  332: 
  333:   UErrorCode status = U_ZERO_ERROR;
  334:   const char *zLocale = 0;
  335: 
  336:   assert(nArg==1 || nArg==2);
  337:   if( nArg==2 ){
  338:     zLocale = (const char *)sqlite3_value_text(apArg[1]);
  339:   }
  340: 
  341:   zInput = sqlite3_value_text16(apArg[0]);
  342:   if( !zInput ){
  343:     return;
  344:   }
  345:   nInput = sqlite3_value_bytes16(apArg[0]);
  346: 
  347:   nOutput = nInput * 2 + 2;
  348:   zOutput = sqlite3_malloc(nOutput);
  349:   if( !zOutput ){
  350:     return;
  351:   }
  352: 
  353:   if( sqlite3_user_data(p) ){
  354:     u_strToUpper(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
  355:   }else{
  356:     u_strToLower(zOutput, nOutput/2, zInput, nInput/2, zLocale, &status);
  357:   }
  358: 
  359:   if( !U_SUCCESS(status) ){
  360:     icuFunctionError(p, "u_strToLower()/u_strToUpper", status);
  361:     return;
  362:   }
  363: 
  364:   sqlite3_result_text16(p, zOutput, -1, xFree);
  365: }
  366: 
  367: /*
  368: ** Collation sequence destructor function. The pCtx argument points to
  369: ** a UCollator structure previously allocated using ucol_open().
  370: */
  371: static void icuCollationDel(void *pCtx){
  372:   UCollator *p = (UCollator *)pCtx;
  373:   ucol_close(p);
  374: }
  375: 
  376: /*
  377: ** Collation sequence comparison function. The pCtx argument points to
  378: ** a UCollator structure previously allocated using ucol_open().
  379: */
  380: static int icuCollationColl(
  381:   void *pCtx,
  382:   int nLeft,
  383:   const void *zLeft,
  384:   int nRight,
  385:   const void *zRight
  386: ){
  387:   UCollationResult res;
  388:   UCollator *p = (UCollator *)pCtx;
  389:   res = ucol_strcoll(p, (UChar *)zLeft, nLeft/2, (UChar *)zRight, nRight/2);
  390:   switch( res ){
  391:     case UCOL_LESS:    return -1;
  392:     case UCOL_GREATER: return +1;
  393:     case UCOL_EQUAL:   return 0;
  394:   }
  395:   assert(!"Unexpected return value from ucol_strcoll()");
  396:   return 0;
  397: }
  398: 
  399: /*
  400: ** Implementation of the scalar function icu_load_collation().
  401: **
  402: ** This scalar function is used to add ICU collation based collation 
  403: ** types to an SQLite database connection. It is intended to be called
  404: ** as follows:
  405: **
  406: **     SELECT icu_load_collation(<locale>, <collation-name>);
  407: **
  408: ** Where <locale> is a string containing an ICU locale identifier (i.e.
  409: ** "en_AU", "tr_TR" etc.) and <collation-name> is the name of the
  410: ** collation sequence to create.
  411: */
  412: static void icuLoadCollation(
  413:   sqlite3_context *p, 
  414:   int nArg, 
  415:   sqlite3_value **apArg
  416: ){
  417:   sqlite3 *db = (sqlite3 *)sqlite3_user_data(p);
  418:   UErrorCode status = U_ZERO_ERROR;
  419:   const char *zLocale;      /* Locale identifier - (eg. "jp_JP") */
  420:   const char *zName;        /* SQL Collation sequence name (eg. "japanese") */
  421:   UCollator *pUCollator;    /* ICU library collation object */
  422:   int rc;                   /* Return code from sqlite3_create_collation_x() */
  423: 
  424:   assert(nArg==2);
  425:   zLocale = (const char *)sqlite3_value_text(apArg[0]);
  426:   zName = (const char *)sqlite3_value_text(apArg[1]);
  427: 
  428:   if( !zLocale || !zName ){
  429:     return;
  430:   }
  431: 
  432:   pUCollator = ucol_open(zLocale, &status);
  433:   if( !U_SUCCESS(status) ){
  434:     icuFunctionError(p, "ucol_open", status);
  435:     return;
  436:   }
  437:   assert(p);
  438: 
  439:   rc = sqlite3_create_collation_v2(db, zName, SQLITE_UTF16, (void *)pUCollator, 
  440:       icuCollationColl, icuCollationDel
  441:   );
  442:   if( rc!=SQLITE_OK ){
  443:     ucol_close(pUCollator);
  444:     sqlite3_result_error(p, "Error registering collation function", -1);
  445:   }
  446: }
  447: 
  448: /*
  449: ** Register the ICU extension functions with database db.
  450: */
  451: int sqlite3IcuInit(sqlite3 *db){
  452:   struct IcuScalar {
  453:     const char *zName;                        /* Function name */
  454:     int nArg;                                 /* Number of arguments */
  455:     int enc;                                  /* Optimal text encoding */
  456:     void *pContext;                           /* sqlite3_user_data() context */
  457:     void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
  458:   } scalars[] = {
  459:     {"regexp", 2, SQLITE_ANY,          0, icuRegexpFunc},
  460: 
  461:     {"lower",  1, SQLITE_UTF16,        0, icuCaseFunc16},
  462:     {"lower",  2, SQLITE_UTF16,        0, icuCaseFunc16},
  463:     {"upper",  1, SQLITE_UTF16, (void*)1, icuCaseFunc16},
  464:     {"upper",  2, SQLITE_UTF16, (void*)1, icuCaseFunc16},
  465: 
  466:     {"lower",  1, SQLITE_UTF8,         0, icuCaseFunc16},
  467:     {"lower",  2, SQLITE_UTF8,         0, icuCaseFunc16},
  468:     {"upper",  1, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
  469:     {"upper",  2, SQLITE_UTF8,  (void*)1, icuCaseFunc16},
  470: 
  471:     {"like",   2, SQLITE_UTF8,         0, icuLikeFunc},
  472:     {"like",   3, SQLITE_UTF8,         0, icuLikeFunc},
  473: 
  474:     {"icu_load_collation",  2, SQLITE_UTF8, (void*)db, icuLoadCollation},
  475:   };
  476: 
  477:   int rc = SQLITE_OK;
  478:   int i;
  479: 
  480:   for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
  481:     struct IcuScalar *p = &scalars[i];
  482:     rc = sqlite3_create_function(
  483:         db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0
  484:     );
  485:   }
  486: 
  487:   return rc;
  488: }
  489: 
  490: #if !SQLITE_CORE
  491: int sqlite3_extension_init(
  492:   sqlite3 *db, 
  493:   char **pzErrMsg,
  494:   const sqlite3_api_routines *pApi
  495: ){
  496:   SQLITE_EXTENSION_INIT2(pApi)
  497:   return sqlite3IcuInit(db);
  498: }
  499: #endif
  500: 
  501: #endif

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>