File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts3 / fts3_expr.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 17:04:17 2012 UTC (12 years, 4 months ago) by misho
Branches: sqlite3, MAIN
CVS tags: v3_7_10, HEAD
sqlite3

    1: /*
    2: ** 2008 Nov 28
    3: **
    4: ** The author disclaims copyright to this source code.  In place of
    5: ** a legal notice, here is a blessing:
    6: **
    7: **    May you do good and not evil.
    8: **    May you find forgiveness for yourself and forgive others.
    9: **    May you share freely, never taking more than you give.
   10: **
   11: ******************************************************************************
   12: **
   13: ** This module contains code that implements a parser for fts3 query strings
   14: ** (the right-hand argument to the MATCH operator). Because the supported 
   15: ** syntax is relatively simple, the whole tokenizer/parser system is
   16: ** hand-coded. 
   17: */
   18: #include "fts3Int.h"
   19: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
   20: 
   21: /*
   22: ** By default, this module parses the legacy syntax that has been 
   23: ** traditionally used by fts3. Or, if SQLITE_ENABLE_FTS3_PARENTHESIS
   24: ** is defined, then it uses the new syntax. The differences between
   25: ** the new and the old syntaxes are:
   26: **
   27: **  a) The new syntax supports parenthesis. The old does not.
   28: **
   29: **  b) The new syntax supports the AND and NOT operators. The old does not.
   30: **
   31: **  c) The old syntax supports the "-" token qualifier. This is not 
   32: **     supported by the new syntax (it is replaced by the NOT operator).
   33: **
   34: **  d) When using the old syntax, the OR operator has a greater precedence
   35: **     than an implicit AND. When using the new, both implicity and explicit
   36: **     AND operators have a higher precedence than OR.
   37: **
   38: ** If compiled with SQLITE_TEST defined, then this module exports the
   39: ** symbol "int sqlite3_fts3_enable_parentheses". Setting this variable
   40: ** to zero causes the module to use the old syntax. If it is set to 
   41: ** non-zero the new syntax is activated. This is so both syntaxes can
   42: ** be tested using a single build of testfixture.
   43: **
   44: ** The following describes the syntax supported by the fts3 MATCH
   45: ** operator in a similar format to that used by the lemon parser
   46: ** generator. This module does not use actually lemon, it uses a
   47: ** custom parser.
   48: **
   49: **   query ::= andexpr (OR andexpr)*.
   50: **
   51: **   andexpr ::= notexpr (AND? notexpr)*.
   52: **
   53: **   notexpr ::= nearexpr (NOT nearexpr|-TOKEN)*.
   54: **   notexpr ::= LP query RP.
   55: **
   56: **   nearexpr ::= phrase (NEAR distance_opt nearexpr)*.
   57: **
   58: **   distance_opt ::= .
   59: **   distance_opt ::= / INTEGER.
   60: **
   61: **   phrase ::= TOKEN.
   62: **   phrase ::= COLUMN:TOKEN.
   63: **   phrase ::= "TOKEN TOKEN TOKEN...".
   64: */
   65: 
   66: #ifdef SQLITE_TEST
   67: int sqlite3_fts3_enable_parentheses = 0;
   68: #else
   69: # ifdef SQLITE_ENABLE_FTS3_PARENTHESIS 
   70: #  define sqlite3_fts3_enable_parentheses 1
   71: # else
   72: #  define sqlite3_fts3_enable_parentheses 0
   73: # endif
   74: #endif
   75: 
   76: /*
   77: ** Default span for NEAR operators.
   78: */
   79: #define SQLITE_FTS3_DEFAULT_NEAR_PARAM 10
   80: 
   81: #include <string.h>
   82: #include <assert.h>
   83: 
   84: /*
   85: ** isNot:
   86: **   This variable is used by function getNextNode(). When getNextNode() is
   87: **   called, it sets ParseContext.isNot to true if the 'next node' is a 
   88: **   FTSQUERY_PHRASE with a unary "-" attached to it. i.e. "mysql" in the
   89: **   FTS3 query "sqlite -mysql". Otherwise, ParseContext.isNot is set to
   90: **   zero.
   91: */
   92: typedef struct ParseContext ParseContext;
   93: struct ParseContext {
   94:   sqlite3_tokenizer *pTokenizer;      /* Tokenizer module */
   95:   const char **azCol;                 /* Array of column names for fts3 table */
   96:   int bFts4;                          /* True to allow FTS4-only syntax */
   97:   int nCol;                           /* Number of entries in azCol[] */
   98:   int iDefaultCol;                    /* Default column to query */
   99:   int isNot;                          /* True if getNextNode() sees a unary - */
  100:   sqlite3_context *pCtx;              /* Write error message here */
  101:   int nNest;                          /* Number of nested brackets */
  102: };
  103: 
  104: /*
  105: ** This function is equivalent to the standard isspace() function. 
  106: **
  107: ** The standard isspace() can be awkward to use safely, because although it
  108: ** is defined to accept an argument of type int, its behaviour when passed
  109: ** an integer that falls outside of the range of the unsigned char type
  110: ** is undefined (and sometimes, "undefined" means segfault). This wrapper
  111: ** is defined to accept an argument of type char, and always returns 0 for
  112: ** any values that fall outside of the range of the unsigned char type (i.e.
  113: ** negative values).
  114: */
  115: static int fts3isspace(char c){
  116:   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
  117: }
  118: 
  119: /*
  120: ** Allocate nByte bytes of memory using sqlite3_malloc(). If successful,
  121: ** zero the memory before returning a pointer to it. If unsuccessful, 
  122: ** return NULL.
  123: */
  124: static void *fts3MallocZero(int nByte){
  125:   void *pRet = sqlite3_malloc(nByte);
  126:   if( pRet ) memset(pRet, 0, nByte);
  127:   return pRet;
  128: }
  129: 
  130: 
  131: /*
  132: ** Extract the next token from buffer z (length n) using the tokenizer
  133: ** and other information (column names etc.) in pParse. Create an Fts3Expr
  134: ** structure of type FTSQUERY_PHRASE containing a phrase consisting of this
  135: ** single token and set *ppExpr to point to it. If the end of the buffer is
  136: ** reached before a token is found, set *ppExpr to zero. It is the
  137: ** responsibility of the caller to eventually deallocate the allocated 
  138: ** Fts3Expr structure (if any) by passing it to sqlite3_free().
  139: **
  140: ** Return SQLITE_OK if successful, or SQLITE_NOMEM if a memory allocation
  141: ** fails.
  142: */
  143: static int getNextToken(
  144:   ParseContext *pParse,                   /* fts3 query parse context */
  145:   int iCol,                               /* Value for Fts3Phrase.iColumn */
  146:   const char *z, int n,                   /* Input string */
  147:   Fts3Expr **ppExpr,                      /* OUT: expression */
  148:   int *pnConsumed                         /* OUT: Number of bytes consumed */
  149: ){
  150:   sqlite3_tokenizer *pTokenizer = pParse->pTokenizer;
  151:   sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
  152:   int rc;
  153:   sqlite3_tokenizer_cursor *pCursor;
  154:   Fts3Expr *pRet = 0;
  155:   int nConsumed = 0;
  156: 
  157:   rc = pModule->xOpen(pTokenizer, z, n, &pCursor);
  158:   if( rc==SQLITE_OK ){
  159:     const char *zToken;
  160:     int nToken, iStart, iEnd, iPosition;
  161:     int nByte;                               /* total space to allocate */
  162: 
  163:     pCursor->pTokenizer = pTokenizer;
  164:     rc = pModule->xNext(pCursor, &zToken, &nToken, &iStart, &iEnd, &iPosition);
  165: 
  166:     if( rc==SQLITE_OK ){
  167:       nByte = sizeof(Fts3Expr) + sizeof(Fts3Phrase) + nToken;
  168:       pRet = (Fts3Expr *)fts3MallocZero(nByte);
  169:       if( !pRet ){
  170:         rc = SQLITE_NOMEM;
  171:       }else{
  172:         pRet->eType = FTSQUERY_PHRASE;
  173:         pRet->pPhrase = (Fts3Phrase *)&pRet[1];
  174:         pRet->pPhrase->nToken = 1;
  175:         pRet->pPhrase->iColumn = iCol;
  176:         pRet->pPhrase->aToken[0].n = nToken;
  177:         pRet->pPhrase->aToken[0].z = (char *)&pRet->pPhrase[1];
  178:         memcpy(pRet->pPhrase->aToken[0].z, zToken, nToken);
  179: 
  180:         if( iEnd<n && z[iEnd]=='*' ){
  181:           pRet->pPhrase->aToken[0].isPrefix = 1;
  182:           iEnd++;
  183:         }
  184: 
  185:         while( 1 ){
  186:           if( !sqlite3_fts3_enable_parentheses 
  187:            && iStart>0 && z[iStart-1]=='-' 
  188:           ){
  189:             pParse->isNot = 1;
  190:             iStart--;
  191:           }else if( pParse->bFts4 && iStart>0 && z[iStart-1]=='^' ){
  192:             pRet->pPhrase->aToken[0].bFirst = 1;
  193:             iStart--;
  194:           }else{
  195:             break;
  196:           }
  197:         }
  198: 
  199:       }
  200:       nConsumed = iEnd;
  201:     }
  202: 
  203:     pModule->xClose(pCursor);
  204:   }
  205:   
  206:   *pnConsumed = nConsumed;
  207:   *ppExpr = pRet;
  208:   return rc;
  209: }
  210: 
  211: 
  212: /*
  213: ** Enlarge a memory allocation.  If an out-of-memory allocation occurs,
  214: ** then free the old allocation.
  215: */
  216: static void *fts3ReallocOrFree(void *pOrig, int nNew){
  217:   void *pRet = sqlite3_realloc(pOrig, nNew);
  218:   if( !pRet ){
  219:     sqlite3_free(pOrig);
  220:   }
  221:   return pRet;
  222: }
  223: 
  224: /*
  225: ** Buffer zInput, length nInput, contains the contents of a quoted string
  226: ** that appeared as part of an fts3 query expression. Neither quote character
  227: ** is included in the buffer. This function attempts to tokenize the entire
  228: ** input buffer and create an Fts3Expr structure of type FTSQUERY_PHRASE 
  229: ** containing the results.
  230: **
  231: ** If successful, SQLITE_OK is returned and *ppExpr set to point at the
  232: ** allocated Fts3Expr structure. Otherwise, either SQLITE_NOMEM (out of memory
  233: ** error) or SQLITE_ERROR (tokenization error) is returned and *ppExpr set
  234: ** to 0.
  235: */
  236: static int getNextString(
  237:   ParseContext *pParse,                   /* fts3 query parse context */
  238:   const char *zInput, int nInput,         /* Input string */
  239:   Fts3Expr **ppExpr                       /* OUT: expression */
  240: ){
  241:   sqlite3_tokenizer *pTokenizer = pParse->pTokenizer;
  242:   sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
  243:   int rc;
  244:   Fts3Expr *p = 0;
  245:   sqlite3_tokenizer_cursor *pCursor = 0;
  246:   char *zTemp = 0;
  247:   int nTemp = 0;
  248: 
  249:   const int nSpace = sizeof(Fts3Expr) + sizeof(Fts3Phrase);
  250:   int nToken = 0;
  251: 
  252:   /* The final Fts3Expr data structure, including the Fts3Phrase,
  253:   ** Fts3PhraseToken structures token buffers are all stored as a single 
  254:   ** allocation so that the expression can be freed with a single call to
  255:   ** sqlite3_free(). Setting this up requires a two pass approach.
  256:   **
  257:   ** The first pass, in the block below, uses a tokenizer cursor to iterate
  258:   ** through the tokens in the expression. This pass uses fts3ReallocOrFree()
  259:   ** to assemble data in two dynamic buffers:
  260:   **
  261:   **   Buffer p: Points to the Fts3Expr structure, followed by the Fts3Phrase
  262:   **             structure, followed by the array of Fts3PhraseToken 
  263:   **             structures. This pass only populates the Fts3PhraseToken array.
  264:   **
  265:   **   Buffer zTemp: Contains copies of all tokens.
  266:   **
  267:   ** The second pass, in the block that begins "if( rc==SQLITE_DONE )" below,
  268:   ** appends buffer zTemp to buffer p, and fills in the Fts3Expr and Fts3Phrase
  269:   ** structures.
  270:   */
  271:   rc = pModule->xOpen(pTokenizer, zInput, nInput, &pCursor);
  272:   if( rc==SQLITE_OK ){
  273:     int ii;
  274:     pCursor->pTokenizer = pTokenizer;
  275:     for(ii=0; rc==SQLITE_OK; ii++){
  276:       const char *zByte;
  277:       int nByte, iBegin, iEnd, iPos;
  278:       rc = pModule->xNext(pCursor, &zByte, &nByte, &iBegin, &iEnd, &iPos);
  279:       if( rc==SQLITE_OK ){
  280:         Fts3PhraseToken *pToken;
  281: 
  282:         p = fts3ReallocOrFree(p, nSpace + ii*sizeof(Fts3PhraseToken));
  283:         if( !p ) goto no_mem;
  284: 
  285:         zTemp = fts3ReallocOrFree(zTemp, nTemp + nByte);
  286:         if( !zTemp ) goto no_mem;
  287: 
  288:         assert( nToken==ii );
  289:         pToken = &((Fts3Phrase *)(&p[1]))->aToken[ii];
  290:         memset(pToken, 0, sizeof(Fts3PhraseToken));
  291: 
  292:         memcpy(&zTemp[nTemp], zByte, nByte);
  293:         nTemp += nByte;
  294: 
  295:         pToken->n = nByte;
  296:         pToken->isPrefix = (iEnd<nInput && zInput[iEnd]=='*');
  297:         pToken->bFirst = (iBegin>0 && zInput[iBegin-1]=='^');
  298:         nToken = ii+1;
  299:       }
  300:     }
  301: 
  302:     pModule->xClose(pCursor);
  303:     pCursor = 0;
  304:   }
  305: 
  306:   if( rc==SQLITE_DONE ){
  307:     int jj;
  308:     char *zBuf = 0;
  309: 
  310:     p = fts3ReallocOrFree(p, nSpace + nToken*sizeof(Fts3PhraseToken) + nTemp);
  311:     if( !p ) goto no_mem;
  312:     memset(p, 0, (char *)&(((Fts3Phrase *)&p[1])->aToken[0])-(char *)p);
  313:     p->eType = FTSQUERY_PHRASE;
  314:     p->pPhrase = (Fts3Phrase *)&p[1];
  315:     p->pPhrase->iColumn = pParse->iDefaultCol;
  316:     p->pPhrase->nToken = nToken;
  317: 
  318:     zBuf = (char *)&p->pPhrase->aToken[nToken];
  319:     if( zTemp ){
  320:       memcpy(zBuf, zTemp, nTemp);
  321:       sqlite3_free(zTemp);
  322:     }else{
  323:       assert( nTemp==0 );
  324:     }
  325: 
  326:     for(jj=0; jj<p->pPhrase->nToken; jj++){
  327:       p->pPhrase->aToken[jj].z = zBuf;
  328:       zBuf += p->pPhrase->aToken[jj].n;
  329:     }
  330:     rc = SQLITE_OK;
  331:   }
  332: 
  333:   *ppExpr = p;
  334:   return rc;
  335: no_mem:
  336: 
  337:   if( pCursor ){
  338:     pModule->xClose(pCursor);
  339:   }
  340:   sqlite3_free(zTemp);
  341:   sqlite3_free(p);
  342:   *ppExpr = 0;
  343:   return SQLITE_NOMEM;
  344: }
  345: 
  346: /*
  347: ** Function getNextNode(), which is called by fts3ExprParse(), may itself
  348: ** call fts3ExprParse(). So this forward declaration is required.
  349: */
  350: static int fts3ExprParse(ParseContext *, const char *, int, Fts3Expr **, int *);
  351: 
  352: /*
  353: ** The output variable *ppExpr is populated with an allocated Fts3Expr 
  354: ** structure, or set to 0 if the end of the input buffer is reached.
  355: **
  356: ** Returns an SQLite error code. SQLITE_OK if everything works, SQLITE_NOMEM
  357: ** if a malloc failure occurs, or SQLITE_ERROR if a parse error is encountered.
  358: ** If SQLITE_ERROR is returned, pContext is populated with an error message.
  359: */
  360: static int getNextNode(
  361:   ParseContext *pParse,                   /* fts3 query parse context */
  362:   const char *z, int n,                   /* Input string */
  363:   Fts3Expr **ppExpr,                      /* OUT: expression */
  364:   int *pnConsumed                         /* OUT: Number of bytes consumed */
  365: ){
  366:   static const struct Fts3Keyword {
  367:     char *z;                              /* Keyword text */
  368:     unsigned char n;                      /* Length of the keyword */
  369:     unsigned char parenOnly;              /* Only valid in paren mode */
  370:     unsigned char eType;                  /* Keyword code */
  371:   } aKeyword[] = {
  372:     { "OR" ,  2, 0, FTSQUERY_OR   },
  373:     { "AND",  3, 1, FTSQUERY_AND  },
  374:     { "NOT",  3, 1, FTSQUERY_NOT  },
  375:     { "NEAR", 4, 0, FTSQUERY_NEAR }
  376:   };
  377:   int ii;
  378:   int iCol;
  379:   int iColLen;
  380:   int rc;
  381:   Fts3Expr *pRet = 0;
  382: 
  383:   const char *zInput = z;
  384:   int nInput = n;
  385: 
  386:   pParse->isNot = 0;
  387: 
  388:   /* Skip over any whitespace before checking for a keyword, an open or
  389:   ** close bracket, or a quoted string. 
  390:   */
  391:   while( nInput>0 && fts3isspace(*zInput) ){
  392:     nInput--;
  393:     zInput++;
  394:   }
  395:   if( nInput==0 ){
  396:     return SQLITE_DONE;
  397:   }
  398: 
  399:   /* See if we are dealing with a keyword. */
  400:   for(ii=0; ii<(int)(sizeof(aKeyword)/sizeof(struct Fts3Keyword)); ii++){
  401:     const struct Fts3Keyword *pKey = &aKeyword[ii];
  402: 
  403:     if( (pKey->parenOnly & ~sqlite3_fts3_enable_parentheses)!=0 ){
  404:       continue;
  405:     }
  406: 
  407:     if( nInput>=pKey->n && 0==memcmp(zInput, pKey->z, pKey->n) ){
  408:       int nNear = SQLITE_FTS3_DEFAULT_NEAR_PARAM;
  409:       int nKey = pKey->n;
  410:       char cNext;
  411: 
  412:       /* If this is a "NEAR" keyword, check for an explicit nearness. */
  413:       if( pKey->eType==FTSQUERY_NEAR ){
  414:         assert( nKey==4 );
  415:         if( zInput[4]=='/' && zInput[5]>='0' && zInput[5]<='9' ){
  416:           nNear = 0;
  417:           for(nKey=5; zInput[nKey]>='0' && zInput[nKey]<='9'; nKey++){
  418:             nNear = nNear * 10 + (zInput[nKey] - '0');
  419:           }
  420:         }
  421:       }
  422: 
  423:       /* At this point this is probably a keyword. But for that to be true,
  424:       ** the next byte must contain either whitespace, an open or close
  425:       ** parenthesis, a quote character, or EOF. 
  426:       */
  427:       cNext = zInput[nKey];
  428:       if( fts3isspace(cNext) 
  429:        || cNext=='"' || cNext=='(' || cNext==')' || cNext==0
  430:       ){
  431:         pRet = (Fts3Expr *)fts3MallocZero(sizeof(Fts3Expr));
  432:         if( !pRet ){
  433:           return SQLITE_NOMEM;
  434:         }
  435:         pRet->eType = pKey->eType;
  436:         pRet->nNear = nNear;
  437:         *ppExpr = pRet;
  438:         *pnConsumed = (int)((zInput - z) + nKey);
  439:         return SQLITE_OK;
  440:       }
  441: 
  442:       /* Turns out that wasn't a keyword after all. This happens if the
  443:       ** user has supplied a token such as "ORacle". Continue.
  444:       */
  445:     }
  446:   }
  447: 
  448:   /* Check for an open bracket. */
  449:   if( sqlite3_fts3_enable_parentheses ){
  450:     if( *zInput=='(' ){
  451:       int nConsumed;
  452:       pParse->nNest++;
  453:       rc = fts3ExprParse(pParse, &zInput[1], nInput-1, ppExpr, &nConsumed);
  454:       if( rc==SQLITE_OK && !*ppExpr ){
  455:         rc = SQLITE_DONE;
  456:       }
  457:       *pnConsumed = (int)((zInput - z) + 1 + nConsumed);
  458:       return rc;
  459:     }
  460:   
  461:     /* Check for a close bracket. */
  462:     if( *zInput==')' ){
  463:       pParse->nNest--;
  464:       *pnConsumed = (int)((zInput - z) + 1);
  465:       return SQLITE_DONE;
  466:     }
  467:   }
  468: 
  469:   /* See if we are dealing with a quoted phrase. If this is the case, then
  470:   ** search for the closing quote and pass the whole string to getNextString()
  471:   ** for processing. This is easy to do, as fts3 has no syntax for escaping
  472:   ** a quote character embedded in a string.
  473:   */
  474:   if( *zInput=='"' ){
  475:     for(ii=1; ii<nInput && zInput[ii]!='"'; ii++);
  476:     *pnConsumed = (int)((zInput - z) + ii + 1);
  477:     if( ii==nInput ){
  478:       return SQLITE_ERROR;
  479:     }
  480:     return getNextString(pParse, &zInput[1], ii-1, ppExpr);
  481:   }
  482: 
  483: 
  484:   /* If control flows to this point, this must be a regular token, or 
  485:   ** the end of the input. Read a regular token using the sqlite3_tokenizer
  486:   ** interface. Before doing so, figure out if there is an explicit
  487:   ** column specifier for the token. 
  488:   **
  489:   ** TODO: Strangely, it is not possible to associate a column specifier
  490:   ** with a quoted phrase, only with a single token. Not sure if this was
  491:   ** an implementation artifact or an intentional decision when fts3 was
  492:   ** first implemented. Whichever it was, this module duplicates the 
  493:   ** limitation.
  494:   */
  495:   iCol = pParse->iDefaultCol;
  496:   iColLen = 0;
  497:   for(ii=0; ii<pParse->nCol; ii++){
  498:     const char *zStr = pParse->azCol[ii];
  499:     int nStr = (int)strlen(zStr);
  500:     if( nInput>nStr && zInput[nStr]==':' 
  501:      && sqlite3_strnicmp(zStr, zInput, nStr)==0 
  502:     ){
  503:       iCol = ii;
  504:       iColLen = (int)((zInput - z) + nStr + 1);
  505:       break;
  506:     }
  507:   }
  508:   rc = getNextToken(pParse, iCol, &z[iColLen], n-iColLen, ppExpr, pnConsumed);
  509:   *pnConsumed += iColLen;
  510:   return rc;
  511: }
  512: 
  513: /*
  514: ** The argument is an Fts3Expr structure for a binary operator (any type
  515: ** except an FTSQUERY_PHRASE). Return an integer value representing the
  516: ** precedence of the operator. Lower values have a higher precedence (i.e.
  517: ** group more tightly). For example, in the C language, the == operator
  518: ** groups more tightly than ||, and would therefore have a higher precedence.
  519: **
  520: ** When using the new fts3 query syntax (when SQLITE_ENABLE_FTS3_PARENTHESIS
  521: ** is defined), the order of the operators in precedence from highest to
  522: ** lowest is:
  523: **
  524: **   NEAR
  525: **   NOT
  526: **   AND (including implicit ANDs)
  527: **   OR
  528: **
  529: ** Note that when using the old query syntax, the OR operator has a higher
  530: ** precedence than the AND operator.
  531: */
  532: static int opPrecedence(Fts3Expr *p){
  533:   assert( p->eType!=FTSQUERY_PHRASE );
  534:   if( sqlite3_fts3_enable_parentheses ){
  535:     return p->eType;
  536:   }else if( p->eType==FTSQUERY_NEAR ){
  537:     return 1;
  538:   }else if( p->eType==FTSQUERY_OR ){
  539:     return 2;
  540:   }
  541:   assert( p->eType==FTSQUERY_AND );
  542:   return 3;
  543: }
  544: 
  545: /*
  546: ** Argument ppHead contains a pointer to the current head of a query 
  547: ** expression tree being parsed. pPrev is the expression node most recently
  548: ** inserted into the tree. This function adds pNew, which is always a binary
  549: ** operator node, into the expression tree based on the relative precedence
  550: ** of pNew and the existing nodes of the tree. This may result in the head
  551: ** of the tree changing, in which case *ppHead is set to the new root node.
  552: */
  553: static void insertBinaryOperator(
  554:   Fts3Expr **ppHead,       /* Pointer to the root node of a tree */
  555:   Fts3Expr *pPrev,         /* Node most recently inserted into the tree */
  556:   Fts3Expr *pNew           /* New binary node to insert into expression tree */
  557: ){
  558:   Fts3Expr *pSplit = pPrev;
  559:   while( pSplit->pParent && opPrecedence(pSplit->pParent)<=opPrecedence(pNew) ){
  560:     pSplit = pSplit->pParent;
  561:   }
  562: 
  563:   if( pSplit->pParent ){
  564:     assert( pSplit->pParent->pRight==pSplit );
  565:     pSplit->pParent->pRight = pNew;
  566:     pNew->pParent = pSplit->pParent;
  567:   }else{
  568:     *ppHead = pNew;
  569:   }
  570:   pNew->pLeft = pSplit;
  571:   pSplit->pParent = pNew;
  572: }
  573: 
  574: /*
  575: ** Parse the fts3 query expression found in buffer z, length n. This function
  576: ** returns either when the end of the buffer is reached or an unmatched 
  577: ** closing bracket - ')' - is encountered.
  578: **
  579: ** If successful, SQLITE_OK is returned, *ppExpr is set to point to the
  580: ** parsed form of the expression and *pnConsumed is set to the number of
  581: ** bytes read from buffer z. Otherwise, *ppExpr is set to 0 and SQLITE_NOMEM
  582: ** (out of memory error) or SQLITE_ERROR (parse error) is returned.
  583: */
  584: static int fts3ExprParse(
  585:   ParseContext *pParse,                   /* fts3 query parse context */
  586:   const char *z, int n,                   /* Text of MATCH query */
  587:   Fts3Expr **ppExpr,                      /* OUT: Parsed query structure */
  588:   int *pnConsumed                         /* OUT: Number of bytes consumed */
  589: ){
  590:   Fts3Expr *pRet = 0;
  591:   Fts3Expr *pPrev = 0;
  592:   Fts3Expr *pNotBranch = 0;               /* Only used in legacy parse mode */
  593:   int nIn = n;
  594:   const char *zIn = z;
  595:   int rc = SQLITE_OK;
  596:   int isRequirePhrase = 1;
  597: 
  598:   while( rc==SQLITE_OK ){
  599:     Fts3Expr *p = 0;
  600:     int nByte = 0;
  601:     rc = getNextNode(pParse, zIn, nIn, &p, &nByte);
  602:     if( rc==SQLITE_OK ){
  603:       int isPhrase;
  604: 
  605:       if( !sqlite3_fts3_enable_parentheses 
  606:        && p->eType==FTSQUERY_PHRASE && pParse->isNot 
  607:       ){
  608:         /* Create an implicit NOT operator. */
  609:         Fts3Expr *pNot = fts3MallocZero(sizeof(Fts3Expr));
  610:         if( !pNot ){
  611:           sqlite3Fts3ExprFree(p);
  612:           rc = SQLITE_NOMEM;
  613:           goto exprparse_out;
  614:         }
  615:         pNot->eType = FTSQUERY_NOT;
  616:         pNot->pRight = p;
  617:         if( pNotBranch ){
  618:           pNot->pLeft = pNotBranch;
  619:         }
  620:         pNotBranch = pNot;
  621:         p = pPrev;
  622:       }else{
  623:         int eType = p->eType;
  624:         isPhrase = (eType==FTSQUERY_PHRASE || p->pLeft);
  625: 
  626:         /* The isRequirePhrase variable is set to true if a phrase or
  627:         ** an expression contained in parenthesis is required. If a
  628:         ** binary operator (AND, OR, NOT or NEAR) is encounted when
  629:         ** isRequirePhrase is set, this is a syntax error.
  630:         */
  631:         if( !isPhrase && isRequirePhrase ){
  632:           sqlite3Fts3ExprFree(p);
  633:           rc = SQLITE_ERROR;
  634:           goto exprparse_out;
  635:         }
  636:   
  637:         if( isPhrase && !isRequirePhrase ){
  638:           /* Insert an implicit AND operator. */
  639:           Fts3Expr *pAnd;
  640:           assert( pRet && pPrev );
  641:           pAnd = fts3MallocZero(sizeof(Fts3Expr));
  642:           if( !pAnd ){
  643:             sqlite3Fts3ExprFree(p);
  644:             rc = SQLITE_NOMEM;
  645:             goto exprparse_out;
  646:           }
  647:           pAnd->eType = FTSQUERY_AND;
  648:           insertBinaryOperator(&pRet, pPrev, pAnd);
  649:           pPrev = pAnd;
  650:         }
  651: 
  652:         /* This test catches attempts to make either operand of a NEAR
  653:         ** operator something other than a phrase. For example, either of
  654:         ** the following:
  655:         **
  656:         **    (bracketed expression) NEAR phrase
  657:         **    phrase NEAR (bracketed expression)
  658:         **
  659:         ** Return an error in either case.
  660:         */
  661:         if( pPrev && (
  662:             (eType==FTSQUERY_NEAR && !isPhrase && pPrev->eType!=FTSQUERY_PHRASE)
  663:          || (eType!=FTSQUERY_PHRASE && isPhrase && pPrev->eType==FTSQUERY_NEAR)
  664:         )){
  665:           sqlite3Fts3ExprFree(p);
  666:           rc = SQLITE_ERROR;
  667:           goto exprparse_out;
  668:         }
  669:   
  670:         if( isPhrase ){
  671:           if( pRet ){
  672:             assert( pPrev && pPrev->pLeft && pPrev->pRight==0 );
  673:             pPrev->pRight = p;
  674:             p->pParent = pPrev;
  675:           }else{
  676:             pRet = p;
  677:           }
  678:         }else{
  679:           insertBinaryOperator(&pRet, pPrev, p);
  680:         }
  681:         isRequirePhrase = !isPhrase;
  682:       }
  683:       assert( nByte>0 );
  684:     }
  685:     assert( rc!=SQLITE_OK || (nByte>0 && nByte<=nIn) );
  686:     nIn -= nByte;
  687:     zIn += nByte;
  688:     pPrev = p;
  689:   }
  690: 
  691:   if( rc==SQLITE_DONE && pRet && isRequirePhrase ){
  692:     rc = SQLITE_ERROR;
  693:   }
  694: 
  695:   if( rc==SQLITE_DONE ){
  696:     rc = SQLITE_OK;
  697:     if( !sqlite3_fts3_enable_parentheses && pNotBranch ){
  698:       if( !pRet ){
  699:         rc = SQLITE_ERROR;
  700:       }else{
  701:         Fts3Expr *pIter = pNotBranch;
  702:         while( pIter->pLeft ){
  703:           pIter = pIter->pLeft;
  704:         }
  705:         pIter->pLeft = pRet;
  706:         pRet = pNotBranch;
  707:       }
  708:     }
  709:   }
  710:   *pnConsumed = n - nIn;
  711: 
  712: exprparse_out:
  713:   if( rc!=SQLITE_OK ){
  714:     sqlite3Fts3ExprFree(pRet);
  715:     sqlite3Fts3ExprFree(pNotBranch);
  716:     pRet = 0;
  717:   }
  718:   *ppExpr = pRet;
  719:   return rc;
  720: }
  721: 
  722: /*
  723: ** Parameters z and n contain a pointer to and length of a buffer containing
  724: ** an fts3 query expression, respectively. This function attempts to parse the
  725: ** query expression and create a tree of Fts3Expr structures representing the
  726: ** parsed expression. If successful, *ppExpr is set to point to the head
  727: ** of the parsed expression tree and SQLITE_OK is returned. If an error
  728: ** occurs, either SQLITE_NOMEM (out-of-memory error) or SQLITE_ERROR (parse
  729: ** error) is returned and *ppExpr is set to 0.
  730: **
  731: ** If parameter n is a negative number, then z is assumed to point to a
  732: ** nul-terminated string and the length is determined using strlen().
  733: **
  734: ** The first parameter, pTokenizer, is passed the fts3 tokenizer module to
  735: ** use to normalize query tokens while parsing the expression. The azCol[]
  736: ** array, which is assumed to contain nCol entries, should contain the names
  737: ** of each column in the target fts3 table, in order from left to right. 
  738: ** Column names must be nul-terminated strings.
  739: **
  740: ** The iDefaultCol parameter should be passed the index of the table column
  741: ** that appears on the left-hand-side of the MATCH operator (the default
  742: ** column to match against for tokens for which a column name is not explicitly
  743: ** specified as part of the query string), or -1 if tokens may by default
  744: ** match any table column.
  745: */
  746: int sqlite3Fts3ExprParse(
  747:   sqlite3_tokenizer *pTokenizer,      /* Tokenizer module */
  748:   char **azCol,                       /* Array of column names for fts3 table */
  749:   int bFts4,                          /* True to allow FTS4-only syntax */
  750:   int nCol,                           /* Number of entries in azCol[] */
  751:   int iDefaultCol,                    /* Default column to query */
  752:   const char *z, int n,               /* Text of MATCH query */
  753:   Fts3Expr **ppExpr                   /* OUT: Parsed query structure */
  754: ){
  755:   int nParsed;
  756:   int rc;
  757:   ParseContext sParse;
  758:   sParse.pTokenizer = pTokenizer;
  759:   sParse.azCol = (const char **)azCol;
  760:   sParse.nCol = nCol;
  761:   sParse.iDefaultCol = iDefaultCol;
  762:   sParse.nNest = 0;
  763:   sParse.bFts4 = bFts4;
  764:   if( z==0 ){
  765:     *ppExpr = 0;
  766:     return SQLITE_OK;
  767:   }
  768:   if( n<0 ){
  769:     n = (int)strlen(z);
  770:   }
  771:   rc = fts3ExprParse(&sParse, z, n, ppExpr, &nParsed);
  772: 
  773:   /* Check for mismatched parenthesis */
  774:   if( rc==SQLITE_OK && sParse.nNest ){
  775:     rc = SQLITE_ERROR;
  776:     sqlite3Fts3ExprFree(*ppExpr);
  777:     *ppExpr = 0;
  778:   }
  779: 
  780:   return rc;
  781: }
  782: 
  783: /*
  784: ** Free a parsed fts3 query expression allocated by sqlite3Fts3ExprParse().
  785: */
  786: void sqlite3Fts3ExprFree(Fts3Expr *p){
  787:   if( p ){
  788:     assert( p->eType==FTSQUERY_PHRASE || p->pPhrase==0 );
  789:     sqlite3Fts3ExprFree(p->pLeft);
  790:     sqlite3Fts3ExprFree(p->pRight);
  791:     sqlite3Fts3EvalPhraseCleanup(p->pPhrase);
  792:     sqlite3_free(p->aMI);
  793:     sqlite3_free(p);
  794:   }
  795: }
  796: 
  797: /****************************************************************************
  798: *****************************************************************************
  799: ** Everything after this point is just test code.
  800: */
  801: 
  802: #ifdef SQLITE_TEST
  803: 
  804: #include <stdio.h>
  805: 
  806: /*
  807: ** Function to query the hash-table of tokenizers (see README.tokenizers).
  808: */
  809: static int queryTestTokenizer(
  810:   sqlite3 *db, 
  811:   const char *zName,  
  812:   const sqlite3_tokenizer_module **pp
  813: ){
  814:   int rc;
  815:   sqlite3_stmt *pStmt;
  816:   const char zSql[] = "SELECT fts3_tokenizer(?)";
  817: 
  818:   *pp = 0;
  819:   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
  820:   if( rc!=SQLITE_OK ){
  821:     return rc;
  822:   }
  823: 
  824:   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
  825:   if( SQLITE_ROW==sqlite3_step(pStmt) ){
  826:     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
  827:       memcpy((void *)pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
  828:     }
  829:   }
  830: 
  831:   return sqlite3_finalize(pStmt);
  832: }
  833: 
  834: /*
  835: ** Return a pointer to a buffer containing a text representation of the
  836: ** expression passed as the first argument. The buffer is obtained from
  837: ** sqlite3_malloc(). It is the responsibility of the caller to use 
  838: ** sqlite3_free() to release the memory. If an OOM condition is encountered,
  839: ** NULL is returned.
  840: **
  841: ** If the second argument is not NULL, then its contents are prepended to 
  842: ** the returned expression text and then freed using sqlite3_free().
  843: */
  844: static char *exprToString(Fts3Expr *pExpr, char *zBuf){
  845:   switch( pExpr->eType ){
  846:     case FTSQUERY_PHRASE: {
  847:       Fts3Phrase *pPhrase = pExpr->pPhrase;
  848:       int i;
  849:       zBuf = sqlite3_mprintf(
  850:           "%zPHRASE %d 0", zBuf, pPhrase->iColumn);
  851:       for(i=0; zBuf && i<pPhrase->nToken; i++){
  852:         zBuf = sqlite3_mprintf("%z %.*s%s", zBuf, 
  853:             pPhrase->aToken[i].n, pPhrase->aToken[i].z,
  854:             (pPhrase->aToken[i].isPrefix?"+":"")
  855:         );
  856:       }
  857:       return zBuf;
  858:     }
  859: 
  860:     case FTSQUERY_NEAR:
  861:       zBuf = sqlite3_mprintf("%zNEAR/%d ", zBuf, pExpr->nNear);
  862:       break;
  863:     case FTSQUERY_NOT:
  864:       zBuf = sqlite3_mprintf("%zNOT ", zBuf);
  865:       break;
  866:     case FTSQUERY_AND:
  867:       zBuf = sqlite3_mprintf("%zAND ", zBuf);
  868:       break;
  869:     case FTSQUERY_OR:
  870:       zBuf = sqlite3_mprintf("%zOR ", zBuf);
  871:       break;
  872:   }
  873: 
  874:   if( zBuf ) zBuf = sqlite3_mprintf("%z{", zBuf);
  875:   if( zBuf ) zBuf = exprToString(pExpr->pLeft, zBuf);
  876:   if( zBuf ) zBuf = sqlite3_mprintf("%z} {", zBuf);
  877: 
  878:   if( zBuf ) zBuf = exprToString(pExpr->pRight, zBuf);
  879:   if( zBuf ) zBuf = sqlite3_mprintf("%z}", zBuf);
  880: 
  881:   return zBuf;
  882: }
  883: 
  884: /*
  885: ** This is the implementation of a scalar SQL function used to test the 
  886: ** expression parser. It should be called as follows:
  887: **
  888: **   fts3_exprtest(<tokenizer>, <expr>, <column 1>, ...);
  889: **
  890: ** The first argument, <tokenizer>, is the name of the fts3 tokenizer used
  891: ** to parse the query expression (see README.tokenizers). The second argument
  892: ** is the query expression to parse. Each subsequent argument is the name
  893: ** of a column of the fts3 table that the query expression may refer to.
  894: ** For example:
  895: **
  896: **   SELECT fts3_exprtest('simple', 'Bill col2:Bloggs', 'col1', 'col2');
  897: */
  898: static void fts3ExprTest(
  899:   sqlite3_context *context,
  900:   int argc,
  901:   sqlite3_value **argv
  902: ){
  903:   sqlite3_tokenizer_module const *pModule = 0;
  904:   sqlite3_tokenizer *pTokenizer = 0;
  905:   int rc;
  906:   char **azCol = 0;
  907:   const char *zExpr;
  908:   int nExpr;
  909:   int nCol;
  910:   int ii;
  911:   Fts3Expr *pExpr;
  912:   char *zBuf = 0;
  913:   sqlite3 *db = sqlite3_context_db_handle(context);
  914: 
  915:   if( argc<3 ){
  916:     sqlite3_result_error(context, 
  917:         "Usage: fts3_exprtest(tokenizer, expr, col1, ...", -1
  918:     );
  919:     return;
  920:   }
  921: 
  922:   rc = queryTestTokenizer(db,
  923:                           (const char *)sqlite3_value_text(argv[0]), &pModule);
  924:   if( rc==SQLITE_NOMEM ){
  925:     sqlite3_result_error_nomem(context);
  926:     goto exprtest_out;
  927:   }else if( !pModule ){
  928:     sqlite3_result_error(context, "No such tokenizer module", -1);
  929:     goto exprtest_out;
  930:   }
  931: 
  932:   rc = pModule->xCreate(0, 0, &pTokenizer);
  933:   assert( rc==SQLITE_NOMEM || rc==SQLITE_OK );
  934:   if( rc==SQLITE_NOMEM ){
  935:     sqlite3_result_error_nomem(context);
  936:     goto exprtest_out;
  937:   }
  938:   pTokenizer->pModule = pModule;
  939: 
  940:   zExpr = (const char *)sqlite3_value_text(argv[1]);
  941:   nExpr = sqlite3_value_bytes(argv[1]);
  942:   nCol = argc-2;
  943:   azCol = (char **)sqlite3_malloc(nCol*sizeof(char *));
  944:   if( !azCol ){
  945:     sqlite3_result_error_nomem(context);
  946:     goto exprtest_out;
  947:   }
  948:   for(ii=0; ii<nCol; ii++){
  949:     azCol[ii] = (char *)sqlite3_value_text(argv[ii+2]);
  950:   }
  951: 
  952:   rc = sqlite3Fts3ExprParse(
  953:       pTokenizer, azCol, 0, nCol, nCol, zExpr, nExpr, &pExpr
  954:   );
  955:   if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM ){
  956:     sqlite3_result_error(context, "Error parsing expression", -1);
  957:   }else if( rc==SQLITE_NOMEM || !(zBuf = exprToString(pExpr, 0)) ){
  958:     sqlite3_result_error_nomem(context);
  959:   }else{
  960:     sqlite3_result_text(context, zBuf, -1, SQLITE_TRANSIENT);
  961:     sqlite3_free(zBuf);
  962:   }
  963: 
  964:   sqlite3Fts3ExprFree(pExpr);
  965: 
  966: exprtest_out:
  967:   if( pModule && pTokenizer ){
  968:     rc = pModule->xDestroy(pTokenizer);
  969:   }
  970:   sqlite3_free(azCol);
  971: }
  972: 
  973: /*
  974: ** Register the query expression parser test function fts3_exprtest() 
  975: ** with database connection db. 
  976: */
  977: int sqlite3Fts3ExprInitTestInterface(sqlite3* db){
  978:   return sqlite3_create_function(
  979:       db, "fts3_exprtest", -1, SQLITE_UTF8, 0, fts3ExprTest, 0, 0
  980:   );
  981: }
  982: 
  983: #endif
  984: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>