File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts3 / fts3_snippet.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 17:04:17 2012 UTC (12 years, 4 months ago) by misho
Branches: sqlite3, MAIN
CVS tags: v3_7_10, HEAD
sqlite3

    1: /*
    2: ** 2009 Oct 23
    3: **
    4: ** The author disclaims copyright to this source code.  In place of
    5: ** a legal notice, here is a blessing:
    6: **
    7: **    May you do good and not evil.
    8: **    May you find forgiveness for yourself and forgive others.
    9: **    May you share freely, never taking more than you give.
   10: **
   11: ******************************************************************************
   12: */
   13: 
   14: #include "fts3Int.h"
   15: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
   16: 
   17: #include <string.h>
   18: #include <assert.h>
   19: 
   20: /*
   21: ** Characters that may appear in the second argument to matchinfo().
   22: */
   23: #define FTS3_MATCHINFO_NPHRASE   'p'        /* 1 value */
   24: #define FTS3_MATCHINFO_NCOL      'c'        /* 1 value */
   25: #define FTS3_MATCHINFO_NDOC      'n'        /* 1 value */
   26: #define FTS3_MATCHINFO_AVGLENGTH 'a'        /* nCol values */
   27: #define FTS3_MATCHINFO_LENGTH    'l'        /* nCol values */
   28: #define FTS3_MATCHINFO_LCS       's'        /* nCol values */
   29: #define FTS3_MATCHINFO_HITS      'x'        /* 3*nCol*nPhrase values */
   30: 
   31: /*
   32: ** The default value for the second argument to matchinfo(). 
   33: */
   34: #define FTS3_MATCHINFO_DEFAULT   "pcx"
   35: 
   36: 
   37: /*
   38: ** Used as an fts3ExprIterate() context when loading phrase doclists to
   39: ** Fts3Expr.aDoclist[]/nDoclist.
   40: */
   41: typedef struct LoadDoclistCtx LoadDoclistCtx;
   42: struct LoadDoclistCtx {
   43:   Fts3Cursor *pCsr;               /* FTS3 Cursor */
   44:   int nPhrase;                    /* Number of phrases seen so far */
   45:   int nToken;                     /* Number of tokens seen so far */
   46: };
   47: 
   48: /*
   49: ** The following types are used as part of the implementation of the 
   50: ** fts3BestSnippet() routine.
   51: */
   52: typedef struct SnippetIter SnippetIter;
   53: typedef struct SnippetPhrase SnippetPhrase;
   54: typedef struct SnippetFragment SnippetFragment;
   55: 
   56: struct SnippetIter {
   57:   Fts3Cursor *pCsr;               /* Cursor snippet is being generated from */
   58:   int iCol;                       /* Extract snippet from this column */
   59:   int nSnippet;                   /* Requested snippet length (in tokens) */
   60:   int nPhrase;                    /* Number of phrases in query */
   61:   SnippetPhrase *aPhrase;         /* Array of size nPhrase */
   62:   int iCurrent;                   /* First token of current snippet */
   63: };
   64: 
   65: struct SnippetPhrase {
   66:   int nToken;                     /* Number of tokens in phrase */
   67:   char *pList;                    /* Pointer to start of phrase position list */
   68:   int iHead;                      /* Next value in position list */
   69:   char *pHead;                    /* Position list data following iHead */
   70:   int iTail;                      /* Next value in trailing position list */
   71:   char *pTail;                    /* Position list data following iTail */
   72: };
   73: 
   74: struct SnippetFragment {
   75:   int iCol;                       /* Column snippet is extracted from */
   76:   int iPos;                       /* Index of first token in snippet */
   77:   u64 covered;                    /* Mask of query phrases covered */
   78:   u64 hlmask;                     /* Mask of snippet terms to highlight */
   79: };
   80: 
   81: /*
   82: ** This type is used as an fts3ExprIterate() context object while 
   83: ** accumulating the data returned by the matchinfo() function.
   84: */
   85: typedef struct MatchInfo MatchInfo;
   86: struct MatchInfo {
   87:   Fts3Cursor *pCursor;            /* FTS3 Cursor */
   88:   int nCol;                       /* Number of columns in table */
   89:   int nPhrase;                    /* Number of matchable phrases in query */
   90:   sqlite3_int64 nDoc;             /* Number of docs in database */
   91:   u32 *aMatchinfo;                /* Pre-allocated buffer */
   92: };
   93: 
   94: 
   95: 
   96: /*
   97: ** The snippet() and offsets() functions both return text values. An instance
   98: ** of the following structure is used to accumulate those values while the
   99: ** functions are running. See fts3StringAppend() for details.
  100: */
  101: typedef struct StrBuffer StrBuffer;
  102: struct StrBuffer {
  103:   char *z;                        /* Pointer to buffer containing string */
  104:   int n;                          /* Length of z in bytes (excl. nul-term) */
  105:   int nAlloc;                     /* Allocated size of buffer z in bytes */
  106: };
  107: 
  108: 
  109: /*
  110: ** This function is used to help iterate through a position-list. A position
  111: ** list is a list of unique integers, sorted from smallest to largest. Each
  112: ** element of the list is represented by an FTS3 varint that takes the value
  113: ** of the difference between the current element and the previous one plus
  114: ** two. For example, to store the position-list:
  115: **
  116: **     4 9 113
  117: **
  118: ** the three varints:
  119: **
  120: **     6 7 106
  121: **
  122: ** are encoded.
  123: **
  124: ** When this function is called, *pp points to the start of an element of
  125: ** the list. *piPos contains the value of the previous entry in the list.
  126: ** After it returns, *piPos contains the value of the next element of the
  127: ** list and *pp is advanced to the following varint.
  128: */
  129: static void fts3GetDeltaPosition(char **pp, int *piPos){
  130:   int iVal;
  131:   *pp += sqlite3Fts3GetVarint32(*pp, &iVal);
  132:   *piPos += (iVal-2);
  133: }
  134: 
  135: /*
  136: ** Helper function for fts3ExprIterate() (see below).
  137: */
  138: static int fts3ExprIterate2(
  139:   Fts3Expr *pExpr,                /* Expression to iterate phrases of */
  140:   int *piPhrase,                  /* Pointer to phrase counter */
  141:   int (*x)(Fts3Expr*,int,void*),  /* Callback function to invoke for phrases */
  142:   void *pCtx                      /* Second argument to pass to callback */
  143: ){
  144:   int rc;                         /* Return code */
  145:   int eType = pExpr->eType;       /* Type of expression node pExpr */
  146: 
  147:   if( eType!=FTSQUERY_PHRASE ){
  148:     assert( pExpr->pLeft && pExpr->pRight );
  149:     rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx);
  150:     if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){
  151:       rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx);
  152:     }
  153:   }else{
  154:     rc = x(pExpr, *piPhrase, pCtx);
  155:     (*piPhrase)++;
  156:   }
  157:   return rc;
  158: }
  159: 
  160: /*
  161: ** Iterate through all phrase nodes in an FTS3 query, except those that
  162: ** are part of a sub-tree that is the right-hand-side of a NOT operator.
  163: ** For each phrase node found, the supplied callback function is invoked.
  164: **
  165: ** If the callback function returns anything other than SQLITE_OK, 
  166: ** the iteration is abandoned and the error code returned immediately.
  167: ** Otherwise, SQLITE_OK is returned after a callback has been made for
  168: ** all eligible phrase nodes.
  169: */
  170: static int fts3ExprIterate(
  171:   Fts3Expr *pExpr,                /* Expression to iterate phrases of */
  172:   int (*x)(Fts3Expr*,int,void*),  /* Callback function to invoke for phrases */
  173:   void *pCtx                      /* Second argument to pass to callback */
  174: ){
  175:   int iPhrase = 0;                /* Variable used as the phrase counter */
  176:   return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx);
  177: }
  178: 
  179: /*
  180: ** This is an fts3ExprIterate() callback used while loading the doclists
  181: ** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also
  182: ** fts3ExprLoadDoclists().
  183: */
  184: static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
  185:   int rc = SQLITE_OK;
  186:   Fts3Phrase *pPhrase = pExpr->pPhrase;
  187:   LoadDoclistCtx *p = (LoadDoclistCtx *)ctx;
  188: 
  189:   UNUSED_PARAMETER(iPhrase);
  190: 
  191:   p->nPhrase++;
  192:   p->nToken += pPhrase->nToken;
  193: 
  194:   return rc;
  195: }
  196: 
  197: /*
  198: ** Load the doclists for each phrase in the query associated with FTS3 cursor
  199: ** pCsr. 
  200: **
  201: ** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable 
  202: ** phrases in the expression (all phrases except those directly or 
  203: ** indirectly descended from the right-hand-side of a NOT operator). If 
  204: ** pnToken is not NULL, then it is set to the number of tokens in all
  205: ** matchable phrases of the expression.
  206: */
  207: static int fts3ExprLoadDoclists(
  208:   Fts3Cursor *pCsr,               /* Fts3 cursor for current query */
  209:   int *pnPhrase,                  /* OUT: Number of phrases in query */
  210:   int *pnToken                    /* OUT: Number of tokens in query */
  211: ){
  212:   int rc;                         /* Return Code */
  213:   LoadDoclistCtx sCtx = {0,0,0};  /* Context for fts3ExprIterate() */
  214:   sCtx.pCsr = pCsr;
  215:   rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx);
  216:   if( pnPhrase ) *pnPhrase = sCtx.nPhrase;
  217:   if( pnToken ) *pnToken = sCtx.nToken;
  218:   return rc;
  219: }
  220: 
  221: static int fts3ExprPhraseCountCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
  222:   (*(int *)ctx)++;
  223:   UNUSED_PARAMETER(pExpr);
  224:   UNUSED_PARAMETER(iPhrase);
  225:   return SQLITE_OK;
  226: }
  227: static int fts3ExprPhraseCount(Fts3Expr *pExpr){
  228:   int nPhrase = 0;
  229:   (void)fts3ExprIterate(pExpr, fts3ExprPhraseCountCb, (void *)&nPhrase);
  230:   return nPhrase;
  231: }
  232: 
  233: /*
  234: ** Advance the position list iterator specified by the first two 
  235: ** arguments so that it points to the first element with a value greater
  236: ** than or equal to parameter iNext.
  237: */
  238: static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){
  239:   char *pIter = *ppIter;
  240:   if( pIter ){
  241:     int iIter = *piIter;
  242: 
  243:     while( iIter<iNext ){
  244:       if( 0==(*pIter & 0xFE) ){
  245:         iIter = -1;
  246:         pIter = 0;
  247:         break;
  248:       }
  249:       fts3GetDeltaPosition(&pIter, &iIter);
  250:     }
  251: 
  252:     *piIter = iIter;
  253:     *ppIter = pIter;
  254:   }
  255: }
  256: 
  257: /*
  258: ** Advance the snippet iterator to the next candidate snippet.
  259: */
  260: static int fts3SnippetNextCandidate(SnippetIter *pIter){
  261:   int i;                          /* Loop counter */
  262: 
  263:   if( pIter->iCurrent<0 ){
  264:     /* The SnippetIter object has just been initialized. The first snippet
  265:     ** candidate always starts at offset 0 (even if this candidate has a
  266:     ** score of 0.0).
  267:     */
  268:     pIter->iCurrent = 0;
  269: 
  270:     /* Advance the 'head' iterator of each phrase to the first offset that
  271:     ** is greater than or equal to (iNext+nSnippet).
  272:     */
  273:     for(i=0; i<pIter->nPhrase; i++){
  274:       SnippetPhrase *pPhrase = &pIter->aPhrase[i];
  275:       fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet);
  276:     }
  277:   }else{
  278:     int iStart;
  279:     int iEnd = 0x7FFFFFFF;
  280: 
  281:     for(i=0; i<pIter->nPhrase; i++){
  282:       SnippetPhrase *pPhrase = &pIter->aPhrase[i];
  283:       if( pPhrase->pHead && pPhrase->iHead<iEnd ){
  284:         iEnd = pPhrase->iHead;
  285:       }
  286:     }
  287:     if( iEnd==0x7FFFFFFF ){
  288:       return 1;
  289:     }
  290: 
  291:     pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1;
  292:     for(i=0; i<pIter->nPhrase; i++){
  293:       SnippetPhrase *pPhrase = &pIter->aPhrase[i];
  294:       fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1);
  295:       fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart);
  296:     }
  297:   }
  298: 
  299:   return 0;
  300: }
  301: 
  302: /*
  303: ** Retrieve information about the current candidate snippet of snippet 
  304: ** iterator pIter.
  305: */
  306: static void fts3SnippetDetails(
  307:   SnippetIter *pIter,             /* Snippet iterator */
  308:   u64 mCovered,                   /* Bitmask of phrases already covered */
  309:   int *piToken,                   /* OUT: First token of proposed snippet */
  310:   int *piScore,                   /* OUT: "Score" for this snippet */
  311:   u64 *pmCover,                   /* OUT: Bitmask of phrases covered */
  312:   u64 *pmHighlight                /* OUT: Bitmask of terms to highlight */
  313: ){
  314:   int iStart = pIter->iCurrent;   /* First token of snippet */
  315:   int iScore = 0;                 /* Score of this snippet */
  316:   int i;                          /* Loop counter */
  317:   u64 mCover = 0;                 /* Mask of phrases covered by this snippet */
  318:   u64 mHighlight = 0;             /* Mask of tokens to highlight in snippet */
  319: 
  320:   for(i=0; i<pIter->nPhrase; i++){
  321:     SnippetPhrase *pPhrase = &pIter->aPhrase[i];
  322:     if( pPhrase->pTail ){
  323:       char *pCsr = pPhrase->pTail;
  324:       int iCsr = pPhrase->iTail;
  325: 
  326:       while( iCsr<(iStart+pIter->nSnippet) ){
  327:         int j;
  328:         u64 mPhrase = (u64)1 << i;
  329:         u64 mPos = (u64)1 << (iCsr - iStart);
  330:         assert( iCsr>=iStart );
  331:         if( (mCover|mCovered)&mPhrase ){
  332:           iScore++;
  333:         }else{
  334:           iScore += 1000;
  335:         }
  336:         mCover |= mPhrase;
  337: 
  338:         for(j=0; j<pPhrase->nToken; j++){
  339:           mHighlight |= (mPos>>j);
  340:         }
  341: 
  342:         if( 0==(*pCsr & 0x0FE) ) break;
  343:         fts3GetDeltaPosition(&pCsr, &iCsr);
  344:       }
  345:     }
  346:   }
  347: 
  348:   /* Set the output variables before returning. */
  349:   *piToken = iStart;
  350:   *piScore = iScore;
  351:   *pmCover = mCover;
  352:   *pmHighlight = mHighlight;
  353: }
  354: 
  355: /*
  356: ** This function is an fts3ExprIterate() callback used by fts3BestSnippet().
  357: ** Each invocation populates an element of the SnippetIter.aPhrase[] array.
  358: */
  359: static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){
  360:   SnippetIter *p = (SnippetIter *)ctx;
  361:   SnippetPhrase *pPhrase = &p->aPhrase[iPhrase];
  362:   char *pCsr;
  363: 
  364:   pPhrase->nToken = pExpr->pPhrase->nToken;
  365: 
  366:   pCsr = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol);
  367:   if( pCsr ){
  368:     int iFirst = 0;
  369:     pPhrase->pList = pCsr;
  370:     fts3GetDeltaPosition(&pCsr, &iFirst);
  371:     assert( iFirst>=0 );
  372:     pPhrase->pHead = pCsr;
  373:     pPhrase->pTail = pCsr;
  374:     pPhrase->iHead = iFirst;
  375:     pPhrase->iTail = iFirst;
  376:   }else{
  377:     assert( pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 );
  378:   }
  379: 
  380:   return SQLITE_OK;
  381: }
  382: 
  383: /*
  384: ** Select the fragment of text consisting of nFragment contiguous tokens 
  385: ** from column iCol that represent the "best" snippet. The best snippet
  386: ** is the snippet with the highest score, where scores are calculated
  387: ** by adding:
  388: **
  389: **   (a) +1 point for each occurence of a matchable phrase in the snippet.
  390: **
  391: **   (b) +1000 points for the first occurence of each matchable phrase in 
  392: **       the snippet for which the corresponding mCovered bit is not set.
  393: **
  394: ** The selected snippet parameters are stored in structure *pFragment before
  395: ** returning. The score of the selected snippet is stored in *piScore
  396: ** before returning.
  397: */
  398: static int fts3BestSnippet(
  399:   int nSnippet,                   /* Desired snippet length */
  400:   Fts3Cursor *pCsr,               /* Cursor to create snippet for */
  401:   int iCol,                       /* Index of column to create snippet from */
  402:   u64 mCovered,                   /* Mask of phrases already covered */
  403:   u64 *pmSeen,                    /* IN/OUT: Mask of phrases seen */
  404:   SnippetFragment *pFragment,     /* OUT: Best snippet found */
  405:   int *piScore                    /* OUT: Score of snippet pFragment */
  406: ){
  407:   int rc;                         /* Return Code */
  408:   int nList;                      /* Number of phrases in expression */
  409:   SnippetIter sIter;              /* Iterates through snippet candidates */
  410:   int nByte;                      /* Number of bytes of space to allocate */
  411:   int iBestScore = -1;            /* Best snippet score found so far */
  412:   int i;                          /* Loop counter */
  413: 
  414:   memset(&sIter, 0, sizeof(sIter));
  415: 
  416:   /* Iterate through the phrases in the expression to count them. The same
  417:   ** callback makes sure the doclists are loaded for each phrase.
  418:   */
  419:   rc = fts3ExprLoadDoclists(pCsr, &nList, 0);
  420:   if( rc!=SQLITE_OK ){
  421:     return rc;
  422:   }
  423: 
  424:   /* Now that it is known how many phrases there are, allocate and zero
  425:   ** the required space using malloc().
  426:   */
  427:   nByte = sizeof(SnippetPhrase) * nList;
  428:   sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte);
  429:   if( !sIter.aPhrase ){
  430:     return SQLITE_NOMEM;
  431:   }
  432:   memset(sIter.aPhrase, 0, nByte);
  433: 
  434:   /* Initialize the contents of the SnippetIter object. Then iterate through
  435:   ** the set of phrases in the expression to populate the aPhrase[] array.
  436:   */
  437:   sIter.pCsr = pCsr;
  438:   sIter.iCol = iCol;
  439:   sIter.nSnippet = nSnippet;
  440:   sIter.nPhrase = nList;
  441:   sIter.iCurrent = -1;
  442:   (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sIter);
  443: 
  444:   /* Set the *pmSeen output variable. */
  445:   for(i=0; i<nList; i++){
  446:     if( sIter.aPhrase[i].pHead ){
  447:       *pmSeen |= (u64)1 << i;
  448:     }
  449:   }
  450: 
  451:   /* Loop through all candidate snippets. Store the best snippet in 
  452:   ** *pFragment. Store its associated 'score' in iBestScore.
  453:   */
  454:   pFragment->iCol = iCol;
  455:   while( !fts3SnippetNextCandidate(&sIter) ){
  456:     int iPos;
  457:     int iScore;
  458:     u64 mCover;
  459:     u64 mHighlight;
  460:     fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover, &mHighlight);
  461:     assert( iScore>=0 );
  462:     if( iScore>iBestScore ){
  463:       pFragment->iPos = iPos;
  464:       pFragment->hlmask = mHighlight;
  465:       pFragment->covered = mCover;
  466:       iBestScore = iScore;
  467:     }
  468:   }
  469: 
  470:   sqlite3_free(sIter.aPhrase);
  471:   *piScore = iBestScore;
  472:   return SQLITE_OK;
  473: }
  474: 
  475: 
  476: /*
  477: ** Append a string to the string-buffer passed as the first argument.
  478: **
  479: ** If nAppend is negative, then the length of the string zAppend is
  480: ** determined using strlen().
  481: */
  482: static int fts3StringAppend(
  483:   StrBuffer *pStr,                /* Buffer to append to */
  484:   const char *zAppend,            /* Pointer to data to append to buffer */
  485:   int nAppend                     /* Size of zAppend in bytes (or -1) */
  486: ){
  487:   if( nAppend<0 ){
  488:     nAppend = (int)strlen(zAppend);
  489:   }
  490: 
  491:   /* If there is insufficient space allocated at StrBuffer.z, use realloc()
  492:   ** to grow the buffer until so that it is big enough to accomadate the
  493:   ** appended data.
  494:   */
  495:   if( pStr->n+nAppend+1>=pStr->nAlloc ){
  496:     int nAlloc = pStr->nAlloc+nAppend+100;
  497:     char *zNew = sqlite3_realloc(pStr->z, nAlloc);
  498:     if( !zNew ){
  499:       return SQLITE_NOMEM;
  500:     }
  501:     pStr->z = zNew;
  502:     pStr->nAlloc = nAlloc;
  503:   }
  504: 
  505:   /* Append the data to the string buffer. */
  506:   memcpy(&pStr->z[pStr->n], zAppend, nAppend);
  507:   pStr->n += nAppend;
  508:   pStr->z[pStr->n] = '\0';
  509: 
  510:   return SQLITE_OK;
  511: }
  512: 
  513: /*
  514: ** The fts3BestSnippet() function often selects snippets that end with a
  515: ** query term. That is, the final term of the snippet is always a term
  516: ** that requires highlighting. For example, if 'X' is a highlighted term
  517: ** and '.' is a non-highlighted term, BestSnippet() may select:
  518: **
  519: **     ........X.....X
  520: **
  521: ** This function "shifts" the beginning of the snippet forward in the 
  522: ** document so that there are approximately the same number of 
  523: ** non-highlighted terms to the right of the final highlighted term as there
  524: ** are to the left of the first highlighted term. For example, to this:
  525: **
  526: **     ....X.....X....
  527: **
  528: ** This is done as part of extracting the snippet text, not when selecting
  529: ** the snippet. Snippet selection is done based on doclists only, so there
  530: ** is no way for fts3BestSnippet() to know whether or not the document 
  531: ** actually contains terms that follow the final highlighted term. 
  532: */
  533: static int fts3SnippetShift(
  534:   Fts3Table *pTab,                /* FTS3 table snippet comes from */
  535:   int nSnippet,                   /* Number of tokens desired for snippet */
  536:   const char *zDoc,               /* Document text to extract snippet from */
  537:   int nDoc,                       /* Size of buffer zDoc in bytes */
  538:   int *piPos,                     /* IN/OUT: First token of snippet */
  539:   u64 *pHlmask                    /* IN/OUT: Mask of tokens to highlight */
  540: ){
  541:   u64 hlmask = *pHlmask;          /* Local copy of initial highlight-mask */
  542: 
  543:   if( hlmask ){
  544:     int nLeft;                    /* Tokens to the left of first highlight */
  545:     int nRight;                   /* Tokens to the right of last highlight */
  546:     int nDesired;                 /* Ideal number of tokens to shift forward */
  547: 
  548:     for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++);
  549:     for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++);
  550:     nDesired = (nLeft-nRight)/2;
  551: 
  552:     /* Ideally, the start of the snippet should be pushed forward in the
  553:     ** document nDesired tokens. This block checks if there are actually
  554:     ** nDesired tokens to the right of the snippet. If so, *piPos and
  555:     ** *pHlMask are updated to shift the snippet nDesired tokens to the
  556:     ** right. Otherwise, the snippet is shifted by the number of tokens
  557:     ** available.
  558:     */
  559:     if( nDesired>0 ){
  560:       int nShift;                 /* Number of tokens to shift snippet by */
  561:       int iCurrent = 0;           /* Token counter */
  562:       int rc;                     /* Return Code */
  563:       sqlite3_tokenizer_module *pMod;
  564:       sqlite3_tokenizer_cursor *pC;
  565:       pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
  566: 
  567:       /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired)
  568:       ** or more tokens in zDoc/nDoc.
  569:       */
  570:       rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
  571:       if( rc!=SQLITE_OK ){
  572:         return rc;
  573:       }
  574:       pC->pTokenizer = pTab->pTokenizer;
  575:       while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){
  576:         const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3;
  577:         rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
  578:       }
  579:       pMod->xClose(pC);
  580:       if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; }
  581: 
  582:       nShift = (rc==SQLITE_DONE)+iCurrent-nSnippet;
  583:       assert( nShift<=nDesired );
  584:       if( nShift>0 ){
  585:         *piPos += nShift;
  586:         *pHlmask = hlmask >> nShift;
  587:       }
  588:     }
  589:   }
  590:   return SQLITE_OK;
  591: }
  592: 
  593: /*
  594: ** Extract the snippet text for fragment pFragment from cursor pCsr and
  595: ** append it to string buffer pOut.
  596: */
  597: static int fts3SnippetText(
  598:   Fts3Cursor *pCsr,               /* FTS3 Cursor */
  599:   SnippetFragment *pFragment,     /* Snippet to extract */
  600:   int iFragment,                  /* Fragment number */
  601:   int isLast,                     /* True for final fragment in snippet */
  602:   int nSnippet,                   /* Number of tokens in extracted snippet */
  603:   const char *zOpen,              /* String inserted before highlighted term */
  604:   const char *zClose,             /* String inserted after highlighted term */
  605:   const char *zEllipsis,          /* String inserted between snippets */
  606:   StrBuffer *pOut                 /* Write output here */
  607: ){
  608:   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  609:   int rc;                         /* Return code */
  610:   const char *zDoc;               /* Document text to extract snippet from */
  611:   int nDoc;                       /* Size of zDoc in bytes */
  612:   int iCurrent = 0;               /* Current token number of document */
  613:   int iEnd = 0;                   /* Byte offset of end of current token */
  614:   int isShiftDone = 0;            /* True after snippet is shifted */
  615:   int iPos = pFragment->iPos;     /* First token of snippet */
  616:   u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */
  617:   int iCol = pFragment->iCol+1;   /* Query column to extract text from */
  618:   sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */
  619:   sqlite3_tokenizer_cursor *pC;   /* Tokenizer cursor open on zDoc/nDoc */
  620:   const char *ZDUMMY;             /* Dummy argument used with tokenizer */
  621:   int DUMMY1;                     /* Dummy argument used with tokenizer */
  622:   
  623:   zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol);
  624:   if( zDoc==0 ){
  625:     if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){
  626:       return SQLITE_NOMEM;
  627:     }
  628:     return SQLITE_OK;
  629:   }
  630:   nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol);
  631: 
  632:   /* Open a token cursor on the document. */
  633:   pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
  634:   rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
  635:   if( rc!=SQLITE_OK ){
  636:     return rc;
  637:   }
  638:   pC->pTokenizer = pTab->pTokenizer;
  639: 
  640:   while( rc==SQLITE_OK ){
  641:     int iBegin;                   /* Offset in zDoc of start of token */
  642:     int iFin;                     /* Offset in zDoc of end of token */
  643:     int isHighlight;              /* True for highlighted terms */
  644: 
  645:     rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);
  646:     if( rc!=SQLITE_OK ){
  647:       if( rc==SQLITE_DONE ){
  648:         /* Special case - the last token of the snippet is also the last token
  649:         ** of the column. Append any punctuation that occurred between the end
  650:         ** of the previous token and the end of the document to the output. 
  651:         ** Then break out of the loop. */
  652:         rc = fts3StringAppend(pOut, &zDoc[iEnd], -1);
  653:       }
  654:       break;
  655:     }
  656:     if( iCurrent<iPos ){ continue; }
  657: 
  658:     if( !isShiftDone ){
  659:       int n = nDoc - iBegin;
  660:       rc = fts3SnippetShift(pTab, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask);
  661:       isShiftDone = 1;
  662: 
  663:       /* Now that the shift has been done, check if the initial "..." are
  664:       ** required. They are required if (a) this is not the first fragment,
  665:       ** or (b) this fragment does not begin at position 0 of its column. 
  666:       */
  667:       if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){
  668:         rc = fts3StringAppend(pOut, zEllipsis, -1);
  669:       }
  670:       if( rc!=SQLITE_OK || iCurrent<iPos ) continue;
  671:     }
  672: 
  673:     if( iCurrent>=(iPos+nSnippet) ){
  674:       if( isLast ){
  675:         rc = fts3StringAppend(pOut, zEllipsis, -1);
  676:       }
  677:       break;
  678:     }
  679: 
  680:     /* Set isHighlight to true if this term should be highlighted. */
  681:     isHighlight = (hlmask & ((u64)1 << (iCurrent-iPos)))!=0;
  682: 
  683:     if( iCurrent>iPos ) rc = fts3StringAppend(pOut, &zDoc[iEnd], iBegin-iEnd);
  684:     if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zOpen, -1);
  685:     if( rc==SQLITE_OK ) rc = fts3StringAppend(pOut, &zDoc[iBegin], iFin-iBegin);
  686:     if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zClose, -1);
  687: 
  688:     iEnd = iFin;
  689:   }
  690: 
  691:   pMod->xClose(pC);
  692:   return rc;
  693: }
  694: 
  695: 
  696: /*
  697: ** This function is used to count the entries in a column-list (a 
  698: ** delta-encoded list of term offsets within a single column of a single 
  699: ** row). When this function is called, *ppCollist should point to the
  700: ** beginning of the first varint in the column-list (the varint that
  701: ** contains the position of the first matching term in the column data).
  702: ** Before returning, *ppCollist is set to point to the first byte after
  703: ** the last varint in the column-list (either the 0x00 signifying the end
  704: ** of the position-list, or the 0x01 that precedes the column number of
  705: ** the next column in the position-list).
  706: **
  707: ** The number of elements in the column-list is returned.
  708: */
  709: static int fts3ColumnlistCount(char **ppCollist){
  710:   char *pEnd = *ppCollist;
  711:   char c = 0;
  712:   int nEntry = 0;
  713: 
  714:   /* A column-list is terminated by either a 0x01 or 0x00. */
  715:   while( 0xFE & (*pEnd | c) ){
  716:     c = *pEnd++ & 0x80;
  717:     if( !c ) nEntry++;
  718:   }
  719: 
  720:   *ppCollist = pEnd;
  721:   return nEntry;
  722: }
  723: 
  724: /*
  725: ** fts3ExprIterate() callback used to collect the "global" matchinfo stats
  726: ** for a single query. 
  727: **
  728: ** fts3ExprIterate() callback to load the 'global' elements of a
  729: ** FTS3_MATCHINFO_HITS matchinfo array. The global stats are those elements 
  730: ** of the matchinfo array that are constant for all rows returned by the 
  731: ** current query.
  732: **
  733: ** Argument pCtx is actually a pointer to a struct of type MatchInfo. This
  734: ** function populates Matchinfo.aMatchinfo[] as follows:
  735: **
  736: **   for(iCol=0; iCol<nCol; iCol++){
  737: **     aMatchinfo[3*iPhrase*nCol + 3*iCol + 1] = X;
  738: **     aMatchinfo[3*iPhrase*nCol + 3*iCol + 2] = Y;
  739: **   }
  740: **
  741: ** where X is the number of matches for phrase iPhrase is column iCol of all
  742: ** rows of the table. Y is the number of rows for which column iCol contains
  743: ** at least one instance of phrase iPhrase.
  744: **
  745: ** If the phrase pExpr consists entirely of deferred tokens, then all X and
  746: ** Y values are set to nDoc, where nDoc is the number of documents in the 
  747: ** file system. This is done because the full-text index doclist is required
  748: ** to calculate these values properly, and the full-text index doclist is
  749: ** not available for deferred tokens.
  750: */
  751: static int fts3ExprGlobalHitsCb(
  752:   Fts3Expr *pExpr,                /* Phrase expression node */
  753:   int iPhrase,                    /* Phrase number (numbered from zero) */
  754:   void *pCtx                      /* Pointer to MatchInfo structure */
  755: ){
  756:   MatchInfo *p = (MatchInfo *)pCtx;
  757:   return sqlite3Fts3EvalPhraseStats(
  758:       p->pCursor, pExpr, &p->aMatchinfo[3*iPhrase*p->nCol]
  759:   );
  760: }
  761: 
  762: /*
  763: ** fts3ExprIterate() callback used to collect the "local" part of the
  764: ** FTS3_MATCHINFO_HITS array. The local stats are those elements of the 
  765: ** array that are different for each row returned by the query.
  766: */
  767: static int fts3ExprLocalHitsCb(
  768:   Fts3Expr *pExpr,                /* Phrase expression node */
  769:   int iPhrase,                    /* Phrase number */
  770:   void *pCtx                      /* Pointer to MatchInfo structure */
  771: ){
  772:   MatchInfo *p = (MatchInfo *)pCtx;
  773:   int iStart = iPhrase * p->nCol * 3;
  774:   int i;
  775: 
  776:   for(i=0; i<p->nCol; i++){
  777:     char *pCsr;
  778:     pCsr = sqlite3Fts3EvalPhrasePoslist(p->pCursor, pExpr, i);
  779:     if( pCsr ){
  780:       p->aMatchinfo[iStart+i*3] = fts3ColumnlistCount(&pCsr);
  781:     }else{
  782:       p->aMatchinfo[iStart+i*3] = 0;
  783:     }
  784:   }
  785: 
  786:   return SQLITE_OK;
  787: }
  788: 
  789: static int fts3MatchinfoCheck(
  790:   Fts3Table *pTab, 
  791:   char cArg,
  792:   char **pzErr
  793: ){
  794:   if( (cArg==FTS3_MATCHINFO_NPHRASE)
  795:    || (cArg==FTS3_MATCHINFO_NCOL)
  796:    || (cArg==FTS3_MATCHINFO_NDOC && pTab->bHasStat)
  797:    || (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bHasStat)
  798:    || (cArg==FTS3_MATCHINFO_LENGTH && pTab->bHasDocsize)
  799:    || (cArg==FTS3_MATCHINFO_LCS)
  800:    || (cArg==FTS3_MATCHINFO_HITS)
  801:   ){
  802:     return SQLITE_OK;
  803:   }
  804:   *pzErr = sqlite3_mprintf("unrecognized matchinfo request: %c", cArg);
  805:   return SQLITE_ERROR;
  806: }
  807: 
  808: static int fts3MatchinfoSize(MatchInfo *pInfo, char cArg){
  809:   int nVal;                       /* Number of integers output by cArg */
  810: 
  811:   switch( cArg ){
  812:     case FTS3_MATCHINFO_NDOC:
  813:     case FTS3_MATCHINFO_NPHRASE: 
  814:     case FTS3_MATCHINFO_NCOL: 
  815:       nVal = 1;
  816:       break;
  817: 
  818:     case FTS3_MATCHINFO_AVGLENGTH:
  819:     case FTS3_MATCHINFO_LENGTH:
  820:     case FTS3_MATCHINFO_LCS:
  821:       nVal = pInfo->nCol;
  822:       break;
  823: 
  824:     default:
  825:       assert( cArg==FTS3_MATCHINFO_HITS );
  826:       nVal = pInfo->nCol * pInfo->nPhrase * 3;
  827:       break;
  828:   }
  829: 
  830:   return nVal;
  831: }
  832: 
  833: static int fts3MatchinfoSelectDoctotal(
  834:   Fts3Table *pTab,
  835:   sqlite3_stmt **ppStmt,
  836:   sqlite3_int64 *pnDoc,
  837:   const char **paLen
  838: ){
  839:   sqlite3_stmt *pStmt;
  840:   const char *a;
  841:   sqlite3_int64 nDoc;
  842: 
  843:   if( !*ppStmt ){
  844:     int rc = sqlite3Fts3SelectDoctotal(pTab, ppStmt);
  845:     if( rc!=SQLITE_OK ) return rc;
  846:   }
  847:   pStmt = *ppStmt;
  848:   assert( sqlite3_data_count(pStmt)==1 );
  849: 
  850:   a = sqlite3_column_blob(pStmt, 0);
  851:   a += sqlite3Fts3GetVarint(a, &nDoc);
  852:   if( nDoc==0 ) return FTS_CORRUPT_VTAB;
  853:   *pnDoc = (u32)nDoc;
  854: 
  855:   if( paLen ) *paLen = a;
  856:   return SQLITE_OK;
  857: }
  858: 
  859: /*
  860: ** An instance of the following structure is used to store state while 
  861: ** iterating through a multi-column position-list corresponding to the
  862: ** hits for a single phrase on a single row in order to calculate the
  863: ** values for a matchinfo() FTS3_MATCHINFO_LCS request.
  864: */
  865: typedef struct LcsIterator LcsIterator;
  866: struct LcsIterator {
  867:   Fts3Expr *pExpr;                /* Pointer to phrase expression */
  868:   int iPosOffset;                 /* Tokens count up to end of this phrase */
  869:   char *pRead;                    /* Cursor used to iterate through aDoclist */
  870:   int iPos;                       /* Current position */
  871: };
  872: 
  873: /* 
  874: ** If LcsIterator.iCol is set to the following value, the iterator has
  875: ** finished iterating through all offsets for all columns.
  876: */
  877: #define LCS_ITERATOR_FINISHED 0x7FFFFFFF;
  878: 
  879: static int fts3MatchinfoLcsCb(
  880:   Fts3Expr *pExpr,                /* Phrase expression node */
  881:   int iPhrase,                    /* Phrase number (numbered from zero) */
  882:   void *pCtx                      /* Pointer to MatchInfo structure */
  883: ){
  884:   LcsIterator *aIter = (LcsIterator *)pCtx;
  885:   aIter[iPhrase].pExpr = pExpr;
  886:   return SQLITE_OK;
  887: }
  888: 
  889: /*
  890: ** Advance the iterator passed as an argument to the next position. Return
  891: ** 1 if the iterator is at EOF or if it now points to the start of the
  892: ** position list for the next column.
  893: */
  894: static int fts3LcsIteratorAdvance(LcsIterator *pIter){
  895:   char *pRead = pIter->pRead;
  896:   sqlite3_int64 iRead;
  897:   int rc = 0;
  898: 
  899:   pRead += sqlite3Fts3GetVarint(pRead, &iRead);
  900:   if( iRead==0 || iRead==1 ){
  901:     pRead = 0;
  902:     rc = 1;
  903:   }else{
  904:     pIter->iPos += (int)(iRead-2);
  905:   }
  906: 
  907:   pIter->pRead = pRead;
  908:   return rc;
  909: }
  910:   
  911: /*
  912: ** This function implements the FTS3_MATCHINFO_LCS matchinfo() flag. 
  913: **
  914: ** If the call is successful, the longest-common-substring lengths for each
  915: ** column are written into the first nCol elements of the pInfo->aMatchinfo[] 
  916: ** array before returning. SQLITE_OK is returned in this case.
  917: **
  918: ** Otherwise, if an error occurs, an SQLite error code is returned and the
  919: ** data written to the first nCol elements of pInfo->aMatchinfo[] is 
  920: ** undefined.
  921: */
  922: static int fts3MatchinfoLcs(Fts3Cursor *pCsr, MatchInfo *pInfo){
  923:   LcsIterator *aIter;
  924:   int i;
  925:   int iCol;
  926:   int nToken = 0;
  927: 
  928:   /* Allocate and populate the array of LcsIterator objects. The array
  929:   ** contains one element for each matchable phrase in the query.
  930:   **/
  931:   aIter = sqlite3_malloc(sizeof(LcsIterator) * pCsr->nPhrase);
  932:   if( !aIter ) return SQLITE_NOMEM;
  933:   memset(aIter, 0, sizeof(LcsIterator) * pCsr->nPhrase);
  934:   (void)fts3ExprIterate(pCsr->pExpr, fts3MatchinfoLcsCb, (void*)aIter);
  935: 
  936:   for(i=0; i<pInfo->nPhrase; i++){
  937:     LcsIterator *pIter = &aIter[i];
  938:     nToken -= pIter->pExpr->pPhrase->nToken;
  939:     pIter->iPosOffset = nToken;
  940:   }
  941: 
  942:   for(iCol=0; iCol<pInfo->nCol; iCol++){
  943:     int nLcs = 0;                 /* LCS value for this column */
  944:     int nLive = 0;                /* Number of iterators in aIter not at EOF */
  945: 
  946:     for(i=0; i<pInfo->nPhrase; i++){
  947:       LcsIterator *pIt = &aIter[i];
  948:       pIt->pRead = sqlite3Fts3EvalPhrasePoslist(pCsr, pIt->pExpr, iCol);
  949:       if( pIt->pRead ){
  950:         pIt->iPos = pIt->iPosOffset;
  951:         fts3LcsIteratorAdvance(&aIter[i]);
  952:         nLive++;
  953:       }
  954:     }
  955: 
  956:     while( nLive>0 ){
  957:       LcsIterator *pAdv = 0;      /* The iterator to advance by one position */
  958:       int nThisLcs = 0;           /* LCS for the current iterator positions */
  959: 
  960:       for(i=0; i<pInfo->nPhrase; i++){
  961:         LcsIterator *pIter = &aIter[i];
  962:         if( pIter->pRead==0 ){
  963:           /* This iterator is already at EOF for this column. */
  964:           nThisLcs = 0;
  965:         }else{
  966:           if( pAdv==0 || pIter->iPos<pAdv->iPos ){
  967:             pAdv = pIter;
  968:           }
  969:           if( nThisLcs==0 || pIter->iPos==pIter[-1].iPos ){
  970:             nThisLcs++;
  971:           }else{
  972:             nThisLcs = 1;
  973:           }
  974:           if( nThisLcs>nLcs ) nLcs = nThisLcs;
  975:         }
  976:       }
  977:       if( fts3LcsIteratorAdvance(pAdv) ) nLive--;
  978:     }
  979: 
  980:     pInfo->aMatchinfo[iCol] = nLcs;
  981:   }
  982: 
  983:   sqlite3_free(aIter);
  984:   return SQLITE_OK;
  985: }
  986: 
  987: /*
  988: ** Populate the buffer pInfo->aMatchinfo[] with an array of integers to
  989: ** be returned by the matchinfo() function. Argument zArg contains the 
  990: ** format string passed as the second argument to matchinfo (or the
  991: ** default value "pcx" if no second argument was specified). The format
  992: ** string has already been validated and the pInfo->aMatchinfo[] array
  993: ** is guaranteed to be large enough for the output.
  994: **
  995: ** If bGlobal is true, then populate all fields of the matchinfo() output.
  996: ** If it is false, then assume that those fields that do not change between
  997: ** rows (i.e. FTS3_MATCHINFO_NPHRASE, NCOL, NDOC, AVGLENGTH and part of HITS)
  998: ** have already been populated.
  999: **
 1000: ** Return SQLITE_OK if successful, or an SQLite error code if an error 
 1001: ** occurs. If a value other than SQLITE_OK is returned, the state the
 1002: ** pInfo->aMatchinfo[] buffer is left in is undefined.
 1003: */
 1004: static int fts3MatchinfoValues(
 1005:   Fts3Cursor *pCsr,               /* FTS3 cursor object */
 1006:   int bGlobal,                    /* True to grab the global stats */
 1007:   MatchInfo *pInfo,               /* Matchinfo context object */
 1008:   const char *zArg                /* Matchinfo format string */
 1009: ){
 1010:   int rc = SQLITE_OK;
 1011:   int i;
 1012:   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
 1013:   sqlite3_stmt *pSelect = 0;
 1014: 
 1015:   for(i=0; rc==SQLITE_OK && zArg[i]; i++){
 1016: 
 1017:     switch( zArg[i] ){
 1018:       case FTS3_MATCHINFO_NPHRASE:
 1019:         if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nPhrase;
 1020:         break;
 1021: 
 1022:       case FTS3_MATCHINFO_NCOL:
 1023:         if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nCol;
 1024:         break;
 1025:         
 1026:       case FTS3_MATCHINFO_NDOC:
 1027:         if( bGlobal ){
 1028:           sqlite3_int64 nDoc = 0;
 1029:           rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, 0);
 1030:           pInfo->aMatchinfo[0] = (u32)nDoc;
 1031:         }
 1032:         break;
 1033: 
 1034:       case FTS3_MATCHINFO_AVGLENGTH: 
 1035:         if( bGlobal ){
 1036:           sqlite3_int64 nDoc;     /* Number of rows in table */
 1037:           const char *a;          /* Aggregate column length array */
 1038: 
 1039:           rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, &a);
 1040:           if( rc==SQLITE_OK ){
 1041:             int iCol;
 1042:             for(iCol=0; iCol<pInfo->nCol; iCol++){
 1043:               u32 iVal;
 1044:               sqlite3_int64 nToken;
 1045:               a += sqlite3Fts3GetVarint(a, &nToken);
 1046:               iVal = (u32)(((u32)(nToken&0xffffffff)+nDoc/2)/nDoc);
 1047:               pInfo->aMatchinfo[iCol] = iVal;
 1048:             }
 1049:           }
 1050:         }
 1051:         break;
 1052: 
 1053:       case FTS3_MATCHINFO_LENGTH: {
 1054:         sqlite3_stmt *pSelectDocsize = 0;
 1055:         rc = sqlite3Fts3SelectDocsize(pTab, pCsr->iPrevId, &pSelectDocsize);
 1056:         if( rc==SQLITE_OK ){
 1057:           int iCol;
 1058:           const char *a = sqlite3_column_blob(pSelectDocsize, 0);
 1059:           for(iCol=0; iCol<pInfo->nCol; iCol++){
 1060:             sqlite3_int64 nToken;
 1061:             a += sqlite3Fts3GetVarint(a, &nToken);
 1062:             pInfo->aMatchinfo[iCol] = (u32)nToken;
 1063:           }
 1064:         }
 1065:         sqlite3_reset(pSelectDocsize);
 1066:         break;
 1067:       }
 1068: 
 1069:       case FTS3_MATCHINFO_LCS:
 1070:         rc = fts3ExprLoadDoclists(pCsr, 0, 0);
 1071:         if( rc==SQLITE_OK ){
 1072:           rc = fts3MatchinfoLcs(pCsr, pInfo);
 1073:         }
 1074:         break;
 1075: 
 1076:       default: {
 1077:         Fts3Expr *pExpr;
 1078:         assert( zArg[i]==FTS3_MATCHINFO_HITS );
 1079:         pExpr = pCsr->pExpr;
 1080:         rc = fts3ExprLoadDoclists(pCsr, 0, 0);
 1081:         if( rc!=SQLITE_OK ) break;
 1082:         if( bGlobal ){
 1083:           if( pCsr->pDeferred ){
 1084:             rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &pInfo->nDoc, 0);
 1085:             if( rc!=SQLITE_OK ) break;
 1086:           }
 1087:           rc = fts3ExprIterate(pExpr, fts3ExprGlobalHitsCb,(void*)pInfo);
 1088:           if( rc!=SQLITE_OK ) break;
 1089:         }
 1090:         (void)fts3ExprIterate(pExpr, fts3ExprLocalHitsCb,(void*)pInfo);
 1091:         break;
 1092:       }
 1093:     }
 1094: 
 1095:     pInfo->aMatchinfo += fts3MatchinfoSize(pInfo, zArg[i]);
 1096:   }
 1097: 
 1098:   sqlite3_reset(pSelect);
 1099:   return rc;
 1100: }
 1101: 
 1102: 
 1103: /*
 1104: ** Populate pCsr->aMatchinfo[] with data for the current row. The 
 1105: ** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32).
 1106: */
 1107: static int fts3GetMatchinfo(
 1108:   Fts3Cursor *pCsr,               /* FTS3 Cursor object */
 1109:   const char *zArg                /* Second argument to matchinfo() function */
 1110: ){
 1111:   MatchInfo sInfo;
 1112:   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
 1113:   int rc = SQLITE_OK;
 1114:   int bGlobal = 0;                /* Collect 'global' stats as well as local */
 1115: 
 1116:   memset(&sInfo, 0, sizeof(MatchInfo));
 1117:   sInfo.pCursor = pCsr;
 1118:   sInfo.nCol = pTab->nColumn;
 1119: 
 1120:   /* If there is cached matchinfo() data, but the format string for the 
 1121:   ** cache does not match the format string for this request, discard 
 1122:   ** the cached data. */
 1123:   if( pCsr->zMatchinfo && strcmp(pCsr->zMatchinfo, zArg) ){
 1124:     assert( pCsr->aMatchinfo );
 1125:     sqlite3_free(pCsr->aMatchinfo);
 1126:     pCsr->zMatchinfo = 0;
 1127:     pCsr->aMatchinfo = 0;
 1128:   }
 1129: 
 1130:   /* If Fts3Cursor.aMatchinfo[] is NULL, then this is the first time the
 1131:   ** matchinfo function has been called for this query. In this case 
 1132:   ** allocate the array used to accumulate the matchinfo data and
 1133:   ** initialize those elements that are constant for every row.
 1134:   */
 1135:   if( pCsr->aMatchinfo==0 ){
 1136:     int nMatchinfo = 0;           /* Number of u32 elements in match-info */
 1137:     int nArg;                     /* Bytes in zArg */
 1138:     int i;                        /* Used to iterate through zArg */
 1139: 
 1140:     /* Determine the number of phrases in the query */
 1141:     pCsr->nPhrase = fts3ExprPhraseCount(pCsr->pExpr);
 1142:     sInfo.nPhrase = pCsr->nPhrase;
 1143: 
 1144:     /* Determine the number of integers in the buffer returned by this call. */
 1145:     for(i=0; zArg[i]; i++){
 1146:       nMatchinfo += fts3MatchinfoSize(&sInfo, zArg[i]);
 1147:     }
 1148: 
 1149:     /* Allocate space for Fts3Cursor.aMatchinfo[] and Fts3Cursor.zMatchinfo. */
 1150:     nArg = (int)strlen(zArg);
 1151:     pCsr->aMatchinfo = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo + nArg + 1);
 1152:     if( !pCsr->aMatchinfo ) return SQLITE_NOMEM;
 1153: 
 1154:     pCsr->zMatchinfo = (char *)&pCsr->aMatchinfo[nMatchinfo];
 1155:     pCsr->nMatchinfo = nMatchinfo;
 1156:     memcpy(pCsr->zMatchinfo, zArg, nArg+1);
 1157:     memset(pCsr->aMatchinfo, 0, sizeof(u32)*nMatchinfo);
 1158:     pCsr->isMatchinfoNeeded = 1;
 1159:     bGlobal = 1;
 1160:   }
 1161: 
 1162:   sInfo.aMatchinfo = pCsr->aMatchinfo;
 1163:   sInfo.nPhrase = pCsr->nPhrase;
 1164:   if( pCsr->isMatchinfoNeeded ){
 1165:     rc = fts3MatchinfoValues(pCsr, bGlobal, &sInfo, zArg);
 1166:     pCsr->isMatchinfoNeeded = 0;
 1167:   }
 1168: 
 1169:   return rc;
 1170: }
 1171: 
 1172: /*
 1173: ** Implementation of snippet() function.
 1174: */
 1175: void sqlite3Fts3Snippet(
 1176:   sqlite3_context *pCtx,          /* SQLite function call context */
 1177:   Fts3Cursor *pCsr,               /* Cursor object */
 1178:   const char *zStart,             /* Snippet start text - "<b>" */
 1179:   const char *zEnd,               /* Snippet end text - "</b>" */
 1180:   const char *zEllipsis,          /* Snippet ellipsis text - "<b>...</b>" */
 1181:   int iCol,                       /* Extract snippet from this column */
 1182:   int nToken                      /* Approximate number of tokens in snippet */
 1183: ){
 1184:   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
 1185:   int rc = SQLITE_OK;
 1186:   int i;
 1187:   StrBuffer res = {0, 0, 0};
 1188: 
 1189:   /* The returned text includes up to four fragments of text extracted from
 1190:   ** the data in the current row. The first iteration of the for(...) loop
 1191:   ** below attempts to locate a single fragment of text nToken tokens in 
 1192:   ** size that contains at least one instance of all phrases in the query
 1193:   ** expression that appear in the current row. If such a fragment of text
 1194:   ** cannot be found, the second iteration of the loop attempts to locate
 1195:   ** a pair of fragments, and so on.
 1196:   */
 1197:   int nSnippet = 0;               /* Number of fragments in this snippet */
 1198:   SnippetFragment aSnippet[4];    /* Maximum of 4 fragments per snippet */
 1199:   int nFToken = -1;               /* Number of tokens in each fragment */
 1200: 
 1201:   if( !pCsr->pExpr ){
 1202:     sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
 1203:     return;
 1204:   }
 1205: 
 1206:   for(nSnippet=1; 1; nSnippet++){
 1207: 
 1208:     int iSnip;                    /* Loop counter 0..nSnippet-1 */
 1209:     u64 mCovered = 0;             /* Bitmask of phrases covered by snippet */
 1210:     u64 mSeen = 0;                /* Bitmask of phrases seen by BestSnippet() */
 1211: 
 1212:     if( nToken>=0 ){
 1213:       nFToken = (nToken+nSnippet-1) / nSnippet;
 1214:     }else{
 1215:       nFToken = -1 * nToken;
 1216:     }
 1217: 
 1218:     for(iSnip=0; iSnip<nSnippet; iSnip++){
 1219:       int iBestScore = -1;        /* Best score of columns checked so far */
 1220:       int iRead;                  /* Used to iterate through columns */
 1221:       SnippetFragment *pFragment = &aSnippet[iSnip];
 1222: 
 1223:       memset(pFragment, 0, sizeof(*pFragment));
 1224: 
 1225:       /* Loop through all columns of the table being considered for snippets.
 1226:       ** If the iCol argument to this function was negative, this means all
 1227:       ** columns of the FTS3 table. Otherwise, only column iCol is considered.
 1228:       */
 1229:       for(iRead=0; iRead<pTab->nColumn; iRead++){
 1230:         SnippetFragment sF = {0, 0, 0, 0};
 1231:         int iS;
 1232:         if( iCol>=0 && iRead!=iCol ) continue;
 1233: 
 1234:         /* Find the best snippet of nFToken tokens in column iRead. */
 1235:         rc = fts3BestSnippet(nFToken, pCsr, iRead, mCovered, &mSeen, &sF, &iS);
 1236:         if( rc!=SQLITE_OK ){
 1237:           goto snippet_out;
 1238:         }
 1239:         if( iS>iBestScore ){
 1240:           *pFragment = sF;
 1241:           iBestScore = iS;
 1242:         }
 1243:       }
 1244: 
 1245:       mCovered |= pFragment->covered;
 1246:     }
 1247: 
 1248:     /* If all query phrases seen by fts3BestSnippet() are present in at least
 1249:     ** one of the nSnippet snippet fragments, break out of the loop.
 1250:     */
 1251:     assert( (mCovered&mSeen)==mCovered );
 1252:     if( mSeen==mCovered || nSnippet==SizeofArray(aSnippet) ) break;
 1253:   }
 1254: 
 1255:   assert( nFToken>0 );
 1256: 
 1257:   for(i=0; i<nSnippet && rc==SQLITE_OK; i++){
 1258:     rc = fts3SnippetText(pCsr, &aSnippet[i], 
 1259:         i, (i==nSnippet-1), nFToken, zStart, zEnd, zEllipsis, &res
 1260:     );
 1261:   }
 1262: 
 1263:  snippet_out:
 1264:   sqlite3Fts3SegmentsClose(pTab);
 1265:   if( rc!=SQLITE_OK ){
 1266:     sqlite3_result_error_code(pCtx, rc);
 1267:     sqlite3_free(res.z);
 1268:   }else{
 1269:     sqlite3_result_text(pCtx, res.z, -1, sqlite3_free);
 1270:   }
 1271: }
 1272: 
 1273: 
 1274: typedef struct TermOffset TermOffset;
 1275: typedef struct TermOffsetCtx TermOffsetCtx;
 1276: 
 1277: struct TermOffset {
 1278:   char *pList;                    /* Position-list */
 1279:   int iPos;                       /* Position just read from pList */
 1280:   int iOff;                       /* Offset of this term from read positions */
 1281: };
 1282: 
 1283: struct TermOffsetCtx {
 1284:   Fts3Cursor *pCsr;
 1285:   int iCol;                       /* Column of table to populate aTerm for */
 1286:   int iTerm;
 1287:   sqlite3_int64 iDocid;
 1288:   TermOffset *aTerm;
 1289: };
 1290: 
 1291: /*
 1292: ** This function is an fts3ExprIterate() callback used by sqlite3Fts3Offsets().
 1293: */
 1294: static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){
 1295:   TermOffsetCtx *p = (TermOffsetCtx *)ctx;
 1296:   int nTerm;                      /* Number of tokens in phrase */
 1297:   int iTerm;                      /* For looping through nTerm phrase terms */
 1298:   char *pList;                    /* Pointer to position list for phrase */
 1299:   int iPos = 0;                   /* First position in position-list */
 1300: 
 1301:   UNUSED_PARAMETER(iPhrase);
 1302:   pList = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol);
 1303:   nTerm = pExpr->pPhrase->nToken;
 1304:   if( pList ){
 1305:     fts3GetDeltaPosition(&pList, &iPos);
 1306:     assert( iPos>=0 );
 1307:   }
 1308: 
 1309:   for(iTerm=0; iTerm<nTerm; iTerm++){
 1310:     TermOffset *pT = &p->aTerm[p->iTerm++];
 1311:     pT->iOff = nTerm-iTerm-1;
 1312:     pT->pList = pList;
 1313:     pT->iPos = iPos;
 1314:   }
 1315: 
 1316:   return SQLITE_OK;
 1317: }
 1318: 
 1319: /*
 1320: ** Implementation of offsets() function.
 1321: */
 1322: void sqlite3Fts3Offsets(
 1323:   sqlite3_context *pCtx,          /* SQLite function call context */
 1324:   Fts3Cursor *pCsr                /* Cursor object */
 1325: ){
 1326:   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
 1327:   sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule;
 1328:   const char *ZDUMMY;             /* Dummy argument used with xNext() */
 1329:   int NDUMMY;                     /* Dummy argument used with xNext() */
 1330:   int rc;                         /* Return Code */
 1331:   int nToken;                     /* Number of tokens in query */
 1332:   int iCol;                       /* Column currently being processed */
 1333:   StrBuffer res = {0, 0, 0};      /* Result string */
 1334:   TermOffsetCtx sCtx;             /* Context for fts3ExprTermOffsetInit() */
 1335: 
 1336:   if( !pCsr->pExpr ){
 1337:     sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
 1338:     return;
 1339:   }
 1340: 
 1341:   memset(&sCtx, 0, sizeof(sCtx));
 1342:   assert( pCsr->isRequireSeek==0 );
 1343: 
 1344:   /* Count the number of terms in the query */
 1345:   rc = fts3ExprLoadDoclists(pCsr, 0, &nToken);
 1346:   if( rc!=SQLITE_OK ) goto offsets_out;
 1347: 
 1348:   /* Allocate the array of TermOffset iterators. */
 1349:   sCtx.aTerm = (TermOffset *)sqlite3_malloc(sizeof(TermOffset)*nToken);
 1350:   if( 0==sCtx.aTerm ){
 1351:     rc = SQLITE_NOMEM;
 1352:     goto offsets_out;
 1353:   }
 1354:   sCtx.iDocid = pCsr->iPrevId;
 1355:   sCtx.pCsr = pCsr;
 1356: 
 1357:   /* Loop through the table columns, appending offset information to 
 1358:   ** string-buffer res for each column.
 1359:   */
 1360:   for(iCol=0; iCol<pTab->nColumn; iCol++){
 1361:     sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */
 1362:     int iStart;
 1363:     int iEnd;
 1364:     int iCurrent;
 1365:     const char *zDoc;
 1366:     int nDoc;
 1367: 
 1368:     /* Initialize the contents of sCtx.aTerm[] for column iCol. There is 
 1369:     ** no way that this operation can fail, so the return code from
 1370:     ** fts3ExprIterate() can be discarded.
 1371:     */
 1372:     sCtx.iCol = iCol;
 1373:     sCtx.iTerm = 0;
 1374:     (void)fts3ExprIterate(pCsr->pExpr, fts3ExprTermOffsetInit, (void *)&sCtx);
 1375: 
 1376:     /* Retreive the text stored in column iCol. If an SQL NULL is stored 
 1377:     ** in column iCol, jump immediately to the next iteration of the loop.
 1378:     ** If an OOM occurs while retrieving the data (this can happen if SQLite
 1379:     ** needs to transform the data from utf-16 to utf-8), return SQLITE_NOMEM 
 1380:     ** to the caller. 
 1381:     */
 1382:     zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1);
 1383:     nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1);
 1384:     if( zDoc==0 ){
 1385:       if( sqlite3_column_type(pCsr->pStmt, iCol+1)==SQLITE_NULL ){
 1386:         continue;
 1387:       }
 1388:       rc = SQLITE_NOMEM;
 1389:       goto offsets_out;
 1390:     }
 1391: 
 1392:     /* Initialize a tokenizer iterator to iterate through column iCol. */
 1393:     rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
 1394:     if( rc!=SQLITE_OK ) goto offsets_out;
 1395:     pC->pTokenizer = pTab->pTokenizer;
 1396: 
 1397:     rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
 1398:     while( rc==SQLITE_OK ){
 1399:       int i;                      /* Used to loop through terms */
 1400:       int iMinPos = 0x7FFFFFFF;   /* Position of next token */
 1401:       TermOffset *pTerm = 0;      /* TermOffset associated with next token */
 1402: 
 1403:       for(i=0; i<nToken; i++){
 1404:         TermOffset *pT = &sCtx.aTerm[i];
 1405:         if( pT->pList && (pT->iPos-pT->iOff)<iMinPos ){
 1406:           iMinPos = pT->iPos-pT->iOff;
 1407:           pTerm = pT;
 1408:         }
 1409:       }
 1410: 
 1411:       if( !pTerm ){
 1412:         /* All offsets for this column have been gathered. */
 1413:         rc = SQLITE_DONE;
 1414:       }else{
 1415:         assert( iCurrent<=iMinPos );
 1416:         if( 0==(0xFE&*pTerm->pList) ){
 1417:           pTerm->pList = 0;
 1418:         }else{
 1419:           fts3GetDeltaPosition(&pTerm->pList, &pTerm->iPos);
 1420:         }
 1421:         while( rc==SQLITE_OK && iCurrent<iMinPos ){
 1422:           rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
 1423:         }
 1424:         if( rc==SQLITE_OK ){
 1425:           char aBuffer[64];
 1426:           sqlite3_snprintf(sizeof(aBuffer), aBuffer, 
 1427:               "%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart
 1428:           );
 1429:           rc = fts3StringAppend(&res, aBuffer, -1);
 1430:         }else if( rc==SQLITE_DONE && pTab->zContentTbl==0 ){
 1431:           rc = FTS_CORRUPT_VTAB;
 1432:         }
 1433:       }
 1434:     }
 1435:     if( rc==SQLITE_DONE ){
 1436:       rc = SQLITE_OK;
 1437:     }
 1438: 
 1439:     pMod->xClose(pC);
 1440:     if( rc!=SQLITE_OK ) goto offsets_out;
 1441:   }
 1442: 
 1443:  offsets_out:
 1444:   sqlite3_free(sCtx.aTerm);
 1445:   assert( rc!=SQLITE_DONE );
 1446:   sqlite3Fts3SegmentsClose(pTab);
 1447:   if( rc!=SQLITE_OK ){
 1448:     sqlite3_result_error_code(pCtx,  rc);
 1449:     sqlite3_free(res.z);
 1450:   }else{
 1451:     sqlite3_result_text(pCtx, res.z, res.n-1, sqlite3_free);
 1452:   }
 1453:   return;
 1454: }
 1455: 
 1456: /*
 1457: ** Implementation of matchinfo() function.
 1458: */
 1459: void sqlite3Fts3Matchinfo(
 1460:   sqlite3_context *pContext,      /* Function call context */
 1461:   Fts3Cursor *pCsr,               /* FTS3 table cursor */
 1462:   const char *zArg                /* Second arg to matchinfo() function */
 1463: ){
 1464:   Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
 1465:   int rc;
 1466:   int i;
 1467:   const char *zFormat;
 1468: 
 1469:   if( zArg ){
 1470:     for(i=0; zArg[i]; i++){
 1471:       char *zErr = 0;
 1472:       if( fts3MatchinfoCheck(pTab, zArg[i], &zErr) ){
 1473:         sqlite3_result_error(pContext, zErr, -1);
 1474:         sqlite3_free(zErr);
 1475:         return;
 1476:       }
 1477:     }
 1478:     zFormat = zArg;
 1479:   }else{
 1480:     zFormat = FTS3_MATCHINFO_DEFAULT;
 1481:   }
 1482: 
 1483:   if( !pCsr->pExpr ){
 1484:     sqlite3_result_blob(pContext, "", 0, SQLITE_STATIC);
 1485:     return;
 1486:   }
 1487: 
 1488:   /* Retrieve matchinfo() data. */
 1489:   rc = fts3GetMatchinfo(pCsr, zFormat);
 1490:   sqlite3Fts3SegmentsClose(pTab);
 1491: 
 1492:   if( rc!=SQLITE_OK ){
 1493:     sqlite3_result_error_code(pContext, rc);
 1494:   }else{
 1495:     int n = pCsr->nMatchinfo * sizeof(u32);
 1496:     sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT);
 1497:   }
 1498: }
 1499: 
 1500: #endif

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>