File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts1 / fulltext.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 17:04:17 2012 UTC (13 years, 1 month ago) by misho
Branches: sqlite3, MAIN
CVS tags: v3_7_10, HEAD
sqlite3

    1: /* The author disclaims copyright to this source code.
    2:  *
    3:  * This is an SQLite module implementing full-text search.
    4:  */
    5: 
    6: #include <assert.h>
    7: #if !defined(__APPLE__)
    8: #include <malloc.h>
    9: #else
   10: #include <stdlib.h>
   11: #endif
   12: #include <stdio.h>
   13: #include <string.h>
   14: #include <ctype.h>
   15: 
   16: #include "fulltext.h"
   17: #include "ft_hash.h"
   18: #include "tokenizer.h"
   19: #include "sqlite3.h"
   20: #include "sqlite3ext.h"
   21: SQLITE_EXTENSION_INIT1
   22: 
   23: /* utility functions */
   24: 
   25: /* We encode variable-length integers in little-endian order using seven bits
   26:  * per byte as follows:
   27: **
   28: ** KEY:
   29: **         A = 0xxxxxxx    7 bits of data and one flag bit
   30: **         B = 1xxxxxxx    7 bits of data and one flag bit
   31: **
   32: **  7 bits - A
   33: ** 14 bits - BA
   34: ** 21 bits - BBA
   35: ** and so on.
   36: */
   37: 
   38: /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
   39: #define VARINT_MAX 10
   40: 
   41: /* Write a 64-bit variable-length integer to memory starting at p[0].
   42:  * The length of data written will be between 1 and VARINT_MAX bytes.
   43:  * The number of bytes written is returned. */
   44: static int putVarint(char *p, sqlite_int64 v){
   45:   unsigned char *q = (unsigned char *) p;
   46:   sqlite_uint64 vu = v;
   47:   do{
   48:     *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
   49:     vu >>= 7;
   50:   }while( vu!=0 );
   51:   q[-1] &= 0x7f;  /* turn off high bit in final byte */
   52:   assert( q - (unsigned char *)p <= VARINT_MAX );
   53:   return (int) (q - (unsigned char *)p);
   54: }
   55: 
   56: /* Read a 64-bit variable-length integer from memory starting at p[0].
   57:  * Return the number of bytes read, or 0 on error.
   58:  * The value is stored in *v. */
   59: static int getVarint(const char *p, sqlite_int64 *v){
   60:   const unsigned char *q = (const unsigned char *) p;
   61:   sqlite_uint64 x = 0, y = 1;
   62:   while( (*q & 0x80) == 0x80 ){
   63:     x += y * (*q++ & 0x7f);
   64:     y <<= 7;
   65:     if( q - (unsigned char *)p >= VARINT_MAX ){  /* bad data */
   66:       assert( 0 );
   67:       return 0;
   68:     }
   69:   }
   70:   x += y * (*q++);
   71:   *v = (sqlite_int64) x;
   72:   return (int) (q - (unsigned char *)p);
   73: }
   74: 
   75: static int getVarint32(const char *p, int *pi){
   76:  sqlite_int64 i;
   77:  int ret = getVarint(p, &i);
   78:  *pi = (int) i;
   79:  assert( *pi==i );
   80:  return ret;
   81: }
   82: 
   83: /*** Document lists ***
   84:  *
   85:  * A document list holds a sorted list of varint-encoded document IDs.
   86:  *
   87:  * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
   88:  *
   89:  * array {
   90:  *   varint docid;
   91:  *   array {
   92:  *     varint position;     (delta from previous position plus 1, or 0 for end)
   93:  *     varint startOffset;  (delta from previous startOffset)
   94:  *     varint endOffset;    (delta from startOffset)
   95:  *   }
   96:  * }
   97:  *
   98:  * Here, array { X } means zero or more occurrences of X, adjacent in memory.
   99:  *
  100:  * A doclist with type DL_POSITIONS is like the above, but holds only docids
  101:  * and positions without offset information.
  102:  *
  103:  * A doclist with type DL_DOCIDS is like the above, but holds only docids
  104:  * without positions or offset information.
  105:  *
  106:  * On disk, every document list has positions and offsets, so we don't bother
  107:  * to serialize a doclist's type.
  108:  * 
  109:  * We don't yet delta-encode document IDs; doing so will probably be a
  110:  * modest win.
  111:  *
  112:  * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
  113:  * After the first offset, estimate the next offset by using the
  114:  * current token position and the previous token position and offset,
  115:  * offset to handle some variance.  So the estimate would be
  116:  * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
  117:  * as normal.  Offsets more than 64 chars from the estimate are
  118:  * encoded as the delta to the previous start offset + 128.  An
  119:  * additional tiny increment can be gained by using the end offset of
  120:  * the previous token to make the estimate a tiny bit more precise.
  121: */
  122: 
  123: typedef enum DocListType {
  124:   DL_DOCIDS,              /* docids only */
  125:   DL_POSITIONS,           /* docids + positions */
  126:   DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
  127: } DocListType;
  128: 
  129: typedef struct DocList {
  130:   char *pData;
  131:   int nData;
  132:   DocListType iType;
  133:   int iLastPos;       /* the last position written */
  134:   int iLastOffset;    /* the last start offset written */
  135: } DocList;
  136: 
  137: /* Initialize a new DocList to hold the given data. */
  138: static void docListInit(DocList *d, DocListType iType,
  139:                         const char *pData, int nData){
  140:   d->nData = nData;
  141:   if( nData>0 ){
  142:     d->pData = malloc(nData);
  143:     memcpy(d->pData, pData, nData);
  144:   } else {
  145:     d->pData = NULL;
  146:   }
  147:   d->iType = iType;
  148:   d->iLastPos = 0;
  149:   d->iLastOffset = 0;
  150: }
  151: 
  152: /* Create a new dynamically-allocated DocList. */
  153: static DocList *docListNew(DocListType iType){
  154:   DocList *d = (DocList *) malloc(sizeof(DocList));
  155:   docListInit(d, iType, 0, 0);
  156:   return d;
  157: }
  158: 
  159: static void docListDestroy(DocList *d){
  160:   free(d->pData);
  161: #ifndef NDEBUG
  162:   memset(d, 0x55, sizeof(*d));
  163: #endif
  164: }
  165: 
  166: static void docListDelete(DocList *d){
  167:   docListDestroy(d);
  168:   free(d);
  169: }
  170: 
  171: static char *docListEnd(DocList *d){
  172:   return d->pData + d->nData;
  173: }
  174: 
  175: /* Append a varint to a DocList's data. */
  176: static void appendVarint(DocList *d, sqlite_int64 i){
  177:   char c[VARINT_MAX];
  178:   int n = putVarint(c, i);
  179:   d->pData = realloc(d->pData, d->nData + n);
  180:   memcpy(d->pData + d->nData, c, n);
  181:   d->nData += n;
  182: }
  183: 
  184: static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
  185:   appendVarint(d, iDocid);
  186:   d->iLastPos = 0;
  187: }
  188: 
  189: /* Add a position to the last position list in a doclist. */
  190: static void docListAddPos(DocList *d, int iPos){
  191:   assert( d->iType>=DL_POSITIONS );
  192:   appendVarint(d, iPos-d->iLastPos+1);
  193:   d->iLastPos = iPos;
  194: }
  195: 
  196: static void docListAddPosOffset(DocList *d, int iPos,
  197:                                 int iStartOffset, int iEndOffset){
  198:   assert( d->iType==DL_POSITIONS_OFFSETS );
  199:   docListAddPos(d, iPos);
  200:   appendVarint(d, iStartOffset-d->iLastOffset);
  201:   d->iLastOffset = iStartOffset;
  202:   appendVarint(d, iEndOffset-iStartOffset);
  203: }
  204: 
  205: /* Terminate the last position list in the given doclist. */
  206: static void docListAddEndPos(DocList *d){
  207:   appendVarint(d, 0);
  208: }
  209: 
  210: typedef struct DocListReader {
  211:   DocList *pDoclist;
  212:   char *p;
  213:   int iLastPos;    /* the last position read */
  214: } DocListReader;
  215: 
  216: static void readerInit(DocListReader *r, DocList *pDoclist){
  217:   r->pDoclist = pDoclist;
  218:   if( pDoclist!=NULL ){
  219:     r->p = pDoclist->pData;
  220:   }
  221:   r->iLastPos = 0;
  222: }
  223: 
  224: static int readerAtEnd(DocListReader *pReader){
  225:   return pReader->p >= docListEnd(pReader->pDoclist);
  226: }
  227: 
  228: /* Peek at the next docid without advancing the read pointer. */
  229: static sqlite_int64 peekDocid(DocListReader *pReader){
  230:   sqlite_int64 ret;
  231:   assert( !readerAtEnd(pReader) );
  232:   getVarint(pReader->p, &ret);
  233:   return ret;
  234: }
  235: 
  236: /* Read the next docid. */
  237: static sqlite_int64 readDocid(DocListReader *pReader){
  238:   sqlite_int64 ret;
  239:   assert( !readerAtEnd(pReader) );
  240:   pReader->p += getVarint(pReader->p, &ret);
  241:   pReader->iLastPos = 0;
  242:   return ret;
  243: }
  244: 
  245: /* Read the next position from a position list.
  246:  * Returns the position, or -1 at the end of the list. */
  247: static int readPosition(DocListReader *pReader){
  248:   int i;
  249:   int iType = pReader->pDoclist->iType;
  250:   assert( iType>=DL_POSITIONS );
  251:   assert( !readerAtEnd(pReader) );
  252: 
  253:   pReader->p += getVarint32(pReader->p, &i);
  254:   if( i==0 ){
  255:     pReader->iLastPos = -1;
  256:     return -1;
  257:   }
  258:   pReader->iLastPos += ((int) i)-1;
  259:   if( iType>=DL_POSITIONS_OFFSETS ){
  260:     /* Skip over offsets, ignoring them for now. */
  261:     int iStart, iEnd;
  262:     pReader->p += getVarint32(pReader->p, &iStart);
  263:     pReader->p += getVarint32(pReader->p, &iEnd);
  264:   }
  265:   return pReader->iLastPos;
  266: }
  267: 
  268: /* Skip past the end of a position list. */
  269: static void skipPositionList(DocListReader *pReader){
  270:   while( readPosition(pReader)!=-1 )
  271:     ;
  272: }
  273: 
  274: /* Skip over a docid, including its position list if the doclist has
  275:  * positions. */
  276: static void skipDocument(DocListReader *pReader){
  277:   readDocid(pReader);
  278:   if( pReader->pDoclist->iType >= DL_POSITIONS ){
  279:     skipPositionList(pReader);
  280:   }
  281: }
  282: 
  283: static sqlite_int64 firstDocid(DocList *d){
  284:   DocListReader r;
  285:   readerInit(&r, d);
  286:   return readDocid(&r);
  287: }
  288: 
  289: /* Doclist multi-tool.  Pass pUpdate==NULL to delete the indicated docid;
  290:  * otherwise pUpdate, which must contain only the single docid [iDocid], is
  291:  * inserted (if not present) or updated (if already present). */
  292: static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){
  293:   int modified = 0;
  294:   DocListReader reader;
  295:   char *p;
  296: 
  297:   if( pUpdate!=NULL ){
  298:     assert( d->iType==pUpdate->iType);
  299:     assert( iDocid==firstDocid(pUpdate) );
  300:   }
  301: 
  302:   readerInit(&reader, d);
  303:   while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){
  304:     skipDocument(&reader);
  305:   }
  306: 
  307:   p = reader.p;
  308:   /* Delete if there is a matching element. */
  309:   if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){
  310:     skipDocument(&reader);
  311:     memmove(p, reader.p, docListEnd(d) - reader.p);
  312:     d->nData -= (reader.p - p);
  313:     modified = 1;
  314:   }
  315: 
  316:   /* Insert if indicated. */
  317:   if( pUpdate!=NULL ){
  318:     int iDoclist = p-d->pData;
  319:     docListAddEndPos(pUpdate);
  320: 
  321:     d->pData = realloc(d->pData, d->nData+pUpdate->nData);
  322:     p = d->pData + iDoclist;
  323: 
  324:     memmove(p+pUpdate->nData, p, docListEnd(d) - p);
  325:     memcpy(p, pUpdate->pData, pUpdate->nData);
  326:     d->nData += pUpdate->nData;
  327:     modified = 1;
  328:   }
  329: 
  330:   return modified;
  331: }
  332: 
  333: /* Split the second half of doclist d into a separate doclist d2.  Returns 1
  334:  * if successful, or 0 if d contains a single document and hence can't be
  335:  * split. */
  336: static int docListSplit(DocList *d, DocList *d2){
  337:   const char *pSplitPoint = d->pData + d->nData / 2;
  338:   DocListReader reader;
  339: 
  340:   readerInit(&reader, d);
  341:   while( reader.p<pSplitPoint ){
  342:     skipDocument(&reader);
  343:   }
  344:   if( readerAtEnd(&reader) ) return 0;
  345:   docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p);
  346:   d->nData = reader.p - d->pData;
  347:   d->pData = realloc(d->pData, d->nData);
  348:   return 1;
  349: }
  350: 
  351: /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked
  352:  * on-disk doclist, resulting in another in-memory DocList [out].  [in]
  353:  * and [out] may or may not store position information according to the
  354:  * caller's wishes.  The on-disk doclist always comes with positions.
  355:  *
  356:  * The caller must read each chunk of the on-disk doclist in succession and
  357:  * pass it to mergeBlock().
  358:  *
  359:  * If [in] has positions, then the merge output contains only documents with
  360:  * matching positions in the two input doclists.  If [in] does not have
  361:  * positions, then the merge output contains all documents common to the two
  362:  * input doclists.
  363:  *
  364:  * If [in] is NULL, then the on-disk doclist is copied to [out] directly.
  365:  *
  366:  * A merge is performed using an integer [iOffset] provided by the caller.
  367:  * [iOffset] is subtracted from each position in the on-disk doclist for the
  368:  * purpose of position comparison; this is helpful in implementing phrase
  369:  * searches.
  370:  *
  371:  * A DocListMerge is not yet able to propagate offsets through query
  372:  * processing; we should add that capability soon.
  373: */
  374: typedef struct DocListMerge {
  375:   DocListReader in;
  376:   DocList *pOut;
  377:   int iOffset;
  378: } DocListMerge;
  379: 
  380: static void mergeInit(DocListMerge *m,
  381:                       DocList *pIn, int iOffset, DocList *pOut){
  382:   readerInit(&m->in, pIn);
  383:   m->pOut = pOut;
  384:   m->iOffset = iOffset;
  385: 
  386:   /* can't handle offsets yet */
  387:   assert( pIn==NULL || pIn->iType <= DL_POSITIONS );
  388:   assert( pOut->iType <= DL_POSITIONS );
  389: }
  390: 
  391: /* A helper function for mergeBlock(), below.  Merge the position lists
  392:  * pointed to by m->in and pBlockReader.
  393:  * If the merge matches, write [iDocid] to m->pOut; if m->pOut
  394:  * has positions then write all matching positions as well. */
  395: static void mergePosList(DocListMerge *m, sqlite_int64 iDocid,
  396:                   DocListReader *pBlockReader){
  397:   int block_pos = readPosition(pBlockReader);
  398:   int in_pos = readPosition(&m->in);
  399:   int match = 0;
  400:   while( block_pos!=-1 || in_pos!=-1 ){
  401:     if( block_pos-m->iOffset==in_pos ){
  402:       if( !match ){
  403:         docListAddDocid(m->pOut, iDocid);
  404:         match = 1;
  405:       }
  406:       if( m->pOut->iType >= DL_POSITIONS ){
  407:         docListAddPos(m->pOut, in_pos);
  408:       }
  409:       block_pos = readPosition(pBlockReader);
  410:       in_pos = readPosition(&m->in);
  411:     } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){
  412:       block_pos = readPosition(pBlockReader);
  413:     } else {
  414:       in_pos = readPosition(&m->in);
  415:     }
  416:   }
  417:   if( m->pOut->iType >= DL_POSITIONS && match ){
  418:     docListAddEndPos(m->pOut);
  419:   }
  420: }
  421: 
  422: /* Merge one block of an on-disk doclist into a DocListMerge. */
  423: static void mergeBlock(DocListMerge *m, DocList *pBlock){
  424:   DocListReader blockReader;
  425:   assert( pBlock->iType >= DL_POSITIONS );
  426:   readerInit(&blockReader, pBlock);
  427:   while( !readerAtEnd(&blockReader) ){
  428:     sqlite_int64 iDocid = readDocid(&blockReader);
  429:     if( m->in.pDoclist!=NULL ){
  430:       while( 1 ){
  431:         if( readerAtEnd(&m->in) ) return;  /* nothing more to merge */
  432:         if( peekDocid(&m->in)>=iDocid ) break;
  433:         skipDocument(&m->in);
  434:       }
  435:       if( peekDocid(&m->in)>iDocid ){  /* [pIn] has no match with iDocid */
  436:         skipPositionList(&blockReader);  /* skip this docid in the block */
  437:         continue;
  438:       }
  439:       readDocid(&m->in);
  440:     }
  441:     /* We have a document match. */
  442:     if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){
  443:       /* We don't need to do a poslist merge. */
  444:       docListAddDocid(m->pOut, iDocid);
  445:       if( m->pOut->iType >= DL_POSITIONS ){
  446:         /* Copy all positions to the output doclist. */
  447:         while( 1 ){
  448:           int pos = readPosition(&blockReader);
  449:           if( pos==-1 ) break;
  450:           docListAddPos(m->pOut, pos);
  451:         }
  452:         docListAddEndPos(m->pOut);
  453:       } else skipPositionList(&blockReader);
  454:       continue;
  455:     }
  456:     mergePosList(m, iDocid, &blockReader);
  457:   }
  458: }
  459: 
  460: static char *string_dup_n(const char *s, int n){
  461:   char *str = malloc(n + 1);
  462:   memcpy(str, s, n);
  463:   str[n] = '\0';
  464:   return str;
  465: }
  466: 
  467: /* Duplicate a string; the caller must free() the returned string.
  468:  * (We don't use strdup() since it's not part of the standard C library and
  469:  * may not be available everywhere.) */
  470: static char *string_dup(const char *s){
  471:   return string_dup_n(s, strlen(s));
  472: }
  473: 
  474: /* Format a string, replacing each occurrence of the % character with
  475:  * zName.  This may be more convenient than sqlite_mprintf()
  476:  * when one string is used repeatedly in a format string.
  477:  * The caller must free() the returned string. */
  478: static char *string_format(const char *zFormat, const char *zName){
  479:   const char *p;
  480:   size_t len = 0;
  481:   size_t nName = strlen(zName);
  482:   char *result;
  483:   char *r;
  484: 
  485:   /* first compute length needed */
  486:   for(p = zFormat ; *p ; ++p){
  487:     len += (*p=='%' ? nName : 1);
  488:   }
  489:   len += 1;  /* for null terminator */
  490: 
  491:   r = result = malloc(len);
  492:   for(p = zFormat; *p; ++p){
  493:     if( *p=='%' ){
  494:       memcpy(r, zName, nName);
  495:       r += nName;
  496:     } else {
  497:       *r++ = *p;
  498:     }
  499:   }
  500:   *r++ = '\0';
  501:   assert( r == result + len );
  502:   return result;
  503: }
  504: 
  505: static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){
  506:   char *zCommand = string_format(zFormat, zName);
  507:   int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
  508:   free(zCommand);
  509:   return rc;
  510: }
  511: 
  512: static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt,
  513:                 const char *zFormat){
  514:   char *zCommand = string_format(zFormat, zName);
  515:   int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
  516:   free(zCommand);
  517:   return rc;
  518: }
  519: 
  520: /* end utility functions */
  521: 
  522: #define QUERY_GENERIC 0
  523: #define QUERY_FULLTEXT 1
  524: 
  525: #define CHUNK_MAX 1024
  526: 
  527: typedef enum fulltext_statement {
  528:   CONTENT_INSERT_STMT,
  529:   CONTENT_SELECT_STMT,
  530:   CONTENT_DELETE_STMT,
  531: 
  532:   TERM_SELECT_STMT,
  533:   TERM_CHUNK_SELECT_STMT,
  534:   TERM_INSERT_STMT,
  535:   TERM_UPDATE_STMT,
  536:   TERM_DELETE_STMT,
  537: 
  538:   MAX_STMT                     /* Always at end! */
  539: } fulltext_statement;
  540: 
  541: /* These must exactly match the enum above. */
  542: /* TODO(adam): Is there some risk that a statement (in particular,
  543: ** pTermSelectStmt) will be used in two cursors at once, e.g.  if a
  544: ** query joins a virtual table to itself?  If so perhaps we should
  545: ** move some of these to the cursor object.
  546: */
  547: static const char *fulltext_zStatement[MAX_STMT] = {
  548:   /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)",
  549:   /* CONTENT_SELECT */ "select content from %_content where rowid = ?",
  550:   /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
  551: 
  552:   /* TERM_SELECT */
  553:   "select rowid, doclist from %_term where term = ? and first = ?",
  554:   /* TERM_CHUNK_SELECT */
  555:   "select max(first) from %_term where term = ? and first <= ?",
  556:   /* TERM_INSERT */
  557:   "insert into %_term (term, first, doclist) values (?, ?, ?)",
  558:   /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
  559:   /* TERM_DELETE */ "delete from %_term where rowid = ?",
  560: };
  561: 
  562: typedef struct fulltext_vtab {
  563:   sqlite3_vtab base;
  564:   sqlite3 *db;
  565:   const char *zName;               /* virtual table name */
  566:   sqlite3_tokenizer *pTokenizer;   /* tokenizer for inserts and queries */
  567: 
  568:   /* Precompiled statements which we keep as long as the table is
  569:   ** open.
  570:   */
  571:   sqlite3_stmt *pFulltextStatements[MAX_STMT];
  572: } fulltext_vtab;
  573: 
  574: typedef struct fulltext_cursor {
  575:   sqlite3_vtab_cursor base;
  576:   int iCursorType;  /* QUERY_GENERIC or QUERY_FULLTEXT */
  577: 
  578:   sqlite3_stmt *pStmt;
  579: 
  580:   int eof;
  581: 
  582:   /* The following is used only when iCursorType == QUERY_FULLTEXT. */
  583:   DocListReader result;
  584: } fulltext_cursor;
  585: 
  586: static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
  587:   return (fulltext_vtab *) c->base.pVtab;
  588: }
  589: 
  590: static sqlite3_module fulltextModule;   /* forward declaration */
  591: 
  592: /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
  593: ** If the indicated statement has never been prepared, it is prepared
  594: ** and cached, otherwise the cached version is reset.
  595: */
  596: static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
  597:                              sqlite3_stmt **ppStmt){
  598:   assert( iStmt<MAX_STMT );
  599:   if( v->pFulltextStatements[iStmt]==NULL ){
  600:     int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt],
  601:                          fulltext_zStatement[iStmt]);
  602:     if( rc!=SQLITE_OK ) return rc;
  603:   } else {
  604:     int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
  605:     if( rc!=SQLITE_OK ) return rc;
  606:   }
  607: 
  608:   *ppStmt = v->pFulltextStatements[iStmt];
  609:   return SQLITE_OK;
  610: }
  611: 
  612: /* Step the indicated statement, handling errors SQLITE_BUSY (by
  613: ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
  614: ** bindings to the new statement).
  615: ** TODO(adam): We should extend this function so that it can work with
  616: ** statements declared locally, not only globally cached statements.
  617: */
  618: static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
  619:                               sqlite3_stmt **ppStmt){
  620:   int rc;
  621:   sqlite3_stmt *s = *ppStmt;
  622:   assert( iStmt<MAX_STMT );
  623:   assert( s==v->pFulltextStatements[iStmt] );
  624: 
  625:   while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
  626:     sqlite3_stmt *pNewStmt;
  627: 
  628:     if( rc==SQLITE_BUSY ) continue;
  629:     if( rc!=SQLITE_ERROR ) return rc;
  630: 
  631:     rc = sqlite3_reset(s);
  632:     if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR;
  633: 
  634:     v->pFulltextStatements[iStmt] = NULL;   /* Still in s */
  635:     rc = sql_get_statement(v, iStmt, &pNewStmt);
  636:     if( rc!=SQLITE_OK ) goto err;
  637:     *ppStmt = pNewStmt;
  638: 
  639:     rc = sqlite3_transfer_bindings(s, pNewStmt);
  640:     if( rc!=SQLITE_OK ) goto err;
  641: 
  642:     rc = sqlite3_finalize(s);
  643:     if( rc!=SQLITE_OK ) return rc;
  644:     s = pNewStmt;
  645:   }
  646:   return rc;
  647: 
  648:  err:
  649:   sqlite3_finalize(s);
  650:   return rc;
  651: }
  652: 
  653: /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
  654: ** Useful for statements like UPDATE, where we expect no results.
  655: */
  656: static int sql_single_step_statement(fulltext_vtab *v,
  657:                                      fulltext_statement iStmt,
  658:                                      sqlite3_stmt **ppStmt){
  659:   int rc = sql_step_statement(v, iStmt, ppStmt);
  660:   return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
  661: }
  662: 
  663: /* insert into %_content (rowid, content) values ([rowid], [zContent]) */
  664: static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
  665:                           const char *zContent, int nContent){
  666:   sqlite3_stmt *s;
  667:   int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
  668:   if( rc!=SQLITE_OK ) return rc;
  669: 
  670:   rc = sqlite3_bind_value(s, 1, rowid);
  671:   if( rc!=SQLITE_OK ) return rc;
  672: 
  673:   rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC);
  674:   if( rc!=SQLITE_OK ) return rc;
  675: 
  676:   return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
  677: }
  678: 
  679: /* select content from %_content where rowid = [iRow]
  680:  * The caller must delete the returned string. */
  681: static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
  682:                           char **pzContent){
  683:   sqlite3_stmt *s;
  684:   int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
  685:   if( rc!=SQLITE_OK ) return rc;
  686: 
  687:   rc = sqlite3_bind_int64(s, 1, iRow);
  688:   if( rc!=SQLITE_OK ) return rc;
  689: 
  690:   rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
  691:   if( rc!=SQLITE_ROW ) return rc;
  692: 
  693:   *pzContent = string_dup((const char *)sqlite3_column_text(s, 0));
  694: 
  695:   /* We expect only one row.  We must execute another sqlite3_step()
  696:    * to complete the iteration; otherwise the table will remain locked. */
  697:   rc = sqlite3_step(s);
  698:   if( rc==SQLITE_DONE ) return SQLITE_OK;
  699: 
  700:   free(*pzContent);
  701:   return rc;
  702: }
  703: 
  704: /* delete from %_content where rowid = [iRow ] */
  705: static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
  706:   sqlite3_stmt *s;
  707:   int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
  708:   if( rc!=SQLITE_OK ) return rc;
  709: 
  710:   rc = sqlite3_bind_int64(s, 1, iRow);
  711:   if( rc!=SQLITE_OK ) return rc;
  712: 
  713:   return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
  714: }
  715: 
  716: /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst]
  717:  * If found, returns SQLITE_OK; the caller must free the returned doclist.
  718:  * If no rows found, returns SQLITE_ERROR. */
  719: static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm,
  720:                        sqlite_int64 iFirst,
  721:                        sqlite_int64 *rowid,
  722:                        DocList *out){
  723:   sqlite3_stmt *s;
  724:   int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
  725:   if( rc!=SQLITE_OK ) return rc;
  726: 
  727:   rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT);
  728:   if( rc!=SQLITE_OK ) return rc;
  729: 
  730:   rc = sqlite3_bind_int64(s, 2, iFirst);
  731:   if( rc!=SQLITE_OK ) return rc;
  732: 
  733:   rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
  734:   if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
  735: 
  736:   *rowid = sqlite3_column_int64(s, 0);
  737:   docListInit(out, DL_POSITIONS_OFFSETS,
  738:               sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
  739: 
  740:   /* We expect only one row.  We must execute another sqlite3_step()
  741:    * to complete the iteration; otherwise the table will remain locked. */
  742:   rc = sqlite3_step(s);
  743:   return rc==SQLITE_DONE ? SQLITE_OK : rc;
  744: }
  745: 
  746: /* select max(first) from %_term where term = [zTerm] and first <= [iFirst]
  747:  * If found, returns SQLITE_ROW and result in *piResult; if the query returns
  748:  * NULL (meaning no row found) returns SQLITE_DONE.
  749:  */
  750: static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm,
  751:                            sqlite_int64 iFirst, sqlite_int64 *piResult){
  752:   sqlite3_stmt *s;
  753:   int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s);
  754:   if( rc!=SQLITE_OK ) return rc;
  755: 
  756:   rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
  757:   if( rc!=SQLITE_OK ) return rc;
  758: 
  759:   rc = sqlite3_bind_int64(s, 2, iFirst);
  760:   if( rc!=SQLITE_OK ) return rc;
  761: 
  762:   rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s);
  763:   if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
  764: 
  765:   switch( sqlite3_column_type(s, 0) ){
  766:     case SQLITE_NULL:
  767:       rc = SQLITE_DONE;
  768:       break;
  769:     case SQLITE_INTEGER:
  770:      *piResult = sqlite3_column_int64(s, 0);
  771:      break;
  772:     default:
  773:       return SQLITE_ERROR;
  774:   }
  775:   /* We expect only one row.  We must execute another sqlite3_step()
  776:    * to complete the iteration; otherwise the table will remain locked. */
  777:   if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR;
  778:   return rc;
  779: }
  780: 
  781: /* insert into %_term (term, first, doclist)
  782:                values ([zTerm], [iFirst], [doclist]) */
  783: static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm,
  784:                        sqlite_int64 iFirst, DocList *doclist){
  785:   sqlite3_stmt *s;
  786:   int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
  787:   if( rc!=SQLITE_OK ) return rc;
  788: 
  789:   rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
  790:   if( rc!=SQLITE_OK ) return rc;
  791: 
  792:   rc = sqlite3_bind_int64(s, 2, iFirst);
  793:   if( rc!=SQLITE_OK ) return rc;
  794: 
  795:   rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC);
  796:   if( rc!=SQLITE_OK ) return rc;
  797: 
  798:   return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
  799: }
  800: 
  801: /* update %_term set doclist = [doclist] where rowid = [rowid] */
  802: static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
  803:                        DocList *doclist){
  804:   sqlite3_stmt *s;
  805:   int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
  806:   if( rc!=SQLITE_OK ) return rc;
  807: 
  808:   rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData,
  809:                          SQLITE_STATIC);
  810:   if( rc!=SQLITE_OK ) return rc;
  811: 
  812:   rc = sqlite3_bind_int64(s, 2, rowid);
  813:   if( rc!=SQLITE_OK ) return rc;
  814: 
  815:   return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
  816: }
  817: 
  818: static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
  819:   sqlite3_stmt *s;
  820:   int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
  821:   if( rc!=SQLITE_OK ) return rc;
  822: 
  823:   rc = sqlite3_bind_int64(s, 1, rowid);
  824:   if( rc!=SQLITE_OK ) return rc;
  825: 
  826:   return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
  827: }
  828: 
  829: static void fulltext_vtab_destroy(fulltext_vtab *v){
  830:   int iStmt;
  831: 
  832:   for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
  833:     if( v->pFulltextStatements[iStmt]!=NULL ){
  834:       sqlite3_finalize(v->pFulltextStatements[iStmt]);
  835:       v->pFulltextStatements[iStmt] = NULL;
  836:     }
  837:   }
  838: 
  839:   if( v->pTokenizer!=NULL ){
  840:     v->pTokenizer->pModule->xDestroy(v->pTokenizer);
  841:     v->pTokenizer = NULL;
  842:   }
  843: 
  844:   free((void *) v->zName);
  845:   free(v);
  846: }
  847: 
  848: /* Current interface:
  849: ** argv[0] - module name
  850: ** argv[1] - database name
  851: ** argv[2] - table name
  852: ** argv[3] - tokenizer name (optional, a sensible default is provided)
  853: ** argv[4..] - passed to tokenizer (optional based on tokenizer)
  854: **/
  855: static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv,
  856:                            sqlite3_vtab **ppVTab){
  857:   int rc;
  858:   fulltext_vtab *v;
  859:   sqlite3_tokenizer_module *m = NULL;
  860: 
  861:   assert( argc>=3 );
  862:   v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
  863:   /* sqlite will initialize v->base */
  864:   v->db = db;
  865:   v->zName = string_dup(argv[2]);
  866:   v->pTokenizer = NULL;
  867: 
  868:   if( argc==3 ){
  869:     get_simple_tokenizer_module(&m);
  870:   } else {
  871:     /* TODO(shess) For now, add new tokenizers as else if clauses. */
  872:     if( !strcmp(argv[3], "simple") ){
  873:       get_simple_tokenizer_module(&m);
  874:     } else {
  875:       assert( "unrecognized tokenizer"==NULL );
  876:     }
  877:   }
  878: 
  879:   /* TODO(shess) Since tokenization impacts the index, the parameters
  880:   ** to the tokenizer need to be identical when a persistent virtual
  881:   ** table is re-created.  One solution would be a meta-table to track
  882:   ** such information in the database.  Then we could verify that the
  883:   ** information is identical on subsequent creates.
  884:   */
  885:   /* TODO(shess) Why isn't argv already (const char **)? */
  886:   rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer);
  887:   if( rc!=SQLITE_OK ) return rc;
  888:   v->pTokenizer->pModule = m;
  889: 
  890:   /* TODO: verify the existence of backing tables foo_content, foo_term */
  891: 
  892:   rc = sqlite3_declare_vtab(db, "create table x(content text)");
  893:   if( rc!=SQLITE_OK ) return rc;
  894: 
  895:   memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
  896: 
  897:   *ppVTab = &v->base;
  898:   return SQLITE_OK;
  899: }
  900: 
  901: static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv,
  902:                           sqlite3_vtab **ppVTab){
  903:   int rc;
  904:   assert( argc>=3 );
  905: 
  906:   /* The %_content table holds the text of each full-text item, with
  907:   ** the rowid used as the docid.
  908:   **
  909:   ** The %_term table maps each term to a document list blob
  910:   ** containing elements sorted by ascending docid, each element
  911:   ** encoded as:
  912:   **
  913:   **   docid varint-encoded
  914:   **   token count varint-encoded
  915:   **   "count" token elements (poslist):
  916:   **     position varint-encoded as delta from previous position
  917:   **     start offset varint-encoded as delta from previous start offset
  918:   **     end offset varint-encoded as delta from start offset
  919:   **
  920:   ** Additionally, doclist blobs can be chunked into multiple rows,
  921:   ** using "first" to order the blobs.  "first" is simply the first
  922:   ** docid in the blob.
  923:   */
  924:   /*
  925:   ** NOTE(shess) That last sentence is incorrect in the face of
  926:   ** deletion, which can leave a doclist that doesn't contain the
  927:   ** first from that row.  I _believe_ this does not matter to the
  928:   ** operation of the system, but it might be reasonable to update
  929:   ** appropriately in case this assumption becomes more important.
  930:   */
  931:   rc = sql_exec(db, argv[2],
  932:     "create table %_content(content text);"
  933:     "create table %_term(term text, first integer, doclist blob);"
  934:     "create index %_index on %_term(term, first)");
  935:   if( rc!=SQLITE_OK ) return rc;
  936: 
  937:   return fulltextConnect(db, pAux, argc, argv, ppVTab);
  938: }
  939: 
  940: /* Decide how to handle an SQL query.
  941:  * At the moment, MATCH queries can include implicit boolean ANDs; we
  942:  * haven't implemented phrase searches or OR yet. */
  943: static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
  944:   int i;
  945: 
  946:   for(i=0; i<pInfo->nConstraint; ++i){
  947:     const struct sqlite3_index_constraint *pConstraint;
  948:     pConstraint = &pInfo->aConstraint[i];
  949:     if( pConstraint->iColumn==0 &&
  950:         pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH &&
  951:         pConstraint->usable ){   /* a full-text search */
  952:       pInfo->aConstraintUsage[i].argvIndex = 1;
  953:       pInfo->aConstraintUsage[i].omit = 1;
  954:       pInfo->idxNum = QUERY_FULLTEXT;
  955:       pInfo->estimatedCost = 1.0;   /* an arbitrary value for now */
  956:       return SQLITE_OK;
  957:     }
  958:   }
  959:   pInfo->idxNum = QUERY_GENERIC;
  960:   return SQLITE_OK;
  961: }
  962: 
  963: static int fulltextDisconnect(sqlite3_vtab *pVTab){
  964:   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
  965:   return SQLITE_OK;
  966: }
  967: 
  968: static int fulltextDestroy(sqlite3_vtab *pVTab){
  969:   fulltext_vtab *v = (fulltext_vtab *)pVTab;
  970: 
  971:   int rc = sql_exec(v->db, v->zName,
  972:                     "drop table %_content; drop table %_term");
  973:   if( rc!=SQLITE_OK ) return rc;
  974: 
  975:   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
  976:   return SQLITE_OK;
  977: }
  978: 
  979: static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
  980:   fulltext_cursor *c;
  981: 
  982:   c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
  983:   /* sqlite will initialize c->base */
  984:   *ppCursor = &c->base;
  985: 
  986:   return SQLITE_OK;
  987: }
  988: 
  989: static int fulltextClose(sqlite3_vtab_cursor *pCursor){
  990:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
  991:   sqlite3_finalize(c->pStmt);
  992:   if( c->result.pDoclist!=NULL ){
  993:     docListDelete(c->result.pDoclist);
  994:   }
  995:   free(c);
  996:   return SQLITE_OK;
  997: }
  998: 
  999: static int fulltextNext(sqlite3_vtab_cursor *pCursor){
 1000:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 1001:   sqlite_int64 iDocid;
 1002:   int rc;
 1003: 
 1004:   switch( c->iCursorType ){
 1005:     case QUERY_GENERIC:
 1006:       /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
 1007:       rc = sqlite3_step(c->pStmt);
 1008:       switch( rc ){
 1009:         case SQLITE_ROW:
 1010:           c->eof = 0;
 1011:           return SQLITE_OK;
 1012:         case SQLITE_DONE:
 1013:           c->eof = 1;
 1014:           return SQLITE_OK;
 1015:         default:
 1016:           c->eof = 1;
 1017:           return rc;
 1018:       }
 1019:     case QUERY_FULLTEXT:
 1020:       rc = sqlite3_reset(c->pStmt);
 1021:       if( rc!=SQLITE_OK ) return rc;
 1022: 
 1023:       if( readerAtEnd(&c->result)){
 1024:         c->eof = 1;
 1025:         return SQLITE_OK;
 1026:       }
 1027:       iDocid = readDocid(&c->result);
 1028:       rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
 1029:       if( rc!=SQLITE_OK ) return rc;
 1030:       /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
 1031:       rc = sqlite3_step(c->pStmt);
 1032:       if( rc==SQLITE_ROW ){   /* the case we expect */
 1033:         c->eof = 0;
 1034:         return SQLITE_OK;
 1035:       }
 1036:       /* an error occurred; abort */
 1037:       return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
 1038:     default:
 1039:       assert( 0 );
 1040:       return SQLITE_ERROR;  /* not reached */
 1041:   }
 1042: }
 1043: 
 1044: static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm,
 1045:                                sqlite3_stmt **ppStmt){
 1046:   int rc;
 1047:   if( *ppStmt ){
 1048:     rc = sqlite3_reset(*ppStmt);
 1049:   } else {
 1050:     rc = sql_prepare(v->db, v->zName, ppStmt,
 1051:       "select doclist from %_term where term = ? order by first");
 1052:   }
 1053:   if( rc!=SQLITE_OK ) return rc;
 1054: 
 1055:   rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT);
 1056:   if( rc!=SQLITE_OK ) return rc;
 1057: 
 1058:   return sqlite3_step(*ppStmt);   /* TODO(adamd): handle schema error */
 1059: }
 1060: 
 1061: /* Read the posting list for [zTerm]; AND it with the doclist [in] to
 1062:  * produce the doclist [out], using the given offset [iOffset] for phrase
 1063:  * matching.
 1064:  * (*pSelect) is used to hold an SQLite statement used inside this function;
 1065:  * the caller should initialize *pSelect to NULL before the first call.
 1066:  */
 1067: static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect,
 1068:                        const char *zTerm,
 1069:                        DocList *pIn, int iOffset, DocList *out){
 1070:   int rc;
 1071:   DocListMerge merge;
 1072: 
 1073:   if( pIn!=NULL && !pIn->nData ){
 1074:     /* If [pIn] is already empty, there's no point in reading the
 1075:      * posting list to AND it in; return immediately. */
 1076:       return SQLITE_OK;
 1077:   }
 1078: 
 1079:   rc = term_select_doclist(v, zTerm, -1, pSelect);
 1080:   if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
 1081: 
 1082:   mergeInit(&merge, pIn, iOffset, out);
 1083:   while( rc==SQLITE_ROW ){
 1084:     DocList block;
 1085:     docListInit(&block, DL_POSITIONS_OFFSETS,
 1086:                 sqlite3_column_blob(*pSelect, 0),
 1087:                 sqlite3_column_bytes(*pSelect, 0));
 1088:     mergeBlock(&merge, &block);
 1089:     docListDestroy(&block);
 1090: 
 1091:     rc = sqlite3_step(*pSelect);
 1092:     if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){
 1093:       return rc;
 1094:     }
 1095:   }
 1096:   
 1097:   return SQLITE_OK;
 1098: }
 1099: 
 1100: typedef struct QueryTerm {
 1101:   int is_phrase;    /* true if this term begins a new phrase */
 1102:   const char *zTerm;
 1103: } QueryTerm;
 1104: 
 1105: /* A parsed query.
 1106:  *
 1107:  * As an example, parsing the query ["four score" years "new nation"] will
 1108:  * yield a Query with 5 terms:
 1109:  *   "four",   is_phrase = 1
 1110:  *   "score",  is_phrase = 0
 1111:  *   "years",  is_phrase = 1
 1112:  *   "new",    is_phrase = 1
 1113:  *   "nation", is_phrase = 0
 1114:  */
 1115: typedef struct Query {
 1116:   int nTerms;
 1117:   QueryTerm *pTerm;
 1118: } Query;
 1119: 
 1120: static void query_add(Query *q, int is_phrase, const char *zTerm){
 1121:   QueryTerm *t;
 1122:   ++q->nTerms;
 1123:   q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0]));
 1124:   t = &q->pTerm[q->nTerms - 1];
 1125:   t->is_phrase = is_phrase;
 1126:   t->zTerm = zTerm;
 1127: }
 1128:     
 1129: static void query_free(Query *q){
 1130:   int i;
 1131:   for(i = 0; i < q->nTerms; ++i){
 1132:     free((void *) q->pTerm[i].zTerm);
 1133:   }
 1134:   free(q->pTerm);
 1135: }
 1136: 
 1137: static int tokenize_segment(sqlite3_tokenizer *pTokenizer,
 1138:                             const char *zQuery, int in_phrase,
 1139:                             Query *pQuery){
 1140:   sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
 1141:   sqlite3_tokenizer_cursor *pCursor;
 1142:   int is_first = 1;
 1143:   
 1144:   int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor);
 1145:   if( rc!=SQLITE_OK ) return rc;
 1146:   pCursor->pTokenizer = pTokenizer;
 1147: 
 1148:   while( 1 ){
 1149:     const char *zToken;
 1150:     int nToken, iStartOffset, iEndOffset, dummy_pos;
 1151: 
 1152:     rc = pModule->xNext(pCursor,
 1153:                         &zToken, &nToken,
 1154:                         &iStartOffset, &iEndOffset,
 1155:                         &dummy_pos);
 1156:     if( rc!=SQLITE_OK ) break;
 1157:     query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken));
 1158:     is_first = 0;
 1159:   }
 1160: 
 1161:   return pModule->xClose(pCursor);
 1162: }
 1163: 
 1164: /* Parse a query string, yielding a Query object. */
 1165: static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){
 1166:   char *zQuery1 = string_dup(zQuery);
 1167:   int in_phrase = 0;
 1168:   char *s = zQuery1;
 1169:   pQuery->nTerms = 0;
 1170:   pQuery->pTerm = NULL;
 1171: 
 1172:   while( *s ){
 1173:     char *t = s;
 1174:     while( *t ){
 1175:       if( *t=='"' ){
 1176:         *t++ = '\0';
 1177:         break;
 1178:       }
 1179:       ++t;
 1180:     }
 1181:     if( *s ){
 1182:       tokenize_segment(v->pTokenizer, s, in_phrase, pQuery);
 1183:     }
 1184:     s = t;
 1185:     in_phrase = !in_phrase;
 1186:   }
 1187:   
 1188:   free(zQuery1);
 1189:   return SQLITE_OK;
 1190: }
 1191: 
 1192: /* Perform a full-text query; return a list of documents in [pResult]. */
 1193: static int fulltext_query(fulltext_vtab *v, const char *zQuery,
 1194:                           DocList **pResult){
 1195:   Query q;
 1196:   int phrase_start = -1;
 1197:   int i;
 1198:   sqlite3_stmt *pSelect = NULL;
 1199:   DocList *d = NULL;
 1200: 
 1201:   int rc = parse_query(v, zQuery, &q);
 1202:   if( rc!=SQLITE_OK ) return rc;
 1203: 
 1204:   /* Merge terms. */
 1205:   for(i = 0 ; i < q.nTerms ; ++i){
 1206:     /* In each merge step, we need to generate positions whenever we're
 1207:      * processing a phrase which hasn't ended yet. */
 1208:     int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase;
 1209:     DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS);
 1210:     if( q.pTerm[i].is_phrase ){
 1211:       phrase_start = i;
 1212:     }
 1213:     rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next);
 1214:     if( rc!=SQLITE_OK ) break;
 1215:     if( d!=NULL ){
 1216:       docListDelete(d);
 1217:     }
 1218:     d = next;
 1219:   }
 1220: 
 1221:   sqlite3_finalize(pSelect);
 1222:   query_free(&q);
 1223:   *pResult = d;
 1224:   return rc;
 1225: }
 1226: 
 1227: static int fulltextFilter(sqlite3_vtab_cursor *pCursor,
 1228:                           int idxNum, const char *idxStr,
 1229:                           int argc, sqlite3_value **argv){
 1230:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 1231:   fulltext_vtab *v = cursor_vtab(c);
 1232:   int rc;
 1233:   const char *zStatement;
 1234: 
 1235:   c->iCursorType = idxNum;
 1236:   switch( idxNum ){
 1237:     case QUERY_GENERIC:
 1238:       zStatement = "select rowid, content from %_content";
 1239:       break;
 1240: 
 1241:     case QUERY_FULLTEXT:   /* full-text search */
 1242:     {
 1243:       const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
 1244:       DocList *pResult;
 1245:       assert( argc==1 );
 1246:       rc = fulltext_query(v, zQuery, &pResult);
 1247:       if( rc!=SQLITE_OK ) return rc;
 1248:       readerInit(&c->result, pResult);
 1249:       zStatement = "select rowid, content from %_content where rowid = ?";
 1250:       break;
 1251:     }
 1252: 
 1253:     default:
 1254:       assert( 0 );
 1255:   }
 1256: 
 1257:   rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement);
 1258:   if( rc!=SQLITE_OK ) return rc;
 1259: 
 1260:   return fulltextNext(pCursor);
 1261: }
 1262: 
 1263: static int fulltextEof(sqlite3_vtab_cursor *pCursor){
 1264:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 1265:   return c->eof;
 1266: }
 1267: 
 1268: static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
 1269:                           sqlite3_context *pContext, int idxCol){
 1270:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 1271:   const char *s;
 1272: 
 1273:   assert( idxCol==0 );
 1274:   s = (const char *) sqlite3_column_text(c->pStmt, 1);
 1275:   sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT);
 1276: 
 1277:   return SQLITE_OK;
 1278: }
 1279: 
 1280: static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
 1281:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 1282: 
 1283:   *pRowid = sqlite3_column_int64(c->pStmt, 0);
 1284:   return SQLITE_OK;
 1285: }
 1286: 
 1287: /* Build a hash table containing all terms in zText. */
 1288: static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer,
 1289:                        const char *zText, sqlite_int64 iDocid){
 1290:   sqlite3_tokenizer_cursor *pCursor;
 1291:   const char *pToken;
 1292:   int nTokenBytes;
 1293:   int iStartOffset, iEndOffset, iPosition;
 1294: 
 1295:   int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
 1296:   if( rc!=SQLITE_OK ) return rc;
 1297: 
 1298:   pCursor->pTokenizer = pTokenizer;
 1299:   HashInit(terms, HASH_STRING, 1);
 1300:   while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
 1301:                                                &pToken, &nTokenBytes,
 1302:                                                &iStartOffset, &iEndOffset,
 1303:                                                &iPosition) ){
 1304:     DocList *p;
 1305: 
 1306:     /* Positions can't be negative; we use -1 as a terminator internally. */
 1307:     if( iPosition<0 ) {
 1308:       rc = SQLITE_ERROR;  
 1309:       goto err;
 1310:     }
 1311: 
 1312:     p = HashFind(terms, pToken, nTokenBytes);
 1313:     if( p==NULL ){
 1314:       p = docListNew(DL_POSITIONS_OFFSETS);
 1315:       docListAddDocid(p, iDocid);
 1316:       HashInsert(terms, pToken, nTokenBytes, p);
 1317:     }
 1318:     docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset);
 1319:   }
 1320: 
 1321: err:
 1322:   /* TODO(shess) Check return?  Should this be able to cause errors at
 1323:   ** this point?  Actually, same question about sqlite3_finalize(),
 1324:   ** though one could argue that failure there means that the data is
 1325:   ** not durable.  *ponder*
 1326:   */
 1327:   pTokenizer->pModule->xClose(pCursor);
 1328:   return rc;
 1329: }
 1330: /* Update the %_terms table to map the term [zTerm] to the given rowid. */
 1331: static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm,
 1332:                              sqlite_int64 iDocid, DocList *p){
 1333:   sqlite_int64 iFirst;
 1334:   sqlite_int64 iIndexRow;
 1335:   DocList doclist;
 1336: 
 1337:   int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
 1338:   if( rc==SQLITE_DONE ){
 1339:     docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0);
 1340:     if( docListUpdate(&doclist, iDocid, p) ){
 1341:       rc = term_insert(v, zTerm, nTerm, iDocid, &doclist);
 1342:       docListDestroy(&doclist);
 1343:       return rc;
 1344:     }
 1345:     return SQLITE_OK;
 1346:   }
 1347:   if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
 1348: 
 1349:   /* This word is in the index; add this document ID to its blob. */
 1350: 
 1351:   rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
 1352:   if( rc!=SQLITE_OK ) return rc;
 1353: 
 1354:   if( docListUpdate(&doclist, iDocid, p) ){
 1355:     /* If the blob is too big, split it in half. */
 1356:     if( doclist.nData>CHUNK_MAX ){
 1357:       DocList half;
 1358:       if( docListSplit(&doclist, &half) ){
 1359:         rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half);
 1360:         docListDestroy(&half);
 1361:         if( rc!=SQLITE_OK ) goto err;
 1362:       }
 1363:     }
 1364:     rc = term_update(v, iIndexRow, &doclist);
 1365:   }
 1366: 
 1367: err:
 1368:   docListDestroy(&doclist);
 1369:   return rc;
 1370: }
 1371: 
 1372: /* Insert a row into the full-text index; set *piRowid to be the ID of the
 1373:  * new row. */
 1374: static int index_insert(fulltext_vtab *v,
 1375:                         sqlite3_value *pRequestRowid, const char *zText,
 1376:                         sqlite_int64 *piRowid){
 1377:   Hash terms;  /* maps term string -> PosList */
 1378:   HashElem *e;
 1379: 
 1380:   int rc = content_insert(v, pRequestRowid, zText, -1);
 1381:   if( rc!=SQLITE_OK ) return rc;
 1382:   *piRowid = sqlite3_last_insert_rowid(v->db);
 1383: 
 1384:   if( !zText ) return SQLITE_OK;   /* nothing to index */
 1385: 
 1386:   rc = build_terms(&terms, v->pTokenizer, zText, *piRowid);
 1387:   if( rc!=SQLITE_OK ) return rc;
 1388: 
 1389:   for(e=HashFirst(&terms); e; e=HashNext(e)){
 1390:     DocList *p = HashData(e);
 1391:     rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p);
 1392:     if( rc!=SQLITE_OK ) break;
 1393:   }
 1394: 
 1395:   for(e=HashFirst(&terms); e; e=HashNext(e)){
 1396:     DocList *p = HashData(e);
 1397:     docListDelete(p);
 1398:   }
 1399:   HashClear(&terms);
 1400:   return rc;
 1401: }
 1402: 
 1403: static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm,
 1404:                              sqlite_int64 iDocid){
 1405:   sqlite_int64 iFirst;
 1406:   sqlite_int64 iIndexRow;
 1407:   DocList doclist;
 1408: 
 1409:   int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
 1410:   if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
 1411: 
 1412:   rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
 1413:   if( rc!=SQLITE_OK ) return rc;
 1414: 
 1415:   if( docListUpdate(&doclist, iDocid, NULL) ){
 1416:     if( doclist.nData>0 ){
 1417:       rc = term_update(v, iIndexRow, &doclist);
 1418:     } else {  /* empty posting list */
 1419:       rc = term_delete(v, iIndexRow);
 1420:     }
 1421:   }
 1422:   docListDestroy(&doclist);
 1423:   return rc;
 1424: }
 1425: 
 1426: /* Delete a row from the full-text index. */
 1427: static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
 1428:   char *zText;
 1429:   Hash terms;
 1430:   HashElem *e;
 1431: 
 1432:   int rc = content_select(v, iRow, &zText);
 1433:   if( rc!=SQLITE_OK ) return rc;
 1434: 
 1435:   rc = build_terms(&terms, v->pTokenizer, zText, iRow);
 1436:   free(zText);
 1437:   if( rc!=SQLITE_OK ) return rc;
 1438: 
 1439:   for(e=HashFirst(&terms); e; e=HashNext(e)){
 1440:     rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow);
 1441:     if( rc!=SQLITE_OK ) break;
 1442:   }
 1443:   for(e=HashFirst(&terms); e; e=HashNext(e)){
 1444:     DocList *p = HashData(e);
 1445:     docListDelete(p);
 1446:   }
 1447:   HashClear(&terms);
 1448: 
 1449:   return content_delete(v, iRow);
 1450: }
 1451: 
 1452: static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
 1453:                    sqlite_int64 *pRowid){
 1454:   fulltext_vtab *v = (fulltext_vtab *) pVtab;
 1455: 
 1456:   if( nArg<2 ){
 1457:     return index_delete(v, sqlite3_value_int64(ppArg[0]));
 1458:   }
 1459: 
 1460:   if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
 1461:     return SQLITE_ERROR;   /* an update; not yet supported */
 1462:   }
 1463: 
 1464:   assert( nArg==3 );    /* ppArg[1] = rowid, ppArg[2] = content */
 1465:   return index_insert(v, ppArg[1],
 1466:                       (const char *)sqlite3_value_text(ppArg[2]), pRowid);
 1467: }
 1468: 
 1469: static sqlite3_module fulltextModule = {
 1470:   0,
 1471:   fulltextCreate,
 1472:   fulltextConnect,
 1473:   fulltextBestIndex,
 1474:   fulltextDisconnect,
 1475:   fulltextDestroy,
 1476:   fulltextOpen,
 1477:   fulltextClose,
 1478:   fulltextFilter,
 1479:   fulltextNext,
 1480:   fulltextEof,
 1481:   fulltextColumn,
 1482:   fulltextRowid,
 1483:   fulltextUpdate
 1484: };
 1485: 
 1486: int fulltext_init(sqlite3 *db){
 1487:  return sqlite3_create_module(db, "fulltext", &fulltextModule, 0);
 1488: }
 1489: 
 1490: #if !SQLITE_CORE
 1491: int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg,
 1492:                            const sqlite3_api_routines *pApi){
 1493:  SQLITE_EXTENSION_INIT2(pApi)
 1494:  return fulltext_init(db);
 1495: }
 1496: #endif

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>