Annotation of embedaddon/sqlite3/ext/fts1/fulltext.c, revision 1.1.1.1

1.1       misho       1: /* The author disclaims copyright to this source code.
                      2:  *
                      3:  * This is an SQLite module implementing full-text search.
                      4:  */
                      5: 
                      6: #include <assert.h>
                      7: #if !defined(__APPLE__)
                      8: #include <malloc.h>
                      9: #else
                     10: #include <stdlib.h>
                     11: #endif
                     12: #include <stdio.h>
                     13: #include <string.h>
                     14: #include <ctype.h>
                     15: 
                     16: #include "fulltext.h"
                     17: #include "ft_hash.h"
                     18: #include "tokenizer.h"
                     19: #include "sqlite3.h"
                     20: #include "sqlite3ext.h"
                     21: SQLITE_EXTENSION_INIT1
                     22: 
                     23: /* utility functions */
                     24: 
                     25: /* We encode variable-length integers in little-endian order using seven bits
                     26:  * per byte as follows:
                     27: **
                     28: ** KEY:
                     29: **         A = 0xxxxxxx    7 bits of data and one flag bit
                     30: **         B = 1xxxxxxx    7 bits of data and one flag bit
                     31: **
                     32: **  7 bits - A
                     33: ** 14 bits - BA
                     34: ** 21 bits - BBA
                     35: ** and so on.
                     36: */
                     37: 
                     38: /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
                     39: #define VARINT_MAX 10
                     40: 
                     41: /* Write a 64-bit variable-length integer to memory starting at p[0].
                     42:  * The length of data written will be between 1 and VARINT_MAX bytes.
                     43:  * The number of bytes written is returned. */
                     44: static int putVarint(char *p, sqlite_int64 v){
                     45:   unsigned char *q = (unsigned char *) p;
                     46:   sqlite_uint64 vu = v;
                     47:   do{
                     48:     *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
                     49:     vu >>= 7;
                     50:   }while( vu!=0 );
                     51:   q[-1] &= 0x7f;  /* turn off high bit in final byte */
                     52:   assert( q - (unsigned char *)p <= VARINT_MAX );
                     53:   return (int) (q - (unsigned char *)p);
                     54: }
                     55: 
                     56: /* Read a 64-bit variable-length integer from memory starting at p[0].
                     57:  * Return the number of bytes read, or 0 on error.
                     58:  * The value is stored in *v. */
                     59: static int getVarint(const char *p, sqlite_int64 *v){
                     60:   const unsigned char *q = (const unsigned char *) p;
                     61:   sqlite_uint64 x = 0, y = 1;
                     62:   while( (*q & 0x80) == 0x80 ){
                     63:     x += y * (*q++ & 0x7f);
                     64:     y <<= 7;
                     65:     if( q - (unsigned char *)p >= VARINT_MAX ){  /* bad data */
                     66:       assert( 0 );
                     67:       return 0;
                     68:     }
                     69:   }
                     70:   x += y * (*q++);
                     71:   *v = (sqlite_int64) x;
                     72:   return (int) (q - (unsigned char *)p);
                     73: }
                     74: 
                     75: static int getVarint32(const char *p, int *pi){
                     76:  sqlite_int64 i;
                     77:  int ret = getVarint(p, &i);
                     78:  *pi = (int) i;
                     79:  assert( *pi==i );
                     80:  return ret;
                     81: }
                     82: 
                     83: /*** Document lists ***
                     84:  *
                     85:  * A document list holds a sorted list of varint-encoded document IDs.
                     86:  *
                     87:  * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
                     88:  *
                     89:  * array {
                     90:  *   varint docid;
                     91:  *   array {
                     92:  *     varint position;     (delta from previous position plus 1, or 0 for end)
                     93:  *     varint startOffset;  (delta from previous startOffset)
                     94:  *     varint endOffset;    (delta from startOffset)
                     95:  *   }
                     96:  * }
                     97:  *
                     98:  * Here, array { X } means zero or more occurrences of X, adjacent in memory.
                     99:  *
                    100:  * A doclist with type DL_POSITIONS is like the above, but holds only docids
                    101:  * and positions without offset information.
                    102:  *
                    103:  * A doclist with type DL_DOCIDS is like the above, but holds only docids
                    104:  * without positions or offset information.
                    105:  *
                    106:  * On disk, every document list has positions and offsets, so we don't bother
                    107:  * to serialize a doclist's type.
                    108:  * 
                    109:  * We don't yet delta-encode document IDs; doing so will probably be a
                    110:  * modest win.
                    111:  *
                    112:  * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
                    113:  * After the first offset, estimate the next offset by using the
                    114:  * current token position and the previous token position and offset,
                    115:  * offset to handle some variance.  So the estimate would be
                    116:  * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
                    117:  * as normal.  Offsets more than 64 chars from the estimate are
                    118:  * encoded as the delta to the previous start offset + 128.  An
                    119:  * additional tiny increment can be gained by using the end offset of
                    120:  * the previous token to make the estimate a tiny bit more precise.
                    121: */
                    122: 
                    123: typedef enum DocListType {
                    124:   DL_DOCIDS,              /* docids only */
                    125:   DL_POSITIONS,           /* docids + positions */
                    126:   DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
                    127: } DocListType;
                    128: 
                    129: typedef struct DocList {
                    130:   char *pData;
                    131:   int nData;
                    132:   DocListType iType;
                    133:   int iLastPos;       /* the last position written */
                    134:   int iLastOffset;    /* the last start offset written */
                    135: } DocList;
                    136: 
                    137: /* Initialize a new DocList to hold the given data. */
                    138: static void docListInit(DocList *d, DocListType iType,
                    139:                         const char *pData, int nData){
                    140:   d->nData = nData;
                    141:   if( nData>0 ){
                    142:     d->pData = malloc(nData);
                    143:     memcpy(d->pData, pData, nData);
                    144:   } else {
                    145:     d->pData = NULL;
                    146:   }
                    147:   d->iType = iType;
                    148:   d->iLastPos = 0;
                    149:   d->iLastOffset = 0;
                    150: }
                    151: 
                    152: /* Create a new dynamically-allocated DocList. */
                    153: static DocList *docListNew(DocListType iType){
                    154:   DocList *d = (DocList *) malloc(sizeof(DocList));
                    155:   docListInit(d, iType, 0, 0);
                    156:   return d;
                    157: }
                    158: 
                    159: static void docListDestroy(DocList *d){
                    160:   free(d->pData);
                    161: #ifndef NDEBUG
                    162:   memset(d, 0x55, sizeof(*d));
                    163: #endif
                    164: }
                    165: 
                    166: static void docListDelete(DocList *d){
                    167:   docListDestroy(d);
                    168:   free(d);
                    169: }
                    170: 
                    171: static char *docListEnd(DocList *d){
                    172:   return d->pData + d->nData;
                    173: }
                    174: 
                    175: /* Append a varint to a DocList's data. */
                    176: static void appendVarint(DocList *d, sqlite_int64 i){
                    177:   char c[VARINT_MAX];
                    178:   int n = putVarint(c, i);
                    179:   d->pData = realloc(d->pData, d->nData + n);
                    180:   memcpy(d->pData + d->nData, c, n);
                    181:   d->nData += n;
                    182: }
                    183: 
                    184: static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
                    185:   appendVarint(d, iDocid);
                    186:   d->iLastPos = 0;
                    187: }
                    188: 
                    189: /* Add a position to the last position list in a doclist. */
                    190: static void docListAddPos(DocList *d, int iPos){
                    191:   assert( d->iType>=DL_POSITIONS );
                    192:   appendVarint(d, iPos-d->iLastPos+1);
                    193:   d->iLastPos = iPos;
                    194: }
                    195: 
                    196: static void docListAddPosOffset(DocList *d, int iPos,
                    197:                                 int iStartOffset, int iEndOffset){
                    198:   assert( d->iType==DL_POSITIONS_OFFSETS );
                    199:   docListAddPos(d, iPos);
                    200:   appendVarint(d, iStartOffset-d->iLastOffset);
                    201:   d->iLastOffset = iStartOffset;
                    202:   appendVarint(d, iEndOffset-iStartOffset);
                    203: }
                    204: 
                    205: /* Terminate the last position list in the given doclist. */
                    206: static void docListAddEndPos(DocList *d){
                    207:   appendVarint(d, 0);
                    208: }
                    209: 
                    210: typedef struct DocListReader {
                    211:   DocList *pDoclist;
                    212:   char *p;
                    213:   int iLastPos;    /* the last position read */
                    214: } DocListReader;
                    215: 
                    216: static void readerInit(DocListReader *r, DocList *pDoclist){
                    217:   r->pDoclist = pDoclist;
                    218:   if( pDoclist!=NULL ){
                    219:     r->p = pDoclist->pData;
                    220:   }
                    221:   r->iLastPos = 0;
                    222: }
                    223: 
                    224: static int readerAtEnd(DocListReader *pReader){
                    225:   return pReader->p >= docListEnd(pReader->pDoclist);
                    226: }
                    227: 
                    228: /* Peek at the next docid without advancing the read pointer. */
                    229: static sqlite_int64 peekDocid(DocListReader *pReader){
                    230:   sqlite_int64 ret;
                    231:   assert( !readerAtEnd(pReader) );
                    232:   getVarint(pReader->p, &ret);
                    233:   return ret;
                    234: }
                    235: 
                    236: /* Read the next docid. */
                    237: static sqlite_int64 readDocid(DocListReader *pReader){
                    238:   sqlite_int64 ret;
                    239:   assert( !readerAtEnd(pReader) );
                    240:   pReader->p += getVarint(pReader->p, &ret);
                    241:   pReader->iLastPos = 0;
                    242:   return ret;
                    243: }
                    244: 
                    245: /* Read the next position from a position list.
                    246:  * Returns the position, or -1 at the end of the list. */
                    247: static int readPosition(DocListReader *pReader){
                    248:   int i;
                    249:   int iType = pReader->pDoclist->iType;
                    250:   assert( iType>=DL_POSITIONS );
                    251:   assert( !readerAtEnd(pReader) );
                    252: 
                    253:   pReader->p += getVarint32(pReader->p, &i);
                    254:   if( i==0 ){
                    255:     pReader->iLastPos = -1;
                    256:     return -1;
                    257:   }
                    258:   pReader->iLastPos += ((int) i)-1;
                    259:   if( iType>=DL_POSITIONS_OFFSETS ){
                    260:     /* Skip over offsets, ignoring them for now. */
                    261:     int iStart, iEnd;
                    262:     pReader->p += getVarint32(pReader->p, &iStart);
                    263:     pReader->p += getVarint32(pReader->p, &iEnd);
                    264:   }
                    265:   return pReader->iLastPos;
                    266: }
                    267: 
                    268: /* Skip past the end of a position list. */
                    269: static void skipPositionList(DocListReader *pReader){
                    270:   while( readPosition(pReader)!=-1 )
                    271:     ;
                    272: }
                    273: 
                    274: /* Skip over a docid, including its position list if the doclist has
                    275:  * positions. */
                    276: static void skipDocument(DocListReader *pReader){
                    277:   readDocid(pReader);
                    278:   if( pReader->pDoclist->iType >= DL_POSITIONS ){
                    279:     skipPositionList(pReader);
                    280:   }
                    281: }
                    282: 
                    283: static sqlite_int64 firstDocid(DocList *d){
                    284:   DocListReader r;
                    285:   readerInit(&r, d);
                    286:   return readDocid(&r);
                    287: }
                    288: 
                    289: /* Doclist multi-tool.  Pass pUpdate==NULL to delete the indicated docid;
                    290:  * otherwise pUpdate, which must contain only the single docid [iDocid], is
                    291:  * inserted (if not present) or updated (if already present). */
                    292: static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){
                    293:   int modified = 0;
                    294:   DocListReader reader;
                    295:   char *p;
                    296: 
                    297:   if( pUpdate!=NULL ){
                    298:     assert( d->iType==pUpdate->iType);
                    299:     assert( iDocid==firstDocid(pUpdate) );
                    300:   }
                    301: 
                    302:   readerInit(&reader, d);
                    303:   while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){
                    304:     skipDocument(&reader);
                    305:   }
                    306: 
                    307:   p = reader.p;
                    308:   /* Delete if there is a matching element. */
                    309:   if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){
                    310:     skipDocument(&reader);
                    311:     memmove(p, reader.p, docListEnd(d) - reader.p);
                    312:     d->nData -= (reader.p - p);
                    313:     modified = 1;
                    314:   }
                    315: 
                    316:   /* Insert if indicated. */
                    317:   if( pUpdate!=NULL ){
                    318:     int iDoclist = p-d->pData;
                    319:     docListAddEndPos(pUpdate);
                    320: 
                    321:     d->pData = realloc(d->pData, d->nData+pUpdate->nData);
                    322:     p = d->pData + iDoclist;
                    323: 
                    324:     memmove(p+pUpdate->nData, p, docListEnd(d) - p);
                    325:     memcpy(p, pUpdate->pData, pUpdate->nData);
                    326:     d->nData += pUpdate->nData;
                    327:     modified = 1;
                    328:   }
                    329: 
                    330:   return modified;
                    331: }
                    332: 
                    333: /* Split the second half of doclist d into a separate doclist d2.  Returns 1
                    334:  * if successful, or 0 if d contains a single document and hence can't be
                    335:  * split. */
                    336: static int docListSplit(DocList *d, DocList *d2){
                    337:   const char *pSplitPoint = d->pData + d->nData / 2;
                    338:   DocListReader reader;
                    339: 
                    340:   readerInit(&reader, d);
                    341:   while( reader.p<pSplitPoint ){
                    342:     skipDocument(&reader);
                    343:   }
                    344:   if( readerAtEnd(&reader) ) return 0;
                    345:   docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p);
                    346:   d->nData = reader.p - d->pData;
                    347:   d->pData = realloc(d->pData, d->nData);
                    348:   return 1;
                    349: }
                    350: 
                    351: /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked
                    352:  * on-disk doclist, resulting in another in-memory DocList [out].  [in]
                    353:  * and [out] may or may not store position information according to the
                    354:  * caller's wishes.  The on-disk doclist always comes with positions.
                    355:  *
                    356:  * The caller must read each chunk of the on-disk doclist in succession and
                    357:  * pass it to mergeBlock().
                    358:  *
                    359:  * If [in] has positions, then the merge output contains only documents with
                    360:  * matching positions in the two input doclists.  If [in] does not have
                    361:  * positions, then the merge output contains all documents common to the two
                    362:  * input doclists.
                    363:  *
                    364:  * If [in] is NULL, then the on-disk doclist is copied to [out] directly.
                    365:  *
                    366:  * A merge is performed using an integer [iOffset] provided by the caller.
                    367:  * [iOffset] is subtracted from each position in the on-disk doclist for the
                    368:  * purpose of position comparison; this is helpful in implementing phrase
                    369:  * searches.
                    370:  *
                    371:  * A DocListMerge is not yet able to propagate offsets through query
                    372:  * processing; we should add that capability soon.
                    373: */
                    374: typedef struct DocListMerge {
                    375:   DocListReader in;
                    376:   DocList *pOut;
                    377:   int iOffset;
                    378: } DocListMerge;
                    379: 
                    380: static void mergeInit(DocListMerge *m,
                    381:                       DocList *pIn, int iOffset, DocList *pOut){
                    382:   readerInit(&m->in, pIn);
                    383:   m->pOut = pOut;
                    384:   m->iOffset = iOffset;
                    385: 
                    386:   /* can't handle offsets yet */
                    387:   assert( pIn==NULL || pIn->iType <= DL_POSITIONS );
                    388:   assert( pOut->iType <= DL_POSITIONS );
                    389: }
                    390: 
                    391: /* A helper function for mergeBlock(), below.  Merge the position lists
                    392:  * pointed to by m->in and pBlockReader.
                    393:  * If the merge matches, write [iDocid] to m->pOut; if m->pOut
                    394:  * has positions then write all matching positions as well. */
                    395: static void mergePosList(DocListMerge *m, sqlite_int64 iDocid,
                    396:                   DocListReader *pBlockReader){
                    397:   int block_pos = readPosition(pBlockReader);
                    398:   int in_pos = readPosition(&m->in);
                    399:   int match = 0;
                    400:   while( block_pos!=-1 || in_pos!=-1 ){
                    401:     if( block_pos-m->iOffset==in_pos ){
                    402:       if( !match ){
                    403:         docListAddDocid(m->pOut, iDocid);
                    404:         match = 1;
                    405:       }
                    406:       if( m->pOut->iType >= DL_POSITIONS ){
                    407:         docListAddPos(m->pOut, in_pos);
                    408:       }
                    409:       block_pos = readPosition(pBlockReader);
                    410:       in_pos = readPosition(&m->in);
                    411:     } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){
                    412:       block_pos = readPosition(pBlockReader);
                    413:     } else {
                    414:       in_pos = readPosition(&m->in);
                    415:     }
                    416:   }
                    417:   if( m->pOut->iType >= DL_POSITIONS && match ){
                    418:     docListAddEndPos(m->pOut);
                    419:   }
                    420: }
                    421: 
                    422: /* Merge one block of an on-disk doclist into a DocListMerge. */
                    423: static void mergeBlock(DocListMerge *m, DocList *pBlock){
                    424:   DocListReader blockReader;
                    425:   assert( pBlock->iType >= DL_POSITIONS );
                    426:   readerInit(&blockReader, pBlock);
                    427:   while( !readerAtEnd(&blockReader) ){
                    428:     sqlite_int64 iDocid = readDocid(&blockReader);
                    429:     if( m->in.pDoclist!=NULL ){
                    430:       while( 1 ){
                    431:         if( readerAtEnd(&m->in) ) return;  /* nothing more to merge */
                    432:         if( peekDocid(&m->in)>=iDocid ) break;
                    433:         skipDocument(&m->in);
                    434:       }
                    435:       if( peekDocid(&m->in)>iDocid ){  /* [pIn] has no match with iDocid */
                    436:         skipPositionList(&blockReader);  /* skip this docid in the block */
                    437:         continue;
                    438:       }
                    439:       readDocid(&m->in);
                    440:     }
                    441:     /* We have a document match. */
                    442:     if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){
                    443:       /* We don't need to do a poslist merge. */
                    444:       docListAddDocid(m->pOut, iDocid);
                    445:       if( m->pOut->iType >= DL_POSITIONS ){
                    446:         /* Copy all positions to the output doclist. */
                    447:         while( 1 ){
                    448:           int pos = readPosition(&blockReader);
                    449:           if( pos==-1 ) break;
                    450:           docListAddPos(m->pOut, pos);
                    451:         }
                    452:         docListAddEndPos(m->pOut);
                    453:       } else skipPositionList(&blockReader);
                    454:       continue;
                    455:     }
                    456:     mergePosList(m, iDocid, &blockReader);
                    457:   }
                    458: }
                    459: 
                    460: static char *string_dup_n(const char *s, int n){
                    461:   char *str = malloc(n + 1);
                    462:   memcpy(str, s, n);
                    463:   str[n] = '\0';
                    464:   return str;
                    465: }
                    466: 
                    467: /* Duplicate a string; the caller must free() the returned string.
                    468:  * (We don't use strdup() since it's not part of the standard C library and
                    469:  * may not be available everywhere.) */
                    470: static char *string_dup(const char *s){
                    471:   return string_dup_n(s, strlen(s));
                    472: }
                    473: 
                    474: /* Format a string, replacing each occurrence of the % character with
                    475:  * zName.  This may be more convenient than sqlite_mprintf()
                    476:  * when one string is used repeatedly in a format string.
                    477:  * The caller must free() the returned string. */
                    478: static char *string_format(const char *zFormat, const char *zName){
                    479:   const char *p;
                    480:   size_t len = 0;
                    481:   size_t nName = strlen(zName);
                    482:   char *result;
                    483:   char *r;
                    484: 
                    485:   /* first compute length needed */
                    486:   for(p = zFormat ; *p ; ++p){
                    487:     len += (*p=='%' ? nName : 1);
                    488:   }
                    489:   len += 1;  /* for null terminator */
                    490: 
                    491:   r = result = malloc(len);
                    492:   for(p = zFormat; *p; ++p){
                    493:     if( *p=='%' ){
                    494:       memcpy(r, zName, nName);
                    495:       r += nName;
                    496:     } else {
                    497:       *r++ = *p;
                    498:     }
                    499:   }
                    500:   *r++ = '\0';
                    501:   assert( r == result + len );
                    502:   return result;
                    503: }
                    504: 
                    505: static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){
                    506:   char *zCommand = string_format(zFormat, zName);
                    507:   int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
                    508:   free(zCommand);
                    509:   return rc;
                    510: }
                    511: 
                    512: static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt,
                    513:                 const char *zFormat){
                    514:   char *zCommand = string_format(zFormat, zName);
                    515:   int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
                    516:   free(zCommand);
                    517:   return rc;
                    518: }
                    519: 
                    520: /* end utility functions */
                    521: 
                    522: #define QUERY_GENERIC 0
                    523: #define QUERY_FULLTEXT 1
                    524: 
                    525: #define CHUNK_MAX 1024
                    526: 
                    527: typedef enum fulltext_statement {
                    528:   CONTENT_INSERT_STMT,
                    529:   CONTENT_SELECT_STMT,
                    530:   CONTENT_DELETE_STMT,
                    531: 
                    532:   TERM_SELECT_STMT,
                    533:   TERM_CHUNK_SELECT_STMT,
                    534:   TERM_INSERT_STMT,
                    535:   TERM_UPDATE_STMT,
                    536:   TERM_DELETE_STMT,
                    537: 
                    538:   MAX_STMT                     /* Always at end! */
                    539: } fulltext_statement;
                    540: 
                    541: /* These must exactly match the enum above. */
                    542: /* TODO(adam): Is there some risk that a statement (in particular,
                    543: ** pTermSelectStmt) will be used in two cursors at once, e.g.  if a
                    544: ** query joins a virtual table to itself?  If so perhaps we should
                    545: ** move some of these to the cursor object.
                    546: */
                    547: static const char *fulltext_zStatement[MAX_STMT] = {
                    548:   /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)",
                    549:   /* CONTENT_SELECT */ "select content from %_content where rowid = ?",
                    550:   /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
                    551: 
                    552:   /* TERM_SELECT */
                    553:   "select rowid, doclist from %_term where term = ? and first = ?",
                    554:   /* TERM_CHUNK_SELECT */
                    555:   "select max(first) from %_term where term = ? and first <= ?",
                    556:   /* TERM_INSERT */
                    557:   "insert into %_term (term, first, doclist) values (?, ?, ?)",
                    558:   /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
                    559:   /* TERM_DELETE */ "delete from %_term where rowid = ?",
                    560: };
                    561: 
                    562: typedef struct fulltext_vtab {
                    563:   sqlite3_vtab base;
                    564:   sqlite3 *db;
                    565:   const char *zName;               /* virtual table name */
                    566:   sqlite3_tokenizer *pTokenizer;   /* tokenizer for inserts and queries */
                    567: 
                    568:   /* Precompiled statements which we keep as long as the table is
                    569:   ** open.
                    570:   */
                    571:   sqlite3_stmt *pFulltextStatements[MAX_STMT];
                    572: } fulltext_vtab;
                    573: 
                    574: typedef struct fulltext_cursor {
                    575:   sqlite3_vtab_cursor base;
                    576:   int iCursorType;  /* QUERY_GENERIC or QUERY_FULLTEXT */
                    577: 
                    578:   sqlite3_stmt *pStmt;
                    579: 
                    580:   int eof;
                    581: 
                    582:   /* The following is used only when iCursorType == QUERY_FULLTEXT. */
                    583:   DocListReader result;
                    584: } fulltext_cursor;
                    585: 
                    586: static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
                    587:   return (fulltext_vtab *) c->base.pVtab;
                    588: }
                    589: 
                    590: static sqlite3_module fulltextModule;   /* forward declaration */
                    591: 
                    592: /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
                    593: ** If the indicated statement has never been prepared, it is prepared
                    594: ** and cached, otherwise the cached version is reset.
                    595: */
                    596: static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
                    597:                              sqlite3_stmt **ppStmt){
                    598:   assert( iStmt<MAX_STMT );
                    599:   if( v->pFulltextStatements[iStmt]==NULL ){
                    600:     int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt],
                    601:                          fulltext_zStatement[iStmt]);
                    602:     if( rc!=SQLITE_OK ) return rc;
                    603:   } else {
                    604:     int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
                    605:     if( rc!=SQLITE_OK ) return rc;
                    606:   }
                    607: 
                    608:   *ppStmt = v->pFulltextStatements[iStmt];
                    609:   return SQLITE_OK;
                    610: }
                    611: 
                    612: /* Step the indicated statement, handling errors SQLITE_BUSY (by
                    613: ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
                    614: ** bindings to the new statement).
                    615: ** TODO(adam): We should extend this function so that it can work with
                    616: ** statements declared locally, not only globally cached statements.
                    617: */
                    618: static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
                    619:                               sqlite3_stmt **ppStmt){
                    620:   int rc;
                    621:   sqlite3_stmt *s = *ppStmt;
                    622:   assert( iStmt<MAX_STMT );
                    623:   assert( s==v->pFulltextStatements[iStmt] );
                    624: 
                    625:   while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
                    626:     sqlite3_stmt *pNewStmt;
                    627: 
                    628:     if( rc==SQLITE_BUSY ) continue;
                    629:     if( rc!=SQLITE_ERROR ) return rc;
                    630: 
                    631:     rc = sqlite3_reset(s);
                    632:     if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR;
                    633: 
                    634:     v->pFulltextStatements[iStmt] = NULL;   /* Still in s */
                    635:     rc = sql_get_statement(v, iStmt, &pNewStmt);
                    636:     if( rc!=SQLITE_OK ) goto err;
                    637:     *ppStmt = pNewStmt;
                    638: 
                    639:     rc = sqlite3_transfer_bindings(s, pNewStmt);
                    640:     if( rc!=SQLITE_OK ) goto err;
                    641: 
                    642:     rc = sqlite3_finalize(s);
                    643:     if( rc!=SQLITE_OK ) return rc;
                    644:     s = pNewStmt;
                    645:   }
                    646:   return rc;
                    647: 
                    648:  err:
                    649:   sqlite3_finalize(s);
                    650:   return rc;
                    651: }
                    652: 
                    653: /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
                    654: ** Useful for statements like UPDATE, where we expect no results.
                    655: */
                    656: static int sql_single_step_statement(fulltext_vtab *v,
                    657:                                      fulltext_statement iStmt,
                    658:                                      sqlite3_stmt **ppStmt){
                    659:   int rc = sql_step_statement(v, iStmt, ppStmt);
                    660:   return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
                    661: }
                    662: 
                    663: /* insert into %_content (rowid, content) values ([rowid], [zContent]) */
                    664: static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
                    665:                           const char *zContent, int nContent){
                    666:   sqlite3_stmt *s;
                    667:   int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
                    668:   if( rc!=SQLITE_OK ) return rc;
                    669: 
                    670:   rc = sqlite3_bind_value(s, 1, rowid);
                    671:   if( rc!=SQLITE_OK ) return rc;
                    672: 
                    673:   rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC);
                    674:   if( rc!=SQLITE_OK ) return rc;
                    675: 
                    676:   return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
                    677: }
                    678: 
                    679: /* select content from %_content where rowid = [iRow]
                    680:  * The caller must delete the returned string. */
                    681: static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
                    682:                           char **pzContent){
                    683:   sqlite3_stmt *s;
                    684:   int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
                    685:   if( rc!=SQLITE_OK ) return rc;
                    686: 
                    687:   rc = sqlite3_bind_int64(s, 1, iRow);
                    688:   if( rc!=SQLITE_OK ) return rc;
                    689: 
                    690:   rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
                    691:   if( rc!=SQLITE_ROW ) return rc;
                    692: 
                    693:   *pzContent = string_dup((const char *)sqlite3_column_text(s, 0));
                    694: 
                    695:   /* We expect only one row.  We must execute another sqlite3_step()
                    696:    * to complete the iteration; otherwise the table will remain locked. */
                    697:   rc = sqlite3_step(s);
                    698:   if( rc==SQLITE_DONE ) return SQLITE_OK;
                    699: 
                    700:   free(*pzContent);
                    701:   return rc;
                    702: }
                    703: 
                    704: /* delete from %_content where rowid = [iRow ] */
                    705: static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
                    706:   sqlite3_stmt *s;
                    707:   int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
                    708:   if( rc!=SQLITE_OK ) return rc;
                    709: 
                    710:   rc = sqlite3_bind_int64(s, 1, iRow);
                    711:   if( rc!=SQLITE_OK ) return rc;
                    712: 
                    713:   return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
                    714: }
                    715: 
                    716: /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst]
                    717:  * If found, returns SQLITE_OK; the caller must free the returned doclist.
                    718:  * If no rows found, returns SQLITE_ERROR. */
                    719: static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm,
                    720:                        sqlite_int64 iFirst,
                    721:                        sqlite_int64 *rowid,
                    722:                        DocList *out){
                    723:   sqlite3_stmt *s;
                    724:   int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
                    725:   if( rc!=SQLITE_OK ) return rc;
                    726: 
                    727:   rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT);
                    728:   if( rc!=SQLITE_OK ) return rc;
                    729: 
                    730:   rc = sqlite3_bind_int64(s, 2, iFirst);
                    731:   if( rc!=SQLITE_OK ) return rc;
                    732: 
                    733:   rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
                    734:   if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
                    735: 
                    736:   *rowid = sqlite3_column_int64(s, 0);
                    737:   docListInit(out, DL_POSITIONS_OFFSETS,
                    738:               sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
                    739: 
                    740:   /* We expect only one row.  We must execute another sqlite3_step()
                    741:    * to complete the iteration; otherwise the table will remain locked. */
                    742:   rc = sqlite3_step(s);
                    743:   return rc==SQLITE_DONE ? SQLITE_OK : rc;
                    744: }
                    745: 
                    746: /* select max(first) from %_term where term = [zTerm] and first <= [iFirst]
                    747:  * If found, returns SQLITE_ROW and result in *piResult; if the query returns
                    748:  * NULL (meaning no row found) returns SQLITE_DONE.
                    749:  */
                    750: static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm,
                    751:                            sqlite_int64 iFirst, sqlite_int64 *piResult){
                    752:   sqlite3_stmt *s;
                    753:   int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s);
                    754:   if( rc!=SQLITE_OK ) return rc;
                    755: 
                    756:   rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
                    757:   if( rc!=SQLITE_OK ) return rc;
                    758: 
                    759:   rc = sqlite3_bind_int64(s, 2, iFirst);
                    760:   if( rc!=SQLITE_OK ) return rc;
                    761: 
                    762:   rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s);
                    763:   if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
                    764: 
                    765:   switch( sqlite3_column_type(s, 0) ){
                    766:     case SQLITE_NULL:
                    767:       rc = SQLITE_DONE;
                    768:       break;
                    769:     case SQLITE_INTEGER:
                    770:      *piResult = sqlite3_column_int64(s, 0);
                    771:      break;
                    772:     default:
                    773:       return SQLITE_ERROR;
                    774:   }
                    775:   /* We expect only one row.  We must execute another sqlite3_step()
                    776:    * to complete the iteration; otherwise the table will remain locked. */
                    777:   if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR;
                    778:   return rc;
                    779: }
                    780: 
                    781: /* insert into %_term (term, first, doclist)
                    782:                values ([zTerm], [iFirst], [doclist]) */
                    783: static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm,
                    784:                        sqlite_int64 iFirst, DocList *doclist){
                    785:   sqlite3_stmt *s;
                    786:   int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
                    787:   if( rc!=SQLITE_OK ) return rc;
                    788: 
                    789:   rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
                    790:   if( rc!=SQLITE_OK ) return rc;
                    791: 
                    792:   rc = sqlite3_bind_int64(s, 2, iFirst);
                    793:   if( rc!=SQLITE_OK ) return rc;
                    794: 
                    795:   rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC);
                    796:   if( rc!=SQLITE_OK ) return rc;
                    797: 
                    798:   return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
                    799: }
                    800: 
                    801: /* update %_term set doclist = [doclist] where rowid = [rowid] */
                    802: static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
                    803:                        DocList *doclist){
                    804:   sqlite3_stmt *s;
                    805:   int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
                    806:   if( rc!=SQLITE_OK ) return rc;
                    807: 
                    808:   rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData,
                    809:                          SQLITE_STATIC);
                    810:   if( rc!=SQLITE_OK ) return rc;
                    811: 
                    812:   rc = sqlite3_bind_int64(s, 2, rowid);
                    813:   if( rc!=SQLITE_OK ) return rc;
                    814: 
                    815:   return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
                    816: }
                    817: 
                    818: static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
                    819:   sqlite3_stmt *s;
                    820:   int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
                    821:   if( rc!=SQLITE_OK ) return rc;
                    822: 
                    823:   rc = sqlite3_bind_int64(s, 1, rowid);
                    824:   if( rc!=SQLITE_OK ) return rc;
                    825: 
                    826:   return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
                    827: }
                    828: 
                    829: static void fulltext_vtab_destroy(fulltext_vtab *v){
                    830:   int iStmt;
                    831: 
                    832:   for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
                    833:     if( v->pFulltextStatements[iStmt]!=NULL ){
                    834:       sqlite3_finalize(v->pFulltextStatements[iStmt]);
                    835:       v->pFulltextStatements[iStmt] = NULL;
                    836:     }
                    837:   }
                    838: 
                    839:   if( v->pTokenizer!=NULL ){
                    840:     v->pTokenizer->pModule->xDestroy(v->pTokenizer);
                    841:     v->pTokenizer = NULL;
                    842:   }
                    843: 
                    844:   free((void *) v->zName);
                    845:   free(v);
                    846: }
                    847: 
                    848: /* Current interface:
                    849: ** argv[0] - module name
                    850: ** argv[1] - database name
                    851: ** argv[2] - table name
                    852: ** argv[3] - tokenizer name (optional, a sensible default is provided)
                    853: ** argv[4..] - passed to tokenizer (optional based on tokenizer)
                    854: **/
                    855: static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv,
                    856:                            sqlite3_vtab **ppVTab){
                    857:   int rc;
                    858:   fulltext_vtab *v;
                    859:   sqlite3_tokenizer_module *m = NULL;
                    860: 
                    861:   assert( argc>=3 );
                    862:   v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
                    863:   /* sqlite will initialize v->base */
                    864:   v->db = db;
                    865:   v->zName = string_dup(argv[2]);
                    866:   v->pTokenizer = NULL;
                    867: 
                    868:   if( argc==3 ){
                    869:     get_simple_tokenizer_module(&m);
                    870:   } else {
                    871:     /* TODO(shess) For now, add new tokenizers as else if clauses. */
                    872:     if( !strcmp(argv[3], "simple") ){
                    873:       get_simple_tokenizer_module(&m);
                    874:     } else {
                    875:       assert( "unrecognized tokenizer"==NULL );
                    876:     }
                    877:   }
                    878: 
                    879:   /* TODO(shess) Since tokenization impacts the index, the parameters
                    880:   ** to the tokenizer need to be identical when a persistent virtual
                    881:   ** table is re-created.  One solution would be a meta-table to track
                    882:   ** such information in the database.  Then we could verify that the
                    883:   ** information is identical on subsequent creates.
                    884:   */
                    885:   /* TODO(shess) Why isn't argv already (const char **)? */
                    886:   rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer);
                    887:   if( rc!=SQLITE_OK ) return rc;
                    888:   v->pTokenizer->pModule = m;
                    889: 
                    890:   /* TODO: verify the existence of backing tables foo_content, foo_term */
                    891: 
                    892:   rc = sqlite3_declare_vtab(db, "create table x(content text)");
                    893:   if( rc!=SQLITE_OK ) return rc;
                    894: 
                    895:   memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
                    896: 
                    897:   *ppVTab = &v->base;
                    898:   return SQLITE_OK;
                    899: }
                    900: 
                    901: static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv,
                    902:                           sqlite3_vtab **ppVTab){
                    903:   int rc;
                    904:   assert( argc>=3 );
                    905: 
                    906:   /* The %_content table holds the text of each full-text item, with
                    907:   ** the rowid used as the docid.
                    908:   **
                    909:   ** The %_term table maps each term to a document list blob
                    910:   ** containing elements sorted by ascending docid, each element
                    911:   ** encoded as:
                    912:   **
                    913:   **   docid varint-encoded
                    914:   **   token count varint-encoded
                    915:   **   "count" token elements (poslist):
                    916:   **     position varint-encoded as delta from previous position
                    917:   **     start offset varint-encoded as delta from previous start offset
                    918:   **     end offset varint-encoded as delta from start offset
                    919:   **
                    920:   ** Additionally, doclist blobs can be chunked into multiple rows,
                    921:   ** using "first" to order the blobs.  "first" is simply the first
                    922:   ** docid in the blob.
                    923:   */
                    924:   /*
                    925:   ** NOTE(shess) That last sentence is incorrect in the face of
                    926:   ** deletion, which can leave a doclist that doesn't contain the
                    927:   ** first from that row.  I _believe_ this does not matter to the
                    928:   ** operation of the system, but it might be reasonable to update
                    929:   ** appropriately in case this assumption becomes more important.
                    930:   */
                    931:   rc = sql_exec(db, argv[2],
                    932:     "create table %_content(content text);"
                    933:     "create table %_term(term text, first integer, doclist blob);"
                    934:     "create index %_index on %_term(term, first)");
                    935:   if( rc!=SQLITE_OK ) return rc;
                    936: 
                    937:   return fulltextConnect(db, pAux, argc, argv, ppVTab);
                    938: }
                    939: 
                    940: /* Decide how to handle an SQL query.
                    941:  * At the moment, MATCH queries can include implicit boolean ANDs; we
                    942:  * haven't implemented phrase searches or OR yet. */
                    943: static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
                    944:   int i;
                    945: 
                    946:   for(i=0; i<pInfo->nConstraint; ++i){
                    947:     const struct sqlite3_index_constraint *pConstraint;
                    948:     pConstraint = &pInfo->aConstraint[i];
                    949:     if( pConstraint->iColumn==0 &&
                    950:         pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH &&
                    951:         pConstraint->usable ){   /* a full-text search */
                    952:       pInfo->aConstraintUsage[i].argvIndex = 1;
                    953:       pInfo->aConstraintUsage[i].omit = 1;
                    954:       pInfo->idxNum = QUERY_FULLTEXT;
                    955:       pInfo->estimatedCost = 1.0;   /* an arbitrary value for now */
                    956:       return SQLITE_OK;
                    957:     }
                    958:   }
                    959:   pInfo->idxNum = QUERY_GENERIC;
                    960:   return SQLITE_OK;
                    961: }
                    962: 
                    963: static int fulltextDisconnect(sqlite3_vtab *pVTab){
                    964:   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
                    965:   return SQLITE_OK;
                    966: }
                    967: 
                    968: static int fulltextDestroy(sqlite3_vtab *pVTab){
                    969:   fulltext_vtab *v = (fulltext_vtab *)pVTab;
                    970: 
                    971:   int rc = sql_exec(v->db, v->zName,
                    972:                     "drop table %_content; drop table %_term");
                    973:   if( rc!=SQLITE_OK ) return rc;
                    974: 
                    975:   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
                    976:   return SQLITE_OK;
                    977: }
                    978: 
                    979: static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
                    980:   fulltext_cursor *c;
                    981: 
                    982:   c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
                    983:   /* sqlite will initialize c->base */
                    984:   *ppCursor = &c->base;
                    985: 
                    986:   return SQLITE_OK;
                    987: }
                    988: 
                    989: static int fulltextClose(sqlite3_vtab_cursor *pCursor){
                    990:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                    991:   sqlite3_finalize(c->pStmt);
                    992:   if( c->result.pDoclist!=NULL ){
                    993:     docListDelete(c->result.pDoclist);
                    994:   }
                    995:   free(c);
                    996:   return SQLITE_OK;
                    997: }
                    998: 
                    999: static int fulltextNext(sqlite3_vtab_cursor *pCursor){
                   1000:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   1001:   sqlite_int64 iDocid;
                   1002:   int rc;
                   1003: 
                   1004:   switch( c->iCursorType ){
                   1005:     case QUERY_GENERIC:
                   1006:       /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
                   1007:       rc = sqlite3_step(c->pStmt);
                   1008:       switch( rc ){
                   1009:         case SQLITE_ROW:
                   1010:           c->eof = 0;
                   1011:           return SQLITE_OK;
                   1012:         case SQLITE_DONE:
                   1013:           c->eof = 1;
                   1014:           return SQLITE_OK;
                   1015:         default:
                   1016:           c->eof = 1;
                   1017:           return rc;
                   1018:       }
                   1019:     case QUERY_FULLTEXT:
                   1020:       rc = sqlite3_reset(c->pStmt);
                   1021:       if( rc!=SQLITE_OK ) return rc;
                   1022: 
                   1023:       if( readerAtEnd(&c->result)){
                   1024:         c->eof = 1;
                   1025:         return SQLITE_OK;
                   1026:       }
                   1027:       iDocid = readDocid(&c->result);
                   1028:       rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
                   1029:       if( rc!=SQLITE_OK ) return rc;
                   1030:       /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
                   1031:       rc = sqlite3_step(c->pStmt);
                   1032:       if( rc==SQLITE_ROW ){   /* the case we expect */
                   1033:         c->eof = 0;
                   1034:         return SQLITE_OK;
                   1035:       }
                   1036:       /* an error occurred; abort */
                   1037:       return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
                   1038:     default:
                   1039:       assert( 0 );
                   1040:       return SQLITE_ERROR;  /* not reached */
                   1041:   }
                   1042: }
                   1043: 
                   1044: static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm,
                   1045:                                sqlite3_stmt **ppStmt){
                   1046:   int rc;
                   1047:   if( *ppStmt ){
                   1048:     rc = sqlite3_reset(*ppStmt);
                   1049:   } else {
                   1050:     rc = sql_prepare(v->db, v->zName, ppStmt,
                   1051:       "select doclist from %_term where term = ? order by first");
                   1052:   }
                   1053:   if( rc!=SQLITE_OK ) return rc;
                   1054: 
                   1055:   rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT);
                   1056:   if( rc!=SQLITE_OK ) return rc;
                   1057: 
                   1058:   return sqlite3_step(*ppStmt);   /* TODO(adamd): handle schema error */
                   1059: }
                   1060: 
                   1061: /* Read the posting list for [zTerm]; AND it with the doclist [in] to
                   1062:  * produce the doclist [out], using the given offset [iOffset] for phrase
                   1063:  * matching.
                   1064:  * (*pSelect) is used to hold an SQLite statement used inside this function;
                   1065:  * the caller should initialize *pSelect to NULL before the first call.
                   1066:  */
                   1067: static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect,
                   1068:                        const char *zTerm,
                   1069:                        DocList *pIn, int iOffset, DocList *out){
                   1070:   int rc;
                   1071:   DocListMerge merge;
                   1072: 
                   1073:   if( pIn!=NULL && !pIn->nData ){
                   1074:     /* If [pIn] is already empty, there's no point in reading the
                   1075:      * posting list to AND it in; return immediately. */
                   1076:       return SQLITE_OK;
                   1077:   }
                   1078: 
                   1079:   rc = term_select_doclist(v, zTerm, -1, pSelect);
                   1080:   if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
                   1081: 
                   1082:   mergeInit(&merge, pIn, iOffset, out);
                   1083:   while( rc==SQLITE_ROW ){
                   1084:     DocList block;
                   1085:     docListInit(&block, DL_POSITIONS_OFFSETS,
                   1086:                 sqlite3_column_blob(*pSelect, 0),
                   1087:                 sqlite3_column_bytes(*pSelect, 0));
                   1088:     mergeBlock(&merge, &block);
                   1089:     docListDestroy(&block);
                   1090: 
                   1091:     rc = sqlite3_step(*pSelect);
                   1092:     if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){
                   1093:       return rc;
                   1094:     }
                   1095:   }
                   1096:   
                   1097:   return SQLITE_OK;
                   1098: }
                   1099: 
                   1100: typedef struct QueryTerm {
                   1101:   int is_phrase;    /* true if this term begins a new phrase */
                   1102:   const char *zTerm;
                   1103: } QueryTerm;
                   1104: 
                   1105: /* A parsed query.
                   1106:  *
                   1107:  * As an example, parsing the query ["four score" years "new nation"] will
                   1108:  * yield a Query with 5 terms:
                   1109:  *   "four",   is_phrase = 1
                   1110:  *   "score",  is_phrase = 0
                   1111:  *   "years",  is_phrase = 1
                   1112:  *   "new",    is_phrase = 1
                   1113:  *   "nation", is_phrase = 0
                   1114:  */
                   1115: typedef struct Query {
                   1116:   int nTerms;
                   1117:   QueryTerm *pTerm;
                   1118: } Query;
                   1119: 
                   1120: static void query_add(Query *q, int is_phrase, const char *zTerm){
                   1121:   QueryTerm *t;
                   1122:   ++q->nTerms;
                   1123:   q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0]));
                   1124:   t = &q->pTerm[q->nTerms - 1];
                   1125:   t->is_phrase = is_phrase;
                   1126:   t->zTerm = zTerm;
                   1127: }
                   1128:     
                   1129: static void query_free(Query *q){
                   1130:   int i;
                   1131:   for(i = 0; i < q->nTerms; ++i){
                   1132:     free((void *) q->pTerm[i].zTerm);
                   1133:   }
                   1134:   free(q->pTerm);
                   1135: }
                   1136: 
                   1137: static int tokenize_segment(sqlite3_tokenizer *pTokenizer,
                   1138:                             const char *zQuery, int in_phrase,
                   1139:                             Query *pQuery){
                   1140:   sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
                   1141:   sqlite3_tokenizer_cursor *pCursor;
                   1142:   int is_first = 1;
                   1143:   
                   1144:   int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor);
                   1145:   if( rc!=SQLITE_OK ) return rc;
                   1146:   pCursor->pTokenizer = pTokenizer;
                   1147: 
                   1148:   while( 1 ){
                   1149:     const char *zToken;
                   1150:     int nToken, iStartOffset, iEndOffset, dummy_pos;
                   1151: 
                   1152:     rc = pModule->xNext(pCursor,
                   1153:                         &zToken, &nToken,
                   1154:                         &iStartOffset, &iEndOffset,
                   1155:                         &dummy_pos);
                   1156:     if( rc!=SQLITE_OK ) break;
                   1157:     query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken));
                   1158:     is_first = 0;
                   1159:   }
                   1160: 
                   1161:   return pModule->xClose(pCursor);
                   1162: }
                   1163: 
                   1164: /* Parse a query string, yielding a Query object. */
                   1165: static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){
                   1166:   char *zQuery1 = string_dup(zQuery);
                   1167:   int in_phrase = 0;
                   1168:   char *s = zQuery1;
                   1169:   pQuery->nTerms = 0;
                   1170:   pQuery->pTerm = NULL;
                   1171: 
                   1172:   while( *s ){
                   1173:     char *t = s;
                   1174:     while( *t ){
                   1175:       if( *t=='"' ){
                   1176:         *t++ = '\0';
                   1177:         break;
                   1178:       }
                   1179:       ++t;
                   1180:     }
                   1181:     if( *s ){
                   1182:       tokenize_segment(v->pTokenizer, s, in_phrase, pQuery);
                   1183:     }
                   1184:     s = t;
                   1185:     in_phrase = !in_phrase;
                   1186:   }
                   1187:   
                   1188:   free(zQuery1);
                   1189:   return SQLITE_OK;
                   1190: }
                   1191: 
                   1192: /* Perform a full-text query; return a list of documents in [pResult]. */
                   1193: static int fulltext_query(fulltext_vtab *v, const char *zQuery,
                   1194:                           DocList **pResult){
                   1195:   Query q;
                   1196:   int phrase_start = -1;
                   1197:   int i;
                   1198:   sqlite3_stmt *pSelect = NULL;
                   1199:   DocList *d = NULL;
                   1200: 
                   1201:   int rc = parse_query(v, zQuery, &q);
                   1202:   if( rc!=SQLITE_OK ) return rc;
                   1203: 
                   1204:   /* Merge terms. */
                   1205:   for(i = 0 ; i < q.nTerms ; ++i){
                   1206:     /* In each merge step, we need to generate positions whenever we're
                   1207:      * processing a phrase which hasn't ended yet. */
                   1208:     int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase;
                   1209:     DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS);
                   1210:     if( q.pTerm[i].is_phrase ){
                   1211:       phrase_start = i;
                   1212:     }
                   1213:     rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next);
                   1214:     if( rc!=SQLITE_OK ) break;
                   1215:     if( d!=NULL ){
                   1216:       docListDelete(d);
                   1217:     }
                   1218:     d = next;
                   1219:   }
                   1220: 
                   1221:   sqlite3_finalize(pSelect);
                   1222:   query_free(&q);
                   1223:   *pResult = d;
                   1224:   return rc;
                   1225: }
                   1226: 
                   1227: static int fulltextFilter(sqlite3_vtab_cursor *pCursor,
                   1228:                           int idxNum, const char *idxStr,
                   1229:                           int argc, sqlite3_value **argv){
                   1230:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   1231:   fulltext_vtab *v = cursor_vtab(c);
                   1232:   int rc;
                   1233:   const char *zStatement;
                   1234: 
                   1235:   c->iCursorType = idxNum;
                   1236:   switch( idxNum ){
                   1237:     case QUERY_GENERIC:
                   1238:       zStatement = "select rowid, content from %_content";
                   1239:       break;
                   1240: 
                   1241:     case QUERY_FULLTEXT:   /* full-text search */
                   1242:     {
                   1243:       const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
                   1244:       DocList *pResult;
                   1245:       assert( argc==1 );
                   1246:       rc = fulltext_query(v, zQuery, &pResult);
                   1247:       if( rc!=SQLITE_OK ) return rc;
                   1248:       readerInit(&c->result, pResult);
                   1249:       zStatement = "select rowid, content from %_content where rowid = ?";
                   1250:       break;
                   1251:     }
                   1252: 
                   1253:     default:
                   1254:       assert( 0 );
                   1255:   }
                   1256: 
                   1257:   rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement);
                   1258:   if( rc!=SQLITE_OK ) return rc;
                   1259: 
                   1260:   return fulltextNext(pCursor);
                   1261: }
                   1262: 
                   1263: static int fulltextEof(sqlite3_vtab_cursor *pCursor){
                   1264:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   1265:   return c->eof;
                   1266: }
                   1267: 
                   1268: static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
                   1269:                           sqlite3_context *pContext, int idxCol){
                   1270:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   1271:   const char *s;
                   1272: 
                   1273:   assert( idxCol==0 );
                   1274:   s = (const char *) sqlite3_column_text(c->pStmt, 1);
                   1275:   sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT);
                   1276: 
                   1277:   return SQLITE_OK;
                   1278: }
                   1279: 
                   1280: static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
                   1281:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   1282: 
                   1283:   *pRowid = sqlite3_column_int64(c->pStmt, 0);
                   1284:   return SQLITE_OK;
                   1285: }
                   1286: 
                   1287: /* Build a hash table containing all terms in zText. */
                   1288: static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer,
                   1289:                        const char *zText, sqlite_int64 iDocid){
                   1290:   sqlite3_tokenizer_cursor *pCursor;
                   1291:   const char *pToken;
                   1292:   int nTokenBytes;
                   1293:   int iStartOffset, iEndOffset, iPosition;
                   1294: 
                   1295:   int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
                   1296:   if( rc!=SQLITE_OK ) return rc;
                   1297: 
                   1298:   pCursor->pTokenizer = pTokenizer;
                   1299:   HashInit(terms, HASH_STRING, 1);
                   1300:   while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
                   1301:                                                &pToken, &nTokenBytes,
                   1302:                                                &iStartOffset, &iEndOffset,
                   1303:                                                &iPosition) ){
                   1304:     DocList *p;
                   1305: 
                   1306:     /* Positions can't be negative; we use -1 as a terminator internally. */
                   1307:     if( iPosition<0 ) {
                   1308:       rc = SQLITE_ERROR;  
                   1309:       goto err;
                   1310:     }
                   1311: 
                   1312:     p = HashFind(terms, pToken, nTokenBytes);
                   1313:     if( p==NULL ){
                   1314:       p = docListNew(DL_POSITIONS_OFFSETS);
                   1315:       docListAddDocid(p, iDocid);
                   1316:       HashInsert(terms, pToken, nTokenBytes, p);
                   1317:     }
                   1318:     docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset);
                   1319:   }
                   1320: 
                   1321: err:
                   1322:   /* TODO(shess) Check return?  Should this be able to cause errors at
                   1323:   ** this point?  Actually, same question about sqlite3_finalize(),
                   1324:   ** though one could argue that failure there means that the data is
                   1325:   ** not durable.  *ponder*
                   1326:   */
                   1327:   pTokenizer->pModule->xClose(pCursor);
                   1328:   return rc;
                   1329: }
                   1330: /* Update the %_terms table to map the term [zTerm] to the given rowid. */
                   1331: static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm,
                   1332:                              sqlite_int64 iDocid, DocList *p){
                   1333:   sqlite_int64 iFirst;
                   1334:   sqlite_int64 iIndexRow;
                   1335:   DocList doclist;
                   1336: 
                   1337:   int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
                   1338:   if( rc==SQLITE_DONE ){
                   1339:     docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0);
                   1340:     if( docListUpdate(&doclist, iDocid, p) ){
                   1341:       rc = term_insert(v, zTerm, nTerm, iDocid, &doclist);
                   1342:       docListDestroy(&doclist);
                   1343:       return rc;
                   1344:     }
                   1345:     return SQLITE_OK;
                   1346:   }
                   1347:   if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
                   1348: 
                   1349:   /* This word is in the index; add this document ID to its blob. */
                   1350: 
                   1351:   rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
                   1352:   if( rc!=SQLITE_OK ) return rc;
                   1353: 
                   1354:   if( docListUpdate(&doclist, iDocid, p) ){
                   1355:     /* If the blob is too big, split it in half. */
                   1356:     if( doclist.nData>CHUNK_MAX ){
                   1357:       DocList half;
                   1358:       if( docListSplit(&doclist, &half) ){
                   1359:         rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half);
                   1360:         docListDestroy(&half);
                   1361:         if( rc!=SQLITE_OK ) goto err;
                   1362:       }
                   1363:     }
                   1364:     rc = term_update(v, iIndexRow, &doclist);
                   1365:   }
                   1366: 
                   1367: err:
                   1368:   docListDestroy(&doclist);
                   1369:   return rc;
                   1370: }
                   1371: 
                   1372: /* Insert a row into the full-text index; set *piRowid to be the ID of the
                   1373:  * new row. */
                   1374: static int index_insert(fulltext_vtab *v,
                   1375:                         sqlite3_value *pRequestRowid, const char *zText,
                   1376:                         sqlite_int64 *piRowid){
                   1377:   Hash terms;  /* maps term string -> PosList */
                   1378:   HashElem *e;
                   1379: 
                   1380:   int rc = content_insert(v, pRequestRowid, zText, -1);
                   1381:   if( rc!=SQLITE_OK ) return rc;
                   1382:   *piRowid = sqlite3_last_insert_rowid(v->db);
                   1383: 
                   1384:   if( !zText ) return SQLITE_OK;   /* nothing to index */
                   1385: 
                   1386:   rc = build_terms(&terms, v->pTokenizer, zText, *piRowid);
                   1387:   if( rc!=SQLITE_OK ) return rc;
                   1388: 
                   1389:   for(e=HashFirst(&terms); e; e=HashNext(e)){
                   1390:     DocList *p = HashData(e);
                   1391:     rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p);
                   1392:     if( rc!=SQLITE_OK ) break;
                   1393:   }
                   1394: 
                   1395:   for(e=HashFirst(&terms); e; e=HashNext(e)){
                   1396:     DocList *p = HashData(e);
                   1397:     docListDelete(p);
                   1398:   }
                   1399:   HashClear(&terms);
                   1400:   return rc;
                   1401: }
                   1402: 
                   1403: static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm,
                   1404:                              sqlite_int64 iDocid){
                   1405:   sqlite_int64 iFirst;
                   1406:   sqlite_int64 iIndexRow;
                   1407:   DocList doclist;
                   1408: 
                   1409:   int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
                   1410:   if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
                   1411: 
                   1412:   rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
                   1413:   if( rc!=SQLITE_OK ) return rc;
                   1414: 
                   1415:   if( docListUpdate(&doclist, iDocid, NULL) ){
                   1416:     if( doclist.nData>0 ){
                   1417:       rc = term_update(v, iIndexRow, &doclist);
                   1418:     } else {  /* empty posting list */
                   1419:       rc = term_delete(v, iIndexRow);
                   1420:     }
                   1421:   }
                   1422:   docListDestroy(&doclist);
                   1423:   return rc;
                   1424: }
                   1425: 
                   1426: /* Delete a row from the full-text index. */
                   1427: static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
                   1428:   char *zText;
                   1429:   Hash terms;
                   1430:   HashElem *e;
                   1431: 
                   1432:   int rc = content_select(v, iRow, &zText);
                   1433:   if( rc!=SQLITE_OK ) return rc;
                   1434: 
                   1435:   rc = build_terms(&terms, v->pTokenizer, zText, iRow);
                   1436:   free(zText);
                   1437:   if( rc!=SQLITE_OK ) return rc;
                   1438: 
                   1439:   for(e=HashFirst(&terms); e; e=HashNext(e)){
                   1440:     rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow);
                   1441:     if( rc!=SQLITE_OK ) break;
                   1442:   }
                   1443:   for(e=HashFirst(&terms); e; e=HashNext(e)){
                   1444:     DocList *p = HashData(e);
                   1445:     docListDelete(p);
                   1446:   }
                   1447:   HashClear(&terms);
                   1448: 
                   1449:   return content_delete(v, iRow);
                   1450: }
                   1451: 
                   1452: static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
                   1453:                    sqlite_int64 *pRowid){
                   1454:   fulltext_vtab *v = (fulltext_vtab *) pVtab;
                   1455: 
                   1456:   if( nArg<2 ){
                   1457:     return index_delete(v, sqlite3_value_int64(ppArg[0]));
                   1458:   }
                   1459: 
                   1460:   if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
                   1461:     return SQLITE_ERROR;   /* an update; not yet supported */
                   1462:   }
                   1463: 
                   1464:   assert( nArg==3 );    /* ppArg[1] = rowid, ppArg[2] = content */
                   1465:   return index_insert(v, ppArg[1],
                   1466:                       (const char *)sqlite3_value_text(ppArg[2]), pRowid);
                   1467: }
                   1468: 
                   1469: static sqlite3_module fulltextModule = {
                   1470:   0,
                   1471:   fulltextCreate,
                   1472:   fulltextConnect,
                   1473:   fulltextBestIndex,
                   1474:   fulltextDisconnect,
                   1475:   fulltextDestroy,
                   1476:   fulltextOpen,
                   1477:   fulltextClose,
                   1478:   fulltextFilter,
                   1479:   fulltextNext,
                   1480:   fulltextEof,
                   1481:   fulltextColumn,
                   1482:   fulltextRowid,
                   1483:   fulltextUpdate
                   1484: };
                   1485: 
                   1486: int fulltext_init(sqlite3 *db){
                   1487:  return sqlite3_create_module(db, "fulltext", &fulltextModule, 0);
                   1488: }
                   1489: 
                   1490: #if !SQLITE_CORE
                   1491: int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg,
                   1492:                            const sqlite3_api_routines *pApi){
                   1493:  SQLITE_EXTENSION_INIT2(pApi)
                   1494:  return fulltext_init(db);
                   1495: }
                   1496: #endif

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>