embedaddon/sqlite3/ext/fts2/fts2.c - annotate

Return to fts2.c CVS log
Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts2
Annotation of embedaddon/sqlite3/ext/fts2/fts2.c, revision 1.1.1.1

1.1       misho       1: /* fts2 has a design flaw which can lead to database corruption (see
                      2: ** below).  It is recommended not to use it any longer, instead use
                      3: ** fts3 (or higher).  If you believe that your use of fts2 is safe,
                      4: ** add -DSQLITE_ENABLE_BROKEN_FTS2=1 to your CFLAGS.
                      5: */
                      6: #if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)) \
                      7:         && !defined(SQLITE_ENABLE_BROKEN_FTS2)
                      8: #error fts2 has a design flaw and has been deprecated.
                      9: #endif
                     10: /* The flaw is that fts2 uses the content table's unaliased rowid as
                     11: ** the unique docid.  fts2 embeds the rowid in the index it builds,
                     12: ** and expects the rowid to not change.  The SQLite VACUUM operation
                     13: ** will renumber such rowids, thereby breaking fts2.  If you are using
                     14: ** fts2 in a system which has disabled VACUUM, then you can continue
                     15: ** to use it safely.  Note that PRAGMA auto_vacuum does NOT disable
                     16: ** VACUUM, though systems using auto_vacuum are unlikely to invoke
                     17: ** VACUUM.
                     18: **
                     19: ** Unlike fts1, which is safe across VACUUM if you never delete
                     20: ** documents, fts2 has a second exposure to this flaw, in the segments
                     21: ** table.  So fts2 should be considered unsafe across VACUUM in all
                     22: ** cases.
                     23: */
                     24: 
                     25: /*
                     26: ** 2006 Oct 10
                     27: **
                     28: ** The author disclaims copyright to this source code.  In place of
                     29: ** a legal notice, here is a blessing:
                     30: **
                     31: **    May you do good and not evil.
                     32: **    May you find forgiveness for yourself and forgive others.
                     33: **    May you share freely, never taking more than you give.
                     34: **
                     35: ******************************************************************************
                     36: **
                     37: ** This is an SQLite module implementing full-text search.
                     38: */
                     39: 
                     40: /*
                     41: ** The code in this file is only compiled if:
                     42: **
                     43: **     * The FTS2 module is being built as an extension
                     44: **       (in which case SQLITE_CORE is not defined), or
                     45: **
                     46: **     * The FTS2 module is being built into the core of
                     47: **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
                     48: */
                     49: 
                     50: /* TODO(shess) Consider exporting this comment to an HTML file or the
                     51: ** wiki.
                     52: */
                     53: /* The full-text index is stored in a series of b+tree (-like)
                     54: ** structures called segments which map terms to doclists.  The
                     55: ** structures are like b+trees in layout, but are constructed from the
                     56: ** bottom up in optimal fashion and are not updatable.  Since trees
                     57: ** are built from the bottom up, things will be described from the
                     58: ** bottom up.
                     59: **
                     60: **
                     61: **** Varints ****
                     62: ** The basic unit of encoding is a variable-length integer called a
                     63: ** varint.  We encode variable-length integers in little-endian order
                     64: ** using seven bits * per byte as follows:
                     65: **
                     66: ** KEY:
                     67: **         A = 0xxxxxxx    7 bits of data and one flag bit
                     68: **         B = 1xxxxxxx    7 bits of data and one flag bit
                     69: **
                     70: **  7 bits - A
                     71: ** 14 bits - BA
                     72: ** 21 bits - BBA
                     73: ** and so on.
                     74: **
                     75: ** This is identical to how sqlite encodes varints (see util.c).
                     76: **
                     77: **
                     78: **** Document lists ****
                     79: ** A doclist (document list) holds a docid-sorted list of hits for a
                     80: ** given term.  Doclists hold docids, and can optionally associate
                     81: ** token positions and offsets with docids.
                     82: **
                     83: ** A DL_POSITIONS_OFFSETS doclist is stored like this:
                     84: **
                     85: ** array {
                     86: **   varint docid;
                     87: **   array {                (position list for column 0)
                     88: **     varint position;     (delta from previous position plus POS_BASE)
                     89: **     varint startOffset;  (delta from previous startOffset)
                     90: **     varint endOffset;    (delta from startOffset)
                     91: **   }
                     92: **   array {
                     93: **     varint POS_COLUMN;   (marks start of position list for new column)
                     94: **     varint column;       (index of new column)
                     95: **     array {
                     96: **       varint position;   (delta from previous position plus POS_BASE)
                     97: **       varint startOffset;(delta from previous startOffset)
                     98: **       varint endOffset;  (delta from startOffset)
                     99: **     }
                    100: **   }
                    101: **   varint POS_END;        (marks end of positions for this document.
                    102: ** }
                    103: **
                    104: ** Here, array { X } means zero or more occurrences of X, adjacent in
                    105: ** memory.  A "position" is an index of a token in the token stream
                    106: ** generated by the tokenizer, while an "offset" is a byte offset,
                    107: ** both based at 0.  Note that POS_END and POS_COLUMN occur in the
                    108: ** same logical place as the position element, and act as sentinals
                    109: ** ending a position list array.
                    110: **
                    111: ** A DL_POSITIONS doclist omits the startOffset and endOffset
                    112: ** information.  A DL_DOCIDS doclist omits both the position and
                    113: ** offset information, becoming an array of varint-encoded docids.
                    114: **
                    115: ** On-disk data is stored as type DL_DEFAULT, so we don't serialize
                    116: ** the type.  Due to how deletion is implemented in the segmentation
                    117: ** system, on-disk doclists MUST store at least positions.
                    118: **
                    119: **
                    120: **** Segment leaf nodes ****
                    121: ** Segment leaf nodes store terms and doclists, ordered by term.  Leaf
                    122: ** nodes are written using LeafWriter, and read using LeafReader (to
                    123: ** iterate through a single leaf node's data) and LeavesReader (to
                    124: ** iterate through a segment's entire leaf layer).  Leaf nodes have
                    125: ** the format:
                    126: **
                    127: ** varint iHeight;             (height from leaf level, always 0)
                    128: ** varint nTerm;               (length of first term)
                    129: ** char pTerm[nTerm];          (content of first term)
                    130: ** varint nDoclist;            (length of term's associated doclist)
                    131: ** char pDoclist[nDoclist];    (content of doclist)
                    132: ** array {
                    133: **                             (further terms are delta-encoded)
                    134: **   varint nPrefix;           (length of prefix shared with previous term)
                    135: **   varint nSuffix;           (length of unshared suffix)
                    136: **   char pTermSuffix[nSuffix];(unshared suffix of next term)
                    137: **   varint nDoclist;          (length of term's associated doclist)
                    138: **   char pDoclist[nDoclist];  (content of doclist)
                    139: ** }
                    140: **
                    141: ** Here, array { X } means zero or more occurrences of X, adjacent in
                    142: ** memory.
                    143: **
                    144: ** Leaf nodes are broken into blocks which are stored contiguously in
                    145: ** the %_segments table in sorted order.  This means that when the end
                    146: ** of a node is reached, the next term is in the node with the next
                    147: ** greater node id.
                    148: **
                    149: ** New data is spilled to a new leaf node when the current node
                    150: ** exceeds LEAF_MAX bytes (default 2048).  New data which itself is
                    151: ** larger than STANDALONE_MIN (default 1024) is placed in a standalone
                    152: ** node (a leaf node with a single term and doclist).  The goal of
                    153: ** these settings is to pack together groups of small doclists while
                    154: ** making it efficient to directly access large doclists.  The
                    155: ** assumption is that large doclists represent terms which are more
                    156: ** likely to be query targets.
                    157: **
                    158: ** TODO(shess) It may be useful for blocking decisions to be more
                    159: ** dynamic.  For instance, it may make more sense to have a 2.5k leaf
                    160: ** node rather than splitting into 2k and .5k nodes.  My intuition is
                    161: ** that this might extend through 2x or 4x the pagesize.
                    162: **
                    163: **
                    164: **** Segment interior nodes ****
                    165: ** Segment interior nodes store blockids for subtree nodes and terms
                    166: ** to describe what data is stored by the each subtree.  Interior
                    167: ** nodes are written using InteriorWriter, and read using
                    168: ** InteriorReader.  InteriorWriters are created as needed when
                    169: ** SegmentWriter creates new leaf nodes, or when an interior node
                    170: ** itself grows too big and must be split.  The format of interior
                    171: ** nodes:
                    172: **
                    173: ** varint iHeight;           (height from leaf level, always >0)
                    174: ** varint iBlockid;          (block id of node's leftmost subtree)
                    175: ** optional {
                    176: **   varint nTerm;           (length of first term)
                    177: **   char pTerm[nTerm];      (content of first term)
                    178: **   array {
                    179: **                                (further terms are delta-encoded)
                    180: **     varint nPrefix;            (length of shared prefix with previous term)
                    181: **     varint nSuffix;            (length of unshared suffix)
                    182: **     char pTermSuffix[nSuffix]; (unshared suffix of next term)
                    183: **   }
                    184: ** }
                    185: **
                    186: ** Here, optional { X } means an optional element, while array { X }
                    187: ** means zero or more occurrences of X, adjacent in memory.
                    188: **
                    189: ** An interior node encodes n terms separating n+1 subtrees.  The
                    190: ** subtree blocks are contiguous, so only the first subtree's blockid
                    191: ** is encoded.  The subtree at iBlockid will contain all terms less
                    192: ** than the first term encoded (or all terms if no term is encoded).
                    193: ** Otherwise, for terms greater than or equal to pTerm[i] but less
                    194: ** than pTerm[i+1], the subtree for that term will be rooted at
                    195: ** iBlockid+i.  Interior nodes only store enough term data to
                    196: ** distinguish adjacent children (if the rightmost term of the left
                    197: ** child is "something", and the leftmost term of the right child is
                    198: ** "wicked", only "w" is stored).
                    199: **
                    200: ** New data is spilled to a new interior node at the same height when
                    201: ** the current node exceeds INTERIOR_MAX bytes (default 2048).
                    202: ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
                    203: ** interior nodes and making the tree too skinny.  The interior nodes
                    204: ** at a given height are naturally tracked by interior nodes at
                    205: ** height+1, and so on.
                    206: **
                    207: **
                    208: **** Segment directory ****
                    209: ** The segment directory in table %_segdir stores meta-information for
                    210: ** merging and deleting segments, and also the root node of the
                    211: ** segment's tree.
                    212: **
                    213: ** The root node is the top node of the segment's tree after encoding
                    214: ** the entire segment, restricted to ROOT_MAX bytes (default 1024).
                    215: ** This could be either a leaf node or an interior node.  If the top
                    216: ** node requires more than ROOT_MAX bytes, it is flushed to %_segments
                    217: ** and a new root interior node is generated (which should always fit
                    218: ** within ROOT_MAX because it only needs space for 2 varints, the
                    219: ** height and the blockid of the previous root).
                    220: **
                    221: ** The meta-information in the segment directory is:
                    222: **   level               - segment level (see below)
                    223: **   idx                 - index within level
                    224: **                       - (level,idx uniquely identify a segment)
                    225: **   start_block         - first leaf node
                    226: **   leaves_end_block    - last leaf node
                    227: **   end_block           - last block (including interior nodes)
                    228: **   root                - contents of root node
                    229: **
                    230: ** If the root node is a leaf node, then start_block,
                    231: ** leaves_end_block, and end_block are all 0.
                    232: **
                    233: **
                    234: **** Segment merging ****
                    235: ** To amortize update costs, segments are groups into levels and
                    236: ** merged in matches.  Each increase in level represents exponentially
                    237: ** more documents.
                    238: **
                    239: ** New documents (actually, document updates) are tokenized and
                    240: ** written individually (using LeafWriter) to a level 0 segment, with
                    241: ** incrementing idx.  When idx reaches MERGE_COUNT (default 16), all
                    242: ** level 0 segments are merged into a single level 1 segment.  Level 1
                    243: ** is populated like level 0, and eventually MERGE_COUNT level 1
                    244: ** segments are merged to a single level 2 segment (representing
                    245: ** MERGE_COUNT^2 updates), and so on.
                    246: **
                    247: ** A segment merge traverses all segments at a given level in
                    248: ** parallel, performing a straightforward sorted merge.  Since segment
                    249: ** leaf nodes are written in to the %_segments table in order, this
                    250: ** merge traverses the underlying sqlite disk structures efficiently.
                    251: ** After the merge, all segment blocks from the merged level are
                    252: ** deleted.
                    253: **
                    254: ** MERGE_COUNT controls how often we merge segments.  16 seems to be
                    255: ** somewhat of a sweet spot for insertion performance.  32 and 64 show
                    256: ** very similar performance numbers to 16 on insertion, though they're
                    257: ** a tiny bit slower (perhaps due to more overhead in merge-time
                    258: ** sorting).  8 is about 20% slower than 16, 4 about 50% slower than
                    259: ** 16, 2 about 66% slower than 16.
                    260: **
                    261: ** At query time, high MERGE_COUNT increases the number of segments
                    262: ** which need to be scanned and merged.  For instance, with 100k docs
                    263: ** inserted:
                    264: **
                    265: **    MERGE_COUNT   segments
                    266: **       16           25
                    267: **        8           12
                    268: **        4           10
                    269: **        2            6
                    270: **
                    271: ** This appears to have only a moderate impact on queries for very
                    272: ** frequent terms (which are somewhat dominated by segment merge
                    273: ** costs), and infrequent and non-existent terms still seem to be fast
                    274: ** even with many segments.
                    275: **
                    276: ** TODO(shess) That said, it would be nice to have a better query-side
                    277: ** argument for MERGE_COUNT of 16.  Also, it is possible/likely that
                    278: ** optimizations to things like doclist merging will swing the sweet
                    279: ** spot around.
                    280: **
                    281: **
                    282: **
                    283: **** Handling of deletions and updates ****
                    284: ** Since we're using a segmented structure, with no docid-oriented
                    285: ** index into the term index, we clearly cannot simply update the term
                    286: ** index when a document is deleted or updated.  For deletions, we
                    287: ** write an empty doclist (varint(docid) varint(POS_END)), for updates
                    288: ** we simply write the new doclist.  Segment merges overwrite older
                    289: ** data for a particular docid with newer data, so deletes or updates
                    290: ** will eventually overtake the earlier data and knock it out.  The
                    291: ** query logic likewise merges doclists so that newer data knocks out
                    292: ** older data.
                    293: **
                    294: ** TODO(shess) Provide a VACUUM type operation to clear out all
                    295: ** deletions and duplications.  This would basically be a forced merge
                    296: ** into a single segment.
                    297: */
                    298: 
                    299: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
                    300: 
                    301: #if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE)
                    302: # define SQLITE_CORE 1
                    303: #endif
                    304: 
                    305: #include <assert.h>
                    306: #include <stdlib.h>
                    307: #include <stdio.h>
                    308: #include <string.h>
                    309: #include "fts2.h"
                    310: #include "fts2_hash.h"
                    311: #include "fts2_tokenizer.h"
                    312: #include "sqlite3.h"
                    313: #include "sqlite3ext.h"
                    314: SQLITE_EXTENSION_INIT1
                    315: 
                    316: 
                    317: /* TODO(shess) MAN, this thing needs some refactoring.  At minimum, it
                    318: ** would be nice to order the file better, perhaps something along the
                    319: ** lines of:
                    320: **
                    321: **  - utility functions
                    322: **  - table setup functions
                    323: **  - table update functions
                    324: **  - table query functions
                    325: **
                    326: ** Put the query functions last because they're likely to reference
                    327: ** typedefs or functions from the table update section.
                    328: */
                    329: 
                    330: #if 0
                    331: # define TRACE(A)  printf A; fflush(stdout)
                    332: #else
                    333: # define TRACE(A)
                    334: #endif
                    335: 
                    336: /* It is not safe to call isspace(), tolower(), or isalnum() on
                    337: ** hi-bit-set characters.  This is the same solution used in the
                    338: ** tokenizer.
                    339: */
                    340: /* TODO(shess) The snippet-generation code should be using the
                    341: ** tokenizer-generated tokens rather than doing its own local
                    342: ** tokenization.
                    343: */
                    344: /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
                    345: static int safe_isspace(char c){
                    346:   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
                    347: }
                    348: static int safe_tolower(char c){
                    349:   return (c>='A' && c<='Z') ? (c - 'A' + 'a') : c;
                    350: }
                    351: static int safe_isalnum(char c){
                    352:   return (c>='0' && c<='9') || (c>='A' && c<='Z') || (c>='a' && c<='z');
                    353: }
                    354: 
                    355: typedef enum DocListType {
                    356:   DL_DOCIDS,              /* docids only */
                    357:   DL_POSITIONS,           /* docids + positions */
                    358:   DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
                    359: } DocListType;
                    360: 
                    361: /*
                    362: ** By default, only positions and not offsets are stored in the doclists.
                    363: ** To change this so that offsets are stored too, compile with
                    364: **
                    365: **          -DDL_DEFAULT=DL_POSITIONS_OFFSETS
                    366: **
                    367: ** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
                    368: ** into (no deletes or updates).
                    369: */
                    370: #ifndef DL_DEFAULT
                    371: # define DL_DEFAULT DL_POSITIONS
                    372: #endif
                    373: 
                    374: enum {
                    375:   POS_END = 0,        /* end of this position list */
                    376:   POS_COLUMN,         /* followed by new column number */
                    377:   POS_BASE
                    378: };
                    379: 
                    380: /* MERGE_COUNT controls how often we merge segments (see comment at
                    381: ** top of file).
                    382: */
                    383: #define MERGE_COUNT 16
                    384: 
                    385: /* utility functions */
                    386: 
                    387: /* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
                    388: ** record to prevent errors of the form:
                    389: **
                    390: ** my_function(SomeType *b){
                    391: **   memset(b, '\0', sizeof(b));  // sizeof(b)!=sizeof(*b)
                    392: ** }
                    393: */
                    394: /* TODO(shess) Obvious candidates for a header file. */
                    395: #define CLEAR(b) memset(b, '\0', sizeof(*(b)))
                    396: 
                    397: #ifndef NDEBUG
                    398: #  define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
                    399: #else
                    400: #  define SCRAMBLE(b)
                    401: #endif
                    402: 
                    403: /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
                    404: #define VARINT_MAX 10
                    405: 
                    406: /* Write a 64-bit variable-length integer to memory starting at p[0].
                    407:  * The length of data written will be between 1 and VARINT_MAX bytes.
                    408:  * The number of bytes written is returned. */
                    409: static int putVarint(char *p, sqlite_int64 v){
                    410:   unsigned char *q = (unsigned char *) p;
                    411:   sqlite_uint64 vu = v;
                    412:   do{
                    413:     *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
                    414:     vu >>= 7;
                    415:   }while( vu!=0 );
                    416:   q[-1] &= 0x7f;  /* turn off high bit in final byte */
                    417:   assert( q - (unsigned char *)p <= VARINT_MAX );
                    418:   return (int) (q - (unsigned char *)p);
                    419: }
                    420: 
                    421: /* Read a 64-bit variable-length integer from memory starting at p[0].
                    422:  * Return the number of bytes read, or 0 on error.
                    423:  * The value is stored in *v. */
                    424: static int getVarint(const char *p, sqlite_int64 *v){
                    425:   const unsigned char *q = (const unsigned char *) p;
                    426:   sqlite_uint64 x = 0, y = 1;
                    427:   while( (*q & 0x80) == 0x80 ){
                    428:     x += y * (*q++ & 0x7f);
                    429:     y <<= 7;
                    430:     if( q - (unsigned char *)p >= VARINT_MAX ){  /* bad data */
                    431:       assert( 0 );
                    432:       return 0;
                    433:     }
                    434:   }
                    435:   x += y * (*q++);
                    436:   *v = (sqlite_int64) x;
                    437:   return (int) (q - (unsigned char *)p);
                    438: }
                    439: 
                    440: static int getVarint32(const char *p, int *pi){
                    441:  sqlite_int64 i;
                    442:  int ret = getVarint(p, &i);
                    443:  *pi = (int) i;
                    444:  assert( *pi==i );
                    445:  return ret;
                    446: }
                    447: 
                    448: /*******************************************************************/
                    449: /* DataBuffer is used to collect data into a buffer in piecemeal
                    450: ** fashion.  It implements the usual distinction between amount of
                    451: ** data currently stored (nData) and buffer capacity (nCapacity).
                    452: **
                    453: ** dataBufferInit - create a buffer with given initial capacity.
                    454: ** dataBufferReset - forget buffer's data, retaining capacity.
                    455: ** dataBufferDestroy - free buffer's data.
                    456: ** dataBufferSwap - swap contents of two buffers.
                    457: ** dataBufferExpand - expand capacity without adding data.
                    458: ** dataBufferAppend - append data.
                    459: ** dataBufferAppend2 - append two pieces of data at once.
                    460: ** dataBufferReplace - replace buffer's data.
                    461: */
                    462: typedef struct DataBuffer {
                    463:   char *pData;          /* Pointer to malloc'ed buffer. */
                    464:   int nCapacity;        /* Size of pData buffer. */
                    465:   int nData;            /* End of data loaded into pData. */
                    466: } DataBuffer;
                    467: 
                    468: static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
                    469:   assert( nCapacity>=0 );
                    470:   pBuffer->nData = 0;
                    471:   pBuffer->nCapacity = nCapacity;
                    472:   pBuffer->pData = nCapacity==0 ? NULL : sqlite3_malloc(nCapacity);
                    473: }
                    474: static void dataBufferReset(DataBuffer *pBuffer){
                    475:   pBuffer->nData = 0;
                    476: }
                    477: static void dataBufferDestroy(DataBuffer *pBuffer){
                    478:   if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData);
                    479:   SCRAMBLE(pBuffer);
                    480: }
                    481: static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){
                    482:   DataBuffer tmp = *pBuffer1;
                    483:   *pBuffer1 = *pBuffer2;
                    484:   *pBuffer2 = tmp;
                    485: }
                    486: static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
                    487:   assert( nAddCapacity>0 );
                    488:   /* TODO(shess) Consider expanding more aggressively.  Note that the
                    489:   ** underlying malloc implementation may take care of such things for
                    490:   ** us already.
                    491:   */
                    492:   if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
                    493:     pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
                    494:     pBuffer->pData = sqlite3_realloc(pBuffer->pData, pBuffer->nCapacity);
                    495:   }
                    496: }
                    497: static void dataBufferAppend(DataBuffer *pBuffer,
                    498:                              const char *pSource, int nSource){
                    499:   assert( nSource>0 && pSource!=NULL );
                    500:   dataBufferExpand(pBuffer, nSource);
                    501:   memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
                    502:   pBuffer->nData += nSource;
                    503: }
                    504: static void dataBufferAppend2(DataBuffer *pBuffer,
                    505:                               const char *pSource1, int nSource1,
                    506:                               const char *pSource2, int nSource2){
                    507:   assert( nSource1>0 && pSource1!=NULL );
                    508:   assert( nSource2>0 && pSource2!=NULL );
                    509:   dataBufferExpand(pBuffer, nSource1+nSource2);
                    510:   memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
                    511:   memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
                    512:   pBuffer->nData += nSource1+nSource2;
                    513: }
                    514: static void dataBufferReplace(DataBuffer *pBuffer,
                    515:                               const char *pSource, int nSource){
                    516:   dataBufferReset(pBuffer);
                    517:   dataBufferAppend(pBuffer, pSource, nSource);
                    518: }
                    519: 
                    520: /* StringBuffer is a null-terminated version of DataBuffer. */
                    521: typedef struct StringBuffer {
                    522:   DataBuffer b;            /* Includes null terminator. */
                    523: } StringBuffer;
                    524: 
                    525: static void initStringBuffer(StringBuffer *sb){
                    526:   dataBufferInit(&sb->b, 100);
                    527:   dataBufferReplace(&sb->b, "", 1);
                    528: }
                    529: static int stringBufferLength(StringBuffer *sb){
                    530:   return sb->b.nData-1;
                    531: }
                    532: static char *stringBufferData(StringBuffer *sb){
                    533:   return sb->b.pData;
                    534: }
                    535: static void stringBufferDestroy(StringBuffer *sb){
                    536:   dataBufferDestroy(&sb->b);
                    537: }
                    538: 
                    539: static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
                    540:   assert( sb->b.nData>0 );
                    541:   if( nFrom>0 ){
                    542:     sb->b.nData--;
                    543:     dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
                    544:   }
                    545: }
                    546: static void append(StringBuffer *sb, const char *zFrom){
                    547:   nappend(sb, zFrom, strlen(zFrom));
                    548: }
                    549: 
                    550: /* Append a list of strings separated by commas. */
                    551: static void appendList(StringBuffer *sb, int nString, char **azString){
                    552:   int i;
                    553:   for(i=0; i<nString; ++i){
                    554:     if( i>0 ) append(sb, ", ");
                    555:     append(sb, azString[i]);
                    556:   }
                    557: }
                    558: 
                    559: static int endsInWhiteSpace(StringBuffer *p){
                    560:   return stringBufferLength(p)>0 &&
                    561:     safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
                    562: }
                    563: 
                    564: /* If the StringBuffer ends in something other than white space, add a
                    565: ** single space character to the end.
                    566: */
                    567: static void appendWhiteSpace(StringBuffer *p){
                    568:   if( stringBufferLength(p)==0 ) return;
                    569:   if( !endsInWhiteSpace(p) ) append(p, " ");
                    570: }
                    571: 
                    572: /* Remove white space from the end of the StringBuffer */
                    573: static void trimWhiteSpace(StringBuffer *p){
                    574:   while( endsInWhiteSpace(p) ){
                    575:     p->b.pData[--p->b.nData-1] = '\0';
                    576:   }
                    577: }
                    578: 
                    579: /*******************************************************************/
                    580: /* DLReader is used to read document elements from a doclist.  The
                    581: ** current docid is cached, so dlrDocid() is fast.  DLReader does not
                    582: ** own the doclist buffer.
                    583: **
                    584: ** dlrAtEnd - true if there's no more data to read.
                    585: ** dlrDocid - docid of current document.
                    586: ** dlrDocData - doclist data for current document (including docid).
                    587: ** dlrDocDataBytes - length of same.
                    588: ** dlrAllDataBytes - length of all remaining data.
                    589: ** dlrPosData - position data for current document.
                    590: ** dlrPosDataLen - length of pos data for current document (incl POS_END).
                    591: ** dlrStep - step to current document.
                    592: ** dlrInit - initial for doclist of given type against given data.
                    593: ** dlrDestroy - clean up.
                    594: **
                    595: ** Expected usage is something like:
                    596: **
                    597: **   DLReader reader;
                    598: **   dlrInit(&reader, pData, nData);
                    599: **   while( !dlrAtEnd(&reader) ){
                    600: **     // calls to dlrDocid() and kin.
                    601: **     dlrStep(&reader);
                    602: **   }
                    603: **   dlrDestroy(&reader);
                    604: */
                    605: typedef struct DLReader {
                    606:   DocListType iType;
                    607:   const char *pData;
                    608:   int nData;
                    609: 
                    610:   sqlite_int64 iDocid;
                    611:   int nElement;
                    612: } DLReader;
                    613: 
                    614: static int dlrAtEnd(DLReader *pReader){
                    615:   assert( pReader->nData>=0 );
                    616:   return pReader->nData==0;
                    617: }
                    618: static sqlite_int64 dlrDocid(DLReader *pReader){
                    619:   assert( !dlrAtEnd(pReader) );
                    620:   return pReader->iDocid;
                    621: }
                    622: static const char *dlrDocData(DLReader *pReader){
                    623:   assert( !dlrAtEnd(pReader) );
                    624:   return pReader->pData;
                    625: }
                    626: static int dlrDocDataBytes(DLReader *pReader){
                    627:   assert( !dlrAtEnd(pReader) );
                    628:   return pReader->nElement;
                    629: }
                    630: static int dlrAllDataBytes(DLReader *pReader){
                    631:   assert( !dlrAtEnd(pReader) );
                    632:   return pReader->nData;
                    633: }
                    634: /* TODO(shess) Consider adding a field to track iDocid varint length
                    635: ** to make these two functions faster.  This might matter (a tiny bit)
                    636: ** for queries.
                    637: */
                    638: static const char *dlrPosData(DLReader *pReader){
                    639:   sqlite_int64 iDummy;
                    640:   int n = getVarint(pReader->pData, &iDummy);
                    641:   assert( !dlrAtEnd(pReader) );
                    642:   return pReader->pData+n;
                    643: }
                    644: static int dlrPosDataLen(DLReader *pReader){
                    645:   sqlite_int64 iDummy;
                    646:   int n = getVarint(pReader->pData, &iDummy);
                    647:   assert( !dlrAtEnd(pReader) );
                    648:   return pReader->nElement-n;
                    649: }
                    650: static void dlrStep(DLReader *pReader){
                    651:   assert( !dlrAtEnd(pReader) );
                    652: 
                    653:   /* Skip past current doclist element. */
                    654:   assert( pReader->nElement<=pReader->nData );
                    655:   pReader->pData += pReader->nElement;
                    656:   pReader->nData -= pReader->nElement;
                    657: 
                    658:   /* If there is more data, read the next doclist element. */
                    659:   if( pReader->nData!=0 ){
                    660:     sqlite_int64 iDocidDelta;
                    661:     int iDummy, n = getVarint(pReader->pData, &iDocidDelta);
                    662:     pReader->iDocid += iDocidDelta;
                    663:     if( pReader->iType>=DL_POSITIONS ){
                    664:       assert( n<pReader->nData );
                    665:       while( 1 ){
                    666:         n += getVarint32(pReader->pData+n, &iDummy);
                    667:         assert( n<=pReader->nData );
                    668:         if( iDummy==POS_END ) break;
                    669:         if( iDummy==POS_COLUMN ){
                    670:           n += getVarint32(pReader->pData+n, &iDummy);
                    671:           assert( n<pReader->nData );
                    672:         }else if( pReader->iType==DL_POSITIONS_OFFSETS ){
                    673:           n += getVarint32(pReader->pData+n, &iDummy);
                    674:           n += getVarint32(pReader->pData+n, &iDummy);
                    675:           assert( n<pReader->nData );
                    676:         }
                    677:       }
                    678:     }
                    679:     pReader->nElement = n;
                    680:     assert( pReader->nElement<=pReader->nData );
                    681:   }
                    682: }
                    683: static void dlrInit(DLReader *pReader, DocListType iType,
                    684:                     const char *pData, int nData){
                    685:   assert( pData!=NULL && nData!=0 );
                    686:   pReader->iType = iType;
                    687:   pReader->pData = pData;
                    688:   pReader->nData = nData;
                    689:   pReader->nElement = 0;
                    690:   pReader->iDocid = 0;
                    691: 
                    692:   /* Load the first element's data.  There must be a first element. */
                    693:   dlrStep(pReader);
                    694: }
                    695: static void dlrDestroy(DLReader *pReader){
                    696:   SCRAMBLE(pReader);
                    697: }
                    698: 
                    699: #ifndef NDEBUG
                    700: /* Verify that the doclist can be validly decoded.  Also returns the
                    701: ** last docid found because it is convenient in other assertions for
                    702: ** DLWriter.
                    703: */
                    704: static void docListValidate(DocListType iType, const char *pData, int nData,
                    705:                             sqlite_int64 *pLastDocid){
                    706:   sqlite_int64 iPrevDocid = 0;
                    707:   assert( nData>0 );
                    708:   assert( pData!=0 );
                    709:   assert( pData+nData>pData );
                    710:   while( nData!=0 ){
                    711:     sqlite_int64 iDocidDelta;
                    712:     int n = getVarint(pData, &iDocidDelta);
                    713:     iPrevDocid += iDocidDelta;
                    714:     if( iType>DL_DOCIDS ){
                    715:       int iDummy;
                    716:       while( 1 ){
                    717:         n += getVarint32(pData+n, &iDummy);
                    718:         if( iDummy==POS_END ) break;
                    719:         if( iDummy==POS_COLUMN ){
                    720:           n += getVarint32(pData+n, &iDummy);
                    721:         }else if( iType>DL_POSITIONS ){
                    722:           n += getVarint32(pData+n, &iDummy);
                    723:           n += getVarint32(pData+n, &iDummy);
                    724:         }
                    725:         assert( n<=nData );
                    726:       }
                    727:     }
                    728:     assert( n<=nData );
                    729:     pData += n;
                    730:     nData -= n;
                    731:   }
                    732:   if( pLastDocid ) *pLastDocid = iPrevDocid;
                    733: }
                    734: #define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
                    735: #else
                    736: #define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
                    737: #endif
                    738: 
                    739: /*******************************************************************/
                    740: /* DLWriter is used to write doclist data to a DataBuffer.  DLWriter
                    741: ** always appends to the buffer and does not own it.
                    742: **
                    743: ** dlwInit - initialize to write a given type doclistto a buffer.
                    744: ** dlwDestroy - clear the writer's memory.  Does not free buffer.
                    745: ** dlwAppend - append raw doclist data to buffer.
                    746: ** dlwCopy - copy next doclist from reader to writer.
                    747: ** dlwAdd - construct doclist element and append to buffer.
                    748: **    Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
                    749: */
                    750: typedef struct DLWriter {
                    751:   DocListType iType;
                    752:   DataBuffer *b;
                    753:   sqlite_int64 iPrevDocid;
                    754: #ifndef NDEBUG
                    755:   int has_iPrevDocid;
                    756: #endif
                    757: } DLWriter;
                    758: 
                    759: static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){
                    760:   pWriter->b = b;
                    761:   pWriter->iType = iType;
                    762:   pWriter->iPrevDocid = 0;
                    763: #ifndef NDEBUG
                    764:   pWriter->has_iPrevDocid = 0;
                    765: #endif
                    766: }
                    767: static void dlwDestroy(DLWriter *pWriter){
                    768:   SCRAMBLE(pWriter);
                    769: }
                    770: /* iFirstDocid is the first docid in the doclist in pData.  It is
                    771: ** needed because pData may point within a larger doclist, in which
                    772: ** case the first item would be delta-encoded.
                    773: **
                    774: ** iLastDocid is the final docid in the doclist in pData.  It is
                    775: ** needed to create the new iPrevDocid for future delta-encoding.  The
                    776: ** code could decode the passed doclist to recreate iLastDocid, but
                    777: ** the only current user (docListMerge) already has decoded this
                    778: ** information.
                    779: */
                    780: /* TODO(shess) This has become just a helper for docListMerge.
                    781: ** Consider a refactor to make this cleaner.
                    782: */
                    783: static void dlwAppend(DLWriter *pWriter,
                    784:                       const char *pData, int nData,
                    785:                       sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
                    786:   sqlite_int64 iDocid = 0;
                    787:   char c[VARINT_MAX];
                    788:   int nFirstOld, nFirstNew;     /* Old and new varint len of first docid. */
                    789: #ifndef NDEBUG
                    790:   sqlite_int64 iLastDocidDelta;
                    791: #endif
                    792: 
                    793:   /* Recode the initial docid as delta from iPrevDocid. */
                    794:   nFirstOld = getVarint(pData, &iDocid);
                    795:   assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
                    796:   nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
                    797: 
                    798:   /* Verify that the incoming doclist is valid AND that it ends with
                    799:   ** the expected docid.  This is essential because we'll trust this
                    800:   ** docid in future delta-encoding.
                    801:   */
                    802:   ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
                    803:   assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
                    804: 
                    805:   /* Append recoded initial docid and everything else.  Rest of docids
                    806:   ** should have been delta-encoded from previous initial docid.
                    807:   */
                    808:   if( nFirstOld<nData ){
                    809:     dataBufferAppend2(pWriter->b, c, nFirstNew,
                    810:                       pData+nFirstOld, nData-nFirstOld);
                    811:   }else{
                    812:     dataBufferAppend(pWriter->b, c, nFirstNew);
                    813:   }
                    814:   pWriter->iPrevDocid = iLastDocid;
                    815: }
                    816: static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
                    817:   dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
                    818:             dlrDocid(pReader), dlrDocid(pReader));
                    819: }
                    820: static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
                    821:   char c[VARINT_MAX];
                    822:   int n = putVarint(c, iDocid-pWriter->iPrevDocid);
                    823: 
                    824:   /* Docids must ascend. */
                    825:   assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid );
                    826:   assert( pWriter->iType==DL_DOCIDS );
                    827: 
                    828:   dataBufferAppend(pWriter->b, c, n);
                    829:   pWriter->iPrevDocid = iDocid;
                    830: #ifndef NDEBUG
                    831:   pWriter->has_iPrevDocid = 1;
                    832: #endif
                    833: }
                    834: 
                    835: /*******************************************************************/
                    836: /* PLReader is used to read data from a document's position list.  As
                    837: ** the caller steps through the list, data is cached so that varints
                    838: ** only need to be decoded once.
                    839: **
                    840: ** plrInit, plrDestroy - create/destroy a reader.
                    841: ** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
                    842: ** plrAtEnd - at end of stream, only call plrDestroy once true.
                    843: ** plrStep - step to the next element.
                    844: */
                    845: typedef struct PLReader {
                    846:   /* These refer to the next position's data.  nData will reach 0 when
                    847:   ** reading the last position, so plrStep() signals EOF by setting
                    848:   ** pData to NULL.
                    849:   */
                    850:   const char *pData;
                    851:   int nData;
                    852: 
                    853:   DocListType iType;
                    854:   int iColumn;         /* the last column read */
                    855:   int iPosition;       /* the last position read */
                    856:   int iStartOffset;    /* the last start offset read */
                    857:   int iEndOffset;      /* the last end offset read */
                    858: } PLReader;
                    859: 
                    860: static int plrAtEnd(PLReader *pReader){
                    861:   return pReader->pData==NULL;
                    862: }
                    863: static int plrColumn(PLReader *pReader){
                    864:   assert( !plrAtEnd(pReader) );
                    865:   return pReader->iColumn;
                    866: }
                    867: static int plrPosition(PLReader *pReader){
                    868:   assert( !plrAtEnd(pReader) );
                    869:   return pReader->iPosition;
                    870: }
                    871: static int plrStartOffset(PLReader *pReader){
                    872:   assert( !plrAtEnd(pReader) );
                    873:   return pReader->iStartOffset;
                    874: }
                    875: static int plrEndOffset(PLReader *pReader){
                    876:   assert( !plrAtEnd(pReader) );
                    877:   return pReader->iEndOffset;
                    878: }
                    879: static void plrStep(PLReader *pReader){
                    880:   int i, n;
                    881: 
                    882:   assert( !plrAtEnd(pReader) );
                    883: 
                    884:   if( pReader->nData==0 ){
                    885:     pReader->pData = NULL;
                    886:     return;
                    887:   }
                    888: 
                    889:   n = getVarint32(pReader->pData, &i);
                    890:   if( i==POS_COLUMN ){
                    891:     n += getVarint32(pReader->pData+n, &pReader->iColumn);
                    892:     pReader->iPosition = 0;
                    893:     pReader->iStartOffset = 0;
                    894:     n += getVarint32(pReader->pData+n, &i);
                    895:   }
                    896:   /* Should never see adjacent column changes. */
                    897:   assert( i!=POS_COLUMN );
                    898: 
                    899:   if( i==POS_END ){
                    900:     pReader->nData = 0;
                    901:     pReader->pData = NULL;
                    902:     return;
                    903:   }
                    904: 
                    905:   pReader->iPosition += i-POS_BASE;
                    906:   if( pReader->iType==DL_POSITIONS_OFFSETS ){
                    907:     n += getVarint32(pReader->pData+n, &i);
                    908:     pReader->iStartOffset += i;
                    909:     n += getVarint32(pReader->pData+n, &i);
                    910:     pReader->iEndOffset = pReader->iStartOffset+i;
                    911:   }
                    912:   assert( n<=pReader->nData );
                    913:   pReader->pData += n;
                    914:   pReader->nData -= n;
                    915: }
                    916: 
                    917: static void plrInit(PLReader *pReader, DLReader *pDLReader){
                    918:   pReader->pData = dlrPosData(pDLReader);
                    919:   pReader->nData = dlrPosDataLen(pDLReader);
                    920:   pReader->iType = pDLReader->iType;
                    921:   pReader->iColumn = 0;
                    922:   pReader->iPosition = 0;
                    923:   pReader->iStartOffset = 0;
                    924:   pReader->iEndOffset = 0;
                    925:   plrStep(pReader);
                    926: }
                    927: static void plrDestroy(PLReader *pReader){
                    928:   SCRAMBLE(pReader);
                    929: }
                    930: 
                    931: /*******************************************************************/
                    932: /* PLWriter is used in constructing a document's position list.  As a
                    933: ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
                    934: ** PLWriter writes to the associated DLWriter's buffer.
                    935: **
                    936: ** plwInit - init for writing a document's poslist.
                    937: ** plwDestroy - clear a writer.
                    938: ** plwAdd - append position and offset information.
                    939: ** plwCopy - copy next position's data from reader to writer.
                    940: ** plwTerminate - add any necessary doclist terminator.
                    941: **
                    942: ** Calling plwAdd() after plwTerminate() may result in a corrupt
                    943: ** doclist.
                    944: */
                    945: /* TODO(shess) Until we've written the second item, we can cache the
                    946: ** first item's information.  Then we'd have three states:
                    947: **
                    948: ** - initialized with docid, no positions.
                    949: ** - docid and one position.
                    950: ** - docid and multiple positions.
                    951: **
                    952: ** Only the last state needs to actually write to dlw->b, which would
                    953: ** be an improvement in the DLCollector case.
                    954: */
                    955: typedef struct PLWriter {
                    956:   DLWriter *dlw;
                    957: 
                    958:   int iColumn;    /* the last column written */
                    959:   int iPos;       /* the last position written */
                    960:   int iOffset;    /* the last start offset written */
                    961: } PLWriter;
                    962: 
                    963: /* TODO(shess) In the case where the parent is reading these values
                    964: ** from a PLReader, we could optimize to a copy if that PLReader has
                    965: ** the same type as pWriter.
                    966: */
                    967: static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
                    968:                    int iStartOffset, int iEndOffset){
                    969:   /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
                    970:   ** iStartOffsetDelta, and iEndOffsetDelta.
                    971:   */
                    972:   char c[5*VARINT_MAX];
                    973:   int n = 0;
                    974: 
                    975:   /* Ban plwAdd() after plwTerminate(). */
                    976:   assert( pWriter->iPos!=-1 );
                    977: 
                    978:   if( pWriter->dlw->iType==DL_DOCIDS ) return;
                    979: 
                    980:   if( iColumn!=pWriter->iColumn ){
                    981:     n += putVarint(c+n, POS_COLUMN);
                    982:     n += putVarint(c+n, iColumn);
                    983:     pWriter->iColumn = iColumn;
                    984:     pWriter->iPos = 0;
                    985:     pWriter->iOffset = 0;
                    986:   }
                    987:   assert( iPos>=pWriter->iPos );
                    988:   n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
                    989:   pWriter->iPos = iPos;
                    990:   if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
                    991:     assert( iStartOffset>=pWriter->iOffset );
                    992:     n += putVarint(c+n, iStartOffset-pWriter->iOffset);
                    993:     pWriter->iOffset = iStartOffset;
                    994:     assert( iEndOffset>=iStartOffset );
                    995:     n += putVarint(c+n, iEndOffset-iStartOffset);
                    996:   }
                    997:   dataBufferAppend(pWriter->dlw->b, c, n);
                    998: }
                    999: static void plwCopy(PLWriter *pWriter, PLReader *pReader){
                   1000:   plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
                   1001:          plrStartOffset(pReader), plrEndOffset(pReader));
                   1002: }
                   1003: static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
                   1004:   char c[VARINT_MAX];
                   1005:   int n;
                   1006: 
                   1007:   pWriter->dlw = dlw;
                   1008: 
                   1009:   /* Docids must ascend. */
                   1010:   assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid );
                   1011:   n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
                   1012:   dataBufferAppend(pWriter->dlw->b, c, n);
                   1013:   pWriter->dlw->iPrevDocid = iDocid;
                   1014: #ifndef NDEBUG
                   1015:   pWriter->dlw->has_iPrevDocid = 1;
                   1016: #endif
                   1017: 
                   1018:   pWriter->iColumn = 0;
                   1019:   pWriter->iPos = 0;
                   1020:   pWriter->iOffset = 0;
                   1021: }
                   1022: /* TODO(shess) Should plwDestroy() also terminate the doclist?  But
                   1023: ** then plwDestroy() would no longer be just a destructor, it would
                   1024: ** also be doing work, which isn't consistent with the overall idiom.
                   1025: ** Another option would be for plwAdd() to always append any necessary
                   1026: ** terminator, so that the output is always correct.  But that would
                   1027: ** add incremental work to the common case with the only benefit being
                   1028: ** API elegance.  Punt for now.
                   1029: */
                   1030: static void plwTerminate(PLWriter *pWriter){
                   1031:   if( pWriter->dlw->iType>DL_DOCIDS ){
                   1032:     char c[VARINT_MAX];
                   1033:     int n = putVarint(c, POS_END);
                   1034:     dataBufferAppend(pWriter->dlw->b, c, n);
                   1035:   }
                   1036: #ifndef NDEBUG
                   1037:   /* Mark as terminated for assert in plwAdd(). */
                   1038:   pWriter->iPos = -1;
                   1039: #endif
                   1040: }
                   1041: static void plwDestroy(PLWriter *pWriter){
                   1042:   SCRAMBLE(pWriter);
                   1043: }
                   1044: 
                   1045: /*******************************************************************/
                   1046: /* DLCollector wraps PLWriter and DLWriter to provide a
                   1047: ** dynamically-allocated doclist area to use during tokenization.
                   1048: **
                   1049: ** dlcNew - malloc up and initialize a collector.
                   1050: ** dlcDelete - destroy a collector and all contained items.
                   1051: ** dlcAddPos - append position and offset information.
                   1052: ** dlcAddDoclist - add the collected doclist to the given buffer.
                   1053: ** dlcNext - terminate the current document and open another.
                   1054: */
                   1055: typedef struct DLCollector {
                   1056:   DataBuffer b;
                   1057:   DLWriter dlw;
                   1058:   PLWriter plw;
                   1059: } DLCollector;
                   1060: 
                   1061: /* TODO(shess) This could also be done by calling plwTerminate() and
                   1062: ** dataBufferAppend().  I tried that, expecting nominal performance
                   1063: ** differences, but it seemed to pretty reliably be worth 1% to code
                   1064: ** it this way.  I suspect it is the incremental malloc overhead (some
                   1065: ** percentage of the plwTerminate() calls will cause a realloc), so
                   1066: ** this might be worth revisiting if the DataBuffer implementation
                   1067: ** changes.
                   1068: */
                   1069: static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
                   1070:   if( pCollector->dlw.iType>DL_DOCIDS ){
                   1071:     char c[VARINT_MAX];
                   1072:     int n = putVarint(c, POS_END);
                   1073:     dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
                   1074:   }else{
                   1075:     dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
                   1076:   }
                   1077: }
                   1078: static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
                   1079:   plwTerminate(&pCollector->plw);
                   1080:   plwDestroy(&pCollector->plw);
                   1081:   plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
                   1082: }
                   1083: static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
                   1084:                       int iStartOffset, int iEndOffset){
                   1085:   plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
                   1086: }
                   1087: 
                   1088: static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
                   1089:   DLCollector *pCollector = sqlite3_malloc(sizeof(DLCollector));
                   1090:   dataBufferInit(&pCollector->b, 0);
                   1091:   dlwInit(&pCollector->dlw, iType, &pCollector->b);
                   1092:   plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
                   1093:   return pCollector;
                   1094: }
                   1095: static void dlcDelete(DLCollector *pCollector){
                   1096:   plwDestroy(&pCollector->plw);
                   1097:   dlwDestroy(&pCollector->dlw);
                   1098:   dataBufferDestroy(&pCollector->b);
                   1099:   SCRAMBLE(pCollector);
                   1100:   sqlite3_free(pCollector);
                   1101: }
                   1102: 
                   1103: 
                   1104: /* Copy the doclist data of iType in pData/nData into *out, trimming
                   1105: ** unnecessary data as we go.  Only columns matching iColumn are
                   1106: ** copied, all columns copied if iColumn is -1.  Elements with no
                   1107: ** matching columns are dropped.  The output is an iOutType doclist.
                   1108: */
                   1109: /* NOTE(shess) This code is only valid after all doclists are merged.
                   1110: ** If this is run before merges, then doclist items which represent
                   1111: ** deletion will be trimmed, and will thus not effect a deletion
                   1112: ** during the merge.
                   1113: */
                   1114: static void docListTrim(DocListType iType, const char *pData, int nData,
                   1115:                         int iColumn, DocListType iOutType, DataBuffer *out){
                   1116:   DLReader dlReader;
                   1117:   DLWriter dlWriter;
                   1118: 
                   1119:   assert( iOutType<=iType );
                   1120: 
                   1121:   dlrInit(&dlReader, iType, pData, nData);
                   1122:   dlwInit(&dlWriter, iOutType, out);
                   1123: 
                   1124:   while( !dlrAtEnd(&dlReader) ){
                   1125:     PLReader plReader;
                   1126:     PLWriter plWriter;
                   1127:     int match = 0;
                   1128: 
                   1129:     plrInit(&plReader, &dlReader);
                   1130: 
                   1131:     while( !plrAtEnd(&plReader) ){
                   1132:       if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
                   1133:         if( !match ){
                   1134:           plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
                   1135:           match = 1;
                   1136:         }
                   1137:         plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
                   1138:                plrStartOffset(&plReader), plrEndOffset(&plReader));
                   1139:       }
                   1140:       plrStep(&plReader);
                   1141:     }
                   1142:     if( match ){
                   1143:       plwTerminate(&plWriter);
                   1144:       plwDestroy(&plWriter);
                   1145:     }
                   1146: 
                   1147:     plrDestroy(&plReader);
                   1148:     dlrStep(&dlReader);
                   1149:   }
                   1150:   dlwDestroy(&dlWriter);
                   1151:   dlrDestroy(&dlReader);
                   1152: }
                   1153: 
                   1154: /* Used by docListMerge() to keep doclists in the ascending order by
                   1155: ** docid, then ascending order by age (so the newest comes first).
                   1156: */
                   1157: typedef struct OrderedDLReader {
                   1158:   DLReader *pReader;
                   1159: 
                   1160:   /* TODO(shess) If we assume that docListMerge pReaders is ordered by
                   1161:   ** age (which we do), then we could use pReader comparisons to break
                   1162:   ** ties.
                   1163:   */
                   1164:   int idx;
                   1165: } OrderedDLReader;
                   1166: 
                   1167: /* Order eof to end, then by docid asc, idx desc. */
                   1168: static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){
                   1169:   if( dlrAtEnd(r1->pReader) ){
                   1170:     if( dlrAtEnd(r2->pReader) ) return 0;  /* Both atEnd(). */
                   1171:     return 1;                              /* Only r1 atEnd(). */
                   1172:   }
                   1173:   if( dlrAtEnd(r2->pReader) ) return -1;   /* Only r2 atEnd(). */
                   1174: 
                   1175:   if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1;
                   1176:   if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
                   1177: 
                   1178:   /* Descending on idx. */
                   1179:   return r2->idx-r1->idx;
                   1180: }
                   1181: 
                   1182: /* Bubble p[0] to appropriate place in p[1..n-1].  Assumes that
                   1183: ** p[1..n-1] is already sorted.
                   1184: */
                   1185: /* TODO(shess) Is this frequent enough to warrant a binary search?
                   1186: ** Before implementing that, instrument the code to check.  In most
                   1187: ** current usage, I expect that p[0] will be less than p[1] a very
                   1188: ** high proportion of the time.
                   1189: */
                   1190: static void orderedDLReaderReorder(OrderedDLReader *p, int n){
                   1191:   while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
                   1192:     OrderedDLReader tmp = p[0];
                   1193:     p[0] = p[1];
                   1194:     p[1] = tmp;
                   1195:     n--;
                   1196:     p++;
                   1197:   }
                   1198: }
                   1199: 
                   1200: /* Given an array of doclist readers, merge their doclist elements
                   1201: ** into out in sorted order (by docid), dropping elements from older
                   1202: ** readers when there is a duplicate docid.  pReaders is assumed to be
                   1203: ** ordered by age, oldest first.
                   1204: */
                   1205: /* TODO(shess) nReaders must be <= MERGE_COUNT.  This should probably
                   1206: ** be fixed.
                   1207: */
                   1208: static void docListMerge(DataBuffer *out,
                   1209:                          DLReader *pReaders, int nReaders){
                   1210:   OrderedDLReader readers[MERGE_COUNT];
                   1211:   DLWriter writer;
                   1212:   int i, n;
                   1213:   const char *pStart = 0;
                   1214:   int nStart = 0;
                   1215:   sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
                   1216: 
                   1217:   assert( nReaders>0 );
                   1218:   if( nReaders==1 ){
                   1219:     dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
                   1220:     return;
                   1221:   }
                   1222: 
                   1223:   assert( nReaders<=MERGE_COUNT );
                   1224:   n = 0;
                   1225:   for(i=0; i<nReaders; i++){
                   1226:     assert( pReaders[i].iType==pReaders[0].iType );
                   1227:     readers[i].pReader = pReaders+i;
                   1228:     readers[i].idx = i;
                   1229:     n += dlrAllDataBytes(&pReaders[i]);
                   1230:   }
                   1231:   /* Conservatively size output to sum of inputs.  Output should end
                   1232:   ** up strictly smaller than input.
                   1233:   */
                   1234:   dataBufferExpand(out, n);
                   1235: 
                   1236:   /* Get the readers into sorted order. */
                   1237:   while( i-->0 ){
                   1238:     orderedDLReaderReorder(readers+i, nReaders-i);
                   1239:   }
                   1240: 
                   1241:   dlwInit(&writer, pReaders[0].iType, out);
                   1242:   while( !dlrAtEnd(readers[0].pReader) ){
                   1243:     sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
                   1244: 
                   1245:     /* If this is a continuation of the current buffer to copy, extend
                   1246:     ** that buffer.  memcpy() seems to be more efficient if it has a
                   1247:     ** lots of data to copy.
                   1248:     */
                   1249:     if( dlrDocData(readers[0].pReader)==pStart+nStart ){
                   1250:       nStart += dlrDocDataBytes(readers[0].pReader);
                   1251:     }else{
                   1252:       if( pStart!=0 ){
                   1253:         dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
                   1254:       }
                   1255:       pStart = dlrDocData(readers[0].pReader);
                   1256:       nStart = dlrDocDataBytes(readers[0].pReader);
                   1257:       iFirstDocid = iDocid;
                   1258:     }
                   1259:     iLastDocid = iDocid;
                   1260:     dlrStep(readers[0].pReader);
                   1261: 
                   1262:     /* Drop all of the older elements with the same docid. */
                   1263:     for(i=1; i<nReaders &&
                   1264:              !dlrAtEnd(readers[i].pReader) &&
                   1265:              dlrDocid(readers[i].pReader)==iDocid; i++){
                   1266:       dlrStep(readers[i].pReader);
                   1267:     }
                   1268: 
                   1269:     /* Get the readers back into order. */
                   1270:     while( i-->0 ){
                   1271:       orderedDLReaderReorder(readers+i, nReaders-i);
                   1272:     }
                   1273:   }
                   1274: 
                   1275:   /* Copy over any remaining elements. */
                   1276:   if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
                   1277:   dlwDestroy(&writer);
                   1278: }
                   1279: 
                   1280: /* Helper function for posListUnion().  Compares the current position
                   1281: ** between left and right, returning as standard C idiom of <0 if
                   1282: ** left<right, >0 if left>right, and 0 if left==right.  "End" always
                   1283: ** compares greater.
                   1284: */
                   1285: static int posListCmp(PLReader *pLeft, PLReader *pRight){
                   1286:   assert( pLeft->iType==pRight->iType );
                   1287:   if( pLeft->iType==DL_DOCIDS ) return 0;
                   1288: 
                   1289:   if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
                   1290:   if( plrAtEnd(pRight) ) return -1;
                   1291: 
                   1292:   if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
                   1293:   if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
                   1294: 
                   1295:   if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
                   1296:   if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
                   1297:   if( pLeft->iType==DL_POSITIONS ) return 0;
                   1298: 
                   1299:   if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
                   1300:   if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
                   1301: 
                   1302:   if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
                   1303:   if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
                   1304: 
                   1305:   return 0;
                   1306: }
                   1307: 
                   1308: /* Write the union of position lists in pLeft and pRight to pOut.
                   1309: ** "Union" in this case meaning "All unique position tuples".  Should
                   1310: ** work with any doclist type, though both inputs and the output
                   1311: ** should be the same type.
                   1312: */
                   1313: static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
                   1314:   PLReader left, right;
                   1315:   PLWriter writer;
                   1316: 
                   1317:   assert( dlrDocid(pLeft)==dlrDocid(pRight) );
                   1318:   assert( pLeft->iType==pRight->iType );
                   1319:   assert( pLeft->iType==pOut->iType );
                   1320: 
                   1321:   plrInit(&left, pLeft);
                   1322:   plrInit(&right, pRight);
                   1323:   plwInit(&writer, pOut, dlrDocid(pLeft));
                   1324: 
                   1325:   while( !plrAtEnd(&left) || !plrAtEnd(&right) ){
                   1326:     int c = posListCmp(&left, &right);
                   1327:     if( c<0 ){
                   1328:       plwCopy(&writer, &left);
                   1329:       plrStep(&left);
                   1330:     }else if( c>0 ){
                   1331:       plwCopy(&writer, &right);
                   1332:       plrStep(&right);
                   1333:     }else{
                   1334:       plwCopy(&writer, &left);
                   1335:       plrStep(&left);
                   1336:       plrStep(&right);
                   1337:     }
                   1338:   }
                   1339: 
                   1340:   plwTerminate(&writer);
                   1341:   plwDestroy(&writer);
                   1342:   plrDestroy(&left);
                   1343:   plrDestroy(&right);
                   1344: }
                   1345: 
                   1346: /* Write the union of doclists in pLeft and pRight to pOut.  For
                   1347: ** docids in common between the inputs, the union of the position
                   1348: ** lists is written.  Inputs and outputs are always type DL_DEFAULT.
                   1349: */
                   1350: static void docListUnion(
                   1351:   const char *pLeft, int nLeft,
                   1352:   const char *pRight, int nRight,
                   1353:   DataBuffer *pOut      /* Write the combined doclist here */
                   1354: ){
                   1355:   DLReader left, right;
                   1356:   DLWriter writer;
                   1357: 
                   1358:   if( nLeft==0 ){
                   1359:     if( nRight!=0) dataBufferAppend(pOut, pRight, nRight);
                   1360:     return;
                   1361:   }
                   1362:   if( nRight==0 ){
                   1363:     dataBufferAppend(pOut, pLeft, nLeft);
                   1364:     return;
                   1365:   }
                   1366: 
                   1367:   dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
                   1368:   dlrInit(&right, DL_DEFAULT, pRight, nRight);
                   1369:   dlwInit(&writer, DL_DEFAULT, pOut);
                   1370: 
                   1371:   while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
                   1372:     if( dlrAtEnd(&right) ){
                   1373:       dlwCopy(&writer, &left);
                   1374:       dlrStep(&left);
                   1375:     }else if( dlrAtEnd(&left) ){
                   1376:       dlwCopy(&writer, &right);
                   1377:       dlrStep(&right);
                   1378:     }else if( dlrDocid(&left)<dlrDocid(&right) ){
                   1379:       dlwCopy(&writer, &left);
                   1380:       dlrStep(&left);
                   1381:     }else if( dlrDocid(&left)>dlrDocid(&right) ){
                   1382:       dlwCopy(&writer, &right);
                   1383:       dlrStep(&right);
                   1384:     }else{
                   1385:       posListUnion(&left, &right, &writer);
                   1386:       dlrStep(&left);
                   1387:       dlrStep(&right);
                   1388:     }
                   1389:   }
                   1390: 
                   1391:   dlrDestroy(&left);
                   1392:   dlrDestroy(&right);
                   1393:   dlwDestroy(&writer);
                   1394: }
                   1395: 
                   1396: /* pLeft and pRight are DLReaders positioned to the same docid.
                   1397: **
                   1398: ** If there are no instances in pLeft or pRight where the position
                   1399: ** of pLeft is one less than the position of pRight, then this
                   1400: ** routine adds nothing to pOut.
                   1401: **
                   1402: ** If there are one or more instances where positions from pLeft
                   1403: ** are exactly one less than positions from pRight, then add a new
                   1404: ** document record to pOut.  If pOut wants to hold positions, then
                   1405: ** include the positions from pRight that are one more than a
                   1406: ** position in pLeft.  In other words:  pRight.iPos==pLeft.iPos+1.
                   1407: */
                   1408: static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight,
                   1409:                                DLWriter *pOut){
                   1410:   PLReader left, right;
                   1411:   PLWriter writer;
                   1412:   int match = 0;
                   1413: 
                   1414:   assert( dlrDocid(pLeft)==dlrDocid(pRight) );
                   1415:   assert( pOut->iType!=DL_POSITIONS_OFFSETS );
                   1416: 
                   1417:   plrInit(&left, pLeft);
                   1418:   plrInit(&right, pRight);
                   1419: 
                   1420:   while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
                   1421:     if( plrColumn(&left)<plrColumn(&right) ){
                   1422:       plrStep(&left);
                   1423:     }else if( plrColumn(&left)>plrColumn(&right) ){
                   1424:       plrStep(&right);
                   1425:     }else if( plrPosition(&left)+1<plrPosition(&right) ){
                   1426:       plrStep(&left);
                   1427:     }else if( plrPosition(&left)+1>plrPosition(&right) ){
                   1428:       plrStep(&right);
                   1429:     }else{
                   1430:       if( !match ){
                   1431:         plwInit(&writer, pOut, dlrDocid(pLeft));
                   1432:         match = 1;
                   1433:       }
                   1434:       plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
                   1435:       plrStep(&left);
                   1436:       plrStep(&right);
                   1437:     }
                   1438:   }
                   1439: 
                   1440:   if( match ){
                   1441:     plwTerminate(&writer);
                   1442:     plwDestroy(&writer);
                   1443:   }
                   1444: 
                   1445:   plrDestroy(&left);
                   1446:   plrDestroy(&right);
                   1447: }
                   1448: 
                   1449: /* We have two doclists with positions:  pLeft and pRight.
                   1450: ** Write the phrase intersection of these two doclists into pOut.
                   1451: **
                   1452: ** A phrase intersection means that two documents only match
                   1453: ** if pLeft.iPos+1==pRight.iPos.
                   1454: **
                   1455: ** iType controls the type of data written to pOut.  If iType is
                   1456: ** DL_POSITIONS, the positions are those from pRight.
                   1457: */
                   1458: static void docListPhraseMerge(
                   1459:   const char *pLeft, int nLeft,
                   1460:   const char *pRight, int nRight,
                   1461:   DocListType iType,
                   1462:   DataBuffer *pOut      /* Write the combined doclist here */
                   1463: ){
                   1464:   DLReader left, right;
                   1465:   DLWriter writer;
                   1466: 
                   1467:   if( nLeft==0 || nRight==0 ) return;
                   1468: 
                   1469:   assert( iType!=DL_POSITIONS_OFFSETS );
                   1470: 
                   1471:   dlrInit(&left, DL_POSITIONS, pLeft, nLeft);
                   1472:   dlrInit(&right, DL_POSITIONS, pRight, nRight);
                   1473:   dlwInit(&writer, iType, pOut);
                   1474: 
                   1475:   while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
                   1476:     if( dlrDocid(&left)<dlrDocid(&right) ){
                   1477:       dlrStep(&left);
                   1478:     }else if( dlrDocid(&right)<dlrDocid(&left) ){
                   1479:       dlrStep(&right);
                   1480:     }else{
                   1481:       posListPhraseMerge(&left, &right, &writer);
                   1482:       dlrStep(&left);
                   1483:       dlrStep(&right);
                   1484:     }
                   1485:   }
                   1486: 
                   1487:   dlrDestroy(&left);
                   1488:   dlrDestroy(&right);
                   1489:   dlwDestroy(&writer);
                   1490: }
                   1491: 
                   1492: /* We have two DL_DOCIDS doclists:  pLeft and pRight.
                   1493: ** Write the intersection of these two doclists into pOut as a
                   1494: ** DL_DOCIDS doclist.
                   1495: */
                   1496: static void docListAndMerge(
                   1497:   const char *pLeft, int nLeft,
                   1498:   const char *pRight, int nRight,
                   1499:   DataBuffer *pOut      /* Write the combined doclist here */
                   1500: ){
                   1501:   DLReader left, right;
                   1502:   DLWriter writer;
                   1503: 
                   1504:   if( nLeft==0 || nRight==0 ) return;
                   1505: 
                   1506:   dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
                   1507:   dlrInit(&right, DL_DOCIDS, pRight, nRight);
                   1508:   dlwInit(&writer, DL_DOCIDS, pOut);
                   1509: 
                   1510:   while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
                   1511:     if( dlrDocid(&left)<dlrDocid(&right) ){
                   1512:       dlrStep(&left);
                   1513:     }else if( dlrDocid(&right)<dlrDocid(&left) ){
                   1514:       dlrStep(&right);
                   1515:     }else{
                   1516:       dlwAdd(&writer, dlrDocid(&left));
                   1517:       dlrStep(&left);
                   1518:       dlrStep(&right);
                   1519:     }
                   1520:   }
                   1521: 
                   1522:   dlrDestroy(&left);
                   1523:   dlrDestroy(&right);
                   1524:   dlwDestroy(&writer);
                   1525: }
                   1526: 
                   1527: /* We have two DL_DOCIDS doclists:  pLeft and pRight.
                   1528: ** Write the union of these two doclists into pOut as a
                   1529: ** DL_DOCIDS doclist.
                   1530: */
                   1531: static void docListOrMerge(
                   1532:   const char *pLeft, int nLeft,
                   1533:   const char *pRight, int nRight,
                   1534:   DataBuffer *pOut      /* Write the combined doclist here */
                   1535: ){
                   1536:   DLReader left, right;
                   1537:   DLWriter writer;
                   1538: 
                   1539:   if( nLeft==0 ){
                   1540:     if( nRight!=0 ) dataBufferAppend(pOut, pRight, nRight);
                   1541:     return;
                   1542:   }
                   1543:   if( nRight==0 ){
                   1544:     dataBufferAppend(pOut, pLeft, nLeft);
                   1545:     return;
                   1546:   }
                   1547: 
                   1548:   dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
                   1549:   dlrInit(&right, DL_DOCIDS, pRight, nRight);
                   1550:   dlwInit(&writer, DL_DOCIDS, pOut);
                   1551: 
                   1552:   while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
                   1553:     if( dlrAtEnd(&right) ){
                   1554:       dlwAdd(&writer, dlrDocid(&left));
                   1555:       dlrStep(&left);
                   1556:     }else if( dlrAtEnd(&left) ){
                   1557:       dlwAdd(&writer, dlrDocid(&right));
                   1558:       dlrStep(&right);
                   1559:     }else if( dlrDocid(&left)<dlrDocid(&right) ){
                   1560:       dlwAdd(&writer, dlrDocid(&left));
                   1561:       dlrStep(&left);
                   1562:     }else if( dlrDocid(&right)<dlrDocid(&left) ){
                   1563:       dlwAdd(&writer, dlrDocid(&right));
                   1564:       dlrStep(&right);
                   1565:     }else{
                   1566:       dlwAdd(&writer, dlrDocid(&left));
                   1567:       dlrStep(&left);
                   1568:       dlrStep(&right);
                   1569:     }
                   1570:   }
                   1571: 
                   1572:   dlrDestroy(&left);
                   1573:   dlrDestroy(&right);
                   1574:   dlwDestroy(&writer);
                   1575: }
                   1576: 
                   1577: /* We have two DL_DOCIDS doclists:  pLeft and pRight.
                   1578: ** Write into pOut as DL_DOCIDS doclist containing all documents that
                   1579: ** occur in pLeft but not in pRight.
                   1580: */
                   1581: static void docListExceptMerge(
                   1582:   const char *pLeft, int nLeft,
                   1583:   const char *pRight, int nRight,
                   1584:   DataBuffer *pOut      /* Write the combined doclist here */
                   1585: ){
                   1586:   DLReader left, right;
                   1587:   DLWriter writer;
                   1588: 
                   1589:   if( nLeft==0 ) return;
                   1590:   if( nRight==0 ){
                   1591:     dataBufferAppend(pOut, pLeft, nLeft);
                   1592:     return;
                   1593:   }
                   1594: 
                   1595:   dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
                   1596:   dlrInit(&right, DL_DOCIDS, pRight, nRight);
                   1597:   dlwInit(&writer, DL_DOCIDS, pOut);
                   1598: 
                   1599:   while( !dlrAtEnd(&left) ){
                   1600:     while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
                   1601:       dlrStep(&right);
                   1602:     }
                   1603:     if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
                   1604:       dlwAdd(&writer, dlrDocid(&left));
                   1605:     }
                   1606:     dlrStep(&left);
                   1607:   }
                   1608: 
                   1609:   dlrDestroy(&left);
                   1610:   dlrDestroy(&right);
                   1611:   dlwDestroy(&writer);
                   1612: }
                   1613: 
                   1614: static char *string_dup_n(const char *s, int n){
                   1615:   char *str = sqlite3_malloc(n + 1);
                   1616:   memcpy(str, s, n);
                   1617:   str[n] = '\0';
                   1618:   return str;
                   1619: }
                   1620: 
                   1621: /* Duplicate a string; the caller must free() the returned string.
                   1622:  * (We don't use strdup() since it is not part of the standard C library and
                   1623:  * may not be available everywhere.) */
                   1624: static char *string_dup(const char *s){
                   1625:   return string_dup_n(s, strlen(s));
                   1626: }
                   1627: 
                   1628: /* Format a string, replacing each occurrence of the % character with
                   1629:  * zDb.zName.  This may be more convenient than sqlite_mprintf()
                   1630:  * when one string is used repeatedly in a format string.
                   1631:  * The caller must free() the returned string. */
                   1632: static char *string_format(const char *zFormat,
                   1633:                            const char *zDb, const char *zName){
                   1634:   const char *p;
                   1635:   size_t len = 0;
                   1636:   size_t nDb = strlen(zDb);
                   1637:   size_t nName = strlen(zName);
                   1638:   size_t nFullTableName = nDb+1+nName;
                   1639:   char *result;
                   1640:   char *r;
                   1641: 
                   1642:   /* first compute length needed */
                   1643:   for(p = zFormat ; *p ; ++p){
                   1644:     len += (*p=='%' ? nFullTableName : 1);
                   1645:   }
                   1646:   len += 1;  /* for null terminator */
                   1647: 
                   1648:   r = result = sqlite3_malloc(len);
                   1649:   for(p = zFormat; *p; ++p){
                   1650:     if( *p=='%' ){
                   1651:       memcpy(r, zDb, nDb);
                   1652:       r += nDb;
                   1653:       *r++ = '.';
                   1654:       memcpy(r, zName, nName);
                   1655:       r += nName;
                   1656:     } else {
                   1657:       *r++ = *p;
                   1658:     }
                   1659:   }
                   1660:   *r++ = '\0';
                   1661:   assert( r == result + len );
                   1662:   return result;
                   1663: }
                   1664: 
                   1665: static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
                   1666:                     const char *zFormat){
                   1667:   char *zCommand = string_format(zFormat, zDb, zName);
                   1668:   int rc;
                   1669:   TRACE(("FTS2 sql: %s\n", zCommand));
                   1670:   rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
                   1671:   sqlite3_free(zCommand);
                   1672:   return rc;
                   1673: }
                   1674: 
                   1675: static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
                   1676:                        sqlite3_stmt **ppStmt, const char *zFormat){
                   1677:   char *zCommand = string_format(zFormat, zDb, zName);
                   1678:   int rc;
                   1679:   TRACE(("FTS2 prepare: %s\n", zCommand));
                   1680:   rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL);
                   1681:   sqlite3_free(zCommand);
                   1682:   return rc;
                   1683: }
                   1684: 
                   1685: /* end utility functions */
                   1686: 
                   1687: /* Forward reference */
                   1688: typedef struct fulltext_vtab fulltext_vtab;
                   1689: 
                   1690: /* A single term in a query is represented by an instances of
                   1691: ** the following structure.
                   1692: */
                   1693: typedef struct QueryTerm {
                   1694:   short int nPhrase; /* How many following terms are part of the same phrase */
                   1695:   short int iPhrase; /* This is the i-th term of a phrase. */
                   1696:   short int iColumn; /* Column of the index that must match this term */
                   1697:   signed char isOr;  /* this term is preceded by "OR" */
                   1698:   signed char isNot; /* this term is preceded by "-" */
                   1699:   signed char isPrefix; /* this term is followed by "*" */
                   1700:   char *pTerm;       /* text of the term.  '\000' terminated.  malloced */
                   1701:   int nTerm;         /* Number of bytes in pTerm[] */
                   1702: } QueryTerm;
                   1703: 
                   1704: 
                   1705: /* A query string is parsed into a Query structure.
                   1706:  *
                   1707:  * We could, in theory, allow query strings to be complicated
                   1708:  * nested expressions with precedence determined by parentheses.
                   1709:  * But none of the major search engines do this.  (Perhaps the
                   1710:  * feeling is that an parenthesized expression is two complex of
                   1711:  * an idea for the average user to grasp.)  Taking our lead from
                   1712:  * the major search engines, we will allow queries to be a list
                   1713:  * of terms (with an implied AND operator) or phrases in double-quotes,
                   1714:  * with a single optional "-" before each non-phrase term to designate
                   1715:  * negation and an optional OR connector.
                   1716:  *
                   1717:  * OR binds more tightly than the implied AND, which is what the
                   1718:  * major search engines seem to do.  So, for example:
                   1719:  * 
                   1720:  *    [one two OR three]     ==>    one AND (two OR three)
                   1721:  *    [one OR two three]     ==>    (one OR two) AND three
                   1722:  *
                   1723:  * A "-" before a term matches all entries that lack that term.
                   1724:  * The "-" must occur immediately before the term with in intervening
                   1725:  * space.  This is how the search engines do it.
                   1726:  *
                   1727:  * A NOT term cannot be the right-hand operand of an OR.  If this
                   1728:  * occurs in the query string, the NOT is ignored:
                   1729:  *
                   1730:  *    [one OR -two]          ==>    one OR two
                   1731:  *
                   1732:  */
                   1733: typedef struct Query {
                   1734:   fulltext_vtab *pFts;  /* The full text index */
                   1735:   int nTerms;           /* Number of terms in the query */
                   1736:   QueryTerm *pTerms;    /* Array of terms.  Space obtained from malloc() */
                   1737:   int nextIsOr;         /* Set the isOr flag on the next inserted term */
                   1738:   int nextColumn;       /* Next word parsed must be in this column */
                   1739:   int dfltColumn;       /* The default column */
                   1740: } Query;
                   1741: 
                   1742: 
                   1743: /*
                   1744: ** An instance of the following structure keeps track of generated
                   1745: ** matching-word offset information and snippets.
                   1746: */
                   1747: typedef struct Snippet {
                   1748:   int nMatch;     /* Total number of matches */
                   1749:   int nAlloc;     /* Space allocated for aMatch[] */
                   1750:   struct snippetMatch { /* One entry for each matching term */
                   1751:     char snStatus;       /* Status flag for use while constructing snippets */
                   1752:     short int iCol;      /* The column that contains the match */
                   1753:     short int iTerm;     /* The index in Query.pTerms[] of the matching term */
                   1754:     short int nByte;     /* Number of bytes in the term */
                   1755:     int iStart;          /* The offset to the first character of the term */
                   1756:   } *aMatch;      /* Points to space obtained from malloc */
                   1757:   char *zOffset;  /* Text rendering of aMatch[] */
                   1758:   int nOffset;    /* strlen(zOffset) */
                   1759:   char *zSnippet; /* Snippet text */
                   1760:   int nSnippet;   /* strlen(zSnippet) */
                   1761: } Snippet;
                   1762: 
                   1763: 
                   1764: typedef enum QueryType {
                   1765:   QUERY_GENERIC,   /* table scan */
                   1766:   QUERY_ROWID,     /* lookup by rowid */
                   1767:   QUERY_FULLTEXT   /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
                   1768: } QueryType;
                   1769: 
                   1770: typedef enum fulltext_statement {
                   1771:   CONTENT_INSERT_STMT,
                   1772:   CONTENT_SELECT_STMT,
                   1773:   CONTENT_UPDATE_STMT,
                   1774:   CONTENT_DELETE_STMT,
                   1775:   CONTENT_EXISTS_STMT,
                   1776: 
                   1777:   BLOCK_INSERT_STMT,
                   1778:   BLOCK_SELECT_STMT,
                   1779:   BLOCK_DELETE_STMT,
                   1780:   BLOCK_DELETE_ALL_STMT,
                   1781: 
                   1782:   SEGDIR_MAX_INDEX_STMT,
                   1783:   SEGDIR_SET_STMT,
                   1784:   SEGDIR_SELECT_LEVEL_STMT,
                   1785:   SEGDIR_SPAN_STMT,
                   1786:   SEGDIR_DELETE_STMT,
                   1787:   SEGDIR_SELECT_SEGMENT_STMT,
                   1788:   SEGDIR_SELECT_ALL_STMT,
                   1789:   SEGDIR_DELETE_ALL_STMT,
                   1790:   SEGDIR_COUNT_STMT,
                   1791: 
                   1792:   MAX_STMT                     /* Always at end! */
                   1793: } fulltext_statement;
                   1794: 
                   1795: /* These must exactly match the enum above. */
                   1796: /* TODO(shess): Is there some risk that a statement will be used in two
                   1797: ** cursors at once, e.g.  if a query joins a virtual table to itself?
                   1798: ** If so perhaps we should move some of these to the cursor object.
                   1799: */
                   1800: static const char *const fulltext_zStatement[MAX_STMT] = {
                   1801:   /* CONTENT_INSERT */ NULL,  /* generated in contentInsertStatement() */
                   1802:   /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
                   1803:   /* CONTENT_UPDATE */ NULL,  /* generated in contentUpdateStatement() */
                   1804:   /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
                   1805:   /* CONTENT_EXISTS */ "select rowid from %_content limit 1",
                   1806: 
                   1807:   /* BLOCK_INSERT */ "insert into %_segments values (?)",
                   1808:   /* BLOCK_SELECT */ "select block from %_segments where rowid = ?",
                   1809:   /* BLOCK_DELETE */ "delete from %_segments where rowid between ? and ?",
                   1810:   /* BLOCK_DELETE_ALL */ "delete from %_segments",
                   1811: 
                   1812:   /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
                   1813:   /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
                   1814:   /* SEGDIR_SELECT_LEVEL */
                   1815:   "select start_block, leaves_end_block, root from %_segdir "
                   1816:   " where level = ? order by idx",
                   1817:   /* SEGDIR_SPAN */
                   1818:   "select min(start_block), max(end_block) from %_segdir "
                   1819:   " where level = ? and start_block <> 0",
                   1820:   /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
                   1821: 
                   1822:   /* NOTE(shess): The first three results of the following two
                   1823:   ** statements must match.
                   1824:   */
                   1825:   /* SEGDIR_SELECT_SEGMENT */
                   1826:   "select start_block, leaves_end_block, root from %_segdir "
                   1827:   " where level = ? and idx = ?",
                   1828:   /* SEGDIR_SELECT_ALL */
                   1829:   "select start_block, leaves_end_block, root from %_segdir "
                   1830:   " order by level desc, idx asc",
                   1831:   /* SEGDIR_DELETE_ALL */ "delete from %_segdir",
                   1832:   /* SEGDIR_COUNT */ "select count(*), ifnull(max(level),0) from %_segdir",
                   1833: };
                   1834: 
                   1835: /*
                   1836: ** A connection to a fulltext index is an instance of the following
                   1837: ** structure.  The xCreate and xConnect methods create an instance
                   1838: ** of this structure and xDestroy and xDisconnect free that instance.
                   1839: ** All other methods receive a pointer to the structure as one of their
                   1840: ** arguments.
                   1841: */
                   1842: struct fulltext_vtab {
                   1843:   sqlite3_vtab base;               /* Base class used by SQLite core */
                   1844:   sqlite3 *db;                     /* The database connection */
                   1845:   const char *zDb;                 /* logical database name */
                   1846:   const char *zName;               /* virtual table name */
                   1847:   int nColumn;                     /* number of columns in virtual table */
                   1848:   char **azColumn;                 /* column names.  malloced */
                   1849:   char **azContentColumn;          /* column names in content table; malloced */
                   1850:   sqlite3_tokenizer *pTokenizer;   /* tokenizer for inserts and queries */
                   1851: 
                   1852:   /* Precompiled statements which we keep as long as the table is
                   1853:   ** open.
                   1854:   */
                   1855:   sqlite3_stmt *pFulltextStatements[MAX_STMT];
                   1856: 
                   1857:   /* Precompiled statements used for segment merges.  We run a
                   1858:   ** separate select across the leaf level of each tree being merged.
                   1859:   */
                   1860:   sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
                   1861:   /* The statement used to prepare pLeafSelectStmts. */
                   1862: #define LEAF_SELECT \
                   1863:   "select block from %_segments where rowid between ? and ? order by rowid"
                   1864: 
                   1865:   /* These buffer pending index updates during transactions.
                   1866:   ** nPendingData estimates the memory size of the pending data.  It
                   1867:   ** doesn't include the hash-bucket overhead, nor any malloc
                   1868:   ** overhead.  When nPendingData exceeds kPendingThreshold, the
                   1869:   ** buffer is flushed even before the transaction closes.
                   1870:   ** pendingTerms stores the data, and is only valid when nPendingData
                   1871:   ** is >=0 (nPendingData<0 means pendingTerms has not been
                   1872:   ** initialized).  iPrevDocid is the last docid written, used to make
                   1873:   ** certain we're inserting in sorted order.
                   1874:   */
                   1875:   int nPendingData;
                   1876: #define kPendingThreshold (1*1024*1024)
                   1877:   sqlite_int64 iPrevDocid;
                   1878:   fts2Hash pendingTerms;
                   1879: };
                   1880: 
                   1881: /*
                   1882: ** When the core wants to do a query, it create a cursor using a
                   1883: ** call to xOpen.  This structure is an instance of a cursor.  It
                   1884: ** is destroyed by xClose.
                   1885: */
                   1886: typedef struct fulltext_cursor {
                   1887:   sqlite3_vtab_cursor base;        /* Base class used by SQLite core */
                   1888:   QueryType iCursorType;           /* Copy of sqlite3_index_info.idxNum */
                   1889:   sqlite3_stmt *pStmt;             /* Prepared statement in use by the cursor */
                   1890:   int eof;                         /* True if at End Of Results */
                   1891:   Query q;                         /* Parsed query string */
                   1892:   Snippet snippet;                 /* Cached snippet for the current row */
                   1893:   int iColumn;                     /* Column being searched */
                   1894:   DataBuffer result;               /* Doclist results from fulltextQuery */
                   1895:   DLReader reader;                 /* Result reader if result not empty */
                   1896: } fulltext_cursor;
                   1897: 
                   1898: static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
                   1899:   return (fulltext_vtab *) c->base.pVtab;
                   1900: }
                   1901: 
                   1902: static const sqlite3_module fts2Module;   /* forward declaration */
                   1903: 
                   1904: /* Return a dynamically generated statement of the form
                   1905:  *   insert into %_content (rowid, ...) values (?, ...)
                   1906:  */
                   1907: static const char *contentInsertStatement(fulltext_vtab *v){
                   1908:   StringBuffer sb;
                   1909:   int i;
                   1910: 
                   1911:   initStringBuffer(&sb);
                   1912:   append(&sb, "insert into %_content (rowid, ");
                   1913:   appendList(&sb, v->nColumn, v->azContentColumn);
                   1914:   append(&sb, ") values (?");
                   1915:   for(i=0; i<v->nColumn; ++i)
                   1916:     append(&sb, ", ?");
                   1917:   append(&sb, ")");
                   1918:   return stringBufferData(&sb);
                   1919: }
                   1920: 
                   1921: /* Return a dynamically generated statement of the form
                   1922:  *   update %_content set [col_0] = ?, [col_1] = ?, ...
                   1923:  *                    where rowid = ?
                   1924:  */
                   1925: static const char *contentUpdateStatement(fulltext_vtab *v){
                   1926:   StringBuffer sb;
                   1927:   int i;
                   1928: 
                   1929:   initStringBuffer(&sb);
                   1930:   append(&sb, "update %_content set ");
                   1931:   for(i=0; i<v->nColumn; ++i) {
                   1932:     if( i>0 ){
                   1933:       append(&sb, ", ");
                   1934:     }
                   1935:     append(&sb, v->azContentColumn[i]);
                   1936:     append(&sb, " = ?");
                   1937:   }
                   1938:   append(&sb, " where rowid = ?");
                   1939:   return stringBufferData(&sb);
                   1940: }
                   1941: 
                   1942: /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
                   1943: ** If the indicated statement has never been prepared, it is prepared
                   1944: ** and cached, otherwise the cached version is reset.
                   1945: */
                   1946: static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
                   1947:                              sqlite3_stmt **ppStmt){
                   1948:   assert( iStmt<MAX_STMT );
                   1949:   if( v->pFulltextStatements[iStmt]==NULL ){
                   1950:     const char *zStmt;
                   1951:     int rc;
                   1952:     switch( iStmt ){
                   1953:       case CONTENT_INSERT_STMT:
                   1954:         zStmt = contentInsertStatement(v); break;
                   1955:       case CONTENT_UPDATE_STMT:
                   1956:         zStmt = contentUpdateStatement(v); break;
                   1957:       default:
                   1958:         zStmt = fulltext_zStatement[iStmt];
                   1959:     }
                   1960:     rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
                   1961:                          zStmt);
                   1962:     if( zStmt != fulltext_zStatement[iStmt]) sqlite3_free((void *) zStmt);
                   1963:     if( rc!=SQLITE_OK ) return rc;
                   1964:   } else {
                   1965:     int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
                   1966:     if( rc!=SQLITE_OK ) return rc;
                   1967:   }
                   1968: 
                   1969:   *ppStmt = v->pFulltextStatements[iStmt];
                   1970:   return SQLITE_OK;
                   1971: }
                   1972: 
                   1973: /* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
                   1974: ** SQLITE_ROW to SQLITE_ERROR.  Useful for statements like UPDATE,
                   1975: ** where we expect no results.
                   1976: */
                   1977: static int sql_single_step(sqlite3_stmt *s){
                   1978:   int rc = sqlite3_step(s);
                   1979:   return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
                   1980: }
                   1981: 
                   1982: /* Like sql_get_statement(), but for special replicated LEAF_SELECT
                   1983: ** statements.  idx -1 is a special case for an uncached version of
                   1984: ** the statement (used in the optimize implementation).
                   1985: */
                   1986: /* TODO(shess) Write version for generic statements and then share
                   1987: ** that between the cached-statement functions.
                   1988: */
                   1989: static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
                   1990:                                   sqlite3_stmt **ppStmt){
                   1991:   assert( idx>=-1 && idx<MERGE_COUNT );
                   1992:   if( idx==-1 ){
                   1993:     return sql_prepare(v->db, v->zDb, v->zName, ppStmt, LEAF_SELECT);
                   1994:   }else if( v->pLeafSelectStmts[idx]==NULL ){
                   1995:     int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
                   1996:                          LEAF_SELECT);
                   1997:     if( rc!=SQLITE_OK ) return rc;
                   1998:   }else{
                   1999:     int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
                   2000:     if( rc!=SQLITE_OK ) return rc;
                   2001:   }
                   2002: 
                   2003:   *ppStmt = v->pLeafSelectStmts[idx];
                   2004:   return SQLITE_OK;
                   2005: }
                   2006: 
                   2007: /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
                   2008: static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
                   2009:                           sqlite3_value **pValues){
                   2010:   sqlite3_stmt *s;
                   2011:   int i;
                   2012:   int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
                   2013:   if( rc!=SQLITE_OK ) return rc;
                   2014: 
                   2015:   rc = sqlite3_bind_value(s, 1, rowid);
                   2016:   if( rc!=SQLITE_OK ) return rc;
                   2017: 
                   2018:   for(i=0; i<v->nColumn; ++i){
                   2019:     rc = sqlite3_bind_value(s, 2+i, pValues[i]);
                   2020:     if( rc!=SQLITE_OK ) return rc;
                   2021:   }
                   2022: 
                   2023:   return sql_single_step(s);
                   2024: }
                   2025: 
                   2026: /* update %_content set col0 = pValues[0], col1 = pValues[1], ...
                   2027:  *                  where rowid = [iRowid] */
                   2028: static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
                   2029:                           sqlite_int64 iRowid){
                   2030:   sqlite3_stmt *s;
                   2031:   int i;
                   2032:   int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
                   2033:   if( rc!=SQLITE_OK ) return rc;
                   2034: 
                   2035:   for(i=0; i<v->nColumn; ++i){
                   2036:     rc = sqlite3_bind_value(s, 1+i, pValues[i]);
                   2037:     if( rc!=SQLITE_OK ) return rc;
                   2038:   }
                   2039: 
                   2040:   rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
                   2041:   if( rc!=SQLITE_OK ) return rc;
                   2042: 
                   2043:   return sql_single_step(s);
                   2044: }
                   2045: 
                   2046: static void freeStringArray(int nString, const char **pString){
                   2047:   int i;
                   2048: 
                   2049:   for (i=0 ; i < nString ; ++i) {
                   2050:     if( pString[i]!=NULL ) sqlite3_free((void *) pString[i]);
                   2051:   }
                   2052:   sqlite3_free((void *) pString);
                   2053: }
                   2054: 
                   2055: /* select * from %_content where rowid = [iRow]
                   2056:  * The caller must delete the returned array and all strings in it.
                   2057:  * null fields will be NULL in the returned array.
                   2058:  *
                   2059:  * TODO: Perhaps we should return pointer/length strings here for consistency
                   2060:  * with other code which uses pointer/length. */
                   2061: static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
                   2062:                           const char ***pValues){
                   2063:   sqlite3_stmt *s;
                   2064:   const char **values;
                   2065:   int i;
                   2066:   int rc;
                   2067: 
                   2068:   *pValues = NULL;
                   2069: 
                   2070:   rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
                   2071:   if( rc!=SQLITE_OK ) return rc;
                   2072: 
                   2073:   rc = sqlite3_bind_int64(s, 1, iRow);
                   2074:   if( rc!=SQLITE_OK ) return rc;
                   2075: 
                   2076:   rc = sqlite3_step(s);
                   2077:   if( rc!=SQLITE_ROW ) return rc;
                   2078: 
                   2079:   values = (const char **) sqlite3_malloc(v->nColumn * sizeof(const char *));
                   2080:   for(i=0; i<v->nColumn; ++i){
                   2081:     if( sqlite3_column_type(s, i)==SQLITE_NULL ){
                   2082:       values[i] = NULL;
                   2083:     }else{
                   2084:       values[i] = string_dup((char*)sqlite3_column_text(s, i));
                   2085:     }
                   2086:   }
                   2087: 
                   2088:   /* We expect only one row.  We must execute another sqlite3_step()
                   2089:    * to complete the iteration; otherwise the table will remain locked. */
                   2090:   rc = sqlite3_step(s);
                   2091:   if( rc==SQLITE_DONE ){
                   2092:     *pValues = values;
                   2093:     return SQLITE_OK;
                   2094:   }
                   2095: 
                   2096:   freeStringArray(v->nColumn, values);
                   2097:   return rc;
                   2098: }
                   2099: 
                   2100: /* delete from %_content where rowid = [iRow ] */
                   2101: static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
                   2102:   sqlite3_stmt *s;
                   2103:   int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
                   2104:   if( rc!=SQLITE_OK ) return rc;
                   2105: 
                   2106:   rc = sqlite3_bind_int64(s, 1, iRow);
                   2107:   if( rc!=SQLITE_OK ) return rc;
                   2108: 
                   2109:   return sql_single_step(s);
                   2110: }
                   2111: 
                   2112: /* Returns SQLITE_ROW if any rows exist in %_content, SQLITE_DONE if
                   2113: ** no rows exist, and any error in case of failure.
                   2114: */
                   2115: static int content_exists(fulltext_vtab *v){
                   2116:   sqlite3_stmt *s;
                   2117:   int rc = sql_get_statement(v, CONTENT_EXISTS_STMT, &s);
                   2118:   if( rc!=SQLITE_OK ) return rc;
                   2119: 
                   2120:   rc = sqlite3_step(s);
                   2121:   if( rc!=SQLITE_ROW ) return rc;
                   2122: 
                   2123:   /* We expect only one row.  We must execute another sqlite3_step()
                   2124:    * to complete the iteration; otherwise the table will remain locked. */
                   2125:   rc = sqlite3_step(s);
                   2126:   if( rc==SQLITE_DONE ) return SQLITE_ROW;
                   2127:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
                   2128:   return rc;
                   2129: }
                   2130: 
                   2131: /* insert into %_segments values ([pData])
                   2132: **   returns assigned rowid in *piBlockid
                   2133: */
                   2134: static int block_insert(fulltext_vtab *v, const char *pData, int nData,
                   2135:                         sqlite_int64 *piBlockid){
                   2136:   sqlite3_stmt *s;
                   2137:   int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
                   2138:   if( rc!=SQLITE_OK ) return rc;
                   2139: 
                   2140:   rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
                   2141:   if( rc!=SQLITE_OK ) return rc;
                   2142: 
                   2143:   rc = sqlite3_step(s);
                   2144:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
                   2145:   if( rc!=SQLITE_DONE ) return rc;
                   2146: 
                   2147:   *piBlockid = sqlite3_last_insert_rowid(v->db);
                   2148:   return SQLITE_OK;
                   2149: }
                   2150: 
                   2151: /* delete from %_segments
                   2152: **   where rowid between [iStartBlockid] and [iEndBlockid]
                   2153: **
                   2154: ** Deletes the range of blocks, inclusive, used to delete the blocks
                   2155: ** which form a segment.
                   2156: */
                   2157: static int block_delete(fulltext_vtab *v,
                   2158:                         sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
                   2159:   sqlite3_stmt *s;
                   2160:   int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
                   2161:   if( rc!=SQLITE_OK ) return rc;
                   2162: 
                   2163:   rc = sqlite3_bind_int64(s, 1, iStartBlockid);
                   2164:   if( rc!=SQLITE_OK ) return rc;
                   2165: 
                   2166:   rc = sqlite3_bind_int64(s, 2, iEndBlockid);
                   2167:   if( rc!=SQLITE_OK ) return rc;
                   2168: 
                   2169:   return sql_single_step(s);
                   2170: }
                   2171: 
                   2172: /* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
                   2173: ** at iLevel.  Returns SQLITE_DONE if there are no segments at
                   2174: ** iLevel.  Otherwise returns an error.
                   2175: */
                   2176: static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){
                   2177:   sqlite3_stmt *s;
                   2178:   int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
                   2179:   if( rc!=SQLITE_OK ) return rc;
                   2180: 
                   2181:   rc = sqlite3_bind_int(s, 1, iLevel);
                   2182:   if( rc!=SQLITE_OK ) return rc;
                   2183: 
                   2184:   rc = sqlite3_step(s);
                   2185:   /* Should always get at least one row due to how max() works. */
                   2186:   if( rc==SQLITE_DONE ) return SQLITE_DONE;
                   2187:   if( rc!=SQLITE_ROW ) return rc;
                   2188: 
                   2189:   /* NULL means that there were no inputs to max(). */
                   2190:   if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
                   2191:     rc = sqlite3_step(s);
                   2192:     if( rc==SQLITE_ROW ) return SQLITE_ERROR;
                   2193:     return rc;
                   2194:   }
                   2195: 
                   2196:   *pidx = sqlite3_column_int(s, 0);
                   2197: 
                   2198:   /* We expect only one row.  We must execute another sqlite3_step()
                   2199:    * to complete the iteration; otherwise the table will remain locked. */
                   2200:   rc = sqlite3_step(s);
                   2201:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
                   2202:   if( rc!=SQLITE_DONE ) return rc;
                   2203:   return SQLITE_ROW;
                   2204: }
                   2205: 
                   2206: /* insert into %_segdir values (
                   2207: **   [iLevel], [idx],
                   2208: **   [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
                   2209: **   [pRootData]
                   2210: ** )
                   2211: */
                   2212: static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
                   2213:                       sqlite_int64 iStartBlockid,
                   2214:                       sqlite_int64 iLeavesEndBlockid,
                   2215:                       sqlite_int64 iEndBlockid,
                   2216:                       const char *pRootData, int nRootData){
                   2217:   sqlite3_stmt *s;
                   2218:   int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
                   2219:   if( rc!=SQLITE_OK ) return rc;
                   2220: 
                   2221:   rc = sqlite3_bind_int(s, 1, iLevel);
                   2222:   if( rc!=SQLITE_OK ) return rc;
                   2223: 
                   2224:   rc = sqlite3_bind_int(s, 2, idx);
                   2225:   if( rc!=SQLITE_OK ) return rc;
                   2226: 
                   2227:   rc = sqlite3_bind_int64(s, 3, iStartBlockid);
                   2228:   if( rc!=SQLITE_OK ) return rc;
                   2229: 
                   2230:   rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
                   2231:   if( rc!=SQLITE_OK ) return rc;
                   2232: 
                   2233:   rc = sqlite3_bind_int64(s, 5, iEndBlockid);
                   2234:   if( rc!=SQLITE_OK ) return rc;
                   2235: 
                   2236:   rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
                   2237:   if( rc!=SQLITE_OK ) return rc;
                   2238: 
                   2239:   return sql_single_step(s);
                   2240: }
                   2241: 
                   2242: /* Queries %_segdir for the block span of the segments in level
                   2243: ** iLevel.  Returns SQLITE_DONE if there are no blocks for iLevel,
                   2244: ** SQLITE_ROW if there are blocks, else an error.
                   2245: */
                   2246: static int segdir_span(fulltext_vtab *v, int iLevel,
                   2247:                        sqlite_int64 *piStartBlockid,
                   2248:                        sqlite_int64 *piEndBlockid){
                   2249:   sqlite3_stmt *s;
                   2250:   int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
                   2251:   if( rc!=SQLITE_OK ) return rc;
                   2252: 
                   2253:   rc = sqlite3_bind_int(s, 1, iLevel);
                   2254:   if( rc!=SQLITE_OK ) return rc;
                   2255: 
                   2256:   rc = sqlite3_step(s);
                   2257:   if( rc==SQLITE_DONE ) return SQLITE_DONE;  /* Should never happen */
                   2258:   if( rc!=SQLITE_ROW ) return rc;
                   2259: 
                   2260:   /* This happens if all segments at this level are entirely inline. */
                   2261:   if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
                   2262:     /* We expect only one row.  We must execute another sqlite3_step()
                   2263:      * to complete the iteration; otherwise the table will remain locked. */
                   2264:     int rc2 = sqlite3_step(s);
                   2265:     if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
                   2266:     return rc2;
                   2267:   }
                   2268: 
                   2269:   *piStartBlockid = sqlite3_column_int64(s, 0);
                   2270:   *piEndBlockid = sqlite3_column_int64(s, 1);
                   2271: 
                   2272:   /* We expect only one row.  We must execute another sqlite3_step()
                   2273:    * to complete the iteration; otherwise the table will remain locked. */
                   2274:   rc = sqlite3_step(s);
                   2275:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
                   2276:   if( rc!=SQLITE_DONE ) return rc;
                   2277:   return SQLITE_ROW;
                   2278: }
                   2279: 
                   2280: /* Delete the segment blocks and segment directory records for all
                   2281: ** segments at iLevel.
                   2282: */
                   2283: static int segdir_delete(fulltext_vtab *v, int iLevel){
                   2284:   sqlite3_stmt *s;
                   2285:   sqlite_int64 iStartBlockid, iEndBlockid;
                   2286:   int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
                   2287:   if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
                   2288: 
                   2289:   if( rc==SQLITE_ROW ){
                   2290:     rc = block_delete(v, iStartBlockid, iEndBlockid);
                   2291:     if( rc!=SQLITE_OK ) return rc;
                   2292:   }
                   2293: 
                   2294:   /* Delete the segment directory itself. */
                   2295:   rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
                   2296:   if( rc!=SQLITE_OK ) return rc;
                   2297: 
                   2298:   rc = sqlite3_bind_int64(s, 1, iLevel);
                   2299:   if( rc!=SQLITE_OK ) return rc;
                   2300: 
                   2301:   return sql_single_step(s);
                   2302: }
                   2303: 
                   2304: /* Delete entire fts index, SQLITE_OK on success, relevant error on
                   2305: ** failure.
                   2306: */
                   2307: static int segdir_delete_all(fulltext_vtab *v){
                   2308:   sqlite3_stmt *s;
                   2309:   int rc = sql_get_statement(v, SEGDIR_DELETE_ALL_STMT, &s);
                   2310:   if( rc!=SQLITE_OK ) return rc;
                   2311: 
                   2312:   rc = sql_single_step(s);
                   2313:   if( rc!=SQLITE_OK ) return rc;
                   2314: 
                   2315:   rc = sql_get_statement(v, BLOCK_DELETE_ALL_STMT, &s);
                   2316:   if( rc!=SQLITE_OK ) return rc;
                   2317: 
                   2318:   return sql_single_step(s);
                   2319: }
                   2320: 
                   2321: /* Returns SQLITE_OK with *pnSegments set to the number of entries in
                   2322: ** %_segdir and *piMaxLevel set to the highest level which has a
                   2323: ** segment.  Otherwise returns the SQLite error which caused failure.
                   2324: */
                   2325: static int segdir_count(fulltext_vtab *v, int *pnSegments, int *piMaxLevel){
                   2326:   sqlite3_stmt *s;
                   2327:   int rc = sql_get_statement(v, SEGDIR_COUNT_STMT, &s);
                   2328:   if( rc!=SQLITE_OK ) return rc;
                   2329: 
                   2330:   rc = sqlite3_step(s);
                   2331:   /* TODO(shess): This case should not be possible?  Should stronger
                   2332:   ** measures be taken if it happens?
                   2333:   */
                   2334:   if( rc==SQLITE_DONE ){
                   2335:     *pnSegments = 0;
                   2336:     *piMaxLevel = 0;
                   2337:     return SQLITE_OK;
                   2338:   }
                   2339:   if( rc!=SQLITE_ROW ) return rc;
                   2340: 
                   2341:   *pnSegments = sqlite3_column_int(s, 0);
                   2342:   *piMaxLevel = sqlite3_column_int(s, 1);
                   2343: 
                   2344:   /* We expect only one row.  We must execute another sqlite3_step()
                   2345:    * to complete the iteration; otherwise the table will remain locked. */
                   2346:   rc = sqlite3_step(s);
                   2347:   if( rc==SQLITE_DONE ) return SQLITE_OK;
                   2348:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
                   2349:   return rc;
                   2350: }
                   2351: 
                   2352: /* TODO(shess) clearPendingTerms() is far down the file because
                   2353: ** writeZeroSegment() is far down the file because LeafWriter is far
                   2354: ** down the file.  Consider refactoring the code to move the non-vtab
                   2355: ** code above the vtab code so that we don't need this forward
                   2356: ** reference.
                   2357: */
                   2358: static int clearPendingTerms(fulltext_vtab *v);
                   2359: 
                   2360: /*
                   2361: ** Free the memory used to contain a fulltext_vtab structure.
                   2362: */
                   2363: static void fulltext_vtab_destroy(fulltext_vtab *v){
                   2364:   int iStmt, i;
                   2365: 
                   2366:   TRACE(("FTS2 Destroy %p\n", v));
                   2367:   for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
                   2368:     if( v->pFulltextStatements[iStmt]!=NULL ){
                   2369:       sqlite3_finalize(v->pFulltextStatements[iStmt]);
                   2370:       v->pFulltextStatements[iStmt] = NULL;
                   2371:     }
                   2372:   }
                   2373: 
                   2374:   for( i=0; i<MERGE_COUNT; i++ ){
                   2375:     if( v->pLeafSelectStmts[i]!=NULL ){
                   2376:       sqlite3_finalize(v->pLeafSelectStmts[i]);
                   2377:       v->pLeafSelectStmts[i] = NULL;
                   2378:     }
                   2379:   }
                   2380: 
                   2381:   if( v->pTokenizer!=NULL ){
                   2382:     v->pTokenizer->pModule->xDestroy(v->pTokenizer);
                   2383:     v->pTokenizer = NULL;
                   2384:   }
                   2385: 
                   2386:   clearPendingTerms(v);
                   2387: 
                   2388:   sqlite3_free(v->azColumn);
                   2389:   for(i = 0; i < v->nColumn; ++i) {
                   2390:     sqlite3_free(v->azContentColumn[i]);
                   2391:   }
                   2392:   sqlite3_free(v->azContentColumn);
                   2393:   sqlite3_free(v);
                   2394: }
                   2395: 
                   2396: /*
                   2397: ** Token types for parsing the arguments to xConnect or xCreate.
                   2398: */
                   2399: #define TOKEN_EOF         0    /* End of file */
                   2400: #define TOKEN_SPACE       1    /* Any kind of whitespace */
                   2401: #define TOKEN_ID          2    /* An identifier */
                   2402: #define TOKEN_STRING      3    /* A string literal */
                   2403: #define TOKEN_PUNCT       4    /* A single punctuation character */
                   2404: 
                   2405: /*
                   2406: ** If X is a character that can be used in an identifier then
                   2407: ** IdChar(X) will be true.  Otherwise it is false.
                   2408: **
                   2409: ** For ASCII, any character with the high-order bit set is
                   2410: ** allowed in an identifier.  For 7-bit characters, 
                   2411: ** sqlite3IsIdChar[X] must be 1.
                   2412: **
                   2413: ** Ticket #1066.  the SQL standard does not allow '$' in the
                   2414: ** middle of identfiers.  But many SQL implementations do. 
                   2415: ** SQLite will allow '$' in identifiers for compatibility.
                   2416: ** But the feature is undocumented.
                   2417: */
                   2418: static const char isIdChar[] = {
                   2419: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
                   2420:     0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
                   2421:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
                   2422:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
                   2423:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
                   2424:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
                   2425:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
                   2426: };
                   2427: #define IdChar(C)  (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
                   2428: 
                   2429: 
                   2430: /*
                   2431: ** Return the length of the token that begins at z[0]. 
                   2432: ** Store the token type in *tokenType before returning.
                   2433: */
                   2434: static int getToken(const char *z, int *tokenType){
                   2435:   int i, c;
                   2436:   switch( *z ){
                   2437:     case 0: {
                   2438:       *tokenType = TOKEN_EOF;
                   2439:       return 0;
                   2440:     }
                   2441:     case ' ': case '\t': case '\n': case '\f': case '\r': {
                   2442:       for(i=1; safe_isspace(z[i]); i++){}
                   2443:       *tokenType = TOKEN_SPACE;
                   2444:       return i;
                   2445:     }
                   2446:     case '`':
                   2447:     case '\'':
                   2448:     case '"': {
                   2449:       int delim = z[0];
                   2450:       for(i=1; (c=z[i])!=0; i++){
                   2451:         if( c==delim ){
                   2452:           if( z[i+1]==delim ){
                   2453:             i++;
                   2454:           }else{
                   2455:             break;
                   2456:           }
                   2457:         }
                   2458:       }
                   2459:       *tokenType = TOKEN_STRING;
                   2460:       return i + (c!=0);
                   2461:     }
                   2462:     case '[': {
                   2463:       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
                   2464:       *tokenType = TOKEN_ID;
                   2465:       return i;
                   2466:     }
                   2467:     default: {
                   2468:       if( !IdChar(*z) ){
                   2469:         break;
                   2470:       }
                   2471:       for(i=1; IdChar(z[i]); i++){}
                   2472:       *tokenType = TOKEN_ID;
                   2473:       return i;
                   2474:     }
                   2475:   }
                   2476:   *tokenType = TOKEN_PUNCT;
                   2477:   return 1;
                   2478: }
                   2479: 
                   2480: /*
                   2481: ** A token extracted from a string is an instance of the following
                   2482: ** structure.
                   2483: */
                   2484: typedef struct Token {
                   2485:   const char *z;       /* Pointer to token text.  Not '\000' terminated */
                   2486:   short int n;         /* Length of the token text in bytes. */
                   2487: } Token;
                   2488: 
                   2489: /*
                   2490: ** Given a input string (which is really one of the argv[] parameters
                   2491: ** passed into xConnect or xCreate) split the string up into tokens.
                   2492: ** Return an array of pointers to '\000' terminated strings, one string
                   2493: ** for each non-whitespace token.
                   2494: **
                   2495: ** The returned array is terminated by a single NULL pointer.
                   2496: **
                   2497: ** Space to hold the returned array is obtained from a single
                   2498: ** malloc and should be freed by passing the return value to free().
                   2499: ** The individual strings within the token list are all a part of
                   2500: ** the single memory allocation and will all be freed at once.
                   2501: */
                   2502: static char **tokenizeString(const char *z, int *pnToken){
                   2503:   int nToken = 0;
                   2504:   Token *aToken = sqlite3_malloc( strlen(z) * sizeof(aToken[0]) );
                   2505:   int n = 1;
                   2506:   int e, i;
                   2507:   int totalSize = 0;
                   2508:   char **azToken;
                   2509:   char *zCopy;
                   2510:   while( n>0 ){
                   2511:     n = getToken(z, &e);
                   2512:     if( e!=TOKEN_SPACE ){
                   2513:       aToken[nToken].z = z;
                   2514:       aToken[nToken].n = n;
                   2515:       nToken++;
                   2516:       totalSize += n+1;
                   2517:     }
                   2518:     z += n;
                   2519:   }
                   2520:   azToken = (char**)sqlite3_malloc( nToken*sizeof(char*) + totalSize );
                   2521:   zCopy = (char*)&azToken[nToken];
                   2522:   nToken--;
                   2523:   for(i=0; i<nToken; i++){
                   2524:     azToken[i] = zCopy;
                   2525:     n = aToken[i].n;
                   2526:     memcpy(zCopy, aToken[i].z, n);
                   2527:     zCopy[n] = 0;
                   2528:     zCopy += n+1;
                   2529:   }
                   2530:   azToken[nToken] = 0;
                   2531:   sqlite3_free(aToken);
                   2532:   *pnToken = nToken;
                   2533:   return azToken;
                   2534: }
                   2535: 
                   2536: /*
                   2537: ** Convert an SQL-style quoted string into a normal string by removing
                   2538: ** the quote characters.  The conversion is done in-place.  If the
                   2539: ** input does not begin with a quote character, then this routine
                   2540: ** is a no-op.
                   2541: **
                   2542: ** Examples:
                   2543: **
                   2544: **     "abc"   becomes   abc
                   2545: **     'xyz'   becomes   xyz
                   2546: **     [pqr]   becomes   pqr
                   2547: **     `mno`   becomes   mno
                   2548: */
                   2549: static void dequoteString(char *z){
                   2550:   int quote;
                   2551:   int i, j;
                   2552:   if( z==0 ) return;
                   2553:   quote = z[0];
                   2554:   switch( quote ){
                   2555:     case '\'':  break;
                   2556:     case '"':   break;
                   2557:     case '`':   break;                /* For MySQL compatibility */
                   2558:     case '[':   quote = ']';  break;  /* For MS SqlServer compatibility */
                   2559:     default:    return;
                   2560:   }
                   2561:   for(i=1, j=0; z[i]; i++){
                   2562:     if( z[i]==quote ){
                   2563:       if( z[i+1]==quote ){
                   2564:         z[j++] = quote;
                   2565:         i++;
                   2566:       }else{
                   2567:         z[j++] = 0;
                   2568:         break;
                   2569:       }
                   2570:     }else{
                   2571:       z[j++] = z[i];
                   2572:     }
                   2573:   }
                   2574: }
                   2575: 
                   2576: /*
                   2577: ** The input azIn is a NULL-terminated list of tokens.  Remove the first
                   2578: ** token and all punctuation tokens.  Remove the quotes from
                   2579: ** around string literal tokens.
                   2580: **
                   2581: ** Example:
                   2582: **
                   2583: **     input:      tokenize chinese ( 'simplifed' , 'mixed' )
                   2584: **     output:     chinese simplifed mixed
                   2585: **
                   2586: ** Another example:
                   2587: **
                   2588: **     input:      delimiters ( '[' , ']' , '...' )
                   2589: **     output:     [ ] ...
                   2590: */
                   2591: static void tokenListToIdList(char **azIn){
                   2592:   int i, j;
                   2593:   if( azIn ){
                   2594:     for(i=0, j=-1; azIn[i]; i++){
                   2595:       if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
                   2596:         dequoteString(azIn[i]);
                   2597:         if( j>=0 ){
                   2598:           azIn[j] = azIn[i];
                   2599:         }
                   2600:         j++;
                   2601:       }
                   2602:     }
                   2603:     azIn[j] = 0;
                   2604:   }
                   2605: }
                   2606: 
                   2607: 
                   2608: /*
                   2609: ** Find the first alphanumeric token in the string zIn.  Null-terminate
                   2610: ** this token.  Remove any quotation marks.  And return a pointer to
                   2611: ** the result.
                   2612: */
                   2613: static char *firstToken(char *zIn, char **pzTail){
                   2614:   int n, ttype;
                   2615:   while(1){
                   2616:     n = getToken(zIn, &ttype);
                   2617:     if( ttype==TOKEN_SPACE ){
                   2618:       zIn += n;
                   2619:     }else if( ttype==TOKEN_EOF ){
                   2620:       *pzTail = zIn;
                   2621:       return 0;
                   2622:     }else{
                   2623:       zIn[n] = 0;
                   2624:       *pzTail = &zIn[1];
                   2625:       dequoteString(zIn);
                   2626:       return zIn;
                   2627:     }
                   2628:   }
                   2629:   /*NOTREACHED*/
                   2630: }
                   2631: 
                   2632: /* Return true if...
                   2633: **
                   2634: **   *  s begins with the string t, ignoring case
                   2635: **   *  s is longer than t
                   2636: **   *  The first character of s beyond t is not a alphanumeric
                   2637: ** 
                   2638: ** Ignore leading space in *s.
                   2639: **
                   2640: ** To put it another way, return true if the first token of
                   2641: ** s[] is t[].
                   2642: */
                   2643: static int startsWith(const char *s, const char *t){
                   2644:   while( safe_isspace(*s) ){ s++; }
                   2645:   while( *t ){
                   2646:     if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
                   2647:   }
                   2648:   return *s!='_' && !safe_isalnum(*s);
                   2649: }
                   2650: 
                   2651: /*
                   2652: ** An instance of this structure defines the "spec" of a
                   2653: ** full text index.  This structure is populated by parseSpec
                   2654: ** and use by fulltextConnect and fulltextCreate.
                   2655: */
                   2656: typedef struct TableSpec {
                   2657:   const char *zDb;         /* Logical database name */
                   2658:   const char *zName;       /* Name of the full-text index */
                   2659:   int nColumn;             /* Number of columns to be indexed */
                   2660:   char **azColumn;         /* Original names of columns to be indexed */
                   2661:   char **azContentColumn;  /* Column names for %_content */
                   2662:   char **azTokenizer;      /* Name of tokenizer and its arguments */
                   2663: } TableSpec;
                   2664: 
                   2665: /*
                   2666: ** Reclaim all of the memory used by a TableSpec
                   2667: */
                   2668: static void clearTableSpec(TableSpec *p) {
                   2669:   sqlite3_free(p->azColumn);
                   2670:   sqlite3_free(p->azContentColumn);
                   2671:   sqlite3_free(p->azTokenizer);
                   2672: }
                   2673: 
                   2674: /* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
                   2675:  *
                   2676:  * CREATE VIRTUAL TABLE email
                   2677:  *        USING fts2(subject, body, tokenize mytokenizer(myarg))
                   2678:  *
                   2679:  * We return parsed information in a TableSpec structure.
                   2680:  * 
                   2681:  */
                   2682: static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
                   2683:                      char**pzErr){
                   2684:   int i, n;
                   2685:   char *z, *zDummy;
                   2686:   char **azArg;
                   2687:   const char *zTokenizer = 0;    /* argv[] entry describing the tokenizer */
                   2688: 
                   2689:   assert( argc>=3 );
                   2690:   /* Current interface:
                   2691:   ** argv[0] - module name
                   2692:   ** argv[1] - database name
                   2693:   ** argv[2] - table name
                   2694:   ** argv[3..] - columns, optionally followed by tokenizer specification
                   2695:   **             and snippet delimiters specification.
                   2696:   */
                   2697: 
                   2698:   /* Make a copy of the complete argv[][] array in a single allocation.
                   2699:   ** The argv[][] array is read-only and transient.  We can write to the
                   2700:   ** copy in order to modify things and the copy is persistent.
                   2701:   */
                   2702:   CLEAR(pSpec);
                   2703:   for(i=n=0; i<argc; i++){
                   2704:     n += strlen(argv[i]) + 1;
                   2705:   }
                   2706:   azArg = sqlite3_malloc( sizeof(char*)*argc + n );
                   2707:   if( azArg==0 ){
                   2708:     return SQLITE_NOMEM;
                   2709:   }
                   2710:   z = (char*)&azArg[argc];
                   2711:   for(i=0; i<argc; i++){
                   2712:     azArg[i] = z;
                   2713:     strcpy(z, argv[i]);
                   2714:     z += strlen(z)+1;
                   2715:   }
                   2716: 
                   2717:   /* Identify the column names and the tokenizer and delimiter arguments
                   2718:   ** in the argv[][] array.
                   2719:   */
                   2720:   pSpec->zDb = azArg[1];
                   2721:   pSpec->zName = azArg[2];
                   2722:   pSpec->nColumn = 0;
                   2723:   pSpec->azColumn = azArg;
                   2724:   zTokenizer = "tokenize simple";
                   2725:   for(i=3; i<argc; ++i){
                   2726:     if( startsWith(azArg[i],"tokenize") ){
                   2727:       zTokenizer = azArg[i];
                   2728:     }else{
                   2729:       z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
                   2730:       pSpec->nColumn++;
                   2731:     }
                   2732:   }
                   2733:   if( pSpec->nColumn==0 ){
                   2734:     azArg[0] = "content";
                   2735:     pSpec->nColumn = 1;
                   2736:   }
                   2737: 
                   2738:   /*
                   2739:   ** Construct the list of content column names.
                   2740:   **
                   2741:   ** Each content column name will be of the form cNNAAAA
                   2742:   ** where NN is the column number and AAAA is the sanitized
                   2743:   ** column name.  "sanitized" means that special characters are
                   2744:   ** converted to "_".  The cNN prefix guarantees that all column
                   2745:   ** names are unique.
                   2746:   **
                   2747:   ** The AAAA suffix is not strictly necessary.  It is included
                   2748:   ** for the convenience of people who might examine the generated
                   2749:   ** %_content table and wonder what the columns are used for.
                   2750:   */
                   2751:   pSpec->azContentColumn = sqlite3_malloc( pSpec->nColumn * sizeof(char *) );
                   2752:   if( pSpec->azContentColumn==0 ){
                   2753:     clearTableSpec(pSpec);
                   2754:     return SQLITE_NOMEM;
                   2755:   }
                   2756:   for(i=0; i<pSpec->nColumn; i++){
                   2757:     char *p;
                   2758:     pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
                   2759:     for (p = pSpec->azContentColumn[i]; *p ; ++p) {
                   2760:       if( !safe_isalnum(*p) ) *p = '_';
                   2761:     }
                   2762:   }
                   2763: 
                   2764:   /*
                   2765:   ** Parse the tokenizer specification string.
                   2766:   */
                   2767:   pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
                   2768:   tokenListToIdList(pSpec->azTokenizer);
                   2769: 
                   2770:   return SQLITE_OK;
                   2771: }
                   2772: 
                   2773: /*
                   2774: ** Generate a CREATE TABLE statement that describes the schema of
                   2775: ** the virtual table.  Return a pointer to this schema string.
                   2776: **
                   2777: ** Space is obtained from sqlite3_mprintf() and should be freed
                   2778: ** using sqlite3_free().
                   2779: */
                   2780: static char *fulltextSchema(
                   2781:   int nColumn,                  /* Number of columns */
                   2782:   const char *const* azColumn,  /* List of columns */
                   2783:   const char *zTableName        /* Name of the table */
                   2784: ){
                   2785:   int i;
                   2786:   char *zSchema, *zNext;
                   2787:   const char *zSep = "(";
                   2788:   zSchema = sqlite3_mprintf("CREATE TABLE x");
                   2789:   for(i=0; i<nColumn; i++){
                   2790:     zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
                   2791:     sqlite3_free(zSchema);
                   2792:     zSchema = zNext;
                   2793:     zSep = ",";
                   2794:   }
                   2795:   zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
                   2796:   sqlite3_free(zSchema);
                   2797:   return zNext;
                   2798: }
                   2799: 
                   2800: /*
                   2801: ** Build a new sqlite3_vtab structure that will describe the
                   2802: ** fulltext index defined by spec.
                   2803: */
                   2804: static int constructVtab(
                   2805:   sqlite3 *db,              /* The SQLite database connection */
                   2806:   fts2Hash *pHash,          /* Hash table containing tokenizers */
                   2807:   TableSpec *spec,          /* Parsed spec information from parseSpec() */
                   2808:   sqlite3_vtab **ppVTab,    /* Write the resulting vtab structure here */
                   2809:   char **pzErr              /* Write any error message here */
                   2810: ){
                   2811:   int rc;
                   2812:   int n;
                   2813:   fulltext_vtab *v = 0;
                   2814:   const sqlite3_tokenizer_module *m = NULL;
                   2815:   char *schema;
                   2816: 
                   2817:   char const *zTok;         /* Name of tokenizer to use for this fts table */
                   2818:   int nTok;                 /* Length of zTok, including nul terminator */
                   2819: 
                   2820:   v = (fulltext_vtab *) sqlite3_malloc(sizeof(fulltext_vtab));
                   2821:   if( v==0 ) return SQLITE_NOMEM;
                   2822:   CLEAR(v);
                   2823:   /* sqlite will initialize v->base */
                   2824:   v->db = db;
                   2825:   v->zDb = spec->zDb;       /* Freed when azColumn is freed */
                   2826:   v->zName = spec->zName;   /* Freed when azColumn is freed */
                   2827:   v->nColumn = spec->nColumn;
                   2828:   v->azContentColumn = spec->azContentColumn;
                   2829:   spec->azContentColumn = 0;
                   2830:   v->azColumn = spec->azColumn;
                   2831:   spec->azColumn = 0;
                   2832: 
                   2833:   if( spec->azTokenizer==0 ){
                   2834:     return SQLITE_NOMEM;
                   2835:   }
                   2836: 
                   2837:   zTok = spec->azTokenizer[0]; 
                   2838:   if( !zTok ){
                   2839:     zTok = "simple";
                   2840:   }
                   2841:   nTok = strlen(zTok)+1;
                   2842: 
                   2843:   m = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zTok, nTok);
                   2844:   if( !m ){
                   2845:     *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
                   2846:     rc = SQLITE_ERROR;
                   2847:     goto err;
                   2848:   }
                   2849: 
                   2850:   for(n=0; spec->azTokenizer[n]; n++){}
                   2851:   if( n ){
                   2852:     rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
                   2853:                     &v->pTokenizer);
                   2854:   }else{
                   2855:     rc = m->xCreate(0, 0, &v->pTokenizer);
                   2856:   }
                   2857:   if( rc!=SQLITE_OK ) goto err;
                   2858:   v->pTokenizer->pModule = m;
                   2859: 
                   2860:   /* TODO: verify the existence of backing tables foo_content, foo_term */
                   2861: 
                   2862:   schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
                   2863:                           spec->zName);
                   2864:   rc = sqlite3_declare_vtab(db, schema);
                   2865:   sqlite3_free(schema);
                   2866:   if( rc!=SQLITE_OK ) goto err;
                   2867: 
                   2868:   memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
                   2869: 
                   2870:   /* Indicate that the buffer is not live. */
                   2871:   v->nPendingData = -1;
                   2872: 
                   2873:   *ppVTab = &v->base;
                   2874:   TRACE(("FTS2 Connect %p\n", v));
                   2875: 
                   2876:   return rc;
                   2877: 
                   2878: err:
                   2879:   fulltext_vtab_destroy(v);
                   2880:   return rc;
                   2881: }
                   2882: 
                   2883: static int fulltextConnect(
                   2884:   sqlite3 *db,
                   2885:   void *pAux,
                   2886:   int argc, const char *const*argv,
                   2887:   sqlite3_vtab **ppVTab,
                   2888:   char **pzErr
                   2889: ){
                   2890:   TableSpec spec;
                   2891:   int rc = parseSpec(&spec, argc, argv, pzErr);
                   2892:   if( rc!=SQLITE_OK ) return rc;
                   2893: 
                   2894:   rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
                   2895:   clearTableSpec(&spec);
                   2896:   return rc;
                   2897: }
                   2898: 
                   2899: /* The %_content table holds the text of each document, with
                   2900: ** the rowid used as the docid.
                   2901: */
                   2902: /* TODO(shess) This comment needs elaboration to match the updated
                   2903: ** code.  Work it into the top-of-file comment at that time.
                   2904: */
                   2905: static int fulltextCreate(sqlite3 *db, void *pAux,
                   2906:                           int argc, const char * const *argv,
                   2907:                           sqlite3_vtab **ppVTab, char **pzErr){
                   2908:   int rc;
                   2909:   TableSpec spec;
                   2910:   StringBuffer schema;
                   2911:   TRACE(("FTS2 Create\n"));
                   2912: 
                   2913:   rc = parseSpec(&spec, argc, argv, pzErr);
                   2914:   if( rc!=SQLITE_OK ) return rc;
                   2915: 
                   2916:   initStringBuffer(&schema);
                   2917:   append(&schema, "CREATE TABLE %_content(");
                   2918:   appendList(&schema, spec.nColumn, spec.azContentColumn);
                   2919:   append(&schema, ")");
                   2920:   rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
                   2921:   stringBufferDestroy(&schema);
                   2922:   if( rc!=SQLITE_OK ) goto out;
                   2923: 
                   2924:   rc = sql_exec(db, spec.zDb, spec.zName,
                   2925:                 "create table %_segments(block blob);");
                   2926:   if( rc!=SQLITE_OK ) goto out;
                   2927: 
                   2928:   rc = sql_exec(db, spec.zDb, spec.zName,
                   2929:                 "create table %_segdir("
                   2930:                 "  level integer,"
                   2931:                 "  idx integer,"
                   2932:                 "  start_block integer,"
                   2933:                 "  leaves_end_block integer,"
                   2934:                 "  end_block integer,"
                   2935:                 "  root blob,"
                   2936:                 "  primary key(level, idx)"
                   2937:                 ");");
                   2938:   if( rc!=SQLITE_OK ) goto out;
                   2939: 
                   2940:   rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
                   2941: 
                   2942: out:
                   2943:   clearTableSpec(&spec);
                   2944:   return rc;
                   2945: }
                   2946: 
                   2947: /* Decide how to handle an SQL query. */
                   2948: static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
                   2949:   int i;
                   2950:   TRACE(("FTS2 BestIndex\n"));
                   2951: 
                   2952:   for(i=0; i<pInfo->nConstraint; ++i){
                   2953:     const struct sqlite3_index_constraint *pConstraint;
                   2954:     pConstraint = &pInfo->aConstraint[i];
                   2955:     if( pConstraint->usable ) {
                   2956:       if( pConstraint->iColumn==-1 &&
                   2957:           pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
                   2958:         pInfo->idxNum = QUERY_ROWID;      /* lookup by rowid */
                   2959:         TRACE(("FTS2 QUERY_ROWID\n"));
                   2960:       } else if( pConstraint->iColumn>=0 &&
                   2961:                  pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
                   2962:         /* full-text search */
                   2963:         pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
                   2964:         TRACE(("FTS2 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
                   2965:       } else continue;
                   2966: 
                   2967:       pInfo->aConstraintUsage[i].argvIndex = 1;
                   2968:       pInfo->aConstraintUsage[i].omit = 1;
                   2969: 
                   2970:       /* An arbitrary value for now.
                   2971:        * TODO: Perhaps rowid matches should be considered cheaper than
                   2972:        * full-text searches. */
                   2973:       pInfo->estimatedCost = 1.0;   
                   2974: 
                   2975:       return SQLITE_OK;
                   2976:     }
                   2977:   }
                   2978:   pInfo->idxNum = QUERY_GENERIC;
                   2979:   return SQLITE_OK;
                   2980: }
                   2981: 
                   2982: static int fulltextDisconnect(sqlite3_vtab *pVTab){
                   2983:   TRACE(("FTS2 Disconnect %p\n", pVTab));
                   2984:   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
                   2985:   return SQLITE_OK;
                   2986: }
                   2987: 
                   2988: static int fulltextDestroy(sqlite3_vtab *pVTab){
                   2989:   fulltext_vtab *v = (fulltext_vtab *)pVTab;
                   2990:   int rc;
                   2991: 
                   2992:   TRACE(("FTS2 Destroy %p\n", pVTab));
                   2993:   rc = sql_exec(v->db, v->zDb, v->zName,
                   2994:                 "drop table if exists %_content;"
                   2995:                 "drop table if exists %_segments;"
                   2996:                 "drop table if exists %_segdir;"
                   2997:                 );
                   2998:   if( rc!=SQLITE_OK ) return rc;
                   2999: 
                   3000:   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
                   3001:   return SQLITE_OK;
                   3002: }
                   3003: 
                   3004: static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
                   3005:   fulltext_cursor *c;
                   3006: 
                   3007:   c = (fulltext_cursor *) sqlite3_malloc(sizeof(fulltext_cursor));
                   3008:   if( c ){
                   3009:     memset(c, 0, sizeof(fulltext_cursor));
                   3010:     /* sqlite will initialize c->base */
                   3011:     *ppCursor = &c->base;
                   3012:     TRACE(("FTS2 Open %p: %p\n", pVTab, c));
                   3013:     return SQLITE_OK;
                   3014:   }else{
                   3015:     return SQLITE_NOMEM;
                   3016:   }
                   3017: }
                   3018: 
                   3019: 
                   3020: /* Free all of the dynamically allocated memory held by *q
                   3021: */
                   3022: static void queryClear(Query *q){
                   3023:   int i;
                   3024:   for(i = 0; i < q->nTerms; ++i){
                   3025:     sqlite3_free(q->pTerms[i].pTerm);
                   3026:   }
                   3027:   sqlite3_free(q->pTerms);
                   3028:   CLEAR(q);
                   3029: }
                   3030: 
                   3031: /* Free all of the dynamically allocated memory held by the
                   3032: ** Snippet
                   3033: */
                   3034: static void snippetClear(Snippet *p){
                   3035:   sqlite3_free(p->aMatch);
                   3036:   sqlite3_free(p->zOffset);
                   3037:   sqlite3_free(p->zSnippet);
                   3038:   CLEAR(p);
                   3039: }
                   3040: /*
                   3041: ** Append a single entry to the p->aMatch[] log.
                   3042: */
                   3043: static void snippetAppendMatch(
                   3044:   Snippet *p,               /* Append the entry to this snippet */
                   3045:   int iCol, int iTerm,      /* The column and query term */
                   3046:   int iStart, int nByte     /* Offset and size of the match */
                   3047: ){
                   3048:   int i;
                   3049:   struct snippetMatch *pMatch;
                   3050:   if( p->nMatch+1>=p->nAlloc ){
                   3051:     p->nAlloc = p->nAlloc*2 + 10;
                   3052:     p->aMatch = sqlite3_realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
                   3053:     if( p->aMatch==0 ){
                   3054:       p->nMatch = 0;
                   3055:       p->nAlloc = 0;
                   3056:       return;
                   3057:     }
                   3058:   }
                   3059:   i = p->nMatch++;
                   3060:   pMatch = &p->aMatch[i];
                   3061:   pMatch->iCol = iCol;
                   3062:   pMatch->iTerm = iTerm;
                   3063:   pMatch->iStart = iStart;
                   3064:   pMatch->nByte = nByte;
                   3065: }
                   3066: 
                   3067: /*
                   3068: ** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
                   3069: */
                   3070: #define FTS2_ROTOR_SZ   (32)
                   3071: #define FTS2_ROTOR_MASK (FTS2_ROTOR_SZ-1)
                   3072: 
                   3073: /*
                   3074: ** Add entries to pSnippet->aMatch[] for every match that occurs against
                   3075: ** document zDoc[0..nDoc-1] which is stored in column iColumn.
                   3076: */
                   3077: static void snippetOffsetsOfColumn(
                   3078:   Query *pQuery,
                   3079:   Snippet *pSnippet,
                   3080:   int iColumn,
                   3081:   const char *zDoc,
                   3082:   int nDoc
                   3083: ){
                   3084:   const sqlite3_tokenizer_module *pTModule;  /* The tokenizer module */
                   3085:   sqlite3_tokenizer *pTokenizer;             /* The specific tokenizer */
                   3086:   sqlite3_tokenizer_cursor *pTCursor;        /* Tokenizer cursor */
                   3087:   fulltext_vtab *pVtab;                /* The full text index */
                   3088:   int nColumn;                         /* Number of columns in the index */
                   3089:   const QueryTerm *aTerm;              /* Query string terms */
                   3090:   int nTerm;                           /* Number of query string terms */  
                   3091:   int i, j;                            /* Loop counters */
                   3092:   int rc;                              /* Return code */
                   3093:   unsigned int match, prevMatch;       /* Phrase search bitmasks */
                   3094:   const char *zToken;                  /* Next token from the tokenizer */
                   3095:   int nToken;                          /* Size of zToken */
                   3096:   int iBegin, iEnd, iPos;              /* Offsets of beginning and end */
                   3097: 
                   3098:   /* The following variables keep a circular buffer of the last
                   3099:   ** few tokens */
                   3100:   unsigned int iRotor = 0;             /* Index of current token */
                   3101:   int iRotorBegin[FTS2_ROTOR_SZ];      /* Beginning offset of token */
                   3102:   int iRotorLen[FTS2_ROTOR_SZ];        /* Length of token */
                   3103: 
                   3104:   pVtab = pQuery->pFts;
                   3105:   nColumn = pVtab->nColumn;
                   3106:   pTokenizer = pVtab->pTokenizer;
                   3107:   pTModule = pTokenizer->pModule;
                   3108:   rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
                   3109:   if( rc ) return;
                   3110:   pTCursor->pTokenizer = pTokenizer;
                   3111:   aTerm = pQuery->pTerms;
                   3112:   nTerm = pQuery->nTerms;
                   3113:   if( nTerm>=FTS2_ROTOR_SZ ){
                   3114:     nTerm = FTS2_ROTOR_SZ - 1;
                   3115:   }
                   3116:   prevMatch = 0;
                   3117:   while(1){
                   3118:     rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
                   3119:     if( rc ) break;
                   3120:     iRotorBegin[iRotor&FTS2_ROTOR_MASK] = iBegin;
                   3121:     iRotorLen[iRotor&FTS2_ROTOR_MASK] = iEnd-iBegin;
                   3122:     match = 0;
                   3123:     for(i=0; i<nTerm; i++){
                   3124:       int iCol;
                   3125:       iCol = aTerm[i].iColumn;
                   3126:       if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
                   3127:       if( aTerm[i].nTerm>nToken ) continue;
                   3128:       if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
                   3129:       assert( aTerm[i].nTerm<=nToken );
                   3130:       if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
                   3131:       if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
                   3132:       match |= 1<<i;
                   3133:       if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
                   3134:         for(j=aTerm[i].iPhrase-1; j>=0; j--){
                   3135:           int k = (iRotor-j) & FTS2_ROTOR_MASK;
                   3136:           snippetAppendMatch(pSnippet, iColumn, i-j,
                   3137:                 iRotorBegin[k], iRotorLen[k]);
                   3138:         }
                   3139:       }
                   3140:     }
                   3141:     prevMatch = match<<1;
                   3142:     iRotor++;
                   3143:   }
                   3144:   pTModule->xClose(pTCursor);  
                   3145: }
                   3146: 
                   3147: 
                   3148: /*
                   3149: ** Compute all offsets for the current row of the query.  
                   3150: ** If the offsets have already been computed, this routine is a no-op.
                   3151: */
                   3152: static void snippetAllOffsets(fulltext_cursor *p){
                   3153:   int nColumn;
                   3154:   int iColumn, i;
                   3155:   int iFirst, iLast;
                   3156:   fulltext_vtab *pFts;
                   3157: 
                   3158:   if( p->snippet.nMatch ) return;
                   3159:   if( p->q.nTerms==0 ) return;
                   3160:   pFts = p->q.pFts;
                   3161:   nColumn = pFts->nColumn;
                   3162:   iColumn = (p->iCursorType - QUERY_FULLTEXT);
                   3163:   if( iColumn<0 || iColumn>=nColumn ){
                   3164:     iFirst = 0;
                   3165:     iLast = nColumn-1;
                   3166:   }else{
                   3167:     iFirst = iColumn;
                   3168:     iLast = iColumn;
                   3169:   }
                   3170:   for(i=iFirst; i<=iLast; i++){
                   3171:     const char *zDoc;
                   3172:     int nDoc;
                   3173:     zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
                   3174:     nDoc = sqlite3_column_bytes(p->pStmt, i+1);
                   3175:     snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
                   3176:   }
                   3177: }
                   3178: 
                   3179: /*
                   3180: ** Convert the information in the aMatch[] array of the snippet
                   3181: ** into the string zOffset[0..nOffset-1].
                   3182: */
                   3183: static void snippetOffsetText(Snippet *p){
                   3184:   int i;
                   3185:   int cnt = 0;
                   3186:   StringBuffer sb;
                   3187:   char zBuf[200];
                   3188:   if( p->zOffset ) return;
                   3189:   initStringBuffer(&sb);
                   3190:   for(i=0; i<p->nMatch; i++){
                   3191:     struct snippetMatch *pMatch = &p->aMatch[i];
                   3192:     zBuf[0] = ' ';
                   3193:     sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
                   3194:         pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
                   3195:     append(&sb, zBuf);
                   3196:     cnt++;
                   3197:   }
                   3198:   p->zOffset = stringBufferData(&sb);
                   3199:   p->nOffset = stringBufferLength(&sb);
                   3200: }
                   3201: 
                   3202: /*
                   3203: ** zDoc[0..nDoc-1] is phrase of text.  aMatch[0..nMatch-1] are a set
                   3204: ** of matching words some of which might be in zDoc.  zDoc is column
                   3205: ** number iCol.
                   3206: **
                   3207: ** iBreak is suggested spot in zDoc where we could begin or end an
                   3208: ** excerpt.  Return a value similar to iBreak but possibly adjusted
                   3209: ** to be a little left or right so that the break point is better.
                   3210: */
                   3211: static int wordBoundary(
                   3212:   int iBreak,                   /* The suggested break point */
                   3213:   const char *zDoc,             /* Document text */
                   3214:   int nDoc,                     /* Number of bytes in zDoc[] */
                   3215:   struct snippetMatch *aMatch,  /* Matching words */
                   3216:   int nMatch,                   /* Number of entries in aMatch[] */
                   3217:   int iCol                      /* The column number for zDoc[] */
                   3218: ){
                   3219:   int i;
                   3220:   if( iBreak<=10 ){
                   3221:     return 0;
                   3222:   }
                   3223:   if( iBreak>=nDoc-10 ){
                   3224:     return nDoc;
                   3225:   }
                   3226:   for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
                   3227:   while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
                   3228:   if( i<nMatch ){
                   3229:     if( aMatch[i].iStart<iBreak+10 ){
                   3230:       return aMatch[i].iStart;
                   3231:     }
                   3232:     if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
                   3233:       return aMatch[i-1].iStart;
                   3234:     }
                   3235:   }
                   3236:   for(i=1; i<=10; i++){
                   3237:     if( safe_isspace(zDoc[iBreak-i]) ){
                   3238:       return iBreak - i + 1;
                   3239:     }
                   3240:     if( safe_isspace(zDoc[iBreak+i]) ){
                   3241:       return iBreak + i + 1;
                   3242:     }
                   3243:   }
                   3244:   return iBreak;
                   3245: }
                   3246: 
                   3247: 
                   3248: 
                   3249: /*
                   3250: ** Allowed values for Snippet.aMatch[].snStatus
                   3251: */
                   3252: #define SNIPPET_IGNORE  0   /* It is ok to omit this match from the snippet */
                   3253: #define SNIPPET_DESIRED 1   /* We want to include this match in the snippet */
                   3254: 
                   3255: /*
                   3256: ** Generate the text of a snippet.
                   3257: */
                   3258: static void snippetText(
                   3259:   fulltext_cursor *pCursor,   /* The cursor we need the snippet for */
                   3260:   const char *zStartMark,     /* Markup to appear before each match */
                   3261:   const char *zEndMark,       /* Markup to appear after each match */
                   3262:   const char *zEllipsis       /* Ellipsis mark */
                   3263: ){
                   3264:   int i, j;
                   3265:   struct snippetMatch *aMatch;
                   3266:   int nMatch;
                   3267:   int nDesired;
                   3268:   StringBuffer sb;
                   3269:   int tailCol;
                   3270:   int tailOffset;
                   3271:   int iCol;
                   3272:   int nDoc;
                   3273:   const char *zDoc;
                   3274:   int iStart, iEnd;
                   3275:   int tailEllipsis = 0;
                   3276:   int iMatch;
                   3277:   
                   3278: 
                   3279:   sqlite3_free(pCursor->snippet.zSnippet);
                   3280:   pCursor->snippet.zSnippet = 0;
                   3281:   aMatch = pCursor->snippet.aMatch;
                   3282:   nMatch = pCursor->snippet.nMatch;
                   3283:   initStringBuffer(&sb);
                   3284: 
                   3285:   for(i=0; i<nMatch; i++){
                   3286:     aMatch[i].snStatus = SNIPPET_IGNORE;
                   3287:   }
                   3288:   nDesired = 0;
                   3289:   for(i=0; i<pCursor->q.nTerms; i++){
                   3290:     for(j=0; j<nMatch; j++){
                   3291:       if( aMatch[j].iTerm==i ){
                   3292:         aMatch[j].snStatus = SNIPPET_DESIRED;
                   3293:         nDesired++;
                   3294:         break;
                   3295:       }
                   3296:     }
                   3297:   }
                   3298: 
                   3299:   iMatch = 0;
                   3300:   tailCol = -1;
                   3301:   tailOffset = 0;
                   3302:   for(i=0; i<nMatch && nDesired>0; i++){
                   3303:     if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
                   3304:     nDesired--;
                   3305:     iCol = aMatch[i].iCol;
                   3306:     zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
                   3307:     nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
                   3308:     iStart = aMatch[i].iStart - 40;
                   3309:     iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
                   3310:     if( iStart<=10 ){
                   3311:       iStart = 0;
                   3312:     }
                   3313:     if( iCol==tailCol && iStart<=tailOffset+20 ){
                   3314:       iStart = tailOffset;
                   3315:     }
                   3316:     if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
                   3317:       trimWhiteSpace(&sb);
                   3318:       appendWhiteSpace(&sb);
                   3319:       append(&sb, zEllipsis);
                   3320:       appendWhiteSpace(&sb);
                   3321:     }
                   3322:     iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
                   3323:     iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
                   3324:     if( iEnd>=nDoc-10 ){
                   3325:       iEnd = nDoc;
                   3326:       tailEllipsis = 0;
                   3327:     }else{
                   3328:       tailEllipsis = 1;
                   3329:     }
                   3330:     while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
                   3331:     while( iStart<iEnd ){
                   3332:       while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
                   3333:              && aMatch[iMatch].iCol<=iCol ){
                   3334:         iMatch++;
                   3335:       }
                   3336:       if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
                   3337:              && aMatch[iMatch].iCol==iCol ){
                   3338:         nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
                   3339:         iStart = aMatch[iMatch].iStart;
                   3340:         append(&sb, zStartMark);
                   3341:         nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
                   3342:         append(&sb, zEndMark);
                   3343:         iStart += aMatch[iMatch].nByte;
                   3344:         for(j=iMatch+1; j<nMatch; j++){
                   3345:           if( aMatch[j].iTerm==aMatch[iMatch].iTerm
                   3346:               && aMatch[j].snStatus==SNIPPET_DESIRED ){
                   3347:             nDesired--;
                   3348:             aMatch[j].snStatus = SNIPPET_IGNORE;
                   3349:           }
                   3350:         }
                   3351:       }else{
                   3352:         nappend(&sb, &zDoc[iStart], iEnd - iStart);
                   3353:         iStart = iEnd;
                   3354:       }
                   3355:     }
                   3356:     tailCol = iCol;
                   3357:     tailOffset = iEnd;
                   3358:   }
                   3359:   trimWhiteSpace(&sb);
                   3360:   if( tailEllipsis ){
                   3361:     appendWhiteSpace(&sb);
                   3362:     append(&sb, zEllipsis);
                   3363:   }
                   3364:   pCursor->snippet.zSnippet = stringBufferData(&sb);
                   3365:   pCursor->snippet.nSnippet = stringBufferLength(&sb);
                   3366: }
                   3367: 
                   3368: 
                   3369: /*
                   3370: ** Close the cursor.  For additional information see the documentation
                   3371: ** on the xClose method of the virtual table interface.
                   3372: */
                   3373: static int fulltextClose(sqlite3_vtab_cursor *pCursor){
                   3374:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   3375:   TRACE(("FTS2 Close %p\n", c));
                   3376:   sqlite3_finalize(c->pStmt);
                   3377:   queryClear(&c->q);
                   3378:   snippetClear(&c->snippet);
                   3379:   if( c->result.nData!=0 ) dlrDestroy(&c->reader);
                   3380:   dataBufferDestroy(&c->result);
                   3381:   sqlite3_free(c);
                   3382:   return SQLITE_OK;
                   3383: }
                   3384: 
                   3385: static int fulltextNext(sqlite3_vtab_cursor *pCursor){
                   3386:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   3387:   int rc;
                   3388: 
                   3389:   TRACE(("FTS2 Next %p\n", pCursor));
                   3390:   snippetClear(&c->snippet);
                   3391:   if( c->iCursorType < QUERY_FULLTEXT ){
                   3392:     /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
                   3393:     rc = sqlite3_step(c->pStmt);
                   3394:     switch( rc ){
                   3395:       case SQLITE_ROW:
                   3396:         c->eof = 0;
                   3397:         return SQLITE_OK;
                   3398:       case SQLITE_DONE:
                   3399:         c->eof = 1;
                   3400:         return SQLITE_OK;
                   3401:       default:
                   3402:         c->eof = 1;
                   3403:         return rc;
                   3404:     }
                   3405:   } else {  /* full-text query */
                   3406:     rc = sqlite3_reset(c->pStmt);
                   3407:     if( rc!=SQLITE_OK ) return rc;
                   3408: 
                   3409:     if( c->result.nData==0 || dlrAtEnd(&c->reader) ){
                   3410:       c->eof = 1;
                   3411:       return SQLITE_OK;
                   3412:     }
                   3413:     rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader));
                   3414:     dlrStep(&c->reader);
                   3415:     if( rc!=SQLITE_OK ) return rc;
                   3416:     /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
                   3417:     rc = sqlite3_step(c->pStmt);
                   3418:     if( rc==SQLITE_ROW ){   /* the case we expect */
                   3419:       c->eof = 0;
                   3420:       return SQLITE_OK;
                   3421:     }
                   3422:     /* an error occurred; abort */
                   3423:     return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
                   3424:   }
                   3425: }
                   3426: 
                   3427: 
                   3428: /* TODO(shess) If we pushed LeafReader to the top of the file, or to
                   3429: ** another file, term_select() could be pushed above
                   3430: ** docListOfTerm().
                   3431: */
                   3432: static int termSelect(fulltext_vtab *v, int iColumn,
                   3433:                       const char *pTerm, int nTerm, int isPrefix,
                   3434:                       DocListType iType, DataBuffer *out);
                   3435: 
                   3436: /* Return a DocList corresponding to the query term *pTerm.  If *pTerm
                   3437: ** is the first term of a phrase query, go ahead and evaluate the phrase
                   3438: ** query and return the doclist for the entire phrase query.
                   3439: **
                   3440: ** The resulting DL_DOCIDS doclist is stored in pResult, which is
                   3441: ** overwritten.
                   3442: */
                   3443: static int docListOfTerm(
                   3444:   fulltext_vtab *v,   /* The full text index */
                   3445:   int iColumn,        /* column to restrict to.  No restriction if >=nColumn */
                   3446:   QueryTerm *pQTerm,  /* Term we are looking for, or 1st term of a phrase */
                   3447:   DataBuffer *pResult /* Write the result here */
                   3448: ){
                   3449:   DataBuffer left, right, new;
                   3450:   int i, rc;
                   3451: 
                   3452:   /* No phrase search if no position info. */
                   3453:   assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS );
                   3454: 
                   3455:   /* This code should never be called with buffered updates. */
                   3456:   assert( v->nPendingData<0 );
                   3457: 
                   3458:   dataBufferInit(&left, 0);
                   3459:   rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix,
                   3460:                   0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left);
                   3461:   if( rc ) return rc;
                   3462:   for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){
                   3463:     dataBufferInit(&right, 0);
                   3464:     rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm,
                   3465:                     pQTerm[i].isPrefix, DL_POSITIONS, &right);
                   3466:     if( rc ){
                   3467:       dataBufferDestroy(&left);
                   3468:       return rc;
                   3469:     }
                   3470:     dataBufferInit(&new, 0);
                   3471:     docListPhraseMerge(left.pData, left.nData, right.pData, right.nData,
                   3472:                        i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new);
                   3473:     dataBufferDestroy(&left);
                   3474:     dataBufferDestroy(&right);
                   3475:     left = new;
                   3476:   }
                   3477:   *pResult = left;
                   3478:   return SQLITE_OK;
                   3479: }
                   3480: 
                   3481: /* Add a new term pTerm[0..nTerm-1] to the query *q.
                   3482: */
                   3483: static void queryAdd(Query *q, const char *pTerm, int nTerm){
                   3484:   QueryTerm *t;
                   3485:   ++q->nTerms;
                   3486:   q->pTerms = sqlite3_realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
                   3487:   if( q->pTerms==0 ){
                   3488:     q->nTerms = 0;
                   3489:     return;
                   3490:   }
                   3491:   t = &q->pTerms[q->nTerms - 1];
                   3492:   CLEAR(t);
                   3493:   t->pTerm = sqlite3_malloc(nTerm+1);
                   3494:   memcpy(t->pTerm, pTerm, nTerm);
                   3495:   t->pTerm[nTerm] = 0;
                   3496:   t->nTerm = nTerm;
                   3497:   t->isOr = q->nextIsOr;
                   3498:   t->isPrefix = 0;
                   3499:   q->nextIsOr = 0;
                   3500:   t->iColumn = q->nextColumn;
                   3501:   q->nextColumn = q->dfltColumn;
                   3502: }
                   3503: 
                   3504: /*
                   3505: ** Check to see if the string zToken[0...nToken-1] matches any
                   3506: ** column name in the virtual table.   If it does,
                   3507: ** return the zero-indexed column number.  If not, return -1.
                   3508: */
                   3509: static int checkColumnSpecifier(
                   3510:   fulltext_vtab *pVtab,    /* The virtual table */
                   3511:   const char *zToken,      /* Text of the token */
                   3512:   int nToken               /* Number of characters in the token */
                   3513: ){
                   3514:   int i;
                   3515:   for(i=0; i<pVtab->nColumn; i++){
                   3516:     if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
                   3517:         && pVtab->azColumn[i][nToken]==0 ){
                   3518:       return i;
                   3519:     }
                   3520:   }
                   3521:   return -1;
                   3522: }
                   3523: 
                   3524: /*
                   3525: ** Parse the text at pSegment[0..nSegment-1].  Add additional terms
                   3526: ** to the query being assemblied in pQuery.
                   3527: **
                   3528: ** inPhrase is true if pSegment[0..nSegement-1] is contained within
                   3529: ** double-quotes.  If inPhrase is true, then the first term
                   3530: ** is marked with the number of terms in the phrase less one and
                   3531: ** OR and "-" syntax is ignored.  If inPhrase is false, then every
                   3532: ** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
                   3533: */
                   3534: static int tokenizeSegment(
                   3535:   sqlite3_tokenizer *pTokenizer,          /* The tokenizer to use */
                   3536:   const char *pSegment, int nSegment,     /* Query expression being parsed */
                   3537:   int inPhrase,                           /* True if within "..." */
                   3538:   Query *pQuery                           /* Append results here */
                   3539: ){
                   3540:   const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
                   3541:   sqlite3_tokenizer_cursor *pCursor;
                   3542:   int firstIndex = pQuery->nTerms;
                   3543:   int iCol;
                   3544:   int nTerm = 1;
                   3545:   
                   3546:   int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
                   3547:   if( rc!=SQLITE_OK ) return rc;
                   3548:   pCursor->pTokenizer = pTokenizer;
                   3549: 
                   3550:   while( 1 ){
                   3551:     const char *pToken;
                   3552:     int nToken, iBegin, iEnd, iPos;
                   3553: 
                   3554:     rc = pModule->xNext(pCursor,
                   3555:                         &pToken, &nToken,
                   3556:                         &iBegin, &iEnd, &iPos);
                   3557:     if( rc!=SQLITE_OK ) break;
                   3558:     if( !inPhrase &&
                   3559:         pSegment[iEnd]==':' &&
                   3560:          (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
                   3561:       pQuery->nextColumn = iCol;
                   3562:       continue;
                   3563:     }
                   3564:     if( !inPhrase && pQuery->nTerms>0 && nToken==2
                   3565:          && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
                   3566:       pQuery->nextIsOr = 1;
                   3567:       continue;
                   3568:     }
                   3569:     queryAdd(pQuery, pToken, nToken);
                   3570:     if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
                   3571:       pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
                   3572:     }
                   3573:     if( iEnd<nSegment && pSegment[iEnd]=='*' ){
                   3574:       pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
                   3575:     }
                   3576:     pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
                   3577:     if( inPhrase ){
                   3578:       nTerm++;
                   3579:     }
                   3580:   }
                   3581: 
                   3582:   if( inPhrase && pQuery->nTerms>firstIndex ){
                   3583:     pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
                   3584:   }
                   3585: 
                   3586:   return pModule->xClose(pCursor);
                   3587: }
                   3588: 
                   3589: /* Parse a query string, yielding a Query object pQuery.
                   3590: **
                   3591: ** The calling function will need to queryClear() to clean up
                   3592: ** the dynamically allocated memory held by pQuery.
                   3593: */
                   3594: static int parseQuery(
                   3595:   fulltext_vtab *v,        /* The fulltext index */
                   3596:   const char *zInput,      /* Input text of the query string */
                   3597:   int nInput,              /* Size of the input text */
                   3598:   int dfltColumn,          /* Default column of the index to match against */
                   3599:   Query *pQuery            /* Write the parse results here. */
                   3600: ){
                   3601:   int iInput, inPhrase = 0;
                   3602: 
                   3603:   if( zInput==0 ) nInput = 0;
                   3604:   if( nInput<0 ) nInput = strlen(zInput);
                   3605:   pQuery->nTerms = 0;
                   3606:   pQuery->pTerms = NULL;
                   3607:   pQuery->nextIsOr = 0;
                   3608:   pQuery->nextColumn = dfltColumn;
                   3609:   pQuery->dfltColumn = dfltColumn;
                   3610:   pQuery->pFts = v;
                   3611: 
                   3612:   for(iInput=0; iInput<nInput; ++iInput){
                   3613:     int i;
                   3614:     for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
                   3615:     if( i>iInput ){
                   3616:       tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
                   3617:                        pQuery);
                   3618:     }
                   3619:     iInput = i;
                   3620:     if( i<nInput ){
                   3621:       assert( zInput[i]=='"' );
                   3622:       inPhrase = !inPhrase;
                   3623:     }
                   3624:   }
                   3625: 
                   3626:   if( inPhrase ){
                   3627:     /* unmatched quote */
                   3628:     queryClear(pQuery);
                   3629:     return SQLITE_ERROR;
                   3630:   }
                   3631:   return SQLITE_OK;
                   3632: }
                   3633: 
                   3634: /* TODO(shess) Refactor the code to remove this forward decl. */
                   3635: static int flushPendingTerms(fulltext_vtab *v);
                   3636: 
                   3637: /* Perform a full-text query using the search expression in
                   3638: ** zInput[0..nInput-1].  Return a list of matching documents
                   3639: ** in pResult.
                   3640: **
                   3641: ** Queries must match column iColumn.  Or if iColumn>=nColumn
                   3642: ** they are allowed to match against any column.
                   3643: */
                   3644: static int fulltextQuery(
                   3645:   fulltext_vtab *v,      /* The full text index */
                   3646:   int iColumn,           /* Match against this column by default */
                   3647:   const char *zInput,    /* The query string */
                   3648:   int nInput,            /* Number of bytes in zInput[] */
                   3649:   DataBuffer *pResult,   /* Write the result doclist here */
                   3650:   Query *pQuery          /* Put parsed query string here */
                   3651: ){
                   3652:   int i, iNext, rc;
                   3653:   DataBuffer left, right, or, new;
                   3654:   int nNot = 0;
                   3655:   QueryTerm *aTerm;
                   3656: 
                   3657:   /* TODO(shess) Instead of flushing pendingTerms, we could query for
                   3658:   ** the relevant term and merge the doclist into what we receive from
                   3659:   ** the database.  Wait and see if this is a common issue, first.
                   3660:   **
                   3661:   ** A good reason not to flush is to not generate update-related
                   3662:   ** error codes from here.
                   3663:   */
                   3664: 
                   3665:   /* Flush any buffered updates before executing the query. */
                   3666:   rc = flushPendingTerms(v);
                   3667:   if( rc!=SQLITE_OK ) return rc;
                   3668: 
                   3669:   /* TODO(shess) I think that the queryClear() calls below are not
                   3670:   ** necessary, because fulltextClose() already clears the query.
                   3671:   */
                   3672:   rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
                   3673:   if( rc!=SQLITE_OK ) return rc;
                   3674: 
                   3675:   /* Empty or NULL queries return no results. */
                   3676:   if( pQuery->nTerms==0 ){
                   3677:     dataBufferInit(pResult, 0);
                   3678:     return SQLITE_OK;
                   3679:   }
                   3680: 
                   3681:   /* Merge AND terms. */
                   3682:   /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
                   3683:   aTerm = pQuery->pTerms;
                   3684:   for(i = 0; i<pQuery->nTerms; i=iNext){
                   3685:     if( aTerm[i].isNot ){
                   3686:       /* Handle all NOT terms in a separate pass */
                   3687:       nNot++;
                   3688:       iNext = i + aTerm[i].nPhrase+1;
                   3689:       continue;
                   3690:     }
                   3691:     iNext = i + aTerm[i].nPhrase + 1;
                   3692:     rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
                   3693:     if( rc ){
                   3694:       if( i!=nNot ) dataBufferDestroy(&left);
                   3695:       queryClear(pQuery);
                   3696:       return rc;
                   3697:     }
                   3698:     while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
                   3699:       rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or);
                   3700:       iNext += aTerm[iNext].nPhrase + 1;
                   3701:       if( rc ){
                   3702:         if( i!=nNot ) dataBufferDestroy(&left);
                   3703:         dataBufferDestroy(&right);
                   3704:         queryClear(pQuery);
                   3705:         return rc;
                   3706:       }
                   3707:       dataBufferInit(&new, 0);
                   3708:       docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new);
                   3709:       dataBufferDestroy(&right);
                   3710:       dataBufferDestroy(&or);
                   3711:       right = new;
                   3712:     }
                   3713:     if( i==nNot ){           /* first term processed. */
                   3714:       left = right;
                   3715:     }else{
                   3716:       dataBufferInit(&new, 0);
                   3717:       docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new);
                   3718:       dataBufferDestroy(&right);
                   3719:       dataBufferDestroy(&left);
                   3720:       left = new;
                   3721:     }
                   3722:   }
                   3723: 
                   3724:   if( nNot==pQuery->nTerms ){
                   3725:     /* We do not yet know how to handle a query of only NOT terms */
                   3726:     return SQLITE_ERROR;
                   3727:   }
                   3728: 
                   3729:   /* Do the EXCEPT terms */
                   3730:   for(i=0; i<pQuery->nTerms;  i += aTerm[i].nPhrase + 1){
                   3731:     if( !aTerm[i].isNot ) continue;
                   3732:     rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
                   3733:     if( rc ){
                   3734:       queryClear(pQuery);
                   3735:       dataBufferDestroy(&left);
                   3736:       return rc;
                   3737:     }
                   3738:     dataBufferInit(&new, 0);
                   3739:     docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new);
                   3740:     dataBufferDestroy(&right);
                   3741:     dataBufferDestroy(&left);
                   3742:     left = new;
                   3743:   }
                   3744: 
                   3745:   *pResult = left;
                   3746:   return rc;
                   3747: }
                   3748: 
                   3749: /*
                   3750: ** This is the xFilter interface for the virtual table.  See
                   3751: ** the virtual table xFilter method documentation for additional
                   3752: ** information.
                   3753: **
                   3754: ** If idxNum==QUERY_GENERIC then do a full table scan against
                   3755: ** the %_content table.
                   3756: **
                   3757: ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
                   3758: ** in the %_content table.
                   3759: **
                   3760: ** If idxNum>=QUERY_FULLTEXT then use the full text index.  The
                   3761: ** column on the left-hand side of the MATCH operator is column
                   3762: ** number idxNum-QUERY_FULLTEXT, 0 indexed.  argv[0] is the right-hand
                   3763: ** side of the MATCH operator.
                   3764: */
                   3765: /* TODO(shess) Upgrade the cursor initialization and destruction to
                   3766: ** account for fulltextFilter() being called multiple times on the
                   3767: ** same cursor.  The current solution is very fragile.  Apply fix to
                   3768: ** fts2 as appropriate.
                   3769: */
                   3770: static int fulltextFilter(
                   3771:   sqlite3_vtab_cursor *pCursor,     /* The cursor used for this query */
                   3772:   int idxNum, const char *idxStr,   /* Which indexing scheme to use */
                   3773:   int argc, sqlite3_value **argv    /* Arguments for the indexing scheme */
                   3774: ){
                   3775:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   3776:   fulltext_vtab *v = cursor_vtab(c);
                   3777:   int rc;
                   3778: 
                   3779:   TRACE(("FTS2 Filter %p\n",pCursor));
                   3780: 
                   3781:   /* If the cursor has a statement that was not prepared according to
                   3782:   ** idxNum, clear it.  I believe all calls to fulltextFilter with a
                   3783:   ** given cursor will have the same idxNum , but in this case it's
                   3784:   ** easy to be safe.
                   3785:   */
                   3786:   if( c->pStmt && c->iCursorType!=idxNum ){
                   3787:     sqlite3_finalize(c->pStmt);
                   3788:     c->pStmt = NULL;
                   3789:   }
                   3790: 
                   3791:   /* Get a fresh statement appropriate to idxNum. */
                   3792:   /* TODO(shess): Add a prepared-statement cache in the vt structure.
                   3793:   ** The cache must handle multiple open cursors.  Easier to cache the
                   3794:   ** statement variants at the vt to reduce malloc/realloc/free here.
                   3795:   ** Or we could have a StringBuffer variant which allowed stack
                   3796:   ** construction for small values.
                   3797:   */
                   3798:   if( !c->pStmt ){
                   3799:     char *zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
                   3800:                                  idxNum==QUERY_GENERIC ? "" : "where rowid=?");
                   3801:     rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
                   3802:     sqlite3_free(zSql);
                   3803:     if( rc!=SQLITE_OK ) return rc;
                   3804:     c->iCursorType = idxNum;
                   3805:   }else{
                   3806:     sqlite3_reset(c->pStmt);
                   3807:     assert( c->iCursorType==idxNum );
                   3808:   }
                   3809: 
                   3810:   switch( idxNum ){
                   3811:     case QUERY_GENERIC:
                   3812:       break;
                   3813: 
                   3814:     case QUERY_ROWID:
                   3815:       rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
                   3816:       if( rc!=SQLITE_OK ) return rc;
                   3817:       break;
                   3818: 
                   3819:     default:   /* full-text search */
                   3820:     {
                   3821:       const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
                   3822:       assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
                   3823:       assert( argc==1 );
                   3824:       queryClear(&c->q);
                   3825:       if( c->result.nData!=0 ){
                   3826:         /* This case happens if the same cursor is used repeatedly. */
                   3827:         dlrDestroy(&c->reader);
                   3828:         dataBufferReset(&c->result);
                   3829:       }else{
                   3830:         dataBufferInit(&c->result, 0);
                   3831:       }
                   3832:       rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q);
                   3833:       if( rc!=SQLITE_OK ) return rc;
                   3834:       if( c->result.nData!=0 ){
                   3835:         dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData);
                   3836:       }
                   3837:       break;
                   3838:     }
                   3839:   }
                   3840: 
                   3841:   return fulltextNext(pCursor);
                   3842: }
                   3843: 
                   3844: /* This is the xEof method of the virtual table.  The SQLite core
                   3845: ** calls this routine to find out if it has reached the end of
                   3846: ** a query's results set.
                   3847: */
                   3848: static int fulltextEof(sqlite3_vtab_cursor *pCursor){
                   3849:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   3850:   return c->eof;
                   3851: }
                   3852: 
                   3853: /* This is the xColumn method of the virtual table.  The SQLite
                   3854: ** core calls this method during a query when it needs the value
                   3855: ** of a column from the virtual table.  This method needs to use
                   3856: ** one of the sqlite3_result_*() routines to store the requested
                   3857: ** value back in the pContext.
                   3858: */
                   3859: static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
                   3860:                           sqlite3_context *pContext, int idxCol){
                   3861:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   3862:   fulltext_vtab *v = cursor_vtab(c);
                   3863: 
                   3864:   if( idxCol<v->nColumn ){
                   3865:     sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
                   3866:     sqlite3_result_value(pContext, pVal);
                   3867:   }else if( idxCol==v->nColumn ){
                   3868:     /* The extra column whose name is the same as the table.
                   3869:     ** Return a blob which is a pointer to the cursor
                   3870:     */
                   3871:     sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
                   3872:   }
                   3873:   return SQLITE_OK;
                   3874: }
                   3875: 
                   3876: /* This is the xRowid method.  The SQLite core calls this routine to
                   3877: ** retrive the rowid for the current row of the result set.  The
                   3878: ** rowid should be written to *pRowid.
                   3879: */
                   3880: static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
                   3881:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
                   3882: 
                   3883:   *pRowid = sqlite3_column_int64(c->pStmt, 0);
                   3884:   return SQLITE_OK;
                   3885: }
                   3886: 
                   3887: /* Add all terms in [zText] to pendingTerms table.  If [iColumn] > 0,
                   3888: ** we also store positions and offsets in the hash table using that
                   3889: ** column number.
                   3890: */
                   3891: static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
                   3892:                       const char *zText, int iColumn){
                   3893:   sqlite3_tokenizer *pTokenizer = v->pTokenizer;
                   3894:   sqlite3_tokenizer_cursor *pCursor;
                   3895:   const char *pToken;
                   3896:   int nTokenBytes;
                   3897:   int iStartOffset, iEndOffset, iPosition;
                   3898:   int rc;
                   3899: 
                   3900:   rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
                   3901:   if( rc!=SQLITE_OK ) return rc;
                   3902: 
                   3903:   pCursor->pTokenizer = pTokenizer;
                   3904:   while( SQLITE_OK==(rc=pTokenizer->pModule->xNext(pCursor,
                   3905:                                                    &pToken, &nTokenBytes,
                   3906:                                                    &iStartOffset, &iEndOffset,
                   3907:                                                    &iPosition)) ){
                   3908:     DLCollector *p;
                   3909:     int nData;                   /* Size of doclist before our update. */
                   3910: 
                   3911:     /* Positions can't be negative; we use -1 as a terminator
                   3912:      * internally.  Token can't be NULL or empty. */
                   3913:     if( iPosition<0 || pToken == NULL || nTokenBytes == 0 ){
                   3914:       rc = SQLITE_ERROR;
                   3915:       break;
                   3916:     }
                   3917: 
                   3918:     p = fts2HashFind(&v->pendingTerms, pToken, nTokenBytes);
                   3919:     if( p==NULL ){
                   3920:       nData = 0;
                   3921:       p = dlcNew(iDocid, DL_DEFAULT);
                   3922:       fts2HashInsert(&v->pendingTerms, pToken, nTokenBytes, p);
                   3923: 
                   3924:       /* Overhead for our hash table entry, the key, and the value. */
                   3925:       v->nPendingData += sizeof(struct fts2HashElem)+sizeof(*p)+nTokenBytes;
                   3926:     }else{
                   3927:       nData = p->b.nData;
                   3928:       if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid);
                   3929:     }
                   3930:     if( iColumn>=0 ){
                   3931:       dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
                   3932:     }
                   3933: 
                   3934:     /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
                   3935:     v->nPendingData += p->b.nData-nData;
                   3936:   }
                   3937: 
                   3938:   /* TODO(shess) Check return?  Should this be able to cause errors at
                   3939:   ** this point?  Actually, same question about sqlite3_finalize(),
                   3940:   ** though one could argue that failure there means that the data is
                   3941:   ** not durable.  *ponder*
                   3942:   */
                   3943:   pTokenizer->pModule->xClose(pCursor);
                   3944:   if( SQLITE_DONE == rc ) return SQLITE_OK;
                   3945:   return rc;
                   3946: }
                   3947: 
                   3948: /* Add doclists for all terms in [pValues] to pendingTerms table. */
                   3949: static int insertTerms(fulltext_vtab *v, sqlite_int64 iRowid,
                   3950:                        sqlite3_value **pValues){
                   3951:   int i;
                   3952:   for(i = 0; i < v->nColumn ; ++i){
                   3953:     char *zText = (char*)sqlite3_value_text(pValues[i]);
                   3954:     int rc = buildTerms(v, iRowid, zText, i);
                   3955:     if( rc!=SQLITE_OK ) return rc;
                   3956:   }
                   3957:   return SQLITE_OK;
                   3958: }
                   3959: 
                   3960: /* Add empty doclists for all terms in the given row's content to
                   3961: ** pendingTerms.
                   3962: */
                   3963: static int deleteTerms(fulltext_vtab *v, sqlite_int64 iRowid){
                   3964:   const char **pValues;
                   3965:   int i, rc;
                   3966: 
                   3967:   /* TODO(shess) Should we allow such tables at all? */
                   3968:   if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
                   3969: 
                   3970:   rc = content_select(v, iRowid, &pValues);
                   3971:   if( rc!=SQLITE_OK ) return rc;
                   3972: 
                   3973:   for(i = 0 ; i < v->nColumn; ++i) {
                   3974:     rc = buildTerms(v, iRowid, pValues[i], -1);
                   3975:     if( rc!=SQLITE_OK ) break;
                   3976:   }
                   3977: 
                   3978:   freeStringArray(v->nColumn, pValues);
                   3979:   return SQLITE_OK;
                   3980: }
                   3981: 
                   3982: /* TODO(shess) Refactor the code to remove this forward decl. */
                   3983: static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid);
                   3984: 
                   3985: /* Insert a row into the %_content table; set *piRowid to be the ID of the
                   3986: ** new row.  Add doclists for terms to pendingTerms.
                   3987: */
                   3988: static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
                   3989:                         sqlite3_value **pValues, sqlite_int64 *piRowid){
                   3990:   int rc;
                   3991: 
                   3992:   rc = content_insert(v, pRequestRowid, pValues);  /* execute an SQL INSERT */
                   3993:   if( rc!=SQLITE_OK ) return rc;
                   3994: 
                   3995:   *piRowid = sqlite3_last_insert_rowid(v->db);
                   3996:   rc = initPendingTerms(v, *piRowid);
                   3997:   if( rc!=SQLITE_OK ) return rc;
                   3998: 
                   3999:   return insertTerms(v, *piRowid, pValues);
                   4000: }
                   4001: 
                   4002: /* Delete a row from the %_content table; add empty doclists for terms
                   4003: ** to pendingTerms.
                   4004: */
                   4005: static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
                   4006:   int rc = initPendingTerms(v, iRow);
                   4007:   if( rc!=SQLITE_OK ) return rc;
                   4008: 
                   4009:   rc = deleteTerms(v, iRow);
                   4010:   if( rc!=SQLITE_OK ) return rc;
                   4011: 
                   4012:   return content_delete(v, iRow);  /* execute an SQL DELETE */
                   4013: }
                   4014: 
                   4015: /* Update a row in the %_content table; add delete doclists to
                   4016: ** pendingTerms for old terms not in the new data, add insert doclists
                   4017: ** to pendingTerms for terms in the new data.
                   4018: */
                   4019: static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
                   4020:                         sqlite3_value **pValues){
                   4021:   int rc = initPendingTerms(v, iRow);
                   4022:   if( rc!=SQLITE_OK ) return rc;
                   4023: 
                   4024:   /* Generate an empty doclist for each term that previously appeared in this
                   4025:    * row. */
                   4026:   rc = deleteTerms(v, iRow);
                   4027:   if( rc!=SQLITE_OK ) return rc;
                   4028: 
                   4029:   rc = content_update(v, pValues, iRow);  /* execute an SQL UPDATE */
                   4030:   if( rc!=SQLITE_OK ) return rc;
                   4031: 
                   4032:   /* Now add positions for terms which appear in the updated row. */
                   4033:   return insertTerms(v, iRow, pValues);
                   4034: }
                   4035: 
                   4036: /*******************************************************************/
                   4037: /* InteriorWriter is used to collect terms and block references into
                   4038: ** interior nodes in %_segments.  See commentary at top of file for
                   4039: ** format.
                   4040: */
                   4041: 
                   4042: /* How large interior nodes can grow. */
                   4043: #define INTERIOR_MAX 2048
                   4044: 
                   4045: /* Minimum number of terms per interior node (except the root). This
                   4046: ** prevents large terms from making the tree too skinny - must be >0
                   4047: ** so that the tree always makes progress.  Note that the min tree
                   4048: ** fanout will be INTERIOR_MIN_TERMS+1.
                   4049: */
                   4050: #define INTERIOR_MIN_TERMS 7
                   4051: #if INTERIOR_MIN_TERMS<1
                   4052: # error INTERIOR_MIN_TERMS must be greater than 0.
                   4053: #endif
                   4054: 
                   4055: /* ROOT_MAX controls how much data is stored inline in the segment
                   4056: ** directory.
                   4057: */
                   4058: /* TODO(shess) Push ROOT_MAX down to whoever is writing things.  It's
                   4059: ** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
                   4060: ** can both see it, but if the caller passed it in, we wouldn't even
                   4061: ** need a define.
                   4062: */
                   4063: #define ROOT_MAX 1024
                   4064: #if ROOT_MAX<VARINT_MAX*2
                   4065: # error ROOT_MAX must have enough space for a header.
                   4066: #endif
                   4067: 
                   4068: /* InteriorBlock stores a linked-list of interior blocks while a lower
                   4069: ** layer is being constructed.
                   4070: */
                   4071: typedef struct InteriorBlock {
                   4072:   DataBuffer term;           /* Leftmost term in block's subtree. */
                   4073:   DataBuffer data;           /* Accumulated data for the block. */
                   4074:   struct InteriorBlock *next;
                   4075: } InteriorBlock;
                   4076: 
                   4077: static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
                   4078:                                        const char *pTerm, int nTerm){
                   4079:   InteriorBlock *block = sqlite3_malloc(sizeof(InteriorBlock));
                   4080:   char c[VARINT_MAX+VARINT_MAX];
                   4081:   int n;
                   4082: 
                   4083:   if( block ){
                   4084:     memset(block, 0, sizeof(*block));
                   4085:     dataBufferInit(&block->term, 0);
                   4086:     dataBufferReplace(&block->term, pTerm, nTerm);
                   4087: 
                   4088:     n = putVarint(c, iHeight);
                   4089:     n += putVarint(c+n, iChildBlock);
                   4090:     dataBufferInit(&block->data, INTERIOR_MAX);
                   4091:     dataBufferReplace(&block->data, c, n);
                   4092:   }
                   4093:   return block;
                   4094: }
                   4095: 
                   4096: #ifndef NDEBUG
                   4097: /* Verify that the data is readable as an interior node. */
                   4098: static void interiorBlockValidate(InteriorBlock *pBlock){
                   4099:   const char *pData = pBlock->data.pData;
                   4100:   int nData = pBlock->data.nData;
                   4101:   int n, iDummy;
                   4102:   sqlite_int64 iBlockid;
                   4103: 
                   4104:   assert( nData>0 );
                   4105:   assert( pData!=0 );
                   4106:   assert( pData+nData>pData );
                   4107: 
                   4108:   /* Must lead with height of node as a varint(n), n>0 */
                   4109:   n = getVarint32(pData, &iDummy);
                   4110:   assert( n>0 );
                   4111:   assert( iDummy>0 );
                   4112:   assert( n<nData );
                   4113:   pData += n;
                   4114:   nData -= n;
                   4115: 
                   4116:   /* Must contain iBlockid. */
                   4117:   n = getVarint(pData, &iBlockid);
                   4118:   assert( n>0 );
                   4119:   assert( n<=nData );
                   4120:   pData += n;
                   4121:   nData -= n;
                   4122: 
                   4123:   /* Zero or more terms of positive length */
                   4124:   if( nData!=0 ){
                   4125:     /* First term is not delta-encoded. */
                   4126:     n = getVarint32(pData, &iDummy);
                   4127:     assert( n>0 );
                   4128:     assert( iDummy>0 );
                   4129:     assert( n+iDummy>0);
                   4130:     assert( n+iDummy<=nData );
                   4131:     pData += n+iDummy;
                   4132:     nData -= n+iDummy;
                   4133: 
                   4134:     /* Following terms delta-encoded. */
                   4135:     while( nData!=0 ){
                   4136:       /* Length of shared prefix. */
                   4137:       n = getVarint32(pData, &iDummy);
                   4138:       assert( n>0 );
                   4139:       assert( iDummy>=0 );
                   4140:       assert( n<nData );
                   4141:       pData += n;
                   4142:       nData -= n;
                   4143: 
                   4144:       /* Length and data of distinct suffix. */
                   4145:       n = getVarint32(pData, &iDummy);
                   4146:       assert( n>0 );
                   4147:       assert( iDummy>0 );
                   4148:       assert( n+iDummy>0);
                   4149:       assert( n+iDummy<=nData );
                   4150:       pData += n+iDummy;
                   4151:       nData -= n+iDummy;
                   4152:     }
                   4153:   }
                   4154: }
                   4155: #define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
                   4156: #else
                   4157: #define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
                   4158: #endif
                   4159: 
                   4160: typedef struct InteriorWriter {
                   4161:   int iHeight;                   /* from 0 at leaves. */
                   4162:   InteriorBlock *first, *last;
                   4163:   struct InteriorWriter *parentWriter;
                   4164: 
                   4165:   DataBuffer term;               /* Last term written to block "last". */
                   4166:   sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
                   4167: #ifndef NDEBUG
                   4168:   sqlite_int64 iLastChildBlock;  /* for consistency checks. */
                   4169: #endif
                   4170: } InteriorWriter;
                   4171: 
                   4172: /* Initialize an interior node where pTerm[nTerm] marks the leftmost
                   4173: ** term in the tree.  iChildBlock is the leftmost child block at the
                   4174: ** next level down the tree.
                   4175: */
                   4176: static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
                   4177:                                sqlite_int64 iChildBlock,
                   4178:                                InteriorWriter *pWriter){
                   4179:   InteriorBlock *block;
                   4180:   assert( iHeight>0 );
                   4181:   CLEAR(pWriter);
                   4182: 
                   4183:   pWriter->iHeight = iHeight;
                   4184:   pWriter->iOpeningChildBlock = iChildBlock;
                   4185: #ifndef NDEBUG
                   4186:   pWriter->iLastChildBlock = iChildBlock;
                   4187: #endif
                   4188:   block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
                   4189:   pWriter->last = pWriter->first = block;
                   4190:   ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
                   4191:   dataBufferInit(&pWriter->term, 0);
                   4192: }
                   4193: 
                   4194: /* Append the child node rooted at iChildBlock to the interior node,
                   4195: ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
                   4196: */
                   4197: static void interiorWriterAppend(InteriorWriter *pWriter,
                   4198:                                  const char *pTerm, int nTerm,
                   4199:                                  sqlite_int64 iChildBlock){
                   4200:   char c[VARINT_MAX+VARINT_MAX];
                   4201:   int n, nPrefix = 0;
                   4202: 
                   4203:   ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
                   4204: 
                   4205:   /* The first term written into an interior node is actually
                   4206:   ** associated with the second child added (the first child was added
                   4207:   ** in interiorWriterInit, or in the if clause at the bottom of this
                   4208:   ** function).  That term gets encoded straight up, with nPrefix left
                   4209:   ** at 0.
                   4210:   */
                   4211:   if( pWriter->term.nData==0 ){
                   4212:     n = putVarint(c, nTerm);
                   4213:   }else{
                   4214:     while( nPrefix<pWriter->term.nData &&
                   4215:            pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
                   4216:       nPrefix++;
                   4217:     }
                   4218: 
                   4219:     n = putVarint(c, nPrefix);
                   4220:     n += putVarint(c+n, nTerm-nPrefix);
                   4221:   }
                   4222: 
                   4223: #ifndef NDEBUG
                   4224:   pWriter->iLastChildBlock++;
                   4225: #endif
                   4226:   assert( pWriter->iLastChildBlock==iChildBlock );
                   4227: 
                   4228:   /* Overflow to a new block if the new term makes the current block
                   4229:   ** too big, and the current block already has enough terms.
                   4230:   */
                   4231:   if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
                   4232:       iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
                   4233:     pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
                   4234:                                            pTerm, nTerm);
                   4235:     pWriter->last = pWriter->last->next;
                   4236:     pWriter->iOpeningChildBlock = iChildBlock;
                   4237:     dataBufferReset(&pWriter->term);
                   4238:   }else{
                   4239:     dataBufferAppend2(&pWriter->last->data, c, n,
                   4240:                       pTerm+nPrefix, nTerm-nPrefix);
                   4241:     dataBufferReplace(&pWriter->term, pTerm, nTerm);
                   4242:   }
                   4243:   ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
                   4244: }
                   4245: 
                   4246: /* Free the space used by pWriter, including the linked-list of
                   4247: ** InteriorBlocks, and parentWriter, if present.
                   4248: */
                   4249: static int interiorWriterDestroy(InteriorWriter *pWriter){
                   4250:   InteriorBlock *block = pWriter->first;
                   4251: 
                   4252:   while( block!=NULL ){
                   4253:     InteriorBlock *b = block;
                   4254:     block = block->next;
                   4255:     dataBufferDestroy(&b->term);
                   4256:     dataBufferDestroy(&b->data);
                   4257:     sqlite3_free(b);
                   4258:   }
                   4259:   if( pWriter->parentWriter!=NULL ){
                   4260:     interiorWriterDestroy(pWriter->parentWriter);
                   4261:     sqlite3_free(pWriter->parentWriter);
                   4262:   }
                   4263:   dataBufferDestroy(&pWriter->term);
                   4264:   SCRAMBLE(pWriter);
                   4265:   return SQLITE_OK;
                   4266: }
                   4267: 
                   4268: /* If pWriter can fit entirely in ROOT_MAX, return it as the root info
                   4269: ** directly, leaving *piEndBlockid unchanged.  Otherwise, flush
                   4270: ** pWriter to %_segments, building a new layer of interior nodes, and
                   4271: ** recursively ask for their root into.
                   4272: */
                   4273: static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
                   4274:                                   char **ppRootInfo, int *pnRootInfo,
                   4275:                                   sqlite_int64 *piEndBlockid){
                   4276:   InteriorBlock *block = pWriter->first;
                   4277:   sqlite_int64 iBlockid = 0;
                   4278:   int rc;
                   4279: 
                   4280:   /* If we can fit the segment inline */
                   4281:   if( block==pWriter->last && block->data.nData<ROOT_MAX ){
                   4282:     *ppRootInfo = block->data.pData;
                   4283:     *pnRootInfo = block->data.nData;
                   4284:     return SQLITE_OK;
                   4285:   }
                   4286: 
                   4287:   /* Flush the first block to %_segments, and create a new level of
                   4288:   ** interior node.
                   4289:   */
                   4290:   ASSERT_VALID_INTERIOR_BLOCK(block);
                   4291:   rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
                   4292:   if( rc!=SQLITE_OK ) return rc;
                   4293:   *piEndBlockid = iBlockid;
                   4294: 
                   4295:   pWriter->parentWriter = sqlite3_malloc(sizeof(*pWriter->parentWriter));
                   4296:   interiorWriterInit(pWriter->iHeight+1,
                   4297:                      block->term.pData, block->term.nData,
                   4298:                      iBlockid, pWriter->parentWriter);
                   4299: 
                   4300:   /* Flush additional blocks and append to the higher interior
                   4301:   ** node.
                   4302:   */
                   4303:   for(block=block->next; block!=NULL; block=block->next){
                   4304:     ASSERT_VALID_INTERIOR_BLOCK(block);
                   4305:     rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
                   4306:     if( rc!=SQLITE_OK ) return rc;
                   4307:     *piEndBlockid = iBlockid;
                   4308: 
                   4309:     interiorWriterAppend(pWriter->parentWriter,
                   4310:                          block->term.pData, block->term.nData, iBlockid);
                   4311:   }
                   4312: 
                   4313:   /* Parent node gets the chance to be the root. */
                   4314:   return interiorWriterRootInfo(v, pWriter->parentWriter,
                   4315:                                 ppRootInfo, pnRootInfo, piEndBlockid);
                   4316: }
                   4317: 
                   4318: /****************************************************************/
                   4319: /* InteriorReader is used to read off the data from an interior node
                   4320: ** (see comment at top of file for the format).
                   4321: */
                   4322: typedef struct InteriorReader {
                   4323:   const char *pData;
                   4324:   int nData;
                   4325: 
                   4326:   DataBuffer term;          /* previous term, for decoding term delta. */
                   4327: 
                   4328:   sqlite_int64 iBlockid;
                   4329: } InteriorReader;
                   4330: 
                   4331: static void interiorReaderDestroy(InteriorReader *pReader){
                   4332:   dataBufferDestroy(&pReader->term);
                   4333:   SCRAMBLE(pReader);
                   4334: }
                   4335: 
                   4336: /* TODO(shess) The assertions are great, but what if we're in NDEBUG
                   4337: ** and the blob is empty or otherwise contains suspect data?
                   4338: */
                   4339: static void interiorReaderInit(const char *pData, int nData,
                   4340:                                InteriorReader *pReader){
                   4341:   int n, nTerm;
                   4342: 
                   4343:   /* Require at least the leading flag byte */
                   4344:   assert( nData>0 );
                   4345:   assert( pData[0]!='\0' );
                   4346: 
                   4347:   CLEAR(pReader);
                   4348: 
                   4349:   /* Decode the base blockid, and set the cursor to the first term. */
                   4350:   n = getVarint(pData+1, &pReader->iBlockid);
                   4351:   assert( 1+n<=nData );
                   4352:   pReader->pData = pData+1+n;
                   4353:   pReader->nData = nData-(1+n);
                   4354: 
                   4355:   /* A single-child interior node (such as when a leaf node was too
                   4356:   ** large for the segment directory) won't have any terms.
                   4357:   ** Otherwise, decode the first term.
                   4358:   */
                   4359:   if( pReader->nData==0 ){
                   4360:     dataBufferInit(&pReader->term, 0);
                   4361:   }else{
                   4362:     n = getVarint32(pReader->pData, &nTerm);
                   4363:     dataBufferInit(&pReader->term, nTerm);
                   4364:     dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
                   4365:     assert( n+nTerm<=pReader->nData );
                   4366:     pReader->pData += n+nTerm;
                   4367:     pReader->nData -= n+nTerm;
                   4368:   }
                   4369: }
                   4370: 
                   4371: static int interiorReaderAtEnd(InteriorReader *pReader){
                   4372:   return pReader->term.nData==0;
                   4373: }
                   4374: 
                   4375: static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
                   4376:   return pReader->iBlockid;
                   4377: }
                   4378: 
                   4379: static int interiorReaderTermBytes(InteriorReader *pReader){
                   4380:   assert( !interiorReaderAtEnd(pReader) );
                   4381:   return pReader->term.nData;
                   4382: }
                   4383: static const char *interiorReaderTerm(InteriorReader *pReader){
                   4384:   assert( !interiorReaderAtEnd(pReader) );
                   4385:   return pReader->term.pData;
                   4386: }
                   4387: 
                   4388: /* Step forward to the next term in the node. */
                   4389: static void interiorReaderStep(InteriorReader *pReader){
                   4390:   assert( !interiorReaderAtEnd(pReader) );
                   4391: 
                   4392:   /* If the last term has been read, signal eof, else construct the
                   4393:   ** next term.
                   4394:   */
                   4395:   if( pReader->nData==0 ){
                   4396:     dataBufferReset(&pReader->term);
                   4397:   }else{
                   4398:     int n, nPrefix, nSuffix;
                   4399: 
                   4400:     n = getVarint32(pReader->pData, &nPrefix);
                   4401:     n += getVarint32(pReader->pData+n, &nSuffix);
                   4402: 
                   4403:     /* Truncate the current term and append suffix data. */
                   4404:     pReader->term.nData = nPrefix;
                   4405:     dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
                   4406: 
                   4407:     assert( n+nSuffix<=pReader->nData );
                   4408:     pReader->pData += n+nSuffix;
                   4409:     pReader->nData -= n+nSuffix;
                   4410:   }
                   4411:   pReader->iBlockid++;
                   4412: }
                   4413: 
                   4414: /* Compare the current term to pTerm[nTerm], returning strcmp-style
                   4415: ** results.  If isPrefix, equality means equal through nTerm bytes.
                   4416: */
                   4417: static int interiorReaderTermCmp(InteriorReader *pReader,
                   4418:                                  const char *pTerm, int nTerm, int isPrefix){
                   4419:   const char *pReaderTerm = interiorReaderTerm(pReader);
                   4420:   int nReaderTerm = interiorReaderTermBytes(pReader);
                   4421:   int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
                   4422: 
                   4423:   if( n==0 ){
                   4424:     if( nReaderTerm>0 ) return -1;
                   4425:     if( nTerm>0 ) return 1;
                   4426:     return 0;
                   4427:   }
                   4428: 
                   4429:   c = memcmp(pReaderTerm, pTerm, n);
                   4430:   if( c!=0 ) return c;
                   4431:   if( isPrefix && n==nTerm ) return 0;
                   4432:   return nReaderTerm - nTerm;
                   4433: }
                   4434: 
                   4435: /****************************************************************/
                   4436: /* LeafWriter is used to collect terms and associated doclist data
                   4437: ** into leaf blocks in %_segments (see top of file for format info).
                   4438: ** Expected usage is:
                   4439: **
                   4440: ** LeafWriter writer;
                   4441: ** leafWriterInit(0, 0, &writer);
                   4442: ** while( sorted_terms_left_to_process ){
                   4443: **   // data is doclist data for that term.
                   4444: **   rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
                   4445: **   if( rc!=SQLITE_OK ) goto err;
                   4446: ** }
                   4447: ** rc = leafWriterFinalize(v, &writer);
                   4448: **err:
                   4449: ** leafWriterDestroy(&writer);
                   4450: ** return rc;
                   4451: **
                   4452: ** leafWriterStep() may write a collected leaf out to %_segments.
                   4453: ** leafWriterFinalize() finishes writing any buffered data and stores
                   4454: ** a root node in %_segdir.  leafWriterDestroy() frees all buffers and
                   4455: ** InteriorWriters allocated as part of writing this segment.
                   4456: **
                   4457: ** TODO(shess) Document leafWriterStepMerge().
                   4458: */
                   4459: 
                   4460: /* Put terms with data this big in their own block. */
                   4461: #define STANDALONE_MIN 1024
                   4462: 
                   4463: /* Keep leaf blocks below this size. */
                   4464: #define LEAF_MAX 2048
                   4465: 
                   4466: typedef struct LeafWriter {
                   4467:   int iLevel;
                   4468:   int idx;
                   4469:   sqlite_int64 iStartBlockid;     /* needed to create the root info */
                   4470:   sqlite_int64 iEndBlockid;       /* when we're done writing. */
                   4471: 
                   4472:   DataBuffer term;                /* previous encoded term */
                   4473:   DataBuffer data;                /* encoding buffer */
                   4474: 
                   4475:   /* bytes of first term in the current node which distinguishes that
                   4476:   ** term from the last term of the previous node.
                   4477:   */
                   4478:   int nTermDistinct;
                   4479: 
                   4480:   InteriorWriter parentWriter;    /* if we overflow */
                   4481:   int has_parent;
                   4482: } LeafWriter;
                   4483: 
                   4484: static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
                   4485:   CLEAR(pWriter);
                   4486:   pWriter->iLevel = iLevel;
                   4487:   pWriter->idx = idx;
                   4488: 
                   4489:   dataBufferInit(&pWriter->term, 32);
                   4490: 
                   4491:   /* Start out with a reasonably sized block, though it can grow. */
                   4492:   dataBufferInit(&pWriter->data, LEAF_MAX);
                   4493: }
                   4494: 
                   4495: #ifndef NDEBUG
                   4496: /* Verify that the data is readable as a leaf node. */
                   4497: static void leafNodeValidate(const char *pData, int nData){
                   4498:   int n, iDummy;
                   4499: 
                   4500:   if( nData==0 ) return;
                   4501:   assert( nData>0 );
                   4502:   assert( pData!=0 );
                   4503:   assert( pData+nData>pData );
                   4504: 
                   4505:   /* Must lead with a varint(0) */
                   4506:   n = getVarint32(pData, &iDummy);
                   4507:   assert( iDummy==0 );
                   4508:   assert( n>0 );
                   4509:   assert( n<nData );
                   4510:   pData += n;
                   4511:   nData -= n;
                   4512: 
                   4513:   /* Leading term length and data must fit in buffer. */
                   4514:   n = getVarint32(pData, &iDummy);
                   4515:   assert( n>0 );
                   4516:   assert( iDummy>0 );
                   4517:   assert( n+iDummy>0 );
                   4518:   assert( n+iDummy<nData );
                   4519:   pData += n+iDummy;
                   4520:   nData -= n+iDummy;
                   4521: 
                   4522:   /* Leading term's doclist length and data must fit. */
                   4523:   n = getVarint32(pData, &iDummy);
                   4524:   assert( n>0 );
                   4525:   assert( iDummy>0 );
                   4526:   assert( n+iDummy>0 );
                   4527:   assert( n+iDummy<=nData );
                   4528:   ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
                   4529:   pData += n+iDummy;
                   4530:   nData -= n+iDummy;
                   4531: 
                   4532:   /* Verify that trailing terms and doclists also are readable. */
                   4533:   while( nData!=0 ){
                   4534:     n = getVarint32(pData, &iDummy);
                   4535:     assert( n>0 );
                   4536:     assert( iDummy>=0 );
                   4537:     assert( n<nData );
                   4538:     pData += n;
                   4539:     nData -= n;
                   4540:     n = getVarint32(pData, &iDummy);
                   4541:     assert( n>0 );
                   4542:     assert( iDummy>0 );
                   4543:     assert( n+iDummy>0 );
                   4544:     assert( n+iDummy<nData );
                   4545:     pData += n+iDummy;
                   4546:     nData -= n+iDummy;
                   4547: 
                   4548:     n = getVarint32(pData, &iDummy);
                   4549:     assert( n>0 );
                   4550:     assert( iDummy>0 );
                   4551:     assert( n+iDummy>0 );
                   4552:     assert( n+iDummy<=nData );
                   4553:     ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
                   4554:     pData += n+iDummy;
                   4555:     nData -= n+iDummy;
                   4556:   }
                   4557: }
                   4558: #define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
                   4559: #else
                   4560: #define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
                   4561: #endif
                   4562: 
                   4563: /* Flush the current leaf node to %_segments, and adding the resulting
                   4564: ** blockid and the starting term to the interior node which will
                   4565: ** contain it.
                   4566: */
                   4567: static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
                   4568:                                    int iData, int nData){
                   4569:   sqlite_int64 iBlockid = 0;
                   4570:   const char *pStartingTerm;
                   4571:   int nStartingTerm, rc, n;
                   4572: 
                   4573:   /* Must have the leading varint(0) flag, plus at least some
                   4574:   ** valid-looking data.
                   4575:   */
                   4576:   assert( nData>2 );
                   4577:   assert( iData>=0 );
                   4578:   assert( iData+nData<=pWriter->data.nData );
                   4579:   ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
                   4580: 
                   4581:   rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
                   4582:   if( rc!=SQLITE_OK ) return rc;
                   4583:   assert( iBlockid!=0 );
                   4584: 
                   4585:   /* Reconstruct the first term in the leaf for purposes of building
                   4586:   ** the interior node.
                   4587:   */
                   4588:   n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
                   4589:   pStartingTerm = pWriter->data.pData+iData+1+n;
                   4590:   assert( pWriter->data.nData>iData+1+n+nStartingTerm );
                   4591:   assert( pWriter->nTermDistinct>0 );
                   4592:   assert( pWriter->nTermDistinct<=nStartingTerm );
                   4593:   nStartingTerm = pWriter->nTermDistinct;
                   4594: 
                   4595:   if( pWriter->has_parent ){
                   4596:     interiorWriterAppend(&pWriter->parentWriter,
                   4597:                          pStartingTerm, nStartingTerm, iBlockid);
                   4598:   }else{
                   4599:     interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
                   4600:                        &pWriter->parentWriter);
                   4601:     pWriter->has_parent = 1;
                   4602:   }
                   4603: 
                   4604:   /* Track the span of this segment's leaf nodes. */
                   4605:   if( pWriter->iEndBlockid==0 ){
                   4606:     pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
                   4607:   }else{
                   4608:     pWriter->iEndBlockid++;
                   4609:     assert( iBlockid==pWriter->iEndBlockid );
                   4610:   }
                   4611: 
                   4612:   return SQLITE_OK;
                   4613: }
                   4614: static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
                   4615:   int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
                   4616:   if( rc!=SQLITE_OK ) return rc;
                   4617: 
                   4618:   /* Re-initialize the output buffer. */
                   4619:   dataBufferReset(&pWriter->data);
                   4620: 
                   4621:   return SQLITE_OK;
                   4622: }
                   4623: 
                   4624: /* Fetch the root info for the segment.  If the entire leaf fits
                   4625: ** within ROOT_MAX, then it will be returned directly, otherwise it
                   4626: ** will be flushed and the root info will be returned from the
                   4627: ** interior node.  *piEndBlockid is set to the blockid of the last
                   4628: ** interior or leaf node written to disk (0 if none are written at
                   4629: ** all).
                   4630: */
                   4631: static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
                   4632:                               char **ppRootInfo, int *pnRootInfo,
                   4633:                               sqlite_int64 *piEndBlockid){
                   4634:   /* we can fit the segment entirely inline */
                   4635:   if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){
                   4636:     *ppRootInfo = pWriter->data.pData;
                   4637:     *pnRootInfo = pWriter->data.nData;
                   4638:     *piEndBlockid = 0;
                   4639:     return SQLITE_OK;
                   4640:   }
                   4641: 
                   4642:   /* Flush remaining leaf data. */
                   4643:   if( pWriter->data.nData>0 ){
                   4644:     int rc = leafWriterFlush(v, pWriter);
                   4645:     if( rc!=SQLITE_OK ) return rc;
                   4646:   }
                   4647: 
                   4648:   /* We must have flushed a leaf at some point. */
                   4649:   assert( pWriter->has_parent );
                   4650: 
                   4651:   /* Tenatively set the end leaf blockid as the end blockid.  If the
                   4652:   ** interior node can be returned inline, this will be the final
                   4653:   ** blockid, otherwise it will be overwritten by
                   4654:   ** interiorWriterRootInfo().
                   4655:   */
                   4656:   *piEndBlockid = pWriter->iEndBlockid;
                   4657: 
                   4658:   return interiorWriterRootInfo(v, &pWriter->parentWriter,
                   4659:                                 ppRootInfo, pnRootInfo, piEndBlockid);
                   4660: }
                   4661: 
                   4662: /* Collect the rootInfo data and store it into the segment directory.
                   4663: ** This has the effect of flushing the segment's leaf data to
                   4664: ** %_segments, and also flushing any interior nodes to %_segments.
                   4665: */
                   4666: static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){
                   4667:   sqlite_int64 iEndBlockid;
                   4668:   char *pRootInfo;
                   4669:   int rc, nRootInfo;
                   4670: 
                   4671:   rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
                   4672:   if( rc!=SQLITE_OK ) return rc;
                   4673: 
                   4674:   /* Don't bother storing an entirely empty segment. */
                   4675:   if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
                   4676: 
                   4677:   return segdir_set(v, pWriter->iLevel, pWriter->idx,
                   4678:                     pWriter->iStartBlockid, pWriter->iEndBlockid,
                   4679:                     iEndBlockid, pRootInfo, nRootInfo);
                   4680: }
                   4681: 
                   4682: static void leafWriterDestroy(LeafWriter *pWriter){
                   4683:   if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
                   4684:   dataBufferDestroy(&pWriter->term);
                   4685:   dataBufferDestroy(&pWriter->data);
                   4686: }
                   4687: 
                   4688: /* Encode a term into the leafWriter, delta-encoding as appropriate.
                   4689: ** Returns the length of the new term which distinguishes it from the
                   4690: ** previous term, which can be used to set nTermDistinct when a node
                   4691: ** boundary is crossed.
                   4692: */
                   4693: static int leafWriterEncodeTerm(LeafWriter *pWriter,
                   4694:                                 const char *pTerm, int nTerm){
                   4695:   char c[VARINT_MAX+VARINT_MAX];
                   4696:   int n, nPrefix = 0;
                   4697: 
                   4698:   assert( nTerm>0 );
                   4699:   while( nPrefix<pWriter->term.nData &&
                   4700:          pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
                   4701:     nPrefix++;
                   4702:     /* Failing this implies that the terms weren't in order. */
                   4703:     assert( nPrefix<nTerm );
                   4704:   }
                   4705: 
                   4706:   if( pWriter->data.nData==0 ){
                   4707:     /* Encode the node header and leading term as:
                   4708:     **  varint(0)
                   4709:     **  varint(nTerm)
                   4710:     **  char pTerm[nTerm]
                   4711:     */
                   4712:     n = putVarint(c, '\0');
                   4713:     n += putVarint(c+n, nTerm);
                   4714:     dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
                   4715:   }else{
                   4716:     /* Delta-encode the term as:
                   4717:     **  varint(nPrefix)
                   4718:     **  varint(nSuffix)
                   4719:     **  char pTermSuffix[nSuffix]
                   4720:     */
                   4721:     n = putVarint(c, nPrefix);
                   4722:     n += putVarint(c+n, nTerm-nPrefix);
                   4723:     dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
                   4724:   }
                   4725:   dataBufferReplace(&pWriter->term, pTerm, nTerm);
                   4726: 
                   4727:   return nPrefix+1;
                   4728: }
                   4729: 
                   4730: /* Used to avoid a memmove when a large amount of doclist data is in
                   4731: ** the buffer.  This constructs a node and term header before
                   4732: ** iDoclistData and flushes the resulting complete node using
                   4733: ** leafWriterInternalFlush().
                   4734: */
                   4735: static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter,
                   4736:                                  const char *pTerm, int nTerm,
                   4737:                                  int iDoclistData){
                   4738:   char c[VARINT_MAX+VARINT_MAX];
                   4739:   int iData, n = putVarint(c, 0);
                   4740:   n += putVarint(c+n, nTerm);
                   4741: 
                   4742:   /* There should always be room for the header.  Even if pTerm shared
                   4743:   ** a substantial prefix with the previous term, the entire prefix
                   4744:   ** could be constructed from earlier data in the doclist, so there
                   4745:   ** should be room.
                   4746:   */
                   4747:   assert( iDoclistData>=n+nTerm );
                   4748: 
                   4749:   iData = iDoclistData-(n+nTerm);
                   4750:   memcpy(pWriter->data.pData+iData, c, n);
                   4751:   memcpy(pWriter->data.pData+iData+n, pTerm, nTerm);
                   4752: 
                   4753:   return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData);
                   4754: }
                   4755: 
                   4756: /* Push pTerm[nTerm] along with the doclist data to the leaf layer of
                   4757: ** %_segments.
                   4758: */
                   4759: static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
                   4760:                                const char *pTerm, int nTerm,
                   4761:                                DLReader *pReaders, int nReaders){
                   4762:   char c[VARINT_MAX+VARINT_MAX];
                   4763:   int iTermData = pWriter->data.nData, iDoclistData;
                   4764:   int i, nData, n, nActualData, nActual, rc, nTermDistinct;
                   4765: 
                   4766:   ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
                   4767:   nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
                   4768: 
                   4769:   /* Remember nTermDistinct if opening a new node. */
                   4770:   if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
                   4771: 
                   4772:   iDoclistData = pWriter->data.nData;
                   4773: 
                   4774:   /* Estimate the length of the merged doclist so we can leave space
                   4775:   ** to encode it.
                   4776:   */
                   4777:   for(i=0, nData=0; i<nReaders; i++){
                   4778:     nData += dlrAllDataBytes(&pReaders[i]);
                   4779:   }
                   4780:   n = putVarint(c, nData);
                   4781:   dataBufferAppend(&pWriter->data, c, n);
                   4782: 
                   4783:   docListMerge(&pWriter->data, pReaders, nReaders);
                   4784:   ASSERT_VALID_DOCLIST(DL_DEFAULT,
                   4785:                        pWriter->data.pData+iDoclistData+n,
                   4786:                        pWriter->data.nData-iDoclistData-n, NULL);
                   4787: 
                   4788:   /* The actual amount of doclist data at this point could be smaller
                   4789:   ** than the length we encoded.  Additionally, the space required to
                   4790:   ** encode this length could be smaller.  For small doclists, this is
                   4791:   ** not a big deal, we can just use memmove() to adjust things.
                   4792:   */
                   4793:   nActualData = pWriter->data.nData-(iDoclistData+n);
                   4794:   nActual = putVarint(c, nActualData);
                   4795:   assert( nActualData<=nData );
                   4796:   assert( nActual<=n );
                   4797: 
                   4798:   /* If the new doclist is big enough for force a standalone leaf
                   4799:   ** node, we can immediately flush it inline without doing the
                   4800:   ** memmove().
                   4801:   */
                   4802:   /* TODO(shess) This test matches leafWriterStep(), which does this
                   4803:   ** test before it knows the cost to varint-encode the term and
                   4804:   ** doclist lengths.  At some point, change to
                   4805:   ** pWriter->data.nData-iTermData>STANDALONE_MIN.
                   4806:   */
                   4807:   if( nTerm+nActualData>STANDALONE_MIN ){
                   4808:     /* Push leaf node from before this term. */
                   4809:     if( iTermData>0 ){
                   4810:       rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
                   4811:       if( rc!=SQLITE_OK ) return rc;
                   4812: 
                   4813:       pWriter->nTermDistinct = nTermDistinct;
                   4814:     }
                   4815: 
                   4816:     /* Fix the encoded doclist length. */
                   4817:     iDoclistData += n - nActual;
                   4818:     memcpy(pWriter->data.pData+iDoclistData, c, nActual);
                   4819: 
                   4820:     /* Push the standalone leaf node. */
                   4821:     rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
                   4822:     if( rc!=SQLITE_OK ) return rc;
                   4823: 
                   4824:     /* Leave the node empty. */
                   4825:     dataBufferReset(&pWriter->data);
                   4826: 
                   4827:     return rc;
                   4828:   }
                   4829: 
                   4830:   /* At this point, we know that the doclist was small, so do the
                   4831:   ** memmove if indicated.
                   4832:   */
                   4833:   if( nActual<n ){
                   4834:     memmove(pWriter->data.pData+iDoclistData+nActual,
                   4835:             pWriter->data.pData+iDoclistData+n,
                   4836:             pWriter->data.nData-(iDoclistData+n));
                   4837:     pWriter->data.nData -= n-nActual;
                   4838:   }
                   4839: 
                   4840:   /* Replace written length with actual length. */
                   4841:   memcpy(pWriter->data.pData+iDoclistData, c, nActual);
                   4842: 
                   4843:   /* If the node is too large, break things up. */
                   4844:   /* TODO(shess) This test matches leafWriterStep(), which does this
                   4845:   ** test before it knows the cost to varint-encode the term and
                   4846:   ** doclist lengths.  At some point, change to
                   4847:   ** pWriter->data.nData>LEAF_MAX.
                   4848:   */
                   4849:   if( iTermData+nTerm+nActualData>LEAF_MAX ){
                   4850:     /* Flush out the leading data as a node */
                   4851:     rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
                   4852:     if( rc!=SQLITE_OK ) return rc;
                   4853: 
                   4854:     pWriter->nTermDistinct = nTermDistinct;
                   4855: 
                   4856:     /* Rebuild header using the current term */
                   4857:     n = putVarint(pWriter->data.pData, 0);
                   4858:     n += putVarint(pWriter->data.pData+n, nTerm);
                   4859:     memcpy(pWriter->data.pData+n, pTerm, nTerm);
                   4860:     n += nTerm;
                   4861: 
                   4862:     /* There should always be room, because the previous encoding
                   4863:     ** included all data necessary to construct the term.
                   4864:     */
                   4865:     assert( n<iDoclistData );
                   4866:     /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
                   4867:     ** following memcpy() is safe (as opposed to needing a memmove).
                   4868:     */
                   4869:     assert( 2*STANDALONE_MIN<=LEAF_MAX );
                   4870:     assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
                   4871:     memcpy(pWriter->data.pData+n,
                   4872:            pWriter->data.pData+iDoclistData,
                   4873:            pWriter->data.nData-iDoclistData);
                   4874:     pWriter->data.nData -= iDoclistData-n;
                   4875:   }
                   4876:   ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
                   4877: 
                   4878:   return SQLITE_OK;
                   4879: }
                   4880: 
                   4881: /* Push pTerm[nTerm] along with the doclist data to the leaf layer of
                   4882: ** %_segments.
                   4883: */
                   4884: /* TODO(shess) Revise writeZeroSegment() so that doclists are
                   4885: ** constructed directly in pWriter->data.
                   4886: */
                   4887: static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
                   4888:                           const char *pTerm, int nTerm,
                   4889:                           const char *pData, int nData){
                   4890:   int rc;
                   4891:   DLReader reader;
                   4892: 
                   4893:   dlrInit(&reader, DL_DEFAULT, pData, nData);
                   4894:   rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
                   4895:   dlrDestroy(&reader);
                   4896: 
                   4897:   return rc;
                   4898: }
                   4899: 
                   4900: 
                   4901: /****************************************************************/
                   4902: /* LeafReader is used to iterate over an individual leaf node. */
                   4903: typedef struct LeafReader {
                   4904:   DataBuffer term;          /* copy of current term. */
                   4905: 
                   4906:   const char *pData;        /* data for current term. */
                   4907:   int nData;
                   4908: } LeafReader;
                   4909: 
                   4910: static void leafReaderDestroy(LeafReader *pReader){
                   4911:   dataBufferDestroy(&pReader->term);
                   4912:   SCRAMBLE(pReader);
                   4913: }
                   4914: 
                   4915: static int leafReaderAtEnd(LeafReader *pReader){
                   4916:   return pReader->nData<=0;
                   4917: }
                   4918: 
                   4919: /* Access the current term. */
                   4920: static int leafReaderTermBytes(LeafReader *pReader){
                   4921:   return pReader->term.nData;
                   4922: }
                   4923: static const char *leafReaderTerm(LeafReader *pReader){
                   4924:   assert( pReader->term.nData>0 );
                   4925:   return pReader->term.pData;
                   4926: }
                   4927: 
                   4928: /* Access the doclist data for the current term. */
                   4929: static int leafReaderDataBytes(LeafReader *pReader){
                   4930:   int nData;
                   4931:   assert( pReader->term.nData>0 );
                   4932:   getVarint32(pReader->pData, &nData);
                   4933:   return nData;
                   4934: }
                   4935: static const char *leafReaderData(LeafReader *pReader){
                   4936:   int n, nData;
                   4937:   assert( pReader->term.nData>0 );
                   4938:   n = getVarint32(pReader->pData, &nData);
                   4939:   return pReader->pData+n;
                   4940: }
                   4941: 
                   4942: static void leafReaderInit(const char *pData, int nData,
                   4943:                            LeafReader *pReader){
                   4944:   int nTerm, n;
                   4945: 
                   4946:   assert( nData>0 );
                   4947:   assert( pData[0]=='\0' );
                   4948: 
                   4949:   CLEAR(pReader);
                   4950: 
                   4951:   /* Read the first term, skipping the header byte. */
                   4952:   n = getVarint32(pData+1, &nTerm);
                   4953:   dataBufferInit(&pReader->term, nTerm);
                   4954:   dataBufferReplace(&pReader->term, pData+1+n, nTerm);
                   4955: 
                   4956:   /* Position after the first term. */
                   4957:   assert( 1+n+nTerm<nData );
                   4958:   pReader->pData = pData+1+n+nTerm;
                   4959:   pReader->nData = nData-1-n-nTerm;
                   4960: }
                   4961: 
                   4962: /* Step the reader forward to the next term. */
                   4963: static void leafReaderStep(LeafReader *pReader){
                   4964:   int n, nData, nPrefix, nSuffix;
                   4965:   assert( !leafReaderAtEnd(pReader) );
                   4966: 
                   4967:   /* Skip previous entry's data block. */
                   4968:   n = getVarint32(pReader->pData, &nData);
                   4969:   assert( n+nData<=pReader->nData );
                   4970:   pReader->pData += n+nData;
                   4971:   pReader->nData -= n+nData;
                   4972: 
                   4973:   if( !leafReaderAtEnd(pReader) ){
                   4974:     /* Construct the new term using a prefix from the old term plus a
                   4975:     ** suffix from the leaf data.
                   4976:     */
                   4977:     n = getVarint32(pReader->pData, &nPrefix);
                   4978:     n += getVarint32(pReader->pData+n, &nSuffix);
                   4979:     assert( n+nSuffix<pReader->nData );
                   4980:     pReader->term.nData = nPrefix;
                   4981:     dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
                   4982: 
                   4983:     pReader->pData += n+nSuffix;
                   4984:     pReader->nData -= n+nSuffix;
                   4985:   }
                   4986: }
                   4987: 
                   4988: /* strcmp-style comparison of pReader's current term against pTerm.
                   4989: ** If isPrefix, equality means equal through nTerm bytes.
                   4990: */
                   4991: static int leafReaderTermCmp(LeafReader *pReader,
                   4992:                              const char *pTerm, int nTerm, int isPrefix){
                   4993:   int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
                   4994:   if( n==0 ){
                   4995:     if( pReader->term.nData>0 ) return -1;
                   4996:     if(nTerm>0 ) return 1;
                   4997:     return 0;
                   4998:   }
                   4999: 
                   5000:   c = memcmp(pReader->term.pData, pTerm, n);
                   5001:   if( c!=0 ) return c;
                   5002:   if( isPrefix && n==nTerm ) return 0;
                   5003:   return pReader->term.nData - nTerm;
                   5004: }
                   5005: 
                   5006: 
                   5007: /****************************************************************/
                   5008: /* LeavesReader wraps LeafReader to allow iterating over the entire
                   5009: ** leaf layer of the tree.
                   5010: */
                   5011: typedef struct LeavesReader {
                   5012:   int idx;                  /* Index within the segment. */
                   5013: 
                   5014:   sqlite3_stmt *pStmt;      /* Statement we're streaming leaves from. */
                   5015:   int eof;                  /* we've seen SQLITE_DONE from pStmt. */
                   5016: 
                   5017:   LeafReader leafReader;    /* reader for the current leaf. */
                   5018:   DataBuffer rootData;      /* root data for inline. */
                   5019: } LeavesReader;
                   5020: 
                   5021: /* Access the current term. */
                   5022: static int leavesReaderTermBytes(LeavesReader *pReader){
                   5023:   assert( !pReader->eof );
                   5024:   return leafReaderTermBytes(&pReader->leafReader);
                   5025: }
                   5026: static const char *leavesReaderTerm(LeavesReader *pReader){
                   5027:   assert( !pReader->eof );
                   5028:   return leafReaderTerm(&pReader->leafReader);
                   5029: }
                   5030: 
                   5031: /* Access the doclist data for the current term. */
                   5032: static int leavesReaderDataBytes(LeavesReader *pReader){
                   5033:   assert( !pReader->eof );
                   5034:   return leafReaderDataBytes(&pReader->leafReader);
                   5035: }
                   5036: static const char *leavesReaderData(LeavesReader *pReader){
                   5037:   assert( !pReader->eof );
                   5038:   return leafReaderData(&pReader->leafReader);
                   5039: }
                   5040: 
                   5041: static int leavesReaderAtEnd(LeavesReader *pReader){
                   5042:   return pReader->eof;
                   5043: }
                   5044: 
                   5045: /* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
                   5046: ** leaving the statement handle open, which locks the table.
                   5047: */
                   5048: /* TODO(shess) This "solution" is not satisfactory.  Really, there
                   5049: ** should be check-in function for all statement handles which
                   5050: ** arranges to call sqlite3_reset().  This most likely will require
                   5051: ** modification to control flow all over the place, though, so for now
                   5052: ** just punt.
                   5053: **
                   5054: ** Note the the current system assumes that segment merges will run to
                   5055: ** completion, which is why this particular probably hasn't arisen in
                   5056: ** this case.  Probably a brittle assumption.
                   5057: */
                   5058: static int leavesReaderReset(LeavesReader *pReader){
                   5059:   return sqlite3_reset(pReader->pStmt);
                   5060: }
                   5061: 
                   5062: static void leavesReaderDestroy(LeavesReader *pReader){
                   5063:   /* If idx is -1, that means we're using a non-cached statement
                   5064:   ** handle in the optimize() case, so we need to release it.
                   5065:   */
                   5066:   if( pReader->pStmt!=NULL && pReader->idx==-1 ){
                   5067:     sqlite3_finalize(pReader->pStmt);
                   5068:   }
                   5069:   leafReaderDestroy(&pReader->leafReader);
                   5070:   dataBufferDestroy(&pReader->rootData);
                   5071:   SCRAMBLE(pReader);
                   5072: }
                   5073: 
                   5074: /* Initialize pReader with the given root data (if iStartBlockid==0
                   5075: ** the leaf data was entirely contained in the root), or from the
                   5076: ** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
                   5077: */
                   5078: static int leavesReaderInit(fulltext_vtab *v,
                   5079:                             int idx,
                   5080:                             sqlite_int64 iStartBlockid,
                   5081:                             sqlite_int64 iEndBlockid,
                   5082:                             const char *pRootData, int nRootData,
                   5083:                             LeavesReader *pReader){
                   5084:   CLEAR(pReader);
                   5085:   pReader->idx = idx;
                   5086: 
                   5087:   dataBufferInit(&pReader->rootData, 0);
                   5088:   if( iStartBlockid==0 ){
                   5089:     /* Entire leaf level fit in root data. */
                   5090:     dataBufferReplace(&pReader->rootData, pRootData, nRootData);
                   5091:     leafReaderInit(pReader->rootData.pData, pReader->rootData.nData,
                   5092:                    &pReader->leafReader);
                   5093:   }else{
                   5094:     sqlite3_stmt *s;
                   5095:     int rc = sql_get_leaf_statement(v, idx, &s);
                   5096:     if( rc!=SQLITE_OK ) return rc;
                   5097: 
                   5098:     rc = sqlite3_bind_int64(s, 1, iStartBlockid);
                   5099:     if( rc!=SQLITE_OK ) return rc;
                   5100: 
                   5101:     rc = sqlite3_bind_int64(s, 2, iEndBlockid);
                   5102:     if( rc!=SQLITE_OK ) return rc;
                   5103: 
                   5104:     rc = sqlite3_step(s);
                   5105:     if( rc==SQLITE_DONE ){
                   5106:       pReader->eof = 1;
                   5107:       return SQLITE_OK;
                   5108:     }
                   5109:     if( rc!=SQLITE_ROW ) return rc;
                   5110: 
                   5111:     pReader->pStmt = s;
                   5112:     leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
                   5113:                    sqlite3_column_bytes(pReader->pStmt, 0),
                   5114:                    &pReader->leafReader);
                   5115:   }
                   5116:   return SQLITE_OK;
                   5117: }
                   5118: 
                   5119: /* Step the current leaf forward to the next term.  If we reach the
                   5120: ** end of the current leaf, step forward to the next leaf block.
                   5121: */
                   5122: static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){
                   5123:   assert( !leavesReaderAtEnd(pReader) );
                   5124:   leafReaderStep(&pReader->leafReader);
                   5125: 
                   5126:   if( leafReaderAtEnd(&pReader->leafReader) ){
                   5127:     int rc;
                   5128:     if( pReader->rootData.pData ){
                   5129:       pReader->eof = 1;
                   5130:       return SQLITE_OK;
                   5131:     }
                   5132:     rc = sqlite3_step(pReader->pStmt);
                   5133:     if( rc!=SQLITE_ROW ){
                   5134:       pReader->eof = 1;
                   5135:       return rc==SQLITE_DONE ? SQLITE_OK : rc;
                   5136:     }
                   5137:     leafReaderDestroy(&pReader->leafReader);
                   5138:     leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
                   5139:                    sqlite3_column_bytes(pReader->pStmt, 0),
                   5140:                    &pReader->leafReader);
                   5141:   }
                   5142:   return SQLITE_OK;
                   5143: }
                   5144: 
                   5145: /* Order LeavesReaders by their term, ignoring idx.  Readers at eof
                   5146: ** always sort to the end.
                   5147: */
                   5148: static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
                   5149:   if( leavesReaderAtEnd(lr1) ){
                   5150:     if( leavesReaderAtEnd(lr2) ) return 0;
                   5151:     return 1;
                   5152:   }
                   5153:   if( leavesReaderAtEnd(lr2) ) return -1;
                   5154: 
                   5155:   return leafReaderTermCmp(&lr1->leafReader,
                   5156:                            leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
                   5157:                            0);
                   5158: }
                   5159: 
                   5160: /* Similar to leavesReaderTermCmp(), with additional ordering by idx
                   5161: ** so that older segments sort before newer segments.
                   5162: */
                   5163: static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){
                   5164:   int c = leavesReaderTermCmp(lr1, lr2);
                   5165:   if( c!=0 ) return c;
                   5166:   return lr1->idx-lr2->idx;
                   5167: }
                   5168: 
                   5169: /* Assume that pLr[1]..pLr[nLr] are sorted.  Bubble pLr[0] into its
                   5170: ** sorted position.
                   5171: */
                   5172: static void leavesReaderReorder(LeavesReader *pLr, int nLr){
                   5173:   while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
                   5174:     LeavesReader tmp = pLr[0];
                   5175:     pLr[0] = pLr[1];
                   5176:     pLr[1] = tmp;
                   5177:     nLr--;
                   5178:     pLr++;
                   5179:   }
                   5180: }
                   5181: 
                   5182: /* Initializes pReaders with the segments from level iLevel, returning
                   5183: ** the number of segments in *piReaders.  Leaves pReaders in sorted
                   5184: ** order.
                   5185: */
                   5186: static int leavesReadersInit(fulltext_vtab *v, int iLevel,
                   5187:                              LeavesReader *pReaders, int *piReaders){
                   5188:   sqlite3_stmt *s;
                   5189:   int i, rc = sql_get_statement(v, SEGDIR_SELECT_LEVEL_STMT, &s);
                   5190:   if( rc!=SQLITE_OK ) return rc;
                   5191: 
                   5192:   rc = sqlite3_bind_int(s, 1, iLevel);
                   5193:   if( rc!=SQLITE_OK ) return rc;
                   5194: 
                   5195:   i = 0;
                   5196:   while( (rc = sqlite3_step(s))==SQLITE_ROW ){
                   5197:     sqlite_int64 iStart = sqlite3_column_int64(s, 0);
                   5198:     sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
                   5199:     const char *pRootData = sqlite3_column_blob(s, 2);
                   5200:     int nRootData = sqlite3_column_bytes(s, 2);
                   5201: 
                   5202:     assert( i<MERGE_COUNT );
                   5203:     rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
                   5204:                           &pReaders[i]);
                   5205:     if( rc!=SQLITE_OK ) break;
                   5206: 
                   5207:     i++;
                   5208:   }
                   5209:   if( rc!=SQLITE_DONE ){
                   5210:     while( i-->0 ){
                   5211:       leavesReaderDestroy(&pReaders[i]);
                   5212:     }
                   5213:     return rc;
                   5214:   }
                   5215: 
                   5216:   *piReaders = i;
                   5217: 
                   5218:   /* Leave our results sorted by term, then age. */
                   5219:   while( i-- ){
                   5220:     leavesReaderReorder(pReaders+i, *piReaders-i);
                   5221:   }
                   5222:   return SQLITE_OK;
                   5223: }
                   5224: 
                   5225: /* Merge doclists from pReaders[nReaders] into a single doclist, which
                   5226: ** is written to pWriter.  Assumes pReaders is ordered oldest to
                   5227: ** newest.
                   5228: */
                   5229: /* TODO(shess) Consider putting this inline in segmentMerge(). */
                   5230: static int leavesReadersMerge(fulltext_vtab *v,
                   5231:                               LeavesReader *pReaders, int nReaders,
                   5232:                               LeafWriter *pWriter){
                   5233:   DLReader dlReaders[MERGE_COUNT];
                   5234:   const char *pTerm = leavesReaderTerm(pReaders);
                   5235:   int i, nTerm = leavesReaderTermBytes(pReaders);
                   5236: 
                   5237:   assert( nReaders<=MERGE_COUNT );
                   5238: 
                   5239:   for(i=0; i<nReaders; i++){
                   5240:     dlrInit(&dlReaders[i], DL_DEFAULT,
                   5241:             leavesReaderData(pReaders+i),
                   5242:             leavesReaderDataBytes(pReaders+i));
                   5243:   }
                   5244: 
                   5245:   return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders);
                   5246: }
                   5247: 
                   5248: /* Forward ref due to mutual recursion with segdirNextIndex(). */
                   5249: static int segmentMerge(fulltext_vtab *v, int iLevel);
                   5250: 
                   5251: /* Put the next available index at iLevel into *pidx.  If iLevel
                   5252: ** already has MERGE_COUNT segments, they are merged to a higher
                   5253: ** level to make room.
                   5254: */
                   5255: static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){
                   5256:   int rc = segdir_max_index(v, iLevel, pidx);
                   5257:   if( rc==SQLITE_DONE ){              /* No segments at iLevel. */
                   5258:     *pidx = 0;
                   5259:   }else if( rc==SQLITE_ROW ){
                   5260:     if( *pidx==(MERGE_COUNT-1) ){
                   5261:       rc = segmentMerge(v, iLevel);
                   5262:       if( rc!=SQLITE_OK ) return rc;
                   5263:       *pidx = 0;
                   5264:     }else{
                   5265:       (*pidx)++;
                   5266:     }
                   5267:   }else{
                   5268:     return rc;
                   5269:   }
                   5270:   return SQLITE_OK;
                   5271: }
                   5272: 
                   5273: /* Merge MERGE_COUNT segments at iLevel into a new segment at
                   5274: ** iLevel+1.  If iLevel+1 is already full of segments, those will be
                   5275: ** merged to make room.
                   5276: */
                   5277: static int segmentMerge(fulltext_vtab *v, int iLevel){
                   5278:   LeafWriter writer;
                   5279:   LeavesReader lrs[MERGE_COUNT];
                   5280:   int i, rc, idx = 0;
                   5281: 
                   5282:   /* Determine the next available segment index at the next level,
                   5283:   ** merging as necessary.
                   5284:   */
                   5285:   rc = segdirNextIndex(v, iLevel+1, &idx);
                   5286:   if( rc!=SQLITE_OK ) return rc;
                   5287: 
                   5288:   /* TODO(shess) This assumes that we'll always see exactly
                   5289:   ** MERGE_COUNT segments to merge at a given level.  That will be
                   5290:   ** broken if we allow the developer to request preemptive or
                   5291:   ** deferred merging.
                   5292:   */
                   5293:   memset(&lrs, '\0', sizeof(lrs));
                   5294:   rc = leavesReadersInit(v, iLevel, lrs, &i);
                   5295:   if( rc!=SQLITE_OK ) return rc;
                   5296:   assert( i==MERGE_COUNT );
                   5297: 
                   5298:   leafWriterInit(iLevel+1, idx, &writer);
                   5299: 
                   5300:   /* Since leavesReaderReorder() pushes readers at eof to the end,
                   5301:   ** when the first reader is empty, all will be empty.
                   5302:   */
                   5303:   while( !leavesReaderAtEnd(lrs) ){
                   5304:     /* Figure out how many readers share their next term. */
                   5305:     for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
                   5306:       if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
                   5307:     }
                   5308: 
                   5309:     rc = leavesReadersMerge(v, lrs, i, &writer);
                   5310:     if( rc!=SQLITE_OK ) goto err;
                   5311: 
                   5312:     /* Step forward those that were merged. */
                   5313:     while( i-->0 ){
                   5314:       rc = leavesReaderStep(v, lrs+i);
                   5315:       if( rc!=SQLITE_OK ) goto err;
                   5316: 
                   5317:       /* Reorder by term, then by age. */
                   5318:       leavesReaderReorder(lrs+i, MERGE_COUNT-i);
                   5319:     }
                   5320:   }
                   5321: 
                   5322:   for(i=0; i<MERGE_COUNT; i++){
                   5323:     leavesReaderDestroy(&lrs[i]);
                   5324:   }
                   5325: 
                   5326:   rc = leafWriterFinalize(v, &writer);
                   5327:   leafWriterDestroy(&writer);
                   5328:   if( rc!=SQLITE_OK ) return rc;
                   5329: 
                   5330:   /* Delete the merged segment data. */
                   5331:   return segdir_delete(v, iLevel);
                   5332: 
                   5333:  err:
                   5334:   for(i=0; i<MERGE_COUNT; i++){
                   5335:     leavesReaderDestroy(&lrs[i]);
                   5336:   }
                   5337:   leafWriterDestroy(&writer);
                   5338:   return rc;
                   5339: }
                   5340: 
                   5341: /* Accumulate the union of *acc and *pData into *acc. */
                   5342: static void docListAccumulateUnion(DataBuffer *acc,
                   5343:                                    const char *pData, int nData) {
                   5344:   DataBuffer tmp = *acc;
                   5345:   dataBufferInit(acc, tmp.nData+nData);
                   5346:   docListUnion(tmp.pData, tmp.nData, pData, nData, acc);
                   5347:   dataBufferDestroy(&tmp);
                   5348: }
                   5349: 
                   5350: /* TODO(shess) It might be interesting to explore different merge
                   5351: ** strategies, here.  For instance, since this is a sorted merge, we
                   5352: ** could easily merge many doclists in parallel.  With some
                   5353: ** comprehension of the storage format, we could merge all of the
                   5354: ** doclists within a leaf node directly from the leaf node's storage.
                   5355: ** It may be worthwhile to merge smaller doclists before larger
                   5356: ** doclists, since they can be traversed more quickly - but the
                   5357: ** results may have less overlap, making them more expensive in a
                   5358: ** different way.
                   5359: */
                   5360: 
                   5361: /* Scan pReader for pTerm/nTerm, and merge the term's doclist over
                   5362: ** *out (any doclists with duplicate docids overwrite those in *out).
                   5363: ** Internal function for loadSegmentLeaf().
                   5364: */
                   5365: static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
                   5366:                                 const char *pTerm, int nTerm, int isPrefix,
                   5367:                                 DataBuffer *out){
                   5368:   /* doclist data is accumulated into pBuffers similar to how one does
                   5369:   ** increment in binary arithmetic.  If index 0 is empty, the data is
                   5370:   ** stored there.  If there is data there, it is merged and the
                   5371:   ** results carried into position 1, with further merge-and-carry
                   5372:   ** until an empty position is found.
                   5373:   */
                   5374:   DataBuffer *pBuffers = NULL;
                   5375:   int nBuffers = 0, nMaxBuffers = 0, rc;
                   5376: 
                   5377:   assert( nTerm>0 );
                   5378: 
                   5379:   for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader);
                   5380:       rc=leavesReaderStep(v, pReader)){
                   5381:     /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
                   5382:     ** already taken to compare the terms of two LeavesReaders.  Think
                   5383:     ** on a better name.  [Meanwhile, break encapsulation rather than
                   5384:     ** use a confusing name.]
                   5385:     */
                   5386:     int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
                   5387:     if( c>0 ) break;      /* Past any possible matches. */
                   5388:     if( c==0 ){
                   5389:       const char *pData = leavesReaderData(pReader);
                   5390:       int iBuffer, nData = leavesReaderDataBytes(pReader);
                   5391: 
                   5392:       /* Find the first empty buffer. */
                   5393:       for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
                   5394:         if( 0==pBuffers[iBuffer].nData ) break;
                   5395:       }
                   5396: 
                   5397:       /* Out of buffers, add an empty one. */
                   5398:       if( iBuffer==nBuffers ){
                   5399:         if( nBuffers==nMaxBuffers ){
                   5400:           DataBuffer *p;
                   5401:           nMaxBuffers += 20;
                   5402: 
                   5403:           /* Manual realloc so we can handle NULL appropriately. */
                   5404:           p = sqlite3_malloc(nMaxBuffers*sizeof(*pBuffers));
                   5405:           if( p==NULL ){
                   5406:             rc = SQLITE_NOMEM;
                   5407:             break;
                   5408:           }
                   5409: 
                   5410:           if( nBuffers>0 ){
                   5411:             assert(pBuffers!=NULL);
                   5412:             memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers));
                   5413:             sqlite3_free(pBuffers);
                   5414:           }
                   5415:           pBuffers = p;
                   5416:         }
                   5417:         dataBufferInit(&(pBuffers[nBuffers]), 0);
                   5418:         nBuffers++;
                   5419:       }
                   5420: 
                   5421:       /* At this point, must have an empty at iBuffer. */
                   5422:       assert(iBuffer<nBuffers && pBuffers[iBuffer].nData==0);
                   5423: 
                   5424:       /* If empty was first buffer, no need for merge logic. */
                   5425:       if( iBuffer==0 ){
                   5426:         dataBufferReplace(&(pBuffers[0]), pData, nData);
                   5427:       }else{
                   5428:         /* pAcc is the empty buffer the merged data will end up in. */
                   5429:         DataBuffer *pAcc = &(pBuffers[iBuffer]);
                   5430:         DataBuffer *p = &(pBuffers[0]);
                   5431: 
                   5432:         /* Handle position 0 specially to avoid need to prime pAcc
                   5433:         ** with pData/nData.
                   5434:         */
                   5435:         dataBufferSwap(p, pAcc);
                   5436:         docListAccumulateUnion(pAcc, pData, nData);
                   5437: 
                   5438:         /* Accumulate remaining doclists into pAcc. */
                   5439:         for(++p; p<pAcc; ++p){
                   5440:           docListAccumulateUnion(pAcc, p->pData, p->nData);
                   5441: 
                   5442:           /* dataBufferReset() could allow a large doclist to blow up
                   5443:           ** our memory requirements.
                   5444:           */
                   5445:           if( p->nCapacity<1024 ){
                   5446:             dataBufferReset(p);
                   5447:           }else{
                   5448:             dataBufferDestroy(p);
                   5449:             dataBufferInit(p, 0);
                   5450:           }
                   5451:         }
                   5452:       }
                   5453:     }
                   5454:   }
                   5455: 
                   5456:   /* Union all the doclists together into *out. */
                   5457:   /* TODO(shess) What if *out is big?  Sigh. */
                   5458:   if( rc==SQLITE_OK && nBuffers>0 ){
                   5459:     int iBuffer;
                   5460:     for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
                   5461:       if( pBuffers[iBuffer].nData>0 ){
                   5462:         if( out->nData==0 ){
                   5463:           dataBufferSwap(out, &(pBuffers[iBuffer]));
                   5464:         }else{
                   5465:           docListAccumulateUnion(out, pBuffers[iBuffer].pData,
                   5466:                                  pBuffers[iBuffer].nData);
                   5467:         }
                   5468:       }
                   5469:     }
                   5470:   }
                   5471: 
                   5472:   while( nBuffers-- ){
                   5473:     dataBufferDestroy(&(pBuffers[nBuffers]));
                   5474:   }
                   5475:   if( pBuffers!=NULL ) sqlite3_free(pBuffers);
                   5476: 
                   5477:   return rc;
                   5478: }
                   5479: 
                   5480: /* Call loadSegmentLeavesInt() with pData/nData as input. */
                   5481: static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
                   5482:                            const char *pTerm, int nTerm, int isPrefix,
                   5483:                            DataBuffer *out){
                   5484:   LeavesReader reader;
                   5485:   int rc;
                   5486: 
                   5487:   assert( nData>1 );
                   5488:   assert( *pData=='\0' );
                   5489:   rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
                   5490:   if( rc!=SQLITE_OK ) return rc;
                   5491: 
                   5492:   rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
                   5493:   leavesReaderReset(&reader);
                   5494:   leavesReaderDestroy(&reader);
                   5495:   return rc;
                   5496: }
                   5497: 
                   5498: /* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
                   5499: ** iEndLeaf (inclusive) as input, and merge the resulting doclist into
                   5500: ** out.
                   5501: */
                   5502: static int loadSegmentLeaves(fulltext_vtab *v,
                   5503:                              sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
                   5504:                              const char *pTerm, int nTerm, int isPrefix,
                   5505:                              DataBuffer *out){
                   5506:   int rc;
                   5507:   LeavesReader reader;
                   5508: 
                   5509:   assert( iStartLeaf<=iEndLeaf );
                   5510:   rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
                   5511:   if( rc!=SQLITE_OK ) return rc;
                   5512: 
                   5513:   rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
                   5514:   leavesReaderReset(&reader);
                   5515:   leavesReaderDestroy(&reader);
                   5516:   return rc;
                   5517: }
                   5518: 
                   5519: /* Taking pData/nData as an interior node, find the sequence of child
                   5520: ** nodes which could include pTerm/nTerm/isPrefix.  Note that the
                   5521: ** interior node terms logically come between the blocks, so there is
                   5522: ** one more blockid than there are terms (that block contains terms >=
                   5523: ** the last interior-node term).
                   5524: */
                   5525: /* TODO(shess) The calling code may already know that the end child is
                   5526: ** not worth calculating, because the end may be in a later sibling
                   5527: ** node.  Consider whether breaking symmetry is worthwhile.  I suspect
                   5528: ** it is not worthwhile.
                   5529: */
                   5530: static void getChildrenContaining(const char *pData, int nData,
                   5531:                                   const char *pTerm, int nTerm, int isPrefix,
                   5532:                                   sqlite_int64 *piStartChild,
                   5533:                                   sqlite_int64 *piEndChild){
                   5534:   InteriorReader reader;
                   5535: 
                   5536:   assert( nData>1 );
                   5537:   assert( *pData!='\0' );
                   5538:   interiorReaderInit(pData, nData, &reader);
                   5539: 
                   5540:   /* Scan for the first child which could contain pTerm/nTerm. */
                   5541:   while( !interiorReaderAtEnd(&reader) ){
                   5542:     if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break;
                   5543:     interiorReaderStep(&reader);
                   5544:   }
                   5545:   *piStartChild = interiorReaderCurrentBlockid(&reader);
                   5546: 
                   5547:   /* Keep scanning to find a term greater than our term, using prefix
                   5548:   ** comparison if indicated.  If isPrefix is false, this will be the
                   5549:   ** same blockid as the starting block.
                   5550:   */
                   5551:   while( !interiorReaderAtEnd(&reader) ){
                   5552:     if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break;
                   5553:     interiorReaderStep(&reader);
                   5554:   }
                   5555:   *piEndChild = interiorReaderCurrentBlockid(&reader);
                   5556: 
                   5557:   interiorReaderDestroy(&reader);
                   5558: 
                   5559:   /* Children must ascend, and if !prefix, both must be the same. */
                   5560:   assert( *piEndChild>=*piStartChild );
                   5561:   assert( isPrefix || *piStartChild==*piEndChild );
                   5562: }
                   5563: 
                   5564: /* Read block at iBlockid and pass it with other params to
                   5565: ** getChildrenContaining().
                   5566: */
                   5567: static int loadAndGetChildrenContaining(
                   5568:   fulltext_vtab *v,
                   5569:   sqlite_int64 iBlockid,
                   5570:   const char *pTerm, int nTerm, int isPrefix,
                   5571:   sqlite_int64 *piStartChild, sqlite_int64 *piEndChild
                   5572: ){
                   5573:   sqlite3_stmt *s = NULL;
                   5574:   int rc;
                   5575: 
                   5576:   assert( iBlockid!=0 );
                   5577:   assert( pTerm!=NULL );
                   5578:   assert( nTerm!=0 );        /* TODO(shess) Why not allow this? */
                   5579:   assert( piStartChild!=NULL );
                   5580:   assert( piEndChild!=NULL );
                   5581: 
                   5582:   rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
                   5583:   if( rc!=SQLITE_OK ) return rc;
                   5584: 
                   5585:   rc = sqlite3_bind_int64(s, 1, iBlockid);
                   5586:   if( rc!=SQLITE_OK ) return rc;
                   5587: 
                   5588:   rc = sqlite3_step(s);
                   5589:   if( rc==SQLITE_DONE ) return SQLITE_ERROR;
                   5590:   if( rc!=SQLITE_ROW ) return rc;
                   5591: 
                   5592:   getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
                   5593:                         pTerm, nTerm, isPrefix, piStartChild, piEndChild);
                   5594: 
                   5595:   /* We expect only one row.  We must execute another sqlite3_step()
                   5596:    * to complete the iteration; otherwise the table will remain
                   5597:    * locked. */
                   5598:   rc = sqlite3_step(s);
                   5599:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
                   5600:   if( rc!=SQLITE_DONE ) return rc;
                   5601: 
                   5602:   return SQLITE_OK;
                   5603: }
                   5604: 
                   5605: /* Traverse the tree represented by pData[nData] looking for
                   5606: ** pTerm[nTerm], placing its doclist into *out.  This is internal to
                   5607: ** loadSegment() to make error-handling cleaner.
                   5608: */
                   5609: static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
                   5610:                           sqlite_int64 iLeavesEnd,
                   5611:                           const char *pTerm, int nTerm, int isPrefix,
                   5612:                           DataBuffer *out){
                   5613:   /* Special case where root is a leaf. */
                   5614:   if( *pData=='\0' ){
                   5615:     return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
                   5616:   }else{
                   5617:     int rc;
                   5618:     sqlite_int64 iStartChild, iEndChild;
                   5619: 
                   5620:     /* Process pData as an interior node, then loop down the tree
                   5621:     ** until we find the set of leaf nodes to scan for the term.
                   5622:     */
                   5623:     getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix,
                   5624:                           &iStartChild, &iEndChild);
                   5625:     while( iStartChild>iLeavesEnd ){
                   5626:       sqlite_int64 iNextStart, iNextEnd;
                   5627:       rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix,
                   5628:                                         &iNextStart, &iNextEnd);
                   5629:       if( rc!=SQLITE_OK ) return rc;
                   5630: 
                   5631:       /* If we've branched, follow the end branch, too. */
                   5632:       if( iStartChild!=iEndChild ){
                   5633:         sqlite_int64 iDummy;
                   5634:         rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix,
                   5635:                                           &iDummy, &iNextEnd);
                   5636:         if( rc!=SQLITE_OK ) return rc;
                   5637:       }
                   5638: 
                   5639:       assert( iNextStart<=iNextEnd );
                   5640:       iStartChild = iNextStart;
                   5641:       iEndChild = iNextEnd;
                   5642:     }
                   5643:     assert( iStartChild<=iLeavesEnd );
                   5644:     assert( iEndChild<=iLeavesEnd );
                   5645: 
                   5646:     /* Scan through the leaf segments for doclists. */
                   5647:     return loadSegmentLeaves(v, iStartChild, iEndChild,
                   5648:                              pTerm, nTerm, isPrefix, out);
                   5649:   }
                   5650: }
                   5651: 
                   5652: /* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
                   5653: ** merge its doclist over *out (any duplicate doclists read from the
                   5654: ** segment rooted at pData will overwrite those in *out).
                   5655: */
                   5656: /* TODO(shess) Consider changing this to determine the depth of the
                   5657: ** leaves using either the first characters of interior nodes (when
                   5658: ** ==1, we're one level above the leaves), or the first character of
                   5659: ** the root (which will describe the height of the tree directly).
                   5660: ** Either feels somewhat tricky to me.
                   5661: */
                   5662: /* TODO(shess) The current merge is likely to be slow for large
                   5663: ** doclists (though it should process from newest/smallest to
                   5664: ** oldest/largest, so it may not be that bad).  It might be useful to
                   5665: ** modify things to allow for N-way merging.  This could either be
                   5666: ** within a segment, with pairwise merges across segments, or across
                   5667: ** all segments at once.
                   5668: */
                   5669: static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
                   5670:                        sqlite_int64 iLeavesEnd,
                   5671:                        const char *pTerm, int nTerm, int isPrefix,
                   5672:                        DataBuffer *out){
                   5673:   DataBuffer result;
                   5674:   int rc;
                   5675: 
                   5676:   assert( nData>1 );
                   5677: 
                   5678:   /* This code should never be called with buffered updates. */
                   5679:   assert( v->nPendingData<0 );
                   5680: 
                   5681:   dataBufferInit(&result, 0);
                   5682:   rc = loadSegmentInt(v, pData, nData, iLeavesEnd,
                   5683:                       pTerm, nTerm, isPrefix, &result);
                   5684:   if( rc==SQLITE_OK && result.nData>0 ){
                   5685:     if( out->nData==0 ){
                   5686:       DataBuffer tmp = *out;
                   5687:       *out = result;
                   5688:       result = tmp;
                   5689:     }else{
                   5690:       DataBuffer merged;
                   5691:       DLReader readers[2];
                   5692: 
                   5693:       dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData);
                   5694:       dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData);
                   5695:       dataBufferInit(&merged, out->nData+result.nData);
                   5696:       docListMerge(&merged, readers, 2);
                   5697:       dataBufferDestroy(out);
                   5698:       *out = merged;
                   5699:       dlrDestroy(&readers[0]);
                   5700:       dlrDestroy(&readers[1]);
                   5701:     }
                   5702:   }
                   5703:   dataBufferDestroy(&result);
                   5704:   return rc;
                   5705: }
                   5706: 
                   5707: /* Scan the database and merge together the posting lists for the term
                   5708: ** into *out.
                   5709: */
                   5710: static int termSelect(fulltext_vtab *v, int iColumn,
                   5711:                       const char *pTerm, int nTerm, int isPrefix,
                   5712:                       DocListType iType, DataBuffer *out){
                   5713:   DataBuffer doclist;
                   5714:   sqlite3_stmt *s;
                   5715:   int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
                   5716:   if( rc!=SQLITE_OK ) return rc;
                   5717: 
                   5718:   /* This code should never be called with buffered updates. */
                   5719:   assert( v->nPendingData<0 );
                   5720: 
                   5721:   dataBufferInit(&doclist, 0);
                   5722: 
                   5723:   /* Traverse the segments from oldest to newest so that newer doclist
                   5724:   ** elements for given docids overwrite older elements.
                   5725:   */
                   5726:   while( (rc = sqlite3_step(s))==SQLITE_ROW ){
                   5727:     const char *pData = sqlite3_column_blob(s, 2);
                   5728:     const int nData = sqlite3_column_bytes(s, 2);
                   5729:     const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
                   5730:     rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix,
                   5731:                      &doclist);
                   5732:     if( rc!=SQLITE_OK ) goto err;
                   5733:   }
                   5734:   if( rc==SQLITE_DONE ){
                   5735:     if( doclist.nData!=0 ){
                   5736:       /* TODO(shess) The old term_select_all() code applied the column
                   5737:       ** restrict as we merged segments, leading to smaller buffers.
                   5738:       ** This is probably worthwhile to bring back, once the new storage
                   5739:       ** system is checked in.
                   5740:       */
                   5741:       if( iColumn==v->nColumn) iColumn = -1;
                   5742:       docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
                   5743:                   iColumn, iType, out);
                   5744:     }
                   5745:     rc = SQLITE_OK;
                   5746:   }
                   5747: 
                   5748:  err:
                   5749:   dataBufferDestroy(&doclist);
                   5750:   return rc;
                   5751: }
                   5752: 
                   5753: /****************************************************************/
                   5754: /* Used to hold hashtable data for sorting. */
                   5755: typedef struct TermData {
                   5756:   const char *pTerm;
                   5757:   int nTerm;
                   5758:   DLCollector *pCollector;
                   5759: } TermData;
                   5760: 
                   5761: /* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
                   5762: ** for equal, >0 for greater-than).
                   5763: */
                   5764: static int termDataCmp(const void *av, const void *bv){
                   5765:   const TermData *a = (const TermData *)av;
                   5766:   const TermData *b = (const TermData *)bv;
                   5767:   int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
                   5768:   int c = memcmp(a->pTerm, b->pTerm, n);
                   5769:   if( c!=0 ) return c;
                   5770:   return a->nTerm-b->nTerm;
                   5771: }
                   5772: 
                   5773: /* Order pTerms data by term, then write a new level 0 segment using
                   5774: ** LeafWriter.
                   5775: */
                   5776: static int writeZeroSegment(fulltext_vtab *v, fts2Hash *pTerms){
                   5777:   fts2HashElem *e;
                   5778:   int idx, rc, i, n;
                   5779:   TermData *pData;
                   5780:   LeafWriter writer;
                   5781:   DataBuffer dl;
                   5782: 
                   5783:   /* Determine the next index at level 0, merging as necessary. */
                   5784:   rc = segdirNextIndex(v, 0, &idx);
                   5785:   if( rc!=SQLITE_OK ) return rc;
                   5786: 
                   5787:   n = fts2HashCount(pTerms);
                   5788:   pData = sqlite3_malloc(n*sizeof(TermData));
                   5789: 
                   5790:   for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){
                   5791:     assert( i<n );
                   5792:     pData[i].pTerm = fts2HashKey(e);
                   5793:     pData[i].nTerm = fts2HashKeysize(e);
                   5794:     pData[i].pCollector = fts2HashData(e);
                   5795:   }
                   5796:   assert( i==n );
                   5797: 
                   5798:   /* TODO(shess) Should we allow user-defined collation sequences,
                   5799:   ** here?  I think we only need that once we support prefix searches.
                   5800:   */
                   5801:   if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
                   5802: 
                   5803:   /* TODO(shess) Refactor so that we can write directly to the segment
                   5804:   ** DataBuffer, as happens for segment merges.
                   5805:   */
                   5806:   leafWriterInit(0, idx, &writer);
                   5807:   dataBufferInit(&dl, 0);
                   5808:   for(i=0; i<n; i++){
                   5809:     dataBufferReset(&dl);
                   5810:     dlcAddDoclist(pData[i].pCollector, &dl);
                   5811:     rc = leafWriterStep(v, &writer,
                   5812:                         pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
                   5813:     if( rc!=SQLITE_OK ) goto err;
                   5814:   }
                   5815:   rc = leafWriterFinalize(v, &writer);
                   5816: 
                   5817:  err:
                   5818:   dataBufferDestroy(&dl);
                   5819:   sqlite3_free(pData);
                   5820:   leafWriterDestroy(&writer);
                   5821:   return rc;
                   5822: }
                   5823: 
                   5824: /* If pendingTerms has data, free it. */
                   5825: static int clearPendingTerms(fulltext_vtab *v){
                   5826:   if( v->nPendingData>=0 ){
                   5827:     fts2HashElem *e;
                   5828:     for(e=fts2HashFirst(&v->pendingTerms); e; e=fts2HashNext(e)){
                   5829:       dlcDelete(fts2HashData(e));
                   5830:     }
                   5831:     fts2HashClear(&v->pendingTerms);
                   5832:     v->nPendingData = -1;
                   5833:   }
                   5834:   return SQLITE_OK;
                   5835: }
                   5836: 
                   5837: /* If pendingTerms has data, flush it to a level-zero segment, and
                   5838: ** free it.
                   5839: */
                   5840: static int flushPendingTerms(fulltext_vtab *v){
                   5841:   if( v->nPendingData>=0 ){
                   5842:     int rc = writeZeroSegment(v, &v->pendingTerms);
                   5843:     if( rc==SQLITE_OK ) clearPendingTerms(v);
                   5844:     return rc;
                   5845:   }
                   5846:   return SQLITE_OK;
                   5847: }
                   5848: 
                   5849: /* If pendingTerms is "too big", or docid is out of order, flush it.
                   5850: ** Regardless, be certain that pendingTerms is initialized for use.
                   5851: */
                   5852: static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){
                   5853:   /* TODO(shess) Explore whether partially flushing the buffer on
                   5854:   ** forced-flush would provide better performance.  I suspect that if
                   5855:   ** we ordered the doclists by size and flushed the largest until the
                   5856:   ** buffer was half empty, that would let the less frequent terms
                   5857:   ** generate longer doclists.
                   5858:   */
                   5859:   if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){
                   5860:     int rc = flushPendingTerms(v);
                   5861:     if( rc!=SQLITE_OK ) return rc;
                   5862:   }
                   5863:   if( v->nPendingData<0 ){
                   5864:     fts2HashInit(&v->pendingTerms, FTS2_HASH_STRING, 1);
                   5865:     v->nPendingData = 0;
                   5866:   }
                   5867:   v->iPrevDocid = iDocid;
                   5868:   return SQLITE_OK;
                   5869: }
                   5870: 
                   5871: /* This function implements the xUpdate callback; it is the top-level entry
                   5872:  * point for inserting, deleting or updating a row in a full-text table. */
                   5873: static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
                   5874:                    sqlite_int64 *pRowid){
                   5875:   fulltext_vtab *v = (fulltext_vtab *) pVtab;
                   5876:   int rc;
                   5877: 
                   5878:   TRACE(("FTS2 Update %p\n", pVtab));
                   5879: 
                   5880:   if( nArg<2 ){
                   5881:     rc = index_delete(v, sqlite3_value_int64(ppArg[0]));
                   5882:     if( rc==SQLITE_OK ){
                   5883:       /* If we just deleted the last row in the table, clear out the
                   5884:       ** index data.
                   5885:       */
                   5886:       rc = content_exists(v);
                   5887:       if( rc==SQLITE_ROW ){
                   5888:         rc = SQLITE_OK;
                   5889:       }else if( rc==SQLITE_DONE ){
                   5890:         /* Clear the pending terms so we don't flush a useless level-0
                   5891:         ** segment when the transaction closes.
                   5892:         */
                   5893:         rc = clearPendingTerms(v);
                   5894:         if( rc==SQLITE_OK ){
                   5895:           rc = segdir_delete_all(v);
                   5896:         }
                   5897:       }
                   5898:     }
                   5899:   } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
                   5900:     /* An update:
                   5901:      * ppArg[0] = old rowid
                   5902:      * ppArg[1] = new rowid
                   5903:      * ppArg[2..2+v->nColumn-1] = values
                   5904:      * ppArg[2+v->nColumn] = value for magic column (we ignore this)
                   5905:      */
                   5906:     sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
                   5907:     if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
                   5908:       sqlite3_value_int64(ppArg[1]) != rowid ){
                   5909:       rc = SQLITE_ERROR;  /* we don't allow changing the rowid */
                   5910:     } else {
                   5911:       assert( nArg==2+v->nColumn+1);
                   5912:       rc = index_update(v, rowid, &ppArg[2]);
                   5913:     }
                   5914:   } else {
                   5915:     /* An insert:
                   5916:      * ppArg[1] = requested rowid
                   5917:      * ppArg[2..2+v->nColumn-1] = values
                   5918:      * ppArg[2+v->nColumn] = value for magic column (we ignore this)
                   5919:      */
                   5920:     assert( nArg==2+v->nColumn+1);
                   5921:     rc = index_insert(v, ppArg[1], &ppArg[2], pRowid);
                   5922:   }
                   5923: 
                   5924:   return rc;
                   5925: }
                   5926: 
                   5927: static int fulltextSync(sqlite3_vtab *pVtab){
                   5928:   TRACE(("FTS2 xSync()\n"));
                   5929:   return flushPendingTerms((fulltext_vtab *)pVtab);
                   5930: }
                   5931: 
                   5932: static int fulltextBegin(sqlite3_vtab *pVtab){
                   5933:   fulltext_vtab *v = (fulltext_vtab *) pVtab;
                   5934:   TRACE(("FTS2 xBegin()\n"));
                   5935: 
                   5936:   /* Any buffered updates should have been cleared by the previous
                   5937:   ** transaction.
                   5938:   */
                   5939:   assert( v->nPendingData<0 );
                   5940:   return clearPendingTerms(v);
                   5941: }
                   5942: 
                   5943: static int fulltextCommit(sqlite3_vtab *pVtab){
                   5944:   fulltext_vtab *v = (fulltext_vtab *) pVtab;
                   5945:   TRACE(("FTS2 xCommit()\n"));
                   5946: 
                   5947:   /* Buffered updates should have been cleared by fulltextSync(). */
                   5948:   assert( v->nPendingData<0 );
                   5949:   return clearPendingTerms(v);
                   5950: }
                   5951: 
                   5952: static int fulltextRollback(sqlite3_vtab *pVtab){
                   5953:   TRACE(("FTS2 xRollback()\n"));
                   5954:   return clearPendingTerms((fulltext_vtab *)pVtab);
                   5955: }
                   5956: 
                   5957: /*
                   5958: ** Implementation of the snippet() function for FTS2
                   5959: */
                   5960: static void snippetFunc(
                   5961:   sqlite3_context *pContext,
                   5962:   int argc,
                   5963:   sqlite3_value **argv
                   5964: ){
                   5965:   fulltext_cursor *pCursor;
                   5966:   if( argc<1 ) return;
                   5967:   if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
                   5968:       sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
                   5969:     sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
                   5970:   }else{
                   5971:     const char *zStart = "<b>";
                   5972:     const char *zEnd = "</b>";
                   5973:     const char *zEllipsis = "<b>...</b>";
                   5974:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
                   5975:     if( argc>=2 ){
                   5976:       zStart = (const char*)sqlite3_value_text(argv[1]);
                   5977:       if( argc>=3 ){
                   5978:         zEnd = (const char*)sqlite3_value_text(argv[2]);
                   5979:         if( argc>=4 ){
                   5980:           zEllipsis = (const char*)sqlite3_value_text(argv[3]);
                   5981:         }
                   5982:       }
                   5983:     }
                   5984:     snippetAllOffsets(pCursor);
                   5985:     snippetText(pCursor, zStart, zEnd, zEllipsis);
                   5986:     sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
                   5987:                         pCursor->snippet.nSnippet, SQLITE_STATIC);
                   5988:   }
                   5989: }
                   5990: 
                   5991: /*
                   5992: ** Implementation of the offsets() function for FTS2
                   5993: */
                   5994: static void snippetOffsetsFunc(
                   5995:   sqlite3_context *pContext,
                   5996:   int argc,
                   5997:   sqlite3_value **argv
                   5998: ){
                   5999:   fulltext_cursor *pCursor;
                   6000:   if( argc<1 ) return;
                   6001:   if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
                   6002:       sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
                   6003:     sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
                   6004:   }else{
                   6005:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
                   6006:     snippetAllOffsets(pCursor);
                   6007:     snippetOffsetText(&pCursor->snippet);
                   6008:     sqlite3_result_text(pContext,
                   6009:                         pCursor->snippet.zOffset, pCursor->snippet.nOffset,
                   6010:                         SQLITE_STATIC);
                   6011:   }
                   6012: }
                   6013: 
                   6014: /* OptLeavesReader is nearly identical to LeavesReader, except that
                   6015: ** where LeavesReader is geared towards the merging of complete
                   6016: ** segment levels (with exactly MERGE_COUNT segments), OptLeavesReader
                   6017: ** is geared towards implementation of the optimize() function, and
                   6018: ** can merge all segments simultaneously.  This version may be
                   6019: ** somewhat less efficient than LeavesReader because it merges into an
                   6020: ** accumulator rather than doing an N-way merge, but since segment
                   6021: ** size grows exponentially (so segment count logrithmically) this is
                   6022: ** probably not an immediate problem.
                   6023: */
                   6024: /* TODO(shess): Prove that assertion, or extend the merge code to
                   6025: ** merge tree fashion (like the prefix-searching code does).
                   6026: */
                   6027: /* TODO(shess): OptLeavesReader and LeavesReader could probably be
                   6028: ** merged with little or no loss of performance for LeavesReader.  The
                   6029: ** merged code would need to handle >MERGE_COUNT segments, and would
                   6030: ** also need to be able to optionally optimize away deletes.
                   6031: */
                   6032: typedef struct OptLeavesReader {
                   6033:   /* Segment number, to order readers by age. */
                   6034:   int segment;
                   6035:   LeavesReader reader;
                   6036: } OptLeavesReader;
                   6037: 
                   6038: static int optLeavesReaderAtEnd(OptLeavesReader *pReader){
                   6039:   return leavesReaderAtEnd(&pReader->reader);
                   6040: }
                   6041: static int optLeavesReaderTermBytes(OptLeavesReader *pReader){
                   6042:   return leavesReaderTermBytes(&pReader->reader);
                   6043: }
                   6044: static const char *optLeavesReaderData(OptLeavesReader *pReader){
                   6045:   return leavesReaderData(&pReader->reader);
                   6046: }
                   6047: static int optLeavesReaderDataBytes(OptLeavesReader *pReader){
                   6048:   return leavesReaderDataBytes(&pReader->reader);
                   6049: }
                   6050: static const char *optLeavesReaderTerm(OptLeavesReader *pReader){
                   6051:   return leavesReaderTerm(&pReader->reader);
                   6052: }
                   6053: static int optLeavesReaderStep(fulltext_vtab *v, OptLeavesReader *pReader){
                   6054:   return leavesReaderStep(v, &pReader->reader);
                   6055: }
                   6056: static int optLeavesReaderTermCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){
                   6057:   return leavesReaderTermCmp(&lr1->reader, &lr2->reader);
                   6058: }
                   6059: /* Order by term ascending, segment ascending (oldest to newest), with
                   6060: ** exhausted readers to the end.
                   6061: */
                   6062: static int optLeavesReaderCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){
                   6063:   int c = optLeavesReaderTermCmp(lr1, lr2);
                   6064:   if( c!=0 ) return c;
                   6065:   return lr1->segment-lr2->segment;
                   6066: }
                   6067: /* Bubble pLr[0] to appropriate place in pLr[1..nLr-1].  Assumes that
                   6068: ** pLr[1..nLr-1] is already sorted.
                   6069: */
                   6070: static void optLeavesReaderReorder(OptLeavesReader *pLr, int nLr){
                   6071:   while( nLr>1 && optLeavesReaderCmp(pLr, pLr+1)>0 ){
                   6072:     OptLeavesReader tmp = pLr[0];
                   6073:     pLr[0] = pLr[1];
                   6074:     pLr[1] = tmp;
                   6075:     nLr--;
                   6076:     pLr++;
                   6077:   }
                   6078: }
                   6079: 
                   6080: /* optimize() helper function.  Put the readers in order and iterate
                   6081: ** through them, merging doclists for matching terms into pWriter.
                   6082: ** Returns SQLITE_OK on success, or the SQLite error code which
                   6083: ** prevented success.
                   6084: */
                   6085: static int optimizeInternal(fulltext_vtab *v,
                   6086:                             OptLeavesReader *readers, int nReaders,
                   6087:                             LeafWriter *pWriter){
                   6088:   int i, rc = SQLITE_OK;
                   6089:   DataBuffer doclist, merged, tmp;
                   6090: 
                   6091:   /* Order the readers. */
                   6092:   i = nReaders;
                   6093:   while( i-- > 0 ){
                   6094:     optLeavesReaderReorder(&readers[i], nReaders-i);
                   6095:   }
                   6096: 
                   6097:   dataBufferInit(&doclist, LEAF_MAX);
                   6098:   dataBufferInit(&merged, LEAF_MAX);
                   6099: 
                   6100:   /* Exhausted readers bubble to the end, so when the first reader is
                   6101:   ** at eof, all are at eof.
                   6102:   */
                   6103:   while( !optLeavesReaderAtEnd(&readers[0]) ){
                   6104: 
                   6105:     /* Figure out how many readers share the next term. */
                   6106:     for(i=1; i<nReaders && !optLeavesReaderAtEnd(&readers[i]); i++){
                   6107:       if( 0!=optLeavesReaderTermCmp(&readers[0], &readers[i]) ) break;
                   6108:     }
                   6109: 
                   6110:     /* Special-case for no merge. */
                   6111:     if( i==1 ){
                   6112:       /* Trim deletions from the doclist. */
                   6113:       dataBufferReset(&merged);
                   6114:       docListTrim(DL_DEFAULT,
                   6115:                   optLeavesReaderData(&readers[0]),
                   6116:                   optLeavesReaderDataBytes(&readers[0]),
                   6117:                   -1, DL_DEFAULT, &merged);
                   6118:     }else{
                   6119:       DLReader dlReaders[MERGE_COUNT];
                   6120:       int iReader, nReaders;
                   6121: 
                   6122:       /* Prime the pipeline with the first reader's doclist.  After
                   6123:       ** one pass index 0 will reference the accumulated doclist.
                   6124:       */
                   6125:       dlrInit(&dlReaders[0], DL_DEFAULT,
                   6126:               optLeavesReaderData(&readers[0]),
                   6127:               optLeavesReaderDataBytes(&readers[0]));
                   6128:       iReader = 1;
                   6129: 
                   6130:       assert( iReader<i );  /* Must execute the loop at least once. */
                   6131:       while( iReader<i ){
                   6132:         /* Merge 16 inputs per pass. */
                   6133:         for( nReaders=1; iReader<i && nReaders<MERGE_COUNT;
                   6134:              iReader++, nReaders++ ){
                   6135:           dlrInit(&dlReaders[nReaders], DL_DEFAULT,
                   6136:                   optLeavesReaderData(&readers[iReader]),
                   6137:                   optLeavesReaderDataBytes(&readers[iReader]));
                   6138:         }
                   6139: 
                   6140:         /* Merge doclists and swap result into accumulator. */
                   6141:         dataBufferReset(&merged);
                   6142:         docListMerge(&merged, dlReaders, nReaders);
                   6143:         tmp = merged;
                   6144:         merged = doclist;
                   6145:         doclist = tmp;
                   6146: 
                   6147:         while( nReaders-- > 0 ){
                   6148:           dlrDestroy(&dlReaders[nReaders]);
                   6149:         }
                   6150: 
                   6151:         /* Accumulated doclist to reader 0 for next pass. */
                   6152:         dlrInit(&dlReaders[0], DL_DEFAULT, doclist.pData, doclist.nData);
                   6153:       }
                   6154: 
                   6155:       /* Destroy reader that was left in the pipeline. */
                   6156:       dlrDestroy(&dlReaders[0]);
                   6157: 
                   6158:       /* Trim deletions from the doclist. */
                   6159:       dataBufferReset(&merged);
                   6160:       docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
                   6161:                   -1, DL_DEFAULT, &merged);
                   6162:     }
                   6163: 
                   6164:     /* Only pass doclists with hits (skip if all hits deleted). */
                   6165:     if( merged.nData>0 ){
                   6166:       rc = leafWriterStep(v, pWriter,
                   6167:                           optLeavesReaderTerm(&readers[0]),
                   6168:                           optLeavesReaderTermBytes(&readers[0]),
                   6169:                           merged.pData, merged.nData);
                   6170:       if( rc!=SQLITE_OK ) goto err;
                   6171:     }
                   6172: 
                   6173:     /* Step merged readers to next term and reorder. */
                   6174:     while( i-- > 0 ){
                   6175:       rc = optLeavesReaderStep(v, &readers[i]);
                   6176:       if( rc!=SQLITE_OK ) goto err;
                   6177: 
                   6178:       optLeavesReaderReorder(&readers[i], nReaders-i);
                   6179:     }
                   6180:   }
                   6181: 
                   6182:  err:
                   6183:   dataBufferDestroy(&doclist);
                   6184:   dataBufferDestroy(&merged);
                   6185:   return rc;
                   6186: }
                   6187: 
                   6188: /* Implement optimize() function for FTS3.  optimize(t) merges all
                   6189: ** segments in the fts index into a single segment.  't' is the magic
                   6190: ** table-named column.
                   6191: */
                   6192: static void optimizeFunc(sqlite3_context *pContext,
                   6193:                          int argc, sqlite3_value **argv){
                   6194:   fulltext_cursor *pCursor;
                   6195:   if( argc>1 ){
                   6196:     sqlite3_result_error(pContext, "excess arguments to optimize()",-1);
                   6197:   }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
                   6198:             sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
                   6199:     sqlite3_result_error(pContext, "illegal first argument to optimize",-1);
                   6200:   }else{
                   6201:     fulltext_vtab *v;
                   6202:     int i, rc, iMaxLevel;
                   6203:     OptLeavesReader *readers;
                   6204:     int nReaders;
                   6205:     LeafWriter writer;
                   6206:     sqlite3_stmt *s;
                   6207: 
                   6208:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
                   6209:     v = cursor_vtab(pCursor);
                   6210: 
                   6211:     /* Flush any buffered updates before optimizing. */
                   6212:     rc = flushPendingTerms(v);
                   6213:     if( rc!=SQLITE_OK ) goto err;
                   6214: 
                   6215:     rc = segdir_count(v, &nReaders, &iMaxLevel);
                   6216:     if( rc!=SQLITE_OK ) goto err;
                   6217:     if( nReaders==0 || nReaders==1 ){
                   6218:       sqlite3_result_text(pContext, "Index already optimal", -1,
                   6219:                           SQLITE_STATIC);
                   6220:       return;
                   6221:     }
                   6222: 
                   6223:     rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
                   6224:     if( rc!=SQLITE_OK ) goto err;
                   6225: 
                   6226:     readers = sqlite3_malloc(nReaders*sizeof(readers[0]));
                   6227:     if( readers==NULL ) goto err;
                   6228: 
                   6229:     /* Note that there will already be a segment at this position
                   6230:     ** until we call segdir_delete() on iMaxLevel.
                   6231:     */
                   6232:     leafWriterInit(iMaxLevel, 0, &writer);
                   6233: 
                   6234:     i = 0;
                   6235:     while( (rc = sqlite3_step(s))==SQLITE_ROW ){
                   6236:       sqlite_int64 iStart = sqlite3_column_int64(s, 0);
                   6237:       sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
                   6238:       const char *pRootData = sqlite3_column_blob(s, 2);
                   6239:       int nRootData = sqlite3_column_bytes(s, 2);
                   6240: 
                   6241:       assert( i<nReaders );
                   6242:       rc = leavesReaderInit(v, -1, iStart, iEnd, pRootData, nRootData,
                   6243:                             &readers[i].reader);
                   6244:       if( rc!=SQLITE_OK ) break;
                   6245: 
                   6246:       readers[i].segment = i;
                   6247:       i++;
                   6248:     }
                   6249: 
                   6250:     /* If we managed to successfully read them all, optimize them. */
                   6251:     if( rc==SQLITE_DONE ){
                   6252:       assert( i==nReaders );
                   6253:       rc = optimizeInternal(v, readers, nReaders, &writer);
                   6254:     }
                   6255: 
                   6256:     while( i-- > 0 ){
                   6257:       leavesReaderDestroy(&readers[i].reader);
                   6258:     }
                   6259:     sqlite3_free(readers);
                   6260: 
                   6261:     /* If we've successfully gotten to here, delete the old segments
                   6262:     ** and flush the interior structure of the new segment.
                   6263:     */
                   6264:     if( rc==SQLITE_OK ){
                   6265:       for( i=0; i<=iMaxLevel; i++ ){
                   6266:         rc = segdir_delete(v, i);
                   6267:         if( rc!=SQLITE_OK ) break;
                   6268:       }
                   6269: 
                   6270:       if( rc==SQLITE_OK ) rc = leafWriterFinalize(v, &writer);
                   6271:     }
                   6272: 
                   6273:     leafWriterDestroy(&writer);
                   6274: 
                   6275:     if( rc!=SQLITE_OK ) goto err;
                   6276: 
                   6277:     sqlite3_result_text(pContext, "Index optimized", -1, SQLITE_STATIC);
                   6278:     return;
                   6279: 
                   6280:     /* TODO(shess): Error-handling needs to be improved along the
                   6281:     ** lines of the dump_ functions.
                   6282:     */
                   6283:  err:
                   6284:     {
                   6285:       char buf[512];
                   6286:       sqlite3_snprintf(sizeof(buf), buf, "Error in optimize: %s",
                   6287:                        sqlite3_errmsg(sqlite3_context_db_handle(pContext)));
                   6288:       sqlite3_result_error(pContext, buf, -1);
                   6289:     }
                   6290:   }
                   6291: }
                   6292: 
                   6293: #ifdef SQLITE_TEST
                   6294: /* Generate an error of the form "<prefix>: <msg>".  If msg is NULL,
                   6295: ** pull the error from the context's db handle.
                   6296: */
                   6297: static void generateError(sqlite3_context *pContext,
                   6298:                           const char *prefix, const char *msg){
                   6299:   char buf[512];
                   6300:   if( msg==NULL ) msg = sqlite3_errmsg(sqlite3_context_db_handle(pContext));
                   6301:   sqlite3_snprintf(sizeof(buf), buf, "%s: %s", prefix, msg);
                   6302:   sqlite3_result_error(pContext, buf, -1);
                   6303: }
                   6304: 
                   6305: /* Helper function to collect the set of terms in the segment into
                   6306: ** pTerms.  The segment is defined by the leaf nodes between
                   6307: ** iStartBlockid and iEndBlockid, inclusive, or by the contents of
                   6308: ** pRootData if iStartBlockid is 0 (in which case the entire segment
                   6309: ** fit in a leaf).
                   6310: */
                   6311: static int collectSegmentTerms(fulltext_vtab *v, sqlite3_stmt *s,
                   6312:                                fts2Hash *pTerms){
                   6313:   const sqlite_int64 iStartBlockid = sqlite3_column_int64(s, 0);
                   6314:   const sqlite_int64 iEndBlockid = sqlite3_column_int64(s, 1);
                   6315:   const char *pRootData = sqlite3_column_blob(s, 2);
                   6316:   const int nRootData = sqlite3_column_bytes(s, 2);
                   6317:   LeavesReader reader;
                   6318:   int rc = leavesReaderInit(v, 0, iStartBlockid, iEndBlockid,
                   6319:                             pRootData, nRootData, &reader);
                   6320:   if( rc!=SQLITE_OK ) return rc;
                   6321: 
                   6322:   while( rc==SQLITE_OK && !leavesReaderAtEnd(&reader) ){
                   6323:     const char *pTerm = leavesReaderTerm(&reader);
                   6324:     const int nTerm = leavesReaderTermBytes(&reader);
                   6325:     void *oldValue = sqlite3Fts2HashFind(pTerms, pTerm, nTerm);
                   6326:     void *newValue = (void *)((char *)oldValue+1);
                   6327: 
                   6328:     /* From the comment before sqlite3Fts2HashInsert in fts2_hash.c,
                   6329:     ** the data value passed is returned in case of malloc failure.
                   6330:     */
                   6331:     if( newValue==sqlite3Fts2HashInsert(pTerms, pTerm, nTerm, newValue) ){
                   6332:       rc = SQLITE_NOMEM;
                   6333:     }else{
                   6334:       rc = leavesReaderStep(v, &reader);
                   6335:     }
                   6336:   }
                   6337: 
                   6338:   leavesReaderDestroy(&reader);
                   6339:   return rc;
                   6340: }
                   6341: 
                   6342: /* Helper function to build the result string for dump_terms(). */
                   6343: static int generateTermsResult(sqlite3_context *pContext, fts2Hash *pTerms){
                   6344:   int iTerm, nTerms, nResultBytes, iByte;
                   6345:   char *result;
                   6346:   TermData *pData;
                   6347:   fts2HashElem *e;
                   6348: 
                   6349:   /* Iterate pTerms to generate an array of terms in pData for
                   6350:   ** sorting.
                   6351:   */
                   6352:   nTerms = fts2HashCount(pTerms);
                   6353:   assert( nTerms>0 );
                   6354:   pData = sqlite3_malloc(nTerms*sizeof(TermData));
                   6355:   if( pData==NULL ) return SQLITE_NOMEM;
                   6356: 
                   6357:   nResultBytes = 0;
                   6358:   for(iTerm = 0, e = fts2HashFirst(pTerms); e; iTerm++, e = fts2HashNext(e)){
                   6359:     nResultBytes += fts2HashKeysize(e)+1;   /* Term plus trailing space */
                   6360:     assert( iTerm<nTerms );
                   6361:     pData[iTerm].pTerm = fts2HashKey(e);
                   6362:     pData[iTerm].nTerm = fts2HashKeysize(e);
                   6363:     pData[iTerm].pCollector = fts2HashData(e);  /* unused */
                   6364:   }
                   6365:   assert( iTerm==nTerms );
                   6366: 
                   6367:   assert( nResultBytes>0 );   /* nTerms>0, nResultsBytes must be, too. */
                   6368:   result = sqlite3_malloc(nResultBytes);
                   6369:   if( result==NULL ){
                   6370:     sqlite3_free(pData);
                   6371:     return SQLITE_NOMEM;
                   6372:   }
                   6373: 
                   6374:   if( nTerms>1 ) qsort(pData, nTerms, sizeof(*pData), termDataCmp);
                   6375: 
                   6376:   /* Read the terms in order to build the result. */
                   6377:   iByte = 0;
                   6378:   for(iTerm=0; iTerm<nTerms; ++iTerm){
                   6379:     memcpy(result+iByte, pData[iTerm].pTerm, pData[iTerm].nTerm);
                   6380:     iByte += pData[iTerm].nTerm;
                   6381:     result[iByte++] = ' ';
                   6382:   }
                   6383:   assert( iByte==nResultBytes );
                   6384:   assert( result[nResultBytes-1]==' ' );
                   6385:   result[nResultBytes-1] = '\0';
                   6386: 
                   6387:   /* Passes away ownership of result. */
                   6388:   sqlite3_result_text(pContext, result, nResultBytes-1, sqlite3_free);
                   6389:   sqlite3_free(pData);
                   6390:   return SQLITE_OK;
                   6391: }
                   6392: 
                   6393: /* Implements dump_terms() for use in inspecting the fts2 index from
                   6394: ** tests.  TEXT result containing the ordered list of terms joined by
                   6395: ** spaces.  dump_terms(t, level, idx) dumps the terms for the segment
                   6396: ** specified by level, idx (in %_segdir), while dump_terms(t) dumps
                   6397: ** all terms in the index.  In both cases t is the fts table's magic
                   6398: ** table-named column.
                   6399: */
                   6400: static void dumpTermsFunc(
                   6401:   sqlite3_context *pContext,
                   6402:   int argc, sqlite3_value **argv
                   6403: ){
                   6404:   fulltext_cursor *pCursor;
                   6405:   if( argc!=3 && argc!=1 ){
                   6406:     generateError(pContext, "dump_terms", "incorrect arguments");
                   6407:   }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
                   6408:             sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
                   6409:     generateError(pContext, "dump_terms", "illegal first argument");
                   6410:   }else{
                   6411:     fulltext_vtab *v;
                   6412:     fts2Hash terms;
                   6413:     sqlite3_stmt *s = NULL;
                   6414:     int rc;
                   6415: 
                   6416:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
                   6417:     v = cursor_vtab(pCursor);
                   6418: 
                   6419:     /* If passed only the cursor column, get all segments.  Otherwise
                   6420:     ** get the segment described by the following two arguments.
                   6421:     */
                   6422:     if( argc==1 ){
                   6423:       rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
                   6424:     }else{
                   6425:       rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s);
                   6426:       if( rc==SQLITE_OK ){
                   6427:         rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[1]));
                   6428:         if( rc==SQLITE_OK ){
                   6429:           rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[2]));
                   6430:         }
                   6431:       }
                   6432:     }
                   6433: 
                   6434:     if( rc!=SQLITE_OK ){
                   6435:       generateError(pContext, "dump_terms", NULL);
                   6436:       return;
                   6437:     }
                   6438: 
                   6439:     /* Collect the terms for each segment. */
                   6440:     sqlite3Fts2HashInit(&terms, FTS2_HASH_STRING, 1);
                   6441:     while( (rc = sqlite3_step(s))==SQLITE_ROW ){
                   6442:       rc = collectSegmentTerms(v, s, &terms);
                   6443:       if( rc!=SQLITE_OK ) break;
                   6444:     }
                   6445: 
                   6446:     if( rc!=SQLITE_DONE ){
                   6447:       sqlite3_reset(s);
                   6448:       generateError(pContext, "dump_terms", NULL);
                   6449:     }else{
                   6450:       const int nTerms = fts2HashCount(&terms);
                   6451:       if( nTerms>0 ){
                   6452:         rc = generateTermsResult(pContext, &terms);
                   6453:         if( rc==SQLITE_NOMEM ){
                   6454:           generateError(pContext, "dump_terms", "out of memory");
                   6455:         }else{
                   6456:           assert( rc==SQLITE_OK );
                   6457:         }
                   6458:       }else if( argc==3 ){
                   6459:         /* The specific segment asked for could not be found. */
                   6460:         generateError(pContext, "dump_terms", "segment not found");
                   6461:       }else{
                   6462:         /* No segments found. */
                   6463:         /* TODO(shess): It should be impossible to reach this.  This
                   6464:         ** case can only happen for an empty table, in which case
                   6465:         ** SQLite has no rows to call this function on.
                   6466:         */
                   6467:         sqlite3_result_null(pContext);
                   6468:       }
                   6469:     }
                   6470:     sqlite3Fts2HashClear(&terms);
                   6471:   }
                   6472: }
                   6473: 
                   6474: /* Expand the DL_DEFAULT doclist in pData into a text result in
                   6475: ** pContext.
                   6476: */
                   6477: static void createDoclistResult(sqlite3_context *pContext,
                   6478:                                 const char *pData, int nData){
                   6479:   DataBuffer dump;
                   6480:   DLReader dlReader;
                   6481: 
                   6482:   assert( pData!=NULL && nData>0 );
                   6483: 
                   6484:   dataBufferInit(&dump, 0);
                   6485:   dlrInit(&dlReader, DL_DEFAULT, pData, nData);
                   6486:   for( ; !dlrAtEnd(&dlReader); dlrStep(&dlReader) ){
                   6487:     char buf[256];
                   6488:     PLReader plReader;
                   6489: 
                   6490:     plrInit(&plReader, &dlReader);
                   6491:     if( DL_DEFAULT==DL_DOCIDS || plrAtEnd(&plReader) ){
                   6492:       sqlite3_snprintf(sizeof(buf), buf, "[%lld] ", dlrDocid(&dlReader));
                   6493:       dataBufferAppend(&dump, buf, strlen(buf));
                   6494:     }else{
                   6495:       int iColumn = plrColumn(&plReader);
                   6496: 
                   6497:       sqlite3_snprintf(sizeof(buf), buf, "[%lld %d[",
                   6498:                        dlrDocid(&dlReader), iColumn);
                   6499:       dataBufferAppend(&dump, buf, strlen(buf));
                   6500: 
                   6501:       for( ; !plrAtEnd(&plReader); plrStep(&plReader) ){
                   6502:         if( plrColumn(&plReader)!=iColumn ){
                   6503:           iColumn = plrColumn(&plReader);
                   6504:           sqlite3_snprintf(sizeof(buf), buf, "] %d[", iColumn);
                   6505:           assert( dump.nData>0 );
                   6506:           dump.nData--;                     /* Overwrite trailing space. */
                   6507:           assert( dump.pData[dump.nData]==' ');
                   6508:           dataBufferAppend(&dump, buf, strlen(buf));
                   6509:         }
                   6510:         if( DL_DEFAULT==DL_POSITIONS_OFFSETS ){
                   6511:           sqlite3_snprintf(sizeof(buf), buf, "%d,%d,%d ",
                   6512:                            plrPosition(&plReader),
                   6513:                            plrStartOffset(&plReader), plrEndOffset(&plReader));
                   6514:         }else if( DL_DEFAULT==DL_POSITIONS ){
                   6515:           sqlite3_snprintf(sizeof(buf), buf, "%d ", plrPosition(&plReader));
                   6516:         }else{
                   6517:           assert( NULL=="Unhandled DL_DEFAULT value");
                   6518:         }
                   6519:         dataBufferAppend(&dump, buf, strlen(buf));
                   6520:       }
                   6521:       plrDestroy(&plReader);
                   6522: 
                   6523:       assert( dump.nData>0 );
                   6524:       dump.nData--;                     /* Overwrite trailing space. */
                   6525:       assert( dump.pData[dump.nData]==' ');
                   6526:       dataBufferAppend(&dump, "]] ", 3);
                   6527:     }
                   6528:   }
                   6529:   dlrDestroy(&dlReader);
                   6530: 
                   6531:   assert( dump.nData>0 );
                   6532:   dump.nData--;                     /* Overwrite trailing space. */
                   6533:   assert( dump.pData[dump.nData]==' ');
                   6534:   dump.pData[dump.nData] = '\0';
                   6535:   assert( dump.nData>0 );
                   6536: 
                   6537:   /* Passes ownership of dump's buffer to pContext. */
                   6538:   sqlite3_result_text(pContext, dump.pData, dump.nData, sqlite3_free);
                   6539:   dump.pData = NULL;
                   6540:   dump.nData = dump.nCapacity = 0;
                   6541: }
                   6542: 
                   6543: /* Implements dump_doclist() for use in inspecting the fts2 index from
                   6544: ** tests.  TEXT result containing a string representation of the
                   6545: ** doclist for the indicated term.  dump_doclist(t, term, level, idx)
                   6546: ** dumps the doclist for term from the segment specified by level, idx
                   6547: ** (in %_segdir), while dump_doclist(t, term) dumps the logical
                   6548: ** doclist for the term across all segments.  The per-segment doclist
                   6549: ** can contain deletions, while the full-index doclist will not
                   6550: ** (deletions are omitted).
                   6551: **
                   6552: ** Result formats differ with the setting of DL_DEFAULTS.  Examples:
                   6553: **
                   6554: ** DL_DOCIDS: [1] [3] [7]
                   6555: ** DL_POSITIONS: [1 0[0 4] 1[17]] [3 1[5]]
                   6556: ** DL_POSITIONS_OFFSETS: [1 0[0,0,3 4,23,26] 1[17,102,105]] [3 1[5,20,23]]
                   6557: **
                   6558: ** In each case the number after the outer '[' is the docid.  In the
                   6559: ** latter two cases, the number before the inner '[' is the column
                   6560: ** associated with the values within.  For DL_POSITIONS the numbers
                   6561: ** within are the positions, for DL_POSITIONS_OFFSETS they are the
                   6562: ** position, the start offset, and the end offset.
                   6563: */
                   6564: static void dumpDoclistFunc(
                   6565:   sqlite3_context *pContext,
                   6566:   int argc, sqlite3_value **argv
                   6567: ){
                   6568:   fulltext_cursor *pCursor;
                   6569:   if( argc!=2 && argc!=4 ){
                   6570:     generateError(pContext, "dump_doclist", "incorrect arguments");
                   6571:   }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
                   6572:             sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
                   6573:     generateError(pContext, "dump_doclist", "illegal first argument");
                   6574:   }else if( sqlite3_value_text(argv[1])==NULL ||
                   6575:             sqlite3_value_text(argv[1])[0]=='\0' ){
                   6576:     generateError(pContext, "dump_doclist", "empty second argument");
                   6577:   }else{
                   6578:     const char *pTerm = (const char *)sqlite3_value_text(argv[1]);
                   6579:     const int nTerm = strlen(pTerm);
                   6580:     fulltext_vtab *v;
                   6581:     int rc;
                   6582:     DataBuffer doclist;
                   6583: 
                   6584:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
                   6585:     v = cursor_vtab(pCursor);
                   6586: 
                   6587:     dataBufferInit(&doclist, 0);
                   6588: 
                   6589:     /* termSelect() yields the same logical doclist that queries are
                   6590:     ** run against.
                   6591:     */
                   6592:     if( argc==2 ){
                   6593:       rc = termSelect(v, v->nColumn, pTerm, nTerm, 0, DL_DEFAULT, &doclist);
                   6594:     }else{
                   6595:       sqlite3_stmt *s = NULL;
                   6596: 
                   6597:       /* Get our specific segment's information. */
                   6598:       rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s);
                   6599:       if( rc==SQLITE_OK ){
                   6600:         rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[2]));
                   6601:         if( rc==SQLITE_OK ){
                   6602:           rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[3]));
                   6603:         }
                   6604:       }
                   6605: 
                   6606:       if( rc==SQLITE_OK ){
                   6607:         rc = sqlite3_step(s);
                   6608: 
                   6609:         if( rc==SQLITE_DONE ){
                   6610:           dataBufferDestroy(&doclist);
                   6611:           generateError(pContext, "dump_doclist", "segment not found");
                   6612:           return;
                   6613:         }
                   6614: 
                   6615:         /* Found a segment, load it into doclist. */
                   6616:         if( rc==SQLITE_ROW ){
                   6617:           const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
                   6618:           const char *pData = sqlite3_column_blob(s, 2);
                   6619:           const int nData = sqlite3_column_bytes(s, 2);
                   6620: 
                   6621:           /* loadSegment() is used by termSelect() to load each
                   6622:           ** segment's data.
                   6623:           */
                   6624:           rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, 0,
                   6625:                            &doclist);
                   6626:           if( rc==SQLITE_OK ){
                   6627:             rc = sqlite3_step(s);
                   6628: 
                   6629:             /* Should not have more than one matching segment. */
                   6630:             if( rc!=SQLITE_DONE ){
                   6631:               sqlite3_reset(s);
                   6632:               dataBufferDestroy(&doclist);
                   6633:               generateError(pContext, "dump_doclist", "invalid segdir");
                   6634:               return;
                   6635:             }
                   6636:             rc = SQLITE_OK;
                   6637:           }
                   6638:         }
                   6639:       }
                   6640: 
                   6641:       sqlite3_reset(s);
                   6642:     }
                   6643: 
                   6644:     if( rc==SQLITE_OK ){
                   6645:       if( doclist.nData>0 ){
                   6646:         createDoclistResult(pContext, doclist.pData, doclist.nData);
                   6647:       }else{
                   6648:         /* TODO(shess): This can happen if the term is not present, or
                   6649:         ** if all instances of the term have been deleted and this is
                   6650:         ** an all-index dump.  It may be interesting to distinguish
                   6651:         ** these cases.
                   6652:         */
                   6653:         sqlite3_result_text(pContext, "", 0, SQLITE_STATIC);
                   6654:       }
                   6655:     }else if( rc==SQLITE_NOMEM ){
                   6656:       /* Handle out-of-memory cases specially because if they are
                   6657:       ** generated in fts2 code they may not be reflected in the db
                   6658:       ** handle.
                   6659:       */
                   6660:       /* TODO(shess): Handle this more comprehensively.
                   6661:       ** sqlite3ErrStr() has what I need, but is internal.
                   6662:       */
                   6663:       generateError(pContext, "dump_doclist", "out of memory");
                   6664:     }else{
                   6665:       generateError(pContext, "dump_doclist", NULL);
                   6666:     }
                   6667: 
                   6668:     dataBufferDestroy(&doclist);
                   6669:   }
                   6670: }
                   6671: #endif
                   6672: 
                   6673: /*
                   6674: ** This routine implements the xFindFunction method for the FTS2
                   6675: ** virtual table.
                   6676: */
                   6677: static int fulltextFindFunction(
                   6678:   sqlite3_vtab *pVtab,
                   6679:   int nArg,
                   6680:   const char *zName,
                   6681:   void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
                   6682:   void **ppArg
                   6683: ){
                   6684:   if( strcmp(zName,"snippet")==0 ){
                   6685:     *pxFunc = snippetFunc;
                   6686:     return 1;
                   6687:   }else if( strcmp(zName,"offsets")==0 ){
                   6688:     *pxFunc = snippetOffsetsFunc;
                   6689:     return 1;
                   6690:   }else if( strcmp(zName,"optimize")==0 ){
                   6691:     *pxFunc = optimizeFunc;
                   6692:     return 1;
                   6693: #ifdef SQLITE_TEST
                   6694:     /* NOTE(shess): These functions are present only for testing
                   6695:     ** purposes.  No particular effort is made to optimize their
                   6696:     ** execution or how they build their results.
                   6697:     */
                   6698:   }else if( strcmp(zName,"dump_terms")==0 ){
                   6699:     /* fprintf(stderr, "Found dump_terms\n"); */
                   6700:     *pxFunc = dumpTermsFunc;
                   6701:     return 1;
                   6702:   }else if( strcmp(zName,"dump_doclist")==0 ){
                   6703:     /* fprintf(stderr, "Found dump_doclist\n"); */
                   6704:     *pxFunc = dumpDoclistFunc;
                   6705:     return 1;
                   6706: #endif
                   6707:   }
                   6708:   return 0;
                   6709: }
                   6710: 
                   6711: /*
                   6712: ** Rename an fts2 table.
                   6713: */
                   6714: static int fulltextRename(
                   6715:   sqlite3_vtab *pVtab,
                   6716:   const char *zName
                   6717: ){
                   6718:   fulltext_vtab *p = (fulltext_vtab *)pVtab;
                   6719:   int rc = SQLITE_NOMEM;
                   6720:   char *zSql = sqlite3_mprintf(
                   6721:     "ALTER TABLE %Q.'%q_content'  RENAME TO '%q_content';"
                   6722:     "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
                   6723:     "ALTER TABLE %Q.'%q_segdir'   RENAME TO '%q_segdir';"
                   6724:     , p->zDb, p->zName, zName 
                   6725:     , p->zDb, p->zName, zName 
                   6726:     , p->zDb, p->zName, zName
                   6727:   );
                   6728:   if( zSql ){
                   6729:     rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
                   6730:     sqlite3_free(zSql);
                   6731:   }
                   6732:   return rc;
                   6733: }
                   6734: 
                   6735: static const sqlite3_module fts2Module = {
                   6736:   /* iVersion      */ 0,
                   6737:   /* xCreate       */ fulltextCreate,
                   6738:   /* xConnect      */ fulltextConnect,
                   6739:   /* xBestIndex    */ fulltextBestIndex,
                   6740:   /* xDisconnect   */ fulltextDisconnect,
                   6741:   /* xDestroy      */ fulltextDestroy,
                   6742:   /* xOpen         */ fulltextOpen,
                   6743:   /* xClose        */ fulltextClose,
                   6744:   /* xFilter       */ fulltextFilter,
                   6745:   /* xNext         */ fulltextNext,
                   6746:   /* xEof          */ fulltextEof,
                   6747:   /* xColumn       */ fulltextColumn,
                   6748:   /* xRowid        */ fulltextRowid,
                   6749:   /* xUpdate       */ fulltextUpdate,
                   6750:   /* xBegin        */ fulltextBegin,
                   6751:   /* xSync         */ fulltextSync,
                   6752:   /* xCommit       */ fulltextCommit,
                   6753:   /* xRollback     */ fulltextRollback,
                   6754:   /* xFindFunction */ fulltextFindFunction,
                   6755:   /* xRename */       fulltextRename,
                   6756: };
                   6757: 
                   6758: static void hashDestroy(void *p){
                   6759:   fts2Hash *pHash = (fts2Hash *)p;
                   6760:   sqlite3Fts2HashClear(pHash);
                   6761:   sqlite3_free(pHash);
                   6762: }
                   6763: 
                   6764: /*
                   6765: ** The fts2 built-in tokenizers - "simple" and "porter" - are implemented
                   6766: ** in files fts2_tokenizer1.c and fts2_porter.c respectively. The following
                   6767: ** two forward declarations are for functions declared in these files
                   6768: ** used to retrieve the respective implementations.
                   6769: **
                   6770: ** Calling sqlite3Fts2SimpleTokenizerModule() sets the value pointed
                   6771: ** to by the argument to point a the "simple" tokenizer implementation.
                   6772: ** Function ...PorterTokenizerModule() sets *pModule to point to the
                   6773: ** porter tokenizer/stemmer implementation.
                   6774: */
                   6775: void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
                   6776: void sqlite3Fts2PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
                   6777: void sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
                   6778: 
                   6779: int sqlite3Fts2InitHashTable(sqlite3 *, fts2Hash *, const char *);
                   6780: 
                   6781: /*
                   6782: ** Initialise the fts2 extension. If this extension is built as part
                   6783: ** of the sqlite library, then this function is called directly by
                   6784: ** SQLite. If fts2 is built as a dynamically loadable extension, this
                   6785: ** function is called by the sqlite3_extension_init() entry point.
                   6786: */
                   6787: int sqlite3Fts2Init(sqlite3 *db){
                   6788:   int rc = SQLITE_OK;
                   6789:   fts2Hash *pHash = 0;
                   6790:   const sqlite3_tokenizer_module *pSimple = 0;
                   6791:   const sqlite3_tokenizer_module *pPorter = 0;
                   6792:   const sqlite3_tokenizer_module *pIcu = 0;
                   6793: 
                   6794:   sqlite3Fts2SimpleTokenizerModule(&pSimple);
                   6795:   sqlite3Fts2PorterTokenizerModule(&pPorter);
                   6796: #ifdef SQLITE_ENABLE_ICU
                   6797:   sqlite3Fts2IcuTokenizerModule(&pIcu);
                   6798: #endif
                   6799: 
                   6800:   /* Allocate and initialise the hash-table used to store tokenizers. */
                   6801:   pHash = sqlite3_malloc(sizeof(fts2Hash));
                   6802:   if( !pHash ){
                   6803:     rc = SQLITE_NOMEM;
                   6804:   }else{
                   6805:     sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
                   6806:   }
                   6807: 
                   6808:   /* Load the built-in tokenizers into the hash table */
                   6809:   if( rc==SQLITE_OK ){
                   6810:     if( sqlite3Fts2HashInsert(pHash, "simple", 7, (void *)pSimple)
                   6811:      || sqlite3Fts2HashInsert(pHash, "porter", 7, (void *)pPorter) 
                   6812:      || (pIcu && sqlite3Fts2HashInsert(pHash, "icu", 4, (void *)pIcu))
                   6813:     ){
                   6814:       rc = SQLITE_NOMEM;
                   6815:     }
                   6816:   }
                   6817: 
                   6818:   /* Create the virtual table wrapper around the hash-table and overload 
                   6819:   ** the two scalar functions. If this is successful, register the
                   6820:   ** module with sqlite.
                   6821:   */
                   6822:   if( SQLITE_OK==rc 
                   6823:    && SQLITE_OK==(rc = sqlite3Fts2InitHashTable(db, pHash, "fts2_tokenizer"))
                   6824:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
                   6825:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
                   6826:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "optimize", -1))
                   6827: #ifdef SQLITE_TEST
                   6828:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_terms", -1))
                   6829:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_doclist", -1))
                   6830: #endif
                   6831:   ){
                   6832:     return sqlite3_create_module_v2(
                   6833:         db, "fts2", &fts2Module, (void *)pHash, hashDestroy
                   6834:     );
                   6835:   }
                   6836: 
                   6837:   /* An error has occurred. Delete the hash table and return the error code. */
                   6838:   assert( rc!=SQLITE_OK );
                   6839:   if( pHash ){
                   6840:     sqlite3Fts2HashClear(pHash);
                   6841:     sqlite3_free(pHash);
                   6842:   }
                   6843:   return rc;
                   6844: }
                   6845: 
                   6846: #if !SQLITE_CORE
                   6847: int sqlite3_extension_init(
                   6848:   sqlite3 *db, 
                   6849:   char **pzErrMsg,
                   6850:   const sqlite3_api_routines *pApi
                   6851: ){
                   6852:   SQLITE_EXTENSION_INIT2(pApi)
                   6853:   return sqlite3Fts2Init(db);
                   6854: }
                   6855: #endif
                   6856: 
                   6857: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>