File:  [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts2 / fts2.c
Revision 1.1.1.1 (vendor branch): download - view: text, annotated - select for diffs - revision graph
Tue Feb 21 17:04:17 2012 UTC (13 years, 1 month ago) by misho
Branches: sqlite3, MAIN
CVS tags: v3_7_10, HEAD
sqlite3

    1: /* fts2 has a design flaw which can lead to database corruption (see
    2: ** below).  It is recommended not to use it any longer, instead use
    3: ** fts3 (or higher).  If you believe that your use of fts2 is safe,
    4: ** add -DSQLITE_ENABLE_BROKEN_FTS2=1 to your CFLAGS.
    5: */
    6: #if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)) \
    7:         && !defined(SQLITE_ENABLE_BROKEN_FTS2)
    8: #error fts2 has a design flaw and has been deprecated.
    9: #endif
   10: /* The flaw is that fts2 uses the content table's unaliased rowid as
   11: ** the unique docid.  fts2 embeds the rowid in the index it builds,
   12: ** and expects the rowid to not change.  The SQLite VACUUM operation
   13: ** will renumber such rowids, thereby breaking fts2.  If you are using
   14: ** fts2 in a system which has disabled VACUUM, then you can continue
   15: ** to use it safely.  Note that PRAGMA auto_vacuum does NOT disable
   16: ** VACUUM, though systems using auto_vacuum are unlikely to invoke
   17: ** VACUUM.
   18: **
   19: ** Unlike fts1, which is safe across VACUUM if you never delete
   20: ** documents, fts2 has a second exposure to this flaw, in the segments
   21: ** table.  So fts2 should be considered unsafe across VACUUM in all
   22: ** cases.
   23: */
   24: 
   25: /*
   26: ** 2006 Oct 10
   27: **
   28: ** The author disclaims copyright to this source code.  In place of
   29: ** a legal notice, here is a blessing:
   30: **
   31: **    May you do good and not evil.
   32: **    May you find forgiveness for yourself and forgive others.
   33: **    May you share freely, never taking more than you give.
   34: **
   35: ******************************************************************************
   36: **
   37: ** This is an SQLite module implementing full-text search.
   38: */
   39: 
   40: /*
   41: ** The code in this file is only compiled if:
   42: **
   43: **     * The FTS2 module is being built as an extension
   44: **       (in which case SQLITE_CORE is not defined), or
   45: **
   46: **     * The FTS2 module is being built into the core of
   47: **       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
   48: */
   49: 
   50: /* TODO(shess) Consider exporting this comment to an HTML file or the
   51: ** wiki.
   52: */
   53: /* The full-text index is stored in a series of b+tree (-like)
   54: ** structures called segments which map terms to doclists.  The
   55: ** structures are like b+trees in layout, but are constructed from the
   56: ** bottom up in optimal fashion and are not updatable.  Since trees
   57: ** are built from the bottom up, things will be described from the
   58: ** bottom up.
   59: **
   60: **
   61: **** Varints ****
   62: ** The basic unit of encoding is a variable-length integer called a
   63: ** varint.  We encode variable-length integers in little-endian order
   64: ** using seven bits * per byte as follows:
   65: **
   66: ** KEY:
   67: **         A = 0xxxxxxx    7 bits of data and one flag bit
   68: **         B = 1xxxxxxx    7 bits of data and one flag bit
   69: **
   70: **  7 bits - A
   71: ** 14 bits - BA
   72: ** 21 bits - BBA
   73: ** and so on.
   74: **
   75: ** This is identical to how sqlite encodes varints (see util.c).
   76: **
   77: **
   78: **** Document lists ****
   79: ** A doclist (document list) holds a docid-sorted list of hits for a
   80: ** given term.  Doclists hold docids, and can optionally associate
   81: ** token positions and offsets with docids.
   82: **
   83: ** A DL_POSITIONS_OFFSETS doclist is stored like this:
   84: **
   85: ** array {
   86: **   varint docid;
   87: **   array {                (position list for column 0)
   88: **     varint position;     (delta from previous position plus POS_BASE)
   89: **     varint startOffset;  (delta from previous startOffset)
   90: **     varint endOffset;    (delta from startOffset)
   91: **   }
   92: **   array {
   93: **     varint POS_COLUMN;   (marks start of position list for new column)
   94: **     varint column;       (index of new column)
   95: **     array {
   96: **       varint position;   (delta from previous position plus POS_BASE)
   97: **       varint startOffset;(delta from previous startOffset)
   98: **       varint endOffset;  (delta from startOffset)
   99: **     }
  100: **   }
  101: **   varint POS_END;        (marks end of positions for this document.
  102: ** }
  103: **
  104: ** Here, array { X } means zero or more occurrences of X, adjacent in
  105: ** memory.  A "position" is an index of a token in the token stream
  106: ** generated by the tokenizer, while an "offset" is a byte offset,
  107: ** both based at 0.  Note that POS_END and POS_COLUMN occur in the
  108: ** same logical place as the position element, and act as sentinals
  109: ** ending a position list array.
  110: **
  111: ** A DL_POSITIONS doclist omits the startOffset and endOffset
  112: ** information.  A DL_DOCIDS doclist omits both the position and
  113: ** offset information, becoming an array of varint-encoded docids.
  114: **
  115: ** On-disk data is stored as type DL_DEFAULT, so we don't serialize
  116: ** the type.  Due to how deletion is implemented in the segmentation
  117: ** system, on-disk doclists MUST store at least positions.
  118: **
  119: **
  120: **** Segment leaf nodes ****
  121: ** Segment leaf nodes store terms and doclists, ordered by term.  Leaf
  122: ** nodes are written using LeafWriter, and read using LeafReader (to
  123: ** iterate through a single leaf node's data) and LeavesReader (to
  124: ** iterate through a segment's entire leaf layer).  Leaf nodes have
  125: ** the format:
  126: **
  127: ** varint iHeight;             (height from leaf level, always 0)
  128: ** varint nTerm;               (length of first term)
  129: ** char pTerm[nTerm];          (content of first term)
  130: ** varint nDoclist;            (length of term's associated doclist)
  131: ** char pDoclist[nDoclist];    (content of doclist)
  132: ** array {
  133: **                             (further terms are delta-encoded)
  134: **   varint nPrefix;           (length of prefix shared with previous term)
  135: **   varint nSuffix;           (length of unshared suffix)
  136: **   char pTermSuffix[nSuffix];(unshared suffix of next term)
  137: **   varint nDoclist;          (length of term's associated doclist)
  138: **   char pDoclist[nDoclist];  (content of doclist)
  139: ** }
  140: **
  141: ** Here, array { X } means zero or more occurrences of X, adjacent in
  142: ** memory.
  143: **
  144: ** Leaf nodes are broken into blocks which are stored contiguously in
  145: ** the %_segments table in sorted order.  This means that when the end
  146: ** of a node is reached, the next term is in the node with the next
  147: ** greater node id.
  148: **
  149: ** New data is spilled to a new leaf node when the current node
  150: ** exceeds LEAF_MAX bytes (default 2048).  New data which itself is
  151: ** larger than STANDALONE_MIN (default 1024) is placed in a standalone
  152: ** node (a leaf node with a single term and doclist).  The goal of
  153: ** these settings is to pack together groups of small doclists while
  154: ** making it efficient to directly access large doclists.  The
  155: ** assumption is that large doclists represent terms which are more
  156: ** likely to be query targets.
  157: **
  158: ** TODO(shess) It may be useful for blocking decisions to be more
  159: ** dynamic.  For instance, it may make more sense to have a 2.5k leaf
  160: ** node rather than splitting into 2k and .5k nodes.  My intuition is
  161: ** that this might extend through 2x or 4x the pagesize.
  162: **
  163: **
  164: **** Segment interior nodes ****
  165: ** Segment interior nodes store blockids for subtree nodes and terms
  166: ** to describe what data is stored by the each subtree.  Interior
  167: ** nodes are written using InteriorWriter, and read using
  168: ** InteriorReader.  InteriorWriters are created as needed when
  169: ** SegmentWriter creates new leaf nodes, or when an interior node
  170: ** itself grows too big and must be split.  The format of interior
  171: ** nodes:
  172: **
  173: ** varint iHeight;           (height from leaf level, always >0)
  174: ** varint iBlockid;          (block id of node's leftmost subtree)
  175: ** optional {
  176: **   varint nTerm;           (length of first term)
  177: **   char pTerm[nTerm];      (content of first term)
  178: **   array {
  179: **                                (further terms are delta-encoded)
  180: **     varint nPrefix;            (length of shared prefix with previous term)
  181: **     varint nSuffix;            (length of unshared suffix)
  182: **     char pTermSuffix[nSuffix]; (unshared suffix of next term)
  183: **   }
  184: ** }
  185: **
  186: ** Here, optional { X } means an optional element, while array { X }
  187: ** means zero or more occurrences of X, adjacent in memory.
  188: **
  189: ** An interior node encodes n terms separating n+1 subtrees.  The
  190: ** subtree blocks are contiguous, so only the first subtree's blockid
  191: ** is encoded.  The subtree at iBlockid will contain all terms less
  192: ** than the first term encoded (or all terms if no term is encoded).
  193: ** Otherwise, for terms greater than or equal to pTerm[i] but less
  194: ** than pTerm[i+1], the subtree for that term will be rooted at
  195: ** iBlockid+i.  Interior nodes only store enough term data to
  196: ** distinguish adjacent children (if the rightmost term of the left
  197: ** child is "something", and the leftmost term of the right child is
  198: ** "wicked", only "w" is stored).
  199: **
  200: ** New data is spilled to a new interior node at the same height when
  201: ** the current node exceeds INTERIOR_MAX bytes (default 2048).
  202: ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
  203: ** interior nodes and making the tree too skinny.  The interior nodes
  204: ** at a given height are naturally tracked by interior nodes at
  205: ** height+1, and so on.
  206: **
  207: **
  208: **** Segment directory ****
  209: ** The segment directory in table %_segdir stores meta-information for
  210: ** merging and deleting segments, and also the root node of the
  211: ** segment's tree.
  212: **
  213: ** The root node is the top node of the segment's tree after encoding
  214: ** the entire segment, restricted to ROOT_MAX bytes (default 1024).
  215: ** This could be either a leaf node or an interior node.  If the top
  216: ** node requires more than ROOT_MAX bytes, it is flushed to %_segments
  217: ** and a new root interior node is generated (which should always fit
  218: ** within ROOT_MAX because it only needs space for 2 varints, the
  219: ** height and the blockid of the previous root).
  220: **
  221: ** The meta-information in the segment directory is:
  222: **   level               - segment level (see below)
  223: **   idx                 - index within level
  224: **                       - (level,idx uniquely identify a segment)
  225: **   start_block         - first leaf node
  226: **   leaves_end_block    - last leaf node
  227: **   end_block           - last block (including interior nodes)
  228: **   root                - contents of root node
  229: **
  230: ** If the root node is a leaf node, then start_block,
  231: ** leaves_end_block, and end_block are all 0.
  232: **
  233: **
  234: **** Segment merging ****
  235: ** To amortize update costs, segments are groups into levels and
  236: ** merged in matches.  Each increase in level represents exponentially
  237: ** more documents.
  238: **
  239: ** New documents (actually, document updates) are tokenized and
  240: ** written individually (using LeafWriter) to a level 0 segment, with
  241: ** incrementing idx.  When idx reaches MERGE_COUNT (default 16), all
  242: ** level 0 segments are merged into a single level 1 segment.  Level 1
  243: ** is populated like level 0, and eventually MERGE_COUNT level 1
  244: ** segments are merged to a single level 2 segment (representing
  245: ** MERGE_COUNT^2 updates), and so on.
  246: **
  247: ** A segment merge traverses all segments at a given level in
  248: ** parallel, performing a straightforward sorted merge.  Since segment
  249: ** leaf nodes are written in to the %_segments table in order, this
  250: ** merge traverses the underlying sqlite disk structures efficiently.
  251: ** After the merge, all segment blocks from the merged level are
  252: ** deleted.
  253: **
  254: ** MERGE_COUNT controls how often we merge segments.  16 seems to be
  255: ** somewhat of a sweet spot for insertion performance.  32 and 64 show
  256: ** very similar performance numbers to 16 on insertion, though they're
  257: ** a tiny bit slower (perhaps due to more overhead in merge-time
  258: ** sorting).  8 is about 20% slower than 16, 4 about 50% slower than
  259: ** 16, 2 about 66% slower than 16.
  260: **
  261: ** At query time, high MERGE_COUNT increases the number of segments
  262: ** which need to be scanned and merged.  For instance, with 100k docs
  263: ** inserted:
  264: **
  265: **    MERGE_COUNT   segments
  266: **       16           25
  267: **        8           12
  268: **        4           10
  269: **        2            6
  270: **
  271: ** This appears to have only a moderate impact on queries for very
  272: ** frequent terms (which are somewhat dominated by segment merge
  273: ** costs), and infrequent and non-existent terms still seem to be fast
  274: ** even with many segments.
  275: **
  276: ** TODO(shess) That said, it would be nice to have a better query-side
  277: ** argument for MERGE_COUNT of 16.  Also, it is possible/likely that
  278: ** optimizations to things like doclist merging will swing the sweet
  279: ** spot around.
  280: **
  281: **
  282: **
  283: **** Handling of deletions and updates ****
  284: ** Since we're using a segmented structure, with no docid-oriented
  285: ** index into the term index, we clearly cannot simply update the term
  286: ** index when a document is deleted or updated.  For deletions, we
  287: ** write an empty doclist (varint(docid) varint(POS_END)), for updates
  288: ** we simply write the new doclist.  Segment merges overwrite older
  289: ** data for a particular docid with newer data, so deletes or updates
  290: ** will eventually overtake the earlier data and knock it out.  The
  291: ** query logic likewise merges doclists so that newer data knocks out
  292: ** older data.
  293: **
  294: ** TODO(shess) Provide a VACUUM type operation to clear out all
  295: ** deletions and duplications.  This would basically be a forced merge
  296: ** into a single segment.
  297: */
  298: 
  299: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
  300: 
  301: #if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE)
  302: # define SQLITE_CORE 1
  303: #endif
  304: 
  305: #include <assert.h>
  306: #include <stdlib.h>
  307: #include <stdio.h>
  308: #include <string.h>
  309: #include "fts2.h"
  310: #include "fts2_hash.h"
  311: #include "fts2_tokenizer.h"
  312: #include "sqlite3.h"
  313: #include "sqlite3ext.h"
  314: SQLITE_EXTENSION_INIT1
  315: 
  316: 
  317: /* TODO(shess) MAN, this thing needs some refactoring.  At minimum, it
  318: ** would be nice to order the file better, perhaps something along the
  319: ** lines of:
  320: **
  321: **  - utility functions
  322: **  - table setup functions
  323: **  - table update functions
  324: **  - table query functions
  325: **
  326: ** Put the query functions last because they're likely to reference
  327: ** typedefs or functions from the table update section.
  328: */
  329: 
  330: #if 0
  331: # define TRACE(A)  printf A; fflush(stdout)
  332: #else
  333: # define TRACE(A)
  334: #endif
  335: 
  336: /* It is not safe to call isspace(), tolower(), or isalnum() on
  337: ** hi-bit-set characters.  This is the same solution used in the
  338: ** tokenizer.
  339: */
  340: /* TODO(shess) The snippet-generation code should be using the
  341: ** tokenizer-generated tokens rather than doing its own local
  342: ** tokenization.
  343: */
  344: /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
  345: static int safe_isspace(char c){
  346:   return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
  347: }
  348: static int safe_tolower(char c){
  349:   return (c>='A' && c<='Z') ? (c - 'A' + 'a') : c;
  350: }
  351: static int safe_isalnum(char c){
  352:   return (c>='0' && c<='9') || (c>='A' && c<='Z') || (c>='a' && c<='z');
  353: }
  354: 
  355: typedef enum DocListType {
  356:   DL_DOCIDS,              /* docids only */
  357:   DL_POSITIONS,           /* docids + positions */
  358:   DL_POSITIONS_OFFSETS    /* docids + positions + offsets */
  359: } DocListType;
  360: 
  361: /*
  362: ** By default, only positions and not offsets are stored in the doclists.
  363: ** To change this so that offsets are stored too, compile with
  364: **
  365: **          -DDL_DEFAULT=DL_POSITIONS_OFFSETS
  366: **
  367: ** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
  368: ** into (no deletes or updates).
  369: */
  370: #ifndef DL_DEFAULT
  371: # define DL_DEFAULT DL_POSITIONS
  372: #endif
  373: 
  374: enum {
  375:   POS_END = 0,        /* end of this position list */
  376:   POS_COLUMN,         /* followed by new column number */
  377:   POS_BASE
  378: };
  379: 
  380: /* MERGE_COUNT controls how often we merge segments (see comment at
  381: ** top of file).
  382: */
  383: #define MERGE_COUNT 16
  384: 
  385: /* utility functions */
  386: 
  387: /* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
  388: ** record to prevent errors of the form:
  389: **
  390: ** my_function(SomeType *b){
  391: **   memset(b, '\0', sizeof(b));  // sizeof(b)!=sizeof(*b)
  392: ** }
  393: */
  394: /* TODO(shess) Obvious candidates for a header file. */
  395: #define CLEAR(b) memset(b, '\0', sizeof(*(b)))
  396: 
  397: #ifndef NDEBUG
  398: #  define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
  399: #else
  400: #  define SCRAMBLE(b)
  401: #endif
  402: 
  403: /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
  404: #define VARINT_MAX 10
  405: 
  406: /* Write a 64-bit variable-length integer to memory starting at p[0].
  407:  * The length of data written will be between 1 and VARINT_MAX bytes.
  408:  * The number of bytes written is returned. */
  409: static int putVarint(char *p, sqlite_int64 v){
  410:   unsigned char *q = (unsigned char *) p;
  411:   sqlite_uint64 vu = v;
  412:   do{
  413:     *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
  414:     vu >>= 7;
  415:   }while( vu!=0 );
  416:   q[-1] &= 0x7f;  /* turn off high bit in final byte */
  417:   assert( q - (unsigned char *)p <= VARINT_MAX );
  418:   return (int) (q - (unsigned char *)p);
  419: }
  420: 
  421: /* Read a 64-bit variable-length integer from memory starting at p[0].
  422:  * Return the number of bytes read, or 0 on error.
  423:  * The value is stored in *v. */
  424: static int getVarint(const char *p, sqlite_int64 *v){
  425:   const unsigned char *q = (const unsigned char *) p;
  426:   sqlite_uint64 x = 0, y = 1;
  427:   while( (*q & 0x80) == 0x80 ){
  428:     x += y * (*q++ & 0x7f);
  429:     y <<= 7;
  430:     if( q - (unsigned char *)p >= VARINT_MAX ){  /* bad data */
  431:       assert( 0 );
  432:       return 0;
  433:     }
  434:   }
  435:   x += y * (*q++);
  436:   *v = (sqlite_int64) x;
  437:   return (int) (q - (unsigned char *)p);
  438: }
  439: 
  440: static int getVarint32(const char *p, int *pi){
  441:  sqlite_int64 i;
  442:  int ret = getVarint(p, &i);
  443:  *pi = (int) i;
  444:  assert( *pi==i );
  445:  return ret;
  446: }
  447: 
  448: /*******************************************************************/
  449: /* DataBuffer is used to collect data into a buffer in piecemeal
  450: ** fashion.  It implements the usual distinction between amount of
  451: ** data currently stored (nData) and buffer capacity (nCapacity).
  452: **
  453: ** dataBufferInit - create a buffer with given initial capacity.
  454: ** dataBufferReset - forget buffer's data, retaining capacity.
  455: ** dataBufferDestroy - free buffer's data.
  456: ** dataBufferSwap - swap contents of two buffers.
  457: ** dataBufferExpand - expand capacity without adding data.
  458: ** dataBufferAppend - append data.
  459: ** dataBufferAppend2 - append two pieces of data at once.
  460: ** dataBufferReplace - replace buffer's data.
  461: */
  462: typedef struct DataBuffer {
  463:   char *pData;          /* Pointer to malloc'ed buffer. */
  464:   int nCapacity;        /* Size of pData buffer. */
  465:   int nData;            /* End of data loaded into pData. */
  466: } DataBuffer;
  467: 
  468: static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
  469:   assert( nCapacity>=0 );
  470:   pBuffer->nData = 0;
  471:   pBuffer->nCapacity = nCapacity;
  472:   pBuffer->pData = nCapacity==0 ? NULL : sqlite3_malloc(nCapacity);
  473: }
  474: static void dataBufferReset(DataBuffer *pBuffer){
  475:   pBuffer->nData = 0;
  476: }
  477: static void dataBufferDestroy(DataBuffer *pBuffer){
  478:   if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData);
  479:   SCRAMBLE(pBuffer);
  480: }
  481: static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){
  482:   DataBuffer tmp = *pBuffer1;
  483:   *pBuffer1 = *pBuffer2;
  484:   *pBuffer2 = tmp;
  485: }
  486: static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
  487:   assert( nAddCapacity>0 );
  488:   /* TODO(shess) Consider expanding more aggressively.  Note that the
  489:   ** underlying malloc implementation may take care of such things for
  490:   ** us already.
  491:   */
  492:   if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
  493:     pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
  494:     pBuffer->pData = sqlite3_realloc(pBuffer->pData, pBuffer->nCapacity);
  495:   }
  496: }
  497: static void dataBufferAppend(DataBuffer *pBuffer,
  498:                              const char *pSource, int nSource){
  499:   assert( nSource>0 && pSource!=NULL );
  500:   dataBufferExpand(pBuffer, nSource);
  501:   memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
  502:   pBuffer->nData += nSource;
  503: }
  504: static void dataBufferAppend2(DataBuffer *pBuffer,
  505:                               const char *pSource1, int nSource1,
  506:                               const char *pSource2, int nSource2){
  507:   assert( nSource1>0 && pSource1!=NULL );
  508:   assert( nSource2>0 && pSource2!=NULL );
  509:   dataBufferExpand(pBuffer, nSource1+nSource2);
  510:   memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
  511:   memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
  512:   pBuffer->nData += nSource1+nSource2;
  513: }
  514: static void dataBufferReplace(DataBuffer *pBuffer,
  515:                               const char *pSource, int nSource){
  516:   dataBufferReset(pBuffer);
  517:   dataBufferAppend(pBuffer, pSource, nSource);
  518: }
  519: 
  520: /* StringBuffer is a null-terminated version of DataBuffer. */
  521: typedef struct StringBuffer {
  522:   DataBuffer b;            /* Includes null terminator. */
  523: } StringBuffer;
  524: 
  525: static void initStringBuffer(StringBuffer *sb){
  526:   dataBufferInit(&sb->b, 100);
  527:   dataBufferReplace(&sb->b, "", 1);
  528: }
  529: static int stringBufferLength(StringBuffer *sb){
  530:   return sb->b.nData-1;
  531: }
  532: static char *stringBufferData(StringBuffer *sb){
  533:   return sb->b.pData;
  534: }
  535: static void stringBufferDestroy(StringBuffer *sb){
  536:   dataBufferDestroy(&sb->b);
  537: }
  538: 
  539: static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
  540:   assert( sb->b.nData>0 );
  541:   if( nFrom>0 ){
  542:     sb->b.nData--;
  543:     dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
  544:   }
  545: }
  546: static void append(StringBuffer *sb, const char *zFrom){
  547:   nappend(sb, zFrom, strlen(zFrom));
  548: }
  549: 
  550: /* Append a list of strings separated by commas. */
  551: static void appendList(StringBuffer *sb, int nString, char **azString){
  552:   int i;
  553:   for(i=0; i<nString; ++i){
  554:     if( i>0 ) append(sb, ", ");
  555:     append(sb, azString[i]);
  556:   }
  557: }
  558: 
  559: static int endsInWhiteSpace(StringBuffer *p){
  560:   return stringBufferLength(p)>0 &&
  561:     safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
  562: }
  563: 
  564: /* If the StringBuffer ends in something other than white space, add a
  565: ** single space character to the end.
  566: */
  567: static void appendWhiteSpace(StringBuffer *p){
  568:   if( stringBufferLength(p)==0 ) return;
  569:   if( !endsInWhiteSpace(p) ) append(p, " ");
  570: }
  571: 
  572: /* Remove white space from the end of the StringBuffer */
  573: static void trimWhiteSpace(StringBuffer *p){
  574:   while( endsInWhiteSpace(p) ){
  575:     p->b.pData[--p->b.nData-1] = '\0';
  576:   }
  577: }
  578: 
  579: /*******************************************************************/
  580: /* DLReader is used to read document elements from a doclist.  The
  581: ** current docid is cached, so dlrDocid() is fast.  DLReader does not
  582: ** own the doclist buffer.
  583: **
  584: ** dlrAtEnd - true if there's no more data to read.
  585: ** dlrDocid - docid of current document.
  586: ** dlrDocData - doclist data for current document (including docid).
  587: ** dlrDocDataBytes - length of same.
  588: ** dlrAllDataBytes - length of all remaining data.
  589: ** dlrPosData - position data for current document.
  590: ** dlrPosDataLen - length of pos data for current document (incl POS_END).
  591: ** dlrStep - step to current document.
  592: ** dlrInit - initial for doclist of given type against given data.
  593: ** dlrDestroy - clean up.
  594: **
  595: ** Expected usage is something like:
  596: **
  597: **   DLReader reader;
  598: **   dlrInit(&reader, pData, nData);
  599: **   while( !dlrAtEnd(&reader) ){
  600: **     // calls to dlrDocid() and kin.
  601: **     dlrStep(&reader);
  602: **   }
  603: **   dlrDestroy(&reader);
  604: */
  605: typedef struct DLReader {
  606:   DocListType iType;
  607:   const char *pData;
  608:   int nData;
  609: 
  610:   sqlite_int64 iDocid;
  611:   int nElement;
  612: } DLReader;
  613: 
  614: static int dlrAtEnd(DLReader *pReader){
  615:   assert( pReader->nData>=0 );
  616:   return pReader->nData==0;
  617: }
  618: static sqlite_int64 dlrDocid(DLReader *pReader){
  619:   assert( !dlrAtEnd(pReader) );
  620:   return pReader->iDocid;
  621: }
  622: static const char *dlrDocData(DLReader *pReader){
  623:   assert( !dlrAtEnd(pReader) );
  624:   return pReader->pData;
  625: }
  626: static int dlrDocDataBytes(DLReader *pReader){
  627:   assert( !dlrAtEnd(pReader) );
  628:   return pReader->nElement;
  629: }
  630: static int dlrAllDataBytes(DLReader *pReader){
  631:   assert( !dlrAtEnd(pReader) );
  632:   return pReader->nData;
  633: }
  634: /* TODO(shess) Consider adding a field to track iDocid varint length
  635: ** to make these two functions faster.  This might matter (a tiny bit)
  636: ** for queries.
  637: */
  638: static const char *dlrPosData(DLReader *pReader){
  639:   sqlite_int64 iDummy;
  640:   int n = getVarint(pReader->pData, &iDummy);
  641:   assert( !dlrAtEnd(pReader) );
  642:   return pReader->pData+n;
  643: }
  644: static int dlrPosDataLen(DLReader *pReader){
  645:   sqlite_int64 iDummy;
  646:   int n = getVarint(pReader->pData, &iDummy);
  647:   assert( !dlrAtEnd(pReader) );
  648:   return pReader->nElement-n;
  649: }
  650: static void dlrStep(DLReader *pReader){
  651:   assert( !dlrAtEnd(pReader) );
  652: 
  653:   /* Skip past current doclist element. */
  654:   assert( pReader->nElement<=pReader->nData );
  655:   pReader->pData += pReader->nElement;
  656:   pReader->nData -= pReader->nElement;
  657: 
  658:   /* If there is more data, read the next doclist element. */
  659:   if( pReader->nData!=0 ){
  660:     sqlite_int64 iDocidDelta;
  661:     int iDummy, n = getVarint(pReader->pData, &iDocidDelta);
  662:     pReader->iDocid += iDocidDelta;
  663:     if( pReader->iType>=DL_POSITIONS ){
  664:       assert( n<pReader->nData );
  665:       while( 1 ){
  666:         n += getVarint32(pReader->pData+n, &iDummy);
  667:         assert( n<=pReader->nData );
  668:         if( iDummy==POS_END ) break;
  669:         if( iDummy==POS_COLUMN ){
  670:           n += getVarint32(pReader->pData+n, &iDummy);
  671:           assert( n<pReader->nData );
  672:         }else if( pReader->iType==DL_POSITIONS_OFFSETS ){
  673:           n += getVarint32(pReader->pData+n, &iDummy);
  674:           n += getVarint32(pReader->pData+n, &iDummy);
  675:           assert( n<pReader->nData );
  676:         }
  677:       }
  678:     }
  679:     pReader->nElement = n;
  680:     assert( pReader->nElement<=pReader->nData );
  681:   }
  682: }
  683: static void dlrInit(DLReader *pReader, DocListType iType,
  684:                     const char *pData, int nData){
  685:   assert( pData!=NULL && nData!=0 );
  686:   pReader->iType = iType;
  687:   pReader->pData = pData;
  688:   pReader->nData = nData;
  689:   pReader->nElement = 0;
  690:   pReader->iDocid = 0;
  691: 
  692:   /* Load the first element's data.  There must be a first element. */
  693:   dlrStep(pReader);
  694: }
  695: static void dlrDestroy(DLReader *pReader){
  696:   SCRAMBLE(pReader);
  697: }
  698: 
  699: #ifndef NDEBUG
  700: /* Verify that the doclist can be validly decoded.  Also returns the
  701: ** last docid found because it is convenient in other assertions for
  702: ** DLWriter.
  703: */
  704: static void docListValidate(DocListType iType, const char *pData, int nData,
  705:                             sqlite_int64 *pLastDocid){
  706:   sqlite_int64 iPrevDocid = 0;
  707:   assert( nData>0 );
  708:   assert( pData!=0 );
  709:   assert( pData+nData>pData );
  710:   while( nData!=0 ){
  711:     sqlite_int64 iDocidDelta;
  712:     int n = getVarint(pData, &iDocidDelta);
  713:     iPrevDocid += iDocidDelta;
  714:     if( iType>DL_DOCIDS ){
  715:       int iDummy;
  716:       while( 1 ){
  717:         n += getVarint32(pData+n, &iDummy);
  718:         if( iDummy==POS_END ) break;
  719:         if( iDummy==POS_COLUMN ){
  720:           n += getVarint32(pData+n, &iDummy);
  721:         }else if( iType>DL_POSITIONS ){
  722:           n += getVarint32(pData+n, &iDummy);
  723:           n += getVarint32(pData+n, &iDummy);
  724:         }
  725:         assert( n<=nData );
  726:       }
  727:     }
  728:     assert( n<=nData );
  729:     pData += n;
  730:     nData -= n;
  731:   }
  732:   if( pLastDocid ) *pLastDocid = iPrevDocid;
  733: }
  734: #define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
  735: #else
  736: #define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
  737: #endif
  738: 
  739: /*******************************************************************/
  740: /* DLWriter is used to write doclist data to a DataBuffer.  DLWriter
  741: ** always appends to the buffer and does not own it.
  742: **
  743: ** dlwInit - initialize to write a given type doclistto a buffer.
  744: ** dlwDestroy - clear the writer's memory.  Does not free buffer.
  745: ** dlwAppend - append raw doclist data to buffer.
  746: ** dlwCopy - copy next doclist from reader to writer.
  747: ** dlwAdd - construct doclist element and append to buffer.
  748: **    Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
  749: */
  750: typedef struct DLWriter {
  751:   DocListType iType;
  752:   DataBuffer *b;
  753:   sqlite_int64 iPrevDocid;
  754: #ifndef NDEBUG
  755:   int has_iPrevDocid;
  756: #endif
  757: } DLWriter;
  758: 
  759: static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){
  760:   pWriter->b = b;
  761:   pWriter->iType = iType;
  762:   pWriter->iPrevDocid = 0;
  763: #ifndef NDEBUG
  764:   pWriter->has_iPrevDocid = 0;
  765: #endif
  766: }
  767: static void dlwDestroy(DLWriter *pWriter){
  768:   SCRAMBLE(pWriter);
  769: }
  770: /* iFirstDocid is the first docid in the doclist in pData.  It is
  771: ** needed because pData may point within a larger doclist, in which
  772: ** case the first item would be delta-encoded.
  773: **
  774: ** iLastDocid is the final docid in the doclist in pData.  It is
  775: ** needed to create the new iPrevDocid for future delta-encoding.  The
  776: ** code could decode the passed doclist to recreate iLastDocid, but
  777: ** the only current user (docListMerge) already has decoded this
  778: ** information.
  779: */
  780: /* TODO(shess) This has become just a helper for docListMerge.
  781: ** Consider a refactor to make this cleaner.
  782: */
  783: static void dlwAppend(DLWriter *pWriter,
  784:                       const char *pData, int nData,
  785:                       sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
  786:   sqlite_int64 iDocid = 0;
  787:   char c[VARINT_MAX];
  788:   int nFirstOld, nFirstNew;     /* Old and new varint len of first docid. */
  789: #ifndef NDEBUG
  790:   sqlite_int64 iLastDocidDelta;
  791: #endif
  792: 
  793:   /* Recode the initial docid as delta from iPrevDocid. */
  794:   nFirstOld = getVarint(pData, &iDocid);
  795:   assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
  796:   nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
  797: 
  798:   /* Verify that the incoming doclist is valid AND that it ends with
  799:   ** the expected docid.  This is essential because we'll trust this
  800:   ** docid in future delta-encoding.
  801:   */
  802:   ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
  803:   assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
  804: 
  805:   /* Append recoded initial docid and everything else.  Rest of docids
  806:   ** should have been delta-encoded from previous initial docid.
  807:   */
  808:   if( nFirstOld<nData ){
  809:     dataBufferAppend2(pWriter->b, c, nFirstNew,
  810:                       pData+nFirstOld, nData-nFirstOld);
  811:   }else{
  812:     dataBufferAppend(pWriter->b, c, nFirstNew);
  813:   }
  814:   pWriter->iPrevDocid = iLastDocid;
  815: }
  816: static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
  817:   dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
  818:             dlrDocid(pReader), dlrDocid(pReader));
  819: }
  820: static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
  821:   char c[VARINT_MAX];
  822:   int n = putVarint(c, iDocid-pWriter->iPrevDocid);
  823: 
  824:   /* Docids must ascend. */
  825:   assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid );
  826:   assert( pWriter->iType==DL_DOCIDS );
  827: 
  828:   dataBufferAppend(pWriter->b, c, n);
  829:   pWriter->iPrevDocid = iDocid;
  830: #ifndef NDEBUG
  831:   pWriter->has_iPrevDocid = 1;
  832: #endif
  833: }
  834: 
  835: /*******************************************************************/
  836: /* PLReader is used to read data from a document's position list.  As
  837: ** the caller steps through the list, data is cached so that varints
  838: ** only need to be decoded once.
  839: **
  840: ** plrInit, plrDestroy - create/destroy a reader.
  841: ** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
  842: ** plrAtEnd - at end of stream, only call plrDestroy once true.
  843: ** plrStep - step to the next element.
  844: */
  845: typedef struct PLReader {
  846:   /* These refer to the next position's data.  nData will reach 0 when
  847:   ** reading the last position, so plrStep() signals EOF by setting
  848:   ** pData to NULL.
  849:   */
  850:   const char *pData;
  851:   int nData;
  852: 
  853:   DocListType iType;
  854:   int iColumn;         /* the last column read */
  855:   int iPosition;       /* the last position read */
  856:   int iStartOffset;    /* the last start offset read */
  857:   int iEndOffset;      /* the last end offset read */
  858: } PLReader;
  859: 
  860: static int plrAtEnd(PLReader *pReader){
  861:   return pReader->pData==NULL;
  862: }
  863: static int plrColumn(PLReader *pReader){
  864:   assert( !plrAtEnd(pReader) );
  865:   return pReader->iColumn;
  866: }
  867: static int plrPosition(PLReader *pReader){
  868:   assert( !plrAtEnd(pReader) );
  869:   return pReader->iPosition;
  870: }
  871: static int plrStartOffset(PLReader *pReader){
  872:   assert( !plrAtEnd(pReader) );
  873:   return pReader->iStartOffset;
  874: }
  875: static int plrEndOffset(PLReader *pReader){
  876:   assert( !plrAtEnd(pReader) );
  877:   return pReader->iEndOffset;
  878: }
  879: static void plrStep(PLReader *pReader){
  880:   int i, n;
  881: 
  882:   assert( !plrAtEnd(pReader) );
  883: 
  884:   if( pReader->nData==0 ){
  885:     pReader->pData = NULL;
  886:     return;
  887:   }
  888: 
  889:   n = getVarint32(pReader->pData, &i);
  890:   if( i==POS_COLUMN ){
  891:     n += getVarint32(pReader->pData+n, &pReader->iColumn);
  892:     pReader->iPosition = 0;
  893:     pReader->iStartOffset = 0;
  894:     n += getVarint32(pReader->pData+n, &i);
  895:   }
  896:   /* Should never see adjacent column changes. */
  897:   assert( i!=POS_COLUMN );
  898: 
  899:   if( i==POS_END ){
  900:     pReader->nData = 0;
  901:     pReader->pData = NULL;
  902:     return;
  903:   }
  904: 
  905:   pReader->iPosition += i-POS_BASE;
  906:   if( pReader->iType==DL_POSITIONS_OFFSETS ){
  907:     n += getVarint32(pReader->pData+n, &i);
  908:     pReader->iStartOffset += i;
  909:     n += getVarint32(pReader->pData+n, &i);
  910:     pReader->iEndOffset = pReader->iStartOffset+i;
  911:   }
  912:   assert( n<=pReader->nData );
  913:   pReader->pData += n;
  914:   pReader->nData -= n;
  915: }
  916: 
  917: static void plrInit(PLReader *pReader, DLReader *pDLReader){
  918:   pReader->pData = dlrPosData(pDLReader);
  919:   pReader->nData = dlrPosDataLen(pDLReader);
  920:   pReader->iType = pDLReader->iType;
  921:   pReader->iColumn = 0;
  922:   pReader->iPosition = 0;
  923:   pReader->iStartOffset = 0;
  924:   pReader->iEndOffset = 0;
  925:   plrStep(pReader);
  926: }
  927: static void plrDestroy(PLReader *pReader){
  928:   SCRAMBLE(pReader);
  929: }
  930: 
  931: /*******************************************************************/
  932: /* PLWriter is used in constructing a document's position list.  As a
  933: ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
  934: ** PLWriter writes to the associated DLWriter's buffer.
  935: **
  936: ** plwInit - init for writing a document's poslist.
  937: ** plwDestroy - clear a writer.
  938: ** plwAdd - append position and offset information.
  939: ** plwCopy - copy next position's data from reader to writer.
  940: ** plwTerminate - add any necessary doclist terminator.
  941: **
  942: ** Calling plwAdd() after plwTerminate() may result in a corrupt
  943: ** doclist.
  944: */
  945: /* TODO(shess) Until we've written the second item, we can cache the
  946: ** first item's information.  Then we'd have three states:
  947: **
  948: ** - initialized with docid, no positions.
  949: ** - docid and one position.
  950: ** - docid and multiple positions.
  951: **
  952: ** Only the last state needs to actually write to dlw->b, which would
  953: ** be an improvement in the DLCollector case.
  954: */
  955: typedef struct PLWriter {
  956:   DLWriter *dlw;
  957: 
  958:   int iColumn;    /* the last column written */
  959:   int iPos;       /* the last position written */
  960:   int iOffset;    /* the last start offset written */
  961: } PLWriter;
  962: 
  963: /* TODO(shess) In the case where the parent is reading these values
  964: ** from a PLReader, we could optimize to a copy if that PLReader has
  965: ** the same type as pWriter.
  966: */
  967: static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
  968:                    int iStartOffset, int iEndOffset){
  969:   /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
  970:   ** iStartOffsetDelta, and iEndOffsetDelta.
  971:   */
  972:   char c[5*VARINT_MAX];
  973:   int n = 0;
  974: 
  975:   /* Ban plwAdd() after plwTerminate(). */
  976:   assert( pWriter->iPos!=-1 );
  977: 
  978:   if( pWriter->dlw->iType==DL_DOCIDS ) return;
  979: 
  980:   if( iColumn!=pWriter->iColumn ){
  981:     n += putVarint(c+n, POS_COLUMN);
  982:     n += putVarint(c+n, iColumn);
  983:     pWriter->iColumn = iColumn;
  984:     pWriter->iPos = 0;
  985:     pWriter->iOffset = 0;
  986:   }
  987:   assert( iPos>=pWriter->iPos );
  988:   n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
  989:   pWriter->iPos = iPos;
  990:   if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
  991:     assert( iStartOffset>=pWriter->iOffset );
  992:     n += putVarint(c+n, iStartOffset-pWriter->iOffset);
  993:     pWriter->iOffset = iStartOffset;
  994:     assert( iEndOffset>=iStartOffset );
  995:     n += putVarint(c+n, iEndOffset-iStartOffset);
  996:   }
  997:   dataBufferAppend(pWriter->dlw->b, c, n);
  998: }
  999: static void plwCopy(PLWriter *pWriter, PLReader *pReader){
 1000:   plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
 1001:          plrStartOffset(pReader), plrEndOffset(pReader));
 1002: }
 1003: static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
 1004:   char c[VARINT_MAX];
 1005:   int n;
 1006: 
 1007:   pWriter->dlw = dlw;
 1008: 
 1009:   /* Docids must ascend. */
 1010:   assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid );
 1011:   n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
 1012:   dataBufferAppend(pWriter->dlw->b, c, n);
 1013:   pWriter->dlw->iPrevDocid = iDocid;
 1014: #ifndef NDEBUG
 1015:   pWriter->dlw->has_iPrevDocid = 1;
 1016: #endif
 1017: 
 1018:   pWriter->iColumn = 0;
 1019:   pWriter->iPos = 0;
 1020:   pWriter->iOffset = 0;
 1021: }
 1022: /* TODO(shess) Should plwDestroy() also terminate the doclist?  But
 1023: ** then plwDestroy() would no longer be just a destructor, it would
 1024: ** also be doing work, which isn't consistent with the overall idiom.
 1025: ** Another option would be for plwAdd() to always append any necessary
 1026: ** terminator, so that the output is always correct.  But that would
 1027: ** add incremental work to the common case with the only benefit being
 1028: ** API elegance.  Punt for now.
 1029: */
 1030: static void plwTerminate(PLWriter *pWriter){
 1031:   if( pWriter->dlw->iType>DL_DOCIDS ){
 1032:     char c[VARINT_MAX];
 1033:     int n = putVarint(c, POS_END);
 1034:     dataBufferAppend(pWriter->dlw->b, c, n);
 1035:   }
 1036: #ifndef NDEBUG
 1037:   /* Mark as terminated for assert in plwAdd(). */
 1038:   pWriter->iPos = -1;
 1039: #endif
 1040: }
 1041: static void plwDestroy(PLWriter *pWriter){
 1042:   SCRAMBLE(pWriter);
 1043: }
 1044: 
 1045: /*******************************************************************/
 1046: /* DLCollector wraps PLWriter and DLWriter to provide a
 1047: ** dynamically-allocated doclist area to use during tokenization.
 1048: **
 1049: ** dlcNew - malloc up and initialize a collector.
 1050: ** dlcDelete - destroy a collector and all contained items.
 1051: ** dlcAddPos - append position and offset information.
 1052: ** dlcAddDoclist - add the collected doclist to the given buffer.
 1053: ** dlcNext - terminate the current document and open another.
 1054: */
 1055: typedef struct DLCollector {
 1056:   DataBuffer b;
 1057:   DLWriter dlw;
 1058:   PLWriter plw;
 1059: } DLCollector;
 1060: 
 1061: /* TODO(shess) This could also be done by calling plwTerminate() and
 1062: ** dataBufferAppend().  I tried that, expecting nominal performance
 1063: ** differences, but it seemed to pretty reliably be worth 1% to code
 1064: ** it this way.  I suspect it is the incremental malloc overhead (some
 1065: ** percentage of the plwTerminate() calls will cause a realloc), so
 1066: ** this might be worth revisiting if the DataBuffer implementation
 1067: ** changes.
 1068: */
 1069: static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
 1070:   if( pCollector->dlw.iType>DL_DOCIDS ){
 1071:     char c[VARINT_MAX];
 1072:     int n = putVarint(c, POS_END);
 1073:     dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
 1074:   }else{
 1075:     dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
 1076:   }
 1077: }
 1078: static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
 1079:   plwTerminate(&pCollector->plw);
 1080:   plwDestroy(&pCollector->plw);
 1081:   plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
 1082: }
 1083: static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
 1084:                       int iStartOffset, int iEndOffset){
 1085:   plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
 1086: }
 1087: 
 1088: static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
 1089:   DLCollector *pCollector = sqlite3_malloc(sizeof(DLCollector));
 1090:   dataBufferInit(&pCollector->b, 0);
 1091:   dlwInit(&pCollector->dlw, iType, &pCollector->b);
 1092:   plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
 1093:   return pCollector;
 1094: }
 1095: static void dlcDelete(DLCollector *pCollector){
 1096:   plwDestroy(&pCollector->plw);
 1097:   dlwDestroy(&pCollector->dlw);
 1098:   dataBufferDestroy(&pCollector->b);
 1099:   SCRAMBLE(pCollector);
 1100:   sqlite3_free(pCollector);
 1101: }
 1102: 
 1103: 
 1104: /* Copy the doclist data of iType in pData/nData into *out, trimming
 1105: ** unnecessary data as we go.  Only columns matching iColumn are
 1106: ** copied, all columns copied if iColumn is -1.  Elements with no
 1107: ** matching columns are dropped.  The output is an iOutType doclist.
 1108: */
 1109: /* NOTE(shess) This code is only valid after all doclists are merged.
 1110: ** If this is run before merges, then doclist items which represent
 1111: ** deletion will be trimmed, and will thus not effect a deletion
 1112: ** during the merge.
 1113: */
 1114: static void docListTrim(DocListType iType, const char *pData, int nData,
 1115:                         int iColumn, DocListType iOutType, DataBuffer *out){
 1116:   DLReader dlReader;
 1117:   DLWriter dlWriter;
 1118: 
 1119:   assert( iOutType<=iType );
 1120: 
 1121:   dlrInit(&dlReader, iType, pData, nData);
 1122:   dlwInit(&dlWriter, iOutType, out);
 1123: 
 1124:   while( !dlrAtEnd(&dlReader) ){
 1125:     PLReader plReader;
 1126:     PLWriter plWriter;
 1127:     int match = 0;
 1128: 
 1129:     plrInit(&plReader, &dlReader);
 1130: 
 1131:     while( !plrAtEnd(&plReader) ){
 1132:       if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
 1133:         if( !match ){
 1134:           plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
 1135:           match = 1;
 1136:         }
 1137:         plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
 1138:                plrStartOffset(&plReader), plrEndOffset(&plReader));
 1139:       }
 1140:       plrStep(&plReader);
 1141:     }
 1142:     if( match ){
 1143:       plwTerminate(&plWriter);
 1144:       plwDestroy(&plWriter);
 1145:     }
 1146: 
 1147:     plrDestroy(&plReader);
 1148:     dlrStep(&dlReader);
 1149:   }
 1150:   dlwDestroy(&dlWriter);
 1151:   dlrDestroy(&dlReader);
 1152: }
 1153: 
 1154: /* Used by docListMerge() to keep doclists in the ascending order by
 1155: ** docid, then ascending order by age (so the newest comes first).
 1156: */
 1157: typedef struct OrderedDLReader {
 1158:   DLReader *pReader;
 1159: 
 1160:   /* TODO(shess) If we assume that docListMerge pReaders is ordered by
 1161:   ** age (which we do), then we could use pReader comparisons to break
 1162:   ** ties.
 1163:   */
 1164:   int idx;
 1165: } OrderedDLReader;
 1166: 
 1167: /* Order eof to end, then by docid asc, idx desc. */
 1168: static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){
 1169:   if( dlrAtEnd(r1->pReader) ){
 1170:     if( dlrAtEnd(r2->pReader) ) return 0;  /* Both atEnd(). */
 1171:     return 1;                              /* Only r1 atEnd(). */
 1172:   }
 1173:   if( dlrAtEnd(r2->pReader) ) return -1;   /* Only r2 atEnd(). */
 1174: 
 1175:   if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1;
 1176:   if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
 1177: 
 1178:   /* Descending on idx. */
 1179:   return r2->idx-r1->idx;
 1180: }
 1181: 
 1182: /* Bubble p[0] to appropriate place in p[1..n-1].  Assumes that
 1183: ** p[1..n-1] is already sorted.
 1184: */
 1185: /* TODO(shess) Is this frequent enough to warrant a binary search?
 1186: ** Before implementing that, instrument the code to check.  In most
 1187: ** current usage, I expect that p[0] will be less than p[1] a very
 1188: ** high proportion of the time.
 1189: */
 1190: static void orderedDLReaderReorder(OrderedDLReader *p, int n){
 1191:   while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
 1192:     OrderedDLReader tmp = p[0];
 1193:     p[0] = p[1];
 1194:     p[1] = tmp;
 1195:     n--;
 1196:     p++;
 1197:   }
 1198: }
 1199: 
 1200: /* Given an array of doclist readers, merge their doclist elements
 1201: ** into out in sorted order (by docid), dropping elements from older
 1202: ** readers when there is a duplicate docid.  pReaders is assumed to be
 1203: ** ordered by age, oldest first.
 1204: */
 1205: /* TODO(shess) nReaders must be <= MERGE_COUNT.  This should probably
 1206: ** be fixed.
 1207: */
 1208: static void docListMerge(DataBuffer *out,
 1209:                          DLReader *pReaders, int nReaders){
 1210:   OrderedDLReader readers[MERGE_COUNT];
 1211:   DLWriter writer;
 1212:   int i, n;
 1213:   const char *pStart = 0;
 1214:   int nStart = 0;
 1215:   sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
 1216: 
 1217:   assert( nReaders>0 );
 1218:   if( nReaders==1 ){
 1219:     dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
 1220:     return;
 1221:   }
 1222: 
 1223:   assert( nReaders<=MERGE_COUNT );
 1224:   n = 0;
 1225:   for(i=0; i<nReaders; i++){
 1226:     assert( pReaders[i].iType==pReaders[0].iType );
 1227:     readers[i].pReader = pReaders+i;
 1228:     readers[i].idx = i;
 1229:     n += dlrAllDataBytes(&pReaders[i]);
 1230:   }
 1231:   /* Conservatively size output to sum of inputs.  Output should end
 1232:   ** up strictly smaller than input.
 1233:   */
 1234:   dataBufferExpand(out, n);
 1235: 
 1236:   /* Get the readers into sorted order. */
 1237:   while( i-->0 ){
 1238:     orderedDLReaderReorder(readers+i, nReaders-i);
 1239:   }
 1240: 
 1241:   dlwInit(&writer, pReaders[0].iType, out);
 1242:   while( !dlrAtEnd(readers[0].pReader) ){
 1243:     sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
 1244: 
 1245:     /* If this is a continuation of the current buffer to copy, extend
 1246:     ** that buffer.  memcpy() seems to be more efficient if it has a
 1247:     ** lots of data to copy.
 1248:     */
 1249:     if( dlrDocData(readers[0].pReader)==pStart+nStart ){
 1250:       nStart += dlrDocDataBytes(readers[0].pReader);
 1251:     }else{
 1252:       if( pStart!=0 ){
 1253:         dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
 1254:       }
 1255:       pStart = dlrDocData(readers[0].pReader);
 1256:       nStart = dlrDocDataBytes(readers[0].pReader);
 1257:       iFirstDocid = iDocid;
 1258:     }
 1259:     iLastDocid = iDocid;
 1260:     dlrStep(readers[0].pReader);
 1261: 
 1262:     /* Drop all of the older elements with the same docid. */
 1263:     for(i=1; i<nReaders &&
 1264:              !dlrAtEnd(readers[i].pReader) &&
 1265:              dlrDocid(readers[i].pReader)==iDocid; i++){
 1266:       dlrStep(readers[i].pReader);
 1267:     }
 1268: 
 1269:     /* Get the readers back into order. */
 1270:     while( i-->0 ){
 1271:       orderedDLReaderReorder(readers+i, nReaders-i);
 1272:     }
 1273:   }
 1274: 
 1275:   /* Copy over any remaining elements. */
 1276:   if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
 1277:   dlwDestroy(&writer);
 1278: }
 1279: 
 1280: /* Helper function for posListUnion().  Compares the current position
 1281: ** between left and right, returning as standard C idiom of <0 if
 1282: ** left<right, >0 if left>right, and 0 if left==right.  "End" always
 1283: ** compares greater.
 1284: */
 1285: static int posListCmp(PLReader *pLeft, PLReader *pRight){
 1286:   assert( pLeft->iType==pRight->iType );
 1287:   if( pLeft->iType==DL_DOCIDS ) return 0;
 1288: 
 1289:   if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
 1290:   if( plrAtEnd(pRight) ) return -1;
 1291: 
 1292:   if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
 1293:   if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
 1294: 
 1295:   if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
 1296:   if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
 1297:   if( pLeft->iType==DL_POSITIONS ) return 0;
 1298: 
 1299:   if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
 1300:   if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
 1301: 
 1302:   if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
 1303:   if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
 1304: 
 1305:   return 0;
 1306: }
 1307: 
 1308: /* Write the union of position lists in pLeft and pRight to pOut.
 1309: ** "Union" in this case meaning "All unique position tuples".  Should
 1310: ** work with any doclist type, though both inputs and the output
 1311: ** should be the same type.
 1312: */
 1313: static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
 1314:   PLReader left, right;
 1315:   PLWriter writer;
 1316: 
 1317:   assert( dlrDocid(pLeft)==dlrDocid(pRight) );
 1318:   assert( pLeft->iType==pRight->iType );
 1319:   assert( pLeft->iType==pOut->iType );
 1320: 
 1321:   plrInit(&left, pLeft);
 1322:   plrInit(&right, pRight);
 1323:   plwInit(&writer, pOut, dlrDocid(pLeft));
 1324: 
 1325:   while( !plrAtEnd(&left) || !plrAtEnd(&right) ){
 1326:     int c = posListCmp(&left, &right);
 1327:     if( c<0 ){
 1328:       plwCopy(&writer, &left);
 1329:       plrStep(&left);
 1330:     }else if( c>0 ){
 1331:       plwCopy(&writer, &right);
 1332:       plrStep(&right);
 1333:     }else{
 1334:       plwCopy(&writer, &left);
 1335:       plrStep(&left);
 1336:       plrStep(&right);
 1337:     }
 1338:   }
 1339: 
 1340:   plwTerminate(&writer);
 1341:   plwDestroy(&writer);
 1342:   plrDestroy(&left);
 1343:   plrDestroy(&right);
 1344: }
 1345: 
 1346: /* Write the union of doclists in pLeft and pRight to pOut.  For
 1347: ** docids in common between the inputs, the union of the position
 1348: ** lists is written.  Inputs and outputs are always type DL_DEFAULT.
 1349: */
 1350: static void docListUnion(
 1351:   const char *pLeft, int nLeft,
 1352:   const char *pRight, int nRight,
 1353:   DataBuffer *pOut      /* Write the combined doclist here */
 1354: ){
 1355:   DLReader left, right;
 1356:   DLWriter writer;
 1357: 
 1358:   if( nLeft==0 ){
 1359:     if( nRight!=0) dataBufferAppend(pOut, pRight, nRight);
 1360:     return;
 1361:   }
 1362:   if( nRight==0 ){
 1363:     dataBufferAppend(pOut, pLeft, nLeft);
 1364:     return;
 1365:   }
 1366: 
 1367:   dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
 1368:   dlrInit(&right, DL_DEFAULT, pRight, nRight);
 1369:   dlwInit(&writer, DL_DEFAULT, pOut);
 1370: 
 1371:   while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
 1372:     if( dlrAtEnd(&right) ){
 1373:       dlwCopy(&writer, &left);
 1374:       dlrStep(&left);
 1375:     }else if( dlrAtEnd(&left) ){
 1376:       dlwCopy(&writer, &right);
 1377:       dlrStep(&right);
 1378:     }else if( dlrDocid(&left)<dlrDocid(&right) ){
 1379:       dlwCopy(&writer, &left);
 1380:       dlrStep(&left);
 1381:     }else if( dlrDocid(&left)>dlrDocid(&right) ){
 1382:       dlwCopy(&writer, &right);
 1383:       dlrStep(&right);
 1384:     }else{
 1385:       posListUnion(&left, &right, &writer);
 1386:       dlrStep(&left);
 1387:       dlrStep(&right);
 1388:     }
 1389:   }
 1390: 
 1391:   dlrDestroy(&left);
 1392:   dlrDestroy(&right);
 1393:   dlwDestroy(&writer);
 1394: }
 1395: 
 1396: /* pLeft and pRight are DLReaders positioned to the same docid.
 1397: **
 1398: ** If there are no instances in pLeft or pRight where the position
 1399: ** of pLeft is one less than the position of pRight, then this
 1400: ** routine adds nothing to pOut.
 1401: **
 1402: ** If there are one or more instances where positions from pLeft
 1403: ** are exactly one less than positions from pRight, then add a new
 1404: ** document record to pOut.  If pOut wants to hold positions, then
 1405: ** include the positions from pRight that are one more than a
 1406: ** position in pLeft.  In other words:  pRight.iPos==pLeft.iPos+1.
 1407: */
 1408: static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight,
 1409:                                DLWriter *pOut){
 1410:   PLReader left, right;
 1411:   PLWriter writer;
 1412:   int match = 0;
 1413: 
 1414:   assert( dlrDocid(pLeft)==dlrDocid(pRight) );
 1415:   assert( pOut->iType!=DL_POSITIONS_OFFSETS );
 1416: 
 1417:   plrInit(&left, pLeft);
 1418:   plrInit(&right, pRight);
 1419: 
 1420:   while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
 1421:     if( plrColumn(&left)<plrColumn(&right) ){
 1422:       plrStep(&left);
 1423:     }else if( plrColumn(&left)>plrColumn(&right) ){
 1424:       plrStep(&right);
 1425:     }else if( plrPosition(&left)+1<plrPosition(&right) ){
 1426:       plrStep(&left);
 1427:     }else if( plrPosition(&left)+1>plrPosition(&right) ){
 1428:       plrStep(&right);
 1429:     }else{
 1430:       if( !match ){
 1431:         plwInit(&writer, pOut, dlrDocid(pLeft));
 1432:         match = 1;
 1433:       }
 1434:       plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
 1435:       plrStep(&left);
 1436:       plrStep(&right);
 1437:     }
 1438:   }
 1439: 
 1440:   if( match ){
 1441:     plwTerminate(&writer);
 1442:     plwDestroy(&writer);
 1443:   }
 1444: 
 1445:   plrDestroy(&left);
 1446:   plrDestroy(&right);
 1447: }
 1448: 
 1449: /* We have two doclists with positions:  pLeft and pRight.
 1450: ** Write the phrase intersection of these two doclists into pOut.
 1451: **
 1452: ** A phrase intersection means that two documents only match
 1453: ** if pLeft.iPos+1==pRight.iPos.
 1454: **
 1455: ** iType controls the type of data written to pOut.  If iType is
 1456: ** DL_POSITIONS, the positions are those from pRight.
 1457: */
 1458: static void docListPhraseMerge(
 1459:   const char *pLeft, int nLeft,
 1460:   const char *pRight, int nRight,
 1461:   DocListType iType,
 1462:   DataBuffer *pOut      /* Write the combined doclist here */
 1463: ){
 1464:   DLReader left, right;
 1465:   DLWriter writer;
 1466: 
 1467:   if( nLeft==0 || nRight==0 ) return;
 1468: 
 1469:   assert( iType!=DL_POSITIONS_OFFSETS );
 1470: 
 1471:   dlrInit(&left, DL_POSITIONS, pLeft, nLeft);
 1472:   dlrInit(&right, DL_POSITIONS, pRight, nRight);
 1473:   dlwInit(&writer, iType, pOut);
 1474: 
 1475:   while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
 1476:     if( dlrDocid(&left)<dlrDocid(&right) ){
 1477:       dlrStep(&left);
 1478:     }else if( dlrDocid(&right)<dlrDocid(&left) ){
 1479:       dlrStep(&right);
 1480:     }else{
 1481:       posListPhraseMerge(&left, &right, &writer);
 1482:       dlrStep(&left);
 1483:       dlrStep(&right);
 1484:     }
 1485:   }
 1486: 
 1487:   dlrDestroy(&left);
 1488:   dlrDestroy(&right);
 1489:   dlwDestroy(&writer);
 1490: }
 1491: 
 1492: /* We have two DL_DOCIDS doclists:  pLeft and pRight.
 1493: ** Write the intersection of these two doclists into pOut as a
 1494: ** DL_DOCIDS doclist.
 1495: */
 1496: static void docListAndMerge(
 1497:   const char *pLeft, int nLeft,
 1498:   const char *pRight, int nRight,
 1499:   DataBuffer *pOut      /* Write the combined doclist here */
 1500: ){
 1501:   DLReader left, right;
 1502:   DLWriter writer;
 1503: 
 1504:   if( nLeft==0 || nRight==0 ) return;
 1505: 
 1506:   dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
 1507:   dlrInit(&right, DL_DOCIDS, pRight, nRight);
 1508:   dlwInit(&writer, DL_DOCIDS, pOut);
 1509: 
 1510:   while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
 1511:     if( dlrDocid(&left)<dlrDocid(&right) ){
 1512:       dlrStep(&left);
 1513:     }else if( dlrDocid(&right)<dlrDocid(&left) ){
 1514:       dlrStep(&right);
 1515:     }else{
 1516:       dlwAdd(&writer, dlrDocid(&left));
 1517:       dlrStep(&left);
 1518:       dlrStep(&right);
 1519:     }
 1520:   }
 1521: 
 1522:   dlrDestroy(&left);
 1523:   dlrDestroy(&right);
 1524:   dlwDestroy(&writer);
 1525: }
 1526: 
 1527: /* We have two DL_DOCIDS doclists:  pLeft and pRight.
 1528: ** Write the union of these two doclists into pOut as a
 1529: ** DL_DOCIDS doclist.
 1530: */
 1531: static void docListOrMerge(
 1532:   const char *pLeft, int nLeft,
 1533:   const char *pRight, int nRight,
 1534:   DataBuffer *pOut      /* Write the combined doclist here */
 1535: ){
 1536:   DLReader left, right;
 1537:   DLWriter writer;
 1538: 
 1539:   if( nLeft==0 ){
 1540:     if( nRight!=0 ) dataBufferAppend(pOut, pRight, nRight);
 1541:     return;
 1542:   }
 1543:   if( nRight==0 ){
 1544:     dataBufferAppend(pOut, pLeft, nLeft);
 1545:     return;
 1546:   }
 1547: 
 1548:   dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
 1549:   dlrInit(&right, DL_DOCIDS, pRight, nRight);
 1550:   dlwInit(&writer, DL_DOCIDS, pOut);
 1551: 
 1552:   while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
 1553:     if( dlrAtEnd(&right) ){
 1554:       dlwAdd(&writer, dlrDocid(&left));
 1555:       dlrStep(&left);
 1556:     }else if( dlrAtEnd(&left) ){
 1557:       dlwAdd(&writer, dlrDocid(&right));
 1558:       dlrStep(&right);
 1559:     }else if( dlrDocid(&left)<dlrDocid(&right) ){
 1560:       dlwAdd(&writer, dlrDocid(&left));
 1561:       dlrStep(&left);
 1562:     }else if( dlrDocid(&right)<dlrDocid(&left) ){
 1563:       dlwAdd(&writer, dlrDocid(&right));
 1564:       dlrStep(&right);
 1565:     }else{
 1566:       dlwAdd(&writer, dlrDocid(&left));
 1567:       dlrStep(&left);
 1568:       dlrStep(&right);
 1569:     }
 1570:   }
 1571: 
 1572:   dlrDestroy(&left);
 1573:   dlrDestroy(&right);
 1574:   dlwDestroy(&writer);
 1575: }
 1576: 
 1577: /* We have two DL_DOCIDS doclists:  pLeft and pRight.
 1578: ** Write into pOut as DL_DOCIDS doclist containing all documents that
 1579: ** occur in pLeft but not in pRight.
 1580: */
 1581: static void docListExceptMerge(
 1582:   const char *pLeft, int nLeft,
 1583:   const char *pRight, int nRight,
 1584:   DataBuffer *pOut      /* Write the combined doclist here */
 1585: ){
 1586:   DLReader left, right;
 1587:   DLWriter writer;
 1588: 
 1589:   if( nLeft==0 ) return;
 1590:   if( nRight==0 ){
 1591:     dataBufferAppend(pOut, pLeft, nLeft);
 1592:     return;
 1593:   }
 1594: 
 1595:   dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
 1596:   dlrInit(&right, DL_DOCIDS, pRight, nRight);
 1597:   dlwInit(&writer, DL_DOCIDS, pOut);
 1598: 
 1599:   while( !dlrAtEnd(&left) ){
 1600:     while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
 1601:       dlrStep(&right);
 1602:     }
 1603:     if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
 1604:       dlwAdd(&writer, dlrDocid(&left));
 1605:     }
 1606:     dlrStep(&left);
 1607:   }
 1608: 
 1609:   dlrDestroy(&left);
 1610:   dlrDestroy(&right);
 1611:   dlwDestroy(&writer);
 1612: }
 1613: 
 1614: static char *string_dup_n(const char *s, int n){
 1615:   char *str = sqlite3_malloc(n + 1);
 1616:   memcpy(str, s, n);
 1617:   str[n] = '\0';
 1618:   return str;
 1619: }
 1620: 
 1621: /* Duplicate a string; the caller must free() the returned string.
 1622:  * (We don't use strdup() since it is not part of the standard C library and
 1623:  * may not be available everywhere.) */
 1624: static char *string_dup(const char *s){
 1625:   return string_dup_n(s, strlen(s));
 1626: }
 1627: 
 1628: /* Format a string, replacing each occurrence of the % character with
 1629:  * zDb.zName.  This may be more convenient than sqlite_mprintf()
 1630:  * when one string is used repeatedly in a format string.
 1631:  * The caller must free() the returned string. */
 1632: static char *string_format(const char *zFormat,
 1633:                            const char *zDb, const char *zName){
 1634:   const char *p;
 1635:   size_t len = 0;
 1636:   size_t nDb = strlen(zDb);
 1637:   size_t nName = strlen(zName);
 1638:   size_t nFullTableName = nDb+1+nName;
 1639:   char *result;
 1640:   char *r;
 1641: 
 1642:   /* first compute length needed */
 1643:   for(p = zFormat ; *p ; ++p){
 1644:     len += (*p=='%' ? nFullTableName : 1);
 1645:   }
 1646:   len += 1;  /* for null terminator */
 1647: 
 1648:   r = result = sqlite3_malloc(len);
 1649:   for(p = zFormat; *p; ++p){
 1650:     if( *p=='%' ){
 1651:       memcpy(r, zDb, nDb);
 1652:       r += nDb;
 1653:       *r++ = '.';
 1654:       memcpy(r, zName, nName);
 1655:       r += nName;
 1656:     } else {
 1657:       *r++ = *p;
 1658:     }
 1659:   }
 1660:   *r++ = '\0';
 1661:   assert( r == result + len );
 1662:   return result;
 1663: }
 1664: 
 1665: static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
 1666:                     const char *zFormat){
 1667:   char *zCommand = string_format(zFormat, zDb, zName);
 1668:   int rc;
 1669:   TRACE(("FTS2 sql: %s\n", zCommand));
 1670:   rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
 1671:   sqlite3_free(zCommand);
 1672:   return rc;
 1673: }
 1674: 
 1675: static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
 1676:                        sqlite3_stmt **ppStmt, const char *zFormat){
 1677:   char *zCommand = string_format(zFormat, zDb, zName);
 1678:   int rc;
 1679:   TRACE(("FTS2 prepare: %s\n", zCommand));
 1680:   rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL);
 1681:   sqlite3_free(zCommand);
 1682:   return rc;
 1683: }
 1684: 
 1685: /* end utility functions */
 1686: 
 1687: /* Forward reference */
 1688: typedef struct fulltext_vtab fulltext_vtab;
 1689: 
 1690: /* A single term in a query is represented by an instances of
 1691: ** the following structure.
 1692: */
 1693: typedef struct QueryTerm {
 1694:   short int nPhrase; /* How many following terms are part of the same phrase */
 1695:   short int iPhrase; /* This is the i-th term of a phrase. */
 1696:   short int iColumn; /* Column of the index that must match this term */
 1697:   signed char isOr;  /* this term is preceded by "OR" */
 1698:   signed char isNot; /* this term is preceded by "-" */
 1699:   signed char isPrefix; /* this term is followed by "*" */
 1700:   char *pTerm;       /* text of the term.  '\000' terminated.  malloced */
 1701:   int nTerm;         /* Number of bytes in pTerm[] */
 1702: } QueryTerm;
 1703: 
 1704: 
 1705: /* A query string is parsed into a Query structure.
 1706:  *
 1707:  * We could, in theory, allow query strings to be complicated
 1708:  * nested expressions with precedence determined by parentheses.
 1709:  * But none of the major search engines do this.  (Perhaps the
 1710:  * feeling is that an parenthesized expression is two complex of
 1711:  * an idea for the average user to grasp.)  Taking our lead from
 1712:  * the major search engines, we will allow queries to be a list
 1713:  * of terms (with an implied AND operator) or phrases in double-quotes,
 1714:  * with a single optional "-" before each non-phrase term to designate
 1715:  * negation and an optional OR connector.
 1716:  *
 1717:  * OR binds more tightly than the implied AND, which is what the
 1718:  * major search engines seem to do.  So, for example:
 1719:  * 
 1720:  *    [one two OR three]     ==>    one AND (two OR three)
 1721:  *    [one OR two three]     ==>    (one OR two) AND three
 1722:  *
 1723:  * A "-" before a term matches all entries that lack that term.
 1724:  * The "-" must occur immediately before the term with in intervening
 1725:  * space.  This is how the search engines do it.
 1726:  *
 1727:  * A NOT term cannot be the right-hand operand of an OR.  If this
 1728:  * occurs in the query string, the NOT is ignored:
 1729:  *
 1730:  *    [one OR -two]          ==>    one OR two
 1731:  *
 1732:  */
 1733: typedef struct Query {
 1734:   fulltext_vtab *pFts;  /* The full text index */
 1735:   int nTerms;           /* Number of terms in the query */
 1736:   QueryTerm *pTerms;    /* Array of terms.  Space obtained from malloc() */
 1737:   int nextIsOr;         /* Set the isOr flag on the next inserted term */
 1738:   int nextColumn;       /* Next word parsed must be in this column */
 1739:   int dfltColumn;       /* The default column */
 1740: } Query;
 1741: 
 1742: 
 1743: /*
 1744: ** An instance of the following structure keeps track of generated
 1745: ** matching-word offset information and snippets.
 1746: */
 1747: typedef struct Snippet {
 1748:   int nMatch;     /* Total number of matches */
 1749:   int nAlloc;     /* Space allocated for aMatch[] */
 1750:   struct snippetMatch { /* One entry for each matching term */
 1751:     char snStatus;       /* Status flag for use while constructing snippets */
 1752:     short int iCol;      /* The column that contains the match */
 1753:     short int iTerm;     /* The index in Query.pTerms[] of the matching term */
 1754:     short int nByte;     /* Number of bytes in the term */
 1755:     int iStart;          /* The offset to the first character of the term */
 1756:   } *aMatch;      /* Points to space obtained from malloc */
 1757:   char *zOffset;  /* Text rendering of aMatch[] */
 1758:   int nOffset;    /* strlen(zOffset) */
 1759:   char *zSnippet; /* Snippet text */
 1760:   int nSnippet;   /* strlen(zSnippet) */
 1761: } Snippet;
 1762: 
 1763: 
 1764: typedef enum QueryType {
 1765:   QUERY_GENERIC,   /* table scan */
 1766:   QUERY_ROWID,     /* lookup by rowid */
 1767:   QUERY_FULLTEXT   /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
 1768: } QueryType;
 1769: 
 1770: typedef enum fulltext_statement {
 1771:   CONTENT_INSERT_STMT,
 1772:   CONTENT_SELECT_STMT,
 1773:   CONTENT_UPDATE_STMT,
 1774:   CONTENT_DELETE_STMT,
 1775:   CONTENT_EXISTS_STMT,
 1776: 
 1777:   BLOCK_INSERT_STMT,
 1778:   BLOCK_SELECT_STMT,
 1779:   BLOCK_DELETE_STMT,
 1780:   BLOCK_DELETE_ALL_STMT,
 1781: 
 1782:   SEGDIR_MAX_INDEX_STMT,
 1783:   SEGDIR_SET_STMT,
 1784:   SEGDIR_SELECT_LEVEL_STMT,
 1785:   SEGDIR_SPAN_STMT,
 1786:   SEGDIR_DELETE_STMT,
 1787:   SEGDIR_SELECT_SEGMENT_STMT,
 1788:   SEGDIR_SELECT_ALL_STMT,
 1789:   SEGDIR_DELETE_ALL_STMT,
 1790:   SEGDIR_COUNT_STMT,
 1791: 
 1792:   MAX_STMT                     /* Always at end! */
 1793: } fulltext_statement;
 1794: 
 1795: /* These must exactly match the enum above. */
 1796: /* TODO(shess): Is there some risk that a statement will be used in two
 1797: ** cursors at once, e.g.  if a query joins a virtual table to itself?
 1798: ** If so perhaps we should move some of these to the cursor object.
 1799: */
 1800: static const char *const fulltext_zStatement[MAX_STMT] = {
 1801:   /* CONTENT_INSERT */ NULL,  /* generated in contentInsertStatement() */
 1802:   /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
 1803:   /* CONTENT_UPDATE */ NULL,  /* generated in contentUpdateStatement() */
 1804:   /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
 1805:   /* CONTENT_EXISTS */ "select rowid from %_content limit 1",
 1806: 
 1807:   /* BLOCK_INSERT */ "insert into %_segments values (?)",
 1808:   /* BLOCK_SELECT */ "select block from %_segments where rowid = ?",
 1809:   /* BLOCK_DELETE */ "delete from %_segments where rowid between ? and ?",
 1810:   /* BLOCK_DELETE_ALL */ "delete from %_segments",
 1811: 
 1812:   /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
 1813:   /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
 1814:   /* SEGDIR_SELECT_LEVEL */
 1815:   "select start_block, leaves_end_block, root from %_segdir "
 1816:   " where level = ? order by idx",
 1817:   /* SEGDIR_SPAN */
 1818:   "select min(start_block), max(end_block) from %_segdir "
 1819:   " where level = ? and start_block <> 0",
 1820:   /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
 1821: 
 1822:   /* NOTE(shess): The first three results of the following two
 1823:   ** statements must match.
 1824:   */
 1825:   /* SEGDIR_SELECT_SEGMENT */
 1826:   "select start_block, leaves_end_block, root from %_segdir "
 1827:   " where level = ? and idx = ?",
 1828:   /* SEGDIR_SELECT_ALL */
 1829:   "select start_block, leaves_end_block, root from %_segdir "
 1830:   " order by level desc, idx asc",
 1831:   /* SEGDIR_DELETE_ALL */ "delete from %_segdir",
 1832:   /* SEGDIR_COUNT */ "select count(*), ifnull(max(level),0) from %_segdir",
 1833: };
 1834: 
 1835: /*
 1836: ** A connection to a fulltext index is an instance of the following
 1837: ** structure.  The xCreate and xConnect methods create an instance
 1838: ** of this structure and xDestroy and xDisconnect free that instance.
 1839: ** All other methods receive a pointer to the structure as one of their
 1840: ** arguments.
 1841: */
 1842: struct fulltext_vtab {
 1843:   sqlite3_vtab base;               /* Base class used by SQLite core */
 1844:   sqlite3 *db;                     /* The database connection */
 1845:   const char *zDb;                 /* logical database name */
 1846:   const char *zName;               /* virtual table name */
 1847:   int nColumn;                     /* number of columns in virtual table */
 1848:   char **azColumn;                 /* column names.  malloced */
 1849:   char **azContentColumn;          /* column names in content table; malloced */
 1850:   sqlite3_tokenizer *pTokenizer;   /* tokenizer for inserts and queries */
 1851: 
 1852:   /* Precompiled statements which we keep as long as the table is
 1853:   ** open.
 1854:   */
 1855:   sqlite3_stmt *pFulltextStatements[MAX_STMT];
 1856: 
 1857:   /* Precompiled statements used for segment merges.  We run a
 1858:   ** separate select across the leaf level of each tree being merged.
 1859:   */
 1860:   sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
 1861:   /* The statement used to prepare pLeafSelectStmts. */
 1862: #define LEAF_SELECT \
 1863:   "select block from %_segments where rowid between ? and ? order by rowid"
 1864: 
 1865:   /* These buffer pending index updates during transactions.
 1866:   ** nPendingData estimates the memory size of the pending data.  It
 1867:   ** doesn't include the hash-bucket overhead, nor any malloc
 1868:   ** overhead.  When nPendingData exceeds kPendingThreshold, the
 1869:   ** buffer is flushed even before the transaction closes.
 1870:   ** pendingTerms stores the data, and is only valid when nPendingData
 1871:   ** is >=0 (nPendingData<0 means pendingTerms has not been
 1872:   ** initialized).  iPrevDocid is the last docid written, used to make
 1873:   ** certain we're inserting in sorted order.
 1874:   */
 1875:   int nPendingData;
 1876: #define kPendingThreshold (1*1024*1024)
 1877:   sqlite_int64 iPrevDocid;
 1878:   fts2Hash pendingTerms;
 1879: };
 1880: 
 1881: /*
 1882: ** When the core wants to do a query, it create a cursor using a
 1883: ** call to xOpen.  This structure is an instance of a cursor.  It
 1884: ** is destroyed by xClose.
 1885: */
 1886: typedef struct fulltext_cursor {
 1887:   sqlite3_vtab_cursor base;        /* Base class used by SQLite core */
 1888:   QueryType iCursorType;           /* Copy of sqlite3_index_info.idxNum */
 1889:   sqlite3_stmt *pStmt;             /* Prepared statement in use by the cursor */
 1890:   int eof;                         /* True if at End Of Results */
 1891:   Query q;                         /* Parsed query string */
 1892:   Snippet snippet;                 /* Cached snippet for the current row */
 1893:   int iColumn;                     /* Column being searched */
 1894:   DataBuffer result;               /* Doclist results from fulltextQuery */
 1895:   DLReader reader;                 /* Result reader if result not empty */
 1896: } fulltext_cursor;
 1897: 
 1898: static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
 1899:   return (fulltext_vtab *) c->base.pVtab;
 1900: }
 1901: 
 1902: static const sqlite3_module fts2Module;   /* forward declaration */
 1903: 
 1904: /* Return a dynamically generated statement of the form
 1905:  *   insert into %_content (rowid, ...) values (?, ...)
 1906:  */
 1907: static const char *contentInsertStatement(fulltext_vtab *v){
 1908:   StringBuffer sb;
 1909:   int i;
 1910: 
 1911:   initStringBuffer(&sb);
 1912:   append(&sb, "insert into %_content (rowid, ");
 1913:   appendList(&sb, v->nColumn, v->azContentColumn);
 1914:   append(&sb, ") values (?");
 1915:   for(i=0; i<v->nColumn; ++i)
 1916:     append(&sb, ", ?");
 1917:   append(&sb, ")");
 1918:   return stringBufferData(&sb);
 1919: }
 1920: 
 1921: /* Return a dynamically generated statement of the form
 1922:  *   update %_content set [col_0] = ?, [col_1] = ?, ...
 1923:  *                    where rowid = ?
 1924:  */
 1925: static const char *contentUpdateStatement(fulltext_vtab *v){
 1926:   StringBuffer sb;
 1927:   int i;
 1928: 
 1929:   initStringBuffer(&sb);
 1930:   append(&sb, "update %_content set ");
 1931:   for(i=0; i<v->nColumn; ++i) {
 1932:     if( i>0 ){
 1933:       append(&sb, ", ");
 1934:     }
 1935:     append(&sb, v->azContentColumn[i]);
 1936:     append(&sb, " = ?");
 1937:   }
 1938:   append(&sb, " where rowid = ?");
 1939:   return stringBufferData(&sb);
 1940: }
 1941: 
 1942: /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
 1943: ** If the indicated statement has never been prepared, it is prepared
 1944: ** and cached, otherwise the cached version is reset.
 1945: */
 1946: static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
 1947:                              sqlite3_stmt **ppStmt){
 1948:   assert( iStmt<MAX_STMT );
 1949:   if( v->pFulltextStatements[iStmt]==NULL ){
 1950:     const char *zStmt;
 1951:     int rc;
 1952:     switch( iStmt ){
 1953:       case CONTENT_INSERT_STMT:
 1954:         zStmt = contentInsertStatement(v); break;
 1955:       case CONTENT_UPDATE_STMT:
 1956:         zStmt = contentUpdateStatement(v); break;
 1957:       default:
 1958:         zStmt = fulltext_zStatement[iStmt];
 1959:     }
 1960:     rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
 1961:                          zStmt);
 1962:     if( zStmt != fulltext_zStatement[iStmt]) sqlite3_free((void *) zStmt);
 1963:     if( rc!=SQLITE_OK ) return rc;
 1964:   } else {
 1965:     int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
 1966:     if( rc!=SQLITE_OK ) return rc;
 1967:   }
 1968: 
 1969:   *ppStmt = v->pFulltextStatements[iStmt];
 1970:   return SQLITE_OK;
 1971: }
 1972: 
 1973: /* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
 1974: ** SQLITE_ROW to SQLITE_ERROR.  Useful for statements like UPDATE,
 1975: ** where we expect no results.
 1976: */
 1977: static int sql_single_step(sqlite3_stmt *s){
 1978:   int rc = sqlite3_step(s);
 1979:   return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
 1980: }
 1981: 
 1982: /* Like sql_get_statement(), but for special replicated LEAF_SELECT
 1983: ** statements.  idx -1 is a special case for an uncached version of
 1984: ** the statement (used in the optimize implementation).
 1985: */
 1986: /* TODO(shess) Write version for generic statements and then share
 1987: ** that between the cached-statement functions.
 1988: */
 1989: static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
 1990:                                   sqlite3_stmt **ppStmt){
 1991:   assert( idx>=-1 && idx<MERGE_COUNT );
 1992:   if( idx==-1 ){
 1993:     return sql_prepare(v->db, v->zDb, v->zName, ppStmt, LEAF_SELECT);
 1994:   }else if( v->pLeafSelectStmts[idx]==NULL ){
 1995:     int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
 1996:                          LEAF_SELECT);
 1997:     if( rc!=SQLITE_OK ) return rc;
 1998:   }else{
 1999:     int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
 2000:     if( rc!=SQLITE_OK ) return rc;
 2001:   }
 2002: 
 2003:   *ppStmt = v->pLeafSelectStmts[idx];
 2004:   return SQLITE_OK;
 2005: }
 2006: 
 2007: /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
 2008: static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
 2009:                           sqlite3_value **pValues){
 2010:   sqlite3_stmt *s;
 2011:   int i;
 2012:   int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
 2013:   if( rc!=SQLITE_OK ) return rc;
 2014: 
 2015:   rc = sqlite3_bind_value(s, 1, rowid);
 2016:   if( rc!=SQLITE_OK ) return rc;
 2017: 
 2018:   for(i=0; i<v->nColumn; ++i){
 2019:     rc = sqlite3_bind_value(s, 2+i, pValues[i]);
 2020:     if( rc!=SQLITE_OK ) return rc;
 2021:   }
 2022: 
 2023:   return sql_single_step(s);
 2024: }
 2025: 
 2026: /* update %_content set col0 = pValues[0], col1 = pValues[1], ...
 2027:  *                  where rowid = [iRowid] */
 2028: static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
 2029:                           sqlite_int64 iRowid){
 2030:   sqlite3_stmt *s;
 2031:   int i;
 2032:   int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
 2033:   if( rc!=SQLITE_OK ) return rc;
 2034: 
 2035:   for(i=0; i<v->nColumn; ++i){
 2036:     rc = sqlite3_bind_value(s, 1+i, pValues[i]);
 2037:     if( rc!=SQLITE_OK ) return rc;
 2038:   }
 2039: 
 2040:   rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
 2041:   if( rc!=SQLITE_OK ) return rc;
 2042: 
 2043:   return sql_single_step(s);
 2044: }
 2045: 
 2046: static void freeStringArray(int nString, const char **pString){
 2047:   int i;
 2048: 
 2049:   for (i=0 ; i < nString ; ++i) {
 2050:     if( pString[i]!=NULL ) sqlite3_free((void *) pString[i]);
 2051:   }
 2052:   sqlite3_free((void *) pString);
 2053: }
 2054: 
 2055: /* select * from %_content where rowid = [iRow]
 2056:  * The caller must delete the returned array and all strings in it.
 2057:  * null fields will be NULL in the returned array.
 2058:  *
 2059:  * TODO: Perhaps we should return pointer/length strings here for consistency
 2060:  * with other code which uses pointer/length. */
 2061: static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
 2062:                           const char ***pValues){
 2063:   sqlite3_stmt *s;
 2064:   const char **values;
 2065:   int i;
 2066:   int rc;
 2067: 
 2068:   *pValues = NULL;
 2069: 
 2070:   rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
 2071:   if( rc!=SQLITE_OK ) return rc;
 2072: 
 2073:   rc = sqlite3_bind_int64(s, 1, iRow);
 2074:   if( rc!=SQLITE_OK ) return rc;
 2075: 
 2076:   rc = sqlite3_step(s);
 2077:   if( rc!=SQLITE_ROW ) return rc;
 2078: 
 2079:   values = (const char **) sqlite3_malloc(v->nColumn * sizeof(const char *));
 2080:   for(i=0; i<v->nColumn; ++i){
 2081:     if( sqlite3_column_type(s, i)==SQLITE_NULL ){
 2082:       values[i] = NULL;
 2083:     }else{
 2084:       values[i] = string_dup((char*)sqlite3_column_text(s, i));
 2085:     }
 2086:   }
 2087: 
 2088:   /* We expect only one row.  We must execute another sqlite3_step()
 2089:    * to complete the iteration; otherwise the table will remain locked. */
 2090:   rc = sqlite3_step(s);
 2091:   if( rc==SQLITE_DONE ){
 2092:     *pValues = values;
 2093:     return SQLITE_OK;
 2094:   }
 2095: 
 2096:   freeStringArray(v->nColumn, values);
 2097:   return rc;
 2098: }
 2099: 
 2100: /* delete from %_content where rowid = [iRow ] */
 2101: static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
 2102:   sqlite3_stmt *s;
 2103:   int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
 2104:   if( rc!=SQLITE_OK ) return rc;
 2105: 
 2106:   rc = sqlite3_bind_int64(s, 1, iRow);
 2107:   if( rc!=SQLITE_OK ) return rc;
 2108: 
 2109:   return sql_single_step(s);
 2110: }
 2111: 
 2112: /* Returns SQLITE_ROW if any rows exist in %_content, SQLITE_DONE if
 2113: ** no rows exist, and any error in case of failure.
 2114: */
 2115: static int content_exists(fulltext_vtab *v){
 2116:   sqlite3_stmt *s;
 2117:   int rc = sql_get_statement(v, CONTENT_EXISTS_STMT, &s);
 2118:   if( rc!=SQLITE_OK ) return rc;
 2119: 
 2120:   rc = sqlite3_step(s);
 2121:   if( rc!=SQLITE_ROW ) return rc;
 2122: 
 2123:   /* We expect only one row.  We must execute another sqlite3_step()
 2124:    * to complete the iteration; otherwise the table will remain locked. */
 2125:   rc = sqlite3_step(s);
 2126:   if( rc==SQLITE_DONE ) return SQLITE_ROW;
 2127:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
 2128:   return rc;
 2129: }
 2130: 
 2131: /* insert into %_segments values ([pData])
 2132: **   returns assigned rowid in *piBlockid
 2133: */
 2134: static int block_insert(fulltext_vtab *v, const char *pData, int nData,
 2135:                         sqlite_int64 *piBlockid){
 2136:   sqlite3_stmt *s;
 2137:   int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
 2138:   if( rc!=SQLITE_OK ) return rc;
 2139: 
 2140:   rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
 2141:   if( rc!=SQLITE_OK ) return rc;
 2142: 
 2143:   rc = sqlite3_step(s);
 2144:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
 2145:   if( rc!=SQLITE_DONE ) return rc;
 2146: 
 2147:   *piBlockid = sqlite3_last_insert_rowid(v->db);
 2148:   return SQLITE_OK;
 2149: }
 2150: 
 2151: /* delete from %_segments
 2152: **   where rowid between [iStartBlockid] and [iEndBlockid]
 2153: **
 2154: ** Deletes the range of blocks, inclusive, used to delete the blocks
 2155: ** which form a segment.
 2156: */
 2157: static int block_delete(fulltext_vtab *v,
 2158:                         sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
 2159:   sqlite3_stmt *s;
 2160:   int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
 2161:   if( rc!=SQLITE_OK ) return rc;
 2162: 
 2163:   rc = sqlite3_bind_int64(s, 1, iStartBlockid);
 2164:   if( rc!=SQLITE_OK ) return rc;
 2165: 
 2166:   rc = sqlite3_bind_int64(s, 2, iEndBlockid);
 2167:   if( rc!=SQLITE_OK ) return rc;
 2168: 
 2169:   return sql_single_step(s);
 2170: }
 2171: 
 2172: /* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
 2173: ** at iLevel.  Returns SQLITE_DONE if there are no segments at
 2174: ** iLevel.  Otherwise returns an error.
 2175: */
 2176: static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){
 2177:   sqlite3_stmt *s;
 2178:   int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
 2179:   if( rc!=SQLITE_OK ) return rc;
 2180: 
 2181:   rc = sqlite3_bind_int(s, 1, iLevel);
 2182:   if( rc!=SQLITE_OK ) return rc;
 2183: 
 2184:   rc = sqlite3_step(s);
 2185:   /* Should always get at least one row due to how max() works. */
 2186:   if( rc==SQLITE_DONE ) return SQLITE_DONE;
 2187:   if( rc!=SQLITE_ROW ) return rc;
 2188: 
 2189:   /* NULL means that there were no inputs to max(). */
 2190:   if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
 2191:     rc = sqlite3_step(s);
 2192:     if( rc==SQLITE_ROW ) return SQLITE_ERROR;
 2193:     return rc;
 2194:   }
 2195: 
 2196:   *pidx = sqlite3_column_int(s, 0);
 2197: 
 2198:   /* We expect only one row.  We must execute another sqlite3_step()
 2199:    * to complete the iteration; otherwise the table will remain locked. */
 2200:   rc = sqlite3_step(s);
 2201:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
 2202:   if( rc!=SQLITE_DONE ) return rc;
 2203:   return SQLITE_ROW;
 2204: }
 2205: 
 2206: /* insert into %_segdir values (
 2207: **   [iLevel], [idx],
 2208: **   [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
 2209: **   [pRootData]
 2210: ** )
 2211: */
 2212: static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
 2213:                       sqlite_int64 iStartBlockid,
 2214:                       sqlite_int64 iLeavesEndBlockid,
 2215:                       sqlite_int64 iEndBlockid,
 2216:                       const char *pRootData, int nRootData){
 2217:   sqlite3_stmt *s;
 2218:   int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
 2219:   if( rc!=SQLITE_OK ) return rc;
 2220: 
 2221:   rc = sqlite3_bind_int(s, 1, iLevel);
 2222:   if( rc!=SQLITE_OK ) return rc;
 2223: 
 2224:   rc = sqlite3_bind_int(s, 2, idx);
 2225:   if( rc!=SQLITE_OK ) return rc;
 2226: 
 2227:   rc = sqlite3_bind_int64(s, 3, iStartBlockid);
 2228:   if( rc!=SQLITE_OK ) return rc;
 2229: 
 2230:   rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
 2231:   if( rc!=SQLITE_OK ) return rc;
 2232: 
 2233:   rc = sqlite3_bind_int64(s, 5, iEndBlockid);
 2234:   if( rc!=SQLITE_OK ) return rc;
 2235: 
 2236:   rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
 2237:   if( rc!=SQLITE_OK ) return rc;
 2238: 
 2239:   return sql_single_step(s);
 2240: }
 2241: 
 2242: /* Queries %_segdir for the block span of the segments in level
 2243: ** iLevel.  Returns SQLITE_DONE if there are no blocks for iLevel,
 2244: ** SQLITE_ROW if there are blocks, else an error.
 2245: */
 2246: static int segdir_span(fulltext_vtab *v, int iLevel,
 2247:                        sqlite_int64 *piStartBlockid,
 2248:                        sqlite_int64 *piEndBlockid){
 2249:   sqlite3_stmt *s;
 2250:   int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
 2251:   if( rc!=SQLITE_OK ) return rc;
 2252: 
 2253:   rc = sqlite3_bind_int(s, 1, iLevel);
 2254:   if( rc!=SQLITE_OK ) return rc;
 2255: 
 2256:   rc = sqlite3_step(s);
 2257:   if( rc==SQLITE_DONE ) return SQLITE_DONE;  /* Should never happen */
 2258:   if( rc!=SQLITE_ROW ) return rc;
 2259: 
 2260:   /* This happens if all segments at this level are entirely inline. */
 2261:   if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
 2262:     /* We expect only one row.  We must execute another sqlite3_step()
 2263:      * to complete the iteration; otherwise the table will remain locked. */
 2264:     int rc2 = sqlite3_step(s);
 2265:     if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
 2266:     return rc2;
 2267:   }
 2268: 
 2269:   *piStartBlockid = sqlite3_column_int64(s, 0);
 2270:   *piEndBlockid = sqlite3_column_int64(s, 1);
 2271: 
 2272:   /* We expect only one row.  We must execute another sqlite3_step()
 2273:    * to complete the iteration; otherwise the table will remain locked. */
 2274:   rc = sqlite3_step(s);
 2275:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
 2276:   if( rc!=SQLITE_DONE ) return rc;
 2277:   return SQLITE_ROW;
 2278: }
 2279: 
 2280: /* Delete the segment blocks and segment directory records for all
 2281: ** segments at iLevel.
 2282: */
 2283: static int segdir_delete(fulltext_vtab *v, int iLevel){
 2284:   sqlite3_stmt *s;
 2285:   sqlite_int64 iStartBlockid, iEndBlockid;
 2286:   int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
 2287:   if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
 2288: 
 2289:   if( rc==SQLITE_ROW ){
 2290:     rc = block_delete(v, iStartBlockid, iEndBlockid);
 2291:     if( rc!=SQLITE_OK ) return rc;
 2292:   }
 2293: 
 2294:   /* Delete the segment directory itself. */
 2295:   rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
 2296:   if( rc!=SQLITE_OK ) return rc;
 2297: 
 2298:   rc = sqlite3_bind_int64(s, 1, iLevel);
 2299:   if( rc!=SQLITE_OK ) return rc;
 2300: 
 2301:   return sql_single_step(s);
 2302: }
 2303: 
 2304: /* Delete entire fts index, SQLITE_OK on success, relevant error on
 2305: ** failure.
 2306: */
 2307: static int segdir_delete_all(fulltext_vtab *v){
 2308:   sqlite3_stmt *s;
 2309:   int rc = sql_get_statement(v, SEGDIR_DELETE_ALL_STMT, &s);
 2310:   if( rc!=SQLITE_OK ) return rc;
 2311: 
 2312:   rc = sql_single_step(s);
 2313:   if( rc!=SQLITE_OK ) return rc;
 2314: 
 2315:   rc = sql_get_statement(v, BLOCK_DELETE_ALL_STMT, &s);
 2316:   if( rc!=SQLITE_OK ) return rc;
 2317: 
 2318:   return sql_single_step(s);
 2319: }
 2320: 
 2321: /* Returns SQLITE_OK with *pnSegments set to the number of entries in
 2322: ** %_segdir and *piMaxLevel set to the highest level which has a
 2323: ** segment.  Otherwise returns the SQLite error which caused failure.
 2324: */
 2325: static int segdir_count(fulltext_vtab *v, int *pnSegments, int *piMaxLevel){
 2326:   sqlite3_stmt *s;
 2327:   int rc = sql_get_statement(v, SEGDIR_COUNT_STMT, &s);
 2328:   if( rc!=SQLITE_OK ) return rc;
 2329: 
 2330:   rc = sqlite3_step(s);
 2331:   /* TODO(shess): This case should not be possible?  Should stronger
 2332:   ** measures be taken if it happens?
 2333:   */
 2334:   if( rc==SQLITE_DONE ){
 2335:     *pnSegments = 0;
 2336:     *piMaxLevel = 0;
 2337:     return SQLITE_OK;
 2338:   }
 2339:   if( rc!=SQLITE_ROW ) return rc;
 2340: 
 2341:   *pnSegments = sqlite3_column_int(s, 0);
 2342:   *piMaxLevel = sqlite3_column_int(s, 1);
 2343: 
 2344:   /* We expect only one row.  We must execute another sqlite3_step()
 2345:    * to complete the iteration; otherwise the table will remain locked. */
 2346:   rc = sqlite3_step(s);
 2347:   if( rc==SQLITE_DONE ) return SQLITE_OK;
 2348:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
 2349:   return rc;
 2350: }
 2351: 
 2352: /* TODO(shess) clearPendingTerms() is far down the file because
 2353: ** writeZeroSegment() is far down the file because LeafWriter is far
 2354: ** down the file.  Consider refactoring the code to move the non-vtab
 2355: ** code above the vtab code so that we don't need this forward
 2356: ** reference.
 2357: */
 2358: static int clearPendingTerms(fulltext_vtab *v);
 2359: 
 2360: /*
 2361: ** Free the memory used to contain a fulltext_vtab structure.
 2362: */
 2363: static void fulltext_vtab_destroy(fulltext_vtab *v){
 2364:   int iStmt, i;
 2365: 
 2366:   TRACE(("FTS2 Destroy %p\n", v));
 2367:   for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
 2368:     if( v->pFulltextStatements[iStmt]!=NULL ){
 2369:       sqlite3_finalize(v->pFulltextStatements[iStmt]);
 2370:       v->pFulltextStatements[iStmt] = NULL;
 2371:     }
 2372:   }
 2373: 
 2374:   for( i=0; i<MERGE_COUNT; i++ ){
 2375:     if( v->pLeafSelectStmts[i]!=NULL ){
 2376:       sqlite3_finalize(v->pLeafSelectStmts[i]);
 2377:       v->pLeafSelectStmts[i] = NULL;
 2378:     }
 2379:   }
 2380: 
 2381:   if( v->pTokenizer!=NULL ){
 2382:     v->pTokenizer->pModule->xDestroy(v->pTokenizer);
 2383:     v->pTokenizer = NULL;
 2384:   }
 2385: 
 2386:   clearPendingTerms(v);
 2387: 
 2388:   sqlite3_free(v->azColumn);
 2389:   for(i = 0; i < v->nColumn; ++i) {
 2390:     sqlite3_free(v->azContentColumn[i]);
 2391:   }
 2392:   sqlite3_free(v->azContentColumn);
 2393:   sqlite3_free(v);
 2394: }
 2395: 
 2396: /*
 2397: ** Token types for parsing the arguments to xConnect or xCreate.
 2398: */
 2399: #define TOKEN_EOF         0    /* End of file */
 2400: #define TOKEN_SPACE       1    /* Any kind of whitespace */
 2401: #define TOKEN_ID          2    /* An identifier */
 2402: #define TOKEN_STRING      3    /* A string literal */
 2403: #define TOKEN_PUNCT       4    /* A single punctuation character */
 2404: 
 2405: /*
 2406: ** If X is a character that can be used in an identifier then
 2407: ** IdChar(X) will be true.  Otherwise it is false.
 2408: **
 2409: ** For ASCII, any character with the high-order bit set is
 2410: ** allowed in an identifier.  For 7-bit characters, 
 2411: ** sqlite3IsIdChar[X] must be 1.
 2412: **
 2413: ** Ticket #1066.  the SQL standard does not allow '$' in the
 2414: ** middle of identfiers.  But many SQL implementations do. 
 2415: ** SQLite will allow '$' in identifiers for compatibility.
 2416: ** But the feature is undocumented.
 2417: */
 2418: static const char isIdChar[] = {
 2419: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
 2420:     0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  /* 2x */
 2421:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
 2422:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
 2423:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
 2424:     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
 2425:     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
 2426: };
 2427: #define IdChar(C)  (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
 2428: 
 2429: 
 2430: /*
 2431: ** Return the length of the token that begins at z[0]. 
 2432: ** Store the token type in *tokenType before returning.
 2433: */
 2434: static int getToken(const char *z, int *tokenType){
 2435:   int i, c;
 2436:   switch( *z ){
 2437:     case 0: {
 2438:       *tokenType = TOKEN_EOF;
 2439:       return 0;
 2440:     }
 2441:     case ' ': case '\t': case '\n': case '\f': case '\r': {
 2442:       for(i=1; safe_isspace(z[i]); i++){}
 2443:       *tokenType = TOKEN_SPACE;
 2444:       return i;
 2445:     }
 2446:     case '`':
 2447:     case '\'':
 2448:     case '"': {
 2449:       int delim = z[0];
 2450:       for(i=1; (c=z[i])!=0; i++){
 2451:         if( c==delim ){
 2452:           if( z[i+1]==delim ){
 2453:             i++;
 2454:           }else{
 2455:             break;
 2456:           }
 2457:         }
 2458:       }
 2459:       *tokenType = TOKEN_STRING;
 2460:       return i + (c!=0);
 2461:     }
 2462:     case '[': {
 2463:       for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
 2464:       *tokenType = TOKEN_ID;
 2465:       return i;
 2466:     }
 2467:     default: {
 2468:       if( !IdChar(*z) ){
 2469:         break;
 2470:       }
 2471:       for(i=1; IdChar(z[i]); i++){}
 2472:       *tokenType = TOKEN_ID;
 2473:       return i;
 2474:     }
 2475:   }
 2476:   *tokenType = TOKEN_PUNCT;
 2477:   return 1;
 2478: }
 2479: 
 2480: /*
 2481: ** A token extracted from a string is an instance of the following
 2482: ** structure.
 2483: */
 2484: typedef struct Token {
 2485:   const char *z;       /* Pointer to token text.  Not '\000' terminated */
 2486:   short int n;         /* Length of the token text in bytes. */
 2487: } Token;
 2488: 
 2489: /*
 2490: ** Given a input string (which is really one of the argv[] parameters
 2491: ** passed into xConnect or xCreate) split the string up into tokens.
 2492: ** Return an array of pointers to '\000' terminated strings, one string
 2493: ** for each non-whitespace token.
 2494: **
 2495: ** The returned array is terminated by a single NULL pointer.
 2496: **
 2497: ** Space to hold the returned array is obtained from a single
 2498: ** malloc and should be freed by passing the return value to free().
 2499: ** The individual strings within the token list are all a part of
 2500: ** the single memory allocation and will all be freed at once.
 2501: */
 2502: static char **tokenizeString(const char *z, int *pnToken){
 2503:   int nToken = 0;
 2504:   Token *aToken = sqlite3_malloc( strlen(z) * sizeof(aToken[0]) );
 2505:   int n = 1;
 2506:   int e, i;
 2507:   int totalSize = 0;
 2508:   char **azToken;
 2509:   char *zCopy;
 2510:   while( n>0 ){
 2511:     n = getToken(z, &e);
 2512:     if( e!=TOKEN_SPACE ){
 2513:       aToken[nToken].z = z;
 2514:       aToken[nToken].n = n;
 2515:       nToken++;
 2516:       totalSize += n+1;
 2517:     }
 2518:     z += n;
 2519:   }
 2520:   azToken = (char**)sqlite3_malloc( nToken*sizeof(char*) + totalSize );
 2521:   zCopy = (char*)&azToken[nToken];
 2522:   nToken--;
 2523:   for(i=0; i<nToken; i++){
 2524:     azToken[i] = zCopy;
 2525:     n = aToken[i].n;
 2526:     memcpy(zCopy, aToken[i].z, n);
 2527:     zCopy[n] = 0;
 2528:     zCopy += n+1;
 2529:   }
 2530:   azToken[nToken] = 0;
 2531:   sqlite3_free(aToken);
 2532:   *pnToken = nToken;
 2533:   return azToken;
 2534: }
 2535: 
 2536: /*
 2537: ** Convert an SQL-style quoted string into a normal string by removing
 2538: ** the quote characters.  The conversion is done in-place.  If the
 2539: ** input does not begin with a quote character, then this routine
 2540: ** is a no-op.
 2541: **
 2542: ** Examples:
 2543: **
 2544: **     "abc"   becomes   abc
 2545: **     'xyz'   becomes   xyz
 2546: **     [pqr]   becomes   pqr
 2547: **     `mno`   becomes   mno
 2548: */
 2549: static void dequoteString(char *z){
 2550:   int quote;
 2551:   int i, j;
 2552:   if( z==0 ) return;
 2553:   quote = z[0];
 2554:   switch( quote ){
 2555:     case '\'':  break;
 2556:     case '"':   break;
 2557:     case '`':   break;                /* For MySQL compatibility */
 2558:     case '[':   quote = ']';  break;  /* For MS SqlServer compatibility */
 2559:     default:    return;
 2560:   }
 2561:   for(i=1, j=0; z[i]; i++){
 2562:     if( z[i]==quote ){
 2563:       if( z[i+1]==quote ){
 2564:         z[j++] = quote;
 2565:         i++;
 2566:       }else{
 2567:         z[j++] = 0;
 2568:         break;
 2569:       }
 2570:     }else{
 2571:       z[j++] = z[i];
 2572:     }
 2573:   }
 2574: }
 2575: 
 2576: /*
 2577: ** The input azIn is a NULL-terminated list of tokens.  Remove the first
 2578: ** token and all punctuation tokens.  Remove the quotes from
 2579: ** around string literal tokens.
 2580: **
 2581: ** Example:
 2582: **
 2583: **     input:      tokenize chinese ( 'simplifed' , 'mixed' )
 2584: **     output:     chinese simplifed mixed
 2585: **
 2586: ** Another example:
 2587: **
 2588: **     input:      delimiters ( '[' , ']' , '...' )
 2589: **     output:     [ ] ...
 2590: */
 2591: static void tokenListToIdList(char **azIn){
 2592:   int i, j;
 2593:   if( azIn ){
 2594:     for(i=0, j=-1; azIn[i]; i++){
 2595:       if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
 2596:         dequoteString(azIn[i]);
 2597:         if( j>=0 ){
 2598:           azIn[j] = azIn[i];
 2599:         }
 2600:         j++;
 2601:       }
 2602:     }
 2603:     azIn[j] = 0;
 2604:   }
 2605: }
 2606: 
 2607: 
 2608: /*
 2609: ** Find the first alphanumeric token in the string zIn.  Null-terminate
 2610: ** this token.  Remove any quotation marks.  And return a pointer to
 2611: ** the result.
 2612: */
 2613: static char *firstToken(char *zIn, char **pzTail){
 2614:   int n, ttype;
 2615:   while(1){
 2616:     n = getToken(zIn, &ttype);
 2617:     if( ttype==TOKEN_SPACE ){
 2618:       zIn += n;
 2619:     }else if( ttype==TOKEN_EOF ){
 2620:       *pzTail = zIn;
 2621:       return 0;
 2622:     }else{
 2623:       zIn[n] = 0;
 2624:       *pzTail = &zIn[1];
 2625:       dequoteString(zIn);
 2626:       return zIn;
 2627:     }
 2628:   }
 2629:   /*NOTREACHED*/
 2630: }
 2631: 
 2632: /* Return true if...
 2633: **
 2634: **   *  s begins with the string t, ignoring case
 2635: **   *  s is longer than t
 2636: **   *  The first character of s beyond t is not a alphanumeric
 2637: ** 
 2638: ** Ignore leading space in *s.
 2639: **
 2640: ** To put it another way, return true if the first token of
 2641: ** s[] is t[].
 2642: */
 2643: static int startsWith(const char *s, const char *t){
 2644:   while( safe_isspace(*s) ){ s++; }
 2645:   while( *t ){
 2646:     if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
 2647:   }
 2648:   return *s!='_' && !safe_isalnum(*s);
 2649: }
 2650: 
 2651: /*
 2652: ** An instance of this structure defines the "spec" of a
 2653: ** full text index.  This structure is populated by parseSpec
 2654: ** and use by fulltextConnect and fulltextCreate.
 2655: */
 2656: typedef struct TableSpec {
 2657:   const char *zDb;         /* Logical database name */
 2658:   const char *zName;       /* Name of the full-text index */
 2659:   int nColumn;             /* Number of columns to be indexed */
 2660:   char **azColumn;         /* Original names of columns to be indexed */
 2661:   char **azContentColumn;  /* Column names for %_content */
 2662:   char **azTokenizer;      /* Name of tokenizer and its arguments */
 2663: } TableSpec;
 2664: 
 2665: /*
 2666: ** Reclaim all of the memory used by a TableSpec
 2667: */
 2668: static void clearTableSpec(TableSpec *p) {
 2669:   sqlite3_free(p->azColumn);
 2670:   sqlite3_free(p->azContentColumn);
 2671:   sqlite3_free(p->azTokenizer);
 2672: }
 2673: 
 2674: /* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
 2675:  *
 2676:  * CREATE VIRTUAL TABLE email
 2677:  *        USING fts2(subject, body, tokenize mytokenizer(myarg))
 2678:  *
 2679:  * We return parsed information in a TableSpec structure.
 2680:  * 
 2681:  */
 2682: static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
 2683:                      char**pzErr){
 2684:   int i, n;
 2685:   char *z, *zDummy;
 2686:   char **azArg;
 2687:   const char *zTokenizer = 0;    /* argv[] entry describing the tokenizer */
 2688: 
 2689:   assert( argc>=3 );
 2690:   /* Current interface:
 2691:   ** argv[0] - module name
 2692:   ** argv[1] - database name
 2693:   ** argv[2] - table name
 2694:   ** argv[3..] - columns, optionally followed by tokenizer specification
 2695:   **             and snippet delimiters specification.
 2696:   */
 2697: 
 2698:   /* Make a copy of the complete argv[][] array in a single allocation.
 2699:   ** The argv[][] array is read-only and transient.  We can write to the
 2700:   ** copy in order to modify things and the copy is persistent.
 2701:   */
 2702:   CLEAR(pSpec);
 2703:   for(i=n=0; i<argc; i++){
 2704:     n += strlen(argv[i]) + 1;
 2705:   }
 2706:   azArg = sqlite3_malloc( sizeof(char*)*argc + n );
 2707:   if( azArg==0 ){
 2708:     return SQLITE_NOMEM;
 2709:   }
 2710:   z = (char*)&azArg[argc];
 2711:   for(i=0; i<argc; i++){
 2712:     azArg[i] = z;
 2713:     strcpy(z, argv[i]);
 2714:     z += strlen(z)+1;
 2715:   }
 2716: 
 2717:   /* Identify the column names and the tokenizer and delimiter arguments
 2718:   ** in the argv[][] array.
 2719:   */
 2720:   pSpec->zDb = azArg[1];
 2721:   pSpec->zName = azArg[2];
 2722:   pSpec->nColumn = 0;
 2723:   pSpec->azColumn = azArg;
 2724:   zTokenizer = "tokenize simple";
 2725:   for(i=3; i<argc; ++i){
 2726:     if( startsWith(azArg[i],"tokenize") ){
 2727:       zTokenizer = azArg[i];
 2728:     }else{
 2729:       z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
 2730:       pSpec->nColumn++;
 2731:     }
 2732:   }
 2733:   if( pSpec->nColumn==0 ){
 2734:     azArg[0] = "content";
 2735:     pSpec->nColumn = 1;
 2736:   }
 2737: 
 2738:   /*
 2739:   ** Construct the list of content column names.
 2740:   **
 2741:   ** Each content column name will be of the form cNNAAAA
 2742:   ** where NN is the column number and AAAA is the sanitized
 2743:   ** column name.  "sanitized" means that special characters are
 2744:   ** converted to "_".  The cNN prefix guarantees that all column
 2745:   ** names are unique.
 2746:   **
 2747:   ** The AAAA suffix is not strictly necessary.  It is included
 2748:   ** for the convenience of people who might examine the generated
 2749:   ** %_content table and wonder what the columns are used for.
 2750:   */
 2751:   pSpec->azContentColumn = sqlite3_malloc( pSpec->nColumn * sizeof(char *) );
 2752:   if( pSpec->azContentColumn==0 ){
 2753:     clearTableSpec(pSpec);
 2754:     return SQLITE_NOMEM;
 2755:   }
 2756:   for(i=0; i<pSpec->nColumn; i++){
 2757:     char *p;
 2758:     pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
 2759:     for (p = pSpec->azContentColumn[i]; *p ; ++p) {
 2760:       if( !safe_isalnum(*p) ) *p = '_';
 2761:     }
 2762:   }
 2763: 
 2764:   /*
 2765:   ** Parse the tokenizer specification string.
 2766:   */
 2767:   pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
 2768:   tokenListToIdList(pSpec->azTokenizer);
 2769: 
 2770:   return SQLITE_OK;
 2771: }
 2772: 
 2773: /*
 2774: ** Generate a CREATE TABLE statement that describes the schema of
 2775: ** the virtual table.  Return a pointer to this schema string.
 2776: **
 2777: ** Space is obtained from sqlite3_mprintf() and should be freed
 2778: ** using sqlite3_free().
 2779: */
 2780: static char *fulltextSchema(
 2781:   int nColumn,                  /* Number of columns */
 2782:   const char *const* azColumn,  /* List of columns */
 2783:   const char *zTableName        /* Name of the table */
 2784: ){
 2785:   int i;
 2786:   char *zSchema, *zNext;
 2787:   const char *zSep = "(";
 2788:   zSchema = sqlite3_mprintf("CREATE TABLE x");
 2789:   for(i=0; i<nColumn; i++){
 2790:     zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
 2791:     sqlite3_free(zSchema);
 2792:     zSchema = zNext;
 2793:     zSep = ",";
 2794:   }
 2795:   zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
 2796:   sqlite3_free(zSchema);
 2797:   return zNext;
 2798: }
 2799: 
 2800: /*
 2801: ** Build a new sqlite3_vtab structure that will describe the
 2802: ** fulltext index defined by spec.
 2803: */
 2804: static int constructVtab(
 2805:   sqlite3 *db,              /* The SQLite database connection */
 2806:   fts2Hash *pHash,          /* Hash table containing tokenizers */
 2807:   TableSpec *spec,          /* Parsed spec information from parseSpec() */
 2808:   sqlite3_vtab **ppVTab,    /* Write the resulting vtab structure here */
 2809:   char **pzErr              /* Write any error message here */
 2810: ){
 2811:   int rc;
 2812:   int n;
 2813:   fulltext_vtab *v = 0;
 2814:   const sqlite3_tokenizer_module *m = NULL;
 2815:   char *schema;
 2816: 
 2817:   char const *zTok;         /* Name of tokenizer to use for this fts table */
 2818:   int nTok;                 /* Length of zTok, including nul terminator */
 2819: 
 2820:   v = (fulltext_vtab *) sqlite3_malloc(sizeof(fulltext_vtab));
 2821:   if( v==0 ) return SQLITE_NOMEM;
 2822:   CLEAR(v);
 2823:   /* sqlite will initialize v->base */
 2824:   v->db = db;
 2825:   v->zDb = spec->zDb;       /* Freed when azColumn is freed */
 2826:   v->zName = spec->zName;   /* Freed when azColumn is freed */
 2827:   v->nColumn = spec->nColumn;
 2828:   v->azContentColumn = spec->azContentColumn;
 2829:   spec->azContentColumn = 0;
 2830:   v->azColumn = spec->azColumn;
 2831:   spec->azColumn = 0;
 2832: 
 2833:   if( spec->azTokenizer==0 ){
 2834:     return SQLITE_NOMEM;
 2835:   }
 2836: 
 2837:   zTok = spec->azTokenizer[0]; 
 2838:   if( !zTok ){
 2839:     zTok = "simple";
 2840:   }
 2841:   nTok = strlen(zTok)+1;
 2842: 
 2843:   m = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zTok, nTok);
 2844:   if( !m ){
 2845:     *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
 2846:     rc = SQLITE_ERROR;
 2847:     goto err;
 2848:   }
 2849: 
 2850:   for(n=0; spec->azTokenizer[n]; n++){}
 2851:   if( n ){
 2852:     rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
 2853:                     &v->pTokenizer);
 2854:   }else{
 2855:     rc = m->xCreate(0, 0, &v->pTokenizer);
 2856:   }
 2857:   if( rc!=SQLITE_OK ) goto err;
 2858:   v->pTokenizer->pModule = m;
 2859: 
 2860:   /* TODO: verify the existence of backing tables foo_content, foo_term */
 2861: 
 2862:   schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
 2863:                           spec->zName);
 2864:   rc = sqlite3_declare_vtab(db, schema);
 2865:   sqlite3_free(schema);
 2866:   if( rc!=SQLITE_OK ) goto err;
 2867: 
 2868:   memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
 2869: 
 2870:   /* Indicate that the buffer is not live. */
 2871:   v->nPendingData = -1;
 2872: 
 2873:   *ppVTab = &v->base;
 2874:   TRACE(("FTS2 Connect %p\n", v));
 2875: 
 2876:   return rc;
 2877: 
 2878: err:
 2879:   fulltext_vtab_destroy(v);
 2880:   return rc;
 2881: }
 2882: 
 2883: static int fulltextConnect(
 2884:   sqlite3 *db,
 2885:   void *pAux,
 2886:   int argc, const char *const*argv,
 2887:   sqlite3_vtab **ppVTab,
 2888:   char **pzErr
 2889: ){
 2890:   TableSpec spec;
 2891:   int rc = parseSpec(&spec, argc, argv, pzErr);
 2892:   if( rc!=SQLITE_OK ) return rc;
 2893: 
 2894:   rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
 2895:   clearTableSpec(&spec);
 2896:   return rc;
 2897: }
 2898: 
 2899: /* The %_content table holds the text of each document, with
 2900: ** the rowid used as the docid.
 2901: */
 2902: /* TODO(shess) This comment needs elaboration to match the updated
 2903: ** code.  Work it into the top-of-file comment at that time.
 2904: */
 2905: static int fulltextCreate(sqlite3 *db, void *pAux,
 2906:                           int argc, const char * const *argv,
 2907:                           sqlite3_vtab **ppVTab, char **pzErr){
 2908:   int rc;
 2909:   TableSpec spec;
 2910:   StringBuffer schema;
 2911:   TRACE(("FTS2 Create\n"));
 2912: 
 2913:   rc = parseSpec(&spec, argc, argv, pzErr);
 2914:   if( rc!=SQLITE_OK ) return rc;
 2915: 
 2916:   initStringBuffer(&schema);
 2917:   append(&schema, "CREATE TABLE %_content(");
 2918:   appendList(&schema, spec.nColumn, spec.azContentColumn);
 2919:   append(&schema, ")");
 2920:   rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
 2921:   stringBufferDestroy(&schema);
 2922:   if( rc!=SQLITE_OK ) goto out;
 2923: 
 2924:   rc = sql_exec(db, spec.zDb, spec.zName,
 2925:                 "create table %_segments(block blob);");
 2926:   if( rc!=SQLITE_OK ) goto out;
 2927: 
 2928:   rc = sql_exec(db, spec.zDb, spec.zName,
 2929:                 "create table %_segdir("
 2930:                 "  level integer,"
 2931:                 "  idx integer,"
 2932:                 "  start_block integer,"
 2933:                 "  leaves_end_block integer,"
 2934:                 "  end_block integer,"
 2935:                 "  root blob,"
 2936:                 "  primary key(level, idx)"
 2937:                 ");");
 2938:   if( rc!=SQLITE_OK ) goto out;
 2939: 
 2940:   rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
 2941: 
 2942: out:
 2943:   clearTableSpec(&spec);
 2944:   return rc;
 2945: }
 2946: 
 2947: /* Decide how to handle an SQL query. */
 2948: static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
 2949:   int i;
 2950:   TRACE(("FTS2 BestIndex\n"));
 2951: 
 2952:   for(i=0; i<pInfo->nConstraint; ++i){
 2953:     const struct sqlite3_index_constraint *pConstraint;
 2954:     pConstraint = &pInfo->aConstraint[i];
 2955:     if( pConstraint->usable ) {
 2956:       if( pConstraint->iColumn==-1 &&
 2957:           pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
 2958:         pInfo->idxNum = QUERY_ROWID;      /* lookup by rowid */
 2959:         TRACE(("FTS2 QUERY_ROWID\n"));
 2960:       } else if( pConstraint->iColumn>=0 &&
 2961:                  pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
 2962:         /* full-text search */
 2963:         pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
 2964:         TRACE(("FTS2 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
 2965:       } else continue;
 2966: 
 2967:       pInfo->aConstraintUsage[i].argvIndex = 1;
 2968:       pInfo->aConstraintUsage[i].omit = 1;
 2969: 
 2970:       /* An arbitrary value for now.
 2971:        * TODO: Perhaps rowid matches should be considered cheaper than
 2972:        * full-text searches. */
 2973:       pInfo->estimatedCost = 1.0;   
 2974: 
 2975:       return SQLITE_OK;
 2976:     }
 2977:   }
 2978:   pInfo->idxNum = QUERY_GENERIC;
 2979:   return SQLITE_OK;
 2980: }
 2981: 
 2982: static int fulltextDisconnect(sqlite3_vtab *pVTab){
 2983:   TRACE(("FTS2 Disconnect %p\n", pVTab));
 2984:   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
 2985:   return SQLITE_OK;
 2986: }
 2987: 
 2988: static int fulltextDestroy(sqlite3_vtab *pVTab){
 2989:   fulltext_vtab *v = (fulltext_vtab *)pVTab;
 2990:   int rc;
 2991: 
 2992:   TRACE(("FTS2 Destroy %p\n", pVTab));
 2993:   rc = sql_exec(v->db, v->zDb, v->zName,
 2994:                 "drop table if exists %_content;"
 2995:                 "drop table if exists %_segments;"
 2996:                 "drop table if exists %_segdir;"
 2997:                 );
 2998:   if( rc!=SQLITE_OK ) return rc;
 2999: 
 3000:   fulltext_vtab_destroy((fulltext_vtab *)pVTab);
 3001:   return SQLITE_OK;
 3002: }
 3003: 
 3004: static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
 3005:   fulltext_cursor *c;
 3006: 
 3007:   c = (fulltext_cursor *) sqlite3_malloc(sizeof(fulltext_cursor));
 3008:   if( c ){
 3009:     memset(c, 0, sizeof(fulltext_cursor));
 3010:     /* sqlite will initialize c->base */
 3011:     *ppCursor = &c->base;
 3012:     TRACE(("FTS2 Open %p: %p\n", pVTab, c));
 3013:     return SQLITE_OK;
 3014:   }else{
 3015:     return SQLITE_NOMEM;
 3016:   }
 3017: }
 3018: 
 3019: 
 3020: /* Free all of the dynamically allocated memory held by *q
 3021: */
 3022: static void queryClear(Query *q){
 3023:   int i;
 3024:   for(i = 0; i < q->nTerms; ++i){
 3025:     sqlite3_free(q->pTerms[i].pTerm);
 3026:   }
 3027:   sqlite3_free(q->pTerms);
 3028:   CLEAR(q);
 3029: }
 3030: 
 3031: /* Free all of the dynamically allocated memory held by the
 3032: ** Snippet
 3033: */
 3034: static void snippetClear(Snippet *p){
 3035:   sqlite3_free(p->aMatch);
 3036:   sqlite3_free(p->zOffset);
 3037:   sqlite3_free(p->zSnippet);
 3038:   CLEAR(p);
 3039: }
 3040: /*
 3041: ** Append a single entry to the p->aMatch[] log.
 3042: */
 3043: static void snippetAppendMatch(
 3044:   Snippet *p,               /* Append the entry to this snippet */
 3045:   int iCol, int iTerm,      /* The column and query term */
 3046:   int iStart, int nByte     /* Offset and size of the match */
 3047: ){
 3048:   int i;
 3049:   struct snippetMatch *pMatch;
 3050:   if( p->nMatch+1>=p->nAlloc ){
 3051:     p->nAlloc = p->nAlloc*2 + 10;
 3052:     p->aMatch = sqlite3_realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
 3053:     if( p->aMatch==0 ){
 3054:       p->nMatch = 0;
 3055:       p->nAlloc = 0;
 3056:       return;
 3057:     }
 3058:   }
 3059:   i = p->nMatch++;
 3060:   pMatch = &p->aMatch[i];
 3061:   pMatch->iCol = iCol;
 3062:   pMatch->iTerm = iTerm;
 3063:   pMatch->iStart = iStart;
 3064:   pMatch->nByte = nByte;
 3065: }
 3066: 
 3067: /*
 3068: ** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
 3069: */
 3070: #define FTS2_ROTOR_SZ   (32)
 3071: #define FTS2_ROTOR_MASK (FTS2_ROTOR_SZ-1)
 3072: 
 3073: /*
 3074: ** Add entries to pSnippet->aMatch[] for every match that occurs against
 3075: ** document zDoc[0..nDoc-1] which is stored in column iColumn.
 3076: */
 3077: static void snippetOffsetsOfColumn(
 3078:   Query *pQuery,
 3079:   Snippet *pSnippet,
 3080:   int iColumn,
 3081:   const char *zDoc,
 3082:   int nDoc
 3083: ){
 3084:   const sqlite3_tokenizer_module *pTModule;  /* The tokenizer module */
 3085:   sqlite3_tokenizer *pTokenizer;             /* The specific tokenizer */
 3086:   sqlite3_tokenizer_cursor *pTCursor;        /* Tokenizer cursor */
 3087:   fulltext_vtab *pVtab;                /* The full text index */
 3088:   int nColumn;                         /* Number of columns in the index */
 3089:   const QueryTerm *aTerm;              /* Query string terms */
 3090:   int nTerm;                           /* Number of query string terms */  
 3091:   int i, j;                            /* Loop counters */
 3092:   int rc;                              /* Return code */
 3093:   unsigned int match, prevMatch;       /* Phrase search bitmasks */
 3094:   const char *zToken;                  /* Next token from the tokenizer */
 3095:   int nToken;                          /* Size of zToken */
 3096:   int iBegin, iEnd, iPos;              /* Offsets of beginning and end */
 3097: 
 3098:   /* The following variables keep a circular buffer of the last
 3099:   ** few tokens */
 3100:   unsigned int iRotor = 0;             /* Index of current token */
 3101:   int iRotorBegin[FTS2_ROTOR_SZ];      /* Beginning offset of token */
 3102:   int iRotorLen[FTS2_ROTOR_SZ];        /* Length of token */
 3103: 
 3104:   pVtab = pQuery->pFts;
 3105:   nColumn = pVtab->nColumn;
 3106:   pTokenizer = pVtab->pTokenizer;
 3107:   pTModule = pTokenizer->pModule;
 3108:   rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
 3109:   if( rc ) return;
 3110:   pTCursor->pTokenizer = pTokenizer;
 3111:   aTerm = pQuery->pTerms;
 3112:   nTerm = pQuery->nTerms;
 3113:   if( nTerm>=FTS2_ROTOR_SZ ){
 3114:     nTerm = FTS2_ROTOR_SZ - 1;
 3115:   }
 3116:   prevMatch = 0;
 3117:   while(1){
 3118:     rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
 3119:     if( rc ) break;
 3120:     iRotorBegin[iRotor&FTS2_ROTOR_MASK] = iBegin;
 3121:     iRotorLen[iRotor&FTS2_ROTOR_MASK] = iEnd-iBegin;
 3122:     match = 0;
 3123:     for(i=0; i<nTerm; i++){
 3124:       int iCol;
 3125:       iCol = aTerm[i].iColumn;
 3126:       if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
 3127:       if( aTerm[i].nTerm>nToken ) continue;
 3128:       if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
 3129:       assert( aTerm[i].nTerm<=nToken );
 3130:       if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
 3131:       if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
 3132:       match |= 1<<i;
 3133:       if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
 3134:         for(j=aTerm[i].iPhrase-1; j>=0; j--){
 3135:           int k = (iRotor-j) & FTS2_ROTOR_MASK;
 3136:           snippetAppendMatch(pSnippet, iColumn, i-j,
 3137:                 iRotorBegin[k], iRotorLen[k]);
 3138:         }
 3139:       }
 3140:     }
 3141:     prevMatch = match<<1;
 3142:     iRotor++;
 3143:   }
 3144:   pTModule->xClose(pTCursor);  
 3145: }
 3146: 
 3147: 
 3148: /*
 3149: ** Compute all offsets for the current row of the query.  
 3150: ** If the offsets have already been computed, this routine is a no-op.
 3151: */
 3152: static void snippetAllOffsets(fulltext_cursor *p){
 3153:   int nColumn;
 3154:   int iColumn, i;
 3155:   int iFirst, iLast;
 3156:   fulltext_vtab *pFts;
 3157: 
 3158:   if( p->snippet.nMatch ) return;
 3159:   if( p->q.nTerms==0 ) return;
 3160:   pFts = p->q.pFts;
 3161:   nColumn = pFts->nColumn;
 3162:   iColumn = (p->iCursorType - QUERY_FULLTEXT);
 3163:   if( iColumn<0 || iColumn>=nColumn ){
 3164:     iFirst = 0;
 3165:     iLast = nColumn-1;
 3166:   }else{
 3167:     iFirst = iColumn;
 3168:     iLast = iColumn;
 3169:   }
 3170:   for(i=iFirst; i<=iLast; i++){
 3171:     const char *zDoc;
 3172:     int nDoc;
 3173:     zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
 3174:     nDoc = sqlite3_column_bytes(p->pStmt, i+1);
 3175:     snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
 3176:   }
 3177: }
 3178: 
 3179: /*
 3180: ** Convert the information in the aMatch[] array of the snippet
 3181: ** into the string zOffset[0..nOffset-1].
 3182: */
 3183: static void snippetOffsetText(Snippet *p){
 3184:   int i;
 3185:   int cnt = 0;
 3186:   StringBuffer sb;
 3187:   char zBuf[200];
 3188:   if( p->zOffset ) return;
 3189:   initStringBuffer(&sb);
 3190:   for(i=0; i<p->nMatch; i++){
 3191:     struct snippetMatch *pMatch = &p->aMatch[i];
 3192:     zBuf[0] = ' ';
 3193:     sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
 3194:         pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
 3195:     append(&sb, zBuf);
 3196:     cnt++;
 3197:   }
 3198:   p->zOffset = stringBufferData(&sb);
 3199:   p->nOffset = stringBufferLength(&sb);
 3200: }
 3201: 
 3202: /*
 3203: ** zDoc[0..nDoc-1] is phrase of text.  aMatch[0..nMatch-1] are a set
 3204: ** of matching words some of which might be in zDoc.  zDoc is column
 3205: ** number iCol.
 3206: **
 3207: ** iBreak is suggested spot in zDoc where we could begin or end an
 3208: ** excerpt.  Return a value similar to iBreak but possibly adjusted
 3209: ** to be a little left or right so that the break point is better.
 3210: */
 3211: static int wordBoundary(
 3212:   int iBreak,                   /* The suggested break point */
 3213:   const char *zDoc,             /* Document text */
 3214:   int nDoc,                     /* Number of bytes in zDoc[] */
 3215:   struct snippetMatch *aMatch,  /* Matching words */
 3216:   int nMatch,                   /* Number of entries in aMatch[] */
 3217:   int iCol                      /* The column number for zDoc[] */
 3218: ){
 3219:   int i;
 3220:   if( iBreak<=10 ){
 3221:     return 0;
 3222:   }
 3223:   if( iBreak>=nDoc-10 ){
 3224:     return nDoc;
 3225:   }
 3226:   for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
 3227:   while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
 3228:   if( i<nMatch ){
 3229:     if( aMatch[i].iStart<iBreak+10 ){
 3230:       return aMatch[i].iStart;
 3231:     }
 3232:     if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
 3233:       return aMatch[i-1].iStart;
 3234:     }
 3235:   }
 3236:   for(i=1; i<=10; i++){
 3237:     if( safe_isspace(zDoc[iBreak-i]) ){
 3238:       return iBreak - i + 1;
 3239:     }
 3240:     if( safe_isspace(zDoc[iBreak+i]) ){
 3241:       return iBreak + i + 1;
 3242:     }
 3243:   }
 3244:   return iBreak;
 3245: }
 3246: 
 3247: 
 3248: 
 3249: /*
 3250: ** Allowed values for Snippet.aMatch[].snStatus
 3251: */
 3252: #define SNIPPET_IGNORE  0   /* It is ok to omit this match from the snippet */
 3253: #define SNIPPET_DESIRED 1   /* We want to include this match in the snippet */
 3254: 
 3255: /*
 3256: ** Generate the text of a snippet.
 3257: */
 3258: static void snippetText(
 3259:   fulltext_cursor *pCursor,   /* The cursor we need the snippet for */
 3260:   const char *zStartMark,     /* Markup to appear before each match */
 3261:   const char *zEndMark,       /* Markup to appear after each match */
 3262:   const char *zEllipsis       /* Ellipsis mark */
 3263: ){
 3264:   int i, j;
 3265:   struct snippetMatch *aMatch;
 3266:   int nMatch;
 3267:   int nDesired;
 3268:   StringBuffer sb;
 3269:   int tailCol;
 3270:   int tailOffset;
 3271:   int iCol;
 3272:   int nDoc;
 3273:   const char *zDoc;
 3274:   int iStart, iEnd;
 3275:   int tailEllipsis = 0;
 3276:   int iMatch;
 3277:   
 3278: 
 3279:   sqlite3_free(pCursor->snippet.zSnippet);
 3280:   pCursor->snippet.zSnippet = 0;
 3281:   aMatch = pCursor->snippet.aMatch;
 3282:   nMatch = pCursor->snippet.nMatch;
 3283:   initStringBuffer(&sb);
 3284: 
 3285:   for(i=0; i<nMatch; i++){
 3286:     aMatch[i].snStatus = SNIPPET_IGNORE;
 3287:   }
 3288:   nDesired = 0;
 3289:   for(i=0; i<pCursor->q.nTerms; i++){
 3290:     for(j=0; j<nMatch; j++){
 3291:       if( aMatch[j].iTerm==i ){
 3292:         aMatch[j].snStatus = SNIPPET_DESIRED;
 3293:         nDesired++;
 3294:         break;
 3295:       }
 3296:     }
 3297:   }
 3298: 
 3299:   iMatch = 0;
 3300:   tailCol = -1;
 3301:   tailOffset = 0;
 3302:   for(i=0; i<nMatch && nDesired>0; i++){
 3303:     if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
 3304:     nDesired--;
 3305:     iCol = aMatch[i].iCol;
 3306:     zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
 3307:     nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
 3308:     iStart = aMatch[i].iStart - 40;
 3309:     iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
 3310:     if( iStart<=10 ){
 3311:       iStart = 0;
 3312:     }
 3313:     if( iCol==tailCol && iStart<=tailOffset+20 ){
 3314:       iStart = tailOffset;
 3315:     }
 3316:     if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
 3317:       trimWhiteSpace(&sb);
 3318:       appendWhiteSpace(&sb);
 3319:       append(&sb, zEllipsis);
 3320:       appendWhiteSpace(&sb);
 3321:     }
 3322:     iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
 3323:     iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
 3324:     if( iEnd>=nDoc-10 ){
 3325:       iEnd = nDoc;
 3326:       tailEllipsis = 0;
 3327:     }else{
 3328:       tailEllipsis = 1;
 3329:     }
 3330:     while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
 3331:     while( iStart<iEnd ){
 3332:       while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
 3333:              && aMatch[iMatch].iCol<=iCol ){
 3334:         iMatch++;
 3335:       }
 3336:       if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
 3337:              && aMatch[iMatch].iCol==iCol ){
 3338:         nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
 3339:         iStart = aMatch[iMatch].iStart;
 3340:         append(&sb, zStartMark);
 3341:         nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
 3342:         append(&sb, zEndMark);
 3343:         iStart += aMatch[iMatch].nByte;
 3344:         for(j=iMatch+1; j<nMatch; j++){
 3345:           if( aMatch[j].iTerm==aMatch[iMatch].iTerm
 3346:               && aMatch[j].snStatus==SNIPPET_DESIRED ){
 3347:             nDesired--;
 3348:             aMatch[j].snStatus = SNIPPET_IGNORE;
 3349:           }
 3350:         }
 3351:       }else{
 3352:         nappend(&sb, &zDoc[iStart], iEnd - iStart);
 3353:         iStart = iEnd;
 3354:       }
 3355:     }
 3356:     tailCol = iCol;
 3357:     tailOffset = iEnd;
 3358:   }
 3359:   trimWhiteSpace(&sb);
 3360:   if( tailEllipsis ){
 3361:     appendWhiteSpace(&sb);
 3362:     append(&sb, zEllipsis);
 3363:   }
 3364:   pCursor->snippet.zSnippet = stringBufferData(&sb);
 3365:   pCursor->snippet.nSnippet = stringBufferLength(&sb);
 3366: }
 3367: 
 3368: 
 3369: /*
 3370: ** Close the cursor.  For additional information see the documentation
 3371: ** on the xClose method of the virtual table interface.
 3372: */
 3373: static int fulltextClose(sqlite3_vtab_cursor *pCursor){
 3374:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 3375:   TRACE(("FTS2 Close %p\n", c));
 3376:   sqlite3_finalize(c->pStmt);
 3377:   queryClear(&c->q);
 3378:   snippetClear(&c->snippet);
 3379:   if( c->result.nData!=0 ) dlrDestroy(&c->reader);
 3380:   dataBufferDestroy(&c->result);
 3381:   sqlite3_free(c);
 3382:   return SQLITE_OK;
 3383: }
 3384: 
 3385: static int fulltextNext(sqlite3_vtab_cursor *pCursor){
 3386:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 3387:   int rc;
 3388: 
 3389:   TRACE(("FTS2 Next %p\n", pCursor));
 3390:   snippetClear(&c->snippet);
 3391:   if( c->iCursorType < QUERY_FULLTEXT ){
 3392:     /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
 3393:     rc = sqlite3_step(c->pStmt);
 3394:     switch( rc ){
 3395:       case SQLITE_ROW:
 3396:         c->eof = 0;
 3397:         return SQLITE_OK;
 3398:       case SQLITE_DONE:
 3399:         c->eof = 1;
 3400:         return SQLITE_OK;
 3401:       default:
 3402:         c->eof = 1;
 3403:         return rc;
 3404:     }
 3405:   } else {  /* full-text query */
 3406:     rc = sqlite3_reset(c->pStmt);
 3407:     if( rc!=SQLITE_OK ) return rc;
 3408: 
 3409:     if( c->result.nData==0 || dlrAtEnd(&c->reader) ){
 3410:       c->eof = 1;
 3411:       return SQLITE_OK;
 3412:     }
 3413:     rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader));
 3414:     dlrStep(&c->reader);
 3415:     if( rc!=SQLITE_OK ) return rc;
 3416:     /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
 3417:     rc = sqlite3_step(c->pStmt);
 3418:     if( rc==SQLITE_ROW ){   /* the case we expect */
 3419:       c->eof = 0;
 3420:       return SQLITE_OK;
 3421:     }
 3422:     /* an error occurred; abort */
 3423:     return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
 3424:   }
 3425: }
 3426: 
 3427: 
 3428: /* TODO(shess) If we pushed LeafReader to the top of the file, or to
 3429: ** another file, term_select() could be pushed above
 3430: ** docListOfTerm().
 3431: */
 3432: static int termSelect(fulltext_vtab *v, int iColumn,
 3433:                       const char *pTerm, int nTerm, int isPrefix,
 3434:                       DocListType iType, DataBuffer *out);
 3435: 
 3436: /* Return a DocList corresponding to the query term *pTerm.  If *pTerm
 3437: ** is the first term of a phrase query, go ahead and evaluate the phrase
 3438: ** query and return the doclist for the entire phrase query.
 3439: **
 3440: ** The resulting DL_DOCIDS doclist is stored in pResult, which is
 3441: ** overwritten.
 3442: */
 3443: static int docListOfTerm(
 3444:   fulltext_vtab *v,   /* The full text index */
 3445:   int iColumn,        /* column to restrict to.  No restriction if >=nColumn */
 3446:   QueryTerm *pQTerm,  /* Term we are looking for, or 1st term of a phrase */
 3447:   DataBuffer *pResult /* Write the result here */
 3448: ){
 3449:   DataBuffer left, right, new;
 3450:   int i, rc;
 3451: 
 3452:   /* No phrase search if no position info. */
 3453:   assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS );
 3454: 
 3455:   /* This code should never be called with buffered updates. */
 3456:   assert( v->nPendingData<0 );
 3457: 
 3458:   dataBufferInit(&left, 0);
 3459:   rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix,
 3460:                   0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left);
 3461:   if( rc ) return rc;
 3462:   for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){
 3463:     dataBufferInit(&right, 0);
 3464:     rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm,
 3465:                     pQTerm[i].isPrefix, DL_POSITIONS, &right);
 3466:     if( rc ){
 3467:       dataBufferDestroy(&left);
 3468:       return rc;
 3469:     }
 3470:     dataBufferInit(&new, 0);
 3471:     docListPhraseMerge(left.pData, left.nData, right.pData, right.nData,
 3472:                        i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new);
 3473:     dataBufferDestroy(&left);
 3474:     dataBufferDestroy(&right);
 3475:     left = new;
 3476:   }
 3477:   *pResult = left;
 3478:   return SQLITE_OK;
 3479: }
 3480: 
 3481: /* Add a new term pTerm[0..nTerm-1] to the query *q.
 3482: */
 3483: static void queryAdd(Query *q, const char *pTerm, int nTerm){
 3484:   QueryTerm *t;
 3485:   ++q->nTerms;
 3486:   q->pTerms = sqlite3_realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
 3487:   if( q->pTerms==0 ){
 3488:     q->nTerms = 0;
 3489:     return;
 3490:   }
 3491:   t = &q->pTerms[q->nTerms - 1];
 3492:   CLEAR(t);
 3493:   t->pTerm = sqlite3_malloc(nTerm+1);
 3494:   memcpy(t->pTerm, pTerm, nTerm);
 3495:   t->pTerm[nTerm] = 0;
 3496:   t->nTerm = nTerm;
 3497:   t->isOr = q->nextIsOr;
 3498:   t->isPrefix = 0;
 3499:   q->nextIsOr = 0;
 3500:   t->iColumn = q->nextColumn;
 3501:   q->nextColumn = q->dfltColumn;
 3502: }
 3503: 
 3504: /*
 3505: ** Check to see if the string zToken[0...nToken-1] matches any
 3506: ** column name in the virtual table.   If it does,
 3507: ** return the zero-indexed column number.  If not, return -1.
 3508: */
 3509: static int checkColumnSpecifier(
 3510:   fulltext_vtab *pVtab,    /* The virtual table */
 3511:   const char *zToken,      /* Text of the token */
 3512:   int nToken               /* Number of characters in the token */
 3513: ){
 3514:   int i;
 3515:   for(i=0; i<pVtab->nColumn; i++){
 3516:     if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
 3517:         && pVtab->azColumn[i][nToken]==0 ){
 3518:       return i;
 3519:     }
 3520:   }
 3521:   return -1;
 3522: }
 3523: 
 3524: /*
 3525: ** Parse the text at pSegment[0..nSegment-1].  Add additional terms
 3526: ** to the query being assemblied in pQuery.
 3527: **
 3528: ** inPhrase is true if pSegment[0..nSegement-1] is contained within
 3529: ** double-quotes.  If inPhrase is true, then the first term
 3530: ** is marked with the number of terms in the phrase less one and
 3531: ** OR and "-" syntax is ignored.  If inPhrase is false, then every
 3532: ** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
 3533: */
 3534: static int tokenizeSegment(
 3535:   sqlite3_tokenizer *pTokenizer,          /* The tokenizer to use */
 3536:   const char *pSegment, int nSegment,     /* Query expression being parsed */
 3537:   int inPhrase,                           /* True if within "..." */
 3538:   Query *pQuery                           /* Append results here */
 3539: ){
 3540:   const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
 3541:   sqlite3_tokenizer_cursor *pCursor;
 3542:   int firstIndex = pQuery->nTerms;
 3543:   int iCol;
 3544:   int nTerm = 1;
 3545:   
 3546:   int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
 3547:   if( rc!=SQLITE_OK ) return rc;
 3548:   pCursor->pTokenizer = pTokenizer;
 3549: 
 3550:   while( 1 ){
 3551:     const char *pToken;
 3552:     int nToken, iBegin, iEnd, iPos;
 3553: 
 3554:     rc = pModule->xNext(pCursor,
 3555:                         &pToken, &nToken,
 3556:                         &iBegin, &iEnd, &iPos);
 3557:     if( rc!=SQLITE_OK ) break;
 3558:     if( !inPhrase &&
 3559:         pSegment[iEnd]==':' &&
 3560:          (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
 3561:       pQuery->nextColumn = iCol;
 3562:       continue;
 3563:     }
 3564:     if( !inPhrase && pQuery->nTerms>0 && nToken==2
 3565:          && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
 3566:       pQuery->nextIsOr = 1;
 3567:       continue;
 3568:     }
 3569:     queryAdd(pQuery, pToken, nToken);
 3570:     if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
 3571:       pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
 3572:     }
 3573:     if( iEnd<nSegment && pSegment[iEnd]=='*' ){
 3574:       pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
 3575:     }
 3576:     pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
 3577:     if( inPhrase ){
 3578:       nTerm++;
 3579:     }
 3580:   }
 3581: 
 3582:   if( inPhrase && pQuery->nTerms>firstIndex ){
 3583:     pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
 3584:   }
 3585: 
 3586:   return pModule->xClose(pCursor);
 3587: }
 3588: 
 3589: /* Parse a query string, yielding a Query object pQuery.
 3590: **
 3591: ** The calling function will need to queryClear() to clean up
 3592: ** the dynamically allocated memory held by pQuery.
 3593: */
 3594: static int parseQuery(
 3595:   fulltext_vtab *v,        /* The fulltext index */
 3596:   const char *zInput,      /* Input text of the query string */
 3597:   int nInput,              /* Size of the input text */
 3598:   int dfltColumn,          /* Default column of the index to match against */
 3599:   Query *pQuery            /* Write the parse results here. */
 3600: ){
 3601:   int iInput, inPhrase = 0;
 3602: 
 3603:   if( zInput==0 ) nInput = 0;
 3604:   if( nInput<0 ) nInput = strlen(zInput);
 3605:   pQuery->nTerms = 0;
 3606:   pQuery->pTerms = NULL;
 3607:   pQuery->nextIsOr = 0;
 3608:   pQuery->nextColumn = dfltColumn;
 3609:   pQuery->dfltColumn = dfltColumn;
 3610:   pQuery->pFts = v;
 3611: 
 3612:   for(iInput=0; iInput<nInput; ++iInput){
 3613:     int i;
 3614:     for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
 3615:     if( i>iInput ){
 3616:       tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
 3617:                        pQuery);
 3618:     }
 3619:     iInput = i;
 3620:     if( i<nInput ){
 3621:       assert( zInput[i]=='"' );
 3622:       inPhrase = !inPhrase;
 3623:     }
 3624:   }
 3625: 
 3626:   if( inPhrase ){
 3627:     /* unmatched quote */
 3628:     queryClear(pQuery);
 3629:     return SQLITE_ERROR;
 3630:   }
 3631:   return SQLITE_OK;
 3632: }
 3633: 
 3634: /* TODO(shess) Refactor the code to remove this forward decl. */
 3635: static int flushPendingTerms(fulltext_vtab *v);
 3636: 
 3637: /* Perform a full-text query using the search expression in
 3638: ** zInput[0..nInput-1].  Return a list of matching documents
 3639: ** in pResult.
 3640: **
 3641: ** Queries must match column iColumn.  Or if iColumn>=nColumn
 3642: ** they are allowed to match against any column.
 3643: */
 3644: static int fulltextQuery(
 3645:   fulltext_vtab *v,      /* The full text index */
 3646:   int iColumn,           /* Match against this column by default */
 3647:   const char *zInput,    /* The query string */
 3648:   int nInput,            /* Number of bytes in zInput[] */
 3649:   DataBuffer *pResult,   /* Write the result doclist here */
 3650:   Query *pQuery          /* Put parsed query string here */
 3651: ){
 3652:   int i, iNext, rc;
 3653:   DataBuffer left, right, or, new;
 3654:   int nNot = 0;
 3655:   QueryTerm *aTerm;
 3656: 
 3657:   /* TODO(shess) Instead of flushing pendingTerms, we could query for
 3658:   ** the relevant term and merge the doclist into what we receive from
 3659:   ** the database.  Wait and see if this is a common issue, first.
 3660:   **
 3661:   ** A good reason not to flush is to not generate update-related
 3662:   ** error codes from here.
 3663:   */
 3664: 
 3665:   /* Flush any buffered updates before executing the query. */
 3666:   rc = flushPendingTerms(v);
 3667:   if( rc!=SQLITE_OK ) return rc;
 3668: 
 3669:   /* TODO(shess) I think that the queryClear() calls below are not
 3670:   ** necessary, because fulltextClose() already clears the query.
 3671:   */
 3672:   rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
 3673:   if( rc!=SQLITE_OK ) return rc;
 3674: 
 3675:   /* Empty or NULL queries return no results. */
 3676:   if( pQuery->nTerms==0 ){
 3677:     dataBufferInit(pResult, 0);
 3678:     return SQLITE_OK;
 3679:   }
 3680: 
 3681:   /* Merge AND terms. */
 3682:   /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
 3683:   aTerm = pQuery->pTerms;
 3684:   for(i = 0; i<pQuery->nTerms; i=iNext){
 3685:     if( aTerm[i].isNot ){
 3686:       /* Handle all NOT terms in a separate pass */
 3687:       nNot++;
 3688:       iNext = i + aTerm[i].nPhrase+1;
 3689:       continue;
 3690:     }
 3691:     iNext = i + aTerm[i].nPhrase + 1;
 3692:     rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
 3693:     if( rc ){
 3694:       if( i!=nNot ) dataBufferDestroy(&left);
 3695:       queryClear(pQuery);
 3696:       return rc;
 3697:     }
 3698:     while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
 3699:       rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or);
 3700:       iNext += aTerm[iNext].nPhrase + 1;
 3701:       if( rc ){
 3702:         if( i!=nNot ) dataBufferDestroy(&left);
 3703:         dataBufferDestroy(&right);
 3704:         queryClear(pQuery);
 3705:         return rc;
 3706:       }
 3707:       dataBufferInit(&new, 0);
 3708:       docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new);
 3709:       dataBufferDestroy(&right);
 3710:       dataBufferDestroy(&or);
 3711:       right = new;
 3712:     }
 3713:     if( i==nNot ){           /* first term processed. */
 3714:       left = right;
 3715:     }else{
 3716:       dataBufferInit(&new, 0);
 3717:       docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new);
 3718:       dataBufferDestroy(&right);
 3719:       dataBufferDestroy(&left);
 3720:       left = new;
 3721:     }
 3722:   }
 3723: 
 3724:   if( nNot==pQuery->nTerms ){
 3725:     /* We do not yet know how to handle a query of only NOT terms */
 3726:     return SQLITE_ERROR;
 3727:   }
 3728: 
 3729:   /* Do the EXCEPT terms */
 3730:   for(i=0; i<pQuery->nTerms;  i += aTerm[i].nPhrase + 1){
 3731:     if( !aTerm[i].isNot ) continue;
 3732:     rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
 3733:     if( rc ){
 3734:       queryClear(pQuery);
 3735:       dataBufferDestroy(&left);
 3736:       return rc;
 3737:     }
 3738:     dataBufferInit(&new, 0);
 3739:     docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new);
 3740:     dataBufferDestroy(&right);
 3741:     dataBufferDestroy(&left);
 3742:     left = new;
 3743:   }
 3744: 
 3745:   *pResult = left;
 3746:   return rc;
 3747: }
 3748: 
 3749: /*
 3750: ** This is the xFilter interface for the virtual table.  See
 3751: ** the virtual table xFilter method documentation for additional
 3752: ** information.
 3753: **
 3754: ** If idxNum==QUERY_GENERIC then do a full table scan against
 3755: ** the %_content table.
 3756: **
 3757: ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
 3758: ** in the %_content table.
 3759: **
 3760: ** If idxNum>=QUERY_FULLTEXT then use the full text index.  The
 3761: ** column on the left-hand side of the MATCH operator is column
 3762: ** number idxNum-QUERY_FULLTEXT, 0 indexed.  argv[0] is the right-hand
 3763: ** side of the MATCH operator.
 3764: */
 3765: /* TODO(shess) Upgrade the cursor initialization and destruction to
 3766: ** account for fulltextFilter() being called multiple times on the
 3767: ** same cursor.  The current solution is very fragile.  Apply fix to
 3768: ** fts2 as appropriate.
 3769: */
 3770: static int fulltextFilter(
 3771:   sqlite3_vtab_cursor *pCursor,     /* The cursor used for this query */
 3772:   int idxNum, const char *idxStr,   /* Which indexing scheme to use */
 3773:   int argc, sqlite3_value **argv    /* Arguments for the indexing scheme */
 3774: ){
 3775:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 3776:   fulltext_vtab *v = cursor_vtab(c);
 3777:   int rc;
 3778: 
 3779:   TRACE(("FTS2 Filter %p\n",pCursor));
 3780: 
 3781:   /* If the cursor has a statement that was not prepared according to
 3782:   ** idxNum, clear it.  I believe all calls to fulltextFilter with a
 3783:   ** given cursor will have the same idxNum , but in this case it's
 3784:   ** easy to be safe.
 3785:   */
 3786:   if( c->pStmt && c->iCursorType!=idxNum ){
 3787:     sqlite3_finalize(c->pStmt);
 3788:     c->pStmt = NULL;
 3789:   }
 3790: 
 3791:   /* Get a fresh statement appropriate to idxNum. */
 3792:   /* TODO(shess): Add a prepared-statement cache in the vt structure.
 3793:   ** The cache must handle multiple open cursors.  Easier to cache the
 3794:   ** statement variants at the vt to reduce malloc/realloc/free here.
 3795:   ** Or we could have a StringBuffer variant which allowed stack
 3796:   ** construction for small values.
 3797:   */
 3798:   if( !c->pStmt ){
 3799:     char *zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
 3800:                                  idxNum==QUERY_GENERIC ? "" : "where rowid=?");
 3801:     rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
 3802:     sqlite3_free(zSql);
 3803:     if( rc!=SQLITE_OK ) return rc;
 3804:     c->iCursorType = idxNum;
 3805:   }else{
 3806:     sqlite3_reset(c->pStmt);
 3807:     assert( c->iCursorType==idxNum );
 3808:   }
 3809: 
 3810:   switch( idxNum ){
 3811:     case QUERY_GENERIC:
 3812:       break;
 3813: 
 3814:     case QUERY_ROWID:
 3815:       rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
 3816:       if( rc!=SQLITE_OK ) return rc;
 3817:       break;
 3818: 
 3819:     default:   /* full-text search */
 3820:     {
 3821:       const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
 3822:       assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
 3823:       assert( argc==1 );
 3824:       queryClear(&c->q);
 3825:       if( c->result.nData!=0 ){
 3826:         /* This case happens if the same cursor is used repeatedly. */
 3827:         dlrDestroy(&c->reader);
 3828:         dataBufferReset(&c->result);
 3829:       }else{
 3830:         dataBufferInit(&c->result, 0);
 3831:       }
 3832:       rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q);
 3833:       if( rc!=SQLITE_OK ) return rc;
 3834:       if( c->result.nData!=0 ){
 3835:         dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData);
 3836:       }
 3837:       break;
 3838:     }
 3839:   }
 3840: 
 3841:   return fulltextNext(pCursor);
 3842: }
 3843: 
 3844: /* This is the xEof method of the virtual table.  The SQLite core
 3845: ** calls this routine to find out if it has reached the end of
 3846: ** a query's results set.
 3847: */
 3848: static int fulltextEof(sqlite3_vtab_cursor *pCursor){
 3849:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 3850:   return c->eof;
 3851: }
 3852: 
 3853: /* This is the xColumn method of the virtual table.  The SQLite
 3854: ** core calls this method during a query when it needs the value
 3855: ** of a column from the virtual table.  This method needs to use
 3856: ** one of the sqlite3_result_*() routines to store the requested
 3857: ** value back in the pContext.
 3858: */
 3859: static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
 3860:                           sqlite3_context *pContext, int idxCol){
 3861:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 3862:   fulltext_vtab *v = cursor_vtab(c);
 3863: 
 3864:   if( idxCol<v->nColumn ){
 3865:     sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
 3866:     sqlite3_result_value(pContext, pVal);
 3867:   }else if( idxCol==v->nColumn ){
 3868:     /* The extra column whose name is the same as the table.
 3869:     ** Return a blob which is a pointer to the cursor
 3870:     */
 3871:     sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
 3872:   }
 3873:   return SQLITE_OK;
 3874: }
 3875: 
 3876: /* This is the xRowid method.  The SQLite core calls this routine to
 3877: ** retrive the rowid for the current row of the result set.  The
 3878: ** rowid should be written to *pRowid.
 3879: */
 3880: static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
 3881:   fulltext_cursor *c = (fulltext_cursor *) pCursor;
 3882: 
 3883:   *pRowid = sqlite3_column_int64(c->pStmt, 0);
 3884:   return SQLITE_OK;
 3885: }
 3886: 
 3887: /* Add all terms in [zText] to pendingTerms table.  If [iColumn] > 0,
 3888: ** we also store positions and offsets in the hash table using that
 3889: ** column number.
 3890: */
 3891: static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
 3892:                       const char *zText, int iColumn){
 3893:   sqlite3_tokenizer *pTokenizer = v->pTokenizer;
 3894:   sqlite3_tokenizer_cursor *pCursor;
 3895:   const char *pToken;
 3896:   int nTokenBytes;
 3897:   int iStartOffset, iEndOffset, iPosition;
 3898:   int rc;
 3899: 
 3900:   rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
 3901:   if( rc!=SQLITE_OK ) return rc;
 3902: 
 3903:   pCursor->pTokenizer = pTokenizer;
 3904:   while( SQLITE_OK==(rc=pTokenizer->pModule->xNext(pCursor,
 3905:                                                    &pToken, &nTokenBytes,
 3906:                                                    &iStartOffset, &iEndOffset,
 3907:                                                    &iPosition)) ){
 3908:     DLCollector *p;
 3909:     int nData;                   /* Size of doclist before our update. */
 3910: 
 3911:     /* Positions can't be negative; we use -1 as a terminator
 3912:      * internally.  Token can't be NULL or empty. */
 3913:     if( iPosition<0 || pToken == NULL || nTokenBytes == 0 ){
 3914:       rc = SQLITE_ERROR;
 3915:       break;
 3916:     }
 3917: 
 3918:     p = fts2HashFind(&v->pendingTerms, pToken, nTokenBytes);
 3919:     if( p==NULL ){
 3920:       nData = 0;
 3921:       p = dlcNew(iDocid, DL_DEFAULT);
 3922:       fts2HashInsert(&v->pendingTerms, pToken, nTokenBytes, p);
 3923: 
 3924:       /* Overhead for our hash table entry, the key, and the value. */
 3925:       v->nPendingData += sizeof(struct fts2HashElem)+sizeof(*p)+nTokenBytes;
 3926:     }else{
 3927:       nData = p->b.nData;
 3928:       if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid);
 3929:     }
 3930:     if( iColumn>=0 ){
 3931:       dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
 3932:     }
 3933: 
 3934:     /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
 3935:     v->nPendingData += p->b.nData-nData;
 3936:   }
 3937: 
 3938:   /* TODO(shess) Check return?  Should this be able to cause errors at
 3939:   ** this point?  Actually, same question about sqlite3_finalize(),
 3940:   ** though one could argue that failure there means that the data is
 3941:   ** not durable.  *ponder*
 3942:   */
 3943:   pTokenizer->pModule->xClose(pCursor);
 3944:   if( SQLITE_DONE == rc ) return SQLITE_OK;
 3945:   return rc;
 3946: }
 3947: 
 3948: /* Add doclists for all terms in [pValues] to pendingTerms table. */
 3949: static int insertTerms(fulltext_vtab *v, sqlite_int64 iRowid,
 3950:                        sqlite3_value **pValues){
 3951:   int i;
 3952:   for(i = 0; i < v->nColumn ; ++i){
 3953:     char *zText = (char*)sqlite3_value_text(pValues[i]);
 3954:     int rc = buildTerms(v, iRowid, zText, i);
 3955:     if( rc!=SQLITE_OK ) return rc;
 3956:   }
 3957:   return SQLITE_OK;
 3958: }
 3959: 
 3960: /* Add empty doclists for all terms in the given row's content to
 3961: ** pendingTerms.
 3962: */
 3963: static int deleteTerms(fulltext_vtab *v, sqlite_int64 iRowid){
 3964:   const char **pValues;
 3965:   int i, rc;
 3966: 
 3967:   /* TODO(shess) Should we allow such tables at all? */
 3968:   if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
 3969: 
 3970:   rc = content_select(v, iRowid, &pValues);
 3971:   if( rc!=SQLITE_OK ) return rc;
 3972: 
 3973:   for(i = 0 ; i < v->nColumn; ++i) {
 3974:     rc = buildTerms(v, iRowid, pValues[i], -1);
 3975:     if( rc!=SQLITE_OK ) break;
 3976:   }
 3977: 
 3978:   freeStringArray(v->nColumn, pValues);
 3979:   return SQLITE_OK;
 3980: }
 3981: 
 3982: /* TODO(shess) Refactor the code to remove this forward decl. */
 3983: static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid);
 3984: 
 3985: /* Insert a row into the %_content table; set *piRowid to be the ID of the
 3986: ** new row.  Add doclists for terms to pendingTerms.
 3987: */
 3988: static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
 3989:                         sqlite3_value **pValues, sqlite_int64 *piRowid){
 3990:   int rc;
 3991: 
 3992:   rc = content_insert(v, pRequestRowid, pValues);  /* execute an SQL INSERT */
 3993:   if( rc!=SQLITE_OK ) return rc;
 3994: 
 3995:   *piRowid = sqlite3_last_insert_rowid(v->db);
 3996:   rc = initPendingTerms(v, *piRowid);
 3997:   if( rc!=SQLITE_OK ) return rc;
 3998: 
 3999:   return insertTerms(v, *piRowid, pValues);
 4000: }
 4001: 
 4002: /* Delete a row from the %_content table; add empty doclists for terms
 4003: ** to pendingTerms.
 4004: */
 4005: static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
 4006:   int rc = initPendingTerms(v, iRow);
 4007:   if( rc!=SQLITE_OK ) return rc;
 4008: 
 4009:   rc = deleteTerms(v, iRow);
 4010:   if( rc!=SQLITE_OK ) return rc;
 4011: 
 4012:   return content_delete(v, iRow);  /* execute an SQL DELETE */
 4013: }
 4014: 
 4015: /* Update a row in the %_content table; add delete doclists to
 4016: ** pendingTerms for old terms not in the new data, add insert doclists
 4017: ** to pendingTerms for terms in the new data.
 4018: */
 4019: static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
 4020:                         sqlite3_value **pValues){
 4021:   int rc = initPendingTerms(v, iRow);
 4022:   if( rc!=SQLITE_OK ) return rc;
 4023: 
 4024:   /* Generate an empty doclist for each term that previously appeared in this
 4025:    * row. */
 4026:   rc = deleteTerms(v, iRow);
 4027:   if( rc!=SQLITE_OK ) return rc;
 4028: 
 4029:   rc = content_update(v, pValues, iRow);  /* execute an SQL UPDATE */
 4030:   if( rc!=SQLITE_OK ) return rc;
 4031: 
 4032:   /* Now add positions for terms which appear in the updated row. */
 4033:   return insertTerms(v, iRow, pValues);
 4034: }
 4035: 
 4036: /*******************************************************************/
 4037: /* InteriorWriter is used to collect terms and block references into
 4038: ** interior nodes in %_segments.  See commentary at top of file for
 4039: ** format.
 4040: */
 4041: 
 4042: /* How large interior nodes can grow. */
 4043: #define INTERIOR_MAX 2048
 4044: 
 4045: /* Minimum number of terms per interior node (except the root). This
 4046: ** prevents large terms from making the tree too skinny - must be >0
 4047: ** so that the tree always makes progress.  Note that the min tree
 4048: ** fanout will be INTERIOR_MIN_TERMS+1.
 4049: */
 4050: #define INTERIOR_MIN_TERMS 7
 4051: #if INTERIOR_MIN_TERMS<1
 4052: # error INTERIOR_MIN_TERMS must be greater than 0.
 4053: #endif
 4054: 
 4055: /* ROOT_MAX controls how much data is stored inline in the segment
 4056: ** directory.
 4057: */
 4058: /* TODO(shess) Push ROOT_MAX down to whoever is writing things.  It's
 4059: ** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
 4060: ** can both see it, but if the caller passed it in, we wouldn't even
 4061: ** need a define.
 4062: */
 4063: #define ROOT_MAX 1024
 4064: #if ROOT_MAX<VARINT_MAX*2
 4065: # error ROOT_MAX must have enough space for a header.
 4066: #endif
 4067: 
 4068: /* InteriorBlock stores a linked-list of interior blocks while a lower
 4069: ** layer is being constructed.
 4070: */
 4071: typedef struct InteriorBlock {
 4072:   DataBuffer term;           /* Leftmost term in block's subtree. */
 4073:   DataBuffer data;           /* Accumulated data for the block. */
 4074:   struct InteriorBlock *next;
 4075: } InteriorBlock;
 4076: 
 4077: static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
 4078:                                        const char *pTerm, int nTerm){
 4079:   InteriorBlock *block = sqlite3_malloc(sizeof(InteriorBlock));
 4080:   char c[VARINT_MAX+VARINT_MAX];
 4081:   int n;
 4082: 
 4083:   if( block ){
 4084:     memset(block, 0, sizeof(*block));
 4085:     dataBufferInit(&block->term, 0);
 4086:     dataBufferReplace(&block->term, pTerm, nTerm);
 4087: 
 4088:     n = putVarint(c, iHeight);
 4089:     n += putVarint(c+n, iChildBlock);
 4090:     dataBufferInit(&block->data, INTERIOR_MAX);
 4091:     dataBufferReplace(&block->data, c, n);
 4092:   }
 4093:   return block;
 4094: }
 4095: 
 4096: #ifndef NDEBUG
 4097: /* Verify that the data is readable as an interior node. */
 4098: static void interiorBlockValidate(InteriorBlock *pBlock){
 4099:   const char *pData = pBlock->data.pData;
 4100:   int nData = pBlock->data.nData;
 4101:   int n, iDummy;
 4102:   sqlite_int64 iBlockid;
 4103: 
 4104:   assert( nData>0 );
 4105:   assert( pData!=0 );
 4106:   assert( pData+nData>pData );
 4107: 
 4108:   /* Must lead with height of node as a varint(n), n>0 */
 4109:   n = getVarint32(pData, &iDummy);
 4110:   assert( n>0 );
 4111:   assert( iDummy>0 );
 4112:   assert( n<nData );
 4113:   pData += n;
 4114:   nData -= n;
 4115: 
 4116:   /* Must contain iBlockid. */
 4117:   n = getVarint(pData, &iBlockid);
 4118:   assert( n>0 );
 4119:   assert( n<=nData );
 4120:   pData += n;
 4121:   nData -= n;
 4122: 
 4123:   /* Zero or more terms of positive length */
 4124:   if( nData!=0 ){
 4125:     /* First term is not delta-encoded. */
 4126:     n = getVarint32(pData, &iDummy);
 4127:     assert( n>0 );
 4128:     assert( iDummy>0 );
 4129:     assert( n+iDummy>0);
 4130:     assert( n+iDummy<=nData );
 4131:     pData += n+iDummy;
 4132:     nData -= n+iDummy;
 4133: 
 4134:     /* Following terms delta-encoded. */
 4135:     while( nData!=0 ){
 4136:       /* Length of shared prefix. */
 4137:       n = getVarint32(pData, &iDummy);
 4138:       assert( n>0 );
 4139:       assert( iDummy>=0 );
 4140:       assert( n<nData );
 4141:       pData += n;
 4142:       nData -= n;
 4143: 
 4144:       /* Length and data of distinct suffix. */
 4145:       n = getVarint32(pData, &iDummy);
 4146:       assert( n>0 );
 4147:       assert( iDummy>0 );
 4148:       assert( n+iDummy>0);
 4149:       assert( n+iDummy<=nData );
 4150:       pData += n+iDummy;
 4151:       nData -= n+iDummy;
 4152:     }
 4153:   }
 4154: }
 4155: #define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
 4156: #else
 4157: #define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
 4158: #endif
 4159: 
 4160: typedef struct InteriorWriter {
 4161:   int iHeight;                   /* from 0 at leaves. */
 4162:   InteriorBlock *first, *last;
 4163:   struct InteriorWriter *parentWriter;
 4164: 
 4165:   DataBuffer term;               /* Last term written to block "last". */
 4166:   sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
 4167: #ifndef NDEBUG
 4168:   sqlite_int64 iLastChildBlock;  /* for consistency checks. */
 4169: #endif
 4170: } InteriorWriter;
 4171: 
 4172: /* Initialize an interior node where pTerm[nTerm] marks the leftmost
 4173: ** term in the tree.  iChildBlock is the leftmost child block at the
 4174: ** next level down the tree.
 4175: */
 4176: static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
 4177:                                sqlite_int64 iChildBlock,
 4178:                                InteriorWriter *pWriter){
 4179:   InteriorBlock *block;
 4180:   assert( iHeight>0 );
 4181:   CLEAR(pWriter);
 4182: 
 4183:   pWriter->iHeight = iHeight;
 4184:   pWriter->iOpeningChildBlock = iChildBlock;
 4185: #ifndef NDEBUG
 4186:   pWriter->iLastChildBlock = iChildBlock;
 4187: #endif
 4188:   block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
 4189:   pWriter->last = pWriter->first = block;
 4190:   ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
 4191:   dataBufferInit(&pWriter->term, 0);
 4192: }
 4193: 
 4194: /* Append the child node rooted at iChildBlock to the interior node,
 4195: ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
 4196: */
 4197: static void interiorWriterAppend(InteriorWriter *pWriter,
 4198:                                  const char *pTerm, int nTerm,
 4199:                                  sqlite_int64 iChildBlock){
 4200:   char c[VARINT_MAX+VARINT_MAX];
 4201:   int n, nPrefix = 0;
 4202: 
 4203:   ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
 4204: 
 4205:   /* The first term written into an interior node is actually
 4206:   ** associated with the second child added (the first child was added
 4207:   ** in interiorWriterInit, or in the if clause at the bottom of this
 4208:   ** function).  That term gets encoded straight up, with nPrefix left
 4209:   ** at 0.
 4210:   */
 4211:   if( pWriter->term.nData==0 ){
 4212:     n = putVarint(c, nTerm);
 4213:   }else{
 4214:     while( nPrefix<pWriter->term.nData &&
 4215:            pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
 4216:       nPrefix++;
 4217:     }
 4218: 
 4219:     n = putVarint(c, nPrefix);
 4220:     n += putVarint(c+n, nTerm-nPrefix);
 4221:   }
 4222: 
 4223: #ifndef NDEBUG
 4224:   pWriter->iLastChildBlock++;
 4225: #endif
 4226:   assert( pWriter->iLastChildBlock==iChildBlock );
 4227: 
 4228:   /* Overflow to a new block if the new term makes the current block
 4229:   ** too big, and the current block already has enough terms.
 4230:   */
 4231:   if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
 4232:       iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
 4233:     pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
 4234:                                            pTerm, nTerm);
 4235:     pWriter->last = pWriter->last->next;
 4236:     pWriter->iOpeningChildBlock = iChildBlock;
 4237:     dataBufferReset(&pWriter->term);
 4238:   }else{
 4239:     dataBufferAppend2(&pWriter->last->data, c, n,
 4240:                       pTerm+nPrefix, nTerm-nPrefix);
 4241:     dataBufferReplace(&pWriter->term, pTerm, nTerm);
 4242:   }
 4243:   ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
 4244: }
 4245: 
 4246: /* Free the space used by pWriter, including the linked-list of
 4247: ** InteriorBlocks, and parentWriter, if present.
 4248: */
 4249: static int interiorWriterDestroy(InteriorWriter *pWriter){
 4250:   InteriorBlock *block = pWriter->first;
 4251: 
 4252:   while( block!=NULL ){
 4253:     InteriorBlock *b = block;
 4254:     block = block->next;
 4255:     dataBufferDestroy(&b->term);
 4256:     dataBufferDestroy(&b->data);
 4257:     sqlite3_free(b);
 4258:   }
 4259:   if( pWriter->parentWriter!=NULL ){
 4260:     interiorWriterDestroy(pWriter->parentWriter);
 4261:     sqlite3_free(pWriter->parentWriter);
 4262:   }
 4263:   dataBufferDestroy(&pWriter->term);
 4264:   SCRAMBLE(pWriter);
 4265:   return SQLITE_OK;
 4266: }
 4267: 
 4268: /* If pWriter can fit entirely in ROOT_MAX, return it as the root info
 4269: ** directly, leaving *piEndBlockid unchanged.  Otherwise, flush
 4270: ** pWriter to %_segments, building a new layer of interior nodes, and
 4271: ** recursively ask for their root into.
 4272: */
 4273: static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
 4274:                                   char **ppRootInfo, int *pnRootInfo,
 4275:                                   sqlite_int64 *piEndBlockid){
 4276:   InteriorBlock *block = pWriter->first;
 4277:   sqlite_int64 iBlockid = 0;
 4278:   int rc;
 4279: 
 4280:   /* If we can fit the segment inline */
 4281:   if( block==pWriter->last && block->data.nData<ROOT_MAX ){
 4282:     *ppRootInfo = block->data.pData;
 4283:     *pnRootInfo = block->data.nData;
 4284:     return SQLITE_OK;
 4285:   }
 4286: 
 4287:   /* Flush the first block to %_segments, and create a new level of
 4288:   ** interior node.
 4289:   */
 4290:   ASSERT_VALID_INTERIOR_BLOCK(block);
 4291:   rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
 4292:   if( rc!=SQLITE_OK ) return rc;
 4293:   *piEndBlockid = iBlockid;
 4294: 
 4295:   pWriter->parentWriter = sqlite3_malloc(sizeof(*pWriter->parentWriter));
 4296:   interiorWriterInit(pWriter->iHeight+1,
 4297:                      block->term.pData, block->term.nData,
 4298:                      iBlockid, pWriter->parentWriter);
 4299: 
 4300:   /* Flush additional blocks and append to the higher interior
 4301:   ** node.
 4302:   */
 4303:   for(block=block->next; block!=NULL; block=block->next){
 4304:     ASSERT_VALID_INTERIOR_BLOCK(block);
 4305:     rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
 4306:     if( rc!=SQLITE_OK ) return rc;
 4307:     *piEndBlockid = iBlockid;
 4308: 
 4309:     interiorWriterAppend(pWriter->parentWriter,
 4310:                          block->term.pData, block->term.nData, iBlockid);
 4311:   }
 4312: 
 4313:   /* Parent node gets the chance to be the root. */
 4314:   return interiorWriterRootInfo(v, pWriter->parentWriter,
 4315:                                 ppRootInfo, pnRootInfo, piEndBlockid);
 4316: }
 4317: 
 4318: /****************************************************************/
 4319: /* InteriorReader is used to read off the data from an interior node
 4320: ** (see comment at top of file for the format).
 4321: */
 4322: typedef struct InteriorReader {
 4323:   const char *pData;
 4324:   int nData;
 4325: 
 4326:   DataBuffer term;          /* previous term, for decoding term delta. */
 4327: 
 4328:   sqlite_int64 iBlockid;
 4329: } InteriorReader;
 4330: 
 4331: static void interiorReaderDestroy(InteriorReader *pReader){
 4332:   dataBufferDestroy(&pReader->term);
 4333:   SCRAMBLE(pReader);
 4334: }
 4335: 
 4336: /* TODO(shess) The assertions are great, but what if we're in NDEBUG
 4337: ** and the blob is empty or otherwise contains suspect data?
 4338: */
 4339: static void interiorReaderInit(const char *pData, int nData,
 4340:                                InteriorReader *pReader){
 4341:   int n, nTerm;
 4342: 
 4343:   /* Require at least the leading flag byte */
 4344:   assert( nData>0 );
 4345:   assert( pData[0]!='\0' );
 4346: 
 4347:   CLEAR(pReader);
 4348: 
 4349:   /* Decode the base blockid, and set the cursor to the first term. */
 4350:   n = getVarint(pData+1, &pReader->iBlockid);
 4351:   assert( 1+n<=nData );
 4352:   pReader->pData = pData+1+n;
 4353:   pReader->nData = nData-(1+n);
 4354: 
 4355:   /* A single-child interior node (such as when a leaf node was too
 4356:   ** large for the segment directory) won't have any terms.
 4357:   ** Otherwise, decode the first term.
 4358:   */
 4359:   if( pReader->nData==0 ){
 4360:     dataBufferInit(&pReader->term, 0);
 4361:   }else{
 4362:     n = getVarint32(pReader->pData, &nTerm);
 4363:     dataBufferInit(&pReader->term, nTerm);
 4364:     dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
 4365:     assert( n+nTerm<=pReader->nData );
 4366:     pReader->pData += n+nTerm;
 4367:     pReader->nData -= n+nTerm;
 4368:   }
 4369: }
 4370: 
 4371: static int interiorReaderAtEnd(InteriorReader *pReader){
 4372:   return pReader->term.nData==0;
 4373: }
 4374: 
 4375: static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
 4376:   return pReader->iBlockid;
 4377: }
 4378: 
 4379: static int interiorReaderTermBytes(InteriorReader *pReader){
 4380:   assert( !interiorReaderAtEnd(pReader) );
 4381:   return pReader->term.nData;
 4382: }
 4383: static const char *interiorReaderTerm(InteriorReader *pReader){
 4384:   assert( !interiorReaderAtEnd(pReader) );
 4385:   return pReader->term.pData;
 4386: }
 4387: 
 4388: /* Step forward to the next term in the node. */
 4389: static void interiorReaderStep(InteriorReader *pReader){
 4390:   assert( !interiorReaderAtEnd(pReader) );
 4391: 
 4392:   /* If the last term has been read, signal eof, else construct the
 4393:   ** next term.
 4394:   */
 4395:   if( pReader->nData==0 ){
 4396:     dataBufferReset(&pReader->term);
 4397:   }else{
 4398:     int n, nPrefix, nSuffix;
 4399: 
 4400:     n = getVarint32(pReader->pData, &nPrefix);
 4401:     n += getVarint32(pReader->pData+n, &nSuffix);
 4402: 
 4403:     /* Truncate the current term and append suffix data. */
 4404:     pReader->term.nData = nPrefix;
 4405:     dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
 4406: 
 4407:     assert( n+nSuffix<=pReader->nData );
 4408:     pReader->pData += n+nSuffix;
 4409:     pReader->nData -= n+nSuffix;
 4410:   }
 4411:   pReader->iBlockid++;
 4412: }
 4413: 
 4414: /* Compare the current term to pTerm[nTerm], returning strcmp-style
 4415: ** results.  If isPrefix, equality means equal through nTerm bytes.
 4416: */
 4417: static int interiorReaderTermCmp(InteriorReader *pReader,
 4418:                                  const char *pTerm, int nTerm, int isPrefix){
 4419:   const char *pReaderTerm = interiorReaderTerm(pReader);
 4420:   int nReaderTerm = interiorReaderTermBytes(pReader);
 4421:   int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
 4422: 
 4423:   if( n==0 ){
 4424:     if( nReaderTerm>0 ) return -1;
 4425:     if( nTerm>0 ) return 1;
 4426:     return 0;
 4427:   }
 4428: 
 4429:   c = memcmp(pReaderTerm, pTerm, n);
 4430:   if( c!=0 ) return c;
 4431:   if( isPrefix && n==nTerm ) return 0;
 4432:   return nReaderTerm - nTerm;
 4433: }
 4434: 
 4435: /****************************************************************/
 4436: /* LeafWriter is used to collect terms and associated doclist data
 4437: ** into leaf blocks in %_segments (see top of file for format info).
 4438: ** Expected usage is:
 4439: **
 4440: ** LeafWriter writer;
 4441: ** leafWriterInit(0, 0, &writer);
 4442: ** while( sorted_terms_left_to_process ){
 4443: **   // data is doclist data for that term.
 4444: **   rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
 4445: **   if( rc!=SQLITE_OK ) goto err;
 4446: ** }
 4447: ** rc = leafWriterFinalize(v, &writer);
 4448: **err:
 4449: ** leafWriterDestroy(&writer);
 4450: ** return rc;
 4451: **
 4452: ** leafWriterStep() may write a collected leaf out to %_segments.
 4453: ** leafWriterFinalize() finishes writing any buffered data and stores
 4454: ** a root node in %_segdir.  leafWriterDestroy() frees all buffers and
 4455: ** InteriorWriters allocated as part of writing this segment.
 4456: **
 4457: ** TODO(shess) Document leafWriterStepMerge().
 4458: */
 4459: 
 4460: /* Put terms with data this big in their own block. */
 4461: #define STANDALONE_MIN 1024
 4462: 
 4463: /* Keep leaf blocks below this size. */
 4464: #define LEAF_MAX 2048
 4465: 
 4466: typedef struct LeafWriter {
 4467:   int iLevel;
 4468:   int idx;
 4469:   sqlite_int64 iStartBlockid;     /* needed to create the root info */
 4470:   sqlite_int64 iEndBlockid;       /* when we're done writing. */
 4471: 
 4472:   DataBuffer term;                /* previous encoded term */
 4473:   DataBuffer data;                /* encoding buffer */
 4474: 
 4475:   /* bytes of first term in the current node which distinguishes that
 4476:   ** term from the last term of the previous node.
 4477:   */
 4478:   int nTermDistinct;
 4479: 
 4480:   InteriorWriter parentWriter;    /* if we overflow */
 4481:   int has_parent;
 4482: } LeafWriter;
 4483: 
 4484: static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
 4485:   CLEAR(pWriter);
 4486:   pWriter->iLevel = iLevel;
 4487:   pWriter->idx = idx;
 4488: 
 4489:   dataBufferInit(&pWriter->term, 32);
 4490: 
 4491:   /* Start out with a reasonably sized block, though it can grow. */
 4492:   dataBufferInit(&pWriter->data, LEAF_MAX);
 4493: }
 4494: 
 4495: #ifndef NDEBUG
 4496: /* Verify that the data is readable as a leaf node. */
 4497: static void leafNodeValidate(const char *pData, int nData){
 4498:   int n, iDummy;
 4499: 
 4500:   if( nData==0 ) return;
 4501:   assert( nData>0 );
 4502:   assert( pData!=0 );
 4503:   assert( pData+nData>pData );
 4504: 
 4505:   /* Must lead with a varint(0) */
 4506:   n = getVarint32(pData, &iDummy);
 4507:   assert( iDummy==0 );
 4508:   assert( n>0 );
 4509:   assert( n<nData );
 4510:   pData += n;
 4511:   nData -= n;
 4512: 
 4513:   /* Leading term length and data must fit in buffer. */
 4514:   n = getVarint32(pData, &iDummy);
 4515:   assert( n>0 );
 4516:   assert( iDummy>0 );
 4517:   assert( n+iDummy>0 );
 4518:   assert( n+iDummy<nData );
 4519:   pData += n+iDummy;
 4520:   nData -= n+iDummy;
 4521: 
 4522:   /* Leading term's doclist length and data must fit. */
 4523:   n = getVarint32(pData, &iDummy);
 4524:   assert( n>0 );
 4525:   assert( iDummy>0 );
 4526:   assert( n+iDummy>0 );
 4527:   assert( n+iDummy<=nData );
 4528:   ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
 4529:   pData += n+iDummy;
 4530:   nData -= n+iDummy;
 4531: 
 4532:   /* Verify that trailing terms and doclists also are readable. */
 4533:   while( nData!=0 ){
 4534:     n = getVarint32(pData, &iDummy);
 4535:     assert( n>0 );
 4536:     assert( iDummy>=0 );
 4537:     assert( n<nData );
 4538:     pData += n;
 4539:     nData -= n;
 4540:     n = getVarint32(pData, &iDummy);
 4541:     assert( n>0 );
 4542:     assert( iDummy>0 );
 4543:     assert( n+iDummy>0 );
 4544:     assert( n+iDummy<nData );
 4545:     pData += n+iDummy;
 4546:     nData -= n+iDummy;
 4547: 
 4548:     n = getVarint32(pData, &iDummy);
 4549:     assert( n>0 );
 4550:     assert( iDummy>0 );
 4551:     assert( n+iDummy>0 );
 4552:     assert( n+iDummy<=nData );
 4553:     ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
 4554:     pData += n+iDummy;
 4555:     nData -= n+iDummy;
 4556:   }
 4557: }
 4558: #define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
 4559: #else
 4560: #define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
 4561: #endif
 4562: 
 4563: /* Flush the current leaf node to %_segments, and adding the resulting
 4564: ** blockid and the starting term to the interior node which will
 4565: ** contain it.
 4566: */
 4567: static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
 4568:                                    int iData, int nData){
 4569:   sqlite_int64 iBlockid = 0;
 4570:   const char *pStartingTerm;
 4571:   int nStartingTerm, rc, n;
 4572: 
 4573:   /* Must have the leading varint(0) flag, plus at least some
 4574:   ** valid-looking data.
 4575:   */
 4576:   assert( nData>2 );
 4577:   assert( iData>=0 );
 4578:   assert( iData+nData<=pWriter->data.nData );
 4579:   ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
 4580: 
 4581:   rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
 4582:   if( rc!=SQLITE_OK ) return rc;
 4583:   assert( iBlockid!=0 );
 4584: 
 4585:   /* Reconstruct the first term in the leaf for purposes of building
 4586:   ** the interior node.
 4587:   */
 4588:   n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
 4589:   pStartingTerm = pWriter->data.pData+iData+1+n;
 4590:   assert( pWriter->data.nData>iData+1+n+nStartingTerm );
 4591:   assert( pWriter->nTermDistinct>0 );
 4592:   assert( pWriter->nTermDistinct<=nStartingTerm );
 4593:   nStartingTerm = pWriter->nTermDistinct;
 4594: 
 4595:   if( pWriter->has_parent ){
 4596:     interiorWriterAppend(&pWriter->parentWriter,
 4597:                          pStartingTerm, nStartingTerm, iBlockid);
 4598:   }else{
 4599:     interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
 4600:                        &pWriter->parentWriter);
 4601:     pWriter->has_parent = 1;
 4602:   }
 4603: 
 4604:   /* Track the span of this segment's leaf nodes. */
 4605:   if( pWriter->iEndBlockid==0 ){
 4606:     pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
 4607:   }else{
 4608:     pWriter->iEndBlockid++;
 4609:     assert( iBlockid==pWriter->iEndBlockid );
 4610:   }
 4611: 
 4612:   return SQLITE_OK;
 4613: }
 4614: static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
 4615:   int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
 4616:   if( rc!=SQLITE_OK ) return rc;
 4617: 
 4618:   /* Re-initialize the output buffer. */
 4619:   dataBufferReset(&pWriter->data);
 4620: 
 4621:   return SQLITE_OK;
 4622: }
 4623: 
 4624: /* Fetch the root info for the segment.  If the entire leaf fits
 4625: ** within ROOT_MAX, then it will be returned directly, otherwise it
 4626: ** will be flushed and the root info will be returned from the
 4627: ** interior node.  *piEndBlockid is set to the blockid of the last
 4628: ** interior or leaf node written to disk (0 if none are written at
 4629: ** all).
 4630: */
 4631: static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
 4632:                               char **ppRootInfo, int *pnRootInfo,
 4633:                               sqlite_int64 *piEndBlockid){
 4634:   /* we can fit the segment entirely inline */
 4635:   if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){
 4636:     *ppRootInfo = pWriter->data.pData;
 4637:     *pnRootInfo = pWriter->data.nData;
 4638:     *piEndBlockid = 0;
 4639:     return SQLITE_OK;
 4640:   }
 4641: 
 4642:   /* Flush remaining leaf data. */
 4643:   if( pWriter->data.nData>0 ){
 4644:     int rc = leafWriterFlush(v, pWriter);
 4645:     if( rc!=SQLITE_OK ) return rc;
 4646:   }
 4647: 
 4648:   /* We must have flushed a leaf at some point. */
 4649:   assert( pWriter->has_parent );
 4650: 
 4651:   /* Tenatively set the end leaf blockid as the end blockid.  If the
 4652:   ** interior node can be returned inline, this will be the final
 4653:   ** blockid, otherwise it will be overwritten by
 4654:   ** interiorWriterRootInfo().
 4655:   */
 4656:   *piEndBlockid = pWriter->iEndBlockid;
 4657: 
 4658:   return interiorWriterRootInfo(v, &pWriter->parentWriter,
 4659:                                 ppRootInfo, pnRootInfo, piEndBlockid);
 4660: }
 4661: 
 4662: /* Collect the rootInfo data and store it into the segment directory.
 4663: ** This has the effect of flushing the segment's leaf data to
 4664: ** %_segments, and also flushing any interior nodes to %_segments.
 4665: */
 4666: static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){
 4667:   sqlite_int64 iEndBlockid;
 4668:   char *pRootInfo;
 4669:   int rc, nRootInfo;
 4670: 
 4671:   rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
 4672:   if( rc!=SQLITE_OK ) return rc;
 4673: 
 4674:   /* Don't bother storing an entirely empty segment. */
 4675:   if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
 4676: 
 4677:   return segdir_set(v, pWriter->iLevel, pWriter->idx,
 4678:                     pWriter->iStartBlockid, pWriter->iEndBlockid,
 4679:                     iEndBlockid, pRootInfo, nRootInfo);
 4680: }
 4681: 
 4682: static void leafWriterDestroy(LeafWriter *pWriter){
 4683:   if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
 4684:   dataBufferDestroy(&pWriter->term);
 4685:   dataBufferDestroy(&pWriter->data);
 4686: }
 4687: 
 4688: /* Encode a term into the leafWriter, delta-encoding as appropriate.
 4689: ** Returns the length of the new term which distinguishes it from the
 4690: ** previous term, which can be used to set nTermDistinct when a node
 4691: ** boundary is crossed.
 4692: */
 4693: static int leafWriterEncodeTerm(LeafWriter *pWriter,
 4694:                                 const char *pTerm, int nTerm){
 4695:   char c[VARINT_MAX+VARINT_MAX];
 4696:   int n, nPrefix = 0;
 4697: 
 4698:   assert( nTerm>0 );
 4699:   while( nPrefix<pWriter->term.nData &&
 4700:          pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
 4701:     nPrefix++;
 4702:     /* Failing this implies that the terms weren't in order. */
 4703:     assert( nPrefix<nTerm );
 4704:   }
 4705: 
 4706:   if( pWriter->data.nData==0 ){
 4707:     /* Encode the node header and leading term as:
 4708:     **  varint(0)
 4709:     **  varint(nTerm)
 4710:     **  char pTerm[nTerm]
 4711:     */
 4712:     n = putVarint(c, '\0');
 4713:     n += putVarint(c+n, nTerm);
 4714:     dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
 4715:   }else{
 4716:     /* Delta-encode the term as:
 4717:     **  varint(nPrefix)
 4718:     **  varint(nSuffix)
 4719:     **  char pTermSuffix[nSuffix]
 4720:     */
 4721:     n = putVarint(c, nPrefix);
 4722:     n += putVarint(c+n, nTerm-nPrefix);
 4723:     dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
 4724:   }
 4725:   dataBufferReplace(&pWriter->term, pTerm, nTerm);
 4726: 
 4727:   return nPrefix+1;
 4728: }
 4729: 
 4730: /* Used to avoid a memmove when a large amount of doclist data is in
 4731: ** the buffer.  This constructs a node and term header before
 4732: ** iDoclistData and flushes the resulting complete node using
 4733: ** leafWriterInternalFlush().
 4734: */
 4735: static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter,
 4736:                                  const char *pTerm, int nTerm,
 4737:                                  int iDoclistData){
 4738:   char c[VARINT_MAX+VARINT_MAX];
 4739:   int iData, n = putVarint(c, 0);
 4740:   n += putVarint(c+n, nTerm);
 4741: 
 4742:   /* There should always be room for the header.  Even if pTerm shared
 4743:   ** a substantial prefix with the previous term, the entire prefix
 4744:   ** could be constructed from earlier data in the doclist, so there
 4745:   ** should be room.
 4746:   */
 4747:   assert( iDoclistData>=n+nTerm );
 4748: 
 4749:   iData = iDoclistData-(n+nTerm);
 4750:   memcpy(pWriter->data.pData+iData, c, n);
 4751:   memcpy(pWriter->data.pData+iData+n, pTerm, nTerm);
 4752: 
 4753:   return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData);
 4754: }
 4755: 
 4756: /* Push pTerm[nTerm] along with the doclist data to the leaf layer of
 4757: ** %_segments.
 4758: */
 4759: static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
 4760:                                const char *pTerm, int nTerm,
 4761:                                DLReader *pReaders, int nReaders){
 4762:   char c[VARINT_MAX+VARINT_MAX];
 4763:   int iTermData = pWriter->data.nData, iDoclistData;
 4764:   int i, nData, n, nActualData, nActual, rc, nTermDistinct;
 4765: 
 4766:   ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
 4767:   nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
 4768: 
 4769:   /* Remember nTermDistinct if opening a new node. */
 4770:   if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
 4771: 
 4772:   iDoclistData = pWriter->data.nData;
 4773: 
 4774:   /* Estimate the length of the merged doclist so we can leave space
 4775:   ** to encode it.
 4776:   */
 4777:   for(i=0, nData=0; i<nReaders; i++){
 4778:     nData += dlrAllDataBytes(&pReaders[i]);
 4779:   }
 4780:   n = putVarint(c, nData);
 4781:   dataBufferAppend(&pWriter->data, c, n);
 4782: 
 4783:   docListMerge(&pWriter->data, pReaders, nReaders);
 4784:   ASSERT_VALID_DOCLIST(DL_DEFAULT,
 4785:                        pWriter->data.pData+iDoclistData+n,
 4786:                        pWriter->data.nData-iDoclistData-n, NULL);
 4787: 
 4788:   /* The actual amount of doclist data at this point could be smaller
 4789:   ** than the length we encoded.  Additionally, the space required to
 4790:   ** encode this length could be smaller.  For small doclists, this is
 4791:   ** not a big deal, we can just use memmove() to adjust things.
 4792:   */
 4793:   nActualData = pWriter->data.nData-(iDoclistData+n);
 4794:   nActual = putVarint(c, nActualData);
 4795:   assert( nActualData<=nData );
 4796:   assert( nActual<=n );
 4797: 
 4798:   /* If the new doclist is big enough for force a standalone leaf
 4799:   ** node, we can immediately flush it inline without doing the
 4800:   ** memmove().
 4801:   */
 4802:   /* TODO(shess) This test matches leafWriterStep(), which does this
 4803:   ** test before it knows the cost to varint-encode the term and
 4804:   ** doclist lengths.  At some point, change to
 4805:   ** pWriter->data.nData-iTermData>STANDALONE_MIN.
 4806:   */
 4807:   if( nTerm+nActualData>STANDALONE_MIN ){
 4808:     /* Push leaf node from before this term. */
 4809:     if( iTermData>0 ){
 4810:       rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
 4811:       if( rc!=SQLITE_OK ) return rc;
 4812: 
 4813:       pWriter->nTermDistinct = nTermDistinct;
 4814:     }
 4815: 
 4816:     /* Fix the encoded doclist length. */
 4817:     iDoclistData += n - nActual;
 4818:     memcpy(pWriter->data.pData+iDoclistData, c, nActual);
 4819: 
 4820:     /* Push the standalone leaf node. */
 4821:     rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
 4822:     if( rc!=SQLITE_OK ) return rc;
 4823: 
 4824:     /* Leave the node empty. */
 4825:     dataBufferReset(&pWriter->data);
 4826: 
 4827:     return rc;
 4828:   }
 4829: 
 4830:   /* At this point, we know that the doclist was small, so do the
 4831:   ** memmove if indicated.
 4832:   */
 4833:   if( nActual<n ){
 4834:     memmove(pWriter->data.pData+iDoclistData+nActual,
 4835:             pWriter->data.pData+iDoclistData+n,
 4836:             pWriter->data.nData-(iDoclistData+n));
 4837:     pWriter->data.nData -= n-nActual;
 4838:   }
 4839: 
 4840:   /* Replace written length with actual length. */
 4841:   memcpy(pWriter->data.pData+iDoclistData, c, nActual);
 4842: 
 4843:   /* If the node is too large, break things up. */
 4844:   /* TODO(shess) This test matches leafWriterStep(), which does this
 4845:   ** test before it knows the cost to varint-encode the term and
 4846:   ** doclist lengths.  At some point, change to
 4847:   ** pWriter->data.nData>LEAF_MAX.
 4848:   */
 4849:   if( iTermData+nTerm+nActualData>LEAF_MAX ){
 4850:     /* Flush out the leading data as a node */
 4851:     rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
 4852:     if( rc!=SQLITE_OK ) return rc;
 4853: 
 4854:     pWriter->nTermDistinct = nTermDistinct;
 4855: 
 4856:     /* Rebuild header using the current term */
 4857:     n = putVarint(pWriter->data.pData, 0);
 4858:     n += putVarint(pWriter->data.pData+n, nTerm);
 4859:     memcpy(pWriter->data.pData+n, pTerm, nTerm);
 4860:     n += nTerm;
 4861: 
 4862:     /* There should always be room, because the previous encoding
 4863:     ** included all data necessary to construct the term.
 4864:     */
 4865:     assert( n<iDoclistData );
 4866:     /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
 4867:     ** following memcpy() is safe (as opposed to needing a memmove).
 4868:     */
 4869:     assert( 2*STANDALONE_MIN<=LEAF_MAX );
 4870:     assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
 4871:     memcpy(pWriter->data.pData+n,
 4872:            pWriter->data.pData+iDoclistData,
 4873:            pWriter->data.nData-iDoclistData);
 4874:     pWriter->data.nData -= iDoclistData-n;
 4875:   }
 4876:   ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
 4877: 
 4878:   return SQLITE_OK;
 4879: }
 4880: 
 4881: /* Push pTerm[nTerm] along with the doclist data to the leaf layer of
 4882: ** %_segments.
 4883: */
 4884: /* TODO(shess) Revise writeZeroSegment() so that doclists are
 4885: ** constructed directly in pWriter->data.
 4886: */
 4887: static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
 4888:                           const char *pTerm, int nTerm,
 4889:                           const char *pData, int nData){
 4890:   int rc;
 4891:   DLReader reader;
 4892: 
 4893:   dlrInit(&reader, DL_DEFAULT, pData, nData);
 4894:   rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
 4895:   dlrDestroy(&reader);
 4896: 
 4897:   return rc;
 4898: }
 4899: 
 4900: 
 4901: /****************************************************************/
 4902: /* LeafReader is used to iterate over an individual leaf node. */
 4903: typedef struct LeafReader {
 4904:   DataBuffer term;          /* copy of current term. */
 4905: 
 4906:   const char *pData;        /* data for current term. */
 4907:   int nData;
 4908: } LeafReader;
 4909: 
 4910: static void leafReaderDestroy(LeafReader *pReader){
 4911:   dataBufferDestroy(&pReader->term);
 4912:   SCRAMBLE(pReader);
 4913: }
 4914: 
 4915: static int leafReaderAtEnd(LeafReader *pReader){
 4916:   return pReader->nData<=0;
 4917: }
 4918: 
 4919: /* Access the current term. */
 4920: static int leafReaderTermBytes(LeafReader *pReader){
 4921:   return pReader->term.nData;
 4922: }
 4923: static const char *leafReaderTerm(LeafReader *pReader){
 4924:   assert( pReader->term.nData>0 );
 4925:   return pReader->term.pData;
 4926: }
 4927: 
 4928: /* Access the doclist data for the current term. */
 4929: static int leafReaderDataBytes(LeafReader *pReader){
 4930:   int nData;
 4931:   assert( pReader->term.nData>0 );
 4932:   getVarint32(pReader->pData, &nData);
 4933:   return nData;
 4934: }
 4935: static const char *leafReaderData(LeafReader *pReader){
 4936:   int n, nData;
 4937:   assert( pReader->term.nData>0 );
 4938:   n = getVarint32(pReader->pData, &nData);
 4939:   return pReader->pData+n;
 4940: }
 4941: 
 4942: static void leafReaderInit(const char *pData, int nData,
 4943:                            LeafReader *pReader){
 4944:   int nTerm, n;
 4945: 
 4946:   assert( nData>0 );
 4947:   assert( pData[0]=='\0' );
 4948: 
 4949:   CLEAR(pReader);
 4950: 
 4951:   /* Read the first term, skipping the header byte. */
 4952:   n = getVarint32(pData+1, &nTerm);
 4953:   dataBufferInit(&pReader->term, nTerm);
 4954:   dataBufferReplace(&pReader->term, pData+1+n, nTerm);
 4955: 
 4956:   /* Position after the first term. */
 4957:   assert( 1+n+nTerm<nData );
 4958:   pReader->pData = pData+1+n+nTerm;
 4959:   pReader->nData = nData-1-n-nTerm;
 4960: }
 4961: 
 4962: /* Step the reader forward to the next term. */
 4963: static void leafReaderStep(LeafReader *pReader){
 4964:   int n, nData, nPrefix, nSuffix;
 4965:   assert( !leafReaderAtEnd(pReader) );
 4966: 
 4967:   /* Skip previous entry's data block. */
 4968:   n = getVarint32(pReader->pData, &nData);
 4969:   assert( n+nData<=pReader->nData );
 4970:   pReader->pData += n+nData;
 4971:   pReader->nData -= n+nData;
 4972: 
 4973:   if( !leafReaderAtEnd(pReader) ){
 4974:     /* Construct the new term using a prefix from the old term plus a
 4975:     ** suffix from the leaf data.
 4976:     */
 4977:     n = getVarint32(pReader->pData, &nPrefix);
 4978:     n += getVarint32(pReader->pData+n, &nSuffix);
 4979:     assert( n+nSuffix<pReader->nData );
 4980:     pReader->term.nData = nPrefix;
 4981:     dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
 4982: 
 4983:     pReader->pData += n+nSuffix;
 4984:     pReader->nData -= n+nSuffix;
 4985:   }
 4986: }
 4987: 
 4988: /* strcmp-style comparison of pReader's current term against pTerm.
 4989: ** If isPrefix, equality means equal through nTerm bytes.
 4990: */
 4991: static int leafReaderTermCmp(LeafReader *pReader,
 4992:                              const char *pTerm, int nTerm, int isPrefix){
 4993:   int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
 4994:   if( n==0 ){
 4995:     if( pReader->term.nData>0 ) return -1;
 4996:     if(nTerm>0 ) return 1;
 4997:     return 0;
 4998:   }
 4999: 
 5000:   c = memcmp(pReader->term.pData, pTerm, n);
 5001:   if( c!=0 ) return c;
 5002:   if( isPrefix && n==nTerm ) return 0;
 5003:   return pReader->term.nData - nTerm;
 5004: }
 5005: 
 5006: 
 5007: /****************************************************************/
 5008: /* LeavesReader wraps LeafReader to allow iterating over the entire
 5009: ** leaf layer of the tree.
 5010: */
 5011: typedef struct LeavesReader {
 5012:   int idx;                  /* Index within the segment. */
 5013: 
 5014:   sqlite3_stmt *pStmt;      /* Statement we're streaming leaves from. */
 5015:   int eof;                  /* we've seen SQLITE_DONE from pStmt. */
 5016: 
 5017:   LeafReader leafReader;    /* reader for the current leaf. */
 5018:   DataBuffer rootData;      /* root data for inline. */
 5019: } LeavesReader;
 5020: 
 5021: /* Access the current term. */
 5022: static int leavesReaderTermBytes(LeavesReader *pReader){
 5023:   assert( !pReader->eof );
 5024:   return leafReaderTermBytes(&pReader->leafReader);
 5025: }
 5026: static const char *leavesReaderTerm(LeavesReader *pReader){
 5027:   assert( !pReader->eof );
 5028:   return leafReaderTerm(&pReader->leafReader);
 5029: }
 5030: 
 5031: /* Access the doclist data for the current term. */
 5032: static int leavesReaderDataBytes(LeavesReader *pReader){
 5033:   assert( !pReader->eof );
 5034:   return leafReaderDataBytes(&pReader->leafReader);
 5035: }
 5036: static const char *leavesReaderData(LeavesReader *pReader){
 5037:   assert( !pReader->eof );
 5038:   return leafReaderData(&pReader->leafReader);
 5039: }
 5040: 
 5041: static int leavesReaderAtEnd(LeavesReader *pReader){
 5042:   return pReader->eof;
 5043: }
 5044: 
 5045: /* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
 5046: ** leaving the statement handle open, which locks the table.
 5047: */
 5048: /* TODO(shess) This "solution" is not satisfactory.  Really, there
 5049: ** should be check-in function for all statement handles which
 5050: ** arranges to call sqlite3_reset().  This most likely will require
 5051: ** modification to control flow all over the place, though, so for now
 5052: ** just punt.
 5053: **
 5054: ** Note the the current system assumes that segment merges will run to
 5055: ** completion, which is why this particular probably hasn't arisen in
 5056: ** this case.  Probably a brittle assumption.
 5057: */
 5058: static int leavesReaderReset(LeavesReader *pReader){
 5059:   return sqlite3_reset(pReader->pStmt);
 5060: }
 5061: 
 5062: static void leavesReaderDestroy(LeavesReader *pReader){
 5063:   /* If idx is -1, that means we're using a non-cached statement
 5064:   ** handle in the optimize() case, so we need to release it.
 5065:   */
 5066:   if( pReader->pStmt!=NULL && pReader->idx==-1 ){
 5067:     sqlite3_finalize(pReader->pStmt);
 5068:   }
 5069:   leafReaderDestroy(&pReader->leafReader);
 5070:   dataBufferDestroy(&pReader->rootData);
 5071:   SCRAMBLE(pReader);
 5072: }
 5073: 
 5074: /* Initialize pReader with the given root data (if iStartBlockid==0
 5075: ** the leaf data was entirely contained in the root), or from the
 5076: ** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
 5077: */
 5078: static int leavesReaderInit(fulltext_vtab *v,
 5079:                             int idx,
 5080:                             sqlite_int64 iStartBlockid,
 5081:                             sqlite_int64 iEndBlockid,
 5082:                             const char *pRootData, int nRootData,
 5083:                             LeavesReader *pReader){
 5084:   CLEAR(pReader);
 5085:   pReader->idx = idx;
 5086: 
 5087:   dataBufferInit(&pReader->rootData, 0);
 5088:   if( iStartBlockid==0 ){
 5089:     /* Entire leaf level fit in root data. */
 5090:     dataBufferReplace(&pReader->rootData, pRootData, nRootData);
 5091:     leafReaderInit(pReader->rootData.pData, pReader->rootData.nData,
 5092:                    &pReader->leafReader);
 5093:   }else{
 5094:     sqlite3_stmt *s;
 5095:     int rc = sql_get_leaf_statement(v, idx, &s);
 5096:     if( rc!=SQLITE_OK ) return rc;
 5097: 
 5098:     rc = sqlite3_bind_int64(s, 1, iStartBlockid);
 5099:     if( rc!=SQLITE_OK ) return rc;
 5100: 
 5101:     rc = sqlite3_bind_int64(s, 2, iEndBlockid);
 5102:     if( rc!=SQLITE_OK ) return rc;
 5103: 
 5104:     rc = sqlite3_step(s);
 5105:     if( rc==SQLITE_DONE ){
 5106:       pReader->eof = 1;
 5107:       return SQLITE_OK;
 5108:     }
 5109:     if( rc!=SQLITE_ROW ) return rc;
 5110: 
 5111:     pReader->pStmt = s;
 5112:     leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
 5113:                    sqlite3_column_bytes(pReader->pStmt, 0),
 5114:                    &pReader->leafReader);
 5115:   }
 5116:   return SQLITE_OK;
 5117: }
 5118: 
 5119: /* Step the current leaf forward to the next term.  If we reach the
 5120: ** end of the current leaf, step forward to the next leaf block.
 5121: */
 5122: static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){
 5123:   assert( !leavesReaderAtEnd(pReader) );
 5124:   leafReaderStep(&pReader->leafReader);
 5125: 
 5126:   if( leafReaderAtEnd(&pReader->leafReader) ){
 5127:     int rc;
 5128:     if( pReader->rootData.pData ){
 5129:       pReader->eof = 1;
 5130:       return SQLITE_OK;
 5131:     }
 5132:     rc = sqlite3_step(pReader->pStmt);
 5133:     if( rc!=SQLITE_ROW ){
 5134:       pReader->eof = 1;
 5135:       return rc==SQLITE_DONE ? SQLITE_OK : rc;
 5136:     }
 5137:     leafReaderDestroy(&pReader->leafReader);
 5138:     leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
 5139:                    sqlite3_column_bytes(pReader->pStmt, 0),
 5140:                    &pReader->leafReader);
 5141:   }
 5142:   return SQLITE_OK;
 5143: }
 5144: 
 5145: /* Order LeavesReaders by their term, ignoring idx.  Readers at eof
 5146: ** always sort to the end.
 5147: */
 5148: static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
 5149:   if( leavesReaderAtEnd(lr1) ){
 5150:     if( leavesReaderAtEnd(lr2) ) return 0;
 5151:     return 1;
 5152:   }
 5153:   if( leavesReaderAtEnd(lr2) ) return -1;
 5154: 
 5155:   return leafReaderTermCmp(&lr1->leafReader,
 5156:                            leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
 5157:                            0);
 5158: }
 5159: 
 5160: /* Similar to leavesReaderTermCmp(), with additional ordering by idx
 5161: ** so that older segments sort before newer segments.
 5162: */
 5163: static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){
 5164:   int c = leavesReaderTermCmp(lr1, lr2);
 5165:   if( c!=0 ) return c;
 5166:   return lr1->idx-lr2->idx;
 5167: }
 5168: 
 5169: /* Assume that pLr[1]..pLr[nLr] are sorted.  Bubble pLr[0] into its
 5170: ** sorted position.
 5171: */
 5172: static void leavesReaderReorder(LeavesReader *pLr, int nLr){
 5173:   while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
 5174:     LeavesReader tmp = pLr[0];
 5175:     pLr[0] = pLr[1];
 5176:     pLr[1] = tmp;
 5177:     nLr--;
 5178:     pLr++;
 5179:   }
 5180: }
 5181: 
 5182: /* Initializes pReaders with the segments from level iLevel, returning
 5183: ** the number of segments in *piReaders.  Leaves pReaders in sorted
 5184: ** order.
 5185: */
 5186: static int leavesReadersInit(fulltext_vtab *v, int iLevel,
 5187:                              LeavesReader *pReaders, int *piReaders){
 5188:   sqlite3_stmt *s;
 5189:   int i, rc = sql_get_statement(v, SEGDIR_SELECT_LEVEL_STMT, &s);
 5190:   if( rc!=SQLITE_OK ) return rc;
 5191: 
 5192:   rc = sqlite3_bind_int(s, 1, iLevel);
 5193:   if( rc!=SQLITE_OK ) return rc;
 5194: 
 5195:   i = 0;
 5196:   while( (rc = sqlite3_step(s))==SQLITE_ROW ){
 5197:     sqlite_int64 iStart = sqlite3_column_int64(s, 0);
 5198:     sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
 5199:     const char *pRootData = sqlite3_column_blob(s, 2);
 5200:     int nRootData = sqlite3_column_bytes(s, 2);
 5201: 
 5202:     assert( i<MERGE_COUNT );
 5203:     rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
 5204:                           &pReaders[i]);
 5205:     if( rc!=SQLITE_OK ) break;
 5206: 
 5207:     i++;
 5208:   }
 5209:   if( rc!=SQLITE_DONE ){
 5210:     while( i-->0 ){
 5211:       leavesReaderDestroy(&pReaders[i]);
 5212:     }
 5213:     return rc;
 5214:   }
 5215: 
 5216:   *piReaders = i;
 5217: 
 5218:   /* Leave our results sorted by term, then age. */
 5219:   while( i-- ){
 5220:     leavesReaderReorder(pReaders+i, *piReaders-i);
 5221:   }
 5222:   return SQLITE_OK;
 5223: }
 5224: 
 5225: /* Merge doclists from pReaders[nReaders] into a single doclist, which
 5226: ** is written to pWriter.  Assumes pReaders is ordered oldest to
 5227: ** newest.
 5228: */
 5229: /* TODO(shess) Consider putting this inline in segmentMerge(). */
 5230: static int leavesReadersMerge(fulltext_vtab *v,
 5231:                               LeavesReader *pReaders, int nReaders,
 5232:                               LeafWriter *pWriter){
 5233:   DLReader dlReaders[MERGE_COUNT];
 5234:   const char *pTerm = leavesReaderTerm(pReaders);
 5235:   int i, nTerm = leavesReaderTermBytes(pReaders);
 5236: 
 5237:   assert( nReaders<=MERGE_COUNT );
 5238: 
 5239:   for(i=0; i<nReaders; i++){
 5240:     dlrInit(&dlReaders[i], DL_DEFAULT,
 5241:             leavesReaderData(pReaders+i),
 5242:             leavesReaderDataBytes(pReaders+i));
 5243:   }
 5244: 
 5245:   return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders);
 5246: }
 5247: 
 5248: /* Forward ref due to mutual recursion with segdirNextIndex(). */
 5249: static int segmentMerge(fulltext_vtab *v, int iLevel);
 5250: 
 5251: /* Put the next available index at iLevel into *pidx.  If iLevel
 5252: ** already has MERGE_COUNT segments, they are merged to a higher
 5253: ** level to make room.
 5254: */
 5255: static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){
 5256:   int rc = segdir_max_index(v, iLevel, pidx);
 5257:   if( rc==SQLITE_DONE ){              /* No segments at iLevel. */
 5258:     *pidx = 0;
 5259:   }else if( rc==SQLITE_ROW ){
 5260:     if( *pidx==(MERGE_COUNT-1) ){
 5261:       rc = segmentMerge(v, iLevel);
 5262:       if( rc!=SQLITE_OK ) return rc;
 5263:       *pidx = 0;
 5264:     }else{
 5265:       (*pidx)++;
 5266:     }
 5267:   }else{
 5268:     return rc;
 5269:   }
 5270:   return SQLITE_OK;
 5271: }
 5272: 
 5273: /* Merge MERGE_COUNT segments at iLevel into a new segment at
 5274: ** iLevel+1.  If iLevel+1 is already full of segments, those will be
 5275: ** merged to make room.
 5276: */
 5277: static int segmentMerge(fulltext_vtab *v, int iLevel){
 5278:   LeafWriter writer;
 5279:   LeavesReader lrs[MERGE_COUNT];
 5280:   int i, rc, idx = 0;
 5281: 
 5282:   /* Determine the next available segment index at the next level,
 5283:   ** merging as necessary.
 5284:   */
 5285:   rc = segdirNextIndex(v, iLevel+1, &idx);
 5286:   if( rc!=SQLITE_OK ) return rc;
 5287: 
 5288:   /* TODO(shess) This assumes that we'll always see exactly
 5289:   ** MERGE_COUNT segments to merge at a given level.  That will be
 5290:   ** broken if we allow the developer to request preemptive or
 5291:   ** deferred merging.
 5292:   */
 5293:   memset(&lrs, '\0', sizeof(lrs));
 5294:   rc = leavesReadersInit(v, iLevel, lrs, &i);
 5295:   if( rc!=SQLITE_OK ) return rc;
 5296:   assert( i==MERGE_COUNT );
 5297: 
 5298:   leafWriterInit(iLevel+1, idx, &writer);
 5299: 
 5300:   /* Since leavesReaderReorder() pushes readers at eof to the end,
 5301:   ** when the first reader is empty, all will be empty.
 5302:   */
 5303:   while( !leavesReaderAtEnd(lrs) ){
 5304:     /* Figure out how many readers share their next term. */
 5305:     for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
 5306:       if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
 5307:     }
 5308: 
 5309:     rc = leavesReadersMerge(v, lrs, i, &writer);
 5310:     if( rc!=SQLITE_OK ) goto err;
 5311: 
 5312:     /* Step forward those that were merged. */
 5313:     while( i-->0 ){
 5314:       rc = leavesReaderStep(v, lrs+i);
 5315:       if( rc!=SQLITE_OK ) goto err;
 5316: 
 5317:       /* Reorder by term, then by age. */
 5318:       leavesReaderReorder(lrs+i, MERGE_COUNT-i);
 5319:     }
 5320:   }
 5321: 
 5322:   for(i=0; i<MERGE_COUNT; i++){
 5323:     leavesReaderDestroy(&lrs[i]);
 5324:   }
 5325: 
 5326:   rc = leafWriterFinalize(v, &writer);
 5327:   leafWriterDestroy(&writer);
 5328:   if( rc!=SQLITE_OK ) return rc;
 5329: 
 5330:   /* Delete the merged segment data. */
 5331:   return segdir_delete(v, iLevel);
 5332: 
 5333:  err:
 5334:   for(i=0; i<MERGE_COUNT; i++){
 5335:     leavesReaderDestroy(&lrs[i]);
 5336:   }
 5337:   leafWriterDestroy(&writer);
 5338:   return rc;
 5339: }
 5340: 
 5341: /* Accumulate the union of *acc and *pData into *acc. */
 5342: static void docListAccumulateUnion(DataBuffer *acc,
 5343:                                    const char *pData, int nData) {
 5344:   DataBuffer tmp = *acc;
 5345:   dataBufferInit(acc, tmp.nData+nData);
 5346:   docListUnion(tmp.pData, tmp.nData, pData, nData, acc);
 5347:   dataBufferDestroy(&tmp);
 5348: }
 5349: 
 5350: /* TODO(shess) It might be interesting to explore different merge
 5351: ** strategies, here.  For instance, since this is a sorted merge, we
 5352: ** could easily merge many doclists in parallel.  With some
 5353: ** comprehension of the storage format, we could merge all of the
 5354: ** doclists within a leaf node directly from the leaf node's storage.
 5355: ** It may be worthwhile to merge smaller doclists before larger
 5356: ** doclists, since they can be traversed more quickly - but the
 5357: ** results may have less overlap, making them more expensive in a
 5358: ** different way.
 5359: */
 5360: 
 5361: /* Scan pReader for pTerm/nTerm, and merge the term's doclist over
 5362: ** *out (any doclists with duplicate docids overwrite those in *out).
 5363: ** Internal function for loadSegmentLeaf().
 5364: */
 5365: static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
 5366:                                 const char *pTerm, int nTerm, int isPrefix,
 5367:                                 DataBuffer *out){
 5368:   /* doclist data is accumulated into pBuffers similar to how one does
 5369:   ** increment in binary arithmetic.  If index 0 is empty, the data is
 5370:   ** stored there.  If there is data there, it is merged and the
 5371:   ** results carried into position 1, with further merge-and-carry
 5372:   ** until an empty position is found.
 5373:   */
 5374:   DataBuffer *pBuffers = NULL;
 5375:   int nBuffers = 0, nMaxBuffers = 0, rc;
 5376: 
 5377:   assert( nTerm>0 );
 5378: 
 5379:   for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader);
 5380:       rc=leavesReaderStep(v, pReader)){
 5381:     /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
 5382:     ** already taken to compare the terms of two LeavesReaders.  Think
 5383:     ** on a better name.  [Meanwhile, break encapsulation rather than
 5384:     ** use a confusing name.]
 5385:     */
 5386:     int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
 5387:     if( c>0 ) break;      /* Past any possible matches. */
 5388:     if( c==0 ){
 5389:       const char *pData = leavesReaderData(pReader);
 5390:       int iBuffer, nData = leavesReaderDataBytes(pReader);
 5391: 
 5392:       /* Find the first empty buffer. */
 5393:       for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
 5394:         if( 0==pBuffers[iBuffer].nData ) break;
 5395:       }
 5396: 
 5397:       /* Out of buffers, add an empty one. */
 5398:       if( iBuffer==nBuffers ){
 5399:         if( nBuffers==nMaxBuffers ){
 5400:           DataBuffer *p;
 5401:           nMaxBuffers += 20;
 5402: 
 5403:           /* Manual realloc so we can handle NULL appropriately. */
 5404:           p = sqlite3_malloc(nMaxBuffers*sizeof(*pBuffers));
 5405:           if( p==NULL ){
 5406:             rc = SQLITE_NOMEM;
 5407:             break;
 5408:           }
 5409: 
 5410:           if( nBuffers>0 ){
 5411:             assert(pBuffers!=NULL);
 5412:             memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers));
 5413:             sqlite3_free(pBuffers);
 5414:           }
 5415:           pBuffers = p;
 5416:         }
 5417:         dataBufferInit(&(pBuffers[nBuffers]), 0);
 5418:         nBuffers++;
 5419:       }
 5420: 
 5421:       /* At this point, must have an empty at iBuffer. */
 5422:       assert(iBuffer<nBuffers && pBuffers[iBuffer].nData==0);
 5423: 
 5424:       /* If empty was first buffer, no need for merge logic. */
 5425:       if( iBuffer==0 ){
 5426:         dataBufferReplace(&(pBuffers[0]), pData, nData);
 5427:       }else{
 5428:         /* pAcc is the empty buffer the merged data will end up in. */
 5429:         DataBuffer *pAcc = &(pBuffers[iBuffer]);
 5430:         DataBuffer *p = &(pBuffers[0]);
 5431: 
 5432:         /* Handle position 0 specially to avoid need to prime pAcc
 5433:         ** with pData/nData.
 5434:         */
 5435:         dataBufferSwap(p, pAcc);
 5436:         docListAccumulateUnion(pAcc, pData, nData);
 5437: 
 5438:         /* Accumulate remaining doclists into pAcc. */
 5439:         for(++p; p<pAcc; ++p){
 5440:           docListAccumulateUnion(pAcc, p->pData, p->nData);
 5441: 
 5442:           /* dataBufferReset() could allow a large doclist to blow up
 5443:           ** our memory requirements.
 5444:           */
 5445:           if( p->nCapacity<1024 ){
 5446:             dataBufferReset(p);
 5447:           }else{
 5448:             dataBufferDestroy(p);
 5449:             dataBufferInit(p, 0);
 5450:           }
 5451:         }
 5452:       }
 5453:     }
 5454:   }
 5455: 
 5456:   /* Union all the doclists together into *out. */
 5457:   /* TODO(shess) What if *out is big?  Sigh. */
 5458:   if( rc==SQLITE_OK && nBuffers>0 ){
 5459:     int iBuffer;
 5460:     for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
 5461:       if( pBuffers[iBuffer].nData>0 ){
 5462:         if( out->nData==0 ){
 5463:           dataBufferSwap(out, &(pBuffers[iBuffer]));
 5464:         }else{
 5465:           docListAccumulateUnion(out, pBuffers[iBuffer].pData,
 5466:                                  pBuffers[iBuffer].nData);
 5467:         }
 5468:       }
 5469:     }
 5470:   }
 5471: 
 5472:   while( nBuffers-- ){
 5473:     dataBufferDestroy(&(pBuffers[nBuffers]));
 5474:   }
 5475:   if( pBuffers!=NULL ) sqlite3_free(pBuffers);
 5476: 
 5477:   return rc;
 5478: }
 5479: 
 5480: /* Call loadSegmentLeavesInt() with pData/nData as input. */
 5481: static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
 5482:                            const char *pTerm, int nTerm, int isPrefix,
 5483:                            DataBuffer *out){
 5484:   LeavesReader reader;
 5485:   int rc;
 5486: 
 5487:   assert( nData>1 );
 5488:   assert( *pData=='\0' );
 5489:   rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
 5490:   if( rc!=SQLITE_OK ) return rc;
 5491: 
 5492:   rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
 5493:   leavesReaderReset(&reader);
 5494:   leavesReaderDestroy(&reader);
 5495:   return rc;
 5496: }
 5497: 
 5498: /* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
 5499: ** iEndLeaf (inclusive) as input, and merge the resulting doclist into
 5500: ** out.
 5501: */
 5502: static int loadSegmentLeaves(fulltext_vtab *v,
 5503:                              sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
 5504:                              const char *pTerm, int nTerm, int isPrefix,
 5505:                              DataBuffer *out){
 5506:   int rc;
 5507:   LeavesReader reader;
 5508: 
 5509:   assert( iStartLeaf<=iEndLeaf );
 5510:   rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
 5511:   if( rc!=SQLITE_OK ) return rc;
 5512: 
 5513:   rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
 5514:   leavesReaderReset(&reader);
 5515:   leavesReaderDestroy(&reader);
 5516:   return rc;
 5517: }
 5518: 
 5519: /* Taking pData/nData as an interior node, find the sequence of child
 5520: ** nodes which could include pTerm/nTerm/isPrefix.  Note that the
 5521: ** interior node terms logically come between the blocks, so there is
 5522: ** one more blockid than there are terms (that block contains terms >=
 5523: ** the last interior-node term).
 5524: */
 5525: /* TODO(shess) The calling code may already know that the end child is
 5526: ** not worth calculating, because the end may be in a later sibling
 5527: ** node.  Consider whether breaking symmetry is worthwhile.  I suspect
 5528: ** it is not worthwhile.
 5529: */
 5530: static void getChildrenContaining(const char *pData, int nData,
 5531:                                   const char *pTerm, int nTerm, int isPrefix,
 5532:                                   sqlite_int64 *piStartChild,
 5533:                                   sqlite_int64 *piEndChild){
 5534:   InteriorReader reader;
 5535: 
 5536:   assert( nData>1 );
 5537:   assert( *pData!='\0' );
 5538:   interiorReaderInit(pData, nData, &reader);
 5539: 
 5540:   /* Scan for the first child which could contain pTerm/nTerm. */
 5541:   while( !interiorReaderAtEnd(&reader) ){
 5542:     if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break;
 5543:     interiorReaderStep(&reader);
 5544:   }
 5545:   *piStartChild = interiorReaderCurrentBlockid(&reader);
 5546: 
 5547:   /* Keep scanning to find a term greater than our term, using prefix
 5548:   ** comparison if indicated.  If isPrefix is false, this will be the
 5549:   ** same blockid as the starting block.
 5550:   */
 5551:   while( !interiorReaderAtEnd(&reader) ){
 5552:     if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break;
 5553:     interiorReaderStep(&reader);
 5554:   }
 5555:   *piEndChild = interiorReaderCurrentBlockid(&reader);
 5556: 
 5557:   interiorReaderDestroy(&reader);
 5558: 
 5559:   /* Children must ascend, and if !prefix, both must be the same. */
 5560:   assert( *piEndChild>=*piStartChild );
 5561:   assert( isPrefix || *piStartChild==*piEndChild );
 5562: }
 5563: 
 5564: /* Read block at iBlockid and pass it with other params to
 5565: ** getChildrenContaining().
 5566: */
 5567: static int loadAndGetChildrenContaining(
 5568:   fulltext_vtab *v,
 5569:   sqlite_int64 iBlockid,
 5570:   const char *pTerm, int nTerm, int isPrefix,
 5571:   sqlite_int64 *piStartChild, sqlite_int64 *piEndChild
 5572: ){
 5573:   sqlite3_stmt *s = NULL;
 5574:   int rc;
 5575: 
 5576:   assert( iBlockid!=0 );
 5577:   assert( pTerm!=NULL );
 5578:   assert( nTerm!=0 );        /* TODO(shess) Why not allow this? */
 5579:   assert( piStartChild!=NULL );
 5580:   assert( piEndChild!=NULL );
 5581: 
 5582:   rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
 5583:   if( rc!=SQLITE_OK ) return rc;
 5584: 
 5585:   rc = sqlite3_bind_int64(s, 1, iBlockid);
 5586:   if( rc!=SQLITE_OK ) return rc;
 5587: 
 5588:   rc = sqlite3_step(s);
 5589:   if( rc==SQLITE_DONE ) return SQLITE_ERROR;
 5590:   if( rc!=SQLITE_ROW ) return rc;
 5591: 
 5592:   getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
 5593:                         pTerm, nTerm, isPrefix, piStartChild, piEndChild);
 5594: 
 5595:   /* We expect only one row.  We must execute another sqlite3_step()
 5596:    * to complete the iteration; otherwise the table will remain
 5597:    * locked. */
 5598:   rc = sqlite3_step(s);
 5599:   if( rc==SQLITE_ROW ) return SQLITE_ERROR;
 5600:   if( rc!=SQLITE_DONE ) return rc;
 5601: 
 5602:   return SQLITE_OK;
 5603: }
 5604: 
 5605: /* Traverse the tree represented by pData[nData] looking for
 5606: ** pTerm[nTerm], placing its doclist into *out.  This is internal to
 5607: ** loadSegment() to make error-handling cleaner.
 5608: */
 5609: static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
 5610:                           sqlite_int64 iLeavesEnd,
 5611:                           const char *pTerm, int nTerm, int isPrefix,
 5612:                           DataBuffer *out){
 5613:   /* Special case where root is a leaf. */
 5614:   if( *pData=='\0' ){
 5615:     return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
 5616:   }else{
 5617:     int rc;
 5618:     sqlite_int64 iStartChild, iEndChild;
 5619: 
 5620:     /* Process pData as an interior node, then loop down the tree
 5621:     ** until we find the set of leaf nodes to scan for the term.
 5622:     */
 5623:     getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix,
 5624:                           &iStartChild, &iEndChild);
 5625:     while( iStartChild>iLeavesEnd ){
 5626:       sqlite_int64 iNextStart, iNextEnd;
 5627:       rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix,
 5628:                                         &iNextStart, &iNextEnd);
 5629:       if( rc!=SQLITE_OK ) return rc;
 5630: 
 5631:       /* If we've branched, follow the end branch, too. */
 5632:       if( iStartChild!=iEndChild ){
 5633:         sqlite_int64 iDummy;
 5634:         rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix,
 5635:                                           &iDummy, &iNextEnd);
 5636:         if( rc!=SQLITE_OK ) return rc;
 5637:       }
 5638: 
 5639:       assert( iNextStart<=iNextEnd );
 5640:       iStartChild = iNextStart;
 5641:       iEndChild = iNextEnd;
 5642:     }
 5643:     assert( iStartChild<=iLeavesEnd );
 5644:     assert( iEndChild<=iLeavesEnd );
 5645: 
 5646:     /* Scan through the leaf segments for doclists. */
 5647:     return loadSegmentLeaves(v, iStartChild, iEndChild,
 5648:                              pTerm, nTerm, isPrefix, out);
 5649:   }
 5650: }
 5651: 
 5652: /* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
 5653: ** merge its doclist over *out (any duplicate doclists read from the
 5654: ** segment rooted at pData will overwrite those in *out).
 5655: */
 5656: /* TODO(shess) Consider changing this to determine the depth of the
 5657: ** leaves using either the first characters of interior nodes (when
 5658: ** ==1, we're one level above the leaves), or the first character of
 5659: ** the root (which will describe the height of the tree directly).
 5660: ** Either feels somewhat tricky to me.
 5661: */
 5662: /* TODO(shess) The current merge is likely to be slow for large
 5663: ** doclists (though it should process from newest/smallest to
 5664: ** oldest/largest, so it may not be that bad).  It might be useful to
 5665: ** modify things to allow for N-way merging.  This could either be
 5666: ** within a segment, with pairwise merges across segments, or across
 5667: ** all segments at once.
 5668: */
 5669: static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
 5670:                        sqlite_int64 iLeavesEnd,
 5671:                        const char *pTerm, int nTerm, int isPrefix,
 5672:                        DataBuffer *out){
 5673:   DataBuffer result;
 5674:   int rc;
 5675: 
 5676:   assert( nData>1 );
 5677: 
 5678:   /* This code should never be called with buffered updates. */
 5679:   assert( v->nPendingData<0 );
 5680: 
 5681:   dataBufferInit(&result, 0);
 5682:   rc = loadSegmentInt(v, pData, nData, iLeavesEnd,
 5683:                       pTerm, nTerm, isPrefix, &result);
 5684:   if( rc==SQLITE_OK && result.nData>0 ){
 5685:     if( out->nData==0 ){
 5686:       DataBuffer tmp = *out;
 5687:       *out = result;
 5688:       result = tmp;
 5689:     }else{
 5690:       DataBuffer merged;
 5691:       DLReader readers[2];
 5692: 
 5693:       dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData);
 5694:       dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData);
 5695:       dataBufferInit(&merged, out->nData+result.nData);
 5696:       docListMerge(&merged, readers, 2);
 5697:       dataBufferDestroy(out);
 5698:       *out = merged;
 5699:       dlrDestroy(&readers[0]);
 5700:       dlrDestroy(&readers[1]);
 5701:     }
 5702:   }
 5703:   dataBufferDestroy(&result);
 5704:   return rc;
 5705: }
 5706: 
 5707: /* Scan the database and merge together the posting lists for the term
 5708: ** into *out.
 5709: */
 5710: static int termSelect(fulltext_vtab *v, int iColumn,
 5711:                       const char *pTerm, int nTerm, int isPrefix,
 5712:                       DocListType iType, DataBuffer *out){
 5713:   DataBuffer doclist;
 5714:   sqlite3_stmt *s;
 5715:   int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
 5716:   if( rc!=SQLITE_OK ) return rc;
 5717: 
 5718:   /* This code should never be called with buffered updates. */
 5719:   assert( v->nPendingData<0 );
 5720: 
 5721:   dataBufferInit(&doclist, 0);
 5722: 
 5723:   /* Traverse the segments from oldest to newest so that newer doclist
 5724:   ** elements for given docids overwrite older elements.
 5725:   */
 5726:   while( (rc = sqlite3_step(s))==SQLITE_ROW ){
 5727:     const char *pData = sqlite3_column_blob(s, 2);
 5728:     const int nData = sqlite3_column_bytes(s, 2);
 5729:     const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
 5730:     rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix,
 5731:                      &doclist);
 5732:     if( rc!=SQLITE_OK ) goto err;
 5733:   }
 5734:   if( rc==SQLITE_DONE ){
 5735:     if( doclist.nData!=0 ){
 5736:       /* TODO(shess) The old term_select_all() code applied the column
 5737:       ** restrict as we merged segments, leading to smaller buffers.
 5738:       ** This is probably worthwhile to bring back, once the new storage
 5739:       ** system is checked in.
 5740:       */
 5741:       if( iColumn==v->nColumn) iColumn = -1;
 5742:       docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
 5743:                   iColumn, iType, out);
 5744:     }
 5745:     rc = SQLITE_OK;
 5746:   }
 5747: 
 5748:  err:
 5749:   dataBufferDestroy(&doclist);
 5750:   return rc;
 5751: }
 5752: 
 5753: /****************************************************************/
 5754: /* Used to hold hashtable data for sorting. */
 5755: typedef struct TermData {
 5756:   const char *pTerm;
 5757:   int nTerm;
 5758:   DLCollector *pCollector;
 5759: } TermData;
 5760: 
 5761: /* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
 5762: ** for equal, >0 for greater-than).
 5763: */
 5764: static int termDataCmp(const void *av, const void *bv){
 5765:   const TermData *a = (const TermData *)av;
 5766:   const TermData *b = (const TermData *)bv;
 5767:   int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
 5768:   int c = memcmp(a->pTerm, b->pTerm, n);
 5769:   if( c!=0 ) return c;
 5770:   return a->nTerm-b->nTerm;
 5771: }
 5772: 
 5773: /* Order pTerms data by term, then write a new level 0 segment using
 5774: ** LeafWriter.
 5775: */
 5776: static int writeZeroSegment(fulltext_vtab *v, fts2Hash *pTerms){
 5777:   fts2HashElem *e;
 5778:   int idx, rc, i, n;
 5779:   TermData *pData;
 5780:   LeafWriter writer;
 5781:   DataBuffer dl;
 5782: 
 5783:   /* Determine the next index at level 0, merging as necessary. */
 5784:   rc = segdirNextIndex(v, 0, &idx);
 5785:   if( rc!=SQLITE_OK ) return rc;
 5786: 
 5787:   n = fts2HashCount(pTerms);
 5788:   pData = sqlite3_malloc(n*sizeof(TermData));
 5789: 
 5790:   for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){
 5791:     assert( i<n );
 5792:     pData[i].pTerm = fts2HashKey(e);
 5793:     pData[i].nTerm = fts2HashKeysize(e);
 5794:     pData[i].pCollector = fts2HashData(e);
 5795:   }
 5796:   assert( i==n );
 5797: 
 5798:   /* TODO(shess) Should we allow user-defined collation sequences,
 5799:   ** here?  I think we only need that once we support prefix searches.
 5800:   */
 5801:   if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
 5802: 
 5803:   /* TODO(shess) Refactor so that we can write directly to the segment
 5804:   ** DataBuffer, as happens for segment merges.
 5805:   */
 5806:   leafWriterInit(0, idx, &writer);
 5807:   dataBufferInit(&dl, 0);
 5808:   for(i=0; i<n; i++){
 5809:     dataBufferReset(&dl);
 5810:     dlcAddDoclist(pData[i].pCollector, &dl);
 5811:     rc = leafWriterStep(v, &writer,
 5812:                         pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
 5813:     if( rc!=SQLITE_OK ) goto err;
 5814:   }
 5815:   rc = leafWriterFinalize(v, &writer);
 5816: 
 5817:  err:
 5818:   dataBufferDestroy(&dl);
 5819:   sqlite3_free(pData);
 5820:   leafWriterDestroy(&writer);
 5821:   return rc;
 5822: }
 5823: 
 5824: /* If pendingTerms has data, free it. */
 5825: static int clearPendingTerms(fulltext_vtab *v){
 5826:   if( v->nPendingData>=0 ){
 5827:     fts2HashElem *e;
 5828:     for(e=fts2HashFirst(&v->pendingTerms); e; e=fts2HashNext(e)){
 5829:       dlcDelete(fts2HashData(e));
 5830:     }
 5831:     fts2HashClear(&v->pendingTerms);
 5832:     v->nPendingData = -1;
 5833:   }
 5834:   return SQLITE_OK;
 5835: }
 5836: 
 5837: /* If pendingTerms has data, flush it to a level-zero segment, and
 5838: ** free it.
 5839: */
 5840: static int flushPendingTerms(fulltext_vtab *v){
 5841:   if( v->nPendingData>=0 ){
 5842:     int rc = writeZeroSegment(v, &v->pendingTerms);
 5843:     if( rc==SQLITE_OK ) clearPendingTerms(v);
 5844:     return rc;
 5845:   }
 5846:   return SQLITE_OK;
 5847: }
 5848: 
 5849: /* If pendingTerms is "too big", or docid is out of order, flush it.
 5850: ** Regardless, be certain that pendingTerms is initialized for use.
 5851: */
 5852: static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){
 5853:   /* TODO(shess) Explore whether partially flushing the buffer on
 5854:   ** forced-flush would provide better performance.  I suspect that if
 5855:   ** we ordered the doclists by size and flushed the largest until the
 5856:   ** buffer was half empty, that would let the less frequent terms
 5857:   ** generate longer doclists.
 5858:   */
 5859:   if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){
 5860:     int rc = flushPendingTerms(v);
 5861:     if( rc!=SQLITE_OK ) return rc;
 5862:   }
 5863:   if( v->nPendingData<0 ){
 5864:     fts2HashInit(&v->pendingTerms, FTS2_HASH_STRING, 1);
 5865:     v->nPendingData = 0;
 5866:   }
 5867:   v->iPrevDocid = iDocid;
 5868:   return SQLITE_OK;
 5869: }
 5870: 
 5871: /* This function implements the xUpdate callback; it is the top-level entry
 5872:  * point for inserting, deleting or updating a row in a full-text table. */
 5873: static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
 5874:                    sqlite_int64 *pRowid){
 5875:   fulltext_vtab *v = (fulltext_vtab *) pVtab;
 5876:   int rc;
 5877: 
 5878:   TRACE(("FTS2 Update %p\n", pVtab));
 5879: 
 5880:   if( nArg<2 ){
 5881:     rc = index_delete(v, sqlite3_value_int64(ppArg[0]));
 5882:     if( rc==SQLITE_OK ){
 5883:       /* If we just deleted the last row in the table, clear out the
 5884:       ** index data.
 5885:       */
 5886:       rc = content_exists(v);
 5887:       if( rc==SQLITE_ROW ){
 5888:         rc = SQLITE_OK;
 5889:       }else if( rc==SQLITE_DONE ){
 5890:         /* Clear the pending terms so we don't flush a useless level-0
 5891:         ** segment when the transaction closes.
 5892:         */
 5893:         rc = clearPendingTerms(v);
 5894:         if( rc==SQLITE_OK ){
 5895:           rc = segdir_delete_all(v);
 5896:         }
 5897:       }
 5898:     }
 5899:   } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
 5900:     /* An update:
 5901:      * ppArg[0] = old rowid
 5902:      * ppArg[1] = new rowid
 5903:      * ppArg[2..2+v->nColumn-1] = values
 5904:      * ppArg[2+v->nColumn] = value for magic column (we ignore this)
 5905:      */
 5906:     sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
 5907:     if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
 5908:       sqlite3_value_int64(ppArg[1]) != rowid ){
 5909:       rc = SQLITE_ERROR;  /* we don't allow changing the rowid */
 5910:     } else {
 5911:       assert( nArg==2+v->nColumn+1);
 5912:       rc = index_update(v, rowid, &ppArg[2]);
 5913:     }
 5914:   } else {
 5915:     /* An insert:
 5916:      * ppArg[1] = requested rowid
 5917:      * ppArg[2..2+v->nColumn-1] = values
 5918:      * ppArg[2+v->nColumn] = value for magic column (we ignore this)
 5919:      */
 5920:     assert( nArg==2+v->nColumn+1);
 5921:     rc = index_insert(v, ppArg[1], &ppArg[2], pRowid);
 5922:   }
 5923: 
 5924:   return rc;
 5925: }
 5926: 
 5927: static int fulltextSync(sqlite3_vtab *pVtab){
 5928:   TRACE(("FTS2 xSync()\n"));
 5929:   return flushPendingTerms((fulltext_vtab *)pVtab);
 5930: }
 5931: 
 5932: static int fulltextBegin(sqlite3_vtab *pVtab){
 5933:   fulltext_vtab *v = (fulltext_vtab *) pVtab;
 5934:   TRACE(("FTS2 xBegin()\n"));
 5935: 
 5936:   /* Any buffered updates should have been cleared by the previous
 5937:   ** transaction.
 5938:   */
 5939:   assert( v->nPendingData<0 );
 5940:   return clearPendingTerms(v);
 5941: }
 5942: 
 5943: static int fulltextCommit(sqlite3_vtab *pVtab){
 5944:   fulltext_vtab *v = (fulltext_vtab *) pVtab;
 5945:   TRACE(("FTS2 xCommit()\n"));
 5946: 
 5947:   /* Buffered updates should have been cleared by fulltextSync(). */
 5948:   assert( v->nPendingData<0 );
 5949:   return clearPendingTerms(v);
 5950: }
 5951: 
 5952: static int fulltextRollback(sqlite3_vtab *pVtab){
 5953:   TRACE(("FTS2 xRollback()\n"));
 5954:   return clearPendingTerms((fulltext_vtab *)pVtab);
 5955: }
 5956: 
 5957: /*
 5958: ** Implementation of the snippet() function for FTS2
 5959: */
 5960: static void snippetFunc(
 5961:   sqlite3_context *pContext,
 5962:   int argc,
 5963:   sqlite3_value **argv
 5964: ){
 5965:   fulltext_cursor *pCursor;
 5966:   if( argc<1 ) return;
 5967:   if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
 5968:       sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
 5969:     sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
 5970:   }else{
 5971:     const char *zStart = "<b>";
 5972:     const char *zEnd = "</b>";
 5973:     const char *zEllipsis = "<b>...</b>";
 5974:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
 5975:     if( argc>=2 ){
 5976:       zStart = (const char*)sqlite3_value_text(argv[1]);
 5977:       if( argc>=3 ){
 5978:         zEnd = (const char*)sqlite3_value_text(argv[2]);
 5979:         if( argc>=4 ){
 5980:           zEllipsis = (const char*)sqlite3_value_text(argv[3]);
 5981:         }
 5982:       }
 5983:     }
 5984:     snippetAllOffsets(pCursor);
 5985:     snippetText(pCursor, zStart, zEnd, zEllipsis);
 5986:     sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
 5987:                         pCursor->snippet.nSnippet, SQLITE_STATIC);
 5988:   }
 5989: }
 5990: 
 5991: /*
 5992: ** Implementation of the offsets() function for FTS2
 5993: */
 5994: static void snippetOffsetsFunc(
 5995:   sqlite3_context *pContext,
 5996:   int argc,
 5997:   sqlite3_value **argv
 5998: ){
 5999:   fulltext_cursor *pCursor;
 6000:   if( argc<1 ) return;
 6001:   if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
 6002:       sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
 6003:     sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
 6004:   }else{
 6005:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
 6006:     snippetAllOffsets(pCursor);
 6007:     snippetOffsetText(&pCursor->snippet);
 6008:     sqlite3_result_text(pContext,
 6009:                         pCursor->snippet.zOffset, pCursor->snippet.nOffset,
 6010:                         SQLITE_STATIC);
 6011:   }
 6012: }
 6013: 
 6014: /* OptLeavesReader is nearly identical to LeavesReader, except that
 6015: ** where LeavesReader is geared towards the merging of complete
 6016: ** segment levels (with exactly MERGE_COUNT segments), OptLeavesReader
 6017: ** is geared towards implementation of the optimize() function, and
 6018: ** can merge all segments simultaneously.  This version may be
 6019: ** somewhat less efficient than LeavesReader because it merges into an
 6020: ** accumulator rather than doing an N-way merge, but since segment
 6021: ** size grows exponentially (so segment count logrithmically) this is
 6022: ** probably not an immediate problem.
 6023: */
 6024: /* TODO(shess): Prove that assertion, or extend the merge code to
 6025: ** merge tree fashion (like the prefix-searching code does).
 6026: */
 6027: /* TODO(shess): OptLeavesReader and LeavesReader could probably be
 6028: ** merged with little or no loss of performance for LeavesReader.  The
 6029: ** merged code would need to handle >MERGE_COUNT segments, and would
 6030: ** also need to be able to optionally optimize away deletes.
 6031: */
 6032: typedef struct OptLeavesReader {
 6033:   /* Segment number, to order readers by age. */
 6034:   int segment;
 6035:   LeavesReader reader;
 6036: } OptLeavesReader;
 6037: 
 6038: static int optLeavesReaderAtEnd(OptLeavesReader *pReader){
 6039:   return leavesReaderAtEnd(&pReader->reader);
 6040: }
 6041: static int optLeavesReaderTermBytes(OptLeavesReader *pReader){
 6042:   return leavesReaderTermBytes(&pReader->reader);
 6043: }
 6044: static const char *optLeavesReaderData(OptLeavesReader *pReader){
 6045:   return leavesReaderData(&pReader->reader);
 6046: }
 6047: static int optLeavesReaderDataBytes(OptLeavesReader *pReader){
 6048:   return leavesReaderDataBytes(&pReader->reader);
 6049: }
 6050: static const char *optLeavesReaderTerm(OptLeavesReader *pReader){
 6051:   return leavesReaderTerm(&pReader->reader);
 6052: }
 6053: static int optLeavesReaderStep(fulltext_vtab *v, OptLeavesReader *pReader){
 6054:   return leavesReaderStep(v, &pReader->reader);
 6055: }
 6056: static int optLeavesReaderTermCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){
 6057:   return leavesReaderTermCmp(&lr1->reader, &lr2->reader);
 6058: }
 6059: /* Order by term ascending, segment ascending (oldest to newest), with
 6060: ** exhausted readers to the end.
 6061: */
 6062: static int optLeavesReaderCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){
 6063:   int c = optLeavesReaderTermCmp(lr1, lr2);
 6064:   if( c!=0 ) return c;
 6065:   return lr1->segment-lr2->segment;
 6066: }
 6067: /* Bubble pLr[0] to appropriate place in pLr[1..nLr-1].  Assumes that
 6068: ** pLr[1..nLr-1] is already sorted.
 6069: */
 6070: static void optLeavesReaderReorder(OptLeavesReader *pLr, int nLr){
 6071:   while( nLr>1 && optLeavesReaderCmp(pLr, pLr+1)>0 ){
 6072:     OptLeavesReader tmp = pLr[0];
 6073:     pLr[0] = pLr[1];
 6074:     pLr[1] = tmp;
 6075:     nLr--;
 6076:     pLr++;
 6077:   }
 6078: }
 6079: 
 6080: /* optimize() helper function.  Put the readers in order and iterate
 6081: ** through them, merging doclists for matching terms into pWriter.
 6082: ** Returns SQLITE_OK on success, or the SQLite error code which
 6083: ** prevented success.
 6084: */
 6085: static int optimizeInternal(fulltext_vtab *v,
 6086:                             OptLeavesReader *readers, int nReaders,
 6087:                             LeafWriter *pWriter){
 6088:   int i, rc = SQLITE_OK;
 6089:   DataBuffer doclist, merged, tmp;
 6090: 
 6091:   /* Order the readers. */
 6092:   i = nReaders;
 6093:   while( i-- > 0 ){
 6094:     optLeavesReaderReorder(&readers[i], nReaders-i);
 6095:   }
 6096: 
 6097:   dataBufferInit(&doclist, LEAF_MAX);
 6098:   dataBufferInit(&merged, LEAF_MAX);
 6099: 
 6100:   /* Exhausted readers bubble to the end, so when the first reader is
 6101:   ** at eof, all are at eof.
 6102:   */
 6103:   while( !optLeavesReaderAtEnd(&readers[0]) ){
 6104: 
 6105:     /* Figure out how many readers share the next term. */
 6106:     for(i=1; i<nReaders && !optLeavesReaderAtEnd(&readers[i]); i++){
 6107:       if( 0!=optLeavesReaderTermCmp(&readers[0], &readers[i]) ) break;
 6108:     }
 6109: 
 6110:     /* Special-case for no merge. */
 6111:     if( i==1 ){
 6112:       /* Trim deletions from the doclist. */
 6113:       dataBufferReset(&merged);
 6114:       docListTrim(DL_DEFAULT,
 6115:                   optLeavesReaderData(&readers[0]),
 6116:                   optLeavesReaderDataBytes(&readers[0]),
 6117:                   -1, DL_DEFAULT, &merged);
 6118:     }else{
 6119:       DLReader dlReaders[MERGE_COUNT];
 6120:       int iReader, nReaders;
 6121: 
 6122:       /* Prime the pipeline with the first reader's doclist.  After
 6123:       ** one pass index 0 will reference the accumulated doclist.
 6124:       */
 6125:       dlrInit(&dlReaders[0], DL_DEFAULT,
 6126:               optLeavesReaderData(&readers[0]),
 6127:               optLeavesReaderDataBytes(&readers[0]));
 6128:       iReader = 1;
 6129: 
 6130:       assert( iReader<i );  /* Must execute the loop at least once. */
 6131:       while( iReader<i ){
 6132:         /* Merge 16 inputs per pass. */
 6133:         for( nReaders=1; iReader<i && nReaders<MERGE_COUNT;
 6134:              iReader++, nReaders++ ){
 6135:           dlrInit(&dlReaders[nReaders], DL_DEFAULT,
 6136:                   optLeavesReaderData(&readers[iReader]),
 6137:                   optLeavesReaderDataBytes(&readers[iReader]));
 6138:         }
 6139: 
 6140:         /* Merge doclists and swap result into accumulator. */
 6141:         dataBufferReset(&merged);
 6142:         docListMerge(&merged, dlReaders, nReaders);
 6143:         tmp = merged;
 6144:         merged = doclist;
 6145:         doclist = tmp;
 6146: 
 6147:         while( nReaders-- > 0 ){
 6148:           dlrDestroy(&dlReaders[nReaders]);
 6149:         }
 6150: 
 6151:         /* Accumulated doclist to reader 0 for next pass. */
 6152:         dlrInit(&dlReaders[0], DL_DEFAULT, doclist.pData, doclist.nData);
 6153:       }
 6154: 
 6155:       /* Destroy reader that was left in the pipeline. */
 6156:       dlrDestroy(&dlReaders[0]);
 6157: 
 6158:       /* Trim deletions from the doclist. */
 6159:       dataBufferReset(&merged);
 6160:       docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
 6161:                   -1, DL_DEFAULT, &merged);
 6162:     }
 6163: 
 6164:     /* Only pass doclists with hits (skip if all hits deleted). */
 6165:     if( merged.nData>0 ){
 6166:       rc = leafWriterStep(v, pWriter,
 6167:                           optLeavesReaderTerm(&readers[0]),
 6168:                           optLeavesReaderTermBytes(&readers[0]),
 6169:                           merged.pData, merged.nData);
 6170:       if( rc!=SQLITE_OK ) goto err;
 6171:     }
 6172: 
 6173:     /* Step merged readers to next term and reorder. */
 6174:     while( i-- > 0 ){
 6175:       rc = optLeavesReaderStep(v, &readers[i]);
 6176:       if( rc!=SQLITE_OK ) goto err;
 6177: 
 6178:       optLeavesReaderReorder(&readers[i], nReaders-i);
 6179:     }
 6180:   }
 6181: 
 6182:  err:
 6183:   dataBufferDestroy(&doclist);
 6184:   dataBufferDestroy(&merged);
 6185:   return rc;
 6186: }
 6187: 
 6188: /* Implement optimize() function for FTS3.  optimize(t) merges all
 6189: ** segments in the fts index into a single segment.  't' is the magic
 6190: ** table-named column.
 6191: */
 6192: static void optimizeFunc(sqlite3_context *pContext,
 6193:                          int argc, sqlite3_value **argv){
 6194:   fulltext_cursor *pCursor;
 6195:   if( argc>1 ){
 6196:     sqlite3_result_error(pContext, "excess arguments to optimize()",-1);
 6197:   }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
 6198:             sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
 6199:     sqlite3_result_error(pContext, "illegal first argument to optimize",-1);
 6200:   }else{
 6201:     fulltext_vtab *v;
 6202:     int i, rc, iMaxLevel;
 6203:     OptLeavesReader *readers;
 6204:     int nReaders;
 6205:     LeafWriter writer;
 6206:     sqlite3_stmt *s;
 6207: 
 6208:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
 6209:     v = cursor_vtab(pCursor);
 6210: 
 6211:     /* Flush any buffered updates before optimizing. */
 6212:     rc = flushPendingTerms(v);
 6213:     if( rc!=SQLITE_OK ) goto err;
 6214: 
 6215:     rc = segdir_count(v, &nReaders, &iMaxLevel);
 6216:     if( rc!=SQLITE_OK ) goto err;
 6217:     if( nReaders==0 || nReaders==1 ){
 6218:       sqlite3_result_text(pContext, "Index already optimal", -1,
 6219:                           SQLITE_STATIC);
 6220:       return;
 6221:     }
 6222: 
 6223:     rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
 6224:     if( rc!=SQLITE_OK ) goto err;
 6225: 
 6226:     readers = sqlite3_malloc(nReaders*sizeof(readers[0]));
 6227:     if( readers==NULL ) goto err;
 6228: 
 6229:     /* Note that there will already be a segment at this position
 6230:     ** until we call segdir_delete() on iMaxLevel.
 6231:     */
 6232:     leafWriterInit(iMaxLevel, 0, &writer);
 6233: 
 6234:     i = 0;
 6235:     while( (rc = sqlite3_step(s))==SQLITE_ROW ){
 6236:       sqlite_int64 iStart = sqlite3_column_int64(s, 0);
 6237:       sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
 6238:       const char *pRootData = sqlite3_column_blob(s, 2);
 6239:       int nRootData = sqlite3_column_bytes(s, 2);
 6240: 
 6241:       assert( i<nReaders );
 6242:       rc = leavesReaderInit(v, -1, iStart, iEnd, pRootData, nRootData,
 6243:                             &readers[i].reader);
 6244:       if( rc!=SQLITE_OK ) break;
 6245: 
 6246:       readers[i].segment = i;
 6247:       i++;
 6248:     }
 6249: 
 6250:     /* If we managed to successfully read them all, optimize them. */
 6251:     if( rc==SQLITE_DONE ){
 6252:       assert( i==nReaders );
 6253:       rc = optimizeInternal(v, readers, nReaders, &writer);
 6254:     }
 6255: 
 6256:     while( i-- > 0 ){
 6257:       leavesReaderDestroy(&readers[i].reader);
 6258:     }
 6259:     sqlite3_free(readers);
 6260: 
 6261:     /* If we've successfully gotten to here, delete the old segments
 6262:     ** and flush the interior structure of the new segment.
 6263:     */
 6264:     if( rc==SQLITE_OK ){
 6265:       for( i=0; i<=iMaxLevel; i++ ){
 6266:         rc = segdir_delete(v, i);
 6267:         if( rc!=SQLITE_OK ) break;
 6268:       }
 6269: 
 6270:       if( rc==SQLITE_OK ) rc = leafWriterFinalize(v, &writer);
 6271:     }
 6272: 
 6273:     leafWriterDestroy(&writer);
 6274: 
 6275:     if( rc!=SQLITE_OK ) goto err;
 6276: 
 6277:     sqlite3_result_text(pContext, "Index optimized", -1, SQLITE_STATIC);
 6278:     return;
 6279: 
 6280:     /* TODO(shess): Error-handling needs to be improved along the
 6281:     ** lines of the dump_ functions.
 6282:     */
 6283:  err:
 6284:     {
 6285:       char buf[512];
 6286:       sqlite3_snprintf(sizeof(buf), buf, "Error in optimize: %s",
 6287:                        sqlite3_errmsg(sqlite3_context_db_handle(pContext)));
 6288:       sqlite3_result_error(pContext, buf, -1);
 6289:     }
 6290:   }
 6291: }
 6292: 
 6293: #ifdef SQLITE_TEST
 6294: /* Generate an error of the form "<prefix>: <msg>".  If msg is NULL,
 6295: ** pull the error from the context's db handle.
 6296: */
 6297: static void generateError(sqlite3_context *pContext,
 6298:                           const char *prefix, const char *msg){
 6299:   char buf[512];
 6300:   if( msg==NULL ) msg = sqlite3_errmsg(sqlite3_context_db_handle(pContext));
 6301:   sqlite3_snprintf(sizeof(buf), buf, "%s: %s", prefix, msg);
 6302:   sqlite3_result_error(pContext, buf, -1);
 6303: }
 6304: 
 6305: /* Helper function to collect the set of terms in the segment into
 6306: ** pTerms.  The segment is defined by the leaf nodes between
 6307: ** iStartBlockid and iEndBlockid, inclusive, or by the contents of
 6308: ** pRootData if iStartBlockid is 0 (in which case the entire segment
 6309: ** fit in a leaf).
 6310: */
 6311: static int collectSegmentTerms(fulltext_vtab *v, sqlite3_stmt *s,
 6312:                                fts2Hash *pTerms){
 6313:   const sqlite_int64 iStartBlockid = sqlite3_column_int64(s, 0);
 6314:   const sqlite_int64 iEndBlockid = sqlite3_column_int64(s, 1);
 6315:   const char *pRootData = sqlite3_column_blob(s, 2);
 6316:   const int nRootData = sqlite3_column_bytes(s, 2);
 6317:   LeavesReader reader;
 6318:   int rc = leavesReaderInit(v, 0, iStartBlockid, iEndBlockid,
 6319:                             pRootData, nRootData, &reader);
 6320:   if( rc!=SQLITE_OK ) return rc;
 6321: 
 6322:   while( rc==SQLITE_OK && !leavesReaderAtEnd(&reader) ){
 6323:     const char *pTerm = leavesReaderTerm(&reader);
 6324:     const int nTerm = leavesReaderTermBytes(&reader);
 6325:     void *oldValue = sqlite3Fts2HashFind(pTerms, pTerm, nTerm);
 6326:     void *newValue = (void *)((char *)oldValue+1);
 6327: 
 6328:     /* From the comment before sqlite3Fts2HashInsert in fts2_hash.c,
 6329:     ** the data value passed is returned in case of malloc failure.
 6330:     */
 6331:     if( newValue==sqlite3Fts2HashInsert(pTerms, pTerm, nTerm, newValue) ){
 6332:       rc = SQLITE_NOMEM;
 6333:     }else{
 6334:       rc = leavesReaderStep(v, &reader);
 6335:     }
 6336:   }
 6337: 
 6338:   leavesReaderDestroy(&reader);
 6339:   return rc;
 6340: }
 6341: 
 6342: /* Helper function to build the result string for dump_terms(). */
 6343: static int generateTermsResult(sqlite3_context *pContext, fts2Hash *pTerms){
 6344:   int iTerm, nTerms, nResultBytes, iByte;
 6345:   char *result;
 6346:   TermData *pData;
 6347:   fts2HashElem *e;
 6348: 
 6349:   /* Iterate pTerms to generate an array of terms in pData for
 6350:   ** sorting.
 6351:   */
 6352:   nTerms = fts2HashCount(pTerms);
 6353:   assert( nTerms>0 );
 6354:   pData = sqlite3_malloc(nTerms*sizeof(TermData));
 6355:   if( pData==NULL ) return SQLITE_NOMEM;
 6356: 
 6357:   nResultBytes = 0;
 6358:   for(iTerm = 0, e = fts2HashFirst(pTerms); e; iTerm++, e = fts2HashNext(e)){
 6359:     nResultBytes += fts2HashKeysize(e)+1;   /* Term plus trailing space */
 6360:     assert( iTerm<nTerms );
 6361:     pData[iTerm].pTerm = fts2HashKey(e);
 6362:     pData[iTerm].nTerm = fts2HashKeysize(e);
 6363:     pData[iTerm].pCollector = fts2HashData(e);  /* unused */
 6364:   }
 6365:   assert( iTerm==nTerms );
 6366: 
 6367:   assert( nResultBytes>0 );   /* nTerms>0, nResultsBytes must be, too. */
 6368:   result = sqlite3_malloc(nResultBytes);
 6369:   if( result==NULL ){
 6370:     sqlite3_free(pData);
 6371:     return SQLITE_NOMEM;
 6372:   }
 6373: 
 6374:   if( nTerms>1 ) qsort(pData, nTerms, sizeof(*pData), termDataCmp);
 6375: 
 6376:   /* Read the terms in order to build the result. */
 6377:   iByte = 0;
 6378:   for(iTerm=0; iTerm<nTerms; ++iTerm){
 6379:     memcpy(result+iByte, pData[iTerm].pTerm, pData[iTerm].nTerm);
 6380:     iByte += pData[iTerm].nTerm;
 6381:     result[iByte++] = ' ';
 6382:   }
 6383:   assert( iByte==nResultBytes );
 6384:   assert( result[nResultBytes-1]==' ' );
 6385:   result[nResultBytes-1] = '\0';
 6386: 
 6387:   /* Passes away ownership of result. */
 6388:   sqlite3_result_text(pContext, result, nResultBytes-1, sqlite3_free);
 6389:   sqlite3_free(pData);
 6390:   return SQLITE_OK;
 6391: }
 6392: 
 6393: /* Implements dump_terms() for use in inspecting the fts2 index from
 6394: ** tests.  TEXT result containing the ordered list of terms joined by
 6395: ** spaces.  dump_terms(t, level, idx) dumps the terms for the segment
 6396: ** specified by level, idx (in %_segdir), while dump_terms(t) dumps
 6397: ** all terms in the index.  In both cases t is the fts table's magic
 6398: ** table-named column.
 6399: */
 6400: static void dumpTermsFunc(
 6401:   sqlite3_context *pContext,
 6402:   int argc, sqlite3_value **argv
 6403: ){
 6404:   fulltext_cursor *pCursor;
 6405:   if( argc!=3 && argc!=1 ){
 6406:     generateError(pContext, "dump_terms", "incorrect arguments");
 6407:   }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
 6408:             sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
 6409:     generateError(pContext, "dump_terms", "illegal first argument");
 6410:   }else{
 6411:     fulltext_vtab *v;
 6412:     fts2Hash terms;
 6413:     sqlite3_stmt *s = NULL;
 6414:     int rc;
 6415: 
 6416:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
 6417:     v = cursor_vtab(pCursor);
 6418: 
 6419:     /* If passed only the cursor column, get all segments.  Otherwise
 6420:     ** get the segment described by the following two arguments.
 6421:     */
 6422:     if( argc==1 ){
 6423:       rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
 6424:     }else{
 6425:       rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s);
 6426:       if( rc==SQLITE_OK ){
 6427:         rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[1]));
 6428:         if( rc==SQLITE_OK ){
 6429:           rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[2]));
 6430:         }
 6431:       }
 6432:     }
 6433: 
 6434:     if( rc!=SQLITE_OK ){
 6435:       generateError(pContext, "dump_terms", NULL);
 6436:       return;
 6437:     }
 6438: 
 6439:     /* Collect the terms for each segment. */
 6440:     sqlite3Fts2HashInit(&terms, FTS2_HASH_STRING, 1);
 6441:     while( (rc = sqlite3_step(s))==SQLITE_ROW ){
 6442:       rc = collectSegmentTerms(v, s, &terms);
 6443:       if( rc!=SQLITE_OK ) break;
 6444:     }
 6445: 
 6446:     if( rc!=SQLITE_DONE ){
 6447:       sqlite3_reset(s);
 6448:       generateError(pContext, "dump_terms", NULL);
 6449:     }else{
 6450:       const int nTerms = fts2HashCount(&terms);
 6451:       if( nTerms>0 ){
 6452:         rc = generateTermsResult(pContext, &terms);
 6453:         if( rc==SQLITE_NOMEM ){
 6454:           generateError(pContext, "dump_terms", "out of memory");
 6455:         }else{
 6456:           assert( rc==SQLITE_OK );
 6457:         }
 6458:       }else if( argc==3 ){
 6459:         /* The specific segment asked for could not be found. */
 6460:         generateError(pContext, "dump_terms", "segment not found");
 6461:       }else{
 6462:         /* No segments found. */
 6463:         /* TODO(shess): It should be impossible to reach this.  This
 6464:         ** case can only happen for an empty table, in which case
 6465:         ** SQLite has no rows to call this function on.
 6466:         */
 6467:         sqlite3_result_null(pContext);
 6468:       }
 6469:     }
 6470:     sqlite3Fts2HashClear(&terms);
 6471:   }
 6472: }
 6473: 
 6474: /* Expand the DL_DEFAULT doclist in pData into a text result in
 6475: ** pContext.
 6476: */
 6477: static void createDoclistResult(sqlite3_context *pContext,
 6478:                                 const char *pData, int nData){
 6479:   DataBuffer dump;
 6480:   DLReader dlReader;
 6481: 
 6482:   assert( pData!=NULL && nData>0 );
 6483: 
 6484:   dataBufferInit(&dump, 0);
 6485:   dlrInit(&dlReader, DL_DEFAULT, pData, nData);
 6486:   for( ; !dlrAtEnd(&dlReader); dlrStep(&dlReader) ){
 6487:     char buf[256];
 6488:     PLReader plReader;
 6489: 
 6490:     plrInit(&plReader, &dlReader);
 6491:     if( DL_DEFAULT==DL_DOCIDS || plrAtEnd(&plReader) ){
 6492:       sqlite3_snprintf(sizeof(buf), buf, "[%lld] ", dlrDocid(&dlReader));
 6493:       dataBufferAppend(&dump, buf, strlen(buf));
 6494:     }else{
 6495:       int iColumn = plrColumn(&plReader);
 6496: 
 6497:       sqlite3_snprintf(sizeof(buf), buf, "[%lld %d[",
 6498:                        dlrDocid(&dlReader), iColumn);
 6499:       dataBufferAppend(&dump, buf, strlen(buf));
 6500: 
 6501:       for( ; !plrAtEnd(&plReader); plrStep(&plReader) ){
 6502:         if( plrColumn(&plReader)!=iColumn ){
 6503:           iColumn = plrColumn(&plReader);
 6504:           sqlite3_snprintf(sizeof(buf), buf, "] %d[", iColumn);
 6505:           assert( dump.nData>0 );
 6506:           dump.nData--;                     /* Overwrite trailing space. */
 6507:           assert( dump.pData[dump.nData]==' ');
 6508:           dataBufferAppend(&dump, buf, strlen(buf));
 6509:         }
 6510:         if( DL_DEFAULT==DL_POSITIONS_OFFSETS ){
 6511:           sqlite3_snprintf(sizeof(buf), buf, "%d,%d,%d ",
 6512:                            plrPosition(&plReader),
 6513:                            plrStartOffset(&plReader), plrEndOffset(&plReader));
 6514:         }else if( DL_DEFAULT==DL_POSITIONS ){
 6515:           sqlite3_snprintf(sizeof(buf), buf, "%d ", plrPosition(&plReader));
 6516:         }else{
 6517:           assert( NULL=="Unhandled DL_DEFAULT value");
 6518:         }
 6519:         dataBufferAppend(&dump, buf, strlen(buf));
 6520:       }
 6521:       plrDestroy(&plReader);
 6522: 
 6523:       assert( dump.nData>0 );
 6524:       dump.nData--;                     /* Overwrite trailing space. */
 6525:       assert( dump.pData[dump.nData]==' ');
 6526:       dataBufferAppend(&dump, "]] ", 3);
 6527:     }
 6528:   }
 6529:   dlrDestroy(&dlReader);
 6530: 
 6531:   assert( dump.nData>0 );
 6532:   dump.nData--;                     /* Overwrite trailing space. */
 6533:   assert( dump.pData[dump.nData]==' ');
 6534:   dump.pData[dump.nData] = '\0';
 6535:   assert( dump.nData>0 );
 6536: 
 6537:   /* Passes ownership of dump's buffer to pContext. */
 6538:   sqlite3_result_text(pContext, dump.pData, dump.nData, sqlite3_free);
 6539:   dump.pData = NULL;
 6540:   dump.nData = dump.nCapacity = 0;
 6541: }
 6542: 
 6543: /* Implements dump_doclist() for use in inspecting the fts2 index from
 6544: ** tests.  TEXT result containing a string representation of the
 6545: ** doclist for the indicated term.  dump_doclist(t, term, level, idx)
 6546: ** dumps the doclist for term from the segment specified by level, idx
 6547: ** (in %_segdir), while dump_doclist(t, term) dumps the logical
 6548: ** doclist for the term across all segments.  The per-segment doclist
 6549: ** can contain deletions, while the full-index doclist will not
 6550: ** (deletions are omitted).
 6551: **
 6552: ** Result formats differ with the setting of DL_DEFAULTS.  Examples:
 6553: **
 6554: ** DL_DOCIDS: [1] [3] [7]
 6555: ** DL_POSITIONS: [1 0[0 4] 1[17]] [3 1[5]]
 6556: ** DL_POSITIONS_OFFSETS: [1 0[0,0,3 4,23,26] 1[17,102,105]] [3 1[5,20,23]]
 6557: **
 6558: ** In each case the number after the outer '[' is the docid.  In the
 6559: ** latter two cases, the number before the inner '[' is the column
 6560: ** associated with the values within.  For DL_POSITIONS the numbers
 6561: ** within are the positions, for DL_POSITIONS_OFFSETS they are the
 6562: ** position, the start offset, and the end offset.
 6563: */
 6564: static void dumpDoclistFunc(
 6565:   sqlite3_context *pContext,
 6566:   int argc, sqlite3_value **argv
 6567: ){
 6568:   fulltext_cursor *pCursor;
 6569:   if( argc!=2 && argc!=4 ){
 6570:     generateError(pContext, "dump_doclist", "incorrect arguments");
 6571:   }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
 6572:             sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
 6573:     generateError(pContext, "dump_doclist", "illegal first argument");
 6574:   }else if( sqlite3_value_text(argv[1])==NULL ||
 6575:             sqlite3_value_text(argv[1])[0]=='\0' ){
 6576:     generateError(pContext, "dump_doclist", "empty second argument");
 6577:   }else{
 6578:     const char *pTerm = (const char *)sqlite3_value_text(argv[1]);
 6579:     const int nTerm = strlen(pTerm);
 6580:     fulltext_vtab *v;
 6581:     int rc;
 6582:     DataBuffer doclist;
 6583: 
 6584:     memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
 6585:     v = cursor_vtab(pCursor);
 6586: 
 6587:     dataBufferInit(&doclist, 0);
 6588: 
 6589:     /* termSelect() yields the same logical doclist that queries are
 6590:     ** run against.
 6591:     */
 6592:     if( argc==2 ){
 6593:       rc = termSelect(v, v->nColumn, pTerm, nTerm, 0, DL_DEFAULT, &doclist);
 6594:     }else{
 6595:       sqlite3_stmt *s = NULL;
 6596: 
 6597:       /* Get our specific segment's information. */
 6598:       rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s);
 6599:       if( rc==SQLITE_OK ){
 6600:         rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[2]));
 6601:         if( rc==SQLITE_OK ){
 6602:           rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[3]));
 6603:         }
 6604:       }
 6605: 
 6606:       if( rc==SQLITE_OK ){
 6607:         rc = sqlite3_step(s);
 6608: 
 6609:         if( rc==SQLITE_DONE ){
 6610:           dataBufferDestroy(&doclist);
 6611:           generateError(pContext, "dump_doclist", "segment not found");
 6612:           return;
 6613:         }
 6614: 
 6615:         /* Found a segment, load it into doclist. */
 6616:         if( rc==SQLITE_ROW ){
 6617:           const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
 6618:           const char *pData = sqlite3_column_blob(s, 2);
 6619:           const int nData = sqlite3_column_bytes(s, 2);
 6620: 
 6621:           /* loadSegment() is used by termSelect() to load each
 6622:           ** segment's data.
 6623:           */
 6624:           rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, 0,
 6625:                            &doclist);
 6626:           if( rc==SQLITE_OK ){
 6627:             rc = sqlite3_step(s);
 6628: 
 6629:             /* Should not have more than one matching segment. */
 6630:             if( rc!=SQLITE_DONE ){
 6631:               sqlite3_reset(s);
 6632:               dataBufferDestroy(&doclist);
 6633:               generateError(pContext, "dump_doclist", "invalid segdir");
 6634:               return;
 6635:             }
 6636:             rc = SQLITE_OK;
 6637:           }
 6638:         }
 6639:       }
 6640: 
 6641:       sqlite3_reset(s);
 6642:     }
 6643: 
 6644:     if( rc==SQLITE_OK ){
 6645:       if( doclist.nData>0 ){
 6646:         createDoclistResult(pContext, doclist.pData, doclist.nData);
 6647:       }else{
 6648:         /* TODO(shess): This can happen if the term is not present, or
 6649:         ** if all instances of the term have been deleted and this is
 6650:         ** an all-index dump.  It may be interesting to distinguish
 6651:         ** these cases.
 6652:         */
 6653:         sqlite3_result_text(pContext, "", 0, SQLITE_STATIC);
 6654:       }
 6655:     }else if( rc==SQLITE_NOMEM ){
 6656:       /* Handle out-of-memory cases specially because if they are
 6657:       ** generated in fts2 code they may not be reflected in the db
 6658:       ** handle.
 6659:       */
 6660:       /* TODO(shess): Handle this more comprehensively.
 6661:       ** sqlite3ErrStr() has what I need, but is internal.
 6662:       */
 6663:       generateError(pContext, "dump_doclist", "out of memory");
 6664:     }else{
 6665:       generateError(pContext, "dump_doclist", NULL);
 6666:     }
 6667: 
 6668:     dataBufferDestroy(&doclist);
 6669:   }
 6670: }
 6671: #endif
 6672: 
 6673: /*
 6674: ** This routine implements the xFindFunction method for the FTS2
 6675: ** virtual table.
 6676: */
 6677: static int fulltextFindFunction(
 6678:   sqlite3_vtab *pVtab,
 6679:   int nArg,
 6680:   const char *zName,
 6681:   void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
 6682:   void **ppArg
 6683: ){
 6684:   if( strcmp(zName,"snippet")==0 ){
 6685:     *pxFunc = snippetFunc;
 6686:     return 1;
 6687:   }else if( strcmp(zName,"offsets")==0 ){
 6688:     *pxFunc = snippetOffsetsFunc;
 6689:     return 1;
 6690:   }else if( strcmp(zName,"optimize")==0 ){
 6691:     *pxFunc = optimizeFunc;
 6692:     return 1;
 6693: #ifdef SQLITE_TEST
 6694:     /* NOTE(shess): These functions are present only for testing
 6695:     ** purposes.  No particular effort is made to optimize their
 6696:     ** execution or how they build their results.
 6697:     */
 6698:   }else if( strcmp(zName,"dump_terms")==0 ){
 6699:     /* fprintf(stderr, "Found dump_terms\n"); */
 6700:     *pxFunc = dumpTermsFunc;
 6701:     return 1;
 6702:   }else if( strcmp(zName,"dump_doclist")==0 ){
 6703:     /* fprintf(stderr, "Found dump_doclist\n"); */
 6704:     *pxFunc = dumpDoclistFunc;
 6705:     return 1;
 6706: #endif
 6707:   }
 6708:   return 0;
 6709: }
 6710: 
 6711: /*
 6712: ** Rename an fts2 table.
 6713: */
 6714: static int fulltextRename(
 6715:   sqlite3_vtab *pVtab,
 6716:   const char *zName
 6717: ){
 6718:   fulltext_vtab *p = (fulltext_vtab *)pVtab;
 6719:   int rc = SQLITE_NOMEM;
 6720:   char *zSql = sqlite3_mprintf(
 6721:     "ALTER TABLE %Q.'%q_content'  RENAME TO '%q_content';"
 6722:     "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
 6723:     "ALTER TABLE %Q.'%q_segdir'   RENAME TO '%q_segdir';"
 6724:     , p->zDb, p->zName, zName 
 6725:     , p->zDb, p->zName, zName 
 6726:     , p->zDb, p->zName, zName
 6727:   );
 6728:   if( zSql ){
 6729:     rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
 6730:     sqlite3_free(zSql);
 6731:   }
 6732:   return rc;
 6733: }
 6734: 
 6735: static const sqlite3_module fts2Module = {
 6736:   /* iVersion      */ 0,
 6737:   /* xCreate       */ fulltextCreate,
 6738:   /* xConnect      */ fulltextConnect,
 6739:   /* xBestIndex    */ fulltextBestIndex,
 6740:   /* xDisconnect   */ fulltextDisconnect,
 6741:   /* xDestroy      */ fulltextDestroy,
 6742:   /* xOpen         */ fulltextOpen,
 6743:   /* xClose        */ fulltextClose,
 6744:   /* xFilter       */ fulltextFilter,
 6745:   /* xNext         */ fulltextNext,
 6746:   /* xEof          */ fulltextEof,
 6747:   /* xColumn       */ fulltextColumn,
 6748:   /* xRowid        */ fulltextRowid,
 6749:   /* xUpdate       */ fulltextUpdate,
 6750:   /* xBegin        */ fulltextBegin,
 6751:   /* xSync         */ fulltextSync,
 6752:   /* xCommit       */ fulltextCommit,
 6753:   /* xRollback     */ fulltextRollback,
 6754:   /* xFindFunction */ fulltextFindFunction,
 6755:   /* xRename */       fulltextRename,
 6756: };
 6757: 
 6758: static void hashDestroy(void *p){
 6759:   fts2Hash *pHash = (fts2Hash *)p;
 6760:   sqlite3Fts2HashClear(pHash);
 6761:   sqlite3_free(pHash);
 6762: }
 6763: 
 6764: /*
 6765: ** The fts2 built-in tokenizers - "simple" and "porter" - are implemented
 6766: ** in files fts2_tokenizer1.c and fts2_porter.c respectively. The following
 6767: ** two forward declarations are for functions declared in these files
 6768: ** used to retrieve the respective implementations.
 6769: **
 6770: ** Calling sqlite3Fts2SimpleTokenizerModule() sets the value pointed
 6771: ** to by the argument to point a the "simple" tokenizer implementation.
 6772: ** Function ...PorterTokenizerModule() sets *pModule to point to the
 6773: ** porter tokenizer/stemmer implementation.
 6774: */
 6775: void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
 6776: void sqlite3Fts2PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
 6777: void sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
 6778: 
 6779: int sqlite3Fts2InitHashTable(sqlite3 *, fts2Hash *, const char *);
 6780: 
 6781: /*
 6782: ** Initialise the fts2 extension. If this extension is built as part
 6783: ** of the sqlite library, then this function is called directly by
 6784: ** SQLite. If fts2 is built as a dynamically loadable extension, this
 6785: ** function is called by the sqlite3_extension_init() entry point.
 6786: */
 6787: int sqlite3Fts2Init(sqlite3 *db){
 6788:   int rc = SQLITE_OK;
 6789:   fts2Hash *pHash = 0;
 6790:   const sqlite3_tokenizer_module *pSimple = 0;
 6791:   const sqlite3_tokenizer_module *pPorter = 0;
 6792:   const sqlite3_tokenizer_module *pIcu = 0;
 6793: 
 6794:   sqlite3Fts2SimpleTokenizerModule(&pSimple);
 6795:   sqlite3Fts2PorterTokenizerModule(&pPorter);
 6796: #ifdef SQLITE_ENABLE_ICU
 6797:   sqlite3Fts2IcuTokenizerModule(&pIcu);
 6798: #endif
 6799: 
 6800:   /* Allocate and initialise the hash-table used to store tokenizers. */
 6801:   pHash = sqlite3_malloc(sizeof(fts2Hash));
 6802:   if( !pHash ){
 6803:     rc = SQLITE_NOMEM;
 6804:   }else{
 6805:     sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
 6806:   }
 6807: 
 6808:   /* Load the built-in tokenizers into the hash table */
 6809:   if( rc==SQLITE_OK ){
 6810:     if( sqlite3Fts2HashInsert(pHash, "simple", 7, (void *)pSimple)
 6811:      || sqlite3Fts2HashInsert(pHash, "porter", 7, (void *)pPorter) 
 6812:      || (pIcu && sqlite3Fts2HashInsert(pHash, "icu", 4, (void *)pIcu))
 6813:     ){
 6814:       rc = SQLITE_NOMEM;
 6815:     }
 6816:   }
 6817: 
 6818:   /* Create the virtual table wrapper around the hash-table and overload 
 6819:   ** the two scalar functions. If this is successful, register the
 6820:   ** module with sqlite.
 6821:   */
 6822:   if( SQLITE_OK==rc 
 6823:    && SQLITE_OK==(rc = sqlite3Fts2InitHashTable(db, pHash, "fts2_tokenizer"))
 6824:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
 6825:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
 6826:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "optimize", -1))
 6827: #ifdef SQLITE_TEST
 6828:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_terms", -1))
 6829:    && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_doclist", -1))
 6830: #endif
 6831:   ){
 6832:     return sqlite3_create_module_v2(
 6833:         db, "fts2", &fts2Module, (void *)pHash, hashDestroy
 6834:     );
 6835:   }
 6836: 
 6837:   /* An error has occurred. Delete the hash table and return the error code. */
 6838:   assert( rc!=SQLITE_OK );
 6839:   if( pHash ){
 6840:     sqlite3Fts2HashClear(pHash);
 6841:     sqlite3_free(pHash);
 6842:   }
 6843:   return rc;
 6844: }
 6845: 
 6846: #if !SQLITE_CORE
 6847: int sqlite3_extension_init(
 6848:   sqlite3 *db, 
 6849:   char **pzErrMsg,
 6850:   const sqlite3_api_routines *pApi
 6851: ){
 6852:   SQLITE_EXTENSION_INIT2(pApi)
 6853:   return sqlite3Fts2Init(db);
 6854: }
 6855: #endif
 6856: 
 6857: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>