Annotation of embedaddon/sqlite3/ext/fts2/fts2.c, revision 1.1
1.1 ! misho 1: /* fts2 has a design flaw which can lead to database corruption (see
! 2: ** below). It is recommended not to use it any longer, instead use
! 3: ** fts3 (or higher). If you believe that your use of fts2 is safe,
! 4: ** add -DSQLITE_ENABLE_BROKEN_FTS2=1 to your CFLAGS.
! 5: */
! 6: #if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)) \
! 7: && !defined(SQLITE_ENABLE_BROKEN_FTS2)
! 8: #error fts2 has a design flaw and has been deprecated.
! 9: #endif
! 10: /* The flaw is that fts2 uses the content table's unaliased rowid as
! 11: ** the unique docid. fts2 embeds the rowid in the index it builds,
! 12: ** and expects the rowid to not change. The SQLite VACUUM operation
! 13: ** will renumber such rowids, thereby breaking fts2. If you are using
! 14: ** fts2 in a system which has disabled VACUUM, then you can continue
! 15: ** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
! 16: ** VACUUM, though systems using auto_vacuum are unlikely to invoke
! 17: ** VACUUM.
! 18: **
! 19: ** Unlike fts1, which is safe across VACUUM if you never delete
! 20: ** documents, fts2 has a second exposure to this flaw, in the segments
! 21: ** table. So fts2 should be considered unsafe across VACUUM in all
! 22: ** cases.
! 23: */
! 24:
! 25: /*
! 26: ** 2006 Oct 10
! 27: **
! 28: ** The author disclaims copyright to this source code. In place of
! 29: ** a legal notice, here is a blessing:
! 30: **
! 31: ** May you do good and not evil.
! 32: ** May you find forgiveness for yourself and forgive others.
! 33: ** May you share freely, never taking more than you give.
! 34: **
! 35: ******************************************************************************
! 36: **
! 37: ** This is an SQLite module implementing full-text search.
! 38: */
! 39:
! 40: /*
! 41: ** The code in this file is only compiled if:
! 42: **
! 43: ** * The FTS2 module is being built as an extension
! 44: ** (in which case SQLITE_CORE is not defined), or
! 45: **
! 46: ** * The FTS2 module is being built into the core of
! 47: ** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
! 48: */
! 49:
! 50: /* TODO(shess) Consider exporting this comment to an HTML file or the
! 51: ** wiki.
! 52: */
! 53: /* The full-text index is stored in a series of b+tree (-like)
! 54: ** structures called segments which map terms to doclists. The
! 55: ** structures are like b+trees in layout, but are constructed from the
! 56: ** bottom up in optimal fashion and are not updatable. Since trees
! 57: ** are built from the bottom up, things will be described from the
! 58: ** bottom up.
! 59: **
! 60: **
! 61: **** Varints ****
! 62: ** The basic unit of encoding is a variable-length integer called a
! 63: ** varint. We encode variable-length integers in little-endian order
! 64: ** using seven bits * per byte as follows:
! 65: **
! 66: ** KEY:
! 67: ** A = 0xxxxxxx 7 bits of data and one flag bit
! 68: ** B = 1xxxxxxx 7 bits of data and one flag bit
! 69: **
! 70: ** 7 bits - A
! 71: ** 14 bits - BA
! 72: ** 21 bits - BBA
! 73: ** and so on.
! 74: **
! 75: ** This is identical to how sqlite encodes varints (see util.c).
! 76: **
! 77: **
! 78: **** Document lists ****
! 79: ** A doclist (document list) holds a docid-sorted list of hits for a
! 80: ** given term. Doclists hold docids, and can optionally associate
! 81: ** token positions and offsets with docids.
! 82: **
! 83: ** A DL_POSITIONS_OFFSETS doclist is stored like this:
! 84: **
! 85: ** array {
! 86: ** varint docid;
! 87: ** array { (position list for column 0)
! 88: ** varint position; (delta from previous position plus POS_BASE)
! 89: ** varint startOffset; (delta from previous startOffset)
! 90: ** varint endOffset; (delta from startOffset)
! 91: ** }
! 92: ** array {
! 93: ** varint POS_COLUMN; (marks start of position list for new column)
! 94: ** varint column; (index of new column)
! 95: ** array {
! 96: ** varint position; (delta from previous position plus POS_BASE)
! 97: ** varint startOffset;(delta from previous startOffset)
! 98: ** varint endOffset; (delta from startOffset)
! 99: ** }
! 100: ** }
! 101: ** varint POS_END; (marks end of positions for this document.
! 102: ** }
! 103: **
! 104: ** Here, array { X } means zero or more occurrences of X, adjacent in
! 105: ** memory. A "position" is an index of a token in the token stream
! 106: ** generated by the tokenizer, while an "offset" is a byte offset,
! 107: ** both based at 0. Note that POS_END and POS_COLUMN occur in the
! 108: ** same logical place as the position element, and act as sentinals
! 109: ** ending a position list array.
! 110: **
! 111: ** A DL_POSITIONS doclist omits the startOffset and endOffset
! 112: ** information. A DL_DOCIDS doclist omits both the position and
! 113: ** offset information, becoming an array of varint-encoded docids.
! 114: **
! 115: ** On-disk data is stored as type DL_DEFAULT, so we don't serialize
! 116: ** the type. Due to how deletion is implemented in the segmentation
! 117: ** system, on-disk doclists MUST store at least positions.
! 118: **
! 119: **
! 120: **** Segment leaf nodes ****
! 121: ** Segment leaf nodes store terms and doclists, ordered by term. Leaf
! 122: ** nodes are written using LeafWriter, and read using LeafReader (to
! 123: ** iterate through a single leaf node's data) and LeavesReader (to
! 124: ** iterate through a segment's entire leaf layer). Leaf nodes have
! 125: ** the format:
! 126: **
! 127: ** varint iHeight; (height from leaf level, always 0)
! 128: ** varint nTerm; (length of first term)
! 129: ** char pTerm[nTerm]; (content of first term)
! 130: ** varint nDoclist; (length of term's associated doclist)
! 131: ** char pDoclist[nDoclist]; (content of doclist)
! 132: ** array {
! 133: ** (further terms are delta-encoded)
! 134: ** varint nPrefix; (length of prefix shared with previous term)
! 135: ** varint nSuffix; (length of unshared suffix)
! 136: ** char pTermSuffix[nSuffix];(unshared suffix of next term)
! 137: ** varint nDoclist; (length of term's associated doclist)
! 138: ** char pDoclist[nDoclist]; (content of doclist)
! 139: ** }
! 140: **
! 141: ** Here, array { X } means zero or more occurrences of X, adjacent in
! 142: ** memory.
! 143: **
! 144: ** Leaf nodes are broken into blocks which are stored contiguously in
! 145: ** the %_segments table in sorted order. This means that when the end
! 146: ** of a node is reached, the next term is in the node with the next
! 147: ** greater node id.
! 148: **
! 149: ** New data is spilled to a new leaf node when the current node
! 150: ** exceeds LEAF_MAX bytes (default 2048). New data which itself is
! 151: ** larger than STANDALONE_MIN (default 1024) is placed in a standalone
! 152: ** node (a leaf node with a single term and doclist). The goal of
! 153: ** these settings is to pack together groups of small doclists while
! 154: ** making it efficient to directly access large doclists. The
! 155: ** assumption is that large doclists represent terms which are more
! 156: ** likely to be query targets.
! 157: **
! 158: ** TODO(shess) It may be useful for blocking decisions to be more
! 159: ** dynamic. For instance, it may make more sense to have a 2.5k leaf
! 160: ** node rather than splitting into 2k and .5k nodes. My intuition is
! 161: ** that this might extend through 2x or 4x the pagesize.
! 162: **
! 163: **
! 164: **** Segment interior nodes ****
! 165: ** Segment interior nodes store blockids for subtree nodes and terms
! 166: ** to describe what data is stored by the each subtree. Interior
! 167: ** nodes are written using InteriorWriter, and read using
! 168: ** InteriorReader. InteriorWriters are created as needed when
! 169: ** SegmentWriter creates new leaf nodes, or when an interior node
! 170: ** itself grows too big and must be split. The format of interior
! 171: ** nodes:
! 172: **
! 173: ** varint iHeight; (height from leaf level, always >0)
! 174: ** varint iBlockid; (block id of node's leftmost subtree)
! 175: ** optional {
! 176: ** varint nTerm; (length of first term)
! 177: ** char pTerm[nTerm]; (content of first term)
! 178: ** array {
! 179: ** (further terms are delta-encoded)
! 180: ** varint nPrefix; (length of shared prefix with previous term)
! 181: ** varint nSuffix; (length of unshared suffix)
! 182: ** char pTermSuffix[nSuffix]; (unshared suffix of next term)
! 183: ** }
! 184: ** }
! 185: **
! 186: ** Here, optional { X } means an optional element, while array { X }
! 187: ** means zero or more occurrences of X, adjacent in memory.
! 188: **
! 189: ** An interior node encodes n terms separating n+1 subtrees. The
! 190: ** subtree blocks are contiguous, so only the first subtree's blockid
! 191: ** is encoded. The subtree at iBlockid will contain all terms less
! 192: ** than the first term encoded (or all terms if no term is encoded).
! 193: ** Otherwise, for terms greater than or equal to pTerm[i] but less
! 194: ** than pTerm[i+1], the subtree for that term will be rooted at
! 195: ** iBlockid+i. Interior nodes only store enough term data to
! 196: ** distinguish adjacent children (if the rightmost term of the left
! 197: ** child is "something", and the leftmost term of the right child is
! 198: ** "wicked", only "w" is stored).
! 199: **
! 200: ** New data is spilled to a new interior node at the same height when
! 201: ** the current node exceeds INTERIOR_MAX bytes (default 2048).
! 202: ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
! 203: ** interior nodes and making the tree too skinny. The interior nodes
! 204: ** at a given height are naturally tracked by interior nodes at
! 205: ** height+1, and so on.
! 206: **
! 207: **
! 208: **** Segment directory ****
! 209: ** The segment directory in table %_segdir stores meta-information for
! 210: ** merging and deleting segments, and also the root node of the
! 211: ** segment's tree.
! 212: **
! 213: ** The root node is the top node of the segment's tree after encoding
! 214: ** the entire segment, restricted to ROOT_MAX bytes (default 1024).
! 215: ** This could be either a leaf node or an interior node. If the top
! 216: ** node requires more than ROOT_MAX bytes, it is flushed to %_segments
! 217: ** and a new root interior node is generated (which should always fit
! 218: ** within ROOT_MAX because it only needs space for 2 varints, the
! 219: ** height and the blockid of the previous root).
! 220: **
! 221: ** The meta-information in the segment directory is:
! 222: ** level - segment level (see below)
! 223: ** idx - index within level
! 224: ** - (level,idx uniquely identify a segment)
! 225: ** start_block - first leaf node
! 226: ** leaves_end_block - last leaf node
! 227: ** end_block - last block (including interior nodes)
! 228: ** root - contents of root node
! 229: **
! 230: ** If the root node is a leaf node, then start_block,
! 231: ** leaves_end_block, and end_block are all 0.
! 232: **
! 233: **
! 234: **** Segment merging ****
! 235: ** To amortize update costs, segments are groups into levels and
! 236: ** merged in matches. Each increase in level represents exponentially
! 237: ** more documents.
! 238: **
! 239: ** New documents (actually, document updates) are tokenized and
! 240: ** written individually (using LeafWriter) to a level 0 segment, with
! 241: ** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
! 242: ** level 0 segments are merged into a single level 1 segment. Level 1
! 243: ** is populated like level 0, and eventually MERGE_COUNT level 1
! 244: ** segments are merged to a single level 2 segment (representing
! 245: ** MERGE_COUNT^2 updates), and so on.
! 246: **
! 247: ** A segment merge traverses all segments at a given level in
! 248: ** parallel, performing a straightforward sorted merge. Since segment
! 249: ** leaf nodes are written in to the %_segments table in order, this
! 250: ** merge traverses the underlying sqlite disk structures efficiently.
! 251: ** After the merge, all segment blocks from the merged level are
! 252: ** deleted.
! 253: **
! 254: ** MERGE_COUNT controls how often we merge segments. 16 seems to be
! 255: ** somewhat of a sweet spot for insertion performance. 32 and 64 show
! 256: ** very similar performance numbers to 16 on insertion, though they're
! 257: ** a tiny bit slower (perhaps due to more overhead in merge-time
! 258: ** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
! 259: ** 16, 2 about 66% slower than 16.
! 260: **
! 261: ** At query time, high MERGE_COUNT increases the number of segments
! 262: ** which need to be scanned and merged. For instance, with 100k docs
! 263: ** inserted:
! 264: **
! 265: ** MERGE_COUNT segments
! 266: ** 16 25
! 267: ** 8 12
! 268: ** 4 10
! 269: ** 2 6
! 270: **
! 271: ** This appears to have only a moderate impact on queries for very
! 272: ** frequent terms (which are somewhat dominated by segment merge
! 273: ** costs), and infrequent and non-existent terms still seem to be fast
! 274: ** even with many segments.
! 275: **
! 276: ** TODO(shess) That said, it would be nice to have a better query-side
! 277: ** argument for MERGE_COUNT of 16. Also, it is possible/likely that
! 278: ** optimizations to things like doclist merging will swing the sweet
! 279: ** spot around.
! 280: **
! 281: **
! 282: **
! 283: **** Handling of deletions and updates ****
! 284: ** Since we're using a segmented structure, with no docid-oriented
! 285: ** index into the term index, we clearly cannot simply update the term
! 286: ** index when a document is deleted or updated. For deletions, we
! 287: ** write an empty doclist (varint(docid) varint(POS_END)), for updates
! 288: ** we simply write the new doclist. Segment merges overwrite older
! 289: ** data for a particular docid with newer data, so deletes or updates
! 290: ** will eventually overtake the earlier data and knock it out. The
! 291: ** query logic likewise merges doclists so that newer data knocks out
! 292: ** older data.
! 293: **
! 294: ** TODO(shess) Provide a VACUUM type operation to clear out all
! 295: ** deletions and duplications. This would basically be a forced merge
! 296: ** into a single segment.
! 297: */
! 298:
! 299: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
! 300:
! 301: #if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE)
! 302: # define SQLITE_CORE 1
! 303: #endif
! 304:
! 305: #include <assert.h>
! 306: #include <stdlib.h>
! 307: #include <stdio.h>
! 308: #include <string.h>
! 309: #include "fts2.h"
! 310: #include "fts2_hash.h"
! 311: #include "fts2_tokenizer.h"
! 312: #include "sqlite3.h"
! 313: #include "sqlite3ext.h"
! 314: SQLITE_EXTENSION_INIT1
! 315:
! 316:
! 317: /* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
! 318: ** would be nice to order the file better, perhaps something along the
! 319: ** lines of:
! 320: **
! 321: ** - utility functions
! 322: ** - table setup functions
! 323: ** - table update functions
! 324: ** - table query functions
! 325: **
! 326: ** Put the query functions last because they're likely to reference
! 327: ** typedefs or functions from the table update section.
! 328: */
! 329:
! 330: #if 0
! 331: # define TRACE(A) printf A; fflush(stdout)
! 332: #else
! 333: # define TRACE(A)
! 334: #endif
! 335:
! 336: /* It is not safe to call isspace(), tolower(), or isalnum() on
! 337: ** hi-bit-set characters. This is the same solution used in the
! 338: ** tokenizer.
! 339: */
! 340: /* TODO(shess) The snippet-generation code should be using the
! 341: ** tokenizer-generated tokens rather than doing its own local
! 342: ** tokenization.
! 343: */
! 344: /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
! 345: static int safe_isspace(char c){
! 346: return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
! 347: }
! 348: static int safe_tolower(char c){
! 349: return (c>='A' && c<='Z') ? (c - 'A' + 'a') : c;
! 350: }
! 351: static int safe_isalnum(char c){
! 352: return (c>='0' && c<='9') || (c>='A' && c<='Z') || (c>='a' && c<='z');
! 353: }
! 354:
! 355: typedef enum DocListType {
! 356: DL_DOCIDS, /* docids only */
! 357: DL_POSITIONS, /* docids + positions */
! 358: DL_POSITIONS_OFFSETS /* docids + positions + offsets */
! 359: } DocListType;
! 360:
! 361: /*
! 362: ** By default, only positions and not offsets are stored in the doclists.
! 363: ** To change this so that offsets are stored too, compile with
! 364: **
! 365: ** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
! 366: **
! 367: ** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
! 368: ** into (no deletes or updates).
! 369: */
! 370: #ifndef DL_DEFAULT
! 371: # define DL_DEFAULT DL_POSITIONS
! 372: #endif
! 373:
! 374: enum {
! 375: POS_END = 0, /* end of this position list */
! 376: POS_COLUMN, /* followed by new column number */
! 377: POS_BASE
! 378: };
! 379:
! 380: /* MERGE_COUNT controls how often we merge segments (see comment at
! 381: ** top of file).
! 382: */
! 383: #define MERGE_COUNT 16
! 384:
! 385: /* utility functions */
! 386:
! 387: /* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
! 388: ** record to prevent errors of the form:
! 389: **
! 390: ** my_function(SomeType *b){
! 391: ** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b)
! 392: ** }
! 393: */
! 394: /* TODO(shess) Obvious candidates for a header file. */
! 395: #define CLEAR(b) memset(b, '\0', sizeof(*(b)))
! 396:
! 397: #ifndef NDEBUG
! 398: # define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
! 399: #else
! 400: # define SCRAMBLE(b)
! 401: #endif
! 402:
! 403: /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
! 404: #define VARINT_MAX 10
! 405:
! 406: /* Write a 64-bit variable-length integer to memory starting at p[0].
! 407: * The length of data written will be between 1 and VARINT_MAX bytes.
! 408: * The number of bytes written is returned. */
! 409: static int putVarint(char *p, sqlite_int64 v){
! 410: unsigned char *q = (unsigned char *) p;
! 411: sqlite_uint64 vu = v;
! 412: do{
! 413: *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
! 414: vu >>= 7;
! 415: }while( vu!=0 );
! 416: q[-1] &= 0x7f; /* turn off high bit in final byte */
! 417: assert( q - (unsigned char *)p <= VARINT_MAX );
! 418: return (int) (q - (unsigned char *)p);
! 419: }
! 420:
! 421: /* Read a 64-bit variable-length integer from memory starting at p[0].
! 422: * Return the number of bytes read, or 0 on error.
! 423: * The value is stored in *v. */
! 424: static int getVarint(const char *p, sqlite_int64 *v){
! 425: const unsigned char *q = (const unsigned char *) p;
! 426: sqlite_uint64 x = 0, y = 1;
! 427: while( (*q & 0x80) == 0x80 ){
! 428: x += y * (*q++ & 0x7f);
! 429: y <<= 7;
! 430: if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
! 431: assert( 0 );
! 432: return 0;
! 433: }
! 434: }
! 435: x += y * (*q++);
! 436: *v = (sqlite_int64) x;
! 437: return (int) (q - (unsigned char *)p);
! 438: }
! 439:
! 440: static int getVarint32(const char *p, int *pi){
! 441: sqlite_int64 i;
! 442: int ret = getVarint(p, &i);
! 443: *pi = (int) i;
! 444: assert( *pi==i );
! 445: return ret;
! 446: }
! 447:
! 448: /*******************************************************************/
! 449: /* DataBuffer is used to collect data into a buffer in piecemeal
! 450: ** fashion. It implements the usual distinction between amount of
! 451: ** data currently stored (nData) and buffer capacity (nCapacity).
! 452: **
! 453: ** dataBufferInit - create a buffer with given initial capacity.
! 454: ** dataBufferReset - forget buffer's data, retaining capacity.
! 455: ** dataBufferDestroy - free buffer's data.
! 456: ** dataBufferSwap - swap contents of two buffers.
! 457: ** dataBufferExpand - expand capacity without adding data.
! 458: ** dataBufferAppend - append data.
! 459: ** dataBufferAppend2 - append two pieces of data at once.
! 460: ** dataBufferReplace - replace buffer's data.
! 461: */
! 462: typedef struct DataBuffer {
! 463: char *pData; /* Pointer to malloc'ed buffer. */
! 464: int nCapacity; /* Size of pData buffer. */
! 465: int nData; /* End of data loaded into pData. */
! 466: } DataBuffer;
! 467:
! 468: static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
! 469: assert( nCapacity>=0 );
! 470: pBuffer->nData = 0;
! 471: pBuffer->nCapacity = nCapacity;
! 472: pBuffer->pData = nCapacity==0 ? NULL : sqlite3_malloc(nCapacity);
! 473: }
! 474: static void dataBufferReset(DataBuffer *pBuffer){
! 475: pBuffer->nData = 0;
! 476: }
! 477: static void dataBufferDestroy(DataBuffer *pBuffer){
! 478: if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData);
! 479: SCRAMBLE(pBuffer);
! 480: }
! 481: static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){
! 482: DataBuffer tmp = *pBuffer1;
! 483: *pBuffer1 = *pBuffer2;
! 484: *pBuffer2 = tmp;
! 485: }
! 486: static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
! 487: assert( nAddCapacity>0 );
! 488: /* TODO(shess) Consider expanding more aggressively. Note that the
! 489: ** underlying malloc implementation may take care of such things for
! 490: ** us already.
! 491: */
! 492: if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
! 493: pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
! 494: pBuffer->pData = sqlite3_realloc(pBuffer->pData, pBuffer->nCapacity);
! 495: }
! 496: }
! 497: static void dataBufferAppend(DataBuffer *pBuffer,
! 498: const char *pSource, int nSource){
! 499: assert( nSource>0 && pSource!=NULL );
! 500: dataBufferExpand(pBuffer, nSource);
! 501: memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
! 502: pBuffer->nData += nSource;
! 503: }
! 504: static void dataBufferAppend2(DataBuffer *pBuffer,
! 505: const char *pSource1, int nSource1,
! 506: const char *pSource2, int nSource2){
! 507: assert( nSource1>0 && pSource1!=NULL );
! 508: assert( nSource2>0 && pSource2!=NULL );
! 509: dataBufferExpand(pBuffer, nSource1+nSource2);
! 510: memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
! 511: memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
! 512: pBuffer->nData += nSource1+nSource2;
! 513: }
! 514: static void dataBufferReplace(DataBuffer *pBuffer,
! 515: const char *pSource, int nSource){
! 516: dataBufferReset(pBuffer);
! 517: dataBufferAppend(pBuffer, pSource, nSource);
! 518: }
! 519:
! 520: /* StringBuffer is a null-terminated version of DataBuffer. */
! 521: typedef struct StringBuffer {
! 522: DataBuffer b; /* Includes null terminator. */
! 523: } StringBuffer;
! 524:
! 525: static void initStringBuffer(StringBuffer *sb){
! 526: dataBufferInit(&sb->b, 100);
! 527: dataBufferReplace(&sb->b, "", 1);
! 528: }
! 529: static int stringBufferLength(StringBuffer *sb){
! 530: return sb->b.nData-1;
! 531: }
! 532: static char *stringBufferData(StringBuffer *sb){
! 533: return sb->b.pData;
! 534: }
! 535: static void stringBufferDestroy(StringBuffer *sb){
! 536: dataBufferDestroy(&sb->b);
! 537: }
! 538:
! 539: static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
! 540: assert( sb->b.nData>0 );
! 541: if( nFrom>0 ){
! 542: sb->b.nData--;
! 543: dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
! 544: }
! 545: }
! 546: static void append(StringBuffer *sb, const char *zFrom){
! 547: nappend(sb, zFrom, strlen(zFrom));
! 548: }
! 549:
! 550: /* Append a list of strings separated by commas. */
! 551: static void appendList(StringBuffer *sb, int nString, char **azString){
! 552: int i;
! 553: for(i=0; i<nString; ++i){
! 554: if( i>0 ) append(sb, ", ");
! 555: append(sb, azString[i]);
! 556: }
! 557: }
! 558:
! 559: static int endsInWhiteSpace(StringBuffer *p){
! 560: return stringBufferLength(p)>0 &&
! 561: safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
! 562: }
! 563:
! 564: /* If the StringBuffer ends in something other than white space, add a
! 565: ** single space character to the end.
! 566: */
! 567: static void appendWhiteSpace(StringBuffer *p){
! 568: if( stringBufferLength(p)==0 ) return;
! 569: if( !endsInWhiteSpace(p) ) append(p, " ");
! 570: }
! 571:
! 572: /* Remove white space from the end of the StringBuffer */
! 573: static void trimWhiteSpace(StringBuffer *p){
! 574: while( endsInWhiteSpace(p) ){
! 575: p->b.pData[--p->b.nData-1] = '\0';
! 576: }
! 577: }
! 578:
! 579: /*******************************************************************/
! 580: /* DLReader is used to read document elements from a doclist. The
! 581: ** current docid is cached, so dlrDocid() is fast. DLReader does not
! 582: ** own the doclist buffer.
! 583: **
! 584: ** dlrAtEnd - true if there's no more data to read.
! 585: ** dlrDocid - docid of current document.
! 586: ** dlrDocData - doclist data for current document (including docid).
! 587: ** dlrDocDataBytes - length of same.
! 588: ** dlrAllDataBytes - length of all remaining data.
! 589: ** dlrPosData - position data for current document.
! 590: ** dlrPosDataLen - length of pos data for current document (incl POS_END).
! 591: ** dlrStep - step to current document.
! 592: ** dlrInit - initial for doclist of given type against given data.
! 593: ** dlrDestroy - clean up.
! 594: **
! 595: ** Expected usage is something like:
! 596: **
! 597: ** DLReader reader;
! 598: ** dlrInit(&reader, pData, nData);
! 599: ** while( !dlrAtEnd(&reader) ){
! 600: ** // calls to dlrDocid() and kin.
! 601: ** dlrStep(&reader);
! 602: ** }
! 603: ** dlrDestroy(&reader);
! 604: */
! 605: typedef struct DLReader {
! 606: DocListType iType;
! 607: const char *pData;
! 608: int nData;
! 609:
! 610: sqlite_int64 iDocid;
! 611: int nElement;
! 612: } DLReader;
! 613:
! 614: static int dlrAtEnd(DLReader *pReader){
! 615: assert( pReader->nData>=0 );
! 616: return pReader->nData==0;
! 617: }
! 618: static sqlite_int64 dlrDocid(DLReader *pReader){
! 619: assert( !dlrAtEnd(pReader) );
! 620: return pReader->iDocid;
! 621: }
! 622: static const char *dlrDocData(DLReader *pReader){
! 623: assert( !dlrAtEnd(pReader) );
! 624: return pReader->pData;
! 625: }
! 626: static int dlrDocDataBytes(DLReader *pReader){
! 627: assert( !dlrAtEnd(pReader) );
! 628: return pReader->nElement;
! 629: }
! 630: static int dlrAllDataBytes(DLReader *pReader){
! 631: assert( !dlrAtEnd(pReader) );
! 632: return pReader->nData;
! 633: }
! 634: /* TODO(shess) Consider adding a field to track iDocid varint length
! 635: ** to make these two functions faster. This might matter (a tiny bit)
! 636: ** for queries.
! 637: */
! 638: static const char *dlrPosData(DLReader *pReader){
! 639: sqlite_int64 iDummy;
! 640: int n = getVarint(pReader->pData, &iDummy);
! 641: assert( !dlrAtEnd(pReader) );
! 642: return pReader->pData+n;
! 643: }
! 644: static int dlrPosDataLen(DLReader *pReader){
! 645: sqlite_int64 iDummy;
! 646: int n = getVarint(pReader->pData, &iDummy);
! 647: assert( !dlrAtEnd(pReader) );
! 648: return pReader->nElement-n;
! 649: }
! 650: static void dlrStep(DLReader *pReader){
! 651: assert( !dlrAtEnd(pReader) );
! 652:
! 653: /* Skip past current doclist element. */
! 654: assert( pReader->nElement<=pReader->nData );
! 655: pReader->pData += pReader->nElement;
! 656: pReader->nData -= pReader->nElement;
! 657:
! 658: /* If there is more data, read the next doclist element. */
! 659: if( pReader->nData!=0 ){
! 660: sqlite_int64 iDocidDelta;
! 661: int iDummy, n = getVarint(pReader->pData, &iDocidDelta);
! 662: pReader->iDocid += iDocidDelta;
! 663: if( pReader->iType>=DL_POSITIONS ){
! 664: assert( n<pReader->nData );
! 665: while( 1 ){
! 666: n += getVarint32(pReader->pData+n, &iDummy);
! 667: assert( n<=pReader->nData );
! 668: if( iDummy==POS_END ) break;
! 669: if( iDummy==POS_COLUMN ){
! 670: n += getVarint32(pReader->pData+n, &iDummy);
! 671: assert( n<pReader->nData );
! 672: }else if( pReader->iType==DL_POSITIONS_OFFSETS ){
! 673: n += getVarint32(pReader->pData+n, &iDummy);
! 674: n += getVarint32(pReader->pData+n, &iDummy);
! 675: assert( n<pReader->nData );
! 676: }
! 677: }
! 678: }
! 679: pReader->nElement = n;
! 680: assert( pReader->nElement<=pReader->nData );
! 681: }
! 682: }
! 683: static void dlrInit(DLReader *pReader, DocListType iType,
! 684: const char *pData, int nData){
! 685: assert( pData!=NULL && nData!=0 );
! 686: pReader->iType = iType;
! 687: pReader->pData = pData;
! 688: pReader->nData = nData;
! 689: pReader->nElement = 0;
! 690: pReader->iDocid = 0;
! 691:
! 692: /* Load the first element's data. There must be a first element. */
! 693: dlrStep(pReader);
! 694: }
! 695: static void dlrDestroy(DLReader *pReader){
! 696: SCRAMBLE(pReader);
! 697: }
! 698:
! 699: #ifndef NDEBUG
! 700: /* Verify that the doclist can be validly decoded. Also returns the
! 701: ** last docid found because it is convenient in other assertions for
! 702: ** DLWriter.
! 703: */
! 704: static void docListValidate(DocListType iType, const char *pData, int nData,
! 705: sqlite_int64 *pLastDocid){
! 706: sqlite_int64 iPrevDocid = 0;
! 707: assert( nData>0 );
! 708: assert( pData!=0 );
! 709: assert( pData+nData>pData );
! 710: while( nData!=0 ){
! 711: sqlite_int64 iDocidDelta;
! 712: int n = getVarint(pData, &iDocidDelta);
! 713: iPrevDocid += iDocidDelta;
! 714: if( iType>DL_DOCIDS ){
! 715: int iDummy;
! 716: while( 1 ){
! 717: n += getVarint32(pData+n, &iDummy);
! 718: if( iDummy==POS_END ) break;
! 719: if( iDummy==POS_COLUMN ){
! 720: n += getVarint32(pData+n, &iDummy);
! 721: }else if( iType>DL_POSITIONS ){
! 722: n += getVarint32(pData+n, &iDummy);
! 723: n += getVarint32(pData+n, &iDummy);
! 724: }
! 725: assert( n<=nData );
! 726: }
! 727: }
! 728: assert( n<=nData );
! 729: pData += n;
! 730: nData -= n;
! 731: }
! 732: if( pLastDocid ) *pLastDocid = iPrevDocid;
! 733: }
! 734: #define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
! 735: #else
! 736: #define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
! 737: #endif
! 738:
! 739: /*******************************************************************/
! 740: /* DLWriter is used to write doclist data to a DataBuffer. DLWriter
! 741: ** always appends to the buffer and does not own it.
! 742: **
! 743: ** dlwInit - initialize to write a given type doclistto a buffer.
! 744: ** dlwDestroy - clear the writer's memory. Does not free buffer.
! 745: ** dlwAppend - append raw doclist data to buffer.
! 746: ** dlwCopy - copy next doclist from reader to writer.
! 747: ** dlwAdd - construct doclist element and append to buffer.
! 748: ** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
! 749: */
! 750: typedef struct DLWriter {
! 751: DocListType iType;
! 752: DataBuffer *b;
! 753: sqlite_int64 iPrevDocid;
! 754: #ifndef NDEBUG
! 755: int has_iPrevDocid;
! 756: #endif
! 757: } DLWriter;
! 758:
! 759: static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){
! 760: pWriter->b = b;
! 761: pWriter->iType = iType;
! 762: pWriter->iPrevDocid = 0;
! 763: #ifndef NDEBUG
! 764: pWriter->has_iPrevDocid = 0;
! 765: #endif
! 766: }
! 767: static void dlwDestroy(DLWriter *pWriter){
! 768: SCRAMBLE(pWriter);
! 769: }
! 770: /* iFirstDocid is the first docid in the doclist in pData. It is
! 771: ** needed because pData may point within a larger doclist, in which
! 772: ** case the first item would be delta-encoded.
! 773: **
! 774: ** iLastDocid is the final docid in the doclist in pData. It is
! 775: ** needed to create the new iPrevDocid for future delta-encoding. The
! 776: ** code could decode the passed doclist to recreate iLastDocid, but
! 777: ** the only current user (docListMerge) already has decoded this
! 778: ** information.
! 779: */
! 780: /* TODO(shess) This has become just a helper for docListMerge.
! 781: ** Consider a refactor to make this cleaner.
! 782: */
! 783: static void dlwAppend(DLWriter *pWriter,
! 784: const char *pData, int nData,
! 785: sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
! 786: sqlite_int64 iDocid = 0;
! 787: char c[VARINT_MAX];
! 788: int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */
! 789: #ifndef NDEBUG
! 790: sqlite_int64 iLastDocidDelta;
! 791: #endif
! 792:
! 793: /* Recode the initial docid as delta from iPrevDocid. */
! 794: nFirstOld = getVarint(pData, &iDocid);
! 795: assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
! 796: nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
! 797:
! 798: /* Verify that the incoming doclist is valid AND that it ends with
! 799: ** the expected docid. This is essential because we'll trust this
! 800: ** docid in future delta-encoding.
! 801: */
! 802: ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
! 803: assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
! 804:
! 805: /* Append recoded initial docid and everything else. Rest of docids
! 806: ** should have been delta-encoded from previous initial docid.
! 807: */
! 808: if( nFirstOld<nData ){
! 809: dataBufferAppend2(pWriter->b, c, nFirstNew,
! 810: pData+nFirstOld, nData-nFirstOld);
! 811: }else{
! 812: dataBufferAppend(pWriter->b, c, nFirstNew);
! 813: }
! 814: pWriter->iPrevDocid = iLastDocid;
! 815: }
! 816: static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
! 817: dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
! 818: dlrDocid(pReader), dlrDocid(pReader));
! 819: }
! 820: static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
! 821: char c[VARINT_MAX];
! 822: int n = putVarint(c, iDocid-pWriter->iPrevDocid);
! 823:
! 824: /* Docids must ascend. */
! 825: assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid );
! 826: assert( pWriter->iType==DL_DOCIDS );
! 827:
! 828: dataBufferAppend(pWriter->b, c, n);
! 829: pWriter->iPrevDocid = iDocid;
! 830: #ifndef NDEBUG
! 831: pWriter->has_iPrevDocid = 1;
! 832: #endif
! 833: }
! 834:
! 835: /*******************************************************************/
! 836: /* PLReader is used to read data from a document's position list. As
! 837: ** the caller steps through the list, data is cached so that varints
! 838: ** only need to be decoded once.
! 839: **
! 840: ** plrInit, plrDestroy - create/destroy a reader.
! 841: ** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
! 842: ** plrAtEnd - at end of stream, only call plrDestroy once true.
! 843: ** plrStep - step to the next element.
! 844: */
! 845: typedef struct PLReader {
! 846: /* These refer to the next position's data. nData will reach 0 when
! 847: ** reading the last position, so plrStep() signals EOF by setting
! 848: ** pData to NULL.
! 849: */
! 850: const char *pData;
! 851: int nData;
! 852:
! 853: DocListType iType;
! 854: int iColumn; /* the last column read */
! 855: int iPosition; /* the last position read */
! 856: int iStartOffset; /* the last start offset read */
! 857: int iEndOffset; /* the last end offset read */
! 858: } PLReader;
! 859:
! 860: static int plrAtEnd(PLReader *pReader){
! 861: return pReader->pData==NULL;
! 862: }
! 863: static int plrColumn(PLReader *pReader){
! 864: assert( !plrAtEnd(pReader) );
! 865: return pReader->iColumn;
! 866: }
! 867: static int plrPosition(PLReader *pReader){
! 868: assert( !plrAtEnd(pReader) );
! 869: return pReader->iPosition;
! 870: }
! 871: static int plrStartOffset(PLReader *pReader){
! 872: assert( !plrAtEnd(pReader) );
! 873: return pReader->iStartOffset;
! 874: }
! 875: static int plrEndOffset(PLReader *pReader){
! 876: assert( !plrAtEnd(pReader) );
! 877: return pReader->iEndOffset;
! 878: }
! 879: static void plrStep(PLReader *pReader){
! 880: int i, n;
! 881:
! 882: assert( !plrAtEnd(pReader) );
! 883:
! 884: if( pReader->nData==0 ){
! 885: pReader->pData = NULL;
! 886: return;
! 887: }
! 888:
! 889: n = getVarint32(pReader->pData, &i);
! 890: if( i==POS_COLUMN ){
! 891: n += getVarint32(pReader->pData+n, &pReader->iColumn);
! 892: pReader->iPosition = 0;
! 893: pReader->iStartOffset = 0;
! 894: n += getVarint32(pReader->pData+n, &i);
! 895: }
! 896: /* Should never see adjacent column changes. */
! 897: assert( i!=POS_COLUMN );
! 898:
! 899: if( i==POS_END ){
! 900: pReader->nData = 0;
! 901: pReader->pData = NULL;
! 902: return;
! 903: }
! 904:
! 905: pReader->iPosition += i-POS_BASE;
! 906: if( pReader->iType==DL_POSITIONS_OFFSETS ){
! 907: n += getVarint32(pReader->pData+n, &i);
! 908: pReader->iStartOffset += i;
! 909: n += getVarint32(pReader->pData+n, &i);
! 910: pReader->iEndOffset = pReader->iStartOffset+i;
! 911: }
! 912: assert( n<=pReader->nData );
! 913: pReader->pData += n;
! 914: pReader->nData -= n;
! 915: }
! 916:
! 917: static void plrInit(PLReader *pReader, DLReader *pDLReader){
! 918: pReader->pData = dlrPosData(pDLReader);
! 919: pReader->nData = dlrPosDataLen(pDLReader);
! 920: pReader->iType = pDLReader->iType;
! 921: pReader->iColumn = 0;
! 922: pReader->iPosition = 0;
! 923: pReader->iStartOffset = 0;
! 924: pReader->iEndOffset = 0;
! 925: plrStep(pReader);
! 926: }
! 927: static void plrDestroy(PLReader *pReader){
! 928: SCRAMBLE(pReader);
! 929: }
! 930:
! 931: /*******************************************************************/
! 932: /* PLWriter is used in constructing a document's position list. As a
! 933: ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
! 934: ** PLWriter writes to the associated DLWriter's buffer.
! 935: **
! 936: ** plwInit - init for writing a document's poslist.
! 937: ** plwDestroy - clear a writer.
! 938: ** plwAdd - append position and offset information.
! 939: ** plwCopy - copy next position's data from reader to writer.
! 940: ** plwTerminate - add any necessary doclist terminator.
! 941: **
! 942: ** Calling plwAdd() after plwTerminate() may result in a corrupt
! 943: ** doclist.
! 944: */
! 945: /* TODO(shess) Until we've written the second item, we can cache the
! 946: ** first item's information. Then we'd have three states:
! 947: **
! 948: ** - initialized with docid, no positions.
! 949: ** - docid and one position.
! 950: ** - docid and multiple positions.
! 951: **
! 952: ** Only the last state needs to actually write to dlw->b, which would
! 953: ** be an improvement in the DLCollector case.
! 954: */
! 955: typedef struct PLWriter {
! 956: DLWriter *dlw;
! 957:
! 958: int iColumn; /* the last column written */
! 959: int iPos; /* the last position written */
! 960: int iOffset; /* the last start offset written */
! 961: } PLWriter;
! 962:
! 963: /* TODO(shess) In the case where the parent is reading these values
! 964: ** from a PLReader, we could optimize to a copy if that PLReader has
! 965: ** the same type as pWriter.
! 966: */
! 967: static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
! 968: int iStartOffset, int iEndOffset){
! 969: /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
! 970: ** iStartOffsetDelta, and iEndOffsetDelta.
! 971: */
! 972: char c[5*VARINT_MAX];
! 973: int n = 0;
! 974:
! 975: /* Ban plwAdd() after plwTerminate(). */
! 976: assert( pWriter->iPos!=-1 );
! 977:
! 978: if( pWriter->dlw->iType==DL_DOCIDS ) return;
! 979:
! 980: if( iColumn!=pWriter->iColumn ){
! 981: n += putVarint(c+n, POS_COLUMN);
! 982: n += putVarint(c+n, iColumn);
! 983: pWriter->iColumn = iColumn;
! 984: pWriter->iPos = 0;
! 985: pWriter->iOffset = 0;
! 986: }
! 987: assert( iPos>=pWriter->iPos );
! 988: n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
! 989: pWriter->iPos = iPos;
! 990: if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
! 991: assert( iStartOffset>=pWriter->iOffset );
! 992: n += putVarint(c+n, iStartOffset-pWriter->iOffset);
! 993: pWriter->iOffset = iStartOffset;
! 994: assert( iEndOffset>=iStartOffset );
! 995: n += putVarint(c+n, iEndOffset-iStartOffset);
! 996: }
! 997: dataBufferAppend(pWriter->dlw->b, c, n);
! 998: }
! 999: static void plwCopy(PLWriter *pWriter, PLReader *pReader){
! 1000: plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
! 1001: plrStartOffset(pReader), plrEndOffset(pReader));
! 1002: }
! 1003: static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
! 1004: char c[VARINT_MAX];
! 1005: int n;
! 1006:
! 1007: pWriter->dlw = dlw;
! 1008:
! 1009: /* Docids must ascend. */
! 1010: assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid );
! 1011: n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
! 1012: dataBufferAppend(pWriter->dlw->b, c, n);
! 1013: pWriter->dlw->iPrevDocid = iDocid;
! 1014: #ifndef NDEBUG
! 1015: pWriter->dlw->has_iPrevDocid = 1;
! 1016: #endif
! 1017:
! 1018: pWriter->iColumn = 0;
! 1019: pWriter->iPos = 0;
! 1020: pWriter->iOffset = 0;
! 1021: }
! 1022: /* TODO(shess) Should plwDestroy() also terminate the doclist? But
! 1023: ** then plwDestroy() would no longer be just a destructor, it would
! 1024: ** also be doing work, which isn't consistent with the overall idiom.
! 1025: ** Another option would be for plwAdd() to always append any necessary
! 1026: ** terminator, so that the output is always correct. But that would
! 1027: ** add incremental work to the common case with the only benefit being
! 1028: ** API elegance. Punt for now.
! 1029: */
! 1030: static void plwTerminate(PLWriter *pWriter){
! 1031: if( pWriter->dlw->iType>DL_DOCIDS ){
! 1032: char c[VARINT_MAX];
! 1033: int n = putVarint(c, POS_END);
! 1034: dataBufferAppend(pWriter->dlw->b, c, n);
! 1035: }
! 1036: #ifndef NDEBUG
! 1037: /* Mark as terminated for assert in plwAdd(). */
! 1038: pWriter->iPos = -1;
! 1039: #endif
! 1040: }
! 1041: static void plwDestroy(PLWriter *pWriter){
! 1042: SCRAMBLE(pWriter);
! 1043: }
! 1044:
! 1045: /*******************************************************************/
! 1046: /* DLCollector wraps PLWriter and DLWriter to provide a
! 1047: ** dynamically-allocated doclist area to use during tokenization.
! 1048: **
! 1049: ** dlcNew - malloc up and initialize a collector.
! 1050: ** dlcDelete - destroy a collector and all contained items.
! 1051: ** dlcAddPos - append position and offset information.
! 1052: ** dlcAddDoclist - add the collected doclist to the given buffer.
! 1053: ** dlcNext - terminate the current document and open another.
! 1054: */
! 1055: typedef struct DLCollector {
! 1056: DataBuffer b;
! 1057: DLWriter dlw;
! 1058: PLWriter plw;
! 1059: } DLCollector;
! 1060:
! 1061: /* TODO(shess) This could also be done by calling plwTerminate() and
! 1062: ** dataBufferAppend(). I tried that, expecting nominal performance
! 1063: ** differences, but it seemed to pretty reliably be worth 1% to code
! 1064: ** it this way. I suspect it is the incremental malloc overhead (some
! 1065: ** percentage of the plwTerminate() calls will cause a realloc), so
! 1066: ** this might be worth revisiting if the DataBuffer implementation
! 1067: ** changes.
! 1068: */
! 1069: static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
! 1070: if( pCollector->dlw.iType>DL_DOCIDS ){
! 1071: char c[VARINT_MAX];
! 1072: int n = putVarint(c, POS_END);
! 1073: dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
! 1074: }else{
! 1075: dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
! 1076: }
! 1077: }
! 1078: static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
! 1079: plwTerminate(&pCollector->plw);
! 1080: plwDestroy(&pCollector->plw);
! 1081: plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
! 1082: }
! 1083: static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
! 1084: int iStartOffset, int iEndOffset){
! 1085: plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
! 1086: }
! 1087:
! 1088: static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
! 1089: DLCollector *pCollector = sqlite3_malloc(sizeof(DLCollector));
! 1090: dataBufferInit(&pCollector->b, 0);
! 1091: dlwInit(&pCollector->dlw, iType, &pCollector->b);
! 1092: plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
! 1093: return pCollector;
! 1094: }
! 1095: static void dlcDelete(DLCollector *pCollector){
! 1096: plwDestroy(&pCollector->plw);
! 1097: dlwDestroy(&pCollector->dlw);
! 1098: dataBufferDestroy(&pCollector->b);
! 1099: SCRAMBLE(pCollector);
! 1100: sqlite3_free(pCollector);
! 1101: }
! 1102:
! 1103:
! 1104: /* Copy the doclist data of iType in pData/nData into *out, trimming
! 1105: ** unnecessary data as we go. Only columns matching iColumn are
! 1106: ** copied, all columns copied if iColumn is -1. Elements with no
! 1107: ** matching columns are dropped. The output is an iOutType doclist.
! 1108: */
! 1109: /* NOTE(shess) This code is only valid after all doclists are merged.
! 1110: ** If this is run before merges, then doclist items which represent
! 1111: ** deletion will be trimmed, and will thus not effect a deletion
! 1112: ** during the merge.
! 1113: */
! 1114: static void docListTrim(DocListType iType, const char *pData, int nData,
! 1115: int iColumn, DocListType iOutType, DataBuffer *out){
! 1116: DLReader dlReader;
! 1117: DLWriter dlWriter;
! 1118:
! 1119: assert( iOutType<=iType );
! 1120:
! 1121: dlrInit(&dlReader, iType, pData, nData);
! 1122: dlwInit(&dlWriter, iOutType, out);
! 1123:
! 1124: while( !dlrAtEnd(&dlReader) ){
! 1125: PLReader plReader;
! 1126: PLWriter plWriter;
! 1127: int match = 0;
! 1128:
! 1129: plrInit(&plReader, &dlReader);
! 1130:
! 1131: while( !plrAtEnd(&plReader) ){
! 1132: if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
! 1133: if( !match ){
! 1134: plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
! 1135: match = 1;
! 1136: }
! 1137: plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
! 1138: plrStartOffset(&plReader), plrEndOffset(&plReader));
! 1139: }
! 1140: plrStep(&plReader);
! 1141: }
! 1142: if( match ){
! 1143: plwTerminate(&plWriter);
! 1144: plwDestroy(&plWriter);
! 1145: }
! 1146:
! 1147: plrDestroy(&plReader);
! 1148: dlrStep(&dlReader);
! 1149: }
! 1150: dlwDestroy(&dlWriter);
! 1151: dlrDestroy(&dlReader);
! 1152: }
! 1153:
! 1154: /* Used by docListMerge() to keep doclists in the ascending order by
! 1155: ** docid, then ascending order by age (so the newest comes first).
! 1156: */
! 1157: typedef struct OrderedDLReader {
! 1158: DLReader *pReader;
! 1159:
! 1160: /* TODO(shess) If we assume that docListMerge pReaders is ordered by
! 1161: ** age (which we do), then we could use pReader comparisons to break
! 1162: ** ties.
! 1163: */
! 1164: int idx;
! 1165: } OrderedDLReader;
! 1166:
! 1167: /* Order eof to end, then by docid asc, idx desc. */
! 1168: static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){
! 1169: if( dlrAtEnd(r1->pReader) ){
! 1170: if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */
! 1171: return 1; /* Only r1 atEnd(). */
! 1172: }
! 1173: if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */
! 1174:
! 1175: if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1;
! 1176: if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
! 1177:
! 1178: /* Descending on idx. */
! 1179: return r2->idx-r1->idx;
! 1180: }
! 1181:
! 1182: /* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that
! 1183: ** p[1..n-1] is already sorted.
! 1184: */
! 1185: /* TODO(shess) Is this frequent enough to warrant a binary search?
! 1186: ** Before implementing that, instrument the code to check. In most
! 1187: ** current usage, I expect that p[0] will be less than p[1] a very
! 1188: ** high proportion of the time.
! 1189: */
! 1190: static void orderedDLReaderReorder(OrderedDLReader *p, int n){
! 1191: while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
! 1192: OrderedDLReader tmp = p[0];
! 1193: p[0] = p[1];
! 1194: p[1] = tmp;
! 1195: n--;
! 1196: p++;
! 1197: }
! 1198: }
! 1199:
! 1200: /* Given an array of doclist readers, merge their doclist elements
! 1201: ** into out in sorted order (by docid), dropping elements from older
! 1202: ** readers when there is a duplicate docid. pReaders is assumed to be
! 1203: ** ordered by age, oldest first.
! 1204: */
! 1205: /* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably
! 1206: ** be fixed.
! 1207: */
! 1208: static void docListMerge(DataBuffer *out,
! 1209: DLReader *pReaders, int nReaders){
! 1210: OrderedDLReader readers[MERGE_COUNT];
! 1211: DLWriter writer;
! 1212: int i, n;
! 1213: const char *pStart = 0;
! 1214: int nStart = 0;
! 1215: sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
! 1216:
! 1217: assert( nReaders>0 );
! 1218: if( nReaders==1 ){
! 1219: dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
! 1220: return;
! 1221: }
! 1222:
! 1223: assert( nReaders<=MERGE_COUNT );
! 1224: n = 0;
! 1225: for(i=0; i<nReaders; i++){
! 1226: assert( pReaders[i].iType==pReaders[0].iType );
! 1227: readers[i].pReader = pReaders+i;
! 1228: readers[i].idx = i;
! 1229: n += dlrAllDataBytes(&pReaders[i]);
! 1230: }
! 1231: /* Conservatively size output to sum of inputs. Output should end
! 1232: ** up strictly smaller than input.
! 1233: */
! 1234: dataBufferExpand(out, n);
! 1235:
! 1236: /* Get the readers into sorted order. */
! 1237: while( i-->0 ){
! 1238: orderedDLReaderReorder(readers+i, nReaders-i);
! 1239: }
! 1240:
! 1241: dlwInit(&writer, pReaders[0].iType, out);
! 1242: while( !dlrAtEnd(readers[0].pReader) ){
! 1243: sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
! 1244:
! 1245: /* If this is a continuation of the current buffer to copy, extend
! 1246: ** that buffer. memcpy() seems to be more efficient if it has a
! 1247: ** lots of data to copy.
! 1248: */
! 1249: if( dlrDocData(readers[0].pReader)==pStart+nStart ){
! 1250: nStart += dlrDocDataBytes(readers[0].pReader);
! 1251: }else{
! 1252: if( pStart!=0 ){
! 1253: dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
! 1254: }
! 1255: pStart = dlrDocData(readers[0].pReader);
! 1256: nStart = dlrDocDataBytes(readers[0].pReader);
! 1257: iFirstDocid = iDocid;
! 1258: }
! 1259: iLastDocid = iDocid;
! 1260: dlrStep(readers[0].pReader);
! 1261:
! 1262: /* Drop all of the older elements with the same docid. */
! 1263: for(i=1; i<nReaders &&
! 1264: !dlrAtEnd(readers[i].pReader) &&
! 1265: dlrDocid(readers[i].pReader)==iDocid; i++){
! 1266: dlrStep(readers[i].pReader);
! 1267: }
! 1268:
! 1269: /* Get the readers back into order. */
! 1270: while( i-->0 ){
! 1271: orderedDLReaderReorder(readers+i, nReaders-i);
! 1272: }
! 1273: }
! 1274:
! 1275: /* Copy over any remaining elements. */
! 1276: if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
! 1277: dlwDestroy(&writer);
! 1278: }
! 1279:
! 1280: /* Helper function for posListUnion(). Compares the current position
! 1281: ** between left and right, returning as standard C idiom of <0 if
! 1282: ** left<right, >0 if left>right, and 0 if left==right. "End" always
! 1283: ** compares greater.
! 1284: */
! 1285: static int posListCmp(PLReader *pLeft, PLReader *pRight){
! 1286: assert( pLeft->iType==pRight->iType );
! 1287: if( pLeft->iType==DL_DOCIDS ) return 0;
! 1288:
! 1289: if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
! 1290: if( plrAtEnd(pRight) ) return -1;
! 1291:
! 1292: if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
! 1293: if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
! 1294:
! 1295: if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
! 1296: if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
! 1297: if( pLeft->iType==DL_POSITIONS ) return 0;
! 1298:
! 1299: if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
! 1300: if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
! 1301:
! 1302: if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
! 1303: if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
! 1304:
! 1305: return 0;
! 1306: }
! 1307:
! 1308: /* Write the union of position lists in pLeft and pRight to pOut.
! 1309: ** "Union" in this case meaning "All unique position tuples". Should
! 1310: ** work with any doclist type, though both inputs and the output
! 1311: ** should be the same type.
! 1312: */
! 1313: static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){
! 1314: PLReader left, right;
! 1315: PLWriter writer;
! 1316:
! 1317: assert( dlrDocid(pLeft)==dlrDocid(pRight) );
! 1318: assert( pLeft->iType==pRight->iType );
! 1319: assert( pLeft->iType==pOut->iType );
! 1320:
! 1321: plrInit(&left, pLeft);
! 1322: plrInit(&right, pRight);
! 1323: plwInit(&writer, pOut, dlrDocid(pLeft));
! 1324:
! 1325: while( !plrAtEnd(&left) || !plrAtEnd(&right) ){
! 1326: int c = posListCmp(&left, &right);
! 1327: if( c<0 ){
! 1328: plwCopy(&writer, &left);
! 1329: plrStep(&left);
! 1330: }else if( c>0 ){
! 1331: plwCopy(&writer, &right);
! 1332: plrStep(&right);
! 1333: }else{
! 1334: plwCopy(&writer, &left);
! 1335: plrStep(&left);
! 1336: plrStep(&right);
! 1337: }
! 1338: }
! 1339:
! 1340: plwTerminate(&writer);
! 1341: plwDestroy(&writer);
! 1342: plrDestroy(&left);
! 1343: plrDestroy(&right);
! 1344: }
! 1345:
! 1346: /* Write the union of doclists in pLeft and pRight to pOut. For
! 1347: ** docids in common between the inputs, the union of the position
! 1348: ** lists is written. Inputs and outputs are always type DL_DEFAULT.
! 1349: */
! 1350: static void docListUnion(
! 1351: const char *pLeft, int nLeft,
! 1352: const char *pRight, int nRight,
! 1353: DataBuffer *pOut /* Write the combined doclist here */
! 1354: ){
! 1355: DLReader left, right;
! 1356: DLWriter writer;
! 1357:
! 1358: if( nLeft==0 ){
! 1359: if( nRight!=0) dataBufferAppend(pOut, pRight, nRight);
! 1360: return;
! 1361: }
! 1362: if( nRight==0 ){
! 1363: dataBufferAppend(pOut, pLeft, nLeft);
! 1364: return;
! 1365: }
! 1366:
! 1367: dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
! 1368: dlrInit(&right, DL_DEFAULT, pRight, nRight);
! 1369: dlwInit(&writer, DL_DEFAULT, pOut);
! 1370:
! 1371: while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
! 1372: if( dlrAtEnd(&right) ){
! 1373: dlwCopy(&writer, &left);
! 1374: dlrStep(&left);
! 1375: }else if( dlrAtEnd(&left) ){
! 1376: dlwCopy(&writer, &right);
! 1377: dlrStep(&right);
! 1378: }else if( dlrDocid(&left)<dlrDocid(&right) ){
! 1379: dlwCopy(&writer, &left);
! 1380: dlrStep(&left);
! 1381: }else if( dlrDocid(&left)>dlrDocid(&right) ){
! 1382: dlwCopy(&writer, &right);
! 1383: dlrStep(&right);
! 1384: }else{
! 1385: posListUnion(&left, &right, &writer);
! 1386: dlrStep(&left);
! 1387: dlrStep(&right);
! 1388: }
! 1389: }
! 1390:
! 1391: dlrDestroy(&left);
! 1392: dlrDestroy(&right);
! 1393: dlwDestroy(&writer);
! 1394: }
! 1395:
! 1396: /* pLeft and pRight are DLReaders positioned to the same docid.
! 1397: **
! 1398: ** If there are no instances in pLeft or pRight where the position
! 1399: ** of pLeft is one less than the position of pRight, then this
! 1400: ** routine adds nothing to pOut.
! 1401: **
! 1402: ** If there are one or more instances where positions from pLeft
! 1403: ** are exactly one less than positions from pRight, then add a new
! 1404: ** document record to pOut. If pOut wants to hold positions, then
! 1405: ** include the positions from pRight that are one more than a
! 1406: ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
! 1407: */
! 1408: static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight,
! 1409: DLWriter *pOut){
! 1410: PLReader left, right;
! 1411: PLWriter writer;
! 1412: int match = 0;
! 1413:
! 1414: assert( dlrDocid(pLeft)==dlrDocid(pRight) );
! 1415: assert( pOut->iType!=DL_POSITIONS_OFFSETS );
! 1416:
! 1417: plrInit(&left, pLeft);
! 1418: plrInit(&right, pRight);
! 1419:
! 1420: while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
! 1421: if( plrColumn(&left)<plrColumn(&right) ){
! 1422: plrStep(&left);
! 1423: }else if( plrColumn(&left)>plrColumn(&right) ){
! 1424: plrStep(&right);
! 1425: }else if( plrPosition(&left)+1<plrPosition(&right) ){
! 1426: plrStep(&left);
! 1427: }else if( plrPosition(&left)+1>plrPosition(&right) ){
! 1428: plrStep(&right);
! 1429: }else{
! 1430: if( !match ){
! 1431: plwInit(&writer, pOut, dlrDocid(pLeft));
! 1432: match = 1;
! 1433: }
! 1434: plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
! 1435: plrStep(&left);
! 1436: plrStep(&right);
! 1437: }
! 1438: }
! 1439:
! 1440: if( match ){
! 1441: plwTerminate(&writer);
! 1442: plwDestroy(&writer);
! 1443: }
! 1444:
! 1445: plrDestroy(&left);
! 1446: plrDestroy(&right);
! 1447: }
! 1448:
! 1449: /* We have two doclists with positions: pLeft and pRight.
! 1450: ** Write the phrase intersection of these two doclists into pOut.
! 1451: **
! 1452: ** A phrase intersection means that two documents only match
! 1453: ** if pLeft.iPos+1==pRight.iPos.
! 1454: **
! 1455: ** iType controls the type of data written to pOut. If iType is
! 1456: ** DL_POSITIONS, the positions are those from pRight.
! 1457: */
! 1458: static void docListPhraseMerge(
! 1459: const char *pLeft, int nLeft,
! 1460: const char *pRight, int nRight,
! 1461: DocListType iType,
! 1462: DataBuffer *pOut /* Write the combined doclist here */
! 1463: ){
! 1464: DLReader left, right;
! 1465: DLWriter writer;
! 1466:
! 1467: if( nLeft==0 || nRight==0 ) return;
! 1468:
! 1469: assert( iType!=DL_POSITIONS_OFFSETS );
! 1470:
! 1471: dlrInit(&left, DL_POSITIONS, pLeft, nLeft);
! 1472: dlrInit(&right, DL_POSITIONS, pRight, nRight);
! 1473: dlwInit(&writer, iType, pOut);
! 1474:
! 1475: while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
! 1476: if( dlrDocid(&left)<dlrDocid(&right) ){
! 1477: dlrStep(&left);
! 1478: }else if( dlrDocid(&right)<dlrDocid(&left) ){
! 1479: dlrStep(&right);
! 1480: }else{
! 1481: posListPhraseMerge(&left, &right, &writer);
! 1482: dlrStep(&left);
! 1483: dlrStep(&right);
! 1484: }
! 1485: }
! 1486:
! 1487: dlrDestroy(&left);
! 1488: dlrDestroy(&right);
! 1489: dlwDestroy(&writer);
! 1490: }
! 1491:
! 1492: /* We have two DL_DOCIDS doclists: pLeft and pRight.
! 1493: ** Write the intersection of these two doclists into pOut as a
! 1494: ** DL_DOCIDS doclist.
! 1495: */
! 1496: static void docListAndMerge(
! 1497: const char *pLeft, int nLeft,
! 1498: const char *pRight, int nRight,
! 1499: DataBuffer *pOut /* Write the combined doclist here */
! 1500: ){
! 1501: DLReader left, right;
! 1502: DLWriter writer;
! 1503:
! 1504: if( nLeft==0 || nRight==0 ) return;
! 1505:
! 1506: dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
! 1507: dlrInit(&right, DL_DOCIDS, pRight, nRight);
! 1508: dlwInit(&writer, DL_DOCIDS, pOut);
! 1509:
! 1510: while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
! 1511: if( dlrDocid(&left)<dlrDocid(&right) ){
! 1512: dlrStep(&left);
! 1513: }else if( dlrDocid(&right)<dlrDocid(&left) ){
! 1514: dlrStep(&right);
! 1515: }else{
! 1516: dlwAdd(&writer, dlrDocid(&left));
! 1517: dlrStep(&left);
! 1518: dlrStep(&right);
! 1519: }
! 1520: }
! 1521:
! 1522: dlrDestroy(&left);
! 1523: dlrDestroy(&right);
! 1524: dlwDestroy(&writer);
! 1525: }
! 1526:
! 1527: /* We have two DL_DOCIDS doclists: pLeft and pRight.
! 1528: ** Write the union of these two doclists into pOut as a
! 1529: ** DL_DOCIDS doclist.
! 1530: */
! 1531: static void docListOrMerge(
! 1532: const char *pLeft, int nLeft,
! 1533: const char *pRight, int nRight,
! 1534: DataBuffer *pOut /* Write the combined doclist here */
! 1535: ){
! 1536: DLReader left, right;
! 1537: DLWriter writer;
! 1538:
! 1539: if( nLeft==0 ){
! 1540: if( nRight!=0 ) dataBufferAppend(pOut, pRight, nRight);
! 1541: return;
! 1542: }
! 1543: if( nRight==0 ){
! 1544: dataBufferAppend(pOut, pLeft, nLeft);
! 1545: return;
! 1546: }
! 1547:
! 1548: dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
! 1549: dlrInit(&right, DL_DOCIDS, pRight, nRight);
! 1550: dlwInit(&writer, DL_DOCIDS, pOut);
! 1551:
! 1552: while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){
! 1553: if( dlrAtEnd(&right) ){
! 1554: dlwAdd(&writer, dlrDocid(&left));
! 1555: dlrStep(&left);
! 1556: }else if( dlrAtEnd(&left) ){
! 1557: dlwAdd(&writer, dlrDocid(&right));
! 1558: dlrStep(&right);
! 1559: }else if( dlrDocid(&left)<dlrDocid(&right) ){
! 1560: dlwAdd(&writer, dlrDocid(&left));
! 1561: dlrStep(&left);
! 1562: }else if( dlrDocid(&right)<dlrDocid(&left) ){
! 1563: dlwAdd(&writer, dlrDocid(&right));
! 1564: dlrStep(&right);
! 1565: }else{
! 1566: dlwAdd(&writer, dlrDocid(&left));
! 1567: dlrStep(&left);
! 1568: dlrStep(&right);
! 1569: }
! 1570: }
! 1571:
! 1572: dlrDestroy(&left);
! 1573: dlrDestroy(&right);
! 1574: dlwDestroy(&writer);
! 1575: }
! 1576:
! 1577: /* We have two DL_DOCIDS doclists: pLeft and pRight.
! 1578: ** Write into pOut as DL_DOCIDS doclist containing all documents that
! 1579: ** occur in pLeft but not in pRight.
! 1580: */
! 1581: static void docListExceptMerge(
! 1582: const char *pLeft, int nLeft,
! 1583: const char *pRight, int nRight,
! 1584: DataBuffer *pOut /* Write the combined doclist here */
! 1585: ){
! 1586: DLReader left, right;
! 1587: DLWriter writer;
! 1588:
! 1589: if( nLeft==0 ) return;
! 1590: if( nRight==0 ){
! 1591: dataBufferAppend(pOut, pLeft, nLeft);
! 1592: return;
! 1593: }
! 1594:
! 1595: dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
! 1596: dlrInit(&right, DL_DOCIDS, pRight, nRight);
! 1597: dlwInit(&writer, DL_DOCIDS, pOut);
! 1598:
! 1599: while( !dlrAtEnd(&left) ){
! 1600: while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
! 1601: dlrStep(&right);
! 1602: }
! 1603: if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){
! 1604: dlwAdd(&writer, dlrDocid(&left));
! 1605: }
! 1606: dlrStep(&left);
! 1607: }
! 1608:
! 1609: dlrDestroy(&left);
! 1610: dlrDestroy(&right);
! 1611: dlwDestroy(&writer);
! 1612: }
! 1613:
! 1614: static char *string_dup_n(const char *s, int n){
! 1615: char *str = sqlite3_malloc(n + 1);
! 1616: memcpy(str, s, n);
! 1617: str[n] = '\0';
! 1618: return str;
! 1619: }
! 1620:
! 1621: /* Duplicate a string; the caller must free() the returned string.
! 1622: * (We don't use strdup() since it is not part of the standard C library and
! 1623: * may not be available everywhere.) */
! 1624: static char *string_dup(const char *s){
! 1625: return string_dup_n(s, strlen(s));
! 1626: }
! 1627:
! 1628: /* Format a string, replacing each occurrence of the % character with
! 1629: * zDb.zName. This may be more convenient than sqlite_mprintf()
! 1630: * when one string is used repeatedly in a format string.
! 1631: * The caller must free() the returned string. */
! 1632: static char *string_format(const char *zFormat,
! 1633: const char *zDb, const char *zName){
! 1634: const char *p;
! 1635: size_t len = 0;
! 1636: size_t nDb = strlen(zDb);
! 1637: size_t nName = strlen(zName);
! 1638: size_t nFullTableName = nDb+1+nName;
! 1639: char *result;
! 1640: char *r;
! 1641:
! 1642: /* first compute length needed */
! 1643: for(p = zFormat ; *p ; ++p){
! 1644: len += (*p=='%' ? nFullTableName : 1);
! 1645: }
! 1646: len += 1; /* for null terminator */
! 1647:
! 1648: r = result = sqlite3_malloc(len);
! 1649: for(p = zFormat; *p; ++p){
! 1650: if( *p=='%' ){
! 1651: memcpy(r, zDb, nDb);
! 1652: r += nDb;
! 1653: *r++ = '.';
! 1654: memcpy(r, zName, nName);
! 1655: r += nName;
! 1656: } else {
! 1657: *r++ = *p;
! 1658: }
! 1659: }
! 1660: *r++ = '\0';
! 1661: assert( r == result + len );
! 1662: return result;
! 1663: }
! 1664:
! 1665: static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
! 1666: const char *zFormat){
! 1667: char *zCommand = string_format(zFormat, zDb, zName);
! 1668: int rc;
! 1669: TRACE(("FTS2 sql: %s\n", zCommand));
! 1670: rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
! 1671: sqlite3_free(zCommand);
! 1672: return rc;
! 1673: }
! 1674:
! 1675: static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
! 1676: sqlite3_stmt **ppStmt, const char *zFormat){
! 1677: char *zCommand = string_format(zFormat, zDb, zName);
! 1678: int rc;
! 1679: TRACE(("FTS2 prepare: %s\n", zCommand));
! 1680: rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL);
! 1681: sqlite3_free(zCommand);
! 1682: return rc;
! 1683: }
! 1684:
! 1685: /* end utility functions */
! 1686:
! 1687: /* Forward reference */
! 1688: typedef struct fulltext_vtab fulltext_vtab;
! 1689:
! 1690: /* A single term in a query is represented by an instances of
! 1691: ** the following structure.
! 1692: */
! 1693: typedef struct QueryTerm {
! 1694: short int nPhrase; /* How many following terms are part of the same phrase */
! 1695: short int iPhrase; /* This is the i-th term of a phrase. */
! 1696: short int iColumn; /* Column of the index that must match this term */
! 1697: signed char isOr; /* this term is preceded by "OR" */
! 1698: signed char isNot; /* this term is preceded by "-" */
! 1699: signed char isPrefix; /* this term is followed by "*" */
! 1700: char *pTerm; /* text of the term. '\000' terminated. malloced */
! 1701: int nTerm; /* Number of bytes in pTerm[] */
! 1702: } QueryTerm;
! 1703:
! 1704:
! 1705: /* A query string is parsed into a Query structure.
! 1706: *
! 1707: * We could, in theory, allow query strings to be complicated
! 1708: * nested expressions with precedence determined by parentheses.
! 1709: * But none of the major search engines do this. (Perhaps the
! 1710: * feeling is that an parenthesized expression is two complex of
! 1711: * an idea for the average user to grasp.) Taking our lead from
! 1712: * the major search engines, we will allow queries to be a list
! 1713: * of terms (with an implied AND operator) or phrases in double-quotes,
! 1714: * with a single optional "-" before each non-phrase term to designate
! 1715: * negation and an optional OR connector.
! 1716: *
! 1717: * OR binds more tightly than the implied AND, which is what the
! 1718: * major search engines seem to do. So, for example:
! 1719: *
! 1720: * [one two OR three] ==> one AND (two OR three)
! 1721: * [one OR two three] ==> (one OR two) AND three
! 1722: *
! 1723: * A "-" before a term matches all entries that lack that term.
! 1724: * The "-" must occur immediately before the term with in intervening
! 1725: * space. This is how the search engines do it.
! 1726: *
! 1727: * A NOT term cannot be the right-hand operand of an OR. If this
! 1728: * occurs in the query string, the NOT is ignored:
! 1729: *
! 1730: * [one OR -two] ==> one OR two
! 1731: *
! 1732: */
! 1733: typedef struct Query {
! 1734: fulltext_vtab *pFts; /* The full text index */
! 1735: int nTerms; /* Number of terms in the query */
! 1736: QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
! 1737: int nextIsOr; /* Set the isOr flag on the next inserted term */
! 1738: int nextColumn; /* Next word parsed must be in this column */
! 1739: int dfltColumn; /* The default column */
! 1740: } Query;
! 1741:
! 1742:
! 1743: /*
! 1744: ** An instance of the following structure keeps track of generated
! 1745: ** matching-word offset information and snippets.
! 1746: */
! 1747: typedef struct Snippet {
! 1748: int nMatch; /* Total number of matches */
! 1749: int nAlloc; /* Space allocated for aMatch[] */
! 1750: struct snippetMatch { /* One entry for each matching term */
! 1751: char snStatus; /* Status flag for use while constructing snippets */
! 1752: short int iCol; /* The column that contains the match */
! 1753: short int iTerm; /* The index in Query.pTerms[] of the matching term */
! 1754: short int nByte; /* Number of bytes in the term */
! 1755: int iStart; /* The offset to the first character of the term */
! 1756: } *aMatch; /* Points to space obtained from malloc */
! 1757: char *zOffset; /* Text rendering of aMatch[] */
! 1758: int nOffset; /* strlen(zOffset) */
! 1759: char *zSnippet; /* Snippet text */
! 1760: int nSnippet; /* strlen(zSnippet) */
! 1761: } Snippet;
! 1762:
! 1763:
! 1764: typedef enum QueryType {
! 1765: QUERY_GENERIC, /* table scan */
! 1766: QUERY_ROWID, /* lookup by rowid */
! 1767: QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
! 1768: } QueryType;
! 1769:
! 1770: typedef enum fulltext_statement {
! 1771: CONTENT_INSERT_STMT,
! 1772: CONTENT_SELECT_STMT,
! 1773: CONTENT_UPDATE_STMT,
! 1774: CONTENT_DELETE_STMT,
! 1775: CONTENT_EXISTS_STMT,
! 1776:
! 1777: BLOCK_INSERT_STMT,
! 1778: BLOCK_SELECT_STMT,
! 1779: BLOCK_DELETE_STMT,
! 1780: BLOCK_DELETE_ALL_STMT,
! 1781:
! 1782: SEGDIR_MAX_INDEX_STMT,
! 1783: SEGDIR_SET_STMT,
! 1784: SEGDIR_SELECT_LEVEL_STMT,
! 1785: SEGDIR_SPAN_STMT,
! 1786: SEGDIR_DELETE_STMT,
! 1787: SEGDIR_SELECT_SEGMENT_STMT,
! 1788: SEGDIR_SELECT_ALL_STMT,
! 1789: SEGDIR_DELETE_ALL_STMT,
! 1790: SEGDIR_COUNT_STMT,
! 1791:
! 1792: MAX_STMT /* Always at end! */
! 1793: } fulltext_statement;
! 1794:
! 1795: /* These must exactly match the enum above. */
! 1796: /* TODO(shess): Is there some risk that a statement will be used in two
! 1797: ** cursors at once, e.g. if a query joins a virtual table to itself?
! 1798: ** If so perhaps we should move some of these to the cursor object.
! 1799: */
! 1800: static const char *const fulltext_zStatement[MAX_STMT] = {
! 1801: /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
! 1802: /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
! 1803: /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
! 1804: /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
! 1805: /* CONTENT_EXISTS */ "select rowid from %_content limit 1",
! 1806:
! 1807: /* BLOCK_INSERT */ "insert into %_segments values (?)",
! 1808: /* BLOCK_SELECT */ "select block from %_segments where rowid = ?",
! 1809: /* BLOCK_DELETE */ "delete from %_segments where rowid between ? and ?",
! 1810: /* BLOCK_DELETE_ALL */ "delete from %_segments",
! 1811:
! 1812: /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
! 1813: /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
! 1814: /* SEGDIR_SELECT_LEVEL */
! 1815: "select start_block, leaves_end_block, root from %_segdir "
! 1816: " where level = ? order by idx",
! 1817: /* SEGDIR_SPAN */
! 1818: "select min(start_block), max(end_block) from %_segdir "
! 1819: " where level = ? and start_block <> 0",
! 1820: /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
! 1821:
! 1822: /* NOTE(shess): The first three results of the following two
! 1823: ** statements must match.
! 1824: */
! 1825: /* SEGDIR_SELECT_SEGMENT */
! 1826: "select start_block, leaves_end_block, root from %_segdir "
! 1827: " where level = ? and idx = ?",
! 1828: /* SEGDIR_SELECT_ALL */
! 1829: "select start_block, leaves_end_block, root from %_segdir "
! 1830: " order by level desc, idx asc",
! 1831: /* SEGDIR_DELETE_ALL */ "delete from %_segdir",
! 1832: /* SEGDIR_COUNT */ "select count(*), ifnull(max(level),0) from %_segdir",
! 1833: };
! 1834:
! 1835: /*
! 1836: ** A connection to a fulltext index is an instance of the following
! 1837: ** structure. The xCreate and xConnect methods create an instance
! 1838: ** of this structure and xDestroy and xDisconnect free that instance.
! 1839: ** All other methods receive a pointer to the structure as one of their
! 1840: ** arguments.
! 1841: */
! 1842: struct fulltext_vtab {
! 1843: sqlite3_vtab base; /* Base class used by SQLite core */
! 1844: sqlite3 *db; /* The database connection */
! 1845: const char *zDb; /* logical database name */
! 1846: const char *zName; /* virtual table name */
! 1847: int nColumn; /* number of columns in virtual table */
! 1848: char **azColumn; /* column names. malloced */
! 1849: char **azContentColumn; /* column names in content table; malloced */
! 1850: sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
! 1851:
! 1852: /* Precompiled statements which we keep as long as the table is
! 1853: ** open.
! 1854: */
! 1855: sqlite3_stmt *pFulltextStatements[MAX_STMT];
! 1856:
! 1857: /* Precompiled statements used for segment merges. We run a
! 1858: ** separate select across the leaf level of each tree being merged.
! 1859: */
! 1860: sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
! 1861: /* The statement used to prepare pLeafSelectStmts. */
! 1862: #define LEAF_SELECT \
! 1863: "select block from %_segments where rowid between ? and ? order by rowid"
! 1864:
! 1865: /* These buffer pending index updates during transactions.
! 1866: ** nPendingData estimates the memory size of the pending data. It
! 1867: ** doesn't include the hash-bucket overhead, nor any malloc
! 1868: ** overhead. When nPendingData exceeds kPendingThreshold, the
! 1869: ** buffer is flushed even before the transaction closes.
! 1870: ** pendingTerms stores the data, and is only valid when nPendingData
! 1871: ** is >=0 (nPendingData<0 means pendingTerms has not been
! 1872: ** initialized). iPrevDocid is the last docid written, used to make
! 1873: ** certain we're inserting in sorted order.
! 1874: */
! 1875: int nPendingData;
! 1876: #define kPendingThreshold (1*1024*1024)
! 1877: sqlite_int64 iPrevDocid;
! 1878: fts2Hash pendingTerms;
! 1879: };
! 1880:
! 1881: /*
! 1882: ** When the core wants to do a query, it create a cursor using a
! 1883: ** call to xOpen. This structure is an instance of a cursor. It
! 1884: ** is destroyed by xClose.
! 1885: */
! 1886: typedef struct fulltext_cursor {
! 1887: sqlite3_vtab_cursor base; /* Base class used by SQLite core */
! 1888: QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
! 1889: sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
! 1890: int eof; /* True if at End Of Results */
! 1891: Query q; /* Parsed query string */
! 1892: Snippet snippet; /* Cached snippet for the current row */
! 1893: int iColumn; /* Column being searched */
! 1894: DataBuffer result; /* Doclist results from fulltextQuery */
! 1895: DLReader reader; /* Result reader if result not empty */
! 1896: } fulltext_cursor;
! 1897:
! 1898: static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
! 1899: return (fulltext_vtab *) c->base.pVtab;
! 1900: }
! 1901:
! 1902: static const sqlite3_module fts2Module; /* forward declaration */
! 1903:
! 1904: /* Return a dynamically generated statement of the form
! 1905: * insert into %_content (rowid, ...) values (?, ...)
! 1906: */
! 1907: static const char *contentInsertStatement(fulltext_vtab *v){
! 1908: StringBuffer sb;
! 1909: int i;
! 1910:
! 1911: initStringBuffer(&sb);
! 1912: append(&sb, "insert into %_content (rowid, ");
! 1913: appendList(&sb, v->nColumn, v->azContentColumn);
! 1914: append(&sb, ") values (?");
! 1915: for(i=0; i<v->nColumn; ++i)
! 1916: append(&sb, ", ?");
! 1917: append(&sb, ")");
! 1918: return stringBufferData(&sb);
! 1919: }
! 1920:
! 1921: /* Return a dynamically generated statement of the form
! 1922: * update %_content set [col_0] = ?, [col_1] = ?, ...
! 1923: * where rowid = ?
! 1924: */
! 1925: static const char *contentUpdateStatement(fulltext_vtab *v){
! 1926: StringBuffer sb;
! 1927: int i;
! 1928:
! 1929: initStringBuffer(&sb);
! 1930: append(&sb, "update %_content set ");
! 1931: for(i=0; i<v->nColumn; ++i) {
! 1932: if( i>0 ){
! 1933: append(&sb, ", ");
! 1934: }
! 1935: append(&sb, v->azContentColumn[i]);
! 1936: append(&sb, " = ?");
! 1937: }
! 1938: append(&sb, " where rowid = ?");
! 1939: return stringBufferData(&sb);
! 1940: }
! 1941:
! 1942: /* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
! 1943: ** If the indicated statement has never been prepared, it is prepared
! 1944: ** and cached, otherwise the cached version is reset.
! 1945: */
! 1946: static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
! 1947: sqlite3_stmt **ppStmt){
! 1948: assert( iStmt<MAX_STMT );
! 1949: if( v->pFulltextStatements[iStmt]==NULL ){
! 1950: const char *zStmt;
! 1951: int rc;
! 1952: switch( iStmt ){
! 1953: case CONTENT_INSERT_STMT:
! 1954: zStmt = contentInsertStatement(v); break;
! 1955: case CONTENT_UPDATE_STMT:
! 1956: zStmt = contentUpdateStatement(v); break;
! 1957: default:
! 1958: zStmt = fulltext_zStatement[iStmt];
! 1959: }
! 1960: rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
! 1961: zStmt);
! 1962: if( zStmt != fulltext_zStatement[iStmt]) sqlite3_free((void *) zStmt);
! 1963: if( rc!=SQLITE_OK ) return rc;
! 1964: } else {
! 1965: int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
! 1966: if( rc!=SQLITE_OK ) return rc;
! 1967: }
! 1968:
! 1969: *ppStmt = v->pFulltextStatements[iStmt];
! 1970: return SQLITE_OK;
! 1971: }
! 1972:
! 1973: /* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
! 1974: ** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE,
! 1975: ** where we expect no results.
! 1976: */
! 1977: static int sql_single_step(sqlite3_stmt *s){
! 1978: int rc = sqlite3_step(s);
! 1979: return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
! 1980: }
! 1981:
! 1982: /* Like sql_get_statement(), but for special replicated LEAF_SELECT
! 1983: ** statements. idx -1 is a special case for an uncached version of
! 1984: ** the statement (used in the optimize implementation).
! 1985: */
! 1986: /* TODO(shess) Write version for generic statements and then share
! 1987: ** that between the cached-statement functions.
! 1988: */
! 1989: static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
! 1990: sqlite3_stmt **ppStmt){
! 1991: assert( idx>=-1 && idx<MERGE_COUNT );
! 1992: if( idx==-1 ){
! 1993: return sql_prepare(v->db, v->zDb, v->zName, ppStmt, LEAF_SELECT);
! 1994: }else if( v->pLeafSelectStmts[idx]==NULL ){
! 1995: int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
! 1996: LEAF_SELECT);
! 1997: if( rc!=SQLITE_OK ) return rc;
! 1998: }else{
! 1999: int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
! 2000: if( rc!=SQLITE_OK ) return rc;
! 2001: }
! 2002:
! 2003: *ppStmt = v->pLeafSelectStmts[idx];
! 2004: return SQLITE_OK;
! 2005: }
! 2006:
! 2007: /* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
! 2008: static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
! 2009: sqlite3_value **pValues){
! 2010: sqlite3_stmt *s;
! 2011: int i;
! 2012: int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
! 2013: if( rc!=SQLITE_OK ) return rc;
! 2014:
! 2015: rc = sqlite3_bind_value(s, 1, rowid);
! 2016: if( rc!=SQLITE_OK ) return rc;
! 2017:
! 2018: for(i=0; i<v->nColumn; ++i){
! 2019: rc = sqlite3_bind_value(s, 2+i, pValues[i]);
! 2020: if( rc!=SQLITE_OK ) return rc;
! 2021: }
! 2022:
! 2023: return sql_single_step(s);
! 2024: }
! 2025:
! 2026: /* update %_content set col0 = pValues[0], col1 = pValues[1], ...
! 2027: * where rowid = [iRowid] */
! 2028: static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
! 2029: sqlite_int64 iRowid){
! 2030: sqlite3_stmt *s;
! 2031: int i;
! 2032: int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
! 2033: if( rc!=SQLITE_OK ) return rc;
! 2034:
! 2035: for(i=0; i<v->nColumn; ++i){
! 2036: rc = sqlite3_bind_value(s, 1+i, pValues[i]);
! 2037: if( rc!=SQLITE_OK ) return rc;
! 2038: }
! 2039:
! 2040: rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
! 2041: if( rc!=SQLITE_OK ) return rc;
! 2042:
! 2043: return sql_single_step(s);
! 2044: }
! 2045:
! 2046: static void freeStringArray(int nString, const char **pString){
! 2047: int i;
! 2048:
! 2049: for (i=0 ; i < nString ; ++i) {
! 2050: if( pString[i]!=NULL ) sqlite3_free((void *) pString[i]);
! 2051: }
! 2052: sqlite3_free((void *) pString);
! 2053: }
! 2054:
! 2055: /* select * from %_content where rowid = [iRow]
! 2056: * The caller must delete the returned array and all strings in it.
! 2057: * null fields will be NULL in the returned array.
! 2058: *
! 2059: * TODO: Perhaps we should return pointer/length strings here for consistency
! 2060: * with other code which uses pointer/length. */
! 2061: static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
! 2062: const char ***pValues){
! 2063: sqlite3_stmt *s;
! 2064: const char **values;
! 2065: int i;
! 2066: int rc;
! 2067:
! 2068: *pValues = NULL;
! 2069:
! 2070: rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
! 2071: if( rc!=SQLITE_OK ) return rc;
! 2072:
! 2073: rc = sqlite3_bind_int64(s, 1, iRow);
! 2074: if( rc!=SQLITE_OK ) return rc;
! 2075:
! 2076: rc = sqlite3_step(s);
! 2077: if( rc!=SQLITE_ROW ) return rc;
! 2078:
! 2079: values = (const char **) sqlite3_malloc(v->nColumn * sizeof(const char *));
! 2080: for(i=0; i<v->nColumn; ++i){
! 2081: if( sqlite3_column_type(s, i)==SQLITE_NULL ){
! 2082: values[i] = NULL;
! 2083: }else{
! 2084: values[i] = string_dup((char*)sqlite3_column_text(s, i));
! 2085: }
! 2086: }
! 2087:
! 2088: /* We expect only one row. We must execute another sqlite3_step()
! 2089: * to complete the iteration; otherwise the table will remain locked. */
! 2090: rc = sqlite3_step(s);
! 2091: if( rc==SQLITE_DONE ){
! 2092: *pValues = values;
! 2093: return SQLITE_OK;
! 2094: }
! 2095:
! 2096: freeStringArray(v->nColumn, values);
! 2097: return rc;
! 2098: }
! 2099:
! 2100: /* delete from %_content where rowid = [iRow ] */
! 2101: static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
! 2102: sqlite3_stmt *s;
! 2103: int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
! 2104: if( rc!=SQLITE_OK ) return rc;
! 2105:
! 2106: rc = sqlite3_bind_int64(s, 1, iRow);
! 2107: if( rc!=SQLITE_OK ) return rc;
! 2108:
! 2109: return sql_single_step(s);
! 2110: }
! 2111:
! 2112: /* Returns SQLITE_ROW if any rows exist in %_content, SQLITE_DONE if
! 2113: ** no rows exist, and any error in case of failure.
! 2114: */
! 2115: static int content_exists(fulltext_vtab *v){
! 2116: sqlite3_stmt *s;
! 2117: int rc = sql_get_statement(v, CONTENT_EXISTS_STMT, &s);
! 2118: if( rc!=SQLITE_OK ) return rc;
! 2119:
! 2120: rc = sqlite3_step(s);
! 2121: if( rc!=SQLITE_ROW ) return rc;
! 2122:
! 2123: /* We expect only one row. We must execute another sqlite3_step()
! 2124: * to complete the iteration; otherwise the table will remain locked. */
! 2125: rc = sqlite3_step(s);
! 2126: if( rc==SQLITE_DONE ) return SQLITE_ROW;
! 2127: if( rc==SQLITE_ROW ) return SQLITE_ERROR;
! 2128: return rc;
! 2129: }
! 2130:
! 2131: /* insert into %_segments values ([pData])
! 2132: ** returns assigned rowid in *piBlockid
! 2133: */
! 2134: static int block_insert(fulltext_vtab *v, const char *pData, int nData,
! 2135: sqlite_int64 *piBlockid){
! 2136: sqlite3_stmt *s;
! 2137: int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
! 2138: if( rc!=SQLITE_OK ) return rc;
! 2139:
! 2140: rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
! 2141: if( rc!=SQLITE_OK ) return rc;
! 2142:
! 2143: rc = sqlite3_step(s);
! 2144: if( rc==SQLITE_ROW ) return SQLITE_ERROR;
! 2145: if( rc!=SQLITE_DONE ) return rc;
! 2146:
! 2147: *piBlockid = sqlite3_last_insert_rowid(v->db);
! 2148: return SQLITE_OK;
! 2149: }
! 2150:
! 2151: /* delete from %_segments
! 2152: ** where rowid between [iStartBlockid] and [iEndBlockid]
! 2153: **
! 2154: ** Deletes the range of blocks, inclusive, used to delete the blocks
! 2155: ** which form a segment.
! 2156: */
! 2157: static int block_delete(fulltext_vtab *v,
! 2158: sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
! 2159: sqlite3_stmt *s;
! 2160: int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
! 2161: if( rc!=SQLITE_OK ) return rc;
! 2162:
! 2163: rc = sqlite3_bind_int64(s, 1, iStartBlockid);
! 2164: if( rc!=SQLITE_OK ) return rc;
! 2165:
! 2166: rc = sqlite3_bind_int64(s, 2, iEndBlockid);
! 2167: if( rc!=SQLITE_OK ) return rc;
! 2168:
! 2169: return sql_single_step(s);
! 2170: }
! 2171:
! 2172: /* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
! 2173: ** at iLevel. Returns SQLITE_DONE if there are no segments at
! 2174: ** iLevel. Otherwise returns an error.
! 2175: */
! 2176: static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){
! 2177: sqlite3_stmt *s;
! 2178: int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
! 2179: if( rc!=SQLITE_OK ) return rc;
! 2180:
! 2181: rc = sqlite3_bind_int(s, 1, iLevel);
! 2182: if( rc!=SQLITE_OK ) return rc;
! 2183:
! 2184: rc = sqlite3_step(s);
! 2185: /* Should always get at least one row due to how max() works. */
! 2186: if( rc==SQLITE_DONE ) return SQLITE_DONE;
! 2187: if( rc!=SQLITE_ROW ) return rc;
! 2188:
! 2189: /* NULL means that there were no inputs to max(). */
! 2190: if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
! 2191: rc = sqlite3_step(s);
! 2192: if( rc==SQLITE_ROW ) return SQLITE_ERROR;
! 2193: return rc;
! 2194: }
! 2195:
! 2196: *pidx = sqlite3_column_int(s, 0);
! 2197:
! 2198: /* We expect only one row. We must execute another sqlite3_step()
! 2199: * to complete the iteration; otherwise the table will remain locked. */
! 2200: rc = sqlite3_step(s);
! 2201: if( rc==SQLITE_ROW ) return SQLITE_ERROR;
! 2202: if( rc!=SQLITE_DONE ) return rc;
! 2203: return SQLITE_ROW;
! 2204: }
! 2205:
! 2206: /* insert into %_segdir values (
! 2207: ** [iLevel], [idx],
! 2208: ** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
! 2209: ** [pRootData]
! 2210: ** )
! 2211: */
! 2212: static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
! 2213: sqlite_int64 iStartBlockid,
! 2214: sqlite_int64 iLeavesEndBlockid,
! 2215: sqlite_int64 iEndBlockid,
! 2216: const char *pRootData, int nRootData){
! 2217: sqlite3_stmt *s;
! 2218: int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
! 2219: if( rc!=SQLITE_OK ) return rc;
! 2220:
! 2221: rc = sqlite3_bind_int(s, 1, iLevel);
! 2222: if( rc!=SQLITE_OK ) return rc;
! 2223:
! 2224: rc = sqlite3_bind_int(s, 2, idx);
! 2225: if( rc!=SQLITE_OK ) return rc;
! 2226:
! 2227: rc = sqlite3_bind_int64(s, 3, iStartBlockid);
! 2228: if( rc!=SQLITE_OK ) return rc;
! 2229:
! 2230: rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
! 2231: if( rc!=SQLITE_OK ) return rc;
! 2232:
! 2233: rc = sqlite3_bind_int64(s, 5, iEndBlockid);
! 2234: if( rc!=SQLITE_OK ) return rc;
! 2235:
! 2236: rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
! 2237: if( rc!=SQLITE_OK ) return rc;
! 2238:
! 2239: return sql_single_step(s);
! 2240: }
! 2241:
! 2242: /* Queries %_segdir for the block span of the segments in level
! 2243: ** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel,
! 2244: ** SQLITE_ROW if there are blocks, else an error.
! 2245: */
! 2246: static int segdir_span(fulltext_vtab *v, int iLevel,
! 2247: sqlite_int64 *piStartBlockid,
! 2248: sqlite_int64 *piEndBlockid){
! 2249: sqlite3_stmt *s;
! 2250: int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
! 2251: if( rc!=SQLITE_OK ) return rc;
! 2252:
! 2253: rc = sqlite3_bind_int(s, 1, iLevel);
! 2254: if( rc!=SQLITE_OK ) return rc;
! 2255:
! 2256: rc = sqlite3_step(s);
! 2257: if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */
! 2258: if( rc!=SQLITE_ROW ) return rc;
! 2259:
! 2260: /* This happens if all segments at this level are entirely inline. */
! 2261: if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
! 2262: /* We expect only one row. We must execute another sqlite3_step()
! 2263: * to complete the iteration; otherwise the table will remain locked. */
! 2264: int rc2 = sqlite3_step(s);
! 2265: if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
! 2266: return rc2;
! 2267: }
! 2268:
! 2269: *piStartBlockid = sqlite3_column_int64(s, 0);
! 2270: *piEndBlockid = sqlite3_column_int64(s, 1);
! 2271:
! 2272: /* We expect only one row. We must execute another sqlite3_step()
! 2273: * to complete the iteration; otherwise the table will remain locked. */
! 2274: rc = sqlite3_step(s);
! 2275: if( rc==SQLITE_ROW ) return SQLITE_ERROR;
! 2276: if( rc!=SQLITE_DONE ) return rc;
! 2277: return SQLITE_ROW;
! 2278: }
! 2279:
! 2280: /* Delete the segment blocks and segment directory records for all
! 2281: ** segments at iLevel.
! 2282: */
! 2283: static int segdir_delete(fulltext_vtab *v, int iLevel){
! 2284: sqlite3_stmt *s;
! 2285: sqlite_int64 iStartBlockid, iEndBlockid;
! 2286: int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
! 2287: if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
! 2288:
! 2289: if( rc==SQLITE_ROW ){
! 2290: rc = block_delete(v, iStartBlockid, iEndBlockid);
! 2291: if( rc!=SQLITE_OK ) return rc;
! 2292: }
! 2293:
! 2294: /* Delete the segment directory itself. */
! 2295: rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
! 2296: if( rc!=SQLITE_OK ) return rc;
! 2297:
! 2298: rc = sqlite3_bind_int64(s, 1, iLevel);
! 2299: if( rc!=SQLITE_OK ) return rc;
! 2300:
! 2301: return sql_single_step(s);
! 2302: }
! 2303:
! 2304: /* Delete entire fts index, SQLITE_OK on success, relevant error on
! 2305: ** failure.
! 2306: */
! 2307: static int segdir_delete_all(fulltext_vtab *v){
! 2308: sqlite3_stmt *s;
! 2309: int rc = sql_get_statement(v, SEGDIR_DELETE_ALL_STMT, &s);
! 2310: if( rc!=SQLITE_OK ) return rc;
! 2311:
! 2312: rc = sql_single_step(s);
! 2313: if( rc!=SQLITE_OK ) return rc;
! 2314:
! 2315: rc = sql_get_statement(v, BLOCK_DELETE_ALL_STMT, &s);
! 2316: if( rc!=SQLITE_OK ) return rc;
! 2317:
! 2318: return sql_single_step(s);
! 2319: }
! 2320:
! 2321: /* Returns SQLITE_OK with *pnSegments set to the number of entries in
! 2322: ** %_segdir and *piMaxLevel set to the highest level which has a
! 2323: ** segment. Otherwise returns the SQLite error which caused failure.
! 2324: */
! 2325: static int segdir_count(fulltext_vtab *v, int *pnSegments, int *piMaxLevel){
! 2326: sqlite3_stmt *s;
! 2327: int rc = sql_get_statement(v, SEGDIR_COUNT_STMT, &s);
! 2328: if( rc!=SQLITE_OK ) return rc;
! 2329:
! 2330: rc = sqlite3_step(s);
! 2331: /* TODO(shess): This case should not be possible? Should stronger
! 2332: ** measures be taken if it happens?
! 2333: */
! 2334: if( rc==SQLITE_DONE ){
! 2335: *pnSegments = 0;
! 2336: *piMaxLevel = 0;
! 2337: return SQLITE_OK;
! 2338: }
! 2339: if( rc!=SQLITE_ROW ) return rc;
! 2340:
! 2341: *pnSegments = sqlite3_column_int(s, 0);
! 2342: *piMaxLevel = sqlite3_column_int(s, 1);
! 2343:
! 2344: /* We expect only one row. We must execute another sqlite3_step()
! 2345: * to complete the iteration; otherwise the table will remain locked. */
! 2346: rc = sqlite3_step(s);
! 2347: if( rc==SQLITE_DONE ) return SQLITE_OK;
! 2348: if( rc==SQLITE_ROW ) return SQLITE_ERROR;
! 2349: return rc;
! 2350: }
! 2351:
! 2352: /* TODO(shess) clearPendingTerms() is far down the file because
! 2353: ** writeZeroSegment() is far down the file because LeafWriter is far
! 2354: ** down the file. Consider refactoring the code to move the non-vtab
! 2355: ** code above the vtab code so that we don't need this forward
! 2356: ** reference.
! 2357: */
! 2358: static int clearPendingTerms(fulltext_vtab *v);
! 2359:
! 2360: /*
! 2361: ** Free the memory used to contain a fulltext_vtab structure.
! 2362: */
! 2363: static void fulltext_vtab_destroy(fulltext_vtab *v){
! 2364: int iStmt, i;
! 2365:
! 2366: TRACE(("FTS2 Destroy %p\n", v));
! 2367: for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
! 2368: if( v->pFulltextStatements[iStmt]!=NULL ){
! 2369: sqlite3_finalize(v->pFulltextStatements[iStmt]);
! 2370: v->pFulltextStatements[iStmt] = NULL;
! 2371: }
! 2372: }
! 2373:
! 2374: for( i=0; i<MERGE_COUNT; i++ ){
! 2375: if( v->pLeafSelectStmts[i]!=NULL ){
! 2376: sqlite3_finalize(v->pLeafSelectStmts[i]);
! 2377: v->pLeafSelectStmts[i] = NULL;
! 2378: }
! 2379: }
! 2380:
! 2381: if( v->pTokenizer!=NULL ){
! 2382: v->pTokenizer->pModule->xDestroy(v->pTokenizer);
! 2383: v->pTokenizer = NULL;
! 2384: }
! 2385:
! 2386: clearPendingTerms(v);
! 2387:
! 2388: sqlite3_free(v->azColumn);
! 2389: for(i = 0; i < v->nColumn; ++i) {
! 2390: sqlite3_free(v->azContentColumn[i]);
! 2391: }
! 2392: sqlite3_free(v->azContentColumn);
! 2393: sqlite3_free(v);
! 2394: }
! 2395:
! 2396: /*
! 2397: ** Token types for parsing the arguments to xConnect or xCreate.
! 2398: */
! 2399: #define TOKEN_EOF 0 /* End of file */
! 2400: #define TOKEN_SPACE 1 /* Any kind of whitespace */
! 2401: #define TOKEN_ID 2 /* An identifier */
! 2402: #define TOKEN_STRING 3 /* A string literal */
! 2403: #define TOKEN_PUNCT 4 /* A single punctuation character */
! 2404:
! 2405: /*
! 2406: ** If X is a character that can be used in an identifier then
! 2407: ** IdChar(X) will be true. Otherwise it is false.
! 2408: **
! 2409: ** For ASCII, any character with the high-order bit set is
! 2410: ** allowed in an identifier. For 7-bit characters,
! 2411: ** sqlite3IsIdChar[X] must be 1.
! 2412: **
! 2413: ** Ticket #1066. the SQL standard does not allow '$' in the
! 2414: ** middle of identfiers. But many SQL implementations do.
! 2415: ** SQLite will allow '$' in identifiers for compatibility.
! 2416: ** But the feature is undocumented.
! 2417: */
! 2418: static const char isIdChar[] = {
! 2419: /* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
! 2420: 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
! 2421: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
! 2422: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
! 2423: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
! 2424: 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
! 2425: 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
! 2426: };
! 2427: #define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
! 2428:
! 2429:
! 2430: /*
! 2431: ** Return the length of the token that begins at z[0].
! 2432: ** Store the token type in *tokenType before returning.
! 2433: */
! 2434: static int getToken(const char *z, int *tokenType){
! 2435: int i, c;
! 2436: switch( *z ){
! 2437: case 0: {
! 2438: *tokenType = TOKEN_EOF;
! 2439: return 0;
! 2440: }
! 2441: case ' ': case '\t': case '\n': case '\f': case '\r': {
! 2442: for(i=1; safe_isspace(z[i]); i++){}
! 2443: *tokenType = TOKEN_SPACE;
! 2444: return i;
! 2445: }
! 2446: case '`':
! 2447: case '\'':
! 2448: case '"': {
! 2449: int delim = z[0];
! 2450: for(i=1; (c=z[i])!=0; i++){
! 2451: if( c==delim ){
! 2452: if( z[i+1]==delim ){
! 2453: i++;
! 2454: }else{
! 2455: break;
! 2456: }
! 2457: }
! 2458: }
! 2459: *tokenType = TOKEN_STRING;
! 2460: return i + (c!=0);
! 2461: }
! 2462: case '[': {
! 2463: for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
! 2464: *tokenType = TOKEN_ID;
! 2465: return i;
! 2466: }
! 2467: default: {
! 2468: if( !IdChar(*z) ){
! 2469: break;
! 2470: }
! 2471: for(i=1; IdChar(z[i]); i++){}
! 2472: *tokenType = TOKEN_ID;
! 2473: return i;
! 2474: }
! 2475: }
! 2476: *tokenType = TOKEN_PUNCT;
! 2477: return 1;
! 2478: }
! 2479:
! 2480: /*
! 2481: ** A token extracted from a string is an instance of the following
! 2482: ** structure.
! 2483: */
! 2484: typedef struct Token {
! 2485: const char *z; /* Pointer to token text. Not '\000' terminated */
! 2486: short int n; /* Length of the token text in bytes. */
! 2487: } Token;
! 2488:
! 2489: /*
! 2490: ** Given a input string (which is really one of the argv[] parameters
! 2491: ** passed into xConnect or xCreate) split the string up into tokens.
! 2492: ** Return an array of pointers to '\000' terminated strings, one string
! 2493: ** for each non-whitespace token.
! 2494: **
! 2495: ** The returned array is terminated by a single NULL pointer.
! 2496: **
! 2497: ** Space to hold the returned array is obtained from a single
! 2498: ** malloc and should be freed by passing the return value to free().
! 2499: ** The individual strings within the token list are all a part of
! 2500: ** the single memory allocation and will all be freed at once.
! 2501: */
! 2502: static char **tokenizeString(const char *z, int *pnToken){
! 2503: int nToken = 0;
! 2504: Token *aToken = sqlite3_malloc( strlen(z) * sizeof(aToken[0]) );
! 2505: int n = 1;
! 2506: int e, i;
! 2507: int totalSize = 0;
! 2508: char **azToken;
! 2509: char *zCopy;
! 2510: while( n>0 ){
! 2511: n = getToken(z, &e);
! 2512: if( e!=TOKEN_SPACE ){
! 2513: aToken[nToken].z = z;
! 2514: aToken[nToken].n = n;
! 2515: nToken++;
! 2516: totalSize += n+1;
! 2517: }
! 2518: z += n;
! 2519: }
! 2520: azToken = (char**)sqlite3_malloc( nToken*sizeof(char*) + totalSize );
! 2521: zCopy = (char*)&azToken[nToken];
! 2522: nToken--;
! 2523: for(i=0; i<nToken; i++){
! 2524: azToken[i] = zCopy;
! 2525: n = aToken[i].n;
! 2526: memcpy(zCopy, aToken[i].z, n);
! 2527: zCopy[n] = 0;
! 2528: zCopy += n+1;
! 2529: }
! 2530: azToken[nToken] = 0;
! 2531: sqlite3_free(aToken);
! 2532: *pnToken = nToken;
! 2533: return azToken;
! 2534: }
! 2535:
! 2536: /*
! 2537: ** Convert an SQL-style quoted string into a normal string by removing
! 2538: ** the quote characters. The conversion is done in-place. If the
! 2539: ** input does not begin with a quote character, then this routine
! 2540: ** is a no-op.
! 2541: **
! 2542: ** Examples:
! 2543: **
! 2544: ** "abc" becomes abc
! 2545: ** 'xyz' becomes xyz
! 2546: ** [pqr] becomes pqr
! 2547: ** `mno` becomes mno
! 2548: */
! 2549: static void dequoteString(char *z){
! 2550: int quote;
! 2551: int i, j;
! 2552: if( z==0 ) return;
! 2553: quote = z[0];
! 2554: switch( quote ){
! 2555: case '\'': break;
! 2556: case '"': break;
! 2557: case '`': break; /* For MySQL compatibility */
! 2558: case '[': quote = ']'; break; /* For MS SqlServer compatibility */
! 2559: default: return;
! 2560: }
! 2561: for(i=1, j=0; z[i]; i++){
! 2562: if( z[i]==quote ){
! 2563: if( z[i+1]==quote ){
! 2564: z[j++] = quote;
! 2565: i++;
! 2566: }else{
! 2567: z[j++] = 0;
! 2568: break;
! 2569: }
! 2570: }else{
! 2571: z[j++] = z[i];
! 2572: }
! 2573: }
! 2574: }
! 2575:
! 2576: /*
! 2577: ** The input azIn is a NULL-terminated list of tokens. Remove the first
! 2578: ** token and all punctuation tokens. Remove the quotes from
! 2579: ** around string literal tokens.
! 2580: **
! 2581: ** Example:
! 2582: **
! 2583: ** input: tokenize chinese ( 'simplifed' , 'mixed' )
! 2584: ** output: chinese simplifed mixed
! 2585: **
! 2586: ** Another example:
! 2587: **
! 2588: ** input: delimiters ( '[' , ']' , '...' )
! 2589: ** output: [ ] ...
! 2590: */
! 2591: static void tokenListToIdList(char **azIn){
! 2592: int i, j;
! 2593: if( azIn ){
! 2594: for(i=0, j=-1; azIn[i]; i++){
! 2595: if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
! 2596: dequoteString(azIn[i]);
! 2597: if( j>=0 ){
! 2598: azIn[j] = azIn[i];
! 2599: }
! 2600: j++;
! 2601: }
! 2602: }
! 2603: azIn[j] = 0;
! 2604: }
! 2605: }
! 2606:
! 2607:
! 2608: /*
! 2609: ** Find the first alphanumeric token in the string zIn. Null-terminate
! 2610: ** this token. Remove any quotation marks. And return a pointer to
! 2611: ** the result.
! 2612: */
! 2613: static char *firstToken(char *zIn, char **pzTail){
! 2614: int n, ttype;
! 2615: while(1){
! 2616: n = getToken(zIn, &ttype);
! 2617: if( ttype==TOKEN_SPACE ){
! 2618: zIn += n;
! 2619: }else if( ttype==TOKEN_EOF ){
! 2620: *pzTail = zIn;
! 2621: return 0;
! 2622: }else{
! 2623: zIn[n] = 0;
! 2624: *pzTail = &zIn[1];
! 2625: dequoteString(zIn);
! 2626: return zIn;
! 2627: }
! 2628: }
! 2629: /*NOTREACHED*/
! 2630: }
! 2631:
! 2632: /* Return true if...
! 2633: **
! 2634: ** * s begins with the string t, ignoring case
! 2635: ** * s is longer than t
! 2636: ** * The first character of s beyond t is not a alphanumeric
! 2637: **
! 2638: ** Ignore leading space in *s.
! 2639: **
! 2640: ** To put it another way, return true if the first token of
! 2641: ** s[] is t[].
! 2642: */
! 2643: static int startsWith(const char *s, const char *t){
! 2644: while( safe_isspace(*s) ){ s++; }
! 2645: while( *t ){
! 2646: if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
! 2647: }
! 2648: return *s!='_' && !safe_isalnum(*s);
! 2649: }
! 2650:
! 2651: /*
! 2652: ** An instance of this structure defines the "spec" of a
! 2653: ** full text index. This structure is populated by parseSpec
! 2654: ** and use by fulltextConnect and fulltextCreate.
! 2655: */
! 2656: typedef struct TableSpec {
! 2657: const char *zDb; /* Logical database name */
! 2658: const char *zName; /* Name of the full-text index */
! 2659: int nColumn; /* Number of columns to be indexed */
! 2660: char **azColumn; /* Original names of columns to be indexed */
! 2661: char **azContentColumn; /* Column names for %_content */
! 2662: char **azTokenizer; /* Name of tokenizer and its arguments */
! 2663: } TableSpec;
! 2664:
! 2665: /*
! 2666: ** Reclaim all of the memory used by a TableSpec
! 2667: */
! 2668: static void clearTableSpec(TableSpec *p) {
! 2669: sqlite3_free(p->azColumn);
! 2670: sqlite3_free(p->azContentColumn);
! 2671: sqlite3_free(p->azTokenizer);
! 2672: }
! 2673:
! 2674: /* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
! 2675: *
! 2676: * CREATE VIRTUAL TABLE email
! 2677: * USING fts2(subject, body, tokenize mytokenizer(myarg))
! 2678: *
! 2679: * We return parsed information in a TableSpec structure.
! 2680: *
! 2681: */
! 2682: static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
! 2683: char**pzErr){
! 2684: int i, n;
! 2685: char *z, *zDummy;
! 2686: char **azArg;
! 2687: const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
! 2688:
! 2689: assert( argc>=3 );
! 2690: /* Current interface:
! 2691: ** argv[0] - module name
! 2692: ** argv[1] - database name
! 2693: ** argv[2] - table name
! 2694: ** argv[3..] - columns, optionally followed by tokenizer specification
! 2695: ** and snippet delimiters specification.
! 2696: */
! 2697:
! 2698: /* Make a copy of the complete argv[][] array in a single allocation.
! 2699: ** The argv[][] array is read-only and transient. We can write to the
! 2700: ** copy in order to modify things and the copy is persistent.
! 2701: */
! 2702: CLEAR(pSpec);
! 2703: for(i=n=0; i<argc; i++){
! 2704: n += strlen(argv[i]) + 1;
! 2705: }
! 2706: azArg = sqlite3_malloc( sizeof(char*)*argc + n );
! 2707: if( azArg==0 ){
! 2708: return SQLITE_NOMEM;
! 2709: }
! 2710: z = (char*)&azArg[argc];
! 2711: for(i=0; i<argc; i++){
! 2712: azArg[i] = z;
! 2713: strcpy(z, argv[i]);
! 2714: z += strlen(z)+1;
! 2715: }
! 2716:
! 2717: /* Identify the column names and the tokenizer and delimiter arguments
! 2718: ** in the argv[][] array.
! 2719: */
! 2720: pSpec->zDb = azArg[1];
! 2721: pSpec->zName = azArg[2];
! 2722: pSpec->nColumn = 0;
! 2723: pSpec->azColumn = azArg;
! 2724: zTokenizer = "tokenize simple";
! 2725: for(i=3; i<argc; ++i){
! 2726: if( startsWith(azArg[i],"tokenize") ){
! 2727: zTokenizer = azArg[i];
! 2728: }else{
! 2729: z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
! 2730: pSpec->nColumn++;
! 2731: }
! 2732: }
! 2733: if( pSpec->nColumn==0 ){
! 2734: azArg[0] = "content";
! 2735: pSpec->nColumn = 1;
! 2736: }
! 2737:
! 2738: /*
! 2739: ** Construct the list of content column names.
! 2740: **
! 2741: ** Each content column name will be of the form cNNAAAA
! 2742: ** where NN is the column number and AAAA is the sanitized
! 2743: ** column name. "sanitized" means that special characters are
! 2744: ** converted to "_". The cNN prefix guarantees that all column
! 2745: ** names are unique.
! 2746: **
! 2747: ** The AAAA suffix is not strictly necessary. It is included
! 2748: ** for the convenience of people who might examine the generated
! 2749: ** %_content table and wonder what the columns are used for.
! 2750: */
! 2751: pSpec->azContentColumn = sqlite3_malloc( pSpec->nColumn * sizeof(char *) );
! 2752: if( pSpec->azContentColumn==0 ){
! 2753: clearTableSpec(pSpec);
! 2754: return SQLITE_NOMEM;
! 2755: }
! 2756: for(i=0; i<pSpec->nColumn; i++){
! 2757: char *p;
! 2758: pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
! 2759: for (p = pSpec->azContentColumn[i]; *p ; ++p) {
! 2760: if( !safe_isalnum(*p) ) *p = '_';
! 2761: }
! 2762: }
! 2763:
! 2764: /*
! 2765: ** Parse the tokenizer specification string.
! 2766: */
! 2767: pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
! 2768: tokenListToIdList(pSpec->azTokenizer);
! 2769:
! 2770: return SQLITE_OK;
! 2771: }
! 2772:
! 2773: /*
! 2774: ** Generate a CREATE TABLE statement that describes the schema of
! 2775: ** the virtual table. Return a pointer to this schema string.
! 2776: **
! 2777: ** Space is obtained from sqlite3_mprintf() and should be freed
! 2778: ** using sqlite3_free().
! 2779: */
! 2780: static char *fulltextSchema(
! 2781: int nColumn, /* Number of columns */
! 2782: const char *const* azColumn, /* List of columns */
! 2783: const char *zTableName /* Name of the table */
! 2784: ){
! 2785: int i;
! 2786: char *zSchema, *zNext;
! 2787: const char *zSep = "(";
! 2788: zSchema = sqlite3_mprintf("CREATE TABLE x");
! 2789: for(i=0; i<nColumn; i++){
! 2790: zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
! 2791: sqlite3_free(zSchema);
! 2792: zSchema = zNext;
! 2793: zSep = ",";
! 2794: }
! 2795: zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
! 2796: sqlite3_free(zSchema);
! 2797: return zNext;
! 2798: }
! 2799:
! 2800: /*
! 2801: ** Build a new sqlite3_vtab structure that will describe the
! 2802: ** fulltext index defined by spec.
! 2803: */
! 2804: static int constructVtab(
! 2805: sqlite3 *db, /* The SQLite database connection */
! 2806: fts2Hash *pHash, /* Hash table containing tokenizers */
! 2807: TableSpec *spec, /* Parsed spec information from parseSpec() */
! 2808: sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
! 2809: char **pzErr /* Write any error message here */
! 2810: ){
! 2811: int rc;
! 2812: int n;
! 2813: fulltext_vtab *v = 0;
! 2814: const sqlite3_tokenizer_module *m = NULL;
! 2815: char *schema;
! 2816:
! 2817: char const *zTok; /* Name of tokenizer to use for this fts table */
! 2818: int nTok; /* Length of zTok, including nul terminator */
! 2819:
! 2820: v = (fulltext_vtab *) sqlite3_malloc(sizeof(fulltext_vtab));
! 2821: if( v==0 ) return SQLITE_NOMEM;
! 2822: CLEAR(v);
! 2823: /* sqlite will initialize v->base */
! 2824: v->db = db;
! 2825: v->zDb = spec->zDb; /* Freed when azColumn is freed */
! 2826: v->zName = spec->zName; /* Freed when azColumn is freed */
! 2827: v->nColumn = spec->nColumn;
! 2828: v->azContentColumn = spec->azContentColumn;
! 2829: spec->azContentColumn = 0;
! 2830: v->azColumn = spec->azColumn;
! 2831: spec->azColumn = 0;
! 2832:
! 2833: if( spec->azTokenizer==0 ){
! 2834: return SQLITE_NOMEM;
! 2835: }
! 2836:
! 2837: zTok = spec->azTokenizer[0];
! 2838: if( !zTok ){
! 2839: zTok = "simple";
! 2840: }
! 2841: nTok = strlen(zTok)+1;
! 2842:
! 2843: m = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zTok, nTok);
! 2844: if( !m ){
! 2845: *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
! 2846: rc = SQLITE_ERROR;
! 2847: goto err;
! 2848: }
! 2849:
! 2850: for(n=0; spec->azTokenizer[n]; n++){}
! 2851: if( n ){
! 2852: rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
! 2853: &v->pTokenizer);
! 2854: }else{
! 2855: rc = m->xCreate(0, 0, &v->pTokenizer);
! 2856: }
! 2857: if( rc!=SQLITE_OK ) goto err;
! 2858: v->pTokenizer->pModule = m;
! 2859:
! 2860: /* TODO: verify the existence of backing tables foo_content, foo_term */
! 2861:
! 2862: schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
! 2863: spec->zName);
! 2864: rc = sqlite3_declare_vtab(db, schema);
! 2865: sqlite3_free(schema);
! 2866: if( rc!=SQLITE_OK ) goto err;
! 2867:
! 2868: memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
! 2869:
! 2870: /* Indicate that the buffer is not live. */
! 2871: v->nPendingData = -1;
! 2872:
! 2873: *ppVTab = &v->base;
! 2874: TRACE(("FTS2 Connect %p\n", v));
! 2875:
! 2876: return rc;
! 2877:
! 2878: err:
! 2879: fulltext_vtab_destroy(v);
! 2880: return rc;
! 2881: }
! 2882:
! 2883: static int fulltextConnect(
! 2884: sqlite3 *db,
! 2885: void *pAux,
! 2886: int argc, const char *const*argv,
! 2887: sqlite3_vtab **ppVTab,
! 2888: char **pzErr
! 2889: ){
! 2890: TableSpec spec;
! 2891: int rc = parseSpec(&spec, argc, argv, pzErr);
! 2892: if( rc!=SQLITE_OK ) return rc;
! 2893:
! 2894: rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
! 2895: clearTableSpec(&spec);
! 2896: return rc;
! 2897: }
! 2898:
! 2899: /* The %_content table holds the text of each document, with
! 2900: ** the rowid used as the docid.
! 2901: */
! 2902: /* TODO(shess) This comment needs elaboration to match the updated
! 2903: ** code. Work it into the top-of-file comment at that time.
! 2904: */
! 2905: static int fulltextCreate(sqlite3 *db, void *pAux,
! 2906: int argc, const char * const *argv,
! 2907: sqlite3_vtab **ppVTab, char **pzErr){
! 2908: int rc;
! 2909: TableSpec spec;
! 2910: StringBuffer schema;
! 2911: TRACE(("FTS2 Create\n"));
! 2912:
! 2913: rc = parseSpec(&spec, argc, argv, pzErr);
! 2914: if( rc!=SQLITE_OK ) return rc;
! 2915:
! 2916: initStringBuffer(&schema);
! 2917: append(&schema, "CREATE TABLE %_content(");
! 2918: appendList(&schema, spec.nColumn, spec.azContentColumn);
! 2919: append(&schema, ")");
! 2920: rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
! 2921: stringBufferDestroy(&schema);
! 2922: if( rc!=SQLITE_OK ) goto out;
! 2923:
! 2924: rc = sql_exec(db, spec.zDb, spec.zName,
! 2925: "create table %_segments(block blob);");
! 2926: if( rc!=SQLITE_OK ) goto out;
! 2927:
! 2928: rc = sql_exec(db, spec.zDb, spec.zName,
! 2929: "create table %_segdir("
! 2930: " level integer,"
! 2931: " idx integer,"
! 2932: " start_block integer,"
! 2933: " leaves_end_block integer,"
! 2934: " end_block integer,"
! 2935: " root blob,"
! 2936: " primary key(level, idx)"
! 2937: ");");
! 2938: if( rc!=SQLITE_OK ) goto out;
! 2939:
! 2940: rc = constructVtab(db, (fts2Hash *)pAux, &spec, ppVTab, pzErr);
! 2941:
! 2942: out:
! 2943: clearTableSpec(&spec);
! 2944: return rc;
! 2945: }
! 2946:
! 2947: /* Decide how to handle an SQL query. */
! 2948: static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
! 2949: int i;
! 2950: TRACE(("FTS2 BestIndex\n"));
! 2951:
! 2952: for(i=0; i<pInfo->nConstraint; ++i){
! 2953: const struct sqlite3_index_constraint *pConstraint;
! 2954: pConstraint = &pInfo->aConstraint[i];
! 2955: if( pConstraint->usable ) {
! 2956: if( pConstraint->iColumn==-1 &&
! 2957: pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
! 2958: pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
! 2959: TRACE(("FTS2 QUERY_ROWID\n"));
! 2960: } else if( pConstraint->iColumn>=0 &&
! 2961: pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
! 2962: /* full-text search */
! 2963: pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
! 2964: TRACE(("FTS2 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
! 2965: } else continue;
! 2966:
! 2967: pInfo->aConstraintUsage[i].argvIndex = 1;
! 2968: pInfo->aConstraintUsage[i].omit = 1;
! 2969:
! 2970: /* An arbitrary value for now.
! 2971: * TODO: Perhaps rowid matches should be considered cheaper than
! 2972: * full-text searches. */
! 2973: pInfo->estimatedCost = 1.0;
! 2974:
! 2975: return SQLITE_OK;
! 2976: }
! 2977: }
! 2978: pInfo->idxNum = QUERY_GENERIC;
! 2979: return SQLITE_OK;
! 2980: }
! 2981:
! 2982: static int fulltextDisconnect(sqlite3_vtab *pVTab){
! 2983: TRACE(("FTS2 Disconnect %p\n", pVTab));
! 2984: fulltext_vtab_destroy((fulltext_vtab *)pVTab);
! 2985: return SQLITE_OK;
! 2986: }
! 2987:
! 2988: static int fulltextDestroy(sqlite3_vtab *pVTab){
! 2989: fulltext_vtab *v = (fulltext_vtab *)pVTab;
! 2990: int rc;
! 2991:
! 2992: TRACE(("FTS2 Destroy %p\n", pVTab));
! 2993: rc = sql_exec(v->db, v->zDb, v->zName,
! 2994: "drop table if exists %_content;"
! 2995: "drop table if exists %_segments;"
! 2996: "drop table if exists %_segdir;"
! 2997: );
! 2998: if( rc!=SQLITE_OK ) return rc;
! 2999:
! 3000: fulltext_vtab_destroy((fulltext_vtab *)pVTab);
! 3001: return SQLITE_OK;
! 3002: }
! 3003:
! 3004: static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
! 3005: fulltext_cursor *c;
! 3006:
! 3007: c = (fulltext_cursor *) sqlite3_malloc(sizeof(fulltext_cursor));
! 3008: if( c ){
! 3009: memset(c, 0, sizeof(fulltext_cursor));
! 3010: /* sqlite will initialize c->base */
! 3011: *ppCursor = &c->base;
! 3012: TRACE(("FTS2 Open %p: %p\n", pVTab, c));
! 3013: return SQLITE_OK;
! 3014: }else{
! 3015: return SQLITE_NOMEM;
! 3016: }
! 3017: }
! 3018:
! 3019:
! 3020: /* Free all of the dynamically allocated memory held by *q
! 3021: */
! 3022: static void queryClear(Query *q){
! 3023: int i;
! 3024: for(i = 0; i < q->nTerms; ++i){
! 3025: sqlite3_free(q->pTerms[i].pTerm);
! 3026: }
! 3027: sqlite3_free(q->pTerms);
! 3028: CLEAR(q);
! 3029: }
! 3030:
! 3031: /* Free all of the dynamically allocated memory held by the
! 3032: ** Snippet
! 3033: */
! 3034: static void snippetClear(Snippet *p){
! 3035: sqlite3_free(p->aMatch);
! 3036: sqlite3_free(p->zOffset);
! 3037: sqlite3_free(p->zSnippet);
! 3038: CLEAR(p);
! 3039: }
! 3040: /*
! 3041: ** Append a single entry to the p->aMatch[] log.
! 3042: */
! 3043: static void snippetAppendMatch(
! 3044: Snippet *p, /* Append the entry to this snippet */
! 3045: int iCol, int iTerm, /* The column and query term */
! 3046: int iStart, int nByte /* Offset and size of the match */
! 3047: ){
! 3048: int i;
! 3049: struct snippetMatch *pMatch;
! 3050: if( p->nMatch+1>=p->nAlloc ){
! 3051: p->nAlloc = p->nAlloc*2 + 10;
! 3052: p->aMatch = sqlite3_realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
! 3053: if( p->aMatch==0 ){
! 3054: p->nMatch = 0;
! 3055: p->nAlloc = 0;
! 3056: return;
! 3057: }
! 3058: }
! 3059: i = p->nMatch++;
! 3060: pMatch = &p->aMatch[i];
! 3061: pMatch->iCol = iCol;
! 3062: pMatch->iTerm = iTerm;
! 3063: pMatch->iStart = iStart;
! 3064: pMatch->nByte = nByte;
! 3065: }
! 3066:
! 3067: /*
! 3068: ** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
! 3069: */
! 3070: #define FTS2_ROTOR_SZ (32)
! 3071: #define FTS2_ROTOR_MASK (FTS2_ROTOR_SZ-1)
! 3072:
! 3073: /*
! 3074: ** Add entries to pSnippet->aMatch[] for every match that occurs against
! 3075: ** document zDoc[0..nDoc-1] which is stored in column iColumn.
! 3076: */
! 3077: static void snippetOffsetsOfColumn(
! 3078: Query *pQuery,
! 3079: Snippet *pSnippet,
! 3080: int iColumn,
! 3081: const char *zDoc,
! 3082: int nDoc
! 3083: ){
! 3084: const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
! 3085: sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
! 3086: sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
! 3087: fulltext_vtab *pVtab; /* The full text index */
! 3088: int nColumn; /* Number of columns in the index */
! 3089: const QueryTerm *aTerm; /* Query string terms */
! 3090: int nTerm; /* Number of query string terms */
! 3091: int i, j; /* Loop counters */
! 3092: int rc; /* Return code */
! 3093: unsigned int match, prevMatch; /* Phrase search bitmasks */
! 3094: const char *zToken; /* Next token from the tokenizer */
! 3095: int nToken; /* Size of zToken */
! 3096: int iBegin, iEnd, iPos; /* Offsets of beginning and end */
! 3097:
! 3098: /* The following variables keep a circular buffer of the last
! 3099: ** few tokens */
! 3100: unsigned int iRotor = 0; /* Index of current token */
! 3101: int iRotorBegin[FTS2_ROTOR_SZ]; /* Beginning offset of token */
! 3102: int iRotorLen[FTS2_ROTOR_SZ]; /* Length of token */
! 3103:
! 3104: pVtab = pQuery->pFts;
! 3105: nColumn = pVtab->nColumn;
! 3106: pTokenizer = pVtab->pTokenizer;
! 3107: pTModule = pTokenizer->pModule;
! 3108: rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
! 3109: if( rc ) return;
! 3110: pTCursor->pTokenizer = pTokenizer;
! 3111: aTerm = pQuery->pTerms;
! 3112: nTerm = pQuery->nTerms;
! 3113: if( nTerm>=FTS2_ROTOR_SZ ){
! 3114: nTerm = FTS2_ROTOR_SZ - 1;
! 3115: }
! 3116: prevMatch = 0;
! 3117: while(1){
! 3118: rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
! 3119: if( rc ) break;
! 3120: iRotorBegin[iRotor&FTS2_ROTOR_MASK] = iBegin;
! 3121: iRotorLen[iRotor&FTS2_ROTOR_MASK] = iEnd-iBegin;
! 3122: match = 0;
! 3123: for(i=0; i<nTerm; i++){
! 3124: int iCol;
! 3125: iCol = aTerm[i].iColumn;
! 3126: if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
! 3127: if( aTerm[i].nTerm>nToken ) continue;
! 3128: if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
! 3129: assert( aTerm[i].nTerm<=nToken );
! 3130: if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
! 3131: if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
! 3132: match |= 1<<i;
! 3133: if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
! 3134: for(j=aTerm[i].iPhrase-1; j>=0; j--){
! 3135: int k = (iRotor-j) & FTS2_ROTOR_MASK;
! 3136: snippetAppendMatch(pSnippet, iColumn, i-j,
! 3137: iRotorBegin[k], iRotorLen[k]);
! 3138: }
! 3139: }
! 3140: }
! 3141: prevMatch = match<<1;
! 3142: iRotor++;
! 3143: }
! 3144: pTModule->xClose(pTCursor);
! 3145: }
! 3146:
! 3147:
! 3148: /*
! 3149: ** Compute all offsets for the current row of the query.
! 3150: ** If the offsets have already been computed, this routine is a no-op.
! 3151: */
! 3152: static void snippetAllOffsets(fulltext_cursor *p){
! 3153: int nColumn;
! 3154: int iColumn, i;
! 3155: int iFirst, iLast;
! 3156: fulltext_vtab *pFts;
! 3157:
! 3158: if( p->snippet.nMatch ) return;
! 3159: if( p->q.nTerms==0 ) return;
! 3160: pFts = p->q.pFts;
! 3161: nColumn = pFts->nColumn;
! 3162: iColumn = (p->iCursorType - QUERY_FULLTEXT);
! 3163: if( iColumn<0 || iColumn>=nColumn ){
! 3164: iFirst = 0;
! 3165: iLast = nColumn-1;
! 3166: }else{
! 3167: iFirst = iColumn;
! 3168: iLast = iColumn;
! 3169: }
! 3170: for(i=iFirst; i<=iLast; i++){
! 3171: const char *zDoc;
! 3172: int nDoc;
! 3173: zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
! 3174: nDoc = sqlite3_column_bytes(p->pStmt, i+1);
! 3175: snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
! 3176: }
! 3177: }
! 3178:
! 3179: /*
! 3180: ** Convert the information in the aMatch[] array of the snippet
! 3181: ** into the string zOffset[0..nOffset-1].
! 3182: */
! 3183: static void snippetOffsetText(Snippet *p){
! 3184: int i;
! 3185: int cnt = 0;
! 3186: StringBuffer sb;
! 3187: char zBuf[200];
! 3188: if( p->zOffset ) return;
! 3189: initStringBuffer(&sb);
! 3190: for(i=0; i<p->nMatch; i++){
! 3191: struct snippetMatch *pMatch = &p->aMatch[i];
! 3192: zBuf[0] = ' ';
! 3193: sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
! 3194: pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
! 3195: append(&sb, zBuf);
! 3196: cnt++;
! 3197: }
! 3198: p->zOffset = stringBufferData(&sb);
! 3199: p->nOffset = stringBufferLength(&sb);
! 3200: }
! 3201:
! 3202: /*
! 3203: ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
! 3204: ** of matching words some of which might be in zDoc. zDoc is column
! 3205: ** number iCol.
! 3206: **
! 3207: ** iBreak is suggested spot in zDoc where we could begin or end an
! 3208: ** excerpt. Return a value similar to iBreak but possibly adjusted
! 3209: ** to be a little left or right so that the break point is better.
! 3210: */
! 3211: static int wordBoundary(
! 3212: int iBreak, /* The suggested break point */
! 3213: const char *zDoc, /* Document text */
! 3214: int nDoc, /* Number of bytes in zDoc[] */
! 3215: struct snippetMatch *aMatch, /* Matching words */
! 3216: int nMatch, /* Number of entries in aMatch[] */
! 3217: int iCol /* The column number for zDoc[] */
! 3218: ){
! 3219: int i;
! 3220: if( iBreak<=10 ){
! 3221: return 0;
! 3222: }
! 3223: if( iBreak>=nDoc-10 ){
! 3224: return nDoc;
! 3225: }
! 3226: for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
! 3227: while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
! 3228: if( i<nMatch ){
! 3229: if( aMatch[i].iStart<iBreak+10 ){
! 3230: return aMatch[i].iStart;
! 3231: }
! 3232: if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
! 3233: return aMatch[i-1].iStart;
! 3234: }
! 3235: }
! 3236: for(i=1; i<=10; i++){
! 3237: if( safe_isspace(zDoc[iBreak-i]) ){
! 3238: return iBreak - i + 1;
! 3239: }
! 3240: if( safe_isspace(zDoc[iBreak+i]) ){
! 3241: return iBreak + i + 1;
! 3242: }
! 3243: }
! 3244: return iBreak;
! 3245: }
! 3246:
! 3247:
! 3248:
! 3249: /*
! 3250: ** Allowed values for Snippet.aMatch[].snStatus
! 3251: */
! 3252: #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
! 3253: #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
! 3254:
! 3255: /*
! 3256: ** Generate the text of a snippet.
! 3257: */
! 3258: static void snippetText(
! 3259: fulltext_cursor *pCursor, /* The cursor we need the snippet for */
! 3260: const char *zStartMark, /* Markup to appear before each match */
! 3261: const char *zEndMark, /* Markup to appear after each match */
! 3262: const char *zEllipsis /* Ellipsis mark */
! 3263: ){
! 3264: int i, j;
! 3265: struct snippetMatch *aMatch;
! 3266: int nMatch;
! 3267: int nDesired;
! 3268: StringBuffer sb;
! 3269: int tailCol;
! 3270: int tailOffset;
! 3271: int iCol;
! 3272: int nDoc;
! 3273: const char *zDoc;
! 3274: int iStart, iEnd;
! 3275: int tailEllipsis = 0;
! 3276: int iMatch;
! 3277:
! 3278:
! 3279: sqlite3_free(pCursor->snippet.zSnippet);
! 3280: pCursor->snippet.zSnippet = 0;
! 3281: aMatch = pCursor->snippet.aMatch;
! 3282: nMatch = pCursor->snippet.nMatch;
! 3283: initStringBuffer(&sb);
! 3284:
! 3285: for(i=0; i<nMatch; i++){
! 3286: aMatch[i].snStatus = SNIPPET_IGNORE;
! 3287: }
! 3288: nDesired = 0;
! 3289: for(i=0; i<pCursor->q.nTerms; i++){
! 3290: for(j=0; j<nMatch; j++){
! 3291: if( aMatch[j].iTerm==i ){
! 3292: aMatch[j].snStatus = SNIPPET_DESIRED;
! 3293: nDesired++;
! 3294: break;
! 3295: }
! 3296: }
! 3297: }
! 3298:
! 3299: iMatch = 0;
! 3300: tailCol = -1;
! 3301: tailOffset = 0;
! 3302: for(i=0; i<nMatch && nDesired>0; i++){
! 3303: if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
! 3304: nDesired--;
! 3305: iCol = aMatch[i].iCol;
! 3306: zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
! 3307: nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
! 3308: iStart = aMatch[i].iStart - 40;
! 3309: iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
! 3310: if( iStart<=10 ){
! 3311: iStart = 0;
! 3312: }
! 3313: if( iCol==tailCol && iStart<=tailOffset+20 ){
! 3314: iStart = tailOffset;
! 3315: }
! 3316: if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
! 3317: trimWhiteSpace(&sb);
! 3318: appendWhiteSpace(&sb);
! 3319: append(&sb, zEllipsis);
! 3320: appendWhiteSpace(&sb);
! 3321: }
! 3322: iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
! 3323: iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
! 3324: if( iEnd>=nDoc-10 ){
! 3325: iEnd = nDoc;
! 3326: tailEllipsis = 0;
! 3327: }else{
! 3328: tailEllipsis = 1;
! 3329: }
! 3330: while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
! 3331: while( iStart<iEnd ){
! 3332: while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
! 3333: && aMatch[iMatch].iCol<=iCol ){
! 3334: iMatch++;
! 3335: }
! 3336: if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
! 3337: && aMatch[iMatch].iCol==iCol ){
! 3338: nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
! 3339: iStart = aMatch[iMatch].iStart;
! 3340: append(&sb, zStartMark);
! 3341: nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
! 3342: append(&sb, zEndMark);
! 3343: iStart += aMatch[iMatch].nByte;
! 3344: for(j=iMatch+1; j<nMatch; j++){
! 3345: if( aMatch[j].iTerm==aMatch[iMatch].iTerm
! 3346: && aMatch[j].snStatus==SNIPPET_DESIRED ){
! 3347: nDesired--;
! 3348: aMatch[j].snStatus = SNIPPET_IGNORE;
! 3349: }
! 3350: }
! 3351: }else{
! 3352: nappend(&sb, &zDoc[iStart], iEnd - iStart);
! 3353: iStart = iEnd;
! 3354: }
! 3355: }
! 3356: tailCol = iCol;
! 3357: tailOffset = iEnd;
! 3358: }
! 3359: trimWhiteSpace(&sb);
! 3360: if( tailEllipsis ){
! 3361: appendWhiteSpace(&sb);
! 3362: append(&sb, zEllipsis);
! 3363: }
! 3364: pCursor->snippet.zSnippet = stringBufferData(&sb);
! 3365: pCursor->snippet.nSnippet = stringBufferLength(&sb);
! 3366: }
! 3367:
! 3368:
! 3369: /*
! 3370: ** Close the cursor. For additional information see the documentation
! 3371: ** on the xClose method of the virtual table interface.
! 3372: */
! 3373: static int fulltextClose(sqlite3_vtab_cursor *pCursor){
! 3374: fulltext_cursor *c = (fulltext_cursor *) pCursor;
! 3375: TRACE(("FTS2 Close %p\n", c));
! 3376: sqlite3_finalize(c->pStmt);
! 3377: queryClear(&c->q);
! 3378: snippetClear(&c->snippet);
! 3379: if( c->result.nData!=0 ) dlrDestroy(&c->reader);
! 3380: dataBufferDestroy(&c->result);
! 3381: sqlite3_free(c);
! 3382: return SQLITE_OK;
! 3383: }
! 3384:
! 3385: static int fulltextNext(sqlite3_vtab_cursor *pCursor){
! 3386: fulltext_cursor *c = (fulltext_cursor *) pCursor;
! 3387: int rc;
! 3388:
! 3389: TRACE(("FTS2 Next %p\n", pCursor));
! 3390: snippetClear(&c->snippet);
! 3391: if( c->iCursorType < QUERY_FULLTEXT ){
! 3392: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
! 3393: rc = sqlite3_step(c->pStmt);
! 3394: switch( rc ){
! 3395: case SQLITE_ROW:
! 3396: c->eof = 0;
! 3397: return SQLITE_OK;
! 3398: case SQLITE_DONE:
! 3399: c->eof = 1;
! 3400: return SQLITE_OK;
! 3401: default:
! 3402: c->eof = 1;
! 3403: return rc;
! 3404: }
! 3405: } else { /* full-text query */
! 3406: rc = sqlite3_reset(c->pStmt);
! 3407: if( rc!=SQLITE_OK ) return rc;
! 3408:
! 3409: if( c->result.nData==0 || dlrAtEnd(&c->reader) ){
! 3410: c->eof = 1;
! 3411: return SQLITE_OK;
! 3412: }
! 3413: rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader));
! 3414: dlrStep(&c->reader);
! 3415: if( rc!=SQLITE_OK ) return rc;
! 3416: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
! 3417: rc = sqlite3_step(c->pStmt);
! 3418: if( rc==SQLITE_ROW ){ /* the case we expect */
! 3419: c->eof = 0;
! 3420: return SQLITE_OK;
! 3421: }
! 3422: /* an error occurred; abort */
! 3423: return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
! 3424: }
! 3425: }
! 3426:
! 3427:
! 3428: /* TODO(shess) If we pushed LeafReader to the top of the file, or to
! 3429: ** another file, term_select() could be pushed above
! 3430: ** docListOfTerm().
! 3431: */
! 3432: static int termSelect(fulltext_vtab *v, int iColumn,
! 3433: const char *pTerm, int nTerm, int isPrefix,
! 3434: DocListType iType, DataBuffer *out);
! 3435:
! 3436: /* Return a DocList corresponding to the query term *pTerm. If *pTerm
! 3437: ** is the first term of a phrase query, go ahead and evaluate the phrase
! 3438: ** query and return the doclist for the entire phrase query.
! 3439: **
! 3440: ** The resulting DL_DOCIDS doclist is stored in pResult, which is
! 3441: ** overwritten.
! 3442: */
! 3443: static int docListOfTerm(
! 3444: fulltext_vtab *v, /* The full text index */
! 3445: int iColumn, /* column to restrict to. No restriction if >=nColumn */
! 3446: QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
! 3447: DataBuffer *pResult /* Write the result here */
! 3448: ){
! 3449: DataBuffer left, right, new;
! 3450: int i, rc;
! 3451:
! 3452: /* No phrase search if no position info. */
! 3453: assert( pQTerm->nPhrase==0 || DL_DEFAULT!=DL_DOCIDS );
! 3454:
! 3455: /* This code should never be called with buffered updates. */
! 3456: assert( v->nPendingData<0 );
! 3457:
! 3458: dataBufferInit(&left, 0);
! 3459: rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix,
! 3460: 0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &left);
! 3461: if( rc ) return rc;
! 3462: for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){
! 3463: dataBufferInit(&right, 0);
! 3464: rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm,
! 3465: pQTerm[i].isPrefix, DL_POSITIONS, &right);
! 3466: if( rc ){
! 3467: dataBufferDestroy(&left);
! 3468: return rc;
! 3469: }
! 3470: dataBufferInit(&new, 0);
! 3471: docListPhraseMerge(left.pData, left.nData, right.pData, right.nData,
! 3472: i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS, &new);
! 3473: dataBufferDestroy(&left);
! 3474: dataBufferDestroy(&right);
! 3475: left = new;
! 3476: }
! 3477: *pResult = left;
! 3478: return SQLITE_OK;
! 3479: }
! 3480:
! 3481: /* Add a new term pTerm[0..nTerm-1] to the query *q.
! 3482: */
! 3483: static void queryAdd(Query *q, const char *pTerm, int nTerm){
! 3484: QueryTerm *t;
! 3485: ++q->nTerms;
! 3486: q->pTerms = sqlite3_realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
! 3487: if( q->pTerms==0 ){
! 3488: q->nTerms = 0;
! 3489: return;
! 3490: }
! 3491: t = &q->pTerms[q->nTerms - 1];
! 3492: CLEAR(t);
! 3493: t->pTerm = sqlite3_malloc(nTerm+1);
! 3494: memcpy(t->pTerm, pTerm, nTerm);
! 3495: t->pTerm[nTerm] = 0;
! 3496: t->nTerm = nTerm;
! 3497: t->isOr = q->nextIsOr;
! 3498: t->isPrefix = 0;
! 3499: q->nextIsOr = 0;
! 3500: t->iColumn = q->nextColumn;
! 3501: q->nextColumn = q->dfltColumn;
! 3502: }
! 3503:
! 3504: /*
! 3505: ** Check to see if the string zToken[0...nToken-1] matches any
! 3506: ** column name in the virtual table. If it does,
! 3507: ** return the zero-indexed column number. If not, return -1.
! 3508: */
! 3509: static int checkColumnSpecifier(
! 3510: fulltext_vtab *pVtab, /* The virtual table */
! 3511: const char *zToken, /* Text of the token */
! 3512: int nToken /* Number of characters in the token */
! 3513: ){
! 3514: int i;
! 3515: for(i=0; i<pVtab->nColumn; i++){
! 3516: if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
! 3517: && pVtab->azColumn[i][nToken]==0 ){
! 3518: return i;
! 3519: }
! 3520: }
! 3521: return -1;
! 3522: }
! 3523:
! 3524: /*
! 3525: ** Parse the text at pSegment[0..nSegment-1]. Add additional terms
! 3526: ** to the query being assemblied in pQuery.
! 3527: **
! 3528: ** inPhrase is true if pSegment[0..nSegement-1] is contained within
! 3529: ** double-quotes. If inPhrase is true, then the first term
! 3530: ** is marked with the number of terms in the phrase less one and
! 3531: ** OR and "-" syntax is ignored. If inPhrase is false, then every
! 3532: ** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
! 3533: */
! 3534: static int tokenizeSegment(
! 3535: sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
! 3536: const char *pSegment, int nSegment, /* Query expression being parsed */
! 3537: int inPhrase, /* True if within "..." */
! 3538: Query *pQuery /* Append results here */
! 3539: ){
! 3540: const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
! 3541: sqlite3_tokenizer_cursor *pCursor;
! 3542: int firstIndex = pQuery->nTerms;
! 3543: int iCol;
! 3544: int nTerm = 1;
! 3545:
! 3546: int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
! 3547: if( rc!=SQLITE_OK ) return rc;
! 3548: pCursor->pTokenizer = pTokenizer;
! 3549:
! 3550: while( 1 ){
! 3551: const char *pToken;
! 3552: int nToken, iBegin, iEnd, iPos;
! 3553:
! 3554: rc = pModule->xNext(pCursor,
! 3555: &pToken, &nToken,
! 3556: &iBegin, &iEnd, &iPos);
! 3557: if( rc!=SQLITE_OK ) break;
! 3558: if( !inPhrase &&
! 3559: pSegment[iEnd]==':' &&
! 3560: (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
! 3561: pQuery->nextColumn = iCol;
! 3562: continue;
! 3563: }
! 3564: if( !inPhrase && pQuery->nTerms>0 && nToken==2
! 3565: && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
! 3566: pQuery->nextIsOr = 1;
! 3567: continue;
! 3568: }
! 3569: queryAdd(pQuery, pToken, nToken);
! 3570: if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
! 3571: pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
! 3572: }
! 3573: if( iEnd<nSegment && pSegment[iEnd]=='*' ){
! 3574: pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
! 3575: }
! 3576: pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
! 3577: if( inPhrase ){
! 3578: nTerm++;
! 3579: }
! 3580: }
! 3581:
! 3582: if( inPhrase && pQuery->nTerms>firstIndex ){
! 3583: pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
! 3584: }
! 3585:
! 3586: return pModule->xClose(pCursor);
! 3587: }
! 3588:
! 3589: /* Parse a query string, yielding a Query object pQuery.
! 3590: **
! 3591: ** The calling function will need to queryClear() to clean up
! 3592: ** the dynamically allocated memory held by pQuery.
! 3593: */
! 3594: static int parseQuery(
! 3595: fulltext_vtab *v, /* The fulltext index */
! 3596: const char *zInput, /* Input text of the query string */
! 3597: int nInput, /* Size of the input text */
! 3598: int dfltColumn, /* Default column of the index to match against */
! 3599: Query *pQuery /* Write the parse results here. */
! 3600: ){
! 3601: int iInput, inPhrase = 0;
! 3602:
! 3603: if( zInput==0 ) nInput = 0;
! 3604: if( nInput<0 ) nInput = strlen(zInput);
! 3605: pQuery->nTerms = 0;
! 3606: pQuery->pTerms = NULL;
! 3607: pQuery->nextIsOr = 0;
! 3608: pQuery->nextColumn = dfltColumn;
! 3609: pQuery->dfltColumn = dfltColumn;
! 3610: pQuery->pFts = v;
! 3611:
! 3612: for(iInput=0; iInput<nInput; ++iInput){
! 3613: int i;
! 3614: for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
! 3615: if( i>iInput ){
! 3616: tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
! 3617: pQuery);
! 3618: }
! 3619: iInput = i;
! 3620: if( i<nInput ){
! 3621: assert( zInput[i]=='"' );
! 3622: inPhrase = !inPhrase;
! 3623: }
! 3624: }
! 3625:
! 3626: if( inPhrase ){
! 3627: /* unmatched quote */
! 3628: queryClear(pQuery);
! 3629: return SQLITE_ERROR;
! 3630: }
! 3631: return SQLITE_OK;
! 3632: }
! 3633:
! 3634: /* TODO(shess) Refactor the code to remove this forward decl. */
! 3635: static int flushPendingTerms(fulltext_vtab *v);
! 3636:
! 3637: /* Perform a full-text query using the search expression in
! 3638: ** zInput[0..nInput-1]. Return a list of matching documents
! 3639: ** in pResult.
! 3640: **
! 3641: ** Queries must match column iColumn. Or if iColumn>=nColumn
! 3642: ** they are allowed to match against any column.
! 3643: */
! 3644: static int fulltextQuery(
! 3645: fulltext_vtab *v, /* The full text index */
! 3646: int iColumn, /* Match against this column by default */
! 3647: const char *zInput, /* The query string */
! 3648: int nInput, /* Number of bytes in zInput[] */
! 3649: DataBuffer *pResult, /* Write the result doclist here */
! 3650: Query *pQuery /* Put parsed query string here */
! 3651: ){
! 3652: int i, iNext, rc;
! 3653: DataBuffer left, right, or, new;
! 3654: int nNot = 0;
! 3655: QueryTerm *aTerm;
! 3656:
! 3657: /* TODO(shess) Instead of flushing pendingTerms, we could query for
! 3658: ** the relevant term and merge the doclist into what we receive from
! 3659: ** the database. Wait and see if this is a common issue, first.
! 3660: **
! 3661: ** A good reason not to flush is to not generate update-related
! 3662: ** error codes from here.
! 3663: */
! 3664:
! 3665: /* Flush any buffered updates before executing the query. */
! 3666: rc = flushPendingTerms(v);
! 3667: if( rc!=SQLITE_OK ) return rc;
! 3668:
! 3669: /* TODO(shess) I think that the queryClear() calls below are not
! 3670: ** necessary, because fulltextClose() already clears the query.
! 3671: */
! 3672: rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
! 3673: if( rc!=SQLITE_OK ) return rc;
! 3674:
! 3675: /* Empty or NULL queries return no results. */
! 3676: if( pQuery->nTerms==0 ){
! 3677: dataBufferInit(pResult, 0);
! 3678: return SQLITE_OK;
! 3679: }
! 3680:
! 3681: /* Merge AND terms. */
! 3682: /* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
! 3683: aTerm = pQuery->pTerms;
! 3684: for(i = 0; i<pQuery->nTerms; i=iNext){
! 3685: if( aTerm[i].isNot ){
! 3686: /* Handle all NOT terms in a separate pass */
! 3687: nNot++;
! 3688: iNext = i + aTerm[i].nPhrase+1;
! 3689: continue;
! 3690: }
! 3691: iNext = i + aTerm[i].nPhrase + 1;
! 3692: rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
! 3693: if( rc ){
! 3694: if( i!=nNot ) dataBufferDestroy(&left);
! 3695: queryClear(pQuery);
! 3696: return rc;
! 3697: }
! 3698: while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
! 3699: rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or);
! 3700: iNext += aTerm[iNext].nPhrase + 1;
! 3701: if( rc ){
! 3702: if( i!=nNot ) dataBufferDestroy(&left);
! 3703: dataBufferDestroy(&right);
! 3704: queryClear(pQuery);
! 3705: return rc;
! 3706: }
! 3707: dataBufferInit(&new, 0);
! 3708: docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new);
! 3709: dataBufferDestroy(&right);
! 3710: dataBufferDestroy(&or);
! 3711: right = new;
! 3712: }
! 3713: if( i==nNot ){ /* first term processed. */
! 3714: left = right;
! 3715: }else{
! 3716: dataBufferInit(&new, 0);
! 3717: docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new);
! 3718: dataBufferDestroy(&right);
! 3719: dataBufferDestroy(&left);
! 3720: left = new;
! 3721: }
! 3722: }
! 3723:
! 3724: if( nNot==pQuery->nTerms ){
! 3725: /* We do not yet know how to handle a query of only NOT terms */
! 3726: return SQLITE_ERROR;
! 3727: }
! 3728:
! 3729: /* Do the EXCEPT terms */
! 3730: for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
! 3731: if( !aTerm[i].isNot ) continue;
! 3732: rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
! 3733: if( rc ){
! 3734: queryClear(pQuery);
! 3735: dataBufferDestroy(&left);
! 3736: return rc;
! 3737: }
! 3738: dataBufferInit(&new, 0);
! 3739: docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new);
! 3740: dataBufferDestroy(&right);
! 3741: dataBufferDestroy(&left);
! 3742: left = new;
! 3743: }
! 3744:
! 3745: *pResult = left;
! 3746: return rc;
! 3747: }
! 3748:
! 3749: /*
! 3750: ** This is the xFilter interface for the virtual table. See
! 3751: ** the virtual table xFilter method documentation for additional
! 3752: ** information.
! 3753: **
! 3754: ** If idxNum==QUERY_GENERIC then do a full table scan against
! 3755: ** the %_content table.
! 3756: **
! 3757: ** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
! 3758: ** in the %_content table.
! 3759: **
! 3760: ** If idxNum>=QUERY_FULLTEXT then use the full text index. The
! 3761: ** column on the left-hand side of the MATCH operator is column
! 3762: ** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
! 3763: ** side of the MATCH operator.
! 3764: */
! 3765: /* TODO(shess) Upgrade the cursor initialization and destruction to
! 3766: ** account for fulltextFilter() being called multiple times on the
! 3767: ** same cursor. The current solution is very fragile. Apply fix to
! 3768: ** fts2 as appropriate.
! 3769: */
! 3770: static int fulltextFilter(
! 3771: sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
! 3772: int idxNum, const char *idxStr, /* Which indexing scheme to use */
! 3773: int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
! 3774: ){
! 3775: fulltext_cursor *c = (fulltext_cursor *) pCursor;
! 3776: fulltext_vtab *v = cursor_vtab(c);
! 3777: int rc;
! 3778:
! 3779: TRACE(("FTS2 Filter %p\n",pCursor));
! 3780:
! 3781: /* If the cursor has a statement that was not prepared according to
! 3782: ** idxNum, clear it. I believe all calls to fulltextFilter with a
! 3783: ** given cursor will have the same idxNum , but in this case it's
! 3784: ** easy to be safe.
! 3785: */
! 3786: if( c->pStmt && c->iCursorType!=idxNum ){
! 3787: sqlite3_finalize(c->pStmt);
! 3788: c->pStmt = NULL;
! 3789: }
! 3790:
! 3791: /* Get a fresh statement appropriate to idxNum. */
! 3792: /* TODO(shess): Add a prepared-statement cache in the vt structure.
! 3793: ** The cache must handle multiple open cursors. Easier to cache the
! 3794: ** statement variants at the vt to reduce malloc/realloc/free here.
! 3795: ** Or we could have a StringBuffer variant which allowed stack
! 3796: ** construction for small values.
! 3797: */
! 3798: if( !c->pStmt ){
! 3799: char *zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
! 3800: idxNum==QUERY_GENERIC ? "" : "where rowid=?");
! 3801: rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
! 3802: sqlite3_free(zSql);
! 3803: if( rc!=SQLITE_OK ) return rc;
! 3804: c->iCursorType = idxNum;
! 3805: }else{
! 3806: sqlite3_reset(c->pStmt);
! 3807: assert( c->iCursorType==idxNum );
! 3808: }
! 3809:
! 3810: switch( idxNum ){
! 3811: case QUERY_GENERIC:
! 3812: break;
! 3813:
! 3814: case QUERY_ROWID:
! 3815: rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
! 3816: if( rc!=SQLITE_OK ) return rc;
! 3817: break;
! 3818:
! 3819: default: /* full-text search */
! 3820: {
! 3821: const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
! 3822: assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
! 3823: assert( argc==1 );
! 3824: queryClear(&c->q);
! 3825: if( c->result.nData!=0 ){
! 3826: /* This case happens if the same cursor is used repeatedly. */
! 3827: dlrDestroy(&c->reader);
! 3828: dataBufferReset(&c->result);
! 3829: }else{
! 3830: dataBufferInit(&c->result, 0);
! 3831: }
! 3832: rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q);
! 3833: if( rc!=SQLITE_OK ) return rc;
! 3834: if( c->result.nData!=0 ){
! 3835: dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData);
! 3836: }
! 3837: break;
! 3838: }
! 3839: }
! 3840:
! 3841: return fulltextNext(pCursor);
! 3842: }
! 3843:
! 3844: /* This is the xEof method of the virtual table. The SQLite core
! 3845: ** calls this routine to find out if it has reached the end of
! 3846: ** a query's results set.
! 3847: */
! 3848: static int fulltextEof(sqlite3_vtab_cursor *pCursor){
! 3849: fulltext_cursor *c = (fulltext_cursor *) pCursor;
! 3850: return c->eof;
! 3851: }
! 3852:
! 3853: /* This is the xColumn method of the virtual table. The SQLite
! 3854: ** core calls this method during a query when it needs the value
! 3855: ** of a column from the virtual table. This method needs to use
! 3856: ** one of the sqlite3_result_*() routines to store the requested
! 3857: ** value back in the pContext.
! 3858: */
! 3859: static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
! 3860: sqlite3_context *pContext, int idxCol){
! 3861: fulltext_cursor *c = (fulltext_cursor *) pCursor;
! 3862: fulltext_vtab *v = cursor_vtab(c);
! 3863:
! 3864: if( idxCol<v->nColumn ){
! 3865: sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
! 3866: sqlite3_result_value(pContext, pVal);
! 3867: }else if( idxCol==v->nColumn ){
! 3868: /* The extra column whose name is the same as the table.
! 3869: ** Return a blob which is a pointer to the cursor
! 3870: */
! 3871: sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
! 3872: }
! 3873: return SQLITE_OK;
! 3874: }
! 3875:
! 3876: /* This is the xRowid method. The SQLite core calls this routine to
! 3877: ** retrive the rowid for the current row of the result set. The
! 3878: ** rowid should be written to *pRowid.
! 3879: */
! 3880: static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
! 3881: fulltext_cursor *c = (fulltext_cursor *) pCursor;
! 3882:
! 3883: *pRowid = sqlite3_column_int64(c->pStmt, 0);
! 3884: return SQLITE_OK;
! 3885: }
! 3886:
! 3887: /* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0,
! 3888: ** we also store positions and offsets in the hash table using that
! 3889: ** column number.
! 3890: */
! 3891: static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
! 3892: const char *zText, int iColumn){
! 3893: sqlite3_tokenizer *pTokenizer = v->pTokenizer;
! 3894: sqlite3_tokenizer_cursor *pCursor;
! 3895: const char *pToken;
! 3896: int nTokenBytes;
! 3897: int iStartOffset, iEndOffset, iPosition;
! 3898: int rc;
! 3899:
! 3900: rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
! 3901: if( rc!=SQLITE_OK ) return rc;
! 3902:
! 3903: pCursor->pTokenizer = pTokenizer;
! 3904: while( SQLITE_OK==(rc=pTokenizer->pModule->xNext(pCursor,
! 3905: &pToken, &nTokenBytes,
! 3906: &iStartOffset, &iEndOffset,
! 3907: &iPosition)) ){
! 3908: DLCollector *p;
! 3909: int nData; /* Size of doclist before our update. */
! 3910:
! 3911: /* Positions can't be negative; we use -1 as a terminator
! 3912: * internally. Token can't be NULL or empty. */
! 3913: if( iPosition<0 || pToken == NULL || nTokenBytes == 0 ){
! 3914: rc = SQLITE_ERROR;
! 3915: break;
! 3916: }
! 3917:
! 3918: p = fts2HashFind(&v->pendingTerms, pToken, nTokenBytes);
! 3919: if( p==NULL ){
! 3920: nData = 0;
! 3921: p = dlcNew(iDocid, DL_DEFAULT);
! 3922: fts2HashInsert(&v->pendingTerms, pToken, nTokenBytes, p);
! 3923:
! 3924: /* Overhead for our hash table entry, the key, and the value. */
! 3925: v->nPendingData += sizeof(struct fts2HashElem)+sizeof(*p)+nTokenBytes;
! 3926: }else{
! 3927: nData = p->b.nData;
! 3928: if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid);
! 3929: }
! 3930: if( iColumn>=0 ){
! 3931: dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
! 3932: }
! 3933:
! 3934: /* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
! 3935: v->nPendingData += p->b.nData-nData;
! 3936: }
! 3937:
! 3938: /* TODO(shess) Check return? Should this be able to cause errors at
! 3939: ** this point? Actually, same question about sqlite3_finalize(),
! 3940: ** though one could argue that failure there means that the data is
! 3941: ** not durable. *ponder*
! 3942: */
! 3943: pTokenizer->pModule->xClose(pCursor);
! 3944: if( SQLITE_DONE == rc ) return SQLITE_OK;
! 3945: return rc;
! 3946: }
! 3947:
! 3948: /* Add doclists for all terms in [pValues] to pendingTerms table. */
! 3949: static int insertTerms(fulltext_vtab *v, sqlite_int64 iRowid,
! 3950: sqlite3_value **pValues){
! 3951: int i;
! 3952: for(i = 0; i < v->nColumn ; ++i){
! 3953: char *zText = (char*)sqlite3_value_text(pValues[i]);
! 3954: int rc = buildTerms(v, iRowid, zText, i);
! 3955: if( rc!=SQLITE_OK ) return rc;
! 3956: }
! 3957: return SQLITE_OK;
! 3958: }
! 3959:
! 3960: /* Add empty doclists for all terms in the given row's content to
! 3961: ** pendingTerms.
! 3962: */
! 3963: static int deleteTerms(fulltext_vtab *v, sqlite_int64 iRowid){
! 3964: const char **pValues;
! 3965: int i, rc;
! 3966:
! 3967: /* TODO(shess) Should we allow such tables at all? */
! 3968: if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
! 3969:
! 3970: rc = content_select(v, iRowid, &pValues);
! 3971: if( rc!=SQLITE_OK ) return rc;
! 3972:
! 3973: for(i = 0 ; i < v->nColumn; ++i) {
! 3974: rc = buildTerms(v, iRowid, pValues[i], -1);
! 3975: if( rc!=SQLITE_OK ) break;
! 3976: }
! 3977:
! 3978: freeStringArray(v->nColumn, pValues);
! 3979: return SQLITE_OK;
! 3980: }
! 3981:
! 3982: /* TODO(shess) Refactor the code to remove this forward decl. */
! 3983: static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid);
! 3984:
! 3985: /* Insert a row into the %_content table; set *piRowid to be the ID of the
! 3986: ** new row. Add doclists for terms to pendingTerms.
! 3987: */
! 3988: static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
! 3989: sqlite3_value **pValues, sqlite_int64 *piRowid){
! 3990: int rc;
! 3991:
! 3992: rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
! 3993: if( rc!=SQLITE_OK ) return rc;
! 3994:
! 3995: *piRowid = sqlite3_last_insert_rowid(v->db);
! 3996: rc = initPendingTerms(v, *piRowid);
! 3997: if( rc!=SQLITE_OK ) return rc;
! 3998:
! 3999: return insertTerms(v, *piRowid, pValues);
! 4000: }
! 4001:
! 4002: /* Delete a row from the %_content table; add empty doclists for terms
! 4003: ** to pendingTerms.
! 4004: */
! 4005: static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
! 4006: int rc = initPendingTerms(v, iRow);
! 4007: if( rc!=SQLITE_OK ) return rc;
! 4008:
! 4009: rc = deleteTerms(v, iRow);
! 4010: if( rc!=SQLITE_OK ) return rc;
! 4011:
! 4012: return content_delete(v, iRow); /* execute an SQL DELETE */
! 4013: }
! 4014:
! 4015: /* Update a row in the %_content table; add delete doclists to
! 4016: ** pendingTerms for old terms not in the new data, add insert doclists
! 4017: ** to pendingTerms for terms in the new data.
! 4018: */
! 4019: static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
! 4020: sqlite3_value **pValues){
! 4021: int rc = initPendingTerms(v, iRow);
! 4022: if( rc!=SQLITE_OK ) return rc;
! 4023:
! 4024: /* Generate an empty doclist for each term that previously appeared in this
! 4025: * row. */
! 4026: rc = deleteTerms(v, iRow);
! 4027: if( rc!=SQLITE_OK ) return rc;
! 4028:
! 4029: rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
! 4030: if( rc!=SQLITE_OK ) return rc;
! 4031:
! 4032: /* Now add positions for terms which appear in the updated row. */
! 4033: return insertTerms(v, iRow, pValues);
! 4034: }
! 4035:
! 4036: /*******************************************************************/
! 4037: /* InteriorWriter is used to collect terms and block references into
! 4038: ** interior nodes in %_segments. See commentary at top of file for
! 4039: ** format.
! 4040: */
! 4041:
! 4042: /* How large interior nodes can grow. */
! 4043: #define INTERIOR_MAX 2048
! 4044:
! 4045: /* Minimum number of terms per interior node (except the root). This
! 4046: ** prevents large terms from making the tree too skinny - must be >0
! 4047: ** so that the tree always makes progress. Note that the min tree
! 4048: ** fanout will be INTERIOR_MIN_TERMS+1.
! 4049: */
! 4050: #define INTERIOR_MIN_TERMS 7
! 4051: #if INTERIOR_MIN_TERMS<1
! 4052: # error INTERIOR_MIN_TERMS must be greater than 0.
! 4053: #endif
! 4054:
! 4055: /* ROOT_MAX controls how much data is stored inline in the segment
! 4056: ** directory.
! 4057: */
! 4058: /* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's
! 4059: ** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
! 4060: ** can both see it, but if the caller passed it in, we wouldn't even
! 4061: ** need a define.
! 4062: */
! 4063: #define ROOT_MAX 1024
! 4064: #if ROOT_MAX<VARINT_MAX*2
! 4065: # error ROOT_MAX must have enough space for a header.
! 4066: #endif
! 4067:
! 4068: /* InteriorBlock stores a linked-list of interior blocks while a lower
! 4069: ** layer is being constructed.
! 4070: */
! 4071: typedef struct InteriorBlock {
! 4072: DataBuffer term; /* Leftmost term in block's subtree. */
! 4073: DataBuffer data; /* Accumulated data for the block. */
! 4074: struct InteriorBlock *next;
! 4075: } InteriorBlock;
! 4076:
! 4077: static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
! 4078: const char *pTerm, int nTerm){
! 4079: InteriorBlock *block = sqlite3_malloc(sizeof(InteriorBlock));
! 4080: char c[VARINT_MAX+VARINT_MAX];
! 4081: int n;
! 4082:
! 4083: if( block ){
! 4084: memset(block, 0, sizeof(*block));
! 4085: dataBufferInit(&block->term, 0);
! 4086: dataBufferReplace(&block->term, pTerm, nTerm);
! 4087:
! 4088: n = putVarint(c, iHeight);
! 4089: n += putVarint(c+n, iChildBlock);
! 4090: dataBufferInit(&block->data, INTERIOR_MAX);
! 4091: dataBufferReplace(&block->data, c, n);
! 4092: }
! 4093: return block;
! 4094: }
! 4095:
! 4096: #ifndef NDEBUG
! 4097: /* Verify that the data is readable as an interior node. */
! 4098: static void interiorBlockValidate(InteriorBlock *pBlock){
! 4099: const char *pData = pBlock->data.pData;
! 4100: int nData = pBlock->data.nData;
! 4101: int n, iDummy;
! 4102: sqlite_int64 iBlockid;
! 4103:
! 4104: assert( nData>0 );
! 4105: assert( pData!=0 );
! 4106: assert( pData+nData>pData );
! 4107:
! 4108: /* Must lead with height of node as a varint(n), n>0 */
! 4109: n = getVarint32(pData, &iDummy);
! 4110: assert( n>0 );
! 4111: assert( iDummy>0 );
! 4112: assert( n<nData );
! 4113: pData += n;
! 4114: nData -= n;
! 4115:
! 4116: /* Must contain iBlockid. */
! 4117: n = getVarint(pData, &iBlockid);
! 4118: assert( n>0 );
! 4119: assert( n<=nData );
! 4120: pData += n;
! 4121: nData -= n;
! 4122:
! 4123: /* Zero or more terms of positive length */
! 4124: if( nData!=0 ){
! 4125: /* First term is not delta-encoded. */
! 4126: n = getVarint32(pData, &iDummy);
! 4127: assert( n>0 );
! 4128: assert( iDummy>0 );
! 4129: assert( n+iDummy>0);
! 4130: assert( n+iDummy<=nData );
! 4131: pData += n+iDummy;
! 4132: nData -= n+iDummy;
! 4133:
! 4134: /* Following terms delta-encoded. */
! 4135: while( nData!=0 ){
! 4136: /* Length of shared prefix. */
! 4137: n = getVarint32(pData, &iDummy);
! 4138: assert( n>0 );
! 4139: assert( iDummy>=0 );
! 4140: assert( n<nData );
! 4141: pData += n;
! 4142: nData -= n;
! 4143:
! 4144: /* Length and data of distinct suffix. */
! 4145: n = getVarint32(pData, &iDummy);
! 4146: assert( n>0 );
! 4147: assert( iDummy>0 );
! 4148: assert( n+iDummy>0);
! 4149: assert( n+iDummy<=nData );
! 4150: pData += n+iDummy;
! 4151: nData -= n+iDummy;
! 4152: }
! 4153: }
! 4154: }
! 4155: #define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
! 4156: #else
! 4157: #define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
! 4158: #endif
! 4159:
! 4160: typedef struct InteriorWriter {
! 4161: int iHeight; /* from 0 at leaves. */
! 4162: InteriorBlock *first, *last;
! 4163: struct InteriorWriter *parentWriter;
! 4164:
! 4165: DataBuffer term; /* Last term written to block "last". */
! 4166: sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
! 4167: #ifndef NDEBUG
! 4168: sqlite_int64 iLastChildBlock; /* for consistency checks. */
! 4169: #endif
! 4170: } InteriorWriter;
! 4171:
! 4172: /* Initialize an interior node where pTerm[nTerm] marks the leftmost
! 4173: ** term in the tree. iChildBlock is the leftmost child block at the
! 4174: ** next level down the tree.
! 4175: */
! 4176: static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
! 4177: sqlite_int64 iChildBlock,
! 4178: InteriorWriter *pWriter){
! 4179: InteriorBlock *block;
! 4180: assert( iHeight>0 );
! 4181: CLEAR(pWriter);
! 4182:
! 4183: pWriter->iHeight = iHeight;
! 4184: pWriter->iOpeningChildBlock = iChildBlock;
! 4185: #ifndef NDEBUG
! 4186: pWriter->iLastChildBlock = iChildBlock;
! 4187: #endif
! 4188: block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
! 4189: pWriter->last = pWriter->first = block;
! 4190: ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
! 4191: dataBufferInit(&pWriter->term, 0);
! 4192: }
! 4193:
! 4194: /* Append the child node rooted at iChildBlock to the interior node,
! 4195: ** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
! 4196: */
! 4197: static void interiorWriterAppend(InteriorWriter *pWriter,
! 4198: const char *pTerm, int nTerm,
! 4199: sqlite_int64 iChildBlock){
! 4200: char c[VARINT_MAX+VARINT_MAX];
! 4201: int n, nPrefix = 0;
! 4202:
! 4203: ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
! 4204:
! 4205: /* The first term written into an interior node is actually
! 4206: ** associated with the second child added (the first child was added
! 4207: ** in interiorWriterInit, or in the if clause at the bottom of this
! 4208: ** function). That term gets encoded straight up, with nPrefix left
! 4209: ** at 0.
! 4210: */
! 4211: if( pWriter->term.nData==0 ){
! 4212: n = putVarint(c, nTerm);
! 4213: }else{
! 4214: while( nPrefix<pWriter->term.nData &&
! 4215: pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
! 4216: nPrefix++;
! 4217: }
! 4218:
! 4219: n = putVarint(c, nPrefix);
! 4220: n += putVarint(c+n, nTerm-nPrefix);
! 4221: }
! 4222:
! 4223: #ifndef NDEBUG
! 4224: pWriter->iLastChildBlock++;
! 4225: #endif
! 4226: assert( pWriter->iLastChildBlock==iChildBlock );
! 4227:
! 4228: /* Overflow to a new block if the new term makes the current block
! 4229: ** too big, and the current block already has enough terms.
! 4230: */
! 4231: if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
! 4232: iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
! 4233: pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
! 4234: pTerm, nTerm);
! 4235: pWriter->last = pWriter->last->next;
! 4236: pWriter->iOpeningChildBlock = iChildBlock;
! 4237: dataBufferReset(&pWriter->term);
! 4238: }else{
! 4239: dataBufferAppend2(&pWriter->last->data, c, n,
! 4240: pTerm+nPrefix, nTerm-nPrefix);
! 4241: dataBufferReplace(&pWriter->term, pTerm, nTerm);
! 4242: }
! 4243: ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
! 4244: }
! 4245:
! 4246: /* Free the space used by pWriter, including the linked-list of
! 4247: ** InteriorBlocks, and parentWriter, if present.
! 4248: */
! 4249: static int interiorWriterDestroy(InteriorWriter *pWriter){
! 4250: InteriorBlock *block = pWriter->first;
! 4251:
! 4252: while( block!=NULL ){
! 4253: InteriorBlock *b = block;
! 4254: block = block->next;
! 4255: dataBufferDestroy(&b->term);
! 4256: dataBufferDestroy(&b->data);
! 4257: sqlite3_free(b);
! 4258: }
! 4259: if( pWriter->parentWriter!=NULL ){
! 4260: interiorWriterDestroy(pWriter->parentWriter);
! 4261: sqlite3_free(pWriter->parentWriter);
! 4262: }
! 4263: dataBufferDestroy(&pWriter->term);
! 4264: SCRAMBLE(pWriter);
! 4265: return SQLITE_OK;
! 4266: }
! 4267:
! 4268: /* If pWriter can fit entirely in ROOT_MAX, return it as the root info
! 4269: ** directly, leaving *piEndBlockid unchanged. Otherwise, flush
! 4270: ** pWriter to %_segments, building a new layer of interior nodes, and
! 4271: ** recursively ask for their root into.
! 4272: */
! 4273: static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
! 4274: char **ppRootInfo, int *pnRootInfo,
! 4275: sqlite_int64 *piEndBlockid){
! 4276: InteriorBlock *block = pWriter->first;
! 4277: sqlite_int64 iBlockid = 0;
! 4278: int rc;
! 4279:
! 4280: /* If we can fit the segment inline */
! 4281: if( block==pWriter->last && block->data.nData<ROOT_MAX ){
! 4282: *ppRootInfo = block->data.pData;
! 4283: *pnRootInfo = block->data.nData;
! 4284: return SQLITE_OK;
! 4285: }
! 4286:
! 4287: /* Flush the first block to %_segments, and create a new level of
! 4288: ** interior node.
! 4289: */
! 4290: ASSERT_VALID_INTERIOR_BLOCK(block);
! 4291: rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
! 4292: if( rc!=SQLITE_OK ) return rc;
! 4293: *piEndBlockid = iBlockid;
! 4294:
! 4295: pWriter->parentWriter = sqlite3_malloc(sizeof(*pWriter->parentWriter));
! 4296: interiorWriterInit(pWriter->iHeight+1,
! 4297: block->term.pData, block->term.nData,
! 4298: iBlockid, pWriter->parentWriter);
! 4299:
! 4300: /* Flush additional blocks and append to the higher interior
! 4301: ** node.
! 4302: */
! 4303: for(block=block->next; block!=NULL; block=block->next){
! 4304: ASSERT_VALID_INTERIOR_BLOCK(block);
! 4305: rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
! 4306: if( rc!=SQLITE_OK ) return rc;
! 4307: *piEndBlockid = iBlockid;
! 4308:
! 4309: interiorWriterAppend(pWriter->parentWriter,
! 4310: block->term.pData, block->term.nData, iBlockid);
! 4311: }
! 4312:
! 4313: /* Parent node gets the chance to be the root. */
! 4314: return interiorWriterRootInfo(v, pWriter->parentWriter,
! 4315: ppRootInfo, pnRootInfo, piEndBlockid);
! 4316: }
! 4317:
! 4318: /****************************************************************/
! 4319: /* InteriorReader is used to read off the data from an interior node
! 4320: ** (see comment at top of file for the format).
! 4321: */
! 4322: typedef struct InteriorReader {
! 4323: const char *pData;
! 4324: int nData;
! 4325:
! 4326: DataBuffer term; /* previous term, for decoding term delta. */
! 4327:
! 4328: sqlite_int64 iBlockid;
! 4329: } InteriorReader;
! 4330:
! 4331: static void interiorReaderDestroy(InteriorReader *pReader){
! 4332: dataBufferDestroy(&pReader->term);
! 4333: SCRAMBLE(pReader);
! 4334: }
! 4335:
! 4336: /* TODO(shess) The assertions are great, but what if we're in NDEBUG
! 4337: ** and the blob is empty or otherwise contains suspect data?
! 4338: */
! 4339: static void interiorReaderInit(const char *pData, int nData,
! 4340: InteriorReader *pReader){
! 4341: int n, nTerm;
! 4342:
! 4343: /* Require at least the leading flag byte */
! 4344: assert( nData>0 );
! 4345: assert( pData[0]!='\0' );
! 4346:
! 4347: CLEAR(pReader);
! 4348:
! 4349: /* Decode the base blockid, and set the cursor to the first term. */
! 4350: n = getVarint(pData+1, &pReader->iBlockid);
! 4351: assert( 1+n<=nData );
! 4352: pReader->pData = pData+1+n;
! 4353: pReader->nData = nData-(1+n);
! 4354:
! 4355: /* A single-child interior node (such as when a leaf node was too
! 4356: ** large for the segment directory) won't have any terms.
! 4357: ** Otherwise, decode the first term.
! 4358: */
! 4359: if( pReader->nData==0 ){
! 4360: dataBufferInit(&pReader->term, 0);
! 4361: }else{
! 4362: n = getVarint32(pReader->pData, &nTerm);
! 4363: dataBufferInit(&pReader->term, nTerm);
! 4364: dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
! 4365: assert( n+nTerm<=pReader->nData );
! 4366: pReader->pData += n+nTerm;
! 4367: pReader->nData -= n+nTerm;
! 4368: }
! 4369: }
! 4370:
! 4371: static int interiorReaderAtEnd(InteriorReader *pReader){
! 4372: return pReader->term.nData==0;
! 4373: }
! 4374:
! 4375: static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
! 4376: return pReader->iBlockid;
! 4377: }
! 4378:
! 4379: static int interiorReaderTermBytes(InteriorReader *pReader){
! 4380: assert( !interiorReaderAtEnd(pReader) );
! 4381: return pReader->term.nData;
! 4382: }
! 4383: static const char *interiorReaderTerm(InteriorReader *pReader){
! 4384: assert( !interiorReaderAtEnd(pReader) );
! 4385: return pReader->term.pData;
! 4386: }
! 4387:
! 4388: /* Step forward to the next term in the node. */
! 4389: static void interiorReaderStep(InteriorReader *pReader){
! 4390: assert( !interiorReaderAtEnd(pReader) );
! 4391:
! 4392: /* If the last term has been read, signal eof, else construct the
! 4393: ** next term.
! 4394: */
! 4395: if( pReader->nData==0 ){
! 4396: dataBufferReset(&pReader->term);
! 4397: }else{
! 4398: int n, nPrefix, nSuffix;
! 4399:
! 4400: n = getVarint32(pReader->pData, &nPrefix);
! 4401: n += getVarint32(pReader->pData+n, &nSuffix);
! 4402:
! 4403: /* Truncate the current term and append suffix data. */
! 4404: pReader->term.nData = nPrefix;
! 4405: dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
! 4406:
! 4407: assert( n+nSuffix<=pReader->nData );
! 4408: pReader->pData += n+nSuffix;
! 4409: pReader->nData -= n+nSuffix;
! 4410: }
! 4411: pReader->iBlockid++;
! 4412: }
! 4413:
! 4414: /* Compare the current term to pTerm[nTerm], returning strcmp-style
! 4415: ** results. If isPrefix, equality means equal through nTerm bytes.
! 4416: */
! 4417: static int interiorReaderTermCmp(InteriorReader *pReader,
! 4418: const char *pTerm, int nTerm, int isPrefix){
! 4419: const char *pReaderTerm = interiorReaderTerm(pReader);
! 4420: int nReaderTerm = interiorReaderTermBytes(pReader);
! 4421: int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
! 4422:
! 4423: if( n==0 ){
! 4424: if( nReaderTerm>0 ) return -1;
! 4425: if( nTerm>0 ) return 1;
! 4426: return 0;
! 4427: }
! 4428:
! 4429: c = memcmp(pReaderTerm, pTerm, n);
! 4430: if( c!=0 ) return c;
! 4431: if( isPrefix && n==nTerm ) return 0;
! 4432: return nReaderTerm - nTerm;
! 4433: }
! 4434:
! 4435: /****************************************************************/
! 4436: /* LeafWriter is used to collect terms and associated doclist data
! 4437: ** into leaf blocks in %_segments (see top of file for format info).
! 4438: ** Expected usage is:
! 4439: **
! 4440: ** LeafWriter writer;
! 4441: ** leafWriterInit(0, 0, &writer);
! 4442: ** while( sorted_terms_left_to_process ){
! 4443: ** // data is doclist data for that term.
! 4444: ** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
! 4445: ** if( rc!=SQLITE_OK ) goto err;
! 4446: ** }
! 4447: ** rc = leafWriterFinalize(v, &writer);
! 4448: **err:
! 4449: ** leafWriterDestroy(&writer);
! 4450: ** return rc;
! 4451: **
! 4452: ** leafWriterStep() may write a collected leaf out to %_segments.
! 4453: ** leafWriterFinalize() finishes writing any buffered data and stores
! 4454: ** a root node in %_segdir. leafWriterDestroy() frees all buffers and
! 4455: ** InteriorWriters allocated as part of writing this segment.
! 4456: **
! 4457: ** TODO(shess) Document leafWriterStepMerge().
! 4458: */
! 4459:
! 4460: /* Put terms with data this big in their own block. */
! 4461: #define STANDALONE_MIN 1024
! 4462:
! 4463: /* Keep leaf blocks below this size. */
! 4464: #define LEAF_MAX 2048
! 4465:
! 4466: typedef struct LeafWriter {
! 4467: int iLevel;
! 4468: int idx;
! 4469: sqlite_int64 iStartBlockid; /* needed to create the root info */
! 4470: sqlite_int64 iEndBlockid; /* when we're done writing. */
! 4471:
! 4472: DataBuffer term; /* previous encoded term */
! 4473: DataBuffer data; /* encoding buffer */
! 4474:
! 4475: /* bytes of first term in the current node which distinguishes that
! 4476: ** term from the last term of the previous node.
! 4477: */
! 4478: int nTermDistinct;
! 4479:
! 4480: InteriorWriter parentWriter; /* if we overflow */
! 4481: int has_parent;
! 4482: } LeafWriter;
! 4483:
! 4484: static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
! 4485: CLEAR(pWriter);
! 4486: pWriter->iLevel = iLevel;
! 4487: pWriter->idx = idx;
! 4488:
! 4489: dataBufferInit(&pWriter->term, 32);
! 4490:
! 4491: /* Start out with a reasonably sized block, though it can grow. */
! 4492: dataBufferInit(&pWriter->data, LEAF_MAX);
! 4493: }
! 4494:
! 4495: #ifndef NDEBUG
! 4496: /* Verify that the data is readable as a leaf node. */
! 4497: static void leafNodeValidate(const char *pData, int nData){
! 4498: int n, iDummy;
! 4499:
! 4500: if( nData==0 ) return;
! 4501: assert( nData>0 );
! 4502: assert( pData!=0 );
! 4503: assert( pData+nData>pData );
! 4504:
! 4505: /* Must lead with a varint(0) */
! 4506: n = getVarint32(pData, &iDummy);
! 4507: assert( iDummy==0 );
! 4508: assert( n>0 );
! 4509: assert( n<nData );
! 4510: pData += n;
! 4511: nData -= n;
! 4512:
! 4513: /* Leading term length and data must fit in buffer. */
! 4514: n = getVarint32(pData, &iDummy);
! 4515: assert( n>0 );
! 4516: assert( iDummy>0 );
! 4517: assert( n+iDummy>0 );
! 4518: assert( n+iDummy<nData );
! 4519: pData += n+iDummy;
! 4520: nData -= n+iDummy;
! 4521:
! 4522: /* Leading term's doclist length and data must fit. */
! 4523: n = getVarint32(pData, &iDummy);
! 4524: assert( n>0 );
! 4525: assert( iDummy>0 );
! 4526: assert( n+iDummy>0 );
! 4527: assert( n+iDummy<=nData );
! 4528: ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
! 4529: pData += n+iDummy;
! 4530: nData -= n+iDummy;
! 4531:
! 4532: /* Verify that trailing terms and doclists also are readable. */
! 4533: while( nData!=0 ){
! 4534: n = getVarint32(pData, &iDummy);
! 4535: assert( n>0 );
! 4536: assert( iDummy>=0 );
! 4537: assert( n<nData );
! 4538: pData += n;
! 4539: nData -= n;
! 4540: n = getVarint32(pData, &iDummy);
! 4541: assert( n>0 );
! 4542: assert( iDummy>0 );
! 4543: assert( n+iDummy>0 );
! 4544: assert( n+iDummy<nData );
! 4545: pData += n+iDummy;
! 4546: nData -= n+iDummy;
! 4547:
! 4548: n = getVarint32(pData, &iDummy);
! 4549: assert( n>0 );
! 4550: assert( iDummy>0 );
! 4551: assert( n+iDummy>0 );
! 4552: assert( n+iDummy<=nData );
! 4553: ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
! 4554: pData += n+iDummy;
! 4555: nData -= n+iDummy;
! 4556: }
! 4557: }
! 4558: #define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
! 4559: #else
! 4560: #define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
! 4561: #endif
! 4562:
! 4563: /* Flush the current leaf node to %_segments, and adding the resulting
! 4564: ** blockid and the starting term to the interior node which will
! 4565: ** contain it.
! 4566: */
! 4567: static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter,
! 4568: int iData, int nData){
! 4569: sqlite_int64 iBlockid = 0;
! 4570: const char *pStartingTerm;
! 4571: int nStartingTerm, rc, n;
! 4572:
! 4573: /* Must have the leading varint(0) flag, plus at least some
! 4574: ** valid-looking data.
! 4575: */
! 4576: assert( nData>2 );
! 4577: assert( iData>=0 );
! 4578: assert( iData+nData<=pWriter->data.nData );
! 4579: ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
! 4580:
! 4581: rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
! 4582: if( rc!=SQLITE_OK ) return rc;
! 4583: assert( iBlockid!=0 );
! 4584:
! 4585: /* Reconstruct the first term in the leaf for purposes of building
! 4586: ** the interior node.
! 4587: */
! 4588: n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
! 4589: pStartingTerm = pWriter->data.pData+iData+1+n;
! 4590: assert( pWriter->data.nData>iData+1+n+nStartingTerm );
! 4591: assert( pWriter->nTermDistinct>0 );
! 4592: assert( pWriter->nTermDistinct<=nStartingTerm );
! 4593: nStartingTerm = pWriter->nTermDistinct;
! 4594:
! 4595: if( pWriter->has_parent ){
! 4596: interiorWriterAppend(&pWriter->parentWriter,
! 4597: pStartingTerm, nStartingTerm, iBlockid);
! 4598: }else{
! 4599: interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
! 4600: &pWriter->parentWriter);
! 4601: pWriter->has_parent = 1;
! 4602: }
! 4603:
! 4604: /* Track the span of this segment's leaf nodes. */
! 4605: if( pWriter->iEndBlockid==0 ){
! 4606: pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
! 4607: }else{
! 4608: pWriter->iEndBlockid++;
! 4609: assert( iBlockid==pWriter->iEndBlockid );
! 4610: }
! 4611:
! 4612: return SQLITE_OK;
! 4613: }
! 4614: static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
! 4615: int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
! 4616: if( rc!=SQLITE_OK ) return rc;
! 4617:
! 4618: /* Re-initialize the output buffer. */
! 4619: dataBufferReset(&pWriter->data);
! 4620:
! 4621: return SQLITE_OK;
! 4622: }
! 4623:
! 4624: /* Fetch the root info for the segment. If the entire leaf fits
! 4625: ** within ROOT_MAX, then it will be returned directly, otherwise it
! 4626: ** will be flushed and the root info will be returned from the
! 4627: ** interior node. *piEndBlockid is set to the blockid of the last
! 4628: ** interior or leaf node written to disk (0 if none are written at
! 4629: ** all).
! 4630: */
! 4631: static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
! 4632: char **ppRootInfo, int *pnRootInfo,
! 4633: sqlite_int64 *piEndBlockid){
! 4634: /* we can fit the segment entirely inline */
! 4635: if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){
! 4636: *ppRootInfo = pWriter->data.pData;
! 4637: *pnRootInfo = pWriter->data.nData;
! 4638: *piEndBlockid = 0;
! 4639: return SQLITE_OK;
! 4640: }
! 4641:
! 4642: /* Flush remaining leaf data. */
! 4643: if( pWriter->data.nData>0 ){
! 4644: int rc = leafWriterFlush(v, pWriter);
! 4645: if( rc!=SQLITE_OK ) return rc;
! 4646: }
! 4647:
! 4648: /* We must have flushed a leaf at some point. */
! 4649: assert( pWriter->has_parent );
! 4650:
! 4651: /* Tenatively set the end leaf blockid as the end blockid. If the
! 4652: ** interior node can be returned inline, this will be the final
! 4653: ** blockid, otherwise it will be overwritten by
! 4654: ** interiorWriterRootInfo().
! 4655: */
! 4656: *piEndBlockid = pWriter->iEndBlockid;
! 4657:
! 4658: return interiorWriterRootInfo(v, &pWriter->parentWriter,
! 4659: ppRootInfo, pnRootInfo, piEndBlockid);
! 4660: }
! 4661:
! 4662: /* Collect the rootInfo data and store it into the segment directory.
! 4663: ** This has the effect of flushing the segment's leaf data to
! 4664: ** %_segments, and also flushing any interior nodes to %_segments.
! 4665: */
! 4666: static int leafWriterFinalize(fulltext_vtab *v, LeafWriter *pWriter){
! 4667: sqlite_int64 iEndBlockid;
! 4668: char *pRootInfo;
! 4669: int rc, nRootInfo;
! 4670:
! 4671: rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
! 4672: if( rc!=SQLITE_OK ) return rc;
! 4673:
! 4674: /* Don't bother storing an entirely empty segment. */
! 4675: if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
! 4676:
! 4677: return segdir_set(v, pWriter->iLevel, pWriter->idx,
! 4678: pWriter->iStartBlockid, pWriter->iEndBlockid,
! 4679: iEndBlockid, pRootInfo, nRootInfo);
! 4680: }
! 4681:
! 4682: static void leafWriterDestroy(LeafWriter *pWriter){
! 4683: if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
! 4684: dataBufferDestroy(&pWriter->term);
! 4685: dataBufferDestroy(&pWriter->data);
! 4686: }
! 4687:
! 4688: /* Encode a term into the leafWriter, delta-encoding as appropriate.
! 4689: ** Returns the length of the new term which distinguishes it from the
! 4690: ** previous term, which can be used to set nTermDistinct when a node
! 4691: ** boundary is crossed.
! 4692: */
! 4693: static int leafWriterEncodeTerm(LeafWriter *pWriter,
! 4694: const char *pTerm, int nTerm){
! 4695: char c[VARINT_MAX+VARINT_MAX];
! 4696: int n, nPrefix = 0;
! 4697:
! 4698: assert( nTerm>0 );
! 4699: while( nPrefix<pWriter->term.nData &&
! 4700: pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
! 4701: nPrefix++;
! 4702: /* Failing this implies that the terms weren't in order. */
! 4703: assert( nPrefix<nTerm );
! 4704: }
! 4705:
! 4706: if( pWriter->data.nData==0 ){
! 4707: /* Encode the node header and leading term as:
! 4708: ** varint(0)
! 4709: ** varint(nTerm)
! 4710: ** char pTerm[nTerm]
! 4711: */
! 4712: n = putVarint(c, '\0');
! 4713: n += putVarint(c+n, nTerm);
! 4714: dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
! 4715: }else{
! 4716: /* Delta-encode the term as:
! 4717: ** varint(nPrefix)
! 4718: ** varint(nSuffix)
! 4719: ** char pTermSuffix[nSuffix]
! 4720: */
! 4721: n = putVarint(c, nPrefix);
! 4722: n += putVarint(c+n, nTerm-nPrefix);
! 4723: dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
! 4724: }
! 4725: dataBufferReplace(&pWriter->term, pTerm, nTerm);
! 4726:
! 4727: return nPrefix+1;
! 4728: }
! 4729:
! 4730: /* Used to avoid a memmove when a large amount of doclist data is in
! 4731: ** the buffer. This constructs a node and term header before
! 4732: ** iDoclistData and flushes the resulting complete node using
! 4733: ** leafWriterInternalFlush().
! 4734: */
! 4735: static int leafWriterInlineFlush(fulltext_vtab *v, LeafWriter *pWriter,
! 4736: const char *pTerm, int nTerm,
! 4737: int iDoclistData){
! 4738: char c[VARINT_MAX+VARINT_MAX];
! 4739: int iData, n = putVarint(c, 0);
! 4740: n += putVarint(c+n, nTerm);
! 4741:
! 4742: /* There should always be room for the header. Even if pTerm shared
! 4743: ** a substantial prefix with the previous term, the entire prefix
! 4744: ** could be constructed from earlier data in the doclist, so there
! 4745: ** should be room.
! 4746: */
! 4747: assert( iDoclistData>=n+nTerm );
! 4748:
! 4749: iData = iDoclistData-(n+nTerm);
! 4750: memcpy(pWriter->data.pData+iData, c, n);
! 4751: memcpy(pWriter->data.pData+iData+n, pTerm, nTerm);
! 4752:
! 4753: return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData);
! 4754: }
! 4755:
! 4756: /* Push pTerm[nTerm] along with the doclist data to the leaf layer of
! 4757: ** %_segments.
! 4758: */
! 4759: static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
! 4760: const char *pTerm, int nTerm,
! 4761: DLReader *pReaders, int nReaders){
! 4762: char c[VARINT_MAX+VARINT_MAX];
! 4763: int iTermData = pWriter->data.nData, iDoclistData;
! 4764: int i, nData, n, nActualData, nActual, rc, nTermDistinct;
! 4765:
! 4766: ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
! 4767: nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
! 4768:
! 4769: /* Remember nTermDistinct if opening a new node. */
! 4770: if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
! 4771:
! 4772: iDoclistData = pWriter->data.nData;
! 4773:
! 4774: /* Estimate the length of the merged doclist so we can leave space
! 4775: ** to encode it.
! 4776: */
! 4777: for(i=0, nData=0; i<nReaders; i++){
! 4778: nData += dlrAllDataBytes(&pReaders[i]);
! 4779: }
! 4780: n = putVarint(c, nData);
! 4781: dataBufferAppend(&pWriter->data, c, n);
! 4782:
! 4783: docListMerge(&pWriter->data, pReaders, nReaders);
! 4784: ASSERT_VALID_DOCLIST(DL_DEFAULT,
! 4785: pWriter->data.pData+iDoclistData+n,
! 4786: pWriter->data.nData-iDoclistData-n, NULL);
! 4787:
! 4788: /* The actual amount of doclist data at this point could be smaller
! 4789: ** than the length we encoded. Additionally, the space required to
! 4790: ** encode this length could be smaller. For small doclists, this is
! 4791: ** not a big deal, we can just use memmove() to adjust things.
! 4792: */
! 4793: nActualData = pWriter->data.nData-(iDoclistData+n);
! 4794: nActual = putVarint(c, nActualData);
! 4795: assert( nActualData<=nData );
! 4796: assert( nActual<=n );
! 4797:
! 4798: /* If the new doclist is big enough for force a standalone leaf
! 4799: ** node, we can immediately flush it inline without doing the
! 4800: ** memmove().
! 4801: */
! 4802: /* TODO(shess) This test matches leafWriterStep(), which does this
! 4803: ** test before it knows the cost to varint-encode the term and
! 4804: ** doclist lengths. At some point, change to
! 4805: ** pWriter->data.nData-iTermData>STANDALONE_MIN.
! 4806: */
! 4807: if( nTerm+nActualData>STANDALONE_MIN ){
! 4808: /* Push leaf node from before this term. */
! 4809: if( iTermData>0 ){
! 4810: rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
! 4811: if( rc!=SQLITE_OK ) return rc;
! 4812:
! 4813: pWriter->nTermDistinct = nTermDistinct;
! 4814: }
! 4815:
! 4816: /* Fix the encoded doclist length. */
! 4817: iDoclistData += n - nActual;
! 4818: memcpy(pWriter->data.pData+iDoclistData, c, nActual);
! 4819:
! 4820: /* Push the standalone leaf node. */
! 4821: rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
! 4822: if( rc!=SQLITE_OK ) return rc;
! 4823:
! 4824: /* Leave the node empty. */
! 4825: dataBufferReset(&pWriter->data);
! 4826:
! 4827: return rc;
! 4828: }
! 4829:
! 4830: /* At this point, we know that the doclist was small, so do the
! 4831: ** memmove if indicated.
! 4832: */
! 4833: if( nActual<n ){
! 4834: memmove(pWriter->data.pData+iDoclistData+nActual,
! 4835: pWriter->data.pData+iDoclistData+n,
! 4836: pWriter->data.nData-(iDoclistData+n));
! 4837: pWriter->data.nData -= n-nActual;
! 4838: }
! 4839:
! 4840: /* Replace written length with actual length. */
! 4841: memcpy(pWriter->data.pData+iDoclistData, c, nActual);
! 4842:
! 4843: /* If the node is too large, break things up. */
! 4844: /* TODO(shess) This test matches leafWriterStep(), which does this
! 4845: ** test before it knows the cost to varint-encode the term and
! 4846: ** doclist lengths. At some point, change to
! 4847: ** pWriter->data.nData>LEAF_MAX.
! 4848: */
! 4849: if( iTermData+nTerm+nActualData>LEAF_MAX ){
! 4850: /* Flush out the leading data as a node */
! 4851: rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
! 4852: if( rc!=SQLITE_OK ) return rc;
! 4853:
! 4854: pWriter->nTermDistinct = nTermDistinct;
! 4855:
! 4856: /* Rebuild header using the current term */
! 4857: n = putVarint(pWriter->data.pData, 0);
! 4858: n += putVarint(pWriter->data.pData+n, nTerm);
! 4859: memcpy(pWriter->data.pData+n, pTerm, nTerm);
! 4860: n += nTerm;
! 4861:
! 4862: /* There should always be room, because the previous encoding
! 4863: ** included all data necessary to construct the term.
! 4864: */
! 4865: assert( n<iDoclistData );
! 4866: /* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
! 4867: ** following memcpy() is safe (as opposed to needing a memmove).
! 4868: */
! 4869: assert( 2*STANDALONE_MIN<=LEAF_MAX );
! 4870: assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
! 4871: memcpy(pWriter->data.pData+n,
! 4872: pWriter->data.pData+iDoclistData,
! 4873: pWriter->data.nData-iDoclistData);
! 4874: pWriter->data.nData -= iDoclistData-n;
! 4875: }
! 4876: ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
! 4877:
! 4878: return SQLITE_OK;
! 4879: }
! 4880:
! 4881: /* Push pTerm[nTerm] along with the doclist data to the leaf layer of
! 4882: ** %_segments.
! 4883: */
! 4884: /* TODO(shess) Revise writeZeroSegment() so that doclists are
! 4885: ** constructed directly in pWriter->data.
! 4886: */
! 4887: static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
! 4888: const char *pTerm, int nTerm,
! 4889: const char *pData, int nData){
! 4890: int rc;
! 4891: DLReader reader;
! 4892:
! 4893: dlrInit(&reader, DL_DEFAULT, pData, nData);
! 4894: rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
! 4895: dlrDestroy(&reader);
! 4896:
! 4897: return rc;
! 4898: }
! 4899:
! 4900:
! 4901: /****************************************************************/
! 4902: /* LeafReader is used to iterate over an individual leaf node. */
! 4903: typedef struct LeafReader {
! 4904: DataBuffer term; /* copy of current term. */
! 4905:
! 4906: const char *pData; /* data for current term. */
! 4907: int nData;
! 4908: } LeafReader;
! 4909:
! 4910: static void leafReaderDestroy(LeafReader *pReader){
! 4911: dataBufferDestroy(&pReader->term);
! 4912: SCRAMBLE(pReader);
! 4913: }
! 4914:
! 4915: static int leafReaderAtEnd(LeafReader *pReader){
! 4916: return pReader->nData<=0;
! 4917: }
! 4918:
! 4919: /* Access the current term. */
! 4920: static int leafReaderTermBytes(LeafReader *pReader){
! 4921: return pReader->term.nData;
! 4922: }
! 4923: static const char *leafReaderTerm(LeafReader *pReader){
! 4924: assert( pReader->term.nData>0 );
! 4925: return pReader->term.pData;
! 4926: }
! 4927:
! 4928: /* Access the doclist data for the current term. */
! 4929: static int leafReaderDataBytes(LeafReader *pReader){
! 4930: int nData;
! 4931: assert( pReader->term.nData>0 );
! 4932: getVarint32(pReader->pData, &nData);
! 4933: return nData;
! 4934: }
! 4935: static const char *leafReaderData(LeafReader *pReader){
! 4936: int n, nData;
! 4937: assert( pReader->term.nData>0 );
! 4938: n = getVarint32(pReader->pData, &nData);
! 4939: return pReader->pData+n;
! 4940: }
! 4941:
! 4942: static void leafReaderInit(const char *pData, int nData,
! 4943: LeafReader *pReader){
! 4944: int nTerm, n;
! 4945:
! 4946: assert( nData>0 );
! 4947: assert( pData[0]=='\0' );
! 4948:
! 4949: CLEAR(pReader);
! 4950:
! 4951: /* Read the first term, skipping the header byte. */
! 4952: n = getVarint32(pData+1, &nTerm);
! 4953: dataBufferInit(&pReader->term, nTerm);
! 4954: dataBufferReplace(&pReader->term, pData+1+n, nTerm);
! 4955:
! 4956: /* Position after the first term. */
! 4957: assert( 1+n+nTerm<nData );
! 4958: pReader->pData = pData+1+n+nTerm;
! 4959: pReader->nData = nData-1-n-nTerm;
! 4960: }
! 4961:
! 4962: /* Step the reader forward to the next term. */
! 4963: static void leafReaderStep(LeafReader *pReader){
! 4964: int n, nData, nPrefix, nSuffix;
! 4965: assert( !leafReaderAtEnd(pReader) );
! 4966:
! 4967: /* Skip previous entry's data block. */
! 4968: n = getVarint32(pReader->pData, &nData);
! 4969: assert( n+nData<=pReader->nData );
! 4970: pReader->pData += n+nData;
! 4971: pReader->nData -= n+nData;
! 4972:
! 4973: if( !leafReaderAtEnd(pReader) ){
! 4974: /* Construct the new term using a prefix from the old term plus a
! 4975: ** suffix from the leaf data.
! 4976: */
! 4977: n = getVarint32(pReader->pData, &nPrefix);
! 4978: n += getVarint32(pReader->pData+n, &nSuffix);
! 4979: assert( n+nSuffix<pReader->nData );
! 4980: pReader->term.nData = nPrefix;
! 4981: dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
! 4982:
! 4983: pReader->pData += n+nSuffix;
! 4984: pReader->nData -= n+nSuffix;
! 4985: }
! 4986: }
! 4987:
! 4988: /* strcmp-style comparison of pReader's current term against pTerm.
! 4989: ** If isPrefix, equality means equal through nTerm bytes.
! 4990: */
! 4991: static int leafReaderTermCmp(LeafReader *pReader,
! 4992: const char *pTerm, int nTerm, int isPrefix){
! 4993: int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
! 4994: if( n==0 ){
! 4995: if( pReader->term.nData>0 ) return -1;
! 4996: if(nTerm>0 ) return 1;
! 4997: return 0;
! 4998: }
! 4999:
! 5000: c = memcmp(pReader->term.pData, pTerm, n);
! 5001: if( c!=0 ) return c;
! 5002: if( isPrefix && n==nTerm ) return 0;
! 5003: return pReader->term.nData - nTerm;
! 5004: }
! 5005:
! 5006:
! 5007: /****************************************************************/
! 5008: /* LeavesReader wraps LeafReader to allow iterating over the entire
! 5009: ** leaf layer of the tree.
! 5010: */
! 5011: typedef struct LeavesReader {
! 5012: int idx; /* Index within the segment. */
! 5013:
! 5014: sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */
! 5015: int eof; /* we've seen SQLITE_DONE from pStmt. */
! 5016:
! 5017: LeafReader leafReader; /* reader for the current leaf. */
! 5018: DataBuffer rootData; /* root data for inline. */
! 5019: } LeavesReader;
! 5020:
! 5021: /* Access the current term. */
! 5022: static int leavesReaderTermBytes(LeavesReader *pReader){
! 5023: assert( !pReader->eof );
! 5024: return leafReaderTermBytes(&pReader->leafReader);
! 5025: }
! 5026: static const char *leavesReaderTerm(LeavesReader *pReader){
! 5027: assert( !pReader->eof );
! 5028: return leafReaderTerm(&pReader->leafReader);
! 5029: }
! 5030:
! 5031: /* Access the doclist data for the current term. */
! 5032: static int leavesReaderDataBytes(LeavesReader *pReader){
! 5033: assert( !pReader->eof );
! 5034: return leafReaderDataBytes(&pReader->leafReader);
! 5035: }
! 5036: static const char *leavesReaderData(LeavesReader *pReader){
! 5037: assert( !pReader->eof );
! 5038: return leafReaderData(&pReader->leafReader);
! 5039: }
! 5040:
! 5041: static int leavesReaderAtEnd(LeavesReader *pReader){
! 5042: return pReader->eof;
! 5043: }
! 5044:
! 5045: /* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
! 5046: ** leaving the statement handle open, which locks the table.
! 5047: */
! 5048: /* TODO(shess) This "solution" is not satisfactory. Really, there
! 5049: ** should be check-in function for all statement handles which
! 5050: ** arranges to call sqlite3_reset(). This most likely will require
! 5051: ** modification to control flow all over the place, though, so for now
! 5052: ** just punt.
! 5053: **
! 5054: ** Note the the current system assumes that segment merges will run to
! 5055: ** completion, which is why this particular probably hasn't arisen in
! 5056: ** this case. Probably a brittle assumption.
! 5057: */
! 5058: static int leavesReaderReset(LeavesReader *pReader){
! 5059: return sqlite3_reset(pReader->pStmt);
! 5060: }
! 5061:
! 5062: static void leavesReaderDestroy(LeavesReader *pReader){
! 5063: /* If idx is -1, that means we're using a non-cached statement
! 5064: ** handle in the optimize() case, so we need to release it.
! 5065: */
! 5066: if( pReader->pStmt!=NULL && pReader->idx==-1 ){
! 5067: sqlite3_finalize(pReader->pStmt);
! 5068: }
! 5069: leafReaderDestroy(&pReader->leafReader);
! 5070: dataBufferDestroy(&pReader->rootData);
! 5071: SCRAMBLE(pReader);
! 5072: }
! 5073:
! 5074: /* Initialize pReader with the given root data (if iStartBlockid==0
! 5075: ** the leaf data was entirely contained in the root), or from the
! 5076: ** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
! 5077: */
! 5078: static int leavesReaderInit(fulltext_vtab *v,
! 5079: int idx,
! 5080: sqlite_int64 iStartBlockid,
! 5081: sqlite_int64 iEndBlockid,
! 5082: const char *pRootData, int nRootData,
! 5083: LeavesReader *pReader){
! 5084: CLEAR(pReader);
! 5085: pReader->idx = idx;
! 5086:
! 5087: dataBufferInit(&pReader->rootData, 0);
! 5088: if( iStartBlockid==0 ){
! 5089: /* Entire leaf level fit in root data. */
! 5090: dataBufferReplace(&pReader->rootData, pRootData, nRootData);
! 5091: leafReaderInit(pReader->rootData.pData, pReader->rootData.nData,
! 5092: &pReader->leafReader);
! 5093: }else{
! 5094: sqlite3_stmt *s;
! 5095: int rc = sql_get_leaf_statement(v, idx, &s);
! 5096: if( rc!=SQLITE_OK ) return rc;
! 5097:
! 5098: rc = sqlite3_bind_int64(s, 1, iStartBlockid);
! 5099: if( rc!=SQLITE_OK ) return rc;
! 5100:
! 5101: rc = sqlite3_bind_int64(s, 2, iEndBlockid);
! 5102: if( rc!=SQLITE_OK ) return rc;
! 5103:
! 5104: rc = sqlite3_step(s);
! 5105: if( rc==SQLITE_DONE ){
! 5106: pReader->eof = 1;
! 5107: return SQLITE_OK;
! 5108: }
! 5109: if( rc!=SQLITE_ROW ) return rc;
! 5110:
! 5111: pReader->pStmt = s;
! 5112: leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
! 5113: sqlite3_column_bytes(pReader->pStmt, 0),
! 5114: &pReader->leafReader);
! 5115: }
! 5116: return SQLITE_OK;
! 5117: }
! 5118:
! 5119: /* Step the current leaf forward to the next term. If we reach the
! 5120: ** end of the current leaf, step forward to the next leaf block.
! 5121: */
! 5122: static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){
! 5123: assert( !leavesReaderAtEnd(pReader) );
! 5124: leafReaderStep(&pReader->leafReader);
! 5125:
! 5126: if( leafReaderAtEnd(&pReader->leafReader) ){
! 5127: int rc;
! 5128: if( pReader->rootData.pData ){
! 5129: pReader->eof = 1;
! 5130: return SQLITE_OK;
! 5131: }
! 5132: rc = sqlite3_step(pReader->pStmt);
! 5133: if( rc!=SQLITE_ROW ){
! 5134: pReader->eof = 1;
! 5135: return rc==SQLITE_DONE ? SQLITE_OK : rc;
! 5136: }
! 5137: leafReaderDestroy(&pReader->leafReader);
! 5138: leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
! 5139: sqlite3_column_bytes(pReader->pStmt, 0),
! 5140: &pReader->leafReader);
! 5141: }
! 5142: return SQLITE_OK;
! 5143: }
! 5144:
! 5145: /* Order LeavesReaders by their term, ignoring idx. Readers at eof
! 5146: ** always sort to the end.
! 5147: */
! 5148: static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
! 5149: if( leavesReaderAtEnd(lr1) ){
! 5150: if( leavesReaderAtEnd(lr2) ) return 0;
! 5151: return 1;
! 5152: }
! 5153: if( leavesReaderAtEnd(lr2) ) return -1;
! 5154:
! 5155: return leafReaderTermCmp(&lr1->leafReader,
! 5156: leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
! 5157: 0);
! 5158: }
! 5159:
! 5160: /* Similar to leavesReaderTermCmp(), with additional ordering by idx
! 5161: ** so that older segments sort before newer segments.
! 5162: */
! 5163: static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){
! 5164: int c = leavesReaderTermCmp(lr1, lr2);
! 5165: if( c!=0 ) return c;
! 5166: return lr1->idx-lr2->idx;
! 5167: }
! 5168:
! 5169: /* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its
! 5170: ** sorted position.
! 5171: */
! 5172: static void leavesReaderReorder(LeavesReader *pLr, int nLr){
! 5173: while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
! 5174: LeavesReader tmp = pLr[0];
! 5175: pLr[0] = pLr[1];
! 5176: pLr[1] = tmp;
! 5177: nLr--;
! 5178: pLr++;
! 5179: }
! 5180: }
! 5181:
! 5182: /* Initializes pReaders with the segments from level iLevel, returning
! 5183: ** the number of segments in *piReaders. Leaves pReaders in sorted
! 5184: ** order.
! 5185: */
! 5186: static int leavesReadersInit(fulltext_vtab *v, int iLevel,
! 5187: LeavesReader *pReaders, int *piReaders){
! 5188: sqlite3_stmt *s;
! 5189: int i, rc = sql_get_statement(v, SEGDIR_SELECT_LEVEL_STMT, &s);
! 5190: if( rc!=SQLITE_OK ) return rc;
! 5191:
! 5192: rc = sqlite3_bind_int(s, 1, iLevel);
! 5193: if( rc!=SQLITE_OK ) return rc;
! 5194:
! 5195: i = 0;
! 5196: while( (rc = sqlite3_step(s))==SQLITE_ROW ){
! 5197: sqlite_int64 iStart = sqlite3_column_int64(s, 0);
! 5198: sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
! 5199: const char *pRootData = sqlite3_column_blob(s, 2);
! 5200: int nRootData = sqlite3_column_bytes(s, 2);
! 5201:
! 5202: assert( i<MERGE_COUNT );
! 5203: rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
! 5204: &pReaders[i]);
! 5205: if( rc!=SQLITE_OK ) break;
! 5206:
! 5207: i++;
! 5208: }
! 5209: if( rc!=SQLITE_DONE ){
! 5210: while( i-->0 ){
! 5211: leavesReaderDestroy(&pReaders[i]);
! 5212: }
! 5213: return rc;
! 5214: }
! 5215:
! 5216: *piReaders = i;
! 5217:
! 5218: /* Leave our results sorted by term, then age. */
! 5219: while( i-- ){
! 5220: leavesReaderReorder(pReaders+i, *piReaders-i);
! 5221: }
! 5222: return SQLITE_OK;
! 5223: }
! 5224:
! 5225: /* Merge doclists from pReaders[nReaders] into a single doclist, which
! 5226: ** is written to pWriter. Assumes pReaders is ordered oldest to
! 5227: ** newest.
! 5228: */
! 5229: /* TODO(shess) Consider putting this inline in segmentMerge(). */
! 5230: static int leavesReadersMerge(fulltext_vtab *v,
! 5231: LeavesReader *pReaders, int nReaders,
! 5232: LeafWriter *pWriter){
! 5233: DLReader dlReaders[MERGE_COUNT];
! 5234: const char *pTerm = leavesReaderTerm(pReaders);
! 5235: int i, nTerm = leavesReaderTermBytes(pReaders);
! 5236:
! 5237: assert( nReaders<=MERGE_COUNT );
! 5238:
! 5239: for(i=0; i<nReaders; i++){
! 5240: dlrInit(&dlReaders[i], DL_DEFAULT,
! 5241: leavesReaderData(pReaders+i),
! 5242: leavesReaderDataBytes(pReaders+i));
! 5243: }
! 5244:
! 5245: return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders);
! 5246: }
! 5247:
! 5248: /* Forward ref due to mutual recursion with segdirNextIndex(). */
! 5249: static int segmentMerge(fulltext_vtab *v, int iLevel);
! 5250:
! 5251: /* Put the next available index at iLevel into *pidx. If iLevel
! 5252: ** already has MERGE_COUNT segments, they are merged to a higher
! 5253: ** level to make room.
! 5254: */
! 5255: static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){
! 5256: int rc = segdir_max_index(v, iLevel, pidx);
! 5257: if( rc==SQLITE_DONE ){ /* No segments at iLevel. */
! 5258: *pidx = 0;
! 5259: }else if( rc==SQLITE_ROW ){
! 5260: if( *pidx==(MERGE_COUNT-1) ){
! 5261: rc = segmentMerge(v, iLevel);
! 5262: if( rc!=SQLITE_OK ) return rc;
! 5263: *pidx = 0;
! 5264: }else{
! 5265: (*pidx)++;
! 5266: }
! 5267: }else{
! 5268: return rc;
! 5269: }
! 5270: return SQLITE_OK;
! 5271: }
! 5272:
! 5273: /* Merge MERGE_COUNT segments at iLevel into a new segment at
! 5274: ** iLevel+1. If iLevel+1 is already full of segments, those will be
! 5275: ** merged to make room.
! 5276: */
! 5277: static int segmentMerge(fulltext_vtab *v, int iLevel){
! 5278: LeafWriter writer;
! 5279: LeavesReader lrs[MERGE_COUNT];
! 5280: int i, rc, idx = 0;
! 5281:
! 5282: /* Determine the next available segment index at the next level,
! 5283: ** merging as necessary.
! 5284: */
! 5285: rc = segdirNextIndex(v, iLevel+1, &idx);
! 5286: if( rc!=SQLITE_OK ) return rc;
! 5287:
! 5288: /* TODO(shess) This assumes that we'll always see exactly
! 5289: ** MERGE_COUNT segments to merge at a given level. That will be
! 5290: ** broken if we allow the developer to request preemptive or
! 5291: ** deferred merging.
! 5292: */
! 5293: memset(&lrs, '\0', sizeof(lrs));
! 5294: rc = leavesReadersInit(v, iLevel, lrs, &i);
! 5295: if( rc!=SQLITE_OK ) return rc;
! 5296: assert( i==MERGE_COUNT );
! 5297:
! 5298: leafWriterInit(iLevel+1, idx, &writer);
! 5299:
! 5300: /* Since leavesReaderReorder() pushes readers at eof to the end,
! 5301: ** when the first reader is empty, all will be empty.
! 5302: */
! 5303: while( !leavesReaderAtEnd(lrs) ){
! 5304: /* Figure out how many readers share their next term. */
! 5305: for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
! 5306: if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
! 5307: }
! 5308:
! 5309: rc = leavesReadersMerge(v, lrs, i, &writer);
! 5310: if( rc!=SQLITE_OK ) goto err;
! 5311:
! 5312: /* Step forward those that were merged. */
! 5313: while( i-->0 ){
! 5314: rc = leavesReaderStep(v, lrs+i);
! 5315: if( rc!=SQLITE_OK ) goto err;
! 5316:
! 5317: /* Reorder by term, then by age. */
! 5318: leavesReaderReorder(lrs+i, MERGE_COUNT-i);
! 5319: }
! 5320: }
! 5321:
! 5322: for(i=0; i<MERGE_COUNT; i++){
! 5323: leavesReaderDestroy(&lrs[i]);
! 5324: }
! 5325:
! 5326: rc = leafWriterFinalize(v, &writer);
! 5327: leafWriterDestroy(&writer);
! 5328: if( rc!=SQLITE_OK ) return rc;
! 5329:
! 5330: /* Delete the merged segment data. */
! 5331: return segdir_delete(v, iLevel);
! 5332:
! 5333: err:
! 5334: for(i=0; i<MERGE_COUNT; i++){
! 5335: leavesReaderDestroy(&lrs[i]);
! 5336: }
! 5337: leafWriterDestroy(&writer);
! 5338: return rc;
! 5339: }
! 5340:
! 5341: /* Accumulate the union of *acc and *pData into *acc. */
! 5342: static void docListAccumulateUnion(DataBuffer *acc,
! 5343: const char *pData, int nData) {
! 5344: DataBuffer tmp = *acc;
! 5345: dataBufferInit(acc, tmp.nData+nData);
! 5346: docListUnion(tmp.pData, tmp.nData, pData, nData, acc);
! 5347: dataBufferDestroy(&tmp);
! 5348: }
! 5349:
! 5350: /* TODO(shess) It might be interesting to explore different merge
! 5351: ** strategies, here. For instance, since this is a sorted merge, we
! 5352: ** could easily merge many doclists in parallel. With some
! 5353: ** comprehension of the storage format, we could merge all of the
! 5354: ** doclists within a leaf node directly from the leaf node's storage.
! 5355: ** It may be worthwhile to merge smaller doclists before larger
! 5356: ** doclists, since they can be traversed more quickly - but the
! 5357: ** results may have less overlap, making them more expensive in a
! 5358: ** different way.
! 5359: */
! 5360:
! 5361: /* Scan pReader for pTerm/nTerm, and merge the term's doclist over
! 5362: ** *out (any doclists with duplicate docids overwrite those in *out).
! 5363: ** Internal function for loadSegmentLeaf().
! 5364: */
! 5365: static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
! 5366: const char *pTerm, int nTerm, int isPrefix,
! 5367: DataBuffer *out){
! 5368: /* doclist data is accumulated into pBuffers similar to how one does
! 5369: ** increment in binary arithmetic. If index 0 is empty, the data is
! 5370: ** stored there. If there is data there, it is merged and the
! 5371: ** results carried into position 1, with further merge-and-carry
! 5372: ** until an empty position is found.
! 5373: */
! 5374: DataBuffer *pBuffers = NULL;
! 5375: int nBuffers = 0, nMaxBuffers = 0, rc;
! 5376:
! 5377: assert( nTerm>0 );
! 5378:
! 5379: for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader);
! 5380: rc=leavesReaderStep(v, pReader)){
! 5381: /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
! 5382: ** already taken to compare the terms of two LeavesReaders. Think
! 5383: ** on a better name. [Meanwhile, break encapsulation rather than
! 5384: ** use a confusing name.]
! 5385: */
! 5386: int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
! 5387: if( c>0 ) break; /* Past any possible matches. */
! 5388: if( c==0 ){
! 5389: const char *pData = leavesReaderData(pReader);
! 5390: int iBuffer, nData = leavesReaderDataBytes(pReader);
! 5391:
! 5392: /* Find the first empty buffer. */
! 5393: for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
! 5394: if( 0==pBuffers[iBuffer].nData ) break;
! 5395: }
! 5396:
! 5397: /* Out of buffers, add an empty one. */
! 5398: if( iBuffer==nBuffers ){
! 5399: if( nBuffers==nMaxBuffers ){
! 5400: DataBuffer *p;
! 5401: nMaxBuffers += 20;
! 5402:
! 5403: /* Manual realloc so we can handle NULL appropriately. */
! 5404: p = sqlite3_malloc(nMaxBuffers*sizeof(*pBuffers));
! 5405: if( p==NULL ){
! 5406: rc = SQLITE_NOMEM;
! 5407: break;
! 5408: }
! 5409:
! 5410: if( nBuffers>0 ){
! 5411: assert(pBuffers!=NULL);
! 5412: memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers));
! 5413: sqlite3_free(pBuffers);
! 5414: }
! 5415: pBuffers = p;
! 5416: }
! 5417: dataBufferInit(&(pBuffers[nBuffers]), 0);
! 5418: nBuffers++;
! 5419: }
! 5420:
! 5421: /* At this point, must have an empty at iBuffer. */
! 5422: assert(iBuffer<nBuffers && pBuffers[iBuffer].nData==0);
! 5423:
! 5424: /* If empty was first buffer, no need for merge logic. */
! 5425: if( iBuffer==0 ){
! 5426: dataBufferReplace(&(pBuffers[0]), pData, nData);
! 5427: }else{
! 5428: /* pAcc is the empty buffer the merged data will end up in. */
! 5429: DataBuffer *pAcc = &(pBuffers[iBuffer]);
! 5430: DataBuffer *p = &(pBuffers[0]);
! 5431:
! 5432: /* Handle position 0 specially to avoid need to prime pAcc
! 5433: ** with pData/nData.
! 5434: */
! 5435: dataBufferSwap(p, pAcc);
! 5436: docListAccumulateUnion(pAcc, pData, nData);
! 5437:
! 5438: /* Accumulate remaining doclists into pAcc. */
! 5439: for(++p; p<pAcc; ++p){
! 5440: docListAccumulateUnion(pAcc, p->pData, p->nData);
! 5441:
! 5442: /* dataBufferReset() could allow a large doclist to blow up
! 5443: ** our memory requirements.
! 5444: */
! 5445: if( p->nCapacity<1024 ){
! 5446: dataBufferReset(p);
! 5447: }else{
! 5448: dataBufferDestroy(p);
! 5449: dataBufferInit(p, 0);
! 5450: }
! 5451: }
! 5452: }
! 5453: }
! 5454: }
! 5455:
! 5456: /* Union all the doclists together into *out. */
! 5457: /* TODO(shess) What if *out is big? Sigh. */
! 5458: if( rc==SQLITE_OK && nBuffers>0 ){
! 5459: int iBuffer;
! 5460: for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
! 5461: if( pBuffers[iBuffer].nData>0 ){
! 5462: if( out->nData==0 ){
! 5463: dataBufferSwap(out, &(pBuffers[iBuffer]));
! 5464: }else{
! 5465: docListAccumulateUnion(out, pBuffers[iBuffer].pData,
! 5466: pBuffers[iBuffer].nData);
! 5467: }
! 5468: }
! 5469: }
! 5470: }
! 5471:
! 5472: while( nBuffers-- ){
! 5473: dataBufferDestroy(&(pBuffers[nBuffers]));
! 5474: }
! 5475: if( pBuffers!=NULL ) sqlite3_free(pBuffers);
! 5476:
! 5477: return rc;
! 5478: }
! 5479:
! 5480: /* Call loadSegmentLeavesInt() with pData/nData as input. */
! 5481: static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
! 5482: const char *pTerm, int nTerm, int isPrefix,
! 5483: DataBuffer *out){
! 5484: LeavesReader reader;
! 5485: int rc;
! 5486:
! 5487: assert( nData>1 );
! 5488: assert( *pData=='\0' );
! 5489: rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
! 5490: if( rc!=SQLITE_OK ) return rc;
! 5491:
! 5492: rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
! 5493: leavesReaderReset(&reader);
! 5494: leavesReaderDestroy(&reader);
! 5495: return rc;
! 5496: }
! 5497:
! 5498: /* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
! 5499: ** iEndLeaf (inclusive) as input, and merge the resulting doclist into
! 5500: ** out.
! 5501: */
! 5502: static int loadSegmentLeaves(fulltext_vtab *v,
! 5503: sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
! 5504: const char *pTerm, int nTerm, int isPrefix,
! 5505: DataBuffer *out){
! 5506: int rc;
! 5507: LeavesReader reader;
! 5508:
! 5509: assert( iStartLeaf<=iEndLeaf );
! 5510: rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
! 5511: if( rc!=SQLITE_OK ) return rc;
! 5512:
! 5513: rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
! 5514: leavesReaderReset(&reader);
! 5515: leavesReaderDestroy(&reader);
! 5516: return rc;
! 5517: }
! 5518:
! 5519: /* Taking pData/nData as an interior node, find the sequence of child
! 5520: ** nodes which could include pTerm/nTerm/isPrefix. Note that the
! 5521: ** interior node terms logically come between the blocks, so there is
! 5522: ** one more blockid than there are terms (that block contains terms >=
! 5523: ** the last interior-node term).
! 5524: */
! 5525: /* TODO(shess) The calling code may already know that the end child is
! 5526: ** not worth calculating, because the end may be in a later sibling
! 5527: ** node. Consider whether breaking symmetry is worthwhile. I suspect
! 5528: ** it is not worthwhile.
! 5529: */
! 5530: static void getChildrenContaining(const char *pData, int nData,
! 5531: const char *pTerm, int nTerm, int isPrefix,
! 5532: sqlite_int64 *piStartChild,
! 5533: sqlite_int64 *piEndChild){
! 5534: InteriorReader reader;
! 5535:
! 5536: assert( nData>1 );
! 5537: assert( *pData!='\0' );
! 5538: interiorReaderInit(pData, nData, &reader);
! 5539:
! 5540: /* Scan for the first child which could contain pTerm/nTerm. */
! 5541: while( !interiorReaderAtEnd(&reader) ){
! 5542: if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break;
! 5543: interiorReaderStep(&reader);
! 5544: }
! 5545: *piStartChild = interiorReaderCurrentBlockid(&reader);
! 5546:
! 5547: /* Keep scanning to find a term greater than our term, using prefix
! 5548: ** comparison if indicated. If isPrefix is false, this will be the
! 5549: ** same blockid as the starting block.
! 5550: */
! 5551: while( !interiorReaderAtEnd(&reader) ){
! 5552: if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break;
! 5553: interiorReaderStep(&reader);
! 5554: }
! 5555: *piEndChild = interiorReaderCurrentBlockid(&reader);
! 5556:
! 5557: interiorReaderDestroy(&reader);
! 5558:
! 5559: /* Children must ascend, and if !prefix, both must be the same. */
! 5560: assert( *piEndChild>=*piStartChild );
! 5561: assert( isPrefix || *piStartChild==*piEndChild );
! 5562: }
! 5563:
! 5564: /* Read block at iBlockid and pass it with other params to
! 5565: ** getChildrenContaining().
! 5566: */
! 5567: static int loadAndGetChildrenContaining(
! 5568: fulltext_vtab *v,
! 5569: sqlite_int64 iBlockid,
! 5570: const char *pTerm, int nTerm, int isPrefix,
! 5571: sqlite_int64 *piStartChild, sqlite_int64 *piEndChild
! 5572: ){
! 5573: sqlite3_stmt *s = NULL;
! 5574: int rc;
! 5575:
! 5576: assert( iBlockid!=0 );
! 5577: assert( pTerm!=NULL );
! 5578: assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */
! 5579: assert( piStartChild!=NULL );
! 5580: assert( piEndChild!=NULL );
! 5581:
! 5582: rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
! 5583: if( rc!=SQLITE_OK ) return rc;
! 5584:
! 5585: rc = sqlite3_bind_int64(s, 1, iBlockid);
! 5586: if( rc!=SQLITE_OK ) return rc;
! 5587:
! 5588: rc = sqlite3_step(s);
! 5589: if( rc==SQLITE_DONE ) return SQLITE_ERROR;
! 5590: if( rc!=SQLITE_ROW ) return rc;
! 5591:
! 5592: getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
! 5593: pTerm, nTerm, isPrefix, piStartChild, piEndChild);
! 5594:
! 5595: /* We expect only one row. We must execute another sqlite3_step()
! 5596: * to complete the iteration; otherwise the table will remain
! 5597: * locked. */
! 5598: rc = sqlite3_step(s);
! 5599: if( rc==SQLITE_ROW ) return SQLITE_ERROR;
! 5600: if( rc!=SQLITE_DONE ) return rc;
! 5601:
! 5602: return SQLITE_OK;
! 5603: }
! 5604:
! 5605: /* Traverse the tree represented by pData[nData] looking for
! 5606: ** pTerm[nTerm], placing its doclist into *out. This is internal to
! 5607: ** loadSegment() to make error-handling cleaner.
! 5608: */
! 5609: static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData,
! 5610: sqlite_int64 iLeavesEnd,
! 5611: const char *pTerm, int nTerm, int isPrefix,
! 5612: DataBuffer *out){
! 5613: /* Special case where root is a leaf. */
! 5614: if( *pData=='\0' ){
! 5615: return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
! 5616: }else{
! 5617: int rc;
! 5618: sqlite_int64 iStartChild, iEndChild;
! 5619:
! 5620: /* Process pData as an interior node, then loop down the tree
! 5621: ** until we find the set of leaf nodes to scan for the term.
! 5622: */
! 5623: getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix,
! 5624: &iStartChild, &iEndChild);
! 5625: while( iStartChild>iLeavesEnd ){
! 5626: sqlite_int64 iNextStart, iNextEnd;
! 5627: rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix,
! 5628: &iNextStart, &iNextEnd);
! 5629: if( rc!=SQLITE_OK ) return rc;
! 5630:
! 5631: /* If we've branched, follow the end branch, too. */
! 5632: if( iStartChild!=iEndChild ){
! 5633: sqlite_int64 iDummy;
! 5634: rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix,
! 5635: &iDummy, &iNextEnd);
! 5636: if( rc!=SQLITE_OK ) return rc;
! 5637: }
! 5638:
! 5639: assert( iNextStart<=iNextEnd );
! 5640: iStartChild = iNextStart;
! 5641: iEndChild = iNextEnd;
! 5642: }
! 5643: assert( iStartChild<=iLeavesEnd );
! 5644: assert( iEndChild<=iLeavesEnd );
! 5645:
! 5646: /* Scan through the leaf segments for doclists. */
! 5647: return loadSegmentLeaves(v, iStartChild, iEndChild,
! 5648: pTerm, nTerm, isPrefix, out);
! 5649: }
! 5650: }
! 5651:
! 5652: /* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
! 5653: ** merge its doclist over *out (any duplicate doclists read from the
! 5654: ** segment rooted at pData will overwrite those in *out).
! 5655: */
! 5656: /* TODO(shess) Consider changing this to determine the depth of the
! 5657: ** leaves using either the first characters of interior nodes (when
! 5658: ** ==1, we're one level above the leaves), or the first character of
! 5659: ** the root (which will describe the height of the tree directly).
! 5660: ** Either feels somewhat tricky to me.
! 5661: */
! 5662: /* TODO(shess) The current merge is likely to be slow for large
! 5663: ** doclists (though it should process from newest/smallest to
! 5664: ** oldest/largest, so it may not be that bad). It might be useful to
! 5665: ** modify things to allow for N-way merging. This could either be
! 5666: ** within a segment, with pairwise merges across segments, or across
! 5667: ** all segments at once.
! 5668: */
! 5669: static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
! 5670: sqlite_int64 iLeavesEnd,
! 5671: const char *pTerm, int nTerm, int isPrefix,
! 5672: DataBuffer *out){
! 5673: DataBuffer result;
! 5674: int rc;
! 5675:
! 5676: assert( nData>1 );
! 5677:
! 5678: /* This code should never be called with buffered updates. */
! 5679: assert( v->nPendingData<0 );
! 5680:
! 5681: dataBufferInit(&result, 0);
! 5682: rc = loadSegmentInt(v, pData, nData, iLeavesEnd,
! 5683: pTerm, nTerm, isPrefix, &result);
! 5684: if( rc==SQLITE_OK && result.nData>0 ){
! 5685: if( out->nData==0 ){
! 5686: DataBuffer tmp = *out;
! 5687: *out = result;
! 5688: result = tmp;
! 5689: }else{
! 5690: DataBuffer merged;
! 5691: DLReader readers[2];
! 5692:
! 5693: dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData);
! 5694: dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData);
! 5695: dataBufferInit(&merged, out->nData+result.nData);
! 5696: docListMerge(&merged, readers, 2);
! 5697: dataBufferDestroy(out);
! 5698: *out = merged;
! 5699: dlrDestroy(&readers[0]);
! 5700: dlrDestroy(&readers[1]);
! 5701: }
! 5702: }
! 5703: dataBufferDestroy(&result);
! 5704: return rc;
! 5705: }
! 5706:
! 5707: /* Scan the database and merge together the posting lists for the term
! 5708: ** into *out.
! 5709: */
! 5710: static int termSelect(fulltext_vtab *v, int iColumn,
! 5711: const char *pTerm, int nTerm, int isPrefix,
! 5712: DocListType iType, DataBuffer *out){
! 5713: DataBuffer doclist;
! 5714: sqlite3_stmt *s;
! 5715: int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
! 5716: if( rc!=SQLITE_OK ) return rc;
! 5717:
! 5718: /* This code should never be called with buffered updates. */
! 5719: assert( v->nPendingData<0 );
! 5720:
! 5721: dataBufferInit(&doclist, 0);
! 5722:
! 5723: /* Traverse the segments from oldest to newest so that newer doclist
! 5724: ** elements for given docids overwrite older elements.
! 5725: */
! 5726: while( (rc = sqlite3_step(s))==SQLITE_ROW ){
! 5727: const char *pData = sqlite3_column_blob(s, 2);
! 5728: const int nData = sqlite3_column_bytes(s, 2);
! 5729: const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
! 5730: rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix,
! 5731: &doclist);
! 5732: if( rc!=SQLITE_OK ) goto err;
! 5733: }
! 5734: if( rc==SQLITE_DONE ){
! 5735: if( doclist.nData!=0 ){
! 5736: /* TODO(shess) The old term_select_all() code applied the column
! 5737: ** restrict as we merged segments, leading to smaller buffers.
! 5738: ** This is probably worthwhile to bring back, once the new storage
! 5739: ** system is checked in.
! 5740: */
! 5741: if( iColumn==v->nColumn) iColumn = -1;
! 5742: docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
! 5743: iColumn, iType, out);
! 5744: }
! 5745: rc = SQLITE_OK;
! 5746: }
! 5747:
! 5748: err:
! 5749: dataBufferDestroy(&doclist);
! 5750: return rc;
! 5751: }
! 5752:
! 5753: /****************************************************************/
! 5754: /* Used to hold hashtable data for sorting. */
! 5755: typedef struct TermData {
! 5756: const char *pTerm;
! 5757: int nTerm;
! 5758: DLCollector *pCollector;
! 5759: } TermData;
! 5760:
! 5761: /* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
! 5762: ** for equal, >0 for greater-than).
! 5763: */
! 5764: static int termDataCmp(const void *av, const void *bv){
! 5765: const TermData *a = (const TermData *)av;
! 5766: const TermData *b = (const TermData *)bv;
! 5767: int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
! 5768: int c = memcmp(a->pTerm, b->pTerm, n);
! 5769: if( c!=0 ) return c;
! 5770: return a->nTerm-b->nTerm;
! 5771: }
! 5772:
! 5773: /* Order pTerms data by term, then write a new level 0 segment using
! 5774: ** LeafWriter.
! 5775: */
! 5776: static int writeZeroSegment(fulltext_vtab *v, fts2Hash *pTerms){
! 5777: fts2HashElem *e;
! 5778: int idx, rc, i, n;
! 5779: TermData *pData;
! 5780: LeafWriter writer;
! 5781: DataBuffer dl;
! 5782:
! 5783: /* Determine the next index at level 0, merging as necessary. */
! 5784: rc = segdirNextIndex(v, 0, &idx);
! 5785: if( rc!=SQLITE_OK ) return rc;
! 5786:
! 5787: n = fts2HashCount(pTerms);
! 5788: pData = sqlite3_malloc(n*sizeof(TermData));
! 5789:
! 5790: for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){
! 5791: assert( i<n );
! 5792: pData[i].pTerm = fts2HashKey(e);
! 5793: pData[i].nTerm = fts2HashKeysize(e);
! 5794: pData[i].pCollector = fts2HashData(e);
! 5795: }
! 5796: assert( i==n );
! 5797:
! 5798: /* TODO(shess) Should we allow user-defined collation sequences,
! 5799: ** here? I think we only need that once we support prefix searches.
! 5800: */
! 5801: if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
! 5802:
! 5803: /* TODO(shess) Refactor so that we can write directly to the segment
! 5804: ** DataBuffer, as happens for segment merges.
! 5805: */
! 5806: leafWriterInit(0, idx, &writer);
! 5807: dataBufferInit(&dl, 0);
! 5808: for(i=0; i<n; i++){
! 5809: dataBufferReset(&dl);
! 5810: dlcAddDoclist(pData[i].pCollector, &dl);
! 5811: rc = leafWriterStep(v, &writer,
! 5812: pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
! 5813: if( rc!=SQLITE_OK ) goto err;
! 5814: }
! 5815: rc = leafWriterFinalize(v, &writer);
! 5816:
! 5817: err:
! 5818: dataBufferDestroy(&dl);
! 5819: sqlite3_free(pData);
! 5820: leafWriterDestroy(&writer);
! 5821: return rc;
! 5822: }
! 5823:
! 5824: /* If pendingTerms has data, free it. */
! 5825: static int clearPendingTerms(fulltext_vtab *v){
! 5826: if( v->nPendingData>=0 ){
! 5827: fts2HashElem *e;
! 5828: for(e=fts2HashFirst(&v->pendingTerms); e; e=fts2HashNext(e)){
! 5829: dlcDelete(fts2HashData(e));
! 5830: }
! 5831: fts2HashClear(&v->pendingTerms);
! 5832: v->nPendingData = -1;
! 5833: }
! 5834: return SQLITE_OK;
! 5835: }
! 5836:
! 5837: /* If pendingTerms has data, flush it to a level-zero segment, and
! 5838: ** free it.
! 5839: */
! 5840: static int flushPendingTerms(fulltext_vtab *v){
! 5841: if( v->nPendingData>=0 ){
! 5842: int rc = writeZeroSegment(v, &v->pendingTerms);
! 5843: if( rc==SQLITE_OK ) clearPendingTerms(v);
! 5844: return rc;
! 5845: }
! 5846: return SQLITE_OK;
! 5847: }
! 5848:
! 5849: /* If pendingTerms is "too big", or docid is out of order, flush it.
! 5850: ** Regardless, be certain that pendingTerms is initialized for use.
! 5851: */
! 5852: static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){
! 5853: /* TODO(shess) Explore whether partially flushing the buffer on
! 5854: ** forced-flush would provide better performance. I suspect that if
! 5855: ** we ordered the doclists by size and flushed the largest until the
! 5856: ** buffer was half empty, that would let the less frequent terms
! 5857: ** generate longer doclists.
! 5858: */
! 5859: if( iDocid<=v->iPrevDocid || v->nPendingData>kPendingThreshold ){
! 5860: int rc = flushPendingTerms(v);
! 5861: if( rc!=SQLITE_OK ) return rc;
! 5862: }
! 5863: if( v->nPendingData<0 ){
! 5864: fts2HashInit(&v->pendingTerms, FTS2_HASH_STRING, 1);
! 5865: v->nPendingData = 0;
! 5866: }
! 5867: v->iPrevDocid = iDocid;
! 5868: return SQLITE_OK;
! 5869: }
! 5870:
! 5871: /* This function implements the xUpdate callback; it is the top-level entry
! 5872: * point for inserting, deleting or updating a row in a full-text table. */
! 5873: static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
! 5874: sqlite_int64 *pRowid){
! 5875: fulltext_vtab *v = (fulltext_vtab *) pVtab;
! 5876: int rc;
! 5877:
! 5878: TRACE(("FTS2 Update %p\n", pVtab));
! 5879:
! 5880: if( nArg<2 ){
! 5881: rc = index_delete(v, sqlite3_value_int64(ppArg[0]));
! 5882: if( rc==SQLITE_OK ){
! 5883: /* If we just deleted the last row in the table, clear out the
! 5884: ** index data.
! 5885: */
! 5886: rc = content_exists(v);
! 5887: if( rc==SQLITE_ROW ){
! 5888: rc = SQLITE_OK;
! 5889: }else if( rc==SQLITE_DONE ){
! 5890: /* Clear the pending terms so we don't flush a useless level-0
! 5891: ** segment when the transaction closes.
! 5892: */
! 5893: rc = clearPendingTerms(v);
! 5894: if( rc==SQLITE_OK ){
! 5895: rc = segdir_delete_all(v);
! 5896: }
! 5897: }
! 5898: }
! 5899: } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
! 5900: /* An update:
! 5901: * ppArg[0] = old rowid
! 5902: * ppArg[1] = new rowid
! 5903: * ppArg[2..2+v->nColumn-1] = values
! 5904: * ppArg[2+v->nColumn] = value for magic column (we ignore this)
! 5905: */
! 5906: sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
! 5907: if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
! 5908: sqlite3_value_int64(ppArg[1]) != rowid ){
! 5909: rc = SQLITE_ERROR; /* we don't allow changing the rowid */
! 5910: } else {
! 5911: assert( nArg==2+v->nColumn+1);
! 5912: rc = index_update(v, rowid, &ppArg[2]);
! 5913: }
! 5914: } else {
! 5915: /* An insert:
! 5916: * ppArg[1] = requested rowid
! 5917: * ppArg[2..2+v->nColumn-1] = values
! 5918: * ppArg[2+v->nColumn] = value for magic column (we ignore this)
! 5919: */
! 5920: assert( nArg==2+v->nColumn+1);
! 5921: rc = index_insert(v, ppArg[1], &ppArg[2], pRowid);
! 5922: }
! 5923:
! 5924: return rc;
! 5925: }
! 5926:
! 5927: static int fulltextSync(sqlite3_vtab *pVtab){
! 5928: TRACE(("FTS2 xSync()\n"));
! 5929: return flushPendingTerms((fulltext_vtab *)pVtab);
! 5930: }
! 5931:
! 5932: static int fulltextBegin(sqlite3_vtab *pVtab){
! 5933: fulltext_vtab *v = (fulltext_vtab *) pVtab;
! 5934: TRACE(("FTS2 xBegin()\n"));
! 5935:
! 5936: /* Any buffered updates should have been cleared by the previous
! 5937: ** transaction.
! 5938: */
! 5939: assert( v->nPendingData<0 );
! 5940: return clearPendingTerms(v);
! 5941: }
! 5942:
! 5943: static int fulltextCommit(sqlite3_vtab *pVtab){
! 5944: fulltext_vtab *v = (fulltext_vtab *) pVtab;
! 5945: TRACE(("FTS2 xCommit()\n"));
! 5946:
! 5947: /* Buffered updates should have been cleared by fulltextSync(). */
! 5948: assert( v->nPendingData<0 );
! 5949: return clearPendingTerms(v);
! 5950: }
! 5951:
! 5952: static int fulltextRollback(sqlite3_vtab *pVtab){
! 5953: TRACE(("FTS2 xRollback()\n"));
! 5954: return clearPendingTerms((fulltext_vtab *)pVtab);
! 5955: }
! 5956:
! 5957: /*
! 5958: ** Implementation of the snippet() function for FTS2
! 5959: */
! 5960: static void snippetFunc(
! 5961: sqlite3_context *pContext,
! 5962: int argc,
! 5963: sqlite3_value **argv
! 5964: ){
! 5965: fulltext_cursor *pCursor;
! 5966: if( argc<1 ) return;
! 5967: if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
! 5968: sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
! 5969: sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
! 5970: }else{
! 5971: const char *zStart = "<b>";
! 5972: const char *zEnd = "</b>";
! 5973: const char *zEllipsis = "<b>...</b>";
! 5974: memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
! 5975: if( argc>=2 ){
! 5976: zStart = (const char*)sqlite3_value_text(argv[1]);
! 5977: if( argc>=3 ){
! 5978: zEnd = (const char*)sqlite3_value_text(argv[2]);
! 5979: if( argc>=4 ){
! 5980: zEllipsis = (const char*)sqlite3_value_text(argv[3]);
! 5981: }
! 5982: }
! 5983: }
! 5984: snippetAllOffsets(pCursor);
! 5985: snippetText(pCursor, zStart, zEnd, zEllipsis);
! 5986: sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
! 5987: pCursor->snippet.nSnippet, SQLITE_STATIC);
! 5988: }
! 5989: }
! 5990:
! 5991: /*
! 5992: ** Implementation of the offsets() function for FTS2
! 5993: */
! 5994: static void snippetOffsetsFunc(
! 5995: sqlite3_context *pContext,
! 5996: int argc,
! 5997: sqlite3_value **argv
! 5998: ){
! 5999: fulltext_cursor *pCursor;
! 6000: if( argc<1 ) return;
! 6001: if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
! 6002: sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
! 6003: sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
! 6004: }else{
! 6005: memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
! 6006: snippetAllOffsets(pCursor);
! 6007: snippetOffsetText(&pCursor->snippet);
! 6008: sqlite3_result_text(pContext,
! 6009: pCursor->snippet.zOffset, pCursor->snippet.nOffset,
! 6010: SQLITE_STATIC);
! 6011: }
! 6012: }
! 6013:
! 6014: /* OptLeavesReader is nearly identical to LeavesReader, except that
! 6015: ** where LeavesReader is geared towards the merging of complete
! 6016: ** segment levels (with exactly MERGE_COUNT segments), OptLeavesReader
! 6017: ** is geared towards implementation of the optimize() function, and
! 6018: ** can merge all segments simultaneously. This version may be
! 6019: ** somewhat less efficient than LeavesReader because it merges into an
! 6020: ** accumulator rather than doing an N-way merge, but since segment
! 6021: ** size grows exponentially (so segment count logrithmically) this is
! 6022: ** probably not an immediate problem.
! 6023: */
! 6024: /* TODO(shess): Prove that assertion, or extend the merge code to
! 6025: ** merge tree fashion (like the prefix-searching code does).
! 6026: */
! 6027: /* TODO(shess): OptLeavesReader and LeavesReader could probably be
! 6028: ** merged with little or no loss of performance for LeavesReader. The
! 6029: ** merged code would need to handle >MERGE_COUNT segments, and would
! 6030: ** also need to be able to optionally optimize away deletes.
! 6031: */
! 6032: typedef struct OptLeavesReader {
! 6033: /* Segment number, to order readers by age. */
! 6034: int segment;
! 6035: LeavesReader reader;
! 6036: } OptLeavesReader;
! 6037:
! 6038: static int optLeavesReaderAtEnd(OptLeavesReader *pReader){
! 6039: return leavesReaderAtEnd(&pReader->reader);
! 6040: }
! 6041: static int optLeavesReaderTermBytes(OptLeavesReader *pReader){
! 6042: return leavesReaderTermBytes(&pReader->reader);
! 6043: }
! 6044: static const char *optLeavesReaderData(OptLeavesReader *pReader){
! 6045: return leavesReaderData(&pReader->reader);
! 6046: }
! 6047: static int optLeavesReaderDataBytes(OptLeavesReader *pReader){
! 6048: return leavesReaderDataBytes(&pReader->reader);
! 6049: }
! 6050: static const char *optLeavesReaderTerm(OptLeavesReader *pReader){
! 6051: return leavesReaderTerm(&pReader->reader);
! 6052: }
! 6053: static int optLeavesReaderStep(fulltext_vtab *v, OptLeavesReader *pReader){
! 6054: return leavesReaderStep(v, &pReader->reader);
! 6055: }
! 6056: static int optLeavesReaderTermCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){
! 6057: return leavesReaderTermCmp(&lr1->reader, &lr2->reader);
! 6058: }
! 6059: /* Order by term ascending, segment ascending (oldest to newest), with
! 6060: ** exhausted readers to the end.
! 6061: */
! 6062: static int optLeavesReaderCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){
! 6063: int c = optLeavesReaderTermCmp(lr1, lr2);
! 6064: if( c!=0 ) return c;
! 6065: return lr1->segment-lr2->segment;
! 6066: }
! 6067: /* Bubble pLr[0] to appropriate place in pLr[1..nLr-1]. Assumes that
! 6068: ** pLr[1..nLr-1] is already sorted.
! 6069: */
! 6070: static void optLeavesReaderReorder(OptLeavesReader *pLr, int nLr){
! 6071: while( nLr>1 && optLeavesReaderCmp(pLr, pLr+1)>0 ){
! 6072: OptLeavesReader tmp = pLr[0];
! 6073: pLr[0] = pLr[1];
! 6074: pLr[1] = tmp;
! 6075: nLr--;
! 6076: pLr++;
! 6077: }
! 6078: }
! 6079:
! 6080: /* optimize() helper function. Put the readers in order and iterate
! 6081: ** through them, merging doclists for matching terms into pWriter.
! 6082: ** Returns SQLITE_OK on success, or the SQLite error code which
! 6083: ** prevented success.
! 6084: */
! 6085: static int optimizeInternal(fulltext_vtab *v,
! 6086: OptLeavesReader *readers, int nReaders,
! 6087: LeafWriter *pWriter){
! 6088: int i, rc = SQLITE_OK;
! 6089: DataBuffer doclist, merged, tmp;
! 6090:
! 6091: /* Order the readers. */
! 6092: i = nReaders;
! 6093: while( i-- > 0 ){
! 6094: optLeavesReaderReorder(&readers[i], nReaders-i);
! 6095: }
! 6096:
! 6097: dataBufferInit(&doclist, LEAF_MAX);
! 6098: dataBufferInit(&merged, LEAF_MAX);
! 6099:
! 6100: /* Exhausted readers bubble to the end, so when the first reader is
! 6101: ** at eof, all are at eof.
! 6102: */
! 6103: while( !optLeavesReaderAtEnd(&readers[0]) ){
! 6104:
! 6105: /* Figure out how many readers share the next term. */
! 6106: for(i=1; i<nReaders && !optLeavesReaderAtEnd(&readers[i]); i++){
! 6107: if( 0!=optLeavesReaderTermCmp(&readers[0], &readers[i]) ) break;
! 6108: }
! 6109:
! 6110: /* Special-case for no merge. */
! 6111: if( i==1 ){
! 6112: /* Trim deletions from the doclist. */
! 6113: dataBufferReset(&merged);
! 6114: docListTrim(DL_DEFAULT,
! 6115: optLeavesReaderData(&readers[0]),
! 6116: optLeavesReaderDataBytes(&readers[0]),
! 6117: -1, DL_DEFAULT, &merged);
! 6118: }else{
! 6119: DLReader dlReaders[MERGE_COUNT];
! 6120: int iReader, nReaders;
! 6121:
! 6122: /* Prime the pipeline with the first reader's doclist. After
! 6123: ** one pass index 0 will reference the accumulated doclist.
! 6124: */
! 6125: dlrInit(&dlReaders[0], DL_DEFAULT,
! 6126: optLeavesReaderData(&readers[0]),
! 6127: optLeavesReaderDataBytes(&readers[0]));
! 6128: iReader = 1;
! 6129:
! 6130: assert( iReader<i ); /* Must execute the loop at least once. */
! 6131: while( iReader<i ){
! 6132: /* Merge 16 inputs per pass. */
! 6133: for( nReaders=1; iReader<i && nReaders<MERGE_COUNT;
! 6134: iReader++, nReaders++ ){
! 6135: dlrInit(&dlReaders[nReaders], DL_DEFAULT,
! 6136: optLeavesReaderData(&readers[iReader]),
! 6137: optLeavesReaderDataBytes(&readers[iReader]));
! 6138: }
! 6139:
! 6140: /* Merge doclists and swap result into accumulator. */
! 6141: dataBufferReset(&merged);
! 6142: docListMerge(&merged, dlReaders, nReaders);
! 6143: tmp = merged;
! 6144: merged = doclist;
! 6145: doclist = tmp;
! 6146:
! 6147: while( nReaders-- > 0 ){
! 6148: dlrDestroy(&dlReaders[nReaders]);
! 6149: }
! 6150:
! 6151: /* Accumulated doclist to reader 0 for next pass. */
! 6152: dlrInit(&dlReaders[0], DL_DEFAULT, doclist.pData, doclist.nData);
! 6153: }
! 6154:
! 6155: /* Destroy reader that was left in the pipeline. */
! 6156: dlrDestroy(&dlReaders[0]);
! 6157:
! 6158: /* Trim deletions from the doclist. */
! 6159: dataBufferReset(&merged);
! 6160: docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
! 6161: -1, DL_DEFAULT, &merged);
! 6162: }
! 6163:
! 6164: /* Only pass doclists with hits (skip if all hits deleted). */
! 6165: if( merged.nData>0 ){
! 6166: rc = leafWriterStep(v, pWriter,
! 6167: optLeavesReaderTerm(&readers[0]),
! 6168: optLeavesReaderTermBytes(&readers[0]),
! 6169: merged.pData, merged.nData);
! 6170: if( rc!=SQLITE_OK ) goto err;
! 6171: }
! 6172:
! 6173: /* Step merged readers to next term and reorder. */
! 6174: while( i-- > 0 ){
! 6175: rc = optLeavesReaderStep(v, &readers[i]);
! 6176: if( rc!=SQLITE_OK ) goto err;
! 6177:
! 6178: optLeavesReaderReorder(&readers[i], nReaders-i);
! 6179: }
! 6180: }
! 6181:
! 6182: err:
! 6183: dataBufferDestroy(&doclist);
! 6184: dataBufferDestroy(&merged);
! 6185: return rc;
! 6186: }
! 6187:
! 6188: /* Implement optimize() function for FTS3. optimize(t) merges all
! 6189: ** segments in the fts index into a single segment. 't' is the magic
! 6190: ** table-named column.
! 6191: */
! 6192: static void optimizeFunc(sqlite3_context *pContext,
! 6193: int argc, sqlite3_value **argv){
! 6194: fulltext_cursor *pCursor;
! 6195: if( argc>1 ){
! 6196: sqlite3_result_error(pContext, "excess arguments to optimize()",-1);
! 6197: }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
! 6198: sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
! 6199: sqlite3_result_error(pContext, "illegal first argument to optimize",-1);
! 6200: }else{
! 6201: fulltext_vtab *v;
! 6202: int i, rc, iMaxLevel;
! 6203: OptLeavesReader *readers;
! 6204: int nReaders;
! 6205: LeafWriter writer;
! 6206: sqlite3_stmt *s;
! 6207:
! 6208: memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
! 6209: v = cursor_vtab(pCursor);
! 6210:
! 6211: /* Flush any buffered updates before optimizing. */
! 6212: rc = flushPendingTerms(v);
! 6213: if( rc!=SQLITE_OK ) goto err;
! 6214:
! 6215: rc = segdir_count(v, &nReaders, &iMaxLevel);
! 6216: if( rc!=SQLITE_OK ) goto err;
! 6217: if( nReaders==0 || nReaders==1 ){
! 6218: sqlite3_result_text(pContext, "Index already optimal", -1,
! 6219: SQLITE_STATIC);
! 6220: return;
! 6221: }
! 6222:
! 6223: rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
! 6224: if( rc!=SQLITE_OK ) goto err;
! 6225:
! 6226: readers = sqlite3_malloc(nReaders*sizeof(readers[0]));
! 6227: if( readers==NULL ) goto err;
! 6228:
! 6229: /* Note that there will already be a segment at this position
! 6230: ** until we call segdir_delete() on iMaxLevel.
! 6231: */
! 6232: leafWriterInit(iMaxLevel, 0, &writer);
! 6233:
! 6234: i = 0;
! 6235: while( (rc = sqlite3_step(s))==SQLITE_ROW ){
! 6236: sqlite_int64 iStart = sqlite3_column_int64(s, 0);
! 6237: sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
! 6238: const char *pRootData = sqlite3_column_blob(s, 2);
! 6239: int nRootData = sqlite3_column_bytes(s, 2);
! 6240:
! 6241: assert( i<nReaders );
! 6242: rc = leavesReaderInit(v, -1, iStart, iEnd, pRootData, nRootData,
! 6243: &readers[i].reader);
! 6244: if( rc!=SQLITE_OK ) break;
! 6245:
! 6246: readers[i].segment = i;
! 6247: i++;
! 6248: }
! 6249:
! 6250: /* If we managed to successfully read them all, optimize them. */
! 6251: if( rc==SQLITE_DONE ){
! 6252: assert( i==nReaders );
! 6253: rc = optimizeInternal(v, readers, nReaders, &writer);
! 6254: }
! 6255:
! 6256: while( i-- > 0 ){
! 6257: leavesReaderDestroy(&readers[i].reader);
! 6258: }
! 6259: sqlite3_free(readers);
! 6260:
! 6261: /* If we've successfully gotten to here, delete the old segments
! 6262: ** and flush the interior structure of the new segment.
! 6263: */
! 6264: if( rc==SQLITE_OK ){
! 6265: for( i=0; i<=iMaxLevel; i++ ){
! 6266: rc = segdir_delete(v, i);
! 6267: if( rc!=SQLITE_OK ) break;
! 6268: }
! 6269:
! 6270: if( rc==SQLITE_OK ) rc = leafWriterFinalize(v, &writer);
! 6271: }
! 6272:
! 6273: leafWriterDestroy(&writer);
! 6274:
! 6275: if( rc!=SQLITE_OK ) goto err;
! 6276:
! 6277: sqlite3_result_text(pContext, "Index optimized", -1, SQLITE_STATIC);
! 6278: return;
! 6279:
! 6280: /* TODO(shess): Error-handling needs to be improved along the
! 6281: ** lines of the dump_ functions.
! 6282: */
! 6283: err:
! 6284: {
! 6285: char buf[512];
! 6286: sqlite3_snprintf(sizeof(buf), buf, "Error in optimize: %s",
! 6287: sqlite3_errmsg(sqlite3_context_db_handle(pContext)));
! 6288: sqlite3_result_error(pContext, buf, -1);
! 6289: }
! 6290: }
! 6291: }
! 6292:
! 6293: #ifdef SQLITE_TEST
! 6294: /* Generate an error of the form "<prefix>: <msg>". If msg is NULL,
! 6295: ** pull the error from the context's db handle.
! 6296: */
! 6297: static void generateError(sqlite3_context *pContext,
! 6298: const char *prefix, const char *msg){
! 6299: char buf[512];
! 6300: if( msg==NULL ) msg = sqlite3_errmsg(sqlite3_context_db_handle(pContext));
! 6301: sqlite3_snprintf(sizeof(buf), buf, "%s: %s", prefix, msg);
! 6302: sqlite3_result_error(pContext, buf, -1);
! 6303: }
! 6304:
! 6305: /* Helper function to collect the set of terms in the segment into
! 6306: ** pTerms. The segment is defined by the leaf nodes between
! 6307: ** iStartBlockid and iEndBlockid, inclusive, or by the contents of
! 6308: ** pRootData if iStartBlockid is 0 (in which case the entire segment
! 6309: ** fit in a leaf).
! 6310: */
! 6311: static int collectSegmentTerms(fulltext_vtab *v, sqlite3_stmt *s,
! 6312: fts2Hash *pTerms){
! 6313: const sqlite_int64 iStartBlockid = sqlite3_column_int64(s, 0);
! 6314: const sqlite_int64 iEndBlockid = sqlite3_column_int64(s, 1);
! 6315: const char *pRootData = sqlite3_column_blob(s, 2);
! 6316: const int nRootData = sqlite3_column_bytes(s, 2);
! 6317: LeavesReader reader;
! 6318: int rc = leavesReaderInit(v, 0, iStartBlockid, iEndBlockid,
! 6319: pRootData, nRootData, &reader);
! 6320: if( rc!=SQLITE_OK ) return rc;
! 6321:
! 6322: while( rc==SQLITE_OK && !leavesReaderAtEnd(&reader) ){
! 6323: const char *pTerm = leavesReaderTerm(&reader);
! 6324: const int nTerm = leavesReaderTermBytes(&reader);
! 6325: void *oldValue = sqlite3Fts2HashFind(pTerms, pTerm, nTerm);
! 6326: void *newValue = (void *)((char *)oldValue+1);
! 6327:
! 6328: /* From the comment before sqlite3Fts2HashInsert in fts2_hash.c,
! 6329: ** the data value passed is returned in case of malloc failure.
! 6330: */
! 6331: if( newValue==sqlite3Fts2HashInsert(pTerms, pTerm, nTerm, newValue) ){
! 6332: rc = SQLITE_NOMEM;
! 6333: }else{
! 6334: rc = leavesReaderStep(v, &reader);
! 6335: }
! 6336: }
! 6337:
! 6338: leavesReaderDestroy(&reader);
! 6339: return rc;
! 6340: }
! 6341:
! 6342: /* Helper function to build the result string for dump_terms(). */
! 6343: static int generateTermsResult(sqlite3_context *pContext, fts2Hash *pTerms){
! 6344: int iTerm, nTerms, nResultBytes, iByte;
! 6345: char *result;
! 6346: TermData *pData;
! 6347: fts2HashElem *e;
! 6348:
! 6349: /* Iterate pTerms to generate an array of terms in pData for
! 6350: ** sorting.
! 6351: */
! 6352: nTerms = fts2HashCount(pTerms);
! 6353: assert( nTerms>0 );
! 6354: pData = sqlite3_malloc(nTerms*sizeof(TermData));
! 6355: if( pData==NULL ) return SQLITE_NOMEM;
! 6356:
! 6357: nResultBytes = 0;
! 6358: for(iTerm = 0, e = fts2HashFirst(pTerms); e; iTerm++, e = fts2HashNext(e)){
! 6359: nResultBytes += fts2HashKeysize(e)+1; /* Term plus trailing space */
! 6360: assert( iTerm<nTerms );
! 6361: pData[iTerm].pTerm = fts2HashKey(e);
! 6362: pData[iTerm].nTerm = fts2HashKeysize(e);
! 6363: pData[iTerm].pCollector = fts2HashData(e); /* unused */
! 6364: }
! 6365: assert( iTerm==nTerms );
! 6366:
! 6367: assert( nResultBytes>0 ); /* nTerms>0, nResultsBytes must be, too. */
! 6368: result = sqlite3_malloc(nResultBytes);
! 6369: if( result==NULL ){
! 6370: sqlite3_free(pData);
! 6371: return SQLITE_NOMEM;
! 6372: }
! 6373:
! 6374: if( nTerms>1 ) qsort(pData, nTerms, sizeof(*pData), termDataCmp);
! 6375:
! 6376: /* Read the terms in order to build the result. */
! 6377: iByte = 0;
! 6378: for(iTerm=0; iTerm<nTerms; ++iTerm){
! 6379: memcpy(result+iByte, pData[iTerm].pTerm, pData[iTerm].nTerm);
! 6380: iByte += pData[iTerm].nTerm;
! 6381: result[iByte++] = ' ';
! 6382: }
! 6383: assert( iByte==nResultBytes );
! 6384: assert( result[nResultBytes-1]==' ' );
! 6385: result[nResultBytes-1] = '\0';
! 6386:
! 6387: /* Passes away ownership of result. */
! 6388: sqlite3_result_text(pContext, result, nResultBytes-1, sqlite3_free);
! 6389: sqlite3_free(pData);
! 6390: return SQLITE_OK;
! 6391: }
! 6392:
! 6393: /* Implements dump_terms() for use in inspecting the fts2 index from
! 6394: ** tests. TEXT result containing the ordered list of terms joined by
! 6395: ** spaces. dump_terms(t, level, idx) dumps the terms for the segment
! 6396: ** specified by level, idx (in %_segdir), while dump_terms(t) dumps
! 6397: ** all terms in the index. In both cases t is the fts table's magic
! 6398: ** table-named column.
! 6399: */
! 6400: static void dumpTermsFunc(
! 6401: sqlite3_context *pContext,
! 6402: int argc, sqlite3_value **argv
! 6403: ){
! 6404: fulltext_cursor *pCursor;
! 6405: if( argc!=3 && argc!=1 ){
! 6406: generateError(pContext, "dump_terms", "incorrect arguments");
! 6407: }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
! 6408: sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
! 6409: generateError(pContext, "dump_terms", "illegal first argument");
! 6410: }else{
! 6411: fulltext_vtab *v;
! 6412: fts2Hash terms;
! 6413: sqlite3_stmt *s = NULL;
! 6414: int rc;
! 6415:
! 6416: memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
! 6417: v = cursor_vtab(pCursor);
! 6418:
! 6419: /* If passed only the cursor column, get all segments. Otherwise
! 6420: ** get the segment described by the following two arguments.
! 6421: */
! 6422: if( argc==1 ){
! 6423: rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
! 6424: }else{
! 6425: rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s);
! 6426: if( rc==SQLITE_OK ){
! 6427: rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[1]));
! 6428: if( rc==SQLITE_OK ){
! 6429: rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[2]));
! 6430: }
! 6431: }
! 6432: }
! 6433:
! 6434: if( rc!=SQLITE_OK ){
! 6435: generateError(pContext, "dump_terms", NULL);
! 6436: return;
! 6437: }
! 6438:
! 6439: /* Collect the terms for each segment. */
! 6440: sqlite3Fts2HashInit(&terms, FTS2_HASH_STRING, 1);
! 6441: while( (rc = sqlite3_step(s))==SQLITE_ROW ){
! 6442: rc = collectSegmentTerms(v, s, &terms);
! 6443: if( rc!=SQLITE_OK ) break;
! 6444: }
! 6445:
! 6446: if( rc!=SQLITE_DONE ){
! 6447: sqlite3_reset(s);
! 6448: generateError(pContext, "dump_terms", NULL);
! 6449: }else{
! 6450: const int nTerms = fts2HashCount(&terms);
! 6451: if( nTerms>0 ){
! 6452: rc = generateTermsResult(pContext, &terms);
! 6453: if( rc==SQLITE_NOMEM ){
! 6454: generateError(pContext, "dump_terms", "out of memory");
! 6455: }else{
! 6456: assert( rc==SQLITE_OK );
! 6457: }
! 6458: }else if( argc==3 ){
! 6459: /* The specific segment asked for could not be found. */
! 6460: generateError(pContext, "dump_terms", "segment not found");
! 6461: }else{
! 6462: /* No segments found. */
! 6463: /* TODO(shess): It should be impossible to reach this. This
! 6464: ** case can only happen for an empty table, in which case
! 6465: ** SQLite has no rows to call this function on.
! 6466: */
! 6467: sqlite3_result_null(pContext);
! 6468: }
! 6469: }
! 6470: sqlite3Fts2HashClear(&terms);
! 6471: }
! 6472: }
! 6473:
! 6474: /* Expand the DL_DEFAULT doclist in pData into a text result in
! 6475: ** pContext.
! 6476: */
! 6477: static void createDoclistResult(sqlite3_context *pContext,
! 6478: const char *pData, int nData){
! 6479: DataBuffer dump;
! 6480: DLReader dlReader;
! 6481:
! 6482: assert( pData!=NULL && nData>0 );
! 6483:
! 6484: dataBufferInit(&dump, 0);
! 6485: dlrInit(&dlReader, DL_DEFAULT, pData, nData);
! 6486: for( ; !dlrAtEnd(&dlReader); dlrStep(&dlReader) ){
! 6487: char buf[256];
! 6488: PLReader plReader;
! 6489:
! 6490: plrInit(&plReader, &dlReader);
! 6491: if( DL_DEFAULT==DL_DOCIDS || plrAtEnd(&plReader) ){
! 6492: sqlite3_snprintf(sizeof(buf), buf, "[%lld] ", dlrDocid(&dlReader));
! 6493: dataBufferAppend(&dump, buf, strlen(buf));
! 6494: }else{
! 6495: int iColumn = plrColumn(&plReader);
! 6496:
! 6497: sqlite3_snprintf(sizeof(buf), buf, "[%lld %d[",
! 6498: dlrDocid(&dlReader), iColumn);
! 6499: dataBufferAppend(&dump, buf, strlen(buf));
! 6500:
! 6501: for( ; !plrAtEnd(&plReader); plrStep(&plReader) ){
! 6502: if( plrColumn(&plReader)!=iColumn ){
! 6503: iColumn = plrColumn(&plReader);
! 6504: sqlite3_snprintf(sizeof(buf), buf, "] %d[", iColumn);
! 6505: assert( dump.nData>0 );
! 6506: dump.nData--; /* Overwrite trailing space. */
! 6507: assert( dump.pData[dump.nData]==' ');
! 6508: dataBufferAppend(&dump, buf, strlen(buf));
! 6509: }
! 6510: if( DL_DEFAULT==DL_POSITIONS_OFFSETS ){
! 6511: sqlite3_snprintf(sizeof(buf), buf, "%d,%d,%d ",
! 6512: plrPosition(&plReader),
! 6513: plrStartOffset(&plReader), plrEndOffset(&plReader));
! 6514: }else if( DL_DEFAULT==DL_POSITIONS ){
! 6515: sqlite3_snprintf(sizeof(buf), buf, "%d ", plrPosition(&plReader));
! 6516: }else{
! 6517: assert( NULL=="Unhandled DL_DEFAULT value");
! 6518: }
! 6519: dataBufferAppend(&dump, buf, strlen(buf));
! 6520: }
! 6521: plrDestroy(&plReader);
! 6522:
! 6523: assert( dump.nData>0 );
! 6524: dump.nData--; /* Overwrite trailing space. */
! 6525: assert( dump.pData[dump.nData]==' ');
! 6526: dataBufferAppend(&dump, "]] ", 3);
! 6527: }
! 6528: }
! 6529: dlrDestroy(&dlReader);
! 6530:
! 6531: assert( dump.nData>0 );
! 6532: dump.nData--; /* Overwrite trailing space. */
! 6533: assert( dump.pData[dump.nData]==' ');
! 6534: dump.pData[dump.nData] = '\0';
! 6535: assert( dump.nData>0 );
! 6536:
! 6537: /* Passes ownership of dump's buffer to pContext. */
! 6538: sqlite3_result_text(pContext, dump.pData, dump.nData, sqlite3_free);
! 6539: dump.pData = NULL;
! 6540: dump.nData = dump.nCapacity = 0;
! 6541: }
! 6542:
! 6543: /* Implements dump_doclist() for use in inspecting the fts2 index from
! 6544: ** tests. TEXT result containing a string representation of the
! 6545: ** doclist for the indicated term. dump_doclist(t, term, level, idx)
! 6546: ** dumps the doclist for term from the segment specified by level, idx
! 6547: ** (in %_segdir), while dump_doclist(t, term) dumps the logical
! 6548: ** doclist for the term across all segments. The per-segment doclist
! 6549: ** can contain deletions, while the full-index doclist will not
! 6550: ** (deletions are omitted).
! 6551: **
! 6552: ** Result formats differ with the setting of DL_DEFAULTS. Examples:
! 6553: **
! 6554: ** DL_DOCIDS: [1] [3] [7]
! 6555: ** DL_POSITIONS: [1 0[0 4] 1[17]] [3 1[5]]
! 6556: ** DL_POSITIONS_OFFSETS: [1 0[0,0,3 4,23,26] 1[17,102,105]] [3 1[5,20,23]]
! 6557: **
! 6558: ** In each case the number after the outer '[' is the docid. In the
! 6559: ** latter two cases, the number before the inner '[' is the column
! 6560: ** associated with the values within. For DL_POSITIONS the numbers
! 6561: ** within are the positions, for DL_POSITIONS_OFFSETS they are the
! 6562: ** position, the start offset, and the end offset.
! 6563: */
! 6564: static void dumpDoclistFunc(
! 6565: sqlite3_context *pContext,
! 6566: int argc, sqlite3_value **argv
! 6567: ){
! 6568: fulltext_cursor *pCursor;
! 6569: if( argc!=2 && argc!=4 ){
! 6570: generateError(pContext, "dump_doclist", "incorrect arguments");
! 6571: }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
! 6572: sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
! 6573: generateError(pContext, "dump_doclist", "illegal first argument");
! 6574: }else if( sqlite3_value_text(argv[1])==NULL ||
! 6575: sqlite3_value_text(argv[1])[0]=='\0' ){
! 6576: generateError(pContext, "dump_doclist", "empty second argument");
! 6577: }else{
! 6578: const char *pTerm = (const char *)sqlite3_value_text(argv[1]);
! 6579: const int nTerm = strlen(pTerm);
! 6580: fulltext_vtab *v;
! 6581: int rc;
! 6582: DataBuffer doclist;
! 6583:
! 6584: memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
! 6585: v = cursor_vtab(pCursor);
! 6586:
! 6587: dataBufferInit(&doclist, 0);
! 6588:
! 6589: /* termSelect() yields the same logical doclist that queries are
! 6590: ** run against.
! 6591: */
! 6592: if( argc==2 ){
! 6593: rc = termSelect(v, v->nColumn, pTerm, nTerm, 0, DL_DEFAULT, &doclist);
! 6594: }else{
! 6595: sqlite3_stmt *s = NULL;
! 6596:
! 6597: /* Get our specific segment's information. */
! 6598: rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s);
! 6599: if( rc==SQLITE_OK ){
! 6600: rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[2]));
! 6601: if( rc==SQLITE_OK ){
! 6602: rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[3]));
! 6603: }
! 6604: }
! 6605:
! 6606: if( rc==SQLITE_OK ){
! 6607: rc = sqlite3_step(s);
! 6608:
! 6609: if( rc==SQLITE_DONE ){
! 6610: dataBufferDestroy(&doclist);
! 6611: generateError(pContext, "dump_doclist", "segment not found");
! 6612: return;
! 6613: }
! 6614:
! 6615: /* Found a segment, load it into doclist. */
! 6616: if( rc==SQLITE_ROW ){
! 6617: const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
! 6618: const char *pData = sqlite3_column_blob(s, 2);
! 6619: const int nData = sqlite3_column_bytes(s, 2);
! 6620:
! 6621: /* loadSegment() is used by termSelect() to load each
! 6622: ** segment's data.
! 6623: */
! 6624: rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, 0,
! 6625: &doclist);
! 6626: if( rc==SQLITE_OK ){
! 6627: rc = sqlite3_step(s);
! 6628:
! 6629: /* Should not have more than one matching segment. */
! 6630: if( rc!=SQLITE_DONE ){
! 6631: sqlite3_reset(s);
! 6632: dataBufferDestroy(&doclist);
! 6633: generateError(pContext, "dump_doclist", "invalid segdir");
! 6634: return;
! 6635: }
! 6636: rc = SQLITE_OK;
! 6637: }
! 6638: }
! 6639: }
! 6640:
! 6641: sqlite3_reset(s);
! 6642: }
! 6643:
! 6644: if( rc==SQLITE_OK ){
! 6645: if( doclist.nData>0 ){
! 6646: createDoclistResult(pContext, doclist.pData, doclist.nData);
! 6647: }else{
! 6648: /* TODO(shess): This can happen if the term is not present, or
! 6649: ** if all instances of the term have been deleted and this is
! 6650: ** an all-index dump. It may be interesting to distinguish
! 6651: ** these cases.
! 6652: */
! 6653: sqlite3_result_text(pContext, "", 0, SQLITE_STATIC);
! 6654: }
! 6655: }else if( rc==SQLITE_NOMEM ){
! 6656: /* Handle out-of-memory cases specially because if they are
! 6657: ** generated in fts2 code they may not be reflected in the db
! 6658: ** handle.
! 6659: */
! 6660: /* TODO(shess): Handle this more comprehensively.
! 6661: ** sqlite3ErrStr() has what I need, but is internal.
! 6662: */
! 6663: generateError(pContext, "dump_doclist", "out of memory");
! 6664: }else{
! 6665: generateError(pContext, "dump_doclist", NULL);
! 6666: }
! 6667:
! 6668: dataBufferDestroy(&doclist);
! 6669: }
! 6670: }
! 6671: #endif
! 6672:
! 6673: /*
! 6674: ** This routine implements the xFindFunction method for the FTS2
! 6675: ** virtual table.
! 6676: */
! 6677: static int fulltextFindFunction(
! 6678: sqlite3_vtab *pVtab,
! 6679: int nArg,
! 6680: const char *zName,
! 6681: void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
! 6682: void **ppArg
! 6683: ){
! 6684: if( strcmp(zName,"snippet")==0 ){
! 6685: *pxFunc = snippetFunc;
! 6686: return 1;
! 6687: }else if( strcmp(zName,"offsets")==0 ){
! 6688: *pxFunc = snippetOffsetsFunc;
! 6689: return 1;
! 6690: }else if( strcmp(zName,"optimize")==0 ){
! 6691: *pxFunc = optimizeFunc;
! 6692: return 1;
! 6693: #ifdef SQLITE_TEST
! 6694: /* NOTE(shess): These functions are present only for testing
! 6695: ** purposes. No particular effort is made to optimize their
! 6696: ** execution or how they build their results.
! 6697: */
! 6698: }else if( strcmp(zName,"dump_terms")==0 ){
! 6699: /* fprintf(stderr, "Found dump_terms\n"); */
! 6700: *pxFunc = dumpTermsFunc;
! 6701: return 1;
! 6702: }else if( strcmp(zName,"dump_doclist")==0 ){
! 6703: /* fprintf(stderr, "Found dump_doclist\n"); */
! 6704: *pxFunc = dumpDoclistFunc;
! 6705: return 1;
! 6706: #endif
! 6707: }
! 6708: return 0;
! 6709: }
! 6710:
! 6711: /*
! 6712: ** Rename an fts2 table.
! 6713: */
! 6714: static int fulltextRename(
! 6715: sqlite3_vtab *pVtab,
! 6716: const char *zName
! 6717: ){
! 6718: fulltext_vtab *p = (fulltext_vtab *)pVtab;
! 6719: int rc = SQLITE_NOMEM;
! 6720: char *zSql = sqlite3_mprintf(
! 6721: "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
! 6722: "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
! 6723: "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';"
! 6724: , p->zDb, p->zName, zName
! 6725: , p->zDb, p->zName, zName
! 6726: , p->zDb, p->zName, zName
! 6727: );
! 6728: if( zSql ){
! 6729: rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
! 6730: sqlite3_free(zSql);
! 6731: }
! 6732: return rc;
! 6733: }
! 6734:
! 6735: static const sqlite3_module fts2Module = {
! 6736: /* iVersion */ 0,
! 6737: /* xCreate */ fulltextCreate,
! 6738: /* xConnect */ fulltextConnect,
! 6739: /* xBestIndex */ fulltextBestIndex,
! 6740: /* xDisconnect */ fulltextDisconnect,
! 6741: /* xDestroy */ fulltextDestroy,
! 6742: /* xOpen */ fulltextOpen,
! 6743: /* xClose */ fulltextClose,
! 6744: /* xFilter */ fulltextFilter,
! 6745: /* xNext */ fulltextNext,
! 6746: /* xEof */ fulltextEof,
! 6747: /* xColumn */ fulltextColumn,
! 6748: /* xRowid */ fulltextRowid,
! 6749: /* xUpdate */ fulltextUpdate,
! 6750: /* xBegin */ fulltextBegin,
! 6751: /* xSync */ fulltextSync,
! 6752: /* xCommit */ fulltextCommit,
! 6753: /* xRollback */ fulltextRollback,
! 6754: /* xFindFunction */ fulltextFindFunction,
! 6755: /* xRename */ fulltextRename,
! 6756: };
! 6757:
! 6758: static void hashDestroy(void *p){
! 6759: fts2Hash *pHash = (fts2Hash *)p;
! 6760: sqlite3Fts2HashClear(pHash);
! 6761: sqlite3_free(pHash);
! 6762: }
! 6763:
! 6764: /*
! 6765: ** The fts2 built-in tokenizers - "simple" and "porter" - are implemented
! 6766: ** in files fts2_tokenizer1.c and fts2_porter.c respectively. The following
! 6767: ** two forward declarations are for functions declared in these files
! 6768: ** used to retrieve the respective implementations.
! 6769: **
! 6770: ** Calling sqlite3Fts2SimpleTokenizerModule() sets the value pointed
! 6771: ** to by the argument to point a the "simple" tokenizer implementation.
! 6772: ** Function ...PorterTokenizerModule() sets *pModule to point to the
! 6773: ** porter tokenizer/stemmer implementation.
! 6774: */
! 6775: void sqlite3Fts2SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
! 6776: void sqlite3Fts2PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
! 6777: void sqlite3Fts2IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
! 6778:
! 6779: int sqlite3Fts2InitHashTable(sqlite3 *, fts2Hash *, const char *);
! 6780:
! 6781: /*
! 6782: ** Initialise the fts2 extension. If this extension is built as part
! 6783: ** of the sqlite library, then this function is called directly by
! 6784: ** SQLite. If fts2 is built as a dynamically loadable extension, this
! 6785: ** function is called by the sqlite3_extension_init() entry point.
! 6786: */
! 6787: int sqlite3Fts2Init(sqlite3 *db){
! 6788: int rc = SQLITE_OK;
! 6789: fts2Hash *pHash = 0;
! 6790: const sqlite3_tokenizer_module *pSimple = 0;
! 6791: const sqlite3_tokenizer_module *pPorter = 0;
! 6792: const sqlite3_tokenizer_module *pIcu = 0;
! 6793:
! 6794: sqlite3Fts2SimpleTokenizerModule(&pSimple);
! 6795: sqlite3Fts2PorterTokenizerModule(&pPorter);
! 6796: #ifdef SQLITE_ENABLE_ICU
! 6797: sqlite3Fts2IcuTokenizerModule(&pIcu);
! 6798: #endif
! 6799:
! 6800: /* Allocate and initialise the hash-table used to store tokenizers. */
! 6801: pHash = sqlite3_malloc(sizeof(fts2Hash));
! 6802: if( !pHash ){
! 6803: rc = SQLITE_NOMEM;
! 6804: }else{
! 6805: sqlite3Fts2HashInit(pHash, FTS2_HASH_STRING, 1);
! 6806: }
! 6807:
! 6808: /* Load the built-in tokenizers into the hash table */
! 6809: if( rc==SQLITE_OK ){
! 6810: if( sqlite3Fts2HashInsert(pHash, "simple", 7, (void *)pSimple)
! 6811: || sqlite3Fts2HashInsert(pHash, "porter", 7, (void *)pPorter)
! 6812: || (pIcu && sqlite3Fts2HashInsert(pHash, "icu", 4, (void *)pIcu))
! 6813: ){
! 6814: rc = SQLITE_NOMEM;
! 6815: }
! 6816: }
! 6817:
! 6818: /* Create the virtual table wrapper around the hash-table and overload
! 6819: ** the two scalar functions. If this is successful, register the
! 6820: ** module with sqlite.
! 6821: */
! 6822: if( SQLITE_OK==rc
! 6823: && SQLITE_OK==(rc = sqlite3Fts2InitHashTable(db, pHash, "fts2_tokenizer"))
! 6824: && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
! 6825: && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
! 6826: && SQLITE_OK==(rc = sqlite3_overload_function(db, "optimize", -1))
! 6827: #ifdef SQLITE_TEST
! 6828: && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_terms", -1))
! 6829: && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_doclist", -1))
! 6830: #endif
! 6831: ){
! 6832: return sqlite3_create_module_v2(
! 6833: db, "fts2", &fts2Module, (void *)pHash, hashDestroy
! 6834: );
! 6835: }
! 6836:
! 6837: /* An error has occurred. Delete the hash table and return the error code. */
! 6838: assert( rc!=SQLITE_OK );
! 6839: if( pHash ){
! 6840: sqlite3Fts2HashClear(pHash);
! 6841: sqlite3_free(pHash);
! 6842: }
! 6843: return rc;
! 6844: }
! 6845:
! 6846: #if !SQLITE_CORE
! 6847: int sqlite3_extension_init(
! 6848: sqlite3 *db,
! 6849: char **pzErrMsg,
! 6850: const sqlite3_api_routines *pApi
! 6851: ){
! 6852: SQLITE_EXTENSION_INIT2(pApi)
! 6853: return sqlite3Fts2Init(db);
! 6854: }
! 6855: #endif
! 6856:
! 6857: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>