Annotation of embedaddon/sqlite3/ext/fts3/fts3.c, revision 1.1.1.1
1.1 misho 1: /*
2: ** 2006 Oct 10
3: **
4: ** The author disclaims copyright to this source code. In place of
5: ** a legal notice, here is a blessing:
6: **
7: ** May you do good and not evil.
8: ** May you find forgiveness for yourself and forgive others.
9: ** May you share freely, never taking more than you give.
10: **
11: ******************************************************************************
12: **
13: ** This is an SQLite module implementing full-text search.
14: */
15:
16: /*
17: ** The code in this file is only compiled if:
18: **
19: ** * The FTS3 module is being built as an extension
20: ** (in which case SQLITE_CORE is not defined), or
21: **
22: ** * The FTS3 module is being built into the core of
23: ** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
24: */
25:
26: /* The full-text index is stored in a series of b+tree (-like)
27: ** structures called segments which map terms to doclists. The
28: ** structures are like b+trees in layout, but are constructed from the
29: ** bottom up in optimal fashion and are not updatable. Since trees
30: ** are built from the bottom up, things will be described from the
31: ** bottom up.
32: **
33: **
34: **** Varints ****
35: ** The basic unit of encoding is a variable-length integer called a
36: ** varint. We encode variable-length integers in little-endian order
37: ** using seven bits * per byte as follows:
38: **
39: ** KEY:
40: ** A = 0xxxxxxx 7 bits of data and one flag bit
41: ** B = 1xxxxxxx 7 bits of data and one flag bit
42: **
43: ** 7 bits - A
44: ** 14 bits - BA
45: ** 21 bits - BBA
46: ** and so on.
47: **
48: ** This is similar in concept to how sqlite encodes "varints" but
49: ** the encoding is not the same. SQLite varints are big-endian
50: ** are are limited to 9 bytes in length whereas FTS3 varints are
51: ** little-endian and can be up to 10 bytes in length (in theory).
52: **
53: ** Example encodings:
54: **
55: ** 1: 0x01
56: ** 127: 0x7f
57: ** 128: 0x81 0x00
58: **
59: **
60: **** Document lists ****
61: ** A doclist (document list) holds a docid-sorted list of hits for a
62: ** given term. Doclists hold docids and associated token positions.
63: ** A docid is the unique integer identifier for a single document.
64: ** A position is the index of a word within the document. The first
65: ** word of the document has a position of 0.
66: **
67: ** FTS3 used to optionally store character offsets using a compile-time
68: ** option. But that functionality is no longer supported.
69: **
70: ** A doclist is stored like this:
71: **
72: ** array {
73: ** varint docid;
74: ** array { (position list for column 0)
75: ** varint position; (2 more than the delta from previous position)
76: ** }
77: ** array {
78: ** varint POS_COLUMN; (marks start of position list for new column)
79: ** varint column; (index of new column)
80: ** array {
81: ** varint position; (2 more than the delta from previous position)
82: ** }
83: ** }
84: ** varint POS_END; (marks end of positions for this document.
85: ** }
86: **
87: ** Here, array { X } means zero or more occurrences of X, adjacent in
88: ** memory. A "position" is an index of a token in the token stream
89: ** generated by the tokenizer. Note that POS_END and POS_COLUMN occur
90: ** in the same logical place as the position element, and act as sentinals
91: ** ending a position list array. POS_END is 0. POS_COLUMN is 1.
92: ** The positions numbers are not stored literally but rather as two more
93: ** than the difference from the prior position, or the just the position plus
94: ** 2 for the first position. Example:
95: **
96: ** label: A B C D E F G H I J K
97: ** value: 123 5 9 1 1 14 35 0 234 72 0
98: **
99: ** The 123 value is the first docid. For column zero in this document
100: ** there are two matches at positions 3 and 10 (5-2 and 9-2+3). The 1
101: ** at D signals the start of a new column; the 1 at E indicates that the
102: ** new column is column number 1. There are two positions at 12 and 45
103: ** (14-2 and 35-2+12). The 0 at H indicate the end-of-document. The
104: ** 234 at I is the next docid. It has one position 72 (72-2) and then
105: ** terminates with the 0 at K.
106: **
107: ** A "position-list" is the list of positions for multiple columns for
108: ** a single docid. A "column-list" is the set of positions for a single
109: ** column. Hence, a position-list consists of one or more column-lists,
110: ** a document record consists of a docid followed by a position-list and
111: ** a doclist consists of one or more document records.
112: **
113: ** A bare doclist omits the position information, becoming an
114: ** array of varint-encoded docids.
115: **
116: **** Segment leaf nodes ****
117: ** Segment leaf nodes store terms and doclists, ordered by term. Leaf
118: ** nodes are written using LeafWriter, and read using LeafReader (to
119: ** iterate through a single leaf node's data) and LeavesReader (to
120: ** iterate through a segment's entire leaf layer). Leaf nodes have
121: ** the format:
122: **
123: ** varint iHeight; (height from leaf level, always 0)
124: ** varint nTerm; (length of first term)
125: ** char pTerm[nTerm]; (content of first term)
126: ** varint nDoclist; (length of term's associated doclist)
127: ** char pDoclist[nDoclist]; (content of doclist)
128: ** array {
129: ** (further terms are delta-encoded)
130: ** varint nPrefix; (length of prefix shared with previous term)
131: ** varint nSuffix; (length of unshared suffix)
132: ** char pTermSuffix[nSuffix];(unshared suffix of next term)
133: ** varint nDoclist; (length of term's associated doclist)
134: ** char pDoclist[nDoclist]; (content of doclist)
135: ** }
136: **
137: ** Here, array { X } means zero or more occurrences of X, adjacent in
138: ** memory.
139: **
140: ** Leaf nodes are broken into blocks which are stored contiguously in
141: ** the %_segments table in sorted order. This means that when the end
142: ** of a node is reached, the next term is in the node with the next
143: ** greater node id.
144: **
145: ** New data is spilled to a new leaf node when the current node
146: ** exceeds LEAF_MAX bytes (default 2048). New data which itself is
147: ** larger than STANDALONE_MIN (default 1024) is placed in a standalone
148: ** node (a leaf node with a single term and doclist). The goal of
149: ** these settings is to pack together groups of small doclists while
150: ** making it efficient to directly access large doclists. The
151: ** assumption is that large doclists represent terms which are more
152: ** likely to be query targets.
153: **
154: ** TODO(shess) It may be useful for blocking decisions to be more
155: ** dynamic. For instance, it may make more sense to have a 2.5k leaf
156: ** node rather than splitting into 2k and .5k nodes. My intuition is
157: ** that this might extend through 2x or 4x the pagesize.
158: **
159: **
160: **** Segment interior nodes ****
161: ** Segment interior nodes store blockids for subtree nodes and terms
162: ** to describe what data is stored by the each subtree. Interior
163: ** nodes are written using InteriorWriter, and read using
164: ** InteriorReader. InteriorWriters are created as needed when
165: ** SegmentWriter creates new leaf nodes, or when an interior node
166: ** itself grows too big and must be split. The format of interior
167: ** nodes:
168: **
169: ** varint iHeight; (height from leaf level, always >0)
170: ** varint iBlockid; (block id of node's leftmost subtree)
171: ** optional {
172: ** varint nTerm; (length of first term)
173: ** char pTerm[nTerm]; (content of first term)
174: ** array {
175: ** (further terms are delta-encoded)
176: ** varint nPrefix; (length of shared prefix with previous term)
177: ** varint nSuffix; (length of unshared suffix)
178: ** char pTermSuffix[nSuffix]; (unshared suffix of next term)
179: ** }
180: ** }
181: **
182: ** Here, optional { X } means an optional element, while array { X }
183: ** means zero or more occurrences of X, adjacent in memory.
184: **
185: ** An interior node encodes n terms separating n+1 subtrees. The
186: ** subtree blocks are contiguous, so only the first subtree's blockid
187: ** is encoded. The subtree at iBlockid will contain all terms less
188: ** than the first term encoded (or all terms if no term is encoded).
189: ** Otherwise, for terms greater than or equal to pTerm[i] but less
190: ** than pTerm[i+1], the subtree for that term will be rooted at
191: ** iBlockid+i. Interior nodes only store enough term data to
192: ** distinguish adjacent children (if the rightmost term of the left
193: ** child is "something", and the leftmost term of the right child is
194: ** "wicked", only "w" is stored).
195: **
196: ** New data is spilled to a new interior node at the same height when
197: ** the current node exceeds INTERIOR_MAX bytes (default 2048).
198: ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
199: ** interior nodes and making the tree too skinny. The interior nodes
200: ** at a given height are naturally tracked by interior nodes at
201: ** height+1, and so on.
202: **
203: **
204: **** Segment directory ****
205: ** The segment directory in table %_segdir stores meta-information for
206: ** merging and deleting segments, and also the root node of the
207: ** segment's tree.
208: **
209: ** The root node is the top node of the segment's tree after encoding
210: ** the entire segment, restricted to ROOT_MAX bytes (default 1024).
211: ** This could be either a leaf node or an interior node. If the top
212: ** node requires more than ROOT_MAX bytes, it is flushed to %_segments
213: ** and a new root interior node is generated (which should always fit
214: ** within ROOT_MAX because it only needs space for 2 varints, the
215: ** height and the blockid of the previous root).
216: **
217: ** The meta-information in the segment directory is:
218: ** level - segment level (see below)
219: ** idx - index within level
220: ** - (level,idx uniquely identify a segment)
221: ** start_block - first leaf node
222: ** leaves_end_block - last leaf node
223: ** end_block - last block (including interior nodes)
224: ** root - contents of root node
225: **
226: ** If the root node is a leaf node, then start_block,
227: ** leaves_end_block, and end_block are all 0.
228: **
229: **
230: **** Segment merging ****
231: ** To amortize update costs, segments are grouped into levels and
232: ** merged in batches. Each increase in level represents exponentially
233: ** more documents.
234: **
235: ** New documents (actually, document updates) are tokenized and
236: ** written individually (using LeafWriter) to a level 0 segment, with
237: ** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
238: ** level 0 segments are merged into a single level 1 segment. Level 1
239: ** is populated like level 0, and eventually MERGE_COUNT level 1
240: ** segments are merged to a single level 2 segment (representing
241: ** MERGE_COUNT^2 updates), and so on.
242: **
243: ** A segment merge traverses all segments at a given level in
244: ** parallel, performing a straightforward sorted merge. Since segment
245: ** leaf nodes are written in to the %_segments table in order, this
246: ** merge traverses the underlying sqlite disk structures efficiently.
247: ** After the merge, all segment blocks from the merged level are
248: ** deleted.
249: **
250: ** MERGE_COUNT controls how often we merge segments. 16 seems to be
251: ** somewhat of a sweet spot for insertion performance. 32 and 64 show
252: ** very similar performance numbers to 16 on insertion, though they're
253: ** a tiny bit slower (perhaps due to more overhead in merge-time
254: ** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
255: ** 16, 2 about 66% slower than 16.
256: **
257: ** At query time, high MERGE_COUNT increases the number of segments
258: ** which need to be scanned and merged. For instance, with 100k docs
259: ** inserted:
260: **
261: ** MERGE_COUNT segments
262: ** 16 25
263: ** 8 12
264: ** 4 10
265: ** 2 6
266: **
267: ** This appears to have only a moderate impact on queries for very
268: ** frequent terms (which are somewhat dominated by segment merge
269: ** costs), and infrequent and non-existent terms still seem to be fast
270: ** even with many segments.
271: **
272: ** TODO(shess) That said, it would be nice to have a better query-side
273: ** argument for MERGE_COUNT of 16. Also, it is possible/likely that
274: ** optimizations to things like doclist merging will swing the sweet
275: ** spot around.
276: **
277: **
278: **
279: **** Handling of deletions and updates ****
280: ** Since we're using a segmented structure, with no docid-oriented
281: ** index into the term index, we clearly cannot simply update the term
282: ** index when a document is deleted or updated. For deletions, we
283: ** write an empty doclist (varint(docid) varint(POS_END)), for updates
284: ** we simply write the new doclist. Segment merges overwrite older
285: ** data for a particular docid with newer data, so deletes or updates
286: ** will eventually overtake the earlier data and knock it out. The
287: ** query logic likewise merges doclists so that newer data knocks out
288: ** older data.
289: **
290: ** TODO(shess) Provide a VACUUM type operation to clear out all
291: ** deletions and duplications. This would basically be a forced merge
292: ** into a single segment.
293: */
294:
295: #include "fts3Int.h"
296: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
297:
298: #if defined(SQLITE_ENABLE_FTS3) && !defined(SQLITE_CORE)
299: # define SQLITE_CORE 1
300: #endif
301:
302: #include <assert.h>
303: #include <stdlib.h>
304: #include <stddef.h>
305: #include <stdio.h>
306: #include <string.h>
307: #include <stdarg.h>
308:
309: #include "fts3.h"
310: #ifndef SQLITE_CORE
311: # include "sqlite3ext.h"
312: SQLITE_EXTENSION_INIT1
313: #endif
314:
315: static int fts3EvalNext(Fts3Cursor *pCsr);
316: static int fts3EvalStart(Fts3Cursor *pCsr);
317: static int fts3TermSegReaderCursor(
318: Fts3Cursor *, const char *, int, int, Fts3MultiSegReader **);
319:
320: /*
321: ** Write a 64-bit variable-length integer to memory starting at p[0].
322: ** The length of data written will be between 1 and FTS3_VARINT_MAX bytes.
323: ** The number of bytes written is returned.
324: */
325: int sqlite3Fts3PutVarint(char *p, sqlite_int64 v){
326: unsigned char *q = (unsigned char *) p;
327: sqlite_uint64 vu = v;
328: do{
329: *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
330: vu >>= 7;
331: }while( vu!=0 );
332: q[-1] &= 0x7f; /* turn off high bit in final byte */
333: assert( q - (unsigned char *)p <= FTS3_VARINT_MAX );
334: return (int) (q - (unsigned char *)p);
335: }
336:
337: /*
338: ** Read a 64-bit variable-length integer from memory starting at p[0].
339: ** Return the number of bytes read, or 0 on error.
340: ** The value is stored in *v.
341: */
342: int sqlite3Fts3GetVarint(const char *p, sqlite_int64 *v){
343: const unsigned char *q = (const unsigned char *) p;
344: sqlite_uint64 x = 0, y = 1;
345: while( (*q&0x80)==0x80 && q-(unsigned char *)p<FTS3_VARINT_MAX ){
346: x += y * (*q++ & 0x7f);
347: y <<= 7;
348: }
349: x += y * (*q++);
350: *v = (sqlite_int64) x;
351: return (int) (q - (unsigned char *)p);
352: }
353:
354: /*
355: ** Similar to sqlite3Fts3GetVarint(), except that the output is truncated to a
356: ** 32-bit integer before it is returned.
357: */
358: int sqlite3Fts3GetVarint32(const char *p, int *pi){
359: sqlite_int64 i;
360: int ret = sqlite3Fts3GetVarint(p, &i);
361: *pi = (int) i;
362: return ret;
363: }
364:
365: /*
366: ** Return the number of bytes required to encode v as a varint
367: */
368: int sqlite3Fts3VarintLen(sqlite3_uint64 v){
369: int i = 0;
370: do{
371: i++;
372: v >>= 7;
373: }while( v!=0 );
374: return i;
375: }
376:
377: /*
378: ** Convert an SQL-style quoted string into a normal string by removing
379: ** the quote characters. The conversion is done in-place. If the
380: ** input does not begin with a quote character, then this routine
381: ** is a no-op.
382: **
383: ** Examples:
384: **
385: ** "abc" becomes abc
386: ** 'xyz' becomes xyz
387: ** [pqr] becomes pqr
388: ** `mno` becomes mno
389: **
390: */
391: void sqlite3Fts3Dequote(char *z){
392: char quote; /* Quote character (if any ) */
393:
394: quote = z[0];
395: if( quote=='[' || quote=='\'' || quote=='"' || quote=='`' ){
396: int iIn = 1; /* Index of next byte to read from input */
397: int iOut = 0; /* Index of next byte to write to output */
398:
399: /* If the first byte was a '[', then the close-quote character is a ']' */
400: if( quote=='[' ) quote = ']';
401:
402: while( ALWAYS(z[iIn]) ){
403: if( z[iIn]==quote ){
404: if( z[iIn+1]!=quote ) break;
405: z[iOut++] = quote;
406: iIn += 2;
407: }else{
408: z[iOut++] = z[iIn++];
409: }
410: }
411: z[iOut] = '\0';
412: }
413: }
414:
415: /*
416: ** Read a single varint from the doclist at *pp and advance *pp to point
417: ** to the first byte past the end of the varint. Add the value of the varint
418: ** to *pVal.
419: */
420: static void fts3GetDeltaVarint(char **pp, sqlite3_int64 *pVal){
421: sqlite3_int64 iVal;
422: *pp += sqlite3Fts3GetVarint(*pp, &iVal);
423: *pVal += iVal;
424: }
425:
426: /*
427: ** When this function is called, *pp points to the first byte following a
428: ** varint that is part of a doclist (or position-list, or any other list
429: ** of varints). This function moves *pp to point to the start of that varint,
430: ** and sets *pVal by the varint value.
431: **
432: ** Argument pStart points to the first byte of the doclist that the
433: ** varint is part of.
434: */
435: static void fts3GetReverseVarint(
436: char **pp,
437: char *pStart,
438: sqlite3_int64 *pVal
439: ){
440: sqlite3_int64 iVal;
441: char *p;
442:
443: /* Pointer p now points at the first byte past the varint we are
444: ** interested in. So, unless the doclist is corrupt, the 0x80 bit is
445: ** clear on character p[-1]. */
446: for(p = (*pp)-2; p>=pStart && *p&0x80; p--);
447: p++;
448: *pp = p;
449:
450: sqlite3Fts3GetVarint(p, &iVal);
451: *pVal = iVal;
452: }
453:
454: /*
455: ** The xDisconnect() virtual table method.
456: */
457: static int fts3DisconnectMethod(sqlite3_vtab *pVtab){
458: Fts3Table *p = (Fts3Table *)pVtab;
459: int i;
460:
461: assert( p->nPendingData==0 );
462: assert( p->pSegments==0 );
463:
464: /* Free any prepared statements held */
465: for(i=0; i<SizeofArray(p->aStmt); i++){
466: sqlite3_finalize(p->aStmt[i]);
467: }
468: sqlite3_free(p->zSegmentsTbl);
469: sqlite3_free(p->zReadExprlist);
470: sqlite3_free(p->zWriteExprlist);
471: sqlite3_free(p->zContentTbl);
472:
473: /* Invoke the tokenizer destructor to free the tokenizer. */
474: p->pTokenizer->pModule->xDestroy(p->pTokenizer);
475:
476: sqlite3_free(p);
477: return SQLITE_OK;
478: }
479:
480: /*
481: ** Construct one or more SQL statements from the format string given
482: ** and then evaluate those statements. The success code is written
483: ** into *pRc.
484: **
485: ** If *pRc is initially non-zero then this routine is a no-op.
486: */
487: static void fts3DbExec(
488: int *pRc, /* Success code */
489: sqlite3 *db, /* Database in which to run SQL */
490: const char *zFormat, /* Format string for SQL */
491: ... /* Arguments to the format string */
492: ){
493: va_list ap;
494: char *zSql;
495: if( *pRc ) return;
496: va_start(ap, zFormat);
497: zSql = sqlite3_vmprintf(zFormat, ap);
498: va_end(ap);
499: if( zSql==0 ){
500: *pRc = SQLITE_NOMEM;
501: }else{
502: *pRc = sqlite3_exec(db, zSql, 0, 0, 0);
503: sqlite3_free(zSql);
504: }
505: }
506:
507: /*
508: ** The xDestroy() virtual table method.
509: */
510: static int fts3DestroyMethod(sqlite3_vtab *pVtab){
511: Fts3Table *p = (Fts3Table *)pVtab;
512: int rc = SQLITE_OK; /* Return code */
513: const char *zDb = p->zDb; /* Name of database (e.g. "main", "temp") */
514: sqlite3 *db = p->db; /* Database handle */
515:
516: /* Drop the shadow tables */
517: if( p->zContentTbl==0 ){
518: fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_content'", zDb, p->zName);
519: }
520: fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_segments'", zDb,p->zName);
521: fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_segdir'", zDb, p->zName);
522: fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_docsize'", zDb, p->zName);
523: fts3DbExec(&rc, db, "DROP TABLE IF EXISTS %Q.'%q_stat'", zDb, p->zName);
524:
525: /* If everything has worked, invoke fts3DisconnectMethod() to free the
526: ** memory associated with the Fts3Table structure and return SQLITE_OK.
527: ** Otherwise, return an SQLite error code.
528: */
529: return (rc==SQLITE_OK ? fts3DisconnectMethod(pVtab) : rc);
530: }
531:
532:
533: /*
534: ** Invoke sqlite3_declare_vtab() to declare the schema for the FTS3 table
535: ** passed as the first argument. This is done as part of the xConnect()
536: ** and xCreate() methods.
537: **
538: ** If *pRc is non-zero when this function is called, it is a no-op.
539: ** Otherwise, if an error occurs, an SQLite error code is stored in *pRc
540: ** before returning.
541: */
542: static void fts3DeclareVtab(int *pRc, Fts3Table *p){
543: if( *pRc==SQLITE_OK ){
544: int i; /* Iterator variable */
545: int rc; /* Return code */
546: char *zSql; /* SQL statement passed to declare_vtab() */
547: char *zCols; /* List of user defined columns */
548:
549: sqlite3_vtab_config(p->db, SQLITE_VTAB_CONSTRAINT_SUPPORT, 1);
550:
551: /* Create a list of user columns for the virtual table */
552: zCols = sqlite3_mprintf("%Q, ", p->azColumn[0]);
553: for(i=1; zCols && i<p->nColumn; i++){
554: zCols = sqlite3_mprintf("%z%Q, ", zCols, p->azColumn[i]);
555: }
556:
557: /* Create the whole "CREATE TABLE" statement to pass to SQLite */
558: zSql = sqlite3_mprintf(
559: "CREATE TABLE x(%s %Q HIDDEN, docid HIDDEN)", zCols, p->zName
560: );
561: if( !zCols || !zSql ){
562: rc = SQLITE_NOMEM;
563: }else{
564: rc = sqlite3_declare_vtab(p->db, zSql);
565: }
566:
567: sqlite3_free(zSql);
568: sqlite3_free(zCols);
569: *pRc = rc;
570: }
571: }
572:
573: /*
574: ** Create the backing store tables (%_content, %_segments and %_segdir)
575: ** required by the FTS3 table passed as the only argument. This is done
576: ** as part of the vtab xCreate() method.
577: **
578: ** If the p->bHasDocsize boolean is true (indicating that this is an
579: ** FTS4 table, not an FTS3 table) then also create the %_docsize and
580: ** %_stat tables required by FTS4.
581: */
582: static int fts3CreateTables(Fts3Table *p){
583: int rc = SQLITE_OK; /* Return code */
584: int i; /* Iterator variable */
585: sqlite3 *db = p->db; /* The database connection */
586:
587: if( p->zContentTbl==0 ){
588: char *zContentCols; /* Columns of %_content table */
589:
590: /* Create a list of user columns for the content table */
591: zContentCols = sqlite3_mprintf("docid INTEGER PRIMARY KEY");
592: for(i=0; zContentCols && i<p->nColumn; i++){
593: char *z = p->azColumn[i];
594: zContentCols = sqlite3_mprintf("%z, 'c%d%q'", zContentCols, i, z);
595: }
596: if( zContentCols==0 ) rc = SQLITE_NOMEM;
597:
598: /* Create the content table */
599: fts3DbExec(&rc, db,
600: "CREATE TABLE %Q.'%q_content'(%s)",
601: p->zDb, p->zName, zContentCols
602: );
603: sqlite3_free(zContentCols);
604: }
605:
606: /* Create other tables */
607: fts3DbExec(&rc, db,
608: "CREATE TABLE %Q.'%q_segments'(blockid INTEGER PRIMARY KEY, block BLOB);",
609: p->zDb, p->zName
610: );
611: fts3DbExec(&rc, db,
612: "CREATE TABLE %Q.'%q_segdir'("
613: "level INTEGER,"
614: "idx INTEGER,"
615: "start_block INTEGER,"
616: "leaves_end_block INTEGER,"
617: "end_block INTEGER,"
618: "root BLOB,"
619: "PRIMARY KEY(level, idx)"
620: ");",
621: p->zDb, p->zName
622: );
623: if( p->bHasDocsize ){
624: fts3DbExec(&rc, db,
625: "CREATE TABLE %Q.'%q_docsize'(docid INTEGER PRIMARY KEY, size BLOB);",
626: p->zDb, p->zName
627: );
628: }
629: if( p->bHasStat ){
630: fts3DbExec(&rc, db,
631: "CREATE TABLE %Q.'%q_stat'(id INTEGER PRIMARY KEY, value BLOB);",
632: p->zDb, p->zName
633: );
634: }
635: return rc;
636: }
637:
638: /*
639: ** Store the current database page-size in bytes in p->nPgsz.
640: **
641: ** If *pRc is non-zero when this function is called, it is a no-op.
642: ** Otherwise, if an error occurs, an SQLite error code is stored in *pRc
643: ** before returning.
644: */
645: static void fts3DatabasePageSize(int *pRc, Fts3Table *p){
646: if( *pRc==SQLITE_OK ){
647: int rc; /* Return code */
648: char *zSql; /* SQL text "PRAGMA %Q.page_size" */
649: sqlite3_stmt *pStmt; /* Compiled "PRAGMA %Q.page_size" statement */
650:
651: zSql = sqlite3_mprintf("PRAGMA %Q.page_size", p->zDb);
652: if( !zSql ){
653: rc = SQLITE_NOMEM;
654: }else{
655: rc = sqlite3_prepare(p->db, zSql, -1, &pStmt, 0);
656: if( rc==SQLITE_OK ){
657: sqlite3_step(pStmt);
658: p->nPgsz = sqlite3_column_int(pStmt, 0);
659: rc = sqlite3_finalize(pStmt);
660: }else if( rc==SQLITE_AUTH ){
661: p->nPgsz = 1024;
662: rc = SQLITE_OK;
663: }
664: }
665: assert( p->nPgsz>0 || rc!=SQLITE_OK );
666: sqlite3_free(zSql);
667: *pRc = rc;
668: }
669: }
670:
671: /*
672: ** "Special" FTS4 arguments are column specifications of the following form:
673: **
674: ** <key> = <value>
675: **
676: ** There may not be whitespace surrounding the "=" character. The <value>
677: ** term may be quoted, but the <key> may not.
678: */
679: static int fts3IsSpecialColumn(
680: const char *z,
681: int *pnKey,
682: char **pzValue
683: ){
684: char *zValue;
685: const char *zCsr = z;
686:
687: while( *zCsr!='=' ){
688: if( *zCsr=='\0' ) return 0;
689: zCsr++;
690: }
691:
692: *pnKey = (int)(zCsr-z);
693: zValue = sqlite3_mprintf("%s", &zCsr[1]);
694: if( zValue ){
695: sqlite3Fts3Dequote(zValue);
696: }
697: *pzValue = zValue;
698: return 1;
699: }
700:
701: /*
702: ** Append the output of a printf() style formatting to an existing string.
703: */
704: static void fts3Appendf(
705: int *pRc, /* IN/OUT: Error code */
706: char **pz, /* IN/OUT: Pointer to string buffer */
707: const char *zFormat, /* Printf format string to append */
708: ... /* Arguments for printf format string */
709: ){
710: if( *pRc==SQLITE_OK ){
711: va_list ap;
712: char *z;
713: va_start(ap, zFormat);
714: z = sqlite3_vmprintf(zFormat, ap);
715: va_end(ap);
716: if( z && *pz ){
717: char *z2 = sqlite3_mprintf("%s%s", *pz, z);
718: sqlite3_free(z);
719: z = z2;
720: }
721: if( z==0 ) *pRc = SQLITE_NOMEM;
722: sqlite3_free(*pz);
723: *pz = z;
724: }
725: }
726:
727: /*
728: ** Return a copy of input string zInput enclosed in double-quotes (") and
729: ** with all double quote characters escaped. For example:
730: **
731: ** fts3QuoteId("un \"zip\"") -> "un \"\"zip\"\""
732: **
733: ** The pointer returned points to memory obtained from sqlite3_malloc(). It
734: ** is the callers responsibility to call sqlite3_free() to release this
735: ** memory.
736: */
737: static char *fts3QuoteId(char const *zInput){
738: int nRet;
739: char *zRet;
740: nRet = 2 + strlen(zInput)*2 + 1;
741: zRet = sqlite3_malloc(nRet);
742: if( zRet ){
743: int i;
744: char *z = zRet;
745: *(z++) = '"';
746: for(i=0; zInput[i]; i++){
747: if( zInput[i]=='"' ) *(z++) = '"';
748: *(z++) = zInput[i];
749: }
750: *(z++) = '"';
751: *(z++) = '\0';
752: }
753: return zRet;
754: }
755:
756: /*
757: ** Return a list of comma separated SQL expressions and a FROM clause that
758: ** could be used in a SELECT statement such as the following:
759: **
760: ** SELECT <list of expressions> FROM %_content AS x ...
761: **
762: ** to return the docid, followed by each column of text data in order
763: ** from left to write. If parameter zFunc is not NULL, then instead of
764: ** being returned directly each column of text data is passed to an SQL
765: ** function named zFunc first. For example, if zFunc is "unzip" and the
766: ** table has the three user-defined columns "a", "b", and "c", the following
767: ** string is returned:
768: **
769: ** "docid, unzip(x.'a'), unzip(x.'b'), unzip(x.'c') FROM %_content AS x"
770: **
771: ** The pointer returned points to a buffer allocated by sqlite3_malloc(). It
772: ** is the responsibility of the caller to eventually free it.
773: **
774: ** If *pRc is not SQLITE_OK when this function is called, it is a no-op (and
775: ** a NULL pointer is returned). Otherwise, if an OOM error is encountered
776: ** by this function, NULL is returned and *pRc is set to SQLITE_NOMEM. If
777: ** no error occurs, *pRc is left unmodified.
778: */
779: static char *fts3ReadExprList(Fts3Table *p, const char *zFunc, int *pRc){
780: char *zRet = 0;
781: char *zFree = 0;
782: char *zFunction;
783: int i;
784:
785: if( p->zContentTbl==0 ){
786: if( !zFunc ){
787: zFunction = "";
788: }else{
789: zFree = zFunction = fts3QuoteId(zFunc);
790: }
791: fts3Appendf(pRc, &zRet, "docid");
792: for(i=0; i<p->nColumn; i++){
793: fts3Appendf(pRc, &zRet, ",%s(x.'c%d%q')", zFunction, i, p->azColumn[i]);
794: }
795: sqlite3_free(zFree);
796: }else{
797: fts3Appendf(pRc, &zRet, "rowid");
798: for(i=0; i<p->nColumn; i++){
799: fts3Appendf(pRc, &zRet, ", x.'%q'", p->azColumn[i]);
800: }
801: }
802: fts3Appendf(pRc, &zRet, "FROM '%q'.'%q%s' AS x",
803: p->zDb,
804: (p->zContentTbl ? p->zContentTbl : p->zName),
805: (p->zContentTbl ? "" : "_content")
806: );
807: return zRet;
808: }
809:
810: /*
811: ** Return a list of N comma separated question marks, where N is the number
812: ** of columns in the %_content table (one for the docid plus one for each
813: ** user-defined text column).
814: **
815: ** If argument zFunc is not NULL, then all but the first question mark
816: ** is preceded by zFunc and an open bracket, and followed by a closed
817: ** bracket. For example, if zFunc is "zip" and the FTS3 table has three
818: ** user-defined text columns, the following string is returned:
819: **
820: ** "?, zip(?), zip(?), zip(?)"
821: **
822: ** The pointer returned points to a buffer allocated by sqlite3_malloc(). It
823: ** is the responsibility of the caller to eventually free it.
824: **
825: ** If *pRc is not SQLITE_OK when this function is called, it is a no-op (and
826: ** a NULL pointer is returned). Otherwise, if an OOM error is encountered
827: ** by this function, NULL is returned and *pRc is set to SQLITE_NOMEM. If
828: ** no error occurs, *pRc is left unmodified.
829: */
830: static char *fts3WriteExprList(Fts3Table *p, const char *zFunc, int *pRc){
831: char *zRet = 0;
832: char *zFree = 0;
833: char *zFunction;
834: int i;
835:
836: if( !zFunc ){
837: zFunction = "";
838: }else{
839: zFree = zFunction = fts3QuoteId(zFunc);
840: }
841: fts3Appendf(pRc, &zRet, "?");
842: for(i=0; i<p->nColumn; i++){
843: fts3Appendf(pRc, &zRet, ",%s(?)", zFunction);
844: }
845: sqlite3_free(zFree);
846: return zRet;
847: }
848:
849: /*
850: ** This function interprets the string at (*pp) as a non-negative integer
851: ** value. It reads the integer and sets *pnOut to the value read, then
852: ** sets *pp to point to the byte immediately following the last byte of
853: ** the integer value.
854: **
855: ** Only decimal digits ('0'..'9') may be part of an integer value.
856: **
857: ** If *pp does not being with a decimal digit SQLITE_ERROR is returned and
858: ** the output value undefined. Otherwise SQLITE_OK is returned.
859: **
860: ** This function is used when parsing the "prefix=" FTS4 parameter.
861: */
862: static int fts3GobbleInt(const char **pp, int *pnOut){
863: const char *p; /* Iterator pointer */
864: int nInt = 0; /* Output value */
865:
866: for(p=*pp; p[0]>='0' && p[0]<='9'; p++){
867: nInt = nInt * 10 + (p[0] - '0');
868: }
869: if( p==*pp ) return SQLITE_ERROR;
870: *pnOut = nInt;
871: *pp = p;
872: return SQLITE_OK;
873: }
874:
875: /*
876: ** This function is called to allocate an array of Fts3Index structures
877: ** representing the indexes maintained by the current FTS table. FTS tables
878: ** always maintain the main "terms" index, but may also maintain one or
879: ** more "prefix" indexes, depending on the value of the "prefix=" parameter
880: ** (if any) specified as part of the CREATE VIRTUAL TABLE statement.
881: **
882: ** Argument zParam is passed the value of the "prefix=" option if one was
883: ** specified, or NULL otherwise.
884: **
885: ** If no error occurs, SQLITE_OK is returned and *apIndex set to point to
886: ** the allocated array. *pnIndex is set to the number of elements in the
887: ** array. If an error does occur, an SQLite error code is returned.
888: **
889: ** Regardless of whether or not an error is returned, it is the responsibility
890: ** of the caller to call sqlite3_free() on the output array to free it.
891: */
892: static int fts3PrefixParameter(
893: const char *zParam, /* ABC in prefix=ABC parameter to parse */
894: int *pnIndex, /* OUT: size of *apIndex[] array */
895: struct Fts3Index **apIndex /* OUT: Array of indexes for this table */
896: ){
897: struct Fts3Index *aIndex; /* Allocated array */
898: int nIndex = 1; /* Number of entries in array */
899:
900: if( zParam && zParam[0] ){
901: const char *p;
902: nIndex++;
903: for(p=zParam; *p; p++){
904: if( *p==',' ) nIndex++;
905: }
906: }
907:
908: aIndex = sqlite3_malloc(sizeof(struct Fts3Index) * nIndex);
909: *apIndex = aIndex;
910: *pnIndex = nIndex;
911: if( !aIndex ){
912: return SQLITE_NOMEM;
913: }
914:
915: memset(aIndex, 0, sizeof(struct Fts3Index) * nIndex);
916: if( zParam ){
917: const char *p = zParam;
918: int i;
919: for(i=1; i<nIndex; i++){
920: int nPrefix;
921: if( fts3GobbleInt(&p, &nPrefix) ) return SQLITE_ERROR;
922: aIndex[i].nPrefix = nPrefix;
923: p++;
924: }
925: }
926:
927: return SQLITE_OK;
928: }
929:
930: /*
931: ** This function is called when initializing an FTS4 table that uses the
932: ** content=xxx option. It determines the number of and names of the columns
933: ** of the new FTS4 table.
934: **
935: ** The third argument passed to this function is the value passed to the
936: ** config=xxx option (i.e. "xxx"). This function queries the database for
937: ** a table of that name. If found, the output variables are populated
938: ** as follows:
939: **
940: ** *pnCol: Set to the number of columns table xxx has,
941: **
942: ** *pnStr: Set to the total amount of space required to store a copy
943: ** of each columns name, including the nul-terminator.
944: **
945: ** *pazCol: Set to point to an array of *pnCol strings. Each string is
946: ** the name of the corresponding column in table xxx. The array
947: ** and its contents are allocated using a single allocation. It
948: ** is the responsibility of the caller to free this allocation
949: ** by eventually passing the *pazCol value to sqlite3_free().
950: **
951: ** If the table cannot be found, an error code is returned and the output
952: ** variables are undefined. Or, if an OOM is encountered, SQLITE_NOMEM is
953: ** returned (and the output variables are undefined).
954: */
955: static int fts3ContentColumns(
956: sqlite3 *db, /* Database handle */
957: const char *zDb, /* Name of db (i.e. "main", "temp" etc.) */
958: const char *zTbl, /* Name of content table */
959: const char ***pazCol, /* OUT: Malloc'd array of column names */
960: int *pnCol, /* OUT: Size of array *pazCol */
961: int *pnStr /* OUT: Bytes of string content */
962: ){
963: int rc = SQLITE_OK; /* Return code */
964: char *zSql; /* "SELECT *" statement on zTbl */
965: sqlite3_stmt *pStmt = 0; /* Compiled version of zSql */
966:
967: zSql = sqlite3_mprintf("SELECT * FROM %Q.%Q", zDb, zTbl);
968: if( !zSql ){
969: rc = SQLITE_NOMEM;
970: }else{
971: rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
972: }
973: sqlite3_free(zSql);
974:
975: if( rc==SQLITE_OK ){
976: const char **azCol; /* Output array */
977: int nStr = 0; /* Size of all column names (incl. 0x00) */
978: int nCol; /* Number of table columns */
979: int i; /* Used to iterate through columns */
980:
981: /* Loop through the returned columns. Set nStr to the number of bytes of
982: ** space required to store a copy of each column name, including the
983: ** nul-terminator byte. */
984: nCol = sqlite3_column_count(pStmt);
985: for(i=0; i<nCol; i++){
986: const char *zCol = sqlite3_column_name(pStmt, i);
987: nStr += strlen(zCol) + 1;
988: }
989:
990: /* Allocate and populate the array to return. */
991: azCol = (const char **)sqlite3_malloc(sizeof(char *) * nCol + nStr);
992: if( azCol==0 ){
993: rc = SQLITE_NOMEM;
994: }else{
995: char *p = (char *)&azCol[nCol];
996: for(i=0; i<nCol; i++){
997: const char *zCol = sqlite3_column_name(pStmt, i);
998: int n = strlen(zCol)+1;
999: memcpy(p, zCol, n);
1000: azCol[i] = p;
1001: p += n;
1002: }
1003: }
1004: sqlite3_finalize(pStmt);
1005:
1006: /* Set the output variables. */
1007: *pnCol = nCol;
1008: *pnStr = nStr;
1009: *pazCol = azCol;
1010: }
1011:
1012: return rc;
1013: }
1014:
1015: /*
1016: ** This function is the implementation of both the xConnect and xCreate
1017: ** methods of the FTS3 virtual table.
1018: **
1019: ** The argv[] array contains the following:
1020: **
1021: ** argv[0] -> module name ("fts3" or "fts4")
1022: ** argv[1] -> database name
1023: ** argv[2] -> table name
1024: ** argv[...] -> "column name" and other module argument fields.
1025: */
1026: static int fts3InitVtab(
1027: int isCreate, /* True for xCreate, false for xConnect */
1028: sqlite3 *db, /* The SQLite database connection */
1029: void *pAux, /* Hash table containing tokenizers */
1030: int argc, /* Number of elements in argv array */
1031: const char * const *argv, /* xCreate/xConnect argument array */
1032: sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
1033: char **pzErr /* Write any error message here */
1034: ){
1035: Fts3Hash *pHash = (Fts3Hash *)pAux;
1036: Fts3Table *p = 0; /* Pointer to allocated vtab */
1037: int rc = SQLITE_OK; /* Return code */
1038: int i; /* Iterator variable */
1039: int nByte; /* Size of allocation used for *p */
1040: int iCol; /* Column index */
1041: int nString = 0; /* Bytes required to hold all column names */
1042: int nCol = 0; /* Number of columns in the FTS table */
1043: char *zCsr; /* Space for holding column names */
1044: int nDb; /* Bytes required to hold database name */
1045: int nName; /* Bytes required to hold table name */
1046: int isFts4 = (argv[0][3]=='4'); /* True for FTS4, false for FTS3 */
1047: const char **aCol; /* Array of column names */
1048: sqlite3_tokenizer *pTokenizer = 0; /* Tokenizer for this table */
1049:
1050: int nIndex; /* Size of aIndex[] array */
1051: struct Fts3Index *aIndex = 0; /* Array of indexes for this table */
1052:
1053: /* The results of parsing supported FTS4 key=value options: */
1054: int bNoDocsize = 0; /* True to omit %_docsize table */
1055: int bDescIdx = 0; /* True to store descending indexes */
1056: char *zPrefix = 0; /* Prefix parameter value (or NULL) */
1057: char *zCompress = 0; /* compress=? parameter (or NULL) */
1058: char *zUncompress = 0; /* uncompress=? parameter (or NULL) */
1059: char *zContent = 0; /* content=? parameter (or NULL) */
1060:
1061: assert( strlen(argv[0])==4 );
1062: assert( (sqlite3_strnicmp(argv[0], "fts4", 4)==0 && isFts4)
1063: || (sqlite3_strnicmp(argv[0], "fts3", 4)==0 && !isFts4)
1064: );
1065:
1066: nDb = (int)strlen(argv[1]) + 1;
1067: nName = (int)strlen(argv[2]) + 1;
1068:
1069: aCol = (const char **)sqlite3_malloc(sizeof(const char *) * (argc-2) );
1070: if( !aCol ) return SQLITE_NOMEM;
1071: memset((void *)aCol, 0, sizeof(const char *) * (argc-2));
1072:
1073: /* Loop through all of the arguments passed by the user to the FTS3/4
1074: ** module (i.e. all the column names and special arguments). This loop
1075: ** does the following:
1076: **
1077: ** + Figures out the number of columns the FTSX table will have, and
1078: ** the number of bytes of space that must be allocated to store copies
1079: ** of the column names.
1080: **
1081: ** + If there is a tokenizer specification included in the arguments,
1082: ** initializes the tokenizer pTokenizer.
1083: */
1084: for(i=3; rc==SQLITE_OK && i<argc; i++){
1085: char const *z = argv[i];
1086: int nKey;
1087: char *zVal;
1088:
1089: /* Check if this is a tokenizer specification */
1090: if( !pTokenizer
1091: && strlen(z)>8
1092: && 0==sqlite3_strnicmp(z, "tokenize", 8)
1093: && 0==sqlite3Fts3IsIdChar(z[8])
1094: ){
1095: rc = sqlite3Fts3InitTokenizer(pHash, &z[9], &pTokenizer, pzErr);
1096: }
1097:
1098: /* Check if it is an FTS4 special argument. */
1099: else if( isFts4 && fts3IsSpecialColumn(z, &nKey, &zVal) ){
1100: struct Fts4Option {
1101: const char *zOpt;
1102: int nOpt;
1103: } aFts4Opt[] = {
1104: { "matchinfo", 9 }, /* 0 -> MATCHINFO */
1105: { "prefix", 6 }, /* 1 -> PREFIX */
1106: { "compress", 8 }, /* 2 -> COMPRESS */
1107: { "uncompress", 10 }, /* 3 -> UNCOMPRESS */
1108: { "order", 5 }, /* 4 -> ORDER */
1109: { "content", 7 } /* 5 -> CONTENT */
1110: };
1111:
1112: int iOpt;
1113: if( !zVal ){
1114: rc = SQLITE_NOMEM;
1115: }else{
1116: for(iOpt=0; iOpt<SizeofArray(aFts4Opt); iOpt++){
1117: struct Fts4Option *pOp = &aFts4Opt[iOpt];
1118: if( nKey==pOp->nOpt && !sqlite3_strnicmp(z, pOp->zOpt, pOp->nOpt) ){
1119: break;
1120: }
1121: }
1122: if( iOpt==SizeofArray(aFts4Opt) ){
1123: *pzErr = sqlite3_mprintf("unrecognized parameter: %s", z);
1124: rc = SQLITE_ERROR;
1125: }else{
1126: switch( iOpt ){
1127: case 0: /* MATCHINFO */
1128: if( strlen(zVal)!=4 || sqlite3_strnicmp(zVal, "fts3", 4) ){
1129: *pzErr = sqlite3_mprintf("unrecognized matchinfo: %s", zVal);
1130: rc = SQLITE_ERROR;
1131: }
1132: bNoDocsize = 1;
1133: break;
1134:
1135: case 1: /* PREFIX */
1136: sqlite3_free(zPrefix);
1137: zPrefix = zVal;
1138: zVal = 0;
1139: break;
1140:
1141: case 2: /* COMPRESS */
1142: sqlite3_free(zCompress);
1143: zCompress = zVal;
1144: zVal = 0;
1145: break;
1146:
1147: case 3: /* UNCOMPRESS */
1148: sqlite3_free(zUncompress);
1149: zUncompress = zVal;
1150: zVal = 0;
1151: break;
1152:
1153: case 4: /* ORDER */
1154: if( (strlen(zVal)!=3 || sqlite3_strnicmp(zVal, "asc", 3))
1155: && (strlen(zVal)!=4 || sqlite3_strnicmp(zVal, "desc", 4))
1156: ){
1157: *pzErr = sqlite3_mprintf("unrecognized order: %s", zVal);
1158: rc = SQLITE_ERROR;
1159: }
1160: bDescIdx = (zVal[0]=='d' || zVal[0]=='D');
1161: break;
1162:
1163: default: /* CONTENT */
1164: assert( iOpt==5 );
1165: sqlite3_free(zUncompress);
1166: zContent = zVal;
1167: zVal = 0;
1168: break;
1169: }
1170: }
1171: sqlite3_free(zVal);
1172: }
1173: }
1174:
1175: /* Otherwise, the argument is a column name. */
1176: else {
1177: nString += (int)(strlen(z) + 1);
1178: aCol[nCol++] = z;
1179: }
1180: }
1181:
1182: /* If a content=xxx option was specified, the following:
1183: **
1184: ** 1. Ignore any compress= and uncompress= options.
1185: **
1186: ** 2. If no column names were specified as part of the CREATE VIRTUAL
1187: ** TABLE statement, use all columns from the content table.
1188: */
1189: if( rc==SQLITE_OK && zContent ){
1190: sqlite3_free(zCompress);
1191: sqlite3_free(zUncompress);
1192: zCompress = 0;
1193: zUncompress = 0;
1194: if( nCol==0 ){
1195: sqlite3_free((void*)aCol);
1196: aCol = 0;
1197: rc = fts3ContentColumns(db, argv[1], zContent, &aCol, &nCol, &nString);
1198: }
1199: assert( rc!=SQLITE_OK || nCol>0 );
1200: }
1201: if( rc!=SQLITE_OK ) goto fts3_init_out;
1202:
1203: if( nCol==0 ){
1204: assert( nString==0 );
1205: aCol[0] = "content";
1206: nString = 8;
1207: nCol = 1;
1208: }
1209:
1210: if( pTokenizer==0 ){
1211: rc = sqlite3Fts3InitTokenizer(pHash, "simple", &pTokenizer, pzErr);
1212: if( rc!=SQLITE_OK ) goto fts3_init_out;
1213: }
1214: assert( pTokenizer );
1215:
1216: rc = fts3PrefixParameter(zPrefix, &nIndex, &aIndex);
1217: if( rc==SQLITE_ERROR ){
1218: assert( zPrefix );
1219: *pzErr = sqlite3_mprintf("error parsing prefix parameter: %s", zPrefix);
1220: }
1221: if( rc!=SQLITE_OK ) goto fts3_init_out;
1222:
1223: /* Allocate and populate the Fts3Table structure. */
1224: nByte = sizeof(Fts3Table) + /* Fts3Table */
1225: nCol * sizeof(char *) + /* azColumn */
1226: nIndex * sizeof(struct Fts3Index) + /* aIndex */
1227: nName + /* zName */
1228: nDb + /* zDb */
1229: nString; /* Space for azColumn strings */
1230: p = (Fts3Table*)sqlite3_malloc(nByte);
1231: if( p==0 ){
1232: rc = SQLITE_NOMEM;
1233: goto fts3_init_out;
1234: }
1235: memset(p, 0, nByte);
1236: p->db = db;
1237: p->nColumn = nCol;
1238: p->nPendingData = 0;
1239: p->azColumn = (char **)&p[1];
1240: p->pTokenizer = pTokenizer;
1241: p->nMaxPendingData = FTS3_MAX_PENDING_DATA;
1242: p->bHasDocsize = (isFts4 && bNoDocsize==0);
1243: p->bHasStat = isFts4;
1244: p->bDescIdx = bDescIdx;
1245: p->zContentTbl = zContent;
1246: zContent = 0;
1247: TESTONLY( p->inTransaction = -1 );
1248: TESTONLY( p->mxSavepoint = -1 );
1249:
1250: p->aIndex = (struct Fts3Index *)&p->azColumn[nCol];
1251: memcpy(p->aIndex, aIndex, sizeof(struct Fts3Index) * nIndex);
1252: p->nIndex = nIndex;
1253: for(i=0; i<nIndex; i++){
1254: fts3HashInit(&p->aIndex[i].hPending, FTS3_HASH_STRING, 1);
1255: }
1256:
1257: /* Fill in the zName and zDb fields of the vtab structure. */
1258: zCsr = (char *)&p->aIndex[nIndex];
1259: p->zName = zCsr;
1260: memcpy(zCsr, argv[2], nName);
1261: zCsr += nName;
1262: p->zDb = zCsr;
1263: memcpy(zCsr, argv[1], nDb);
1264: zCsr += nDb;
1265:
1266: /* Fill in the azColumn array */
1267: for(iCol=0; iCol<nCol; iCol++){
1268: char *z;
1269: int n = 0;
1270: z = (char *)sqlite3Fts3NextToken(aCol[iCol], &n);
1271: memcpy(zCsr, z, n);
1272: zCsr[n] = '\0';
1273: sqlite3Fts3Dequote(zCsr);
1274: p->azColumn[iCol] = zCsr;
1275: zCsr += n+1;
1276: assert( zCsr <= &((char *)p)[nByte] );
1277: }
1278:
1279: if( (zCompress==0)!=(zUncompress==0) ){
1280: char const *zMiss = (zCompress==0 ? "compress" : "uncompress");
1281: rc = SQLITE_ERROR;
1282: *pzErr = sqlite3_mprintf("missing %s parameter in fts4 constructor", zMiss);
1283: }
1284: p->zReadExprlist = fts3ReadExprList(p, zUncompress, &rc);
1285: p->zWriteExprlist = fts3WriteExprList(p, zCompress, &rc);
1286: if( rc!=SQLITE_OK ) goto fts3_init_out;
1287:
1288: /* If this is an xCreate call, create the underlying tables in the
1289: ** database. TODO: For xConnect(), it could verify that said tables exist.
1290: */
1291: if( isCreate ){
1292: rc = fts3CreateTables(p);
1293: }
1294:
1295: /* Figure out the page-size for the database. This is required in order to
1296: ** estimate the cost of loading large doclists from the database. */
1297: fts3DatabasePageSize(&rc, p);
1298: p->nNodeSize = p->nPgsz-35;
1299:
1300: /* Declare the table schema to SQLite. */
1301: fts3DeclareVtab(&rc, p);
1302:
1303: fts3_init_out:
1304: sqlite3_free(zPrefix);
1305: sqlite3_free(aIndex);
1306: sqlite3_free(zCompress);
1307: sqlite3_free(zUncompress);
1308: sqlite3_free(zContent);
1309: sqlite3_free((void *)aCol);
1310: if( rc!=SQLITE_OK ){
1311: if( p ){
1312: fts3DisconnectMethod((sqlite3_vtab *)p);
1313: }else if( pTokenizer ){
1314: pTokenizer->pModule->xDestroy(pTokenizer);
1315: }
1316: }else{
1317: assert( p->pSegments==0 );
1318: *ppVTab = &p->base;
1319: }
1320: return rc;
1321: }
1322:
1323: /*
1324: ** The xConnect() and xCreate() methods for the virtual table. All the
1325: ** work is done in function fts3InitVtab().
1326: */
1327: static int fts3ConnectMethod(
1328: sqlite3 *db, /* Database connection */
1329: void *pAux, /* Pointer to tokenizer hash table */
1330: int argc, /* Number of elements in argv array */
1331: const char * const *argv, /* xCreate/xConnect argument array */
1332: sqlite3_vtab **ppVtab, /* OUT: New sqlite3_vtab object */
1333: char **pzErr /* OUT: sqlite3_malloc'd error message */
1334: ){
1335: return fts3InitVtab(0, db, pAux, argc, argv, ppVtab, pzErr);
1336: }
1337: static int fts3CreateMethod(
1338: sqlite3 *db, /* Database connection */
1339: void *pAux, /* Pointer to tokenizer hash table */
1340: int argc, /* Number of elements in argv array */
1341: const char * const *argv, /* xCreate/xConnect argument array */
1342: sqlite3_vtab **ppVtab, /* OUT: New sqlite3_vtab object */
1343: char **pzErr /* OUT: sqlite3_malloc'd error message */
1344: ){
1345: return fts3InitVtab(1, db, pAux, argc, argv, ppVtab, pzErr);
1346: }
1347:
1348: /*
1349: ** Implementation of the xBestIndex method for FTS3 tables. There
1350: ** are three possible strategies, in order of preference:
1351: **
1352: ** 1. Direct lookup by rowid or docid.
1353: ** 2. Full-text search using a MATCH operator on a non-docid column.
1354: ** 3. Linear scan of %_content table.
1355: */
1356: static int fts3BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
1357: Fts3Table *p = (Fts3Table *)pVTab;
1358: int i; /* Iterator variable */
1359: int iCons = -1; /* Index of constraint to use */
1360:
1361: /* By default use a full table scan. This is an expensive option,
1362: ** so search through the constraints to see if a more efficient
1363: ** strategy is possible.
1364: */
1365: pInfo->idxNum = FTS3_FULLSCAN_SEARCH;
1366: pInfo->estimatedCost = 500000;
1367: for(i=0; i<pInfo->nConstraint; i++){
1368: struct sqlite3_index_constraint *pCons = &pInfo->aConstraint[i];
1369: if( pCons->usable==0 ) continue;
1370:
1371: /* A direct lookup on the rowid or docid column. Assign a cost of 1.0. */
1372: if( pCons->op==SQLITE_INDEX_CONSTRAINT_EQ
1373: && (pCons->iColumn<0 || pCons->iColumn==p->nColumn+1 )
1374: ){
1375: pInfo->idxNum = FTS3_DOCID_SEARCH;
1376: pInfo->estimatedCost = 1.0;
1377: iCons = i;
1378: }
1379:
1380: /* A MATCH constraint. Use a full-text search.
1381: **
1382: ** If there is more than one MATCH constraint available, use the first
1383: ** one encountered. If there is both a MATCH constraint and a direct
1384: ** rowid/docid lookup, prefer the MATCH strategy. This is done even
1385: ** though the rowid/docid lookup is faster than a MATCH query, selecting
1386: ** it would lead to an "unable to use function MATCH in the requested
1387: ** context" error.
1388: */
1389: if( pCons->op==SQLITE_INDEX_CONSTRAINT_MATCH
1390: && pCons->iColumn>=0 && pCons->iColumn<=p->nColumn
1391: ){
1392: pInfo->idxNum = FTS3_FULLTEXT_SEARCH + pCons->iColumn;
1393: pInfo->estimatedCost = 2.0;
1394: iCons = i;
1395: break;
1396: }
1397: }
1398:
1399: if( iCons>=0 ){
1400: pInfo->aConstraintUsage[iCons].argvIndex = 1;
1401: pInfo->aConstraintUsage[iCons].omit = 1;
1402: }
1403:
1404: /* Regardless of the strategy selected, FTS can deliver rows in rowid (or
1405: ** docid) order. Both ascending and descending are possible.
1406: */
1407: if( pInfo->nOrderBy==1 ){
1408: struct sqlite3_index_orderby *pOrder = &pInfo->aOrderBy[0];
1409: if( pOrder->iColumn<0 || pOrder->iColumn==p->nColumn+1 ){
1410: if( pOrder->desc ){
1411: pInfo->idxStr = "DESC";
1412: }else{
1413: pInfo->idxStr = "ASC";
1414: }
1415: pInfo->orderByConsumed = 1;
1416: }
1417: }
1418:
1419: assert( p->pSegments==0 );
1420: return SQLITE_OK;
1421: }
1422:
1423: /*
1424: ** Implementation of xOpen method.
1425: */
1426: static int fts3OpenMethod(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCsr){
1427: sqlite3_vtab_cursor *pCsr; /* Allocated cursor */
1428:
1429: UNUSED_PARAMETER(pVTab);
1430:
1431: /* Allocate a buffer large enough for an Fts3Cursor structure. If the
1432: ** allocation succeeds, zero it and return SQLITE_OK. Otherwise,
1433: ** if the allocation fails, return SQLITE_NOMEM.
1434: */
1435: *ppCsr = pCsr = (sqlite3_vtab_cursor *)sqlite3_malloc(sizeof(Fts3Cursor));
1436: if( !pCsr ){
1437: return SQLITE_NOMEM;
1438: }
1439: memset(pCsr, 0, sizeof(Fts3Cursor));
1440: return SQLITE_OK;
1441: }
1442:
1443: /*
1444: ** Close the cursor. For additional information see the documentation
1445: ** on the xClose method of the virtual table interface.
1446: */
1447: static int fts3CloseMethod(sqlite3_vtab_cursor *pCursor){
1448: Fts3Cursor *pCsr = (Fts3Cursor *)pCursor;
1449: assert( ((Fts3Table *)pCsr->base.pVtab)->pSegments==0 );
1450: sqlite3_finalize(pCsr->pStmt);
1451: sqlite3Fts3ExprFree(pCsr->pExpr);
1452: sqlite3Fts3FreeDeferredTokens(pCsr);
1453: sqlite3_free(pCsr->aDoclist);
1454: sqlite3_free(pCsr->aMatchinfo);
1455: assert( ((Fts3Table *)pCsr->base.pVtab)->pSegments==0 );
1456: sqlite3_free(pCsr);
1457: return SQLITE_OK;
1458: }
1459:
1460: /*
1461: ** If pCsr->pStmt has not been prepared (i.e. if pCsr->pStmt==0), then
1462: ** compose and prepare an SQL statement of the form:
1463: **
1464: ** "SELECT <columns> FROM %_content WHERE rowid = ?"
1465: **
1466: ** (or the equivalent for a content=xxx table) and set pCsr->pStmt to
1467: ** it. If an error occurs, return an SQLite error code.
1468: **
1469: ** Otherwise, set *ppStmt to point to pCsr->pStmt and return SQLITE_OK.
1470: */
1471: static int fts3CursorSeekStmt(Fts3Cursor *pCsr, sqlite3_stmt **ppStmt){
1472: int rc = SQLITE_OK;
1473: if( pCsr->pStmt==0 ){
1474: Fts3Table *p = (Fts3Table *)pCsr->base.pVtab;
1475: char *zSql;
1476: zSql = sqlite3_mprintf("SELECT %s WHERE rowid = ?", p->zReadExprlist);
1477: if( !zSql ) return SQLITE_NOMEM;
1478: rc = sqlite3_prepare_v2(p->db, zSql, -1, &pCsr->pStmt, 0);
1479: sqlite3_free(zSql);
1480: }
1481: *ppStmt = pCsr->pStmt;
1482: return rc;
1483: }
1484:
1485: /*
1486: ** Position the pCsr->pStmt statement so that it is on the row
1487: ** of the %_content table that contains the last match. Return
1488: ** SQLITE_OK on success.
1489: */
1490: static int fts3CursorSeek(sqlite3_context *pContext, Fts3Cursor *pCsr){
1491: int rc = SQLITE_OK;
1492: if( pCsr->isRequireSeek ){
1493: sqlite3_stmt *pStmt = 0;
1494:
1495: rc = fts3CursorSeekStmt(pCsr, &pStmt);
1496: if( rc==SQLITE_OK ){
1497: sqlite3_bind_int64(pCsr->pStmt, 1, pCsr->iPrevId);
1498: pCsr->isRequireSeek = 0;
1499: if( SQLITE_ROW==sqlite3_step(pCsr->pStmt) ){
1500: return SQLITE_OK;
1501: }else{
1502: rc = sqlite3_reset(pCsr->pStmt);
1503: if( rc==SQLITE_OK && ((Fts3Table *)pCsr->base.pVtab)->zContentTbl==0 ){
1504: /* If no row was found and no error has occured, then the %_content
1505: ** table is missing a row that is present in the full-text index.
1506: ** The data structures are corrupt. */
1507: rc = FTS_CORRUPT_VTAB;
1508: pCsr->isEof = 1;
1509: }
1510: }
1511: }
1512: }
1513:
1514: if( rc!=SQLITE_OK && pContext ){
1515: sqlite3_result_error_code(pContext, rc);
1516: }
1517: return rc;
1518: }
1519:
1520: /*
1521: ** This function is used to process a single interior node when searching
1522: ** a b-tree for a term or term prefix. The node data is passed to this
1523: ** function via the zNode/nNode parameters. The term to search for is
1524: ** passed in zTerm/nTerm.
1525: **
1526: ** If piFirst is not NULL, then this function sets *piFirst to the blockid
1527: ** of the child node that heads the sub-tree that may contain the term.
1528: **
1529: ** If piLast is not NULL, then *piLast is set to the right-most child node
1530: ** that heads a sub-tree that may contain a term for which zTerm/nTerm is
1531: ** a prefix.
1532: **
1533: ** If an OOM error occurs, SQLITE_NOMEM is returned. Otherwise, SQLITE_OK.
1534: */
1535: static int fts3ScanInteriorNode(
1536: const char *zTerm, /* Term to select leaves for */
1537: int nTerm, /* Size of term zTerm in bytes */
1538: const char *zNode, /* Buffer containing segment interior node */
1539: int nNode, /* Size of buffer at zNode */
1540: sqlite3_int64 *piFirst, /* OUT: Selected child node */
1541: sqlite3_int64 *piLast /* OUT: Selected child node */
1542: ){
1543: int rc = SQLITE_OK; /* Return code */
1544: const char *zCsr = zNode; /* Cursor to iterate through node */
1545: const char *zEnd = &zCsr[nNode];/* End of interior node buffer */
1546: char *zBuffer = 0; /* Buffer to load terms into */
1547: int nAlloc = 0; /* Size of allocated buffer */
1548: int isFirstTerm = 1; /* True when processing first term on page */
1549: sqlite3_int64 iChild; /* Block id of child node to descend to */
1550:
1551: /* Skip over the 'height' varint that occurs at the start of every
1552: ** interior node. Then load the blockid of the left-child of the b-tree
1553: ** node into variable iChild.
1554: **
1555: ** Even if the data structure on disk is corrupted, this (reading two
1556: ** varints from the buffer) does not risk an overread. If zNode is a
1557: ** root node, then the buffer comes from a SELECT statement. SQLite does
1558: ** not make this guarantee explicitly, but in practice there are always
1559: ** either more than 20 bytes of allocated space following the nNode bytes of
1560: ** contents, or two zero bytes. Or, if the node is read from the %_segments
1561: ** table, then there are always 20 bytes of zeroed padding following the
1562: ** nNode bytes of content (see sqlite3Fts3ReadBlock() for details).
1563: */
1564: zCsr += sqlite3Fts3GetVarint(zCsr, &iChild);
1565: zCsr += sqlite3Fts3GetVarint(zCsr, &iChild);
1566: if( zCsr>zEnd ){
1567: return FTS_CORRUPT_VTAB;
1568: }
1569:
1570: while( zCsr<zEnd && (piFirst || piLast) ){
1571: int cmp; /* memcmp() result */
1572: int nSuffix; /* Size of term suffix */
1573: int nPrefix = 0; /* Size of term prefix */
1574: int nBuffer; /* Total term size */
1575:
1576: /* Load the next term on the node into zBuffer. Use realloc() to expand
1577: ** the size of zBuffer if required. */
1578: if( !isFirstTerm ){
1579: zCsr += sqlite3Fts3GetVarint32(zCsr, &nPrefix);
1580: }
1581: isFirstTerm = 0;
1582: zCsr += sqlite3Fts3GetVarint32(zCsr, &nSuffix);
1583:
1584: if( nPrefix<0 || nSuffix<0 || &zCsr[nSuffix]>zEnd ){
1585: rc = FTS_CORRUPT_VTAB;
1586: goto finish_scan;
1587: }
1588: if( nPrefix+nSuffix>nAlloc ){
1589: char *zNew;
1590: nAlloc = (nPrefix+nSuffix) * 2;
1591: zNew = (char *)sqlite3_realloc(zBuffer, nAlloc);
1592: if( !zNew ){
1593: rc = SQLITE_NOMEM;
1594: goto finish_scan;
1595: }
1596: zBuffer = zNew;
1597: }
1598: assert( zBuffer );
1599: memcpy(&zBuffer[nPrefix], zCsr, nSuffix);
1600: nBuffer = nPrefix + nSuffix;
1601: zCsr += nSuffix;
1602:
1603: /* Compare the term we are searching for with the term just loaded from
1604: ** the interior node. If the specified term is greater than or equal
1605: ** to the term from the interior node, then all terms on the sub-tree
1606: ** headed by node iChild are smaller than zTerm. No need to search
1607: ** iChild.
1608: **
1609: ** If the interior node term is larger than the specified term, then
1610: ** the tree headed by iChild may contain the specified term.
1611: */
1612: cmp = memcmp(zTerm, zBuffer, (nBuffer>nTerm ? nTerm : nBuffer));
1613: if( piFirst && (cmp<0 || (cmp==0 && nBuffer>nTerm)) ){
1614: *piFirst = iChild;
1615: piFirst = 0;
1616: }
1617:
1618: if( piLast && cmp<0 ){
1619: *piLast = iChild;
1620: piLast = 0;
1621: }
1622:
1623: iChild++;
1624: };
1625:
1626: if( piFirst ) *piFirst = iChild;
1627: if( piLast ) *piLast = iChild;
1628:
1629: finish_scan:
1630: sqlite3_free(zBuffer);
1631: return rc;
1632: }
1633:
1634:
1635: /*
1636: ** The buffer pointed to by argument zNode (size nNode bytes) contains an
1637: ** interior node of a b-tree segment. The zTerm buffer (size nTerm bytes)
1638: ** contains a term. This function searches the sub-tree headed by the zNode
1639: ** node for the range of leaf nodes that may contain the specified term
1640: ** or terms for which the specified term is a prefix.
1641: **
1642: ** If piLeaf is not NULL, then *piLeaf is set to the blockid of the
1643: ** left-most leaf node in the tree that may contain the specified term.
1644: ** If piLeaf2 is not NULL, then *piLeaf2 is set to the blockid of the
1645: ** right-most leaf node that may contain a term for which the specified
1646: ** term is a prefix.
1647: **
1648: ** It is possible that the range of returned leaf nodes does not contain
1649: ** the specified term or any terms for which it is a prefix. However, if the
1650: ** segment does contain any such terms, they are stored within the identified
1651: ** range. Because this function only inspects interior segment nodes (and
1652: ** never loads leaf nodes into memory), it is not possible to be sure.
1653: **
1654: ** If an error occurs, an error code other than SQLITE_OK is returned.
1655: */
1656: static int fts3SelectLeaf(
1657: Fts3Table *p, /* Virtual table handle */
1658: const char *zTerm, /* Term to select leaves for */
1659: int nTerm, /* Size of term zTerm in bytes */
1660: const char *zNode, /* Buffer containing segment interior node */
1661: int nNode, /* Size of buffer at zNode */
1662: sqlite3_int64 *piLeaf, /* Selected leaf node */
1663: sqlite3_int64 *piLeaf2 /* Selected leaf node */
1664: ){
1665: int rc; /* Return code */
1666: int iHeight; /* Height of this node in tree */
1667:
1668: assert( piLeaf || piLeaf2 );
1669:
1670: sqlite3Fts3GetVarint32(zNode, &iHeight);
1671: rc = fts3ScanInteriorNode(zTerm, nTerm, zNode, nNode, piLeaf, piLeaf2);
1672: assert( !piLeaf2 || !piLeaf || rc!=SQLITE_OK || (*piLeaf<=*piLeaf2) );
1673:
1674: if( rc==SQLITE_OK && iHeight>1 ){
1675: char *zBlob = 0; /* Blob read from %_segments table */
1676: int nBlob; /* Size of zBlob in bytes */
1677:
1678: if( piLeaf && piLeaf2 && (*piLeaf!=*piLeaf2) ){
1679: rc = sqlite3Fts3ReadBlock(p, *piLeaf, &zBlob, &nBlob, 0);
1680: if( rc==SQLITE_OK ){
1681: rc = fts3SelectLeaf(p, zTerm, nTerm, zBlob, nBlob, piLeaf, 0);
1682: }
1683: sqlite3_free(zBlob);
1684: piLeaf = 0;
1685: zBlob = 0;
1686: }
1687:
1688: if( rc==SQLITE_OK ){
1689: rc = sqlite3Fts3ReadBlock(p, piLeaf?*piLeaf:*piLeaf2, &zBlob, &nBlob, 0);
1690: }
1691: if( rc==SQLITE_OK ){
1692: rc = fts3SelectLeaf(p, zTerm, nTerm, zBlob, nBlob, piLeaf, piLeaf2);
1693: }
1694: sqlite3_free(zBlob);
1695: }
1696:
1697: return rc;
1698: }
1699:
1700: /*
1701: ** This function is used to create delta-encoded serialized lists of FTS3
1702: ** varints. Each call to this function appends a single varint to a list.
1703: */
1704: static void fts3PutDeltaVarint(
1705: char **pp, /* IN/OUT: Output pointer */
1706: sqlite3_int64 *piPrev, /* IN/OUT: Previous value written to list */
1707: sqlite3_int64 iVal /* Write this value to the list */
1708: ){
1709: assert( iVal-*piPrev > 0 || (*piPrev==0 && iVal==0) );
1710: *pp += sqlite3Fts3PutVarint(*pp, iVal-*piPrev);
1711: *piPrev = iVal;
1712: }
1713:
1714: /*
1715: ** When this function is called, *ppPoslist is assumed to point to the
1716: ** start of a position-list. After it returns, *ppPoslist points to the
1717: ** first byte after the position-list.
1718: **
1719: ** A position list is list of positions (delta encoded) and columns for
1720: ** a single document record of a doclist. So, in other words, this
1721: ** routine advances *ppPoslist so that it points to the next docid in
1722: ** the doclist, or to the first byte past the end of the doclist.
1723: **
1724: ** If pp is not NULL, then the contents of the position list are copied
1725: ** to *pp. *pp is set to point to the first byte past the last byte copied
1726: ** before this function returns.
1727: */
1728: static void fts3PoslistCopy(char **pp, char **ppPoslist){
1729: char *pEnd = *ppPoslist;
1730: char c = 0;
1731:
1732: /* The end of a position list is marked by a zero encoded as an FTS3
1733: ** varint. A single POS_END (0) byte. Except, if the 0 byte is preceded by
1734: ** a byte with the 0x80 bit set, then it is not a varint 0, but the tail
1735: ** of some other, multi-byte, value.
1736: **
1737: ** The following while-loop moves pEnd to point to the first byte that is not
1738: ** immediately preceded by a byte with the 0x80 bit set. Then increments
1739: ** pEnd once more so that it points to the byte immediately following the
1740: ** last byte in the position-list.
1741: */
1742: while( *pEnd | c ){
1743: c = *pEnd++ & 0x80;
1744: testcase( c!=0 && (*pEnd)==0 );
1745: }
1746: pEnd++; /* Advance past the POS_END terminator byte */
1747:
1748: if( pp ){
1749: int n = (int)(pEnd - *ppPoslist);
1750: char *p = *pp;
1751: memcpy(p, *ppPoslist, n);
1752: p += n;
1753: *pp = p;
1754: }
1755: *ppPoslist = pEnd;
1756: }
1757:
1758: /*
1759: ** When this function is called, *ppPoslist is assumed to point to the
1760: ** start of a column-list. After it returns, *ppPoslist points to the
1761: ** to the terminator (POS_COLUMN or POS_END) byte of the column-list.
1762: **
1763: ** A column-list is list of delta-encoded positions for a single column
1764: ** within a single document within a doclist.
1765: **
1766: ** The column-list is terminated either by a POS_COLUMN varint (1) or
1767: ** a POS_END varint (0). This routine leaves *ppPoslist pointing to
1768: ** the POS_COLUMN or POS_END that terminates the column-list.
1769: **
1770: ** If pp is not NULL, then the contents of the column-list are copied
1771: ** to *pp. *pp is set to point to the first byte past the last byte copied
1772: ** before this function returns. The POS_COLUMN or POS_END terminator
1773: ** is not copied into *pp.
1774: */
1775: static void fts3ColumnlistCopy(char **pp, char **ppPoslist){
1776: char *pEnd = *ppPoslist;
1777: char c = 0;
1778:
1779: /* A column-list is terminated by either a 0x01 or 0x00 byte that is
1780: ** not part of a multi-byte varint.
1781: */
1782: while( 0xFE & (*pEnd | c) ){
1783: c = *pEnd++ & 0x80;
1784: testcase( c!=0 && ((*pEnd)&0xfe)==0 );
1785: }
1786: if( pp ){
1787: int n = (int)(pEnd - *ppPoslist);
1788: char *p = *pp;
1789: memcpy(p, *ppPoslist, n);
1790: p += n;
1791: *pp = p;
1792: }
1793: *ppPoslist = pEnd;
1794: }
1795:
1796: /*
1797: ** Value used to signify the end of an position-list. This is safe because
1798: ** it is not possible to have a document with 2^31 terms.
1799: */
1800: #define POSITION_LIST_END 0x7fffffff
1801:
1802: /*
1803: ** This function is used to help parse position-lists. When this function is
1804: ** called, *pp may point to the start of the next varint in the position-list
1805: ** being parsed, or it may point to 1 byte past the end of the position-list
1806: ** (in which case **pp will be a terminator bytes POS_END (0) or
1807: ** (1)).
1808: **
1809: ** If *pp points past the end of the current position-list, set *pi to
1810: ** POSITION_LIST_END and return. Otherwise, read the next varint from *pp,
1811: ** increment the current value of *pi by the value read, and set *pp to
1812: ** point to the next value before returning.
1813: **
1814: ** Before calling this routine *pi must be initialized to the value of
1815: ** the previous position, or zero if we are reading the first position
1816: ** in the position-list. Because positions are delta-encoded, the value
1817: ** of the previous position is needed in order to compute the value of
1818: ** the next position.
1819: */
1820: static void fts3ReadNextPos(
1821: char **pp, /* IN/OUT: Pointer into position-list buffer */
1822: sqlite3_int64 *pi /* IN/OUT: Value read from position-list */
1823: ){
1824: if( (**pp)&0xFE ){
1825: fts3GetDeltaVarint(pp, pi);
1826: *pi -= 2;
1827: }else{
1828: *pi = POSITION_LIST_END;
1829: }
1830: }
1831:
1832: /*
1833: ** If parameter iCol is not 0, write an POS_COLUMN (1) byte followed by
1834: ** the value of iCol encoded as a varint to *pp. This will start a new
1835: ** column list.
1836: **
1837: ** Set *pp to point to the byte just after the last byte written before
1838: ** returning (do not modify it if iCol==0). Return the total number of bytes
1839: ** written (0 if iCol==0).
1840: */
1841: static int fts3PutColNumber(char **pp, int iCol){
1842: int n = 0; /* Number of bytes written */
1843: if( iCol ){
1844: char *p = *pp; /* Output pointer */
1845: n = 1 + sqlite3Fts3PutVarint(&p[1], iCol);
1846: *p = 0x01;
1847: *pp = &p[n];
1848: }
1849: return n;
1850: }
1851:
1852: /*
1853: ** Compute the union of two position lists. The output written
1854: ** into *pp contains all positions of both *pp1 and *pp2 in sorted
1855: ** order and with any duplicates removed. All pointers are
1856: ** updated appropriately. The caller is responsible for insuring
1857: ** that there is enough space in *pp to hold the complete output.
1858: */
1859: static void fts3PoslistMerge(
1860: char **pp, /* Output buffer */
1861: char **pp1, /* Left input list */
1862: char **pp2 /* Right input list */
1863: ){
1864: char *p = *pp;
1865: char *p1 = *pp1;
1866: char *p2 = *pp2;
1867:
1868: while( *p1 || *p2 ){
1869: int iCol1; /* The current column index in pp1 */
1870: int iCol2; /* The current column index in pp2 */
1871:
1872: if( *p1==POS_COLUMN ) sqlite3Fts3GetVarint32(&p1[1], &iCol1);
1873: else if( *p1==POS_END ) iCol1 = POSITION_LIST_END;
1874: else iCol1 = 0;
1875:
1876: if( *p2==POS_COLUMN ) sqlite3Fts3GetVarint32(&p2[1], &iCol2);
1877: else if( *p2==POS_END ) iCol2 = POSITION_LIST_END;
1878: else iCol2 = 0;
1879:
1880: if( iCol1==iCol2 ){
1881: sqlite3_int64 i1 = 0; /* Last position from pp1 */
1882: sqlite3_int64 i2 = 0; /* Last position from pp2 */
1883: sqlite3_int64 iPrev = 0;
1884: int n = fts3PutColNumber(&p, iCol1);
1885: p1 += n;
1886: p2 += n;
1887:
1888: /* At this point, both p1 and p2 point to the start of column-lists
1889: ** for the same column (the column with index iCol1 and iCol2).
1890: ** A column-list is a list of non-negative delta-encoded varints, each
1891: ** incremented by 2 before being stored. Each list is terminated by a
1892: ** POS_END (0) or POS_COLUMN (1). The following block merges the two lists
1893: ** and writes the results to buffer p. p is left pointing to the byte
1894: ** after the list written. No terminator (POS_END or POS_COLUMN) is
1895: ** written to the output.
1896: */
1897: fts3GetDeltaVarint(&p1, &i1);
1898: fts3GetDeltaVarint(&p2, &i2);
1899: do {
1900: fts3PutDeltaVarint(&p, &iPrev, (i1<i2) ? i1 : i2);
1901: iPrev -= 2;
1902: if( i1==i2 ){
1903: fts3ReadNextPos(&p1, &i1);
1904: fts3ReadNextPos(&p2, &i2);
1905: }else if( i1<i2 ){
1906: fts3ReadNextPos(&p1, &i1);
1907: }else{
1908: fts3ReadNextPos(&p2, &i2);
1909: }
1910: }while( i1!=POSITION_LIST_END || i2!=POSITION_LIST_END );
1911: }else if( iCol1<iCol2 ){
1912: p1 += fts3PutColNumber(&p, iCol1);
1913: fts3ColumnlistCopy(&p, &p1);
1914: }else{
1915: p2 += fts3PutColNumber(&p, iCol2);
1916: fts3ColumnlistCopy(&p, &p2);
1917: }
1918: }
1919:
1920: *p++ = POS_END;
1921: *pp = p;
1922: *pp1 = p1 + 1;
1923: *pp2 = p2 + 1;
1924: }
1925:
1926: /*
1927: ** This function is used to merge two position lists into one. When it is
1928: ** called, *pp1 and *pp2 must both point to position lists. A position-list is
1929: ** the part of a doclist that follows each document id. For example, if a row
1930: ** contains:
1931: **
1932: ** 'a b c'|'x y z'|'a b b a'
1933: **
1934: ** Then the position list for this row for token 'b' would consist of:
1935: **
1936: ** 0x02 0x01 0x02 0x03 0x03 0x00
1937: **
1938: ** When this function returns, both *pp1 and *pp2 are left pointing to the
1939: ** byte following the 0x00 terminator of their respective position lists.
1940: **
1941: ** If isSaveLeft is 0, an entry is added to the output position list for
1942: ** each position in *pp2 for which there exists one or more positions in
1943: ** *pp1 so that (pos(*pp2)>pos(*pp1) && pos(*pp2)-pos(*pp1)<=nToken). i.e.
1944: ** when the *pp1 token appears before the *pp2 token, but not more than nToken
1945: ** slots before it.
1946: **
1947: ** e.g. nToken==1 searches for adjacent positions.
1948: */
1949: static int fts3PoslistPhraseMerge(
1950: char **pp, /* IN/OUT: Preallocated output buffer */
1951: int nToken, /* Maximum difference in token positions */
1952: int isSaveLeft, /* Save the left position */
1953: int isExact, /* If *pp1 is exactly nTokens before *pp2 */
1954: char **pp1, /* IN/OUT: Left input list */
1955: char **pp2 /* IN/OUT: Right input list */
1956: ){
1957: char *p = *pp;
1958: char *p1 = *pp1;
1959: char *p2 = *pp2;
1960: int iCol1 = 0;
1961: int iCol2 = 0;
1962:
1963: /* Never set both isSaveLeft and isExact for the same invocation. */
1964: assert( isSaveLeft==0 || isExact==0 );
1965:
1966: assert( p!=0 && *p1!=0 && *p2!=0 );
1967: if( *p1==POS_COLUMN ){
1968: p1++;
1969: p1 += sqlite3Fts3GetVarint32(p1, &iCol1);
1970: }
1971: if( *p2==POS_COLUMN ){
1972: p2++;
1973: p2 += sqlite3Fts3GetVarint32(p2, &iCol2);
1974: }
1975:
1976: while( 1 ){
1977: if( iCol1==iCol2 ){
1978: char *pSave = p;
1979: sqlite3_int64 iPrev = 0;
1980: sqlite3_int64 iPos1 = 0;
1981: sqlite3_int64 iPos2 = 0;
1982:
1983: if( iCol1 ){
1984: *p++ = POS_COLUMN;
1985: p += sqlite3Fts3PutVarint(p, iCol1);
1986: }
1987:
1988: assert( *p1!=POS_END && *p1!=POS_COLUMN );
1989: assert( *p2!=POS_END && *p2!=POS_COLUMN );
1990: fts3GetDeltaVarint(&p1, &iPos1); iPos1 -= 2;
1991: fts3GetDeltaVarint(&p2, &iPos2); iPos2 -= 2;
1992:
1993: while( 1 ){
1994: if( iPos2==iPos1+nToken
1995: || (isExact==0 && iPos2>iPos1 && iPos2<=iPos1+nToken)
1996: ){
1997: sqlite3_int64 iSave;
1998: iSave = isSaveLeft ? iPos1 : iPos2;
1999: fts3PutDeltaVarint(&p, &iPrev, iSave+2); iPrev -= 2;
2000: pSave = 0;
2001: assert( p );
2002: }
2003: if( (!isSaveLeft && iPos2<=(iPos1+nToken)) || iPos2<=iPos1 ){
2004: if( (*p2&0xFE)==0 ) break;
2005: fts3GetDeltaVarint(&p2, &iPos2); iPos2 -= 2;
2006: }else{
2007: if( (*p1&0xFE)==0 ) break;
2008: fts3GetDeltaVarint(&p1, &iPos1); iPos1 -= 2;
2009: }
2010: }
2011:
2012: if( pSave ){
2013: assert( pp && p );
2014: p = pSave;
2015: }
2016:
2017: fts3ColumnlistCopy(0, &p1);
2018: fts3ColumnlistCopy(0, &p2);
2019: assert( (*p1&0xFE)==0 && (*p2&0xFE)==0 );
2020: if( 0==*p1 || 0==*p2 ) break;
2021:
2022: p1++;
2023: p1 += sqlite3Fts3GetVarint32(p1, &iCol1);
2024: p2++;
2025: p2 += sqlite3Fts3GetVarint32(p2, &iCol2);
2026: }
2027:
2028: /* Advance pointer p1 or p2 (whichever corresponds to the smaller of
2029: ** iCol1 and iCol2) so that it points to either the 0x00 that marks the
2030: ** end of the position list, or the 0x01 that precedes the next
2031: ** column-number in the position list.
2032: */
2033: else if( iCol1<iCol2 ){
2034: fts3ColumnlistCopy(0, &p1);
2035: if( 0==*p1 ) break;
2036: p1++;
2037: p1 += sqlite3Fts3GetVarint32(p1, &iCol1);
2038: }else{
2039: fts3ColumnlistCopy(0, &p2);
2040: if( 0==*p2 ) break;
2041: p2++;
2042: p2 += sqlite3Fts3GetVarint32(p2, &iCol2);
2043: }
2044: }
2045:
2046: fts3PoslistCopy(0, &p2);
2047: fts3PoslistCopy(0, &p1);
2048: *pp1 = p1;
2049: *pp2 = p2;
2050: if( *pp==p ){
2051: return 0;
2052: }
2053: *p++ = 0x00;
2054: *pp = p;
2055: return 1;
2056: }
2057:
2058: /*
2059: ** Merge two position-lists as required by the NEAR operator. The argument
2060: ** position lists correspond to the left and right phrases of an expression
2061: ** like:
2062: **
2063: ** "phrase 1" NEAR "phrase number 2"
2064: **
2065: ** Position list *pp1 corresponds to the left-hand side of the NEAR
2066: ** expression and *pp2 to the right. As usual, the indexes in the position
2067: ** lists are the offsets of the last token in each phrase (tokens "1" and "2"
2068: ** in the example above).
2069: **
2070: ** The output position list - written to *pp - is a copy of *pp2 with those
2071: ** entries that are not sufficiently NEAR entries in *pp1 removed.
2072: */
2073: static int fts3PoslistNearMerge(
2074: char **pp, /* Output buffer */
2075: char *aTmp, /* Temporary buffer space */
2076: int nRight, /* Maximum difference in token positions */
2077: int nLeft, /* Maximum difference in token positions */
2078: char **pp1, /* IN/OUT: Left input list */
2079: char **pp2 /* IN/OUT: Right input list */
2080: ){
2081: char *p1 = *pp1;
2082: char *p2 = *pp2;
2083:
2084: char *pTmp1 = aTmp;
2085: char *pTmp2;
2086: char *aTmp2;
2087: int res = 1;
2088:
2089: fts3PoslistPhraseMerge(&pTmp1, nRight, 0, 0, pp1, pp2);
2090: aTmp2 = pTmp2 = pTmp1;
2091: *pp1 = p1;
2092: *pp2 = p2;
2093: fts3PoslistPhraseMerge(&pTmp2, nLeft, 1, 0, pp2, pp1);
2094: if( pTmp1!=aTmp && pTmp2!=aTmp2 ){
2095: fts3PoslistMerge(pp, &aTmp, &aTmp2);
2096: }else if( pTmp1!=aTmp ){
2097: fts3PoslistCopy(pp, &aTmp);
2098: }else if( pTmp2!=aTmp2 ){
2099: fts3PoslistCopy(pp, &aTmp2);
2100: }else{
2101: res = 0;
2102: }
2103:
2104: return res;
2105: }
2106:
2107: /*
2108: ** An instance of this function is used to merge together the (potentially
2109: ** large number of) doclists for each term that matches a prefix query.
2110: ** See function fts3TermSelectMerge() for details.
2111: */
2112: typedef struct TermSelect TermSelect;
2113: struct TermSelect {
2114: char *aaOutput[16]; /* Malloc'd output buffers */
2115: int anOutput[16]; /* Size each output buffer in bytes */
2116: };
2117:
2118: /*
2119: ** This function is used to read a single varint from a buffer. Parameter
2120: ** pEnd points 1 byte past the end of the buffer. When this function is
2121: ** called, if *pp points to pEnd or greater, then the end of the buffer
2122: ** has been reached. In this case *pp is set to 0 and the function returns.
2123: **
2124: ** If *pp does not point to or past pEnd, then a single varint is read
2125: ** from *pp. *pp is then set to point 1 byte past the end of the read varint.
2126: **
2127: ** If bDescIdx is false, the value read is added to *pVal before returning.
2128: ** If it is true, the value read is subtracted from *pVal before this
2129: ** function returns.
2130: */
2131: static void fts3GetDeltaVarint3(
2132: char **pp, /* IN/OUT: Point to read varint from */
2133: char *pEnd, /* End of buffer */
2134: int bDescIdx, /* True if docids are descending */
2135: sqlite3_int64 *pVal /* IN/OUT: Integer value */
2136: ){
2137: if( *pp>=pEnd ){
2138: *pp = 0;
2139: }else{
2140: sqlite3_int64 iVal;
2141: *pp += sqlite3Fts3GetVarint(*pp, &iVal);
2142: if( bDescIdx ){
2143: *pVal -= iVal;
2144: }else{
2145: *pVal += iVal;
2146: }
2147: }
2148: }
2149:
2150: /*
2151: ** This function is used to write a single varint to a buffer. The varint
2152: ** is written to *pp. Before returning, *pp is set to point 1 byte past the
2153: ** end of the value written.
2154: **
2155: ** If *pbFirst is zero when this function is called, the value written to
2156: ** the buffer is that of parameter iVal.
2157: **
2158: ** If *pbFirst is non-zero when this function is called, then the value
2159: ** written is either (iVal-*piPrev) (if bDescIdx is zero) or (*piPrev-iVal)
2160: ** (if bDescIdx is non-zero).
2161: **
2162: ** Before returning, this function always sets *pbFirst to 1 and *piPrev
2163: ** to the value of parameter iVal.
2164: */
2165: static void fts3PutDeltaVarint3(
2166: char **pp, /* IN/OUT: Output pointer */
2167: int bDescIdx, /* True for descending docids */
2168: sqlite3_int64 *piPrev, /* IN/OUT: Previous value written to list */
2169: int *pbFirst, /* IN/OUT: True after first int written */
2170: sqlite3_int64 iVal /* Write this value to the list */
2171: ){
2172: sqlite3_int64 iWrite;
2173: if( bDescIdx==0 || *pbFirst==0 ){
2174: iWrite = iVal - *piPrev;
2175: }else{
2176: iWrite = *piPrev - iVal;
2177: }
2178: assert( *pbFirst || *piPrev==0 );
2179: assert( *pbFirst==0 || iWrite>0 );
2180: *pp += sqlite3Fts3PutVarint(*pp, iWrite);
2181: *piPrev = iVal;
2182: *pbFirst = 1;
2183: }
2184:
2185:
2186: /*
2187: ** This macro is used by various functions that merge doclists. The two
2188: ** arguments are 64-bit docid values. If the value of the stack variable
2189: ** bDescDoclist is 0 when this macro is invoked, then it returns (i1-i2).
2190: ** Otherwise, (i2-i1).
2191: **
2192: ** Using this makes it easier to write code that can merge doclists that are
2193: ** sorted in either ascending or descending order.
2194: */
2195: #define DOCID_CMP(i1, i2) ((bDescDoclist?-1:1) * (i1-i2))
2196:
2197: /*
2198: ** This function does an "OR" merge of two doclists (output contains all
2199: ** positions contained in either argument doclist). If the docids in the
2200: ** input doclists are sorted in ascending order, parameter bDescDoclist
2201: ** should be false. If they are sorted in ascending order, it should be
2202: ** passed a non-zero value.
2203: **
2204: ** If no error occurs, *paOut is set to point at an sqlite3_malloc'd buffer
2205: ** containing the output doclist and SQLITE_OK is returned. In this case
2206: ** *pnOut is set to the number of bytes in the output doclist.
2207: **
2208: ** If an error occurs, an SQLite error code is returned. The output values
2209: ** are undefined in this case.
2210: */
2211: static int fts3DoclistOrMerge(
2212: int bDescDoclist, /* True if arguments are desc */
2213: char *a1, int n1, /* First doclist */
2214: char *a2, int n2, /* Second doclist */
2215: char **paOut, int *pnOut /* OUT: Malloc'd doclist */
2216: ){
2217: sqlite3_int64 i1 = 0;
2218: sqlite3_int64 i2 = 0;
2219: sqlite3_int64 iPrev = 0;
2220: char *pEnd1 = &a1[n1];
2221: char *pEnd2 = &a2[n2];
2222: char *p1 = a1;
2223: char *p2 = a2;
2224: char *p;
2225: char *aOut;
2226: int bFirstOut = 0;
2227:
2228: *paOut = 0;
2229: *pnOut = 0;
2230:
2231: /* Allocate space for the output. Both the input and output doclists
2232: ** are delta encoded. If they are in ascending order (bDescDoclist==0),
2233: ** then the first docid in each list is simply encoded as a varint. For
2234: ** each subsequent docid, the varint stored is the difference between the
2235: ** current and previous docid (a positive number - since the list is in
2236: ** ascending order).
2237: **
2238: ** The first docid written to the output is therefore encoded using the
2239: ** same number of bytes as it is in whichever of the input lists it is
2240: ** read from. And each subsequent docid read from the same input list
2241: ** consumes either the same or less bytes as it did in the input (since
2242: ** the difference between it and the previous value in the output must
2243: ** be a positive value less than or equal to the delta value read from
2244: ** the input list). The same argument applies to all but the first docid
2245: ** read from the 'other' list. And to the contents of all position lists
2246: ** that will be copied and merged from the input to the output.
2247: **
2248: ** However, if the first docid copied to the output is a negative number,
2249: ** then the encoding of the first docid from the 'other' input list may
2250: ** be larger in the output than it was in the input (since the delta value
2251: ** may be a larger positive integer than the actual docid).
2252: **
2253: ** The space required to store the output is therefore the sum of the
2254: ** sizes of the two inputs, plus enough space for exactly one of the input
2255: ** docids to grow.
2256: **
2257: ** A symetric argument may be made if the doclists are in descending
2258: ** order.
2259: */
2260: aOut = sqlite3_malloc(n1+n2+FTS3_VARINT_MAX-1);
2261: if( !aOut ) return SQLITE_NOMEM;
2262:
2263: p = aOut;
2264: fts3GetDeltaVarint3(&p1, pEnd1, 0, &i1);
2265: fts3GetDeltaVarint3(&p2, pEnd2, 0, &i2);
2266: while( p1 || p2 ){
2267: sqlite3_int64 iDiff = DOCID_CMP(i1, i2);
2268:
2269: if( p2 && p1 && iDiff==0 ){
2270: fts3PutDeltaVarint3(&p, bDescDoclist, &iPrev, &bFirstOut, i1);
2271: fts3PoslistMerge(&p, &p1, &p2);
2272: fts3GetDeltaVarint3(&p1, pEnd1, bDescDoclist, &i1);
2273: fts3GetDeltaVarint3(&p2, pEnd2, bDescDoclist, &i2);
2274: }else if( !p2 || (p1 && iDiff<0) ){
2275: fts3PutDeltaVarint3(&p, bDescDoclist, &iPrev, &bFirstOut, i1);
2276: fts3PoslistCopy(&p, &p1);
2277: fts3GetDeltaVarint3(&p1, pEnd1, bDescDoclist, &i1);
2278: }else{
2279: fts3PutDeltaVarint3(&p, bDescDoclist, &iPrev, &bFirstOut, i2);
2280: fts3PoslistCopy(&p, &p2);
2281: fts3GetDeltaVarint3(&p2, pEnd2, bDescDoclist, &i2);
2282: }
2283: }
2284:
2285: *paOut = aOut;
2286: *pnOut = (p-aOut);
2287: assert( *pnOut<=n1+n2+FTS3_VARINT_MAX-1 );
2288: return SQLITE_OK;
2289: }
2290:
2291: /*
2292: ** This function does a "phrase" merge of two doclists. In a phrase merge,
2293: ** the output contains a copy of each position from the right-hand input
2294: ** doclist for which there is a position in the left-hand input doclist
2295: ** exactly nDist tokens before it.
2296: **
2297: ** If the docids in the input doclists are sorted in ascending order,
2298: ** parameter bDescDoclist should be false. If they are sorted in ascending
2299: ** order, it should be passed a non-zero value.
2300: **
2301: ** The right-hand input doclist is overwritten by this function.
2302: */
2303: static void fts3DoclistPhraseMerge(
2304: int bDescDoclist, /* True if arguments are desc */
2305: int nDist, /* Distance from left to right (1=adjacent) */
2306: char *aLeft, int nLeft, /* Left doclist */
2307: char *aRight, int *pnRight /* IN/OUT: Right/output doclist */
2308: ){
2309: sqlite3_int64 i1 = 0;
2310: sqlite3_int64 i2 = 0;
2311: sqlite3_int64 iPrev = 0;
2312: char *pEnd1 = &aLeft[nLeft];
2313: char *pEnd2 = &aRight[*pnRight];
2314: char *p1 = aLeft;
2315: char *p2 = aRight;
2316: char *p;
2317: int bFirstOut = 0;
2318: char *aOut = aRight;
2319:
2320: assert( nDist>0 );
2321:
2322: p = aOut;
2323: fts3GetDeltaVarint3(&p1, pEnd1, 0, &i1);
2324: fts3GetDeltaVarint3(&p2, pEnd2, 0, &i2);
2325:
2326: while( p1 && p2 ){
2327: sqlite3_int64 iDiff = DOCID_CMP(i1, i2);
2328: if( iDiff==0 ){
2329: char *pSave = p;
2330: sqlite3_int64 iPrevSave = iPrev;
2331: int bFirstOutSave = bFirstOut;
2332:
2333: fts3PutDeltaVarint3(&p, bDescDoclist, &iPrev, &bFirstOut, i1);
2334: if( 0==fts3PoslistPhraseMerge(&p, nDist, 0, 1, &p1, &p2) ){
2335: p = pSave;
2336: iPrev = iPrevSave;
2337: bFirstOut = bFirstOutSave;
2338: }
2339: fts3GetDeltaVarint3(&p1, pEnd1, bDescDoclist, &i1);
2340: fts3GetDeltaVarint3(&p2, pEnd2, bDescDoclist, &i2);
2341: }else if( iDiff<0 ){
2342: fts3PoslistCopy(0, &p1);
2343: fts3GetDeltaVarint3(&p1, pEnd1, bDescDoclist, &i1);
2344: }else{
2345: fts3PoslistCopy(0, &p2);
2346: fts3GetDeltaVarint3(&p2, pEnd2, bDescDoclist, &i2);
2347: }
2348: }
2349:
2350: *pnRight = p - aOut;
2351: }
2352:
2353: /*
2354: ** Argument pList points to a position list nList bytes in size. This
2355: ** function checks to see if the position list contains any entries for
2356: ** a token in position 0 (of any column). If so, it writes argument iDelta
2357: ** to the output buffer pOut, followed by a position list consisting only
2358: ** of the entries from pList at position 0, and terminated by an 0x00 byte.
2359: ** The value returned is the number of bytes written to pOut (if any).
2360: */
2361: int sqlite3Fts3FirstFilter(
2362: sqlite3_int64 iDelta, /* Varint that may be written to pOut */
2363: char *pList, /* Position list (no 0x00 term) */
2364: int nList, /* Size of pList in bytes */
2365: char *pOut /* Write output here */
2366: ){
2367: int nOut = 0;
2368: int bWritten = 0; /* True once iDelta has been written */
2369: char *p = pList;
2370: char *pEnd = &pList[nList];
2371:
2372: if( *p!=0x01 ){
2373: if( *p==0x02 ){
2374: nOut += sqlite3Fts3PutVarint(&pOut[nOut], iDelta);
2375: pOut[nOut++] = 0x02;
2376: bWritten = 1;
2377: }
2378: fts3ColumnlistCopy(0, &p);
2379: }
2380:
2381: while( p<pEnd && *p==0x01 ){
2382: sqlite3_int64 iCol;
2383: p++;
2384: p += sqlite3Fts3GetVarint(p, &iCol);
2385: if( *p==0x02 ){
2386: if( bWritten==0 ){
2387: nOut += sqlite3Fts3PutVarint(&pOut[nOut], iDelta);
2388: bWritten = 1;
2389: }
2390: pOut[nOut++] = 0x01;
2391: nOut += sqlite3Fts3PutVarint(&pOut[nOut], iCol);
2392: pOut[nOut++] = 0x02;
2393: }
2394: fts3ColumnlistCopy(0, &p);
2395: }
2396: if( bWritten ){
2397: pOut[nOut++] = 0x00;
2398: }
2399:
2400: return nOut;
2401: }
2402:
2403:
2404: /*
2405: ** Merge all doclists in the TermSelect.aaOutput[] array into a single
2406: ** doclist stored in TermSelect.aaOutput[0]. If successful, delete all
2407: ** other doclists (except the aaOutput[0] one) and return SQLITE_OK.
2408: **
2409: ** If an OOM error occurs, return SQLITE_NOMEM. In this case it is
2410: ** the responsibility of the caller to free any doclists left in the
2411: ** TermSelect.aaOutput[] array.
2412: */
2413: static int fts3TermSelectFinishMerge(Fts3Table *p, TermSelect *pTS){
2414: char *aOut = 0;
2415: int nOut = 0;
2416: int i;
2417:
2418: /* Loop through the doclists in the aaOutput[] array. Merge them all
2419: ** into a single doclist.
2420: */
2421: for(i=0; i<SizeofArray(pTS->aaOutput); i++){
2422: if( pTS->aaOutput[i] ){
2423: if( !aOut ){
2424: aOut = pTS->aaOutput[i];
2425: nOut = pTS->anOutput[i];
2426: pTS->aaOutput[i] = 0;
2427: }else{
2428: int nNew;
2429: char *aNew;
2430:
2431: int rc = fts3DoclistOrMerge(p->bDescIdx,
2432: pTS->aaOutput[i], pTS->anOutput[i], aOut, nOut, &aNew, &nNew
2433: );
2434: if( rc!=SQLITE_OK ){
2435: sqlite3_free(aOut);
2436: return rc;
2437: }
2438:
2439: sqlite3_free(pTS->aaOutput[i]);
2440: sqlite3_free(aOut);
2441: pTS->aaOutput[i] = 0;
2442: aOut = aNew;
2443: nOut = nNew;
2444: }
2445: }
2446: }
2447:
2448: pTS->aaOutput[0] = aOut;
2449: pTS->anOutput[0] = nOut;
2450: return SQLITE_OK;
2451: }
2452:
2453: /*
2454: ** Merge the doclist aDoclist/nDoclist into the TermSelect object passed
2455: ** as the first argument. The merge is an "OR" merge (see function
2456: ** fts3DoclistOrMerge() for details).
2457: **
2458: ** This function is called with the doclist for each term that matches
2459: ** a queried prefix. It merges all these doclists into one, the doclist
2460: ** for the specified prefix. Since there can be a very large number of
2461: ** doclists to merge, the merging is done pair-wise using the TermSelect
2462: ** object.
2463: **
2464: ** This function returns SQLITE_OK if the merge is successful, or an
2465: ** SQLite error code (SQLITE_NOMEM) if an error occurs.
2466: */
2467: static int fts3TermSelectMerge(
2468: Fts3Table *p, /* FTS table handle */
2469: TermSelect *pTS, /* TermSelect object to merge into */
2470: char *aDoclist, /* Pointer to doclist */
2471: int nDoclist /* Size of aDoclist in bytes */
2472: ){
2473: if( pTS->aaOutput[0]==0 ){
2474: /* If this is the first term selected, copy the doclist to the output
2475: ** buffer using memcpy(). */
2476: pTS->aaOutput[0] = sqlite3_malloc(nDoclist);
2477: pTS->anOutput[0] = nDoclist;
2478: if( pTS->aaOutput[0] ){
2479: memcpy(pTS->aaOutput[0], aDoclist, nDoclist);
2480: }else{
2481: return SQLITE_NOMEM;
2482: }
2483: }else{
2484: char *aMerge = aDoclist;
2485: int nMerge = nDoclist;
2486: int iOut;
2487:
2488: for(iOut=0; iOut<SizeofArray(pTS->aaOutput); iOut++){
2489: if( pTS->aaOutput[iOut]==0 ){
2490: assert( iOut>0 );
2491: pTS->aaOutput[iOut] = aMerge;
2492: pTS->anOutput[iOut] = nMerge;
2493: break;
2494: }else{
2495: char *aNew;
2496: int nNew;
2497:
2498: int rc = fts3DoclistOrMerge(p->bDescIdx, aMerge, nMerge,
2499: pTS->aaOutput[iOut], pTS->anOutput[iOut], &aNew, &nNew
2500: );
2501: if( rc!=SQLITE_OK ){
2502: if( aMerge!=aDoclist ) sqlite3_free(aMerge);
2503: return rc;
2504: }
2505:
2506: if( aMerge!=aDoclist ) sqlite3_free(aMerge);
2507: sqlite3_free(pTS->aaOutput[iOut]);
2508: pTS->aaOutput[iOut] = 0;
2509:
2510: aMerge = aNew;
2511: nMerge = nNew;
2512: if( (iOut+1)==SizeofArray(pTS->aaOutput) ){
2513: pTS->aaOutput[iOut] = aMerge;
2514: pTS->anOutput[iOut] = nMerge;
2515: }
2516: }
2517: }
2518: }
2519: return SQLITE_OK;
2520: }
2521:
2522: /*
2523: ** Append SegReader object pNew to the end of the pCsr->apSegment[] array.
2524: */
2525: static int fts3SegReaderCursorAppend(
2526: Fts3MultiSegReader *pCsr,
2527: Fts3SegReader *pNew
2528: ){
2529: if( (pCsr->nSegment%16)==0 ){
2530: Fts3SegReader **apNew;
2531: int nByte = (pCsr->nSegment + 16)*sizeof(Fts3SegReader*);
2532: apNew = (Fts3SegReader **)sqlite3_realloc(pCsr->apSegment, nByte);
2533: if( !apNew ){
2534: sqlite3Fts3SegReaderFree(pNew);
2535: return SQLITE_NOMEM;
2536: }
2537: pCsr->apSegment = apNew;
2538: }
2539: pCsr->apSegment[pCsr->nSegment++] = pNew;
2540: return SQLITE_OK;
2541: }
2542:
2543: /*
2544: ** Add seg-reader objects to the Fts3MultiSegReader object passed as the
2545: ** 8th argument.
2546: **
2547: ** This function returns SQLITE_OK if successful, or an SQLite error code
2548: ** otherwise.
2549: */
2550: static int fts3SegReaderCursor(
2551: Fts3Table *p, /* FTS3 table handle */
2552: int iIndex, /* Index to search (from 0 to p->nIndex-1) */
2553: int iLevel, /* Level of segments to scan */
2554: const char *zTerm, /* Term to query for */
2555: int nTerm, /* Size of zTerm in bytes */
2556: int isPrefix, /* True for a prefix search */
2557: int isScan, /* True to scan from zTerm to EOF */
2558: Fts3MultiSegReader *pCsr /* Cursor object to populate */
2559: ){
2560: int rc = SQLITE_OK; /* Error code */
2561: sqlite3_stmt *pStmt = 0; /* Statement to iterate through segments */
2562: int rc2; /* Result of sqlite3_reset() */
2563:
2564: /* If iLevel is less than 0 and this is not a scan, include a seg-reader
2565: ** for the pending-terms. If this is a scan, then this call must be being
2566: ** made by an fts4aux module, not an FTS table. In this case calling
2567: ** Fts3SegReaderPending might segfault, as the data structures used by
2568: ** fts4aux are not completely populated. So it's easiest to filter these
2569: ** calls out here. */
2570: if( iLevel<0 && p->aIndex ){
2571: Fts3SegReader *pSeg = 0;
2572: rc = sqlite3Fts3SegReaderPending(p, iIndex, zTerm, nTerm, isPrefix, &pSeg);
2573: if( rc==SQLITE_OK && pSeg ){
2574: rc = fts3SegReaderCursorAppend(pCsr, pSeg);
2575: }
2576: }
2577:
2578: if( iLevel!=FTS3_SEGCURSOR_PENDING ){
2579: if( rc==SQLITE_OK ){
2580: rc = sqlite3Fts3AllSegdirs(p, iIndex, iLevel, &pStmt);
2581: }
2582:
2583: while( rc==SQLITE_OK && SQLITE_ROW==(rc = sqlite3_step(pStmt)) ){
2584: Fts3SegReader *pSeg = 0;
2585:
2586: /* Read the values returned by the SELECT into local variables. */
2587: sqlite3_int64 iStartBlock = sqlite3_column_int64(pStmt, 1);
2588: sqlite3_int64 iLeavesEndBlock = sqlite3_column_int64(pStmt, 2);
2589: sqlite3_int64 iEndBlock = sqlite3_column_int64(pStmt, 3);
2590: int nRoot = sqlite3_column_bytes(pStmt, 4);
2591: char const *zRoot = sqlite3_column_blob(pStmt, 4);
2592:
2593: /* If zTerm is not NULL, and this segment is not stored entirely on its
2594: ** root node, the range of leaves scanned can be reduced. Do this. */
2595: if( iStartBlock && zTerm ){
2596: sqlite3_int64 *pi = (isPrefix ? &iLeavesEndBlock : 0);
2597: rc = fts3SelectLeaf(p, zTerm, nTerm, zRoot, nRoot, &iStartBlock, pi);
2598: if( rc!=SQLITE_OK ) goto finished;
2599: if( isPrefix==0 && isScan==0 ) iLeavesEndBlock = iStartBlock;
2600: }
2601:
2602: rc = sqlite3Fts3SegReaderNew(pCsr->nSegment+1,
2603: iStartBlock, iLeavesEndBlock, iEndBlock, zRoot, nRoot, &pSeg
2604: );
2605: if( rc!=SQLITE_OK ) goto finished;
2606: rc = fts3SegReaderCursorAppend(pCsr, pSeg);
2607: }
2608: }
2609:
2610: finished:
2611: rc2 = sqlite3_reset(pStmt);
2612: if( rc==SQLITE_DONE ) rc = rc2;
2613:
2614: return rc;
2615: }
2616:
2617: /*
2618: ** Set up a cursor object for iterating through a full-text index or a
2619: ** single level therein.
2620: */
2621: int sqlite3Fts3SegReaderCursor(
2622: Fts3Table *p, /* FTS3 table handle */
2623: int iIndex, /* Index to search (from 0 to p->nIndex-1) */
2624: int iLevel, /* Level of segments to scan */
2625: const char *zTerm, /* Term to query for */
2626: int nTerm, /* Size of zTerm in bytes */
2627: int isPrefix, /* True for a prefix search */
2628: int isScan, /* True to scan from zTerm to EOF */
2629: Fts3MultiSegReader *pCsr /* Cursor object to populate */
2630: ){
2631: assert( iIndex>=0 && iIndex<p->nIndex );
2632: assert( iLevel==FTS3_SEGCURSOR_ALL
2633: || iLevel==FTS3_SEGCURSOR_PENDING
2634: || iLevel>=0
2635: );
2636: assert( iLevel<FTS3_SEGDIR_MAXLEVEL );
2637: assert( FTS3_SEGCURSOR_ALL<0 && FTS3_SEGCURSOR_PENDING<0 );
2638: assert( isPrefix==0 || isScan==0 );
2639:
2640: /* "isScan" is only set to true by the ft4aux module, an ordinary
2641: ** full-text tables. */
2642: assert( isScan==0 || p->aIndex==0 );
2643:
2644: memset(pCsr, 0, sizeof(Fts3MultiSegReader));
2645:
2646: return fts3SegReaderCursor(
2647: p, iIndex, iLevel, zTerm, nTerm, isPrefix, isScan, pCsr
2648: );
2649: }
2650:
2651: /*
2652: ** In addition to its current configuration, have the Fts3MultiSegReader
2653: ** passed as the 4th argument also scan the doclist for term zTerm/nTerm.
2654: **
2655: ** SQLITE_OK is returned if no error occurs, otherwise an SQLite error code.
2656: */
2657: static int fts3SegReaderCursorAddZero(
2658: Fts3Table *p, /* FTS virtual table handle */
2659: const char *zTerm, /* Term to scan doclist of */
2660: int nTerm, /* Number of bytes in zTerm */
2661: Fts3MultiSegReader *pCsr /* Fts3MultiSegReader to modify */
2662: ){
2663: return fts3SegReaderCursor(p, 0, FTS3_SEGCURSOR_ALL, zTerm, nTerm, 0, 0,pCsr);
2664: }
2665:
2666: /*
2667: ** Open an Fts3MultiSegReader to scan the doclist for term zTerm/nTerm. Or,
2668: ** if isPrefix is true, to scan the doclist for all terms for which
2669: ** zTerm/nTerm is a prefix. If successful, return SQLITE_OK and write
2670: ** a pointer to the new Fts3MultiSegReader to *ppSegcsr. Otherwise, return
2671: ** an SQLite error code.
2672: **
2673: ** It is the responsibility of the caller to free this object by eventually
2674: ** passing it to fts3SegReaderCursorFree()
2675: **
2676: ** SQLITE_OK is returned if no error occurs, otherwise an SQLite error code.
2677: ** Output parameter *ppSegcsr is set to 0 if an error occurs.
2678: */
2679: static int fts3TermSegReaderCursor(
2680: Fts3Cursor *pCsr, /* Virtual table cursor handle */
2681: const char *zTerm, /* Term to query for */
2682: int nTerm, /* Size of zTerm in bytes */
2683: int isPrefix, /* True for a prefix search */
2684: Fts3MultiSegReader **ppSegcsr /* OUT: Allocated seg-reader cursor */
2685: ){
2686: Fts3MultiSegReader *pSegcsr; /* Object to allocate and return */
2687: int rc = SQLITE_NOMEM; /* Return code */
2688:
2689: pSegcsr = sqlite3_malloc(sizeof(Fts3MultiSegReader));
2690: if( pSegcsr ){
2691: int i;
2692: int bFound = 0; /* True once an index has been found */
2693: Fts3Table *p = (Fts3Table *)pCsr->base.pVtab;
2694:
2695: if( isPrefix ){
2696: for(i=1; bFound==0 && i<p->nIndex; i++){
2697: if( p->aIndex[i].nPrefix==nTerm ){
2698: bFound = 1;
2699: rc = sqlite3Fts3SegReaderCursor(
2700: p, i, FTS3_SEGCURSOR_ALL, zTerm, nTerm, 0, 0, pSegcsr);
2701: pSegcsr->bLookup = 1;
2702: }
2703: }
2704:
2705: for(i=1; bFound==0 && i<p->nIndex; i++){
2706: if( p->aIndex[i].nPrefix==nTerm+1 ){
2707: bFound = 1;
2708: rc = sqlite3Fts3SegReaderCursor(
2709: p, i, FTS3_SEGCURSOR_ALL, zTerm, nTerm, 1, 0, pSegcsr
2710: );
2711: if( rc==SQLITE_OK ){
2712: rc = fts3SegReaderCursorAddZero(p, zTerm, nTerm, pSegcsr);
2713: }
2714: }
2715: }
2716: }
2717:
2718: if( bFound==0 ){
2719: rc = sqlite3Fts3SegReaderCursor(
2720: p, 0, FTS3_SEGCURSOR_ALL, zTerm, nTerm, isPrefix, 0, pSegcsr
2721: );
2722: pSegcsr->bLookup = !isPrefix;
2723: }
2724: }
2725:
2726: *ppSegcsr = pSegcsr;
2727: return rc;
2728: }
2729:
2730: /*
2731: ** Free an Fts3MultiSegReader allocated by fts3TermSegReaderCursor().
2732: */
2733: static void fts3SegReaderCursorFree(Fts3MultiSegReader *pSegcsr){
2734: sqlite3Fts3SegReaderFinish(pSegcsr);
2735: sqlite3_free(pSegcsr);
2736: }
2737:
2738: /*
2739: ** This function retreives the doclist for the specified term (or term
2740: ** prefix) from the database.
2741: */
2742: static int fts3TermSelect(
2743: Fts3Table *p, /* Virtual table handle */
2744: Fts3PhraseToken *pTok, /* Token to query for */
2745: int iColumn, /* Column to query (or -ve for all columns) */
2746: int *pnOut, /* OUT: Size of buffer at *ppOut */
2747: char **ppOut /* OUT: Malloced result buffer */
2748: ){
2749: int rc; /* Return code */
2750: Fts3MultiSegReader *pSegcsr; /* Seg-reader cursor for this term */
2751: TermSelect tsc; /* Object for pair-wise doclist merging */
2752: Fts3SegFilter filter; /* Segment term filter configuration */
2753:
2754: pSegcsr = pTok->pSegcsr;
2755: memset(&tsc, 0, sizeof(TermSelect));
2756:
2757: filter.flags = FTS3_SEGMENT_IGNORE_EMPTY | FTS3_SEGMENT_REQUIRE_POS
2758: | (pTok->isPrefix ? FTS3_SEGMENT_PREFIX : 0)
2759: | (pTok->bFirst ? FTS3_SEGMENT_FIRST : 0)
2760: | (iColumn<p->nColumn ? FTS3_SEGMENT_COLUMN_FILTER : 0);
2761: filter.iCol = iColumn;
2762: filter.zTerm = pTok->z;
2763: filter.nTerm = pTok->n;
2764:
2765: rc = sqlite3Fts3SegReaderStart(p, pSegcsr, &filter);
2766: while( SQLITE_OK==rc
2767: && SQLITE_ROW==(rc = sqlite3Fts3SegReaderStep(p, pSegcsr))
2768: ){
2769: rc = fts3TermSelectMerge(p, &tsc, pSegcsr->aDoclist, pSegcsr->nDoclist);
2770: }
2771:
2772: if( rc==SQLITE_OK ){
2773: rc = fts3TermSelectFinishMerge(p, &tsc);
2774: }
2775: if( rc==SQLITE_OK ){
2776: *ppOut = tsc.aaOutput[0];
2777: *pnOut = tsc.anOutput[0];
2778: }else{
2779: int i;
2780: for(i=0; i<SizeofArray(tsc.aaOutput); i++){
2781: sqlite3_free(tsc.aaOutput[i]);
2782: }
2783: }
2784:
2785: fts3SegReaderCursorFree(pSegcsr);
2786: pTok->pSegcsr = 0;
2787: return rc;
2788: }
2789:
2790: /*
2791: ** This function counts the total number of docids in the doclist stored
2792: ** in buffer aList[], size nList bytes.
2793: **
2794: ** If the isPoslist argument is true, then it is assumed that the doclist
2795: ** contains a position-list following each docid. Otherwise, it is assumed
2796: ** that the doclist is simply a list of docids stored as delta encoded
2797: ** varints.
2798: */
2799: static int fts3DoclistCountDocids(char *aList, int nList){
2800: int nDoc = 0; /* Return value */
2801: if( aList ){
2802: char *aEnd = &aList[nList]; /* Pointer to one byte after EOF */
2803: char *p = aList; /* Cursor */
2804: while( p<aEnd ){
2805: nDoc++;
2806: while( (*p++)&0x80 ); /* Skip docid varint */
2807: fts3PoslistCopy(0, &p); /* Skip over position list */
2808: }
2809: }
2810:
2811: return nDoc;
2812: }
2813:
2814: /*
2815: ** Advance the cursor to the next row in the %_content table that
2816: ** matches the search criteria. For a MATCH search, this will be
2817: ** the next row that matches. For a full-table scan, this will be
2818: ** simply the next row in the %_content table. For a docid lookup,
2819: ** this routine simply sets the EOF flag.
2820: **
2821: ** Return SQLITE_OK if nothing goes wrong. SQLITE_OK is returned
2822: ** even if we reach end-of-file. The fts3EofMethod() will be called
2823: ** subsequently to determine whether or not an EOF was hit.
2824: */
2825: static int fts3NextMethod(sqlite3_vtab_cursor *pCursor){
2826: int rc;
2827: Fts3Cursor *pCsr = (Fts3Cursor *)pCursor;
2828: if( pCsr->eSearch==FTS3_DOCID_SEARCH || pCsr->eSearch==FTS3_FULLSCAN_SEARCH ){
2829: if( SQLITE_ROW!=sqlite3_step(pCsr->pStmt) ){
2830: pCsr->isEof = 1;
2831: rc = sqlite3_reset(pCsr->pStmt);
2832: }else{
2833: pCsr->iPrevId = sqlite3_column_int64(pCsr->pStmt, 0);
2834: rc = SQLITE_OK;
2835: }
2836: }else{
2837: rc = fts3EvalNext((Fts3Cursor *)pCursor);
2838: }
2839: assert( ((Fts3Table *)pCsr->base.pVtab)->pSegments==0 );
2840: return rc;
2841: }
2842:
2843: /*
2844: ** This is the xFilter interface for the virtual table. See
2845: ** the virtual table xFilter method documentation for additional
2846: ** information.
2847: **
2848: ** If idxNum==FTS3_FULLSCAN_SEARCH then do a full table scan against
2849: ** the %_content table.
2850: **
2851: ** If idxNum==FTS3_DOCID_SEARCH then do a docid lookup for a single entry
2852: ** in the %_content table.
2853: **
2854: ** If idxNum>=FTS3_FULLTEXT_SEARCH then use the full text index. The
2855: ** column on the left-hand side of the MATCH operator is column
2856: ** number idxNum-FTS3_FULLTEXT_SEARCH, 0 indexed. argv[0] is the right-hand
2857: ** side of the MATCH operator.
2858: */
2859: static int fts3FilterMethod(
2860: sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
2861: int idxNum, /* Strategy index */
2862: const char *idxStr, /* Unused */
2863: int nVal, /* Number of elements in apVal */
2864: sqlite3_value **apVal /* Arguments for the indexing scheme */
2865: ){
2866: int rc;
2867: char *zSql; /* SQL statement used to access %_content */
2868: Fts3Table *p = (Fts3Table *)pCursor->pVtab;
2869: Fts3Cursor *pCsr = (Fts3Cursor *)pCursor;
2870:
2871: UNUSED_PARAMETER(idxStr);
2872: UNUSED_PARAMETER(nVal);
2873:
2874: assert( idxNum>=0 && idxNum<=(FTS3_FULLTEXT_SEARCH+p->nColumn) );
2875: assert( nVal==0 || nVal==1 );
2876: assert( (nVal==0)==(idxNum==FTS3_FULLSCAN_SEARCH) );
2877: assert( p->pSegments==0 );
2878:
2879: /* In case the cursor has been used before, clear it now. */
2880: sqlite3_finalize(pCsr->pStmt);
2881: sqlite3_free(pCsr->aDoclist);
2882: sqlite3Fts3ExprFree(pCsr->pExpr);
2883: memset(&pCursor[1], 0, sizeof(Fts3Cursor)-sizeof(sqlite3_vtab_cursor));
2884:
2885: if( idxStr ){
2886: pCsr->bDesc = (idxStr[0]=='D');
2887: }else{
2888: pCsr->bDesc = p->bDescIdx;
2889: }
2890: pCsr->eSearch = (i16)idxNum;
2891:
2892: if( idxNum!=FTS3_DOCID_SEARCH && idxNum!=FTS3_FULLSCAN_SEARCH ){
2893: int iCol = idxNum-FTS3_FULLTEXT_SEARCH;
2894: const char *zQuery = (const char *)sqlite3_value_text(apVal[0]);
2895:
2896: if( zQuery==0 && sqlite3_value_type(apVal[0])!=SQLITE_NULL ){
2897: return SQLITE_NOMEM;
2898: }
2899:
2900: rc = sqlite3Fts3ExprParse(p->pTokenizer, p->azColumn, p->bHasStat,
2901: p->nColumn, iCol, zQuery, -1, &pCsr->pExpr
2902: );
2903: if( rc!=SQLITE_OK ){
2904: if( rc==SQLITE_ERROR ){
2905: static const char *zErr = "malformed MATCH expression: [%s]";
2906: p->base.zErrMsg = sqlite3_mprintf(zErr, zQuery);
2907: }
2908: return rc;
2909: }
2910:
2911: rc = sqlite3Fts3ReadLock(p);
2912: if( rc!=SQLITE_OK ) return rc;
2913:
2914: rc = fts3EvalStart(pCsr);
2915:
2916: sqlite3Fts3SegmentsClose(p);
2917: if( rc!=SQLITE_OK ) return rc;
2918: pCsr->pNextId = pCsr->aDoclist;
2919: pCsr->iPrevId = 0;
2920: }
2921:
2922: /* Compile a SELECT statement for this cursor. For a full-table-scan, the
2923: ** statement loops through all rows of the %_content table. For a
2924: ** full-text query or docid lookup, the statement retrieves a single
2925: ** row by docid.
2926: */
2927: if( idxNum==FTS3_FULLSCAN_SEARCH ){
2928: zSql = sqlite3_mprintf(
2929: "SELECT %s ORDER BY rowid %s",
2930: p->zReadExprlist, (pCsr->bDesc ? "DESC" : "ASC")
2931: );
2932: if( zSql ){
2933: rc = sqlite3_prepare_v2(p->db, zSql, -1, &pCsr->pStmt, 0);
2934: sqlite3_free(zSql);
2935: }else{
2936: rc = SQLITE_NOMEM;
2937: }
2938: }else if( idxNum==FTS3_DOCID_SEARCH ){
2939: rc = fts3CursorSeekStmt(pCsr, &pCsr->pStmt);
2940: if( rc==SQLITE_OK ){
2941: rc = sqlite3_bind_value(pCsr->pStmt, 1, apVal[0]);
2942: }
2943: }
2944: if( rc!=SQLITE_OK ) return rc;
2945:
2946: return fts3NextMethod(pCursor);
2947: }
2948:
2949: /*
2950: ** This is the xEof method of the virtual table. SQLite calls this
2951: ** routine to find out if it has reached the end of a result set.
2952: */
2953: static int fts3EofMethod(sqlite3_vtab_cursor *pCursor){
2954: return ((Fts3Cursor *)pCursor)->isEof;
2955: }
2956:
2957: /*
2958: ** This is the xRowid method. The SQLite core calls this routine to
2959: ** retrieve the rowid for the current row of the result set. fts3
2960: ** exposes %_content.docid as the rowid for the virtual table. The
2961: ** rowid should be written to *pRowid.
2962: */
2963: static int fts3RowidMethod(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
2964: Fts3Cursor *pCsr = (Fts3Cursor *) pCursor;
2965: *pRowid = pCsr->iPrevId;
2966: return SQLITE_OK;
2967: }
2968:
2969: /*
2970: ** This is the xColumn method, called by SQLite to request a value from
2971: ** the row that the supplied cursor currently points to.
2972: */
2973: static int fts3ColumnMethod(
2974: sqlite3_vtab_cursor *pCursor, /* Cursor to retrieve value from */
2975: sqlite3_context *pContext, /* Context for sqlite3_result_xxx() calls */
2976: int iCol /* Index of column to read value from */
2977: ){
2978: int rc = SQLITE_OK; /* Return Code */
2979: Fts3Cursor *pCsr = (Fts3Cursor *) pCursor;
2980: Fts3Table *p = (Fts3Table *)pCursor->pVtab;
2981:
2982: /* The column value supplied by SQLite must be in range. */
2983: assert( iCol>=0 && iCol<=p->nColumn+1 );
2984:
2985: if( iCol==p->nColumn+1 ){
2986: /* This call is a request for the "docid" column. Since "docid" is an
2987: ** alias for "rowid", use the xRowid() method to obtain the value.
2988: */
2989: sqlite3_result_int64(pContext, pCsr->iPrevId);
2990: }else if( iCol==p->nColumn ){
2991: /* The extra column whose name is the same as the table.
2992: ** Return a blob which is a pointer to the cursor.
2993: */
2994: sqlite3_result_blob(pContext, &pCsr, sizeof(pCsr), SQLITE_TRANSIENT);
2995: }else{
2996: rc = fts3CursorSeek(0, pCsr);
2997: if( rc==SQLITE_OK && sqlite3_data_count(pCsr->pStmt)>(iCol+1) ){
2998: sqlite3_result_value(pContext, sqlite3_column_value(pCsr->pStmt, iCol+1));
2999: }
3000: }
3001:
3002: assert( ((Fts3Table *)pCsr->base.pVtab)->pSegments==0 );
3003: return rc;
3004: }
3005:
3006: /*
3007: ** This function is the implementation of the xUpdate callback used by
3008: ** FTS3 virtual tables. It is invoked by SQLite each time a row is to be
3009: ** inserted, updated or deleted.
3010: */
3011: static int fts3UpdateMethod(
3012: sqlite3_vtab *pVtab, /* Virtual table handle */
3013: int nArg, /* Size of argument array */
3014: sqlite3_value **apVal, /* Array of arguments */
3015: sqlite_int64 *pRowid /* OUT: The affected (or effected) rowid */
3016: ){
3017: return sqlite3Fts3UpdateMethod(pVtab, nArg, apVal, pRowid);
3018: }
3019:
3020: /*
3021: ** Implementation of xSync() method. Flush the contents of the pending-terms
3022: ** hash-table to the database.
3023: */
3024: static int fts3SyncMethod(sqlite3_vtab *pVtab){
3025: int rc = sqlite3Fts3PendingTermsFlush((Fts3Table *)pVtab);
3026: sqlite3Fts3SegmentsClose((Fts3Table *)pVtab);
3027: return rc;
3028: }
3029:
3030: /*
3031: ** Implementation of xBegin() method. This is a no-op.
3032: */
3033: static int fts3BeginMethod(sqlite3_vtab *pVtab){
3034: TESTONLY( Fts3Table *p = (Fts3Table*)pVtab );
3035: UNUSED_PARAMETER(pVtab);
3036: assert( p->pSegments==0 );
3037: assert( p->nPendingData==0 );
3038: assert( p->inTransaction!=1 );
3039: TESTONLY( p->inTransaction = 1 );
3040: TESTONLY( p->mxSavepoint = -1; );
3041: return SQLITE_OK;
3042: }
3043:
3044: /*
3045: ** Implementation of xCommit() method. This is a no-op. The contents of
3046: ** the pending-terms hash-table have already been flushed into the database
3047: ** by fts3SyncMethod().
3048: */
3049: static int fts3CommitMethod(sqlite3_vtab *pVtab){
3050: TESTONLY( Fts3Table *p = (Fts3Table*)pVtab );
3051: UNUSED_PARAMETER(pVtab);
3052: assert( p->nPendingData==0 );
3053: assert( p->inTransaction!=0 );
3054: assert( p->pSegments==0 );
3055: TESTONLY( p->inTransaction = 0 );
3056: TESTONLY( p->mxSavepoint = -1; );
3057: return SQLITE_OK;
3058: }
3059:
3060: /*
3061: ** Implementation of xRollback(). Discard the contents of the pending-terms
3062: ** hash-table. Any changes made to the database are reverted by SQLite.
3063: */
3064: static int fts3RollbackMethod(sqlite3_vtab *pVtab){
3065: Fts3Table *p = (Fts3Table*)pVtab;
3066: sqlite3Fts3PendingTermsClear(p);
3067: assert( p->inTransaction!=0 );
3068: TESTONLY( p->inTransaction = 0 );
3069: TESTONLY( p->mxSavepoint = -1; );
3070: return SQLITE_OK;
3071: }
3072:
3073: /*
3074: ** When called, *ppPoslist must point to the byte immediately following the
3075: ** end of a position-list. i.e. ( (*ppPoslist)[-1]==POS_END ). This function
3076: ** moves *ppPoslist so that it instead points to the first byte of the
3077: ** same position list.
3078: */
3079: static void fts3ReversePoslist(char *pStart, char **ppPoslist){
3080: char *p = &(*ppPoslist)[-2];
3081: char c = 0;
3082:
3083: while( p>pStart && (c=*p--)==0 );
3084: while( p>pStart && (*p & 0x80) | c ){
3085: c = *p--;
3086: }
3087: if( p>pStart ){ p = &p[2]; }
3088: while( *p++&0x80 );
3089: *ppPoslist = p;
3090: }
3091:
3092: /*
3093: ** Helper function used by the implementation of the overloaded snippet(),
3094: ** offsets() and optimize() SQL functions.
3095: **
3096: ** If the value passed as the third argument is a blob of size
3097: ** sizeof(Fts3Cursor*), then the blob contents are copied to the
3098: ** output variable *ppCsr and SQLITE_OK is returned. Otherwise, an error
3099: ** message is written to context pContext and SQLITE_ERROR returned. The
3100: ** string passed via zFunc is used as part of the error message.
3101: */
3102: static int fts3FunctionArg(
3103: sqlite3_context *pContext, /* SQL function call context */
3104: const char *zFunc, /* Function name */
3105: sqlite3_value *pVal, /* argv[0] passed to function */
3106: Fts3Cursor **ppCsr /* OUT: Store cursor handle here */
3107: ){
3108: Fts3Cursor *pRet;
3109: if( sqlite3_value_type(pVal)!=SQLITE_BLOB
3110: || sqlite3_value_bytes(pVal)!=sizeof(Fts3Cursor *)
3111: ){
3112: char *zErr = sqlite3_mprintf("illegal first argument to %s", zFunc);
3113: sqlite3_result_error(pContext, zErr, -1);
3114: sqlite3_free(zErr);
3115: return SQLITE_ERROR;
3116: }
3117: memcpy(&pRet, sqlite3_value_blob(pVal), sizeof(Fts3Cursor *));
3118: *ppCsr = pRet;
3119: return SQLITE_OK;
3120: }
3121:
3122: /*
3123: ** Implementation of the snippet() function for FTS3
3124: */
3125: static void fts3SnippetFunc(
3126: sqlite3_context *pContext, /* SQLite function call context */
3127: int nVal, /* Size of apVal[] array */
3128: sqlite3_value **apVal /* Array of arguments */
3129: ){
3130: Fts3Cursor *pCsr; /* Cursor handle passed through apVal[0] */
3131: const char *zStart = "<b>";
3132: const char *zEnd = "</b>";
3133: const char *zEllipsis = "<b>...</b>";
3134: int iCol = -1;
3135: int nToken = 15; /* Default number of tokens in snippet */
3136:
3137: /* There must be at least one argument passed to this function (otherwise
3138: ** the non-overloaded version would have been called instead of this one).
3139: */
3140: assert( nVal>=1 );
3141:
3142: if( nVal>6 ){
3143: sqlite3_result_error(pContext,
3144: "wrong number of arguments to function snippet()", -1);
3145: return;
3146: }
3147: if( fts3FunctionArg(pContext, "snippet", apVal[0], &pCsr) ) return;
3148:
3149: switch( nVal ){
3150: case 6: nToken = sqlite3_value_int(apVal[5]);
3151: case 5: iCol = sqlite3_value_int(apVal[4]);
3152: case 4: zEllipsis = (const char*)sqlite3_value_text(apVal[3]);
3153: case 3: zEnd = (const char*)sqlite3_value_text(apVal[2]);
3154: case 2: zStart = (const char*)sqlite3_value_text(apVal[1]);
3155: }
3156: if( !zEllipsis || !zEnd || !zStart ){
3157: sqlite3_result_error_nomem(pContext);
3158: }else if( SQLITE_OK==fts3CursorSeek(pContext, pCsr) ){
3159: sqlite3Fts3Snippet(pContext, pCsr, zStart, zEnd, zEllipsis, iCol, nToken);
3160: }
3161: }
3162:
3163: /*
3164: ** Implementation of the offsets() function for FTS3
3165: */
3166: static void fts3OffsetsFunc(
3167: sqlite3_context *pContext, /* SQLite function call context */
3168: int nVal, /* Size of argument array */
3169: sqlite3_value **apVal /* Array of arguments */
3170: ){
3171: Fts3Cursor *pCsr; /* Cursor handle passed through apVal[0] */
3172:
3173: UNUSED_PARAMETER(nVal);
3174:
3175: assert( nVal==1 );
3176: if( fts3FunctionArg(pContext, "offsets", apVal[0], &pCsr) ) return;
3177: assert( pCsr );
3178: if( SQLITE_OK==fts3CursorSeek(pContext, pCsr) ){
3179: sqlite3Fts3Offsets(pContext, pCsr);
3180: }
3181: }
3182:
3183: /*
3184: ** Implementation of the special optimize() function for FTS3. This
3185: ** function merges all segments in the database to a single segment.
3186: ** Example usage is:
3187: **
3188: ** SELECT optimize(t) FROM t LIMIT 1;
3189: **
3190: ** where 't' is the name of an FTS3 table.
3191: */
3192: static void fts3OptimizeFunc(
3193: sqlite3_context *pContext, /* SQLite function call context */
3194: int nVal, /* Size of argument array */
3195: sqlite3_value **apVal /* Array of arguments */
3196: ){
3197: int rc; /* Return code */
3198: Fts3Table *p; /* Virtual table handle */
3199: Fts3Cursor *pCursor; /* Cursor handle passed through apVal[0] */
3200:
3201: UNUSED_PARAMETER(nVal);
3202:
3203: assert( nVal==1 );
3204: if( fts3FunctionArg(pContext, "optimize", apVal[0], &pCursor) ) return;
3205: p = (Fts3Table *)pCursor->base.pVtab;
3206: assert( p );
3207:
3208: rc = sqlite3Fts3Optimize(p);
3209:
3210: switch( rc ){
3211: case SQLITE_OK:
3212: sqlite3_result_text(pContext, "Index optimized", -1, SQLITE_STATIC);
3213: break;
3214: case SQLITE_DONE:
3215: sqlite3_result_text(pContext, "Index already optimal", -1, SQLITE_STATIC);
3216: break;
3217: default:
3218: sqlite3_result_error_code(pContext, rc);
3219: break;
3220: }
3221: }
3222:
3223: /*
3224: ** Implementation of the matchinfo() function for FTS3
3225: */
3226: static void fts3MatchinfoFunc(
3227: sqlite3_context *pContext, /* SQLite function call context */
3228: int nVal, /* Size of argument array */
3229: sqlite3_value **apVal /* Array of arguments */
3230: ){
3231: Fts3Cursor *pCsr; /* Cursor handle passed through apVal[0] */
3232: assert( nVal==1 || nVal==2 );
3233: if( SQLITE_OK==fts3FunctionArg(pContext, "matchinfo", apVal[0], &pCsr) ){
3234: const char *zArg = 0;
3235: if( nVal>1 ){
3236: zArg = (const char *)sqlite3_value_text(apVal[1]);
3237: }
3238: sqlite3Fts3Matchinfo(pContext, pCsr, zArg);
3239: }
3240: }
3241:
3242: /*
3243: ** This routine implements the xFindFunction method for the FTS3
3244: ** virtual table.
3245: */
3246: static int fts3FindFunctionMethod(
3247: sqlite3_vtab *pVtab, /* Virtual table handle */
3248: int nArg, /* Number of SQL function arguments */
3249: const char *zName, /* Name of SQL function */
3250: void (**pxFunc)(sqlite3_context*,int,sqlite3_value**), /* OUT: Result */
3251: void **ppArg /* Unused */
3252: ){
3253: struct Overloaded {
3254: const char *zName;
3255: void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
3256: } aOverload[] = {
3257: { "snippet", fts3SnippetFunc },
3258: { "offsets", fts3OffsetsFunc },
3259: { "optimize", fts3OptimizeFunc },
3260: { "matchinfo", fts3MatchinfoFunc },
3261: };
3262: int i; /* Iterator variable */
3263:
3264: UNUSED_PARAMETER(pVtab);
3265: UNUSED_PARAMETER(nArg);
3266: UNUSED_PARAMETER(ppArg);
3267:
3268: for(i=0; i<SizeofArray(aOverload); i++){
3269: if( strcmp(zName, aOverload[i].zName)==0 ){
3270: *pxFunc = aOverload[i].xFunc;
3271: return 1;
3272: }
3273: }
3274:
3275: /* No function of the specified name was found. Return 0. */
3276: return 0;
3277: }
3278:
3279: /*
3280: ** Implementation of FTS3 xRename method. Rename an fts3 table.
3281: */
3282: static int fts3RenameMethod(
3283: sqlite3_vtab *pVtab, /* Virtual table handle */
3284: const char *zName /* New name of table */
3285: ){
3286: Fts3Table *p = (Fts3Table *)pVtab;
3287: sqlite3 *db = p->db; /* Database connection */
3288: int rc; /* Return Code */
3289:
3290: /* As it happens, the pending terms table is always empty here. This is
3291: ** because an "ALTER TABLE RENAME TABLE" statement inside a transaction
3292: ** always opens a savepoint transaction. And the xSavepoint() method
3293: ** flushes the pending terms table. But leave the (no-op) call to
3294: ** PendingTermsFlush() in in case that changes.
3295: */
3296: assert( p->nPendingData==0 );
3297: rc = sqlite3Fts3PendingTermsFlush(p);
3298:
3299: if( p->zContentTbl==0 ){
3300: fts3DbExec(&rc, db,
3301: "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';",
3302: p->zDb, p->zName, zName
3303: );
3304: }
3305:
3306: if( p->bHasDocsize ){
3307: fts3DbExec(&rc, db,
3308: "ALTER TABLE %Q.'%q_docsize' RENAME TO '%q_docsize';",
3309: p->zDb, p->zName, zName
3310: );
3311: }
3312: if( p->bHasStat ){
3313: fts3DbExec(&rc, db,
3314: "ALTER TABLE %Q.'%q_stat' RENAME TO '%q_stat';",
3315: p->zDb, p->zName, zName
3316: );
3317: }
3318: fts3DbExec(&rc, db,
3319: "ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';",
3320: p->zDb, p->zName, zName
3321: );
3322: fts3DbExec(&rc, db,
3323: "ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';",
3324: p->zDb, p->zName, zName
3325: );
3326: return rc;
3327: }
3328:
3329: /*
3330: ** The xSavepoint() method.
3331: **
3332: ** Flush the contents of the pending-terms table to disk.
3333: */
3334: static int fts3SavepointMethod(sqlite3_vtab *pVtab, int iSavepoint){
3335: UNUSED_PARAMETER(iSavepoint);
3336: assert( ((Fts3Table *)pVtab)->inTransaction );
3337: assert( ((Fts3Table *)pVtab)->mxSavepoint < iSavepoint );
3338: TESTONLY( ((Fts3Table *)pVtab)->mxSavepoint = iSavepoint );
3339: return fts3SyncMethod(pVtab);
3340: }
3341:
3342: /*
3343: ** The xRelease() method.
3344: **
3345: ** This is a no-op.
3346: */
3347: static int fts3ReleaseMethod(sqlite3_vtab *pVtab, int iSavepoint){
3348: TESTONLY( Fts3Table *p = (Fts3Table*)pVtab );
3349: UNUSED_PARAMETER(iSavepoint);
3350: UNUSED_PARAMETER(pVtab);
3351: assert( p->inTransaction );
3352: assert( p->mxSavepoint >= iSavepoint );
3353: TESTONLY( p->mxSavepoint = iSavepoint-1 );
3354: return SQLITE_OK;
3355: }
3356:
3357: /*
3358: ** The xRollbackTo() method.
3359: **
3360: ** Discard the contents of the pending terms table.
3361: */
3362: static int fts3RollbackToMethod(sqlite3_vtab *pVtab, int iSavepoint){
3363: Fts3Table *p = (Fts3Table*)pVtab;
3364: UNUSED_PARAMETER(iSavepoint);
3365: assert( p->inTransaction );
3366: assert( p->mxSavepoint >= iSavepoint );
3367: TESTONLY( p->mxSavepoint = iSavepoint );
3368: sqlite3Fts3PendingTermsClear(p);
3369: return SQLITE_OK;
3370: }
3371:
3372: static const sqlite3_module fts3Module = {
3373: /* iVersion */ 2,
3374: /* xCreate */ fts3CreateMethod,
3375: /* xConnect */ fts3ConnectMethod,
3376: /* xBestIndex */ fts3BestIndexMethod,
3377: /* xDisconnect */ fts3DisconnectMethod,
3378: /* xDestroy */ fts3DestroyMethod,
3379: /* xOpen */ fts3OpenMethod,
3380: /* xClose */ fts3CloseMethod,
3381: /* xFilter */ fts3FilterMethod,
3382: /* xNext */ fts3NextMethod,
3383: /* xEof */ fts3EofMethod,
3384: /* xColumn */ fts3ColumnMethod,
3385: /* xRowid */ fts3RowidMethod,
3386: /* xUpdate */ fts3UpdateMethod,
3387: /* xBegin */ fts3BeginMethod,
3388: /* xSync */ fts3SyncMethod,
3389: /* xCommit */ fts3CommitMethod,
3390: /* xRollback */ fts3RollbackMethod,
3391: /* xFindFunction */ fts3FindFunctionMethod,
3392: /* xRename */ fts3RenameMethod,
3393: /* xSavepoint */ fts3SavepointMethod,
3394: /* xRelease */ fts3ReleaseMethod,
3395: /* xRollbackTo */ fts3RollbackToMethod,
3396: };
3397:
3398: /*
3399: ** This function is registered as the module destructor (called when an
3400: ** FTS3 enabled database connection is closed). It frees the memory
3401: ** allocated for the tokenizer hash table.
3402: */
3403: static void hashDestroy(void *p){
3404: Fts3Hash *pHash = (Fts3Hash *)p;
3405: sqlite3Fts3HashClear(pHash);
3406: sqlite3_free(pHash);
3407: }
3408:
3409: /*
3410: ** The fts3 built-in tokenizers - "simple", "porter" and "icu"- are
3411: ** implemented in files fts3_tokenizer1.c, fts3_porter.c and fts3_icu.c
3412: ** respectively. The following three forward declarations are for functions
3413: ** declared in these files used to retrieve the respective implementations.
3414: **
3415: ** Calling sqlite3Fts3SimpleTokenizerModule() sets the value pointed
3416: ** to by the argument to point to the "simple" tokenizer implementation.
3417: ** And so on.
3418: */
3419: void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
3420: void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
3421: #ifdef SQLITE_ENABLE_ICU
3422: void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
3423: #endif
3424:
3425: /*
3426: ** Initialise the fts3 extension. If this extension is built as part
3427: ** of the sqlite library, then this function is called directly by
3428: ** SQLite. If fts3 is built as a dynamically loadable extension, this
3429: ** function is called by the sqlite3_extension_init() entry point.
3430: */
3431: int sqlite3Fts3Init(sqlite3 *db){
3432: int rc = SQLITE_OK;
3433: Fts3Hash *pHash = 0;
3434: const sqlite3_tokenizer_module *pSimple = 0;
3435: const sqlite3_tokenizer_module *pPorter = 0;
3436:
3437: #ifdef SQLITE_ENABLE_ICU
3438: const sqlite3_tokenizer_module *pIcu = 0;
3439: sqlite3Fts3IcuTokenizerModule(&pIcu);
3440: #endif
3441:
3442: #ifdef SQLITE_TEST
3443: rc = sqlite3Fts3InitTerm(db);
3444: if( rc!=SQLITE_OK ) return rc;
3445: #endif
3446:
3447: rc = sqlite3Fts3InitAux(db);
3448: if( rc!=SQLITE_OK ) return rc;
3449:
3450: sqlite3Fts3SimpleTokenizerModule(&pSimple);
3451: sqlite3Fts3PorterTokenizerModule(&pPorter);
3452:
3453: /* Allocate and initialise the hash-table used to store tokenizers. */
3454: pHash = sqlite3_malloc(sizeof(Fts3Hash));
3455: if( !pHash ){
3456: rc = SQLITE_NOMEM;
3457: }else{
3458: sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
3459: }
3460:
3461: /* Load the built-in tokenizers into the hash table */
3462: if( rc==SQLITE_OK ){
3463: if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple)
3464: || sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter)
3465: #ifdef SQLITE_ENABLE_ICU
3466: || (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu))
3467: #endif
3468: ){
3469: rc = SQLITE_NOMEM;
3470: }
3471: }
3472:
3473: #ifdef SQLITE_TEST
3474: if( rc==SQLITE_OK ){
3475: rc = sqlite3Fts3ExprInitTestInterface(db);
3476: }
3477: #endif
3478:
3479: /* Create the virtual table wrapper around the hash-table and overload
3480: ** the two scalar functions. If this is successful, register the
3481: ** module with sqlite.
3482: */
3483: if( SQLITE_OK==rc
3484: && SQLITE_OK==(rc = sqlite3Fts3InitHashTable(db, pHash, "fts3_tokenizer"))
3485: && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
3486: && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", 1))
3487: && SQLITE_OK==(rc = sqlite3_overload_function(db, "matchinfo", 1))
3488: && SQLITE_OK==(rc = sqlite3_overload_function(db, "matchinfo", 2))
3489: && SQLITE_OK==(rc = sqlite3_overload_function(db, "optimize", 1))
3490: ){
3491: rc = sqlite3_create_module_v2(
3492: db, "fts3", &fts3Module, (void *)pHash, hashDestroy
3493: );
3494: if( rc==SQLITE_OK ){
3495: rc = sqlite3_create_module_v2(
3496: db, "fts4", &fts3Module, (void *)pHash, 0
3497: );
3498: }
3499: return rc;
3500: }
3501:
3502: /* An error has occurred. Delete the hash table and return the error code. */
3503: assert( rc!=SQLITE_OK );
3504: if( pHash ){
3505: sqlite3Fts3HashClear(pHash);
3506: sqlite3_free(pHash);
3507: }
3508: return rc;
3509: }
3510:
3511: /*
3512: ** Allocate an Fts3MultiSegReader for each token in the expression headed
3513: ** by pExpr.
3514: **
3515: ** An Fts3SegReader object is a cursor that can seek or scan a range of
3516: ** entries within a single segment b-tree. An Fts3MultiSegReader uses multiple
3517: ** Fts3SegReader objects internally to provide an interface to seek or scan
3518: ** within the union of all segments of a b-tree. Hence the name.
3519: **
3520: ** If the allocated Fts3MultiSegReader just seeks to a single entry in a
3521: ** segment b-tree (if the term is not a prefix or it is a prefix for which
3522: ** there exists prefix b-tree of the right length) then it may be traversed
3523: ** and merged incrementally. Otherwise, it has to be merged into an in-memory
3524: ** doclist and then traversed.
3525: */
3526: static void fts3EvalAllocateReaders(
3527: Fts3Cursor *pCsr, /* FTS cursor handle */
3528: Fts3Expr *pExpr, /* Allocate readers for this expression */
3529: int *pnToken, /* OUT: Total number of tokens in phrase. */
3530: int *pnOr, /* OUT: Total number of OR nodes in expr. */
3531: int *pRc /* IN/OUT: Error code */
3532: ){
3533: if( pExpr && SQLITE_OK==*pRc ){
3534: if( pExpr->eType==FTSQUERY_PHRASE ){
3535: int i;
3536: int nToken = pExpr->pPhrase->nToken;
3537: *pnToken += nToken;
3538: for(i=0; i<nToken; i++){
3539: Fts3PhraseToken *pToken = &pExpr->pPhrase->aToken[i];
3540: int rc = fts3TermSegReaderCursor(pCsr,
3541: pToken->z, pToken->n, pToken->isPrefix, &pToken->pSegcsr
3542: );
3543: if( rc!=SQLITE_OK ){
3544: *pRc = rc;
3545: return;
3546: }
3547: }
3548: assert( pExpr->pPhrase->iDoclistToken==0 );
3549: pExpr->pPhrase->iDoclistToken = -1;
3550: }else{
3551: *pnOr += (pExpr->eType==FTSQUERY_OR);
3552: fts3EvalAllocateReaders(pCsr, pExpr->pLeft, pnToken, pnOr, pRc);
3553: fts3EvalAllocateReaders(pCsr, pExpr->pRight, pnToken, pnOr, pRc);
3554: }
3555: }
3556: }
3557:
3558: /*
3559: ** Arguments pList/nList contain the doclist for token iToken of phrase p.
3560: ** It is merged into the main doclist stored in p->doclist.aAll/nAll.
3561: **
3562: ** This function assumes that pList points to a buffer allocated using
3563: ** sqlite3_malloc(). This function takes responsibility for eventually
3564: ** freeing the buffer.
3565: */
3566: static void fts3EvalPhraseMergeToken(
3567: Fts3Table *pTab, /* FTS Table pointer */
3568: Fts3Phrase *p, /* Phrase to merge pList/nList into */
3569: int iToken, /* Token pList/nList corresponds to */
3570: char *pList, /* Pointer to doclist */
3571: int nList /* Number of bytes in pList */
3572: ){
3573: assert( iToken!=p->iDoclistToken );
3574:
3575: if( pList==0 ){
3576: sqlite3_free(p->doclist.aAll);
3577: p->doclist.aAll = 0;
3578: p->doclist.nAll = 0;
3579: }
3580:
3581: else if( p->iDoclistToken<0 ){
3582: p->doclist.aAll = pList;
3583: p->doclist.nAll = nList;
3584: }
3585:
3586: else if( p->doclist.aAll==0 ){
3587: sqlite3_free(pList);
3588: }
3589:
3590: else {
3591: char *pLeft;
3592: char *pRight;
3593: int nLeft;
3594: int nRight;
3595: int nDiff;
3596:
3597: if( p->iDoclistToken<iToken ){
3598: pLeft = p->doclist.aAll;
3599: nLeft = p->doclist.nAll;
3600: pRight = pList;
3601: nRight = nList;
3602: nDiff = iToken - p->iDoclistToken;
3603: }else{
3604: pRight = p->doclist.aAll;
3605: nRight = p->doclist.nAll;
3606: pLeft = pList;
3607: nLeft = nList;
3608: nDiff = p->iDoclistToken - iToken;
3609: }
3610:
3611: fts3DoclistPhraseMerge(pTab->bDescIdx, nDiff, pLeft, nLeft, pRight,&nRight);
3612: sqlite3_free(pLeft);
3613: p->doclist.aAll = pRight;
3614: p->doclist.nAll = nRight;
3615: }
3616:
3617: if( iToken>p->iDoclistToken ) p->iDoclistToken = iToken;
3618: }
3619:
3620: /*
3621: ** Load the doclist for phrase p into p->doclist.aAll/nAll. The loaded doclist
3622: ** does not take deferred tokens into account.
3623: **
3624: ** SQLITE_OK is returned if no error occurs, otherwise an SQLite error code.
3625: */
3626: static int fts3EvalPhraseLoad(
3627: Fts3Cursor *pCsr, /* FTS Cursor handle */
3628: Fts3Phrase *p /* Phrase object */
3629: ){
3630: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
3631: int iToken;
3632: int rc = SQLITE_OK;
3633:
3634: for(iToken=0; rc==SQLITE_OK && iToken<p->nToken; iToken++){
3635: Fts3PhraseToken *pToken = &p->aToken[iToken];
3636: assert( pToken->pDeferred==0 || pToken->pSegcsr==0 );
3637:
3638: if( pToken->pSegcsr ){
3639: int nThis = 0;
3640: char *pThis = 0;
3641: rc = fts3TermSelect(pTab, pToken, p->iColumn, &nThis, &pThis);
3642: if( rc==SQLITE_OK ){
3643: fts3EvalPhraseMergeToken(pTab, p, iToken, pThis, nThis);
3644: }
3645: }
3646: assert( pToken->pSegcsr==0 );
3647: }
3648:
3649: return rc;
3650: }
3651:
3652: /*
3653: ** This function is called on each phrase after the position lists for
3654: ** any deferred tokens have been loaded into memory. It updates the phrases
3655: ** current position list to include only those positions that are really
3656: ** instances of the phrase (after considering deferred tokens). If this
3657: ** means that the phrase does not appear in the current row, doclist.pList
3658: ** and doclist.nList are both zeroed.
3659: **
3660: ** SQLITE_OK is returned if no error occurs, otherwise an SQLite error code.
3661: */
3662: static int fts3EvalDeferredPhrase(Fts3Cursor *pCsr, Fts3Phrase *pPhrase){
3663: int iToken; /* Used to iterate through phrase tokens */
3664: char *aPoslist = 0; /* Position list for deferred tokens */
3665: int nPoslist = 0; /* Number of bytes in aPoslist */
3666: int iPrev = -1; /* Token number of previous deferred token */
3667:
3668: assert( pPhrase->doclist.bFreeList==0 );
3669:
3670: for(iToken=0; iToken<pPhrase->nToken; iToken++){
3671: Fts3PhraseToken *pToken = &pPhrase->aToken[iToken];
3672: Fts3DeferredToken *pDeferred = pToken->pDeferred;
3673:
3674: if( pDeferred ){
3675: char *pList;
3676: int nList;
3677: int rc = sqlite3Fts3DeferredTokenList(pDeferred, &pList, &nList);
3678: if( rc!=SQLITE_OK ) return rc;
3679:
3680: if( pList==0 ){
3681: sqlite3_free(aPoslist);
3682: pPhrase->doclist.pList = 0;
3683: pPhrase->doclist.nList = 0;
3684: return SQLITE_OK;
3685:
3686: }else if( aPoslist==0 ){
3687: aPoslist = pList;
3688: nPoslist = nList;
3689:
3690: }else{
3691: char *aOut = pList;
3692: char *p1 = aPoslist;
3693: char *p2 = aOut;
3694:
3695: assert( iPrev>=0 );
3696: fts3PoslistPhraseMerge(&aOut, iToken-iPrev, 0, 1, &p1, &p2);
3697: sqlite3_free(aPoslist);
3698: aPoslist = pList;
3699: nPoslist = aOut - aPoslist;
3700: if( nPoslist==0 ){
3701: sqlite3_free(aPoslist);
3702: pPhrase->doclist.pList = 0;
3703: pPhrase->doclist.nList = 0;
3704: return SQLITE_OK;
3705: }
3706: }
3707: iPrev = iToken;
3708: }
3709: }
3710:
3711: if( iPrev>=0 ){
3712: int nMaxUndeferred = pPhrase->iDoclistToken;
3713: if( nMaxUndeferred<0 ){
3714: pPhrase->doclist.pList = aPoslist;
3715: pPhrase->doclist.nList = nPoslist;
3716: pPhrase->doclist.iDocid = pCsr->iPrevId;
3717: pPhrase->doclist.bFreeList = 1;
3718: }else{
3719: int nDistance;
3720: char *p1;
3721: char *p2;
3722: char *aOut;
3723:
3724: if( nMaxUndeferred>iPrev ){
3725: p1 = aPoslist;
3726: p2 = pPhrase->doclist.pList;
3727: nDistance = nMaxUndeferred - iPrev;
3728: }else{
3729: p1 = pPhrase->doclist.pList;
3730: p2 = aPoslist;
3731: nDistance = iPrev - nMaxUndeferred;
3732: }
3733:
3734: aOut = (char *)sqlite3_malloc(nPoslist+8);
3735: if( !aOut ){
3736: sqlite3_free(aPoslist);
3737: return SQLITE_NOMEM;
3738: }
3739:
3740: pPhrase->doclist.pList = aOut;
3741: if( fts3PoslistPhraseMerge(&aOut, nDistance, 0, 1, &p1, &p2) ){
3742: pPhrase->doclist.bFreeList = 1;
3743: pPhrase->doclist.nList = (aOut - pPhrase->doclist.pList);
3744: }else{
3745: sqlite3_free(aOut);
3746: pPhrase->doclist.pList = 0;
3747: pPhrase->doclist.nList = 0;
3748: }
3749: sqlite3_free(aPoslist);
3750: }
3751: }
3752:
3753: return SQLITE_OK;
3754: }
3755:
3756: /*
3757: ** This function is called for each Fts3Phrase in a full-text query
3758: ** expression to initialize the mechanism for returning rows. Once this
3759: ** function has been called successfully on an Fts3Phrase, it may be
3760: ** used with fts3EvalPhraseNext() to iterate through the matching docids.
3761: **
3762: ** If parameter bOptOk is true, then the phrase may (or may not) use the
3763: ** incremental loading strategy. Otherwise, the entire doclist is loaded into
3764: ** memory within this call.
3765: **
3766: ** SQLITE_OK is returned if no error occurs, otherwise an SQLite error code.
3767: */
3768: static int fts3EvalPhraseStart(Fts3Cursor *pCsr, int bOptOk, Fts3Phrase *p){
3769: int rc; /* Error code */
3770: Fts3PhraseToken *pFirst = &p->aToken[0];
3771: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
3772:
3773: if( pCsr->bDesc==pTab->bDescIdx
3774: && bOptOk==1
3775: && p->nToken==1
3776: && pFirst->pSegcsr
3777: && pFirst->pSegcsr->bLookup
3778: && pFirst->bFirst==0
3779: ){
3780: /* Use the incremental approach. */
3781: int iCol = (p->iColumn >= pTab->nColumn ? -1 : p->iColumn);
3782: rc = sqlite3Fts3MsrIncrStart(
3783: pTab, pFirst->pSegcsr, iCol, pFirst->z, pFirst->n);
3784: p->bIncr = 1;
3785:
3786: }else{
3787: /* Load the full doclist for the phrase into memory. */
3788: rc = fts3EvalPhraseLoad(pCsr, p);
3789: p->bIncr = 0;
3790: }
3791:
3792: assert( rc!=SQLITE_OK || p->nToken<1 || p->aToken[0].pSegcsr==0 || p->bIncr );
3793: return rc;
3794: }
3795:
3796: /*
3797: ** This function is used to iterate backwards (from the end to start)
3798: ** through doclists. It is used by this module to iterate through phrase
3799: ** doclists in reverse and by the fts3_write.c module to iterate through
3800: ** pending-terms lists when writing to databases with "order=desc".
3801: **
3802: ** The doclist may be sorted in ascending (parameter bDescIdx==0) or
3803: ** descending (parameter bDescIdx==1) order of docid. Regardless, this
3804: ** function iterates from the end of the doclist to the beginning.
3805: */
3806: void sqlite3Fts3DoclistPrev(
3807: int bDescIdx, /* True if the doclist is desc */
3808: char *aDoclist, /* Pointer to entire doclist */
3809: int nDoclist, /* Length of aDoclist in bytes */
3810: char **ppIter, /* IN/OUT: Iterator pointer */
3811: sqlite3_int64 *piDocid, /* IN/OUT: Docid pointer */
3812: int *pnList, /* IN/OUT: List length pointer */
3813: u8 *pbEof /* OUT: End-of-file flag */
3814: ){
3815: char *p = *ppIter;
3816:
3817: assert( nDoclist>0 );
3818: assert( *pbEof==0 );
3819: assert( p || *piDocid==0 );
3820: assert( !p || (p>aDoclist && p<&aDoclist[nDoclist]) );
3821:
3822: if( p==0 ){
3823: sqlite3_int64 iDocid = 0;
3824: char *pNext = 0;
3825: char *pDocid = aDoclist;
3826: char *pEnd = &aDoclist[nDoclist];
3827: int iMul = 1;
3828:
3829: while( pDocid<pEnd ){
3830: sqlite3_int64 iDelta;
3831: pDocid += sqlite3Fts3GetVarint(pDocid, &iDelta);
3832: iDocid += (iMul * iDelta);
3833: pNext = pDocid;
3834: fts3PoslistCopy(0, &pDocid);
3835: while( pDocid<pEnd && *pDocid==0 ) pDocid++;
3836: iMul = (bDescIdx ? -1 : 1);
3837: }
3838:
3839: *pnList = pEnd - pNext;
3840: *ppIter = pNext;
3841: *piDocid = iDocid;
3842: }else{
3843: int iMul = (bDescIdx ? -1 : 1);
3844: sqlite3_int64 iDelta;
3845: fts3GetReverseVarint(&p, aDoclist, &iDelta);
3846: *piDocid -= (iMul * iDelta);
3847:
3848: if( p==aDoclist ){
3849: *pbEof = 1;
3850: }else{
3851: char *pSave = p;
3852: fts3ReversePoslist(aDoclist, &p);
3853: *pnList = (pSave - p);
3854: }
3855: *ppIter = p;
3856: }
3857: }
3858:
3859: /*
3860: ** Attempt to move the phrase iterator to point to the next matching docid.
3861: ** If an error occurs, return an SQLite error code. Otherwise, return
3862: ** SQLITE_OK.
3863: **
3864: ** If there is no "next" entry and no error occurs, then *pbEof is set to
3865: ** 1 before returning. Otherwise, if no error occurs and the iterator is
3866: ** successfully advanced, *pbEof is set to 0.
3867: */
3868: static int fts3EvalPhraseNext(
3869: Fts3Cursor *pCsr, /* FTS Cursor handle */
3870: Fts3Phrase *p, /* Phrase object to advance to next docid */
3871: u8 *pbEof /* OUT: Set to 1 if EOF */
3872: ){
3873: int rc = SQLITE_OK;
3874: Fts3Doclist *pDL = &p->doclist;
3875: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
3876:
3877: if( p->bIncr ){
3878: assert( p->nToken==1 );
3879: assert( pDL->pNextDocid==0 );
3880: rc = sqlite3Fts3MsrIncrNext(pTab, p->aToken[0].pSegcsr,
3881: &pDL->iDocid, &pDL->pList, &pDL->nList
3882: );
3883: if( rc==SQLITE_OK && !pDL->pList ){
3884: *pbEof = 1;
3885: }
3886: }else if( pCsr->bDesc!=pTab->bDescIdx && pDL->nAll ){
3887: sqlite3Fts3DoclistPrev(pTab->bDescIdx, pDL->aAll, pDL->nAll,
3888: &pDL->pNextDocid, &pDL->iDocid, &pDL->nList, pbEof
3889: );
3890: pDL->pList = pDL->pNextDocid;
3891: }else{
3892: char *pIter; /* Used to iterate through aAll */
3893: char *pEnd = &pDL->aAll[pDL->nAll]; /* 1 byte past end of aAll */
3894: if( pDL->pNextDocid ){
3895: pIter = pDL->pNextDocid;
3896: }else{
3897: pIter = pDL->aAll;
3898: }
3899:
3900: if( pIter>=pEnd ){
3901: /* We have already reached the end of this doclist. EOF. */
3902: *pbEof = 1;
3903: }else{
3904: sqlite3_int64 iDelta;
3905: pIter += sqlite3Fts3GetVarint(pIter, &iDelta);
3906: if( pTab->bDescIdx==0 || pDL->pNextDocid==0 ){
3907: pDL->iDocid += iDelta;
3908: }else{
3909: pDL->iDocid -= iDelta;
3910: }
3911: pDL->pList = pIter;
3912: fts3PoslistCopy(0, &pIter);
3913: pDL->nList = (pIter - pDL->pList);
3914:
3915: /* pIter now points just past the 0x00 that terminates the position-
3916: ** list for document pDL->iDocid. However, if this position-list was
3917: ** edited in place by fts3EvalNearTrim(), then pIter may not actually
3918: ** point to the start of the next docid value. The following line deals
3919: ** with this case by advancing pIter past the zero-padding added by
3920: ** fts3EvalNearTrim(). */
3921: while( pIter<pEnd && *pIter==0 ) pIter++;
3922:
3923: pDL->pNextDocid = pIter;
3924: assert( pIter>=&pDL->aAll[pDL->nAll] || *pIter );
3925: *pbEof = 0;
3926: }
3927: }
3928:
3929: return rc;
3930: }
3931:
3932: /*
3933: **
3934: ** If *pRc is not SQLITE_OK when this function is called, it is a no-op.
3935: ** Otherwise, fts3EvalPhraseStart() is called on all phrases within the
3936: ** expression. Also the Fts3Expr.bDeferred variable is set to true for any
3937: ** expressions for which all descendent tokens are deferred.
3938: **
3939: ** If parameter bOptOk is zero, then it is guaranteed that the
3940: ** Fts3Phrase.doclist.aAll/nAll variables contain the entire doclist for
3941: ** each phrase in the expression (subject to deferred token processing).
3942: ** Or, if bOptOk is non-zero, then one or more tokens within the expression
3943: ** may be loaded incrementally, meaning doclist.aAll/nAll is not available.
3944: **
3945: ** If an error occurs within this function, *pRc is set to an SQLite error
3946: ** code before returning.
3947: */
3948: static void fts3EvalStartReaders(
3949: Fts3Cursor *pCsr, /* FTS Cursor handle */
3950: Fts3Expr *pExpr, /* Expression to initialize phrases in */
3951: int bOptOk, /* True to enable incremental loading */
3952: int *pRc /* IN/OUT: Error code */
3953: ){
3954: if( pExpr && SQLITE_OK==*pRc ){
3955: if( pExpr->eType==FTSQUERY_PHRASE ){
3956: int i;
3957: int nToken = pExpr->pPhrase->nToken;
3958: for(i=0; i<nToken; i++){
3959: if( pExpr->pPhrase->aToken[i].pDeferred==0 ) break;
3960: }
3961: pExpr->bDeferred = (i==nToken);
3962: *pRc = fts3EvalPhraseStart(pCsr, bOptOk, pExpr->pPhrase);
3963: }else{
3964: fts3EvalStartReaders(pCsr, pExpr->pLeft, bOptOk, pRc);
3965: fts3EvalStartReaders(pCsr, pExpr->pRight, bOptOk, pRc);
3966: pExpr->bDeferred = (pExpr->pLeft->bDeferred && pExpr->pRight->bDeferred);
3967: }
3968: }
3969: }
3970:
3971: /*
3972: ** An array of the following structures is assembled as part of the process
3973: ** of selecting tokens to defer before the query starts executing (as part
3974: ** of the xFilter() method). There is one element in the array for each
3975: ** token in the FTS expression.
3976: **
3977: ** Tokens are divided into AND/NEAR clusters. All tokens in a cluster belong
3978: ** to phrases that are connected only by AND and NEAR operators (not OR or
3979: ** NOT). When determining tokens to defer, each AND/NEAR cluster is considered
3980: ** separately. The root of a tokens AND/NEAR cluster is stored in
3981: ** Fts3TokenAndCost.pRoot.
3982: */
3983: typedef struct Fts3TokenAndCost Fts3TokenAndCost;
3984: struct Fts3TokenAndCost {
3985: Fts3Phrase *pPhrase; /* The phrase the token belongs to */
3986: int iToken; /* Position of token in phrase */
3987: Fts3PhraseToken *pToken; /* The token itself */
3988: Fts3Expr *pRoot; /* Root of NEAR/AND cluster */
3989: int nOvfl; /* Number of overflow pages to load doclist */
3990: int iCol; /* The column the token must match */
3991: };
3992:
3993: /*
3994: ** This function is used to populate an allocated Fts3TokenAndCost array.
3995: **
3996: ** If *pRc is not SQLITE_OK when this function is called, it is a no-op.
3997: ** Otherwise, if an error occurs during execution, *pRc is set to an
3998: ** SQLite error code.
3999: */
4000: static void fts3EvalTokenCosts(
4001: Fts3Cursor *pCsr, /* FTS Cursor handle */
4002: Fts3Expr *pRoot, /* Root of current AND/NEAR cluster */
4003: Fts3Expr *pExpr, /* Expression to consider */
4004: Fts3TokenAndCost **ppTC, /* Write new entries to *(*ppTC)++ */
4005: Fts3Expr ***ppOr, /* Write new OR root to *(*ppOr)++ */
4006: int *pRc /* IN/OUT: Error code */
4007: ){
4008: if( *pRc==SQLITE_OK ){
4009: if( pExpr->eType==FTSQUERY_PHRASE ){
4010: Fts3Phrase *pPhrase = pExpr->pPhrase;
4011: int i;
4012: for(i=0; *pRc==SQLITE_OK && i<pPhrase->nToken; i++){
4013: Fts3TokenAndCost *pTC = (*ppTC)++;
4014: pTC->pPhrase = pPhrase;
4015: pTC->iToken = i;
4016: pTC->pRoot = pRoot;
4017: pTC->pToken = &pPhrase->aToken[i];
4018: pTC->iCol = pPhrase->iColumn;
4019: *pRc = sqlite3Fts3MsrOvfl(pCsr, pTC->pToken->pSegcsr, &pTC->nOvfl);
4020: }
4021: }else if( pExpr->eType!=FTSQUERY_NOT ){
4022: assert( pExpr->eType==FTSQUERY_OR
4023: || pExpr->eType==FTSQUERY_AND
4024: || pExpr->eType==FTSQUERY_NEAR
4025: );
4026: assert( pExpr->pLeft && pExpr->pRight );
4027: if( pExpr->eType==FTSQUERY_OR ){
4028: pRoot = pExpr->pLeft;
4029: **ppOr = pRoot;
4030: (*ppOr)++;
4031: }
4032: fts3EvalTokenCosts(pCsr, pRoot, pExpr->pLeft, ppTC, ppOr, pRc);
4033: if( pExpr->eType==FTSQUERY_OR ){
4034: pRoot = pExpr->pRight;
4035: **ppOr = pRoot;
4036: (*ppOr)++;
4037: }
4038: fts3EvalTokenCosts(pCsr, pRoot, pExpr->pRight, ppTC, ppOr, pRc);
4039: }
4040: }
4041: }
4042:
4043: /*
4044: ** Determine the average document (row) size in pages. If successful,
4045: ** write this value to *pnPage and return SQLITE_OK. Otherwise, return
4046: ** an SQLite error code.
4047: **
4048: ** The average document size in pages is calculated by first calculating
4049: ** determining the average size in bytes, B. If B is less than the amount
4050: ** of data that will fit on a single leaf page of an intkey table in
4051: ** this database, then the average docsize is 1. Otherwise, it is 1 plus
4052: ** the number of overflow pages consumed by a record B bytes in size.
4053: */
4054: static int fts3EvalAverageDocsize(Fts3Cursor *pCsr, int *pnPage){
4055: if( pCsr->nRowAvg==0 ){
4056: /* The average document size, which is required to calculate the cost
4057: ** of each doclist, has not yet been determined. Read the required
4058: ** data from the %_stat table to calculate it.
4059: **
4060: ** Entry 0 of the %_stat table is a blob containing (nCol+1) FTS3
4061: ** varints, where nCol is the number of columns in the FTS3 table.
4062: ** The first varint is the number of documents currently stored in
4063: ** the table. The following nCol varints contain the total amount of
4064: ** data stored in all rows of each column of the table, from left
4065: ** to right.
4066: */
4067: int rc;
4068: Fts3Table *p = (Fts3Table*)pCsr->base.pVtab;
4069: sqlite3_stmt *pStmt;
4070: sqlite3_int64 nDoc = 0;
4071: sqlite3_int64 nByte = 0;
4072: const char *pEnd;
4073: const char *a;
4074:
4075: rc = sqlite3Fts3SelectDoctotal(p, &pStmt);
4076: if( rc!=SQLITE_OK ) return rc;
4077: a = sqlite3_column_blob(pStmt, 0);
4078: assert( a );
4079:
4080: pEnd = &a[sqlite3_column_bytes(pStmt, 0)];
4081: a += sqlite3Fts3GetVarint(a, &nDoc);
4082: while( a<pEnd ){
4083: a += sqlite3Fts3GetVarint(a, &nByte);
4084: }
4085: if( nDoc==0 || nByte==0 ){
4086: sqlite3_reset(pStmt);
4087: return FTS_CORRUPT_VTAB;
4088: }
4089:
4090: pCsr->nDoc = nDoc;
4091: pCsr->nRowAvg = (int)(((nByte / nDoc) + p->nPgsz) / p->nPgsz);
4092: assert( pCsr->nRowAvg>0 );
4093: rc = sqlite3_reset(pStmt);
4094: if( rc!=SQLITE_OK ) return rc;
4095: }
4096:
4097: *pnPage = pCsr->nRowAvg;
4098: return SQLITE_OK;
4099: }
4100:
4101: /*
4102: ** This function is called to select the tokens (if any) that will be
4103: ** deferred. The array aTC[] has already been populated when this is
4104: ** called.
4105: **
4106: ** This function is called once for each AND/NEAR cluster in the
4107: ** expression. Each invocation determines which tokens to defer within
4108: ** the cluster with root node pRoot. See comments above the definition
4109: ** of struct Fts3TokenAndCost for more details.
4110: **
4111: ** If no error occurs, SQLITE_OK is returned and sqlite3Fts3DeferToken()
4112: ** called on each token to defer. Otherwise, an SQLite error code is
4113: ** returned.
4114: */
4115: static int fts3EvalSelectDeferred(
4116: Fts3Cursor *pCsr, /* FTS Cursor handle */
4117: Fts3Expr *pRoot, /* Consider tokens with this root node */
4118: Fts3TokenAndCost *aTC, /* Array of expression tokens and costs */
4119: int nTC /* Number of entries in aTC[] */
4120: ){
4121: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
4122: int nDocSize = 0; /* Number of pages per doc loaded */
4123: int rc = SQLITE_OK; /* Return code */
4124: int ii; /* Iterator variable for various purposes */
4125: int nOvfl = 0; /* Total overflow pages used by doclists */
4126: int nToken = 0; /* Total number of tokens in cluster */
4127:
4128: int nMinEst = 0; /* The minimum count for any phrase so far. */
4129: int nLoad4 = 1; /* (Phrases that will be loaded)^4. */
4130:
4131: /* Tokens are never deferred for FTS tables created using the content=xxx
4132: ** option. The reason being that it is not guaranteed that the content
4133: ** table actually contains the same data as the index. To prevent this from
4134: ** causing any problems, the deferred token optimization is completely
4135: ** disabled for content=xxx tables. */
4136: if( pTab->zContentTbl ){
4137: return SQLITE_OK;
4138: }
4139:
4140: /* Count the tokens in this AND/NEAR cluster. If none of the doclists
4141: ** associated with the tokens spill onto overflow pages, or if there is
4142: ** only 1 token, exit early. No tokens to defer in this case. */
4143: for(ii=0; ii<nTC; ii++){
4144: if( aTC[ii].pRoot==pRoot ){
4145: nOvfl += aTC[ii].nOvfl;
4146: nToken++;
4147: }
4148: }
4149: if( nOvfl==0 || nToken<2 ) return SQLITE_OK;
4150:
4151: /* Obtain the average docsize (in pages). */
4152: rc = fts3EvalAverageDocsize(pCsr, &nDocSize);
4153: assert( rc!=SQLITE_OK || nDocSize>0 );
4154:
4155:
4156: /* Iterate through all tokens in this AND/NEAR cluster, in ascending order
4157: ** of the number of overflow pages that will be loaded by the pager layer
4158: ** to retrieve the entire doclist for the token from the full-text index.
4159: ** Load the doclists for tokens that are either:
4160: **
4161: ** a. The cheapest token in the entire query (i.e. the one visited by the
4162: ** first iteration of this loop), or
4163: **
4164: ** b. Part of a multi-token phrase.
4165: **
4166: ** After each token doclist is loaded, merge it with the others from the
4167: ** same phrase and count the number of documents that the merged doclist
4168: ** contains. Set variable "nMinEst" to the smallest number of documents in
4169: ** any phrase doclist for which 1 or more token doclists have been loaded.
4170: ** Let nOther be the number of other phrases for which it is certain that
4171: ** one or more tokens will not be deferred.
4172: **
4173: ** Then, for each token, defer it if loading the doclist would result in
4174: ** loading N or more overflow pages into memory, where N is computed as:
4175: **
4176: ** (nMinEst + 4^nOther - 1) / (4^nOther)
4177: */
4178: for(ii=0; ii<nToken && rc==SQLITE_OK; ii++){
4179: int iTC; /* Used to iterate through aTC[] array. */
4180: Fts3TokenAndCost *pTC = 0; /* Set to cheapest remaining token. */
4181:
4182: /* Set pTC to point to the cheapest remaining token. */
4183: for(iTC=0; iTC<nTC; iTC++){
4184: if( aTC[iTC].pToken && aTC[iTC].pRoot==pRoot
4185: && (!pTC || aTC[iTC].nOvfl<pTC->nOvfl)
4186: ){
4187: pTC = &aTC[iTC];
4188: }
4189: }
4190: assert( pTC );
4191:
4192: if( ii && pTC->nOvfl>=((nMinEst+(nLoad4/4)-1)/(nLoad4/4))*nDocSize ){
4193: /* The number of overflow pages to load for this (and therefore all
4194: ** subsequent) tokens is greater than the estimated number of pages
4195: ** that will be loaded if all subsequent tokens are deferred.
4196: */
4197: Fts3PhraseToken *pToken = pTC->pToken;
4198: rc = sqlite3Fts3DeferToken(pCsr, pToken, pTC->iCol);
4199: fts3SegReaderCursorFree(pToken->pSegcsr);
4200: pToken->pSegcsr = 0;
4201: }else{
4202: /* Set nLoad4 to the value of (4^nOther) for the next iteration of the
4203: ** for-loop. Except, limit the value to 2^24 to prevent it from
4204: ** overflowing the 32-bit integer it is stored in. */
4205: if( ii<12 ) nLoad4 = nLoad4*4;
4206:
4207: if( ii==0 || pTC->pPhrase->nToken>1 ){
4208: /* Either this is the cheapest token in the entire query, or it is
4209: ** part of a multi-token phrase. Either way, the entire doclist will
4210: ** (eventually) be loaded into memory. It may as well be now. */
4211: Fts3PhraseToken *pToken = pTC->pToken;
4212: int nList = 0;
4213: char *pList = 0;
4214: rc = fts3TermSelect(pTab, pToken, pTC->iCol, &nList, &pList);
4215: assert( rc==SQLITE_OK || pList==0 );
4216: if( rc==SQLITE_OK ){
4217: int nCount;
4218: fts3EvalPhraseMergeToken(pTab, pTC->pPhrase, pTC->iToken,pList,nList);
4219: nCount = fts3DoclistCountDocids(
4220: pTC->pPhrase->doclist.aAll, pTC->pPhrase->doclist.nAll
4221: );
4222: if( ii==0 || nCount<nMinEst ) nMinEst = nCount;
4223: }
4224: }
4225: }
4226: pTC->pToken = 0;
4227: }
4228:
4229: return rc;
4230: }
4231:
4232: /*
4233: ** This function is called from within the xFilter method. It initializes
4234: ** the full-text query currently stored in pCsr->pExpr. To iterate through
4235: ** the results of a query, the caller does:
4236: **
4237: ** fts3EvalStart(pCsr);
4238: ** while( 1 ){
4239: ** fts3EvalNext(pCsr);
4240: ** if( pCsr->bEof ) break;
4241: ** ... return row pCsr->iPrevId to the caller ...
4242: ** }
4243: */
4244: static int fts3EvalStart(Fts3Cursor *pCsr){
4245: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
4246: int rc = SQLITE_OK;
4247: int nToken = 0;
4248: int nOr = 0;
4249:
4250: /* Allocate a MultiSegReader for each token in the expression. */
4251: fts3EvalAllocateReaders(pCsr, pCsr->pExpr, &nToken, &nOr, &rc);
4252:
4253: /* Determine which, if any, tokens in the expression should be deferred. */
4254: if( rc==SQLITE_OK && nToken>1 && pTab->bHasStat ){
4255: Fts3TokenAndCost *aTC;
4256: Fts3Expr **apOr;
4257: aTC = (Fts3TokenAndCost *)sqlite3_malloc(
4258: sizeof(Fts3TokenAndCost) * nToken
4259: + sizeof(Fts3Expr *) * nOr * 2
4260: );
4261: apOr = (Fts3Expr **)&aTC[nToken];
4262:
4263: if( !aTC ){
4264: rc = SQLITE_NOMEM;
4265: }else{
4266: int ii;
4267: Fts3TokenAndCost *pTC = aTC;
4268: Fts3Expr **ppOr = apOr;
4269:
4270: fts3EvalTokenCosts(pCsr, 0, pCsr->pExpr, &pTC, &ppOr, &rc);
4271: nToken = pTC-aTC;
4272: nOr = ppOr-apOr;
4273:
4274: if( rc==SQLITE_OK ){
4275: rc = fts3EvalSelectDeferred(pCsr, 0, aTC, nToken);
4276: for(ii=0; rc==SQLITE_OK && ii<nOr; ii++){
4277: rc = fts3EvalSelectDeferred(pCsr, apOr[ii], aTC, nToken);
4278: }
4279: }
4280:
4281: sqlite3_free(aTC);
4282: }
4283: }
4284:
4285: fts3EvalStartReaders(pCsr, pCsr->pExpr, 1, &rc);
4286: return rc;
4287: }
4288:
4289: /*
4290: ** Invalidate the current position list for phrase pPhrase.
4291: */
4292: static void fts3EvalInvalidatePoslist(Fts3Phrase *pPhrase){
4293: if( pPhrase->doclist.bFreeList ){
4294: sqlite3_free(pPhrase->doclist.pList);
4295: }
4296: pPhrase->doclist.pList = 0;
4297: pPhrase->doclist.nList = 0;
4298: pPhrase->doclist.bFreeList = 0;
4299: }
4300:
4301: /*
4302: ** This function is called to edit the position list associated with
4303: ** the phrase object passed as the fifth argument according to a NEAR
4304: ** condition. For example:
4305: **
4306: ** abc NEAR/5 "def ghi"
4307: **
4308: ** Parameter nNear is passed the NEAR distance of the expression (5 in
4309: ** the example above). When this function is called, *paPoslist points to
4310: ** the position list, and *pnToken is the number of phrase tokens in, the
4311: ** phrase on the other side of the NEAR operator to pPhrase. For example,
4312: ** if pPhrase refers to the "def ghi" phrase, then *paPoslist points to
4313: ** the position list associated with phrase "abc".
4314: **
4315: ** All positions in the pPhrase position list that are not sufficiently
4316: ** close to a position in the *paPoslist position list are removed. If this
4317: ** leaves 0 positions, zero is returned. Otherwise, non-zero.
4318: **
4319: ** Before returning, *paPoslist is set to point to the position lsit
4320: ** associated with pPhrase. And *pnToken is set to the number of tokens in
4321: ** pPhrase.
4322: */
4323: static int fts3EvalNearTrim(
4324: int nNear, /* NEAR distance. As in "NEAR/nNear". */
4325: char *aTmp, /* Temporary space to use */
4326: char **paPoslist, /* IN/OUT: Position list */
4327: int *pnToken, /* IN/OUT: Tokens in phrase of *paPoslist */
4328: Fts3Phrase *pPhrase /* The phrase object to trim the doclist of */
4329: ){
4330: int nParam1 = nNear + pPhrase->nToken;
4331: int nParam2 = nNear + *pnToken;
4332: int nNew;
4333: char *p2;
4334: char *pOut;
4335: int res;
4336:
4337: assert( pPhrase->doclist.pList );
4338:
4339: p2 = pOut = pPhrase->doclist.pList;
4340: res = fts3PoslistNearMerge(
4341: &pOut, aTmp, nParam1, nParam2, paPoslist, &p2
4342: );
4343: if( res ){
4344: nNew = (pOut - pPhrase->doclist.pList) - 1;
4345: assert( pPhrase->doclist.pList[nNew]=='\0' );
4346: assert( nNew<=pPhrase->doclist.nList && nNew>0 );
4347: memset(&pPhrase->doclist.pList[nNew], 0, pPhrase->doclist.nList - nNew);
4348: pPhrase->doclist.nList = nNew;
4349: *paPoslist = pPhrase->doclist.pList;
4350: *pnToken = pPhrase->nToken;
4351: }
4352:
4353: return res;
4354: }
4355:
4356: /*
4357: ** This function is a no-op if *pRc is other than SQLITE_OK when it is called.
4358: ** Otherwise, it advances the expression passed as the second argument to
4359: ** point to the next matching row in the database. Expressions iterate through
4360: ** matching rows in docid order. Ascending order if Fts3Cursor.bDesc is zero,
4361: ** or descending if it is non-zero.
4362: **
4363: ** If an error occurs, *pRc is set to an SQLite error code. Otherwise, if
4364: ** successful, the following variables in pExpr are set:
4365: **
4366: ** Fts3Expr.bEof (non-zero if EOF - there is no next row)
4367: ** Fts3Expr.iDocid (valid if bEof==0. The docid of the next row)
4368: **
4369: ** If the expression is of type FTSQUERY_PHRASE, and the expression is not
4370: ** at EOF, then the following variables are populated with the position list
4371: ** for the phrase for the visited row:
4372: **
4373: ** FTs3Expr.pPhrase->doclist.nList (length of pList in bytes)
4374: ** FTs3Expr.pPhrase->doclist.pList (pointer to position list)
4375: **
4376: ** It says above that this function advances the expression to the next
4377: ** matching row. This is usually true, but there are the following exceptions:
4378: **
4379: ** 1. Deferred tokens are not taken into account. If a phrase consists
4380: ** entirely of deferred tokens, it is assumed to match every row in
4381: ** the db. In this case the position-list is not populated at all.
4382: **
4383: ** Or, if a phrase contains one or more deferred tokens and one or
4384: ** more non-deferred tokens, then the expression is advanced to the
4385: ** next possible match, considering only non-deferred tokens. In other
4386: ** words, if the phrase is "A B C", and "B" is deferred, the expression
4387: ** is advanced to the next row that contains an instance of "A * C",
4388: ** where "*" may match any single token. The position list in this case
4389: ** is populated as for "A * C" before returning.
4390: **
4391: ** 2. NEAR is treated as AND. If the expression is "x NEAR y", it is
4392: ** advanced to point to the next row that matches "x AND y".
4393: **
4394: ** See fts3EvalTestDeferredAndNear() for details on testing if a row is
4395: ** really a match, taking into account deferred tokens and NEAR operators.
4396: */
4397: static void fts3EvalNextRow(
4398: Fts3Cursor *pCsr, /* FTS Cursor handle */
4399: Fts3Expr *pExpr, /* Expr. to advance to next matching row */
4400: int *pRc /* IN/OUT: Error code */
4401: ){
4402: if( *pRc==SQLITE_OK ){
4403: int bDescDoclist = pCsr->bDesc; /* Used by DOCID_CMP() macro */
4404: assert( pExpr->bEof==0 );
4405: pExpr->bStart = 1;
4406:
4407: switch( pExpr->eType ){
4408: case FTSQUERY_NEAR:
4409: case FTSQUERY_AND: {
4410: Fts3Expr *pLeft = pExpr->pLeft;
4411: Fts3Expr *pRight = pExpr->pRight;
4412: assert( !pLeft->bDeferred || !pRight->bDeferred );
4413:
4414: if( pLeft->bDeferred ){
4415: /* LHS is entirely deferred. So we assume it matches every row.
4416: ** Advance the RHS iterator to find the next row visited. */
4417: fts3EvalNextRow(pCsr, pRight, pRc);
4418: pExpr->iDocid = pRight->iDocid;
4419: pExpr->bEof = pRight->bEof;
4420: }else if( pRight->bDeferred ){
4421: /* RHS is entirely deferred. So we assume it matches every row.
4422: ** Advance the LHS iterator to find the next row visited. */
4423: fts3EvalNextRow(pCsr, pLeft, pRc);
4424: pExpr->iDocid = pLeft->iDocid;
4425: pExpr->bEof = pLeft->bEof;
4426: }else{
4427: /* Neither the RHS or LHS are deferred. */
4428: fts3EvalNextRow(pCsr, pLeft, pRc);
4429: fts3EvalNextRow(pCsr, pRight, pRc);
4430: while( !pLeft->bEof && !pRight->bEof && *pRc==SQLITE_OK ){
4431: sqlite3_int64 iDiff = DOCID_CMP(pLeft->iDocid, pRight->iDocid);
4432: if( iDiff==0 ) break;
4433: if( iDiff<0 ){
4434: fts3EvalNextRow(pCsr, pLeft, pRc);
4435: }else{
4436: fts3EvalNextRow(pCsr, pRight, pRc);
4437: }
4438: }
4439: pExpr->iDocid = pLeft->iDocid;
4440: pExpr->bEof = (pLeft->bEof || pRight->bEof);
4441: }
4442: break;
4443: }
4444:
4445: case FTSQUERY_OR: {
4446: Fts3Expr *pLeft = pExpr->pLeft;
4447: Fts3Expr *pRight = pExpr->pRight;
4448: sqlite3_int64 iCmp = DOCID_CMP(pLeft->iDocid, pRight->iDocid);
4449:
4450: assert( pLeft->bStart || pLeft->iDocid==pRight->iDocid );
4451: assert( pRight->bStart || pLeft->iDocid==pRight->iDocid );
4452:
4453: if( pRight->bEof || (pLeft->bEof==0 && iCmp<0) ){
4454: fts3EvalNextRow(pCsr, pLeft, pRc);
4455: }else if( pLeft->bEof || (pRight->bEof==0 && iCmp>0) ){
4456: fts3EvalNextRow(pCsr, pRight, pRc);
4457: }else{
4458: fts3EvalNextRow(pCsr, pLeft, pRc);
4459: fts3EvalNextRow(pCsr, pRight, pRc);
4460: }
4461:
4462: pExpr->bEof = (pLeft->bEof && pRight->bEof);
4463: iCmp = DOCID_CMP(pLeft->iDocid, pRight->iDocid);
4464: if( pRight->bEof || (pLeft->bEof==0 && iCmp<0) ){
4465: pExpr->iDocid = pLeft->iDocid;
4466: }else{
4467: pExpr->iDocid = pRight->iDocid;
4468: }
4469:
4470: break;
4471: }
4472:
4473: case FTSQUERY_NOT: {
4474: Fts3Expr *pLeft = pExpr->pLeft;
4475: Fts3Expr *pRight = pExpr->pRight;
4476:
4477: if( pRight->bStart==0 ){
4478: fts3EvalNextRow(pCsr, pRight, pRc);
4479: assert( *pRc!=SQLITE_OK || pRight->bStart );
4480: }
4481:
4482: fts3EvalNextRow(pCsr, pLeft, pRc);
4483: if( pLeft->bEof==0 ){
4484: while( !*pRc
4485: && !pRight->bEof
4486: && DOCID_CMP(pLeft->iDocid, pRight->iDocid)>0
4487: ){
4488: fts3EvalNextRow(pCsr, pRight, pRc);
4489: }
4490: }
4491: pExpr->iDocid = pLeft->iDocid;
4492: pExpr->bEof = pLeft->bEof;
4493: break;
4494: }
4495:
4496: default: {
4497: Fts3Phrase *pPhrase = pExpr->pPhrase;
4498: fts3EvalInvalidatePoslist(pPhrase);
4499: *pRc = fts3EvalPhraseNext(pCsr, pPhrase, &pExpr->bEof);
4500: pExpr->iDocid = pPhrase->doclist.iDocid;
4501: break;
4502: }
4503: }
4504: }
4505: }
4506:
4507: /*
4508: ** If *pRc is not SQLITE_OK, or if pExpr is not the root node of a NEAR
4509: ** cluster, then this function returns 1 immediately.
4510: **
4511: ** Otherwise, it checks if the current row really does match the NEAR
4512: ** expression, using the data currently stored in the position lists
4513: ** (Fts3Expr->pPhrase.doclist.pList/nList) for each phrase in the expression.
4514: **
4515: ** If the current row is a match, the position list associated with each
4516: ** phrase in the NEAR expression is edited in place to contain only those
4517: ** phrase instances sufficiently close to their peers to satisfy all NEAR
4518: ** constraints. In this case it returns 1. If the NEAR expression does not
4519: ** match the current row, 0 is returned. The position lists may or may not
4520: ** be edited if 0 is returned.
4521: */
4522: static int fts3EvalNearTest(Fts3Expr *pExpr, int *pRc){
4523: int res = 1;
4524:
4525: /* The following block runs if pExpr is the root of a NEAR query.
4526: ** For example, the query:
4527: **
4528: ** "w" NEAR "x" NEAR "y" NEAR "z"
4529: **
4530: ** which is represented in tree form as:
4531: **
4532: ** |
4533: ** +--NEAR--+ <-- root of NEAR query
4534: ** | |
4535: ** +--NEAR--+ "z"
4536: ** | |
4537: ** +--NEAR--+ "y"
4538: ** | |
4539: ** "w" "x"
4540: **
4541: ** The right-hand child of a NEAR node is always a phrase. The
4542: ** left-hand child may be either a phrase or a NEAR node. There are
4543: ** no exceptions to this - it's the way the parser in fts3_expr.c works.
4544: */
4545: if( *pRc==SQLITE_OK
4546: && pExpr->eType==FTSQUERY_NEAR
4547: && pExpr->bEof==0
4548: && (pExpr->pParent==0 || pExpr->pParent->eType!=FTSQUERY_NEAR)
4549: ){
4550: Fts3Expr *p;
4551: int nTmp = 0; /* Bytes of temp space */
4552: char *aTmp; /* Temp space for PoslistNearMerge() */
4553:
4554: /* Allocate temporary working space. */
4555: for(p=pExpr; p->pLeft; p=p->pLeft){
4556: nTmp += p->pRight->pPhrase->doclist.nList;
4557: }
4558: nTmp += p->pPhrase->doclist.nList;
4559: aTmp = sqlite3_malloc(nTmp*2);
4560: if( !aTmp ){
4561: *pRc = SQLITE_NOMEM;
4562: res = 0;
4563: }else{
4564: char *aPoslist = p->pPhrase->doclist.pList;
4565: int nToken = p->pPhrase->nToken;
4566:
4567: for(p=p->pParent;res && p && p->eType==FTSQUERY_NEAR; p=p->pParent){
4568: Fts3Phrase *pPhrase = p->pRight->pPhrase;
4569: int nNear = p->nNear;
4570: res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase);
4571: }
4572:
4573: aPoslist = pExpr->pRight->pPhrase->doclist.pList;
4574: nToken = pExpr->pRight->pPhrase->nToken;
4575: for(p=pExpr->pLeft; p && res; p=p->pLeft){
4576: int nNear;
4577: Fts3Phrase *pPhrase;
4578: assert( p->pParent && p->pParent->pLeft==p );
4579: nNear = p->pParent->nNear;
4580: pPhrase = (
4581: p->eType==FTSQUERY_NEAR ? p->pRight->pPhrase : p->pPhrase
4582: );
4583: res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase);
4584: }
4585: }
4586:
4587: sqlite3_free(aTmp);
4588: }
4589:
4590: return res;
4591: }
4592:
4593: /*
4594: ** This function is a helper function for fts3EvalTestDeferredAndNear().
4595: ** Assuming no error occurs or has occurred, It returns non-zero if the
4596: ** expression passed as the second argument matches the row that pCsr
4597: ** currently points to, or zero if it does not.
4598: **
4599: ** If *pRc is not SQLITE_OK when this function is called, it is a no-op.
4600: ** If an error occurs during execution of this function, *pRc is set to
4601: ** the appropriate SQLite error code. In this case the returned value is
4602: ** undefined.
4603: */
4604: static int fts3EvalTestExpr(
4605: Fts3Cursor *pCsr, /* FTS cursor handle */
4606: Fts3Expr *pExpr, /* Expr to test. May or may not be root. */
4607: int *pRc /* IN/OUT: Error code */
4608: ){
4609: int bHit = 1; /* Return value */
4610: if( *pRc==SQLITE_OK ){
4611: switch( pExpr->eType ){
4612: case FTSQUERY_NEAR:
4613: case FTSQUERY_AND:
4614: bHit = (
4615: fts3EvalTestExpr(pCsr, pExpr->pLeft, pRc)
4616: && fts3EvalTestExpr(pCsr, pExpr->pRight, pRc)
4617: && fts3EvalNearTest(pExpr, pRc)
4618: );
4619:
4620: /* If the NEAR expression does not match any rows, zero the doclist for
4621: ** all phrases involved in the NEAR. This is because the snippet(),
4622: ** offsets() and matchinfo() functions are not supposed to recognize
4623: ** any instances of phrases that are part of unmatched NEAR queries.
4624: ** For example if this expression:
4625: **
4626: ** ... MATCH 'a OR (b NEAR c)'
4627: **
4628: ** is matched against a row containing:
4629: **
4630: ** 'a b d e'
4631: **
4632: ** then any snippet() should ony highlight the "a" term, not the "b"
4633: ** (as "b" is part of a non-matching NEAR clause).
4634: */
4635: if( bHit==0
4636: && pExpr->eType==FTSQUERY_NEAR
4637: && (pExpr->pParent==0 || pExpr->pParent->eType!=FTSQUERY_NEAR)
4638: ){
4639: Fts3Expr *p;
4640: for(p=pExpr; p->pPhrase==0; p=p->pLeft){
4641: if( p->pRight->iDocid==pCsr->iPrevId ){
4642: fts3EvalInvalidatePoslist(p->pRight->pPhrase);
4643: }
4644: }
4645: if( p->iDocid==pCsr->iPrevId ){
4646: fts3EvalInvalidatePoslist(p->pPhrase);
4647: }
4648: }
4649:
4650: break;
4651:
4652: case FTSQUERY_OR: {
4653: int bHit1 = fts3EvalTestExpr(pCsr, pExpr->pLeft, pRc);
4654: int bHit2 = fts3EvalTestExpr(pCsr, pExpr->pRight, pRc);
4655: bHit = bHit1 || bHit2;
4656: break;
4657: }
4658:
4659: case FTSQUERY_NOT:
4660: bHit = (
4661: fts3EvalTestExpr(pCsr, pExpr->pLeft, pRc)
4662: && !fts3EvalTestExpr(pCsr, pExpr->pRight, pRc)
4663: );
4664: break;
4665:
4666: default: {
4667: if( pCsr->pDeferred
4668: && (pExpr->iDocid==pCsr->iPrevId || pExpr->bDeferred)
4669: ){
4670: Fts3Phrase *pPhrase = pExpr->pPhrase;
4671: assert( pExpr->bDeferred || pPhrase->doclist.bFreeList==0 );
4672: if( pExpr->bDeferred ){
4673: fts3EvalInvalidatePoslist(pPhrase);
4674: }
4675: *pRc = fts3EvalDeferredPhrase(pCsr, pPhrase);
4676: bHit = (pPhrase->doclist.pList!=0);
4677: pExpr->iDocid = pCsr->iPrevId;
4678: }else{
4679: bHit = (pExpr->bEof==0 && pExpr->iDocid==pCsr->iPrevId);
4680: }
4681: break;
4682: }
4683: }
4684: }
4685: return bHit;
4686: }
4687:
4688: /*
4689: ** This function is called as the second part of each xNext operation when
4690: ** iterating through the results of a full-text query. At this point the
4691: ** cursor points to a row that matches the query expression, with the
4692: ** following caveats:
4693: **
4694: ** * Up until this point, "NEAR" operators in the expression have been
4695: ** treated as "AND".
4696: **
4697: ** * Deferred tokens have not yet been considered.
4698: **
4699: ** If *pRc is not SQLITE_OK when this function is called, it immediately
4700: ** returns 0. Otherwise, it tests whether or not after considering NEAR
4701: ** operators and deferred tokens the current row is still a match for the
4702: ** expression. It returns 1 if both of the following are true:
4703: **
4704: ** 1. *pRc is SQLITE_OK when this function returns, and
4705: **
4706: ** 2. After scanning the current FTS table row for the deferred tokens,
4707: ** it is determined that the row does *not* match the query.
4708: **
4709: ** Or, if no error occurs and it seems the current row does match the FTS
4710: ** query, return 0.
4711: */
4712: static int fts3EvalTestDeferredAndNear(Fts3Cursor *pCsr, int *pRc){
4713: int rc = *pRc;
4714: int bMiss = 0;
4715: if( rc==SQLITE_OK ){
4716:
4717: /* If there are one or more deferred tokens, load the current row into
4718: ** memory and scan it to determine the position list for each deferred
4719: ** token. Then, see if this row is really a match, considering deferred
4720: ** tokens and NEAR operators (neither of which were taken into account
4721: ** earlier, by fts3EvalNextRow()).
4722: */
4723: if( pCsr->pDeferred ){
4724: rc = fts3CursorSeek(0, pCsr);
4725: if( rc==SQLITE_OK ){
4726: rc = sqlite3Fts3CacheDeferredDoclists(pCsr);
4727: }
4728: }
4729: bMiss = (0==fts3EvalTestExpr(pCsr, pCsr->pExpr, &rc));
4730:
4731: /* Free the position-lists accumulated for each deferred token above. */
4732: sqlite3Fts3FreeDeferredDoclists(pCsr);
4733: *pRc = rc;
4734: }
4735: return (rc==SQLITE_OK && bMiss);
4736: }
4737:
4738: /*
4739: ** Advance to the next document that matches the FTS expression in
4740: ** Fts3Cursor.pExpr.
4741: */
4742: static int fts3EvalNext(Fts3Cursor *pCsr){
4743: int rc = SQLITE_OK; /* Return Code */
4744: Fts3Expr *pExpr = pCsr->pExpr;
4745: assert( pCsr->isEof==0 );
4746: if( pExpr==0 ){
4747: pCsr->isEof = 1;
4748: }else{
4749: do {
4750: if( pCsr->isRequireSeek==0 ){
4751: sqlite3_reset(pCsr->pStmt);
4752: }
4753: assert( sqlite3_data_count(pCsr->pStmt)==0 );
4754: fts3EvalNextRow(pCsr, pExpr, &rc);
4755: pCsr->isEof = pExpr->bEof;
4756: pCsr->isRequireSeek = 1;
4757: pCsr->isMatchinfoNeeded = 1;
4758: pCsr->iPrevId = pExpr->iDocid;
4759: }while( pCsr->isEof==0 && fts3EvalTestDeferredAndNear(pCsr, &rc) );
4760: }
4761: return rc;
4762: }
4763:
4764: /*
4765: ** Restart interation for expression pExpr so that the next call to
4766: ** fts3EvalNext() visits the first row. Do not allow incremental
4767: ** loading or merging of phrase doclists for this iteration.
4768: **
4769: ** If *pRc is other than SQLITE_OK when this function is called, it is
4770: ** a no-op. If an error occurs within this function, *pRc is set to an
4771: ** SQLite error code before returning.
4772: */
4773: static void fts3EvalRestart(
4774: Fts3Cursor *pCsr,
4775: Fts3Expr *pExpr,
4776: int *pRc
4777: ){
4778: if( pExpr && *pRc==SQLITE_OK ){
4779: Fts3Phrase *pPhrase = pExpr->pPhrase;
4780:
4781: if( pPhrase ){
4782: fts3EvalInvalidatePoslist(pPhrase);
4783: if( pPhrase->bIncr ){
4784: assert( pPhrase->nToken==1 );
4785: assert( pPhrase->aToken[0].pSegcsr );
4786: sqlite3Fts3MsrIncrRestart(pPhrase->aToken[0].pSegcsr);
4787: *pRc = fts3EvalPhraseStart(pCsr, 0, pPhrase);
4788: }
4789:
4790: pPhrase->doclist.pNextDocid = 0;
4791: pPhrase->doclist.iDocid = 0;
4792: }
4793:
4794: pExpr->iDocid = 0;
4795: pExpr->bEof = 0;
4796: pExpr->bStart = 0;
4797:
4798: fts3EvalRestart(pCsr, pExpr->pLeft, pRc);
4799: fts3EvalRestart(pCsr, pExpr->pRight, pRc);
4800: }
4801: }
4802:
4803: /*
4804: ** After allocating the Fts3Expr.aMI[] array for each phrase in the
4805: ** expression rooted at pExpr, the cursor iterates through all rows matched
4806: ** by pExpr, calling this function for each row. This function increments
4807: ** the values in Fts3Expr.aMI[] according to the position-list currently
4808: ** found in Fts3Expr.pPhrase->doclist.pList for each of the phrase
4809: ** expression nodes.
4810: */
4811: static void fts3EvalUpdateCounts(Fts3Expr *pExpr){
4812: if( pExpr ){
4813: Fts3Phrase *pPhrase = pExpr->pPhrase;
4814: if( pPhrase && pPhrase->doclist.pList ){
4815: int iCol = 0;
4816: char *p = pPhrase->doclist.pList;
4817:
4818: assert( *p );
4819: while( 1 ){
4820: u8 c = 0;
4821: int iCnt = 0;
4822: while( 0xFE & (*p | c) ){
4823: if( (c&0x80)==0 ) iCnt++;
4824: c = *p++ & 0x80;
4825: }
4826:
4827: /* aMI[iCol*3 + 1] = Number of occurrences
4828: ** aMI[iCol*3 + 2] = Number of rows containing at least one instance
4829: */
4830: pExpr->aMI[iCol*3 + 1] += iCnt;
4831: pExpr->aMI[iCol*3 + 2] += (iCnt>0);
4832: if( *p==0x00 ) break;
4833: p++;
4834: p += sqlite3Fts3GetVarint32(p, &iCol);
4835: }
4836: }
4837:
4838: fts3EvalUpdateCounts(pExpr->pLeft);
4839: fts3EvalUpdateCounts(pExpr->pRight);
4840: }
4841: }
4842:
4843: /*
4844: ** Expression pExpr must be of type FTSQUERY_PHRASE.
4845: **
4846: ** If it is not already allocated and populated, this function allocates and
4847: ** populates the Fts3Expr.aMI[] array for expression pExpr. If pExpr is part
4848: ** of a NEAR expression, then it also allocates and populates the same array
4849: ** for all other phrases that are part of the NEAR expression.
4850: **
4851: ** SQLITE_OK is returned if the aMI[] array is successfully allocated and
4852: ** populated. Otherwise, if an error occurs, an SQLite error code is returned.
4853: */
4854: static int fts3EvalGatherStats(
4855: Fts3Cursor *pCsr, /* Cursor object */
4856: Fts3Expr *pExpr /* FTSQUERY_PHRASE expression */
4857: ){
4858: int rc = SQLITE_OK; /* Return code */
4859:
4860: assert( pExpr->eType==FTSQUERY_PHRASE );
4861: if( pExpr->aMI==0 ){
4862: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
4863: Fts3Expr *pRoot; /* Root of NEAR expression */
4864: Fts3Expr *p; /* Iterator used for several purposes */
4865:
4866: sqlite3_int64 iPrevId = pCsr->iPrevId;
4867: sqlite3_int64 iDocid;
4868: u8 bEof;
4869:
4870: /* Find the root of the NEAR expression */
4871: pRoot = pExpr;
4872: while( pRoot->pParent && pRoot->pParent->eType==FTSQUERY_NEAR ){
4873: pRoot = pRoot->pParent;
4874: }
4875: iDocid = pRoot->iDocid;
4876: bEof = pRoot->bEof;
4877: assert( pRoot->bStart );
4878:
4879: /* Allocate space for the aMSI[] array of each FTSQUERY_PHRASE node */
4880: for(p=pRoot; p; p=p->pLeft){
4881: Fts3Expr *pE = (p->eType==FTSQUERY_PHRASE?p:p->pRight);
4882: assert( pE->aMI==0 );
4883: pE->aMI = (u32 *)sqlite3_malloc(pTab->nColumn * 3 * sizeof(u32));
4884: if( !pE->aMI ) return SQLITE_NOMEM;
4885: memset(pE->aMI, 0, pTab->nColumn * 3 * sizeof(u32));
4886: }
4887:
4888: fts3EvalRestart(pCsr, pRoot, &rc);
4889:
4890: while( pCsr->isEof==0 && rc==SQLITE_OK ){
4891:
4892: do {
4893: /* Ensure the %_content statement is reset. */
4894: if( pCsr->isRequireSeek==0 ) sqlite3_reset(pCsr->pStmt);
4895: assert( sqlite3_data_count(pCsr->pStmt)==0 );
4896:
4897: /* Advance to the next document */
4898: fts3EvalNextRow(pCsr, pRoot, &rc);
4899: pCsr->isEof = pRoot->bEof;
4900: pCsr->isRequireSeek = 1;
4901: pCsr->isMatchinfoNeeded = 1;
4902: pCsr->iPrevId = pRoot->iDocid;
4903: }while( pCsr->isEof==0
4904: && pRoot->eType==FTSQUERY_NEAR
4905: && fts3EvalTestDeferredAndNear(pCsr, &rc)
4906: );
4907:
4908: if( rc==SQLITE_OK && pCsr->isEof==0 ){
4909: fts3EvalUpdateCounts(pRoot);
4910: }
4911: }
4912:
4913: pCsr->isEof = 0;
4914: pCsr->iPrevId = iPrevId;
4915:
4916: if( bEof ){
4917: pRoot->bEof = bEof;
4918: }else{
4919: /* Caution: pRoot may iterate through docids in ascending or descending
4920: ** order. For this reason, even though it seems more defensive, the
4921: ** do loop can not be written:
4922: **
4923: ** do {...} while( pRoot->iDocid<iDocid && rc==SQLITE_OK );
4924: */
4925: fts3EvalRestart(pCsr, pRoot, &rc);
4926: do {
4927: fts3EvalNextRow(pCsr, pRoot, &rc);
4928: assert( pRoot->bEof==0 );
4929: }while( pRoot->iDocid!=iDocid && rc==SQLITE_OK );
4930: fts3EvalTestDeferredAndNear(pCsr, &rc);
4931: }
4932: }
4933: return rc;
4934: }
4935:
4936: /*
4937: ** This function is used by the matchinfo() module to query a phrase
4938: ** expression node for the following information:
4939: **
4940: ** 1. The total number of occurrences of the phrase in each column of
4941: ** the FTS table (considering all rows), and
4942: **
4943: ** 2. For each column, the number of rows in the table for which the
4944: ** column contains at least one instance of the phrase.
4945: **
4946: ** If no error occurs, SQLITE_OK is returned and the values for each column
4947: ** written into the array aiOut as follows:
4948: **
4949: ** aiOut[iCol*3 + 1] = Number of occurrences
4950: ** aiOut[iCol*3 + 2] = Number of rows containing at least one instance
4951: **
4952: ** Caveats:
4953: **
4954: ** * If a phrase consists entirely of deferred tokens, then all output
4955: ** values are set to the number of documents in the table. In other
4956: ** words we assume that very common tokens occur exactly once in each
4957: ** column of each row of the table.
4958: **
4959: ** * If a phrase contains some deferred tokens (and some non-deferred
4960: ** tokens), count the potential occurrence identified by considering
4961: ** the non-deferred tokens instead of actual phrase occurrences.
4962: **
4963: ** * If the phrase is part of a NEAR expression, then only phrase instances
4964: ** that meet the NEAR constraint are included in the counts.
4965: */
4966: int sqlite3Fts3EvalPhraseStats(
4967: Fts3Cursor *pCsr, /* FTS cursor handle */
4968: Fts3Expr *pExpr, /* Phrase expression */
4969: u32 *aiOut /* Array to write results into (see above) */
4970: ){
4971: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
4972: int rc = SQLITE_OK;
4973: int iCol;
4974:
4975: if( pExpr->bDeferred && pExpr->pParent->eType!=FTSQUERY_NEAR ){
4976: assert( pCsr->nDoc>0 );
4977: for(iCol=0; iCol<pTab->nColumn; iCol++){
4978: aiOut[iCol*3 + 1] = (u32)pCsr->nDoc;
4979: aiOut[iCol*3 + 2] = (u32)pCsr->nDoc;
4980: }
4981: }else{
4982: rc = fts3EvalGatherStats(pCsr, pExpr);
4983: if( rc==SQLITE_OK ){
4984: assert( pExpr->aMI );
4985: for(iCol=0; iCol<pTab->nColumn; iCol++){
4986: aiOut[iCol*3 + 1] = pExpr->aMI[iCol*3 + 1];
4987: aiOut[iCol*3 + 2] = pExpr->aMI[iCol*3 + 2];
4988: }
4989: }
4990: }
4991:
4992: return rc;
4993: }
4994:
4995: /*
4996: ** The expression pExpr passed as the second argument to this function
4997: ** must be of type FTSQUERY_PHRASE.
4998: **
4999: ** The returned value is either NULL or a pointer to a buffer containing
5000: ** a position-list indicating the occurrences of the phrase in column iCol
5001: ** of the current row.
5002: **
5003: ** More specifically, the returned buffer contains 1 varint for each
5004: ** occurence of the phrase in the column, stored using the normal (delta+2)
5005: ** compression and is terminated by either an 0x01 or 0x00 byte. For example,
5006: ** if the requested column contains "a b X c d X X" and the position-list
5007: ** for 'X' is requested, the buffer returned may contain:
5008: **
5009: ** 0x04 0x05 0x03 0x01 or 0x04 0x05 0x03 0x00
5010: **
5011: ** This function works regardless of whether or not the phrase is deferred,
5012: ** incremental, or neither.
5013: */
5014: char *sqlite3Fts3EvalPhrasePoslist(
5015: Fts3Cursor *pCsr, /* FTS3 cursor object */
5016: Fts3Expr *pExpr, /* Phrase to return doclist for */
5017: int iCol /* Column to return position list for */
5018: ){
5019: Fts3Phrase *pPhrase = pExpr->pPhrase;
5020: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
5021: char *pIter = pPhrase->doclist.pList;
5022: int iThis;
5023:
5024: assert( iCol>=0 && iCol<pTab->nColumn );
5025: if( !pIter
5026: || pExpr->bEof
5027: || pExpr->iDocid!=pCsr->iPrevId
5028: || (pPhrase->iColumn<pTab->nColumn && pPhrase->iColumn!=iCol)
5029: ){
5030: return 0;
5031: }
5032:
5033: assert( pPhrase->doclist.nList>0 );
5034: if( *pIter==0x01 ){
5035: pIter++;
5036: pIter += sqlite3Fts3GetVarint32(pIter, &iThis);
5037: }else{
5038: iThis = 0;
5039: }
5040: while( iThis<iCol ){
5041: fts3ColumnlistCopy(0, &pIter);
5042: if( *pIter==0x00 ) return 0;
5043: pIter++;
5044: pIter += sqlite3Fts3GetVarint32(pIter, &iThis);
5045: }
5046:
5047: return ((iCol==iThis)?pIter:0);
5048: }
5049:
5050: /*
5051: ** Free all components of the Fts3Phrase structure that were allocated by
5052: ** the eval module. Specifically, this means to free:
5053: **
5054: ** * the contents of pPhrase->doclist, and
5055: ** * any Fts3MultiSegReader objects held by phrase tokens.
5056: */
5057: void sqlite3Fts3EvalPhraseCleanup(Fts3Phrase *pPhrase){
5058: if( pPhrase ){
5059: int i;
5060: sqlite3_free(pPhrase->doclist.aAll);
5061: fts3EvalInvalidatePoslist(pPhrase);
5062: memset(&pPhrase->doclist, 0, sizeof(Fts3Doclist));
5063: for(i=0; i<pPhrase->nToken; i++){
5064: fts3SegReaderCursorFree(pPhrase->aToken[i].pSegcsr);
5065: pPhrase->aToken[i].pSegcsr = 0;
5066: }
5067: }
5068: }
5069:
5070: /*
5071: ** Return SQLITE_CORRUPT_VTAB.
5072: */
5073: #ifdef SQLITE_DEBUG
5074: int sqlite3Fts3Corrupt(){
5075: return SQLITE_CORRUPT_VTAB;
5076: }
5077: #endif
5078:
5079: #if !SQLITE_CORE
5080: /*
5081: ** Initialize API pointer table, if required.
5082: */
5083: int sqlite3_extension_init(
5084: sqlite3 *db,
5085: char **pzErrMsg,
5086: const sqlite3_api_routines *pApi
5087: ){
5088: SQLITE_EXTENSION_INIT2(pApi)
5089: return sqlite3Fts3Init(db);
5090: }
5091: #endif
5092:
5093: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>