Annotation of embedaddon/sqlite3/ext/fts3/fts3Int.h, revision 1.1.1.1
1.1 misho 1: /*
2: ** 2009 Nov 12
3: **
4: ** The author disclaims copyright to this source code. In place of
5: ** a legal notice, here is a blessing:
6: **
7: ** May you do good and not evil.
8: ** May you find forgiveness for yourself and forgive others.
9: ** May you share freely, never taking more than you give.
10: **
11: ******************************************************************************
12: **
13: */
14: #ifndef _FTSINT_H
15: #define _FTSINT_H
16:
17: #if !defined(NDEBUG) && !defined(SQLITE_DEBUG)
18: # define NDEBUG 1
19: #endif
20:
21: /*
22: ** FTS4 is really an extension for FTS3. It is enabled using the
23: ** SQLITE_ENABLE_FTS3 macro. But to avoid confusion we also all
24: ** the SQLITE_ENABLE_FTS4 macro to serve as an alisse for SQLITE_ENABLE_FTS3.
25: */
26: #if defined(SQLITE_ENABLE_FTS4) && !defined(SQLITE_ENABLE_FTS3)
27: # define SQLITE_ENABLE_FTS3
28: #endif
29:
30: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
31:
32: /* If not building as part of the core, include sqlite3ext.h. */
33: #ifndef SQLITE_CORE
34: # include "sqlite3ext.h"
35: extern const sqlite3_api_routines *sqlite3_api;
36: #endif
37:
38: #include "sqlite3.h"
39: #include "fts3_tokenizer.h"
40: #include "fts3_hash.h"
41:
42: /*
43: ** This constant controls how often segments are merged. Once there are
44: ** FTS3_MERGE_COUNT segments of level N, they are merged into a single
45: ** segment of level N+1.
46: */
47: #define FTS3_MERGE_COUNT 16
48:
49: /*
50: ** This is the maximum amount of data (in bytes) to store in the
51: ** Fts3Table.pendingTerms hash table. Normally, the hash table is
52: ** populated as documents are inserted/updated/deleted in a transaction
53: ** and used to create a new segment when the transaction is committed.
54: ** However if this limit is reached midway through a transaction, a new
55: ** segment is created and the hash table cleared immediately.
56: */
57: #define FTS3_MAX_PENDING_DATA (1*1024*1024)
58:
59: /*
60: ** Macro to return the number of elements in an array. SQLite has a
61: ** similar macro called ArraySize(). Use a different name to avoid
62: ** a collision when building an amalgamation with built-in FTS3.
63: */
64: #define SizeofArray(X) ((int)(sizeof(X)/sizeof(X[0])))
65:
66:
67: #ifndef MIN
68: # define MIN(x,y) ((x)<(y)?(x):(y))
69: #endif
70:
71: /*
72: ** Maximum length of a varint encoded integer. The varint format is different
73: ** from that used by SQLite, so the maximum length is 10, not 9.
74: */
75: #define FTS3_VARINT_MAX 10
76:
77: /*
78: ** FTS4 virtual tables may maintain multiple indexes - one index of all terms
79: ** in the document set and zero or more prefix indexes. All indexes are stored
80: ** as one or more b+-trees in the %_segments and %_segdir tables.
81: **
82: ** It is possible to determine which index a b+-tree belongs to based on the
83: ** value stored in the "%_segdir.level" column. Given this value L, the index
84: ** that the b+-tree belongs to is (L<<10). In other words, all b+-trees with
85: ** level values between 0 and 1023 (inclusive) belong to index 0, all levels
86: ** between 1024 and 2047 to index 1, and so on.
87: **
88: ** It is considered impossible for an index to use more than 1024 levels. In
89: ** theory though this may happen, but only after at least
90: ** (FTS3_MERGE_COUNT^1024) separate flushes of the pending-terms tables.
91: */
92: #define FTS3_SEGDIR_MAXLEVEL 1024
93: #define FTS3_SEGDIR_MAXLEVEL_STR "1024"
94:
95: /*
96: ** The testcase() macro is only used by the amalgamation. If undefined,
97: ** make it a no-op.
98: */
99: #ifndef testcase
100: # define testcase(X)
101: #endif
102:
103: /*
104: ** Terminator values for position-lists and column-lists.
105: */
106: #define POS_COLUMN (1) /* Column-list terminator */
107: #define POS_END (0) /* Position-list terminator */
108:
109: /*
110: ** This section provides definitions to allow the
111: ** FTS3 extension to be compiled outside of the
112: ** amalgamation.
113: */
114: #ifndef SQLITE_AMALGAMATION
115: /*
116: ** Macros indicating that conditional expressions are always true or
117: ** false.
118: */
119: #ifdef SQLITE_COVERAGE_TEST
120: # define ALWAYS(x) (1)
121: # define NEVER(X) (0)
122: #else
123: # define ALWAYS(x) (x)
124: # define NEVER(X) (x)
125: #endif
126:
127: /*
128: ** Internal types used by SQLite.
129: */
130: typedef unsigned char u8; /* 1-byte (or larger) unsigned integer */
131: typedef short int i16; /* 2-byte (or larger) signed integer */
132: typedef unsigned int u32; /* 4-byte unsigned integer */
133: typedef sqlite3_uint64 u64; /* 8-byte unsigned integer */
134:
135: /*
136: ** Macro used to suppress compiler warnings for unused parameters.
137: */
138: #define UNUSED_PARAMETER(x) (void)(x)
139:
140: /*
141: ** Activate assert() only if SQLITE_TEST is enabled.
142: */
143: #if !defined(NDEBUG) && !defined(SQLITE_DEBUG)
144: # define NDEBUG 1
145: #endif
146:
147: /*
148: ** The TESTONLY macro is used to enclose variable declarations or
149: ** other bits of code that are needed to support the arguments
150: ** within testcase() and assert() macros.
151: */
152: #if defined(SQLITE_DEBUG) || defined(SQLITE_COVERAGE_TEST)
153: # define TESTONLY(X) X
154: #else
155: # define TESTONLY(X)
156: #endif
157:
158: #endif /* SQLITE_AMALGAMATION */
159:
160: #ifdef SQLITE_DEBUG
161: int sqlite3Fts3Corrupt(void);
162: # define FTS_CORRUPT_VTAB sqlite3Fts3Corrupt()
163: #else
164: # define FTS_CORRUPT_VTAB SQLITE_CORRUPT_VTAB
165: #endif
166:
167: typedef struct Fts3Table Fts3Table;
168: typedef struct Fts3Cursor Fts3Cursor;
169: typedef struct Fts3Expr Fts3Expr;
170: typedef struct Fts3Phrase Fts3Phrase;
171: typedef struct Fts3PhraseToken Fts3PhraseToken;
172:
173: typedef struct Fts3Doclist Fts3Doclist;
174: typedef struct Fts3SegFilter Fts3SegFilter;
175: typedef struct Fts3DeferredToken Fts3DeferredToken;
176: typedef struct Fts3SegReader Fts3SegReader;
177: typedef struct Fts3MultiSegReader Fts3MultiSegReader;
178:
179: /*
180: ** A connection to a fulltext index is an instance of the following
181: ** structure. The xCreate and xConnect methods create an instance
182: ** of this structure and xDestroy and xDisconnect free that instance.
183: ** All other methods receive a pointer to the structure as one of their
184: ** arguments.
185: */
186: struct Fts3Table {
187: sqlite3_vtab base; /* Base class used by SQLite core */
188: sqlite3 *db; /* The database connection */
189: const char *zDb; /* logical database name */
190: const char *zName; /* virtual table name */
191: int nColumn; /* number of named columns in virtual table */
192: char **azColumn; /* column names. malloced */
193: sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
194: char *zContentTbl; /* content=xxx option, or NULL */
195:
196: /* Precompiled statements used by the implementation. Each of these
197: ** statements is run and reset within a single virtual table API call.
198: */
199: sqlite3_stmt *aStmt[27];
200:
201: char *zReadExprlist;
202: char *zWriteExprlist;
203:
204: int nNodeSize; /* Soft limit for node size */
205: u8 bHasStat; /* True if %_stat table exists */
206: u8 bHasDocsize; /* True if %_docsize table exists */
207: u8 bDescIdx; /* True if doclists are in reverse order */
208: int nPgsz; /* Page size for host database */
209: char *zSegmentsTbl; /* Name of %_segments table */
210: sqlite3_blob *pSegments; /* Blob handle open on %_segments table */
211:
212: /* TODO: Fix the first paragraph of this comment.
213: **
214: ** The following hash table is used to buffer pending index updates during
215: ** transactions. Variable nPendingData estimates the memory size of the
216: ** pending data, including hash table overhead, but not malloc overhead.
217: ** When nPendingData exceeds nMaxPendingData, the buffer is flushed
218: ** automatically. Variable iPrevDocid is the docid of the most recently
219: ** inserted record.
220: **
221: ** A single FTS4 table may have multiple full-text indexes. For each index
222: ** there is an entry in the aIndex[] array. Index 0 is an index of all the
223: ** terms that appear in the document set. Each subsequent index in aIndex[]
224: ** is an index of prefixes of a specific length.
225: */
226: int nIndex; /* Size of aIndex[] */
227: struct Fts3Index {
228: int nPrefix; /* Prefix length (0 for main terms index) */
229: Fts3Hash hPending; /* Pending terms table for this index */
230: } *aIndex;
231: int nMaxPendingData; /* Max pending data before flush to disk */
232: int nPendingData; /* Current bytes of pending data */
233: sqlite_int64 iPrevDocid; /* Docid of most recently inserted document */
234:
235: #if defined(SQLITE_DEBUG) || defined(SQLITE_COVERAGE_TEST)
236: /* State variables used for validating that the transaction control
237: ** methods of the virtual table are called at appropriate times. These
238: ** values do not contribution to the FTS computation; they are used for
239: ** verifying the SQLite core.
240: */
241: int inTransaction; /* True after xBegin but before xCommit/xRollback */
242: int mxSavepoint; /* Largest valid xSavepoint integer */
243: #endif
244: };
245:
246: /*
247: ** When the core wants to read from the virtual table, it creates a
248: ** virtual table cursor (an instance of the following structure) using
249: ** the xOpen method. Cursors are destroyed using the xClose method.
250: */
251: struct Fts3Cursor {
252: sqlite3_vtab_cursor base; /* Base class used by SQLite core */
253: i16 eSearch; /* Search strategy (see below) */
254: u8 isEof; /* True if at End Of Results */
255: u8 isRequireSeek; /* True if must seek pStmt to %_content row */
256: sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
257: Fts3Expr *pExpr; /* Parsed MATCH query string */
258: int nPhrase; /* Number of matchable phrases in query */
259: Fts3DeferredToken *pDeferred; /* Deferred search tokens, if any */
260: sqlite3_int64 iPrevId; /* Previous id read from aDoclist */
261: char *pNextId; /* Pointer into the body of aDoclist */
262: char *aDoclist; /* List of docids for full-text queries */
263: int nDoclist; /* Size of buffer at aDoclist */
264: u8 bDesc; /* True to sort in descending order */
265: int eEvalmode; /* An FTS3_EVAL_XX constant */
266: int nRowAvg; /* Average size of database rows, in pages */
267: sqlite3_int64 nDoc; /* Documents in table */
268:
269: int isMatchinfoNeeded; /* True when aMatchinfo[] needs filling in */
270: u32 *aMatchinfo; /* Information about most recent match */
271: int nMatchinfo; /* Number of elements in aMatchinfo[] */
272: char *zMatchinfo; /* Matchinfo specification */
273: };
274:
275: #define FTS3_EVAL_FILTER 0
276: #define FTS3_EVAL_NEXT 1
277: #define FTS3_EVAL_MATCHINFO 2
278:
279: /*
280: ** The Fts3Cursor.eSearch member is always set to one of the following.
281: ** Actualy, Fts3Cursor.eSearch can be greater than or equal to
282: ** FTS3_FULLTEXT_SEARCH. If so, then Fts3Cursor.eSearch - 2 is the index
283: ** of the column to be searched. For example, in
284: **
285: ** CREATE VIRTUAL TABLE ex1 USING fts3(a,b,c,d);
286: ** SELECT docid FROM ex1 WHERE b MATCH 'one two three';
287: **
288: ** Because the LHS of the MATCH operator is 2nd column "b",
289: ** Fts3Cursor.eSearch will be set to FTS3_FULLTEXT_SEARCH+1. (+0 for a,
290: ** +1 for b, +2 for c, +3 for d.) If the LHS of MATCH were "ex1"
291: ** indicating that all columns should be searched,
292: ** then eSearch would be set to FTS3_FULLTEXT_SEARCH+4.
293: */
294: #define FTS3_FULLSCAN_SEARCH 0 /* Linear scan of %_content table */
295: #define FTS3_DOCID_SEARCH 1 /* Lookup by rowid on %_content table */
296: #define FTS3_FULLTEXT_SEARCH 2 /* Full-text index search */
297:
298:
299: struct Fts3Doclist {
300: char *aAll; /* Array containing doclist (or NULL) */
301: int nAll; /* Size of a[] in bytes */
302: char *pNextDocid; /* Pointer to next docid */
303:
304: sqlite3_int64 iDocid; /* Current docid (if pList!=0) */
305: int bFreeList; /* True if pList should be sqlite3_free()d */
306: char *pList; /* Pointer to position list following iDocid */
307: int nList; /* Length of position list */
308: };
309:
310: /*
311: ** A "phrase" is a sequence of one or more tokens that must match in
312: ** sequence. A single token is the base case and the most common case.
313: ** For a sequence of tokens contained in double-quotes (i.e. "one two three")
314: ** nToken will be the number of tokens in the string.
315: */
316: struct Fts3PhraseToken {
317: char *z; /* Text of the token */
318: int n; /* Number of bytes in buffer z */
319: int isPrefix; /* True if token ends with a "*" character */
320: int bFirst; /* True if token must appear at position 0 */
321:
322: /* Variables above this point are populated when the expression is
323: ** parsed (by code in fts3_expr.c). Below this point the variables are
324: ** used when evaluating the expression. */
325: Fts3DeferredToken *pDeferred; /* Deferred token object for this token */
326: Fts3MultiSegReader *pSegcsr; /* Segment-reader for this token */
327: };
328:
329: struct Fts3Phrase {
330: /* Cache of doclist for this phrase. */
331: Fts3Doclist doclist;
332: int bIncr; /* True if doclist is loaded incrementally */
333: int iDoclistToken;
334:
335: /* Variables below this point are populated by fts3_expr.c when parsing
336: ** a MATCH expression. Everything above is part of the evaluation phase.
337: */
338: int nToken; /* Number of tokens in the phrase */
339: int iColumn; /* Index of column this phrase must match */
340: Fts3PhraseToken aToken[1]; /* One entry for each token in the phrase */
341: };
342:
343: /*
344: ** A tree of these objects forms the RHS of a MATCH operator.
345: **
346: ** If Fts3Expr.eType is FTSQUERY_PHRASE and isLoaded is true, then aDoclist
347: ** points to a malloced buffer, size nDoclist bytes, containing the results
348: ** of this phrase query in FTS3 doclist format. As usual, the initial
349: ** "Length" field found in doclists stored on disk is omitted from this
350: ** buffer.
351: **
352: ** Variable aMI is used only for FTSQUERY_NEAR nodes to store the global
353: ** matchinfo data. If it is not NULL, it points to an array of size nCol*3,
354: ** where nCol is the number of columns in the queried FTS table. The array
355: ** is populated as follows:
356: **
357: ** aMI[iCol*3 + 0] = Undefined
358: ** aMI[iCol*3 + 1] = Number of occurrences
359: ** aMI[iCol*3 + 2] = Number of rows containing at least one instance
360: **
361: ** The aMI array is allocated using sqlite3_malloc(). It should be freed
362: ** when the expression node is.
363: */
364: struct Fts3Expr {
365: int eType; /* One of the FTSQUERY_XXX values defined below */
366: int nNear; /* Valid if eType==FTSQUERY_NEAR */
367: Fts3Expr *pParent; /* pParent->pLeft==this or pParent->pRight==this */
368: Fts3Expr *pLeft; /* Left operand */
369: Fts3Expr *pRight; /* Right operand */
370: Fts3Phrase *pPhrase; /* Valid if eType==FTSQUERY_PHRASE */
371:
372: /* The following are used by the fts3_eval.c module. */
373: sqlite3_int64 iDocid; /* Current docid */
374: u8 bEof; /* True this expression is at EOF already */
375: u8 bStart; /* True if iDocid is valid */
376: u8 bDeferred; /* True if this expression is entirely deferred */
377:
378: u32 *aMI;
379: };
380:
381: /*
382: ** Candidate values for Fts3Query.eType. Note that the order of the first
383: ** four values is in order of precedence when parsing expressions. For
384: ** example, the following:
385: **
386: ** "a OR b AND c NOT d NEAR e"
387: **
388: ** is equivalent to:
389: **
390: ** "a OR (b AND (c NOT (d NEAR e)))"
391: */
392: #define FTSQUERY_NEAR 1
393: #define FTSQUERY_NOT 2
394: #define FTSQUERY_AND 3
395: #define FTSQUERY_OR 4
396: #define FTSQUERY_PHRASE 5
397:
398:
399: /* fts3_write.c */
400: int sqlite3Fts3UpdateMethod(sqlite3_vtab*,int,sqlite3_value**,sqlite3_int64*);
401: int sqlite3Fts3PendingTermsFlush(Fts3Table *);
402: void sqlite3Fts3PendingTermsClear(Fts3Table *);
403: int sqlite3Fts3Optimize(Fts3Table *);
404: int sqlite3Fts3SegReaderNew(int, sqlite3_int64,
405: sqlite3_int64, sqlite3_int64, const char *, int, Fts3SegReader**);
406: int sqlite3Fts3SegReaderPending(
407: Fts3Table*,int,const char*,int,int,Fts3SegReader**);
408: void sqlite3Fts3SegReaderFree(Fts3SegReader *);
409: int sqlite3Fts3AllSegdirs(Fts3Table*, int, int, sqlite3_stmt **);
410: int sqlite3Fts3ReadLock(Fts3Table *);
411: int sqlite3Fts3ReadBlock(Fts3Table*, sqlite3_int64, char **, int*, int*);
412:
413: int sqlite3Fts3SelectDoctotal(Fts3Table *, sqlite3_stmt **);
414: int sqlite3Fts3SelectDocsize(Fts3Table *, sqlite3_int64, sqlite3_stmt **);
415:
416: void sqlite3Fts3FreeDeferredTokens(Fts3Cursor *);
417: int sqlite3Fts3DeferToken(Fts3Cursor *, Fts3PhraseToken *, int);
418: int sqlite3Fts3CacheDeferredDoclists(Fts3Cursor *);
419: void sqlite3Fts3FreeDeferredDoclists(Fts3Cursor *);
420: void sqlite3Fts3SegmentsClose(Fts3Table *);
421:
422: /* Special values interpreted by sqlite3SegReaderCursor() */
423: #define FTS3_SEGCURSOR_PENDING -1
424: #define FTS3_SEGCURSOR_ALL -2
425:
426: int sqlite3Fts3SegReaderStart(Fts3Table*, Fts3MultiSegReader*, Fts3SegFilter*);
427: int sqlite3Fts3SegReaderStep(Fts3Table *, Fts3MultiSegReader *);
428: void sqlite3Fts3SegReaderFinish(Fts3MultiSegReader *);
429:
430: int sqlite3Fts3SegReaderCursor(
431: Fts3Table *, int, int, const char *, int, int, int, Fts3MultiSegReader *);
432:
433: /* Flags allowed as part of the 4th argument to SegmentReaderIterate() */
434: #define FTS3_SEGMENT_REQUIRE_POS 0x00000001
435: #define FTS3_SEGMENT_IGNORE_EMPTY 0x00000002
436: #define FTS3_SEGMENT_COLUMN_FILTER 0x00000004
437: #define FTS3_SEGMENT_PREFIX 0x00000008
438: #define FTS3_SEGMENT_SCAN 0x00000010
439: #define FTS3_SEGMENT_FIRST 0x00000020
440:
441: /* Type passed as 4th argument to SegmentReaderIterate() */
442: struct Fts3SegFilter {
443: const char *zTerm;
444: int nTerm;
445: int iCol;
446: int flags;
447: };
448:
449: struct Fts3MultiSegReader {
450: /* Used internally by sqlite3Fts3SegReaderXXX() calls */
451: Fts3SegReader **apSegment; /* Array of Fts3SegReader objects */
452: int nSegment; /* Size of apSegment array */
453: int nAdvance; /* How many seg-readers to advance */
454: Fts3SegFilter *pFilter; /* Pointer to filter object */
455: char *aBuffer; /* Buffer to merge doclists in */
456: int nBuffer; /* Allocated size of aBuffer[] in bytes */
457:
458: int iColFilter; /* If >=0, filter for this column */
459: int bRestart;
460:
461: /* Used by fts3.c only. */
462: int nCost; /* Cost of running iterator */
463: int bLookup; /* True if a lookup of a single entry. */
464:
465: /* Output values. Valid only after Fts3SegReaderStep() returns SQLITE_ROW. */
466: char *zTerm; /* Pointer to term buffer */
467: int nTerm; /* Size of zTerm in bytes */
468: char *aDoclist; /* Pointer to doclist buffer */
469: int nDoclist; /* Size of aDoclist[] in bytes */
470: };
471:
472: /* fts3.c */
473: int sqlite3Fts3PutVarint(char *, sqlite3_int64);
474: int sqlite3Fts3GetVarint(const char *, sqlite_int64 *);
475: int sqlite3Fts3GetVarint32(const char *, int *);
476: int sqlite3Fts3VarintLen(sqlite3_uint64);
477: void sqlite3Fts3Dequote(char *);
478: void sqlite3Fts3DoclistPrev(int,char*,int,char**,sqlite3_int64*,int*,u8*);
479: int sqlite3Fts3EvalPhraseStats(Fts3Cursor *, Fts3Expr *, u32 *);
480: int sqlite3Fts3FirstFilter(sqlite3_int64, char *, int, char *);
481:
482: /* fts3_tokenizer.c */
483: const char *sqlite3Fts3NextToken(const char *, int *);
484: int sqlite3Fts3InitHashTable(sqlite3 *, Fts3Hash *, const char *);
485: int sqlite3Fts3InitTokenizer(Fts3Hash *pHash, const char *,
486: sqlite3_tokenizer **, char **
487: );
488: int sqlite3Fts3IsIdChar(char);
489:
490: /* fts3_snippet.c */
491: void sqlite3Fts3Offsets(sqlite3_context*, Fts3Cursor*);
492: void sqlite3Fts3Snippet(sqlite3_context *, Fts3Cursor *, const char *,
493: const char *, const char *, int, int
494: );
495: void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *, const char *);
496:
497: /* fts3_expr.c */
498: int sqlite3Fts3ExprParse(sqlite3_tokenizer *,
499: char **, int, int, int, const char *, int, Fts3Expr **
500: );
501: void sqlite3Fts3ExprFree(Fts3Expr *);
502: #ifdef SQLITE_TEST
503: int sqlite3Fts3ExprInitTestInterface(sqlite3 *db);
504: int sqlite3Fts3InitTerm(sqlite3 *db);
505: #endif
506:
507: /* fts3_aux.c */
508: int sqlite3Fts3InitAux(sqlite3 *db);
509:
510: void sqlite3Fts3EvalPhraseCleanup(Fts3Phrase *);
511:
512: int sqlite3Fts3MsrIncrStart(
513: Fts3Table*, Fts3MultiSegReader*, int, const char*, int);
514: int sqlite3Fts3MsrIncrNext(
515: Fts3Table *, Fts3MultiSegReader *, sqlite3_int64 *, char **, int *);
516: char *sqlite3Fts3EvalPhrasePoslist(Fts3Cursor *, Fts3Expr *, int iCol);
517: int sqlite3Fts3MsrOvfl(Fts3Cursor *, Fts3MultiSegReader *, int *);
518: int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr);
519:
520: int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);
521:
522: #endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
523: #endif /* _FTSINT_H */
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>