Annotation of embedaddon/sqlite3/ext/fts3/fts3_snippet.c, revision 1.1.1.1
1.1 misho 1: /*
2: ** 2009 Oct 23
3: **
4: ** The author disclaims copyright to this source code. In place of
5: ** a legal notice, here is a blessing:
6: **
7: ** May you do good and not evil.
8: ** May you find forgiveness for yourself and forgive others.
9: ** May you share freely, never taking more than you give.
10: **
11: ******************************************************************************
12: */
13:
14: #include "fts3Int.h"
15: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
16:
17: #include <string.h>
18: #include <assert.h>
19:
20: /*
21: ** Characters that may appear in the second argument to matchinfo().
22: */
23: #define FTS3_MATCHINFO_NPHRASE 'p' /* 1 value */
24: #define FTS3_MATCHINFO_NCOL 'c' /* 1 value */
25: #define FTS3_MATCHINFO_NDOC 'n' /* 1 value */
26: #define FTS3_MATCHINFO_AVGLENGTH 'a' /* nCol values */
27: #define FTS3_MATCHINFO_LENGTH 'l' /* nCol values */
28: #define FTS3_MATCHINFO_LCS 's' /* nCol values */
29: #define FTS3_MATCHINFO_HITS 'x' /* 3*nCol*nPhrase values */
30:
31: /*
32: ** The default value for the second argument to matchinfo().
33: */
34: #define FTS3_MATCHINFO_DEFAULT "pcx"
35:
36:
37: /*
38: ** Used as an fts3ExprIterate() context when loading phrase doclists to
39: ** Fts3Expr.aDoclist[]/nDoclist.
40: */
41: typedef struct LoadDoclistCtx LoadDoclistCtx;
42: struct LoadDoclistCtx {
43: Fts3Cursor *pCsr; /* FTS3 Cursor */
44: int nPhrase; /* Number of phrases seen so far */
45: int nToken; /* Number of tokens seen so far */
46: };
47:
48: /*
49: ** The following types are used as part of the implementation of the
50: ** fts3BestSnippet() routine.
51: */
52: typedef struct SnippetIter SnippetIter;
53: typedef struct SnippetPhrase SnippetPhrase;
54: typedef struct SnippetFragment SnippetFragment;
55:
56: struct SnippetIter {
57: Fts3Cursor *pCsr; /* Cursor snippet is being generated from */
58: int iCol; /* Extract snippet from this column */
59: int nSnippet; /* Requested snippet length (in tokens) */
60: int nPhrase; /* Number of phrases in query */
61: SnippetPhrase *aPhrase; /* Array of size nPhrase */
62: int iCurrent; /* First token of current snippet */
63: };
64:
65: struct SnippetPhrase {
66: int nToken; /* Number of tokens in phrase */
67: char *pList; /* Pointer to start of phrase position list */
68: int iHead; /* Next value in position list */
69: char *pHead; /* Position list data following iHead */
70: int iTail; /* Next value in trailing position list */
71: char *pTail; /* Position list data following iTail */
72: };
73:
74: struct SnippetFragment {
75: int iCol; /* Column snippet is extracted from */
76: int iPos; /* Index of first token in snippet */
77: u64 covered; /* Mask of query phrases covered */
78: u64 hlmask; /* Mask of snippet terms to highlight */
79: };
80:
81: /*
82: ** This type is used as an fts3ExprIterate() context object while
83: ** accumulating the data returned by the matchinfo() function.
84: */
85: typedef struct MatchInfo MatchInfo;
86: struct MatchInfo {
87: Fts3Cursor *pCursor; /* FTS3 Cursor */
88: int nCol; /* Number of columns in table */
89: int nPhrase; /* Number of matchable phrases in query */
90: sqlite3_int64 nDoc; /* Number of docs in database */
91: u32 *aMatchinfo; /* Pre-allocated buffer */
92: };
93:
94:
95:
96: /*
97: ** The snippet() and offsets() functions both return text values. An instance
98: ** of the following structure is used to accumulate those values while the
99: ** functions are running. See fts3StringAppend() for details.
100: */
101: typedef struct StrBuffer StrBuffer;
102: struct StrBuffer {
103: char *z; /* Pointer to buffer containing string */
104: int n; /* Length of z in bytes (excl. nul-term) */
105: int nAlloc; /* Allocated size of buffer z in bytes */
106: };
107:
108:
109: /*
110: ** This function is used to help iterate through a position-list. A position
111: ** list is a list of unique integers, sorted from smallest to largest. Each
112: ** element of the list is represented by an FTS3 varint that takes the value
113: ** of the difference between the current element and the previous one plus
114: ** two. For example, to store the position-list:
115: **
116: ** 4 9 113
117: **
118: ** the three varints:
119: **
120: ** 6 7 106
121: **
122: ** are encoded.
123: **
124: ** When this function is called, *pp points to the start of an element of
125: ** the list. *piPos contains the value of the previous entry in the list.
126: ** After it returns, *piPos contains the value of the next element of the
127: ** list and *pp is advanced to the following varint.
128: */
129: static void fts3GetDeltaPosition(char **pp, int *piPos){
130: int iVal;
131: *pp += sqlite3Fts3GetVarint32(*pp, &iVal);
132: *piPos += (iVal-2);
133: }
134:
135: /*
136: ** Helper function for fts3ExprIterate() (see below).
137: */
138: static int fts3ExprIterate2(
139: Fts3Expr *pExpr, /* Expression to iterate phrases of */
140: int *piPhrase, /* Pointer to phrase counter */
141: int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
142: void *pCtx /* Second argument to pass to callback */
143: ){
144: int rc; /* Return code */
145: int eType = pExpr->eType; /* Type of expression node pExpr */
146:
147: if( eType!=FTSQUERY_PHRASE ){
148: assert( pExpr->pLeft && pExpr->pRight );
149: rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx);
150: if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){
151: rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx);
152: }
153: }else{
154: rc = x(pExpr, *piPhrase, pCtx);
155: (*piPhrase)++;
156: }
157: return rc;
158: }
159:
160: /*
161: ** Iterate through all phrase nodes in an FTS3 query, except those that
162: ** are part of a sub-tree that is the right-hand-side of a NOT operator.
163: ** For each phrase node found, the supplied callback function is invoked.
164: **
165: ** If the callback function returns anything other than SQLITE_OK,
166: ** the iteration is abandoned and the error code returned immediately.
167: ** Otherwise, SQLITE_OK is returned after a callback has been made for
168: ** all eligible phrase nodes.
169: */
170: static int fts3ExprIterate(
171: Fts3Expr *pExpr, /* Expression to iterate phrases of */
172: int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */
173: void *pCtx /* Second argument to pass to callback */
174: ){
175: int iPhrase = 0; /* Variable used as the phrase counter */
176: return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx);
177: }
178:
179: /*
180: ** This is an fts3ExprIterate() callback used while loading the doclists
181: ** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also
182: ** fts3ExprLoadDoclists().
183: */
184: static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
185: int rc = SQLITE_OK;
186: Fts3Phrase *pPhrase = pExpr->pPhrase;
187: LoadDoclistCtx *p = (LoadDoclistCtx *)ctx;
188:
189: UNUSED_PARAMETER(iPhrase);
190:
191: p->nPhrase++;
192: p->nToken += pPhrase->nToken;
193:
194: return rc;
195: }
196:
197: /*
198: ** Load the doclists for each phrase in the query associated with FTS3 cursor
199: ** pCsr.
200: **
201: ** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable
202: ** phrases in the expression (all phrases except those directly or
203: ** indirectly descended from the right-hand-side of a NOT operator). If
204: ** pnToken is not NULL, then it is set to the number of tokens in all
205: ** matchable phrases of the expression.
206: */
207: static int fts3ExprLoadDoclists(
208: Fts3Cursor *pCsr, /* Fts3 cursor for current query */
209: int *pnPhrase, /* OUT: Number of phrases in query */
210: int *pnToken /* OUT: Number of tokens in query */
211: ){
212: int rc; /* Return Code */
213: LoadDoclistCtx sCtx = {0,0,0}; /* Context for fts3ExprIterate() */
214: sCtx.pCsr = pCsr;
215: rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx);
216: if( pnPhrase ) *pnPhrase = sCtx.nPhrase;
217: if( pnToken ) *pnToken = sCtx.nToken;
218: return rc;
219: }
220:
221: static int fts3ExprPhraseCountCb(Fts3Expr *pExpr, int iPhrase, void *ctx){
222: (*(int *)ctx)++;
223: UNUSED_PARAMETER(pExpr);
224: UNUSED_PARAMETER(iPhrase);
225: return SQLITE_OK;
226: }
227: static int fts3ExprPhraseCount(Fts3Expr *pExpr){
228: int nPhrase = 0;
229: (void)fts3ExprIterate(pExpr, fts3ExprPhraseCountCb, (void *)&nPhrase);
230: return nPhrase;
231: }
232:
233: /*
234: ** Advance the position list iterator specified by the first two
235: ** arguments so that it points to the first element with a value greater
236: ** than or equal to parameter iNext.
237: */
238: static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){
239: char *pIter = *ppIter;
240: if( pIter ){
241: int iIter = *piIter;
242:
243: while( iIter<iNext ){
244: if( 0==(*pIter & 0xFE) ){
245: iIter = -1;
246: pIter = 0;
247: break;
248: }
249: fts3GetDeltaPosition(&pIter, &iIter);
250: }
251:
252: *piIter = iIter;
253: *ppIter = pIter;
254: }
255: }
256:
257: /*
258: ** Advance the snippet iterator to the next candidate snippet.
259: */
260: static int fts3SnippetNextCandidate(SnippetIter *pIter){
261: int i; /* Loop counter */
262:
263: if( pIter->iCurrent<0 ){
264: /* The SnippetIter object has just been initialized. The first snippet
265: ** candidate always starts at offset 0 (even if this candidate has a
266: ** score of 0.0).
267: */
268: pIter->iCurrent = 0;
269:
270: /* Advance the 'head' iterator of each phrase to the first offset that
271: ** is greater than or equal to (iNext+nSnippet).
272: */
273: for(i=0; i<pIter->nPhrase; i++){
274: SnippetPhrase *pPhrase = &pIter->aPhrase[i];
275: fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet);
276: }
277: }else{
278: int iStart;
279: int iEnd = 0x7FFFFFFF;
280:
281: for(i=0; i<pIter->nPhrase; i++){
282: SnippetPhrase *pPhrase = &pIter->aPhrase[i];
283: if( pPhrase->pHead && pPhrase->iHead<iEnd ){
284: iEnd = pPhrase->iHead;
285: }
286: }
287: if( iEnd==0x7FFFFFFF ){
288: return 1;
289: }
290:
291: pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1;
292: for(i=0; i<pIter->nPhrase; i++){
293: SnippetPhrase *pPhrase = &pIter->aPhrase[i];
294: fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1);
295: fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart);
296: }
297: }
298:
299: return 0;
300: }
301:
302: /*
303: ** Retrieve information about the current candidate snippet of snippet
304: ** iterator pIter.
305: */
306: static void fts3SnippetDetails(
307: SnippetIter *pIter, /* Snippet iterator */
308: u64 mCovered, /* Bitmask of phrases already covered */
309: int *piToken, /* OUT: First token of proposed snippet */
310: int *piScore, /* OUT: "Score" for this snippet */
311: u64 *pmCover, /* OUT: Bitmask of phrases covered */
312: u64 *pmHighlight /* OUT: Bitmask of terms to highlight */
313: ){
314: int iStart = pIter->iCurrent; /* First token of snippet */
315: int iScore = 0; /* Score of this snippet */
316: int i; /* Loop counter */
317: u64 mCover = 0; /* Mask of phrases covered by this snippet */
318: u64 mHighlight = 0; /* Mask of tokens to highlight in snippet */
319:
320: for(i=0; i<pIter->nPhrase; i++){
321: SnippetPhrase *pPhrase = &pIter->aPhrase[i];
322: if( pPhrase->pTail ){
323: char *pCsr = pPhrase->pTail;
324: int iCsr = pPhrase->iTail;
325:
326: while( iCsr<(iStart+pIter->nSnippet) ){
327: int j;
328: u64 mPhrase = (u64)1 << i;
329: u64 mPos = (u64)1 << (iCsr - iStart);
330: assert( iCsr>=iStart );
331: if( (mCover|mCovered)&mPhrase ){
332: iScore++;
333: }else{
334: iScore += 1000;
335: }
336: mCover |= mPhrase;
337:
338: for(j=0; j<pPhrase->nToken; j++){
339: mHighlight |= (mPos>>j);
340: }
341:
342: if( 0==(*pCsr & 0x0FE) ) break;
343: fts3GetDeltaPosition(&pCsr, &iCsr);
344: }
345: }
346: }
347:
348: /* Set the output variables before returning. */
349: *piToken = iStart;
350: *piScore = iScore;
351: *pmCover = mCover;
352: *pmHighlight = mHighlight;
353: }
354:
355: /*
356: ** This function is an fts3ExprIterate() callback used by fts3BestSnippet().
357: ** Each invocation populates an element of the SnippetIter.aPhrase[] array.
358: */
359: static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){
360: SnippetIter *p = (SnippetIter *)ctx;
361: SnippetPhrase *pPhrase = &p->aPhrase[iPhrase];
362: char *pCsr;
363:
364: pPhrase->nToken = pExpr->pPhrase->nToken;
365:
366: pCsr = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol);
367: if( pCsr ){
368: int iFirst = 0;
369: pPhrase->pList = pCsr;
370: fts3GetDeltaPosition(&pCsr, &iFirst);
371: assert( iFirst>=0 );
372: pPhrase->pHead = pCsr;
373: pPhrase->pTail = pCsr;
374: pPhrase->iHead = iFirst;
375: pPhrase->iTail = iFirst;
376: }else{
377: assert( pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 );
378: }
379:
380: return SQLITE_OK;
381: }
382:
383: /*
384: ** Select the fragment of text consisting of nFragment contiguous tokens
385: ** from column iCol that represent the "best" snippet. The best snippet
386: ** is the snippet with the highest score, where scores are calculated
387: ** by adding:
388: **
389: ** (a) +1 point for each occurence of a matchable phrase in the snippet.
390: **
391: ** (b) +1000 points for the first occurence of each matchable phrase in
392: ** the snippet for which the corresponding mCovered bit is not set.
393: **
394: ** The selected snippet parameters are stored in structure *pFragment before
395: ** returning. The score of the selected snippet is stored in *piScore
396: ** before returning.
397: */
398: static int fts3BestSnippet(
399: int nSnippet, /* Desired snippet length */
400: Fts3Cursor *pCsr, /* Cursor to create snippet for */
401: int iCol, /* Index of column to create snippet from */
402: u64 mCovered, /* Mask of phrases already covered */
403: u64 *pmSeen, /* IN/OUT: Mask of phrases seen */
404: SnippetFragment *pFragment, /* OUT: Best snippet found */
405: int *piScore /* OUT: Score of snippet pFragment */
406: ){
407: int rc; /* Return Code */
408: int nList; /* Number of phrases in expression */
409: SnippetIter sIter; /* Iterates through snippet candidates */
410: int nByte; /* Number of bytes of space to allocate */
411: int iBestScore = -1; /* Best snippet score found so far */
412: int i; /* Loop counter */
413:
414: memset(&sIter, 0, sizeof(sIter));
415:
416: /* Iterate through the phrases in the expression to count them. The same
417: ** callback makes sure the doclists are loaded for each phrase.
418: */
419: rc = fts3ExprLoadDoclists(pCsr, &nList, 0);
420: if( rc!=SQLITE_OK ){
421: return rc;
422: }
423:
424: /* Now that it is known how many phrases there are, allocate and zero
425: ** the required space using malloc().
426: */
427: nByte = sizeof(SnippetPhrase) * nList;
428: sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte);
429: if( !sIter.aPhrase ){
430: return SQLITE_NOMEM;
431: }
432: memset(sIter.aPhrase, 0, nByte);
433:
434: /* Initialize the contents of the SnippetIter object. Then iterate through
435: ** the set of phrases in the expression to populate the aPhrase[] array.
436: */
437: sIter.pCsr = pCsr;
438: sIter.iCol = iCol;
439: sIter.nSnippet = nSnippet;
440: sIter.nPhrase = nList;
441: sIter.iCurrent = -1;
442: (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sIter);
443:
444: /* Set the *pmSeen output variable. */
445: for(i=0; i<nList; i++){
446: if( sIter.aPhrase[i].pHead ){
447: *pmSeen |= (u64)1 << i;
448: }
449: }
450:
451: /* Loop through all candidate snippets. Store the best snippet in
452: ** *pFragment. Store its associated 'score' in iBestScore.
453: */
454: pFragment->iCol = iCol;
455: while( !fts3SnippetNextCandidate(&sIter) ){
456: int iPos;
457: int iScore;
458: u64 mCover;
459: u64 mHighlight;
460: fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover, &mHighlight);
461: assert( iScore>=0 );
462: if( iScore>iBestScore ){
463: pFragment->iPos = iPos;
464: pFragment->hlmask = mHighlight;
465: pFragment->covered = mCover;
466: iBestScore = iScore;
467: }
468: }
469:
470: sqlite3_free(sIter.aPhrase);
471: *piScore = iBestScore;
472: return SQLITE_OK;
473: }
474:
475:
476: /*
477: ** Append a string to the string-buffer passed as the first argument.
478: **
479: ** If nAppend is negative, then the length of the string zAppend is
480: ** determined using strlen().
481: */
482: static int fts3StringAppend(
483: StrBuffer *pStr, /* Buffer to append to */
484: const char *zAppend, /* Pointer to data to append to buffer */
485: int nAppend /* Size of zAppend in bytes (or -1) */
486: ){
487: if( nAppend<0 ){
488: nAppend = (int)strlen(zAppend);
489: }
490:
491: /* If there is insufficient space allocated at StrBuffer.z, use realloc()
492: ** to grow the buffer until so that it is big enough to accomadate the
493: ** appended data.
494: */
495: if( pStr->n+nAppend+1>=pStr->nAlloc ){
496: int nAlloc = pStr->nAlloc+nAppend+100;
497: char *zNew = sqlite3_realloc(pStr->z, nAlloc);
498: if( !zNew ){
499: return SQLITE_NOMEM;
500: }
501: pStr->z = zNew;
502: pStr->nAlloc = nAlloc;
503: }
504:
505: /* Append the data to the string buffer. */
506: memcpy(&pStr->z[pStr->n], zAppend, nAppend);
507: pStr->n += nAppend;
508: pStr->z[pStr->n] = '\0';
509:
510: return SQLITE_OK;
511: }
512:
513: /*
514: ** The fts3BestSnippet() function often selects snippets that end with a
515: ** query term. That is, the final term of the snippet is always a term
516: ** that requires highlighting. For example, if 'X' is a highlighted term
517: ** and '.' is a non-highlighted term, BestSnippet() may select:
518: **
519: ** ........X.....X
520: **
521: ** This function "shifts" the beginning of the snippet forward in the
522: ** document so that there are approximately the same number of
523: ** non-highlighted terms to the right of the final highlighted term as there
524: ** are to the left of the first highlighted term. For example, to this:
525: **
526: ** ....X.....X....
527: **
528: ** This is done as part of extracting the snippet text, not when selecting
529: ** the snippet. Snippet selection is done based on doclists only, so there
530: ** is no way for fts3BestSnippet() to know whether or not the document
531: ** actually contains terms that follow the final highlighted term.
532: */
533: static int fts3SnippetShift(
534: Fts3Table *pTab, /* FTS3 table snippet comes from */
535: int nSnippet, /* Number of tokens desired for snippet */
536: const char *zDoc, /* Document text to extract snippet from */
537: int nDoc, /* Size of buffer zDoc in bytes */
538: int *piPos, /* IN/OUT: First token of snippet */
539: u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */
540: ){
541: u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */
542:
543: if( hlmask ){
544: int nLeft; /* Tokens to the left of first highlight */
545: int nRight; /* Tokens to the right of last highlight */
546: int nDesired; /* Ideal number of tokens to shift forward */
547:
548: for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++);
549: for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++);
550: nDesired = (nLeft-nRight)/2;
551:
552: /* Ideally, the start of the snippet should be pushed forward in the
553: ** document nDesired tokens. This block checks if there are actually
554: ** nDesired tokens to the right of the snippet. If so, *piPos and
555: ** *pHlMask are updated to shift the snippet nDesired tokens to the
556: ** right. Otherwise, the snippet is shifted by the number of tokens
557: ** available.
558: */
559: if( nDesired>0 ){
560: int nShift; /* Number of tokens to shift snippet by */
561: int iCurrent = 0; /* Token counter */
562: int rc; /* Return Code */
563: sqlite3_tokenizer_module *pMod;
564: sqlite3_tokenizer_cursor *pC;
565: pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
566:
567: /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired)
568: ** or more tokens in zDoc/nDoc.
569: */
570: rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
571: if( rc!=SQLITE_OK ){
572: return rc;
573: }
574: pC->pTokenizer = pTab->pTokenizer;
575: while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){
576: const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3;
577: rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
578: }
579: pMod->xClose(pC);
580: if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; }
581:
582: nShift = (rc==SQLITE_DONE)+iCurrent-nSnippet;
583: assert( nShift<=nDesired );
584: if( nShift>0 ){
585: *piPos += nShift;
586: *pHlmask = hlmask >> nShift;
587: }
588: }
589: }
590: return SQLITE_OK;
591: }
592:
593: /*
594: ** Extract the snippet text for fragment pFragment from cursor pCsr and
595: ** append it to string buffer pOut.
596: */
597: static int fts3SnippetText(
598: Fts3Cursor *pCsr, /* FTS3 Cursor */
599: SnippetFragment *pFragment, /* Snippet to extract */
600: int iFragment, /* Fragment number */
601: int isLast, /* True for final fragment in snippet */
602: int nSnippet, /* Number of tokens in extracted snippet */
603: const char *zOpen, /* String inserted before highlighted term */
604: const char *zClose, /* String inserted after highlighted term */
605: const char *zEllipsis, /* String inserted between snippets */
606: StrBuffer *pOut /* Write output here */
607: ){
608: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
609: int rc; /* Return code */
610: const char *zDoc; /* Document text to extract snippet from */
611: int nDoc; /* Size of zDoc in bytes */
612: int iCurrent = 0; /* Current token number of document */
613: int iEnd = 0; /* Byte offset of end of current token */
614: int isShiftDone = 0; /* True after snippet is shifted */
615: int iPos = pFragment->iPos; /* First token of snippet */
616: u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */
617: int iCol = pFragment->iCol+1; /* Query column to extract text from */
618: sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */
619: sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */
620: const char *ZDUMMY; /* Dummy argument used with tokenizer */
621: int DUMMY1; /* Dummy argument used with tokenizer */
622:
623: zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol);
624: if( zDoc==0 ){
625: if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){
626: return SQLITE_NOMEM;
627: }
628: return SQLITE_OK;
629: }
630: nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol);
631:
632: /* Open a token cursor on the document. */
633: pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
634: rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
635: if( rc!=SQLITE_OK ){
636: return rc;
637: }
638: pC->pTokenizer = pTab->pTokenizer;
639:
640: while( rc==SQLITE_OK ){
641: int iBegin; /* Offset in zDoc of start of token */
642: int iFin; /* Offset in zDoc of end of token */
643: int isHighlight; /* True for highlighted terms */
644:
645: rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);
646: if( rc!=SQLITE_OK ){
647: if( rc==SQLITE_DONE ){
648: /* Special case - the last token of the snippet is also the last token
649: ** of the column. Append any punctuation that occurred between the end
650: ** of the previous token and the end of the document to the output.
651: ** Then break out of the loop. */
652: rc = fts3StringAppend(pOut, &zDoc[iEnd], -1);
653: }
654: break;
655: }
656: if( iCurrent<iPos ){ continue; }
657:
658: if( !isShiftDone ){
659: int n = nDoc - iBegin;
660: rc = fts3SnippetShift(pTab, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask);
661: isShiftDone = 1;
662:
663: /* Now that the shift has been done, check if the initial "..." are
664: ** required. They are required if (a) this is not the first fragment,
665: ** or (b) this fragment does not begin at position 0 of its column.
666: */
667: if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){
668: rc = fts3StringAppend(pOut, zEllipsis, -1);
669: }
670: if( rc!=SQLITE_OK || iCurrent<iPos ) continue;
671: }
672:
673: if( iCurrent>=(iPos+nSnippet) ){
674: if( isLast ){
675: rc = fts3StringAppend(pOut, zEllipsis, -1);
676: }
677: break;
678: }
679:
680: /* Set isHighlight to true if this term should be highlighted. */
681: isHighlight = (hlmask & ((u64)1 << (iCurrent-iPos)))!=0;
682:
683: if( iCurrent>iPos ) rc = fts3StringAppend(pOut, &zDoc[iEnd], iBegin-iEnd);
684: if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zOpen, -1);
685: if( rc==SQLITE_OK ) rc = fts3StringAppend(pOut, &zDoc[iBegin], iFin-iBegin);
686: if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zClose, -1);
687:
688: iEnd = iFin;
689: }
690:
691: pMod->xClose(pC);
692: return rc;
693: }
694:
695:
696: /*
697: ** This function is used to count the entries in a column-list (a
698: ** delta-encoded list of term offsets within a single column of a single
699: ** row). When this function is called, *ppCollist should point to the
700: ** beginning of the first varint in the column-list (the varint that
701: ** contains the position of the first matching term in the column data).
702: ** Before returning, *ppCollist is set to point to the first byte after
703: ** the last varint in the column-list (either the 0x00 signifying the end
704: ** of the position-list, or the 0x01 that precedes the column number of
705: ** the next column in the position-list).
706: **
707: ** The number of elements in the column-list is returned.
708: */
709: static int fts3ColumnlistCount(char **ppCollist){
710: char *pEnd = *ppCollist;
711: char c = 0;
712: int nEntry = 0;
713:
714: /* A column-list is terminated by either a 0x01 or 0x00. */
715: while( 0xFE & (*pEnd | c) ){
716: c = *pEnd++ & 0x80;
717: if( !c ) nEntry++;
718: }
719:
720: *ppCollist = pEnd;
721: return nEntry;
722: }
723:
724: /*
725: ** fts3ExprIterate() callback used to collect the "global" matchinfo stats
726: ** for a single query.
727: **
728: ** fts3ExprIterate() callback to load the 'global' elements of a
729: ** FTS3_MATCHINFO_HITS matchinfo array. The global stats are those elements
730: ** of the matchinfo array that are constant for all rows returned by the
731: ** current query.
732: **
733: ** Argument pCtx is actually a pointer to a struct of type MatchInfo. This
734: ** function populates Matchinfo.aMatchinfo[] as follows:
735: **
736: ** for(iCol=0; iCol<nCol; iCol++){
737: ** aMatchinfo[3*iPhrase*nCol + 3*iCol + 1] = X;
738: ** aMatchinfo[3*iPhrase*nCol + 3*iCol + 2] = Y;
739: ** }
740: **
741: ** where X is the number of matches for phrase iPhrase is column iCol of all
742: ** rows of the table. Y is the number of rows for which column iCol contains
743: ** at least one instance of phrase iPhrase.
744: **
745: ** If the phrase pExpr consists entirely of deferred tokens, then all X and
746: ** Y values are set to nDoc, where nDoc is the number of documents in the
747: ** file system. This is done because the full-text index doclist is required
748: ** to calculate these values properly, and the full-text index doclist is
749: ** not available for deferred tokens.
750: */
751: static int fts3ExprGlobalHitsCb(
752: Fts3Expr *pExpr, /* Phrase expression node */
753: int iPhrase, /* Phrase number (numbered from zero) */
754: void *pCtx /* Pointer to MatchInfo structure */
755: ){
756: MatchInfo *p = (MatchInfo *)pCtx;
757: return sqlite3Fts3EvalPhraseStats(
758: p->pCursor, pExpr, &p->aMatchinfo[3*iPhrase*p->nCol]
759: );
760: }
761:
762: /*
763: ** fts3ExprIterate() callback used to collect the "local" part of the
764: ** FTS3_MATCHINFO_HITS array. The local stats are those elements of the
765: ** array that are different for each row returned by the query.
766: */
767: static int fts3ExprLocalHitsCb(
768: Fts3Expr *pExpr, /* Phrase expression node */
769: int iPhrase, /* Phrase number */
770: void *pCtx /* Pointer to MatchInfo structure */
771: ){
772: MatchInfo *p = (MatchInfo *)pCtx;
773: int iStart = iPhrase * p->nCol * 3;
774: int i;
775:
776: for(i=0; i<p->nCol; i++){
777: char *pCsr;
778: pCsr = sqlite3Fts3EvalPhrasePoslist(p->pCursor, pExpr, i);
779: if( pCsr ){
780: p->aMatchinfo[iStart+i*3] = fts3ColumnlistCount(&pCsr);
781: }else{
782: p->aMatchinfo[iStart+i*3] = 0;
783: }
784: }
785:
786: return SQLITE_OK;
787: }
788:
789: static int fts3MatchinfoCheck(
790: Fts3Table *pTab,
791: char cArg,
792: char **pzErr
793: ){
794: if( (cArg==FTS3_MATCHINFO_NPHRASE)
795: || (cArg==FTS3_MATCHINFO_NCOL)
796: || (cArg==FTS3_MATCHINFO_NDOC && pTab->bHasStat)
797: || (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bHasStat)
798: || (cArg==FTS3_MATCHINFO_LENGTH && pTab->bHasDocsize)
799: || (cArg==FTS3_MATCHINFO_LCS)
800: || (cArg==FTS3_MATCHINFO_HITS)
801: ){
802: return SQLITE_OK;
803: }
804: *pzErr = sqlite3_mprintf("unrecognized matchinfo request: %c", cArg);
805: return SQLITE_ERROR;
806: }
807:
808: static int fts3MatchinfoSize(MatchInfo *pInfo, char cArg){
809: int nVal; /* Number of integers output by cArg */
810:
811: switch( cArg ){
812: case FTS3_MATCHINFO_NDOC:
813: case FTS3_MATCHINFO_NPHRASE:
814: case FTS3_MATCHINFO_NCOL:
815: nVal = 1;
816: break;
817:
818: case FTS3_MATCHINFO_AVGLENGTH:
819: case FTS3_MATCHINFO_LENGTH:
820: case FTS3_MATCHINFO_LCS:
821: nVal = pInfo->nCol;
822: break;
823:
824: default:
825: assert( cArg==FTS3_MATCHINFO_HITS );
826: nVal = pInfo->nCol * pInfo->nPhrase * 3;
827: break;
828: }
829:
830: return nVal;
831: }
832:
833: static int fts3MatchinfoSelectDoctotal(
834: Fts3Table *pTab,
835: sqlite3_stmt **ppStmt,
836: sqlite3_int64 *pnDoc,
837: const char **paLen
838: ){
839: sqlite3_stmt *pStmt;
840: const char *a;
841: sqlite3_int64 nDoc;
842:
843: if( !*ppStmt ){
844: int rc = sqlite3Fts3SelectDoctotal(pTab, ppStmt);
845: if( rc!=SQLITE_OK ) return rc;
846: }
847: pStmt = *ppStmt;
848: assert( sqlite3_data_count(pStmt)==1 );
849:
850: a = sqlite3_column_blob(pStmt, 0);
851: a += sqlite3Fts3GetVarint(a, &nDoc);
852: if( nDoc==0 ) return FTS_CORRUPT_VTAB;
853: *pnDoc = (u32)nDoc;
854:
855: if( paLen ) *paLen = a;
856: return SQLITE_OK;
857: }
858:
859: /*
860: ** An instance of the following structure is used to store state while
861: ** iterating through a multi-column position-list corresponding to the
862: ** hits for a single phrase on a single row in order to calculate the
863: ** values for a matchinfo() FTS3_MATCHINFO_LCS request.
864: */
865: typedef struct LcsIterator LcsIterator;
866: struct LcsIterator {
867: Fts3Expr *pExpr; /* Pointer to phrase expression */
868: int iPosOffset; /* Tokens count up to end of this phrase */
869: char *pRead; /* Cursor used to iterate through aDoclist */
870: int iPos; /* Current position */
871: };
872:
873: /*
874: ** If LcsIterator.iCol is set to the following value, the iterator has
875: ** finished iterating through all offsets for all columns.
876: */
877: #define LCS_ITERATOR_FINISHED 0x7FFFFFFF;
878:
879: static int fts3MatchinfoLcsCb(
880: Fts3Expr *pExpr, /* Phrase expression node */
881: int iPhrase, /* Phrase number (numbered from zero) */
882: void *pCtx /* Pointer to MatchInfo structure */
883: ){
884: LcsIterator *aIter = (LcsIterator *)pCtx;
885: aIter[iPhrase].pExpr = pExpr;
886: return SQLITE_OK;
887: }
888:
889: /*
890: ** Advance the iterator passed as an argument to the next position. Return
891: ** 1 if the iterator is at EOF or if it now points to the start of the
892: ** position list for the next column.
893: */
894: static int fts3LcsIteratorAdvance(LcsIterator *pIter){
895: char *pRead = pIter->pRead;
896: sqlite3_int64 iRead;
897: int rc = 0;
898:
899: pRead += sqlite3Fts3GetVarint(pRead, &iRead);
900: if( iRead==0 || iRead==1 ){
901: pRead = 0;
902: rc = 1;
903: }else{
904: pIter->iPos += (int)(iRead-2);
905: }
906:
907: pIter->pRead = pRead;
908: return rc;
909: }
910:
911: /*
912: ** This function implements the FTS3_MATCHINFO_LCS matchinfo() flag.
913: **
914: ** If the call is successful, the longest-common-substring lengths for each
915: ** column are written into the first nCol elements of the pInfo->aMatchinfo[]
916: ** array before returning. SQLITE_OK is returned in this case.
917: **
918: ** Otherwise, if an error occurs, an SQLite error code is returned and the
919: ** data written to the first nCol elements of pInfo->aMatchinfo[] is
920: ** undefined.
921: */
922: static int fts3MatchinfoLcs(Fts3Cursor *pCsr, MatchInfo *pInfo){
923: LcsIterator *aIter;
924: int i;
925: int iCol;
926: int nToken = 0;
927:
928: /* Allocate and populate the array of LcsIterator objects. The array
929: ** contains one element for each matchable phrase in the query.
930: **/
931: aIter = sqlite3_malloc(sizeof(LcsIterator) * pCsr->nPhrase);
932: if( !aIter ) return SQLITE_NOMEM;
933: memset(aIter, 0, sizeof(LcsIterator) * pCsr->nPhrase);
934: (void)fts3ExprIterate(pCsr->pExpr, fts3MatchinfoLcsCb, (void*)aIter);
935:
936: for(i=0; i<pInfo->nPhrase; i++){
937: LcsIterator *pIter = &aIter[i];
938: nToken -= pIter->pExpr->pPhrase->nToken;
939: pIter->iPosOffset = nToken;
940: }
941:
942: for(iCol=0; iCol<pInfo->nCol; iCol++){
943: int nLcs = 0; /* LCS value for this column */
944: int nLive = 0; /* Number of iterators in aIter not at EOF */
945:
946: for(i=0; i<pInfo->nPhrase; i++){
947: LcsIterator *pIt = &aIter[i];
948: pIt->pRead = sqlite3Fts3EvalPhrasePoslist(pCsr, pIt->pExpr, iCol);
949: if( pIt->pRead ){
950: pIt->iPos = pIt->iPosOffset;
951: fts3LcsIteratorAdvance(&aIter[i]);
952: nLive++;
953: }
954: }
955:
956: while( nLive>0 ){
957: LcsIterator *pAdv = 0; /* The iterator to advance by one position */
958: int nThisLcs = 0; /* LCS for the current iterator positions */
959:
960: for(i=0; i<pInfo->nPhrase; i++){
961: LcsIterator *pIter = &aIter[i];
962: if( pIter->pRead==0 ){
963: /* This iterator is already at EOF for this column. */
964: nThisLcs = 0;
965: }else{
966: if( pAdv==0 || pIter->iPos<pAdv->iPos ){
967: pAdv = pIter;
968: }
969: if( nThisLcs==0 || pIter->iPos==pIter[-1].iPos ){
970: nThisLcs++;
971: }else{
972: nThisLcs = 1;
973: }
974: if( nThisLcs>nLcs ) nLcs = nThisLcs;
975: }
976: }
977: if( fts3LcsIteratorAdvance(pAdv) ) nLive--;
978: }
979:
980: pInfo->aMatchinfo[iCol] = nLcs;
981: }
982:
983: sqlite3_free(aIter);
984: return SQLITE_OK;
985: }
986:
987: /*
988: ** Populate the buffer pInfo->aMatchinfo[] with an array of integers to
989: ** be returned by the matchinfo() function. Argument zArg contains the
990: ** format string passed as the second argument to matchinfo (or the
991: ** default value "pcx" if no second argument was specified). The format
992: ** string has already been validated and the pInfo->aMatchinfo[] array
993: ** is guaranteed to be large enough for the output.
994: **
995: ** If bGlobal is true, then populate all fields of the matchinfo() output.
996: ** If it is false, then assume that those fields that do not change between
997: ** rows (i.e. FTS3_MATCHINFO_NPHRASE, NCOL, NDOC, AVGLENGTH and part of HITS)
998: ** have already been populated.
999: **
1000: ** Return SQLITE_OK if successful, or an SQLite error code if an error
1001: ** occurs. If a value other than SQLITE_OK is returned, the state the
1002: ** pInfo->aMatchinfo[] buffer is left in is undefined.
1003: */
1004: static int fts3MatchinfoValues(
1005: Fts3Cursor *pCsr, /* FTS3 cursor object */
1006: int bGlobal, /* True to grab the global stats */
1007: MatchInfo *pInfo, /* Matchinfo context object */
1008: const char *zArg /* Matchinfo format string */
1009: ){
1010: int rc = SQLITE_OK;
1011: int i;
1012: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1013: sqlite3_stmt *pSelect = 0;
1014:
1015: for(i=0; rc==SQLITE_OK && zArg[i]; i++){
1016:
1017: switch( zArg[i] ){
1018: case FTS3_MATCHINFO_NPHRASE:
1019: if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nPhrase;
1020: break;
1021:
1022: case FTS3_MATCHINFO_NCOL:
1023: if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nCol;
1024: break;
1025:
1026: case FTS3_MATCHINFO_NDOC:
1027: if( bGlobal ){
1028: sqlite3_int64 nDoc = 0;
1029: rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, 0);
1030: pInfo->aMatchinfo[0] = (u32)nDoc;
1031: }
1032: break;
1033:
1034: case FTS3_MATCHINFO_AVGLENGTH:
1035: if( bGlobal ){
1036: sqlite3_int64 nDoc; /* Number of rows in table */
1037: const char *a; /* Aggregate column length array */
1038:
1039: rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, &a);
1040: if( rc==SQLITE_OK ){
1041: int iCol;
1042: for(iCol=0; iCol<pInfo->nCol; iCol++){
1043: u32 iVal;
1044: sqlite3_int64 nToken;
1045: a += sqlite3Fts3GetVarint(a, &nToken);
1046: iVal = (u32)(((u32)(nToken&0xffffffff)+nDoc/2)/nDoc);
1047: pInfo->aMatchinfo[iCol] = iVal;
1048: }
1049: }
1050: }
1051: break;
1052:
1053: case FTS3_MATCHINFO_LENGTH: {
1054: sqlite3_stmt *pSelectDocsize = 0;
1055: rc = sqlite3Fts3SelectDocsize(pTab, pCsr->iPrevId, &pSelectDocsize);
1056: if( rc==SQLITE_OK ){
1057: int iCol;
1058: const char *a = sqlite3_column_blob(pSelectDocsize, 0);
1059: for(iCol=0; iCol<pInfo->nCol; iCol++){
1060: sqlite3_int64 nToken;
1061: a += sqlite3Fts3GetVarint(a, &nToken);
1062: pInfo->aMatchinfo[iCol] = (u32)nToken;
1063: }
1064: }
1065: sqlite3_reset(pSelectDocsize);
1066: break;
1067: }
1068:
1069: case FTS3_MATCHINFO_LCS:
1070: rc = fts3ExprLoadDoclists(pCsr, 0, 0);
1071: if( rc==SQLITE_OK ){
1072: rc = fts3MatchinfoLcs(pCsr, pInfo);
1073: }
1074: break;
1075:
1076: default: {
1077: Fts3Expr *pExpr;
1078: assert( zArg[i]==FTS3_MATCHINFO_HITS );
1079: pExpr = pCsr->pExpr;
1080: rc = fts3ExprLoadDoclists(pCsr, 0, 0);
1081: if( rc!=SQLITE_OK ) break;
1082: if( bGlobal ){
1083: if( pCsr->pDeferred ){
1084: rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &pInfo->nDoc, 0);
1085: if( rc!=SQLITE_OK ) break;
1086: }
1087: rc = fts3ExprIterate(pExpr, fts3ExprGlobalHitsCb,(void*)pInfo);
1088: if( rc!=SQLITE_OK ) break;
1089: }
1090: (void)fts3ExprIterate(pExpr, fts3ExprLocalHitsCb,(void*)pInfo);
1091: break;
1092: }
1093: }
1094:
1095: pInfo->aMatchinfo += fts3MatchinfoSize(pInfo, zArg[i]);
1096: }
1097:
1098: sqlite3_reset(pSelect);
1099: return rc;
1100: }
1101:
1102:
1103: /*
1104: ** Populate pCsr->aMatchinfo[] with data for the current row. The
1105: ** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32).
1106: */
1107: static int fts3GetMatchinfo(
1108: Fts3Cursor *pCsr, /* FTS3 Cursor object */
1109: const char *zArg /* Second argument to matchinfo() function */
1110: ){
1111: MatchInfo sInfo;
1112: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1113: int rc = SQLITE_OK;
1114: int bGlobal = 0; /* Collect 'global' stats as well as local */
1115:
1116: memset(&sInfo, 0, sizeof(MatchInfo));
1117: sInfo.pCursor = pCsr;
1118: sInfo.nCol = pTab->nColumn;
1119:
1120: /* If there is cached matchinfo() data, but the format string for the
1121: ** cache does not match the format string for this request, discard
1122: ** the cached data. */
1123: if( pCsr->zMatchinfo && strcmp(pCsr->zMatchinfo, zArg) ){
1124: assert( pCsr->aMatchinfo );
1125: sqlite3_free(pCsr->aMatchinfo);
1126: pCsr->zMatchinfo = 0;
1127: pCsr->aMatchinfo = 0;
1128: }
1129:
1130: /* If Fts3Cursor.aMatchinfo[] is NULL, then this is the first time the
1131: ** matchinfo function has been called for this query. In this case
1132: ** allocate the array used to accumulate the matchinfo data and
1133: ** initialize those elements that are constant for every row.
1134: */
1135: if( pCsr->aMatchinfo==0 ){
1136: int nMatchinfo = 0; /* Number of u32 elements in match-info */
1137: int nArg; /* Bytes in zArg */
1138: int i; /* Used to iterate through zArg */
1139:
1140: /* Determine the number of phrases in the query */
1141: pCsr->nPhrase = fts3ExprPhraseCount(pCsr->pExpr);
1142: sInfo.nPhrase = pCsr->nPhrase;
1143:
1144: /* Determine the number of integers in the buffer returned by this call. */
1145: for(i=0; zArg[i]; i++){
1146: nMatchinfo += fts3MatchinfoSize(&sInfo, zArg[i]);
1147: }
1148:
1149: /* Allocate space for Fts3Cursor.aMatchinfo[] and Fts3Cursor.zMatchinfo. */
1150: nArg = (int)strlen(zArg);
1151: pCsr->aMatchinfo = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo + nArg + 1);
1152: if( !pCsr->aMatchinfo ) return SQLITE_NOMEM;
1153:
1154: pCsr->zMatchinfo = (char *)&pCsr->aMatchinfo[nMatchinfo];
1155: pCsr->nMatchinfo = nMatchinfo;
1156: memcpy(pCsr->zMatchinfo, zArg, nArg+1);
1157: memset(pCsr->aMatchinfo, 0, sizeof(u32)*nMatchinfo);
1158: pCsr->isMatchinfoNeeded = 1;
1159: bGlobal = 1;
1160: }
1161:
1162: sInfo.aMatchinfo = pCsr->aMatchinfo;
1163: sInfo.nPhrase = pCsr->nPhrase;
1164: if( pCsr->isMatchinfoNeeded ){
1165: rc = fts3MatchinfoValues(pCsr, bGlobal, &sInfo, zArg);
1166: pCsr->isMatchinfoNeeded = 0;
1167: }
1168:
1169: return rc;
1170: }
1171:
1172: /*
1173: ** Implementation of snippet() function.
1174: */
1175: void sqlite3Fts3Snippet(
1176: sqlite3_context *pCtx, /* SQLite function call context */
1177: Fts3Cursor *pCsr, /* Cursor object */
1178: const char *zStart, /* Snippet start text - "<b>" */
1179: const char *zEnd, /* Snippet end text - "</b>" */
1180: const char *zEllipsis, /* Snippet ellipsis text - "<b>...</b>" */
1181: int iCol, /* Extract snippet from this column */
1182: int nToken /* Approximate number of tokens in snippet */
1183: ){
1184: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1185: int rc = SQLITE_OK;
1186: int i;
1187: StrBuffer res = {0, 0, 0};
1188:
1189: /* The returned text includes up to four fragments of text extracted from
1190: ** the data in the current row. The first iteration of the for(...) loop
1191: ** below attempts to locate a single fragment of text nToken tokens in
1192: ** size that contains at least one instance of all phrases in the query
1193: ** expression that appear in the current row. If such a fragment of text
1194: ** cannot be found, the second iteration of the loop attempts to locate
1195: ** a pair of fragments, and so on.
1196: */
1197: int nSnippet = 0; /* Number of fragments in this snippet */
1198: SnippetFragment aSnippet[4]; /* Maximum of 4 fragments per snippet */
1199: int nFToken = -1; /* Number of tokens in each fragment */
1200:
1201: if( !pCsr->pExpr ){
1202: sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
1203: return;
1204: }
1205:
1206: for(nSnippet=1; 1; nSnippet++){
1207:
1208: int iSnip; /* Loop counter 0..nSnippet-1 */
1209: u64 mCovered = 0; /* Bitmask of phrases covered by snippet */
1210: u64 mSeen = 0; /* Bitmask of phrases seen by BestSnippet() */
1211:
1212: if( nToken>=0 ){
1213: nFToken = (nToken+nSnippet-1) / nSnippet;
1214: }else{
1215: nFToken = -1 * nToken;
1216: }
1217:
1218: for(iSnip=0; iSnip<nSnippet; iSnip++){
1219: int iBestScore = -1; /* Best score of columns checked so far */
1220: int iRead; /* Used to iterate through columns */
1221: SnippetFragment *pFragment = &aSnippet[iSnip];
1222:
1223: memset(pFragment, 0, sizeof(*pFragment));
1224:
1225: /* Loop through all columns of the table being considered for snippets.
1226: ** If the iCol argument to this function was negative, this means all
1227: ** columns of the FTS3 table. Otherwise, only column iCol is considered.
1228: */
1229: for(iRead=0; iRead<pTab->nColumn; iRead++){
1230: SnippetFragment sF = {0, 0, 0, 0};
1231: int iS;
1232: if( iCol>=0 && iRead!=iCol ) continue;
1233:
1234: /* Find the best snippet of nFToken tokens in column iRead. */
1235: rc = fts3BestSnippet(nFToken, pCsr, iRead, mCovered, &mSeen, &sF, &iS);
1236: if( rc!=SQLITE_OK ){
1237: goto snippet_out;
1238: }
1239: if( iS>iBestScore ){
1240: *pFragment = sF;
1241: iBestScore = iS;
1242: }
1243: }
1244:
1245: mCovered |= pFragment->covered;
1246: }
1247:
1248: /* If all query phrases seen by fts3BestSnippet() are present in at least
1249: ** one of the nSnippet snippet fragments, break out of the loop.
1250: */
1251: assert( (mCovered&mSeen)==mCovered );
1252: if( mSeen==mCovered || nSnippet==SizeofArray(aSnippet) ) break;
1253: }
1254:
1255: assert( nFToken>0 );
1256:
1257: for(i=0; i<nSnippet && rc==SQLITE_OK; i++){
1258: rc = fts3SnippetText(pCsr, &aSnippet[i],
1259: i, (i==nSnippet-1), nFToken, zStart, zEnd, zEllipsis, &res
1260: );
1261: }
1262:
1263: snippet_out:
1264: sqlite3Fts3SegmentsClose(pTab);
1265: if( rc!=SQLITE_OK ){
1266: sqlite3_result_error_code(pCtx, rc);
1267: sqlite3_free(res.z);
1268: }else{
1269: sqlite3_result_text(pCtx, res.z, -1, sqlite3_free);
1270: }
1271: }
1272:
1273:
1274: typedef struct TermOffset TermOffset;
1275: typedef struct TermOffsetCtx TermOffsetCtx;
1276:
1277: struct TermOffset {
1278: char *pList; /* Position-list */
1279: int iPos; /* Position just read from pList */
1280: int iOff; /* Offset of this term from read positions */
1281: };
1282:
1283: struct TermOffsetCtx {
1284: Fts3Cursor *pCsr;
1285: int iCol; /* Column of table to populate aTerm for */
1286: int iTerm;
1287: sqlite3_int64 iDocid;
1288: TermOffset *aTerm;
1289: };
1290:
1291: /*
1292: ** This function is an fts3ExprIterate() callback used by sqlite3Fts3Offsets().
1293: */
1294: static int fts3ExprTermOffsetInit(Fts3Expr *pExpr, int iPhrase, void *ctx){
1295: TermOffsetCtx *p = (TermOffsetCtx *)ctx;
1296: int nTerm; /* Number of tokens in phrase */
1297: int iTerm; /* For looping through nTerm phrase terms */
1298: char *pList; /* Pointer to position list for phrase */
1299: int iPos = 0; /* First position in position-list */
1300:
1301: UNUSED_PARAMETER(iPhrase);
1302: pList = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol);
1303: nTerm = pExpr->pPhrase->nToken;
1304: if( pList ){
1305: fts3GetDeltaPosition(&pList, &iPos);
1306: assert( iPos>=0 );
1307: }
1308:
1309: for(iTerm=0; iTerm<nTerm; iTerm++){
1310: TermOffset *pT = &p->aTerm[p->iTerm++];
1311: pT->iOff = nTerm-iTerm-1;
1312: pT->pList = pList;
1313: pT->iPos = iPos;
1314: }
1315:
1316: return SQLITE_OK;
1317: }
1318:
1319: /*
1320: ** Implementation of offsets() function.
1321: */
1322: void sqlite3Fts3Offsets(
1323: sqlite3_context *pCtx, /* SQLite function call context */
1324: Fts3Cursor *pCsr /* Cursor object */
1325: ){
1326: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1327: sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule;
1328: const char *ZDUMMY; /* Dummy argument used with xNext() */
1329: int NDUMMY; /* Dummy argument used with xNext() */
1330: int rc; /* Return Code */
1331: int nToken; /* Number of tokens in query */
1332: int iCol; /* Column currently being processed */
1333: StrBuffer res = {0, 0, 0}; /* Result string */
1334: TermOffsetCtx sCtx; /* Context for fts3ExprTermOffsetInit() */
1335:
1336: if( !pCsr->pExpr ){
1337: sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC);
1338: return;
1339: }
1340:
1341: memset(&sCtx, 0, sizeof(sCtx));
1342: assert( pCsr->isRequireSeek==0 );
1343:
1344: /* Count the number of terms in the query */
1345: rc = fts3ExprLoadDoclists(pCsr, 0, &nToken);
1346: if( rc!=SQLITE_OK ) goto offsets_out;
1347:
1348: /* Allocate the array of TermOffset iterators. */
1349: sCtx.aTerm = (TermOffset *)sqlite3_malloc(sizeof(TermOffset)*nToken);
1350: if( 0==sCtx.aTerm ){
1351: rc = SQLITE_NOMEM;
1352: goto offsets_out;
1353: }
1354: sCtx.iDocid = pCsr->iPrevId;
1355: sCtx.pCsr = pCsr;
1356:
1357: /* Loop through the table columns, appending offset information to
1358: ** string-buffer res for each column.
1359: */
1360: for(iCol=0; iCol<pTab->nColumn; iCol++){
1361: sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */
1362: int iStart;
1363: int iEnd;
1364: int iCurrent;
1365: const char *zDoc;
1366: int nDoc;
1367:
1368: /* Initialize the contents of sCtx.aTerm[] for column iCol. There is
1369: ** no way that this operation can fail, so the return code from
1370: ** fts3ExprIterate() can be discarded.
1371: */
1372: sCtx.iCol = iCol;
1373: sCtx.iTerm = 0;
1374: (void)fts3ExprIterate(pCsr->pExpr, fts3ExprTermOffsetInit, (void *)&sCtx);
1375:
1376: /* Retreive the text stored in column iCol. If an SQL NULL is stored
1377: ** in column iCol, jump immediately to the next iteration of the loop.
1378: ** If an OOM occurs while retrieving the data (this can happen if SQLite
1379: ** needs to transform the data from utf-16 to utf-8), return SQLITE_NOMEM
1380: ** to the caller.
1381: */
1382: zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1);
1383: nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1);
1384: if( zDoc==0 ){
1385: if( sqlite3_column_type(pCsr->pStmt, iCol+1)==SQLITE_NULL ){
1386: continue;
1387: }
1388: rc = SQLITE_NOMEM;
1389: goto offsets_out;
1390: }
1391:
1392: /* Initialize a tokenizer iterator to iterate through column iCol. */
1393: rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
1394: if( rc!=SQLITE_OK ) goto offsets_out;
1395: pC->pTokenizer = pTab->pTokenizer;
1396:
1397: rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
1398: while( rc==SQLITE_OK ){
1399: int i; /* Used to loop through terms */
1400: int iMinPos = 0x7FFFFFFF; /* Position of next token */
1401: TermOffset *pTerm = 0; /* TermOffset associated with next token */
1402:
1403: for(i=0; i<nToken; i++){
1404: TermOffset *pT = &sCtx.aTerm[i];
1405: if( pT->pList && (pT->iPos-pT->iOff)<iMinPos ){
1406: iMinPos = pT->iPos-pT->iOff;
1407: pTerm = pT;
1408: }
1409: }
1410:
1411: if( !pTerm ){
1412: /* All offsets for this column have been gathered. */
1413: rc = SQLITE_DONE;
1414: }else{
1415: assert( iCurrent<=iMinPos );
1416: if( 0==(0xFE&*pTerm->pList) ){
1417: pTerm->pList = 0;
1418: }else{
1419: fts3GetDeltaPosition(&pTerm->pList, &pTerm->iPos);
1420: }
1421: while( rc==SQLITE_OK && iCurrent<iMinPos ){
1422: rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
1423: }
1424: if( rc==SQLITE_OK ){
1425: char aBuffer[64];
1426: sqlite3_snprintf(sizeof(aBuffer), aBuffer,
1427: "%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart
1428: );
1429: rc = fts3StringAppend(&res, aBuffer, -1);
1430: }else if( rc==SQLITE_DONE && pTab->zContentTbl==0 ){
1431: rc = FTS_CORRUPT_VTAB;
1432: }
1433: }
1434: }
1435: if( rc==SQLITE_DONE ){
1436: rc = SQLITE_OK;
1437: }
1438:
1439: pMod->xClose(pC);
1440: if( rc!=SQLITE_OK ) goto offsets_out;
1441: }
1442:
1443: offsets_out:
1444: sqlite3_free(sCtx.aTerm);
1445: assert( rc!=SQLITE_DONE );
1446: sqlite3Fts3SegmentsClose(pTab);
1447: if( rc!=SQLITE_OK ){
1448: sqlite3_result_error_code(pCtx, rc);
1449: sqlite3_free(res.z);
1450: }else{
1451: sqlite3_result_text(pCtx, res.z, res.n-1, sqlite3_free);
1452: }
1453: return;
1454: }
1455:
1456: /*
1457: ** Implementation of matchinfo() function.
1458: */
1459: void sqlite3Fts3Matchinfo(
1460: sqlite3_context *pContext, /* Function call context */
1461: Fts3Cursor *pCsr, /* FTS3 table cursor */
1462: const char *zArg /* Second arg to matchinfo() function */
1463: ){
1464: Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
1465: int rc;
1466: int i;
1467: const char *zFormat;
1468:
1469: if( zArg ){
1470: for(i=0; zArg[i]; i++){
1471: char *zErr = 0;
1472: if( fts3MatchinfoCheck(pTab, zArg[i], &zErr) ){
1473: sqlite3_result_error(pContext, zErr, -1);
1474: sqlite3_free(zErr);
1475: return;
1476: }
1477: }
1478: zFormat = zArg;
1479: }else{
1480: zFormat = FTS3_MATCHINFO_DEFAULT;
1481: }
1482:
1483: if( !pCsr->pExpr ){
1484: sqlite3_result_blob(pContext, "", 0, SQLITE_STATIC);
1485: return;
1486: }
1487:
1488: /* Retrieve matchinfo() data. */
1489: rc = fts3GetMatchinfo(pCsr, zFormat);
1490: sqlite3Fts3SegmentsClose(pTab);
1491:
1492: if( rc!=SQLITE_OK ){
1493: sqlite3_result_error_code(pContext, rc);
1494: }else{
1495: int n = pCsr->nMatchinfo * sizeof(u32);
1496: sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT);
1497: }
1498: }
1499:
1500: #endif
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>