Return to fulltext.c CVS log | Up to [ELWIX - Embedded LightWeight unIX -] / embedaddon / sqlite3 / ext / fts1 |
1.1 ! misho 1: /* The author disclaims copyright to this source code. ! 2: * ! 3: * This is an SQLite module implementing full-text search. ! 4: */ ! 5: ! 6: #include <assert.h> ! 7: #if !defined(__APPLE__) ! 8: #include <malloc.h> ! 9: #else ! 10: #include <stdlib.h> ! 11: #endif ! 12: #include <stdio.h> ! 13: #include <string.h> ! 14: #include <ctype.h> ! 15: ! 16: #include "fulltext.h" ! 17: #include "ft_hash.h" ! 18: #include "tokenizer.h" ! 19: #include "sqlite3.h" ! 20: #include "sqlite3ext.h" ! 21: SQLITE_EXTENSION_INIT1 ! 22: ! 23: /* utility functions */ ! 24: ! 25: /* We encode variable-length integers in little-endian order using seven bits ! 26: * per byte as follows: ! 27: ** ! 28: ** KEY: ! 29: ** A = 0xxxxxxx 7 bits of data and one flag bit ! 30: ** B = 1xxxxxxx 7 bits of data and one flag bit ! 31: ** ! 32: ** 7 bits - A ! 33: ** 14 bits - BA ! 34: ** 21 bits - BBA ! 35: ** and so on. ! 36: */ ! 37: ! 38: /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ ! 39: #define VARINT_MAX 10 ! 40: ! 41: /* Write a 64-bit variable-length integer to memory starting at p[0]. ! 42: * The length of data written will be between 1 and VARINT_MAX bytes. ! 43: * The number of bytes written is returned. */ ! 44: static int putVarint(char *p, sqlite_int64 v){ ! 45: unsigned char *q = (unsigned char *) p; ! 46: sqlite_uint64 vu = v; ! 47: do{ ! 48: *q++ = (unsigned char) ((vu & 0x7f) | 0x80); ! 49: vu >>= 7; ! 50: }while( vu!=0 ); ! 51: q[-1] &= 0x7f; /* turn off high bit in final byte */ ! 52: assert( q - (unsigned char *)p <= VARINT_MAX ); ! 53: return (int) (q - (unsigned char *)p); ! 54: } ! 55: ! 56: /* Read a 64-bit variable-length integer from memory starting at p[0]. ! 57: * Return the number of bytes read, or 0 on error. ! 58: * The value is stored in *v. */ ! 59: static int getVarint(const char *p, sqlite_int64 *v){ ! 60: const unsigned char *q = (const unsigned char *) p; ! 61: sqlite_uint64 x = 0, y = 1; ! 62: while( (*q & 0x80) == 0x80 ){ ! 63: x += y * (*q++ & 0x7f); ! 64: y <<= 7; ! 65: if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ ! 66: assert( 0 ); ! 67: return 0; ! 68: } ! 69: } ! 70: x += y * (*q++); ! 71: *v = (sqlite_int64) x; ! 72: return (int) (q - (unsigned char *)p); ! 73: } ! 74: ! 75: static int getVarint32(const char *p, int *pi){ ! 76: sqlite_int64 i; ! 77: int ret = getVarint(p, &i); ! 78: *pi = (int) i; ! 79: assert( *pi==i ); ! 80: return ret; ! 81: } ! 82: ! 83: /*** Document lists *** ! 84: * ! 85: * A document list holds a sorted list of varint-encoded document IDs. ! 86: * ! 87: * A doclist with type DL_POSITIONS_OFFSETS is stored like this: ! 88: * ! 89: * array { ! 90: * varint docid; ! 91: * array { ! 92: * varint position; (delta from previous position plus 1, or 0 for end) ! 93: * varint startOffset; (delta from previous startOffset) ! 94: * varint endOffset; (delta from startOffset) ! 95: * } ! 96: * } ! 97: * ! 98: * Here, array { X } means zero or more occurrences of X, adjacent in memory. ! 99: * ! 100: * A doclist with type DL_POSITIONS is like the above, but holds only docids ! 101: * and positions without offset information. ! 102: * ! 103: * A doclist with type DL_DOCIDS is like the above, but holds only docids ! 104: * without positions or offset information. ! 105: * ! 106: * On disk, every document list has positions and offsets, so we don't bother ! 107: * to serialize a doclist's type. ! 108: * ! 109: * We don't yet delta-encode document IDs; doing so will probably be a ! 110: * modest win. ! 111: * ! 112: * NOTE(shess) I've thought of a slightly (1%) better offset encoding. ! 113: * After the first offset, estimate the next offset by using the ! 114: * current token position and the previous token position and offset, ! 115: * offset to handle some variance. So the estimate would be ! 116: * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded ! 117: * as normal. Offsets more than 64 chars from the estimate are ! 118: * encoded as the delta to the previous start offset + 128. An ! 119: * additional tiny increment can be gained by using the end offset of ! 120: * the previous token to make the estimate a tiny bit more precise. ! 121: */ ! 122: ! 123: typedef enum DocListType { ! 124: DL_DOCIDS, /* docids only */ ! 125: DL_POSITIONS, /* docids + positions */ ! 126: DL_POSITIONS_OFFSETS /* docids + positions + offsets */ ! 127: } DocListType; ! 128: ! 129: typedef struct DocList { ! 130: char *pData; ! 131: int nData; ! 132: DocListType iType; ! 133: int iLastPos; /* the last position written */ ! 134: int iLastOffset; /* the last start offset written */ ! 135: } DocList; ! 136: ! 137: /* Initialize a new DocList to hold the given data. */ ! 138: static void docListInit(DocList *d, DocListType iType, ! 139: const char *pData, int nData){ ! 140: d->nData = nData; ! 141: if( nData>0 ){ ! 142: d->pData = malloc(nData); ! 143: memcpy(d->pData, pData, nData); ! 144: } else { ! 145: d->pData = NULL; ! 146: } ! 147: d->iType = iType; ! 148: d->iLastPos = 0; ! 149: d->iLastOffset = 0; ! 150: } ! 151: ! 152: /* Create a new dynamically-allocated DocList. */ ! 153: static DocList *docListNew(DocListType iType){ ! 154: DocList *d = (DocList *) malloc(sizeof(DocList)); ! 155: docListInit(d, iType, 0, 0); ! 156: return d; ! 157: } ! 158: ! 159: static void docListDestroy(DocList *d){ ! 160: free(d->pData); ! 161: #ifndef NDEBUG ! 162: memset(d, 0x55, sizeof(*d)); ! 163: #endif ! 164: } ! 165: ! 166: static void docListDelete(DocList *d){ ! 167: docListDestroy(d); ! 168: free(d); ! 169: } ! 170: ! 171: static char *docListEnd(DocList *d){ ! 172: return d->pData + d->nData; ! 173: } ! 174: ! 175: /* Append a varint to a DocList's data. */ ! 176: static void appendVarint(DocList *d, sqlite_int64 i){ ! 177: char c[VARINT_MAX]; ! 178: int n = putVarint(c, i); ! 179: d->pData = realloc(d->pData, d->nData + n); ! 180: memcpy(d->pData + d->nData, c, n); ! 181: d->nData += n; ! 182: } ! 183: ! 184: static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ ! 185: appendVarint(d, iDocid); ! 186: d->iLastPos = 0; ! 187: } ! 188: ! 189: /* Add a position to the last position list in a doclist. */ ! 190: static void docListAddPos(DocList *d, int iPos){ ! 191: assert( d->iType>=DL_POSITIONS ); ! 192: appendVarint(d, iPos-d->iLastPos+1); ! 193: d->iLastPos = iPos; ! 194: } ! 195: ! 196: static void docListAddPosOffset(DocList *d, int iPos, ! 197: int iStartOffset, int iEndOffset){ ! 198: assert( d->iType==DL_POSITIONS_OFFSETS ); ! 199: docListAddPos(d, iPos); ! 200: appendVarint(d, iStartOffset-d->iLastOffset); ! 201: d->iLastOffset = iStartOffset; ! 202: appendVarint(d, iEndOffset-iStartOffset); ! 203: } ! 204: ! 205: /* Terminate the last position list in the given doclist. */ ! 206: static void docListAddEndPos(DocList *d){ ! 207: appendVarint(d, 0); ! 208: } ! 209: ! 210: typedef struct DocListReader { ! 211: DocList *pDoclist; ! 212: char *p; ! 213: int iLastPos; /* the last position read */ ! 214: } DocListReader; ! 215: ! 216: static void readerInit(DocListReader *r, DocList *pDoclist){ ! 217: r->pDoclist = pDoclist; ! 218: if( pDoclist!=NULL ){ ! 219: r->p = pDoclist->pData; ! 220: } ! 221: r->iLastPos = 0; ! 222: } ! 223: ! 224: static int readerAtEnd(DocListReader *pReader){ ! 225: return pReader->p >= docListEnd(pReader->pDoclist); ! 226: } ! 227: ! 228: /* Peek at the next docid without advancing the read pointer. */ ! 229: static sqlite_int64 peekDocid(DocListReader *pReader){ ! 230: sqlite_int64 ret; ! 231: assert( !readerAtEnd(pReader) ); ! 232: getVarint(pReader->p, &ret); ! 233: return ret; ! 234: } ! 235: ! 236: /* Read the next docid. */ ! 237: static sqlite_int64 readDocid(DocListReader *pReader){ ! 238: sqlite_int64 ret; ! 239: assert( !readerAtEnd(pReader) ); ! 240: pReader->p += getVarint(pReader->p, &ret); ! 241: pReader->iLastPos = 0; ! 242: return ret; ! 243: } ! 244: ! 245: /* Read the next position from a position list. ! 246: * Returns the position, or -1 at the end of the list. */ ! 247: static int readPosition(DocListReader *pReader){ ! 248: int i; ! 249: int iType = pReader->pDoclist->iType; ! 250: assert( iType>=DL_POSITIONS ); ! 251: assert( !readerAtEnd(pReader) ); ! 252: ! 253: pReader->p += getVarint32(pReader->p, &i); ! 254: if( i==0 ){ ! 255: pReader->iLastPos = -1; ! 256: return -1; ! 257: } ! 258: pReader->iLastPos += ((int) i)-1; ! 259: if( iType>=DL_POSITIONS_OFFSETS ){ ! 260: /* Skip over offsets, ignoring them for now. */ ! 261: int iStart, iEnd; ! 262: pReader->p += getVarint32(pReader->p, &iStart); ! 263: pReader->p += getVarint32(pReader->p, &iEnd); ! 264: } ! 265: return pReader->iLastPos; ! 266: } ! 267: ! 268: /* Skip past the end of a position list. */ ! 269: static void skipPositionList(DocListReader *pReader){ ! 270: while( readPosition(pReader)!=-1 ) ! 271: ; ! 272: } ! 273: ! 274: /* Skip over a docid, including its position list if the doclist has ! 275: * positions. */ ! 276: static void skipDocument(DocListReader *pReader){ ! 277: readDocid(pReader); ! 278: if( pReader->pDoclist->iType >= DL_POSITIONS ){ ! 279: skipPositionList(pReader); ! 280: } ! 281: } ! 282: ! 283: static sqlite_int64 firstDocid(DocList *d){ ! 284: DocListReader r; ! 285: readerInit(&r, d); ! 286: return readDocid(&r); ! 287: } ! 288: ! 289: /* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; ! 290: * otherwise pUpdate, which must contain only the single docid [iDocid], is ! 291: * inserted (if not present) or updated (if already present). */ ! 292: static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ ! 293: int modified = 0; ! 294: DocListReader reader; ! 295: char *p; ! 296: ! 297: if( pUpdate!=NULL ){ ! 298: assert( d->iType==pUpdate->iType); ! 299: assert( iDocid==firstDocid(pUpdate) ); ! 300: } ! 301: ! 302: readerInit(&reader, d); ! 303: while( !readerAtEnd(&reader) && peekDocid(&reader)<iDocid ){ ! 304: skipDocument(&reader); ! 305: } ! 306: ! 307: p = reader.p; ! 308: /* Delete if there is a matching element. */ ! 309: if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ ! 310: skipDocument(&reader); ! 311: memmove(p, reader.p, docListEnd(d) - reader.p); ! 312: d->nData -= (reader.p - p); ! 313: modified = 1; ! 314: } ! 315: ! 316: /* Insert if indicated. */ ! 317: if( pUpdate!=NULL ){ ! 318: int iDoclist = p-d->pData; ! 319: docListAddEndPos(pUpdate); ! 320: ! 321: d->pData = realloc(d->pData, d->nData+pUpdate->nData); ! 322: p = d->pData + iDoclist; ! 323: ! 324: memmove(p+pUpdate->nData, p, docListEnd(d) - p); ! 325: memcpy(p, pUpdate->pData, pUpdate->nData); ! 326: d->nData += pUpdate->nData; ! 327: modified = 1; ! 328: } ! 329: ! 330: return modified; ! 331: } ! 332: ! 333: /* Split the second half of doclist d into a separate doclist d2. Returns 1 ! 334: * if successful, or 0 if d contains a single document and hence can't be ! 335: * split. */ ! 336: static int docListSplit(DocList *d, DocList *d2){ ! 337: const char *pSplitPoint = d->pData + d->nData / 2; ! 338: DocListReader reader; ! 339: ! 340: readerInit(&reader, d); ! 341: while( reader.p<pSplitPoint ){ ! 342: skipDocument(&reader); ! 343: } ! 344: if( readerAtEnd(&reader) ) return 0; ! 345: docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p); ! 346: d->nData = reader.p - d->pData; ! 347: d->pData = realloc(d->pData, d->nData); ! 348: return 1; ! 349: } ! 350: ! 351: /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked ! 352: * on-disk doclist, resulting in another in-memory DocList [out]. [in] ! 353: * and [out] may or may not store position information according to the ! 354: * caller's wishes. The on-disk doclist always comes with positions. ! 355: * ! 356: * The caller must read each chunk of the on-disk doclist in succession and ! 357: * pass it to mergeBlock(). ! 358: * ! 359: * If [in] has positions, then the merge output contains only documents with ! 360: * matching positions in the two input doclists. If [in] does not have ! 361: * positions, then the merge output contains all documents common to the two ! 362: * input doclists. ! 363: * ! 364: * If [in] is NULL, then the on-disk doclist is copied to [out] directly. ! 365: * ! 366: * A merge is performed using an integer [iOffset] provided by the caller. ! 367: * [iOffset] is subtracted from each position in the on-disk doclist for the ! 368: * purpose of position comparison; this is helpful in implementing phrase ! 369: * searches. ! 370: * ! 371: * A DocListMerge is not yet able to propagate offsets through query ! 372: * processing; we should add that capability soon. ! 373: */ ! 374: typedef struct DocListMerge { ! 375: DocListReader in; ! 376: DocList *pOut; ! 377: int iOffset; ! 378: } DocListMerge; ! 379: ! 380: static void mergeInit(DocListMerge *m, ! 381: DocList *pIn, int iOffset, DocList *pOut){ ! 382: readerInit(&m->in, pIn); ! 383: m->pOut = pOut; ! 384: m->iOffset = iOffset; ! 385: ! 386: /* can't handle offsets yet */ ! 387: assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); ! 388: assert( pOut->iType <= DL_POSITIONS ); ! 389: } ! 390: ! 391: /* A helper function for mergeBlock(), below. Merge the position lists ! 392: * pointed to by m->in and pBlockReader. ! 393: * If the merge matches, write [iDocid] to m->pOut; if m->pOut ! 394: * has positions then write all matching positions as well. */ ! 395: static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, ! 396: DocListReader *pBlockReader){ ! 397: int block_pos = readPosition(pBlockReader); ! 398: int in_pos = readPosition(&m->in); ! 399: int match = 0; ! 400: while( block_pos!=-1 || in_pos!=-1 ){ ! 401: if( block_pos-m->iOffset==in_pos ){ ! 402: if( !match ){ ! 403: docListAddDocid(m->pOut, iDocid); ! 404: match = 1; ! 405: } ! 406: if( m->pOut->iType >= DL_POSITIONS ){ ! 407: docListAddPos(m->pOut, in_pos); ! 408: } ! 409: block_pos = readPosition(pBlockReader); ! 410: in_pos = readPosition(&m->in); ! 411: } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffset<in_pos) ){ ! 412: block_pos = readPosition(pBlockReader); ! 413: } else { ! 414: in_pos = readPosition(&m->in); ! 415: } ! 416: } ! 417: if( m->pOut->iType >= DL_POSITIONS && match ){ ! 418: docListAddEndPos(m->pOut); ! 419: } ! 420: } ! 421: ! 422: /* Merge one block of an on-disk doclist into a DocListMerge. */ ! 423: static void mergeBlock(DocListMerge *m, DocList *pBlock){ ! 424: DocListReader blockReader; ! 425: assert( pBlock->iType >= DL_POSITIONS ); ! 426: readerInit(&blockReader, pBlock); ! 427: while( !readerAtEnd(&blockReader) ){ ! 428: sqlite_int64 iDocid = readDocid(&blockReader); ! 429: if( m->in.pDoclist!=NULL ){ ! 430: while( 1 ){ ! 431: if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ ! 432: if( peekDocid(&m->in)>=iDocid ) break; ! 433: skipDocument(&m->in); ! 434: } ! 435: if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ ! 436: skipPositionList(&blockReader); /* skip this docid in the block */ ! 437: continue; ! 438: } ! 439: readDocid(&m->in); ! 440: } ! 441: /* We have a document match. */ ! 442: if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ ! 443: /* We don't need to do a poslist merge. */ ! 444: docListAddDocid(m->pOut, iDocid); ! 445: if( m->pOut->iType >= DL_POSITIONS ){ ! 446: /* Copy all positions to the output doclist. */ ! 447: while( 1 ){ ! 448: int pos = readPosition(&blockReader); ! 449: if( pos==-1 ) break; ! 450: docListAddPos(m->pOut, pos); ! 451: } ! 452: docListAddEndPos(m->pOut); ! 453: } else skipPositionList(&blockReader); ! 454: continue; ! 455: } ! 456: mergePosList(m, iDocid, &blockReader); ! 457: } ! 458: } ! 459: ! 460: static char *string_dup_n(const char *s, int n){ ! 461: char *str = malloc(n + 1); ! 462: memcpy(str, s, n); ! 463: str[n] = '\0'; ! 464: return str; ! 465: } ! 466: ! 467: /* Duplicate a string; the caller must free() the returned string. ! 468: * (We don't use strdup() since it's not part of the standard C library and ! 469: * may not be available everywhere.) */ ! 470: static char *string_dup(const char *s){ ! 471: return string_dup_n(s, strlen(s)); ! 472: } ! 473: ! 474: /* Format a string, replacing each occurrence of the % character with ! 475: * zName. This may be more convenient than sqlite_mprintf() ! 476: * when one string is used repeatedly in a format string. ! 477: * The caller must free() the returned string. */ ! 478: static char *string_format(const char *zFormat, const char *zName){ ! 479: const char *p; ! 480: size_t len = 0; ! 481: size_t nName = strlen(zName); ! 482: char *result; ! 483: char *r; ! 484: ! 485: /* first compute length needed */ ! 486: for(p = zFormat ; *p ; ++p){ ! 487: len += (*p=='%' ? nName : 1); ! 488: } ! 489: len += 1; /* for null terminator */ ! 490: ! 491: r = result = malloc(len); ! 492: for(p = zFormat; *p; ++p){ ! 493: if( *p=='%' ){ ! 494: memcpy(r, zName, nName); ! 495: r += nName; ! 496: } else { ! 497: *r++ = *p; ! 498: } ! 499: } ! 500: *r++ = '\0'; ! 501: assert( r == result + len ); ! 502: return result; ! 503: } ! 504: ! 505: static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ ! 506: char *zCommand = string_format(zFormat, zName); ! 507: int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); ! 508: free(zCommand); ! 509: return rc; ! 510: } ! 511: ! 512: static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, ! 513: const char *zFormat){ ! 514: char *zCommand = string_format(zFormat, zName); ! 515: int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); ! 516: free(zCommand); ! 517: return rc; ! 518: } ! 519: ! 520: /* end utility functions */ ! 521: ! 522: #define QUERY_GENERIC 0 ! 523: #define QUERY_FULLTEXT 1 ! 524: ! 525: #define CHUNK_MAX 1024 ! 526: ! 527: typedef enum fulltext_statement { ! 528: CONTENT_INSERT_STMT, ! 529: CONTENT_SELECT_STMT, ! 530: CONTENT_DELETE_STMT, ! 531: ! 532: TERM_SELECT_STMT, ! 533: TERM_CHUNK_SELECT_STMT, ! 534: TERM_INSERT_STMT, ! 535: TERM_UPDATE_STMT, ! 536: TERM_DELETE_STMT, ! 537: ! 538: MAX_STMT /* Always at end! */ ! 539: } fulltext_statement; ! 540: ! 541: /* These must exactly match the enum above. */ ! 542: /* TODO(adam): Is there some risk that a statement (in particular, ! 543: ** pTermSelectStmt) will be used in two cursors at once, e.g. if a ! 544: ** query joins a virtual table to itself? If so perhaps we should ! 545: ** move some of these to the cursor object. ! 546: */ ! 547: static const char *fulltext_zStatement[MAX_STMT] = { ! 548: /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", ! 549: /* CONTENT_SELECT */ "select content from %_content where rowid = ?", ! 550: /* CONTENT_DELETE */ "delete from %_content where rowid = ?", ! 551: ! 552: /* TERM_SELECT */ ! 553: "select rowid, doclist from %_term where term = ? and first = ?", ! 554: /* TERM_CHUNK_SELECT */ ! 555: "select max(first) from %_term where term = ? and first <= ?", ! 556: /* TERM_INSERT */ ! 557: "insert into %_term (term, first, doclist) values (?, ?, ?)", ! 558: /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", ! 559: /* TERM_DELETE */ "delete from %_term where rowid = ?", ! 560: }; ! 561: ! 562: typedef struct fulltext_vtab { ! 563: sqlite3_vtab base; ! 564: sqlite3 *db; ! 565: const char *zName; /* virtual table name */ ! 566: sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ ! 567: ! 568: /* Precompiled statements which we keep as long as the table is ! 569: ** open. ! 570: */ ! 571: sqlite3_stmt *pFulltextStatements[MAX_STMT]; ! 572: } fulltext_vtab; ! 573: ! 574: typedef struct fulltext_cursor { ! 575: sqlite3_vtab_cursor base; ! 576: int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ ! 577: ! 578: sqlite3_stmt *pStmt; ! 579: ! 580: int eof; ! 581: ! 582: /* The following is used only when iCursorType == QUERY_FULLTEXT. */ ! 583: DocListReader result; ! 584: } fulltext_cursor; ! 585: ! 586: static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ ! 587: return (fulltext_vtab *) c->base.pVtab; ! 588: } ! 589: ! 590: static sqlite3_module fulltextModule; /* forward declaration */ ! 591: ! 592: /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. ! 593: ** If the indicated statement has never been prepared, it is prepared ! 594: ** and cached, otherwise the cached version is reset. ! 595: */ ! 596: static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, ! 597: sqlite3_stmt **ppStmt){ ! 598: assert( iStmt<MAX_STMT ); ! 599: if( v->pFulltextStatements[iStmt]==NULL ){ ! 600: int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], ! 601: fulltext_zStatement[iStmt]); ! 602: if( rc!=SQLITE_OK ) return rc; ! 603: } else { ! 604: int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); ! 605: if( rc!=SQLITE_OK ) return rc; ! 606: } ! 607: ! 608: *ppStmt = v->pFulltextStatements[iStmt]; ! 609: return SQLITE_OK; ! 610: } ! 611: ! 612: /* Step the indicated statement, handling errors SQLITE_BUSY (by ! 613: ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring ! 614: ** bindings to the new statement). ! 615: ** TODO(adam): We should extend this function so that it can work with ! 616: ** statements declared locally, not only globally cached statements. ! 617: */ ! 618: static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, ! 619: sqlite3_stmt **ppStmt){ ! 620: int rc; ! 621: sqlite3_stmt *s = *ppStmt; ! 622: assert( iStmt<MAX_STMT ); ! 623: assert( s==v->pFulltextStatements[iStmt] ); ! 624: ! 625: while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ ! 626: sqlite3_stmt *pNewStmt; ! 627: ! 628: if( rc==SQLITE_BUSY ) continue; ! 629: if( rc!=SQLITE_ERROR ) return rc; ! 630: ! 631: rc = sqlite3_reset(s); ! 632: if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; ! 633: ! 634: v->pFulltextStatements[iStmt] = NULL; /* Still in s */ ! 635: rc = sql_get_statement(v, iStmt, &pNewStmt); ! 636: if( rc!=SQLITE_OK ) goto err; ! 637: *ppStmt = pNewStmt; ! 638: ! 639: rc = sqlite3_transfer_bindings(s, pNewStmt); ! 640: if( rc!=SQLITE_OK ) goto err; ! 641: ! 642: rc = sqlite3_finalize(s); ! 643: if( rc!=SQLITE_OK ) return rc; ! 644: s = pNewStmt; ! 645: } ! 646: return rc; ! 647: ! 648: err: ! 649: sqlite3_finalize(s); ! 650: return rc; ! 651: } ! 652: ! 653: /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. ! 654: ** Useful for statements like UPDATE, where we expect no results. ! 655: */ ! 656: static int sql_single_step_statement(fulltext_vtab *v, ! 657: fulltext_statement iStmt, ! 658: sqlite3_stmt **ppStmt){ ! 659: int rc = sql_step_statement(v, iStmt, ppStmt); ! 660: return (rc==SQLITE_DONE) ? SQLITE_OK : rc; ! 661: } ! 662: ! 663: /* insert into %_content (rowid, content) values ([rowid], [zContent]) */ ! 664: static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, ! 665: const char *zContent, int nContent){ ! 666: sqlite3_stmt *s; ! 667: int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); ! 668: if( rc!=SQLITE_OK ) return rc; ! 669: ! 670: rc = sqlite3_bind_value(s, 1, rowid); ! 671: if( rc!=SQLITE_OK ) return rc; ! 672: ! 673: rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); ! 674: if( rc!=SQLITE_OK ) return rc; ! 675: ! 676: return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); ! 677: } ! 678: ! 679: /* select content from %_content where rowid = [iRow] ! 680: * The caller must delete the returned string. */ ! 681: static int content_select(fulltext_vtab *v, sqlite_int64 iRow, ! 682: char **pzContent){ ! 683: sqlite3_stmt *s; ! 684: int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); ! 685: if( rc!=SQLITE_OK ) return rc; ! 686: ! 687: rc = sqlite3_bind_int64(s, 1, iRow); ! 688: if( rc!=SQLITE_OK ) return rc; ! 689: ! 690: rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); ! 691: if( rc!=SQLITE_ROW ) return rc; ! 692: ! 693: *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); ! 694: ! 695: /* We expect only one row. We must execute another sqlite3_step() ! 696: * to complete the iteration; otherwise the table will remain locked. */ ! 697: rc = sqlite3_step(s); ! 698: if( rc==SQLITE_DONE ) return SQLITE_OK; ! 699: ! 700: free(*pzContent); ! 701: return rc; ! 702: } ! 703: ! 704: /* delete from %_content where rowid = [iRow ] */ ! 705: static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ ! 706: sqlite3_stmt *s; ! 707: int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); ! 708: if( rc!=SQLITE_OK ) return rc; ! 709: ! 710: rc = sqlite3_bind_int64(s, 1, iRow); ! 711: if( rc!=SQLITE_OK ) return rc; ! 712: ! 713: return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); ! 714: } ! 715: ! 716: /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] ! 717: * If found, returns SQLITE_OK; the caller must free the returned doclist. ! 718: * If no rows found, returns SQLITE_ERROR. */ ! 719: static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, ! 720: sqlite_int64 iFirst, ! 721: sqlite_int64 *rowid, ! 722: DocList *out){ ! 723: sqlite3_stmt *s; ! 724: int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); ! 725: if( rc!=SQLITE_OK ) return rc; ! 726: ! 727: rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); ! 728: if( rc!=SQLITE_OK ) return rc; ! 729: ! 730: rc = sqlite3_bind_int64(s, 2, iFirst); ! 731: if( rc!=SQLITE_OK ) return rc; ! 732: ! 733: rc = sql_step_statement(v, TERM_SELECT_STMT, &s); ! 734: if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; ! 735: ! 736: *rowid = sqlite3_column_int64(s, 0); ! 737: docListInit(out, DL_POSITIONS_OFFSETS, ! 738: sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); ! 739: ! 740: /* We expect only one row. We must execute another sqlite3_step() ! 741: * to complete the iteration; otherwise the table will remain locked. */ ! 742: rc = sqlite3_step(s); ! 743: return rc==SQLITE_DONE ? SQLITE_OK : rc; ! 744: } ! 745: ! 746: /* select max(first) from %_term where term = [zTerm] and first <= [iFirst] ! 747: * If found, returns SQLITE_ROW and result in *piResult; if the query returns ! 748: * NULL (meaning no row found) returns SQLITE_DONE. ! 749: */ ! 750: static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, ! 751: sqlite_int64 iFirst, sqlite_int64 *piResult){ ! 752: sqlite3_stmt *s; ! 753: int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); ! 754: if( rc!=SQLITE_OK ) return rc; ! 755: ! 756: rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); ! 757: if( rc!=SQLITE_OK ) return rc; ! 758: ! 759: rc = sqlite3_bind_int64(s, 2, iFirst); ! 760: if( rc!=SQLITE_OK ) return rc; ! 761: ! 762: rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); ! 763: if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; ! 764: ! 765: switch( sqlite3_column_type(s, 0) ){ ! 766: case SQLITE_NULL: ! 767: rc = SQLITE_DONE; ! 768: break; ! 769: case SQLITE_INTEGER: ! 770: *piResult = sqlite3_column_int64(s, 0); ! 771: break; ! 772: default: ! 773: return SQLITE_ERROR; ! 774: } ! 775: /* We expect only one row. We must execute another sqlite3_step() ! 776: * to complete the iteration; otherwise the table will remain locked. */ ! 777: if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; ! 778: return rc; ! 779: } ! 780: ! 781: /* insert into %_term (term, first, doclist) ! 782: values ([zTerm], [iFirst], [doclist]) */ ! 783: static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, ! 784: sqlite_int64 iFirst, DocList *doclist){ ! 785: sqlite3_stmt *s; ! 786: int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); ! 787: if( rc!=SQLITE_OK ) return rc; ! 788: ! 789: rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); ! 790: if( rc!=SQLITE_OK ) return rc; ! 791: ! 792: rc = sqlite3_bind_int64(s, 2, iFirst); ! 793: if( rc!=SQLITE_OK ) return rc; ! 794: ! 795: rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); ! 796: if( rc!=SQLITE_OK ) return rc; ! 797: ! 798: return sql_single_step_statement(v, TERM_INSERT_STMT, &s); ! 799: } ! 800: ! 801: /* update %_term set doclist = [doclist] where rowid = [rowid] */ ! 802: static int term_update(fulltext_vtab *v, sqlite_int64 rowid, ! 803: DocList *doclist){ ! 804: sqlite3_stmt *s; ! 805: int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); ! 806: if( rc!=SQLITE_OK ) return rc; ! 807: ! 808: rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, ! 809: SQLITE_STATIC); ! 810: if( rc!=SQLITE_OK ) return rc; ! 811: ! 812: rc = sqlite3_bind_int64(s, 2, rowid); ! 813: if( rc!=SQLITE_OK ) return rc; ! 814: ! 815: return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); ! 816: } ! 817: ! 818: static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ ! 819: sqlite3_stmt *s; ! 820: int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); ! 821: if( rc!=SQLITE_OK ) return rc; ! 822: ! 823: rc = sqlite3_bind_int64(s, 1, rowid); ! 824: if( rc!=SQLITE_OK ) return rc; ! 825: ! 826: return sql_single_step_statement(v, TERM_DELETE_STMT, &s); ! 827: } ! 828: ! 829: static void fulltext_vtab_destroy(fulltext_vtab *v){ ! 830: int iStmt; ! 831: ! 832: for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ ! 833: if( v->pFulltextStatements[iStmt]!=NULL ){ ! 834: sqlite3_finalize(v->pFulltextStatements[iStmt]); ! 835: v->pFulltextStatements[iStmt] = NULL; ! 836: } ! 837: } ! 838: ! 839: if( v->pTokenizer!=NULL ){ ! 840: v->pTokenizer->pModule->xDestroy(v->pTokenizer); ! 841: v->pTokenizer = NULL; ! 842: } ! 843: ! 844: free((void *) v->zName); ! 845: free(v); ! 846: } ! 847: ! 848: /* Current interface: ! 849: ** argv[0] - module name ! 850: ** argv[1] - database name ! 851: ** argv[2] - table name ! 852: ** argv[3] - tokenizer name (optional, a sensible default is provided) ! 853: ** argv[4..] - passed to tokenizer (optional based on tokenizer) ! 854: **/ ! 855: static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv, ! 856: sqlite3_vtab **ppVTab){ ! 857: int rc; ! 858: fulltext_vtab *v; ! 859: sqlite3_tokenizer_module *m = NULL; ! 860: ! 861: assert( argc>=3 ); ! 862: v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); ! 863: /* sqlite will initialize v->base */ ! 864: v->db = db; ! 865: v->zName = string_dup(argv[2]); ! 866: v->pTokenizer = NULL; ! 867: ! 868: if( argc==3 ){ ! 869: get_simple_tokenizer_module(&m); ! 870: } else { ! 871: /* TODO(shess) For now, add new tokenizers as else if clauses. */ ! 872: if( !strcmp(argv[3], "simple") ){ ! 873: get_simple_tokenizer_module(&m); ! 874: } else { ! 875: assert( "unrecognized tokenizer"==NULL ); ! 876: } ! 877: } ! 878: ! 879: /* TODO(shess) Since tokenization impacts the index, the parameters ! 880: ** to the tokenizer need to be identical when a persistent virtual ! 881: ** table is re-created. One solution would be a meta-table to track ! 882: ** such information in the database. Then we could verify that the ! 883: ** information is identical on subsequent creates. ! 884: */ ! 885: /* TODO(shess) Why isn't argv already (const char **)? */ ! 886: rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); ! 887: if( rc!=SQLITE_OK ) return rc; ! 888: v->pTokenizer->pModule = m; ! 889: ! 890: /* TODO: verify the existence of backing tables foo_content, foo_term */ ! 891: ! 892: rc = sqlite3_declare_vtab(db, "create table x(content text)"); ! 893: if( rc!=SQLITE_OK ) return rc; ! 894: ! 895: memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); ! 896: ! 897: *ppVTab = &v->base; ! 898: return SQLITE_OK; ! 899: } ! 900: ! 901: static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv, ! 902: sqlite3_vtab **ppVTab){ ! 903: int rc; ! 904: assert( argc>=3 ); ! 905: ! 906: /* The %_content table holds the text of each full-text item, with ! 907: ** the rowid used as the docid. ! 908: ** ! 909: ** The %_term table maps each term to a document list blob ! 910: ** containing elements sorted by ascending docid, each element ! 911: ** encoded as: ! 912: ** ! 913: ** docid varint-encoded ! 914: ** token count varint-encoded ! 915: ** "count" token elements (poslist): ! 916: ** position varint-encoded as delta from previous position ! 917: ** start offset varint-encoded as delta from previous start offset ! 918: ** end offset varint-encoded as delta from start offset ! 919: ** ! 920: ** Additionally, doclist blobs can be chunked into multiple rows, ! 921: ** using "first" to order the blobs. "first" is simply the first ! 922: ** docid in the blob. ! 923: */ ! 924: /* ! 925: ** NOTE(shess) That last sentence is incorrect in the face of ! 926: ** deletion, which can leave a doclist that doesn't contain the ! 927: ** first from that row. I _believe_ this does not matter to the ! 928: ** operation of the system, but it might be reasonable to update ! 929: ** appropriately in case this assumption becomes more important. ! 930: */ ! 931: rc = sql_exec(db, argv[2], ! 932: "create table %_content(content text);" ! 933: "create table %_term(term text, first integer, doclist blob);" ! 934: "create index %_index on %_term(term, first)"); ! 935: if( rc!=SQLITE_OK ) return rc; ! 936: ! 937: return fulltextConnect(db, pAux, argc, argv, ppVTab); ! 938: } ! 939: ! 940: /* Decide how to handle an SQL query. ! 941: * At the moment, MATCH queries can include implicit boolean ANDs; we ! 942: * haven't implemented phrase searches or OR yet. */ ! 943: static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ ! 944: int i; ! 945: ! 946: for(i=0; i<pInfo->nConstraint; ++i){ ! 947: const struct sqlite3_index_constraint *pConstraint; ! 948: pConstraint = &pInfo->aConstraint[i]; ! 949: if( pConstraint->iColumn==0 && ! 950: pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && ! 951: pConstraint->usable ){ /* a full-text search */ ! 952: pInfo->aConstraintUsage[i].argvIndex = 1; ! 953: pInfo->aConstraintUsage[i].omit = 1; ! 954: pInfo->idxNum = QUERY_FULLTEXT; ! 955: pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ ! 956: return SQLITE_OK; ! 957: } ! 958: } ! 959: pInfo->idxNum = QUERY_GENERIC; ! 960: return SQLITE_OK; ! 961: } ! 962: ! 963: static int fulltextDisconnect(sqlite3_vtab *pVTab){ ! 964: fulltext_vtab_destroy((fulltext_vtab *)pVTab); ! 965: return SQLITE_OK; ! 966: } ! 967: ! 968: static int fulltextDestroy(sqlite3_vtab *pVTab){ ! 969: fulltext_vtab *v = (fulltext_vtab *)pVTab; ! 970: ! 971: int rc = sql_exec(v->db, v->zName, ! 972: "drop table %_content; drop table %_term"); ! 973: if( rc!=SQLITE_OK ) return rc; ! 974: ! 975: fulltext_vtab_destroy((fulltext_vtab *)pVTab); ! 976: return SQLITE_OK; ! 977: } ! 978: ! 979: static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ ! 980: fulltext_cursor *c; ! 981: ! 982: c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); ! 983: /* sqlite will initialize c->base */ ! 984: *ppCursor = &c->base; ! 985: ! 986: return SQLITE_OK; ! 987: } ! 988: ! 989: static int fulltextClose(sqlite3_vtab_cursor *pCursor){ ! 990: fulltext_cursor *c = (fulltext_cursor *) pCursor; ! 991: sqlite3_finalize(c->pStmt); ! 992: if( c->result.pDoclist!=NULL ){ ! 993: docListDelete(c->result.pDoclist); ! 994: } ! 995: free(c); ! 996: return SQLITE_OK; ! 997: } ! 998: ! 999: static int fulltextNext(sqlite3_vtab_cursor *pCursor){ ! 1000: fulltext_cursor *c = (fulltext_cursor *) pCursor; ! 1001: sqlite_int64 iDocid; ! 1002: int rc; ! 1003: ! 1004: switch( c->iCursorType ){ ! 1005: case QUERY_GENERIC: ! 1006: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ ! 1007: rc = sqlite3_step(c->pStmt); ! 1008: switch( rc ){ ! 1009: case SQLITE_ROW: ! 1010: c->eof = 0; ! 1011: return SQLITE_OK; ! 1012: case SQLITE_DONE: ! 1013: c->eof = 1; ! 1014: return SQLITE_OK; ! 1015: default: ! 1016: c->eof = 1; ! 1017: return rc; ! 1018: } ! 1019: case QUERY_FULLTEXT: ! 1020: rc = sqlite3_reset(c->pStmt); ! 1021: if( rc!=SQLITE_OK ) return rc; ! 1022: ! 1023: if( readerAtEnd(&c->result)){ ! 1024: c->eof = 1; ! 1025: return SQLITE_OK; ! 1026: } ! 1027: iDocid = readDocid(&c->result); ! 1028: rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); ! 1029: if( rc!=SQLITE_OK ) return rc; ! 1030: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ ! 1031: rc = sqlite3_step(c->pStmt); ! 1032: if( rc==SQLITE_ROW ){ /* the case we expect */ ! 1033: c->eof = 0; ! 1034: return SQLITE_OK; ! 1035: } ! 1036: /* an error occurred; abort */ ! 1037: return rc==SQLITE_DONE ? SQLITE_ERROR : rc; ! 1038: default: ! 1039: assert( 0 ); ! 1040: return SQLITE_ERROR; /* not reached */ ! 1041: } ! 1042: } ! 1043: ! 1044: static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, ! 1045: sqlite3_stmt **ppStmt){ ! 1046: int rc; ! 1047: if( *ppStmt ){ ! 1048: rc = sqlite3_reset(*ppStmt); ! 1049: } else { ! 1050: rc = sql_prepare(v->db, v->zName, ppStmt, ! 1051: "select doclist from %_term where term = ? order by first"); ! 1052: } ! 1053: if( rc!=SQLITE_OK ) return rc; ! 1054: ! 1055: rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); ! 1056: if( rc!=SQLITE_OK ) return rc; ! 1057: ! 1058: return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ ! 1059: } ! 1060: ! 1061: /* Read the posting list for [zTerm]; AND it with the doclist [in] to ! 1062: * produce the doclist [out], using the given offset [iOffset] for phrase ! 1063: * matching. ! 1064: * (*pSelect) is used to hold an SQLite statement used inside this function; ! 1065: * the caller should initialize *pSelect to NULL before the first call. ! 1066: */ ! 1067: static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, ! 1068: const char *zTerm, ! 1069: DocList *pIn, int iOffset, DocList *out){ ! 1070: int rc; ! 1071: DocListMerge merge; ! 1072: ! 1073: if( pIn!=NULL && !pIn->nData ){ ! 1074: /* If [pIn] is already empty, there's no point in reading the ! 1075: * posting list to AND it in; return immediately. */ ! 1076: return SQLITE_OK; ! 1077: } ! 1078: ! 1079: rc = term_select_doclist(v, zTerm, -1, pSelect); ! 1080: if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; ! 1081: ! 1082: mergeInit(&merge, pIn, iOffset, out); ! 1083: while( rc==SQLITE_ROW ){ ! 1084: DocList block; ! 1085: docListInit(&block, DL_POSITIONS_OFFSETS, ! 1086: sqlite3_column_blob(*pSelect, 0), ! 1087: sqlite3_column_bytes(*pSelect, 0)); ! 1088: mergeBlock(&merge, &block); ! 1089: docListDestroy(&block); ! 1090: ! 1091: rc = sqlite3_step(*pSelect); ! 1092: if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ ! 1093: return rc; ! 1094: } ! 1095: } ! 1096: ! 1097: return SQLITE_OK; ! 1098: } ! 1099: ! 1100: typedef struct QueryTerm { ! 1101: int is_phrase; /* true if this term begins a new phrase */ ! 1102: const char *zTerm; ! 1103: } QueryTerm; ! 1104: ! 1105: /* A parsed query. ! 1106: * ! 1107: * As an example, parsing the query ["four score" years "new nation"] will ! 1108: * yield a Query with 5 terms: ! 1109: * "four", is_phrase = 1 ! 1110: * "score", is_phrase = 0 ! 1111: * "years", is_phrase = 1 ! 1112: * "new", is_phrase = 1 ! 1113: * "nation", is_phrase = 0 ! 1114: */ ! 1115: typedef struct Query { ! 1116: int nTerms; ! 1117: QueryTerm *pTerm; ! 1118: } Query; ! 1119: ! 1120: static void query_add(Query *q, int is_phrase, const char *zTerm){ ! 1121: QueryTerm *t; ! 1122: ++q->nTerms; ! 1123: q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); ! 1124: t = &q->pTerm[q->nTerms - 1]; ! 1125: t->is_phrase = is_phrase; ! 1126: t->zTerm = zTerm; ! 1127: } ! 1128: ! 1129: static void query_free(Query *q){ ! 1130: int i; ! 1131: for(i = 0; i < q->nTerms; ++i){ ! 1132: free((void *) q->pTerm[i].zTerm); ! 1133: } ! 1134: free(q->pTerm); ! 1135: } ! 1136: ! 1137: static int tokenize_segment(sqlite3_tokenizer *pTokenizer, ! 1138: const char *zQuery, int in_phrase, ! 1139: Query *pQuery){ ! 1140: sqlite3_tokenizer_module *pModule = pTokenizer->pModule; ! 1141: sqlite3_tokenizer_cursor *pCursor; ! 1142: int is_first = 1; ! 1143: ! 1144: int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); ! 1145: if( rc!=SQLITE_OK ) return rc; ! 1146: pCursor->pTokenizer = pTokenizer; ! 1147: ! 1148: while( 1 ){ ! 1149: const char *zToken; ! 1150: int nToken, iStartOffset, iEndOffset, dummy_pos; ! 1151: ! 1152: rc = pModule->xNext(pCursor, ! 1153: &zToken, &nToken, ! 1154: &iStartOffset, &iEndOffset, ! 1155: &dummy_pos); ! 1156: if( rc!=SQLITE_OK ) break; ! 1157: query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); ! 1158: is_first = 0; ! 1159: } ! 1160: ! 1161: return pModule->xClose(pCursor); ! 1162: } ! 1163: ! 1164: /* Parse a query string, yielding a Query object. */ ! 1165: static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ ! 1166: char *zQuery1 = string_dup(zQuery); ! 1167: int in_phrase = 0; ! 1168: char *s = zQuery1; ! 1169: pQuery->nTerms = 0; ! 1170: pQuery->pTerm = NULL; ! 1171: ! 1172: while( *s ){ ! 1173: char *t = s; ! 1174: while( *t ){ ! 1175: if( *t=='"' ){ ! 1176: *t++ = '\0'; ! 1177: break; ! 1178: } ! 1179: ++t; ! 1180: } ! 1181: if( *s ){ ! 1182: tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); ! 1183: } ! 1184: s = t; ! 1185: in_phrase = !in_phrase; ! 1186: } ! 1187: ! 1188: free(zQuery1); ! 1189: return SQLITE_OK; ! 1190: } ! 1191: ! 1192: /* Perform a full-text query; return a list of documents in [pResult]. */ ! 1193: static int fulltext_query(fulltext_vtab *v, const char *zQuery, ! 1194: DocList **pResult){ ! 1195: Query q; ! 1196: int phrase_start = -1; ! 1197: int i; ! 1198: sqlite3_stmt *pSelect = NULL; ! 1199: DocList *d = NULL; ! 1200: ! 1201: int rc = parse_query(v, zQuery, &q); ! 1202: if( rc!=SQLITE_OK ) return rc; ! 1203: ! 1204: /* Merge terms. */ ! 1205: for(i = 0 ; i < q.nTerms ; ++i){ ! 1206: /* In each merge step, we need to generate positions whenever we're ! 1207: * processing a phrase which hasn't ended yet. */ ! 1208: int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase; ! 1209: DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS); ! 1210: if( q.pTerm[i].is_phrase ){ ! 1211: phrase_start = i; ! 1212: } ! 1213: rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next); ! 1214: if( rc!=SQLITE_OK ) break; ! 1215: if( d!=NULL ){ ! 1216: docListDelete(d); ! 1217: } ! 1218: d = next; ! 1219: } ! 1220: ! 1221: sqlite3_finalize(pSelect); ! 1222: query_free(&q); ! 1223: *pResult = d; ! 1224: return rc; ! 1225: } ! 1226: ! 1227: static int fulltextFilter(sqlite3_vtab_cursor *pCursor, ! 1228: int idxNum, const char *idxStr, ! 1229: int argc, sqlite3_value **argv){ ! 1230: fulltext_cursor *c = (fulltext_cursor *) pCursor; ! 1231: fulltext_vtab *v = cursor_vtab(c); ! 1232: int rc; ! 1233: const char *zStatement; ! 1234: ! 1235: c->iCursorType = idxNum; ! 1236: switch( idxNum ){ ! 1237: case QUERY_GENERIC: ! 1238: zStatement = "select rowid, content from %_content"; ! 1239: break; ! 1240: ! 1241: case QUERY_FULLTEXT: /* full-text search */ ! 1242: { ! 1243: const char *zQuery = (const char *)sqlite3_value_text(argv[0]); ! 1244: DocList *pResult; ! 1245: assert( argc==1 ); ! 1246: rc = fulltext_query(v, zQuery, &pResult); ! 1247: if( rc!=SQLITE_OK ) return rc; ! 1248: readerInit(&c->result, pResult); ! 1249: zStatement = "select rowid, content from %_content where rowid = ?"; ! 1250: break; ! 1251: } ! 1252: ! 1253: default: ! 1254: assert( 0 ); ! 1255: } ! 1256: ! 1257: rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); ! 1258: if( rc!=SQLITE_OK ) return rc; ! 1259: ! 1260: return fulltextNext(pCursor); ! 1261: } ! 1262: ! 1263: static int fulltextEof(sqlite3_vtab_cursor *pCursor){ ! 1264: fulltext_cursor *c = (fulltext_cursor *) pCursor; ! 1265: return c->eof; ! 1266: } ! 1267: ! 1268: static int fulltextColumn(sqlite3_vtab_cursor *pCursor, ! 1269: sqlite3_context *pContext, int idxCol){ ! 1270: fulltext_cursor *c = (fulltext_cursor *) pCursor; ! 1271: const char *s; ! 1272: ! 1273: assert( idxCol==0 ); ! 1274: s = (const char *) sqlite3_column_text(c->pStmt, 1); ! 1275: sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); ! 1276: ! 1277: return SQLITE_OK; ! 1278: } ! 1279: ! 1280: static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ ! 1281: fulltext_cursor *c = (fulltext_cursor *) pCursor; ! 1282: ! 1283: *pRowid = sqlite3_column_int64(c->pStmt, 0); ! 1284: return SQLITE_OK; ! 1285: } ! 1286: ! 1287: /* Build a hash table containing all terms in zText. */ ! 1288: static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer, ! 1289: const char *zText, sqlite_int64 iDocid){ ! 1290: sqlite3_tokenizer_cursor *pCursor; ! 1291: const char *pToken; ! 1292: int nTokenBytes; ! 1293: int iStartOffset, iEndOffset, iPosition; ! 1294: ! 1295: int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); ! 1296: if( rc!=SQLITE_OK ) return rc; ! 1297: ! 1298: pCursor->pTokenizer = pTokenizer; ! 1299: HashInit(terms, HASH_STRING, 1); ! 1300: while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, ! 1301: &pToken, &nTokenBytes, ! 1302: &iStartOffset, &iEndOffset, ! 1303: &iPosition) ){ ! 1304: DocList *p; ! 1305: ! 1306: /* Positions can't be negative; we use -1 as a terminator internally. */ ! 1307: if( iPosition<0 ) { ! 1308: rc = SQLITE_ERROR; ! 1309: goto err; ! 1310: } ! 1311: ! 1312: p = HashFind(terms, pToken, nTokenBytes); ! 1313: if( p==NULL ){ ! 1314: p = docListNew(DL_POSITIONS_OFFSETS); ! 1315: docListAddDocid(p, iDocid); ! 1316: HashInsert(terms, pToken, nTokenBytes, p); ! 1317: } ! 1318: docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); ! 1319: } ! 1320: ! 1321: err: ! 1322: /* TODO(shess) Check return? Should this be able to cause errors at ! 1323: ** this point? Actually, same question about sqlite3_finalize(), ! 1324: ** though one could argue that failure there means that the data is ! 1325: ** not durable. *ponder* ! 1326: */ ! 1327: pTokenizer->pModule->xClose(pCursor); ! 1328: return rc; ! 1329: } ! 1330: /* Update the %_terms table to map the term [zTerm] to the given rowid. */ ! 1331: static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, ! 1332: sqlite_int64 iDocid, DocList *p){ ! 1333: sqlite_int64 iFirst; ! 1334: sqlite_int64 iIndexRow; ! 1335: DocList doclist; ! 1336: ! 1337: int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); ! 1338: if( rc==SQLITE_DONE ){ ! 1339: docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); ! 1340: if( docListUpdate(&doclist, iDocid, p) ){ ! 1341: rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); ! 1342: docListDestroy(&doclist); ! 1343: return rc; ! 1344: } ! 1345: return SQLITE_OK; ! 1346: } ! 1347: if( rc!=SQLITE_ROW ) return SQLITE_ERROR; ! 1348: ! 1349: /* This word is in the index; add this document ID to its blob. */ ! 1350: ! 1351: rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); ! 1352: if( rc!=SQLITE_OK ) return rc; ! 1353: ! 1354: if( docListUpdate(&doclist, iDocid, p) ){ ! 1355: /* If the blob is too big, split it in half. */ ! 1356: if( doclist.nData>CHUNK_MAX ){ ! 1357: DocList half; ! 1358: if( docListSplit(&doclist, &half) ){ ! 1359: rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); ! 1360: docListDestroy(&half); ! 1361: if( rc!=SQLITE_OK ) goto err; ! 1362: } ! 1363: } ! 1364: rc = term_update(v, iIndexRow, &doclist); ! 1365: } ! 1366: ! 1367: err: ! 1368: docListDestroy(&doclist); ! 1369: return rc; ! 1370: } ! 1371: ! 1372: /* Insert a row into the full-text index; set *piRowid to be the ID of the ! 1373: * new row. */ ! 1374: static int index_insert(fulltext_vtab *v, ! 1375: sqlite3_value *pRequestRowid, const char *zText, ! 1376: sqlite_int64 *piRowid){ ! 1377: Hash terms; /* maps term string -> PosList */ ! 1378: HashElem *e; ! 1379: ! 1380: int rc = content_insert(v, pRequestRowid, zText, -1); ! 1381: if( rc!=SQLITE_OK ) return rc; ! 1382: *piRowid = sqlite3_last_insert_rowid(v->db); ! 1383: ! 1384: if( !zText ) return SQLITE_OK; /* nothing to index */ ! 1385: ! 1386: rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); ! 1387: if( rc!=SQLITE_OK ) return rc; ! 1388: ! 1389: for(e=HashFirst(&terms); e; e=HashNext(e)){ ! 1390: DocList *p = HashData(e); ! 1391: rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p); ! 1392: if( rc!=SQLITE_OK ) break; ! 1393: } ! 1394: ! 1395: for(e=HashFirst(&terms); e; e=HashNext(e)){ ! 1396: DocList *p = HashData(e); ! 1397: docListDelete(p); ! 1398: } ! 1399: HashClear(&terms); ! 1400: return rc; ! 1401: } ! 1402: ! 1403: static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, ! 1404: sqlite_int64 iDocid){ ! 1405: sqlite_int64 iFirst; ! 1406: sqlite_int64 iIndexRow; ! 1407: DocList doclist; ! 1408: ! 1409: int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); ! 1410: if( rc!=SQLITE_ROW ) return SQLITE_ERROR; ! 1411: ! 1412: rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); ! 1413: if( rc!=SQLITE_OK ) return rc; ! 1414: ! 1415: if( docListUpdate(&doclist, iDocid, NULL) ){ ! 1416: if( doclist.nData>0 ){ ! 1417: rc = term_update(v, iIndexRow, &doclist); ! 1418: } else { /* empty posting list */ ! 1419: rc = term_delete(v, iIndexRow); ! 1420: } ! 1421: } ! 1422: docListDestroy(&doclist); ! 1423: return rc; ! 1424: } ! 1425: ! 1426: /* Delete a row from the full-text index. */ ! 1427: static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ ! 1428: char *zText; ! 1429: Hash terms; ! 1430: HashElem *e; ! 1431: ! 1432: int rc = content_select(v, iRow, &zText); ! 1433: if( rc!=SQLITE_OK ) return rc; ! 1434: ! 1435: rc = build_terms(&terms, v->pTokenizer, zText, iRow); ! 1436: free(zText); ! 1437: if( rc!=SQLITE_OK ) return rc; ! 1438: ! 1439: for(e=HashFirst(&terms); e; e=HashNext(e)){ ! 1440: rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow); ! 1441: if( rc!=SQLITE_OK ) break; ! 1442: } ! 1443: for(e=HashFirst(&terms); e; e=HashNext(e)){ ! 1444: DocList *p = HashData(e); ! 1445: docListDelete(p); ! 1446: } ! 1447: HashClear(&terms); ! 1448: ! 1449: return content_delete(v, iRow); ! 1450: } ! 1451: ! 1452: static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, ! 1453: sqlite_int64 *pRowid){ ! 1454: fulltext_vtab *v = (fulltext_vtab *) pVtab; ! 1455: ! 1456: if( nArg<2 ){ ! 1457: return index_delete(v, sqlite3_value_int64(ppArg[0])); ! 1458: } ! 1459: ! 1460: if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ ! 1461: return SQLITE_ERROR; /* an update; not yet supported */ ! 1462: } ! 1463: ! 1464: assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ ! 1465: return index_insert(v, ppArg[1], ! 1466: (const char *)sqlite3_value_text(ppArg[2]), pRowid); ! 1467: } ! 1468: ! 1469: static sqlite3_module fulltextModule = { ! 1470: 0, ! 1471: fulltextCreate, ! 1472: fulltextConnect, ! 1473: fulltextBestIndex, ! 1474: fulltextDisconnect, ! 1475: fulltextDestroy, ! 1476: fulltextOpen, ! 1477: fulltextClose, ! 1478: fulltextFilter, ! 1479: fulltextNext, ! 1480: fulltextEof, ! 1481: fulltextColumn, ! 1482: fulltextRowid, ! 1483: fulltextUpdate ! 1484: }; ! 1485: ! 1486: int fulltext_init(sqlite3 *db){ ! 1487: return sqlite3_create_module(db, "fulltext", &fulltextModule, 0); ! 1488: } ! 1489: ! 1490: #if !SQLITE_CORE ! 1491: int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, ! 1492: const sqlite3_api_routines *pApi){ ! 1493: SQLITE_EXTENSION_INIT2(pApi) ! 1494: return fulltext_init(db); ! 1495: } ! 1496: #endif