Annotation of embedaddon/sqlite3/ext/fts2/README.tokenizers, revision 1.1.1.1

1.1       misho       1: 
                      2: 1. FTS2 Tokenizers
                      3: 
                      4:   When creating a new full-text table, FTS2 allows the user to select
                      5:   the text tokenizer implementation to be used when indexing text
                      6:   by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
                      7:   statement:
                      8: 
                      9:     CREATE VIRTUAL TABLE <table-name> USING fts2(
                     10:       <columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]]
                     11:     );
                     12: 
                     13:   The built-in tokenizers (valid values to pass as <tokenizer name>) are
                     14:   "simple" and "porter".
                     15: 
                     16:   <tokenizer-args> should consist of zero or more white-space separated
                     17:   arguments to pass to the selected tokenizer implementation. The 
                     18:   interpretation of the arguments, if any, depends on the individual 
                     19:   tokenizer.
                     20: 
                     21: 2. Custom Tokenizers
                     22: 
                     23:   FTS2 allows users to provide custom tokenizer implementations. The 
                     24:   interface used to create a new tokenizer is defined and described in 
                     25:   the fts2_tokenizer.h source file.
                     26: 
                     27:   Registering a new FTS2 tokenizer is similar to registering a new 
                     28:   virtual table module with SQLite. The user passes a pointer to a
                     29:   structure containing pointers to various callback functions that
                     30:   make up the implementation of the new tokenizer type. For tokenizers,
                     31:   the structure (defined in fts2_tokenizer.h) is called
                     32:   "sqlite3_tokenizer_module".
                     33: 
                     34:   FTS2 does not expose a C-function that users call to register new
                     35:   tokenizer types with a database handle. Instead, the pointer must
                     36:   be encoded as an SQL blob value and passed to FTS2 through the SQL
                     37:   engine by evaluating a special scalar function, "fts2_tokenizer()".
                     38:   The fts2_tokenizer() function may be called with one or two arguments,
                     39:   as follows:
                     40: 
                     41:     SELECT fts2_tokenizer(<tokenizer-name>);
                     42:     SELECT fts2_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
                     43:   
                     44:   Where <tokenizer-name> is a string identifying the tokenizer and
                     45:   <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
                     46:   structure encoded as an SQL blob. If the second argument is present,
                     47:   it is registered as tokenizer <tokenizer-name> and a copy of it
                     48:   returned. If only one argument is passed, a pointer to the tokenizer
                     49:   implementation currently registered as <tokenizer-name> is returned,
                     50:   encoded as a blob. Or, if no such tokenizer exists, an SQL exception
                     51:   (error) is raised.
                     52: 
                     53:   SECURITY: If the fts2 extension is used in an environment where potentially
                     54:     malicious users may execute arbitrary SQL (i.e. gears), they should be
                     55:     prevented from invoking the fts2_tokenizer() function, possibly using the
                     56:     authorisation callback.
                     57: 
                     58:   See "Sample code" below for an example of calling the fts2_tokenizer()
                     59:   function from C code.
                     60: 
                     61: 3. ICU Library Tokenizers
                     62: 
                     63:   If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor 
                     64:   symbol defined, then there exists a built-in tokenizer named "icu" 
                     65:   implemented using the ICU library. The first argument passed to the
                     66:   xCreate() method (see fts2_tokenizer.h) of this tokenizer may be
                     67:   an ICU locale identifier. For example "tr_TR" for Turkish as used
                     68:   in Turkey, or "en_AU" for English as used in Australia. For example:
                     69: 
                     70:     "CREATE VIRTUAL TABLE thai_text USING fts2(text, tokenizer icu th_TH)"
                     71: 
                     72:   The ICU tokenizer implementation is very simple. It splits the input
                     73:   text according to the ICU rules for finding word boundaries and discards
                     74:   any tokens that consist entirely of white-space. This may be suitable
                     75:   for some applications in some locales, but not all. If more complex
                     76:   processing is required, for example to implement stemming or 
                     77:   discard punctuation, this can be done by creating a tokenizer 
                     78:   implementation that uses the ICU tokenizer as part of its implementation.
                     79: 
                     80:   When using the ICU tokenizer this way, it is safe to overwrite the
                     81:   contents of the strings returned by the xNext() method (see
                     82:   fts2_tokenizer.h).
                     83: 
                     84: 4. Sample code.
                     85: 
                     86:   The following two code samples illustrate the way C code should invoke
                     87:   the fts2_tokenizer() scalar function:
                     88: 
                     89:       int registerTokenizer(
                     90:         sqlite3 *db, 
                     91:         char *zName, 
                     92:         const sqlite3_tokenizer_module *p
                     93:       ){
                     94:         int rc;
                     95:         sqlite3_stmt *pStmt;
                     96:         const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
                     97:       
                     98:         rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
                     99:         if( rc!=SQLITE_OK ){
                    100:           return rc;
                    101:         }
                    102:       
                    103:         sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
                    104:         sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
                    105:         sqlite3_step(pStmt);
                    106:       
                    107:         return sqlite3_finalize(pStmt);
                    108:       }
                    109:       
                    110:       int queryTokenizer(
                    111:         sqlite3 *db, 
                    112:         char *zName,  
                    113:         const sqlite3_tokenizer_module **pp
                    114:       ){
                    115:         int rc;
                    116:         sqlite3_stmt *pStmt;
                    117:         const char zSql[] = "SELECT fts2_tokenizer(?)";
                    118:       
                    119:         *pp = 0;
                    120:         rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
                    121:         if( rc!=SQLITE_OK ){
                    122:           return rc;
                    123:         }
                    124:       
                    125:         sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
                    126:         if( SQLITE_ROW==sqlite3_step(pStmt) ){
                    127:           if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
                    128:             memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
                    129:           }
                    130:         }
                    131:       
                    132:         return sqlite3_finalize(pStmt);
                    133:       }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>