Annotation of embedaddon/sqlite3/ext/fts2/README.tokenizers, revision 1.1

1.1     ! misho       1: 
        !             2: 1. FTS2 Tokenizers
        !             3: 
        !             4:   When creating a new full-text table, FTS2 allows the user to select
        !             5:   the text tokenizer implementation to be used when indexing text
        !             6:   by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
        !             7:   statement:
        !             8: 
        !             9:     CREATE VIRTUAL TABLE <table-name> USING fts2(
        !            10:       <columns ...> [, tokenizer <tokenizer-name> [<tokenizer-args>]]
        !            11:     );
        !            12: 
        !            13:   The built-in tokenizers (valid values to pass as <tokenizer name>) are
        !            14:   "simple" and "porter".
        !            15: 
        !            16:   <tokenizer-args> should consist of zero or more white-space separated
        !            17:   arguments to pass to the selected tokenizer implementation. The 
        !            18:   interpretation of the arguments, if any, depends on the individual 
        !            19:   tokenizer.
        !            20: 
        !            21: 2. Custom Tokenizers
        !            22: 
        !            23:   FTS2 allows users to provide custom tokenizer implementations. The 
        !            24:   interface used to create a new tokenizer is defined and described in 
        !            25:   the fts2_tokenizer.h source file.
        !            26: 
        !            27:   Registering a new FTS2 tokenizer is similar to registering a new 
        !            28:   virtual table module with SQLite. The user passes a pointer to a
        !            29:   structure containing pointers to various callback functions that
        !            30:   make up the implementation of the new tokenizer type. For tokenizers,
        !            31:   the structure (defined in fts2_tokenizer.h) is called
        !            32:   "sqlite3_tokenizer_module".
        !            33: 
        !            34:   FTS2 does not expose a C-function that users call to register new
        !            35:   tokenizer types with a database handle. Instead, the pointer must
        !            36:   be encoded as an SQL blob value and passed to FTS2 through the SQL
        !            37:   engine by evaluating a special scalar function, "fts2_tokenizer()".
        !            38:   The fts2_tokenizer() function may be called with one or two arguments,
        !            39:   as follows:
        !            40: 
        !            41:     SELECT fts2_tokenizer(<tokenizer-name>);
        !            42:     SELECT fts2_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module ptr>);
        !            43:   
        !            44:   Where <tokenizer-name> is a string identifying the tokenizer and
        !            45:   <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module
        !            46:   structure encoded as an SQL blob. If the second argument is present,
        !            47:   it is registered as tokenizer <tokenizer-name> and a copy of it
        !            48:   returned. If only one argument is passed, a pointer to the tokenizer
        !            49:   implementation currently registered as <tokenizer-name> is returned,
        !            50:   encoded as a blob. Or, if no such tokenizer exists, an SQL exception
        !            51:   (error) is raised.
        !            52: 
        !            53:   SECURITY: If the fts2 extension is used in an environment where potentially
        !            54:     malicious users may execute arbitrary SQL (i.e. gears), they should be
        !            55:     prevented from invoking the fts2_tokenizer() function, possibly using the
        !            56:     authorisation callback.
        !            57: 
        !            58:   See "Sample code" below for an example of calling the fts2_tokenizer()
        !            59:   function from C code.
        !            60: 
        !            61: 3. ICU Library Tokenizers
        !            62: 
        !            63:   If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor 
        !            64:   symbol defined, then there exists a built-in tokenizer named "icu" 
        !            65:   implemented using the ICU library. The first argument passed to the
        !            66:   xCreate() method (see fts2_tokenizer.h) of this tokenizer may be
        !            67:   an ICU locale identifier. For example "tr_TR" for Turkish as used
        !            68:   in Turkey, or "en_AU" for English as used in Australia. For example:
        !            69: 
        !            70:     "CREATE VIRTUAL TABLE thai_text USING fts2(text, tokenizer icu th_TH)"
        !            71: 
        !            72:   The ICU tokenizer implementation is very simple. It splits the input
        !            73:   text according to the ICU rules for finding word boundaries and discards
        !            74:   any tokens that consist entirely of white-space. This may be suitable
        !            75:   for some applications in some locales, but not all. If more complex
        !            76:   processing is required, for example to implement stemming or 
        !            77:   discard punctuation, this can be done by creating a tokenizer 
        !            78:   implementation that uses the ICU tokenizer as part of its implementation.
        !            79: 
        !            80:   When using the ICU tokenizer this way, it is safe to overwrite the
        !            81:   contents of the strings returned by the xNext() method (see
        !            82:   fts2_tokenizer.h).
        !            83: 
        !            84: 4. Sample code.
        !            85: 
        !            86:   The following two code samples illustrate the way C code should invoke
        !            87:   the fts2_tokenizer() scalar function:
        !            88: 
        !            89:       int registerTokenizer(
        !            90:         sqlite3 *db, 
        !            91:         char *zName, 
        !            92:         const sqlite3_tokenizer_module *p
        !            93:       ){
        !            94:         int rc;
        !            95:         sqlite3_stmt *pStmt;
        !            96:         const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
        !            97:       
        !            98:         rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
        !            99:         if( rc!=SQLITE_OK ){
        !           100:           return rc;
        !           101:         }
        !           102:       
        !           103:         sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
        !           104:         sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
        !           105:         sqlite3_step(pStmt);
        !           106:       
        !           107:         return sqlite3_finalize(pStmt);
        !           108:       }
        !           109:       
        !           110:       int queryTokenizer(
        !           111:         sqlite3 *db, 
        !           112:         char *zName,  
        !           113:         const sqlite3_tokenizer_module **pp
        !           114:       ){
        !           115:         int rc;
        !           116:         sqlite3_stmt *pStmt;
        !           117:         const char zSql[] = "SELECT fts2_tokenizer(?)";
        !           118:       
        !           119:         *pp = 0;
        !           120:         rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
        !           121:         if( rc!=SQLITE_OK ){
        !           122:           return rc;
        !           123:         }
        !           124:       
        !           125:         sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
        !           126:         if( SQLITE_ROW==sqlite3_step(pStmt) ){
        !           127:           if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
        !           128:             memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
        !           129:           }
        !           130:         }
        !           131:       
        !           132:         return sqlite3_finalize(pStmt);
        !           133:       }

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>