Annotation of embedaddon/sqlite3/test/fts3rnd.test, revision 1.1.1.1

1.1       misho       1: # 2009 December 03
                      2: #
                      3: #    May you do good and not evil.
                      4: #    May you find forgiveness for yourself and forgive others.
                      5: #    May you share freely, never taking more than you give.
                      6: #
                      7: #***********************************************************************
                      8: #
                      9: # Brute force (random data) tests for FTS3.
                     10: #
                     11: 
                     12: #-------------------------------------------------------------------------
                     13: #
                     14: # The FTS3 tests implemented in this file focus on testing that FTS3
                     15: # returns the correct set of documents for various types of full-text
                     16: # query. This is done using pseudo-randomly generated data and queries.
                     17: # The expected result of each query is calculated using Tcl code.
                     18: #
                     19: #   1. The database is initialized to contain a single table with three
                     20: #      columns. 100 rows are inserted into the table. Each of the three
                     21: #      values in each row is a document consisting of between 0 and 100
                     22: #      terms. Terms are selected from a vocabulary of $G(nVocab) terms.
                     23: #
                     24: #   2. The following is performed 100 times:
                     25: #
                     26: #      a. A row is inserted into the database. The row contents are 
                     27: #         generated as in step 1. The docid is a pseudo-randomly selected
                     28: #         value between 0 and 1000000.
                     29: # 
                     30: #      b. A psuedo-randomly selected row is updated. One of its columns is
                     31: #         set to contain a new document generated in the same way as the
                     32: #         documents in step 1.
                     33: # 
                     34: #      c. A psuedo-randomly selected row is deleted.
                     35: # 
                     36: #      d. For each of several types of fts3 queries, 10 SELECT queries
                     37: #         of the form:
                     38: # 
                     39: #           SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
                     40: # 
                     41: #         are evaluated. The results are compared to those calculated by
                     42: #         Tcl code in this file. The patterns used for the different query
                     43: #         types are:
                     44: # 
                     45: #           1.  query = <term>
                     46: #           2.  query = <prefix>
                     47: #           3.  query = "<term> <term>"
                     48: #           4.  query = "<term> <term> <term>"
                     49: #           5.  query = "<prefix> <prefix> <prefix>"
                     50: #           6.  query = <term> NEAR <term>
                     51: #           7.  query = <term> NEAR/11 <term> NEAR/11 <term>
                     52: #           8.  query = <term> OR <term>
                     53: #           9.  query = <term> NOT <term>
                     54: #           10. query = <term> AND <term>
                     55: #           11. query = <term> NEAR <term> OR <term> NEAR <term>
                     56: #           12. query = <term> NEAR <term> NOT <term> NEAR <term>
                     57: #           13. query = <term> NEAR <term> AND <term> NEAR <term>
                     58: # 
                     59: #         where <term> is a term psuedo-randomly selected from the vocabulary
                     60: #         and prefix is the first 2 characters of such a term followed by
                     61: #         a "*" character.
                     62: #     
                     63: #      Every second iteration, steps (a) through (d) above are performed
                     64: #      within a single transaction. This forces the queries in (d) to
                     65: #      read data from both the database and the in-memory hash table
                     66: #      that caches the full-text index entries created by steps (a), (b)
                     67: #      and (c) until the transaction is committed.
                     68: #
                     69: # The procedure above is run 5 times, using advisory fts3 node sizes of 50,
                     70: # 500, 1000 and 2000 bytes.
                     71: #
                     72: # After the test using an advisory node-size of 50, an OOM test is run using
                     73: # the database. This test is similar to step (d) above, except that it tests
                     74: # the effects of transient and persistent OOM conditions encountered while
                     75: # executing each query.
                     76: #
                     77: 
                     78: set testdir [file dirname $argv0]
                     79: source $testdir/tester.tcl
                     80: 
                     81: # If this build does not include FTS3, skip the tests in this file.
                     82: #
                     83: ifcapable !fts3 { finish_test ; return }
                     84: source $testdir/fts3_common.tcl
                     85: source $testdir/malloc_common.tcl
                     86: 
                     87: set G(nVocab) 100
                     88: 
                     89: set nVocab 100
                     90: set lVocab [list]
                     91: 
                     92: expr srand(0)
                     93: 
                     94: # Generate a vocabulary of nVocab words. Each word is 3 characters long.
                     95: #
                     96: set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
                     97: for {set i 0} {$i < $nVocab} {incr i} {
                     98:   set len [expr int(rand()*3)+2]
                     99:   set    word [lindex $lChar [expr int(rand()*26)]]
                    100:   append word [lindex $lChar [expr int(rand()*26)]]
                    101:   if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
                    102:   if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
                    103:   lappend lVocab $word
                    104: }
                    105: 
                    106: proc random_term {} {
                    107:   lindex $::lVocab [expr {int(rand()*$::nVocab)}]
                    108: }
                    109: 
                    110: # Return a document consisting of $nWord arbitrarily selected terms
                    111: # from the $::lVocab list.
                    112: #
                    113: proc generate_doc {nWord} {
                    114:   set doc [list]
                    115:   for {set i 0} {$i < $nWord} {incr i} {
                    116:     lappend doc [random_term]
                    117:   }
                    118:   return $doc
                    119: }
                    120: 
                    121: 
                    122: 
                    123: # Primitives to update the table.
                    124: #
                    125: unset -nocomplain t1
                    126: proc insert_row {rowid} {
                    127:   set a [generate_doc [expr int((rand()*100))]]
                    128:   set b [generate_doc [expr int((rand()*100))]]
                    129:   set c [generate_doc [expr int((rand()*100))]]
                    130:   execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
                    131:   set ::t1($rowid) [list $a $b $c]
                    132: }
                    133: proc delete_row {rowid} {
                    134:   execsql { DELETE FROM t1 WHERE rowid = $rowid }
                    135:   catch {unset ::t1($rowid)}
                    136: }
                    137: proc update_row {rowid} {
                    138:   set cols {a b c}
                    139:   set iCol [expr int(rand()*3)]
                    140:   set doc  [generate_doc [expr int((rand()*100))]]
                    141:   lset ::t1($rowid) $iCol $doc
                    142:   execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
                    143: }
                    144: 
                    145: proc simple_phrase {zPrefix} {
                    146:   set ret [list]
                    147: 
                    148:   set reg [string map {* {[^ ]*}} $zPrefix]
                    149:   set reg " $reg "
                    150: 
                    151:   foreach key [lsort -integer [array names ::t1]] {
                    152:     set value $::t1($key)
                    153:     set cnt [list]
                    154:     foreach col $value {
                    155:       if {[regexp $reg " $col "]} { lappend ret $key ; break }
                    156:     }
                    157:   }
                    158: 
                    159:   #lsort -uniq -integer $ret
                    160:   set ret
                    161: }
                    162: 
                    163: # This [proc] is used to test the FTS3 matchinfo() function.
                    164: # 
                    165: proc simple_token_matchinfo {zToken bDesc} {
                    166: 
                    167:   set nDoc(0) 0
                    168:   set nDoc(1) 0
                    169:   set nDoc(2) 0
                    170:   set nHit(0) 0
                    171:   set nHit(1) 0
                    172:   set nHit(2) 0
                    173: 
                    174:   set dir -inc
                    175:   if {$bDesc} { set dir -dec }
                    176: 
                    177:   foreach key [array names ::t1] {
                    178:     set value $::t1($key)
                    179:     set a($key) [list]
                    180:     foreach i {0 1 2} col $value {
                    181:       set hit [llength [lsearch -all $col $zToken]]
                    182:       lappend a($key) $hit
                    183:       incr nHit($i) $hit
                    184:       if {$hit>0} { incr nDoc($i) }
                    185:     }
                    186:   }
                    187: 
                    188:   set ret [list]
                    189:   foreach docid [lsort -integer $dir [array names a]] {
                    190:     if { [lindex [lsort -integer $a($docid)] end] } {
                    191:       set matchinfo [list 1 3]
                    192:       foreach i {0 1 2} hit $a($docid) {
                    193:         lappend matchinfo $hit $nHit($i) $nDoc($i)
                    194:       }
                    195:       lappend ret $docid $matchinfo
                    196:     }
                    197:   }
                    198: 
                    199:   set ret
                    200: } 
                    201: 
                    202: proc simple_near {termlist nNear} {
                    203:   set ret [list]
                    204: 
                    205:   foreach {key value} [array get ::t1] {
                    206:     foreach v $value {
                    207: 
                    208:       set l [lsearch -exact -all $v [lindex $termlist 0]]
                    209:       foreach T [lrange $termlist 1 end] {
                    210:         set l2 [list]
                    211:         foreach i $l {
                    212:           set iStart [expr $i - $nNear - 1]
                    213:           set iEnd [expr $i + $nNear + 1]
                    214:           if {$iStart < 0} {set iStart 0}
                    215:           foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
                    216:             incr i2 $iStart
                    217:             if {$i2 != $i} { lappend l2 $i2 } 
                    218:           }
                    219:         }
                    220:         set l [lsort -uniq -integer $l2]
                    221:       }
                    222: 
                    223:       if {[llength $l]} {
                    224: #puts "MATCH($key): $v"
                    225:         lappend ret $key
                    226:       } 
                    227:     }
                    228:   }
                    229: 
                    230:   lsort -unique -integer $ret
                    231: }
                    232: 
                    233: # The following three procs:
                    234: # 
                    235: #   setup_not A B
                    236: #   setup_or  A B
                    237: #   setup_and A B
                    238: #
                    239: # each take two arguments. Both arguments must be lists of integer values
                    240: # sorted by value. The return value is the list produced by evaluating
                    241: # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
                    242: # AND.
                    243: #
                    244: proc setop_not {A B} {
                    245:   foreach b $B { set n($b) {} }
                    246:   set ret [list]
                    247:   foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
                    248:   return $ret
                    249: }
                    250: proc setop_or {A B} {
                    251:   lsort -integer -uniq [concat $A $B]
                    252: }
                    253: proc setop_and {A B} {
                    254:   foreach b $B { set n($b) {} }
                    255:   set ret [list]
                    256:   foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
                    257:   return $ret
                    258: }
                    259: 
                    260: proc mit {blob} {
                    261:   set scan(littleEndian) i*
                    262:   set scan(bigEndian) I*
                    263:   binary scan $blob $scan($::tcl_platform(byteOrder)) r
                    264:   return $r
                    265: }
                    266: db func mit mit
                    267: set sqlite_fts3_enable_parentheses 1
                    268: 
                    269: proc do_orderbydocid_test {tn sql res} {
                    270:   uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res]
                    271:   uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \
                    272:     [lsort -int -dec $res]
                    273:   ]
                    274: }
                    275: 
                    276: set NUM_TRIALS 100
                    277: 
                    278: foreach {nodesize order} {
                    279:   50    DESC
                    280:   50    ASC
                    281:   500   ASC
                    282:   1000  DESC
                    283:   2000  ASC
                    284: } {
                    285:   catch { array unset ::t1 }
                    286:   set testname "$nodesize/$order"
                    287: 
                    288:   # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
                    289:   #
                    290:   db transaction {
                    291:     catchsql { DROP TABLE t1 }
                    292:     execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)"
                    293:     execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
                    294:     for {set i 0} {$i < 100} {incr i} { insert_row $i }
                    295:   }
                    296:   
                    297:   for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} {
                    298:     catchsql COMMIT
                    299: 
                    300:     set DO_MALLOC_TEST 0
                    301:     set nRep 10
                    302:     if {$iTest==100 && $nodesize==50} { 
                    303:       set DO_MALLOC_TEST 1 
                    304:       set nRep 2
                    305:     }
                    306: 
                    307:     set ::testprefix fts3rnd-1.$testname.$iTest
                    308:   
                    309:     # Delete one row, update one row and insert one row.
                    310:     #
                    311:     set rows [array names ::t1]
                    312:     set nRow [llength $rows]
                    313:     set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
                    314:     set iDelete $iUpdate
                    315:     while {$iDelete == $iUpdate} {
                    316:       set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
                    317:     }
                    318:     set iInsert $iUpdate
                    319:     while {[info exists ::t1($iInsert)]} {
                    320:       set iInsert [expr {int(rand()*1000000)}]
                    321:     }
                    322:     execsql BEGIN
                    323:       insert_row $iInsert
                    324:       update_row $iUpdate
                    325:       delete_row $iDelete
                    326:     if {0==($iTest%2)} { execsql COMMIT }
                    327: 
                    328:     if {0==($iTest%2)} { 
                    329:       #do_test 0 { fts3_integrity_check t1 } ok 
                    330:     }
                    331: 
                    332:     # Pick 10 terms from the vocabulary. Check that the results of querying
                    333:     # the database for the set of documents containing each of these terms
                    334:     # is the same as the result obtained by scanning the contents of the Tcl 
                    335:     # array for each term.
                    336:     #
                    337:     for {set i 0} {$i < 10} {incr i} {
                    338:       set term [random_term]
                    339:       do_select_test 1.$i.asc {
                    340:         SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
                    341:         ORDER BY docid ASC
                    342:       } [simple_token_matchinfo $term 0]
                    343:       do_select_test 1.$i.desc {
                    344:         SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
                    345:         ORDER BY docid DESC
                    346:       } [simple_token_matchinfo $term 1]
                    347:     }
                    348: 
                    349:     # This time, use the first two characters of each term as a term prefix
                    350:     # to query for. Test that querying the Tcl array produces the same results
                    351:     # as querying the FTS3 table for the prefix.
                    352:     #
                    353:     for {set i 0} {$i < $nRep} {incr i} {
                    354:       set prefix [string range [random_term] 0 end-1]
                    355:       set match "${prefix}*"
                    356:       do_orderbydocid_test 2.$i {
                    357:         SELECT docid FROM t1 WHERE t1 MATCH $match
                    358:       } [simple_phrase $match]
                    359:     }
                    360: 
                    361:     # Similar to the above, except for phrase queries.
                    362:     #
                    363:     for {set i 0} {$i < $nRep} {incr i} {
                    364:       set term [list [random_term] [random_term]]
                    365:       set match "\"$term\""
                    366:       do_orderbydocid_test 3.$i {
                    367:         SELECT docid FROM t1 WHERE t1 MATCH $match
                    368:       } [simple_phrase $term]
                    369:     }
                    370: 
                    371:     # Three word phrases.
                    372:     #
                    373:     for {set i 0} {$i < $nRep} {incr i} {
                    374:       set term [list [random_term] [random_term] [random_term]]
                    375:       set match "\"$term\""
                    376:       do_orderbydocid_test 4.$i {
                    377:         SELECT docid FROM t1 WHERE t1 MATCH $match
                    378:       } [simple_phrase $term]
                    379:     }
                    380: 
                    381:     # Three word phrases made up of term-prefixes.
                    382:     #
                    383:     for {set i 0} {$i < $nRep} {incr i} {
                    384:       set    query "[string range [random_term] 0 end-1]* "
                    385:       append query "[string range [random_term] 0 end-1]* "
                    386:       append query "[string range [random_term] 0 end-1]*"
                    387: 
                    388:       set match "\"$query\""
                    389:       do_orderbydocid_test 5.$i {
                    390:         SELECT docid FROM t1 WHERE t1 MATCH $match
                    391:       } [simple_phrase $query]
                    392:     }
                    393: 
                    394:     # A NEAR query with terms as the arguments:
                    395:     #
                    396:     #     ... MATCH '$term1 NEAR $term2' ...
                    397:     #
                    398:     for {set i 0} {$i < $nRep} {incr i} {
                    399:       set terms [list [random_term] [random_term]]
                    400:       set match [join $terms " NEAR "]
                    401:       do_orderbydocid_test 6.$i {
                    402:         SELECT docid FROM t1 WHERE t1 MATCH $match 
                    403:       } [simple_near $terms 10]
                    404:     }
                    405: 
                    406:     # A 3-way NEAR query with terms as the arguments.
                    407:     #
                    408:     for {set i 0} {$i < $nRep} {incr i} {
                    409:       set terms [list [random_term] [random_term] [random_term]]
                    410:       set nNear 11
                    411:       set match [join $terms " NEAR/$nNear "]
                    412:       do_orderbydocid_test 7.$i {
                    413:         SELECT docid FROM t1 WHERE t1 MATCH $match
                    414:       } [simple_near $terms $nNear]
                    415:     }
                    416:     
                    417:     # Set operations on simple term queries.
                    418:     #
                    419:     foreach {tn op proc} {
                    420:       8  OR  setop_or
                    421:       9  NOT setop_not
                    422:       10 AND setop_and
                    423:     } {
                    424:       for {set i 0} {$i < $nRep} {incr i} {
                    425:         set term1 [random_term]
                    426:         set term2 [random_term]
                    427:         set match "$term1 $op $term2"
                    428:         do_orderbydocid_test $tn.$i {
                    429:           SELECT docid FROM t1 WHERE t1 MATCH $match
                    430:         } [$proc [simple_phrase $term1] [simple_phrase $term2]]
                    431:       }
                    432:     }
                    433:  
                    434:     # Set operations on NEAR queries.
                    435:     #
                    436:     foreach {tn op proc} {
                    437:       11 OR  setop_or
                    438:       12 NOT setop_not
                    439:       13 AND setop_and
                    440:     } {
                    441:       for {set i 0} {$i < $nRep} {incr i} {
                    442:         set term1 [random_term]
                    443:         set term2 [random_term]
                    444:         set term3 [random_term]
                    445:         set term4 [random_term]
                    446:         set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
                    447:         do_orderbydocid_test $tn.$i {
                    448:           SELECT docid FROM t1 WHERE t1 MATCH $match
                    449:         } [$proc                                  \
                    450:             [simple_near [list $term1 $term2] 10] \
                    451:             [simple_near [list $term3 $term4] 10]
                    452:           ]
                    453:       }
                    454:     }
                    455: 
                    456:     catchsql COMMIT
                    457:   }
                    458: }
                    459: 
                    460: finish_test

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>