Annotation of embedaddon/sqlite3/test/fts3rnd.test, revision 1.1.1.1
1.1 misho 1: # 2009 December 03
2: #
3: # May you do good and not evil.
4: # May you find forgiveness for yourself and forgive others.
5: # May you share freely, never taking more than you give.
6: #
7: #***********************************************************************
8: #
9: # Brute force (random data) tests for FTS3.
10: #
11:
12: #-------------------------------------------------------------------------
13: #
14: # The FTS3 tests implemented in this file focus on testing that FTS3
15: # returns the correct set of documents for various types of full-text
16: # query. This is done using pseudo-randomly generated data and queries.
17: # The expected result of each query is calculated using Tcl code.
18: #
19: # 1. The database is initialized to contain a single table with three
20: # columns. 100 rows are inserted into the table. Each of the three
21: # values in each row is a document consisting of between 0 and 100
22: # terms. Terms are selected from a vocabulary of $G(nVocab) terms.
23: #
24: # 2. The following is performed 100 times:
25: #
26: # a. A row is inserted into the database. The row contents are
27: # generated as in step 1. The docid is a pseudo-randomly selected
28: # value between 0 and 1000000.
29: #
30: # b. A psuedo-randomly selected row is updated. One of its columns is
31: # set to contain a new document generated in the same way as the
32: # documents in step 1.
33: #
34: # c. A psuedo-randomly selected row is deleted.
35: #
36: # d. For each of several types of fts3 queries, 10 SELECT queries
37: # of the form:
38: #
39: # SELECT docid FROM <tbl> WHERE <tbl> MATCH '<query>'
40: #
41: # are evaluated. The results are compared to those calculated by
42: # Tcl code in this file. The patterns used for the different query
43: # types are:
44: #
45: # 1. query = <term>
46: # 2. query = <prefix>
47: # 3. query = "<term> <term>"
48: # 4. query = "<term> <term> <term>"
49: # 5. query = "<prefix> <prefix> <prefix>"
50: # 6. query = <term> NEAR <term>
51: # 7. query = <term> NEAR/11 <term> NEAR/11 <term>
52: # 8. query = <term> OR <term>
53: # 9. query = <term> NOT <term>
54: # 10. query = <term> AND <term>
55: # 11. query = <term> NEAR <term> OR <term> NEAR <term>
56: # 12. query = <term> NEAR <term> NOT <term> NEAR <term>
57: # 13. query = <term> NEAR <term> AND <term> NEAR <term>
58: #
59: # where <term> is a term psuedo-randomly selected from the vocabulary
60: # and prefix is the first 2 characters of such a term followed by
61: # a "*" character.
62: #
63: # Every second iteration, steps (a) through (d) above are performed
64: # within a single transaction. This forces the queries in (d) to
65: # read data from both the database and the in-memory hash table
66: # that caches the full-text index entries created by steps (a), (b)
67: # and (c) until the transaction is committed.
68: #
69: # The procedure above is run 5 times, using advisory fts3 node sizes of 50,
70: # 500, 1000 and 2000 bytes.
71: #
72: # After the test using an advisory node-size of 50, an OOM test is run using
73: # the database. This test is similar to step (d) above, except that it tests
74: # the effects of transient and persistent OOM conditions encountered while
75: # executing each query.
76: #
77:
78: set testdir [file dirname $argv0]
79: source $testdir/tester.tcl
80:
81: # If this build does not include FTS3, skip the tests in this file.
82: #
83: ifcapable !fts3 { finish_test ; return }
84: source $testdir/fts3_common.tcl
85: source $testdir/malloc_common.tcl
86:
87: set G(nVocab) 100
88:
89: set nVocab 100
90: set lVocab [list]
91:
92: expr srand(0)
93:
94: # Generate a vocabulary of nVocab words. Each word is 3 characters long.
95: #
96: set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z}
97: for {set i 0} {$i < $nVocab} {incr i} {
98: set len [expr int(rand()*3)+2]
99: set word [lindex $lChar [expr int(rand()*26)]]
100: append word [lindex $lChar [expr int(rand()*26)]]
101: if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] }
102: if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] }
103: lappend lVocab $word
104: }
105:
106: proc random_term {} {
107: lindex $::lVocab [expr {int(rand()*$::nVocab)}]
108: }
109:
110: # Return a document consisting of $nWord arbitrarily selected terms
111: # from the $::lVocab list.
112: #
113: proc generate_doc {nWord} {
114: set doc [list]
115: for {set i 0} {$i < $nWord} {incr i} {
116: lappend doc [random_term]
117: }
118: return $doc
119: }
120:
121:
122:
123: # Primitives to update the table.
124: #
125: unset -nocomplain t1
126: proc insert_row {rowid} {
127: set a [generate_doc [expr int((rand()*100))]]
128: set b [generate_doc [expr int((rand()*100))]]
129: set c [generate_doc [expr int((rand()*100))]]
130: execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) }
131: set ::t1($rowid) [list $a $b $c]
132: }
133: proc delete_row {rowid} {
134: execsql { DELETE FROM t1 WHERE rowid = $rowid }
135: catch {unset ::t1($rowid)}
136: }
137: proc update_row {rowid} {
138: set cols {a b c}
139: set iCol [expr int(rand()*3)]
140: set doc [generate_doc [expr int((rand()*100))]]
141: lset ::t1($rowid) $iCol $doc
142: execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid"
143: }
144:
145: proc simple_phrase {zPrefix} {
146: set ret [list]
147:
148: set reg [string map {* {[^ ]*}} $zPrefix]
149: set reg " $reg "
150:
151: foreach key [lsort -integer [array names ::t1]] {
152: set value $::t1($key)
153: set cnt [list]
154: foreach col $value {
155: if {[regexp $reg " $col "]} { lappend ret $key ; break }
156: }
157: }
158:
159: #lsort -uniq -integer $ret
160: set ret
161: }
162:
163: # This [proc] is used to test the FTS3 matchinfo() function.
164: #
165: proc simple_token_matchinfo {zToken bDesc} {
166:
167: set nDoc(0) 0
168: set nDoc(1) 0
169: set nDoc(2) 0
170: set nHit(0) 0
171: set nHit(1) 0
172: set nHit(2) 0
173:
174: set dir -inc
175: if {$bDesc} { set dir -dec }
176:
177: foreach key [array names ::t1] {
178: set value $::t1($key)
179: set a($key) [list]
180: foreach i {0 1 2} col $value {
181: set hit [llength [lsearch -all $col $zToken]]
182: lappend a($key) $hit
183: incr nHit($i) $hit
184: if {$hit>0} { incr nDoc($i) }
185: }
186: }
187:
188: set ret [list]
189: foreach docid [lsort -integer $dir [array names a]] {
190: if { [lindex [lsort -integer $a($docid)] end] } {
191: set matchinfo [list 1 3]
192: foreach i {0 1 2} hit $a($docid) {
193: lappend matchinfo $hit $nHit($i) $nDoc($i)
194: }
195: lappend ret $docid $matchinfo
196: }
197: }
198:
199: set ret
200: }
201:
202: proc simple_near {termlist nNear} {
203: set ret [list]
204:
205: foreach {key value} [array get ::t1] {
206: foreach v $value {
207:
208: set l [lsearch -exact -all $v [lindex $termlist 0]]
209: foreach T [lrange $termlist 1 end] {
210: set l2 [list]
211: foreach i $l {
212: set iStart [expr $i - $nNear - 1]
213: set iEnd [expr $i + $nNear + 1]
214: if {$iStart < 0} {set iStart 0}
215: foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] {
216: incr i2 $iStart
217: if {$i2 != $i} { lappend l2 $i2 }
218: }
219: }
220: set l [lsort -uniq -integer $l2]
221: }
222:
223: if {[llength $l]} {
224: #puts "MATCH($key): $v"
225: lappend ret $key
226: }
227: }
228: }
229:
230: lsort -unique -integer $ret
231: }
232:
233: # The following three procs:
234: #
235: # setup_not A B
236: # setup_or A B
237: # setup_and A B
238: #
239: # each take two arguments. Both arguments must be lists of integer values
240: # sorted by value. The return value is the list produced by evaluating
241: # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or
242: # AND.
243: #
244: proc setop_not {A B} {
245: foreach b $B { set n($b) {} }
246: set ret [list]
247: foreach a $A { if {![info exists n($a)]} {lappend ret $a} }
248: return $ret
249: }
250: proc setop_or {A B} {
251: lsort -integer -uniq [concat $A $B]
252: }
253: proc setop_and {A B} {
254: foreach b $B { set n($b) {} }
255: set ret [list]
256: foreach a $A { if {[info exists n($a)]} {lappend ret $a} }
257: return $ret
258: }
259:
260: proc mit {blob} {
261: set scan(littleEndian) i*
262: set scan(bigEndian) I*
263: binary scan $blob $scan($::tcl_platform(byteOrder)) r
264: return $r
265: }
266: db func mit mit
267: set sqlite_fts3_enable_parentheses 1
268:
269: proc do_orderbydocid_test {tn sql res} {
270: uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res]
271: uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \
272: [lsort -int -dec $res]
273: ]
274: }
275:
276: set NUM_TRIALS 100
277:
278: foreach {nodesize order} {
279: 50 DESC
280: 50 ASC
281: 500 ASC
282: 1000 DESC
283: 2000 ASC
284: } {
285: catch { array unset ::t1 }
286: set testname "$nodesize/$order"
287:
288: # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows.
289: #
290: db transaction {
291: catchsql { DROP TABLE t1 }
292: execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)"
293: execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')"
294: for {set i 0} {$i < 100} {incr i} { insert_row $i }
295: }
296:
297: for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} {
298: catchsql COMMIT
299:
300: set DO_MALLOC_TEST 0
301: set nRep 10
302: if {$iTest==100 && $nodesize==50} {
303: set DO_MALLOC_TEST 1
304: set nRep 2
305: }
306:
307: set ::testprefix fts3rnd-1.$testname.$iTest
308:
309: # Delete one row, update one row and insert one row.
310: #
311: set rows [array names ::t1]
312: set nRow [llength $rows]
313: set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]]
314: set iDelete $iUpdate
315: while {$iDelete == $iUpdate} {
316: set iDelete [lindex $rows [expr {int(rand()*$nRow)}]]
317: }
318: set iInsert $iUpdate
319: while {[info exists ::t1($iInsert)]} {
320: set iInsert [expr {int(rand()*1000000)}]
321: }
322: execsql BEGIN
323: insert_row $iInsert
324: update_row $iUpdate
325: delete_row $iDelete
326: if {0==($iTest%2)} { execsql COMMIT }
327:
328: if {0==($iTest%2)} {
329: #do_test 0 { fts3_integrity_check t1 } ok
330: }
331:
332: # Pick 10 terms from the vocabulary. Check that the results of querying
333: # the database for the set of documents containing each of these terms
334: # is the same as the result obtained by scanning the contents of the Tcl
335: # array for each term.
336: #
337: for {set i 0} {$i < 10} {incr i} {
338: set term [random_term]
339: do_select_test 1.$i.asc {
340: SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
341: ORDER BY docid ASC
342: } [simple_token_matchinfo $term 0]
343: do_select_test 1.$i.desc {
344: SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term
345: ORDER BY docid DESC
346: } [simple_token_matchinfo $term 1]
347: }
348:
349: # This time, use the first two characters of each term as a term prefix
350: # to query for. Test that querying the Tcl array produces the same results
351: # as querying the FTS3 table for the prefix.
352: #
353: for {set i 0} {$i < $nRep} {incr i} {
354: set prefix [string range [random_term] 0 end-1]
355: set match "${prefix}*"
356: do_orderbydocid_test 2.$i {
357: SELECT docid FROM t1 WHERE t1 MATCH $match
358: } [simple_phrase $match]
359: }
360:
361: # Similar to the above, except for phrase queries.
362: #
363: for {set i 0} {$i < $nRep} {incr i} {
364: set term [list [random_term] [random_term]]
365: set match "\"$term\""
366: do_orderbydocid_test 3.$i {
367: SELECT docid FROM t1 WHERE t1 MATCH $match
368: } [simple_phrase $term]
369: }
370:
371: # Three word phrases.
372: #
373: for {set i 0} {$i < $nRep} {incr i} {
374: set term [list [random_term] [random_term] [random_term]]
375: set match "\"$term\""
376: do_orderbydocid_test 4.$i {
377: SELECT docid FROM t1 WHERE t1 MATCH $match
378: } [simple_phrase $term]
379: }
380:
381: # Three word phrases made up of term-prefixes.
382: #
383: for {set i 0} {$i < $nRep} {incr i} {
384: set query "[string range [random_term] 0 end-1]* "
385: append query "[string range [random_term] 0 end-1]* "
386: append query "[string range [random_term] 0 end-1]*"
387:
388: set match "\"$query\""
389: do_orderbydocid_test 5.$i {
390: SELECT docid FROM t1 WHERE t1 MATCH $match
391: } [simple_phrase $query]
392: }
393:
394: # A NEAR query with terms as the arguments:
395: #
396: # ... MATCH '$term1 NEAR $term2' ...
397: #
398: for {set i 0} {$i < $nRep} {incr i} {
399: set terms [list [random_term] [random_term]]
400: set match [join $terms " NEAR "]
401: do_orderbydocid_test 6.$i {
402: SELECT docid FROM t1 WHERE t1 MATCH $match
403: } [simple_near $terms 10]
404: }
405:
406: # A 3-way NEAR query with terms as the arguments.
407: #
408: for {set i 0} {$i < $nRep} {incr i} {
409: set terms [list [random_term] [random_term] [random_term]]
410: set nNear 11
411: set match [join $terms " NEAR/$nNear "]
412: do_orderbydocid_test 7.$i {
413: SELECT docid FROM t1 WHERE t1 MATCH $match
414: } [simple_near $terms $nNear]
415: }
416:
417: # Set operations on simple term queries.
418: #
419: foreach {tn op proc} {
420: 8 OR setop_or
421: 9 NOT setop_not
422: 10 AND setop_and
423: } {
424: for {set i 0} {$i < $nRep} {incr i} {
425: set term1 [random_term]
426: set term2 [random_term]
427: set match "$term1 $op $term2"
428: do_orderbydocid_test $tn.$i {
429: SELECT docid FROM t1 WHERE t1 MATCH $match
430: } [$proc [simple_phrase $term1] [simple_phrase $term2]]
431: }
432: }
433:
434: # Set operations on NEAR queries.
435: #
436: foreach {tn op proc} {
437: 11 OR setop_or
438: 12 NOT setop_not
439: 13 AND setop_and
440: } {
441: for {set i 0} {$i < $nRep} {incr i} {
442: set term1 [random_term]
443: set term2 [random_term]
444: set term3 [random_term]
445: set term4 [random_term]
446: set match "$term1 NEAR $term2 $op $term3 NEAR $term4"
447: do_orderbydocid_test $tn.$i {
448: SELECT docid FROM t1 WHERE t1 MATCH $match
449: } [$proc \
450: [simple_near [list $term1 $term2] 10] \
451: [simple_near [list $term3 $term4] 10]
452: ]
453: }
454: }
455:
456: catchsql COMMIT
457: }
458: }
459:
460: finish_test
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>