diff --git a/lib/DBD/SQLite.pm b/lib/DBD/SQLite.pm index 7382e57..2dadb5d 100644 --- a/lib/DBD/SQLite.pm +++ b/lib/DBD/SQLite.pm @@ -2083,20 +2083,25 @@ need to call the L method directly. =head1 FULLTEXT SEARCH -The FTS3 extension module within SQLite allows users to create special -tables with a built-in full-text index (hereafter "FTS3 tables"). The +The FTS extension module within SQLite allows users to create special +tables with a built-in full-text index (hereafter "FTS tables"). The full-text index allows the user to efficiently query the database for all rows that contain one or more instances of a specified word (hereafter a "token"), even if the table contains many large documents. +=head2 Short introduction to FTS -=head2 Short introduction to FTS3 +The first full-text search modules for SQLite were called C and C +and are now obsolete. The latest recommended module is C; however +the former module C is still supporter. +Detailed documentation for both C and C can be found +at L, including explanations about the +differences between these two versions. -The detailed documentation for FTS3 can be found -at L. Here is a very short example : +Here is a very short example of using FTS : $dbh->do(<<"") or die DBI::errstr; - CREATE VIRTUAL TABLE fts_example USING fts3(content) + CREATE VIRTUAL TABLE fts_example USING fts4(content) my $sth = $dbh->prepare("INSERT INTO fts_example(content) VALUES (?))"); $sth->execute($_) foreach @docs_to_insert; @@ -2111,14 +2116,14 @@ The key points in this example are : =item * -The syntax for creating FTS3 tables is +The syntax for creating FTS tables is - CREATE VIRTUAL TABLE USING fts3() + CREATE VIRTUAL TABLE USING fts4() where C<< >> is a list of column names. Columns may be typed, but the type information is ignored. If no columns are specified, the default is a single column named C. -In addition, FTS3 tables have an implicit column called C +In addition, FTS tables have an implicit column called C (or also C) for numbering the stored documents. =item * @@ -2131,7 +2136,7 @@ use the same syntax as for regular SQLite tables. Full-text searches are specified with the C operator, and an operand which may be a single word, a word prefix ending with '*', a list of words, a "phrase query" in double quotes, or a boolean combination -of the above. +of the above. =item * @@ -2141,7 +2146,7 @@ document text, where the words pertaining to the query are highlighted. =back There are many more details to building and searching -FTS3 tables, so we strongly invite you to read +FTS tables, so we strongly invite you to read the full documentation at at L. B : @@ -2162,14 +2167,16 @@ in a separate distribution. =head2 Tokenizers The behaviour of full-text indexes strongly depends on how -documents are split into I; therefore FTS3 table +documents are split into I; therefore FTS table declarations can explicitly specify how to perform tokenization: - CREATE ... USING fts3(, tokenize=) + CREATE ... USING fts4(, tokenize=) where C<< >> is a sequence of space-separated -words that triggers a specific tokenizer, as explained below. +words that triggers a specific tokenizer. Tokenizers can +be SQLite builtins, written in C code, or Perl tokenizers. +Both are as explained below. =head3 SQLite builtin tokenizers @@ -2207,7 +2214,7 @@ ICU locale identifier as argument (such as "tr_TR" for Turkish as used in Turkey, or "en_AU" for English as used in Australia). For example: - CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH) + CREATE VIRTUAL TABLE thai_text USING fts4(text, tokenize=icu th_TH) The ICU tokenizer implementation is very simple. It splits the input text according to the ICU rules for finding word boundaries and @@ -2224,14 +2231,14 @@ In addition to the builtin SQLite tokenizers, C implements a I tokenizer, that can hook to any tokenizing algorithm written in Perl. This is specified as follows : - CREATE ... USING fts3(, tokenize=perl '') + CREATE ... USING fts4(, tokenize=perl '') where C<< >> is a fully qualified Perl function name (i.e. prefixed by the name of the package in which that function is declared). So for example if the function is C in the main program, write - CREATE ... USING fts3(, tokenize=perl 'main::my_func') + CREATE ... USING fts4(, tokenize=perl 'main::my_func') That function should return a code reference that takes a string as single argument, and returns an iterator (another function), which @@ -2264,7 +2271,7 @@ because : =item * -the external, named sub is called whenever accessing a FTS3 table +the external, named sub is called whenever accessing a FTS table with that tokenizer =item * @@ -2281,32 +2288,33 @@ all terms within that string. =back Instead of writing tokenizers by hand, you can grab one of those -already implemented in the L module : +already implemented in the L module. For example, +if you want ignore differences between accented characters, you can +write : use Search::Tokenizer; $dbh->do(<<"") or die DBI::errstr; - CREATE ... USING fts3(, + CREATE ... USING fts4(, tokenize=perl 'Search::Tokenizer::unaccent') -or you can use L to build +Alternatively, you can use L to build your own tokenizer. =head2 Incomplete handling of utf8 characters -The current FTS3 implementation in SQLite is far from complete with +The current FTS implementation in SQLite is far from complete with respect to utf8 handling : in particular, variable-length characters are not treated correctly by the builtin functions C and C. -=head2 Database space for FTS3 +=head2 Database space for FTS -FTS3 stores a complete copy of the indexed documents, together with +By default, FTS stores a complete copy of the indexed documents, together with the fulltext index. On a large collection of documents, this can -consume quite a lot of disk space. If copies of documents are also -available as external resources (for example files on the filesystem), -that space can sometimes be spared --- see the tip in the -L. +consume quite a lot of disk space. However, FTS has some options +for compressing the documents, or even for not storing them at all +-- see L. =head1 R* TREE SUPPORT diff --git a/lib/DBD/SQLite/Cookbook.pod b/lib/DBD/SQLite/Cookbook.pod index 8b40c5b..f51083b 100644 --- a/lib/DBD/SQLite/Cookbook.pod +++ b/lib/DBD/SQLite/Cookbook.pod @@ -135,34 +135,37 @@ The function can then be used as: FROM results GROUP BY group_name; -=head1 FTS3 fulltext indexing +=head1 FTS fulltext indexing =head2 Sparing database disk space -As explained in L, each -FTS3 table C> is stored internally within three regular tables -C_content>, C_segments> and C_segdir>. The last two -tables contain the fulltext index. The first table C_content> -stores the complete documents being indexed ... but if copies of the -same documents are already stored somewhere else, or can be computed -from external resources (for example as HTML or MsWord files in the -filesystem), then this is quite a waste of space. SQLite itself only -needs the C_content> table for implementing the C and -C functions, which are not always usable anyway (in particular -when using utf8 characters greater than 255). +As explained in L, +several options are available to specify how SQLite should store +indexed documents. -So an alternative strategy is to use SQLite only for the fulltext -index and metadata, and to keep the full documents outside of SQLite : -to do so, after each insert or update in the FTS3 table, do an update -in the C_content> table, setting the content column(s) to -NULL. Of course your application will need an algorithm for finding +One strategy is to use SQLite only for the fulltext index and +metadata, and keep the full documents outside of SQLite; to do so, use +the C option. For example, the following SQL creates +an FTS4 table with three columns - "a", "b", and "c": + + CREATE VIRTUAL TABLE t1 USING fts4(content="", a, b, c); + +Data can be inserted into such an FTS4 table using an INSERT +statements. However, unlike ordinary FTS4 tables, the user must supply +an explicit integer docid value. For example: + + -- This statement is Ok: + INSERT INTO t1(docid, a, b, c) VALUES(1, 'a b c', 'd e f', 'g h i'); + + -- This statement causes an error, as no docid value has been provided: + INSERT INTO t1(a, b, c) VALUES('j k l', 'm n o', 'p q r'); + +Of course your application will need an algorithm for finding the external resource corresponding to any I stored within SQLite. Furthermore, SQLite C and C functions cannot be used, so if such functionality is needed, it has to be directly programmed within the Perl application. -In short, this strategy is really a hack, because FTS3 was not originally -programmed with that behaviour in mind; however it is workable -and has a strong impact on the size of the database file. + =head1 SUPPORT @@ -172,10 +175,18 @@ L =head1 TO DO -* Add more and varied cookbook recipes, until we have enough to +=over + +=item * + +Add more and varied cookbook recipes, until we have enough to turn them into a separate CPAN distribution. -* Create a series of tests scripts that validate the cookbook recipies. +=item * + +Create a series of tests scripts that validate the cookbook recipies. + +=back =head1 AUTHOR diff --git a/t/43_fts3.t b/t/43_fts3.t index 5a0e5f9..7ba0b84 100644 --- a/t/43_fts3.t +++ b/t/43_fts3.t @@ -1,5 +1,4 @@ #!/usr/bin/perl - use strict; BEGIN { $| = 1; @@ -37,7 +36,9 @@ BEGIN { } use Test::NoWarnings; -plan tests => 2 * (1 + @tests) + 1; +plan tests => 4 * @tests # each test with unicode y/n and with fts3/fts4 + + 2 # connect_ok with unicode y/n + + 1; # Test::NoWarnings BEGIN { # Sadly perl for windows (and probably sqlite, too) may hang @@ -78,36 +79,37 @@ for my $use_unicode (0, 1) { # connect my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode ); - # create fts3 table - $dbh->do(<<"") or die DBI::errstr; - CREATE VIRTUAL TABLE try_fts3 - USING fts3(content, tokenize=perl 'main::locale_tokenizer') + for my $fts (qw/fts3 fts4/) { + # create fts table + $dbh->do(<<"") or die DBI::errstr; + CREATE VIRTUAL TABLE try_$fts + USING $fts(content, tokenize=perl 'main::locale_tokenizer') - # populate it - my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr; - INSERT INTO try_fts3(content) VALUES(?) + # populate it + my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr; + INSERT INTO try_$fts(content) VALUES(?) - my @doc_ids; - for (my $i = 0; $i < @texts; $i++) { - $insert_sth->execute($texts[$i]); - $doc_ids[$i] = $dbh->last_insert_id("", "", "", ""); + my @doc_ids; + for (my $i = 0; $i < @texts; $i++) { + $insert_sth->execute($texts[$i]); + $doc_ids[$i] = $dbh->last_insert_id("", "", "", ""); + } + + # queries + SKIP: { + skip "These tests require SQLite compiled with " + . "ENABLE_FTS3_PARENTHESIS option", scalar @tests + unless DBD::SQLite->can('compile_options') && + grep /ENABLE_FTS3_PARENTHESIS/, DBD::SQLite::compile_options(); + my $sql = "SELECT docid FROM try_$fts WHERE content MATCH ?"; + for my $t (@tests) { + my ($query, @expected) = @$t; + @expected = map {$doc_ids[$_]} @expected; + my $results = $dbh->selectcol_arrayref($sql, undef, $query); + is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)"); + } + } } - - # queries -SKIP: { - skip "These tests require SQLite compiled with ENABLE_FTS3_PARENTHESIS option", scalar @tests - unless DBD::SQLite->can('compile_options') && - grep /ENABLE_FTS3_PARENTHESIS/, DBD::SQLite::compile_options(); - my $sql = "SELECT docid FROM try_fts3 WHERE content MATCH ?"; - for my $t (@tests) { - my ($query, @expected) = @$t; - @expected = map {$doc_ids[$_]} @expected; - my $results = $dbh->selectcol_arrayref($sql, undef, $query); - is_deeply($results, \@expected, "$query (unicode is $use_unicode)"); - } - -} - }