mirror of
https://github.com/DBD-SQLite/DBD-SQLite
synced 2025-06-07 22:28:47 -04:00
updated doc and tests for FTS4 (but no change in code was required)
This commit is contained in:
parent
c1e945b0a6
commit
6835c13898
3 changed files with 100 additions and 79 deletions
|
@ -2083,20 +2083,25 @@ need to call the L</create_collation> method directly.
|
||||||
|
|
||||||
=head1 FULLTEXT SEARCH
|
=head1 FULLTEXT SEARCH
|
||||||
|
|
||||||
The FTS3 extension module within SQLite allows users to create special
|
The FTS extension module within SQLite allows users to create special
|
||||||
tables with a built-in full-text index (hereafter "FTS3 tables"). The
|
tables with a built-in full-text index (hereafter "FTS tables"). The
|
||||||
full-text index allows the user to efficiently query the database for
|
full-text index allows the user to efficiently query the database for
|
||||||
all rows that contain one or more instances of a specified word (hereafter
|
all rows that contain one or more instances of a specified word (hereafter
|
||||||
a "token"), even if the table contains many large documents.
|
a "token"), even if the table contains many large documents.
|
||||||
|
|
||||||
|
=head2 Short introduction to FTS
|
||||||
|
|
||||||
=head2 Short introduction to FTS3
|
The first full-text search modules for SQLite were called C<FTS1> and C<FTS2>
|
||||||
|
and are now obsolete. The latest recommended module is C<FTS4>; however
|
||||||
|
the former module C<FTS3> is still supporter.
|
||||||
|
Detailed documentation for both C<FTS4> and C<FTS3> can be found
|
||||||
|
at L<http://www.sqlite.org/fts3.html>, including explanations about the
|
||||||
|
differences between these two versions.
|
||||||
|
|
||||||
The detailed documentation for FTS3 can be found
|
Here is a very short example of using FTS :
|
||||||
at L<http://www.sqlite.org/fts3.html>. Here is a very short example :
|
|
||||||
|
|
||||||
$dbh->do(<<"") or die DBI::errstr;
|
$dbh->do(<<"") or die DBI::errstr;
|
||||||
CREATE VIRTUAL TABLE fts_example USING fts3(content)
|
CREATE VIRTUAL TABLE fts_example USING fts4(content)
|
||||||
|
|
||||||
my $sth = $dbh->prepare("INSERT INTO fts_example(content) VALUES (?))");
|
my $sth = $dbh->prepare("INSERT INTO fts_example(content) VALUES (?))");
|
||||||
$sth->execute($_) foreach @docs_to_insert;
|
$sth->execute($_) foreach @docs_to_insert;
|
||||||
|
@ -2111,14 +2116,14 @@ The key points in this example are :
|
||||||
|
|
||||||
=item *
|
=item *
|
||||||
|
|
||||||
The syntax for creating FTS3 tables is
|
The syntax for creating FTS tables is
|
||||||
|
|
||||||
CREATE VIRTUAL TABLE <table_name> USING fts3(<columns>)
|
CREATE VIRTUAL TABLE <table_name> USING fts4(<columns>)
|
||||||
|
|
||||||
where C<< <columns> >> is a list of column names. Columns may be
|
where C<< <columns> >> is a list of column names. Columns may be
|
||||||
typed, but the type information is ignored. If no columns
|
typed, but the type information is ignored. If no columns
|
||||||
are specified, the default is a single column named C<content>.
|
are specified, the default is a single column named C<content>.
|
||||||
In addition, FTS3 tables have an implicit column called C<docid>
|
In addition, FTS tables have an implicit column called C<docid>
|
||||||
(or also C<rowid>) for numbering the stored documents.
|
(or also C<rowid>) for numbering the stored documents.
|
||||||
|
|
||||||
=item *
|
=item *
|
||||||
|
@ -2141,7 +2146,7 @@ document text, where the words pertaining to the query are highlighted.
|
||||||
=back
|
=back
|
||||||
|
|
||||||
There are many more details to building and searching
|
There are many more details to building and searching
|
||||||
FTS3 tables, so we strongly invite you to read
|
FTS tables, so we strongly invite you to read
|
||||||
the full documentation at at L<http://www.sqlite.org/fts3.html>.
|
the full documentation at at L<http://www.sqlite.org/fts3.html>.
|
||||||
|
|
||||||
B<Incompatible change> :
|
B<Incompatible change> :
|
||||||
|
@ -2162,14 +2167,16 @@ in a separate distribution.
|
||||||
=head2 Tokenizers
|
=head2 Tokenizers
|
||||||
|
|
||||||
The behaviour of full-text indexes strongly depends on how
|
The behaviour of full-text indexes strongly depends on how
|
||||||
documents are split into I<tokens>; therefore FTS3 table
|
documents are split into I<tokens>; therefore FTS table
|
||||||
declarations can explicitly specify how to perform
|
declarations can explicitly specify how to perform
|
||||||
tokenization:
|
tokenization:
|
||||||
|
|
||||||
CREATE ... USING fts3(<columns>, tokenize=<tokenizer>)
|
CREATE ... USING fts4(<columns>, tokenize=<tokenizer>)
|
||||||
|
|
||||||
where C<< <tokenizer> >> is a sequence of space-separated
|
where C<< <tokenizer> >> is a sequence of space-separated
|
||||||
words that triggers a specific tokenizer, as explained below.
|
words that triggers a specific tokenizer. Tokenizers can
|
||||||
|
be SQLite builtins, written in C code, or Perl tokenizers.
|
||||||
|
Both are as explained below.
|
||||||
|
|
||||||
=head3 SQLite builtin tokenizers
|
=head3 SQLite builtin tokenizers
|
||||||
|
|
||||||
|
@ -2207,7 +2214,7 @@ ICU locale identifier as argument (such as "tr_TR" for
|
||||||
Turkish as used in Turkey, or "en_AU" for English as used in
|
Turkish as used in Turkey, or "en_AU" for English as used in
|
||||||
Australia). For example:
|
Australia). For example:
|
||||||
|
|
||||||
CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH)
|
CREATE VIRTUAL TABLE thai_text USING fts4(text, tokenize=icu th_TH)
|
||||||
|
|
||||||
The ICU tokenizer implementation is very simple. It splits the input
|
The ICU tokenizer implementation is very simple. It splits the input
|
||||||
text according to the ICU rules for finding word boundaries and
|
text according to the ICU rules for finding word boundaries and
|
||||||
|
@ -2224,14 +2231,14 @@ In addition to the builtin SQLite tokenizers, C<DBD::SQLite>
|
||||||
implements a I<perl> tokenizer, that can hook to any tokenizing
|
implements a I<perl> tokenizer, that can hook to any tokenizing
|
||||||
algorithm written in Perl. This is specified as follows :
|
algorithm written in Perl. This is specified as follows :
|
||||||
|
|
||||||
CREATE ... USING fts3(<columns>, tokenize=perl '<perl_function>')
|
CREATE ... USING fts4(<columns>, tokenize=perl '<perl_function>')
|
||||||
|
|
||||||
where C<< <perl_function> >> is a fully qualified Perl function name
|
where C<< <perl_function> >> is a fully qualified Perl function name
|
||||||
(i.e. prefixed by the name of the package in which that function is
|
(i.e. prefixed by the name of the package in which that function is
|
||||||
declared). So for example if the function is C<my_func> in the main
|
declared). So for example if the function is C<my_func> in the main
|
||||||
program, write
|
program, write
|
||||||
|
|
||||||
CREATE ... USING fts3(<columns>, tokenize=perl 'main::my_func')
|
CREATE ... USING fts4(<columns>, tokenize=perl 'main::my_func')
|
||||||
|
|
||||||
That function should return a code reference that takes a string as
|
That function should return a code reference that takes a string as
|
||||||
single argument, and returns an iterator (another function), which
|
single argument, and returns an iterator (another function), which
|
||||||
|
@ -2264,7 +2271,7 @@ because :
|
||||||
|
|
||||||
=item *
|
=item *
|
||||||
|
|
||||||
the external, named sub is called whenever accessing a FTS3 table
|
the external, named sub is called whenever accessing a FTS table
|
||||||
with that tokenizer
|
with that tokenizer
|
||||||
|
|
||||||
=item *
|
=item *
|
||||||
|
@ -2281,32 +2288,33 @@ all terms within that string.
|
||||||
=back
|
=back
|
||||||
|
|
||||||
Instead of writing tokenizers by hand, you can grab one of those
|
Instead of writing tokenizers by hand, you can grab one of those
|
||||||
already implemented in the L<Search::Tokenizer> module :
|
already implemented in the L<Search::Tokenizer> module. For example,
|
||||||
|
if you want ignore differences between accented characters, you can
|
||||||
|
write :
|
||||||
|
|
||||||
use Search::Tokenizer;
|
use Search::Tokenizer;
|
||||||
$dbh->do(<<"") or die DBI::errstr;
|
$dbh->do(<<"") or die DBI::errstr;
|
||||||
CREATE ... USING fts3(<columns>,
|
CREATE ... USING fts4(<columns>,
|
||||||
tokenize=perl 'Search::Tokenizer::unaccent')
|
tokenize=perl 'Search::Tokenizer::unaccent')
|
||||||
|
|
||||||
or you can use L<Search::Tokenizer/new> to build
|
Alternatively, you can use L<Search::Tokenizer/new> to build
|
||||||
your own tokenizer.
|
your own tokenizer.
|
||||||
|
|
||||||
|
|
||||||
=head2 Incomplete handling of utf8 characters
|
=head2 Incomplete handling of utf8 characters
|
||||||
|
|
||||||
The current FTS3 implementation in SQLite is far from complete with
|
The current FTS implementation in SQLite is far from complete with
|
||||||
respect to utf8 handling : in particular, variable-length characters
|
respect to utf8 handling : in particular, variable-length characters
|
||||||
are not treated correctly by the builtin functions
|
are not treated correctly by the builtin functions
|
||||||
C<offsets()> and C<snippet()>.
|
C<offsets()> and C<snippet()>.
|
||||||
|
|
||||||
=head2 Database space for FTS3
|
=head2 Database space for FTS
|
||||||
|
|
||||||
FTS3 stores a complete copy of the indexed documents, together with
|
By default, FTS stores a complete copy of the indexed documents, together with
|
||||||
the fulltext index. On a large collection of documents, this can
|
the fulltext index. On a large collection of documents, this can
|
||||||
consume quite a lot of disk space. If copies of documents are also
|
consume quite a lot of disk space. However, FTS has some options
|
||||||
available as external resources (for example files on the filesystem),
|
for compressing the documents, or even for not storing them at all
|
||||||
that space can sometimes be spared --- see the tip in the
|
-- see L<http://www.sqlite.org/fts3.html#fts4_options>.
|
||||||
L<Cookbook|DBD::SQLite::Cookbook/"Sparing database disk space">.
|
|
||||||
|
|
||||||
=head1 R* TREE SUPPORT
|
=head1 R* TREE SUPPORT
|
||||||
|
|
||||||
|
|
|
@ -135,34 +135,37 @@ The function can then be used as:
|
||||||
FROM results
|
FROM results
|
||||||
GROUP BY group_name;
|
GROUP BY group_name;
|
||||||
|
|
||||||
=head1 FTS3 fulltext indexing
|
=head1 FTS fulltext indexing
|
||||||
|
|
||||||
=head2 Sparing database disk space
|
=head2 Sparing database disk space
|
||||||
|
|
||||||
As explained in L<http://www.sqlite.org/fts3.html#section_6>, each
|
As explained in L<http://www.sqlite.org/fts3.html#fts4_options>,
|
||||||
FTS3 table C<I<t>> is stored internally within three regular tables
|
several options are available to specify how SQLite should store
|
||||||
C<I<t>_content>, C<I<t>_segments> and C<I<t>_segdir>. The last two
|
indexed documents.
|
||||||
tables contain the fulltext index. The first table C<I<t>_content>
|
|
||||||
stores the complete documents being indexed ... but if copies of the
|
|
||||||
same documents are already stored somewhere else, or can be computed
|
|
||||||
from external resources (for example as HTML or MsWord files in the
|
|
||||||
filesystem), then this is quite a waste of space. SQLite itself only
|
|
||||||
needs the C<I<t>_content> table for implementing the C<offsets()> and
|
|
||||||
C<snippet()> functions, which are not always usable anyway (in particular
|
|
||||||
when using utf8 characters greater than 255).
|
|
||||||
|
|
||||||
So an alternative strategy is to use SQLite only for the fulltext
|
One strategy is to use SQLite only for the fulltext index and
|
||||||
index and metadata, and to keep the full documents outside of SQLite :
|
metadata, and keep the full documents outside of SQLite; to do so, use
|
||||||
to do so, after each insert or update in the FTS3 table, do an update
|
the C<content=""> option. For example, the following SQL creates
|
||||||
in the C<I<t>_content> table, setting the content column(s) to
|
an FTS4 table with three columns - "a", "b", and "c":
|
||||||
NULL. Of course your application will need an algorithm for finding
|
|
||||||
|
CREATE VIRTUAL TABLE t1 USING fts4(content="", a, b, c);
|
||||||
|
|
||||||
|
Data can be inserted into such an FTS4 table using an INSERT
|
||||||
|
statements. However, unlike ordinary FTS4 tables, the user must supply
|
||||||
|
an explicit integer docid value. For example:
|
||||||
|
|
||||||
|
-- This statement is Ok:
|
||||||
|
INSERT INTO t1(docid, a, b, c) VALUES(1, 'a b c', 'd e f', 'g h i');
|
||||||
|
|
||||||
|
-- This statement causes an error, as no docid value has been provided:
|
||||||
|
INSERT INTO t1(a, b, c) VALUES('j k l', 'm n o', 'p q r');
|
||||||
|
|
||||||
|
Of course your application will need an algorithm for finding
|
||||||
the external resource corresponding to any I<docid> stored within
|
the external resource corresponding to any I<docid> stored within
|
||||||
SQLite. Furthermore, SQLite C<offsets()> and C<snippet()> functions
|
SQLite. Furthermore, SQLite C<offsets()> and C<snippet()> functions
|
||||||
cannot be used, so if such functionality is needed, it has to be
|
cannot be used, so if such functionality is needed, it has to be
|
||||||
directly programmed within the Perl application.
|
directly programmed within the Perl application.
|
||||||
In short, this strategy is really a hack, because FTS3 was not originally
|
|
||||||
programmed with that behaviour in mind; however it is workable
|
|
||||||
and has a strong impact on the size of the database file.
|
|
||||||
|
|
||||||
=head1 SUPPORT
|
=head1 SUPPORT
|
||||||
|
|
||||||
|
@ -172,10 +175,18 @@ L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DBD-SQLite>
|
||||||
|
|
||||||
=head1 TO DO
|
=head1 TO DO
|
||||||
|
|
||||||
* Add more and varied cookbook recipes, until we have enough to
|
=over
|
||||||
|
|
||||||
|
=item *
|
||||||
|
|
||||||
|
Add more and varied cookbook recipes, until we have enough to
|
||||||
turn them into a separate CPAN distribution.
|
turn them into a separate CPAN distribution.
|
||||||
|
|
||||||
* Create a series of tests scripts that validate the cookbook recipies.
|
=item *
|
||||||
|
|
||||||
|
Create a series of tests scripts that validate the cookbook recipies.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
=head1 AUTHOR
|
=head1 AUTHOR
|
||||||
|
|
||||||
|
|
28
t/43_fts3.t
28
t/43_fts3.t
|
@ -1,5 +1,4 @@
|
||||||
#!/usr/bin/perl
|
#!/usr/bin/perl
|
||||||
|
|
||||||
use strict;
|
use strict;
|
||||||
BEGIN {
|
BEGIN {
|
||||||
$| = 1;
|
$| = 1;
|
||||||
|
@ -37,7 +36,9 @@ BEGIN {
|
||||||
}
|
}
|
||||||
use Test::NoWarnings;
|
use Test::NoWarnings;
|
||||||
|
|
||||||
plan tests => 2 * (1 + @tests) + 1;
|
plan tests => 4 * @tests # each test with unicode y/n and with fts3/fts4
|
||||||
|
+ 2 # connect_ok with unicode y/n
|
||||||
|
+ 1; # Test::NoWarnings
|
||||||
|
|
||||||
BEGIN {
|
BEGIN {
|
||||||
# Sadly perl for windows (and probably sqlite, too) may hang
|
# Sadly perl for windows (and probably sqlite, too) may hang
|
||||||
|
@ -78,14 +79,15 @@ for my $use_unicode (0, 1) {
|
||||||
# connect
|
# connect
|
||||||
my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode );
|
my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode );
|
||||||
|
|
||||||
# create fts3 table
|
for my $fts (qw/fts3 fts4/) {
|
||||||
|
# create fts table
|
||||||
$dbh->do(<<"") or die DBI::errstr;
|
$dbh->do(<<"") or die DBI::errstr;
|
||||||
CREATE VIRTUAL TABLE try_fts3
|
CREATE VIRTUAL TABLE try_$fts
|
||||||
USING fts3(content, tokenize=perl 'main::locale_tokenizer')
|
USING $fts(content, tokenize=perl 'main::locale_tokenizer')
|
||||||
|
|
||||||
# populate it
|
# populate it
|
||||||
my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
|
my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
|
||||||
INSERT INTO try_fts3(content) VALUES(?)
|
INSERT INTO try_$fts(content) VALUES(?)
|
||||||
|
|
||||||
my @doc_ids;
|
my @doc_ids;
|
||||||
for (my $i = 0; $i < @texts; $i++) {
|
for (my $i = 0; $i < @texts; $i++) {
|
||||||
|
@ -94,20 +96,20 @@ for my $use_unicode (0, 1) {
|
||||||
}
|
}
|
||||||
|
|
||||||
# queries
|
# queries
|
||||||
SKIP: {
|
SKIP: {
|
||||||
skip "These tests require SQLite compiled with ENABLE_FTS3_PARENTHESIS option", scalar @tests
|
skip "These tests require SQLite compiled with "
|
||||||
|
. "ENABLE_FTS3_PARENTHESIS option", scalar @tests
|
||||||
unless DBD::SQLite->can('compile_options') &&
|
unless DBD::SQLite->can('compile_options') &&
|
||||||
grep /ENABLE_FTS3_PARENTHESIS/, DBD::SQLite::compile_options();
|
grep /ENABLE_FTS3_PARENTHESIS/, DBD::SQLite::compile_options();
|
||||||
my $sql = "SELECT docid FROM try_fts3 WHERE content MATCH ?";
|
my $sql = "SELECT docid FROM try_$fts WHERE content MATCH ?";
|
||||||
for my $t (@tests) {
|
for my $t (@tests) {
|
||||||
my ($query, @expected) = @$t;
|
my ($query, @expected) = @$t;
|
||||||
@expected = map {$doc_ids[$_]} @expected;
|
@expected = map {$doc_ids[$_]} @expected;
|
||||||
my $results = $dbh->selectcol_arrayref($sql, undef, $query);
|
my $results = $dbh->selectcol_arrayref($sql, undef, $query);
|
||||||
is_deeply($results, \@expected, "$query (unicode is $use_unicode)");
|
is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue