1
0
Fork 0
mirror of https://github.com/DBD-SQLite/DBD-SQLite synced 2025-06-07 14:19:10 -04:00

added support for FTS3 fulltext searches : perl tokenizers, documentation and tests

This commit is contained in:
Laurent Dami 2010-07-13 06:28:21 +00:00
parent 413bd0ac9d
commit d43cf63ad0
12 changed files with 988 additions and 3 deletions

View file

@ -1,6 +1,11 @@
Changes for Perl extension DBD-SQLite
1.30_04 to be released
- Added support for FTS3 tokenizers written in Perl. Added tests
and documentation on how to use FTS3. Changed compilation flag
to use the recommanded -DSQLITE_ENABLE_FTS3_PARENTHESIS
*** MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY USED FTS3 ***
(DAMI)
- Fixed various backward compatibility issues back to SQLite 3.6.1
(ISHIGAKI)
- Resolved #58332: Documentation error for preventing fsync

View file

@ -212,8 +212,14 @@ if ( $sqlite_inc ) {
my @CC_DEFINE = (
# '-DSQLITE_CORE',
'-DSQLITE_ENABLE_FTS3',
# Disabled until we have a test for this
# '-DSQLITE_ENABLE_FTS3_PARENTHESIS', # for sqlite >= 3.6.10
# L. Dami 10.07.2010 : now enabling new FTS3 syntax, because
# that's the recommendation from SQLite for new applications
# (used to be "Disabled until we have a test for this").
# This change MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY
# USED FTS3 ... but sooner or later that change had to be done !
'-DSQLITE_ENABLE_FTS3_PARENTHESIS', # for sqlite >= 3.6.10
'-DSQLITE_ENABLE_COLUMN_METADATA',
'-DNDEBUG=1',
);

View file

@ -196,6 +196,22 @@ backup_to_file(dbh, filename)
OUTPUT:
RETVAL
static int
register_fts3_perl_tokenizer(dbh)
SV *dbh
ALIAS:
DBD::SQLite::db::sqlite_register_fts3_perl_tokenizer = 1
CODE:
RETVAL = sqlite_db_register_fts3_perl_tokenizer(aTHX_ dbh);
OUTPUT:
RETVAL
MODULE = DBD::SQLite PACKAGE = DBD::SQLite::st
PROTOTYPES: DISABLE

View file

@ -19,5 +19,6 @@
#include <dbd_xsh.h>
#include "sqlite3.h"
#include "fts3_tokenizer.h"
#endif

302
dbdimp.c
View file

@ -20,6 +20,14 @@ DBISTATE_DECLARE;
#define croak_if_stmt_is_null()
#endif
/*-----------------------------------------------------*
* Globals
*-----------------------------------------------------*/
imp_dbh_t *last_executed_dbh; /* needed by perl_tokenizer
to know if unicode is on/off */
/*-----------------------------------------------------*
* Helper Methods
*-----------------------------------------------------*/
@ -487,6 +495,298 @@ sqlite_db_last_insert_id(SV *dbh, imp_dbh_t *imp_dbh, SV *catalog, SV *schema, S
return newSViv((IV)sqlite3_last_insert_rowid(imp_dbh->db));
}
/* ======================================================================
* EXPERIMENTAL bindings for FTS3 TOKENIZERS
* ====================================================================== */
typedef struct perl_tokenizer {
sqlite3_tokenizer base;
SV *coderef; /* the perl tokenizer is a coderef that takes
a string and returns a cursor coderef */
} perl_tokenizer;
typedef struct perl_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
SV *coderef; /* ref to the closure that returns terms */
char *pToken; /* storage for a copy of the last token */
int nTokenAllocated; /* space allocated to pToken buffer */
/* members below are only used if the input string is in utf8 */
const char *pInput; /* input we are tokenizing */
const char *lastByteOffset; /* offset into pInput */
int lastCharOffset; /* char offset corresponding to lastByteOffset */
} perl_tokenizer_cursor;
/*
** Create a new tokenizer instance.
** Will be called whenever a FTS3 table is created with
** CREATE .. USING fts3( ... , tokenize=perl qualified::function::name)
** where qualified::function::name is a fully qualified perl function
*/
static int perl_tokenizer_Create(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
perl_tokenizer *t;
t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return SQLITE_NOMEM;
memset(t, 0, sizeof(*t));
dTHX;
dSP;
ENTER;
SAVETMPS;
/* call the qualified::function::name */
PUSHMARK(SP);
PUTBACK;
int n_retval = call_pv(argv[0], G_SCALAR);
SPAGAIN;
/* store a copy of the returned coderef into the tokenizer structure */
if (n_retval != 1) {
warn("tokenizer_Create returned %d arguments", n_retval);
}
SV *retval = POPs;
t->coderef = newSVsv(retval);
*ppTokenizer = &t->base;
PUTBACK;
FREETMPS;
LEAVE;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){
dTHX;
perl_tokenizer *t = (perl_tokenizer *) pTokenizer;
sv_free(t->coderef);
sqlite3_free(t);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is supposed to be pInput[0..nBytes-1] ..
** except that nBytes passed by fts3 is -1 (don't know why) !
** This is passed to the tokenizer instance, which then returns a
** closure implementing the cursor (so the cursor is again a coderef).
*/
static int perl_tokenizer_Open(
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
const char *pInput, int nBytes, /* Input buffer */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
){
perl_tokenizer *t = (perl_tokenizer *)pTokenizer;
/* allocate and initialize the cursor struct */
perl_tokenizer_cursor *c;
c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
memset(c, 0, sizeof(*c));
*ppCursor = &c->base;
/* flags for creating the Perl SV containing the input string */
U32 flags = SVs_TEMP; /* will call sv_2mortal */
/* special handling if working with utf8 strings */
if (last_executed_dbh->unicode) { /* global var ... no better way ! */
/* data to keep track of byte offsets */
c->lastByteOffset = c->pInput = pInput;
c->lastCharOffset = 0;
/* string passed to Perl needs to be flagged as utf8 */
flags |= SVf_UTF8;
}
dTHX;
dSP;
ENTER;
SAVETMPS;
/* build a Perl copy of the input string */
if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */
nBytes = strlen(pInput);
}
SV *perl_string = newSVpvn_flags(pInput, nBytes, flags);
/* call the tokenizer coderef */
PUSHMARK(SP);
XPUSHs(perl_string);
PUTBACK;
int n_retval = call_sv(t->coderef, G_SCALAR);
SPAGAIN;
/* store the cursor coderef returned by the tokenizer */
if (n_retval != 1) {
warn("tokenizer returned %d arguments", n_retval);
}
c->coderef = newSVsv(POPs);
PUTBACK;
FREETMPS;
LEAVE;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to
** perl_tokenizer_Open() above.
*/
static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
dTHX;
sv_free(c->coderef);
sqlite3_free(c);
return SQLITE_OK;
}
/*
** Extract the next token from a tokenization cursor. The cursor must
** have been opened by a prior call to perl_tokenizer_Open().
*/
static int perl_tokenizer_Next(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */
const char **ppToken, /* OUT: *ppToken is the token text */
int *pnBytes, /* OUT: Number of bytes in token */
int *piStartOffset, /* OUT: Starting offset of token */
int *piEndOffset, /* OUT: Ending offset of token */
int *piPosition /* OUT: Position integer of token */
){
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
int result;
dTHX;
dSP;
ENTER;
SAVETMPS;
/* call the cursor */
PUSHMARK(SP);
PUTBACK;
int n_retval = call_sv(c->coderef, G_ARRAY);
SPAGAIN;
/* if we get back an empty list, there is no more token */
if (n_retval == 0) {
result = SQLITE_DONE;
}
/* otherwise, get token details from the return list */
else {
if (n_retval != 5) {
warn("tokenizer cursor returned %d arguments", n_retval);
}
*piPosition = POPi;
*piEndOffset = POPi;
*piStartOffset = POPi;
*pnBytes = POPi;
char *token = POPpx;
if (c->pInput) { /* if working with utf8 data */
/* recompute *pnBytes in bytes, not in chars */
*pnBytes = strlen(token);
/* recompute start/end offsets in bytes, not in chars */
I32 hop = *piStartOffset - c->lastCharOffset;
char *byteOffset = utf8_hop(c->lastByteOffset, hop);
hop = *piEndOffset - *piStartOffset;
*piStartOffset = byteOffset - c->pInput;
byteOffset = utf8_hop(byteOffset, hop);
*piEndOffset = byteOffset - c->pInput;
/* remember where we are for next round */
c->lastCharOffset = *piEndOffset,
c->lastByteOffset = byteOffset;
}
/* make sure we have enough storage for copying the token */
if (*pnBytes > c->nTokenAllocated ){
char *pNew;
c->nTokenAllocated = *pnBytes + 20;
pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
if( !pNew ) return SQLITE_NOMEM;
c->pToken = pNew;
}
/* need to copy the token into the C cursor before perl frees that
memory */
memcpy(c->pToken, token, *pnBytes);
*ppToken = c->pToken;
result = SQLITE_OK;
}
PUTBACK;
FREETMPS;
LEAVE;
return result;
}
/*
** The set of routines that implement the perl tokenizer
*/
sqlite3_tokenizer_module perl_tokenizer_Module = {
0,
perl_tokenizer_Create,
perl_tokenizer_Destroy,
perl_tokenizer_Open,
perl_tokenizer_Close,
perl_tokenizer_Next
};
/*
** Register the perl tokenizer with FTS3
*/
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh)
{
D_imp_dbh(dbh);
int rc;
sqlite3_stmt *pStmt;
const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
sqlite3_tokenizer_module *p = &perl_tokenizer_Module;
rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC);
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
sqlite3_step(pStmt);
return sqlite3_finalize(pStmt);
}
/* ======================================================================
* END # EXPERIMENTAL bindings for FTS3 TOKENIZERS
* ====================================================================== */
int
sqlite_st_prepare(SV *sth, imp_sth_t *imp_sth, char *statement, SV *attribs)
{
@ -566,6 +866,8 @@ sqlite_st_execute(SV *sth, imp_sth_t *imp_sth)
croak_if_db_is_null();
croak_if_stmt_is_null();
last_executed_dbh = imp_dbh;
/* COMPAT: sqlite3_sql is only available for 3006000 or newer */
sqlite_trace(sth, imp_sth, 3, form("executing %s", sqlite3_sql(imp_sth->stmt)));

View file

@ -100,6 +100,8 @@ SV* sqlite_db_update_hook( pTHX_ SV *dbh, SV *hook );
int sqlite_db_set_authorizer( pTHX_ SV *dbh, SV *authorizer );
AV* sqlite_compile_options();
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh);
#ifdef SvUTF8_on
static SV *

154
fts3_tokenizer.h Normal file
View file

@ -0,0 +1,154 @@
/************** Begin file fts3_tokenizer.h **********************************/
/*
** 2006 July 10
**
** The author disclaims copyright to this source code.
**
*************************************************************************
** Defines the interface to tokenizers used by fulltext-search. There
** are three basic components:
**
** sqlite3_tokenizer_module is a singleton defining the tokenizer
** interface functions. This is essentially the class structure for
** tokenizers.
**
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
** including customization information defined at creation time.
**
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
** tokens from a particular input.
*/
#ifndef _FTS3_TOKENIZER_H_
#define _FTS3_TOKENIZER_H_
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
** If tokenizers are to be allowed to call sqlite3_*() functions, then
** we will need a way to register the API consistently.
*/
/*
** Structures used by the tokenizer interface. When a new tokenizer
** implementation is registered, the caller provides a pointer to
** an sqlite3_tokenizer_module containing pointers to the callback
** functions that make up an implementation.
**
** When an fts3 table is created, it passes any arguments passed to
** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
** implementation. The xCreate() function in turn returns an
** sqlite3_tokenizer structure representing the specific tokenizer to
** be used for the fts3 table (customized by the tokenizer clause arguments).
**
** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
** method is called. It returns an sqlite3_tokenizer_cursor object
** that may be used to tokenize a specific input buffer based on
** the tokenization rules supplied by a specific sqlite3_tokenizer
** object.
*/
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
struct sqlite3_tokenizer_module {
/*
** Structure version. Should always be set to 0.
*/
int iVersion;
/*
** Create a new tokenizer. The values in the argv[] array are the
** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
** TABLE statement that created the fts3 table. For example, if
** the following SQL is executed:
**
** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
**
** then argc is set to 2, and the argv[] array contains pointers
** to the strings "arg1" and "arg2".
**
** This method should return either SQLITE_OK (0), or an SQLite error
** code. If SQLITE_OK is returned, then *ppTokenizer should be set
** to point at the newly created tokenizer structure. The generic
** sqlite3_tokenizer.pModule variable should not be initialised by
** this callback. The caller will do so.
*/
int (*xCreate)(
int argc, /* Size of argv array */
const char *const*argv, /* Tokenizer argument strings */
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
);
/*
** Destroy an existing tokenizer. The fts3 module calls this method
** exactly once for each successful call to xCreate().
*/
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
/*
** Create a tokenizer cursor to tokenize an input buffer. The caller
** is responsible for ensuring that the input buffer remains valid
** until the cursor is closed (using the xClose() method).
*/
int (*xOpen)(
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
const char *pInput, int nBytes, /* Input buffer */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
);
/*
** Destroy an existing tokenizer cursor. The fts3 module calls this
** method exactly once for each successful call to xOpen().
*/
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
/*
** Retrieve the next token from the tokenizer cursor pCursor. This
** method should either return SQLITE_OK and set the values of the
** "OUT" variables identified below, or SQLITE_DONE to indicate that
** the end of the buffer has been reached, or an SQLite error code.
**
** *ppToken should be set to point at a buffer containing the
** normalized version of the token (i.e. after any case-folding and/or
** stemming has been performed). *pnBytes should be set to the length
** of this buffer in bytes. The input text that generated the token is
** identified by the byte offsets returned in *piStartOffset and
** *piEndOffset. *piStartOffset should be set to the index of the first
** byte of the token in the input buffer. *piEndOffset should be set
** to the index of the first byte just past the end of the token in
** the input buffer.
**
** The buffer *ppToken is set to point at is managed by the tokenizer
** implementation. It is only required to be valid until the next call
** to xNext() or xClose().
*/
/* TODO(shess) current implementation requires pInput to be
** nul-terminated. This should either be fixed, or pInput/nBytes
** should be converted to zInput.
*/
int (*xNext)(
sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
int *piStartOffset, /* OUT: Byte offset of token in input buffer */
int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
int *piPosition /* OUT: Number of tokens returned before this one */
);
};
struct sqlite3_tokenizer {
const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
/* Tokenizer implementations will typically add additional fields */
};
struct sqlite3_tokenizer_cursor {
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
/* Tokenizer implementations will typically add additional fields */
};
int fts3_global_term_cnt(int iTerm, int iCol);
int fts3_term_cnt(int iTerm, int iCol);
#endif /* _FTS3_TOKENIZER_H_ */
/************** End of fts3_tokenizer.h **************************************/

View file

@ -55,6 +55,7 @@ sub driver {
DBD::SQLite::db->install_method('sqlite_backup_from_file');
DBD::SQLite::db->install_method('sqlite_backup_to_file');
DBD::SQLite::db->install_method('sqlite_enable_load_extension');
DBD::SQLite::db->install_method('sqlite_register_fts3_perl_tokenizer');
$methods_are_installed++;
}
@ -71,6 +72,7 @@ sub CLONE {
undef $drh;
}
package DBD::SQLite::dr;
sub connect {
@ -120,13 +122,16 @@ sub connect {
# Hand off to the actual login function
DBD::SQLite::db::_login($dbh, $real, $user, $auth, $attr) or return undef;
# Register the on-demand collation installer and REGEXP function
# Register the on-demand collation installer, REGEXP function and
# perl tokenizer
if ( DBD::SQLite::NEWAPI ) {
$dbh->sqlite_collation_needed( \&install_collation );
$dbh->sqlite_create_function( "REGEXP", 2, \&regexp );
$dbh->sqlite_register_fts3_perl_tokenizer();
} else {
$dbh->func( \&install_collation, "collation_needed" );
$dbh->func( "REGEXP", 2, \&regexp, "create_function" );
$dbh->func( "register_fts3_perl_tokenizer" );
}
# HACK: Since PrintWarn = 0 doesn't seem to actually prevent warnings
@ -1645,6 +1650,234 @@ I<requests> for collations. In other words, if you want to change
the behaviour of a collation within an existing C<$dbh>, you
need to call the L</create_collation> method directly.
=head1 FULLTEXT SEARCH
The FTS3 extension module within SQLite allows users to create special
tables with a built-in full-text index (hereafter "FTS3 tables"). The
full-text index allows the user to efficiently query the database for
all rows that contain one or more instances of a specified word (hereafter
a "token"), even if the table contains many large documents.
=head2 Short introduction to FTS3
The detailed documentation for FTS3 can be found
at L<http://www.sqlite.org/fts3.html>. Here is a very short example :
$dbh->do(<<"") or die DBI::errstr;
CREATE VIRTUAL TABLE fts_example USING fts3(content)
my $sth = $dbh->prepare("INSERT INTO fts_example(content) VALUES (?))");
$sth->execute($_) foreach @docs_to_insert;
my $results = $dbh->selectall_arrayref(<<"");
SELECT docid, snippet(content) FROM fts_example WHERE content MATCH 'foo'
The key points in this example are :
=over
=item *
The syntax for creating FTS3 tables is
CREATE VIRTUAL TABLE <table_name> USING fts3(<columns>)
where C<< <columns> >> is a list of column names. Columns may be
typed, but the type information is ignored. If no columns
are specified, the default is a single column named C<content>.
In addition, FTS3 tables have an implicit column called C<docid>
(or also C<rowid>) for numbering the stored documents.
=item *
Statements for inserting, updating or deleting records
use the same syntax as for regular SQLite tables.
=item *
Full-text searches are specified with the C<MATCH> operator, and an
operand which may be a single word, a word prefix ending with '*', a
list of words, a "phrase query" in double quotes, or a boolean combination
of the above.
=item *
The builtin function C<snippet(...)> builds a formatted excerpt of the
document text, where the words pertaining to the query are highlighted.
=back
There are many more details to building and searching
FTS3 tables, so we strongly invite you to read
the full documentation at at L<http://www.sqlite.org/fts3.html>.
B<Incompatible change> :
starting from version 1.31, C<DBD::SQLite> uses the new, recommended
"Enhanced Query Syntax" for binary set operators (AND, OR, NOT, possibly
nested with parenthesis). Previous versions of C<DBD::SQLite> used the
"Standard Query Syntax" (see L<http://www.sqlite.org/fts3.html#section_3_2>).
Unfortunately this is a compilation switch, so it cannot be tuned
at runtime; however, since FTS3 was never advertised in versions prior
to 1.31, the change should be invisible to the vast majority of
C<DBD::SQLite> users. If, however, there are any applications
that nevertheless were built using the "Standard Query" syntax,
they have to be migrated; but the conversion
function provided in in L<DBD::SQLite::FTS3Transitional>
is there to help.
=head2 Tokenizers
The behaviour of full-text indexes strongly depends on how
documents are split into I<tokens>; therefore FTS3 table
declarations can explicitly specify how to perform
tokenization:
CREATE ... USING fts3(<columns>, tokenize=<tokenizer>)
where C<< <tokenizer> >> is a sequence of space-separated
words that triggers a specific tokenizer, as explained below.
=head3 SQLite builtin tokenizers
SQLite comes with three builtin tokenizers :
=over
=item simple
Under the I<simple> tokenizer, a term is a contiguous sequence of
eligible characters, where eligible characters are all alphanumeric
characters, the "_" character, and all characters with UTF codepoints
greater than or equal to 128. All other characters are discarded when
splitting a document into terms. They serve only to separate adjacent
terms.
All uppercase characters within the ASCII range (UTF codepoints less
than 128), are transformed to their lowercase equivalents as part of
the tokenization process. Thus, full-text queries are case-insensitive
when using the simple tokenizer.
=item porter
The I<porter> tokenizer uses the same rules to separate the input
document into terms, but as well as folding all terms to lower case it
uses the Porter Stemming algorithm to reduce related English language
words to a common root.
=item icu
If SQLite is compiled with the SQLITE_ENABLE_ICU
pre-processor symbol defined, then there exists a built-in tokenizer
named "icu" implemented using the ICU library, and taking an
ICU locale identifier as argument (such as "tr_TR" for
Turkish as used in Turkey, or "en_AU" for English as used in
Australia). For example:
CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH)
The ICU tokenizer implementation is very simple. It splits the input
text according to the ICU rules for finding word boundaries and
discards any tokens that consist entirely of white-space. This may be
suitable for some applications in some locales, but not all. If more
complex processing is required, for example to implement stemming or
discard punctuation, use the perl tokenizer as explained below.
=back
=head3 Perl tokenizers
In addition to the builtin SQLite tokenizers, C<DBD::Sqlite>
implements a I<perl> tokenizer, that can hook to any tokenizing
algorithm written in Perl. This is specified as follows :
CREATE ... USING fts3(<columns>, tokenize=perl '<perl_function>')
where C<< <perl_function> >> is a fully qualified Perl function name
(i.e. prefixed by the name of the package in which that function is
declared). So for example if the function is C<my_func> in the main
program, write
CREATE ... USING fts3(<columns>, tokenize=perl 'main::my_func')
That function should return a code reference that takes a string as
single argument, and returns an iterator (another function), which
returns a tuple C<< ($term, $len, $start, $end, $index) >> for each
term. Here is a simple example that tokenizes on words according to
the current perl locale
sub locale_tokenizer {
return sub {
my $string = shift;
use locale;
my $regex = qr/\w+/;
my $term_index = 0;
return sub { # closure
$string =~ /$regex/g or return; # either match, or no more token
my ($start, $end) = ($-[0], $+[0]);
my $len = $end-$start;
my $term = substr($string, $start, $len);
return ($term, $len, $start, $end, $term_index++);
}
};
}
There must be three levels of subs, in a kind of "Russian dolls" structure,
because :
=over
=item *
the external, named sub is called whenever accessing a FTS3 table
with that tokenizer
=item *
the inner, anonymous sub is called whenever a new string
needs to be tokenized (either for inserting new text into the table,
or for analyzing a query).
=item *
the innermost, anonymous sub is called repeatedly for retrieving
all terms within that string.
=back
Instead of writing tokenizers by hand, you can grab one of those
already implemented in the L<Search::Tokenizer> module :
use Search::Tokenizer;
$dbh->do(<<"") or die DBI::errstr;
CREATE ... USING fts3(<columns>,
tokenize=perl 'Search::Tokenizer::unaccent')
or you can use L<Search::Tokenizer/new> to build
your own tokenizer.
=head2 Incomplete handling of utf8 characters
The current FTS3 implementation in SQLite is far from complete with
respect to utf8 handling : in particular, variable-length characters
are not treated correctly by the builtin functions
C<offsets()> and C<snippet()>.
=head2 Database space for FTS3
FTS3 stores a complete copy of the indexed documents, together with
the fulltext index. On a large collection of documents, this can
consume quite a lot of disk space. If copies of documents are also
available as external resources (for example files on the filesystem),
that space can sometimes be spared --- see the tip in the
L<Cookbook|DBD::SQLite::Cookbook/"Sparing database disk space">.
=head1 FOR DBD::SQLITE EXTENSION AUTHORS
Since 1.30_01, you can retrieve the bundled sqlite C source and/or

View file

@ -9,6 +9,8 @@ This is the L<DBD::SQLite> cookbook.
It is intended to provide a place to keep a variety of functions and
formals for use in callback APIs in L<DBD::SQLite>.
=head1 AGGREGATE FUNCTIONS
=head2 Variance
This is a simple aggregate function which returns a variance. It is
@ -140,6 +142,35 @@ The function can then be used as:
FROM results
GROUP BY group_name;
=head1 FTS3 fulltext indexing
=head2 Sparing database disk space
As explained in L<http://www.sqlite.org/fts3.html#section_6>, each
FTS3 table C<I<t>> is stored internally within three regular tables
C<I<t>_content>, C<I<t>_segments> and C<I<t>_segdir>. The last two
tables contain the fulltext index. The first table C<I<t>_content>
stores the complete documents being indexed ... but if copies of the
same documents are already stored somewhere else, or can be computed
from external resources (for example as HTML or MsWord files in the
filesystem), then this is quite a waste of space. SQLite itself only
needs the C<I<t>_content> table for implementing the C<offsets()> and
C<snippet()> functions, which are not always usable anyway (in particular
when using utf8 characters greater than 255).
So an alternative strategy is to use SQLite only for the fulltext
index and metadata, and to keep the full documents outside of SQLite :
to do so, after each insert or update in the FTS3 table, do an update
in the C<I<t>_content> table, setting the content column(s) to
NULL. Of course your application will need an algorithm for finding
the external resource corresponding to any I<docid> stored within
SQLite. Furthermore, SQLite C<offsets()> and C<snippet()> functions
cannot be used, so if such functionality is needed, it has to be
directly programmed within the Perl application.
In short, this strategy is really a hack, because FTS3 was not originally
programmed with that behaviour in mind; however it is workable
and has a strong impact on the size of the database file.
=head1 SUPPORT
Bugs should be reported via the CPAN bug tracker at
@ -157,6 +188,8 @@ turn them into a separate CPAN distribution.
Adam Kennedy E<lt>adamk@cpan.orgE<gt>
Laurent Dami E<lt>dami@cpan.orgE<gt>
=head1 COPYRIGHT
Copyright 2009 Adam Kennedy.

View file

@ -0,0 +1,96 @@
package DBD::SQLite::FTS3Transitional;
use strict;
use warnings;
no warnings 'uninitialized';
use Exporter 'import';
our @EXPORT_OK = qw/fts3_convert/;
sub fts3_convert {
my $in = shift;
my $out = "";
# decompose input string into tokens
my @tokens = $in =~ / - # minus sign
| \bOR\b # OR keyword
| ".*?" # phrase query
| \S+ # term
/xg;
# build the output string
while (@tokens) {
# -a => (NOT a)
if ($tokens[0] eq '-') {
my (undef, $right) = splice(@tokens, 0, 2);
$out .= " (NOT $right)";
}
# a OR b => (a OR b)
elsif (@tokens >= 2 && $tokens[1] eq 'OR') {
my ($left, undef, $right) = splice(@tokens, 0, 3);
if ($right eq '-') {
$right = "NOT " . shift @tokens;
}
$out .= " ($left OR $right)";
}
# plain term
else {
$out .= " " . shift @tokens;
}
}
return $out;
}
1;
__END__
=head1 NAME
DBD::SQLite::FTS3Transitional - helper function for migrating FTS3 applications
=head1 SYNOPSIS
use DBD::SQLite::FTS3Transitional qw/fts3_convert/;
my $new_match_syntax = fts3_convert($old_match_syntax);
my $sql = "SELECT ... FROM ... WHERE col MATCH $new_match_syntax";
=head1 DESCRIPTION
Starting from version 1.31, C<DBD::SQLite> uses the new, recommended
"Enhanced Query Syntax" for binary set operators in fulltext FTS3 queries
(AND, OR, NOT, possibly nested with parenthesis).
Previous versions of C<DBD::SQLite> used the
"Standard Query Syntax" (see L<http://www.sqlite.org/fts3.html#section_3_2>).
This module helps converting SQLite application built with the old,
"Standard" query syntax, to the new "Extended" syntax.
=head1 FUNCTIONS
=head2 fts3_convert
Takes as input a string for the MATCH clause in a FTS3 fulltext search;
returns the same clause rewritten in new, "Extended" syntax.
=head1 AUTHOR
Laurent Dami E<lt>dami@cpan.orgE<gt>
=head1 COPYRIGHT
Copyright 2010 Laurent Dami.
This program is free software; you can redistribute
it and/or modify it under the same terms as Perl itself.
The full text of the license can be found in the
LICENSE file included with this module.
=cut

103
t/43_fts3.t Normal file
View file

@ -0,0 +1,103 @@
#!/usr/bin/perl
use strict;
BEGIN {
$| = 1;
$^W = 1;
}
use t::lib::Test qw/connect_ok/;
use Test::More;
my @texts = ("il était une bergère",
"qui gardait ses moutons",
"elle fit un fromage",
"du lait de ses moutons");
my @tests = (
# query => expected results
["bergère" => 0 ],
["berg*" => 0 ],
["foobar" ],
["moutons" => 1, 3 ],
['"qui gardait"' => 1 ],
["moutons NOT lait" => 1 ],
["il était" => 0 ],
["(il OR elle) AND un*" => 0, 2 ],
);
BEGIN {
if ($] < 5.008005) {
plan skip_all => 'Unicode is not supported before 5.8.5';
}
}
use Test::NoWarnings;
plan tests => 2 * (1 + @tests) + 1;
BEGIN {
# Sadly perl for windows (and probably sqlite, too) may hang
# if the system locale doesn't support european languages.
# en-us should be a safe default. if it doesn't work, use 'C'.
if ( $^O eq 'MSWin32') {
use POSIX 'locale_h';
setlocale(LC_COLLATE, 'en-us');
}
}
use locale;
sub locale_tokenizer { # see also: Search::Tokenizer
return sub {
my $string = shift;
my $regex = qr/\w+/;
my $term_index = 0;
return sub {
$string =~ /$regex/g or return; # either match, or no more token
my ($start, $end) = ($-[0], $+[0]);
my $term = substr($string, $start, my $len = $end-$start);
return ($term, $len, $start, $end, $term_index++);
};
};
}
use DBD::SQLite;
for my $use_unicode (0, 1) {
# connect
my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode );
# create fts3 table
use Search::Tokenizer;
$dbh->do(<<"") or die DBI::errstr;
CREATE VIRTUAL TABLE try_fts3
USING fts3(content, tokenize=perl 'main::locale_tokenizer')
# populate it
my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
INSERT INTO try_fts3(content) VALUES(?)
my @doc_ids;
for (my $i = 0; $i < @texts; $i++) {
$insert_sth->execute($texts[$i]);
$doc_ids[$i] = $dbh->last_insert_id("", "", "", "");
}
# queries
my $sql = "SELECT docid FROM try_fts3 WHERE content MATCH ?";
for my $t (@tests) {
my ($query, @expected) = @$t;
@expected = map {$doc_ids[$_]} @expected;
my $results = $dbh->selectcol_arrayref($sql, undef, $query);
is_deeply($results, \@expected, "$query (unicode is $use_unicode)");
}
}

34
t/44_fts3_transitional.t Normal file
View file

@ -0,0 +1,34 @@
#!/usr/bin/perl
use strict;
BEGIN {
$| = 1;
$^W = 1;
}
use Test::More;
use Test::NoWarnings;
my @tests = (
['foo bar' => 'foo bar' ],
['foo -bar' => 'foo (NOT bar)' ],
['foo* -bar*' => 'foo* (NOT bar*)' ],
['foo bar OR bie buz' => 'foo (bar OR bie) buz' ],
['-foo bar OR -bie buz' => '(NOT foo) (bar OR NOT bie) buz'],
['"kyrie eleison" OR "christe eleison"'
=> '("kyrie eleison" OR "christe eleison")'],
);
plan tests => 1 + @tests;
use DBD::SQLite::FTS3Transitional qw/fts3_convert/;
foreach my $t (@tests) {
my ($old_syntax, $expected_new) = @$t;
my $new = fts3_convert($old_syntax);
$new =~ s/^\s+//;
is($new, $expected_new, $old_syntax);
}