From d43cf63ad08b1a98c5e50d21dae1411bb7d8a237 Mon Sep 17 00:00:00 2001 From: Laurent Dami Date: Tue, 13 Jul 2010 06:28:21 +0000 Subject: [PATCH] added support for FTS3 fulltext searches : perl tokenizers, documentation and tests --- Changes | 5 + Makefile.PL | 10 +- SQLite.xs | 16 ++ SQLiteXS.h | 1 + dbdimp.c | 302 +++++++++++++++++++++++++++++ dbdimp.h | 2 + fts3_tokenizer.h | 154 +++++++++++++++ lib/DBD/SQLite.pm | 235 +++++++++++++++++++++- lib/DBD/SQLite/Cookbook.pod | 33 ++++ lib/DBD/SQLite/FTS3Transitional.pm | 96 +++++++++ t/43_fts3.t | 103 ++++++++++ t/44_fts3_transitional.t | 34 ++++ 12 files changed, 988 insertions(+), 3 deletions(-) create mode 100644 fts3_tokenizer.h create mode 100644 lib/DBD/SQLite/FTS3Transitional.pm create mode 100644 t/43_fts3.t create mode 100644 t/44_fts3_transitional.t diff --git a/Changes b/Changes index 1eecc0f..c7d5e04 100644 --- a/Changes +++ b/Changes @@ -1,6 +1,11 @@ Changes for Perl extension DBD-SQLite 1.30_04 to be released + - Added support for FTS3 tokenizers written in Perl. Added tests + and documentation on how to use FTS3. Changed compilation flag + to use the recommanded -DSQLITE_ENABLE_FTS3_PARENTHESIS + *** MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY USED FTS3 *** + (DAMI) - Fixed various backward compatibility issues back to SQLite 3.6.1 (ISHIGAKI) - Resolved #58332: Documentation error for preventing fsync diff --git a/Makefile.PL b/Makefile.PL index f7ec149..7049333 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -212,8 +212,14 @@ if ( $sqlite_inc ) { my @CC_DEFINE = ( # '-DSQLITE_CORE', '-DSQLITE_ENABLE_FTS3', - # Disabled until we have a test for this - # '-DSQLITE_ENABLE_FTS3_PARENTHESIS', # for sqlite >= 3.6.10 + + # L. Dami 10.07.2010 : now enabling new FTS3 syntax, because + # that's the recommendation from SQLite for new applications + # (used to be "Disabled until we have a test for this"). + # This change MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY + # USED FTS3 ... but sooner or later that change had to be done ! + '-DSQLITE_ENABLE_FTS3_PARENTHESIS', # for sqlite >= 3.6.10 + '-DSQLITE_ENABLE_COLUMN_METADATA', '-DNDEBUG=1', ); diff --git a/SQLite.xs b/SQLite.xs index 665849c..8e30484 100644 --- a/SQLite.xs +++ b/SQLite.xs @@ -196,6 +196,22 @@ backup_to_file(dbh, filename) OUTPUT: RETVAL + + + + +static int +register_fts3_perl_tokenizer(dbh) + SV *dbh + ALIAS: + DBD::SQLite::db::sqlite_register_fts3_perl_tokenizer = 1 + CODE: + RETVAL = sqlite_db_register_fts3_perl_tokenizer(aTHX_ dbh); + OUTPUT: + RETVAL + + + MODULE = DBD::SQLite PACKAGE = DBD::SQLite::st PROTOTYPES: DISABLE diff --git a/SQLiteXS.h b/SQLiteXS.h index 6b6a4c5..584fb61 100644 --- a/SQLiteXS.h +++ b/SQLiteXS.h @@ -19,5 +19,6 @@ #include #include "sqlite3.h" +#include "fts3_tokenizer.h" #endif diff --git a/dbdimp.c b/dbdimp.c index c63892a..a2ac8d3 100644 --- a/dbdimp.c +++ b/dbdimp.c @@ -20,6 +20,14 @@ DBISTATE_DECLARE; #define croak_if_stmt_is_null() #endif + +/*-----------------------------------------------------* + * Globals + *-----------------------------------------------------*/ +imp_dbh_t *last_executed_dbh; /* needed by perl_tokenizer + to know if unicode is on/off */ + + /*-----------------------------------------------------* * Helper Methods *-----------------------------------------------------*/ @@ -487,6 +495,298 @@ sqlite_db_last_insert_id(SV *dbh, imp_dbh_t *imp_dbh, SV *catalog, SV *schema, S return newSViv((IV)sqlite3_last_insert_rowid(imp_dbh->db)); } +/* ====================================================================== + * EXPERIMENTAL bindings for FTS3 TOKENIZERS + * ====================================================================== */ + +typedef struct perl_tokenizer { + sqlite3_tokenizer base; + SV *coderef; /* the perl tokenizer is a coderef that takes + a string and returns a cursor coderef */ +} perl_tokenizer; + +typedef struct perl_tokenizer_cursor { + sqlite3_tokenizer_cursor base; + SV *coderef; /* ref to the closure that returns terms */ + char *pToken; /* storage for a copy of the last token */ + int nTokenAllocated; /* space allocated to pToken buffer */ + + /* members below are only used if the input string is in utf8 */ + const char *pInput; /* input we are tokenizing */ + const char *lastByteOffset; /* offset into pInput */ + int lastCharOffset; /* char offset corresponding to lastByteOffset */ + +} perl_tokenizer_cursor; + + +/* +** Create a new tokenizer instance. +** Will be called whenever a FTS3 table is created with +** CREATE .. USING fts3( ... , tokenize=perl qualified::function::name) +** where qualified::function::name is a fully qualified perl function +*/ +static int perl_tokenizer_Create( + int argc, const char * const *argv, + sqlite3_tokenizer **ppTokenizer +){ + perl_tokenizer *t; + t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t)); + if( t==NULL ) return SQLITE_NOMEM; + memset(t, 0, sizeof(*t)); + + dTHX; + dSP; + + ENTER; + SAVETMPS; + + /* call the qualified::function::name */ + PUSHMARK(SP); + PUTBACK; + int n_retval = call_pv(argv[0], G_SCALAR); + SPAGAIN; + + /* store a copy of the returned coderef into the tokenizer structure */ + if (n_retval != 1) { + warn("tokenizer_Create returned %d arguments", n_retval); + } + SV *retval = POPs; + t->coderef = newSVsv(retval); + *ppTokenizer = &t->base; + + PUTBACK; + FREETMPS; + LEAVE; + + return SQLITE_OK; +} + + +/* +** Destroy a tokenizer +*/ +static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){ + dTHX; + perl_tokenizer *t = (perl_tokenizer *) pTokenizer; + sv_free(t->coderef); + sqlite3_free(t); + return SQLITE_OK; +} + + +/* +** Prepare to begin tokenizing a particular string. The input +** string to be tokenized is supposed to be pInput[0..nBytes-1] .. +** except that nBytes passed by fts3 is -1 (don't know why) ! +** This is passed to the tokenizer instance, which then returns a +** closure implementing the cursor (so the cursor is again a coderef). +*/ +static int perl_tokenizer_Open( + sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ + const char *pInput, int nBytes, /* Input buffer */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ +){ + perl_tokenizer *t = (perl_tokenizer *)pTokenizer; + + /* allocate and initialize the cursor struct */ + perl_tokenizer_cursor *c; + c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); + memset(c, 0, sizeof(*c)); + *ppCursor = &c->base; + + /* flags for creating the Perl SV containing the input string */ + U32 flags = SVs_TEMP; /* will call sv_2mortal */ + + /* special handling if working with utf8 strings */ + if (last_executed_dbh->unicode) { /* global var ... no better way ! */ + + /* data to keep track of byte offsets */ + c->lastByteOffset = c->pInput = pInput; + c->lastCharOffset = 0; + + /* string passed to Perl needs to be flagged as utf8 */ + flags |= SVf_UTF8; + } + + dTHX; + dSP; + ENTER; + SAVETMPS; + + /* build a Perl copy of the input string */ + if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */ + nBytes = strlen(pInput); + } + SV *perl_string = newSVpvn_flags(pInput, nBytes, flags); + + /* call the tokenizer coderef */ + PUSHMARK(SP); + XPUSHs(perl_string); + PUTBACK; + int n_retval = call_sv(t->coderef, G_SCALAR); + SPAGAIN; + + /* store the cursor coderef returned by the tokenizer */ + if (n_retval != 1) { + warn("tokenizer returned %d arguments", n_retval); + } + c->coderef = newSVsv(POPs); + + PUTBACK; + FREETMPS; + LEAVE; + return SQLITE_OK; +} + +/* +** Close a tokenization cursor previously opened by a call to +** perl_tokenizer_Open() above. +*/ +static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ + perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; + + dTHX; + sv_free(c->coderef); + sqlite3_free(c); + return SQLITE_OK; +} + +/* +** Extract the next token from a tokenization cursor. The cursor must +** have been opened by a prior call to perl_tokenizer_Open(). +*/ +static int perl_tokenizer_Next( + sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */ + const char **ppToken, /* OUT: *ppToken is the token text */ + int *pnBytes, /* OUT: Number of bytes in token */ + int *piStartOffset, /* OUT: Starting offset of token */ + int *piEndOffset, /* OUT: Ending offset of token */ + int *piPosition /* OUT: Position integer of token */ +){ + perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; + int result; + + dTHX; + dSP; + + ENTER; + SAVETMPS; + + /* call the cursor */ + PUSHMARK(SP); + PUTBACK; + int n_retval = call_sv(c->coderef, G_ARRAY); + SPAGAIN; + + /* if we get back an empty list, there is no more token */ + if (n_retval == 0) { + result = SQLITE_DONE; + } + /* otherwise, get token details from the return list */ + else { + if (n_retval != 5) { + warn("tokenizer cursor returned %d arguments", n_retval); + } + *piPosition = POPi; + *piEndOffset = POPi; + *piStartOffset = POPi; + *pnBytes = POPi; + char *token = POPpx; + + if (c->pInput) { /* if working with utf8 data */ + + /* recompute *pnBytes in bytes, not in chars */ + *pnBytes = strlen(token); + + /* recompute start/end offsets in bytes, not in chars */ + I32 hop = *piStartOffset - c->lastCharOffset; + char *byteOffset = utf8_hop(c->lastByteOffset, hop); + hop = *piEndOffset - *piStartOffset; + *piStartOffset = byteOffset - c->pInput; + byteOffset = utf8_hop(byteOffset, hop); + *piEndOffset = byteOffset - c->pInput; + /* remember where we are for next round */ + c->lastCharOffset = *piEndOffset, + c->lastByteOffset = byteOffset; + } + + /* make sure we have enough storage for copying the token */ + if (*pnBytes > c->nTokenAllocated ){ + char *pNew; + c->nTokenAllocated = *pnBytes + 20; + pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); + if( !pNew ) return SQLITE_NOMEM; + c->pToken = pNew; + } + + /* need to copy the token into the C cursor before perl frees that + memory */ + memcpy(c->pToken, token, *pnBytes); + *ppToken = c->pToken; + + result = SQLITE_OK; + } + + PUTBACK; + FREETMPS; + LEAVE; + + return result; +} + + +/* +** The set of routines that implement the perl tokenizer +*/ +sqlite3_tokenizer_module perl_tokenizer_Module = { + 0, + perl_tokenizer_Create, + perl_tokenizer_Destroy, + perl_tokenizer_Open, + perl_tokenizer_Close, + perl_tokenizer_Next +}; + + +/* +** Register the perl tokenizer with FTS3 +*/ +int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh) +{ + D_imp_dbh(dbh); + + int rc; + sqlite3_stmt *pStmt; + const char zSql[] = "SELECT fts3_tokenizer(?, ?)"; + sqlite3_tokenizer_module *p = &perl_tokenizer_Module; + + rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0); + if( rc!=SQLITE_OK ){ + return rc; + } + + sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC); + sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC); + sqlite3_step(pStmt); + + return sqlite3_finalize(pStmt); +} + + +/* ====================================================================== + * END # EXPERIMENTAL bindings for FTS3 TOKENIZERS + * ====================================================================== */ + + + + + + + + + + + int sqlite_st_prepare(SV *sth, imp_sth_t *imp_sth, char *statement, SV *attribs) { @@ -566,6 +866,8 @@ sqlite_st_execute(SV *sth, imp_sth_t *imp_sth) croak_if_db_is_null(); croak_if_stmt_is_null(); + last_executed_dbh = imp_dbh; + /* COMPAT: sqlite3_sql is only available for 3006000 or newer */ sqlite_trace(sth, imp_sth, 3, form("executing %s", sqlite3_sql(imp_sth->stmt))); diff --git a/dbdimp.h b/dbdimp.h index 63f896c..16e20b7 100644 --- a/dbdimp.h +++ b/dbdimp.h @@ -100,6 +100,8 @@ SV* sqlite_db_update_hook( pTHX_ SV *dbh, SV *hook ); int sqlite_db_set_authorizer( pTHX_ SV *dbh, SV *authorizer ); AV* sqlite_compile_options(); +int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh); + #ifdef SvUTF8_on static SV * diff --git a/fts3_tokenizer.h b/fts3_tokenizer.h new file mode 100644 index 0000000..36191f1 --- /dev/null +++ b/fts3_tokenizer.h @@ -0,0 +1,154 @@ +/************** Begin file fts3_tokenizer.h **********************************/ +/* +** 2006 July 10 +** +** The author disclaims copyright to this source code. +** +************************************************************************* +** Defines the interface to tokenizers used by fulltext-search. There +** are three basic components: +** +** sqlite3_tokenizer_module is a singleton defining the tokenizer +** interface functions. This is essentially the class structure for +** tokenizers. +** +** sqlite3_tokenizer is used to define a particular tokenizer, perhaps +** including customization information defined at creation time. +** +** sqlite3_tokenizer_cursor is generated by a tokenizer to generate +** tokens from a particular input. +*/ +#ifndef _FTS3_TOKENIZER_H_ +#define _FTS3_TOKENIZER_H_ + +/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. +** If tokenizers are to be allowed to call sqlite3_*() functions, then +** we will need a way to register the API consistently. +*/ + +/* +** Structures used by the tokenizer interface. When a new tokenizer +** implementation is registered, the caller provides a pointer to +** an sqlite3_tokenizer_module containing pointers to the callback +** functions that make up an implementation. +** +** When an fts3 table is created, it passes any arguments passed to +** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the +** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer +** implementation. The xCreate() function in turn returns an +** sqlite3_tokenizer structure representing the specific tokenizer to +** be used for the fts3 table (customized by the tokenizer clause arguments). +** +** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen() +** method is called. It returns an sqlite3_tokenizer_cursor object +** that may be used to tokenize a specific input buffer based on +** the tokenization rules supplied by a specific sqlite3_tokenizer +** object. +*/ +typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; +typedef struct sqlite3_tokenizer sqlite3_tokenizer; +typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; + +struct sqlite3_tokenizer_module { + + /* + ** Structure version. Should always be set to 0. + */ + int iVersion; + + /* + ** Create a new tokenizer. The values in the argv[] array are the + ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL + ** TABLE statement that created the fts3 table. For example, if + ** the following SQL is executed: + ** + ** CREATE .. USING fts3( ... , tokenizer arg1 arg2) + ** + ** then argc is set to 2, and the argv[] array contains pointers + ** to the strings "arg1" and "arg2". + ** + ** This method should return either SQLITE_OK (0), or an SQLite error + ** code. If SQLITE_OK is returned, then *ppTokenizer should be set + ** to point at the newly created tokenizer structure. The generic + ** sqlite3_tokenizer.pModule variable should not be initialised by + ** this callback. The caller will do so. + */ + int (*xCreate)( + int argc, /* Size of argv array */ + const char *const*argv, /* Tokenizer argument strings */ + sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ + ); + + /* + ** Destroy an existing tokenizer. The fts3 module calls this method + ** exactly once for each successful call to xCreate(). + */ + int (*xDestroy)(sqlite3_tokenizer *pTokenizer); + + /* + ** Create a tokenizer cursor to tokenize an input buffer. The caller + ** is responsible for ensuring that the input buffer remains valid + ** until the cursor is closed (using the xClose() method). + */ + int (*xOpen)( + sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ + const char *pInput, int nBytes, /* Input buffer */ + sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ + ); + + /* + ** Destroy an existing tokenizer cursor. The fts3 module calls this + ** method exactly once for each successful call to xOpen(). + */ + int (*xClose)(sqlite3_tokenizer_cursor *pCursor); + + /* + ** Retrieve the next token from the tokenizer cursor pCursor. This + ** method should either return SQLITE_OK and set the values of the + ** "OUT" variables identified below, or SQLITE_DONE to indicate that + ** the end of the buffer has been reached, or an SQLite error code. + ** + ** *ppToken should be set to point at a buffer containing the + ** normalized version of the token (i.e. after any case-folding and/or + ** stemming has been performed). *pnBytes should be set to the length + ** of this buffer in bytes. The input text that generated the token is + ** identified by the byte offsets returned in *piStartOffset and + ** *piEndOffset. *piStartOffset should be set to the index of the first + ** byte of the token in the input buffer. *piEndOffset should be set + ** to the index of the first byte just past the end of the token in + ** the input buffer. + ** + ** The buffer *ppToken is set to point at is managed by the tokenizer + ** implementation. It is only required to be valid until the next call + ** to xNext() or xClose(). + */ + /* TODO(shess) current implementation requires pInput to be + ** nul-terminated. This should either be fixed, or pInput/nBytes + ** should be converted to zInput. + */ + int (*xNext)( + sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ + const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ + int *piStartOffset, /* OUT: Byte offset of token in input buffer */ + int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ + int *piPosition /* OUT: Number of tokens returned before this one */ + ); +}; + +struct sqlite3_tokenizer { + const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ + /* Tokenizer implementations will typically add additional fields */ +}; + +struct sqlite3_tokenizer_cursor { + sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ + /* Tokenizer implementations will typically add additional fields */ +}; + +int fts3_global_term_cnt(int iTerm, int iCol); +int fts3_term_cnt(int iTerm, int iCol); + + +#endif /* _FTS3_TOKENIZER_H_ */ + +/************** End of fts3_tokenizer.h **************************************/ diff --git a/lib/DBD/SQLite.pm b/lib/DBD/SQLite.pm index ee729a7..6c0e9e4 100644 --- a/lib/DBD/SQLite.pm +++ b/lib/DBD/SQLite.pm @@ -55,6 +55,7 @@ sub driver { DBD::SQLite::db->install_method('sqlite_backup_from_file'); DBD::SQLite::db->install_method('sqlite_backup_to_file'); DBD::SQLite::db->install_method('sqlite_enable_load_extension'); + DBD::SQLite::db->install_method('sqlite_register_fts3_perl_tokenizer'); $methods_are_installed++; } @@ -71,6 +72,7 @@ sub CLONE { undef $drh; } + package DBD::SQLite::dr; sub connect { @@ -120,13 +122,16 @@ sub connect { # Hand off to the actual login function DBD::SQLite::db::_login($dbh, $real, $user, $auth, $attr) or return undef; - # Register the on-demand collation installer and REGEXP function + # Register the on-demand collation installer, REGEXP function and + # perl tokenizer if ( DBD::SQLite::NEWAPI ) { $dbh->sqlite_collation_needed( \&install_collation ); $dbh->sqlite_create_function( "REGEXP", 2, \®exp ); + $dbh->sqlite_register_fts3_perl_tokenizer(); } else { $dbh->func( \&install_collation, "collation_needed" ); $dbh->func( "REGEXP", 2, \®exp, "create_function" ); + $dbh->func( "register_fts3_perl_tokenizer" ); } # HACK: Since PrintWarn = 0 doesn't seem to actually prevent warnings @@ -1645,6 +1650,234 @@ I for collations. In other words, if you want to change the behaviour of a collation within an existing C<$dbh>, you need to call the L method directly. +=head1 FULLTEXT SEARCH + +The FTS3 extension module within SQLite allows users to create special +tables with a built-in full-text index (hereafter "FTS3 tables"). The +full-text index allows the user to efficiently query the database for +all rows that contain one or more instances of a specified word (hereafter +a "token"), even if the table contains many large documents. + + +=head2 Short introduction to FTS3 + +The detailed documentation for FTS3 can be found +at L. Here is a very short example : + + $dbh->do(<<"") or die DBI::errstr; + CREATE VIRTUAL TABLE fts_example USING fts3(content) + + my $sth = $dbh->prepare("INSERT INTO fts_example(content) VALUES (?))"); + $sth->execute($_) foreach @docs_to_insert; + + my $results = $dbh->selectall_arrayref(<<""); + SELECT docid, snippet(content) FROM fts_example WHERE content MATCH 'foo' + + +The key points in this example are : + +=over + +=item * + +The syntax for creating FTS3 tables is + + CREATE VIRTUAL TABLE USING fts3() + +where C<< >> is a list of column names. Columns may be +typed, but the type information is ignored. If no columns +are specified, the default is a single column named C. +In addition, FTS3 tables have an implicit column called C +(or also C) for numbering the stored documents. + +=item * + +Statements for inserting, updating or deleting records +use the same syntax as for regular SQLite tables. + +=item * + +Full-text searches are specified with the C operator, and an +operand which may be a single word, a word prefix ending with '*', a +list of words, a "phrase query" in double quotes, or a boolean combination +of the above. + +=item * + +The builtin function C builds a formatted excerpt of the +document text, where the words pertaining to the query are highlighted. + +=back + +There are many more details to building and searching +FTS3 tables, so we strongly invite you to read +the full documentation at at L. + +B : +starting from version 1.31, C uses the new, recommended +"Enhanced Query Syntax" for binary set operators (AND, OR, NOT, possibly +nested with parenthesis). Previous versions of C used the +"Standard Query Syntax" (see L). +Unfortunately this is a compilation switch, so it cannot be tuned +at runtime; however, since FTS3 was never advertised in versions prior +to 1.31, the change should be invisible to the vast majority of +C users. If, however, there are any applications +that nevertheless were built using the "Standard Query" syntax, +they have to be migrated; but the conversion +function provided in in L +is there to help. + + +=head2 Tokenizers + +The behaviour of full-text indexes strongly depends on how +documents are split into I; therefore FTS3 table +declarations can explicitly specify how to perform +tokenization: + + CREATE ... USING fts3(, tokenize=) + +where C<< >> is a sequence of space-separated +words that triggers a specific tokenizer, as explained below. + +=head3 SQLite builtin tokenizers + +SQLite comes with three builtin tokenizers : + +=over + +=item simple + +Under the I tokenizer, a term is a contiguous sequence of +eligible characters, where eligible characters are all alphanumeric +characters, the "_" character, and all characters with UTF codepoints +greater than or equal to 128. All other characters are discarded when +splitting a document into terms. They serve only to separate adjacent +terms. + +All uppercase characters within the ASCII range (UTF codepoints less +than 128), are transformed to their lowercase equivalents as part of +the tokenization process. Thus, full-text queries are case-insensitive +when using the simple tokenizer. + +=item porter + +The I tokenizer uses the same rules to separate the input +document into terms, but as well as folding all terms to lower case it +uses the Porter Stemming algorithm to reduce related English language +words to a common root. + +=item icu + +If SQLite is compiled with the SQLITE_ENABLE_ICU +pre-processor symbol defined, then there exists a built-in tokenizer +named "icu" implemented using the ICU library, and taking an +ICU locale identifier as argument (such as "tr_TR" for +Turkish as used in Turkey, or "en_AU" for English as used in +Australia). For example: + + CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH) + +The ICU tokenizer implementation is very simple. It splits the input +text according to the ICU rules for finding word boundaries and +discards any tokens that consist entirely of white-space. This may be +suitable for some applications in some locales, but not all. If more +complex processing is required, for example to implement stemming or +discard punctuation, use the perl tokenizer as explained below. + +=back + +=head3 Perl tokenizers + +In addition to the builtin SQLite tokenizers, C +implements a I tokenizer, that can hook to any tokenizing +algorithm written in Perl. This is specified as follows : + + CREATE ... USING fts3(, tokenize=perl '') + +where C<< >> is a fully qualified Perl function name +(i.e. prefixed by the name of the package in which that function is +declared). So for example if the function is C in the main +program, write + + CREATE ... USING fts3(, tokenize=perl 'main::my_func') + +That function should return a code reference that takes a string as +single argument, and returns an iterator (another function), which +returns a tuple C<< ($term, $len, $start, $end, $index) >> for each +term. Here is a simple example that tokenizes on words according to +the current perl locale + + sub locale_tokenizer { + return sub { + my $string = shift; + + use locale; + my $regex = qr/\w+/; + my $term_index = 0; + + return sub { # closure + $string =~ /$regex/g or return; # either match, or no more token + my ($start, $end) = ($-[0], $+[0]); + my $len = $end-$start; + my $term = substr($string, $start, $len); + return ($term, $len, $start, $end, $term_index++); + } + }; + } + +There must be three levels of subs, in a kind of "Russian dolls" structure, +because : + +=over + +=item * + +the external, named sub is called whenever accessing a FTS3 table +with that tokenizer + +=item * + +the inner, anonymous sub is called whenever a new string +needs to be tokenized (either for inserting new text into the table, +or for analyzing a query). + +=item * + +the innermost, anonymous sub is called repeatedly for retrieving +all terms within that string. + +=back + +Instead of writing tokenizers by hand, you can grab one of those +already implemented in the L module : + + use Search::Tokenizer; + $dbh->do(<<"") or die DBI::errstr; + CREATE ... USING fts3(, + tokenize=perl 'Search::Tokenizer::unaccent') + +or you can use L to build +your own tokenizer. + + +=head2 Incomplete handling of utf8 characters + +The current FTS3 implementation in SQLite is far from complete with +respect to utf8 handling : in particular, variable-length characters +are not treated correctly by the builtin functions +C and C. + +=head2 Database space for FTS3 + +FTS3 stores a complete copy of the indexed documents, together with +the fulltext index. On a large collection of documents, this can +consume quite a lot of disk space. If copies of documents are also +available as external resources (for example files on the filesystem), +that space can sometimes be spared --- see the tip in the +L. + + =head1 FOR DBD::SQLITE EXTENSION AUTHORS Since 1.30_01, you can retrieve the bundled sqlite C source and/or diff --git a/lib/DBD/SQLite/Cookbook.pod b/lib/DBD/SQLite/Cookbook.pod index b5f9f34..9d107a2 100644 --- a/lib/DBD/SQLite/Cookbook.pod +++ b/lib/DBD/SQLite/Cookbook.pod @@ -9,6 +9,8 @@ This is the L cookbook. It is intended to provide a place to keep a variety of functions and formals for use in callback APIs in L. +=head1 AGGREGATE FUNCTIONS + =head2 Variance This is a simple aggregate function which returns a variance. It is @@ -140,6 +142,35 @@ The function can then be used as: FROM results GROUP BY group_name; +=head1 FTS3 fulltext indexing + +=head2 Sparing database disk space + +As explained in L, each +FTS3 table C> is stored internally within three regular tables +C_content>, C_segments> and C_segdir>. The last two +tables contain the fulltext index. The first table C_content> +stores the complete documents being indexed ... but if copies of the +same documents are already stored somewhere else, or can be computed +from external resources (for example as HTML or MsWord files in the +filesystem), then this is quite a waste of space. SQLite itself only +needs the C_content> table for implementing the C and +C functions, which are not always usable anyway (in particular +when using utf8 characters greater than 255). + +So an alternative strategy is to use SQLite only for the fulltext +index and metadata, and to keep the full documents outside of SQLite : +to do so, after each insert or update in the FTS3 table, do an update +in the C_content> table, setting the content column(s) to +NULL. Of course your application will need an algorithm for finding +the external resource corresponding to any I stored within +SQLite. Furthermore, SQLite C and C functions +cannot be used, so if such functionality is needed, it has to be +directly programmed within the Perl application. +In short, this strategy is really a hack, because FTS3 was not originally +programmed with that behaviour in mind; however it is workable +and has a strong impact on the size of the database file. + =head1 SUPPORT Bugs should be reported via the CPAN bug tracker at @@ -157,6 +188,8 @@ turn them into a separate CPAN distribution. Adam Kennedy Eadamk@cpan.orgE +Laurent Dami Edami@cpan.orgE + =head1 COPYRIGHT Copyright 2009 Adam Kennedy. diff --git a/lib/DBD/SQLite/FTS3Transitional.pm b/lib/DBD/SQLite/FTS3Transitional.pm new file mode 100644 index 0000000..aeb0d5e --- /dev/null +++ b/lib/DBD/SQLite/FTS3Transitional.pm @@ -0,0 +1,96 @@ +package DBD::SQLite::FTS3Transitional; +use strict; +use warnings; +no warnings 'uninitialized'; + +use Exporter 'import'; +our @EXPORT_OK = qw/fts3_convert/; + + +sub fts3_convert { + my $in = shift; + my $out = ""; + + # decompose input string into tokens + my @tokens = $in =~ / - # minus sign + | \bOR\b # OR keyword + | ".*?" # phrase query + | \S+ # term + /xg; + + # build the output string + while (@tokens) { + + # -a => (NOT a) + if ($tokens[0] eq '-') { + my (undef, $right) = splice(@tokens, 0, 2); + $out .= " (NOT $right)"; + } + + # a OR b => (a OR b) + elsif (@tokens >= 2 && $tokens[1] eq 'OR') { + my ($left, undef, $right) = splice(@tokens, 0, 3); + if ($right eq '-') { + $right = "NOT " . shift @tokens; + } + $out .= " ($left OR $right)"; + } + + # plain term + else { + $out .= " " . shift @tokens; + } + } + + return $out; +} + + +1; + +__END__ + +=head1 NAME + +DBD::SQLite::FTS3Transitional - helper function for migrating FTS3 applications + +=head1 SYNOPSIS + + use DBD::SQLite::FTS3Transitional qw/fts3_convert/; + my $new_match_syntax = fts3_convert($old_match_syntax); + my $sql = "SELECT ... FROM ... WHERE col MATCH $new_match_syntax"; + +=head1 DESCRIPTION + +Starting from version 1.31, C uses the new, recommended +"Enhanced Query Syntax" for binary set operators in fulltext FTS3 queries +(AND, OR, NOT, possibly nested with parenthesis). + +Previous versions of C used the +"Standard Query Syntax" (see L). + +This module helps converting SQLite application built with the old, +"Standard" query syntax, to the new "Extended" syntax. + +=head1 FUNCTIONS + +=head2 fts3_convert + +Takes as input a string for the MATCH clause in a FTS3 fulltext search; +returns the same clause rewritten in new, "Extended" syntax. + +=head1 AUTHOR + +Laurent Dami Edami@cpan.orgE + +=head1 COPYRIGHT + +Copyright 2010 Laurent Dami. + +This program is free software; you can redistribute +it and/or modify it under the same terms as Perl itself. + +The full text of the license can be found in the +LICENSE file included with this module. + +=cut diff --git a/t/43_fts3.t b/t/43_fts3.t new file mode 100644 index 0000000..8c13af2 --- /dev/null +++ b/t/43_fts3.t @@ -0,0 +1,103 @@ +#!/usr/bin/perl + +use strict; +BEGIN { + $| = 1; + $^W = 1; +} + +use t::lib::Test qw/connect_ok/; +use Test::More; + +my @texts = ("il était une bergère", + "qui gardait ses moutons", + "elle fit un fromage", + "du lait de ses moutons"); + +my @tests = ( +# query => expected results + ["bergère" => 0 ], + ["berg*" => 0 ], + ["foobar" ], + ["moutons" => 1, 3 ], + ['"qui gardait"' => 1 ], + ["moutons NOT lait" => 1 ], + ["il était" => 0 ], + ["(il OR elle) AND un*" => 0, 2 ], +); + +BEGIN { + if ($] < 5.008005) { + plan skip_all => 'Unicode is not supported before 5.8.5'; + } +} +use Test::NoWarnings; + +plan tests => 2 * (1 + @tests) + 1; + +BEGIN { + # Sadly perl for windows (and probably sqlite, too) may hang + # if the system locale doesn't support european languages. + # en-us should be a safe default. if it doesn't work, use 'C'. + if ( $^O eq 'MSWin32') { + use POSIX 'locale_h'; + setlocale(LC_COLLATE, 'en-us'); + } +} +use locale; + + +sub locale_tokenizer { # see also: Search::Tokenizer + return sub { + my $string = shift; + + my $regex = qr/\w+/; + my $term_index = 0; + + return sub { + $string =~ /$regex/g or return; # either match, or no more token + my ($start, $end) = ($-[0], $+[0]); + my $term = substr($string, $start, my $len = $end-$start); + return ($term, $len, $start, $end, $term_index++); + }; + }; +} + + + +use DBD::SQLite; + + + +for my $use_unicode (0, 1) { + + # connect + my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode ); + + # create fts3 table + use Search::Tokenizer; + $dbh->do(<<"") or die DBI::errstr; + CREATE VIRTUAL TABLE try_fts3 + USING fts3(content, tokenize=perl 'main::locale_tokenizer') + + # populate it + my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr; + INSERT INTO try_fts3(content) VALUES(?) + + my @doc_ids; + for (my $i = 0; $i < @texts; $i++) { + $insert_sth->execute($texts[$i]); + $doc_ids[$i] = $dbh->last_insert_id("", "", "", ""); + } + + # queries + my $sql = "SELECT docid FROM try_fts3 WHERE content MATCH ?"; + for my $t (@tests) { + my ($query, @expected) = @$t; + @expected = map {$doc_ids[$_]} @expected; + my $results = $dbh->selectcol_arrayref($sql, undef, $query); + is_deeply($results, \@expected, "$query (unicode is $use_unicode)"); + } +} + + diff --git a/t/44_fts3_transitional.t b/t/44_fts3_transitional.t new file mode 100644 index 0000000..c0c8e8d --- /dev/null +++ b/t/44_fts3_transitional.t @@ -0,0 +1,34 @@ +#!/usr/bin/perl + +use strict; +BEGIN { + $| = 1; + $^W = 1; +} + +use Test::More; +use Test::NoWarnings; + +my @tests = ( + ['foo bar' => 'foo bar' ], + ['foo -bar' => 'foo (NOT bar)' ], + ['foo* -bar*' => 'foo* (NOT bar*)' ], + ['foo bar OR bie buz' => 'foo (bar OR bie) buz' ], + ['-foo bar OR -bie buz' => '(NOT foo) (bar OR NOT bie) buz'], + ['"kyrie eleison" OR "christe eleison"' + => '("kyrie eleison" OR "christe eleison")'], + ); + + +plan tests => 1 + @tests; + +use DBD::SQLite::FTS3Transitional qw/fts3_convert/; + +foreach my $t (@tests) { + my ($old_syntax, $expected_new) = @$t; + my $new = fts3_convert($old_syntax); + $new =~ s/^\s+//; + is($new, $expected_new, $old_syntax); +} + +