added support for FTS3 fulltext searches : perl tokenizers, documentation and tests

2025-06-07 14:19:10 -04:00 · 2010-07-13 06:28:21 +00:00 · 2010-07-13 06:28:21 +00:00 · d43cf63ad0
commit d43cf63ad0
parent 413bd0ac9d
12 changed files with 988 additions and 3 deletions
--- a/5
+++ b/5
@ -1,6 +1,11 @@
 Changes for Perl extension DBD-SQLite

 1.30_04 to be released
+    - Added support for FTS3 tokenizers written in Perl. Added tests
+      and documentation on how to use FTS3. Changed compilation flag
+      to use the recommanded -DSQLITE_ENABLE_FTS3_PARENTHESIS
+      *** MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY USED FTS3 ***
+      (DAMI)
    - Fixed various backward compatibility issues back to SQLite 3.6.1
      (ISHIGAKI)
    - Resolved #58332: Documentation error for preventing fsync
--- a/Makefile.PL
+++ b/Makefile.PL
@ -212,8 +212,14 @@ if ( $sqlite_inc ) {
 my @CC_DEFINE = (
 	# '-DSQLITE_CORE',
 	'-DSQLITE_ENABLE_FTS3',
-	#  Disabled until we have a test for this
-	# '-DSQLITE_ENABLE_FTS3_PARENTHESIS',  # for sqlite >= 3.6.10
+
+        # L. Dami 10.07.2010 : now enabling new FTS3 syntax, because 
+        # that's the recommendation from SQLite for new applications
+	# (used to be "Disabled until we have a test for this").
+        # This change MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY 
+        # USED FTS3 ... but sooner or later that change had to be done !
+	'-DSQLITE_ENABLE_FTS3_PARENTHESIS',  # for sqlite >= 3.6.10
+
 	'-DSQLITE_ENABLE_COLUMN_METADATA',
 	'-DNDEBUG=1',
 );
--- a/SQLite.xs
+++ b/SQLite.xs
@ -196,6 +196,22 @@ backup_to_file(dbh, filename)
    OUTPUT:
        RETVAL

+
+
+
+
+static int
+register_fts3_perl_tokenizer(dbh)
+    SV *dbh
+    ALIAS:
+        DBD::SQLite::db::sqlite_register_fts3_perl_tokenizer = 1
+    CODE:
+        RETVAL = sqlite_db_register_fts3_perl_tokenizer(aTHX_ dbh);
+    OUTPUT:
+        RETVAL
+
+
+
 MODULE = DBD::SQLite          PACKAGE = DBD::SQLite::st

 PROTOTYPES: DISABLE
--- a/SQLiteXS.h
+++ b/SQLiteXS.h
@ -19,5 +19,6 @@
 #include <dbd_xsh.h>

 #include "sqlite3.h"
+#include "fts3_tokenizer.h"

 #endif
--- a/dbdimp.c
+++ b/dbdimp.c
@ -20,6 +20,14 @@ DBISTATE_DECLARE;
  #define croak_if_stmt_is_null() 
 #endif

+
+/*-----------------------------------------------------*
+ * Globals
+ *-----------------------------------------------------*/
+imp_dbh_t *last_executed_dbh; /* needed by perl_tokenizer
+                                 to know if unicode is on/off */
+
+
 /*-----------------------------------------------------*
 * Helper Methods
 *-----------------------------------------------------*/
@ -487,6 +495,298 @@ sqlite_db_last_insert_id(SV *dbh, imp_dbh_t *imp_dbh, SV *catalog, SV *schema, S
    return newSViv((IV)sqlite3_last_insert_rowid(imp_dbh->db));
 }

+/* ======================================================================
+ * EXPERIMENTAL bindings for FTS3 TOKENIZERS
+ * ====================================================================== */
+
+typedef struct perl_tokenizer {  
+  sqlite3_tokenizer base;
+  SV *coderef;                 /* the perl tokenizer is a coderef that takes
+                                  a string and returns a cursor coderef */
+} perl_tokenizer;
+
+typedef struct perl_tokenizer_cursor {
+  sqlite3_tokenizer_cursor base;
+  SV *coderef;                 /* ref to the closure that returns terms */
+  char *pToken;                /* storage for a copy of the last token */
+  int nTokenAllocated;         /* space allocated to pToken buffer */
+
+  /* members below are only used if the input string is in utf8 */
+  const char *pInput;          /* input we are tokenizing */
+  const char *lastByteOffset;  /* offset into pInput */
+  int lastCharOffset;          /* char offset corresponding to lastByteOffset */
+
+} perl_tokenizer_cursor;
+
+
+/*
+** Create a new tokenizer instance.
+** Will be called whenever a FTS3 table is created with 
+**   CREATE .. USING fts3( ... , tokenize=perl qualified::function::name)
+** where qualified::function::name is a fully qualified perl function
+*/
+static int perl_tokenizer_Create(
+  int argc, const char * const *argv,
+  sqlite3_tokenizer **ppTokenizer
+){
+  perl_tokenizer *t;
+  t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t));
+  if( t==NULL ) return SQLITE_NOMEM;
+  memset(t, 0, sizeof(*t));
+
+  dTHX;
+  dSP;
+
+  ENTER;
+  SAVETMPS;
+
+  /* call the qualified::function::name */
+  PUSHMARK(SP);
+  PUTBACK;
+  int n_retval = call_pv(argv[0], G_SCALAR);
+  SPAGAIN;
+
+  /* store a copy of the returned coderef into the tokenizer structure */
+  if (n_retval != 1) {
+    warn("tokenizer_Create returned %d arguments", n_retval);
+  }
+  SV *retval = POPs;
+  t->coderef   = newSVsv(retval);
+  *ppTokenizer = &t->base;
+
+  PUTBACK;
+  FREETMPS;
+  LEAVE;
+
+  return SQLITE_OK;
+}
+
+
+/*
+** Destroy a tokenizer
+*/
+static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){
+  dTHX;
+  perl_tokenizer *t = (perl_tokenizer *) pTokenizer;
+  sv_free(t->coderef);
+  sqlite3_free(t);
+  return SQLITE_OK;
+}
+
+
+/*
+** Prepare to begin tokenizing a particular string.  The input
+** string to be tokenized is supposed to be pInput[0..nBytes-1] .. 
+** except that nBytes passed by fts3 is -1 (don't know why) ! 
+** This is passed to the tokenizer instance, which then returns a 
+** closure implementing the cursor (so the cursor is again a coderef).
+*/
+static int perl_tokenizer_Open(
+  sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
+  const char *pInput, int nBytes,      /* Input buffer */
+  sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
+){
+  perl_tokenizer *t = (perl_tokenizer *)pTokenizer;
+
+  /* allocate and initialize the cursor struct */
+  perl_tokenizer_cursor *c;
+  c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
+  memset(c, 0, sizeof(*c));
+  *ppCursor = &c->base;
+
+  /* flags for creating the Perl SV containing the input string */
+  U32 flags = SVs_TEMP; /* will call sv_2mortal */
+
+  /* special handling if working with utf8 strings */
+  if (last_executed_dbh->unicode) { /* global var ... no better way ! */
+
+    /* data to keep track of byte offsets */
+    c->lastByteOffset = c->pInput = pInput;
+    c->lastCharOffset = 0;
+
+    /* string passed to Perl needs to be flagged as utf8 */
+    flags |= SVf_UTF8;
+  }
+
+  dTHX;
+  dSP;
+  ENTER;
+  SAVETMPS;
+
+  /* build a Perl copy of the input string */
+  if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */
+    nBytes = strlen(pInput);
+  } 
+  SV *perl_string = newSVpvn_flags(pInput, nBytes, flags); 
+
+  /* call the tokenizer coderef */
+  PUSHMARK(SP);
+  XPUSHs(perl_string);
+  PUTBACK;
+  int n_retval = call_sv(t->coderef, G_SCALAR);
+  SPAGAIN;
+
+  /* store the cursor coderef returned by the tokenizer */
+  if (n_retval != 1) {
+    warn("tokenizer returned %d arguments", n_retval);
+  }
+  c->coderef = newSVsv(POPs);
+
+  PUTBACK;
+  FREETMPS;
+  LEAVE;
+  return SQLITE_OK;
+}
+
+/*
+** Close a tokenization cursor previously opened by a call to
+** perl_tokenizer_Open() above.
+*/
+static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
+  perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
+
+  dTHX;
+  sv_free(c->coderef);
+  sqlite3_free(c);
+  return SQLITE_OK;
+}
+
+/*
+** Extract the next token from a tokenization cursor.  The cursor must
+** have been opened by a prior call to perl_tokenizer_Open().
+*/
+static int perl_tokenizer_Next(
+  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by perl_tokenizer_Open */
+  const char **ppToken,               /* OUT: *ppToken is the token text */
+  int *pnBytes,                       /* OUT: Number of bytes in token */
+  int *piStartOffset,                 /* OUT: Starting offset of token */
+  int *piEndOffset,                   /* OUT: Ending offset of token */
+  int *piPosition                     /* OUT: Position integer of token */
+){
+  perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
+  int result;
+
+  dTHX;
+  dSP;
+
+  ENTER;
+  SAVETMPS;
+
+  /* call the cursor */
+  PUSHMARK(SP);
+  PUTBACK;
+  int n_retval = call_sv(c->coderef, G_ARRAY);
+  SPAGAIN;
+
+  /* if we get back an empty list, there is no more token */
+  if (n_retval == 0) { 
+    result = SQLITE_DONE;
+  }
+  /* otherwise, get token details from the return list */
+  else {
+    if (n_retval != 5) {
+      warn("tokenizer cursor returned %d arguments", n_retval);
+    }
+    *piPosition    = POPi;
+    *piEndOffset   = POPi;
+    *piStartOffset = POPi;
+    *pnBytes       = POPi;
+    char *token    = POPpx;
+
+    if (c->pInput) { /* if working with utf8 data */
+
+      /* recompute *pnBytes in bytes, not in chars */
+      *pnBytes = strlen(token); 
+
+      /* recompute start/end offsets in bytes, not in chars */
+      I32          hop = *piStartOffset - c->lastCharOffset;
+      char *byteOffset = utf8_hop(c->lastByteOffset, hop);
+                   hop = *piEndOffset - *piStartOffset;                
+        *piStartOffset = byteOffset - c->pInput;
+            byteOffset = utf8_hop(byteOffset, hop);
+          *piEndOffset = byteOffset - c->pInput;                                  
+      /* remember where we are for next round */
+      c->lastCharOffset = *piEndOffset,
+      c->lastByteOffset = byteOffset;
+    }
+
+    /* make sure we have enough storage for copying the token */
+    if (*pnBytes > c->nTokenAllocated ){
+      char *pNew;
+      c->nTokenAllocated = *pnBytes + 20;
+      pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
+      if( !pNew ) return SQLITE_NOMEM;
+      c->pToken = pNew;
+    }
+
+    /* need to copy the token into the C cursor before perl frees that
+       memory */
+    memcpy(c->pToken, token, *pnBytes);
+    *ppToken  = c->pToken;
+
+    result = SQLITE_OK;
+  }
+
+  PUTBACK;
+  FREETMPS;
+  LEAVE;
+
+  return result;
+}
+
+
+/*
+** The set of routines that implement the perl tokenizer
+*/
+sqlite3_tokenizer_module perl_tokenizer_Module = {
+  0,
+  perl_tokenizer_Create,
+  perl_tokenizer_Destroy,
+  perl_tokenizer_Open,
+  perl_tokenizer_Close,
+  perl_tokenizer_Next
+};
+
+
+/*
+** Register the perl tokenizer with FTS3
+*/
+int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh)
+{
+  D_imp_dbh(dbh);
+
+  int rc;
+  sqlite3_stmt *pStmt;
+  const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
+  sqlite3_tokenizer_module *p = &perl_tokenizer_Module;
+
+  rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0);
+  if( rc!=SQLITE_OK ){
+    return rc;
+  }
+
+  sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC);
+  sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
+  sqlite3_step(pStmt);
+
+  return sqlite3_finalize(pStmt);
+}
+
+
+/* ======================================================================
+ * END # EXPERIMENTAL bindings for FTS3 TOKENIZERS
+ * ====================================================================== */
+
+
+
+
+
+
+
+
+
+
+
 int
 sqlite_st_prepare(SV *sth, imp_sth_t *imp_sth, char *statement, SV *attribs)
 {
@ -566,6 +866,8 @@ sqlite_st_execute(SV *sth, imp_sth_t *imp_sth)
    croak_if_db_is_null();
    croak_if_stmt_is_null();

+    last_executed_dbh = imp_dbh;
+
    /* COMPAT: sqlite3_sql is only available for 3006000 or newer */
    sqlite_trace(sth, imp_sth, 3, form("executing %s", sqlite3_sql(imp_sth->stmt)));

--- a/dbdimp.h
+++ b/dbdimp.h
@ -100,6 +100,8 @@ SV* sqlite_db_update_hook( pTHX_ SV *dbh, SV *hook );
 int sqlite_db_set_authorizer( pTHX_ SV *dbh, SV *authorizer );
 AV* sqlite_compile_options();

+int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh);
+
 #ifdef SvUTF8_on

 static SV *
--- a/fts3_tokenizer.h
+++ b/fts3_tokenizer.h
@ -0,0 +1,154 @@
+/************** Begin file fts3_tokenizer.h **********************************/
+/*
+** 2006 July 10
+**
+** The author disclaims copyright to this source code.
+**
+*************************************************************************
+** Defines the interface to tokenizers used by fulltext-search.  There
+** are three basic components:
+**
+** sqlite3_tokenizer_module is a singleton defining the tokenizer
+** interface functions.  This is essentially the class structure for
+** tokenizers.
+**
+** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
+** including customization information defined at creation time.
+**
+** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
+** tokens from a particular input.
+*/
+#ifndef _FTS3_TOKENIZER_H_
+#define _FTS3_TOKENIZER_H_
+
+/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
+** If tokenizers are to be allowed to call sqlite3_*() functions, then
+** we will need a way to register the API consistently.
+*/
+
+/*
+** Structures used by the tokenizer interface. When a new tokenizer
+** implementation is registered, the caller provides a pointer to
+** an sqlite3_tokenizer_module containing pointers to the callback
+** functions that make up an implementation.
+**
+** When an fts3 table is created, it passes any arguments passed to
+** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
+** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
+** implementation. The xCreate() function in turn returns an 
+** sqlite3_tokenizer structure representing the specific tokenizer to
+** be used for the fts3 table (customized by the tokenizer clause arguments).
+**
+** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
+** method is called. It returns an sqlite3_tokenizer_cursor object
+** that may be used to tokenize a specific input buffer based on
+** the tokenization rules supplied by a specific sqlite3_tokenizer
+** object.
+*/
+typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
+typedef struct sqlite3_tokenizer sqlite3_tokenizer;
+typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
+
+struct sqlite3_tokenizer_module {
+
+  /*
+  ** Structure version. Should always be set to 0.
+  */
+  int iVersion;
+
+  /*
+  ** Create a new tokenizer. The values in the argv[] array are the
+  ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
+  ** TABLE statement that created the fts3 table. For example, if
+  ** the following SQL is executed:
+  **
+  **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
+  **
+  ** then argc is set to 2, and the argv[] array contains pointers
+  ** to the strings "arg1" and "arg2".
+  **
+  ** This method should return either SQLITE_OK (0), or an SQLite error 
+  ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
+  ** to point at the newly created tokenizer structure. The generic
+  ** sqlite3_tokenizer.pModule variable should not be initialised by
+  ** this callback. The caller will do so.
+  */
+  int (*xCreate)(
+    int argc,                           /* Size of argv array */
+    const char *const*argv,             /* Tokenizer argument strings */
+    sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
+  );
+
+  /*
+  ** Destroy an existing tokenizer. The fts3 module calls this method
+  ** exactly once for each successful call to xCreate().
+  */
+  int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
+
+  /*
+  ** Create a tokenizer cursor to tokenize an input buffer. The caller
+  ** is responsible for ensuring that the input buffer remains valid
+  ** until the cursor is closed (using the xClose() method). 
+  */
+  int (*xOpen)(
+    sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
+    const char *pInput, int nBytes,      /* Input buffer */
+    sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
+  );
+
+  /*
+  ** Destroy an existing tokenizer cursor. The fts3 module calls this 
+  ** method exactly once for each successful call to xOpen().
+  */
+  int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
+
+  /*
+  ** Retrieve the next token from the tokenizer cursor pCursor. This
+  ** method should either return SQLITE_OK and set the values of the
+  ** "OUT" variables identified below, or SQLITE_DONE to indicate that
+  ** the end of the buffer has been reached, or an SQLite error code.
+  **
+  ** *ppToken should be set to point at a buffer containing the 
+  ** normalized version of the token (i.e. after any case-folding and/or
+  ** stemming has been performed). *pnBytes should be set to the length
+  ** of this buffer in bytes. The input text that generated the token is
+  ** identified by the byte offsets returned in *piStartOffset and
+  ** *piEndOffset. *piStartOffset should be set to the index of the first
+  ** byte of the token in the input buffer. *piEndOffset should be set
+  ** to the index of the first byte just past the end of the token in
+  ** the input buffer.
+  **
+  ** The buffer *ppToken is set to point at is managed by the tokenizer
+  ** implementation. It is only required to be valid until the next call
+  ** to xNext() or xClose(). 
+  */
+  /* TODO(shess) current implementation requires pInput to be
+  ** nul-terminated.  This should either be fixed, or pInput/nBytes
+  ** should be converted to zInput.
+  */
+  int (*xNext)(
+    sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
+    const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
+    int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
+    int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
+    int *piPosition      /* OUT: Number of tokens returned before this one */
+  );
+};
+
+struct sqlite3_tokenizer {
+  const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
+  /* Tokenizer implementations will typically add additional fields */
+};
+
+struct sqlite3_tokenizer_cursor {
+  sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
+  /* Tokenizer implementations will typically add additional fields */
+};
+
+int fts3_global_term_cnt(int iTerm, int iCol);
+int fts3_term_cnt(int iTerm, int iCol);
+
+
+#endif /* _FTS3_TOKENIZER_H_ */
+
+/************** End of fts3_tokenizer.h **************************************/
--- a/lib/DBD/SQLite.pm
+++ b/lib/DBD/SQLite.pm
@ -55,6 +55,7 @@ sub driver {
        DBD::SQLite::db->install_method('sqlite_backup_from_file');
        DBD::SQLite::db->install_method('sqlite_backup_to_file');
        DBD::SQLite::db->install_method('sqlite_enable_load_extension');
+        DBD::SQLite::db->install_method('sqlite_register_fts3_perl_tokenizer');
        $methods_are_installed++;
    }

@ -71,6 +72,7 @@ sub CLONE {
    undef $drh;
 }

+
 package DBD::SQLite::dr;

 sub connect {
@ -120,13 +122,16 @@ sub connect {
    # Hand off to the actual login function
    DBD::SQLite::db::_login($dbh, $real, $user, $auth, $attr) or return undef;

-    # Register the on-demand collation installer and REGEXP function
+    # Register the on-demand collation installer, REGEXP function and
+    # perl tokenizer
    if ( DBD::SQLite::NEWAPI ) {
        $dbh->sqlite_collation_needed( \&install_collation );
        $dbh->sqlite_create_function( "REGEXP", 2, \&regexp );
+        $dbh->sqlite_register_fts3_perl_tokenizer();
    } else {
        $dbh->func( \&install_collation, "collation_needed"  );
        $dbh->func( "REGEXP", 2, \&regexp, "create_function" );
+        $dbh->func( "register_fts3_perl_tokenizer" );
    }

    # HACK: Since PrintWarn = 0 doesn't seem to actually prevent warnings
@ -1645,6 +1650,234 @@ I<requests> for collations. In other words, if you want to change
 the behaviour of a collation within an existing C<$dbh>, you
 need to call the L</create_collation> method directly.

+=head1 FULLTEXT SEARCH
+
+The FTS3 extension module within SQLite allows users to create special
+tables with a built-in full-text index (hereafter "FTS3 tables"). The
+full-text index allows the user to efficiently query the database for
+all rows that contain one or more instances of a specified word (hereafter
+a "token"), even if the table contains many large documents.
+
+
+=head2 Short introduction to FTS3
+
+The detailed documentation for FTS3 can be found
+at L<http://www.sqlite.org/fts3.html>. Here is a very short example :
+
+  $dbh->do(<<"") or die DBI::errstr;
+  CREATE VIRTUAL TABLE fts_example USING fts3(content)
+  
+  my $sth = $dbh->prepare("INSERT INTO fts_example(content) VALUES (?))");
+  $sth->execute($_) foreach @docs_to_insert;
+  
+  my $results = $dbh->selectall_arrayref(<<"");
+  SELECT docid, snippet(content) FROM fts_example WHERE content MATCH 'foo'
+  
+
+The key points in this example are :
+
+=over
+
+=item *
+
+The syntax for creating FTS3 tables is 
+
+  CREATE VIRTUAL TABLE <table_name> USING fts3(<columns>)
+
+where C<< <columns> >> is a list of column names. Columns may be
+typed, but the type information is ignored. If no columns
+are specified, the default is a single column named C<content>.
+In addition, FTS3 tables have an implicit column called C<docid>
+(or also C<rowid>) for numbering the stored documents.
+
+=item *
+
+Statements for inserting, updating or deleting records 
+use the same syntax as for regular SQLite tables.
+
+=item *
+
+Full-text searches are specified with the C<MATCH> operator, and an
+operand which may be a single word, a word prefix ending with '*', a
+list of words, a "phrase query" in double quotes, or a boolean combination
+of the above. 
+
+=item *
+
+The builtin function C<snippet(...)> builds a formatted excerpt of the
+document text, where the words pertaining to the query are highlighted.
+
+=back
+
+There are many more details to building and searching
+FTS3 tables, so we strongly invite you to read
+the full documentation at at L<http://www.sqlite.org/fts3.html>.
+
+B<Incompatible change> : 
+starting from version 1.31, C<DBD::SQLite> uses the new, recommended
+"Enhanced Query Syntax" for binary set operators (AND, OR, NOT, possibly 
+nested with parenthesis). Previous versions of C<DBD::SQLite> used the
+"Standard Query Syntax" (see L<http://www.sqlite.org/fts3.html#section_3_2>).
+Unfortunately this is a compilation switch, so it cannot be tuned
+at runtime; however, since FTS3 was never advertised in versions prior
+to 1.31, the change should be invisible to the vast majority of 
+C<DBD::SQLite> users. If, however, there are any applications
+that nevertheless were built using the "Standard Query" syntax,
+they have to be migrated; but the conversion 
+function provided in in L<DBD::SQLite::FTS3Transitional>
+is there to help.
+
+
+=head2 Tokenizers
+
+The behaviour of full-text indexes strongly depends on how
+documents are split into I<tokens>; therefore FTS3 table
+declarations can explicitly specify how to perform
+tokenization: 
+
+  CREATE ... USING fts3(<columns>, tokenize=<tokenizer>)
+
+where C<< <tokenizer> >> is a sequence of space-separated
+words that triggers a specific tokenizer, as explained below.
+
+=head3 SQLite builtin tokenizers
+
+SQLite comes with three builtin tokenizers :
+
+=over
+
+=item simple
+
+Under the I<simple> tokenizer, a term is a contiguous sequence of
+eligible characters, where eligible characters are all alphanumeric
+characters, the "_" character, and all characters with UTF codepoints
+greater than or equal to 128. All other characters are discarded when
+splitting a document into terms. They serve only to separate adjacent
+terms.
+
+All uppercase characters within the ASCII range (UTF codepoints less
+than 128), are transformed to their lowercase equivalents as part of
+the tokenization process. Thus, full-text queries are case-insensitive
+when using the simple tokenizer.
+
+=item porter
+
+The I<porter> tokenizer uses the same rules to separate the input
+document into terms, but as well as folding all terms to lower case it
+uses the Porter Stemming algorithm to reduce related English language
+words to a common root.
+
+=item icu
+
+If SQLite is compiled with the SQLITE_ENABLE_ICU
+pre-processor symbol defined, then there exists a built-in tokenizer
+named "icu" implemented using the ICU library, and taking an
+ICU locale identifier as argument (such as "tr_TR" for
+Turkish as used in Turkey, or "en_AU" for English as used in
+Australia). For example:
+
+  CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH)
+
+The ICU tokenizer implementation is very simple. It splits the input
+text according to the ICU rules for finding word boundaries and
+discards any tokens that consist entirely of white-space. This may be
+suitable for some applications in some locales, but not all. If more
+complex processing is required, for example to implement stemming or
+discard punctuation, use the perl tokenizer as explained below.
+
+=back
+
+=head3 Perl tokenizers
+
+In addition to the builtin SQLite tokenizers, C<DBD::Sqlite>
+implements a I<perl> tokenizer, that can hook to any tokenizing
+algorithm written in Perl. This is specified as follows :
+
+  CREATE ... USING fts3(<columns>, tokenize=perl '<perl_function>')
+
+where C<< <perl_function> >> is a fully qualified Perl function name
+(i.e. prefixed by the name of the package in which that function is
+declared). So for example if the function is C<my_func> in the main 
+program, write
+
+  CREATE ... USING fts3(<columns>, tokenize=perl 'main::my_func')
+
+That function should return a code reference that takes a string as
+single argument, and returns an iterator (another function), which
+returns a tuple C<< ($term, $len, $start, $end, $index) >> for each
+term. Here is a simple example that tokenizes on words according to
+the current perl locale
+
+  sub locale_tokenizer {
+    return sub {
+      my $string = shift;
+
+      use locale;
+      my $regex      = qr/\w+/;
+      my $term_index = 0;
+
+      return sub { # closure
+        $string =~ /$regex/g or return; # either match, or no more token
+        my ($start, $end) = ($-[0], $+[0]);
+        my $len           = $end-$start;
+        my $term          = substr($string, $start, $len);
+        return ($term, $len, $start, $end, $term_index++);
+      }
+    };
+  }
+
+There must be three levels of subs, in a kind of "Russian dolls" structure,
+because :
+
+=over
+
+=item *
+
+the external, named sub is called whenever accessing a FTS3 table
+with that tokenizer
+
+=item *
+
+the inner, anonymous sub is called whenever a new string
+needs to be tokenized (either for inserting new text into the table,
+or for analyzing a query).
+
+=item *
+
+the innermost, anonymous sub is called repeatedly for retrieving
+all terms within that string.
+
+=back
+
+Instead of writing tokenizers by hand, you can grab one of those
+already implemented in the L<Search::Tokenizer> module :
+
+  use Search::Tokenizer;
+  $dbh->do(<<"") or die DBI::errstr;
+  CREATE ... USING fts3(<columns>, 
+                        tokenize=perl 'Search::Tokenizer::unaccent')
+
+or you can use L<Search::Tokenizer/new> to build
+your own tokenizer.
+
+
+=head2 Incomplete handling of utf8 characters
+
+The current FTS3 implementation in SQLite is far from complete with
+respect to utf8 handling : in particular, variable-length characters
+are not treated correctly by the builtin functions
+C<offsets()> and C<snippet()>.
+
+=head2 Database space for FTS3
+
+FTS3 stores a complete copy of the indexed documents, together with
+the fulltext index. On a large collection of documents, this can
+consume quite a lot of disk space. If copies of documents are also
+available as external resources (for example files on the filesystem),
+that space can sometimes be spared --- see the tip in the 
+L<Cookbook|DBD::SQLite::Cookbook/"Sparing database disk space">.
+
+
 =head1 FOR DBD::SQLITE EXTENSION AUTHORS

 Since 1.30_01, you can retrieve the bundled sqlite C source and/or
--- a/lib/DBD/SQLite/Cookbook.pod
+++ b/lib/DBD/SQLite/Cookbook.pod
@ -9,6 +9,8 @@ This is the L<DBD::SQLite> cookbook.
 It is intended to provide a place to keep a variety of functions and
 formals for use in callback APIs in L<DBD::SQLite>.

+=head1 AGGREGATE FUNCTIONS
+
 =head2 Variance

 This is a simple aggregate function which returns a variance. It is
@ -140,6 +142,35 @@ The function can then be used as:
  FROM results
  GROUP BY group_name;

+=head1 FTS3 fulltext indexing
+
+=head2 Sparing database disk space
+
+As explained in L<http://www.sqlite.org/fts3.html#section_6>, each
+FTS3 table C<I<t>> is stored internally within three regular tables
+C<I<t>_content>, C<I<t>_segments> and C<I<t>_segdir>.  The last two
+tables contain the fulltext index.  The first table C<I<t>_content>
+stores the complete documents being indexed ... but if copies of the
+same documents are already stored somewhere else, or can be computed
+from external resources (for example as HTML or MsWord files in the
+filesystem), then this is quite a waste of space. SQLite itself only
+needs the C<I<t>_content> table for implementing the C<offsets()> and
+C<snippet()> functions, which are not always usable anyway (in particular
+when using utf8 characters greater than 255).
+
+So an alternative strategy is to use SQLite only for the fulltext
+index and metadata, and to keep the full documents outside of SQLite :
+to do so, after each insert or update in the FTS3 table, do an update
+in the C<I<t>_content> table, setting the content column(s) to
+NULL. Of course your application will need an algorithm for finding
+the external resource corresponding to any I<docid> stored within
+SQLite. Furthermore, SQLite C<offsets()> and C<snippet()> functions
+cannot be used, so if such functionality is needed, it has to be
+directly programmed within the Perl application.
+In short, this strategy is really a hack, because FTS3 was not originally
+programmed with that behaviour in mind; however it is workable
+and has a strong impact on the size of the database file.
+
 =head1 SUPPORT

 Bugs should be reported via the CPAN bug tracker at
@ -157,6 +188,8 @@ turn them into a separate CPAN distribution.

 Adam Kennedy E<lt>adamk@cpan.orgE<gt>

+Laurent Dami E<lt>dami@cpan.orgE<gt>
+
 =head1 COPYRIGHT

 Copyright 2009 Adam Kennedy.
--- a/lib/DBD/SQLite/FTS3Transitional.pm
+++ b/lib/DBD/SQLite/FTS3Transitional.pm
@ -0,0 +1,96 @@
+package DBD::SQLite::FTS3Transitional;
+use strict;
+use warnings;
+no warnings 'uninitialized';
+
+use Exporter 'import';
+our @EXPORT_OK = qw/fts3_convert/;
+
+
+sub fts3_convert {
+  my $in  = shift;
+  my $out = "";
+
+  # decompose input string into tokens
+  my @tokens = $in =~ / -       # minus sign
+                      | \bOR\b  # OR keyword
+                      | ".*?"   # phrase query
+                      | \S+     # term
+                      /xg;
+
+  # build the output string
+  while (@tokens) {
+
+    # -a => (NOT a)
+    if ($tokens[0] eq '-') {
+      my (undef, $right) = splice(@tokens, 0, 2);
+      $out .= " (NOT $right)";
+    }
+
+    # a OR b => (a OR b)
+    elsif (@tokens >= 2 && $tokens[1] eq 'OR') {
+      my ($left, undef, $right) = splice(@tokens, 0, 3);
+      if ($right eq '-') {
+        $right = "NOT " . shift @tokens;
+      }
+      $out .= " ($left OR $right)";
+    }
+
+    # plain term
+    else {
+      $out .= " " . shift @tokens;
+    }
+  }
+
+  return $out;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+DBD::SQLite::FTS3Transitional - helper function for migrating FTS3 applications
+
+=head1 SYNOPSIS
+
+  use DBD::SQLite::FTS3Transitional qw/fts3_convert/;
+  my $new_match_syntax = fts3_convert($old_match_syntax);
+  my $sql = "SELECT ... FROM ... WHERE col MATCH $new_match_syntax";
+
+=head1 DESCRIPTION
+
+Starting from version 1.31, C<DBD::SQLite> uses the new, recommended
+"Enhanced Query Syntax" for binary set operators in fulltext FTS3 queries
+(AND, OR, NOT, possibly nested with parenthesis). 
+
+Previous versions of C<DBD::SQLite> used the
+"Standard Query Syntax" (see L<http://www.sqlite.org/fts3.html#section_3_2>).
+
+This module helps converting SQLite application built with the old,
+"Standard" query syntax, to the new "Extended" syntax.
+
+=head1 FUNCTIONS
+
+=head2 fts3_convert
+
+Takes as input a string for the MATCH clause in a FTS3 fulltext search;
+returns the same clause rewritten in new, "Extended" syntax.
+
+=head1 AUTHOR
+
+Laurent Dami E<lt>dami@cpan.orgE<gt>
+
+=head1 COPYRIGHT
+
+Copyright 2010 Laurent Dami.
+
+This program is free software; you can redistribute
+it and/or modify it under the same terms as Perl itself.
+
+The full text of the license can be found in the
+LICENSE file included with this module.
+
+=cut
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@ -0,0 +1,103 @@
+#!/usr/bin/perl
+
+use strict;
+BEGIN {
+	$|  = 1;
+	$^W = 1;
+}
+
+use t::lib::Test     qw/connect_ok/;
+use Test::More;
+
+my @texts = ("il était une bergère",
+             "qui gardait ses moutons",
+             "elle fit un fromage",
+             "du lait de ses moutons");
+
+my @tests = (
+#  query                  => expected results
+  ["bergère"              => 0       ],
+  ["berg*"                => 0       ],
+  ["foobar"                          ],
+  ["moutons"              => 1, 3    ],
+  ['"qui gardait"'        => 1       ],
+  ["moutons NOT lait"     => 1       ],
+  ["il était"             => 0       ],
+  ["(il OR elle) AND un*" => 0, 2    ],
+);
+
+BEGIN {
+	if ($] < 5.008005) {
+		plan skip_all => 'Unicode is not supported before 5.8.5';
+	}
+}
+use Test::NoWarnings;
+
+plan tests => 2 * (1 + @tests)  + 1;
+
+BEGIN {
+	# Sadly perl for windows (and probably sqlite, too) may hang
+	# if the system locale doesn't support european languages.
+	# en-us should be a safe default. if it doesn't work, use 'C'.
+	if ( $^O eq 'MSWin32') {
+		use POSIX 'locale_h';
+		setlocale(LC_COLLATE, 'en-us');
+	}
+}
+use locale;
+
+
+sub locale_tokenizer { # see also: Search::Tokenizer
+  return sub {
+    my $string = shift;
+
+    my $regex      = qr/\w+/;
+    my $term_index = 0;
+
+    return sub {
+      $string =~ /$regex/g or return; # either match, or no more token
+      my ($start, $end) = ($-[0], $+[0]);
+      my $term = substr($string, $start, my $len = $end-$start);
+      return ($term, $len, $start, $end, $term_index++);
+    };
+  };
+}
+
+
+
+use DBD::SQLite;
+
+
+
+for my $use_unicode (0, 1) {
+
+  # connect
+  my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode );
+
+  # create fts3 table
+  use Search::Tokenizer;
+  $dbh->do(<<"") or die DBI::errstr;
+    CREATE VIRTUAL TABLE try_fts3 
+          USING fts3(content, tokenize=perl 'main::locale_tokenizer')
+
+  # populate it
+  my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
+    INSERT INTO try_fts3(content) VALUES(?)
+
+  my @doc_ids;
+  for (my $i = 0; $i < @texts; $i++) {
+    $insert_sth->execute($texts[$i]);
+    $doc_ids[$i] = $dbh->last_insert_id("", "", "", "");
+  }
+
+  # queries
+  my $sql = "SELECT docid FROM try_fts3 WHERE content MATCH ?";
+  for my $t (@tests) {
+    my ($query, @expected) = @$t;
+    @expected = map {$doc_ids[$_]} @expected;
+    my $results = $dbh->selectcol_arrayref($sql, undef, $query);
+    is_deeply($results, \@expected, "$query (unicode is $use_unicode)");
+  }
+}
+
+
--- a/t/44_fts3_transitional.t
+++ b/t/44_fts3_transitional.t
@ -0,0 +1,34 @@
+#!/usr/bin/perl
+
+use strict;
+BEGIN {
+	$|  = 1;
+	$^W = 1;
+}
+
+use Test::More;
+use Test::NoWarnings;
+
+my @tests = (
+  ['foo bar'              => 'foo bar'                       ],
+  ['foo -bar'             => 'foo (NOT bar)'                 ],
+  ['foo* -bar*'           => 'foo* (NOT bar*)'               ],
+  ['foo bar OR bie buz'   => 'foo (bar OR bie) buz'          ],
+  ['-foo bar OR -bie buz' => '(NOT foo) (bar OR NOT bie) buz'],
+  ['"kyrie eleison" OR "christe eleison"' 
+                  => '("kyrie eleison" OR "christe eleison")'],
+ );
+
+
+plan tests => 1 + @tests;
+
+use DBD::SQLite::FTS3Transitional qw/fts3_convert/;
+
+foreach my $t (@tests) {
+  my ($old_syntax, $expected_new) = @$t;
+  my $new = fts3_convert($old_syntax);
+  $new =~ s/^\s+//;
+  is($new, $expected_new, $old_syntax);
+}
+
+