added support for FTS3 fulltext searches : perl tokenizers, documentation and tests

2025-06-07 14:19:10 -04:00 · 2010-07-13 06:28:21 +00:00 · 2010-07-13 06:28:21 +00:00 · d43cf63ad0
commit d43cf63ad0
parent 413bd0ac9d
12 changed files with 988 additions and 3 deletions
--- a/5
+++ b/5
@ -1,6 +1,11 @@
 Changes for Perl extension DBD-SQLite
 1.30_04 to be released
    - Added support for FTS3 tokenizers written in Perl. Added tests
      and documentation on how to use FTS3. Changed compilation flag
      to use the recommanded -DSQLITE_ENABLE_FTS3_PARENTHESIS
      *** MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY USED FTS3 ***
      (DAMI)
    - Fixed various backward compatibility issues back to SQLite 3.6.1
      (ISHIGAKI)
    - Resolved #58332: Documentation error for preventing fsync
--- a/Makefile.PL
+++ b/Makefile.PL
@ -212,8 +212,14 @@ if ( $sqlite_inc ) {
 my @CC_DEFINE = (
 	# '-DSQLITE_CORE',
 	'-DSQLITE_ENABLE_FTS3',
-	#  Disabled until we have a test for this
+
-	# '-DSQLITE_ENABLE_FTS3_PARENTHESIS',  # for sqlite >= 3.6.10
+        # L. Dami 10.07.2010 : now enabling new FTS3 syntax, because 
        # that's the recommendation from SQLite for new applications
 	# (used to be "Disabled until we have a test for this").
        # This change MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY 
        # USED FTS3 ... but sooner or later that change had to be done !
 	'-DSQLITE_ENABLE_FTS3_PARENTHESIS',  # for sqlite >= 3.6.10
 	'-DSQLITE_ENABLE_COLUMN_METADATA',
 	'-DNDEBUG=1',
 );
--- a/SQLite.xs
+++ b/SQLite.xs
@ -196,6 +196,22 @@ backup_to_file(dbh, filename)
    OUTPUT:
        RETVAL
 static int
 register_fts3_perl_tokenizer(dbh)
    SV *dbh
    ALIAS:
        DBD::SQLite::db::sqlite_register_fts3_perl_tokenizer = 1
    CODE:
        RETVAL = sqlite_db_register_fts3_perl_tokenizer(aTHX_ dbh);
    OUTPUT:
        RETVAL
 MODULE = DBD::SQLite          PACKAGE = DBD::SQLite::st
 PROTOTYPES: DISABLE
--- a/SQLiteXS.h
+++ b/SQLiteXS.h
@ -19,5 +19,6 @@
 #include <dbd_xsh.h>
 #include "sqlite3.h"
 #include "fts3_tokenizer.h"
 #endif
--- a/dbdimp.c
+++ b/dbdimp.c
@ -20,6 +20,14 @@ DBISTATE_DECLARE;
  #define croak_if_stmt_is_null() 
 #endif
 /*-----------------------------------------------------*
 * Globals
 *-----------------------------------------------------*/
 imp_dbh_t *last_executed_dbh; /* needed by perl_tokenizer
                                 to know if unicode is on/off */
 /*-----------------------------------------------------*
 * Helper Methods
 *-----------------------------------------------------*/
@ -487,6 +495,298 @@ sqlite_db_last_insert_id(SV *dbh, imp_dbh_t *imp_dbh, SV *catalog, SV *schema, S
    return newSViv((IV)sqlite3_last_insert_rowid(imp_dbh->db));
 }
 /* ======================================================================
 * EXPERIMENTAL bindings for FTS3 TOKENIZERS
 * ====================================================================== */
 typedef struct perl_tokenizer {  
  sqlite3_tokenizer base;
  SV *coderef;                 /* the perl tokenizer is a coderef that takes
                                  a string and returns a cursor coderef */
 } perl_tokenizer;
 typedef struct perl_tokenizer_cursor {
  sqlite3_tokenizer_cursor base;
  SV *coderef;                 /* ref to the closure that returns terms */
  char *pToken;                /* storage for a copy of the last token */
  int nTokenAllocated;         /* space allocated to pToken buffer */
  /* members below are only used if the input string is in utf8 */
  const char *pInput;          /* input we are tokenizing */
  const char *lastByteOffset;  /* offset into pInput */
  int lastCharOffset;          /* char offset corresponding to lastByteOffset */
 } perl_tokenizer_cursor;
 /*
 ** Create a new tokenizer instance.
 ** Will be called whenever a FTS3 table is created with 
 **   CREATE .. USING fts3( ... , tokenize=perl qualified::function::name)
 ** where qualified::function::name is a fully qualified perl function
 */
 static int perl_tokenizer_Create(
  int argc, const char * const *argv,
  sqlite3_tokenizer **ppTokenizer
 ){
  perl_tokenizer *t;
  t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t));
  if( t==NULL ) return SQLITE_NOMEM;
  memset(t, 0, sizeof(*t));
  dTHX;
  dSP;
  ENTER;
  SAVETMPS;
  /* call the qualified::function::name */
  PUSHMARK(SP);
  PUTBACK;
  int n_retval = call_pv(argv[0], G_SCALAR);
  SPAGAIN;
  /* store a copy of the returned coderef into the tokenizer structure */
  if (n_retval != 1) {
    warn("tokenizer_Create returned %d arguments", n_retval);
  }
  SV *retval = POPs;
  t->coderef   = newSVsv(retval);
  *ppTokenizer = &t->base;
  PUTBACK;
  FREETMPS;
  LEAVE;
  return SQLITE_OK;
 }
 /*
 ** Destroy a tokenizer
 */
 static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){
  dTHX;
  perl_tokenizer *t = (perl_tokenizer *) pTokenizer;
  sv_free(t->coderef);
  sqlite3_free(t);
  return SQLITE_OK;
 }
 /*
 ** Prepare to begin tokenizing a particular string.  The input
 ** string to be tokenized is supposed to be pInput[0..nBytes-1] .. 
 ** except that nBytes passed by fts3 is -1 (don't know why) ! 
 ** This is passed to the tokenizer instance, which then returns a 
 ** closure implementing the cursor (so the cursor is again a coderef).
 */
 static int perl_tokenizer_Open(
  sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
  const char *pInput, int nBytes,      /* Input buffer */
  sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
 ){
  perl_tokenizer *t = (perl_tokenizer *)pTokenizer;
  /* allocate and initialize the cursor struct */
  perl_tokenizer_cursor *c;
  c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
  memset(c, 0, sizeof(*c));
  *ppCursor = &c->base;
  /* flags for creating the Perl SV containing the input string */
  U32 flags = SVs_TEMP; /* will call sv_2mortal */
  /* special handling if working with utf8 strings */
  if (last_executed_dbh->unicode) { /* global var ... no better way ! */
    /* data to keep track of byte offsets */
    c->lastByteOffset = c->pInput = pInput;
    c->lastCharOffset = 0;
    /* string passed to Perl needs to be flagged as utf8 */
    flags |= SVf_UTF8;
  }
  dTHX;
  dSP;
  ENTER;
  SAVETMPS;
  /* build a Perl copy of the input string */
  if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */
    nBytes = strlen(pInput);
  } 
  SV *perl_string = newSVpvn_flags(pInput, nBytes, flags); 
  /* call the tokenizer coderef */
  PUSHMARK(SP);
  XPUSHs(perl_string);
  PUTBACK;
  int n_retval = call_sv(t->coderef, G_SCALAR);
  SPAGAIN;
  /* store the cursor coderef returned by the tokenizer */
  if (n_retval != 1) {
    warn("tokenizer returned %d arguments", n_retval);
  }
  c->coderef = newSVsv(POPs);
  PUTBACK;
  FREETMPS;
  LEAVE;
  return SQLITE_OK;
 }
 /*
 ** Close a tokenization cursor previously opened by a call to
 ** perl_tokenizer_Open() above.
 */
 static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
  perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
  dTHX;
  sv_free(c->coderef);
  sqlite3_free(c);
  return SQLITE_OK;
 }
 /*
 ** Extract the next token from a tokenization cursor.  The cursor must
 ** have been opened by a prior call to perl_tokenizer_Open().
 */
 static int perl_tokenizer_Next(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by perl_tokenizer_Open */
  const char **ppToken,               /* OUT: *ppToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
 ){
  perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
  int result;
  dTHX;
  dSP;
  ENTER;
  SAVETMPS;
  /* call the cursor */
  PUSHMARK(SP);
  PUTBACK;
  int n_retval = call_sv(c->coderef, G_ARRAY);
  SPAGAIN;
  /* if we get back an empty list, there is no more token */
  if (n_retval == 0) { 
    result = SQLITE_DONE;
  }
  /* otherwise, get token details from the return list */
  else {
    if (n_retval != 5) {
      warn("tokenizer cursor returned %d arguments", n_retval);
    }
    *piPosition    = POPi;
    *piEndOffset   = POPi;
    *piStartOffset = POPi;
    *pnBytes       = POPi;
    char *token    = POPpx;
    if (c->pInput) { /* if working with utf8 data */
      /* recompute *pnBytes in bytes, not in chars */
      *pnBytes = strlen(token); 
      /* recompute start/end offsets in bytes, not in chars */
      I32          hop = *piStartOffset - c->lastCharOffset;
      char *byteOffset = utf8_hop(c->lastByteOffset, hop);
                   hop = *piEndOffset - *piStartOffset;                
        *piStartOffset = byteOffset - c->pInput;
            byteOffset = utf8_hop(byteOffset, hop);
          *piEndOffset = byteOffset - c->pInput;                                  
      /* remember where we are for next round */
      c->lastCharOffset = *piEndOffset,
      c->lastByteOffset = byteOffset;
    }
    /* make sure we have enough storage for copying the token */
    if (*pnBytes > c->nTokenAllocated ){
      char *pNew;
      c->nTokenAllocated = *pnBytes + 20;
      pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
      if( !pNew ) return SQLITE_NOMEM;
      c->pToken = pNew;
    }
    /* need to copy the token into the C cursor before perl frees that
       memory */
    memcpy(c->pToken, token, *pnBytes);
    *ppToken  = c->pToken;
    result = SQLITE_OK;
  }
  PUTBACK;
  FREETMPS;
  LEAVE;
  return result;
 }
 /*
 ** The set of routines that implement the perl tokenizer
 */
 sqlite3_tokenizer_module perl_tokenizer_Module = {
  0,
  perl_tokenizer_Create,
  perl_tokenizer_Destroy,
  perl_tokenizer_Open,
  perl_tokenizer_Close,
  perl_tokenizer_Next
 };
 /*
 ** Register the perl tokenizer with FTS3
 */
 int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh)
 {
  D_imp_dbh(dbh);
  int rc;
  sqlite3_stmt *pStmt;
  const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
  sqlite3_tokenizer_module *p = &perl_tokenizer_Module;
  rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0);
  if( rc!=SQLITE_OK ){
    return rc;
  }
  sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC);
  sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
  sqlite3_step(pStmt);
  return sqlite3_finalize(pStmt);
 }
 /* ======================================================================
 * END # EXPERIMENTAL bindings for FTS3 TOKENIZERS
 * ====================================================================== */
 int
 sqlite_st_prepare(SV *sth, imp_sth_t *imp_sth, char *statement, SV *attribs)
 {
@ -566,6 +866,8 @@ sqlite_st_execute(SV *sth, imp_sth_t *imp_sth)
    croak_if_db_is_null();
    croak_if_stmt_is_null();
    last_executed_dbh = imp_dbh;
    /* COMPAT: sqlite3_sql is only available for 3006000 or newer */
    sqlite_trace(sth, imp_sth, 3, form("executing %s", sqlite3_sql(imp_sth->stmt)));
--- a/dbdimp.h
+++ b/dbdimp.h
@ -100,6 +100,8 @@ SV* sqlite_db_update_hook( pTHX_ SV *dbh, SV *hook );
 int sqlite_db_set_authorizer( pTHX_ SV *dbh, SV *authorizer );
 AV* sqlite_compile_options();
 int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh);
 #ifdef SvUTF8_on
 static SV *
--- a/fts3_tokenizer.h
+++ b/fts3_tokenizer.h
@ -0,0 +1,154 @@
 /************** Begin file fts3_tokenizer.h **********************************/
 /*
 ** 2006 July 10
 **
 ** The author disclaims copyright to this source code.
 **
 *************************************************************************
 ** Defines the interface to tokenizers used by fulltext-search.  There
 ** are three basic components:
 **
 ** sqlite3_tokenizer_module is a singleton defining the tokenizer
 ** interface functions.  This is essentially the class structure for
 ** tokenizers.
 **
 ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
 ** including customization information defined at creation time.
 **
 ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
 ** tokens from a particular input.
 */
 #ifndef _FTS3_TOKENIZER_H_
 #define _FTS3_TOKENIZER_H_
 /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
 ** If tokenizers are to be allowed to call sqlite3_*() functions, then
 ** we will need a way to register the API consistently.
 */
 /*
 ** Structures used by the tokenizer interface. When a new tokenizer
 ** implementation is registered, the caller provides a pointer to
 ** an sqlite3_tokenizer_module containing pointers to the callback
 ** functions that make up an implementation.
 **
 ** When an fts3 table is created, it passes any arguments passed to
 ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
 ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
 ** implementation. The xCreate() function in turn returns an 
 ** sqlite3_tokenizer structure representing the specific tokenizer to
 ** be used for the fts3 table (customized by the tokenizer clause arguments).
 **
 ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
 ** method is called. It returns an sqlite3_tokenizer_cursor object
 ** that may be used to tokenize a specific input buffer based on
 ** the tokenization rules supplied by a specific sqlite3_tokenizer
 ** object.
 */
 typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
 typedef struct sqlite3_tokenizer sqlite3_tokenizer;
 typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
 struct sqlite3_tokenizer_module {
  /*
  ** Structure version. Should always be set to 0.
  */
  int iVersion;
  /*
  ** Create a new tokenizer. The values in the argv[] array are the
  ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
  ** TABLE statement that created the fts3 table. For example, if
  ** the following SQL is executed:
  **
  **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
  **
  ** then argc is set to 2, and the argv[] array contains pointers
  ** to the strings "arg1" and "arg2".
  **
  ** This method should return either SQLITE_OK (0), or an SQLite error 
  ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
  ** to point at the newly created tokenizer structure. The generic
  ** sqlite3_tokenizer.pModule variable should not be initialised by
  ** this callback. The caller will do so.
  */
  int (*xCreate)(
    int argc,                           /* Size of argv array */
    const char *const*argv,             /* Tokenizer argument strings */
    sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
  );
  /*
  ** Destroy an existing tokenizer. The fts3 module calls this method
  ** exactly once for each successful call to xCreate().
  */
  int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
  /*
  ** Create a tokenizer cursor to tokenize an input buffer. The caller
  ** is responsible for ensuring that the input buffer remains valid
  ** until the cursor is closed (using the xClose() method). 
  */
  int (*xOpen)(
    sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
    const char *pInput, int nBytes,      /* Input buffer */
    sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
  );
  /*
  ** Destroy an existing tokenizer cursor. The fts3 module calls this 
  ** method exactly once for each successful call to xOpen().
  */
  int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
  /*
  ** Retrieve the next token from the tokenizer cursor pCursor. This
  ** method should either return SQLITE_OK and set the values of the
  ** "OUT" variables identified below, or SQLITE_DONE to indicate that
  ** the end of the buffer has been reached, or an SQLite error code.
  **
  ** *ppToken should be set to point at a buffer containing the 
  ** normalized version of the token (i.e. after any case-folding and/or
  ** stemming has been performed). *pnBytes should be set to the length
  ** of this buffer in bytes. The input text that generated the token is
  ** identified by the byte offsets returned in *piStartOffset and
  ** *piEndOffset. *piStartOffset should be set to the index of the first
  ** byte of the token in the input buffer. *piEndOffset should be set
  ** to the index of the first byte just past the end of the token in
  ** the input buffer.
  **
  ** The buffer *ppToken is set to point at is managed by the tokenizer
  ** implementation. It is only required to be valid until the next call
  ** to xNext() or xClose(). 
  */
  /* TODO(shess) current implementation requires pInput to be
  ** nul-terminated.  This should either be fixed, or pInput/nBytes
  ** should be converted to zInput.
  */
  int (*xNext)(
    sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
    const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
    int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
    int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
    int *piPosition      /* OUT: Number of tokens returned before this one */
  );
 };
 struct sqlite3_tokenizer {
  const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
  /* Tokenizer implementations will typically add additional fields */
 };
 struct sqlite3_tokenizer_cursor {
  sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
  /* Tokenizer implementations will typically add additional fields */
 };
 int fts3_global_term_cnt(int iTerm, int iCol);
 int fts3_term_cnt(int iTerm, int iCol);
 #endif /* _FTS3_TOKENIZER_H_ */
 /************** End of fts3_tokenizer.h **************************************/
--- a/lib/DBD/SQLite.pm
+++ b/lib/DBD/SQLite.pm
@ -55,6 +55,7 @@ sub driver {
        DBD::SQLite::db->install_method('sqlite_backup_from_file');
        DBD::SQLite::db->install_method('sqlite_backup_to_file');
        DBD::SQLite::db->install_method('sqlite_enable_load_extension');
        DBD::SQLite::db->install_method('sqlite_register_fts3_perl_tokenizer');
        $methods_are_installed++;
    }
@ -71,6 +72,7 @@ sub CLONE {
    undef $drh;
 }
 package DBD::SQLite::dr;
 sub connect {
@ -120,13 +122,16 @@ sub connect {
    # Hand off to the actual login function
    DBD::SQLite::db::_login($dbh, $real, $user, $auth, $attr) or return undef;
-    # Register the on-demand collation installer and REGEXP function
+    # Register the on-demand collation installer, REGEXP function and
    # perl tokenizer
    if ( DBD::SQLite::NEWAPI ) {
        $dbh->sqlite_collation_needed( \&install_collation );
        $dbh->sqlite_create_function( "REGEXP", 2, \&regexp );
        $dbh->sqlite_register_fts3_perl_tokenizer();
    } else {
        $dbh->func( \&install_collation, "collation_needed"  );
        $dbh->func( "REGEXP", 2, \&regexp, "create_function" );
        $dbh->func( "register_fts3_perl_tokenizer" );
    }
    # HACK: Since PrintWarn = 0 doesn't seem to actually prevent warnings
@ -1645,6 +1650,234 @@ I<requests> for collations. In other words, if you want to change
 the behaviour of a collation within an existing C<$dbh>, you
 need to call the L</create_collation> method directly.
 =head1 FULLTEXT SEARCH
 The FTS3 extension module within SQLite allows users to create special
 tables with a built-in full-text index (hereafter "FTS3 tables"). The
 full-text index allows the user to efficiently query the database for
 all rows that contain one or more instances of a specified word (hereafter
 a "token"), even if the table contains many large documents.
 =head2 Short introduction to FTS3
 The detailed documentation for FTS3 can be found
 at L<http://www.sqlite.org/fts3.html>. Here is a very short example :
  $dbh->do(<<"") or die DBI::errstr;
  CREATE VIRTUAL TABLE fts_example USING fts3(content)
  my $sth = $dbh->prepare("INSERT INTO fts_example(content) VALUES (?))");
  $sth->execute($_) foreach @docs_to_insert;
  my $results = $dbh->selectall_arrayref(<<"");
  SELECT docid, snippet(content) FROM fts_example WHERE content MATCH 'foo'
 The key points in this example are :
 =over
 =item *
 The syntax for creating FTS3 tables is 
  CREATE VIRTUAL TABLE <table_name> USING fts3(<columns>)
 where C<< <columns> >> is a list of column names. Columns may be
 typed, but the type information is ignored. If no columns
 are specified, the default is a single column named C<content>.
 In addition, FTS3 tables have an implicit column called C<docid>
 (or also C<rowid>) for numbering the stored documents.
 =item *
 Statements for inserting, updating or deleting records 
 use the same syntax as for regular SQLite tables.
 =item *
 Full-text searches are specified with the C<MATCH> operator, and an
 operand which may be a single word, a word prefix ending with '*', a
 list of words, a "phrase query" in double quotes, or a boolean combination
 of the above. 
 =item *
 The builtin function C<snippet(...)> builds a formatted excerpt of the
 document text, where the words pertaining to the query are highlighted.
 =back
 There are many more details to building and searching
 FTS3 tables, so we strongly invite you to read
 the full documentation at at L<http://www.sqlite.org/fts3.html>.
 B<Incompatible change> : 
 starting from version 1.31, C<DBD::SQLite> uses the new, recommended
 "Enhanced Query Syntax" for binary set operators (AND, OR, NOT, possibly 
 nested with parenthesis). Previous versions of C<DBD::SQLite> used the
 "Standard Query Syntax" (see L<http://www.sqlite.org/fts3.html#section_3_2>).
 Unfortunately this is a compilation switch, so it cannot be tuned
 at runtime; however, since FTS3 was never advertised in versions prior
 to 1.31, the change should be invisible to the vast majority of 
 C<DBD::SQLite> users. If, however, there are any applications
 that nevertheless were built using the "Standard Query" syntax,
 they have to be migrated; but the conversion 
 function provided in in L<DBD::SQLite::FTS3Transitional>
 is there to help.
 =head2 Tokenizers
 The behaviour of full-text indexes strongly depends on how
 documents are split into I<tokens>; therefore FTS3 table
 declarations can explicitly specify how to perform
 tokenization: 
  CREATE ... USING fts3(<columns>, tokenize=<tokenizer>)
 where C<< <tokenizer> >> is a sequence of space-separated
 words that triggers a specific tokenizer, as explained below.
 =head3 SQLite builtin tokenizers
 SQLite comes with three builtin tokenizers :
 =over
 =item simple
 Under the I<simple> tokenizer, a term is a contiguous sequence of
 eligible characters, where eligible characters are all alphanumeric
 characters, the "_" character, and all characters with UTF codepoints
 greater than or equal to 128. All other characters are discarded when
 splitting a document into terms. They serve only to separate adjacent
 terms.
 All uppercase characters within the ASCII range (UTF codepoints less
 than 128), are transformed to their lowercase equivalents as part of
 the tokenization process. Thus, full-text queries are case-insensitive
 when using the simple tokenizer.
 =item porter
 The I<porter> tokenizer uses the same rules to separate the input
 document into terms, but as well as folding all terms to lower case it
 uses the Porter Stemming algorithm to reduce related English language
 words to a common root.
 =item icu
 If SQLite is compiled with the SQLITE_ENABLE_ICU
 pre-processor symbol defined, then there exists a built-in tokenizer
 named "icu" implemented using the ICU library, and taking an
 ICU locale identifier as argument (such as "tr_TR" for
 Turkish as used in Turkey, or "en_AU" for English as used in
 Australia). For example:
  CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH)
 The ICU tokenizer implementation is very simple. It splits the input
 text according to the ICU rules for finding word boundaries and
 discards any tokens that consist entirely of white-space. This may be
 suitable for some applications in some locales, but not all. If more
 complex processing is required, for example to implement stemming or
 discard punctuation, use the perl tokenizer as explained below.
 =back
 =head3 Perl tokenizers
 In addition to the builtin SQLite tokenizers, C<DBD::Sqlite>
 implements a I<perl> tokenizer, that can hook to any tokenizing
 algorithm written in Perl. This is specified as follows :
  CREATE ... USING fts3(<columns>, tokenize=perl '<perl_function>')
 where C<< <perl_function> >> is a fully qualified Perl function name
 (i.e. prefixed by the name of the package in which that function is
 declared). So for example if the function is C<my_func> in the main 
 program, write
  CREATE ... USING fts3(<columns>, tokenize=perl 'main::my_func')
 That function should return a code reference that takes a string as
 single argument, and returns an iterator (another function), which
 returns a tuple C<< ($term, $len, $start, $end, $index) >> for each
 term. Here is a simple example that tokenizes on words according to
 the current perl locale
  sub locale_tokenizer {
    return sub {
      my $string = shift;
      use locale;
      my $regex      = qr/\w+/;
      my $term_index = 0;
      return sub { # closure
        $string =~ /$regex/g or return; # either match, or no more token
        my ($start, $end) = ($-[0], $+[0]);
        my $len           = $end-$start;
        my $term          = substr($string, $start, $len);
        return ($term, $len, $start, $end, $term_index++);
      }
    };
  }
 There must be three levels of subs, in a kind of "Russian dolls" structure,
 because :
 =over
 =item *
 the external, named sub is called whenever accessing a FTS3 table
 with that tokenizer
 =item *
 the inner, anonymous sub is called whenever a new string
 needs to be tokenized (either for inserting new text into the table,
 or for analyzing a query).
 =item *
 the innermost, anonymous sub is called repeatedly for retrieving
 all terms within that string.
 =back
 Instead of writing tokenizers by hand, you can grab one of those
 already implemented in the L<Search::Tokenizer> module :
  use Search::Tokenizer;
  $dbh->do(<<"") or die DBI::errstr;
  CREATE ... USING fts3(<columns>, 
                        tokenize=perl 'Search::Tokenizer::unaccent')
 or you can use L<Search::Tokenizer/new> to build
 your own tokenizer.
 =head2 Incomplete handling of utf8 characters
 The current FTS3 implementation in SQLite is far from complete with
 respect to utf8 handling : in particular, variable-length characters
 are not treated correctly by the builtin functions
 C<offsets()> and C<snippet()>.
 =head2 Database space for FTS3
 FTS3 stores a complete copy of the indexed documents, together with
 the fulltext index. On a large collection of documents, this can
 consume quite a lot of disk space. If copies of documents are also
 available as external resources (for example files on the filesystem),
 that space can sometimes be spared --- see the tip in the 
 L<Cookbook|DBD::SQLite::Cookbook/"Sparing database disk space">.
 =head1 FOR DBD::SQLITE EXTENSION AUTHORS
 Since 1.30_01, you can retrieve the bundled sqlite C source and/or
--- a/lib/DBD/SQLite/Cookbook.pod
+++ b/lib/DBD/SQLite/Cookbook.pod
@ -9,6 +9,8 @@ This is the L<DBD::SQLite> cookbook.
 It is intended to provide a place to keep a variety of functions and
 formals for use in callback APIs in L<DBD::SQLite>.
 =head1 AGGREGATE FUNCTIONS
 =head2 Variance
 This is a simple aggregate function which returns a variance. It is
@ -140,6 +142,35 @@ The function can then be used as:
  FROM results
  GROUP BY group_name;
 =head1 FTS3 fulltext indexing
 =head2 Sparing database disk space
 As explained in L<http://www.sqlite.org/fts3.html#section_6>, each
 FTS3 table C<I<t>> is stored internally within three regular tables
 C<I<t>_content>, C<I<t>_segments> and C<I<t>_segdir>.  The last two
 tables contain the fulltext index.  The first table C<I<t>_content>
 stores the complete documents being indexed ... but if copies of the
 same documents are already stored somewhere else, or can be computed
 from external resources (for example as HTML or MsWord files in the
 filesystem), then this is quite a waste of space. SQLite itself only
 needs the C<I<t>_content> table for implementing the C<offsets()> and
 C<snippet()> functions, which are not always usable anyway (in particular
 when using utf8 characters greater than 255).
 So an alternative strategy is to use SQLite only for the fulltext
 index and metadata, and to keep the full documents outside of SQLite :
 to do so, after each insert or update in the FTS3 table, do an update
 in the C<I<t>_content> table, setting the content column(s) to
 NULL. Of course your application will need an algorithm for finding
 the external resource corresponding to any I<docid> stored within
 SQLite. Furthermore, SQLite C<offsets()> and C<snippet()> functions
 cannot be used, so if such functionality is needed, it has to be
 directly programmed within the Perl application.
 In short, this strategy is really a hack, because FTS3 was not originally
 programmed with that behaviour in mind; however it is workable
 and has a strong impact on the size of the database file.
 =head1 SUPPORT
 Bugs should be reported via the CPAN bug tracker at
@ -157,6 +188,8 @@ turn them into a separate CPAN distribution.
 Adam Kennedy E<lt>adamk@cpan.orgE<gt>
 Laurent Dami E<lt>dami@cpan.orgE<gt>
 =head1 COPYRIGHT
 Copyright 2009 Adam Kennedy.
--- a/lib/DBD/SQLite/FTS3Transitional.pm
+++ b/lib/DBD/SQLite/FTS3Transitional.pm
@ -0,0 +1,96 @@
 package DBD::SQLite::FTS3Transitional;
 use strict;
 use warnings;
 no warnings 'uninitialized';
 use Exporter 'import';
 our @EXPORT_OK = qw/fts3_convert/;
 sub fts3_convert {
  my $in  = shift;
  my $out = "";
  # decompose input string into tokens
  my @tokens = $in =~ / -       # minus sign
                      | \bOR\b  # OR keyword
                      | ".*?"   # phrase query
                      | \S+     # term
                      /xg;
  # build the output string
  while (@tokens) {
    # -a => (NOT a)
    if ($tokens[0] eq '-') {
      my (undef, $right) = splice(@tokens, 0, 2);
      $out .= " (NOT $right)";
    }
    # a OR b => (a OR b)
    elsif (@tokens >= 2 && $tokens[1] eq 'OR') {
      my ($left, undef, $right) = splice(@tokens, 0, 3);
      if ($right eq '-') {
        $right = "NOT " . shift @tokens;
      }
      $out .= " ($left OR $right)";
    }
    # plain term
    else {
      $out .= " " . shift @tokens;
    }
  }
  return $out;
 }
 1;
 __END__
 =head1 NAME
 DBD::SQLite::FTS3Transitional - helper function for migrating FTS3 applications
 =head1 SYNOPSIS
  use DBD::SQLite::FTS3Transitional qw/fts3_convert/;
  my $new_match_syntax = fts3_convert($old_match_syntax);
  my $sql = "SELECT ... FROM ... WHERE col MATCH $new_match_syntax";
 =head1 DESCRIPTION
 Starting from version 1.31, C<DBD::SQLite> uses the new, recommended
 "Enhanced Query Syntax" for binary set operators in fulltext FTS3 queries
 (AND, OR, NOT, possibly nested with parenthesis). 
 Previous versions of C<DBD::SQLite> used the
 "Standard Query Syntax" (see L<http://www.sqlite.org/fts3.html#section_3_2>).
 This module helps converting SQLite application built with the old,
 "Standard" query syntax, to the new "Extended" syntax.
 =head1 FUNCTIONS
 =head2 fts3_convert
 Takes as input a string for the MATCH clause in a FTS3 fulltext search;
 returns the same clause rewritten in new, "Extended" syntax.
 =head1 AUTHOR
 Laurent Dami E<lt>dami@cpan.orgE<gt>
 =head1 COPYRIGHT
 Copyright 2010 Laurent Dami.
 This program is free software; you can redistribute
 it and/or modify it under the same terms as Perl itself.
 The full text of the license can be found in the
 LICENSE file included with this module.
 =cut
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@ -0,0 +1,103 @@
 #!/usr/bin/perl
 use strict;
 BEGIN {
 	$|  = 1;
 	$^W = 1;
 }
 use t::lib::Test     qw/connect_ok/;
 use Test::More;
 my @texts = ("il était une bergère",
             "qui gardait ses moutons",
             "elle fit un fromage",
             "du lait de ses moutons");
 my @tests = (
 #  query                  => expected results
  ["bergère"              => 0       ],
  ["berg*"                => 0       ],
  ["foobar"                          ],
  ["moutons"              => 1, 3    ],
  ['"qui gardait"'        => 1       ],
  ["moutons NOT lait"     => 1       ],
  ["il était"             => 0       ],
  ["(il OR elle) AND un*" => 0, 2    ],
 );
 BEGIN {
 	if ($] < 5.008005) {
 		plan skip_all => 'Unicode is not supported before 5.8.5';
 	}
 }
 use Test::NoWarnings;
 plan tests => 2 * (1 + @tests)  + 1;
 BEGIN {
 	# Sadly perl for windows (and probably sqlite, too) may hang
 	# if the system locale doesn't support european languages.
 	# en-us should be a safe default. if it doesn't work, use 'C'.
 	if ( $^O eq 'MSWin32') {
 		use POSIX 'locale_h';
 		setlocale(LC_COLLATE, 'en-us');
 	}
 }
 use locale;
 sub locale_tokenizer { # see also: Search::Tokenizer
  return sub {
    my $string = shift;
    my $regex      = qr/\w+/;
    my $term_index = 0;
    return sub {
      $string =~ /$regex/g or return; # either match, or no more token
      my ($start, $end) = ($-[0], $+[0]);
      my $term = substr($string, $start, my $len = $end-$start);
      return ($term, $len, $start, $end, $term_index++);
    };
  };
 }
 use DBD::SQLite;
 for my $use_unicode (0, 1) {
  # connect
  my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode );
  # create fts3 table
  use Search::Tokenizer;
  $dbh->do(<<"") or die DBI::errstr;
    CREATE VIRTUAL TABLE try_fts3 
          USING fts3(content, tokenize=perl 'main::locale_tokenizer')
  # populate it
  my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
    INSERT INTO try_fts3(content) VALUES(?)
  my @doc_ids;
  for (my $i = 0; $i < @texts; $i++) {
    $insert_sth->execute($texts[$i]);
    $doc_ids[$i] = $dbh->last_insert_id("", "", "", "");
  }
  # queries
  my $sql = "SELECT docid FROM try_fts3 WHERE content MATCH ?";
  for my $t (@tests) {
    my ($query, @expected) = @$t;
    @expected = map {$doc_ids[$_]} @expected;
    my $results = $dbh->selectcol_arrayref($sql, undef, $query);
    is_deeply($results, \@expected, "$query (unicode is $use_unicode)");
  }
 }
--- a/t/44_fts3_transitional.t
+++ b/t/44_fts3_transitional.t
@ -0,0 +1,34 @@
 #!/usr/bin/perl
 use strict;
 BEGIN {
 	$|  = 1;
 	$^W = 1;
 }
 use Test::More;
 use Test::NoWarnings;
 my @tests = (
  ['foo bar'              => 'foo bar'                       ],
  ['foo -bar'             => 'foo (NOT bar)'                 ],
  ['foo* -bar*'           => 'foo* (NOT bar*)'               ],
  ['foo bar OR bie buz'   => 'foo (bar OR bie) buz'          ],
  ['-foo bar OR -bie buz' => '(NOT foo) (bar OR NOT bie) buz'],
  ['"kyrie eleison" OR "christe eleison"' 
                  => '("kyrie eleison" OR "christe eleison")'],
 );
 plan tests => 1 + @tests;
 use DBD::SQLite::FTS3Transitional qw/fts3_convert/;
 foreach my $t (@tests) {
  my ($old_syntax, $expected_new) = @$t;
  my $new = fts3_convert($old_syntax);
  $new =~ s/^\s+//;
  is($new, $expected_new, $old_syntax);
 }