mirror of
https://github.com/DBD-SQLite/DBD-SQLite
synced 2025-06-07 14:19:10 -04:00
added support for FTS3 fulltext searches : perl tokenizers, documentation and tests
This commit is contained in:
parent
413bd0ac9d
commit
d43cf63ad0
12 changed files with 988 additions and 3 deletions
5
Changes
5
Changes
|
@ -1,6 +1,11 @@
|
||||||
Changes for Perl extension DBD-SQLite
|
Changes for Perl extension DBD-SQLite
|
||||||
|
|
||||||
1.30_04 to be released
|
1.30_04 to be released
|
||||||
|
- Added support for FTS3 tokenizers written in Perl. Added tests
|
||||||
|
and documentation on how to use FTS3. Changed compilation flag
|
||||||
|
to use the recommanded -DSQLITE_ENABLE_FTS3_PARENTHESIS
|
||||||
|
*** MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY USED FTS3 ***
|
||||||
|
(DAMI)
|
||||||
- Fixed various backward compatibility issues back to SQLite 3.6.1
|
- Fixed various backward compatibility issues back to SQLite 3.6.1
|
||||||
(ISHIGAKI)
|
(ISHIGAKI)
|
||||||
- Resolved #58332: Documentation error for preventing fsync
|
- Resolved #58332: Documentation error for preventing fsync
|
||||||
|
|
10
Makefile.PL
10
Makefile.PL
|
@ -212,8 +212,14 @@ if ( $sqlite_inc ) {
|
||||||
my @CC_DEFINE = (
|
my @CC_DEFINE = (
|
||||||
# '-DSQLITE_CORE',
|
# '-DSQLITE_CORE',
|
||||||
'-DSQLITE_ENABLE_FTS3',
|
'-DSQLITE_ENABLE_FTS3',
|
||||||
# Disabled until we have a test for this
|
|
||||||
# '-DSQLITE_ENABLE_FTS3_PARENTHESIS', # for sqlite >= 3.6.10
|
# L. Dami 10.07.2010 : now enabling new FTS3 syntax, because
|
||||||
|
# that's the recommendation from SQLite for new applications
|
||||||
|
# (used to be "Disabled until we have a test for this").
|
||||||
|
# This change MAY POSSIBLY BREAK OLD APPLICATIONS THAT ALREADY
|
||||||
|
# USED FTS3 ... but sooner or later that change had to be done !
|
||||||
|
'-DSQLITE_ENABLE_FTS3_PARENTHESIS', # for sqlite >= 3.6.10
|
||||||
|
|
||||||
'-DSQLITE_ENABLE_COLUMN_METADATA',
|
'-DSQLITE_ENABLE_COLUMN_METADATA',
|
||||||
'-DNDEBUG=1',
|
'-DNDEBUG=1',
|
||||||
);
|
);
|
||||||
|
|
16
SQLite.xs
16
SQLite.xs
|
@ -196,6 +196,22 @@ backup_to_file(dbh, filename)
|
||||||
OUTPUT:
|
OUTPUT:
|
||||||
RETVAL
|
RETVAL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static int
|
||||||
|
register_fts3_perl_tokenizer(dbh)
|
||||||
|
SV *dbh
|
||||||
|
ALIAS:
|
||||||
|
DBD::SQLite::db::sqlite_register_fts3_perl_tokenizer = 1
|
||||||
|
CODE:
|
||||||
|
RETVAL = sqlite_db_register_fts3_perl_tokenizer(aTHX_ dbh);
|
||||||
|
OUTPUT:
|
||||||
|
RETVAL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
MODULE = DBD::SQLite PACKAGE = DBD::SQLite::st
|
MODULE = DBD::SQLite PACKAGE = DBD::SQLite::st
|
||||||
|
|
||||||
PROTOTYPES: DISABLE
|
PROTOTYPES: DISABLE
|
||||||
|
|
|
@ -19,5 +19,6 @@
|
||||||
#include <dbd_xsh.h>
|
#include <dbd_xsh.h>
|
||||||
|
|
||||||
#include "sqlite3.h"
|
#include "sqlite3.h"
|
||||||
|
#include "fts3_tokenizer.h"
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
302
dbdimp.c
302
dbdimp.c
|
@ -20,6 +20,14 @@ DBISTATE_DECLARE;
|
||||||
#define croak_if_stmt_is_null()
|
#define croak_if_stmt_is_null()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/*-----------------------------------------------------*
|
||||||
|
* Globals
|
||||||
|
*-----------------------------------------------------*/
|
||||||
|
imp_dbh_t *last_executed_dbh; /* needed by perl_tokenizer
|
||||||
|
to know if unicode is on/off */
|
||||||
|
|
||||||
|
|
||||||
/*-----------------------------------------------------*
|
/*-----------------------------------------------------*
|
||||||
* Helper Methods
|
* Helper Methods
|
||||||
*-----------------------------------------------------*/
|
*-----------------------------------------------------*/
|
||||||
|
@ -487,6 +495,298 @@ sqlite_db_last_insert_id(SV *dbh, imp_dbh_t *imp_dbh, SV *catalog, SV *schema, S
|
||||||
return newSViv((IV)sqlite3_last_insert_rowid(imp_dbh->db));
|
return newSViv((IV)sqlite3_last_insert_rowid(imp_dbh->db));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ======================================================================
|
||||||
|
* EXPERIMENTAL bindings for FTS3 TOKENIZERS
|
||||||
|
* ====================================================================== */
|
||||||
|
|
||||||
|
typedef struct perl_tokenizer {
|
||||||
|
sqlite3_tokenizer base;
|
||||||
|
SV *coderef; /* the perl tokenizer is a coderef that takes
|
||||||
|
a string and returns a cursor coderef */
|
||||||
|
} perl_tokenizer;
|
||||||
|
|
||||||
|
typedef struct perl_tokenizer_cursor {
|
||||||
|
sqlite3_tokenizer_cursor base;
|
||||||
|
SV *coderef; /* ref to the closure that returns terms */
|
||||||
|
char *pToken; /* storage for a copy of the last token */
|
||||||
|
int nTokenAllocated; /* space allocated to pToken buffer */
|
||||||
|
|
||||||
|
/* members below are only used if the input string is in utf8 */
|
||||||
|
const char *pInput; /* input we are tokenizing */
|
||||||
|
const char *lastByteOffset; /* offset into pInput */
|
||||||
|
int lastCharOffset; /* char offset corresponding to lastByteOffset */
|
||||||
|
|
||||||
|
} perl_tokenizer_cursor;
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Create a new tokenizer instance.
|
||||||
|
** Will be called whenever a FTS3 table is created with
|
||||||
|
** CREATE .. USING fts3( ... , tokenize=perl qualified::function::name)
|
||||||
|
** where qualified::function::name is a fully qualified perl function
|
||||||
|
*/
|
||||||
|
static int perl_tokenizer_Create(
|
||||||
|
int argc, const char * const *argv,
|
||||||
|
sqlite3_tokenizer **ppTokenizer
|
||||||
|
){
|
||||||
|
perl_tokenizer *t;
|
||||||
|
t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t));
|
||||||
|
if( t==NULL ) return SQLITE_NOMEM;
|
||||||
|
memset(t, 0, sizeof(*t));
|
||||||
|
|
||||||
|
dTHX;
|
||||||
|
dSP;
|
||||||
|
|
||||||
|
ENTER;
|
||||||
|
SAVETMPS;
|
||||||
|
|
||||||
|
/* call the qualified::function::name */
|
||||||
|
PUSHMARK(SP);
|
||||||
|
PUTBACK;
|
||||||
|
int n_retval = call_pv(argv[0], G_SCALAR);
|
||||||
|
SPAGAIN;
|
||||||
|
|
||||||
|
/* store a copy of the returned coderef into the tokenizer structure */
|
||||||
|
if (n_retval != 1) {
|
||||||
|
warn("tokenizer_Create returned %d arguments", n_retval);
|
||||||
|
}
|
||||||
|
SV *retval = POPs;
|
||||||
|
t->coderef = newSVsv(retval);
|
||||||
|
*ppTokenizer = &t->base;
|
||||||
|
|
||||||
|
PUTBACK;
|
||||||
|
FREETMPS;
|
||||||
|
LEAVE;
|
||||||
|
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Destroy a tokenizer
|
||||||
|
*/
|
||||||
|
static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){
|
||||||
|
dTHX;
|
||||||
|
perl_tokenizer *t = (perl_tokenizer *) pTokenizer;
|
||||||
|
sv_free(t->coderef);
|
||||||
|
sqlite3_free(t);
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Prepare to begin tokenizing a particular string. The input
|
||||||
|
** string to be tokenized is supposed to be pInput[0..nBytes-1] ..
|
||||||
|
** except that nBytes passed by fts3 is -1 (don't know why) !
|
||||||
|
** This is passed to the tokenizer instance, which then returns a
|
||||||
|
** closure implementing the cursor (so the cursor is again a coderef).
|
||||||
|
*/
|
||||||
|
static int perl_tokenizer_Open(
|
||||||
|
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
|
||||||
|
const char *pInput, int nBytes, /* Input buffer */
|
||||||
|
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
|
||||||
|
){
|
||||||
|
perl_tokenizer *t = (perl_tokenizer *)pTokenizer;
|
||||||
|
|
||||||
|
/* allocate and initialize the cursor struct */
|
||||||
|
perl_tokenizer_cursor *c;
|
||||||
|
c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
|
||||||
|
memset(c, 0, sizeof(*c));
|
||||||
|
*ppCursor = &c->base;
|
||||||
|
|
||||||
|
/* flags for creating the Perl SV containing the input string */
|
||||||
|
U32 flags = SVs_TEMP; /* will call sv_2mortal */
|
||||||
|
|
||||||
|
/* special handling if working with utf8 strings */
|
||||||
|
if (last_executed_dbh->unicode) { /* global var ... no better way ! */
|
||||||
|
|
||||||
|
/* data to keep track of byte offsets */
|
||||||
|
c->lastByteOffset = c->pInput = pInput;
|
||||||
|
c->lastCharOffset = 0;
|
||||||
|
|
||||||
|
/* string passed to Perl needs to be flagged as utf8 */
|
||||||
|
flags |= SVf_UTF8;
|
||||||
|
}
|
||||||
|
|
||||||
|
dTHX;
|
||||||
|
dSP;
|
||||||
|
ENTER;
|
||||||
|
SAVETMPS;
|
||||||
|
|
||||||
|
/* build a Perl copy of the input string */
|
||||||
|
if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */
|
||||||
|
nBytes = strlen(pInput);
|
||||||
|
}
|
||||||
|
SV *perl_string = newSVpvn_flags(pInput, nBytes, flags);
|
||||||
|
|
||||||
|
/* call the tokenizer coderef */
|
||||||
|
PUSHMARK(SP);
|
||||||
|
XPUSHs(perl_string);
|
||||||
|
PUTBACK;
|
||||||
|
int n_retval = call_sv(t->coderef, G_SCALAR);
|
||||||
|
SPAGAIN;
|
||||||
|
|
||||||
|
/* store the cursor coderef returned by the tokenizer */
|
||||||
|
if (n_retval != 1) {
|
||||||
|
warn("tokenizer returned %d arguments", n_retval);
|
||||||
|
}
|
||||||
|
c->coderef = newSVsv(POPs);
|
||||||
|
|
||||||
|
PUTBACK;
|
||||||
|
FREETMPS;
|
||||||
|
LEAVE;
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Close a tokenization cursor previously opened by a call to
|
||||||
|
** perl_tokenizer_Open() above.
|
||||||
|
*/
|
||||||
|
static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
|
||||||
|
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
|
||||||
|
|
||||||
|
dTHX;
|
||||||
|
sv_free(c->coderef);
|
||||||
|
sqlite3_free(c);
|
||||||
|
return SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Extract the next token from a tokenization cursor. The cursor must
|
||||||
|
** have been opened by a prior call to perl_tokenizer_Open().
|
||||||
|
*/
|
||||||
|
static int perl_tokenizer_Next(
|
||||||
|
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */
|
||||||
|
const char **ppToken, /* OUT: *ppToken is the token text */
|
||||||
|
int *pnBytes, /* OUT: Number of bytes in token */
|
||||||
|
int *piStartOffset, /* OUT: Starting offset of token */
|
||||||
|
int *piEndOffset, /* OUT: Ending offset of token */
|
||||||
|
int *piPosition /* OUT: Position integer of token */
|
||||||
|
){
|
||||||
|
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
|
||||||
|
int result;
|
||||||
|
|
||||||
|
dTHX;
|
||||||
|
dSP;
|
||||||
|
|
||||||
|
ENTER;
|
||||||
|
SAVETMPS;
|
||||||
|
|
||||||
|
/* call the cursor */
|
||||||
|
PUSHMARK(SP);
|
||||||
|
PUTBACK;
|
||||||
|
int n_retval = call_sv(c->coderef, G_ARRAY);
|
||||||
|
SPAGAIN;
|
||||||
|
|
||||||
|
/* if we get back an empty list, there is no more token */
|
||||||
|
if (n_retval == 0) {
|
||||||
|
result = SQLITE_DONE;
|
||||||
|
}
|
||||||
|
/* otherwise, get token details from the return list */
|
||||||
|
else {
|
||||||
|
if (n_retval != 5) {
|
||||||
|
warn("tokenizer cursor returned %d arguments", n_retval);
|
||||||
|
}
|
||||||
|
*piPosition = POPi;
|
||||||
|
*piEndOffset = POPi;
|
||||||
|
*piStartOffset = POPi;
|
||||||
|
*pnBytes = POPi;
|
||||||
|
char *token = POPpx;
|
||||||
|
|
||||||
|
if (c->pInput) { /* if working with utf8 data */
|
||||||
|
|
||||||
|
/* recompute *pnBytes in bytes, not in chars */
|
||||||
|
*pnBytes = strlen(token);
|
||||||
|
|
||||||
|
/* recompute start/end offsets in bytes, not in chars */
|
||||||
|
I32 hop = *piStartOffset - c->lastCharOffset;
|
||||||
|
char *byteOffset = utf8_hop(c->lastByteOffset, hop);
|
||||||
|
hop = *piEndOffset - *piStartOffset;
|
||||||
|
*piStartOffset = byteOffset - c->pInput;
|
||||||
|
byteOffset = utf8_hop(byteOffset, hop);
|
||||||
|
*piEndOffset = byteOffset - c->pInput;
|
||||||
|
/* remember where we are for next round */
|
||||||
|
c->lastCharOffset = *piEndOffset,
|
||||||
|
c->lastByteOffset = byteOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* make sure we have enough storage for copying the token */
|
||||||
|
if (*pnBytes > c->nTokenAllocated ){
|
||||||
|
char *pNew;
|
||||||
|
c->nTokenAllocated = *pnBytes + 20;
|
||||||
|
pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
|
||||||
|
if( !pNew ) return SQLITE_NOMEM;
|
||||||
|
c->pToken = pNew;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* need to copy the token into the C cursor before perl frees that
|
||||||
|
memory */
|
||||||
|
memcpy(c->pToken, token, *pnBytes);
|
||||||
|
*ppToken = c->pToken;
|
||||||
|
|
||||||
|
result = SQLITE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
PUTBACK;
|
||||||
|
FREETMPS;
|
||||||
|
LEAVE;
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
** The set of routines that implement the perl tokenizer
|
||||||
|
*/
|
||||||
|
sqlite3_tokenizer_module perl_tokenizer_Module = {
|
||||||
|
0,
|
||||||
|
perl_tokenizer_Create,
|
||||||
|
perl_tokenizer_Destroy,
|
||||||
|
perl_tokenizer_Open,
|
||||||
|
perl_tokenizer_Close,
|
||||||
|
perl_tokenizer_Next
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Register the perl tokenizer with FTS3
|
||||||
|
*/
|
||||||
|
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh)
|
||||||
|
{
|
||||||
|
D_imp_dbh(dbh);
|
||||||
|
|
||||||
|
int rc;
|
||||||
|
sqlite3_stmt *pStmt;
|
||||||
|
const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
|
||||||
|
sqlite3_tokenizer_module *p = &perl_tokenizer_Module;
|
||||||
|
|
||||||
|
rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0);
|
||||||
|
if( rc!=SQLITE_OK ){
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC);
|
||||||
|
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
|
||||||
|
sqlite3_step(pStmt);
|
||||||
|
|
||||||
|
return sqlite3_finalize(pStmt);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* ======================================================================
|
||||||
|
* END # EXPERIMENTAL bindings for FTS3 TOKENIZERS
|
||||||
|
* ====================================================================== */
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
int
|
int
|
||||||
sqlite_st_prepare(SV *sth, imp_sth_t *imp_sth, char *statement, SV *attribs)
|
sqlite_st_prepare(SV *sth, imp_sth_t *imp_sth, char *statement, SV *attribs)
|
||||||
{
|
{
|
||||||
|
@ -566,6 +866,8 @@ sqlite_st_execute(SV *sth, imp_sth_t *imp_sth)
|
||||||
croak_if_db_is_null();
|
croak_if_db_is_null();
|
||||||
croak_if_stmt_is_null();
|
croak_if_stmt_is_null();
|
||||||
|
|
||||||
|
last_executed_dbh = imp_dbh;
|
||||||
|
|
||||||
/* COMPAT: sqlite3_sql is only available for 3006000 or newer */
|
/* COMPAT: sqlite3_sql is only available for 3006000 or newer */
|
||||||
sqlite_trace(sth, imp_sth, 3, form("executing %s", sqlite3_sql(imp_sth->stmt)));
|
sqlite_trace(sth, imp_sth, 3, form("executing %s", sqlite3_sql(imp_sth->stmt)));
|
||||||
|
|
||||||
|
|
2
dbdimp.h
2
dbdimp.h
|
@ -100,6 +100,8 @@ SV* sqlite_db_update_hook( pTHX_ SV *dbh, SV *hook );
|
||||||
int sqlite_db_set_authorizer( pTHX_ SV *dbh, SV *authorizer );
|
int sqlite_db_set_authorizer( pTHX_ SV *dbh, SV *authorizer );
|
||||||
AV* sqlite_compile_options();
|
AV* sqlite_compile_options();
|
||||||
|
|
||||||
|
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh);
|
||||||
|
|
||||||
#ifdef SvUTF8_on
|
#ifdef SvUTF8_on
|
||||||
|
|
||||||
static SV *
|
static SV *
|
||||||
|
|
154
fts3_tokenizer.h
Normal file
154
fts3_tokenizer.h
Normal file
|
@ -0,0 +1,154 @@
|
||||||
|
/************** Begin file fts3_tokenizer.h **********************************/
|
||||||
|
/*
|
||||||
|
** 2006 July 10
|
||||||
|
**
|
||||||
|
** The author disclaims copyright to this source code.
|
||||||
|
**
|
||||||
|
*************************************************************************
|
||||||
|
** Defines the interface to tokenizers used by fulltext-search. There
|
||||||
|
** are three basic components:
|
||||||
|
**
|
||||||
|
** sqlite3_tokenizer_module is a singleton defining the tokenizer
|
||||||
|
** interface functions. This is essentially the class structure for
|
||||||
|
** tokenizers.
|
||||||
|
**
|
||||||
|
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
|
||||||
|
** including customization information defined at creation time.
|
||||||
|
**
|
||||||
|
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
|
||||||
|
** tokens from a particular input.
|
||||||
|
*/
|
||||||
|
#ifndef _FTS3_TOKENIZER_H_
|
||||||
|
#define _FTS3_TOKENIZER_H_
|
||||||
|
|
||||||
|
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
|
||||||
|
** If tokenizers are to be allowed to call sqlite3_*() functions, then
|
||||||
|
** we will need a way to register the API consistently.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Structures used by the tokenizer interface. When a new tokenizer
|
||||||
|
** implementation is registered, the caller provides a pointer to
|
||||||
|
** an sqlite3_tokenizer_module containing pointers to the callback
|
||||||
|
** functions that make up an implementation.
|
||||||
|
**
|
||||||
|
** When an fts3 table is created, it passes any arguments passed to
|
||||||
|
** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
|
||||||
|
** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
|
||||||
|
** implementation. The xCreate() function in turn returns an
|
||||||
|
** sqlite3_tokenizer structure representing the specific tokenizer to
|
||||||
|
** be used for the fts3 table (customized by the tokenizer clause arguments).
|
||||||
|
**
|
||||||
|
** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
|
||||||
|
** method is called. It returns an sqlite3_tokenizer_cursor object
|
||||||
|
** that may be used to tokenize a specific input buffer based on
|
||||||
|
** the tokenization rules supplied by a specific sqlite3_tokenizer
|
||||||
|
** object.
|
||||||
|
*/
|
||||||
|
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
|
||||||
|
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
|
||||||
|
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
|
||||||
|
|
||||||
|
struct sqlite3_tokenizer_module {
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Structure version. Should always be set to 0.
|
||||||
|
*/
|
||||||
|
int iVersion;
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Create a new tokenizer. The values in the argv[] array are the
|
||||||
|
** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
|
||||||
|
** TABLE statement that created the fts3 table. For example, if
|
||||||
|
** the following SQL is executed:
|
||||||
|
**
|
||||||
|
** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
|
||||||
|
**
|
||||||
|
** then argc is set to 2, and the argv[] array contains pointers
|
||||||
|
** to the strings "arg1" and "arg2".
|
||||||
|
**
|
||||||
|
** This method should return either SQLITE_OK (0), or an SQLite error
|
||||||
|
** code. If SQLITE_OK is returned, then *ppTokenizer should be set
|
||||||
|
** to point at the newly created tokenizer structure. The generic
|
||||||
|
** sqlite3_tokenizer.pModule variable should not be initialised by
|
||||||
|
** this callback. The caller will do so.
|
||||||
|
*/
|
||||||
|
int (*xCreate)(
|
||||||
|
int argc, /* Size of argv array */
|
||||||
|
const char *const*argv, /* Tokenizer argument strings */
|
||||||
|
sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
|
||||||
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Destroy an existing tokenizer. The fts3 module calls this method
|
||||||
|
** exactly once for each successful call to xCreate().
|
||||||
|
*/
|
||||||
|
int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Create a tokenizer cursor to tokenize an input buffer. The caller
|
||||||
|
** is responsible for ensuring that the input buffer remains valid
|
||||||
|
** until the cursor is closed (using the xClose() method).
|
||||||
|
*/
|
||||||
|
int (*xOpen)(
|
||||||
|
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
|
||||||
|
const char *pInput, int nBytes, /* Input buffer */
|
||||||
|
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
|
||||||
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Destroy an existing tokenizer cursor. The fts3 module calls this
|
||||||
|
** method exactly once for each successful call to xOpen().
|
||||||
|
*/
|
||||||
|
int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
|
||||||
|
|
||||||
|
/*
|
||||||
|
** Retrieve the next token from the tokenizer cursor pCursor. This
|
||||||
|
** method should either return SQLITE_OK and set the values of the
|
||||||
|
** "OUT" variables identified below, or SQLITE_DONE to indicate that
|
||||||
|
** the end of the buffer has been reached, or an SQLite error code.
|
||||||
|
**
|
||||||
|
** *ppToken should be set to point at a buffer containing the
|
||||||
|
** normalized version of the token (i.e. after any case-folding and/or
|
||||||
|
** stemming has been performed). *pnBytes should be set to the length
|
||||||
|
** of this buffer in bytes. The input text that generated the token is
|
||||||
|
** identified by the byte offsets returned in *piStartOffset and
|
||||||
|
** *piEndOffset. *piStartOffset should be set to the index of the first
|
||||||
|
** byte of the token in the input buffer. *piEndOffset should be set
|
||||||
|
** to the index of the first byte just past the end of the token in
|
||||||
|
** the input buffer.
|
||||||
|
**
|
||||||
|
** The buffer *ppToken is set to point at is managed by the tokenizer
|
||||||
|
** implementation. It is only required to be valid until the next call
|
||||||
|
** to xNext() or xClose().
|
||||||
|
*/
|
||||||
|
/* TODO(shess) current implementation requires pInput to be
|
||||||
|
** nul-terminated. This should either be fixed, or pInput/nBytes
|
||||||
|
** should be converted to zInput.
|
||||||
|
*/
|
||||||
|
int (*xNext)(
|
||||||
|
sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
|
||||||
|
const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
|
||||||
|
int *piStartOffset, /* OUT: Byte offset of token in input buffer */
|
||||||
|
int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
|
||||||
|
int *piPosition /* OUT: Number of tokens returned before this one */
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct sqlite3_tokenizer {
|
||||||
|
const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
|
||||||
|
/* Tokenizer implementations will typically add additional fields */
|
||||||
|
};
|
||||||
|
|
||||||
|
struct sqlite3_tokenizer_cursor {
|
||||||
|
sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
|
||||||
|
/* Tokenizer implementations will typically add additional fields */
|
||||||
|
};
|
||||||
|
|
||||||
|
int fts3_global_term_cnt(int iTerm, int iCol);
|
||||||
|
int fts3_term_cnt(int iTerm, int iCol);
|
||||||
|
|
||||||
|
|
||||||
|
#endif /* _FTS3_TOKENIZER_H_ */
|
||||||
|
|
||||||
|
/************** End of fts3_tokenizer.h **************************************/
|
|
@ -55,6 +55,7 @@ sub driver {
|
||||||
DBD::SQLite::db->install_method('sqlite_backup_from_file');
|
DBD::SQLite::db->install_method('sqlite_backup_from_file');
|
||||||
DBD::SQLite::db->install_method('sqlite_backup_to_file');
|
DBD::SQLite::db->install_method('sqlite_backup_to_file');
|
||||||
DBD::SQLite::db->install_method('sqlite_enable_load_extension');
|
DBD::SQLite::db->install_method('sqlite_enable_load_extension');
|
||||||
|
DBD::SQLite::db->install_method('sqlite_register_fts3_perl_tokenizer');
|
||||||
$methods_are_installed++;
|
$methods_are_installed++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -71,6 +72,7 @@ sub CLONE {
|
||||||
undef $drh;
|
undef $drh;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
package DBD::SQLite::dr;
|
package DBD::SQLite::dr;
|
||||||
|
|
||||||
sub connect {
|
sub connect {
|
||||||
|
@ -120,13 +122,16 @@ sub connect {
|
||||||
# Hand off to the actual login function
|
# Hand off to the actual login function
|
||||||
DBD::SQLite::db::_login($dbh, $real, $user, $auth, $attr) or return undef;
|
DBD::SQLite::db::_login($dbh, $real, $user, $auth, $attr) or return undef;
|
||||||
|
|
||||||
# Register the on-demand collation installer and REGEXP function
|
# Register the on-demand collation installer, REGEXP function and
|
||||||
|
# perl tokenizer
|
||||||
if ( DBD::SQLite::NEWAPI ) {
|
if ( DBD::SQLite::NEWAPI ) {
|
||||||
$dbh->sqlite_collation_needed( \&install_collation );
|
$dbh->sqlite_collation_needed( \&install_collation );
|
||||||
$dbh->sqlite_create_function( "REGEXP", 2, \®exp );
|
$dbh->sqlite_create_function( "REGEXP", 2, \®exp );
|
||||||
|
$dbh->sqlite_register_fts3_perl_tokenizer();
|
||||||
} else {
|
} else {
|
||||||
$dbh->func( \&install_collation, "collation_needed" );
|
$dbh->func( \&install_collation, "collation_needed" );
|
||||||
$dbh->func( "REGEXP", 2, \®exp, "create_function" );
|
$dbh->func( "REGEXP", 2, \®exp, "create_function" );
|
||||||
|
$dbh->func( "register_fts3_perl_tokenizer" );
|
||||||
}
|
}
|
||||||
|
|
||||||
# HACK: Since PrintWarn = 0 doesn't seem to actually prevent warnings
|
# HACK: Since PrintWarn = 0 doesn't seem to actually prevent warnings
|
||||||
|
@ -1645,6 +1650,234 @@ I<requests> for collations. In other words, if you want to change
|
||||||
the behaviour of a collation within an existing C<$dbh>, you
|
the behaviour of a collation within an existing C<$dbh>, you
|
||||||
need to call the L</create_collation> method directly.
|
need to call the L</create_collation> method directly.
|
||||||
|
|
||||||
|
=head1 FULLTEXT SEARCH
|
||||||
|
|
||||||
|
The FTS3 extension module within SQLite allows users to create special
|
||||||
|
tables with a built-in full-text index (hereafter "FTS3 tables"). The
|
||||||
|
full-text index allows the user to efficiently query the database for
|
||||||
|
all rows that contain one or more instances of a specified word (hereafter
|
||||||
|
a "token"), even if the table contains many large documents.
|
||||||
|
|
||||||
|
|
||||||
|
=head2 Short introduction to FTS3
|
||||||
|
|
||||||
|
The detailed documentation for FTS3 can be found
|
||||||
|
at L<http://www.sqlite.org/fts3.html>. Here is a very short example :
|
||||||
|
|
||||||
|
$dbh->do(<<"") or die DBI::errstr;
|
||||||
|
CREATE VIRTUAL TABLE fts_example USING fts3(content)
|
||||||
|
|
||||||
|
my $sth = $dbh->prepare("INSERT INTO fts_example(content) VALUES (?))");
|
||||||
|
$sth->execute($_) foreach @docs_to_insert;
|
||||||
|
|
||||||
|
my $results = $dbh->selectall_arrayref(<<"");
|
||||||
|
SELECT docid, snippet(content) FROM fts_example WHERE content MATCH 'foo'
|
||||||
|
|
||||||
|
|
||||||
|
The key points in this example are :
|
||||||
|
|
||||||
|
=over
|
||||||
|
|
||||||
|
=item *
|
||||||
|
|
||||||
|
The syntax for creating FTS3 tables is
|
||||||
|
|
||||||
|
CREATE VIRTUAL TABLE <table_name> USING fts3(<columns>)
|
||||||
|
|
||||||
|
where C<< <columns> >> is a list of column names. Columns may be
|
||||||
|
typed, but the type information is ignored. If no columns
|
||||||
|
are specified, the default is a single column named C<content>.
|
||||||
|
In addition, FTS3 tables have an implicit column called C<docid>
|
||||||
|
(or also C<rowid>) for numbering the stored documents.
|
||||||
|
|
||||||
|
=item *
|
||||||
|
|
||||||
|
Statements for inserting, updating or deleting records
|
||||||
|
use the same syntax as for regular SQLite tables.
|
||||||
|
|
||||||
|
=item *
|
||||||
|
|
||||||
|
Full-text searches are specified with the C<MATCH> operator, and an
|
||||||
|
operand which may be a single word, a word prefix ending with '*', a
|
||||||
|
list of words, a "phrase query" in double quotes, or a boolean combination
|
||||||
|
of the above.
|
||||||
|
|
||||||
|
=item *
|
||||||
|
|
||||||
|
The builtin function C<snippet(...)> builds a formatted excerpt of the
|
||||||
|
document text, where the words pertaining to the query are highlighted.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
There are many more details to building and searching
|
||||||
|
FTS3 tables, so we strongly invite you to read
|
||||||
|
the full documentation at at L<http://www.sqlite.org/fts3.html>.
|
||||||
|
|
||||||
|
B<Incompatible change> :
|
||||||
|
starting from version 1.31, C<DBD::SQLite> uses the new, recommended
|
||||||
|
"Enhanced Query Syntax" for binary set operators (AND, OR, NOT, possibly
|
||||||
|
nested with parenthesis). Previous versions of C<DBD::SQLite> used the
|
||||||
|
"Standard Query Syntax" (see L<http://www.sqlite.org/fts3.html#section_3_2>).
|
||||||
|
Unfortunately this is a compilation switch, so it cannot be tuned
|
||||||
|
at runtime; however, since FTS3 was never advertised in versions prior
|
||||||
|
to 1.31, the change should be invisible to the vast majority of
|
||||||
|
C<DBD::SQLite> users. If, however, there are any applications
|
||||||
|
that nevertheless were built using the "Standard Query" syntax,
|
||||||
|
they have to be migrated; but the conversion
|
||||||
|
function provided in in L<DBD::SQLite::FTS3Transitional>
|
||||||
|
is there to help.
|
||||||
|
|
||||||
|
|
||||||
|
=head2 Tokenizers
|
||||||
|
|
||||||
|
The behaviour of full-text indexes strongly depends on how
|
||||||
|
documents are split into I<tokens>; therefore FTS3 table
|
||||||
|
declarations can explicitly specify how to perform
|
||||||
|
tokenization:
|
||||||
|
|
||||||
|
CREATE ... USING fts3(<columns>, tokenize=<tokenizer>)
|
||||||
|
|
||||||
|
where C<< <tokenizer> >> is a sequence of space-separated
|
||||||
|
words that triggers a specific tokenizer, as explained below.
|
||||||
|
|
||||||
|
=head3 SQLite builtin tokenizers
|
||||||
|
|
||||||
|
SQLite comes with three builtin tokenizers :
|
||||||
|
|
||||||
|
=over
|
||||||
|
|
||||||
|
=item simple
|
||||||
|
|
||||||
|
Under the I<simple> tokenizer, a term is a contiguous sequence of
|
||||||
|
eligible characters, where eligible characters are all alphanumeric
|
||||||
|
characters, the "_" character, and all characters with UTF codepoints
|
||||||
|
greater than or equal to 128. All other characters are discarded when
|
||||||
|
splitting a document into terms. They serve only to separate adjacent
|
||||||
|
terms.
|
||||||
|
|
||||||
|
All uppercase characters within the ASCII range (UTF codepoints less
|
||||||
|
than 128), are transformed to their lowercase equivalents as part of
|
||||||
|
the tokenization process. Thus, full-text queries are case-insensitive
|
||||||
|
when using the simple tokenizer.
|
||||||
|
|
||||||
|
=item porter
|
||||||
|
|
||||||
|
The I<porter> tokenizer uses the same rules to separate the input
|
||||||
|
document into terms, but as well as folding all terms to lower case it
|
||||||
|
uses the Porter Stemming algorithm to reduce related English language
|
||||||
|
words to a common root.
|
||||||
|
|
||||||
|
=item icu
|
||||||
|
|
||||||
|
If SQLite is compiled with the SQLITE_ENABLE_ICU
|
||||||
|
pre-processor symbol defined, then there exists a built-in tokenizer
|
||||||
|
named "icu" implemented using the ICU library, and taking an
|
||||||
|
ICU locale identifier as argument (such as "tr_TR" for
|
||||||
|
Turkish as used in Turkey, or "en_AU" for English as used in
|
||||||
|
Australia). For example:
|
||||||
|
|
||||||
|
CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH)
|
||||||
|
|
||||||
|
The ICU tokenizer implementation is very simple. It splits the input
|
||||||
|
text according to the ICU rules for finding word boundaries and
|
||||||
|
discards any tokens that consist entirely of white-space. This may be
|
||||||
|
suitable for some applications in some locales, but not all. If more
|
||||||
|
complex processing is required, for example to implement stemming or
|
||||||
|
discard punctuation, use the perl tokenizer as explained below.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
=head3 Perl tokenizers
|
||||||
|
|
||||||
|
In addition to the builtin SQLite tokenizers, C<DBD::Sqlite>
|
||||||
|
implements a I<perl> tokenizer, that can hook to any tokenizing
|
||||||
|
algorithm written in Perl. This is specified as follows :
|
||||||
|
|
||||||
|
CREATE ... USING fts3(<columns>, tokenize=perl '<perl_function>')
|
||||||
|
|
||||||
|
where C<< <perl_function> >> is a fully qualified Perl function name
|
||||||
|
(i.e. prefixed by the name of the package in which that function is
|
||||||
|
declared). So for example if the function is C<my_func> in the main
|
||||||
|
program, write
|
||||||
|
|
||||||
|
CREATE ... USING fts3(<columns>, tokenize=perl 'main::my_func')
|
||||||
|
|
||||||
|
That function should return a code reference that takes a string as
|
||||||
|
single argument, and returns an iterator (another function), which
|
||||||
|
returns a tuple C<< ($term, $len, $start, $end, $index) >> for each
|
||||||
|
term. Here is a simple example that tokenizes on words according to
|
||||||
|
the current perl locale
|
||||||
|
|
||||||
|
sub locale_tokenizer {
|
||||||
|
return sub {
|
||||||
|
my $string = shift;
|
||||||
|
|
||||||
|
use locale;
|
||||||
|
my $regex = qr/\w+/;
|
||||||
|
my $term_index = 0;
|
||||||
|
|
||||||
|
return sub { # closure
|
||||||
|
$string =~ /$regex/g or return; # either match, or no more token
|
||||||
|
my ($start, $end) = ($-[0], $+[0]);
|
||||||
|
my $len = $end-$start;
|
||||||
|
my $term = substr($string, $start, $len);
|
||||||
|
return ($term, $len, $start, $end, $term_index++);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
There must be three levels of subs, in a kind of "Russian dolls" structure,
|
||||||
|
because :
|
||||||
|
|
||||||
|
=over
|
||||||
|
|
||||||
|
=item *
|
||||||
|
|
||||||
|
the external, named sub is called whenever accessing a FTS3 table
|
||||||
|
with that tokenizer
|
||||||
|
|
||||||
|
=item *
|
||||||
|
|
||||||
|
the inner, anonymous sub is called whenever a new string
|
||||||
|
needs to be tokenized (either for inserting new text into the table,
|
||||||
|
or for analyzing a query).
|
||||||
|
|
||||||
|
=item *
|
||||||
|
|
||||||
|
the innermost, anonymous sub is called repeatedly for retrieving
|
||||||
|
all terms within that string.
|
||||||
|
|
||||||
|
=back
|
||||||
|
|
||||||
|
Instead of writing tokenizers by hand, you can grab one of those
|
||||||
|
already implemented in the L<Search::Tokenizer> module :
|
||||||
|
|
||||||
|
use Search::Tokenizer;
|
||||||
|
$dbh->do(<<"") or die DBI::errstr;
|
||||||
|
CREATE ... USING fts3(<columns>,
|
||||||
|
tokenize=perl 'Search::Tokenizer::unaccent')
|
||||||
|
|
||||||
|
or you can use L<Search::Tokenizer/new> to build
|
||||||
|
your own tokenizer.
|
||||||
|
|
||||||
|
|
||||||
|
=head2 Incomplete handling of utf8 characters
|
||||||
|
|
||||||
|
The current FTS3 implementation in SQLite is far from complete with
|
||||||
|
respect to utf8 handling : in particular, variable-length characters
|
||||||
|
are not treated correctly by the builtin functions
|
||||||
|
C<offsets()> and C<snippet()>.
|
||||||
|
|
||||||
|
=head2 Database space for FTS3
|
||||||
|
|
||||||
|
FTS3 stores a complete copy of the indexed documents, together with
|
||||||
|
the fulltext index. On a large collection of documents, this can
|
||||||
|
consume quite a lot of disk space. If copies of documents are also
|
||||||
|
available as external resources (for example files on the filesystem),
|
||||||
|
that space can sometimes be spared --- see the tip in the
|
||||||
|
L<Cookbook|DBD::SQLite::Cookbook/"Sparing database disk space">.
|
||||||
|
|
||||||
|
|
||||||
=head1 FOR DBD::SQLITE EXTENSION AUTHORS
|
=head1 FOR DBD::SQLITE EXTENSION AUTHORS
|
||||||
|
|
||||||
Since 1.30_01, you can retrieve the bundled sqlite C source and/or
|
Since 1.30_01, you can retrieve the bundled sqlite C source and/or
|
||||||
|
|
|
@ -9,6 +9,8 @@ This is the L<DBD::SQLite> cookbook.
|
||||||
It is intended to provide a place to keep a variety of functions and
|
It is intended to provide a place to keep a variety of functions and
|
||||||
formals for use in callback APIs in L<DBD::SQLite>.
|
formals for use in callback APIs in L<DBD::SQLite>.
|
||||||
|
|
||||||
|
=head1 AGGREGATE FUNCTIONS
|
||||||
|
|
||||||
=head2 Variance
|
=head2 Variance
|
||||||
|
|
||||||
This is a simple aggregate function which returns a variance. It is
|
This is a simple aggregate function which returns a variance. It is
|
||||||
|
@ -140,6 +142,35 @@ The function can then be used as:
|
||||||
FROM results
|
FROM results
|
||||||
GROUP BY group_name;
|
GROUP BY group_name;
|
||||||
|
|
||||||
|
=head1 FTS3 fulltext indexing
|
||||||
|
|
||||||
|
=head2 Sparing database disk space
|
||||||
|
|
||||||
|
As explained in L<http://www.sqlite.org/fts3.html#section_6>, each
|
||||||
|
FTS3 table C<I<t>> is stored internally within three regular tables
|
||||||
|
C<I<t>_content>, C<I<t>_segments> and C<I<t>_segdir>. The last two
|
||||||
|
tables contain the fulltext index. The first table C<I<t>_content>
|
||||||
|
stores the complete documents being indexed ... but if copies of the
|
||||||
|
same documents are already stored somewhere else, or can be computed
|
||||||
|
from external resources (for example as HTML or MsWord files in the
|
||||||
|
filesystem), then this is quite a waste of space. SQLite itself only
|
||||||
|
needs the C<I<t>_content> table for implementing the C<offsets()> and
|
||||||
|
C<snippet()> functions, which are not always usable anyway (in particular
|
||||||
|
when using utf8 characters greater than 255).
|
||||||
|
|
||||||
|
So an alternative strategy is to use SQLite only for the fulltext
|
||||||
|
index and metadata, and to keep the full documents outside of SQLite :
|
||||||
|
to do so, after each insert or update in the FTS3 table, do an update
|
||||||
|
in the C<I<t>_content> table, setting the content column(s) to
|
||||||
|
NULL. Of course your application will need an algorithm for finding
|
||||||
|
the external resource corresponding to any I<docid> stored within
|
||||||
|
SQLite. Furthermore, SQLite C<offsets()> and C<snippet()> functions
|
||||||
|
cannot be used, so if such functionality is needed, it has to be
|
||||||
|
directly programmed within the Perl application.
|
||||||
|
In short, this strategy is really a hack, because FTS3 was not originally
|
||||||
|
programmed with that behaviour in mind; however it is workable
|
||||||
|
and has a strong impact on the size of the database file.
|
||||||
|
|
||||||
=head1 SUPPORT
|
=head1 SUPPORT
|
||||||
|
|
||||||
Bugs should be reported via the CPAN bug tracker at
|
Bugs should be reported via the CPAN bug tracker at
|
||||||
|
@ -157,6 +188,8 @@ turn them into a separate CPAN distribution.
|
||||||
|
|
||||||
Adam Kennedy E<lt>adamk@cpan.orgE<gt>
|
Adam Kennedy E<lt>adamk@cpan.orgE<gt>
|
||||||
|
|
||||||
|
Laurent Dami E<lt>dami@cpan.orgE<gt>
|
||||||
|
|
||||||
=head1 COPYRIGHT
|
=head1 COPYRIGHT
|
||||||
|
|
||||||
Copyright 2009 Adam Kennedy.
|
Copyright 2009 Adam Kennedy.
|
||||||
|
|
96
lib/DBD/SQLite/FTS3Transitional.pm
Normal file
96
lib/DBD/SQLite/FTS3Transitional.pm
Normal file
|
@ -0,0 +1,96 @@
|
||||||
|
package DBD::SQLite::FTS3Transitional;
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
no warnings 'uninitialized';
|
||||||
|
|
||||||
|
use Exporter 'import';
|
||||||
|
our @EXPORT_OK = qw/fts3_convert/;
|
||||||
|
|
||||||
|
|
||||||
|
sub fts3_convert {
|
||||||
|
my $in = shift;
|
||||||
|
my $out = "";
|
||||||
|
|
||||||
|
# decompose input string into tokens
|
||||||
|
my @tokens = $in =~ / - # minus sign
|
||||||
|
| \bOR\b # OR keyword
|
||||||
|
| ".*?" # phrase query
|
||||||
|
| \S+ # term
|
||||||
|
/xg;
|
||||||
|
|
||||||
|
# build the output string
|
||||||
|
while (@tokens) {
|
||||||
|
|
||||||
|
# -a => (NOT a)
|
||||||
|
if ($tokens[0] eq '-') {
|
||||||
|
my (undef, $right) = splice(@tokens, 0, 2);
|
||||||
|
$out .= " (NOT $right)";
|
||||||
|
}
|
||||||
|
|
||||||
|
# a OR b => (a OR b)
|
||||||
|
elsif (@tokens >= 2 && $tokens[1] eq 'OR') {
|
||||||
|
my ($left, undef, $right) = splice(@tokens, 0, 3);
|
||||||
|
if ($right eq '-') {
|
||||||
|
$right = "NOT " . shift @tokens;
|
||||||
|
}
|
||||||
|
$out .= " ($left OR $right)";
|
||||||
|
}
|
||||||
|
|
||||||
|
# plain term
|
||||||
|
else {
|
||||||
|
$out .= " " . shift @tokens;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
1;
|
||||||
|
|
||||||
|
__END__
|
||||||
|
|
||||||
|
=head1 NAME
|
||||||
|
|
||||||
|
DBD::SQLite::FTS3Transitional - helper function for migrating FTS3 applications
|
||||||
|
|
||||||
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
|
use DBD::SQLite::FTS3Transitional qw/fts3_convert/;
|
||||||
|
my $new_match_syntax = fts3_convert($old_match_syntax);
|
||||||
|
my $sql = "SELECT ... FROM ... WHERE col MATCH $new_match_syntax";
|
||||||
|
|
||||||
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
|
Starting from version 1.31, C<DBD::SQLite> uses the new, recommended
|
||||||
|
"Enhanced Query Syntax" for binary set operators in fulltext FTS3 queries
|
||||||
|
(AND, OR, NOT, possibly nested with parenthesis).
|
||||||
|
|
||||||
|
Previous versions of C<DBD::SQLite> used the
|
||||||
|
"Standard Query Syntax" (see L<http://www.sqlite.org/fts3.html#section_3_2>).
|
||||||
|
|
||||||
|
This module helps converting SQLite application built with the old,
|
||||||
|
"Standard" query syntax, to the new "Extended" syntax.
|
||||||
|
|
||||||
|
=head1 FUNCTIONS
|
||||||
|
|
||||||
|
=head2 fts3_convert
|
||||||
|
|
||||||
|
Takes as input a string for the MATCH clause in a FTS3 fulltext search;
|
||||||
|
returns the same clause rewritten in new, "Extended" syntax.
|
||||||
|
|
||||||
|
=head1 AUTHOR
|
||||||
|
|
||||||
|
Laurent Dami E<lt>dami@cpan.orgE<gt>
|
||||||
|
|
||||||
|
=head1 COPYRIGHT
|
||||||
|
|
||||||
|
Copyright 2010 Laurent Dami.
|
||||||
|
|
||||||
|
This program is free software; you can redistribute
|
||||||
|
it and/or modify it under the same terms as Perl itself.
|
||||||
|
|
||||||
|
The full text of the license can be found in the
|
||||||
|
LICENSE file included with this module.
|
||||||
|
|
||||||
|
=cut
|
103
t/43_fts3.t
Normal file
103
t/43_fts3.t
Normal file
|
@ -0,0 +1,103 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
BEGIN {
|
||||||
|
$| = 1;
|
||||||
|
$^W = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
use t::lib::Test qw/connect_ok/;
|
||||||
|
use Test::More;
|
||||||
|
|
||||||
|
my @texts = ("il était une bergère",
|
||||||
|
"qui gardait ses moutons",
|
||||||
|
"elle fit un fromage",
|
||||||
|
"du lait de ses moutons");
|
||||||
|
|
||||||
|
my @tests = (
|
||||||
|
# query => expected results
|
||||||
|
["bergère" => 0 ],
|
||||||
|
["berg*" => 0 ],
|
||||||
|
["foobar" ],
|
||||||
|
["moutons" => 1, 3 ],
|
||||||
|
['"qui gardait"' => 1 ],
|
||||||
|
["moutons NOT lait" => 1 ],
|
||||||
|
["il était" => 0 ],
|
||||||
|
["(il OR elle) AND un*" => 0, 2 ],
|
||||||
|
);
|
||||||
|
|
||||||
|
BEGIN {
|
||||||
|
if ($] < 5.008005) {
|
||||||
|
plan skip_all => 'Unicode is not supported before 5.8.5';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
use Test::NoWarnings;
|
||||||
|
|
||||||
|
plan tests => 2 * (1 + @tests) + 1;
|
||||||
|
|
||||||
|
BEGIN {
|
||||||
|
# Sadly perl for windows (and probably sqlite, too) may hang
|
||||||
|
# if the system locale doesn't support european languages.
|
||||||
|
# en-us should be a safe default. if it doesn't work, use 'C'.
|
||||||
|
if ( $^O eq 'MSWin32') {
|
||||||
|
use POSIX 'locale_h';
|
||||||
|
setlocale(LC_COLLATE, 'en-us');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
use locale;
|
||||||
|
|
||||||
|
|
||||||
|
sub locale_tokenizer { # see also: Search::Tokenizer
|
||||||
|
return sub {
|
||||||
|
my $string = shift;
|
||||||
|
|
||||||
|
my $regex = qr/\w+/;
|
||||||
|
my $term_index = 0;
|
||||||
|
|
||||||
|
return sub {
|
||||||
|
$string =~ /$regex/g or return; # either match, or no more token
|
||||||
|
my ($start, $end) = ($-[0], $+[0]);
|
||||||
|
my $term = substr($string, $start, my $len = $end-$start);
|
||||||
|
return ($term, $len, $start, $end, $term_index++);
|
||||||
|
};
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
use DBD::SQLite;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
for my $use_unicode (0, 1) {
|
||||||
|
|
||||||
|
# connect
|
||||||
|
my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode );
|
||||||
|
|
||||||
|
# create fts3 table
|
||||||
|
use Search::Tokenizer;
|
||||||
|
$dbh->do(<<"") or die DBI::errstr;
|
||||||
|
CREATE VIRTUAL TABLE try_fts3
|
||||||
|
USING fts3(content, tokenize=perl 'main::locale_tokenizer')
|
||||||
|
|
||||||
|
# populate it
|
||||||
|
my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
|
||||||
|
INSERT INTO try_fts3(content) VALUES(?)
|
||||||
|
|
||||||
|
my @doc_ids;
|
||||||
|
for (my $i = 0; $i < @texts; $i++) {
|
||||||
|
$insert_sth->execute($texts[$i]);
|
||||||
|
$doc_ids[$i] = $dbh->last_insert_id("", "", "", "");
|
||||||
|
}
|
||||||
|
|
||||||
|
# queries
|
||||||
|
my $sql = "SELECT docid FROM try_fts3 WHERE content MATCH ?";
|
||||||
|
for my $t (@tests) {
|
||||||
|
my ($query, @expected) = @$t;
|
||||||
|
@expected = map {$doc_ids[$_]} @expected;
|
||||||
|
my $results = $dbh->selectcol_arrayref($sql, undef, $query);
|
||||||
|
is_deeply($results, \@expected, "$query (unicode is $use_unicode)");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
34
t/44_fts3_transitional.t
Normal file
34
t/44_fts3_transitional.t
Normal file
|
@ -0,0 +1,34 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use strict;
|
||||||
|
BEGIN {
|
||||||
|
$| = 1;
|
||||||
|
$^W = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
use Test::More;
|
||||||
|
use Test::NoWarnings;
|
||||||
|
|
||||||
|
my @tests = (
|
||||||
|
['foo bar' => 'foo bar' ],
|
||||||
|
['foo -bar' => 'foo (NOT bar)' ],
|
||||||
|
['foo* -bar*' => 'foo* (NOT bar*)' ],
|
||||||
|
['foo bar OR bie buz' => 'foo (bar OR bie) buz' ],
|
||||||
|
['-foo bar OR -bie buz' => '(NOT foo) (bar OR NOT bie) buz'],
|
||||||
|
['"kyrie eleison" OR "christe eleison"'
|
||||||
|
=> '("kyrie eleison" OR "christe eleison")'],
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
plan tests => 1 + @tests;
|
||||||
|
|
||||||
|
use DBD::SQLite::FTS3Transitional qw/fts3_convert/;
|
||||||
|
|
||||||
|
foreach my $t (@tests) {
|
||||||
|
my ($old_syntax, $expected_new) = @$t;
|
||||||
|
my $new = fts3_convert($old_syntax);
|
||||||
|
$new =~ s/^\s+//;
|
||||||
|
is($new, $expected_new, $old_syntax);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue