diff --git a/SQLite.xs b/SQLite.xs index 0a3a052..f1676a1 100644 --- a/SQLite.xs +++ b/SQLite.xs @@ -314,6 +314,31 @@ register_fts3_perl_tokenizer(dbh) OUTPUT: RETVAL +static int +register_fts5_perl_tokenizer(dbh) + SV *dbh + ALIAS: + DBD::SQLite::db::sqlite_register_fts5_perl_tokenizer = 1 + CODE: + RETVAL = sqlite_db_register_fts5_perl_tokenizer(aTHX_ dbh); + OUTPUT: + RETVAL + +static int +fts5_xToken(pCtx,tflags,svToken,iStart,iEnd) + SV *pCtx + int tflags + SV *svToken + STRLEN iStart + STRLEN iEnd + ALIAS: + DBD::SQLite::db::fts5_xToken = 1 + CODE: + dTHX; + RETVAL = perl_fts5_xToken(aTHX_ pCtx,tflags,svToken,iStart,iEnd); + OUTPUT: + RETVAL + HV* db_status(dbh, reset = 0) SV* dbh diff --git a/dbdimp.h b/dbdimp.h index b357e1f..03d5a7f 100644 --- a/dbdimp.h +++ b/dbdimp.h @@ -182,6 +182,8 @@ HV* sqlite_db_table_column_metadata(pTHX_ SV *dbh, SV *dbname, SV *tablename, SV HV* _sqlite_db_status(pTHX_ SV *dbh, int reset); SV* sqlite_db_filename(pTHX_ SV *dbh); int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh); +int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh); +int perl_fts5_xToken(pTHX_ SV* pCtx, int tflags, SV* svToken, int iStart, int iEnd ); HV* _sqlite_status(int reset); HV* _sqlite_st_status(pTHX_ SV *sth, int reset); int sqlite_db_create_module(pTHX_ SV *dbh, const char *name, const char *perl_class); diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index a6ee4bd..c91a8d5 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -17,7 +17,7 @@ typedef struct perl_fts3_tokenizer_cursor { } perl_fts3_tokenizer_cursor; typedef struct perl_Fts5Tokenizer { - Fts5Tokenizer base; + /* Fts5Tokenizer base; /* this is an empty struct, so we omit it entirely */ SV *coderef; /* the perl tokenizer is a coderef that takes ** a string and and some parameters and ** in turn calls the xToken() function @@ -25,6 +25,23 @@ typedef struct perl_Fts5Tokenizer { */ } perl_Fts5Tokenizer; +/* This is the structure where we store the information between calls + * from Perl and callbacks to SQLite. We could instead pass these values + * as opaque arguments to Perl and back, but this reduces the number of + * opaque values handled by Perl to a single such value. + */ +typedef struct perl_cb_ctx { + void * Ctx; + int (*xToken)( + void *pCtx, /* Copy of 2nd argument to xTokenize() */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Pointer to buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ + ); +} perl_cb_ctx; + /* ** Create a new tokenizer instance. ** Will be called whenever a FTS3 table is created with @@ -338,7 +355,6 @@ static int perl_fts5_tokenizer_Create( int n_retval; SV *retval; perl_Fts5Tokenizer *t; - if (!nArg) { return SQLITE_ERROR; } @@ -346,7 +362,6 @@ static int perl_fts5_tokenizer_Create( t = (perl_Fts5Tokenizer *) sqlite3_malloc(sizeof(*t)); if( t==NULL ) return SQLITE_NOMEM; memset(t, 0, sizeof(*t)); - ENTER; SAVETMPS; @@ -358,11 +373,12 @@ static int perl_fts5_tokenizer_Create( /* store a copy of the returned coderef into the tokenizer structure */ if (n_retval != 1) { - warn("tokenizer_Create returned %d arguments", n_retval); + warn("tokenizer_Create returned %d arguments, expected a single coderef", n_retval); } retval = POPs; t->coderef = newSVsv(retval); - *ppOut = &t->base; + /* *ppOut = &t->base; */ /* Fts5Tokenizer is empty and gcc complains about that */ + *ppOut = (Fts5Tokenizer *) t; PUTBACK; FREETMPS; @@ -405,18 +421,38 @@ static int perl_fts5_tokenizer_Tokenize( int n_retval; char *token; char *byteOffset; - STRLEN n_a; /* this is required for older perls < 5.8.8 */ - I32 hop; - dTHX; dSP; + /* The implicit assumption here is that our callback will only be + * invoked from a stack frame below this frame! + */ + perl_cb_ctx ctx; + SV* ctxP; + SV* text; + + STRLEN n_a; /* this is required for older perls < 5.8.8 */ + I32 hop; + ENTER; SAVETMPS; /* call the Perl tokenizer, and pass it our token callback */ PUSHMARK(SP); - PUTBACK; + + ctx.Ctx = pCtx; + ctx.xToken = xToken; + ctxP = newSVpvn((const char *const)&ctx, sizeof(ctx)); + + text = newSVpvn(pText, nText); + + // We pass four arguments + //EXTEND(SP, 2); + XPUSHs(sv_2mortal(ctxP)); + XPUSHs(sv_2mortal(text)); + XPUSHs(sv_2mortal(newSViv(flags))); + // We need to properly wrap this so it is callable from Perl... + // ... without needing actual local storage or a global variable... // XXX Wrap the "found token" callback, and pass it to the user // Then, restructure the data if it is UTF-8 @@ -471,11 +507,10 @@ static int perl_fts5_tokenizer_Tokenize( // // result = SQLITE_OK; // - + PUTBACK; n_retval = call_sv(c->coderef, G_ARRAY); SPAGAIN; - PUTBACK; FREETMPS; LEAVE; @@ -483,6 +518,20 @@ static int perl_fts5_tokenizer_Tokenize( return result; } +int perl_fts5_xToken(pTHX_ + SV* pCtx, + int tflags, /* Mask of FTS5_TOKEN_* flags */ + SV* svToken, /* Pointer to buffer containing token */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ +) { + const char* chrToken = SvPV_nolen(svToken); + STRLEN nToken = strlen(chrToken); + perl_cb_ctx * p = (perl_cb_ctx *)SvPV_nolen( pCtx ); + return p->xToken(p->Ctx,tflags,chrToken,nToken,iStart,iEnd); +} + + /* ** The set of routines that implement the perl FTS5 tokenizer */ @@ -533,8 +582,7 @@ int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh) fts5_api *pFts5Api = sqlite_fetch_fts5_api(aTHX_ dbh); fts5_tokenizer *p = &perl_fts5_tokenizer_Module; - // pFts5Api->xCreateTokenizer(pFts5Api,...); + rc = pFts5Api->xCreateTokenizer(pFts5Api, "perl", 0, p, 0); - - return 0; + return rc; } diff --git a/lib/DBD/SQLite.pm b/lib/DBD/SQLite.pm index 634b4ec..19d0df9 100644 --- a/lib/DBD/SQLite.pm +++ b/lib/DBD/SQLite.pm @@ -143,10 +143,12 @@ sub connect { $dbh->sqlite_collation_needed( \&install_collation ); $dbh->sqlite_create_function( "REGEXP", 2, \®exp ); $dbh->sqlite_register_fts3_perl_tokenizer(); + $dbh->sqlite_register_fts5_perl_tokenizer(); } else { $dbh->func( \&install_collation, "collation_needed" ); $dbh->func( "REGEXP", 2, \®exp, "create_function" ); $dbh->func( "register_fts3_perl_tokenizer" ); + $dbh->func( "register_fts5_perl_tokenizer" ); } # HACK: Since PrintWarn = 0 doesn't seem to actually prevent warnings @@ -1223,7 +1225,7 @@ store natively as a BLOB use the following code: use DBI qw(:sql_types); my $dbh = DBI->connect("dbi:SQLite:dbfile","",""); - + my $blob = `cat foo.jpg`; my $sth = $dbh->prepare("INSERT INTO mytable VALUES (1, ?)"); $sth->bind_param(1, $blob, SQL_BLOB); @@ -1235,7 +1237,7 @@ And then retrieval just works: $sth->execute(); my $row = $sth->fetch; my $blobo = $row->[1]; - + # now $blobo == $blob =head2 Functions And Bind Parameters @@ -1264,7 +1266,7 @@ As shown above in the C section, you can always use C to tell the type of a bind value. use DBI qw(:sql_types); # Don't forget this - + my $sth = $dbh->prepare(q{ SELECT bar FROM foo GROUP BY bar HAVING count(*) > ?; }); @@ -1454,13 +1456,13 @@ statement. To end it, call C methods, or issue the corresponding statements. $dbh->{AutoCommit} = 1; - + $dbh->begin_work; # or $dbh->do('BEGIN TRANSACTION'); - + # $dbh->{AutoCommit} is turned off temporarily during a transaction; - + $dbh->commit; # or $dbh->do('COMMIT'); - + # $dbh->{AutoCommit} is turned on again; =item When the AutoCommit flag is off @@ -1474,15 +1476,15 @@ You can commit or roll it back freely. Another transaction will automatically begin if you execute another statement. $dbh->{AutoCommit} = 0; - + # $dbh->do('BEGIN TRANSACTION') is not necessary, but possible - + ... - + $dbh->commit; # or $dbh->do('COMMIT'); - + # $dbh->{AutoCommit} stays intact; - + $dbh->{AutoCommit} = 1; # ends the transactional mode =back @@ -2090,38 +2092,38 @@ Here is a simple aggregate function which returns the variance (example adapted from pysqlite): package variance; - + sub new { bless [], shift; } - + sub step { my ( $self, $value ) = @_; - + push @$self, $value; } - + sub finalize { my $self = $_[0]; - + my $n = @$self; - + # Variance is NULL unless there is more than one row return undef unless $n || $n == 1; - + my $mu = 0; foreach my $v ( @$self ) { $mu += $v; } $mu /= $n; - + my $sigma = 0; foreach my $v ( @$self ) { $sigma += ($v - $mu)**2; } $sigma = $sigma / ($n - 1); - + return $sigma; } - + $dbh->sqlite_create_aggregate( "variance", 1, 'variance' ); The aggregate function can then be used as: @@ -2390,13 +2392,13 @@ You may also pass 0 as an argument to reset the status. You can change how the connected database should behave like this: use DBD::SQLite::Constants qw/:database_connection_configuration_options/; - + my $dbh = DBI->connect('dbi:SQLite::memory:'); # This disables language features that allow ordinary SQL # to deliberately corrupt the database file $dbh->sqlite_db_config( SQLITE_DBCONFIG_DEFENSIVE, 1 ); - + # This disables two-arg version of fts3_tokenizer. $dbh->sqlite_db_config( SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 0 ); @@ -2693,16 +2695,16 @@ then query which buildings overlap or are contained within a specified region: SELECT id FROM city_buildings WHERE minLong >= ? AND maxLong <= ? AND minLat >= ? AND maxLat <= ? - + # ... and those that overlap query coordinates my $overlap_sql = <<""; SELECT id FROM city_buildings WHERE maxLong >= ? AND minLong <= ? AND maxLat >= ? AND minLat <= ? - + my $contained = $dbh->selectcol_arrayref($contained_sql,undef, $minLong, $maxLong, $minLat, $maxLat); - + my $overlapping = $dbh->selectcol_arrayref($overlap_sql,undef, $minLong, $maxLong, $minLat, $maxLat); @@ -2750,10 +2752,10 @@ header like this: use File::ShareDir 'dist_dir'; use File::Spec::Functions 'catfile'; - + # the whole sqlite3.h header my $sqlite3_h = catfile(dist_dir('DBD-SQLite'), 'sqlite3.h'); - + # or only a particular header, amalgamated in sqlite3.c my $what_i_want = 'parse.h'; my $sqlite3_c = catfile(dist_dir('DBD-SQLite'), 'sqlite3.c'); diff --git a/t/67_fts5.t b/t/67_fts5.t new file mode 100644 index 0000000..bd5e37c --- /dev/null +++ b/t/67_fts5.t @@ -0,0 +1,106 @@ +use strict; +use warnings; +no if $] >= 5.022, "warnings", "locale"; +use lib "t/lib"; +use SQLiteTest; +use Test::More; +#use if -d ".git", "Test::FailWarnings"; +use DBD::SQLite; + +my @texts = ("il était une bergère", + "qui gardait ses moutons", + "elle fit un fromage", + "du lait de ses moutons"); + +my @tests = ( +# query => expected results + ["bergère" => 0 ], + ["berg*" => 0 ], + ["foobar" ], + ["moutons" => 1, 3 ], + ['"qui gardait"' => 1 ], + ["moutons NOT lait" => 1 ], + ["il était" => 0 ], + ["(il OR elle) AND un*" => 0, 2 ], +); + +BEGIN { + requires_unicode_support(); + + if (!has_fts()) { + plan skip_all => 'FTS is disabled for this DBD::SQLite'; + } + if ($DBD::SQLite::sqlite_version_number >= 3011000 and $DBD::SQLite::sqlite_version_number < 3012000 and !has_compile_option('ENABLE_FTS5_TOKENIZER')) { + plan skip_all => 'FTS5 tokenizer is disabled for this DBD::SQLite'; + } +} + +# Perl may spit a warning on locale +# use Test::NoWarnings; + +BEGIN { + # Sadly perl for windows (and probably sqlite, too) may hang + # if the system locale doesn't support european languages. + # en-us should be a safe default. if it doesn't work, use 'C'. + if ( $^O eq 'MSWin32') { + use POSIX 'locale_h'; + setlocale(LC_COLLATE, 'en-us'); + } +} + +use locale; + +sub locale_tokenizer { # see also: Search::Tokenizer + return sub { + my( $ctx, $string, $tokenizer_context_flags ) = @_; + my $regex = qr/\w+/; + #my $term_index = 0; + # + while( $string =~ /$regex/g) { + my ($start, $end) = ($-[0], $+[0]); + my $term = substr($string, $start, my $len = $end-$start); + my $flags = 0; # SQLITE_FTS5_TOKEN; + DBD::SQLite::db::fts5_xToken($ctx,$flags,$term,$start,$end); + }; + }; +} + +use DBD::SQLite; + +for my $use_unicode (0, 1) { + + # connect + my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode ); + + for my $fts (qw/fts5/) { + + # create fts table + $dbh->do(<<"") or die DBI::errstr; + CREATE VIRTUAL TABLE try_$fts + USING $fts(content, tokenize="perl 'main::locale_tokenizer'") + + # populate it + my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr; + INSERT INTO try_$fts(content) VALUES(?) + + my @doc_ids; + for (my $i = 0; $i < @texts; $i++) { + $insert_sth->execute($texts[$i]); + $doc_ids[$i] = $dbh->last_insert_id("", "", "", ""); + } + + # queries + SKIP: { + my $sql = "SELECT rowid FROM try_$fts WHERE content MATCH ?"; + + for my $t (@tests) { + my ($query, @expected) = @$t; + @expected = map {$doc_ids[$_]} @expected; + my $results = $dbh->selectcol_arrayref($sql, undef, $query); + is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)"); + } + } + } +} + +done_testing;