diff --git a/Makefile.PL b/Makefile.PL
index 4f28a08..b2297a5 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -394,6 +394,9 @@ WriteMakefile(
? '$(O_FILES)'
: 'SQLite.o dbdimp.o'
),
+ depend => {
+ 'dbdimp.o' => 'dbdimp_tokenizer.inc dbdimp_virtual_table.inc',
+ },
clean => {
FILES => 'SQLite.xsi config.h tv.log *.old',
},
diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc
index d48409a..83ab40a 100644
--- a/dbdimp_tokenizer.inc
+++ b/dbdimp_tokenizer.inc
@@ -12,8 +12,8 @@ typedef struct perl_tokenizer_cursor {
/* members below are only used if the input string is in utf8 */
const char *pInput; /* input we are tokenizing */
- const char *lastByteOffset; /* offset into pInput */
- int lastCharOffset; /* char offset corresponding to lastByteOffset */
+ const char *currentByte; /* pointer into pInput */
+ int currentChar; /* char position corresponding to currentByte */
} perl_tokenizer_cursor;
/*
@@ -108,9 +108,9 @@ static int perl_tokenizer_Open(
/* special handling if working with utf8 strings */
if (MY_CXT.last_dbh_is_unicode) {
- /* data to keep track of byte offsets */
- c->lastByteOffset = c->pInput = pInput;
- c->lastCharOffset = 0;
+ /* data to keep track of byte positions */
+ c->currentByte = c->pInput = pInput;
+ c->currentChar = 0;
/* string passed to Perl needs to be flagged as utf8 */
flags |= SVf_UTF8;
@@ -134,7 +134,7 @@ static int perl_tokenizer_Open(
/* store the cursor coderef returned by the tokenizer */
if (n_retval != 1) {
- warn("tokenizer returned %d arguments", n_retval);
+ warn("tokenizer returned %d arguments, expected 1", n_retval);
}
c->coderef = newSVsv(POPs);
@@ -164,17 +164,17 @@ static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
*/
static int perl_tokenizer_Next(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */
- const char **ppToken, /* OUT: *ppToken is the token text */
- int *pnBytes, /* OUT: Number of bytes in token */
- int *piStartOffset, /* OUT: Starting offset of token */
- int *piEndOffset, /* OUT: Ending offset of token */
- int *piPosition /* OUT: Position integer of token */
+ const char **ppToken, /* OUT: Normalized text for token */
+ int *pnBytes, /* OUT: Number of bytes in normalized text */
+ int *piStartOffset, /* Starting offset of token. IN : char offset; OUT : byte offset */
+ int *piEndOffset, /* Ending offset of token. IN : char offset; OUT : byte offset */
+ int *piPosition /* OUT: Number of tokens returned before this one */
){
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
int result;
int n_retval;
char *token;
- char *byteOffset;
+ char *nextByte;
STRLEN n_a; /* this is required for older perls < 5.8.8 */
I32 hop;
@@ -197,7 +197,7 @@ static int perl_tokenizer_Next(
/* otherwise, get token details from the return list */
else {
if (n_retval != 5) {
- warn("tokenizer cursor returned %d arguments", n_retval);
+ warn("tokenizer cursor returned %d arguments, expected 5", n_retval);
}
*piPosition = POPi;
*piEndOffset = POPi;
@@ -206,21 +206,30 @@ static int perl_tokenizer_Next(
token = POPpx;
if (c->pInput) { /* if working with utf8 data */
+ /* compute first hop : nb of chars from last position to the start of the token */
+ hop = *piStartOffset - c->currentChar;
- /* recompute *pnBytes in bytes, not in chars */
- *pnBytes = strlen(token);
+ /* hop: advance to the first byte in token */
+ nextByte = (char*)utf8_hop((U8*)c->currentByte, hop);
- /* recompute start/end offsets in bytes, not in chars */
- hop = *piStartOffset - c->lastCharOffset;
- byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
- hop = *piEndOffset - *piStartOffset;
- *piStartOffset = byteOffset - c->pInput;
- byteOffset = (char*)utf8_hop((U8*)byteOffset, hop);
- *piEndOffset = byteOffset - c->pInput;
+ /* compute 2nd hop : nb of chars from start of the token to end of token */
+ hop = *piEndOffset - *piStartOffset;
- /* remember where we are for next round */
- c->lastCharOffset = *piEndOffset,
- c->lastByteOffset = byteOffset;
+ /* now recompute the start offset in bytes, not in chars */
+ *piStartOffset = nextByte - c->pInput;
+
+ /* 2nd hop: advance past to the last byte in token */
+ nextByte = (char*)utf8_hop((U8*)nextByte, hop);
+
+ /* remember current position (useful for the next invocation) */
+ c->currentChar = *piEndOffset;
+ c->currentByte = nextByte;
+
+ /* now recompute the end offset in bytes, not in chars */
+ *piEndOffset = nextByte - c->pInput;
+
+ /* compute the size of the normalized token in bytes, not in chars */
+ *pnBytes = strlen(token);
}
/* make sure we have enough storage for copying the token */
@@ -232,8 +241,7 @@ static int perl_tokenizer_Next(
c->pToken = pNew;
}
- /* need to copy the token into the C cursor before perl frees that
- memory */
+ /* need to copy the token into the C cursor before perl frees that memory */
memcpy(c->pToken, token, *pnBytes);
*ppToken = c->pToken;
diff --git a/t/43_fts3.t b/t/43_fts3.t
index 934c8de..c6170b9 100644
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@@ -1,12 +1,13 @@
use strict;
use warnings;
-no if $] >= 5.022, "warnings", "locale";
use lib "t/lib";
+use Time::HiRes qw/time/;
use SQLiteTest;
use Test::More;
use if -d ".git", "Test::FailWarnings";
use DBD::SQLite;
+
my @texts = ("il était une bergère",
"qui gardait ses moutons",
"elle fit un fromage",
@@ -24,6 +25,9 @@ my @tests = (
["(il OR elle) AND un*" => 0, 2 ],
);
+my $ix_une_native = index($texts[0], "une");
+my $ix_une_utf8 = do {use bytes; utf8::upgrade(my $bergere_utf8 = $texts[0]); index($bergere_utf8, "une");};
+
BEGIN {
requires_unicode_support();
@@ -35,31 +39,19 @@ BEGIN {
}
}
-# Perl may spit a warning on locale
-# use Test::NoWarnings;
-BEGIN {
- # Sadly perl for windows (and probably sqlite, too) may hang
- # if the system locale doesn't support european languages.
- # en-us should be a safe default. if it doesn't work, use 'C'.
- if ( $^O eq 'MSWin32') {
- use POSIX 'locale_h';
- setlocale(LC_COLLATE, 'en-us');
- }
-}
-
-use locale;
-
-sub locale_tokenizer { # see also: Search::Tokenizer
+sub Unicode_Word_tokenizer { # see also: Search::Tokenizer
return sub {
- my $string = shift;
- my $regex = qr/\w+/;
+ my $string = shift;
+ my $regex = qr/\p{Word}+/;
my $term_index = 0;
return sub {
$string =~ /$regex/g or return; # either match, or no more token
- my ($start, $end) = ($-[0], $+[0]);
- my $term = substr($string, $start, my $len = $end-$start);
+ my $term = $&;
+ my $end = pos $string; # $+[0] is much slower
+ my $len = length($term);
+ my $start = $end - $len;
return ($term, $len, $start, $end, $term_index++);
};
};
@@ -78,7 +70,7 @@ for my $use_unicode (0, 1) {
# create fts table
$dbh->do(<<"") or die DBI::errstr;
CREATE VIRTUAL TABLE try_$fts
- USING $fts(content, tokenize=perl 'main::locale_tokenizer')
+ USING $fts(content, tokenize=perl 'main::Unicode_Word_tokenizer')
# populate it
my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
@@ -105,6 +97,40 @@ for my $use_unicode (0, 1) {
is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)");
}
}
+
+ # the 'snippet' function should highlight the words in the MATCH query
+ my $sql_snip = "SELECT snippet(try_$fts) FROM try_$fts WHERE content MATCH ?";
+ my $result = $dbh->selectcol_arrayref($sql_snip, undef, 'une');
+ is_deeply($result, ['il était une bergère'], "snippet ($fts, unicode=$use_unicode)");
+
+ # the 'offsets' function should return integer offsets for the words in the MATCH query
+ my $sql_offsets = "SELECT offsets(try_$fts) FROM try_$fts WHERE content MATCH ?";
+ $result = $dbh->selectcol_arrayref($sql_offsets, undef, 'une');
+ my $offset_une = $use_unicode ? $ix_une_utf8 : $ix_une_native;
+ my $expected_offsets = "0 0 $offset_une 3";
+ is_deeply($result, [$expected_offsets], "offsets ($fts, unicode=$use_unicode)");
+
+ # test snippet() on a longer sentence
+ $insert_sth->execute(join " ", @texts);
+ $result = $dbh->selectcol_arrayref($sql_snip, undef, '"bergère qui gardait"');
+ like($result->[0],
+ qr[une bergère qui gardait ses],
+ "longer snippet ($fts, unicode=$use_unicode)");
+
+ # simulated large document
+ open my $fh, "<", $INC{'DBD/SQLite.pm'} or die $!;
+ my $source_code = do {local $/; <$fh>};
+ my $long_doc = $source_code x 5;
+
+ my $t0 = time;
+ $insert_sth->execute($long_doc);
+ my $t1 = time;
+ $result = $dbh->selectcol_arrayref($sql_snip, undef, '"package DBD::SQLite"');
+ my $t2 = time;
+
+ note sprintf("long doc (%d chars): insert in %.4f secs, select in %.4f secs",
+ length($long_doc), $t1-$t0, $t2-$t1);
+ like($result->[0], qr[^package DBD::SQLite;], 'snippet "package SQLite"');
}
}