mirror of
https://github.com/DBD-SQLite/DBD-SQLite
synced 2025-06-07 06:08:38 -04:00
Merge pull request #76 from DBD-SQLite/#75_fix_fts3_offsets
#75 fix fts3 offsets
This commit is contained in:
commit
57d2dd7fb5
3 changed files with 85 additions and 48 deletions
|
@ -394,6 +394,9 @@ WriteMakefile(
|
||||||
? '$(O_FILES)'
|
? '$(O_FILES)'
|
||||||
: 'SQLite.o dbdimp.o'
|
: 'SQLite.o dbdimp.o'
|
||||||
),
|
),
|
||||||
|
depend => {
|
||||||
|
'dbdimp.o' => 'dbdimp_tokenizer.inc dbdimp_virtual_table.inc',
|
||||||
|
},
|
||||||
clean => {
|
clean => {
|
||||||
FILES => 'SQLite.xsi config.h tv.log *.old',
|
FILES => 'SQLite.xsi config.h tv.log *.old',
|
||||||
},
|
},
|
||||||
|
|
|
@ -12,8 +12,8 @@ typedef struct perl_tokenizer_cursor {
|
||||||
|
|
||||||
/* members below are only used if the input string is in utf8 */
|
/* members below are only used if the input string is in utf8 */
|
||||||
const char *pInput; /* input we are tokenizing */
|
const char *pInput; /* input we are tokenizing */
|
||||||
const char *lastByteOffset; /* offset into pInput */
|
const char *currentByte; /* pointer into pInput */
|
||||||
int lastCharOffset; /* char offset corresponding to lastByteOffset */
|
int currentChar; /* char position corresponding to currentByte */
|
||||||
} perl_tokenizer_cursor;
|
} perl_tokenizer_cursor;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -108,9 +108,9 @@ static int perl_tokenizer_Open(
|
||||||
/* special handling if working with utf8 strings */
|
/* special handling if working with utf8 strings */
|
||||||
if (MY_CXT.last_dbh_is_unicode) {
|
if (MY_CXT.last_dbh_is_unicode) {
|
||||||
|
|
||||||
/* data to keep track of byte offsets */
|
/* data to keep track of byte positions */
|
||||||
c->lastByteOffset = c->pInput = pInput;
|
c->currentByte = c->pInput = pInput;
|
||||||
c->lastCharOffset = 0;
|
c->currentChar = 0;
|
||||||
|
|
||||||
/* string passed to Perl needs to be flagged as utf8 */
|
/* string passed to Perl needs to be flagged as utf8 */
|
||||||
flags |= SVf_UTF8;
|
flags |= SVf_UTF8;
|
||||||
|
@ -134,7 +134,7 @@ static int perl_tokenizer_Open(
|
||||||
|
|
||||||
/* store the cursor coderef returned by the tokenizer */
|
/* store the cursor coderef returned by the tokenizer */
|
||||||
if (n_retval != 1) {
|
if (n_retval != 1) {
|
||||||
warn("tokenizer returned %d arguments", n_retval);
|
warn("tokenizer returned %d arguments, expected 1", n_retval);
|
||||||
}
|
}
|
||||||
c->coderef = newSVsv(POPs);
|
c->coderef = newSVsv(POPs);
|
||||||
|
|
||||||
|
@ -164,17 +164,17 @@ static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
|
||||||
*/
|
*/
|
||||||
static int perl_tokenizer_Next(
|
static int perl_tokenizer_Next(
|
||||||
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */
|
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */
|
||||||
const char **ppToken, /* OUT: *ppToken is the token text */
|
const char **ppToken, /* OUT: Normalized text for token */
|
||||||
int *pnBytes, /* OUT: Number of bytes in token */
|
int *pnBytes, /* OUT: Number of bytes in normalized text */
|
||||||
int *piStartOffset, /* OUT: Starting offset of token */
|
int *piStartOffset, /* Starting offset of token. IN : char offset; OUT : byte offset */
|
||||||
int *piEndOffset, /* OUT: Ending offset of token */
|
int *piEndOffset, /* Ending offset of token. IN : char offset; OUT : byte offset */
|
||||||
int *piPosition /* OUT: Position integer of token */
|
int *piPosition /* OUT: Number of tokens returned before this one */
|
||||||
){
|
){
|
||||||
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
|
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
|
||||||
int result;
|
int result;
|
||||||
int n_retval;
|
int n_retval;
|
||||||
char *token;
|
char *token;
|
||||||
char *byteOffset;
|
char *nextByte;
|
||||||
STRLEN n_a; /* this is required for older perls < 5.8.8 */
|
STRLEN n_a; /* this is required for older perls < 5.8.8 */
|
||||||
I32 hop;
|
I32 hop;
|
||||||
|
|
||||||
|
@ -197,7 +197,7 @@ static int perl_tokenizer_Next(
|
||||||
/* otherwise, get token details from the return list */
|
/* otherwise, get token details from the return list */
|
||||||
else {
|
else {
|
||||||
if (n_retval != 5) {
|
if (n_retval != 5) {
|
||||||
warn("tokenizer cursor returned %d arguments", n_retval);
|
warn("tokenizer cursor returned %d arguments, expected 5", n_retval);
|
||||||
}
|
}
|
||||||
*piPosition = POPi;
|
*piPosition = POPi;
|
||||||
*piEndOffset = POPi;
|
*piEndOffset = POPi;
|
||||||
|
@ -206,21 +206,30 @@ static int perl_tokenizer_Next(
|
||||||
token = POPpx;
|
token = POPpx;
|
||||||
|
|
||||||
if (c->pInput) { /* if working with utf8 data */
|
if (c->pInput) { /* if working with utf8 data */
|
||||||
|
/* compute first hop : nb of chars from last position to the start of the token */
|
||||||
|
hop = *piStartOffset - c->currentChar;
|
||||||
|
|
||||||
/* recompute *pnBytes in bytes, not in chars */
|
/* hop: advance to the first byte in token */
|
||||||
*pnBytes = strlen(token);
|
nextByte = (char*)utf8_hop((U8*)c->currentByte, hop);
|
||||||
|
|
||||||
/* recompute start/end offsets in bytes, not in chars */
|
/* compute 2nd hop : nb of chars from start of the token to end of token */
|
||||||
hop = *piStartOffset - c->lastCharOffset;
|
hop = *piEndOffset - *piStartOffset;
|
||||||
byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
|
|
||||||
hop = *piEndOffset - *piStartOffset;
|
|
||||||
*piStartOffset = byteOffset - c->pInput;
|
|
||||||
byteOffset = (char*)utf8_hop((U8*)byteOffset, hop);
|
|
||||||
*piEndOffset = byteOffset - c->pInput;
|
|
||||||
|
|
||||||
/* remember where we are for next round */
|
/* now recompute the start offset in bytes, not in chars */
|
||||||
c->lastCharOffset = *piEndOffset,
|
*piStartOffset = nextByte - c->pInput;
|
||||||
c->lastByteOffset = byteOffset;
|
|
||||||
|
/* 2nd hop: advance past to the last byte in token */
|
||||||
|
nextByte = (char*)utf8_hop((U8*)nextByte, hop);
|
||||||
|
|
||||||
|
/* remember current position (useful for the next invocation) */
|
||||||
|
c->currentChar = *piEndOffset;
|
||||||
|
c->currentByte = nextByte;
|
||||||
|
|
||||||
|
/* now recompute the end offset in bytes, not in chars */
|
||||||
|
*piEndOffset = nextByte - c->pInput;
|
||||||
|
|
||||||
|
/* compute the size of the normalized token in bytes, not in chars */
|
||||||
|
*pnBytes = strlen(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* make sure we have enough storage for copying the token */
|
/* make sure we have enough storage for copying the token */
|
||||||
|
@ -232,8 +241,7 @@ static int perl_tokenizer_Next(
|
||||||
c->pToken = pNew;
|
c->pToken = pNew;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* need to copy the token into the C cursor before perl frees that
|
/* need to copy the token into the C cursor before perl frees that memory */
|
||||||
memory */
|
|
||||||
memcpy(c->pToken, token, *pnBytes);
|
memcpy(c->pToken, token, *pnBytes);
|
||||||
*ppToken = c->pToken;
|
*ppToken = c->pToken;
|
||||||
|
|
||||||
|
|
68
t/43_fts3.t
68
t/43_fts3.t
|
@ -1,12 +1,13 @@
|
||||||
use strict;
|
use strict;
|
||||||
use warnings;
|
use warnings;
|
||||||
no if $] >= 5.022, "warnings", "locale";
|
|
||||||
use lib "t/lib";
|
use lib "t/lib";
|
||||||
|
use Time::HiRes qw/time/;
|
||||||
use SQLiteTest;
|
use SQLiteTest;
|
||||||
use Test::More;
|
use Test::More;
|
||||||
use if -d ".git", "Test::FailWarnings";
|
use if -d ".git", "Test::FailWarnings";
|
||||||
use DBD::SQLite;
|
use DBD::SQLite;
|
||||||
|
|
||||||
|
|
||||||
my @texts = ("il était une bergère",
|
my @texts = ("il était une bergère",
|
||||||
"qui gardait ses moutons",
|
"qui gardait ses moutons",
|
||||||
"elle fit un fromage",
|
"elle fit un fromage",
|
||||||
|
@ -24,6 +25,9 @@ my @tests = (
|
||||||
["(il OR elle) AND un*" => 0, 2 ],
|
["(il OR elle) AND un*" => 0, 2 ],
|
||||||
);
|
);
|
||||||
|
|
||||||
|
my $ix_une_native = index($texts[0], "une");
|
||||||
|
my $ix_une_utf8 = do {use bytes; utf8::upgrade(my $bergere_utf8 = $texts[0]); index($bergere_utf8, "une");};
|
||||||
|
|
||||||
BEGIN {
|
BEGIN {
|
||||||
requires_unicode_support();
|
requires_unicode_support();
|
||||||
|
|
||||||
|
@ -35,31 +39,19 @@ BEGIN {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Perl may spit a warning on locale
|
|
||||||
# use Test::NoWarnings;
|
|
||||||
|
|
||||||
BEGIN {
|
sub Unicode_Word_tokenizer { # see also: Search::Tokenizer
|
||||||
# Sadly perl for windows (and probably sqlite, too) may hang
|
|
||||||
# if the system locale doesn't support european languages.
|
|
||||||
# en-us should be a safe default. if it doesn't work, use 'C'.
|
|
||||||
if ( $^O eq 'MSWin32') {
|
|
||||||
use POSIX 'locale_h';
|
|
||||||
setlocale(LC_COLLATE, 'en-us');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
use locale;
|
|
||||||
|
|
||||||
sub locale_tokenizer { # see also: Search::Tokenizer
|
|
||||||
return sub {
|
return sub {
|
||||||
my $string = shift;
|
my $string = shift;
|
||||||
my $regex = qr/\w+/;
|
my $regex = qr/\p{Word}+/;
|
||||||
my $term_index = 0;
|
my $term_index = 0;
|
||||||
|
|
||||||
return sub {
|
return sub {
|
||||||
$string =~ /$regex/g or return; # either match, or no more token
|
$string =~ /$regex/g or return; # either match, or no more token
|
||||||
my ($start, $end) = ($-[0], $+[0]);
|
my $term = $&;
|
||||||
my $term = substr($string, $start, my $len = $end-$start);
|
my $end = pos $string; # $+[0] is much slower
|
||||||
|
my $len = length($term);
|
||||||
|
my $start = $end - $len;
|
||||||
return ($term, $len, $start, $end, $term_index++);
|
return ($term, $len, $start, $end, $term_index++);
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
@ -78,7 +70,7 @@ for my $use_unicode (0, 1) {
|
||||||
# create fts table
|
# create fts table
|
||||||
$dbh->do(<<"") or die DBI::errstr;
|
$dbh->do(<<"") or die DBI::errstr;
|
||||||
CREATE VIRTUAL TABLE try_$fts
|
CREATE VIRTUAL TABLE try_$fts
|
||||||
USING $fts(content, tokenize=perl 'main::locale_tokenizer')
|
USING $fts(content, tokenize=perl 'main::Unicode_Word_tokenizer')
|
||||||
|
|
||||||
# populate it
|
# populate it
|
||||||
my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
|
my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
|
||||||
|
@ -105,6 +97,40 @@ for my $use_unicode (0, 1) {
|
||||||
is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)");
|
is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# the 'snippet' function should highlight the words in the MATCH query
|
||||||
|
my $sql_snip = "SELECT snippet(try_$fts) FROM try_$fts WHERE content MATCH ?";
|
||||||
|
my $result = $dbh->selectcol_arrayref($sql_snip, undef, 'une');
|
||||||
|
is_deeply($result, ['il était <b>une</b> bergère'], "snippet ($fts, unicode=$use_unicode)");
|
||||||
|
|
||||||
|
# the 'offsets' function should return integer offsets for the words in the MATCH query
|
||||||
|
my $sql_offsets = "SELECT offsets(try_$fts) FROM try_$fts WHERE content MATCH ?";
|
||||||
|
$result = $dbh->selectcol_arrayref($sql_offsets, undef, 'une');
|
||||||
|
my $offset_une = $use_unicode ? $ix_une_utf8 : $ix_une_native;
|
||||||
|
my $expected_offsets = "0 0 $offset_une 3";
|
||||||
|
is_deeply($result, [$expected_offsets], "offsets ($fts, unicode=$use_unicode)");
|
||||||
|
|
||||||
|
# test snippet() on a longer sentence
|
||||||
|
$insert_sth->execute(join " ", @texts);
|
||||||
|
$result = $dbh->selectcol_arrayref($sql_snip, undef, '"bergère qui gardait"');
|
||||||
|
like($result->[0],
|
||||||
|
qr[une <b>bergère</b> <b>qui</b> <b>gardait</b> ses],
|
||||||
|
"longer snippet ($fts, unicode=$use_unicode)");
|
||||||
|
|
||||||
|
# simulated large document
|
||||||
|
open my $fh, "<", $INC{'DBD/SQLite.pm'} or die $!;
|
||||||
|
my $source_code = do {local $/; <$fh>};
|
||||||
|
my $long_doc = $source_code x 5;
|
||||||
|
|
||||||
|
my $t0 = time;
|
||||||
|
$insert_sth->execute($long_doc);
|
||||||
|
my $t1 = time;
|
||||||
|
$result = $dbh->selectcol_arrayref($sql_snip, undef, '"package DBD::SQLite"');
|
||||||
|
my $t2 = time;
|
||||||
|
|
||||||
|
note sprintf("long doc (%d chars): insert in %.4f secs, select in %.4f secs",
|
||||||
|
length($long_doc), $t1-$t0, $t2-$t1);
|
||||||
|
like($result->[0], qr[^<b>package</b> <b>DBD</b>::<b>SQLite</b>;], 'snippet "package SQLite"');
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue