From be9f64b2c9879cb57865e8e7591eb568283c55b5 Mon Sep 17 00:00:00 2001 From: dami Date: Sun, 9 May 2021 19:25:27 +0200 Subject: [PATCH 1/7] tests for functions snippet() and offets() --- t/43_fts3.t | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/t/43_fts3.t b/t/43_fts3.t index 934c8de..d8dbec2 100644 --- a/t/43_fts3.t +++ b/t/43_fts3.t @@ -24,6 +24,13 @@ my @tests = ( ["(il OR elle) AND un*" => 0, 2 ], ); + + +my $ix_une_native = index($texts[0], "une"); +my $ix_une_utf8 = do {use bytes; utf8::upgrade(my $bergere_utf8 = $texts[0]); index($bergere_utf8, "une");}; + + + BEGIN { requires_unicode_support(); @@ -105,6 +112,19 @@ for my $use_unicode (0, 1) { is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)"); } } + + + # the 'snippet' function should highlight the words in the MATCH query + my $sql_snip = "SELECT snippet(try_$fts) FROM try_$fts WHERE content MATCH ?"; + my $result = $dbh->selectcol_arrayref($sql_snip, undef, 'une'); + is_deeply($result, ['il était une bergère'], "snippet ($fts, unicode=$use_unicode)"); + + # the 'offsets' function should return integer offstes for the words in the MATCH query + my $sql_offsets = "SELECT offsets(try_$fts) FROM try_$fts WHERE content MATCH ?"; + $result = $dbh->selectcol_arrayref($sql_offsets, undef, 'une'); + my $offset_une = $use_unicode ? $ix_une_utf8 : $ix_une_native; + my $expected_offsets = "0 0 $offset_une 3"; + is_deeply($result, [$expected_offsets], "offsets ($fts, unicode=$use_unicode)"); } } From b5c3f9d5281e8230c580f378965312300e1e9438 Mon Sep 17 00:00:00 2001 From: dami Date: Sun, 9 May 2021 22:06:28 +0200 Subject: [PATCH 2/7] fixed #75 -- lastCharOffset must copy the OLD value of piEndOffset, BEFORE it is recomputed also fixed 43_fts3.t so that it uses \p{Word} instead of \w, because en-us locale did not handle accented characters --- dbdimp_tokenizer.inc | 27 +++++++++++++++++++-------- t/43_fts3.t | 22 ++++------------------ 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index d48409a..286be37 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -205,22 +205,33 @@ static int perl_tokenizer_Next( *pnBytes = POPi; token = POPpx; + if (c->pInput) { /* if working with utf8 data */ +#ifdef DEBUG_OFFSETS + warn("INI: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes); +#endif + /* recompute *pnBytes in bytes, not in chars */ *pnBytes = strlen(token); - /* recompute start/end offsets in bytes, not in chars */ - hop = *piStartOffset - c->lastCharOffset; - byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); - hop = *piEndOffset - *piStartOffset; - *piStartOffset = byteOffset - c->pInput; - byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); - *piEndOffset = byteOffset - c->pInput; + /* recompute start offset in bytes, not in chars */ + hop = *piStartOffset - c->lastCharOffset; + byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); + hop = *piEndOffset - *piStartOffset; + *piStartOffset = byteOffset - c->pInput; + byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); /* remember where we are for next round */ - c->lastCharOffset = *piEndOffset, + c->lastCharOffset = *piEndOffset; c->lastByteOffset = byteOffset; + + /* recompute end offset in bytes, not in chars */ + *piEndOffset = byteOffset - c->pInput; + +#ifdef DEBUG_OFFSETS + warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes); +#endif } /* make sure we have enough storage for copying the token */ diff --git a/t/43_fts3.t b/t/43_fts3.t index d8dbec2..65ff0b4 100644 --- a/t/43_fts3.t +++ b/t/43_fts3.t @@ -42,25 +42,11 @@ BEGIN { } } -# Perl may spit a warning on locale -# use Test::NoWarnings; -BEGIN { - # Sadly perl for windows (and probably sqlite, too) may hang - # if the system locale doesn't support european languages. - # en-us should be a safe default. if it doesn't work, use 'C'. - if ( $^O eq 'MSWin32') { - use POSIX 'locale_h'; - setlocale(LC_COLLATE, 'en-us'); - } -} - -use locale; - -sub locale_tokenizer { # see also: Search::Tokenizer +sub Unicode_Word_tokenizer { # see also: Search::Tokenizer return sub { - my $string = shift; - my $regex = qr/\w+/; + my $string = shift; + my $regex = qr/\p{Word}+/; my $term_index = 0; return sub { @@ -85,7 +71,7 @@ for my $use_unicode (0, 1) { # create fts table $dbh->do(<<"") or die DBI::errstr; CREATE VIRTUAL TABLE try_$fts - USING $fts(content, tokenize=perl 'main::locale_tokenizer') + USING $fts(content, tokenize=perl 'main::Unicode_Word_tokenizer') # populate it my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr; From 8d5a28cf5c9fb77cfd95553a5a0442b64ba794ec Mon Sep 17 00:00:00 2001 From: dami Date: Sun, 9 May 2021 22:22:39 +0200 Subject: [PATCH 3/7] additional test for snippet() --- t/43_fts3.t | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/t/43_fts3.t b/t/43_fts3.t index 65ff0b4..7082e6c 100644 --- a/t/43_fts3.t +++ b/t/43_fts3.t @@ -99,18 +99,24 @@ for my $use_unicode (0, 1) { } } - # the 'snippet' function should highlight the words in the MATCH query my $sql_snip = "SELECT snippet(try_$fts) FROM try_$fts WHERE content MATCH ?"; my $result = $dbh->selectcol_arrayref($sql_snip, undef, 'une'); is_deeply($result, ['il était une bergère'], "snippet ($fts, unicode=$use_unicode)"); - # the 'offsets' function should return integer offstes for the words in the MATCH query + # the 'offsets' function should return integer offsets for the words in the MATCH query my $sql_offsets = "SELECT offsets(try_$fts) FROM try_$fts WHERE content MATCH ?"; $result = $dbh->selectcol_arrayref($sql_offsets, undef, 'une'); my $offset_une = $use_unicode ? $ix_une_utf8 : $ix_une_native; my $expected_offsets = "0 0 $offset_une 3"; is_deeply($result, [$expected_offsets], "offsets ($fts, unicode=$use_unicode)"); + + # test a longer sentence + $dbh->do("INSERT INTO try_$fts(content) VALUES(?)", {}, join(" ", @texts)); + $result = $dbh->selectcol_arrayref($sql_snip, undef, '"bergère qui gardait"'); + like($result->[0], + qr[une bergère qui gardait ses], + "longer snippet ($fts, unicode=$use_unicode)"); } } From f68f79415f083d62398497885eb7ec955a884255 Mon Sep 17 00:00:00 2001 From: dami Date: Tue, 11 May 2021 09:01:59 +0200 Subject: [PATCH 4/7] test to show that perl tokenizer with unicode is extremely slow --- t/43_fts3.t | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/t/43_fts3.t b/t/43_fts3.t index 7082e6c..f5fb0b0 100644 --- a/t/43_fts3.t +++ b/t/43_fts3.t @@ -2,11 +2,19 @@ use strict; use warnings; no if $] >= 5.022, "warnings", "locale"; use lib "t/lib"; + +# TMP for running tests from Emacs +use lib "lib"; +use lib "../blib/lib"; +use lib "../blib/arch"; + +use Time::HiRes qw/time/; use SQLiteTest; use Test::More; use if -d ".git", "Test::FailWarnings"; use DBD::SQLite; + my @texts = ("il était une bergère", "qui gardait ses moutons", "elle fit un fromage", @@ -111,12 +119,27 @@ for my $use_unicode (0, 1) { my $expected_offsets = "0 0 $offset_une 3"; is_deeply($result, [$expected_offsets], "offsets ($fts, unicode=$use_unicode)"); - # test a longer sentence - $dbh->do("INSERT INTO try_$fts(content) VALUES(?)", {}, join(" ", @texts)); + # test snippet() on a longer sentence + $insert_sth->execute(join " ", @texts); $result = $dbh->selectcol_arrayref($sql_snip, undef, '"bergère qui gardait"'); like($result->[0], qr[une bergère qui gardait ses], "longer snippet ($fts, unicode=$use_unicode)"); + + # simulated large document + open my $fh, "<", $INC{'DBD/SQLite.pm'} or die $!; + my $source_code = do {local $/; <$fh>}; + my $long_doc = $source_code x 1; + + my $t0 = time; + $insert_sth->execute($long_doc); + my $t1 = time; + $result = $dbh->selectcol_arrayref($sql_snip, undef, '"package DBD::SQLite"'); + my $t2 = time; + + note sprintf("long doc (%d chars): insert in %.4f secs, select in %.4f secs", + length($long_doc), $t1-$t0, $t2-$t1); + like($result->[0], qr[^package DBD::SQLite;], 'snippet "package SQLite"'); } } From 80c7c91ae6531c9466bc1a63abae0ac06fd1e60f Mon Sep 17 00:00:00 2001 From: dami Date: Tue, 11 May 2021 09:40:24 +0200 Subject: [PATCH 5/7] better variable names and comments --- dbdimp_tokenizer.inc | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index 286be37..49159f2 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -12,8 +12,8 @@ typedef struct perl_tokenizer_cursor { /* members below are only used if the input string is in utf8 */ const char *pInput; /* input we are tokenizing */ - const char *lastByteOffset; /* offset into pInput */ - int lastCharOffset; /* char offset corresponding to lastByteOffset */ + const char *currentByte; /* pointer into pInput */ + int currentChar; /* char corresponding to currentByte */ } perl_tokenizer_cursor; /* @@ -108,9 +108,9 @@ static int perl_tokenizer_Open( /* special handling if working with utf8 strings */ if (MY_CXT.last_dbh_is_unicode) { - /* data to keep track of byte offsets */ - c->lastByteOffset = c->pInput = pInput; - c->lastCharOffset = 0; + /* data to keep track of byte positions */ + c->currentByte = c->pInput = pInput; + c->currentChar = 0; /* string passed to Perl needs to be flagged as utf8 */ flags |= SVf_UTF8; @@ -174,7 +174,7 @@ static int perl_tokenizer_Next( int result; int n_retval; char *token; - char *byteOffset; + char *nextByte; STRLEN n_a; /* this is required for older perls < 5.8.8 */ I32 hop; @@ -215,19 +215,27 @@ static int perl_tokenizer_Next( /* recompute *pnBytes in bytes, not in chars */ *pnBytes = strlen(token); - /* recompute start offset in bytes, not in chars */ - hop = *piStartOffset - c->lastCharOffset; - byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); + /* nb of chars from last position to the start of the token */ + hop = *piStartOffset - c->currentChar; + + /* advance to the first byte in token */ + nextByte = (char*)utf8_hop((U8*)c->currentByte, hop); + + /* nb of chars in token */ hop = *piEndOffset - *piStartOffset; - *piStartOffset = byteOffset - c->pInput; - byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); + + /* recompute start offset in bytes, not in chars */ + *piStartOffset = nextByte - c->pInput; + + /* advance past to the last byte in token */ + nextByte = (char*)utf8_hop((U8*)nextByte, hop); /* remember where we are for next round */ - c->lastCharOffset = *piEndOffset; - c->lastByteOffset = byteOffset; + c->currentChar = *piEndOffset; + c->currentByte = nextByte; /* recompute end offset in bytes, not in chars */ - *piEndOffset = byteOffset - c->pInput; + *piEndOffset = nextByte - c->pInput; #ifdef DEBUG_OFFSETS warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes); From 6669dbc332a3a7544b6a696a351964d72e224343 Mon Sep 17 00:00:00 2001 From: dami Date: Thu, 13 May 2021 12:27:57 +0200 Subject: [PATCH 6/7] cleanup tokenizer code and test --- dbdimp_tokenizer.inc | 47 +++++++++++++++++--------------------------- t/43_fts3.t | 19 +++++------------- 2 files changed, 23 insertions(+), 43 deletions(-) diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index 49159f2..83ab40a 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -13,7 +13,7 @@ typedef struct perl_tokenizer_cursor { /* members below are only used if the input string is in utf8 */ const char *pInput; /* input we are tokenizing */ const char *currentByte; /* pointer into pInput */ - int currentChar; /* char corresponding to currentByte */ + int currentChar; /* char position corresponding to currentByte */ } perl_tokenizer_cursor; /* @@ -134,7 +134,7 @@ static int perl_tokenizer_Open( /* store the cursor coderef returned by the tokenizer */ if (n_retval != 1) { - warn("tokenizer returned %d arguments", n_retval); + warn("tokenizer returned %d arguments, expected 1", n_retval); } c->coderef = newSVsv(POPs); @@ -164,11 +164,11 @@ static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ */ static int perl_tokenizer_Next( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */ - const char **ppToken, /* OUT: *ppToken is the token text */ - int *pnBytes, /* OUT: Number of bytes in token */ - int *piStartOffset, /* OUT: Starting offset of token */ - int *piEndOffset, /* OUT: Ending offset of token */ - int *piPosition /* OUT: Position integer of token */ + const char **ppToken, /* OUT: Normalized text for token */ + int *pnBytes, /* OUT: Number of bytes in normalized text */ + int *piStartOffset, /* Starting offset of token. IN : char offset; OUT : byte offset */ + int *piEndOffset, /* Ending offset of token. IN : char offset; OUT : byte offset */ + int *piPosition /* OUT: Number of tokens returned before this one */ ){ perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; int result; @@ -197,7 +197,7 @@ static int perl_tokenizer_Next( /* otherwise, get token details from the return list */ else { if (n_retval != 5) { - warn("tokenizer cursor returned %d arguments", n_retval); + warn("tokenizer cursor returned %d arguments, expected 5", n_retval); } *piPosition = POPi; *piEndOffset = POPi; @@ -205,41 +205,31 @@ static int perl_tokenizer_Next( *pnBytes = POPi; token = POPpx; - if (c->pInput) { /* if working with utf8 data */ - -#ifdef DEBUG_OFFSETS - warn("INI: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes); -#endif - - /* recompute *pnBytes in bytes, not in chars */ - *pnBytes = strlen(token); - - /* nb of chars from last position to the start of the token */ + /* compute first hop : nb of chars from last position to the start of the token */ hop = *piStartOffset - c->currentChar; - /* advance to the first byte in token */ + /* hop: advance to the first byte in token */ nextByte = (char*)utf8_hop((U8*)c->currentByte, hop); - /* nb of chars in token */ + /* compute 2nd hop : nb of chars from start of the token to end of token */ hop = *piEndOffset - *piStartOffset; - /* recompute start offset in bytes, not in chars */ + /* now recompute the start offset in bytes, not in chars */ *piStartOffset = nextByte - c->pInput; - /* advance past to the last byte in token */ + /* 2nd hop: advance past to the last byte in token */ nextByte = (char*)utf8_hop((U8*)nextByte, hop); - /* remember where we are for next round */ + /* remember current position (useful for the next invocation) */ c->currentChar = *piEndOffset; c->currentByte = nextByte; - /* recompute end offset in bytes, not in chars */ + /* now recompute the end offset in bytes, not in chars */ *piEndOffset = nextByte - c->pInput; -#ifdef DEBUG_OFFSETS - warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes); -#endif + /* compute the size of the normalized token in bytes, not in chars */ + *pnBytes = strlen(token); } /* make sure we have enough storage for copying the token */ @@ -251,8 +241,7 @@ static int perl_tokenizer_Next( c->pToken = pNew; } - /* need to copy the token into the C cursor before perl frees that - memory */ + /* need to copy the token into the C cursor before perl frees that memory */ memcpy(c->pToken, token, *pnBytes); *ppToken = c->pToken; diff --git a/t/43_fts3.t b/t/43_fts3.t index f5fb0b0..c6170b9 100644 --- a/t/43_fts3.t +++ b/t/43_fts3.t @@ -1,13 +1,6 @@ use strict; use warnings; -no if $] >= 5.022, "warnings", "locale"; use lib "t/lib"; - -# TMP for running tests from Emacs -use lib "lib"; -use lib "../blib/lib"; -use lib "../blib/arch"; - use Time::HiRes qw/time/; use SQLiteTest; use Test::More; @@ -32,13 +25,9 @@ my @tests = ( ["(il OR elle) AND un*" => 0, 2 ], ); - - my $ix_une_native = index($texts[0], "une"); my $ix_une_utf8 = do {use bytes; utf8::upgrade(my $bergere_utf8 = $texts[0]); index($bergere_utf8, "une");}; - - BEGIN { requires_unicode_support(); @@ -59,8 +48,10 @@ sub Unicode_Word_tokenizer { # see also: Search::Tokenizer return sub { $string =~ /$regex/g or return; # either match, or no more token - my ($start, $end) = ($-[0], $+[0]); - my $term = substr($string, $start, my $len = $end-$start); + my $term = $&; + my $end = pos $string; # $+[0] is much slower + my $len = length($term); + my $start = $end - $len; return ($term, $len, $start, $end, $term_index++); }; }; @@ -129,7 +120,7 @@ for my $use_unicode (0, 1) { # simulated large document open my $fh, "<", $INC{'DBD/SQLite.pm'} or die $!; my $source_code = do {local $/; <$fh>}; - my $long_doc = $source_code x 1; + my $long_doc = $source_code x 5; my $t0 = time; $insert_sth->execute($long_doc); From a512146d4749eb65c8e5cb66118aef0f2d629b1a Mon Sep 17 00:00:00 2001 From: dami Date: Thu, 13 May 2021 19:35:52 +0200 Subject: [PATCH 7/7] fix #74 -- add a dependency from dbdimp.o to the *.inc files included into dbdimp.c --- Makefile.PL | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.PL b/Makefile.PL index 4f28a08..b2297a5 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -394,6 +394,9 @@ WriteMakefile( ? '$(O_FILES)' : 'SQLite.o dbdimp.o' ), + depend => { + 'dbdimp.o' => 'dbdimp_tokenizer.inc dbdimp_virtual_table.inc', + }, clean => { FILES => 'SQLite.xsi config.h tv.log *.old', },