Merge pull request #76 from DBD-SQLite/#75_fix_fts3_offsets

#75 fix fts3 offsets
2025-06-07 14:19:10 -04:00 · 2021-05-30 20:44:46 +09:00 · 2021-05-30 20:44:46 +09:00 · 57d2dd7fb5
commit 57d2dd7fb5
parent aa90eee3a0 a512146d47
3 changed files with 85 additions and 48 deletions
--- a/Makefile.PL
+++ b/Makefile.PL
@ -394,6 +394,9 @@ WriteMakefile(
 		? '$(O_FILES)'
 		: 'SQLite.o dbdimp.o'
 	),
 	depend           => {
        	'dbdimp.o' => 'dbdimp_tokenizer.inc dbdimp_virtual_table.inc',
 	},
 	clean            => {
 		FILES => 'SQLite.xsi config.h tv.log *.old',
 	},
--- a/dbdimp_tokenizer.inc
+++ b/dbdimp_tokenizer.inc
@ -12,8 +12,8 @@ typedef struct perl_tokenizer_cursor {
    /* members below are only used if the input string is in utf8 */
    const char *pInput;          /* input we are tokenizing */
-    const char *lastByteOffset;  /* offset into pInput */
+    const char *currentByte;     /* pointer into pInput */
-    int lastCharOffset;          /* char offset corresponding to lastByteOffset */
+    int         currentChar;     /* char position corresponding to currentByte */
 } perl_tokenizer_cursor;
 /*
@ -108,9 +108,9 @@ static int perl_tokenizer_Open(
    /* special handling if working with utf8 strings */
    if (MY_CXT.last_dbh_is_unicode) {
-        /* data to keep track of byte offsets */
+        /* data to keep track of byte positions */
-        c->lastByteOffset = c->pInput = pInput;
+        c->currentByte = c->pInput = pInput;
-        c->lastCharOffset = 0;
+        c->currentChar = 0;
        /* string passed to Perl needs to be flagged as utf8 */
        flags |= SVf_UTF8;
@ -134,7 +134,7 @@ static int perl_tokenizer_Open(
    /* store the cursor coderef returned by the tokenizer */
    if (n_retval != 1) {
-        warn("tokenizer returned %d arguments", n_retval);
+        warn("tokenizer returned %d arguments, expected 1", n_retval);
    }
    c->coderef = newSVsv(POPs);
@ -164,17 +164,17 @@ static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
 */
 static int perl_tokenizer_Next(
    sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by perl_tokenizer_Open */
-    const char **ppToken,               /* OUT: *ppToken is the token text */
+    const char **ppToken,               /* OUT: Normalized text for token */
-    int *pnBytes,                       /* OUT: Number of bytes in token */
+    int *pnBytes,                       /* OUT: Number of bytes in normalized text */
-    int *piStartOffset,                 /* OUT: Starting offset of token */
+    int *piStartOffset,                 /* Starting offset of token. IN : char offset; OUT : byte offset */
-    int *piEndOffset,                   /* OUT: Ending offset of token */
+    int *piEndOffset,                   /* Ending offset of token.   IN : char offset; OUT : byte offset */
-    int *piPosition                     /* OUT: Position integer of token */
+    int *piPosition                     /* OUT: Number of tokens returned before this one */
 ){
    perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
    int result;
    int n_retval;
    char *token;
-    char *byteOffset;
+    char *nextByte;
    STRLEN n_a; /* this is required for older perls < 5.8.8 */
    I32 hop;
@ -197,7 +197,7 @@ static int perl_tokenizer_Next(
    /* otherwise, get token details from the return list */
    else {
        if (n_retval != 5) {
-            warn("tokenizer cursor returned %d arguments", n_retval);
+            warn("tokenizer cursor returned %d arguments, expected 5", n_retval);
        }
        *piPosition    = POPi;
        *piEndOffset   = POPi;
@ -206,21 +206,30 @@ static int perl_tokenizer_Next(
        token          = POPpx;
        if (c->pInput) { /* if working with utf8 data */
            /* compute first hop : nb of chars from last position to the start of the token */
            hop               = *piStartOffset - c->currentChar;
-            /* recompute *pnBytes in bytes, not in chars */
+            /* hop: advance to the first byte in token */
-            *pnBytes = strlen(token);
+            nextByte          = (char*)utf8_hop((U8*)c->currentByte, hop);
-            /* recompute start/end offsets in bytes, not in chars */
+            /* compute 2nd hop : nb of chars from start of the token to end of token */
-            hop            = *piStartOffset - c->lastCharOffset;
+            hop               = *piEndOffset - *piStartOffset;
            byteOffset     = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
            hop            = *piEndOffset - *piStartOffset;
            *piStartOffset = byteOffset - c->pInput;
            byteOffset     = (char*)utf8_hop((U8*)byteOffset, hop);
            *piEndOffset   = byteOffset - c->pInput;
-            /* remember where we are for next round */
+            /* now recompute the start offset in bytes, not in chars */
-            c->lastCharOffset = *piEndOffset,
+            *piStartOffset    = nextByte - c->pInput;
-            c->lastByteOffset = byteOffset;
+
            /* 2nd hop: advance past to the last byte in token */
            nextByte          = (char*)utf8_hop((U8*)nextByte, hop);
            /* remember current position (useful for the next invocation) */
            c->currentChar    = *piEndOffset;
            c->currentByte    = nextByte;
            /* now recompute the end offset in bytes, not in chars */
            *piEndOffset      = nextByte - c->pInput;
            /* compute the size of the normalized token in bytes, not in chars */
            *pnBytes          = strlen(token);
        }
        /* make sure we have enough storage for copying the token */
@ -232,8 +241,7 @@ static int perl_tokenizer_Next(
            c->pToken = pNew;
        }
-        /* need to copy the token into the C cursor before perl frees that
+        /* need to copy the token into the C cursor before perl frees that memory */
           memory */
        memcpy(c->pToken, token, *pnBytes);
        *ppToken  = c->pToken;
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@ -1,12 +1,13 @@
 use strict;
 use warnings;
 no if $] >= 5.022, "warnings", "locale";
 use lib "t/lib";
 use Time::HiRes qw/time/;
 use SQLiteTest;
 use Test::More;
 use if -d ".git", "Test::FailWarnings";
 use DBD::SQLite;
 my @texts = ("il était une bergère",
             "qui gardait ses moutons",
             "elle fit un fromage",
@ -24,6 +25,9 @@ my @tests = (
  ["(il OR elle) AND un*" => 0, 2    ],
 );
 my $ix_une_native = index($texts[0], "une");
 my $ix_une_utf8   = do {use bytes; utf8::upgrade(my $bergere_utf8 = $texts[0]); index($bergere_utf8, "une");};
 BEGIN {
 	requires_unicode_support();
@ -35,31 +39,19 @@ BEGIN {
 	}
 }
 # Perl may spit a warning on locale
 # use Test::NoWarnings;
-BEGIN {
+sub Unicode_Word_tokenizer { # see also: Search::Tokenizer
 	# Sadly perl for windows (and probably sqlite, too) may hang
 	# if the system locale doesn't support european languages.
 	# en-us should be a safe default. if it doesn't work, use 'C'.
 	if ( $^O eq 'MSWin32') {
 		use POSIX 'locale_h';
 		setlocale(LC_COLLATE, 'en-us');
 	}
 }
 use locale;
 sub locale_tokenizer { # see also: Search::Tokenizer
  return sub {
-    my $string = shift;
+    my $string     = shift;
-    my $regex      = qr/\w+/;
+    my $regex      = qr/\p{Word}+/;
    my $term_index = 0;
    return sub {
      $string =~ /$regex/g or return; # either match, or no more token
-      my ($start, $end) = ($-[0], $+[0]);
+      my $term  = $&;
-      my $term = substr($string, $start, my $len = $end-$start);
+      my $end   = pos $string; # $+[0] is much slower
      my $len   = length($term);
      my $start = $end - $len;
      return ($term, $len, $start, $end, $term_index++);
    };
  };
@ -78,7 +70,7 @@ for my $use_unicode (0, 1) {
    # create fts table
    $dbh->do(<<"") or die DBI::errstr;
      CREATE VIRTUAL TABLE try_$fts
-            USING $fts(content, tokenize=perl 'main::locale_tokenizer')
+            USING $fts(content, tokenize=perl 'main::Unicode_Word_tokenizer')
    # populate it
    my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
@ -105,6 +97,40 @@ for my $use_unicode (0, 1) {
        is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)");
      }
    }
    # the 'snippet' function should highlight the words in the MATCH query
    my $sql_snip = "SELECT snippet(try_$fts) FROM try_$fts WHERE content MATCH ?";
    my $result = $dbh->selectcol_arrayref($sql_snip, undef, 'une');
    is_deeply($result, ['il était <b>une</b> bergère'], "snippet ($fts, unicode=$use_unicode)");
    # the 'offsets' function should return integer offsets for the words in the MATCH query
    my $sql_offsets = "SELECT offsets(try_$fts) FROM try_$fts WHERE content MATCH ?";
    $result = $dbh->selectcol_arrayref($sql_offsets, undef, 'une');
    my $offset_une = $use_unicode ? $ix_une_utf8 : $ix_une_native;
    my $expected_offsets = "0 0 $offset_une 3";
    is_deeply($result, [$expected_offsets], "offsets ($fts, unicode=$use_unicode)");
    # test snippet() on a longer sentence
    $insert_sth->execute(join " ", @texts);
    $result = $dbh->selectcol_arrayref($sql_snip, undef, '"bergère qui gardait"');
    like($result->[0],
         qr[une <b>bergère</b> <b>qui</b> <b>gardait</b> ses],
         "longer snippet ($fts, unicode=$use_unicode)");
    # simulated large document
    open my $fh, "<", $INC{'DBD/SQLite.pm'} or die $!;
    my $source_code = do {local $/; <$fh>};
    my $long_doc    = $source_code x 5;
    my $t0 = time;
    $insert_sth->execute($long_doc);
    my $t1 = time;
    $result = $dbh->selectcol_arrayref($sql_snip, undef, '"package DBD::SQLite"');
    my $t2 = time;
    note sprintf("long doc (%d chars): insert in %.4f secs, select in %.4f secs",
                 length($long_doc), $t1-$t0, $t2-$t1);
    like($result->[0], qr[^<b>package</b> <b>DBD</b>::<b>SQLite</b>;], 'snippet "package SQLite"');
  }
 }