fixed #75 -- lastCharOffset must copy the OLD value of piEndOffset, BEFORE it is recomputed

also fixed 43_fts3.t so that it uses \p{Word} instead of \w, because en-us locale did not handle accented characters
2025-06-07 14:19:10 -04:00 · 2021-05-09 22:06:28 +02:00 · 2021-05-09 22:06:28 +02:00 · b5c3f9d528
commit b5c3f9d528
parent be9f64b2c9
2 changed files with 23 additions and 26 deletions
--- a/dbdimp_tokenizer.inc
+++ b/dbdimp_tokenizer.inc
@ -205,22 +205,33 @@ static int perl_tokenizer_Next(
        *pnBytes       = POPi;
        token          = POPpx;
        if (c->pInput) { /* if working with utf8 data */
 #ifdef DEBUG_OFFSETS
            warn("INI: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);
 #endif
            /* recompute *pnBytes in bytes, not in chars */
            *pnBytes = strlen(token);
-            /* recompute start/end offsets in bytes, not in chars */
+            /* recompute start offset in bytes, not in chars */
-            hop            = *piStartOffset - c->lastCharOffset;
+            hop               = *piStartOffset - c->lastCharOffset;
-            byteOffset     = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
+            byteOffset        = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
-            hop            = *piEndOffset - *piStartOffset;
+            hop               = *piEndOffset - *piStartOffset;
-            *piStartOffset = byteOffset - c->pInput;
+            *piStartOffset    = byteOffset - c->pInput;
-            byteOffset     = (char*)utf8_hop((U8*)byteOffset, hop);
+            byteOffset        = (char*)utf8_hop((U8*)byteOffset, hop);
            *piEndOffset   = byteOffset - c->pInput;
            /* remember where we are for next round */
-            c->lastCharOffset = *piEndOffset,
+            c->lastCharOffset = *piEndOffset;
            c->lastByteOffset = byteOffset;
            /* recompute end offset in bytes, not in chars */
            *piEndOffset      = byteOffset - c->pInput;
 #ifdef DEBUG_OFFSETS
            warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);
 #endif
        }
        /* make sure we have enough storage for copying the token */
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@ -42,25 +42,11 @@ BEGIN {
 	}
 }
 # Perl may spit a warning on locale
 # use Test::NoWarnings;
-BEGIN {
+sub Unicode_Word_tokenizer { # see also: Search::Tokenizer
 	# Sadly perl for windows (and probably sqlite, too) may hang
 	# if the system locale doesn't support european languages.
 	# en-us should be a safe default. if it doesn't work, use 'C'.
 	if ( $^O eq 'MSWin32') {
 		use POSIX 'locale_h';
 		setlocale(LC_COLLATE, 'en-us');
 	}
 }
 use locale;
 sub locale_tokenizer { # see also: Search::Tokenizer
  return sub {
-    my $string = shift;
+    my $string     = shift;
-    my $regex      = qr/\w+/;
+    my $regex      = qr/\p{Word}+/;
    my $term_index = 0;
    return sub {
@ -85,7 +71,7 @@ for my $use_unicode (0, 1) {
    # create fts table
    $dbh->do(<<"") or die DBI::errstr;
      CREATE VIRTUAL TABLE try_$fts
-            USING $fts(content, tokenize=perl 'main::locale_tokenizer')
+            USING $fts(content, tokenize=perl 'main::Unicode_Word_tokenizer')
    # populate it
    my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;