From be9f64b2c9879cb57865e8e7591eb568283c55b5 Mon Sep 17 00:00:00 2001
From: dami <laurent.dami@justice.ge.ch>
Date: Sun, 9 May 2021 19:25:27 +0200
Subject: [PATCH 1/7] tests for functions snippet() and offets()

---
 t/43_fts3.t | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
diff --git a/t/43_fts3.t b/t/43_fts3.t
index 934c8de..d8dbec2 100644
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@@ -24,6 +24,13 @@ my @tests = (
   ["(il OR elle) AND un*" => 0, 2    ],
 );
 
+
+
+my $ix_une_native = index($texts[0], "une");
+my $ix_une_utf8   = do {use bytes; utf8::upgrade(my $bergere_utf8 = $texts[0]); index($bergere_utf8, "une");};
+
+
+
 BEGIN {
 	requires_unicode_support();
 
@@ -105,6 +112,19 @@ for my $use_unicode (0, 1) {
         is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)");
       }
     }
+
+
+    # the 'snippet' function should highlight the words in the MATCH query
+    my $sql_snip = "SELECT snippet(try_$fts) FROM try_$fts WHERE content MATCH ?";
+    my $result = $dbh->selectcol_arrayref($sql_snip, undef, 'une');
+    is_deeply($result, ['il était <b>une</b> bergère'], "snippet ($fts, unicode=$use_unicode)");
+
+    # the 'offsets' function should return integer offstes for the words in the MATCH query
+    my $sql_offsets = "SELECT offsets(try_$fts) FROM try_$fts WHERE content MATCH ?";
+    $result = $dbh->selectcol_arrayref($sql_offsets, undef, 'une');
+    my $offset_une = $use_unicode ? $ix_une_utf8 : $ix_une_native;
+    my $expected_offsets = "0 0 $offset_une 3";
+    is_deeply($result, [$expected_offsets], "offsets ($fts, unicode=$use_unicode)");
   }
 }
 

From b5c3f9d5281e8230c580f378965312300e1e9438 Mon Sep 17 00:00:00 2001
From: dami <laurent.dami@justice.ge.ch>
Date: Sun, 9 May 2021 22:06:28 +0200
Subject: [PATCH 2/7] fixed #75 -- lastCharOffset must copy the OLD value of
 piEndOffset, BEFORE it is recomputed also fixed 43_fts3.t so that it uses
 \p{Word} instead of \w, because en-us locale did not handle accented
 characters

---
 dbdimp_tokenizer.inc | 27 +++++++++++++++++++--------
 t/43_fts3.t          | 22 ++++------------------
 2 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc
index d48409a..286be37 100644
--- a/dbdimp_tokenizer.inc
+++ b/dbdimp_tokenizer.inc
@@ -205,22 +205,33 @@ static int perl_tokenizer_Next(
         *pnBytes       = POPi;
         token          = POPpx;
 
+       
         if (c->pInput) { /* if working with utf8 data */
 
+#ifdef DEBUG_OFFSETS
+            warn("INI: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);
+#endif
+
             /* recompute *pnBytes in bytes, not in chars */
             *pnBytes = strlen(token);
 
-            /* recompute start/end offsets in bytes, not in chars */
-            hop            = *piStartOffset - c->lastCharOffset;
-            byteOffset     = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
-            hop            = *piEndOffset - *piStartOffset;
-            *piStartOffset = byteOffset - c->pInput;
-            byteOffset     = (char*)utf8_hop((U8*)byteOffset, hop);
-            *piEndOffset   = byteOffset - c->pInput;
+            /* recompute start offset in bytes, not in chars */
+            hop               = *piStartOffset - c->lastCharOffset;
+            byteOffset        = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
+            hop               = *piEndOffset - *piStartOffset;
+            *piStartOffset    = byteOffset - c->pInput;
+            byteOffset        = (char*)utf8_hop((U8*)byteOffset, hop);
 
             /* remember where we are for next round */
-            c->lastCharOffset = *piEndOffset,
+            c->lastCharOffset = *piEndOffset;
             c->lastByteOffset = byteOffset;
+
+            /* recompute end offset in bytes, not in chars */
+            *piEndOffset      = byteOffset - c->pInput;
+
+#ifdef DEBUG_OFFSETS
+            warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);
+#endif
         }
 
         /* make sure we have enough storage for copying the token */
diff --git a/t/43_fts3.t b/t/43_fts3.t
index d8dbec2..65ff0b4 100644
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@@ -42,25 +42,11 @@ BEGIN {
 	}
 }
 
-# Perl may spit a warning on locale
-# use Test::NoWarnings;
 
-BEGIN {
-	# Sadly perl for windows (and probably sqlite, too) may hang
-	# if the system locale doesn't support european languages.
-	# en-us should be a safe default. if it doesn't work, use 'C'.
-	if ( $^O eq 'MSWin32') {
-		use POSIX 'locale_h';
-		setlocale(LC_COLLATE, 'en-us');
-	}
-}
-
-use locale;
-
-sub locale_tokenizer { # see also: Search::Tokenizer
+sub Unicode_Word_tokenizer { # see also: Search::Tokenizer
   return sub {
-    my $string = shift;
-    my $regex      = qr/\w+/;
+    my $string     = shift;
+    my $regex      = qr/\p{Word}+/;
     my $term_index = 0;
 
     return sub {
@@ -85,7 +71,7 @@ for my $use_unicode (0, 1) {
     # create fts table
     $dbh->do(<<"") or die DBI::errstr;
       CREATE VIRTUAL TABLE try_$fts
-            USING $fts(content, tokenize=perl 'main::locale_tokenizer')
+            USING $fts(content, tokenize=perl 'main::Unicode_Word_tokenizer')
 
     # populate it
     my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;

From 8d5a28cf5c9fb77cfd95553a5a0442b64ba794ec Mon Sep 17 00:00:00 2001
From: dami <laurent.dami@justice.ge.ch>
Date: Sun, 9 May 2021 22:22:39 +0200
Subject: [PATCH 3/7] additional test for snippet()

---
 t/43_fts3.t | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/t/43_fts3.t b/t/43_fts3.t
index 65ff0b4..7082e6c 100644
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@@ -99,18 +99,24 @@ for my $use_unicode (0, 1) {
       }
     }
 
-
     # the 'snippet' function should highlight the words in the MATCH query
     my $sql_snip = "SELECT snippet(try_$fts) FROM try_$fts WHERE content MATCH ?";
     my $result = $dbh->selectcol_arrayref($sql_snip, undef, 'une');
     is_deeply($result, ['il était <b>une</b> bergère'], "snippet ($fts, unicode=$use_unicode)");
 
-    # the 'offsets' function should return integer offstes for the words in the MATCH query
+    # the 'offsets' function should return integer offsets for the words in the MATCH query
     my $sql_offsets = "SELECT offsets(try_$fts) FROM try_$fts WHERE content MATCH ?";
     $result = $dbh->selectcol_arrayref($sql_offsets, undef, 'une');
     my $offset_une = $use_unicode ? $ix_une_utf8 : $ix_une_native;
     my $expected_offsets = "0 0 $offset_une 3";
     is_deeply($result, [$expected_offsets], "offsets ($fts, unicode=$use_unicode)");
+
+    # test a longer sentence
+    $dbh->do("INSERT INTO try_$fts(content) VALUES(?)", {}, join(" ", @texts));
+    $result = $dbh->selectcol_arrayref($sql_snip, undef, '"bergère qui gardait"');
+    like($result->[0],
+         qr[une <b>bergère</b> <b>qui</b> <b>gardait</b> ses],
+         "longer snippet ($fts, unicode=$use_unicode)");
   }
 }
 

From f68f79415f083d62398497885eb7ec955a884255 Mon Sep 17 00:00:00 2001
From: dami <laurent.dami@justice.ge.ch>
Date: Tue, 11 May 2021 09:01:59 +0200
Subject: [PATCH 4/7] test to show that perl tokenizer with unicode is
 extremely slow

---
 t/43_fts3.t | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/t/43_fts3.t b/t/43_fts3.t
index 7082e6c..f5fb0b0 100644
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@@ -2,11 +2,19 @@ use strict;
 use warnings;
 no if $] >= 5.022, "warnings", "locale";
 use lib "t/lib";
+
+# TMP for running tests from Emacs
+use lib "lib";
+use lib "../blib/lib";
+use lib "../blib/arch";
+
+use Time::HiRes qw/time/;
 use SQLiteTest;
 use Test::More;
 use if -d ".git", "Test::FailWarnings";
 use DBD::SQLite;
 
+
 my @texts = ("il était une bergère",
              "qui gardait ses moutons",
              "elle fit un fromage",
@@ -111,12 +119,27 @@ for my $use_unicode (0, 1) {
     my $expected_offsets = "0 0 $offset_une 3";
     is_deeply($result, [$expected_offsets], "offsets ($fts, unicode=$use_unicode)");
 
-    # test a longer sentence
-    $dbh->do("INSERT INTO try_$fts(content) VALUES(?)", {}, join(" ", @texts));
+    # test snippet() on a longer sentence
+    $insert_sth->execute(join " ", @texts);
     $result = $dbh->selectcol_arrayref($sql_snip, undef, '"bergère qui gardait"');
     like($result->[0],
          qr[une <b>bergère</b> <b>qui</b> <b>gardait</b> ses],
          "longer snippet ($fts, unicode=$use_unicode)");
+
+    # simulated large document
+    open my $fh, "<", $INC{'DBD/SQLite.pm'} or die $!;
+    my $source_code = do {local $/; <$fh>};
+    my $long_doc    = $source_code x 1;
+
+    my $t0 = time;
+    $insert_sth->execute($long_doc);
+    my $t1 = time;
+    $result = $dbh->selectcol_arrayref($sql_snip, undef, '"package DBD::SQLite"');
+    my $t2 = time;
+
+    note sprintf("long doc (%d chars): insert in %.4f secs, select in %.4f secs",
+                 length($long_doc), $t1-$t0, $t2-$t1);
+    like($result->[0], qr[^<b>package</b> <b>DBD</b>::<b>SQLite</b>;], 'snippet "package SQLite"');
   }
 }
 

From 80c7c91ae6531c9466bc1a63abae0ac06fd1e60f Mon Sep 17 00:00:00 2001
From: dami <laurent.dami@justice.ge.ch>
Date: Tue, 11 May 2021 09:40:24 +0200
Subject: [PATCH 5/7] better variable names and comments

---
 dbdimp_tokenizer.inc | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc
index 286be37..49159f2 100644
--- a/dbdimp_tokenizer.inc
+++ b/dbdimp_tokenizer.inc
@@ -12,8 +12,8 @@ typedef struct perl_tokenizer_cursor {
 
     /* members below are only used if the input string is in utf8 */
     const char *pInput;          /* input we are tokenizing */
-    const char *lastByteOffset;  /* offset into pInput */
-    int lastCharOffset;          /* char offset corresponding to lastByteOffset */
+    const char *currentByte;     /* pointer into pInput */
+    int currentChar;             /* char corresponding to currentByte */
 } perl_tokenizer_cursor;
 
 /*
@@ -108,9 +108,9 @@ static int perl_tokenizer_Open(
     /* special handling if working with utf8 strings */
     if (MY_CXT.last_dbh_is_unicode) {
 
-        /* data to keep track of byte offsets */
-        c->lastByteOffset = c->pInput = pInput;
-        c->lastCharOffset = 0;
+        /* data to keep track of byte positions */
+        c->currentByte = c->pInput = pInput;
+        c->currentChar = 0;
 
         /* string passed to Perl needs to be flagged as utf8 */
         flags |= SVf_UTF8;
@@ -174,7 +174,7 @@ static int perl_tokenizer_Next(
     int result;
     int n_retval;
     char *token;
-    char *byteOffset;
+    char *nextByte;
     STRLEN n_a; /* this is required for older perls < 5.8.8 */
     I32 hop;
 
@@ -215,19 +215,27 @@ static int perl_tokenizer_Next(
             /* recompute *pnBytes in bytes, not in chars */
             *pnBytes = strlen(token);
 
-            /* recompute start offset in bytes, not in chars */
-            hop               = *piStartOffset - c->lastCharOffset;
-            byteOffset        = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
+            /* nb of chars from last position to the start of the token */
+            hop               = *piStartOffset - c->currentChar;
+
+            /* advance to the first byte in token */
+            nextByte          = (char*)utf8_hop((U8*)c->currentByte, hop);
+
+            /* nb of chars in token */
             hop               = *piEndOffset - *piStartOffset;
-            *piStartOffset    = byteOffset - c->pInput;
-            byteOffset        = (char*)utf8_hop((U8*)byteOffset, hop);
+
+            /* recompute start offset in bytes, not in chars */
+            *piStartOffset    = nextByte - c->pInput;
+
+            /* advance past to the last byte in token */
+            nextByte          = (char*)utf8_hop((U8*)nextByte, hop);
 
             /* remember where we are for next round */
-            c->lastCharOffset = *piEndOffset;
-            c->lastByteOffset = byteOffset;
+            c->currentChar    = *piEndOffset;
+            c->currentByte    = nextByte;
 
             /* recompute end offset in bytes, not in chars */
-            *piEndOffset      = byteOffset - c->pInput;
+            *piEndOffset      = nextByte - c->pInput;
 
 #ifdef DEBUG_OFFSETS
             warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);

From 6669dbc332a3a7544b6a696a351964d72e224343 Mon Sep 17 00:00:00 2001
From: dami <laurent.dami@justice.ge.ch>
Date: Thu, 13 May 2021 12:27:57 +0200
Subject: [PATCH 6/7] cleanup tokenizer code and test

---
 dbdimp_tokenizer.inc | 47 +++++++++++++++++---------------------------
 t/43_fts3.t          | 19 +++++-------------
 2 files changed, 23 insertions(+), 43 deletions(-)

diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc
index 49159f2..83ab40a 100644
--- a/dbdimp_tokenizer.inc
+++ b/dbdimp_tokenizer.inc
@@ -13,7 +13,7 @@ typedef struct perl_tokenizer_cursor {
     /* members below are only used if the input string is in utf8 */
     const char *pInput;          /* input we are tokenizing */
     const char *currentByte;     /* pointer into pInput */
-    int currentChar;             /* char corresponding to currentByte */
+    int         currentChar;     /* char position corresponding to currentByte */
 } perl_tokenizer_cursor;
 
 /*
@@ -134,7 +134,7 @@ static int perl_tokenizer_Open(
 
     /* store the cursor coderef returned by the tokenizer */
     if (n_retval != 1) {
-        warn("tokenizer returned %d arguments", n_retval);
+        warn("tokenizer returned %d arguments, expected 1", n_retval);
     }
     c->coderef = newSVsv(POPs);
 
@@ -164,11 +164,11 @@ static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
 */
 static int perl_tokenizer_Next(
     sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by perl_tokenizer_Open */
-    const char **ppToken,               /* OUT: *ppToken is the token text */
-    int *pnBytes,                       /* OUT: Number of bytes in token */
-    int *piStartOffset,                 /* OUT: Starting offset of token */
-    int *piEndOffset,                   /* OUT: Ending offset of token */
-    int *piPosition                     /* OUT: Position integer of token */
+    const char **ppToken,               /* OUT: Normalized text for token */
+    int *pnBytes,                       /* OUT: Number of bytes in normalized text */
+    int *piStartOffset,                 /* Starting offset of token. IN : char offset; OUT : byte offset */
+    int *piEndOffset,                   /* Ending offset of token.   IN : char offset; OUT : byte offset */
+    int *piPosition                     /* OUT: Number of tokens returned before this one */
 ){
     perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
     int result;
@@ -197,7 +197,7 @@ static int perl_tokenizer_Next(
     /* otherwise, get token details from the return list */
     else {
         if (n_retval != 5) {
-            warn("tokenizer cursor returned %d arguments", n_retval);
+            warn("tokenizer cursor returned %d arguments, expected 5", n_retval);
         }
         *piPosition    = POPi;
         *piEndOffset   = POPi;
@@ -205,41 +205,31 @@ static int perl_tokenizer_Next(
         *pnBytes       = POPi;
         token          = POPpx;
 
-       
         if (c->pInput) { /* if working with utf8 data */
-
-#ifdef DEBUG_OFFSETS
-            warn("INI: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);
-#endif
-
-            /* recompute *pnBytes in bytes, not in chars */
-            *pnBytes = strlen(token);
-
-            /* nb of chars from last position to the start of the token */
+            /* compute first hop : nb of chars from last position to the start of the token */
             hop               = *piStartOffset - c->currentChar;
 
-            /* advance to the first byte in token */
+            /* hop: advance to the first byte in token */
             nextByte          = (char*)utf8_hop((U8*)c->currentByte, hop);
 
-            /* nb of chars in token */
+            /* compute 2nd hop : nb of chars from start of the token to end of token */
             hop               = *piEndOffset - *piStartOffset;
 
-            /* recompute start offset in bytes, not in chars */
+            /* now recompute the start offset in bytes, not in chars */
             *piStartOffset    = nextByte - c->pInput;
 
-            /* advance past to the last byte in token */
+            /* 2nd hop: advance past to the last byte in token */
             nextByte          = (char*)utf8_hop((U8*)nextByte, hop);
 
-            /* remember where we are for next round */
+            /* remember current position (useful for the next invocation) */
             c->currentChar    = *piEndOffset;
             c->currentByte    = nextByte;
 
-            /* recompute end offset in bytes, not in chars */
+            /* now recompute the end offset in bytes, not in chars */
             *piEndOffset      = nextByte - c->pInput;
 
-#ifdef DEBUG_OFFSETS
-            warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);
-#endif
+            /* compute the size of the normalized token in bytes, not in chars */
+            *pnBytes          = strlen(token);
         }
 
         /* make sure we have enough storage for copying the token */
@@ -251,8 +241,7 @@ static int perl_tokenizer_Next(
             c->pToken = pNew;
         }
 
-        /* need to copy the token into the C cursor before perl frees that
-           memory */
+        /* need to copy the token into the C cursor before perl frees that memory */
         memcpy(c->pToken, token, *pnBytes);
         *ppToken  = c->pToken;
 
diff --git a/t/43_fts3.t b/t/43_fts3.t
index f5fb0b0..c6170b9 100644
--- a/t/43_fts3.t
+++ b/t/43_fts3.t
@@ -1,13 +1,6 @@
 use strict;
 use warnings;
-no if $] >= 5.022, "warnings", "locale";
 use lib "t/lib";
-
-# TMP for running tests from Emacs
-use lib "lib";
-use lib "../blib/lib";
-use lib "../blib/arch";
-
 use Time::HiRes qw/time/;
 use SQLiteTest;
 use Test::More;
@@ -32,13 +25,9 @@ my @tests = (
   ["(il OR elle) AND un*" => 0, 2    ],
 );
 
-
-
 my $ix_une_native = index($texts[0], "une");
 my $ix_une_utf8   = do {use bytes; utf8::upgrade(my $bergere_utf8 = $texts[0]); index($bergere_utf8, "une");};
 
-
-
 BEGIN {
 	requires_unicode_support();
 
@@ -59,8 +48,10 @@ sub Unicode_Word_tokenizer { # see also: Search::Tokenizer
 
     return sub {
       $string =~ /$regex/g or return; # either match, or no more token
-      my ($start, $end) = ($-[0], $+[0]);
-      my $term = substr($string, $start, my $len = $end-$start);
+      my $term  = $&;
+      my $end   = pos $string; # $+[0] is much slower
+      my $len   = length($term);
+      my $start = $end - $len;
       return ($term, $len, $start, $end, $term_index++);
     };
   };
@@ -129,7 +120,7 @@ for my $use_unicode (0, 1) {
     # simulated large document
     open my $fh, "<", $INC{'DBD/SQLite.pm'} or die $!;
     my $source_code = do {local $/; <$fh>};
-    my $long_doc    = $source_code x 1;
+    my $long_doc    = $source_code x 5;
 
     my $t0 = time;
     $insert_sth->execute($long_doc);

From a512146d4749eb65c8e5cb66118aef0f2d629b1a Mon Sep 17 00:00:00 2001
From: dami <laurent.dami@justice.ge.ch>
Date: Thu, 13 May 2021 19:35:52 +0200
Subject: [PATCH 7/7] fix #74 -- add a dependency from dbdimp.o to the *.inc
 files included into dbdimp.c

---
 Makefile.PL | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Makefile.PL b/Makefile.PL
index 4f28a08..b2297a5 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -394,6 +394,9 @@ WriteMakefile(
 		? '$(O_FILES)'
 		: 'SQLite.o dbdimp.o'
 	),
+	depend           => {
+        	'dbdimp.o' => 'dbdimp_tokenizer.inc dbdimp_virtual_table.inc',
+	},
 	clean            => {
 		FILES => 'SQLite.xsi config.h tv.log *.old',
 	},