mirror of
https://github.com/DBD-SQLite/DBD-SQLite
synced 2025-06-07 14:19:10 -04:00
fixed #75 -- lastCharOffset must copy the OLD value of piEndOffset, BEFORE it is recomputed
also fixed 43_fts3.t so that it uses \p{Word} instead of \w, because en-us locale did not handle accented characters
This commit is contained in:
parent
be9f64b2c9
commit
b5c3f9d528
2 changed files with 23 additions and 26 deletions
|
@ -205,22 +205,33 @@ static int perl_tokenizer_Next(
|
||||||
*pnBytes = POPi;
|
*pnBytes = POPi;
|
||||||
token = POPpx;
|
token = POPpx;
|
||||||
|
|
||||||
|
|
||||||
if (c->pInput) { /* if working with utf8 data */
|
if (c->pInput) { /* if working with utf8 data */
|
||||||
|
|
||||||
|
#ifdef DEBUG_OFFSETS
|
||||||
|
warn("INI: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);
|
||||||
|
#endif
|
||||||
|
|
||||||
/* recompute *pnBytes in bytes, not in chars */
|
/* recompute *pnBytes in bytes, not in chars */
|
||||||
*pnBytes = strlen(token);
|
*pnBytes = strlen(token);
|
||||||
|
|
||||||
/* recompute start/end offsets in bytes, not in chars */
|
/* recompute start offset in bytes, not in chars */
|
||||||
hop = *piStartOffset - c->lastCharOffset;
|
hop = *piStartOffset - c->lastCharOffset;
|
||||||
byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
|
byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
|
||||||
hop = *piEndOffset - *piStartOffset;
|
hop = *piEndOffset - *piStartOffset;
|
||||||
*piStartOffset = byteOffset - c->pInput;
|
*piStartOffset = byteOffset - c->pInput;
|
||||||
byteOffset = (char*)utf8_hop((U8*)byteOffset, hop);
|
byteOffset = (char*)utf8_hop((U8*)byteOffset, hop);
|
||||||
*piEndOffset = byteOffset - c->pInput;
|
|
||||||
|
|
||||||
/* remember where we are for next round */
|
/* remember where we are for next round */
|
||||||
c->lastCharOffset = *piEndOffset,
|
c->lastCharOffset = *piEndOffset;
|
||||||
c->lastByteOffset = byteOffset;
|
c->lastByteOffset = byteOffset;
|
||||||
|
|
||||||
|
/* recompute end offset in bytes, not in chars */
|
||||||
|
*piEndOffset = byteOffset - c->pInput;
|
||||||
|
|
||||||
|
#ifdef DEBUG_OFFSETS
|
||||||
|
warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
/* make sure we have enough storage for copying the token */
|
/* make sure we have enough storage for copying the token */
|
||||||
|
|
22
t/43_fts3.t
22
t/43_fts3.t
|
@ -42,25 +42,11 @@ BEGIN {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# Perl may spit a warning on locale
|
|
||||||
# use Test::NoWarnings;
|
|
||||||
|
|
||||||
BEGIN {
|
sub Unicode_Word_tokenizer { # see also: Search::Tokenizer
|
||||||
# Sadly perl for windows (and probably sqlite, too) may hang
|
|
||||||
# if the system locale doesn't support european languages.
|
|
||||||
# en-us should be a safe default. if it doesn't work, use 'C'.
|
|
||||||
if ( $^O eq 'MSWin32') {
|
|
||||||
use POSIX 'locale_h';
|
|
||||||
setlocale(LC_COLLATE, 'en-us');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
use locale;
|
|
||||||
|
|
||||||
sub locale_tokenizer { # see also: Search::Tokenizer
|
|
||||||
return sub {
|
return sub {
|
||||||
my $string = shift;
|
my $string = shift;
|
||||||
my $regex = qr/\w+/;
|
my $regex = qr/\p{Word}+/;
|
||||||
my $term_index = 0;
|
my $term_index = 0;
|
||||||
|
|
||||||
return sub {
|
return sub {
|
||||||
|
@ -85,7 +71,7 @@ for my $use_unicode (0, 1) {
|
||||||
# create fts table
|
# create fts table
|
||||||
$dbh->do(<<"") or die DBI::errstr;
|
$dbh->do(<<"") or die DBI::errstr;
|
||||||
CREATE VIRTUAL TABLE try_$fts
|
CREATE VIRTUAL TABLE try_$fts
|
||||||
USING $fts(content, tokenize=perl 'main::locale_tokenizer')
|
USING $fts(content, tokenize=perl 'main::Unicode_Word_tokenizer')
|
||||||
|
|
||||||
# populate it
|
# populate it
|
||||||
my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
|
my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;
|
||||||
|
|
Loading…
Add table
Reference in a new issue