diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index d48409a..286be37 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -205,22 +205,33 @@ static int perl_tokenizer_Next( *pnBytes = POPi; token = POPpx; + if (c->pInput) { /* if working with utf8 data */ +#ifdef DEBUG_OFFSETS + warn("INI: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes); +#endif + /* recompute *pnBytes in bytes, not in chars */ *pnBytes = strlen(token); - /* recompute start/end offsets in bytes, not in chars */ - hop = *piStartOffset - c->lastCharOffset; - byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); - hop = *piEndOffset - *piStartOffset; - *piStartOffset = byteOffset - c->pInput; - byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); - *piEndOffset = byteOffset - c->pInput; + /* recompute start offset in bytes, not in chars */ + hop = *piStartOffset - c->lastCharOffset; + byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); + hop = *piEndOffset - *piStartOffset; + *piStartOffset = byteOffset - c->pInput; + byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); /* remember where we are for next round */ - c->lastCharOffset = *piEndOffset, + c->lastCharOffset = *piEndOffset; c->lastByteOffset = byteOffset; + + /* recompute end offset in bytes, not in chars */ + *piEndOffset = byteOffset - c->pInput; + +#ifdef DEBUG_OFFSETS + warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes); +#endif } /* make sure we have enough storage for copying the token */ diff --git a/t/43_fts3.t b/t/43_fts3.t index d8dbec2..65ff0b4 100644 --- a/t/43_fts3.t +++ b/t/43_fts3.t @@ -42,25 +42,11 @@ BEGIN { } } -# Perl may spit a warning on locale -# use Test::NoWarnings; -BEGIN { - # Sadly perl for windows (and probably sqlite, too) may hang - # if the system locale doesn't support european languages. - # en-us should be a safe default. if it doesn't work, use 'C'. - if ( $^O eq 'MSWin32') { - use POSIX 'locale_h'; - setlocale(LC_COLLATE, 'en-us'); - } -} - -use locale; - -sub locale_tokenizer { # see also: Search::Tokenizer +sub Unicode_Word_tokenizer { # see also: Search::Tokenizer return sub { - my $string = shift; - my $regex = qr/\w+/; + my $string = shift; + my $regex = qr/\p{Word}+/; my $term_index = 0; return sub { @@ -85,7 +71,7 @@ for my $use_unicode (0, 1) { # create fts table $dbh->do(<<"") or die DBI::errstr; CREATE VIRTUAL TABLE try_$fts - USING $fts(content, tokenize=perl 'main::locale_tokenizer') + USING $fts(content, tokenize=perl 'main::Unicode_Word_tokenizer') # populate it my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr;