diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index 286be37..49159f2 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -12,8 +12,8 @@ typedef struct perl_tokenizer_cursor { /* members below are only used if the input string is in utf8 */ const char *pInput; /* input we are tokenizing */ - const char *lastByteOffset; /* offset into pInput */ - int lastCharOffset; /* char offset corresponding to lastByteOffset */ + const char *currentByte; /* pointer into pInput */ + int currentChar; /* char corresponding to currentByte */ } perl_tokenizer_cursor; /* @@ -108,9 +108,9 @@ static int perl_tokenizer_Open( /* special handling if working with utf8 strings */ if (MY_CXT.last_dbh_is_unicode) { - /* data to keep track of byte offsets */ - c->lastByteOffset = c->pInput = pInput; - c->lastCharOffset = 0; + /* data to keep track of byte positions */ + c->currentByte = c->pInput = pInput; + c->currentChar = 0; /* string passed to Perl needs to be flagged as utf8 */ flags |= SVf_UTF8; @@ -174,7 +174,7 @@ static int perl_tokenizer_Next( int result; int n_retval; char *token; - char *byteOffset; + char *nextByte; STRLEN n_a; /* this is required for older perls < 5.8.8 */ I32 hop; @@ -215,19 +215,27 @@ static int perl_tokenizer_Next( /* recompute *pnBytes in bytes, not in chars */ *pnBytes = strlen(token); - /* recompute start offset in bytes, not in chars */ - hop = *piStartOffset - c->lastCharOffset; - byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); + /* nb of chars from last position to the start of the token */ + hop = *piStartOffset - c->currentChar; + + /* advance to the first byte in token */ + nextByte = (char*)utf8_hop((U8*)c->currentByte, hop); + + /* nb of chars in token */ hop = *piEndOffset - *piStartOffset; - *piStartOffset = byteOffset - c->pInput; - byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); + + /* recompute start offset in bytes, not in chars */ + *piStartOffset = nextByte - c->pInput; + + /* advance past to the last byte in token */ + nextByte = (char*)utf8_hop((U8*)nextByte, hop); /* remember where we are for next round */ - c->lastCharOffset = *piEndOffset; - c->lastByteOffset = byteOffset; + c->currentChar = *piEndOffset; + c->currentByte = nextByte; /* recompute end offset in bytes, not in chars */ - *piEndOffset = byteOffset - c->pInput; + *piEndOffset = nextByte - c->pInput; #ifdef DEBUG_OFFSETS warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);