better variable names and comments

2025-06-07 14:19:10 -04:00 · 2021-05-11 09:40:24 +02:00 · 2021-05-11 09:40:24 +02:00 · 80c7c91ae6
commit 80c7c91ae6
parent f68f79415f
1 changed files with 22 additions and 14 deletions
--- a/dbdimp_tokenizer.inc
+++ b/dbdimp_tokenizer.inc
@ -12,8 +12,8 @@ typedef struct perl_tokenizer_cursor {

    /* members below are only used if the input string is in utf8 */
    const char *pInput;          /* input we are tokenizing */
-    const char *lastByteOffset;  /* offset into pInput */
-    int lastCharOffset;          /* char offset corresponding to lastByteOffset */
+    const char *currentByte;     /* pointer into pInput */
+    int currentChar;             /* char corresponding to currentByte */
 } perl_tokenizer_cursor;

 /*
@ -108,9 +108,9 @@ static int perl_tokenizer_Open(
    /* special handling if working with utf8 strings */
    if (MY_CXT.last_dbh_is_unicode) {

-        /* data to keep track of byte offsets */
-        c->lastByteOffset = c->pInput = pInput;
-        c->lastCharOffset = 0;
+        /* data to keep track of byte positions */
+        c->currentByte = c->pInput = pInput;
+        c->currentChar = 0;

        /* string passed to Perl needs to be flagged as utf8 */
        flags |= SVf_UTF8;
@ -174,7 +174,7 @@ static int perl_tokenizer_Next(
    int result;
    int n_retval;
    char *token;
-    char *byteOffset;
+    char *nextByte;
    STRLEN n_a; /* this is required for older perls < 5.8.8 */
    I32 hop;

@ -215,19 +215,27 @@ static int perl_tokenizer_Next(
            /* recompute *pnBytes in bytes, not in chars */
            *pnBytes = strlen(token);

-            /* recompute start offset in bytes, not in chars */
-            hop               = *piStartOffset - c->lastCharOffset;
-            byteOffset        = (char*)utf8_hop((U8*)c->lastByteOffset, hop);
+            /* nb of chars from last position to the start of the token */
+            hop               = *piStartOffset - c->currentChar;
+
+            /* advance to the first byte in token */
+            nextByte          = (char*)utf8_hop((U8*)c->currentByte, hop);
+
+            /* nb of chars in token */
            hop               = *piEndOffset - *piStartOffset;
-            *piStartOffset    = byteOffset - c->pInput;
-            byteOffset        = (char*)utf8_hop((U8*)byteOffset, hop);
+
+            /* recompute start offset in bytes, not in chars */
+            *piStartOffset    = nextByte - c->pInput;
+
+            /* advance past to the last byte in token */
+            nextByte          = (char*)utf8_hop((U8*)nextByte, hop);

            /* remember where we are for next round */
-            c->lastCharOffset = *piEndOffset;
-            c->lastByteOffset = byteOffset;
+            c->currentChar    = *piEndOffset;
+            c->currentByte    = nextByte;

            /* recompute end offset in bytes, not in chars */
-            *piEndOffset      = byteOffset - c->pInput;
+            *piEndOffset      = nextByte - c->pInput;

 #ifdef DEBUG_OFFSETS
            warn("FIX: token: %s, start=%d, end=%d, nBytes=%d\n", token, *piStartOffset, *piEndOffset, *pnBytes);