DBD-SQLite-SQLcipher/dbdimp_tokenizer.inc

typedef struct perl_tokenizer {
    sqlite3_tokenizer base;
    SV *coderef;                 /* the perl tokenizer is a coderef that takes
                                    a string and returns a cursor coderef */
} perl_tokenizer;

typedef struct perl_tokenizer_cursor {
    sqlite3_tokenizer_cursor base;
    SV *coderef;                 /* ref to the closure that returns terms */
    char *pToken;                /* storage for a copy of the last token */
    int nTokenAllocated;         /* space allocated to pToken buffer */

    /* members below are only used if the input string is in utf8 */
    const char *pInput;          /* input we are tokenizing */
    const char *currentByte;     /* pointer into pInput */
    int         currentChar;     /* char position corresponding to currentByte */
} perl_tokenizer_cursor;

/*
** Create a new tokenizer instance.
** Will be called whenever a FTS3 table is created with
**   CREATE .. USING fts3( ... , tokenize=perl qualified::function::name)
** where qualified::function::name is a fully qualified perl function
*/
static int perl_tokenizer_Create(
    int argc, const char * const *argv,
    sqlite3_tokenizer **ppTokenizer
){
    dTHX;
    dSP;
    int n_retval;
    SV *retval;
    perl_tokenizer *t;

    if (!argc) {
        return SQLITE_ERROR;
    }

    t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t));
    if( t==NULL ) return SQLITE_NOMEM;
    memset(t, 0, sizeof(*t));

    ENTER;
    SAVETMPS;

    /* call the qualified::function::name */
    PUSHMARK(SP);
    PUTBACK;
    n_retval = call_pv(argv[0], G_SCALAR);
    SPAGAIN;

    /* store a copy of the returned coderef into the tokenizer structure */
    if (n_retval != 1) {
        warn("tokenizer_Create returned %d arguments", n_retval);
    }
    retval = POPs;
    t->coderef   = newSVsv(retval);
    *ppTokenizer = &t->base;

    PUTBACK;
    FREETMPS;
    LEAVE;

    return SQLITE_OK;
}

/*
** Destroy a tokenizer
*/
static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){
    dTHX;
    perl_tokenizer *t = (perl_tokenizer *) pTokenizer;
    sv_free(t->coderef);
    sqlite3_free(t);
    return SQLITE_OK;
}

/*
** Prepare to begin tokenizing a particular string.  The input
** string to be tokenized is supposed to be pInput[0..nBytes-1] ..
** except that nBytes passed by fts3 is -1 (don't know why) !
** This is passed to the tokenizer instance, which then returns a
** closure implementing the cursor (so the cursor is again a coderef).
*/
static int perl_tokenizer_Open(
    sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
    const char *pInput, int nBytes,      /* Input buffer */
    sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
){
    dTHX;
    dSP;
    dMY_CXT;
    U32 flags;
    SV *perl_string;
    int n_retval;

    /* build a Perl copy of the input string */
    if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */
        nBytes = strlen(pInput);
    }

    /* SVs_TEMP will call sv_2mortal */
    perl_string = newSVpvn_flags(pInput, nBytes, SVs_TEMP);

    switch (MY_CXT.last_dbh_string_mode) {
        DBD_SQLITE_STRING_MODE_UNICODE_NAIVE:
            DBD_SQLITE_UTF8_DECODE_NAIVE(perl_string);
            break;

        DBD_SQLITE_STRING_MODE_UNICODE_FALLBACK:
        DBD_SQLITE_STRING_MODE_UNICODE_STRICT:
            DBD_SQLITE_UTF8_DECODE_WITH_FALLBACK(perl_string);
            break;

        default:
            break;
    }

    DBD_SQLITE_UTF8_DECODE_IF_NEEDED(perl_string, MY_CXT.last_dbh_string_mode);

    perl_tokenizer *t = (perl_tokenizer *)pTokenizer;

    /* allocate and initialize the cursor struct */
    perl_tokenizer_cursor *c;
    c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
    memset(c, 0, sizeof(*c));
    *ppCursor = &c->base;

    /* special handling if working with utf8 strings */
    if (MY_CXT.last_dbh_string_mode & DBD_SQLITE_STRING_MODE_UNICODE_ANY) {

        /* data to keep track of byte positions */
        c->currentByte = c->pInput = pInput;
        c->currentChar = 0;
    }

    ENTER;
    SAVETMPS;

    /* call the tokenizer coderef */
    PUSHMARK(SP);
    XPUSHs(perl_string);
    PUTBACK;
    n_retval = call_sv(t->coderef, G_SCALAR);
    SPAGAIN;

    /* store the cursor coderef returned by the tokenizer */
    if (n_retval != 1) {
        warn("tokenizer returned %d arguments, expected 1", n_retval);
    }
    c->coderef = newSVsv(POPs);

    PUTBACK;
    FREETMPS;
    LEAVE;
    return SQLITE_OK;
}

/*
** Close a tokenization cursor previously opened by a call to
** perl_tokenizer_Open() above.
*/
static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
    perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;

    dTHX;
    sv_free(c->coderef);
    if (c->pToken) sqlite3_free(c->pToken);
    sqlite3_free(c);
    return SQLITE_OK;
}

/*
** Extract the next token from a tokenization cursor.  The cursor must
** have been opened by a prior call to perl_tokenizer_Open().
*/
static int perl_tokenizer_Next(
    sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by perl_tokenizer_Open */
    const char **ppToken,               /* OUT: Normalized text for token */
    int *pnBytes,                       /* OUT: Number of bytes in normalized text */
    int *piStartOffset,                 /* Starting offset of token. IN : char offset; OUT : byte offset */
    int *piEndOffset,                   /* Ending offset of token.   IN : char offset; OUT : byte offset */
    int *piPosition                     /* OUT: Number of tokens returned before this one */
){
    perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
    int result;
    int n_retval;
    char *token;
    char *nextByte;
    STRLEN n_a; /* this is required for older perls < 5.8.8 */
    I32 hop;

    dTHX;
    dSP;

    ENTER;
    SAVETMPS;

    /* call the cursor */
    PUSHMARK(SP);
    PUTBACK;
    n_retval = call_sv(c->coderef, G_ARRAY);
    SPAGAIN;

    /* if we get back an empty list, there is no more token */
    if (n_retval == 0) {
        result = SQLITE_DONE;
    }
    /* otherwise, get token details from the return list */
    else {
        if (n_retval != 5) {
            warn("tokenizer cursor returned %d arguments, expected 5", n_retval);
        }
        *piPosition    = POPi;
        *piEndOffset   = POPi;
        *piStartOffset = POPi;
        *pnBytes       = POPi;
        token          = POPpx;

        if (c->pInput) { /* if working with utf8 data */
            /* compute first hop : nb of chars from last position to the start of the token */
            hop               = *piStartOffset - c->currentChar;

            /* hop: advance to the first byte in token */
            nextByte          = (char*)utf8_hop((U8*)c->currentByte, hop);

            /* compute 2nd hop : nb of chars from start of the token to end of token */
            hop               = *piEndOffset - *piStartOffset;

            /* now recompute the start offset in bytes, not in chars */
            *piStartOffset    = nextByte - c->pInput;

            /* 2nd hop: advance past to the last byte in token */
            nextByte          = (char*)utf8_hop((U8*)nextByte, hop);

            /* remember current position (useful for the next invocation) */
            c->currentChar    = *piEndOffset;
            c->currentByte    = nextByte;

            /* now recompute the end offset in bytes, not in chars */
            *piEndOffset      = nextByte - c->pInput;

            /* compute the size of the normalized token in bytes, not in chars */
            *pnBytes          = strlen(token);
        }

        /* make sure we have enough storage for copying the token */
        if (*pnBytes > c->nTokenAllocated ){
            char *pNew;
            c->nTokenAllocated = *pnBytes + 20;
            pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
            if( !pNew ) return SQLITE_NOMEM;
            c->pToken = pNew;
        }

        /* need to copy the token into the C cursor before perl frees that memory */
        memcpy(c->pToken, token, *pnBytes);
        *ppToken  = c->pToken;

        result = SQLITE_OK;
    }

    PUTBACK;
    FREETMPS;
    LEAVE;

    return result;
}

/*
** The set of routines that implement the perl tokenizer
*/
sqlite3_tokenizer_module perl_tokenizer_Module = {
    0,
    perl_tokenizer_Create,
    perl_tokenizer_Destroy,
    perl_tokenizer_Open,
    perl_tokenizer_Close,
    perl_tokenizer_Next
};

/*
** Register the perl tokenizer with FTS3
*/
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh)
{
    D_imp_dbh(dbh);

    int rc;
    sqlite3_stmt *pStmt;
    const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
    sqlite3_tokenizer_module *p = &perl_tokenizer_Module;

    if (!DBIc_ACTIVE(imp_dbh)) {
        sqlite_error(dbh, -2, "attempt to register fts3 tokenizer on inactive database handle");
        return FALSE;
    }

#if SQLITE_VERSION_NUMBER >= 3012000
    rc = sqlite3_db_config(imp_dbh->db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
    if( rc!=SQLITE_OK ){
        return rc;
    }
#endif

    rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0);
    if( rc!=SQLITE_OK ){
        return rc;
    }

    sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC);
    sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
    sqlite3_step(pStmt);

    return sqlite3_finalize(pStmt);
}