1
0
Fork 0
mirror of https://github.com/DBD-SQLite/DBD-SQLite synced 2025-06-07 06:08:38 -04:00
DBD-SQLite-SQLcipher/dbdimp_tokenizer.inc
2021-05-30 21:18:42 +09:00

316 lines
9.2 KiB
C++

typedef struct perl_tokenizer {
sqlite3_tokenizer base;
SV *coderef; /* the perl tokenizer is a coderef that takes
a string and returns a cursor coderef */
} perl_tokenizer;
typedef struct perl_tokenizer_cursor {
sqlite3_tokenizer_cursor base;
SV *coderef; /* ref to the closure that returns terms */
char *pToken; /* storage for a copy of the last token */
int nTokenAllocated; /* space allocated to pToken buffer */
/* members below are only used if the input string is in utf8 */
const char *pInput; /* input we are tokenizing */
const char *currentByte; /* pointer into pInput */
int currentChar; /* char position corresponding to currentByte */
} perl_tokenizer_cursor;
/*
** Create a new tokenizer instance.
** Will be called whenever a FTS3 table is created with
** CREATE .. USING fts3( ... , tokenize=perl qualified::function::name)
** where qualified::function::name is a fully qualified perl function
*/
static int perl_tokenizer_Create(
int argc, const char * const *argv,
sqlite3_tokenizer **ppTokenizer
){
dTHX;
dSP;
int n_retval;
SV *retval;
perl_tokenizer *t;
if (!argc) {
return SQLITE_ERROR;
}
t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t));
if( t==NULL ) return SQLITE_NOMEM;
memset(t, 0, sizeof(*t));
ENTER;
SAVETMPS;
/* call the qualified::function::name */
PUSHMARK(SP);
PUTBACK;
n_retval = call_pv(argv[0], G_SCALAR);
SPAGAIN;
/* store a copy of the returned coderef into the tokenizer structure */
if (n_retval != 1) {
warn("tokenizer_Create returned %d arguments", n_retval);
}
retval = POPs;
t->coderef = newSVsv(retval);
*ppTokenizer = &t->base;
PUTBACK;
FREETMPS;
LEAVE;
return SQLITE_OK;
}
/*
** Destroy a tokenizer
*/
static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){
dTHX;
perl_tokenizer *t = (perl_tokenizer *) pTokenizer;
sv_free(t->coderef);
sqlite3_free(t);
return SQLITE_OK;
}
/*
** Prepare to begin tokenizing a particular string. The input
** string to be tokenized is supposed to be pInput[0..nBytes-1] ..
** except that nBytes passed by fts3 is -1 (don't know why) !
** This is passed to the tokenizer instance, which then returns a
** closure implementing the cursor (so the cursor is again a coderef).
*/
static int perl_tokenizer_Open(
sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
const char *pInput, int nBytes, /* Input buffer */
sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
){
dTHX;
dSP;
dMY_CXT;
U32 flags;
SV *perl_string;
int n_retval;
/* build a Perl copy of the input string */
if (nBytes < 0) { /* we get -1 from fts3. Don't know why ! */
nBytes = strlen(pInput);
}
/* SVs_TEMP will call sv_2mortal */
perl_string = newSVpvn_flags(pInput, nBytes, SVs_TEMP);
switch (MY_CXT.last_dbh_string_mode) {
DBD_SQLITE_STRING_MODE_UNICODE_NAIVE:
DBD_SQLITE_UTF8_DECODE_NAIVE(perl_string);
break;
DBD_SQLITE_STRING_MODE_UNICODE_FALLBACK:
DBD_SQLITE_STRING_MODE_UNICODE_STRICT:
DBD_SQLITE_UTF8_DECODE_WITH_FALLBACK(perl_string);
break;
default:
break;
}
DBD_SQLITE_UTF8_DECODE_IF_NEEDED(perl_string, MY_CXT.last_dbh_string_mode);
perl_tokenizer *t = (perl_tokenizer *)pTokenizer;
/* allocate and initialize the cursor struct */
perl_tokenizer_cursor *c;
c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
memset(c, 0, sizeof(*c));
*ppCursor = &c->base;
/* special handling if working with utf8 strings */
if (MY_CXT.last_dbh_string_mode & DBD_SQLITE_STRING_MODE_UNICODE_ANY) {
/* data to keep track of byte positions */
c->currentByte = c->pInput = pInput;
c->currentChar = 0;
}
ENTER;
SAVETMPS;
/* call the tokenizer coderef */
PUSHMARK(SP);
XPUSHs(perl_string);
PUTBACK;
n_retval = call_sv(t->coderef, G_SCALAR);
SPAGAIN;
/* store the cursor coderef returned by the tokenizer */
if (n_retval != 1) {
warn("tokenizer returned %d arguments, expected 1", n_retval);
}
c->coderef = newSVsv(POPs);
PUTBACK;
FREETMPS;
LEAVE;
return SQLITE_OK;
}
/*
** Close a tokenization cursor previously opened by a call to
** perl_tokenizer_Open() above.
*/
static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
dTHX;
sv_free(c->coderef);
if (c->pToken) sqlite3_free(c->pToken);
sqlite3_free(c);
return SQLITE_OK;
}
/*
** Extract the next token from a tokenization cursor. The cursor must
** have been opened by a prior call to perl_tokenizer_Open().
*/
static int perl_tokenizer_Next(
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */
const char **ppToken, /* OUT: Normalized text for token */
int *pnBytes, /* OUT: Number of bytes in normalized text */
int *piStartOffset, /* Starting offset of token. IN : char offset; OUT : byte offset */
int *piEndOffset, /* Ending offset of token. IN : char offset; OUT : byte offset */
int *piPosition /* OUT: Number of tokens returned before this one */
){
perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor;
int result;
int n_retval;
char *token;
char *nextByte;
STRLEN n_a; /* this is required for older perls < 5.8.8 */
I32 hop;
dTHX;
dSP;
ENTER;
SAVETMPS;
/* call the cursor */
PUSHMARK(SP);
PUTBACK;
n_retval = call_sv(c->coderef, G_ARRAY);
SPAGAIN;
/* if we get back an empty list, there is no more token */
if (n_retval == 0) {
result = SQLITE_DONE;
}
/* otherwise, get token details from the return list */
else {
if (n_retval != 5) {
warn("tokenizer cursor returned %d arguments, expected 5", n_retval);
}
*piPosition = POPi;
*piEndOffset = POPi;
*piStartOffset = POPi;
*pnBytes = POPi;
token = POPpx;
if (c->pInput) { /* if working with utf8 data */
/* compute first hop : nb of chars from last position to the start of the token */
hop = *piStartOffset - c->currentChar;
/* hop: advance to the first byte in token */
nextByte = (char*)utf8_hop((U8*)c->currentByte, hop);
/* compute 2nd hop : nb of chars from start of the token to end of token */
hop = *piEndOffset - *piStartOffset;
/* now recompute the start offset in bytes, not in chars */
*piStartOffset = nextByte - c->pInput;
/* 2nd hop: advance past to the last byte in token */
nextByte = (char*)utf8_hop((U8*)nextByte, hop);
/* remember current position (useful for the next invocation) */
c->currentChar = *piEndOffset;
c->currentByte = nextByte;
/* now recompute the end offset in bytes, not in chars */
*piEndOffset = nextByte - c->pInput;
/* compute the size of the normalized token in bytes, not in chars */
*pnBytes = strlen(token);
}
/* make sure we have enough storage for copying the token */
if (*pnBytes > c->nTokenAllocated ){
char *pNew;
c->nTokenAllocated = *pnBytes + 20;
pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated);
if( !pNew ) return SQLITE_NOMEM;
c->pToken = pNew;
}
/* need to copy the token into the C cursor before perl frees that memory */
memcpy(c->pToken, token, *pnBytes);
*ppToken = c->pToken;
result = SQLITE_OK;
}
PUTBACK;
FREETMPS;
LEAVE;
return result;
}
/*
** The set of routines that implement the perl tokenizer
*/
sqlite3_tokenizer_module perl_tokenizer_Module = {
0,
perl_tokenizer_Create,
perl_tokenizer_Destroy,
perl_tokenizer_Open,
perl_tokenizer_Close,
perl_tokenizer_Next
};
/*
** Register the perl tokenizer with FTS3
*/
int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh)
{
D_imp_dbh(dbh);
int rc;
sqlite3_stmt *pStmt;
const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
sqlite3_tokenizer_module *p = &perl_tokenizer_Module;
if (!DBIc_ACTIVE(imp_dbh)) {
sqlite_error(dbh, -2, "attempt to register fts3 tokenizer on inactive database handle");
return FALSE;
}
#if SQLITE_VERSION_NUMBER >= 3012000
rc = sqlite3_db_config(imp_dbh->db, SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 1, 0);
if( rc!=SQLITE_OK ){
return rc;
}
#endif
rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
return rc;
}
sqlite3_bind_text(pStmt, 1, "perl", -1, SQLITE_STATIC);
sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
sqlite3_step(pStmt);
return sqlite3_finalize(pStmt);
}