From 30e4ee67e9558101df54d2bc065c7d024ed8ee32 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sat, 23 Apr 2022 11:08:18 +0200 Subject: [PATCH 01/14] Rename "perl_tokenizer" to "perl_fts3_tokenizer" This is because we will soon have two styles of tokenizers, one for the FTS3 / FTS4 API and one for the FTS5 API # Conflicts: # dbdimp_tokenizer.inc --- dbdimp_tokenizer.inc | 52 ++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index 2258f0b..8f84494 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -1,10 +1,10 @@ -typedef struct perl_tokenizer { +typedef struct perl_fts3_tokenizer { sqlite3_tokenizer base; SV *coderef; /* the perl tokenizer is a coderef that takes a string and returns a cursor coderef */ -} perl_tokenizer; +} perl_fts3_tokenizer; -typedef struct perl_tokenizer_cursor { +typedef struct perl_fts3_tokenizer_cursor { sqlite3_tokenizer_cursor base; SV *coderef; /* ref to the closure that returns terms */ char *pToken; /* storage for a copy of the last token */ @@ -14,7 +14,7 @@ typedef struct perl_tokenizer_cursor { const char *pInput; /* input we are tokenizing */ const char *currentByte; /* pointer into pInput */ int currentChar; /* char position corresponding to currentByte */ -} perl_tokenizer_cursor; +} perl_fts3_tokenizer_cursor; /* ** Create a new tokenizer instance. @@ -22,7 +22,7 @@ typedef struct perl_tokenizer_cursor { ** CREATE .. USING fts3( ... , tokenize=perl qualified::function::name) ** where qualified::function::name is a fully qualified perl function */ -static int perl_tokenizer_Create( +static int perl_fts3_tokenizer_Create( int argc, const char * const *argv, sqlite3_tokenizer **ppTokenizer ){ @@ -30,13 +30,13 @@ static int perl_tokenizer_Create( dSP; int n_retval; SV *retval; - perl_tokenizer *t; + perl_fts3_tokenizer *t; if (!argc) { return SQLITE_ERROR; } - t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t)); + t = (perl_fts3_tokenizer *) sqlite3_malloc(sizeof(*t)); if( t==NULL ) return SQLITE_NOMEM; memset(t, 0, sizeof(*t)); @@ -67,9 +67,9 @@ static int perl_tokenizer_Create( /* ** Destroy a tokenizer */ -static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){ +static int perl_fts3_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){ dTHX; - perl_tokenizer *t = (perl_tokenizer *) pTokenizer; + perl_fts3_tokenizer *t = (perl_fts3_tokenizer *) pTokenizer; sv_free(t->coderef); sqlite3_free(t); return SQLITE_OK; @@ -82,7 +82,7 @@ static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){ ** This is passed to the tokenizer instance, which then returns a ** closure implementing the cursor (so the cursor is again a coderef). */ -static int perl_tokenizer_Open( +static int perl_fts3_tokenizer_Open( sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ const char *pInput, int nBytes, /* Input buffer */ sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ @@ -118,11 +118,11 @@ static int perl_tokenizer_Open( DBD_SQLITE_UTF8_DECODE_IF_NEEDED(perl_string, MY_CXT.last_dbh_string_mode); - perl_tokenizer *t = (perl_tokenizer *)pTokenizer; + perl_fts3_tokenizer *t = (perl_fts3_tokenizer *)pTokenizer; /* allocate and initialize the cursor struct */ - perl_tokenizer_cursor *c; - c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); + perl_fts3_tokenizer_cursor *c; + c = (perl_fts3_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); memset(c, 0, sizeof(*c)); *ppCursor = &c->base; @@ -158,10 +158,10 @@ static int perl_tokenizer_Open( /* ** Close a tokenization cursor previously opened by a call to -** perl_tokenizer_Open() above. +** perl_fts3_tokenizer_Open() above. */ -static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ - perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; +static int perl_fts3_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ + perl_fts3_tokenizer_cursor *c = (perl_fts3_tokenizer_cursor *) pCursor; dTHX; sv_free(c->coderef); @@ -172,9 +172,9 @@ static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ /* ** Extract the next token from a tokenization cursor. The cursor must -** have been opened by a prior call to perl_tokenizer_Open(). +** have been opened by a prior call to perl_fts3_tokenizer_Open(). */ -static int perl_tokenizer_Next( +static int perl_fts3_tokenizer_Next( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */ const char **ppToken, /* OUT: Normalized text for token */ int *pnBytes, /* OUT: Number of bytes in normalized text */ @@ -182,7 +182,7 @@ static int perl_tokenizer_Next( int *piEndOffset, /* Ending offset of token. IN : char offset; OUT : byte offset */ int *piPosition /* OUT: Number of tokens returned before this one */ ){ - perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; + perl_fts3_tokenizer_cursor *c = (perl_fts3_tokenizer_cursor *) pCursor; int result; int n_retval; char *token; @@ -270,13 +270,13 @@ static int perl_tokenizer_Next( /* ** The set of routines that implement the perl tokenizer */ -sqlite3_tokenizer_module perl_tokenizer_Module = { +sqlite3_tokenizer_module perl_fts3_tokenizer_Module = { 0, - perl_tokenizer_Create, - perl_tokenizer_Destroy, - perl_tokenizer_Open, - perl_tokenizer_Close, - perl_tokenizer_Next + perl_fts3_tokenizer_Create, + perl_fts3_tokenizer_Destroy, + perl_fts3_tokenizer_Open, + perl_fts3_tokenizer_Close, + perl_fts3_tokenizer_Next }; /* @@ -289,7 +289,7 @@ int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh) int rc; sqlite3_stmt *pStmt; const char zSql[] = "SELECT fts3_tokenizer(?, ?)"; - sqlite3_tokenizer_module *p = &perl_tokenizer_Module; + sqlite3_tokenizer_module *p = &perl_fts3_tokenizer_Module; if (!DBIc_ACTIVE(imp_dbh)) { sqlite_error(dbh, -2, "attempt to register fts3 tokenizer on inactive database handle"); From 8c076159d2a7032d8c52ab61f3cbb4ed49f79f9e Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Fri, 28 Aug 2020 13:57:10 +0200 Subject: [PATCH 02/14] WIP: Stuff needed for FTS5 --- SQLiteXS.h | 1 + dbdimp_tokenizer.inc | 223 +++++++++++++++++ fts5.h | 575 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 799 insertions(+) create mode 100644 fts5.h diff --git a/SQLiteXS.h b/SQLiteXS.h index 584fb61..53c7588 100644 --- a/SQLiteXS.h +++ b/SQLiteXS.h @@ -20,5 +20,6 @@ #include "sqlite3.h" #include "fts3_tokenizer.h" +#include "fts5.h" #endif diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index 8f84494..6020ec5 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -16,6 +16,24 @@ typedef struct perl_fts3_tokenizer_cursor { int currentChar; /* char position corresponding to currentByte */ } perl_fts3_tokenizer_cursor; +typedef struct perl_fts5_tokenizer { + fts5_tokenizer base; + SV *coderef; /* the perl tokenizer is a coderef that takes + a string and returns a cursor coderef */ +} perl_fts5_tokenizer; + +typedef struct perl_fts5_tokenizer_cursor { + // fts5_tokenizer_cursor base; + SV *coderef; /* ref to the closure that returns terms */ + char *pToken; /* storage for a copy of the last token */ + int nTokenAllocated; /* space allocated to pToken buffer */ + + /* members below are only used if the input string is in utf8 */ + const char *pInput; /* input we are tokenizing */ + const char *lastByteOffset; /* offset into pInput */ + int lastCharOffset; /* char offset corresponding to lastByteOffset */ +} perl_fts5_tokenizer_cursor; + /* ** Create a new tokenizer instance. ** Will be called whenever a FTS3 table is created with @@ -314,3 +332,208 @@ int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh) return sqlite3_finalize(pStmt); } + +/* +** Create a new tokenizer instance. +** Will be called whenever a FTS5 table is created with +** CREATE .. USING fts5( ... , tokenize=perl qualified::function::name) +** where qualified::function::name is a fully qualified perl function +*/ +static int perl_fts5_tokenizer_Create( + int argc, const char * const *argv, + sqlite3_tokenizer **ppTokenizer +){ + dTHX; + dSP; + int n_retval; + SV *retval; + perl_fts3_tokenizer *t; + + if (!argc) { + return SQLITE_ERROR; + } + + t = (perl_fts3_tokenizer *) sqlite3_malloc(sizeof(*t)); + if( t==NULL ) return SQLITE_NOMEM; + memset(t, 0, sizeof(*t)); + + ENTER; + SAVETMPS; + + /* call the qualified::function::name */ + PUSHMARK(SP); + PUTBACK; + n_retval = call_pv(argv[0], G_SCALAR); + SPAGAIN; + + /* store a copy of the returned coderef into the tokenizer structure */ + if (n_retval != 1) { + warn("tokenizer_Create returned %d arguments", n_retval); + } + retval = POPs; + t->coderef = newSVsv(retval); + *ppTokenizer = &t->base; + + PUTBACK; + FREETMPS; + LEAVE; + + return SQLITE_OK; +} + +/* +** Destroy a tokenizer +*/ +static int perl_fts5_tokenizer_Delete(Fts5Tokenizer *pTokenizer){ + dTHX; + perl_fts5_tokenizer *t = (perl_fts5_tokenizer *) pTokenizer; + sv_free(t->coderef); + sqlite3_free(t); + return SQLITE_OK; +} + +/* +** Extract the next token from a tokenization cursor. The cursor must +** have been opened by a prior call to perl_fts3_tokenizer_Open(). +*/ +static int perl_fts5_tokenizer_Tokenize( + sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_fts3_tokenizer_Open */ + const char **ppToken, /* OUT: *ppToken is the token text */ + int *pnBytes, /* OUT: Number of bytes in token */ + int *piStartOffset, /* OUT: Starting offset of token */ + int *piEndOffset, /* OUT: Ending offset of token */ + int *piPosition /* OUT: Position integer of token */ +){ + perl_fts3_tokenizer_cursor *c = (perl_fts3_tokenizer_cursor *) pCursor; + int result; + int n_retval; + char *token; + char *byteOffset; + STRLEN n_a; /* this is required for older perls < 5.8.8 */ + I32 hop; + + dTHX; + dSP; + + ENTER; + SAVETMPS; + + /* call the cursor */ + PUSHMARK(SP); + PUTBACK; + n_retval = call_sv(c->coderef, G_ARRAY); + SPAGAIN; + + /* if we get back an empty list, there is no more token */ + if (n_retval == 0) { + result = SQLITE_DONE; + } + /* otherwise, get token details from the return list */ + else { + if (n_retval != 5) { + warn("tokenizer cursor returned %d arguments", n_retval); + } + *piPosition = POPi; + *piEndOffset = POPi; + *piStartOffset = POPi; + *pnBytes = POPi; + token = POPpx; + + if (c->pInput) { /* if working with utf8 data */ + + /* recompute *pnBytes in bytes, not in chars */ + *pnBytes = strlen(token); + + /* recompute start/end offsets in bytes, not in chars */ + hop = *piStartOffset - c->lastCharOffset; + byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); + hop = *piEndOffset - *piStartOffset; + *piStartOffset = byteOffset - c->pInput; + byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); + *piEndOffset = byteOffset - c->pInput; + + /* remember where we are for next round */ + c->lastCharOffset = *piEndOffset, + c->lastByteOffset = byteOffset; + } + + /* make sure we have enough storage for copying the token */ + if (*pnBytes > c->nTokenAllocated ){ + char *pNew; + c->nTokenAllocated = *pnBytes + 20; + pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); + if( !pNew ) return SQLITE_NOMEM; + c->pToken = pNew; + } + + /* need to copy the token into the C cursor before perl frees that + memory */ + memcpy(c->pToken, token, *pnBytes); + *ppToken = c->pToken; + + result = SQLITE_OK; + } + + PUTBACK; + FREETMPS; + LEAVE; + + return result; +} + +/* +** The set of routines that implement the perl FTS5 tokenizer +*/ +fts5_tokenizer perl_fts5_tokenizer_Module = { + 0, + perl_fts5_tokenizer_Create, + perl_fts5_tokenizer_Delete, + perl_fts5_tokenizer_Tokenize +}; + +/* +** Fetch the FTS5 API pointers +*/ + +fts5_api* sqlite_fetch_fts5_api(pTHX_ SV *dbh) +{ + D_imp_dbh(dbh); + + int rc; + sqlite3_stmt *pStmt; + const char zSql[] = "SELECT fts5(?)"; + fts5_api *pFts5Api = 0; + + if (!DBIc_ACTIVE(imp_dbh)) { + sqlite_error(dbh, -2, "attempt to register fts5 tokenizer on inactive database handle"); + return FALSE; + } + + rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0); + if( rc!=SQLITE_OK ){ + return 0; + } + + sqlite3_bind_pointer(pStmt, 1, (void*)&pFts5Api, "fts5_api_ptr", NULL); + sqlite3_step(pStmt); + sqlite3_finalize(pStmt); + + return pFts5Api; +} + +/* +** Register the perl tokenizer with FTS5 +*/ +int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh) +{ + D_imp_dbh(dbh); + + int rc; + fts5_api *pFts5Api = sqlite_fetch_fts5_api(aTHX_ dbh); + sqlite3_tokenizer_module *p = &perl_fts5_tokenizer_Module; + + // pFts5Api->xCreateTokenizer(pFts5Api,...); + + + return 0; +} diff --git a/fts5.h b/fts5.h new file mode 100644 index 0000000..081e534 --- /dev/null +++ b/fts5.h @@ -0,0 +1,575 @@ +/* +** 2014 May 31 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** Interfaces to extend FTS5. Using the interfaces defined in this file, +** FTS5 may be extended with: +** +** * custom tokenizers, and +** * custom auxiliary functions. +*/ + + +#ifndef _FTS5_H +#define _FTS5_H + +#include "sqlite3.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/************************************************************************* +** CUSTOM AUXILIARY FUNCTIONS +** +** Virtual table implementations may overload SQL functions by implementing +** the sqlite3_module.xFindFunction() method. +*/ + +typedef struct Fts5ExtensionApi Fts5ExtensionApi; +typedef struct Fts5Context Fts5Context; +typedef struct Fts5PhraseIter Fts5PhraseIter; + +typedef void (*fts5_extension_function)( + const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ + Fts5Context *pFts, /* First arg to pass to pApi functions */ + sqlite3_context *pCtx, /* Context for returning result/error */ + int nVal, /* Number of values in apVal[] array */ + sqlite3_value **apVal /* Array of trailing arguments */ +); + +struct Fts5PhraseIter { + const unsigned char *a; + const unsigned char *b; +}; + +/* +** EXTENSION API FUNCTIONS +** +** xUserData(pFts): +** Return a copy of the context pointer the extension function was +** registered with. +** +** xColumnTotalSize(pFts, iCol, pnToken): +** If parameter iCol is less than zero, set output variable *pnToken +** to the total number of tokens in the FTS5 table. Or, if iCol is +** non-negative but less than the number of columns in the table, return +** the total number of tokens in column iCol, considering all rows in +** the FTS5 table. +** +** If parameter iCol is greater than or equal to the number of columns +** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. +** an OOM condition or IO error), an appropriate SQLite error code is +** returned. +** +** xColumnCount(pFts): +** Return the number of columns in the table. +** +** xColumnSize(pFts, iCol, pnToken): +** If parameter iCol is less than zero, set output variable *pnToken +** to the total number of tokens in the current row. Or, if iCol is +** non-negative but less than the number of columns in the table, set +** *pnToken to the number of tokens in column iCol of the current row. +** +** If parameter iCol is greater than or equal to the number of columns +** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. +** an OOM condition or IO error), an appropriate SQLite error code is +** returned. +** +** This function may be quite inefficient if used with an FTS5 table +** created with the "columnsize=0" option. +** +** xColumnText: +** This function attempts to retrieve the text of column iCol of the +** current document. If successful, (*pz) is set to point to a buffer +** containing the text in utf-8 encoding, (*pn) is set to the size in bytes +** (not characters) of the buffer and SQLITE_OK is returned. Otherwise, +** if an error occurs, an SQLite error code is returned and the final values +** of (*pz) and (*pn) are undefined. +** +** xPhraseCount: +** Returns the number of phrases in the current query expression. +** +** xPhraseSize: +** Returns the number of tokens in phrase iPhrase of the query. Phrases +** are numbered starting from zero. +** +** xInstCount: +** Set *pnInst to the total number of occurrences of all phrases within +** the query within the current row. Return SQLITE_OK if successful, or +** an error code (i.e. SQLITE_NOMEM) if an error occurs. +** +** This API can be quite slow if used with an FTS5 table created with the +** "detail=none" or "detail=column" option. If the FTS5 table is created +** with either "detail=none" or "detail=column" and "content=" option +** (i.e. if it is a contentless table), then this API always returns 0. +** +** xInst: +** Query for the details of phrase match iIdx within the current row. +** Phrase matches are numbered starting from zero, so the iIdx argument +** should be greater than or equal to zero and smaller than the value +** output by xInstCount(). +** +** Usually, output parameter *piPhrase is set to the phrase number, *piCol +** to the column in which it occurs and *piOff the token offset of the +** first token of the phrase. Returns SQLITE_OK if successful, or an error +** code (i.e. SQLITE_NOMEM) if an error occurs. +** +** This API can be quite slow if used with an FTS5 table created with the +** "detail=none" or "detail=column" option. +** +** xRowid: +** Returns the rowid of the current row. +** +** xTokenize: +** Tokenize text using the tokenizer belonging to the FTS5 table. +** +** xQueryPhrase(pFts5, iPhrase, pUserData, xCallback): +** This API function is used to query the FTS table for phrase iPhrase +** of the current query. Specifically, a query equivalent to: +** +** ... FROM ftstable WHERE ftstable MATCH $p ORDER BY rowid +** +** with $p set to a phrase equivalent to the phrase iPhrase of the +** current query is executed. Any column filter that applies to +** phrase iPhrase of the current query is included in $p. For each +** row visited, the callback function passed as the fourth argument +** is invoked. The context and API objects passed to the callback +** function may be used to access the properties of each matched row. +** Invoking Api.xUserData() returns a copy of the pointer passed as +** the third argument to pUserData. +** +** If the callback function returns any value other than SQLITE_OK, the +** query is abandoned and the xQueryPhrase function returns immediately. +** If the returned value is SQLITE_DONE, xQueryPhrase returns SQLITE_OK. +** Otherwise, the error code is propagated upwards. +** +** If the query runs to completion without incident, SQLITE_OK is returned. +** Or, if some error occurs before the query completes or is aborted by +** the callback, an SQLite error code is returned. +** +** +** xSetAuxdata(pFts5, pAux, xDelete) +** +** Save the pointer passed as the second argument as the extension function's +** "auxiliary data". The pointer may then be retrieved by the current or any +** future invocation of the same fts5 extension function made as part of +** the same MATCH query using the xGetAuxdata() API. +** +** Each extension function is allocated a single auxiliary data slot for +** each FTS query (MATCH expression). If the extension function is invoked +** more than once for a single FTS query, then all invocations share a +** single auxiliary data context. +** +** If there is already an auxiliary data pointer when this function is +** invoked, then it is replaced by the new pointer. If an xDelete callback +** was specified along with the original pointer, it is invoked at this +** point. +** +** The xDelete callback, if one is specified, is also invoked on the +** auxiliary data pointer after the FTS5 query has finished. +** +** If an error (e.g. an OOM condition) occurs within this function, +** the auxiliary data is set to NULL and an error code returned. If the +** xDelete parameter was not NULL, it is invoked on the auxiliary data +** pointer before returning. +** +** +** xGetAuxdata(pFts5, bClear) +** +** Returns the current auxiliary data pointer for the fts5 extension +** function. See the xSetAuxdata() method for details. +** +** If the bClear argument is non-zero, then the auxiliary data is cleared +** (set to NULL) before this function returns. In this case the xDelete, +** if any, is not invoked. +** +** +** xRowCount(pFts5, pnRow) +** +** This function is used to retrieve the total number of rows in the table. +** In other words, the same value that would be returned by: +** +** SELECT count(*) FROM ftstable; +** +** xPhraseFirst() +** This function is used, along with type Fts5PhraseIter and the xPhraseNext +** method, to iterate through all instances of a single query phrase within +** the current row. This is the same information as is accessible via the +** xInstCount/xInst APIs. While the xInstCount/xInst APIs are more convenient +** to use, this API may be faster under some circumstances. To iterate +** through instances of phrase iPhrase, use the following code: +** +** Fts5PhraseIter iter; +** int iCol, iOff; +** for(pApi->xPhraseFirst(pFts, iPhrase, &iter, &iCol, &iOff); +** iCol>=0; +** pApi->xPhraseNext(pFts, &iter, &iCol, &iOff) +** ){ +** // An instance of phrase iPhrase at offset iOff of column iCol +** } +** +** The Fts5PhraseIter structure is defined above. Applications should not +** modify this structure directly - it should only be used as shown above +** with the xPhraseFirst() and xPhraseNext() API methods (and by +** xPhraseFirstColumn() and xPhraseNextColumn() as illustrated below). +** +** This API can be quite slow if used with an FTS5 table created with the +** "detail=none" or "detail=column" option. If the FTS5 table is created +** with either "detail=none" or "detail=column" and "content=" option +** (i.e. if it is a contentless table), then this API always iterates +** through an empty set (all calls to xPhraseFirst() set iCol to -1). +** +** xPhraseNext() +** See xPhraseFirst above. +** +** xPhraseFirstColumn() +** This function and xPhraseNextColumn() are similar to the xPhraseFirst() +** and xPhraseNext() APIs described above. The difference is that instead +** of iterating through all instances of a phrase in the current row, these +** APIs are used to iterate through the set of columns in the current row +** that contain one or more instances of a specified phrase. For example: +** +** Fts5PhraseIter iter; +** int iCol; +** for(pApi->xPhraseFirstColumn(pFts, iPhrase, &iter, &iCol); +** iCol>=0; +** pApi->xPhraseNextColumn(pFts, &iter, &iCol) +** ){ +** // Column iCol contains at least one instance of phrase iPhrase +** } +** +** This API can be quite slow if used with an FTS5 table created with the +** "detail=none" option. If the FTS5 table is created with either +** "detail=none" "content=" option (i.e. if it is a contentless table), +** then this API always iterates through an empty set (all calls to +** xPhraseFirstColumn() set iCol to -1). +** +** The information accessed using this API and its companion +** xPhraseFirstColumn() may also be obtained using xPhraseFirst/xPhraseNext +** (or xInst/xInstCount). The chief advantage of this API is that it is +** significantly more efficient than those alternatives when used with +** "detail=column" tables. +** +** xPhraseNextColumn() +** See xPhraseFirstColumn above. +*/ +struct Fts5ExtensionApi { + int iVersion; /* Currently always set to 3 */ + + void *(*xUserData)(Fts5Context*); + + int (*xColumnCount)(Fts5Context*); + int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow); + int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken); + + int (*xTokenize)(Fts5Context*, + const char *pText, int nText, /* Text to tokenize */ + void *pCtx, /* Context passed to xToken() */ + int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ + ); + + int (*xPhraseCount)(Fts5Context*); + int (*xPhraseSize)(Fts5Context*, int iPhrase); + + int (*xInstCount)(Fts5Context*, int *pnInst); + int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff); + + sqlite3_int64 (*xRowid)(Fts5Context*); + int (*xColumnText)(Fts5Context*, int iCol, const char **pz, int *pn); + int (*xColumnSize)(Fts5Context*, int iCol, int *pnToken); + + int (*xQueryPhrase)(Fts5Context*, int iPhrase, void *pUserData, + int(*)(const Fts5ExtensionApi*,Fts5Context*,void*) + ); + int (*xSetAuxdata)(Fts5Context*, void *pAux, void(*xDelete)(void*)); + void *(*xGetAuxdata)(Fts5Context*, int bClear); + + int (*xPhraseFirst)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*, int*); + void (*xPhraseNext)(Fts5Context*, Fts5PhraseIter*, int *piCol, int *piOff); + + int (*xPhraseFirstColumn)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*); + void (*xPhraseNextColumn)(Fts5Context*, Fts5PhraseIter*, int *piCol); +}; + +/* +** CUSTOM AUXILIARY FUNCTIONS +*************************************************************************/ + +/************************************************************************* +** CUSTOM TOKENIZERS +** +** Applications may also register custom tokenizer types. A tokenizer +** is registered by providing fts5 with a populated instance of the +** following structure. All structure methods must be defined, setting +** any member of the fts5_tokenizer struct to NULL leads to undefined +** behaviour. The structure methods are expected to function as follows: +** +** xCreate: +** This function is used to allocate and initialize a tokenizer instance. +** A tokenizer instance is required to actually tokenize text. +** +** The first argument passed to this function is a copy of the (void*) +** pointer provided by the application when the fts5_tokenizer object +** was registered with FTS5 (the third argument to xCreateTokenizer()). +** The second and third arguments are an array of nul-terminated strings +** containing the tokenizer arguments, if any, specified following the +** tokenizer name as part of the CREATE VIRTUAL TABLE statement used +** to create the FTS5 table. +** +** The final argument is an output variable. If successful, (*ppOut) +** should be set to point to the new tokenizer handle and SQLITE_OK +** returned. If an error occurs, some value other than SQLITE_OK should +** be returned. In this case, fts5 assumes that the final value of *ppOut +** is undefined. +** +** xDelete: +** This function is invoked to delete a tokenizer handle previously +** allocated using xCreate(). Fts5 guarantees that this function will +** be invoked exactly once for each successful call to xCreate(). +** +** xTokenize: +** This function is expected to tokenize the nText byte string indicated +** by argument pText. pText may or may not be nul-terminated. The first +** argument passed to this function is a pointer to an Fts5Tokenizer object +** returned by an earlier call to xCreate(). +** +** The second argument indicates the reason that FTS5 is requesting +** tokenization of the supplied text. This is always one of the following +** four values: +** +** +** +** For each token in the input string, the supplied callback xToken() must +** be invoked. The first argument to it should be a copy of the pointer +** passed as the second argument to xTokenize(). The third and fourth +** arguments are a pointer to a buffer containing the token text, and the +** size of the token in bytes. The 4th and 5th arguments are the byte offsets +** of the first byte of and first byte immediately following the text from +** which the token is derived within the input. +** +** The second argument passed to the xToken() callback ("tflags") should +** normally be set to 0. The exception is if the tokenizer supports +** synonyms. In this case see the discussion below for details. +** +** FTS5 assumes the xToken() callback is invoked for each token in the +** order that they occur within the input text. +** +** If an xToken() callback returns any value other than SQLITE_OK, then +** the tokenization should be abandoned and the xTokenize() method should +** immediately return a copy of the xToken() return value. Or, if the +** input buffer is exhausted, xTokenize() should return SQLITE_OK. Finally, +** if an error occurs with the xTokenize() implementation itself, it +** may abandon the tokenization and return any error code other than +** SQLITE_OK or SQLITE_DONE. +** +** SYNONYM SUPPORT +** +** Custom tokenizers may also support synonyms. Consider a case in which a +** user wishes to query for a phrase such as "first place". Using the +** built-in tokenizers, the FTS5 query 'first + place' will match instances +** of "first place" within the document set, but not alternative forms +** such as "1st place". In some applications, it would be better to match +** all instances of "first place" or "1st place" regardless of which form +** the user specified in the MATCH query text. +** +** There are several ways to approach this in FTS5: +** +**
  1. By mapping all synonyms to a single token. In this case, using +** the above example, this means that the tokenizer returns the +** same token for inputs "first" and "1st". Say that token is in +** fact "first", so that when the user inserts the document "I won +** 1st place" entries are added to the index for tokens "i", "won", +** "first" and "place". If the user then queries for '1st + place', +** the tokenizer substitutes "first" for "1st" and the query works +** as expected. +** +**
  2. By querying the index for all synonyms of each query term +** separately. In this case, when tokenizing query text, the +** tokenizer may provide multiple synonyms for a single term +** within the document. FTS5 then queries the index for each +** synonym individually. For example, faced with the query: +** +** +** ... MATCH 'first place' +** +** the tokenizer offers both "1st" and "first" as synonyms for the +** first token in the MATCH query and FTS5 effectively runs a query +** similar to: +** +** +** ... MATCH '(first OR 1st) place' +** +** except that, for the purposes of auxiliary functions, the query +** still appears to contain just two phrases - "(first OR 1st)" +** being treated as a single phrase. +** +**
  3. By adding multiple synonyms for a single term to the FTS index. +** Using this method, when tokenizing document text, the tokenizer +** provides multiple synonyms for each token. So that when a +** document such as "I won first place" is tokenized, entries are +** added to the FTS index for "i", "won", "first", "1st" and +** "place". +** +** This way, even if the tokenizer does not provide synonyms +** when tokenizing query text (it should not - to do so would be +** inefficient), it doesn't matter if the user queries for +** 'first + place' or '1st + place', as there are entries in the +** FTS index corresponding to both forms of the first token. +**
+** +** Whether it is parsing document or query text, any call to xToken that +** specifies a tflags argument with the FTS5_TOKEN_COLOCATED bit +** is considered to supply a synonym for the previous token. For example, +** when parsing the document "I won first place", a tokenizer that supports +** synonyms would call xToken() 5 times, as follows: +** +** +** xToken(pCtx, 0, "i", 1, 0, 1); +** xToken(pCtx, 0, "won", 3, 2, 5); +** xToken(pCtx, 0, "first", 5, 6, 11); +** xToken(pCtx, FTS5_TOKEN_COLOCATED, "1st", 3, 6, 11); +** xToken(pCtx, 0, "place", 5, 12, 17); +** +** +** It is an error to specify the FTS5_TOKEN_COLOCATED flag the first time +** xToken() is called. Multiple synonyms may be specified for a single token +** by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence. +** There is no limit to the number of synonyms that may be provided for a +** single token. +** +** In many cases, method (1) above is the best approach. It does not add +** extra data to the FTS index or require FTS5 to query for multiple terms, +** so it is efficient in terms of disk space and query speed. However, it +** does not support prefix queries very well. If, as suggested above, the +** token "first" is substituted for "1st" by the tokenizer, then the query: +** +** +** ... MATCH '1s*' +** +** will not match documents that contain the token "1st" (as the tokenizer +** will probably not map "1s" to any prefix of "first"). +** +** For full prefix support, method (3) may be preferred. In this case, +** because the index contains entries for both "first" and "1st", prefix +** queries such as 'fi*' or '1s*' will match correctly. However, because +** extra entries are added to the FTS index, this method uses more space +** within the database. +** +** Method (2) offers a midpoint between (1) and (3). Using this method, +** a query such as '1s*' will match documents that contain the literal +** token "1st", but not "first" (assuming the tokenizer is not able to +** provide synonyms for prefixes). However, a non-prefix query like '1st' +** will match against "1st" and "first". This method does not require +** extra disk space, as no extra entries are added to the FTS index. +** On the other hand, it may require more CPU cycles to run MATCH queries, +** as separate queries of the FTS index are required for each synonym. +** +** When using methods (2) or (3), it is important that the tokenizer only +** provide synonyms when tokenizing document text (method (2)) or query +** text (method (3)), not both. Doing so will not cause any errors, but is +** inefficient. +*/ +typedef struct Fts5Tokenizer Fts5Tokenizer; +typedef struct fts5_tokenizer fts5_tokenizer; +struct fts5_tokenizer { + int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut); + void (*xDelete)(Fts5Tokenizer*); + int (*xTokenize)(Fts5Tokenizer*, + void *pCtx, + int flags, /* Mask of FTS5_TOKENIZE_* flags */ + const char *pText, int nText, + int (*xToken)( + void *pCtx, /* Copy of 2nd argument to xTokenize() */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Pointer to buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ + ) + ); +}; + +/* Flags that may be passed as the third argument to xTokenize() */ +#define FTS5_TOKENIZE_QUERY 0x0001 +#define FTS5_TOKENIZE_PREFIX 0x0002 +#define FTS5_TOKENIZE_DOCUMENT 0x0004 +#define FTS5_TOKENIZE_AUX 0x0008 + +/* Flags that may be passed by the tokenizer implementation back to FTS5 +** as the third argument to the supplied xToken callback. */ +#define FTS5_TOKEN_COLOCATED 0x0001 /* Same position as prev. token */ + +/* +** END OF CUSTOM TOKENIZERS +*************************************************************************/ + +/************************************************************************* +** FTS5 EXTENSION REGISTRATION API +*/ +typedef struct fts5_api fts5_api; +struct fts5_api { + int iVersion; /* Currently always set to 2 */ + + /* Create a new tokenizer */ + int (*xCreateTokenizer)( + fts5_api *pApi, + const char *zName, + void *pContext, + fts5_tokenizer *pTokenizer, + void (*xDestroy)(void*) + ); + + /* Find an existing tokenizer */ + int (*xFindTokenizer)( + fts5_api *pApi, + const char *zName, + void **ppContext, + fts5_tokenizer *pTokenizer + ); + + /* Create a new auxiliary function */ + int (*xCreateFunction)( + fts5_api *pApi, + const char *zName, + void *pContext, + fts5_extension_function xFunction, + void (*xDestroy)(void*) + ); +}; + +/* +** END OF REGISTRATION API +*************************************************************************/ + +#ifdef __cplusplus +} /* end of the 'extern "C"' block */ +#endif + +#endif /* _FTS5_H */ From 8c0209e4f471ca4c4f1b383e604f14dc87fb61c8 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Fri, 28 Aug 2020 13:58:28 +0200 Subject: [PATCH 03/14] Remove trailing whitespace --- lib/DBD/SQLite.pm | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/lib/DBD/SQLite.pm b/lib/DBD/SQLite.pm index 1032e1d..fd14348 100644 --- a/lib/DBD/SQLite.pm +++ b/lib/DBD/SQLite.pm @@ -551,7 +551,7 @@ my @FOREIGN_KEY_INFO_ODBC = ( # Maybe we could add an option so that the user can choose which field # names will be returned; the DBI spec is not very clear about ODBC vs. CLI. my @FOREIGN_KEY_INFO_SQL_CLI = qw( - UK_TABLE_CAT + UK_TABLE_CAT UK_TABLE_SCHEM UK_TABLE_NAME UK_COLUMN_NAME @@ -765,7 +765,7 @@ sub statistics_info { TABLE_CAT => undef, TABLE_SCHEM => $db->{name}, TABLE_NAME => $tbname, - NON_UNIQUE => $row->{unique} ? 0 : 1, + NON_UNIQUE => $row->{unique} ? 0 : 1, INDEX_QUALIFIER => undef, INDEX_NAME => $row->{name}, TYPE => 'btree', # see https://www.sqlite.org/version3.html esp. "Traditional B-trees are still used for indices" @@ -1334,7 +1334,7 @@ bind values with no explicit type. SQLite supports several placeholder expressions, including C and C<:AAAA>. Consult the L and SQLite documentation for -details. +details. L @@ -1345,7 +1345,7 @@ named) placeholders to avoid confusion. my $sth = $dbh->prepare( 'update TABLE set a=?1 where b=?2 and a IS NOT ?1' ); - $sth->execute(1, 2); + $sth->execute(1, 2); =head2 Pragma @@ -1520,7 +1520,7 @@ As the L doc says, you almost certainly do B need to call L method if you fetch all rows (probably in a loop). However, there are several exceptions to this rule, and rolling-back of an unfinished C statements in a transaction (See L for @@ -1550,7 +1550,7 @@ statements (a C) to a statement handle (via C or C), L only processes the first statement, and discards the rest. -If you need to process multiple statements at a time, set +If you need to process multiple statements at a time, set a C attribute of a database handle to true when you connect to a database, and C method takes care of the rest (since 1.30_01, and without creating DBI's statement @@ -1784,7 +1784,7 @@ keys of temporary tables). undef, $fk_schema, $fk_table); Returns information about foreign key constraints, as specified in -L, but with some limitations : +L, but with some limitations : =over @@ -1849,7 +1849,7 @@ a C command; see L earlier in this manual. $unique_only, $quick); Returns information about a table and it's indexes, as specified in -L, but with some limitations : +L, but with some limitations : =over @@ -2417,7 +2417,7 @@ Virtual tables are explained in L. Sets a new run-time limit for the category, and returns the current limit. If the new value is a negative number (or omitted), the limit is unchanged and just returns the current limit. Category ids (SQLITE_LIMIT_LENGTH, -SQLITE_LIMIT_VARIABLE_NUMBER, etc) can be imported from DBD::SQLite::Constants. +SQLITE_LIMIT_VARIABLE_NUMBER, etc) can be imported from DBD::SQLite::Constants. =head2 $dbh->sqlite_get_autocommit() @@ -2703,7 +2703,7 @@ then query which buildings overlap or are contained within a specified region: $minLong, $maxLong, $minLat, $maxLat); my $overlapping = $dbh->selectcol_arrayref($overlap_sql,undef, - $minLong, $maxLong, $minLat, $maxLat); + $minLong, $maxLong, $minLat, $maxLat); For more detail, please see the SQLite R-Tree page (L). Note that custom R-Tree @@ -2723,7 +2723,7 @@ virtual tables. These can have many interesting uses for joining regular DBMS data with some other kind of data within your Perl programs. Bundled with the present distribution are : -=over +=over =item * From 1ad93cacc0a596c9784d5baafa403af473de9d70 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sun, 30 Aug 2020 07:59:53 +0200 Subject: [PATCH 04/14] Fix all type errors --- dbdimp_tokenizer.inc | 175 ++++++++++++++++++++++--------------------- lib/DBD/SQLite.pm | 1 + 2 files changed, 89 insertions(+), 87 deletions(-) diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index 6020ec5..a6ee4bd 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -16,23 +16,14 @@ typedef struct perl_fts3_tokenizer_cursor { int currentChar; /* char position corresponding to currentByte */ } perl_fts3_tokenizer_cursor; -typedef struct perl_fts5_tokenizer { - fts5_tokenizer base; +typedef struct perl_Fts5Tokenizer { + Fts5Tokenizer base; SV *coderef; /* the perl tokenizer is a coderef that takes - a string and returns a cursor coderef */ -} perl_fts5_tokenizer; - -typedef struct perl_fts5_tokenizer_cursor { - // fts5_tokenizer_cursor base; - SV *coderef; /* ref to the closure that returns terms */ - char *pToken; /* storage for a copy of the last token */ - int nTokenAllocated; /* space allocated to pToken buffer */ - - /* members below are only used if the input string is in utf8 */ - const char *pInput; /* input we are tokenizing */ - const char *lastByteOffset; /* offset into pInput */ - int lastCharOffset; /* char offset corresponding to lastByteOffset */ -} perl_fts5_tokenizer_cursor; + ** a string and and some parameters and + ** in turn calls the xToken() function + ** passed to it + */ +} perl_Fts5Tokenizer; /* ** Create a new tokenizer instance. @@ -340,20 +331,19 @@ int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh) ** where qualified::function::name is a fully qualified perl function */ static int perl_fts5_tokenizer_Create( - int argc, const char * const *argv, - sqlite3_tokenizer **ppTokenizer + void* pCtx, const char **azArg, int nArg, Fts5Tokenizer **ppOut ){ dTHX; dSP; int n_retval; SV *retval; - perl_fts3_tokenizer *t; + perl_Fts5Tokenizer *t; - if (!argc) { + if (!nArg) { return SQLITE_ERROR; } - t = (perl_fts3_tokenizer *) sqlite3_malloc(sizeof(*t)); + t = (perl_Fts5Tokenizer *) sqlite3_malloc(sizeof(*t)); if( t==NULL ) return SQLITE_NOMEM; memset(t, 0, sizeof(*t)); @@ -363,7 +353,7 @@ static int perl_fts5_tokenizer_Create( /* call the qualified::function::name */ PUSHMARK(SP); PUTBACK; - n_retval = call_pv(argv[0], G_SCALAR); + n_retval = call_pv(azArg[0], G_SCALAR); SPAGAIN; /* store a copy of the returned coderef into the tokenizer structure */ @@ -372,7 +362,7 @@ static int perl_fts5_tokenizer_Create( } retval = POPs; t->coderef = newSVsv(retval); - *ppTokenizer = &t->base; + *ppOut = &t->base; PUTBACK; FREETMPS; @@ -384,27 +374,33 @@ static int perl_fts5_tokenizer_Create( /* ** Destroy a tokenizer */ -static int perl_fts5_tokenizer_Delete(Fts5Tokenizer *pTokenizer){ +static void perl_fts5_tokenizer_Delete(Fts5Tokenizer *pTokenizer){ dTHX; - perl_fts5_tokenizer *t = (perl_fts5_tokenizer *) pTokenizer; + perl_Fts5Tokenizer *t = (perl_Fts5Tokenizer *) pTokenizer; sv_free(t->coderef); sqlite3_free(t); - return SQLITE_OK; + return; } /* -** Extract the next token from a tokenization cursor. The cursor must -** have been opened by a prior call to perl_fts3_tokenizer_Open(). +** This does a tokenizing run over the string. Found tokens (and synonyms) +** are stored by calling xToken() */ static int perl_fts5_tokenizer_Tokenize( - sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_fts3_tokenizer_Open */ - const char **ppToken, /* OUT: *ppToken is the token text */ - int *pnBytes, /* OUT: Number of bytes in token */ - int *piStartOffset, /* OUT: Starting offset of token */ - int *piEndOffset, /* OUT: Ending offset of token */ - int *piPosition /* OUT: Position integer of token */ + Fts5Tokenizer* tokenizer, + void *pCtx, + int flags, /* Mask of FTS5_TOKENIZE_* flags */ + const char *pText, int nText, + int (*xToken)( + void *pCtx, /* Copy of 2nd argument to xTokenize() */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Pointer to buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ + ) ){ - perl_fts3_tokenizer_cursor *c = (perl_fts3_tokenizer_cursor *) pCursor; + perl_Fts5Tokenizer *c = (perl_Fts5Tokenizer *) tokenizer; int result; int n_retval; char *token; @@ -418,61 +414,67 @@ static int perl_fts5_tokenizer_Tokenize( ENTER; SAVETMPS; - /* call the cursor */ + /* call the Perl tokenizer, and pass it our token callback */ PUSHMARK(SP); PUTBACK; + + // XXX Wrap the "found token" callback, and pass it to the user + // Then, restructure the data if it is UTF-8 + // First, do all of this in Perl so it is easier to debug + + ///* if we get back an empty list, there is no more token */ + //if (n_retval == 0) { + // result = SQLITE_DONE; + //} + ///* otherwise, get token details from the return list */ + //else { + // if (n_retval != 5) { + // warn("tokenizer cursor returned %d arguments", n_retval); + // } + // *piPosition = POPi; + // *piEndOffset = POPi; + // *piStartOffset = POPi; + // *pnBytes = POPi; + // token = POPpx; + // + // if (c->pInput) { /* if working with utf8 data */ + // + // /* recompute *pnBytes in bytes, not in chars */ + // *pnBytes = strlen(token); + // + // /* recompute start/end offsets in bytes, not in chars */ + // hop = *piStartOffset - c->lastCharOffset; + // byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); + // hop = *piEndOffset - *piStartOffset; + // *piStartOffset = byteOffset - c->pInput; + // byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); + // *piEndOffset = byteOffset - c->pInput; + // + // /* remember where we are for next round */ + // c->lastCharOffset = *piEndOffset, + // c->lastByteOffset = byteOffset; + // } + // + // /* make sure we have enough storage for copying the token */ + // if (*pnBytes > c->nTokenAllocated ){ + // char *pNew; + // c->nTokenAllocated = *pnBytes + 20; + // pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); + // if( !pNew ) return SQLITE_NOMEM; + // c->pToken = pNew; + // } + // + // /* need to copy the token into the C cursor before perl frees that + // memory */ + // memcpy(c->pToken, token, *pnBytes); + // *ppToken = c->pToken; + // + // result = SQLITE_OK; + // + n_retval = call_sv(c->coderef, G_ARRAY); SPAGAIN; - /* if we get back an empty list, there is no more token */ - if (n_retval == 0) { - result = SQLITE_DONE; - } - /* otherwise, get token details from the return list */ - else { - if (n_retval != 5) { - warn("tokenizer cursor returned %d arguments", n_retval); - } - *piPosition = POPi; - *piEndOffset = POPi; - *piStartOffset = POPi; - *pnBytes = POPi; - token = POPpx; - - if (c->pInput) { /* if working with utf8 data */ - - /* recompute *pnBytes in bytes, not in chars */ - *pnBytes = strlen(token); - - /* recompute start/end offsets in bytes, not in chars */ - hop = *piStartOffset - c->lastCharOffset; - byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); - hop = *piEndOffset - *piStartOffset; - *piStartOffset = byteOffset - c->pInput; - byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); - *piEndOffset = byteOffset - c->pInput; - - /* remember where we are for next round */ - c->lastCharOffset = *piEndOffset, - c->lastByteOffset = byteOffset; - } - - /* make sure we have enough storage for copying the token */ - if (*pnBytes > c->nTokenAllocated ){ - char *pNew; - c->nTokenAllocated = *pnBytes + 20; - pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); - if( !pNew ) return SQLITE_NOMEM; - c->pToken = pNew; - } - - /* need to copy the token into the C cursor before perl frees that - memory */ - memcpy(c->pToken, token, *pnBytes); - *ppToken = c->pToken; - - result = SQLITE_OK; - } PUTBACK; FREETMPS; @@ -485,7 +487,6 @@ static int perl_fts5_tokenizer_Tokenize( ** The set of routines that implement the perl FTS5 tokenizer */ fts5_tokenizer perl_fts5_tokenizer_Module = { - 0, perl_fts5_tokenizer_Create, perl_fts5_tokenizer_Delete, perl_fts5_tokenizer_Tokenize @@ -530,7 +531,7 @@ int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh) int rc; fts5_api *pFts5Api = sqlite_fetch_fts5_api(aTHX_ dbh); - sqlite3_tokenizer_module *p = &perl_fts5_tokenizer_Module; + fts5_tokenizer *p = &perl_fts5_tokenizer_Module; // pFts5Api->xCreateTokenizer(pFts5Api,...); diff --git a/lib/DBD/SQLite.pm b/lib/DBD/SQLite.pm index fd14348..634b4ec 100644 --- a/lib/DBD/SQLite.pm +++ b/lib/DBD/SQLite.pm @@ -51,6 +51,7 @@ sub driver { DBD::SQLite::db->install_method('sqlite_enable_load_extension'); DBD::SQLite::db->install_method('sqlite_load_extension'); DBD::SQLite::db->install_method('sqlite_register_fts3_perl_tokenizer'); + DBD::SQLite::db->install_method('sqlite_register_fts5_perl_tokenizer'); DBD::SQLite::db->install_method('sqlite_trace', { O => 0x0004 }); DBD::SQLite::db->install_method('sqlite_profile', { O => 0x0004 }); DBD::SQLite::db->install_method('sqlite_table_column_metadata', { O => 0x0004 }); From e521f92bf66d57d80d4f4d3e0ce4e3d0011ebaf7 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Fri, 4 Sep 2020 15:52:29 +0200 Subject: [PATCH 05/14] The test suite now passes without segfaults or anything --- SQLite.xs | 25 ++++++++++ dbdimp.h | 2 + dbdimp_tokenizer.inc | 76 +++++++++++++++++++++++++------ lib/DBD/SQLite.pm | 60 ++++++++++++------------ t/67_fts5.t | 106 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 226 insertions(+), 43 deletions(-) create mode 100644 t/67_fts5.t diff --git a/SQLite.xs b/SQLite.xs index 0a3a052..f1676a1 100644 --- a/SQLite.xs +++ b/SQLite.xs @@ -314,6 +314,31 @@ register_fts3_perl_tokenizer(dbh) OUTPUT: RETVAL +static int +register_fts5_perl_tokenizer(dbh) + SV *dbh + ALIAS: + DBD::SQLite::db::sqlite_register_fts5_perl_tokenizer = 1 + CODE: + RETVAL = sqlite_db_register_fts5_perl_tokenizer(aTHX_ dbh); + OUTPUT: + RETVAL + +static int +fts5_xToken(pCtx,tflags,svToken,iStart,iEnd) + SV *pCtx + int tflags + SV *svToken + STRLEN iStart + STRLEN iEnd + ALIAS: + DBD::SQLite::db::fts5_xToken = 1 + CODE: + dTHX; + RETVAL = perl_fts5_xToken(aTHX_ pCtx,tflags,svToken,iStart,iEnd); + OUTPUT: + RETVAL + HV* db_status(dbh, reset = 0) SV* dbh diff --git a/dbdimp.h b/dbdimp.h index b357e1f..03d5a7f 100644 --- a/dbdimp.h +++ b/dbdimp.h @@ -182,6 +182,8 @@ HV* sqlite_db_table_column_metadata(pTHX_ SV *dbh, SV *dbname, SV *tablename, SV HV* _sqlite_db_status(pTHX_ SV *dbh, int reset); SV* sqlite_db_filename(pTHX_ SV *dbh); int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh); +int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh); +int perl_fts5_xToken(pTHX_ SV* pCtx, int tflags, SV* svToken, int iStart, int iEnd ); HV* _sqlite_status(int reset); HV* _sqlite_st_status(pTHX_ SV *sth, int reset); int sqlite_db_create_module(pTHX_ SV *dbh, const char *name, const char *perl_class); diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index a6ee4bd..c91a8d5 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -17,7 +17,7 @@ typedef struct perl_fts3_tokenizer_cursor { } perl_fts3_tokenizer_cursor; typedef struct perl_Fts5Tokenizer { - Fts5Tokenizer base; + /* Fts5Tokenizer base; /* this is an empty struct, so we omit it entirely */ SV *coderef; /* the perl tokenizer is a coderef that takes ** a string and and some parameters and ** in turn calls the xToken() function @@ -25,6 +25,23 @@ typedef struct perl_Fts5Tokenizer { */ } perl_Fts5Tokenizer; +/* This is the structure where we store the information between calls + * from Perl and callbacks to SQLite. We could instead pass these values + * as opaque arguments to Perl and back, but this reduces the number of + * opaque values handled by Perl to a single such value. + */ +typedef struct perl_cb_ctx { + void * Ctx; + int (*xToken)( + void *pCtx, /* Copy of 2nd argument to xTokenize() */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Pointer to buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ + ); +} perl_cb_ctx; + /* ** Create a new tokenizer instance. ** Will be called whenever a FTS3 table is created with @@ -338,7 +355,6 @@ static int perl_fts5_tokenizer_Create( int n_retval; SV *retval; perl_Fts5Tokenizer *t; - if (!nArg) { return SQLITE_ERROR; } @@ -346,7 +362,6 @@ static int perl_fts5_tokenizer_Create( t = (perl_Fts5Tokenizer *) sqlite3_malloc(sizeof(*t)); if( t==NULL ) return SQLITE_NOMEM; memset(t, 0, sizeof(*t)); - ENTER; SAVETMPS; @@ -358,11 +373,12 @@ static int perl_fts5_tokenizer_Create( /* store a copy of the returned coderef into the tokenizer structure */ if (n_retval != 1) { - warn("tokenizer_Create returned %d arguments", n_retval); + warn("tokenizer_Create returned %d arguments, expected a single coderef", n_retval); } retval = POPs; t->coderef = newSVsv(retval); - *ppOut = &t->base; + /* *ppOut = &t->base; */ /* Fts5Tokenizer is empty and gcc complains about that */ + *ppOut = (Fts5Tokenizer *) t; PUTBACK; FREETMPS; @@ -405,18 +421,38 @@ static int perl_fts5_tokenizer_Tokenize( int n_retval; char *token; char *byteOffset; - STRLEN n_a; /* this is required for older perls < 5.8.8 */ - I32 hop; - dTHX; dSP; + /* The implicit assumption here is that our callback will only be + * invoked from a stack frame below this frame! + */ + perl_cb_ctx ctx; + SV* ctxP; + SV* text; + + STRLEN n_a; /* this is required for older perls < 5.8.8 */ + I32 hop; + ENTER; SAVETMPS; /* call the Perl tokenizer, and pass it our token callback */ PUSHMARK(SP); - PUTBACK; + + ctx.Ctx = pCtx; + ctx.xToken = xToken; + ctxP = newSVpvn((const char *const)&ctx, sizeof(ctx)); + + text = newSVpvn(pText, nText); + + // We pass four arguments + //EXTEND(SP, 2); + XPUSHs(sv_2mortal(ctxP)); + XPUSHs(sv_2mortal(text)); + XPUSHs(sv_2mortal(newSViv(flags))); + // We need to properly wrap this so it is callable from Perl... + // ... without needing actual local storage or a global variable... // XXX Wrap the "found token" callback, and pass it to the user // Then, restructure the data if it is UTF-8 @@ -471,11 +507,10 @@ static int perl_fts5_tokenizer_Tokenize( // // result = SQLITE_OK; // - + PUTBACK; n_retval = call_sv(c->coderef, G_ARRAY); SPAGAIN; - PUTBACK; FREETMPS; LEAVE; @@ -483,6 +518,20 @@ static int perl_fts5_tokenizer_Tokenize( return result; } +int perl_fts5_xToken(pTHX_ + SV* pCtx, + int tflags, /* Mask of FTS5_TOKEN_* flags */ + SV* svToken, /* Pointer to buffer containing token */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ +) { + const char* chrToken = SvPV_nolen(svToken); + STRLEN nToken = strlen(chrToken); + perl_cb_ctx * p = (perl_cb_ctx *)SvPV_nolen( pCtx ); + return p->xToken(p->Ctx,tflags,chrToken,nToken,iStart,iEnd); +} + + /* ** The set of routines that implement the perl FTS5 tokenizer */ @@ -533,8 +582,7 @@ int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh) fts5_api *pFts5Api = sqlite_fetch_fts5_api(aTHX_ dbh); fts5_tokenizer *p = &perl_fts5_tokenizer_Module; - // pFts5Api->xCreateTokenizer(pFts5Api,...); + rc = pFts5Api->xCreateTokenizer(pFts5Api, "perl", 0, p, 0); - - return 0; + return rc; } diff --git a/lib/DBD/SQLite.pm b/lib/DBD/SQLite.pm index 634b4ec..19d0df9 100644 --- a/lib/DBD/SQLite.pm +++ b/lib/DBD/SQLite.pm @@ -143,10 +143,12 @@ sub connect { $dbh->sqlite_collation_needed( \&install_collation ); $dbh->sqlite_create_function( "REGEXP", 2, \®exp ); $dbh->sqlite_register_fts3_perl_tokenizer(); + $dbh->sqlite_register_fts5_perl_tokenizer(); } else { $dbh->func( \&install_collation, "collation_needed" ); $dbh->func( "REGEXP", 2, \®exp, "create_function" ); $dbh->func( "register_fts3_perl_tokenizer" ); + $dbh->func( "register_fts5_perl_tokenizer" ); } # HACK: Since PrintWarn = 0 doesn't seem to actually prevent warnings @@ -1223,7 +1225,7 @@ store natively as a BLOB use the following code: use DBI qw(:sql_types); my $dbh = DBI->connect("dbi:SQLite:dbfile","",""); - + my $blob = `cat foo.jpg`; my $sth = $dbh->prepare("INSERT INTO mytable VALUES (1, ?)"); $sth->bind_param(1, $blob, SQL_BLOB); @@ -1235,7 +1237,7 @@ And then retrieval just works: $sth->execute(); my $row = $sth->fetch; my $blobo = $row->[1]; - + # now $blobo == $blob =head2 Functions And Bind Parameters @@ -1264,7 +1266,7 @@ As shown above in the C section, you can always use C to tell the type of a bind value. use DBI qw(:sql_types); # Don't forget this - + my $sth = $dbh->prepare(q{ SELECT bar FROM foo GROUP BY bar HAVING count(*) > ?; }); @@ -1454,13 +1456,13 @@ statement. To end it, call C methods, or issue the corresponding statements. $dbh->{AutoCommit} = 1; - + $dbh->begin_work; # or $dbh->do('BEGIN TRANSACTION'); - + # $dbh->{AutoCommit} is turned off temporarily during a transaction; - + $dbh->commit; # or $dbh->do('COMMIT'); - + # $dbh->{AutoCommit} is turned on again; =item When the AutoCommit flag is off @@ -1474,15 +1476,15 @@ You can commit or roll it back freely. Another transaction will automatically begin if you execute another statement. $dbh->{AutoCommit} = 0; - + # $dbh->do('BEGIN TRANSACTION') is not necessary, but possible - + ... - + $dbh->commit; # or $dbh->do('COMMIT'); - + # $dbh->{AutoCommit} stays intact; - + $dbh->{AutoCommit} = 1; # ends the transactional mode =back @@ -2090,38 +2092,38 @@ Here is a simple aggregate function which returns the variance (example adapted from pysqlite): package variance; - + sub new { bless [], shift; } - + sub step { my ( $self, $value ) = @_; - + push @$self, $value; } - + sub finalize { my $self = $_[0]; - + my $n = @$self; - + # Variance is NULL unless there is more than one row return undef unless $n || $n == 1; - + my $mu = 0; foreach my $v ( @$self ) { $mu += $v; } $mu /= $n; - + my $sigma = 0; foreach my $v ( @$self ) { $sigma += ($v - $mu)**2; } $sigma = $sigma / ($n - 1); - + return $sigma; } - + $dbh->sqlite_create_aggregate( "variance", 1, 'variance' ); The aggregate function can then be used as: @@ -2390,13 +2392,13 @@ You may also pass 0 as an argument to reset the status. You can change how the connected database should behave like this: use DBD::SQLite::Constants qw/:database_connection_configuration_options/; - + my $dbh = DBI->connect('dbi:SQLite::memory:'); # This disables language features that allow ordinary SQL # to deliberately corrupt the database file $dbh->sqlite_db_config( SQLITE_DBCONFIG_DEFENSIVE, 1 ); - + # This disables two-arg version of fts3_tokenizer. $dbh->sqlite_db_config( SQLITE_DBCONFIG_ENABLE_FTS3_TOKENIZER, 0 ); @@ -2693,16 +2695,16 @@ then query which buildings overlap or are contained within a specified region: SELECT id FROM city_buildings WHERE minLong >= ? AND maxLong <= ? AND minLat >= ? AND maxLat <= ? - + # ... and those that overlap query coordinates my $overlap_sql = <<""; SELECT id FROM city_buildings WHERE maxLong >= ? AND minLong <= ? AND maxLat >= ? AND minLat <= ? - + my $contained = $dbh->selectcol_arrayref($contained_sql,undef, $minLong, $maxLong, $minLat, $maxLat); - + my $overlapping = $dbh->selectcol_arrayref($overlap_sql,undef, $minLong, $maxLong, $minLat, $maxLat); @@ -2750,10 +2752,10 @@ header like this: use File::ShareDir 'dist_dir'; use File::Spec::Functions 'catfile'; - + # the whole sqlite3.h header my $sqlite3_h = catfile(dist_dir('DBD-SQLite'), 'sqlite3.h'); - + # or only a particular header, amalgamated in sqlite3.c my $what_i_want = 'parse.h'; my $sqlite3_c = catfile(dist_dir('DBD-SQLite'), 'sqlite3.c'); diff --git a/t/67_fts5.t b/t/67_fts5.t new file mode 100644 index 0000000..bd5e37c --- /dev/null +++ b/t/67_fts5.t @@ -0,0 +1,106 @@ +use strict; +use warnings; +no if $] >= 5.022, "warnings", "locale"; +use lib "t/lib"; +use SQLiteTest; +use Test::More; +#use if -d ".git", "Test::FailWarnings"; +use DBD::SQLite; + +my @texts = ("il était une bergère", + "qui gardait ses moutons", + "elle fit un fromage", + "du lait de ses moutons"); + +my @tests = ( +# query => expected results + ["bergère" => 0 ], + ["berg*" => 0 ], + ["foobar" ], + ["moutons" => 1, 3 ], + ['"qui gardait"' => 1 ], + ["moutons NOT lait" => 1 ], + ["il était" => 0 ], + ["(il OR elle) AND un*" => 0, 2 ], +); + +BEGIN { + requires_unicode_support(); + + if (!has_fts()) { + plan skip_all => 'FTS is disabled for this DBD::SQLite'; + } + if ($DBD::SQLite::sqlite_version_number >= 3011000 and $DBD::SQLite::sqlite_version_number < 3012000 and !has_compile_option('ENABLE_FTS5_TOKENIZER')) { + plan skip_all => 'FTS5 tokenizer is disabled for this DBD::SQLite'; + } +} + +# Perl may spit a warning on locale +# use Test::NoWarnings; + +BEGIN { + # Sadly perl for windows (and probably sqlite, too) may hang + # if the system locale doesn't support european languages. + # en-us should be a safe default. if it doesn't work, use 'C'. + if ( $^O eq 'MSWin32') { + use POSIX 'locale_h'; + setlocale(LC_COLLATE, 'en-us'); + } +} + +use locale; + +sub locale_tokenizer { # see also: Search::Tokenizer + return sub { + my( $ctx, $string, $tokenizer_context_flags ) = @_; + my $regex = qr/\w+/; + #my $term_index = 0; + # + while( $string =~ /$regex/g) { + my ($start, $end) = ($-[0], $+[0]); + my $term = substr($string, $start, my $len = $end-$start); + my $flags = 0; # SQLITE_FTS5_TOKEN; + DBD::SQLite::db::fts5_xToken($ctx,$flags,$term,$start,$end); + }; + }; +} + +use DBD::SQLite; + +for my $use_unicode (0, 1) { + + # connect + my $dbh = connect_ok( RaiseError => 1, sqlite_unicode => $use_unicode ); + + for my $fts (qw/fts5/) { + + # create fts table + $dbh->do(<<"") or die DBI::errstr; + CREATE VIRTUAL TABLE try_$fts + USING $fts(content, tokenize="perl 'main::locale_tokenizer'") + + # populate it + my $insert_sth = $dbh->prepare(<<"") or die DBI::errstr; + INSERT INTO try_$fts(content) VALUES(?) + + my @doc_ids; + for (my $i = 0; $i < @texts; $i++) { + $insert_sth->execute($texts[$i]); + $doc_ids[$i] = $dbh->last_insert_id("", "", "", ""); + } + + # queries + SKIP: { + my $sql = "SELECT rowid FROM try_$fts WHERE content MATCH ?"; + + for my $t (@tests) { + my ($query, @expected) = @$t; + @expected = map {$doc_ids[$_]} @expected; + my $results = $dbh->selectcol_arrayref($sql, undef, $query); + is_deeply($results, \@expected, "$query ($fts, unicode=$use_unicode)"); + } + } + } +} + +done_testing; From a83a495af1b1eee9c0d3987266f98a6a86d89ad2 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sat, 12 Sep 2020 22:35:49 +0200 Subject: [PATCH 06/14] Note an optimization, for later --- dbdimp_tokenizer.inc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index c91a8d5..712c50c 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -424,8 +424,8 @@ static int perl_fts5_tokenizer_Tokenize( dTHX; dSP; - /* The implicit assumption here is that our callback will only be - * invoked from a stack frame below this frame! + /* newSVpvn() will create a copy of this buffer, but ideally we would + * directly write into the PV part of that copied buffer instead */ perl_cb_ctx ctx; SV* ctxP; From c23579d93abf96672aab0328d33354a39c7f509d Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sat, 23 Apr 2022 11:17:57 +0200 Subject: [PATCH 07/14] Properly export the FTS5_ constants on demand # Conflicts: # constants.inc # lib/DBD/SQLite/Constants.pm # util/constants.pl --- constants.inc | 15 +++++++++++++ lib/DBD/SQLite/Constants.pm | 42 ++++++++++++++++++++++++++++++++++++- t/67_fts5.t | 7 ++++--- util/SQLiteUtil.pm | 18 +++++++++++++++- util/constants.pl | 27 ++++++++++++++++++++---- 5 files changed, 100 insertions(+), 9 deletions(-) diff --git a/constants.inc b/constants.inc index ed2e753..c19a284 100644 --- a/constants.inc +++ b/constants.inc @@ -1293,6 +1293,7 @@ _const_flags_for_file_open_operations() SQLITE_OPEN_READONLY = SQLITE_OPEN_READONLY SQLITE_OPEN_READWRITE = SQLITE_OPEN_READWRITE SQLITE_OPEN_CREATE = SQLITE_OPEN_CREATE + SQLITE_OPEN_SUPER_JOURNAL = SQLITE_OPEN_SUPER_JOURNAL SQLITE_OPEN_NOMUTEX = SQLITE_OPEN_NOMUTEX CODE: RETVAL = ix; @@ -1471,6 +1472,19 @@ _const_flags_for_file_open_operations_3037000_zero() #if SQLITE_VERSION_NUMBER >= 3008003 +IV +_const_fts5_tokenizer() + ALIAS: + FTS5_TOKENIZE_QUERY = FTS5_TOKENIZE_QUERY + FTS5_TOKENIZE_PREFIX = FTS5_TOKENIZE_PREFIX + FTS5_TOKENIZE_DOCUMENT = FTS5_TOKENIZE_DOCUMENT + FTS5_TOKENIZE_AUX = FTS5_TOKENIZE_AUX + FTS5_TOKEN_COLOCATED = FTS5_TOKEN_COLOCATED + CODE: + RETVAL = ix; + OUTPUT: + RETVAL + IV _const_function_flags_3008003() ALIAS: @@ -1820,6 +1834,7 @@ _const__flags_for_file_open_operations() OPEN_READONLY = SQLITE_OPEN_READONLY OPEN_READWRITE = SQLITE_OPEN_READWRITE OPEN_CREATE = SQLITE_OPEN_CREATE + OPEN_SUPER_JOURNAL = SQLITE_OPEN_SUPER_JOURNAL OPEN_NOMUTEX = SQLITE_OPEN_NOMUTEX CODE: RETVAL = ix; diff --git a/lib/DBD/SQLite/Constants.pm b/lib/DBD/SQLite/Constants.pm index 5be8f0a..16c26e0 100644 --- a/lib/DBD/SQLite/Constants.pm +++ b/lib/DBD/SQLite/Constants.pm @@ -190,6 +190,15 @@ our @EXPORT_OK = ( SQLITE_OPEN_URI /, + # fts5_tokenizer + qw/ + FTS5_TOKENIZE_AUX + FTS5_TOKENIZE_DOCUMENT + FTS5_TOKENIZE_PREFIX + FTS5_TOKENIZE_QUERY + FTS5_TOKEN_COLOCATED + /, + # function_flags qw/ SQLITE_DETERMINISTIC @@ -357,6 +366,11 @@ our %EXPORT_TAGS = ( SQLITE_ERROR_SNAPSHOT SQLITE_FLOAT SQLITE_FORMAT + FTS5_TOKENIZE_AUX + FTS5_TOKENIZE_DOCUMENT + FTS5_TOKENIZE_PREFIX + FTS5_TOKENIZE_QUERY + FTS5_TOKEN_COLOCATED SQLITE_FULL SQLITE_FUNCTION SQLITE_IGNORE @@ -650,6 +664,14 @@ our %EXPORT_TAGS = ( SQLITE_OPEN_URI /], + fts5_tokenizer => [qw/ + FTS5_TOKENIZE_AUX + FTS5_TOKENIZE_DOCUMENT + FTS5_TOKENIZE_PREFIX + FTS5_TOKENIZE_QUERY + FTS5_TOKEN_COLOCATED + /], + function_flags => [qw/ SQLITE_DETERMINISTIC SQLITE_DIRECTONLY @@ -736,7 +758,7 @@ DBD::SQLite::Constants - common SQLite constants =head1 DESCRIPTION -You can import necessary SQLite constants from this module. Available tags are C, C, C, C, C (C), C, C, C, C (C), C, C (C), C, C. See L for the complete list of constants. +You can import necessary SQLite constants from this module. Available tags are C, C, C, C, C (C), C, C, C, C (C), C, C, C (C), C, C. See L for the complete list of constants. This module does not export anything by default. @@ -1078,6 +1100,8 @@ This module does not export anything by default. =item SQLITE_OPEN_CREATE +=item SQLITE_OPEN_SUPER_JOURNAL + =item SQLITE_OPEN_NOMUTEX =item SQLITE_OPEN_FULLMUTEX @@ -1098,6 +1122,22 @@ This module does not export anything by default. =back +=head2 fts5_tokenizer + +=over 4 + +=item SQLITE_FTS5_TOKENIZE_QUERY + +=item SQLITE_FTS5_TOKENIZE_PREFIX + +=item SQLITE_FTS5_TOKENIZE_DOCUMENT + +=item SQLITE_FTS5_TOKENIZE_AUX + +=item SQLITE_FTS5_TOKEN_COLOCATED + +=back + =head2 function_flags =over 4 diff --git a/t/67_fts5.t b/t/67_fts5.t index bd5e37c..2919c82 100644 --- a/t/67_fts5.t +++ b/t/67_fts5.t @@ -48,6 +48,8 @@ BEGIN { } } +use DBD::SQLite::Constants ':fts5_tokenizer'; + use locale; sub locale_tokenizer { # see also: Search::Tokenizer @@ -59,14 +61,13 @@ sub locale_tokenizer { # see also: Search::Tokenizer while( $string =~ /$regex/g) { my ($start, $end) = ($-[0], $+[0]); my $term = substr($string, $start, my $len = $end-$start); - my $flags = 0; # SQLITE_FTS5_TOKEN; + my $flags = 0; + #my $flags = FTS5_TOKEN_COLOCATED; DBD::SQLite::db::fts5_xToken($ctx,$flags,$term,$start,$end); }; }; } -use DBD::SQLite; - for my $use_unicode (0, 1) { # connect diff --git a/util/SQLiteUtil.pm b/util/SQLiteUtil.pm index cfe8375..b88373b 100644 --- a/util/SQLiteUtil.pm +++ b/util/SQLiteUtil.pm @@ -143,6 +143,11 @@ my %since = ( STMTSTATUS_RUN => '3020000', STMTSTATUS_MEMUSED => '3020000', DBCONFIG_ENABLE_QPSG => '3020000', + SQLITE_FTS5_TOKEN => '3020000', + FTS5_TOKENIZE_QUERY => '3020000', + FTS5_TOKENIZE_PREFIX => '3020000', + FTS5_TOKENIZE_DOCUMENT => '3020000', + FTS5_TOKENIZE_AUX => '3020000', IOERR_BEGIN_ATOMIC => '3021000', IOERR_COMMIT_ATOMIC => '3021000', IOERR_ROLLBACK_ATOMIC => '3021000', @@ -313,6 +318,17 @@ sub extract_constants { } unshift @{$constants{_authorizer_return_codes}}, 'OK'; + # Fudge in the FTS5 constants, as these don't follow the common pattern + $constants{fts5_tokenizer} ||= []; + push @{$constants{fts5_tokenizer}}, + 'FTS5_TOKENIZE_QUERY', + 'FTS5_TOKENIZE_PREFIX', + 'FTS5_TOKENIZE_DOCUMENT', + 'FTS5_TOKENIZE_AUX', + 'FTS5_TOKEN_COLOCATED' + ; + + %constants; } @@ -335,7 +351,7 @@ sub srcdir { sub download_url { my $version = shift; my $year = $version->year; - join '', + join '', "http://www.sqlite.org/", ($version->year ? $version->year."/" : ""), "sqlite-".($version->archive_type)."-$version".$version->extension; diff --git a/util/constants.pl b/util/constants.pl index ea2d2a5..daf5437 100644 --- a/util/constants.pl +++ b/util/constants.pl @@ -26,6 +26,7 @@ my @dbd_sqlite_constants = ( ); my %constants = extract_constants(); + write_inc(%constants); write_pm(%constants); @@ -86,9 +87,18 @@ _const_$tag() END for my $name (@$list) { - my $prefix = $tag =~ /^_/ ? "" : "SQLITE_"; + my $prefix; + my $prefix2 = "SQLITE_"; + if( $tag =~ /^_/ ) { + $prefix = ""; + } elsif( $tag =~ /^fts5_/ ) { + $prefix = ""; + $prefix2 = ""; + } else { + $prefix = "SQLITE_"; + }; print $fh <<"END"; - $prefix$name = SQLITE_$name + $prefix$name = $prefix2$name END } @@ -111,7 +121,16 @@ END my $ix = 1; for my $name (@{$constants{$tag}}) { - my $prefix = $tag =~ /^_/ ? "" : "SQLITE_"; + my $prefix; + my $prefix2 = "SQLITE_"; + if( $tag =~ /^_/ ) { + $prefix = ""; + } elsif( $tag =~ /^fts5_/ ) { + $prefix = ""; + $prefix2 = ""; + } else { + $prefix = "SQLITE_"; + }; print $fh <<"END"; $prefix$name = $ix END @@ -166,7 +185,7 @@ END print $fh <<"END"; # $tag qw/ -@{[join "\n", map {" SQLITE_$_"} sort @{$constants{$tag}}]} +@{[join "\n", map {/^FTS5_/ ? " $_" : " SQLITE_$_"} sort @{$constants{$tag}}]} /, END From 12d982ffec2d88e3ddcd1cbf53c935cfd8efef9f Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sun, 13 Sep 2020 08:56:12 +0200 Subject: [PATCH 08/14] Silence a compilation warning on Mac/clang Gcc doesn't warn on this (at my settings) but Github automation found this one --- dbdimp_tokenizer.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index 712c50c..0e1abf2 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -17,7 +17,7 @@ typedef struct perl_fts3_tokenizer_cursor { } perl_fts3_tokenizer_cursor; typedef struct perl_Fts5Tokenizer { - /* Fts5Tokenizer base; /* this is an empty struct, so we omit it entirely */ + /* Fts5Tokenizer base; */ /* this is an empty struct, so we omit it entirely */ SV *coderef; /* the perl tokenizer is a coderef that takes ** a string and and some parameters and ** in turn calls the xToken() function From 740ba861f9da0e12ef3a7bd896b4ebf102181059 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sun, 20 Sep 2020 20:01:51 +0200 Subject: [PATCH 09/14] Eliminate unused (but returned) variable --- dbdimp_tokenizer.inc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index 0e1abf2..c0ac4d2 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -417,8 +417,6 @@ static int perl_fts5_tokenizer_Tokenize( ) ){ perl_Fts5Tokenizer *c = (perl_Fts5Tokenizer *) tokenizer; - int result; - int n_retval; char *token; char *byteOffset; dTHX; @@ -508,14 +506,16 @@ static int perl_fts5_tokenizer_Tokenize( // result = SQLITE_OK; // PUTBACK; - n_retval = call_sv(c->coderef, G_ARRAY); + call_sv(c->coderef, G_VOID); + + printf("Returned from tokenization CB, returning to SQLite\n"); SPAGAIN; PUTBACK; FREETMPS; LEAVE; - return result; + return SQLITE_OK; } int perl_fts5_xToken(pTHX_ From 2fb0329bbd5dfa9dd4a471af95690a19d04fe920 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sun, 20 Sep 2020 20:13:24 +0200 Subject: [PATCH 10/14] Eliminate call to strlen() in favour of SvPV() --- dbdimp_tokenizer.inc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dbdimp_tokenizer.inc b/dbdimp_tokenizer.inc index c0ac4d2..c6759b4 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_tokenizer.inc @@ -508,7 +508,6 @@ static int perl_fts5_tokenizer_Tokenize( PUTBACK; call_sv(c->coderef, G_VOID); - printf("Returned from tokenization CB, returning to SQLite\n"); SPAGAIN; PUTBACK; @@ -525,8 +524,8 @@ int perl_fts5_xToken(pTHX_ int iStart, /* Byte offset of token within input text */ int iEnd /* Byte offset of end of token within input text */ ) { - const char* chrToken = SvPV_nolen(svToken); - STRLEN nToken = strlen(chrToken); + STRLEN nToken; + const char* chrToken = SvPV(svToken, nToken); perl_cb_ctx * p = (perl_cb_ctx *)SvPV_nolen( pCtx ); return p->xToken(p->Ctx,tflags,chrToken,nToken,iStart,iEnd); } From f89e52f1ea15370518abb932625f69d6eaca6657 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sun, 4 Sep 2022 08:23:59 +0200 Subject: [PATCH 11/14] Split up dbdimp_tokenizer.inc into FTS3 and FTS5 --- MANIFEST | 3 +- Makefile.PL | 4 +- dbdimp.c | 3 +- ...tokenizer.inc => dbdimp_fts3_tokenizer.inc | 254 ------------------ dbdimp_fts5_tokenizer.inc | 253 +++++++++++++++++ 5 files changed, 259 insertions(+), 258 deletions(-) rename dbdimp_tokenizer.inc => dbdimp_fts3_tokenizer.inc (57%) create mode 100644 dbdimp_fts5_tokenizer.inc diff --git a/MANIFEST b/MANIFEST index dd95132..6429727 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3,7 +3,8 @@ Changes constants.inc dbdimp.c dbdimp.h -dbdimp_tokenizer.inc +dbdimp_fts3_tokenizer.inc +dbdimp_fts5_tokenizer.inc dbdimp_virtual_table.inc fts3_tokenizer.h inc/Test/FailWarnings.pm diff --git a/Makefile.PL b/Makefile.PL index b0043c0..162e14d 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -402,10 +402,10 @@ WriteMakefile( ), OBJECT => ( $sqlite_local ? '$(O_FILES)' - : 'SQLite.o dbdimp.o' + : 'SQLite.o dbdimp_fts3.o dbdimp_fts5.o' ), depend => { - 'dbdimp.o' => 'dbdimp_tokenizer.inc dbdimp_virtual_table.inc', + 'dbdimp.o' => 'dbdimp_fts3_tokenizer.inc dbdimp_fts5_tokenizer.inc dbdimp_virtual_table.inc', }, clean => { FILES => 'SQLite.xsi config.h tv.log *.old', diff --git a/dbdimp.c b/dbdimp.c index 0028e14..01b5089 100644 --- a/dbdimp.c +++ b/dbdimp.c @@ -2990,7 +2990,8 @@ sqlite_db_txn_state(pTHX_ SV *dbh, SV *schema) #endif } -#include "dbdimp_tokenizer.inc" +#include "dbdimp_fts3_tokenizer.inc" +#include "dbdimp_fts5_tokenizer.inc" #include "dbdimp_virtual_table.inc" /* end */ diff --git a/dbdimp_tokenizer.inc b/dbdimp_fts3_tokenizer.inc similarity index 57% rename from dbdimp_tokenizer.inc rename to dbdimp_fts3_tokenizer.inc index c6759b4..17089f3 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_fts3_tokenizer.inc @@ -16,15 +16,6 @@ typedef struct perl_fts3_tokenizer_cursor { int currentChar; /* char position corresponding to currentByte */ } perl_fts3_tokenizer_cursor; -typedef struct perl_Fts5Tokenizer { - /* Fts5Tokenizer base; */ /* this is an empty struct, so we omit it entirely */ - SV *coderef; /* the perl tokenizer is a coderef that takes - ** a string and and some parameters and - ** in turn calls the xToken() function - ** passed to it - */ -} perl_Fts5Tokenizer; - /* This is the structure where we store the information between calls * from Perl and callbacks to SQLite. We could instead pass these values * as opaque arguments to Perl and back, but this reduces the number of @@ -340,248 +331,3 @@ int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh) return sqlite3_finalize(pStmt); } - -/* -** Create a new tokenizer instance. -** Will be called whenever a FTS5 table is created with -** CREATE .. USING fts5( ... , tokenize=perl qualified::function::name) -** where qualified::function::name is a fully qualified perl function -*/ -static int perl_fts5_tokenizer_Create( - void* pCtx, const char **azArg, int nArg, Fts5Tokenizer **ppOut -){ - dTHX; - dSP; - int n_retval; - SV *retval; - perl_Fts5Tokenizer *t; - if (!nArg) { - return SQLITE_ERROR; - } - - t = (perl_Fts5Tokenizer *) sqlite3_malloc(sizeof(*t)); - if( t==NULL ) return SQLITE_NOMEM; - memset(t, 0, sizeof(*t)); - ENTER; - SAVETMPS; - - /* call the qualified::function::name */ - PUSHMARK(SP); - PUTBACK; - n_retval = call_pv(azArg[0], G_SCALAR); - SPAGAIN; - - /* store a copy of the returned coderef into the tokenizer structure */ - if (n_retval != 1) { - warn("tokenizer_Create returned %d arguments, expected a single coderef", n_retval); - } - retval = POPs; - t->coderef = newSVsv(retval); - /* *ppOut = &t->base; */ /* Fts5Tokenizer is empty and gcc complains about that */ - *ppOut = (Fts5Tokenizer *) t; - - PUTBACK; - FREETMPS; - LEAVE; - - return SQLITE_OK; -} - -/* -** Destroy a tokenizer -*/ -static void perl_fts5_tokenizer_Delete(Fts5Tokenizer *pTokenizer){ - dTHX; - perl_Fts5Tokenizer *t = (perl_Fts5Tokenizer *) pTokenizer; - sv_free(t->coderef); - sqlite3_free(t); - return; -} - -/* -** This does a tokenizing run over the string. Found tokens (and synonyms) -** are stored by calling xToken() -*/ -static int perl_fts5_tokenizer_Tokenize( - Fts5Tokenizer* tokenizer, - void *pCtx, - int flags, /* Mask of FTS5_TOKENIZE_* flags */ - const char *pText, int nText, - int (*xToken)( - void *pCtx, /* Copy of 2nd argument to xTokenize() */ - int tflags, /* Mask of FTS5_TOKEN_* flags */ - const char *pToken, /* Pointer to buffer containing token */ - int nToken, /* Size of token in bytes */ - int iStart, /* Byte offset of token within input text */ - int iEnd /* Byte offset of end of token within input text */ - ) -){ - perl_Fts5Tokenizer *c = (perl_Fts5Tokenizer *) tokenizer; - char *token; - char *byteOffset; - dTHX; - dSP; - - /* newSVpvn() will create a copy of this buffer, but ideally we would - * directly write into the PV part of that copied buffer instead - */ - perl_cb_ctx ctx; - SV* ctxP; - SV* text; - - STRLEN n_a; /* this is required for older perls < 5.8.8 */ - I32 hop; - - ENTER; - SAVETMPS; - - /* call the Perl tokenizer, and pass it our token callback */ - PUSHMARK(SP); - - ctx.Ctx = pCtx; - ctx.xToken = xToken; - ctxP = newSVpvn((const char *const)&ctx, sizeof(ctx)); - - text = newSVpvn(pText, nText); - - // We pass four arguments - //EXTEND(SP, 2); - XPUSHs(sv_2mortal(ctxP)); - XPUSHs(sv_2mortal(text)); - XPUSHs(sv_2mortal(newSViv(flags))); - // We need to properly wrap this so it is callable from Perl... - // ... without needing actual local storage or a global variable... - - // XXX Wrap the "found token" callback, and pass it to the user - // Then, restructure the data if it is UTF-8 - // First, do all of this in Perl so it is easier to debug - - ///* if we get back an empty list, there is no more token */ - //if (n_retval == 0) { - // result = SQLITE_DONE; - //} - ///* otherwise, get token details from the return list */ - //else { - // if (n_retval != 5) { - // warn("tokenizer cursor returned %d arguments", n_retval); - // } - // *piPosition = POPi; - // *piEndOffset = POPi; - // *piStartOffset = POPi; - // *pnBytes = POPi; - // token = POPpx; - // - // if (c->pInput) { /* if working with utf8 data */ - // - // /* recompute *pnBytes in bytes, not in chars */ - // *pnBytes = strlen(token); - // - // /* recompute start/end offsets in bytes, not in chars */ - // hop = *piStartOffset - c->lastCharOffset; - // byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); - // hop = *piEndOffset - *piStartOffset; - // *piStartOffset = byteOffset - c->pInput; - // byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); - // *piEndOffset = byteOffset - c->pInput; - // - // /* remember where we are for next round */ - // c->lastCharOffset = *piEndOffset, - // c->lastByteOffset = byteOffset; - // } - // - // /* make sure we have enough storage for copying the token */ - // if (*pnBytes > c->nTokenAllocated ){ - // char *pNew; - // c->nTokenAllocated = *pnBytes + 20; - // pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); - // if( !pNew ) return SQLITE_NOMEM; - // c->pToken = pNew; - // } - // - // /* need to copy the token into the C cursor before perl frees that - // memory */ - // memcpy(c->pToken, token, *pnBytes); - // *ppToken = c->pToken; - // - // result = SQLITE_OK; - // - PUTBACK; - call_sv(c->coderef, G_VOID); - - SPAGAIN; - - PUTBACK; - FREETMPS; - LEAVE; - - return SQLITE_OK; -} - -int perl_fts5_xToken(pTHX_ - SV* pCtx, - int tflags, /* Mask of FTS5_TOKEN_* flags */ - SV* svToken, /* Pointer to buffer containing token */ - int iStart, /* Byte offset of token within input text */ - int iEnd /* Byte offset of end of token within input text */ -) { - STRLEN nToken; - const char* chrToken = SvPV(svToken, nToken); - perl_cb_ctx * p = (perl_cb_ctx *)SvPV_nolen( pCtx ); - return p->xToken(p->Ctx,tflags,chrToken,nToken,iStart,iEnd); -} - - -/* -** The set of routines that implement the perl FTS5 tokenizer -*/ -fts5_tokenizer perl_fts5_tokenizer_Module = { - perl_fts5_tokenizer_Create, - perl_fts5_tokenizer_Delete, - perl_fts5_tokenizer_Tokenize -}; - -/* -** Fetch the FTS5 API pointers -*/ - -fts5_api* sqlite_fetch_fts5_api(pTHX_ SV *dbh) -{ - D_imp_dbh(dbh); - - int rc; - sqlite3_stmt *pStmt; - const char zSql[] = "SELECT fts5(?)"; - fts5_api *pFts5Api = 0; - - if (!DBIc_ACTIVE(imp_dbh)) { - sqlite_error(dbh, -2, "attempt to register fts5 tokenizer on inactive database handle"); - return FALSE; - } - - rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0); - if( rc!=SQLITE_OK ){ - return 0; - } - - sqlite3_bind_pointer(pStmt, 1, (void*)&pFts5Api, "fts5_api_ptr", NULL); - sqlite3_step(pStmt); - sqlite3_finalize(pStmt); - - return pFts5Api; -} - -/* -** Register the perl tokenizer with FTS5 -*/ -int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh) -{ - D_imp_dbh(dbh); - - int rc; - fts5_api *pFts5Api = sqlite_fetch_fts5_api(aTHX_ dbh); - fts5_tokenizer *p = &perl_fts5_tokenizer_Module; - - rc = pFts5Api->xCreateTokenizer(pFts5Api, "perl", 0, p, 0); - - return rc; -} diff --git a/dbdimp_fts5_tokenizer.inc b/dbdimp_fts5_tokenizer.inc new file mode 100644 index 0000000..522c7e7 --- /dev/null +++ b/dbdimp_fts5_tokenizer.inc @@ -0,0 +1,253 @@ +typedef struct perl_Fts5Tokenizer { + /* Fts5Tokenizer base; */ /* this is an empty struct, so we omit it entirely */ + SV *coderef; /* the perl tokenizer is a coderef that takes + ** a string and and some parameters and + ** in turn calls the xToken() function + ** passed to it + */ +} perl_Fts5Tokenizer; + +/* +** Create a new tokenizer instance. +** Will be called whenever a FTS5 table is created with +** CREATE .. USING fts5( ... , tokenize=perl qualified::function::name) +** where qualified::function::name is a fully qualified perl function +*/ +static int perl_fts5_tokenizer_Create( + void* pCtx, const char **azArg, int nArg, Fts5Tokenizer **ppOut +){ + dTHX; + dSP; + int n_retval; + SV *retval; + perl_Fts5Tokenizer *t; + if (!nArg) { + return SQLITE_ERROR; + } + + t = (perl_Fts5Tokenizer *) sqlite3_malloc(sizeof(*t)); + if( t==NULL ) return SQLITE_NOMEM; + memset(t, 0, sizeof(*t)); + ENTER; + SAVETMPS; + + /* call the qualified::function::name */ + PUSHMARK(SP); + PUTBACK; + n_retval = call_pv(azArg[0], G_SCALAR); + SPAGAIN; + + /* store a copy of the returned coderef into the tokenizer structure */ + if (n_retval != 1) { + warn("tokenizer_Create returned %d arguments, expected a single coderef", n_retval); + } + retval = POPs; + t->coderef = newSVsv(retval); + /* *ppOut = &t->base; */ /* Fts5Tokenizer is empty and gcc complains about that */ + *ppOut = (Fts5Tokenizer *) t; + + PUTBACK; + FREETMPS; + LEAVE; + + return SQLITE_OK; +} + +/* +** Destroy a tokenizer +*/ +static void perl_fts5_tokenizer_Delete(Fts5Tokenizer *pTokenizer){ + dTHX; + perl_Fts5Tokenizer *t = (perl_Fts5Tokenizer *) pTokenizer; + sv_free(t->coderef); + sqlite3_free(t); + return; +} + +/* +** This does a tokenizing run over the string. Found tokens (and synonyms) +** are stored by calling xToken() +*/ +static int perl_fts5_tokenizer_Tokenize( + Fts5Tokenizer* tokenizer, + void *pCtx, + int flags, /* Mask of FTS5_TOKENIZE_* flags */ + const char *pText, int nText, + int (*xToken)( + void *pCtx, /* Copy of 2nd argument to xTokenize() */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Pointer to buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ + ) +){ + perl_Fts5Tokenizer *c = (perl_Fts5Tokenizer *) tokenizer; + char *token; + char *byteOffset; + dTHX; + dSP; + + /* newSVpvn() will create a copy of this buffer, but ideally we would + * directly write into the PV part of that copied buffer instead + */ + perl_cb_ctx ctx; + SV* ctxP; + SV* text; + + STRLEN n_a; /* this is required for older perls < 5.8.8 */ + I32 hop; + + ENTER; + SAVETMPS; + + /* call the Perl tokenizer, and pass it our token callback */ + PUSHMARK(SP); + + ctx.Ctx = pCtx; + ctx.xToken = xToken; + ctxP = newSVpvn((const char *const)&ctx, sizeof(ctx)); + + text = newSVpvn(pText, nText); + + // We pass four arguments + //EXTEND(SP, 2); + XPUSHs(sv_2mortal(ctxP)); + XPUSHs(sv_2mortal(text)); + XPUSHs(sv_2mortal(newSViv(flags))); + // We need to properly wrap this so it is callable from Perl... + // ... without needing actual local storage or a global variable... + + // XXX Wrap the "found token" callback, and pass it to the user + // Then, restructure the data if it is UTF-8 + // First, do all of this in Perl so it is easier to debug + + ///* if we get back an empty list, there is no more token */ + //if (n_retval == 0) { + // result = SQLITE_DONE; + //} + ///* otherwise, get token details from the return list */ + //else { + // if (n_retval != 5) { + // warn("tokenizer cursor returned %d arguments", n_retval); + // } + // *piPosition = POPi; + // *piEndOffset = POPi; + // *piStartOffset = POPi; + // *pnBytes = POPi; + // token = POPpx; + // + // if (c->pInput) { /* if working with utf8 data */ + // + // /* recompute *pnBytes in bytes, not in chars */ + // *pnBytes = strlen(token); + // + // /* recompute start/end offsets in bytes, not in chars */ + // hop = *piStartOffset - c->lastCharOffset; + // byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); + // hop = *piEndOffset - *piStartOffset; + // *piStartOffset = byteOffset - c->pInput; + // byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); + // *piEndOffset = byteOffset - c->pInput; + // + // /* remember where we are for next round */ + // c->lastCharOffset = *piEndOffset, + // c->lastByteOffset = byteOffset; + // } + // + // /* make sure we have enough storage for copying the token */ + // if (*pnBytes > c->nTokenAllocated ){ + // char *pNew; + // c->nTokenAllocated = *pnBytes + 20; + // pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); + // if( !pNew ) return SQLITE_NOMEM; + // c->pToken = pNew; + // } + // + // /* need to copy the token into the C cursor before perl frees that + // memory */ + // memcpy(c->pToken, token, *pnBytes); + // *ppToken = c->pToken; + // + // result = SQLITE_OK; + // + PUTBACK; + call_sv(c->coderef, G_VOID); + + SPAGAIN; + + PUTBACK; + FREETMPS; + LEAVE; + + return SQLITE_OK; +} + +int perl_fts5_xToken(pTHX_ + SV* pCtx, + int tflags, /* Mask of FTS5_TOKEN_* flags */ + SV* svToken, /* Pointer to buffer containing token */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ +) { + STRLEN nToken; + const char* chrToken = SvPV(svToken, nToken); + perl_cb_ctx * p = (perl_cb_ctx *)SvPV_nolen( pCtx ); + return p->xToken(p->Ctx,tflags,chrToken,nToken,iStart,iEnd); +} + + +/* +** The set of routines that implement the perl FTS5 tokenizer +*/ +fts5_tokenizer perl_fts5_tokenizer_Module = { + perl_fts5_tokenizer_Create, + perl_fts5_tokenizer_Delete, + perl_fts5_tokenizer_Tokenize +}; + +/* +** Fetch the FTS5 API pointers +*/ + +fts5_api* sqlite_fetch_fts5_api(pTHX_ SV *dbh) +{ + D_imp_dbh(dbh); + + int rc; + sqlite3_stmt *pStmt; + const char zSql[] = "SELECT fts5(?)"; + fts5_api *pFts5Api = 0; + + if (!DBIc_ACTIVE(imp_dbh)) { + sqlite_error(dbh, -2, "attempt to register fts5 tokenizer on inactive database handle"); + return FALSE; + } + + rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0); + if( rc!=SQLITE_OK ){ + return 0; + } + + sqlite3_bind_pointer(pStmt, 1, (void*)&pFts5Api, "fts5_api_ptr", NULL); + sqlite3_step(pStmt); + sqlite3_finalize(pStmt); + + return pFts5Api; +} + +/* +** Register the perl tokenizer with FTS5 +*/ +int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh) +{ + D_imp_dbh(dbh); + + int rc; + fts5_api *pFts5Api = sqlite_fetch_fts5_api(aTHX_ dbh); + fts5_tokenizer *p = &perl_fts5_tokenizer_Module; + + rc = pFts5Api->xCreateTokenizer(pFts5Api, "perl", 0, p, 0); + + return rc; +} From 5dbb6deac04c6ac76e215fdf7e783936fedfa695 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sun, 4 Sep 2022 08:24:14 +0200 Subject: [PATCH 12/14] Convert test file to UTF8 source code --- t/67_fts5.t | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/t/67_fts5.t b/t/67_fts5.t index 2919c82..e43b00b 100644 --- a/t/67_fts5.t +++ b/t/67_fts5.t @@ -6,22 +6,25 @@ use SQLiteTest; use Test::More; #use if -d ".git", "Test::FailWarnings"; use DBD::SQLite; +use utf8; # our source code is UTF-8 encoded -my @texts = ("il était une bergère", +my @texts = ("il était une bergère", "qui gardait ses moutons", "elle fit un fromage", - "du lait de ses moutons"); + "du lait de ses moutons", + "anrechenbare quellensteuer hier"); my @tests = ( # query => expected results - ["bergère" => 0 ], + ["bergère" => 0 ], ["berg*" => 0 ], ["foobar" ], ["moutons" => 1, 3 ], ['"qui gardait"' => 1 ], ["moutons NOT lait" => 1 ], - ["il était" => 0 ], + ["il était" => 0 ], ["(il OR elle) AND un*" => 0, 2 ], + ["anrechenbare" => 4 ], ); BEGIN { From 4c6e49ec58a72f0b71215883d9c8fe152db0274e Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sun, 11 Sep 2022 19:37:36 +0200 Subject: [PATCH 13/14] Also distribute fts5.h --- MANIFEST | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST b/MANIFEST index 6429727..0b3620d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -7,6 +7,7 @@ dbdimp_fts3_tokenizer.inc dbdimp_fts5_tokenizer.inc dbdimp_virtual_table.inc fts3_tokenizer.h +fts5.h inc/Test/FailWarnings.pm lib/DBD/SQLite.pm lib/DBD/SQLite/Constants.pm From 79d98dc1d3ac621ce9209463a4b79db461e6d978 Mon Sep 17 00:00:00 2001 From: Max Maischein Date: Sun, 29 Oct 2023 16:58:43 +0100 Subject: [PATCH 14/14] Strip whitespace --- fts5.h | 107 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 56 insertions(+), 51 deletions(-) diff --git a/fts5.h b/fts5.h index 081e534..9bf1479 100644 --- a/fts5.h +++ b/fts5.h @@ -10,7 +10,7 @@ ** ****************************************************************************** ** -** Interfaces to extend FTS5. Using the interfaces defined in this file, +** Interfaces to extend FTS5. Using the interfaces defined in this file, ** FTS5 may be extended with: ** ** * custom tokenizers, and @@ -38,6 +38,11 @@ typedef struct Fts5ExtensionApi Fts5ExtensionApi; typedef struct Fts5Context Fts5Context; typedef struct Fts5PhraseIter Fts5PhraseIter; +/* + * Wrap fts5_xToken in a callback that takes an array of arrayrefs (?) + * ... instead of the user calling fts5_xToken themselves + * / + typedef void (*fts5_extension_function)( const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ Fts5Context *pFts, /* First arg to pass to pApi functions */ @@ -55,19 +60,19 @@ struct Fts5PhraseIter { ** EXTENSION API FUNCTIONS ** ** xUserData(pFts): -** Return a copy of the context pointer the extension function was +** Return a copy of the context pointer the extension function was ** registered with. ** ** xColumnTotalSize(pFts, iCol, pnToken): ** If parameter iCol is less than zero, set output variable *pnToken ** to the total number of tokens in the FTS5 table. Or, if iCol is ** non-negative but less than the number of columns in the table, return -** the total number of tokens in column iCol, considering all rows in +** the total number of tokens in column iCol, considering all rows in ** the FTS5 table. ** ** If parameter iCol is greater than or equal to the number of columns ** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. -** an OOM condition or IO error), an appropriate SQLite error code is +** an OOM condition or IO error), an appropriate SQLite error code is ** returned. ** ** xColumnCount(pFts): @@ -81,7 +86,7 @@ struct Fts5PhraseIter { ** ** If parameter iCol is greater than or equal to the number of columns ** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. -** an OOM condition or IO error), an appropriate SQLite error code is +** an OOM condition or IO error), an appropriate SQLite error code is ** returned. ** ** This function may be quite inefficient if used with an FTS5 table @@ -108,8 +113,8 @@ struct Fts5PhraseIter { ** an error code (i.e. SQLITE_NOMEM) if an error occurs. ** ** This API can be quite slow if used with an FTS5 table created with the -** "detail=none" or "detail=column" option. If the FTS5 table is created -** with either "detail=none" or "detail=column" and "content=" option +** "detail=none" or "detail=column" option. If the FTS5 table is created +** with either "detail=none" or "detail=column" and "content=" option ** (i.e. if it is a contentless table), then this API always returns 0. ** ** xInst: @@ -124,7 +129,7 @@ struct Fts5PhraseIter { ** code (i.e. SQLITE_NOMEM) if an error occurs. ** ** This API can be quite slow if used with an FTS5 table created with the -** "detail=none" or "detail=column" option. +** "detail=none" or "detail=column" option. ** ** xRowid: ** Returns the rowid of the current row. @@ -140,11 +145,11 @@ struct Fts5PhraseIter { ** ** with $p set to a phrase equivalent to the phrase iPhrase of the ** current query is executed. Any column filter that applies to -** phrase iPhrase of the current query is included in $p. For each -** row visited, the callback function passed as the fourth argument -** is invoked. The context and API objects passed to the callback +** phrase iPhrase of the current query is included in $p. For each +** row visited, the callback function passed as the fourth argument +** is invoked. The context and API objects passed to the callback ** function may be used to access the properties of each matched row. -** Invoking Api.xUserData() returns a copy of the pointer passed as +** Invoking Api.xUserData() returns a copy of the pointer passed as ** the third argument to pUserData. ** ** If the callback function returns any value other than SQLITE_OK, the @@ -159,14 +164,14 @@ struct Fts5PhraseIter { ** ** xSetAuxdata(pFts5, pAux, xDelete) ** -** Save the pointer passed as the second argument as the extension function's +** Save the pointer passed as the second argument as the extension function's ** "auxiliary data". The pointer may then be retrieved by the current or any ** future invocation of the same fts5 extension function made as part of ** the same MATCH query using the xGetAuxdata() API. ** ** Each extension function is allocated a single auxiliary data slot for -** each FTS query (MATCH expression). If the extension function is invoked -** more than once for a single FTS query, then all invocations share a +** each FTS query (MATCH expression). If the extension function is invoked +** more than once for a single FTS query, then all invocations share a ** single auxiliary data context. ** ** If there is already an auxiliary data pointer when this function is @@ -185,7 +190,7 @@ struct Fts5PhraseIter { ** ** xGetAuxdata(pFts5, bClear) ** -** Returns the current auxiliary data pointer for the fts5 extension +** Returns the current auxiliary data pointer for the fts5 extension ** function. See the xSetAuxdata() method for details. ** ** If the bClear argument is non-zero, then the auxiliary data is cleared @@ -205,7 +210,7 @@ struct Fts5PhraseIter { ** method, to iterate through all instances of a single query phrase within ** the current row. This is the same information as is accessible via the ** xInstCount/xInst APIs. While the xInstCount/xInst APIs are more convenient -** to use, this API may be faster under some circumstances. To iterate +** to use, this API may be faster under some circumstances. To iterate ** through instances of phrase iPhrase, use the following code: ** ** Fts5PhraseIter iter; @@ -223,8 +228,8 @@ struct Fts5PhraseIter { ** xPhraseFirstColumn() and xPhraseNextColumn() as illustrated below). ** ** This API can be quite slow if used with an FTS5 table created with the -** "detail=none" or "detail=column" option. If the FTS5 table is created -** with either "detail=none" or "detail=column" and "content=" option +** "detail=none" or "detail=column" option. If the FTS5 table is created +** with either "detail=none" or "detail=column" and "content=" option ** (i.e. if it is a contentless table), then this API always iterates ** through an empty set (all calls to xPhraseFirst() set iCol to -1). ** @@ -248,16 +253,16 @@ struct Fts5PhraseIter { ** } ** ** This API can be quite slow if used with an FTS5 table created with the -** "detail=none" option. If the FTS5 table is created with either -** "detail=none" "content=" option (i.e. if it is a contentless table), -** then this API always iterates through an empty set (all calls to +** "detail=none" option. If the FTS5 table is created with either +** "detail=none" "content=" option (i.e. if it is a contentless table), +** then this API always iterates through an empty set (all calls to ** xPhraseFirstColumn() set iCol to -1). ** ** The information accessed using this API and its companion ** xPhraseFirstColumn() may also be obtained using xPhraseFirst/xPhraseNext ** (or xInst/xInstCount). The chief advantage of this API is that it is ** significantly more efficient than those alternatives when used with -** "detail=column" tables. +** "detail=column" tables. ** ** xPhraseNextColumn() ** See xPhraseFirstColumn above. @@ -271,7 +276,7 @@ struct Fts5ExtensionApi { int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow); int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken); - int (*xTokenize)(Fts5Context*, + int (*xTokenize)(Fts5Context*, const char *pText, int nText, /* Text to tokenize */ void *pCtx, /* Context passed to xToken() */ int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ @@ -300,15 +305,15 @@ struct Fts5ExtensionApi { void (*xPhraseNextColumn)(Fts5Context*, Fts5PhraseIter*, int *piCol); }; -/* +/* ** CUSTOM AUXILIARY FUNCTIONS *************************************************************************/ /************************************************************************* ** CUSTOM TOKENIZERS ** -** Applications may also register custom tokenizer types. A tokenizer -** is registered by providing fts5 with a populated instance of the +** Applications may also register custom tokenizer types. A tokenizer +** is registered by providing fts5 with a populated instance of the ** following structure. All structure methods must be defined, setting ** any member of the fts5_tokenizer struct to NULL leads to undefined ** behaviour. The structure methods are expected to function as follows: @@ -319,16 +324,16 @@ struct Fts5ExtensionApi { ** ** The first argument passed to this function is a copy of the (void*) ** pointer provided by the application when the fts5_tokenizer object -** was registered with FTS5 (the third argument to xCreateTokenizer()). +** was registered with FTS5 (the third argument to xCreateTokenizer()). ** The second and third arguments are an array of nul-terminated strings ** containing the tokenizer arguments, if any, specified following the ** tokenizer name as part of the CREATE VIRTUAL TABLE statement used ** to create the FTS5 table. ** -** The final argument is an output variable. If successful, (*ppOut) +** The final argument is an output variable. If successful, (*ppOut) ** should be set to point to the new tokenizer handle and SQLITE_OK ** returned. If an error occurs, some value other than SQLITE_OK should -** be returned. In this case, fts5 assumes that the final value of *ppOut +** be returned. In this case, fts5 assumes that the final value of *ppOut ** is undefined. ** ** xDelete: @@ -337,7 +342,7 @@ struct Fts5ExtensionApi { ** be invoked exactly once for each successful call to xCreate(). ** ** xTokenize: -** This function is expected to tokenize the nText byte string indicated +** This function is expected to tokenize the nText byte string indicated ** by argument pText. pText may or may not be nul-terminated. The first ** argument passed to this function is a pointer to an Fts5Tokenizer object ** returned by an earlier call to xCreate(). @@ -351,8 +356,8 @@ struct Fts5ExtensionApi { ** determine the set of tokens to add to (or delete from) the ** FTS index. ** -**
  • FTS5_TOKENIZE_QUERY - A MATCH query is being executed -** against the FTS index. The tokenizer is being called to tokenize +**
  • FTS5_TOKENIZE_QUERY - A MATCH query is being executed +** against the FTS index. The tokenizer is being called to tokenize ** a bareword or quoted string specified as part of the query. ** **
  • (FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX) - Same as @@ -360,10 +365,10 @@ struct Fts5ExtensionApi { ** followed by a "*" character, indicating that the last token ** returned by the tokenizer will be treated as a token prefix. ** -**
  • FTS5_TOKENIZE_AUX - The tokenizer is being invoked to +**
  • FTS5_TOKENIZE_AUX - The tokenizer is being invoked to ** satisfy an fts5_api.xTokenize() request made by an auxiliary ** function. Or an fts5_api.xColumnSize() request made by the same -** on a columnsize=0 database. +** on a columnsize=0 database. ** ** ** For each token in the input string, the supplied callback xToken() must @@ -375,10 +380,10 @@ struct Fts5ExtensionApi { ** which the token is derived within the input. ** ** The second argument passed to the xToken() callback ("tflags") should -** normally be set to 0. The exception is if the tokenizer supports +** normally be set to 0. The exception is if the tokenizer supports ** synonyms. In this case see the discussion below for details. ** -** FTS5 assumes the xToken() callback is invoked for each token in the +** FTS5 assumes the xToken() callback is invoked for each token in the ** order that they occur within the input text. ** ** If an xToken() callback returns any value other than SQLITE_OK, then @@ -392,7 +397,7 @@ struct Fts5ExtensionApi { ** SYNONYM SUPPORT ** ** Custom tokenizers may also support synonyms. Consider a case in which a -** user wishes to query for a phrase such as "first place". Using the +** user wishes to query for a phrase such as "first place". Using the ** built-in tokenizers, the FTS5 query 'first + place' will match instances ** of "first place" within the document set, but not alternative forms ** such as "1st place". In some applications, it would be better to match @@ -412,34 +417,34 @@ struct Fts5ExtensionApi { ** **
  • By querying the index for all synonyms of each query term ** separately. In this case, when tokenizing query text, the -** tokenizer may provide multiple synonyms for a single term -** within the document. FTS5 then queries the index for each +** tokenizer may provide multiple synonyms for a single term +** within the document. FTS5 then queries the index for each ** synonym individually. For example, faced with the query: ** ** ** ... MATCH 'first place' ** ** the tokenizer offers both "1st" and "first" as synonyms for the -** first token in the MATCH query and FTS5 effectively runs a query +** first token in the MATCH query and FTS5 effectively runs a query ** similar to: ** ** ** ... MATCH '(first OR 1st) place' ** ** except that, for the purposes of auxiliary functions, the query -** still appears to contain just two phrases - "(first OR 1st)" +** still appears to contain just two phrases - "(first OR 1st)" ** being treated as a single phrase. ** **
  • By adding multiple synonyms for a single term to the FTS index. ** Using this method, when tokenizing document text, the tokenizer -** provides multiple synonyms for each token. So that when a +** provides multiple synonyms for each token. So that when a ** document such as "I won first place" is tokenized, entries are ** added to the FTS index for "i", "won", "first", "1st" and ** "place". ** ** This way, even if the tokenizer does not provide synonyms ** when tokenizing query text (it should not - to do so would be -** inefficient), it doesn't matter if the user queries for +** inefficient), it doesn't matter if the user queries for ** 'first + place' or '1st + place', as there are entries in the ** FTS index corresponding to both forms of the first token. ** @@ -460,11 +465,11 @@ struct Fts5ExtensionApi { ** ** It is an error to specify the FTS5_TOKEN_COLOCATED flag the first time ** xToken() is called. Multiple synonyms may be specified for a single token -** by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence. +** by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence. ** There is no limit to the number of synonyms that may be provided for a ** single token. ** -** In many cases, method (1) above is the best approach. It does not add +** In many cases, method (1) above is the best approach. It does not add ** extra data to the FTS index or require FTS5 to query for multiple terms, ** so it is efficient in terms of disk space and query speed. However, it ** does not support prefix queries very well. If, as suggested above, the @@ -476,18 +481,18 @@ struct Fts5ExtensionApi { ** will not match documents that contain the token "1st" (as the tokenizer ** will probably not map "1s" to any prefix of "first"). ** -** For full prefix support, method (3) may be preferred. In this case, +** For full prefix support, method (3) may be preferred. In this case, ** because the index contains entries for both "first" and "1st", prefix ** queries such as 'fi*' or '1s*' will match correctly. However, because ** extra entries are added to the FTS index, this method uses more space ** within the database. ** ** Method (2) offers a midpoint between (1) and (3). Using this method, -** a query such as '1s*' will match documents that contain the literal +** a query such as '1s*' will match documents that contain the literal ** token "1st", but not "first" (assuming the tokenizer is not able to ** provide synonyms for prefixes). However, a non-prefix query like '1st' ** will match against "1st" and "first". This method does not require -** extra disk space, as no extra entries are added to the FTS index. +** extra disk space, as no extra entries are added to the FTS index. ** On the other hand, it may require more CPU cycles to run MATCH queries, ** as separate queries of the FTS index are required for each synonym. ** @@ -501,10 +506,10 @@ typedef struct fts5_tokenizer fts5_tokenizer; struct fts5_tokenizer { int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut); void (*xDelete)(Fts5Tokenizer*); - int (*xTokenize)(Fts5Tokenizer*, + int (*xTokenize)(Fts5Tokenizer*, void *pCtx, int flags, /* Mask of FTS5_TOKENIZE_* flags */ - const char *pText, int nText, + const char *pText, int nText, int (*xToken)( void *pCtx, /* Copy of 2nd argument to xTokenize() */ int tflags, /* Mask of FTS5_TOKEN_* flags */