diff --git a/MANIFEST b/MANIFEST index dd95132..0b3620d 100644 --- a/MANIFEST +++ b/MANIFEST @@ -3,9 +3,11 @@ Changes constants.inc dbdimp.c dbdimp.h -dbdimp_tokenizer.inc +dbdimp_fts3_tokenizer.inc +dbdimp_fts5_tokenizer.inc dbdimp_virtual_table.inc fts3_tokenizer.h +fts5.h inc/Test/FailWarnings.pm lib/DBD/SQLite.pm lib/DBD/SQLite/Constants.pm diff --git a/Makefile.PL b/Makefile.PL index b0043c0..162e14d 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -402,10 +402,10 @@ WriteMakefile( ), OBJECT => ( $sqlite_local ? '$(O_FILES)' - : 'SQLite.o dbdimp.o' + : 'SQLite.o dbdimp_fts3.o dbdimp_fts5.o' ), depend => { - 'dbdimp.o' => 'dbdimp_tokenizer.inc dbdimp_virtual_table.inc', + 'dbdimp.o' => 'dbdimp_fts3_tokenizer.inc dbdimp_fts5_tokenizer.inc dbdimp_virtual_table.inc', }, clean => { FILES => 'SQLite.xsi config.h tv.log *.old', diff --git a/SQLite.xs b/SQLite.xs index 0a3a052..f1676a1 100644 --- a/SQLite.xs +++ b/SQLite.xs @@ -314,6 +314,31 @@ register_fts3_perl_tokenizer(dbh) OUTPUT: RETVAL +static int +register_fts5_perl_tokenizer(dbh) + SV *dbh + ALIAS: + DBD::SQLite::db::sqlite_register_fts5_perl_tokenizer = 1 + CODE: + RETVAL = sqlite_db_register_fts5_perl_tokenizer(aTHX_ dbh); + OUTPUT: + RETVAL + +static int +fts5_xToken(pCtx,tflags,svToken,iStart,iEnd) + SV *pCtx + int tflags + SV *svToken + STRLEN iStart + STRLEN iEnd + ALIAS: + DBD::SQLite::db::fts5_xToken = 1 + CODE: + dTHX; + RETVAL = perl_fts5_xToken(aTHX_ pCtx,tflags,svToken,iStart,iEnd); + OUTPUT: + RETVAL + HV* db_status(dbh, reset = 0) SV* dbh diff --git a/SQLiteXS.h b/SQLiteXS.h index 584fb61..53c7588 100644 --- a/SQLiteXS.h +++ b/SQLiteXS.h @@ -20,5 +20,6 @@ #include "sqlite3.h" #include "fts3_tokenizer.h" +#include "fts5.h" #endif diff --git a/constants.inc b/constants.inc index ed2e753..c19a284 100644 --- a/constants.inc +++ b/constants.inc @@ -1293,6 +1293,7 @@ _const_flags_for_file_open_operations() SQLITE_OPEN_READONLY = SQLITE_OPEN_READONLY SQLITE_OPEN_READWRITE = SQLITE_OPEN_READWRITE SQLITE_OPEN_CREATE = SQLITE_OPEN_CREATE + SQLITE_OPEN_SUPER_JOURNAL = SQLITE_OPEN_SUPER_JOURNAL SQLITE_OPEN_NOMUTEX = SQLITE_OPEN_NOMUTEX CODE: RETVAL = ix; @@ -1471,6 +1472,19 @@ _const_flags_for_file_open_operations_3037000_zero() #if SQLITE_VERSION_NUMBER >= 3008003 +IV +_const_fts5_tokenizer() + ALIAS: + FTS5_TOKENIZE_QUERY = FTS5_TOKENIZE_QUERY + FTS5_TOKENIZE_PREFIX = FTS5_TOKENIZE_PREFIX + FTS5_TOKENIZE_DOCUMENT = FTS5_TOKENIZE_DOCUMENT + FTS5_TOKENIZE_AUX = FTS5_TOKENIZE_AUX + FTS5_TOKEN_COLOCATED = FTS5_TOKEN_COLOCATED + CODE: + RETVAL = ix; + OUTPUT: + RETVAL + IV _const_function_flags_3008003() ALIAS: @@ -1820,6 +1834,7 @@ _const__flags_for_file_open_operations() OPEN_READONLY = SQLITE_OPEN_READONLY OPEN_READWRITE = SQLITE_OPEN_READWRITE OPEN_CREATE = SQLITE_OPEN_CREATE + OPEN_SUPER_JOURNAL = SQLITE_OPEN_SUPER_JOURNAL OPEN_NOMUTEX = SQLITE_OPEN_NOMUTEX CODE: RETVAL = ix; diff --git a/dbdimp.c b/dbdimp.c index 0028e14..01b5089 100644 --- a/dbdimp.c +++ b/dbdimp.c @@ -2990,7 +2990,8 @@ sqlite_db_txn_state(pTHX_ SV *dbh, SV *schema) #endif } -#include "dbdimp_tokenizer.inc" +#include "dbdimp_fts3_tokenizer.inc" +#include "dbdimp_fts5_tokenizer.inc" #include "dbdimp_virtual_table.inc" /* end */ diff --git a/dbdimp.h b/dbdimp.h index b357e1f..03d5a7f 100644 --- a/dbdimp.h +++ b/dbdimp.h @@ -182,6 +182,8 @@ HV* sqlite_db_table_column_metadata(pTHX_ SV *dbh, SV *dbname, SV *tablename, SV HV* _sqlite_db_status(pTHX_ SV *dbh, int reset); SV* sqlite_db_filename(pTHX_ SV *dbh); int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh); +int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh); +int perl_fts5_xToken(pTHX_ SV* pCtx, int tflags, SV* svToken, int iStart, int iEnd ); HV* _sqlite_status(int reset); HV* _sqlite_st_status(pTHX_ SV *sth, int reset); int sqlite_db_create_module(pTHX_ SV *dbh, const char *name, const char *perl_class); diff --git a/dbdimp_tokenizer.inc b/dbdimp_fts3_tokenizer.inc similarity index 80% rename from dbdimp_tokenizer.inc rename to dbdimp_fts3_tokenizer.inc index 2258f0b..17089f3 100644 --- a/dbdimp_tokenizer.inc +++ b/dbdimp_fts3_tokenizer.inc @@ -1,10 +1,10 @@ -typedef struct perl_tokenizer { +typedef struct perl_fts3_tokenizer { sqlite3_tokenizer base; SV *coderef; /* the perl tokenizer is a coderef that takes a string and returns a cursor coderef */ -} perl_tokenizer; +} perl_fts3_tokenizer; -typedef struct perl_tokenizer_cursor { +typedef struct perl_fts3_tokenizer_cursor { sqlite3_tokenizer_cursor base; SV *coderef; /* ref to the closure that returns terms */ char *pToken; /* storage for a copy of the last token */ @@ -14,7 +14,24 @@ typedef struct perl_tokenizer_cursor { const char *pInput; /* input we are tokenizing */ const char *currentByte; /* pointer into pInput */ int currentChar; /* char position corresponding to currentByte */ -} perl_tokenizer_cursor; +} perl_fts3_tokenizer_cursor; + +/* This is the structure where we store the information between calls + * from Perl and callbacks to SQLite. We could instead pass these values + * as opaque arguments to Perl and back, but this reduces the number of + * opaque values handled by Perl to a single such value. + */ +typedef struct perl_cb_ctx { + void * Ctx; + int (*xToken)( + void *pCtx, /* Copy of 2nd argument to xTokenize() */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Pointer to buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ + ); +} perl_cb_ctx; /* ** Create a new tokenizer instance. @@ -22,7 +39,7 @@ typedef struct perl_tokenizer_cursor { ** CREATE .. USING fts3( ... , tokenize=perl qualified::function::name) ** where qualified::function::name is a fully qualified perl function */ -static int perl_tokenizer_Create( +static int perl_fts3_tokenizer_Create( int argc, const char * const *argv, sqlite3_tokenizer **ppTokenizer ){ @@ -30,13 +47,13 @@ static int perl_tokenizer_Create( dSP; int n_retval; SV *retval; - perl_tokenizer *t; + perl_fts3_tokenizer *t; if (!argc) { return SQLITE_ERROR; } - t = (perl_tokenizer *) sqlite3_malloc(sizeof(*t)); + t = (perl_fts3_tokenizer *) sqlite3_malloc(sizeof(*t)); if( t==NULL ) return SQLITE_NOMEM; memset(t, 0, sizeof(*t)); @@ -67,9 +84,9 @@ static int perl_tokenizer_Create( /* ** Destroy a tokenizer */ -static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){ +static int perl_fts3_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){ dTHX; - perl_tokenizer *t = (perl_tokenizer *) pTokenizer; + perl_fts3_tokenizer *t = (perl_fts3_tokenizer *) pTokenizer; sv_free(t->coderef); sqlite3_free(t); return SQLITE_OK; @@ -82,7 +99,7 @@ static int perl_tokenizer_Destroy(sqlite3_tokenizer *pTokenizer){ ** This is passed to the tokenizer instance, which then returns a ** closure implementing the cursor (so the cursor is again a coderef). */ -static int perl_tokenizer_Open( +static int perl_fts3_tokenizer_Open( sqlite3_tokenizer *pTokenizer, /* Tokenizer object */ const char *pInput, int nBytes, /* Input buffer */ sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */ @@ -118,11 +135,11 @@ static int perl_tokenizer_Open( DBD_SQLITE_UTF8_DECODE_IF_NEEDED(perl_string, MY_CXT.last_dbh_string_mode); - perl_tokenizer *t = (perl_tokenizer *)pTokenizer; + perl_fts3_tokenizer *t = (perl_fts3_tokenizer *)pTokenizer; /* allocate and initialize the cursor struct */ - perl_tokenizer_cursor *c; - c = (perl_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); + perl_fts3_tokenizer_cursor *c; + c = (perl_fts3_tokenizer_cursor *) sqlite3_malloc(sizeof(*c)); memset(c, 0, sizeof(*c)); *ppCursor = &c->base; @@ -158,10 +175,10 @@ static int perl_tokenizer_Open( /* ** Close a tokenization cursor previously opened by a call to -** perl_tokenizer_Open() above. +** perl_fts3_tokenizer_Open() above. */ -static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ - perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; +static int perl_fts3_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ + perl_fts3_tokenizer_cursor *c = (perl_fts3_tokenizer_cursor *) pCursor; dTHX; sv_free(c->coderef); @@ -172,9 +189,9 @@ static int perl_tokenizer_Close(sqlite3_tokenizer_cursor *pCursor){ /* ** Extract the next token from a tokenization cursor. The cursor must -** have been opened by a prior call to perl_tokenizer_Open(). +** have been opened by a prior call to perl_fts3_tokenizer_Open(). */ -static int perl_tokenizer_Next( +static int perl_fts3_tokenizer_Next( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by perl_tokenizer_Open */ const char **ppToken, /* OUT: Normalized text for token */ int *pnBytes, /* OUT: Number of bytes in normalized text */ @@ -182,7 +199,7 @@ static int perl_tokenizer_Next( int *piEndOffset, /* Ending offset of token. IN : char offset; OUT : byte offset */ int *piPosition /* OUT: Number of tokens returned before this one */ ){ - perl_tokenizer_cursor *c = (perl_tokenizer_cursor *) pCursor; + perl_fts3_tokenizer_cursor *c = (perl_fts3_tokenizer_cursor *) pCursor; int result; int n_retval; char *token; @@ -270,13 +287,13 @@ static int perl_tokenizer_Next( /* ** The set of routines that implement the perl tokenizer */ -sqlite3_tokenizer_module perl_tokenizer_Module = { +sqlite3_tokenizer_module perl_fts3_tokenizer_Module = { 0, - perl_tokenizer_Create, - perl_tokenizer_Destroy, - perl_tokenizer_Open, - perl_tokenizer_Close, - perl_tokenizer_Next + perl_fts3_tokenizer_Create, + perl_fts3_tokenizer_Destroy, + perl_fts3_tokenizer_Open, + perl_fts3_tokenizer_Close, + perl_fts3_tokenizer_Next }; /* @@ -289,7 +306,7 @@ int sqlite_db_register_fts3_perl_tokenizer(pTHX_ SV *dbh) int rc; sqlite3_stmt *pStmt; const char zSql[] = "SELECT fts3_tokenizer(?, ?)"; - sqlite3_tokenizer_module *p = &perl_tokenizer_Module; + sqlite3_tokenizer_module *p = &perl_fts3_tokenizer_Module; if (!DBIc_ACTIVE(imp_dbh)) { sqlite_error(dbh, -2, "attempt to register fts3 tokenizer on inactive database handle"); diff --git a/dbdimp_fts5_tokenizer.inc b/dbdimp_fts5_tokenizer.inc new file mode 100644 index 0000000..522c7e7 --- /dev/null +++ b/dbdimp_fts5_tokenizer.inc @@ -0,0 +1,253 @@ +typedef struct perl_Fts5Tokenizer { + /* Fts5Tokenizer base; */ /* this is an empty struct, so we omit it entirely */ + SV *coderef; /* the perl tokenizer is a coderef that takes + ** a string and and some parameters and + ** in turn calls the xToken() function + ** passed to it + */ +} perl_Fts5Tokenizer; + +/* +** Create a new tokenizer instance. +** Will be called whenever a FTS5 table is created with +** CREATE .. USING fts5( ... , tokenize=perl qualified::function::name) +** where qualified::function::name is a fully qualified perl function +*/ +static int perl_fts5_tokenizer_Create( + void* pCtx, const char **azArg, int nArg, Fts5Tokenizer **ppOut +){ + dTHX; + dSP; + int n_retval; + SV *retval; + perl_Fts5Tokenizer *t; + if (!nArg) { + return SQLITE_ERROR; + } + + t = (perl_Fts5Tokenizer *) sqlite3_malloc(sizeof(*t)); + if( t==NULL ) return SQLITE_NOMEM; + memset(t, 0, sizeof(*t)); + ENTER; + SAVETMPS; + + /* call the qualified::function::name */ + PUSHMARK(SP); + PUTBACK; + n_retval = call_pv(azArg[0], G_SCALAR); + SPAGAIN; + + /* store a copy of the returned coderef into the tokenizer structure */ + if (n_retval != 1) { + warn("tokenizer_Create returned %d arguments, expected a single coderef", n_retval); + } + retval = POPs; + t->coderef = newSVsv(retval); + /* *ppOut = &t->base; */ /* Fts5Tokenizer is empty and gcc complains about that */ + *ppOut = (Fts5Tokenizer *) t; + + PUTBACK; + FREETMPS; + LEAVE; + + return SQLITE_OK; +} + +/* +** Destroy a tokenizer +*/ +static void perl_fts5_tokenizer_Delete(Fts5Tokenizer *pTokenizer){ + dTHX; + perl_Fts5Tokenizer *t = (perl_Fts5Tokenizer *) pTokenizer; + sv_free(t->coderef); + sqlite3_free(t); + return; +} + +/* +** This does a tokenizing run over the string. Found tokens (and synonyms) +** are stored by calling xToken() +*/ +static int perl_fts5_tokenizer_Tokenize( + Fts5Tokenizer* tokenizer, + void *pCtx, + int flags, /* Mask of FTS5_TOKENIZE_* flags */ + const char *pText, int nText, + int (*xToken)( + void *pCtx, /* Copy of 2nd argument to xTokenize() */ + int tflags, /* Mask of FTS5_TOKEN_* flags */ + const char *pToken, /* Pointer to buffer containing token */ + int nToken, /* Size of token in bytes */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ + ) +){ + perl_Fts5Tokenizer *c = (perl_Fts5Tokenizer *) tokenizer; + char *token; + char *byteOffset; + dTHX; + dSP; + + /* newSVpvn() will create a copy of this buffer, but ideally we would + * directly write into the PV part of that copied buffer instead + */ + perl_cb_ctx ctx; + SV* ctxP; + SV* text; + + STRLEN n_a; /* this is required for older perls < 5.8.8 */ + I32 hop; + + ENTER; + SAVETMPS; + + /* call the Perl tokenizer, and pass it our token callback */ + PUSHMARK(SP); + + ctx.Ctx = pCtx; + ctx.xToken = xToken; + ctxP = newSVpvn((const char *const)&ctx, sizeof(ctx)); + + text = newSVpvn(pText, nText); + + // We pass four arguments + //EXTEND(SP, 2); + XPUSHs(sv_2mortal(ctxP)); + XPUSHs(sv_2mortal(text)); + XPUSHs(sv_2mortal(newSViv(flags))); + // We need to properly wrap this so it is callable from Perl... + // ... without needing actual local storage or a global variable... + + // XXX Wrap the "found token" callback, and pass it to the user + // Then, restructure the data if it is UTF-8 + // First, do all of this in Perl so it is easier to debug + + ///* if we get back an empty list, there is no more token */ + //if (n_retval == 0) { + // result = SQLITE_DONE; + //} + ///* otherwise, get token details from the return list */ + //else { + // if (n_retval != 5) { + // warn("tokenizer cursor returned %d arguments", n_retval); + // } + // *piPosition = POPi; + // *piEndOffset = POPi; + // *piStartOffset = POPi; + // *pnBytes = POPi; + // token = POPpx; + // + // if (c->pInput) { /* if working with utf8 data */ + // + // /* recompute *pnBytes in bytes, not in chars */ + // *pnBytes = strlen(token); + // + // /* recompute start/end offsets in bytes, not in chars */ + // hop = *piStartOffset - c->lastCharOffset; + // byteOffset = (char*)utf8_hop((U8*)c->lastByteOffset, hop); + // hop = *piEndOffset - *piStartOffset; + // *piStartOffset = byteOffset - c->pInput; + // byteOffset = (char*)utf8_hop((U8*)byteOffset, hop); + // *piEndOffset = byteOffset - c->pInput; + // + // /* remember where we are for next round */ + // c->lastCharOffset = *piEndOffset, + // c->lastByteOffset = byteOffset; + // } + // + // /* make sure we have enough storage for copying the token */ + // if (*pnBytes > c->nTokenAllocated ){ + // char *pNew; + // c->nTokenAllocated = *pnBytes + 20; + // pNew = sqlite3_realloc(c->pToken, c->nTokenAllocated); + // if( !pNew ) return SQLITE_NOMEM; + // c->pToken = pNew; + // } + // + // /* need to copy the token into the C cursor before perl frees that + // memory */ + // memcpy(c->pToken, token, *pnBytes); + // *ppToken = c->pToken; + // + // result = SQLITE_OK; + // + PUTBACK; + call_sv(c->coderef, G_VOID); + + SPAGAIN; + + PUTBACK; + FREETMPS; + LEAVE; + + return SQLITE_OK; +} + +int perl_fts5_xToken(pTHX_ + SV* pCtx, + int tflags, /* Mask of FTS5_TOKEN_* flags */ + SV* svToken, /* Pointer to buffer containing token */ + int iStart, /* Byte offset of token within input text */ + int iEnd /* Byte offset of end of token within input text */ +) { + STRLEN nToken; + const char* chrToken = SvPV(svToken, nToken); + perl_cb_ctx * p = (perl_cb_ctx *)SvPV_nolen( pCtx ); + return p->xToken(p->Ctx,tflags,chrToken,nToken,iStart,iEnd); +} + + +/* +** The set of routines that implement the perl FTS5 tokenizer +*/ +fts5_tokenizer perl_fts5_tokenizer_Module = { + perl_fts5_tokenizer_Create, + perl_fts5_tokenizer_Delete, + perl_fts5_tokenizer_Tokenize +}; + +/* +** Fetch the FTS5 API pointers +*/ + +fts5_api* sqlite_fetch_fts5_api(pTHX_ SV *dbh) +{ + D_imp_dbh(dbh); + + int rc; + sqlite3_stmt *pStmt; + const char zSql[] = "SELECT fts5(?)"; + fts5_api *pFts5Api = 0; + + if (!DBIc_ACTIVE(imp_dbh)) { + sqlite_error(dbh, -2, "attempt to register fts5 tokenizer on inactive database handle"); + return FALSE; + } + + rc = sqlite3_prepare_v2(imp_dbh->db, zSql, -1, &pStmt, 0); + if( rc!=SQLITE_OK ){ + return 0; + } + + sqlite3_bind_pointer(pStmt, 1, (void*)&pFts5Api, "fts5_api_ptr", NULL); + sqlite3_step(pStmt); + sqlite3_finalize(pStmt); + + return pFts5Api; +} + +/* +** Register the perl tokenizer with FTS5 +*/ +int sqlite_db_register_fts5_perl_tokenizer(pTHX_ SV *dbh) +{ + D_imp_dbh(dbh); + + int rc; + fts5_api *pFts5Api = sqlite_fetch_fts5_api(aTHX_ dbh); + fts5_tokenizer *p = &perl_fts5_tokenizer_Module; + + rc = pFts5Api->xCreateTokenizer(pFts5Api, "perl", 0, p, 0); + + return rc; +} diff --git a/fts5.h b/fts5.h new file mode 100644 index 0000000..9bf1479 --- /dev/null +++ b/fts5.h @@ -0,0 +1,580 @@ +/* +** 2014 May 31 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +****************************************************************************** +** +** Interfaces to extend FTS5. Using the interfaces defined in this file, +** FTS5 may be extended with: +** +** * custom tokenizers, and +** * custom auxiliary functions. +*/ + + +#ifndef _FTS5_H +#define _FTS5_H + +#include "sqlite3.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/************************************************************************* +** CUSTOM AUXILIARY FUNCTIONS +** +** Virtual table implementations may overload SQL functions by implementing +** the sqlite3_module.xFindFunction() method. +*/ + +typedef struct Fts5ExtensionApi Fts5ExtensionApi; +typedef struct Fts5Context Fts5Context; +typedef struct Fts5PhraseIter Fts5PhraseIter; + +/* + * Wrap fts5_xToken in a callback that takes an array of arrayrefs (?) + * ... instead of the user calling fts5_xToken themselves + * / + +typedef void (*fts5_extension_function)( + const Fts5ExtensionApi *pApi, /* API offered by current FTS version */ + Fts5Context *pFts, /* First arg to pass to pApi functions */ + sqlite3_context *pCtx, /* Context for returning result/error */ + int nVal, /* Number of values in apVal[] array */ + sqlite3_value **apVal /* Array of trailing arguments */ +); + +struct Fts5PhraseIter { + const unsigned char *a; + const unsigned char *b; +}; + +/* +** EXTENSION API FUNCTIONS +** +** xUserData(pFts): +** Return a copy of the context pointer the extension function was +** registered with. +** +** xColumnTotalSize(pFts, iCol, pnToken): +** If parameter iCol is less than zero, set output variable *pnToken +** to the total number of tokens in the FTS5 table. Or, if iCol is +** non-negative but less than the number of columns in the table, return +** the total number of tokens in column iCol, considering all rows in +** the FTS5 table. +** +** If parameter iCol is greater than or equal to the number of columns +** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. +** an OOM condition or IO error), an appropriate SQLite error code is +** returned. +** +** xColumnCount(pFts): +** Return the number of columns in the table. +** +** xColumnSize(pFts, iCol, pnToken): +** If parameter iCol is less than zero, set output variable *pnToken +** to the total number of tokens in the current row. Or, if iCol is +** non-negative but less than the number of columns in the table, set +** *pnToken to the number of tokens in column iCol of the current row. +** +** If parameter iCol is greater than or equal to the number of columns +** in the table, SQLITE_RANGE is returned. Or, if an error occurs (e.g. +** an OOM condition or IO error), an appropriate SQLite error code is +** returned. +** +** This function may be quite inefficient if used with an FTS5 table +** created with the "columnsize=0" option. +** +** xColumnText: +** This function attempts to retrieve the text of column iCol of the +** current document. If successful, (*pz) is set to point to a buffer +** containing the text in utf-8 encoding, (*pn) is set to the size in bytes +** (not characters) of the buffer and SQLITE_OK is returned. Otherwise, +** if an error occurs, an SQLite error code is returned and the final values +** of (*pz) and (*pn) are undefined. +** +** xPhraseCount: +** Returns the number of phrases in the current query expression. +** +** xPhraseSize: +** Returns the number of tokens in phrase iPhrase of the query. Phrases +** are numbered starting from zero. +** +** xInstCount: +** Set *pnInst to the total number of occurrences of all phrases within +** the query within the current row. Return SQLITE_OK if successful, or +** an error code (i.e. SQLITE_NOMEM) if an error occurs. +** +** This API can be quite slow if used with an FTS5 table created with the +** "detail=none" or "detail=column" option. If the FTS5 table is created +** with either "detail=none" or "detail=column" and "content=" option +** (i.e. if it is a contentless table), then this API always returns 0. +** +** xInst: +** Query for the details of phrase match iIdx within the current row. +** Phrase matches are numbered starting from zero, so the iIdx argument +** should be greater than or equal to zero and smaller than the value +** output by xInstCount(). +** +** Usually, output parameter *piPhrase is set to the phrase number, *piCol +** to the column in which it occurs and *piOff the token offset of the +** first token of the phrase. Returns SQLITE_OK if successful, or an error +** code (i.e. SQLITE_NOMEM) if an error occurs. +** +** This API can be quite slow if used with an FTS5 table created with the +** "detail=none" or "detail=column" option. +** +** xRowid: +** Returns the rowid of the current row. +** +** xTokenize: +** Tokenize text using the tokenizer belonging to the FTS5 table. +** +** xQueryPhrase(pFts5, iPhrase, pUserData, xCallback): +** This API function is used to query the FTS table for phrase iPhrase +** of the current query. Specifically, a query equivalent to: +** +** ... FROM ftstable WHERE ftstable MATCH $p ORDER BY rowid +** +** with $p set to a phrase equivalent to the phrase iPhrase of the +** current query is executed. Any column filter that applies to +** phrase iPhrase of the current query is included in $p. For each +** row visited, the callback function passed as the fourth argument +** is invoked. The context and API objects passed to the callback +** function may be used to access the properties of each matched row. +** Invoking Api.xUserData() returns a copy of the pointer passed as +** the third argument to pUserData. +** +** If the callback function returns any value other than SQLITE_OK, the +** query is abandoned and the xQueryPhrase function returns immediately. +** If the returned value is SQLITE_DONE, xQueryPhrase returns SQLITE_OK. +** Otherwise, the error code is propagated upwards. +** +** If the query runs to completion without incident, SQLITE_OK is returned. +** Or, if some error occurs before the query completes or is aborted by +** the callback, an SQLite error code is returned. +** +** +** xSetAuxdata(pFts5, pAux, xDelete) +** +** Save the pointer passed as the second argument as the extension function's +** "auxiliary data". The pointer may then be retrieved by the current or any +** future invocation of the same fts5 extension function made as part of +** the same MATCH query using the xGetAuxdata() API. +** +** Each extension function is allocated a single auxiliary data slot for +** each FTS query (MATCH expression). If the extension function is invoked +** more than once for a single FTS query, then all invocations share a +** single auxiliary data context. +** +** If there is already an auxiliary data pointer when this function is +** invoked, then it is replaced by the new pointer. If an xDelete callback +** was specified along with the original pointer, it is invoked at this +** point. +** +** The xDelete callback, if one is specified, is also invoked on the +** auxiliary data pointer after the FTS5 query has finished. +** +** If an error (e.g. an OOM condition) occurs within this function, +** the auxiliary data is set to NULL and an error code returned. If the +** xDelete parameter was not NULL, it is invoked on the auxiliary data +** pointer before returning. +** +** +** xGetAuxdata(pFts5, bClear) +** +** Returns the current auxiliary data pointer for the fts5 extension +** function. See the xSetAuxdata() method for details. +** +** If the bClear argument is non-zero, then the auxiliary data is cleared +** (set to NULL) before this function returns. In this case the xDelete, +** if any, is not invoked. +** +** +** xRowCount(pFts5, pnRow) +** +** This function is used to retrieve the total number of rows in the table. +** In other words, the same value that would be returned by: +** +** SELECT count(*) FROM ftstable; +** +** xPhraseFirst() +** This function is used, along with type Fts5PhraseIter and the xPhraseNext +** method, to iterate through all instances of a single query phrase within +** the current row. This is the same information as is accessible via the +** xInstCount/xInst APIs. While the xInstCount/xInst APIs are more convenient +** to use, this API may be faster under some circumstances. To iterate +** through instances of phrase iPhrase, use the following code: +** +** Fts5PhraseIter iter; +** int iCol, iOff; +** for(pApi->xPhraseFirst(pFts, iPhrase, &iter, &iCol, &iOff); +** iCol>=0; +** pApi->xPhraseNext(pFts, &iter, &iCol, &iOff) +** ){ +** // An instance of phrase iPhrase at offset iOff of column iCol +** } +** +** The Fts5PhraseIter structure is defined above. Applications should not +** modify this structure directly - it should only be used as shown above +** with the xPhraseFirst() and xPhraseNext() API methods (and by +** xPhraseFirstColumn() and xPhraseNextColumn() as illustrated below). +** +** This API can be quite slow if used with an FTS5 table created with the +** "detail=none" or "detail=column" option. If the FTS5 table is created +** with either "detail=none" or "detail=column" and "content=" option +** (i.e. if it is a contentless table), then this API always iterates +** through an empty set (all calls to xPhraseFirst() set iCol to -1). +** +** xPhraseNext() +** See xPhraseFirst above. +** +** xPhraseFirstColumn() +** This function and xPhraseNextColumn() are similar to the xPhraseFirst() +** and xPhraseNext() APIs described above. The difference is that instead +** of iterating through all instances of a phrase in the current row, these +** APIs are used to iterate through the set of columns in the current row +** that contain one or more instances of a specified phrase. For example: +** +** Fts5PhraseIter iter; +** int iCol; +** for(pApi->xPhraseFirstColumn(pFts, iPhrase, &iter, &iCol); +** iCol>=0; +** pApi->xPhraseNextColumn(pFts, &iter, &iCol) +** ){ +** // Column iCol contains at least one instance of phrase iPhrase +** } +** +** This API can be quite slow if used with an FTS5 table created with the +** "detail=none" option. If the FTS5 table is created with either +** "detail=none" "content=" option (i.e. if it is a contentless table), +** then this API always iterates through an empty set (all calls to +** xPhraseFirstColumn() set iCol to -1). +** +** The information accessed using this API and its companion +** xPhraseFirstColumn() may also be obtained using xPhraseFirst/xPhraseNext +** (or xInst/xInstCount). The chief advantage of this API is that it is +** significantly more efficient than those alternatives when used with +** "detail=column" tables. +** +** xPhraseNextColumn() +** See xPhraseFirstColumn above. +*/ +struct Fts5ExtensionApi { + int iVersion; /* Currently always set to 3 */ + + void *(*xUserData)(Fts5Context*); + + int (*xColumnCount)(Fts5Context*); + int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow); + int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken); + + int (*xTokenize)(Fts5Context*, + const char *pText, int nText, /* Text to tokenize */ + void *pCtx, /* Context passed to xToken() */ + int (*xToken)(void*, int, const char*, int, int, int) /* Callback */ + ); + + int (*xPhraseCount)(Fts5Context*); + int (*xPhraseSize)(Fts5Context*, int iPhrase); + + int (*xInstCount)(Fts5Context*, int *pnInst); + int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff); + + sqlite3_int64 (*xRowid)(Fts5Context*); + int (*xColumnText)(Fts5Context*, int iCol, const char **pz, int *pn); + int (*xColumnSize)(Fts5Context*, int iCol, int *pnToken); + + int (*xQueryPhrase)(Fts5Context*, int iPhrase, void *pUserData, + int(*)(const Fts5ExtensionApi*,Fts5Context*,void*) + ); + int (*xSetAuxdata)(Fts5Context*, void *pAux, void(*xDelete)(void*)); + void *(*xGetAuxdata)(Fts5Context*, int bClear); + + int (*xPhraseFirst)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*, int*); + void (*xPhraseNext)(Fts5Context*, Fts5PhraseIter*, int *piCol, int *piOff); + + int (*xPhraseFirstColumn)(Fts5Context*, int iPhrase, Fts5PhraseIter*, int*); + void (*xPhraseNextColumn)(Fts5Context*, Fts5PhraseIter*, int *piCol); +}; + +/* +** CUSTOM AUXILIARY FUNCTIONS +*************************************************************************/ + +/************************************************************************* +** CUSTOM TOKENIZERS +** +** Applications may also register custom tokenizer types. A tokenizer +** is registered by providing fts5 with a populated instance of the +** following structure. All structure methods must be defined, setting +** any member of the fts5_tokenizer struct to NULL leads to undefined +** behaviour. The structure methods are expected to function as follows: +** +** xCreate: +** This function is used to allocate and initialize a tokenizer instance. +** A tokenizer instance is required to actually tokenize text. +** +** The first argument passed to this function is a copy of the (void*) +** pointer provided by the application when the fts5_tokenizer object +** was registered with FTS5 (the third argument to xCreateTokenizer()). +** The second and third arguments are an array of nul-terminated strings +** containing the tokenizer arguments, if any, specified following the +** tokenizer name as part of the CREATE VIRTUAL TABLE statement used +** to create the FTS5 table. +** +** The final argument is an output variable. If successful, (*ppOut) +** should be set to point to the new tokenizer handle and SQLITE_OK +** returned. If an error occurs, some value other than SQLITE_OK should +** be returned. In this case, fts5 assumes that the final value of *ppOut +** is undefined. +** +** xDelete: +** This function is invoked to delete a tokenizer handle previously +** allocated using xCreate(). Fts5 guarantees that this function will +** be invoked exactly once for each successful call to xCreate(). +** +** xTokenize: +** This function is expected to tokenize the nText byte string indicated +** by argument pText. pText may or may not be nul-terminated. The first +** argument passed to this function is a pointer to an Fts5Tokenizer object +** returned by an earlier call to xCreate(). +** +** The second argument indicates the reason that FTS5 is requesting +** tokenization of the supplied text. This is always one of the following +** four values: +** +**