From: drh Date: Thu, 31 Aug 2006 15:07:14 +0000 (+0000) Subject: Refactor the FTS1 module so that its name is "fts1" instead of "fulltext", X-Git-Tag: version-3.6.10~2790 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=5db455e7b5512613a4bafd39acfb20cfff727708;p=thirdparty%2Fsqlite.git Refactor the FTS1 module so that its name is "fts1" instead of "fulltext", so that all symbols with external linkage begin with "sqlite3Fts1", and so that all filenames begin with "fts1". (CVS 3377) FossilOrigin-Name: e1891f0dc58e5498a8845d8b9b5b092d7f9c7003 --- diff --git a/ext/fts1/fts1.c b/ext/fts1/fts1.c new file mode 100644 index 0000000000..d4a9350621 --- /dev/null +++ b/ext/fts1/fts1.c @@ -0,0 +1,1509 @@ +/* The author disclaims copyright to this source code. + * + * This is an SQLite module implementing full-text search. + */ + +/* +** The code in this file is only compiled if: +** +** * The FTS1 module is being built as an extension +** (in which case SQLITE_CORE is not defined), or +** +** * The FTS1 module is being built into the core of +** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). +*/ +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) + +#include +#if !defined(__APPLE__) +#include +#else +#include +#endif +#include +#include +#include + +#include "fts1.h" +#include "fts1_hash.h" +#include "fts1_tokenizer.h" +#include "sqlite3.h" +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +/* utility functions */ + +/* We encode variable-length integers in little-endian order using seven bits + * per byte as follows: +** +** KEY: +** A = 0xxxxxxx 7 bits of data and one flag bit +** B = 1xxxxxxx 7 bits of data and one flag bit +** +** 7 bits - A +** 14 bits - BA +** 21 bits - BBA +** and so on. +*/ + +/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ +#define VARINT_MAX 10 + +/* Write a 64-bit variable-length integer to memory starting at p[0]. + * The length of data written will be between 1 and VARINT_MAX bytes. + * The number of bytes written is returned. */ +static int putVarint(char *p, sqlite_int64 v){ + unsigned char *q = (unsigned char *) p; + sqlite_uint64 vu = v; + do{ + *q++ = (unsigned char) ((vu & 0x7f) | 0x80); + vu >>= 7; + }while( vu!=0 ); + q[-1] &= 0x7f; /* turn off high bit in final byte */ + assert( q - (unsigned char *)p <= VARINT_MAX ); + return (int) (q - (unsigned char *)p); +} + +/* Read a 64-bit variable-length integer from memory starting at p[0]. + * Return the number of bytes read, or 0 on error. + * The value is stored in *v. */ +static int getVarint(const char *p, sqlite_int64 *v){ + const unsigned char *q = (const unsigned char *) p; + sqlite_uint64 x = 0, y = 1; + while( (*q & 0x80) == 0x80 ){ + x += y * (*q++ & 0x7f); + y <<= 7; + if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ + assert( 0 ); + return 0; + } + } + x += y * (*q++); + *v = (sqlite_int64) x; + return (int) (q - (unsigned char *)p); +} + +static int getVarint32(const char *p, int *pi){ + sqlite_int64 i; + int ret = getVarint(p, &i); + *pi = (int) i; + assert( *pi==i ); + return ret; +} + +/*** Document lists *** + * + * A document list holds a sorted list of varint-encoded document IDs. + * + * A doclist with type DL_POSITIONS_OFFSETS is stored like this: + * + * array { + * varint docid; + * array { + * varint position; (delta from previous position plus 1, or 0 for end) + * varint startOffset; (delta from previous startOffset) + * varint endOffset; (delta from startOffset) + * } + * } + * + * Here, array { X } means zero or more occurrences of X, adjacent in memory. + * + * A doclist with type DL_POSITIONS is like the above, but holds only docids + * and positions without offset information. + * + * A doclist with type DL_DOCIDS is like the above, but holds only docids + * without positions or offset information. + * + * On disk, every document list has positions and offsets, so we don't bother + * to serialize a doclist's type. + * + * We don't yet delta-encode document IDs; doing so will probably be a + * modest win. + * + * NOTE(shess) I've thought of a slightly (1%) better offset encoding. + * After the first offset, estimate the next offset by using the + * current token position and the previous token position and offset, + * offset to handle some variance. So the estimate would be + * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded + * as normal. Offsets more than 64 chars from the estimate are + * encoded as the delta to the previous start offset + 128. An + * additional tiny increment can be gained by using the end offset of + * the previous token to make the estimate a tiny bit more precise. +*/ + +typedef enum DocListType { + DL_DOCIDS, /* docids only */ + DL_POSITIONS, /* docids + positions */ + DL_POSITIONS_OFFSETS /* docids + positions + offsets */ +} DocListType; + +typedef struct DocList { + char *pData; + int nData; + DocListType iType; + int iLastPos; /* the last position written */ + int iLastOffset; /* the last start offset written */ +} DocList; + +/* Initialize a new DocList to hold the given data. */ +static void docListInit(DocList *d, DocListType iType, + const char *pData, int nData){ + d->nData = nData; + if( nData>0 ){ + d->pData = malloc(nData); + memcpy(d->pData, pData, nData); + } else { + d->pData = NULL; + } + d->iType = iType; + d->iLastPos = 0; + d->iLastOffset = 0; +} + +/* Create a new dynamically-allocated DocList. */ +static DocList *docListNew(DocListType iType){ + DocList *d = (DocList *) malloc(sizeof(DocList)); + docListInit(d, iType, 0, 0); + return d; +} + +static void docListDestroy(DocList *d){ + free(d->pData); +#ifndef NDEBUG + memset(d, 0x55, sizeof(*d)); +#endif +} + +static void docListDelete(DocList *d){ + docListDestroy(d); + free(d); +} + +static char *docListEnd(DocList *d){ + return d->pData + d->nData; +} + +/* Append a varint to a DocList's data. */ +static void appendVarint(DocList *d, sqlite_int64 i){ + char c[VARINT_MAX]; + int n = putVarint(c, i); + d->pData = realloc(d->pData, d->nData + n); + memcpy(d->pData + d->nData, c, n); + d->nData += n; +} + +static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ + appendVarint(d, iDocid); + d->iLastPos = 0; +} + +/* Add a position to the last position list in a doclist. */ +static void docListAddPos(DocList *d, int iPos){ + assert( d->iType>=DL_POSITIONS ); + appendVarint(d, iPos-d->iLastPos+1); + d->iLastPos = iPos; +} + +static void docListAddPosOffset(DocList *d, int iPos, + int iStartOffset, int iEndOffset){ + assert( d->iType==DL_POSITIONS_OFFSETS ); + docListAddPos(d, iPos); + appendVarint(d, iStartOffset-d->iLastOffset); + d->iLastOffset = iStartOffset; + appendVarint(d, iEndOffset-iStartOffset); +} + +/* Terminate the last position list in the given doclist. */ +static void docListAddEndPos(DocList *d){ + appendVarint(d, 0); +} + +typedef struct DocListReader { + DocList *pDoclist; + char *p; + int iLastPos; /* the last position read */ +} DocListReader; + +static void readerInit(DocListReader *r, DocList *pDoclist){ + r->pDoclist = pDoclist; + if( pDoclist!=NULL ){ + r->p = pDoclist->pData; + } + r->iLastPos = 0; +} + +static int readerAtEnd(DocListReader *pReader){ + return pReader->p >= docListEnd(pReader->pDoclist); +} + +/* Peek at the next docid without advancing the read pointer. */ +static sqlite_int64 peekDocid(DocListReader *pReader){ + sqlite_int64 ret; + assert( !readerAtEnd(pReader) ); + getVarint(pReader->p, &ret); + return ret; +} + +/* Read the next docid. */ +static sqlite_int64 readDocid(DocListReader *pReader){ + sqlite_int64 ret; + assert( !readerAtEnd(pReader) ); + pReader->p += getVarint(pReader->p, &ret); + pReader->iLastPos = 0; + return ret; +} + +/* Read the next position from a position list. + * Returns the position, or -1 at the end of the list. */ +static int readPosition(DocListReader *pReader){ + int i; + int iType = pReader->pDoclist->iType; + assert( iType>=DL_POSITIONS ); + assert( !readerAtEnd(pReader) ); + + pReader->p += getVarint32(pReader->p, &i); + if( i==0 ){ + pReader->iLastPos = -1; + return -1; + } + pReader->iLastPos += ((int) i)-1; + if( iType>=DL_POSITIONS_OFFSETS ){ + /* Skip over offsets, ignoring them for now. */ + int iStart, iEnd; + pReader->p += getVarint32(pReader->p, &iStart); + pReader->p += getVarint32(pReader->p, &iEnd); + } + return pReader->iLastPos; +} + +/* Skip past the end of a position list. */ +static void skipPositionList(DocListReader *pReader){ + while( readPosition(pReader)!=-1 ) + ; +} + +/* Skip over a docid, including its position list if the doclist has + * positions. */ +static void skipDocument(DocListReader *pReader){ + readDocid(pReader); + if( pReader->pDoclist->iType >= DL_POSITIONS ){ + skipPositionList(pReader); + } +} + +static sqlite_int64 firstDocid(DocList *d){ + DocListReader r; + readerInit(&r, d); + return readDocid(&r); +} + +/* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; + * otherwise pUpdate, which must contain only the single docid [iDocid], is + * inserted (if not present) or updated (if already present). */ +static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ + int modified = 0; + DocListReader reader; + char *p; + + if( pUpdate!=NULL ){ + assert( d->iType==pUpdate->iType); + assert( iDocid==firstDocid(pUpdate) ); + } + + readerInit(&reader, d); + while( !readerAtEnd(&reader) && peekDocid(&reader)nData -= (reader.p - p); + modified = 1; + } + + /* Insert if indicated. */ + if( pUpdate!=NULL ){ + int iDoclist = p-d->pData; + docListAddEndPos(pUpdate); + + d->pData = realloc(d->pData, d->nData+pUpdate->nData); + p = d->pData + iDoclist; + + memmove(p+pUpdate->nData, p, docListEnd(d) - p); + memcpy(p, pUpdate->pData, pUpdate->nData); + d->nData += pUpdate->nData; + modified = 1; + } + + return modified; +} + +/* Split the second half of doclist d into a separate doclist d2. Returns 1 + * if successful, or 0 if d contains a single document and hence can't be + * split. */ +static int docListSplit(DocList *d, DocList *d2){ + const char *pSplitPoint = d->pData + d->nData / 2; + DocListReader reader; + + readerInit(&reader, d); + while( reader.piType, reader.p, docListEnd(d) - reader.p); + d->nData = reader.p - d->pData; + d->pData = realloc(d->pData, d->nData); + return 1; +} + +/* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked + * on-disk doclist, resulting in another in-memory DocList [out]. [in] + * and [out] may or may not store position information according to the + * caller's wishes. The on-disk doclist always comes with positions. + * + * The caller must read each chunk of the on-disk doclist in succession and + * pass it to mergeBlock(). + * + * If [in] has positions, then the merge output contains only documents with + * matching positions in the two input doclists. If [in] does not have + * positions, then the merge output contains all documents common to the two + * input doclists. + * + * If [in] is NULL, then the on-disk doclist is copied to [out] directly. + * + * A merge is performed using an integer [iOffset] provided by the caller. + * [iOffset] is subtracted from each position in the on-disk doclist for the + * purpose of position comparison; this is helpful in implementing phrase + * searches. + * + * A DocListMerge is not yet able to propagate offsets through query + * processing; we should add that capability soon. +*/ +typedef struct DocListMerge { + DocListReader in; + DocList *pOut; + int iOffset; +} DocListMerge; + +static void mergeInit(DocListMerge *m, + DocList *pIn, int iOffset, DocList *pOut){ + readerInit(&m->in, pIn); + m->pOut = pOut; + m->iOffset = iOffset; + + /* can't handle offsets yet */ + assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); + assert( pOut->iType <= DL_POSITIONS ); +} + +/* A helper function for mergeBlock(), below. Merge the position lists + * pointed to by m->in and pBlockReader. + * If the merge matches, write [iDocid] to m->pOut; if m->pOut + * has positions then write all matching positions as well. */ +static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, + DocListReader *pBlockReader){ + int block_pos = readPosition(pBlockReader); + int in_pos = readPosition(&m->in); + int match = 0; + while( block_pos!=-1 || in_pos!=-1 ){ + if( block_pos-m->iOffset==in_pos ){ + if( !match ){ + docListAddDocid(m->pOut, iDocid); + match = 1; + } + if( m->pOut->iType >= DL_POSITIONS ){ + docListAddPos(m->pOut, in_pos); + } + block_pos = readPosition(pBlockReader); + in_pos = readPosition(&m->in); + } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffsetin); + } + } + if( m->pOut->iType >= DL_POSITIONS && match ){ + docListAddEndPos(m->pOut); + } +} + +/* Merge one block of an on-disk doclist into a DocListMerge. */ +static void mergeBlock(DocListMerge *m, DocList *pBlock){ + DocListReader blockReader; + assert( pBlock->iType >= DL_POSITIONS ); + readerInit(&blockReader, pBlock); + while( !readerAtEnd(&blockReader) ){ + sqlite_int64 iDocid = readDocid(&blockReader); + if( m->in.pDoclist!=NULL ){ + while( 1 ){ + if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ + if( peekDocid(&m->in)>=iDocid ) break; + skipDocument(&m->in); + } + if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ + skipPositionList(&blockReader); /* skip this docid in the block */ + continue; + } + readDocid(&m->in); + } + /* We have a document match. */ + if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ + /* We don't need to do a poslist merge. */ + docListAddDocid(m->pOut, iDocid); + if( m->pOut->iType >= DL_POSITIONS ){ + /* Copy all positions to the output doclist. */ + while( 1 ){ + int pos = readPosition(&blockReader); + if( pos==-1 ) break; + docListAddPos(m->pOut, pos); + } + docListAddEndPos(m->pOut); + } else skipPositionList(&blockReader); + continue; + } + mergePosList(m, iDocid, &blockReader); + } +} + +static char *string_dup_n(const char *s, int n){ + char *str = malloc(n + 1); + memcpy(str, s, n); + str[n] = '\0'; + return str; +} + +/* Duplicate a string; the caller must free() the returned string. + * (We don't use strdup() since it's not part of the standard C library and + * may not be available everywhere.) */ +static char *string_dup(const char *s){ + return string_dup_n(s, strlen(s)); +} + +/* Format a string, replacing each occurrence of the % character with + * zName. This may be more convenient than sqlite_mprintf() + * when one string is used repeatedly in a format string. + * The caller must free() the returned string. */ +static char *string_format(const char *zFormat, const char *zName){ + const char *p; + size_t len = 0; + size_t nName = strlen(zName); + char *result; + char *r; + + /* first compute length needed */ + for(p = zFormat ; *p ; ++p){ + len += (*p=='%' ? nName : 1); + } + len += 1; /* for null terminator */ + + r = result = malloc(len); + for(p = zFormat; *p; ++p){ + if( *p=='%' ){ + memcpy(r, zName, nName); + r += nName; + } else { + *r++ = *p; + } + } + *r++ = '\0'; + assert( r == result + len ); + return result; +} + +static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ + char *zCommand = string_format(zFormat, zName); + int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); + free(zCommand); + return rc; +} + +static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, + const char *zFormat){ + char *zCommand = string_format(zFormat, zName); + int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); + free(zCommand); + return rc; +} + +/* end utility functions */ + +#define QUERY_GENERIC 0 +#define QUERY_FULLTEXT 1 + +#define CHUNK_MAX 1024 + +typedef enum fulltext_statement { + CONTENT_INSERT_STMT, + CONTENT_SELECT_STMT, + CONTENT_DELETE_STMT, + + TERM_SELECT_STMT, + TERM_CHUNK_SELECT_STMT, + TERM_INSERT_STMT, + TERM_UPDATE_STMT, + TERM_DELETE_STMT, + + MAX_STMT /* Always at end! */ +} fulltext_statement; + +/* These must exactly match the enum above. */ +/* TODO(adam): Is there some risk that a statement (in particular, +** pTermSelectStmt) will be used in two cursors at once, e.g. if a +** query joins a virtual table to itself? If so perhaps we should +** move some of these to the cursor object. +*/ +static const char *fulltext_zStatement[MAX_STMT] = { + /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", + /* CONTENT_SELECT */ "select content from %_content where rowid = ?", + /* CONTENT_DELETE */ "delete from %_content where rowid = ?", + + /* TERM_SELECT */ + "select rowid, doclist from %_term where term = ? and first = ?", + /* TERM_CHUNK_SELECT */ + "select max(first) from %_term where term = ? and first <= ?", + /* TERM_INSERT */ + "insert into %_term (term, first, doclist) values (?, ?, ?)", + /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", + /* TERM_DELETE */ "delete from %_term where rowid = ?", +}; + +typedef struct fulltext_vtab { + sqlite3_vtab base; + sqlite3 *db; + const char *zName; /* virtual table name */ + sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ + + /* Precompiled statements which we keep as long as the table is + ** open. + */ + sqlite3_stmt *pFulltextStatements[MAX_STMT]; +} fulltext_vtab; + +typedef struct fulltext_cursor { + sqlite3_vtab_cursor base; + int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ + + sqlite3_stmt *pStmt; + + int eof; + + /* The following is used only when iCursorType == QUERY_FULLTEXT. */ + DocListReader result; +} fulltext_cursor; + +static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ + return (fulltext_vtab *) c->base.pVtab; +} + +static sqlite3_module fulltextModule; /* forward declaration */ + +/* Puts a freshly-prepared statement determined by iStmt in *ppStmt. +** If the indicated statement has never been prepared, it is prepared +** and cached, otherwise the cached version is reset. +*/ +static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, + sqlite3_stmt **ppStmt){ + assert( iStmtpFulltextStatements[iStmt]==NULL ){ + int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], + fulltext_zStatement[iStmt]); + if( rc!=SQLITE_OK ) return rc; + } else { + int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); + if( rc!=SQLITE_OK ) return rc; + } + + *ppStmt = v->pFulltextStatements[iStmt]; + return SQLITE_OK; +} + +/* Step the indicated statement, handling errors SQLITE_BUSY (by +** retrying) and SQLITE_SCHEMA (by re-preparing and transferring +** bindings to the new statement). +** TODO(adam): We should extend this function so that it can work with +** statements declared locally, not only globally cached statements. +*/ +static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, + sqlite3_stmt **ppStmt){ + int rc; + sqlite3_stmt *s = *ppStmt; + assert( iStmtpFulltextStatements[iStmt] ); + + while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ + sqlite3_stmt *pNewStmt; + + if( rc==SQLITE_BUSY ) continue; + if( rc!=SQLITE_ERROR ) return rc; + + rc = sqlite3_reset(s); + if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; + + v->pFulltextStatements[iStmt] = NULL; /* Still in s */ + rc = sql_get_statement(v, iStmt, &pNewStmt); + if( rc!=SQLITE_OK ) goto err; + *ppStmt = pNewStmt; + + rc = sqlite3_transfer_bindings(s, pNewStmt); + if( rc!=SQLITE_OK ) goto err; + + rc = sqlite3_finalize(s); + if( rc!=SQLITE_OK ) return rc; + s = pNewStmt; + } + return rc; + + err: + sqlite3_finalize(s); + return rc; +} + +/* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. +** Useful for statements like UPDATE, where we expect no results. +*/ +static int sql_single_step_statement(fulltext_vtab *v, + fulltext_statement iStmt, + sqlite3_stmt **ppStmt){ + int rc = sql_step_statement(v, iStmt, ppStmt); + return (rc==SQLITE_DONE) ? SQLITE_OK : rc; +} + +/* insert into %_content (rowid, content) values ([rowid], [zContent]) */ +static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, + const char *zContent, int nContent){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_value(s, 1, rowid); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); +} + +/* select content from %_content where rowid = [iRow] + * The caller must delete the returned string. */ +static int content_select(fulltext_vtab *v, sqlite_int64 iRow, + char **pzContent){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 1, iRow); + if( rc!=SQLITE_OK ) return rc; + + rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); + if( rc!=SQLITE_ROW ) return rc; + + *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); + + /* We expect only one row. We must execute another sqlite3_step() + * to complete the iteration; otherwise the table will remain locked. */ + rc = sqlite3_step(s); + if( rc==SQLITE_DONE ) return SQLITE_OK; + + free(*pzContent); + return rc; +} + +/* delete from %_content where rowid = [iRow ] */ +static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 1, iRow); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); +} + +/* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] + * If found, returns SQLITE_OK; the caller must free the returned doclist. + * If no rows found, returns SQLITE_ERROR. */ +static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iFirst, + sqlite_int64 *rowid, + DocList *out){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 2, iFirst); + if( rc!=SQLITE_OK ) return rc; + + rc = sql_step_statement(v, TERM_SELECT_STMT, &s); + if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; + + *rowid = sqlite3_column_int64(s, 0); + docListInit(out, DL_POSITIONS_OFFSETS, + sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); + + /* We expect only one row. We must execute another sqlite3_step() + * to complete the iteration; otherwise the table will remain locked. */ + rc = sqlite3_step(s); + return rc==SQLITE_DONE ? SQLITE_OK : rc; +} + +/* select max(first) from %_term where term = [zTerm] and first <= [iFirst] + * If found, returns SQLITE_ROW and result in *piResult; if the query returns + * NULL (meaning no row found) returns SQLITE_DONE. + */ +static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iFirst, sqlite_int64 *piResult){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 2, iFirst); + if( rc!=SQLITE_OK ) return rc; + + rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); + if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; + + switch( sqlite3_column_type(s, 0) ){ + case SQLITE_NULL: + rc = SQLITE_DONE; + break; + case SQLITE_INTEGER: + *piResult = sqlite3_column_int64(s, 0); + break; + default: + return SQLITE_ERROR; + } + /* We expect only one row. We must execute another sqlite3_step() + * to complete the iteration; otherwise the table will remain locked. */ + if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; + return rc; +} + +/* insert into %_term (term, first, doclist) + values ([zTerm], [iFirst], [doclist]) */ +static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iFirst, DocList *doclist){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 2, iFirst); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, TERM_INSERT_STMT, &s); +} + +/* update %_term set doclist = [doclist] where rowid = [rowid] */ +static int term_update(fulltext_vtab *v, sqlite_int64 rowid, + DocList *doclist){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, + SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 2, rowid); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); +} + +static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 1, rowid); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, TERM_DELETE_STMT, &s); +} + +static void fulltext_vtab_destroy(fulltext_vtab *v){ + int iStmt; + + for( iStmt=0; iStmtpFulltextStatements[iStmt]!=NULL ){ + sqlite3_finalize(v->pFulltextStatements[iStmt]); + v->pFulltextStatements[iStmt] = NULL; + } + } + + if( v->pTokenizer!=NULL ){ + v->pTokenizer->pModule->xDestroy(v->pTokenizer); + v->pTokenizer = NULL; + } + + free((void *) v->zName); + free(v); +} + +/* Current interface: +** argv[0] - module name +** argv[1] - database name +** argv[2] - table name +** argv[3] - tokenizer name (optional, a sensible default is provided) +** argv[4..] - passed to tokenizer (optional based on tokenizer) +**/ +static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv, + sqlite3_vtab **ppVTab){ + int rc; + fulltext_vtab *v; + sqlite3_tokenizer_module *m = NULL; + + assert( argc>=3 ); + v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); + /* sqlite will initialize v->base */ + v->db = db; + v->zName = string_dup(argv[2]); + v->pTokenizer = NULL; + + if( argc==3 ){ + sqlite3Fts1SimpleTokenizerModule(&m); + } else { + /* TODO(shess) For now, add new tokenizers as else if clauses. */ + if( !strcmp(argv[3], "simple") ){ + sqlite3Fts1SimpleTokenizerModule(&m); + } else { + assert( "unrecognized tokenizer"==NULL ); + } + } + + /* TODO(shess) Since tokenization impacts the index, the parameters + ** to the tokenizer need to be identical when a persistent virtual + ** table is re-created. One solution would be a meta-table to track + ** such information in the database. Then we could verify that the + ** information is identical on subsequent creates. + */ + /* TODO(shess) Why isn't argv already (const char **)? */ + rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); + if( rc!=SQLITE_OK ) return rc; + v->pTokenizer->pModule = m; + + /* TODO: verify the existence of backing tables foo_content, foo_term */ + + rc = sqlite3_declare_vtab(db, "create table x(content text)"); + if( rc!=SQLITE_OK ) return rc; + + memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); + + *ppVTab = &v->base; + return SQLITE_OK; +} + +static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv, + sqlite3_vtab **ppVTab){ + int rc; + assert( argc>=3 ); + + /* The %_content table holds the text of each full-text item, with + ** the rowid used as the docid. + ** + ** The %_term table maps each term to a document list blob + ** containing elements sorted by ascending docid, each element + ** encoded as: + ** + ** docid varint-encoded + ** token count varint-encoded + ** "count" token elements (poslist): + ** position varint-encoded as delta from previous position + ** start offset varint-encoded as delta from previous start offset + ** end offset varint-encoded as delta from start offset + ** + ** Additionally, doclist blobs can be chunked into multiple rows, + ** using "first" to order the blobs. "first" is simply the first + ** docid in the blob. + */ + /* + ** NOTE(shess) That last sentence is incorrect in the face of + ** deletion, which can leave a doclist that doesn't contain the + ** first from that row. I _believe_ this does not matter to the + ** operation of the system, but it might be reasonable to update + ** appropriately in case this assumption becomes more important. + */ + rc = sql_exec(db, argv[2], + "create table %_content(content text);" + "create table %_term(term text, first integer, doclist blob);" + "create index %_index on %_term(term, first)"); + if( rc!=SQLITE_OK ) return rc; + + return fulltextConnect(db, pAux, argc, argv, ppVTab); +} + +/* Decide how to handle an SQL query. + * At the moment, MATCH queries can include implicit boolean ANDs; we + * haven't implemented phrase searches or OR yet. */ +static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ + int i; + + for(i=0; inConstraint; ++i){ + const struct sqlite3_index_constraint *pConstraint; + pConstraint = &pInfo->aConstraint[i]; + if( pConstraint->iColumn==0 && + pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && + pConstraint->usable ){ /* a full-text search */ + pInfo->aConstraintUsage[i].argvIndex = 1; + pInfo->aConstraintUsage[i].omit = 1; + pInfo->idxNum = QUERY_FULLTEXT; + pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ + return SQLITE_OK; + } + } + pInfo->idxNum = QUERY_GENERIC; + return SQLITE_OK; +} + +static int fulltextDisconnect(sqlite3_vtab *pVTab){ + fulltext_vtab_destroy((fulltext_vtab *)pVTab); + return SQLITE_OK; +} + +static int fulltextDestroy(sqlite3_vtab *pVTab){ + fulltext_vtab *v = (fulltext_vtab *)pVTab; + + int rc = sql_exec(v->db, v->zName, + "drop table %_content; drop table %_term"); + if( rc!=SQLITE_OK ) return rc; + + fulltext_vtab_destroy((fulltext_vtab *)pVTab); + return SQLITE_OK; +} + +static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ + fulltext_cursor *c; + + c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); + /* sqlite will initialize c->base */ + *ppCursor = &c->base; + + return SQLITE_OK; +} + +static int fulltextClose(sqlite3_vtab_cursor *pCursor){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + sqlite3_finalize(c->pStmt); + if( c->result.pDoclist!=NULL ){ + docListDelete(c->result.pDoclist); + } + free(c); + return SQLITE_OK; +} + +static int fulltextNext(sqlite3_vtab_cursor *pCursor){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + sqlite_int64 iDocid; + int rc; + + switch( c->iCursorType ){ + case QUERY_GENERIC: + /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ + rc = sqlite3_step(c->pStmt); + switch( rc ){ + case SQLITE_ROW: + c->eof = 0; + return SQLITE_OK; + case SQLITE_DONE: + c->eof = 1; + return SQLITE_OK; + default: + c->eof = 1; + return rc; + } + case QUERY_FULLTEXT: + rc = sqlite3_reset(c->pStmt); + if( rc!=SQLITE_OK ) return rc; + + if( readerAtEnd(&c->result)){ + c->eof = 1; + return SQLITE_OK; + } + iDocid = readDocid(&c->result); + rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); + if( rc!=SQLITE_OK ) return rc; + /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ + rc = sqlite3_step(c->pStmt); + if( rc==SQLITE_ROW ){ /* the case we expect */ + c->eof = 0; + return SQLITE_OK; + } + /* an error occurred; abort */ + return rc==SQLITE_DONE ? SQLITE_ERROR : rc; + default: + assert( 0 ); + return SQLITE_ERROR; /* not reached */ + } +} + +static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, + sqlite3_stmt **ppStmt){ + int rc; + if( *ppStmt ){ + rc = sqlite3_reset(*ppStmt); + } else { + rc = sql_prepare(v->db, v->zName, ppStmt, + "select doclist from %_term where term = ? order by first"); + } + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); + if( rc!=SQLITE_OK ) return rc; + + return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ +} + +/* Read the posting list for [zTerm]; AND it with the doclist [in] to + * produce the doclist [out], using the given offset [iOffset] for phrase + * matching. + * (*pSelect) is used to hold an SQLite statement used inside this function; + * the caller should initialize *pSelect to NULL before the first call. + */ +static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, + const char *zTerm, + DocList *pIn, int iOffset, DocList *out){ + int rc; + DocListMerge merge; + + if( pIn!=NULL && !pIn->nData ){ + /* If [pIn] is already empty, there's no point in reading the + * posting list to AND it in; return immediately. */ + return SQLITE_OK; + } + + rc = term_select_doclist(v, zTerm, -1, pSelect); + if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; + + mergeInit(&merge, pIn, iOffset, out); + while( rc==SQLITE_ROW ){ + DocList block; + docListInit(&block, DL_POSITIONS_OFFSETS, + sqlite3_column_blob(*pSelect, 0), + sqlite3_column_bytes(*pSelect, 0)); + mergeBlock(&merge, &block); + docListDestroy(&block); + + rc = sqlite3_step(*pSelect); + if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ + return rc; + } + } + + return SQLITE_OK; +} + +typedef struct QueryTerm { + int is_phrase; /* true if this term begins a new phrase */ + const char *zTerm; +} QueryTerm; + +/* A parsed query. + * + * As an example, parsing the query ["four score" years "new nation"] will + * yield a Query with 5 terms: + * "four", is_phrase = 1 + * "score", is_phrase = 0 + * "years", is_phrase = 1 + * "new", is_phrase = 1 + * "nation", is_phrase = 0 + */ +typedef struct Query { + int nTerms; + QueryTerm *pTerm; +} Query; + +static void query_add(Query *q, int is_phrase, const char *zTerm){ + QueryTerm *t; + ++q->nTerms; + q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); + t = &q->pTerm[q->nTerms - 1]; + t->is_phrase = is_phrase; + t->zTerm = zTerm; +} + +static void query_free(Query *q){ + int i; + for(i = 0; i < q->nTerms; ++i){ + free((void *) q->pTerm[i].zTerm); + } + free(q->pTerm); +} + +static int tokenize_segment(sqlite3_tokenizer *pTokenizer, + const char *zQuery, int in_phrase, + Query *pQuery){ + sqlite3_tokenizer_module *pModule = pTokenizer->pModule; + sqlite3_tokenizer_cursor *pCursor; + int is_first = 1; + + int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); + if( rc!=SQLITE_OK ) return rc; + pCursor->pTokenizer = pTokenizer; + + while( 1 ){ + const char *zToken; + int nToken, iStartOffset, iEndOffset, dummy_pos; + + rc = pModule->xNext(pCursor, + &zToken, &nToken, + &iStartOffset, &iEndOffset, + &dummy_pos); + if( rc!=SQLITE_OK ) break; + query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); + is_first = 0; + } + + return pModule->xClose(pCursor); +} + +/* Parse a query string, yielding a Query object. */ +static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ + char *zQuery1 = string_dup(zQuery); + int in_phrase = 0; + char *s = zQuery1; + pQuery->nTerms = 0; + pQuery->pTerm = NULL; + + while( *s ){ + char *t = s; + while( *t ){ + if( *t=='"' ){ + *t++ = '\0'; + break; + } + ++t; + } + if( *s ){ + tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); + } + s = t; + in_phrase = !in_phrase; + } + + free(zQuery1); + return SQLITE_OK; +} + +/* Perform a full-text query; return a list of documents in [pResult]. */ +static int fulltext_query(fulltext_vtab *v, const char *zQuery, + DocList **pResult){ + Query q; + int phrase_start = -1; + int i; + sqlite3_stmt *pSelect = NULL; + DocList *d = NULL; + + int rc = parse_query(v, zQuery, &q); + if( rc!=SQLITE_OK ) return rc; + + /* Merge terms. */ + for(i = 0 ; i < q.nTerms ; ++i){ + /* In each merge step, we need to generate positions whenever we're + * processing a phrase which hasn't ended yet. */ + int need_positions = iiCursorType = idxNum; + switch( idxNum ){ + case QUERY_GENERIC: + zStatement = "select rowid, content from %_content"; + break; + + case QUERY_FULLTEXT: /* full-text search */ + { + const char *zQuery = (const char *)sqlite3_value_text(argv[0]); + DocList *pResult; + assert( argc==1 ); + rc = fulltext_query(v, zQuery, &pResult); + if( rc!=SQLITE_OK ) return rc; + readerInit(&c->result, pResult); + zStatement = "select rowid, content from %_content where rowid = ?"; + break; + } + + default: + assert( 0 ); + } + + rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); + if( rc!=SQLITE_OK ) return rc; + + return fulltextNext(pCursor); +} + +static int fulltextEof(sqlite3_vtab_cursor *pCursor){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + return c->eof; +} + +static int fulltextColumn(sqlite3_vtab_cursor *pCursor, + sqlite3_context *pContext, int idxCol){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + const char *s; + + assert( idxCol==0 ); + s = (const char *) sqlite3_column_text(c->pStmt, 1); + sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); + + return SQLITE_OK; +} + +static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + + *pRowid = sqlite3_column_int64(c->pStmt, 0); + return SQLITE_OK; +} + +/* Build a hash table containing all terms in zText. */ +static int build_terms(fts1Hash *terms, sqlite3_tokenizer *pTokenizer, + const char *zText, sqlite_int64 iDocid){ + sqlite3_tokenizer_cursor *pCursor; + const char *pToken; + int nTokenBytes; + int iStartOffset, iEndOffset, iPosition; + + int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); + if( rc!=SQLITE_OK ) return rc; + + pCursor->pTokenizer = pTokenizer; + fts1HashInit(terms, FTS1_HASH_STRING, 1); + while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, + &pToken, &nTokenBytes, + &iStartOffset, &iEndOffset, + &iPosition) ){ + DocList *p; + + /* Positions can't be negative; we use -1 as a terminator internally. */ + if( iPosition<0 ) { + rc = SQLITE_ERROR; + goto err; + } + + p = fts1HashFind(terms, pToken, nTokenBytes); + if( p==NULL ){ + p = docListNew(DL_POSITIONS_OFFSETS); + docListAddDocid(p, iDocid); + fts1HashInsert(terms, pToken, nTokenBytes, p); + } + docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); + } + +err: + /* TODO(shess) Check return? Should this be able to cause errors at + ** this point? Actually, same question about sqlite3_finalize(), + ** though one could argue that failure there means that the data is + ** not durable. *ponder* + */ + pTokenizer->pModule->xClose(pCursor); + return rc; +} +/* Update the %_terms table to map the term [zTerm] to the given rowid. */ +static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iDocid, DocList *p){ + sqlite_int64 iFirst; + sqlite_int64 iIndexRow; + DocList doclist; + + int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); + if( rc==SQLITE_DONE ){ + docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); + if( docListUpdate(&doclist, iDocid, p) ){ + rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); + docListDestroy(&doclist); + return rc; + } + return SQLITE_OK; + } + if( rc!=SQLITE_ROW ) return SQLITE_ERROR; + + /* This word is in the index; add this document ID to its blob. */ + + rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); + if( rc!=SQLITE_OK ) return rc; + + if( docListUpdate(&doclist, iDocid, p) ){ + /* If the blob is too big, split it in half. */ + if( doclist.nData>CHUNK_MAX ){ + DocList half; + if( docListSplit(&doclist, &half) ){ + rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); + docListDestroy(&half); + if( rc!=SQLITE_OK ) goto err; + } + } + rc = term_update(v, iIndexRow, &doclist); + } + +err: + docListDestroy(&doclist); + return rc; +} + +/* Insert a row into the full-text index; set *piRowid to be the ID of the + * new row. */ +static int index_insert(fulltext_vtab *v, + sqlite3_value *pRequestRowid, const char *zText, + sqlite_int64 *piRowid){ + fts1Hash terms; /* maps term string -> PosList */ + fts1HashElem *e; + + int rc = content_insert(v, pRequestRowid, zText, -1); + if( rc!=SQLITE_OK ) return rc; + *piRowid = sqlite3_last_insert_rowid(v->db); + + if( !zText ) return SQLITE_OK; /* nothing to index */ + + rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); + if( rc!=SQLITE_OK ) return rc; + + for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ + DocList *p = fts1HashData(e); + rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), *piRowid, p); + if( rc!=SQLITE_OK ) break; + } + + for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ + DocList *p = fts1HashData(e); + docListDelete(p); + } + fts1HashClear(&terms); + return rc; +} + +static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iDocid){ + sqlite_int64 iFirst; + sqlite_int64 iIndexRow; + DocList doclist; + + int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); + if( rc!=SQLITE_ROW ) return SQLITE_ERROR; + + rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); + if( rc!=SQLITE_OK ) return rc; + + if( docListUpdate(&doclist, iDocid, NULL) ){ + if( doclist.nData>0 ){ + rc = term_update(v, iIndexRow, &doclist); + } else { /* empty posting list */ + rc = term_delete(v, iIndexRow); + } + } + docListDestroy(&doclist); + return rc; +} + +/* Delete a row from the full-text index. */ +static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ + char *zText; + fts1Hash terms; + fts1HashElem *e; + + int rc = content_select(v, iRow, &zText); + if( rc!=SQLITE_OK ) return rc; + + rc = build_terms(&terms, v->pTokenizer, zText, iRow); + free(zText); + if( rc!=SQLITE_OK ) return rc; + + for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ + rc = index_delete_term(v, fts1HashKey(e), fts1HashKeysize(e), iRow); + if( rc!=SQLITE_OK ) break; + } + for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){ + DocList *p = fts1HashData(e); + docListDelete(p); + } + fts1HashClear(&terms); + + return content_delete(v, iRow); +} + +static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, + sqlite_int64 *pRowid){ + fulltext_vtab *v = (fulltext_vtab *) pVtab; + + if( nArg<2 ){ + return index_delete(v, sqlite3_value_int64(ppArg[0])); + } + + if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ + return SQLITE_ERROR; /* an update; not yet supported */ + } + + assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ + return index_insert(v, ppArg[1], + (const char *)sqlite3_value_text(ppArg[2]), pRowid); +} + +static sqlite3_module fulltextModule = { + 0, + fulltextCreate, + fulltextConnect, + fulltextBestIndex, + fulltextDisconnect, + fulltextDestroy, + fulltextOpen, + fulltextClose, + fulltextFilter, + fulltextNext, + fulltextEof, + fulltextColumn, + fulltextRowid, + fulltextUpdate +}; + +int sqlite3Fts1Init(sqlite3 *db){ + return sqlite3_create_module(db, "fts1", &fulltextModule, 0); +} + +#if !SQLITE_CORE +int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, + const sqlite3_api_routines *pApi){ + SQLITE_EXTENSION_INIT2(pApi) + return sqlite3Fts1Init(db); +} +#endif + +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */ diff --git a/ext/fts1/fts1.h b/ext/fts1/fts1.h new file mode 100644 index 0000000000..a0dfb27622 --- /dev/null +++ b/ext/fts1/fts1.h @@ -0,0 +1,11 @@ +#include "sqlite3.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +int fts1_init(sqlite3 *db); + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ diff --git a/ext/fts1/fts1_hash.c b/ext/fts1/fts1_hash.c new file mode 100644 index 0000000000..463a52b645 --- /dev/null +++ b/ext/fts1/fts1_hash.c @@ -0,0 +1,369 @@ +/* +** 2001 September 22 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** This is the implementation of generic hash-tables used in SQLite. +** We've modified it slightly to serve as a standalone hash table +** implementation for the full-text indexing module. +*/ +#include +#include +#include + +/* +** The code in this file is only compiled if: +** +** * The FTS1 module is being built as an extension +** (in which case SQLITE_CORE is not defined), or +** +** * The FTS1 module is being built into the core of +** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). +*/ +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) + + +#include "fts1_hash.h" + +static void *malloc_and_zero(int n){ + void *p = malloc(n); + if( p ){ + memset(p, 0, n); + } + return p; +} + +/* Turn bulk memory into a hash table object by initializing the +** fields of the Hash structure. +** +** "pNew" is a pointer to the hash table that is to be initialized. +** keyClass is one of the constants +** FTS1_HASH_BINARY or FTS1_HASH_STRING. The value of keyClass +** determines what kind of key the hash table will use. "copyKey" is +** true if the hash table should make its own private copy of keys and +** false if it should just use the supplied pointer. +*/ +void sqlite3Fts1HashInit(fts1Hash *pNew, int keyClass, int copyKey){ + assert( pNew!=0 ); + assert( keyClass>=FTS1_HASH_STRING && keyClass<=FTS1_HASH_BINARY ); + pNew->keyClass = keyClass; + pNew->copyKey = copyKey; + pNew->first = 0; + pNew->count = 0; + pNew->htsize = 0; + pNew->ht = 0; + pNew->xMalloc = malloc_and_zero; + pNew->xFree = free; +} + +/* Remove all entries from a hash table. Reclaim all memory. +** Call this routine to delete a hash table or to reset a hash table +** to the empty state. +*/ +void sqlite3Fts1HashClear(fts1Hash *pH){ + fts1HashElem *elem; /* For looping over all elements of the table */ + + assert( pH!=0 ); + elem = pH->first; + pH->first = 0; + if( pH->ht ) pH->xFree(pH->ht); + pH->ht = 0; + pH->htsize = 0; + while( elem ){ + fts1HashElem *next_elem = elem->next; + if( pH->copyKey && elem->pKey ){ + pH->xFree(elem->pKey); + } + pH->xFree(elem); + elem = next_elem; + } + pH->count = 0; +} + +/* +** Hash and comparison functions when the mode is FTS1_HASH_STRING +*/ +static int strHash(const void *pKey, int nKey){ + const char *z = (const char *)pKey; + int h = 0; + if( nKey<=0 ) nKey = (int) strlen(z); + while( nKey > 0 ){ + h = (h<<3) ^ h ^ *z++; + nKey--; + } + return h & 0x7fffffff; +} +static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){ + if( n1!=n2 ) return 1; + return strncmp((const char*)pKey1,(const char*)pKey2,n1); +} + +/* +** Hash and comparison functions when the mode is FTS1_HASH_BINARY +*/ +static int binHash(const void *pKey, int nKey){ + int h = 0; + const char *z = (const char *)pKey; + while( nKey-- > 0 ){ + h = (h<<3) ^ h ^ *(z++); + } + return h & 0x7fffffff; +} +static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){ + if( n1!=n2 ) return 1; + return memcmp(pKey1,pKey2,n1); +} + +/* +** Return a pointer to the appropriate hash function given the key class. +** +** The C syntax in this function definition may be unfamilar to some +** programmers, so we provide the following additional explanation: +** +** The name of the function is "hashFunction". The function takes a +** single parameter "keyClass". The return value of hashFunction() +** is a pointer to another function. Specifically, the return value +** of hashFunction() is a pointer to a function that takes two parameters +** with types "const void*" and "int" and returns an "int". +*/ +static int (*hashFunction(int keyClass))(const void*,int){ + if( keyClass==FTS1_HASH_STRING ){ + return &strHash; + }else{ + assert( keyClass==FTS1_HASH_BINARY ); + return &binHash; + } +} + +/* +** Return a pointer to the appropriate hash function given the key class. +** +** For help in interpreted the obscure C code in the function definition, +** see the header comment on the previous function. +*/ +static int (*compareFunction(int keyClass))(const void*,int,const void*,int){ + if( keyClass==FTS1_HASH_STRING ){ + return &strCompare; + }else{ + assert( keyClass==FTS1_HASH_BINARY ); + return &binCompare; + } +} + +/* Link an element into the hash table +*/ +static void insertElement( + fts1Hash *pH, /* The complete hash table */ + struct _fts1ht *pEntry, /* The entry into which pNew is inserted */ + fts1HashElem *pNew /* The element to be inserted */ +){ + fts1HashElem *pHead; /* First element already in pEntry */ + pHead = pEntry->chain; + if( pHead ){ + pNew->next = pHead; + pNew->prev = pHead->prev; + if( pHead->prev ){ pHead->prev->next = pNew; } + else { pH->first = pNew; } + pHead->prev = pNew; + }else{ + pNew->next = pH->first; + if( pH->first ){ pH->first->prev = pNew; } + pNew->prev = 0; + pH->first = pNew; + } + pEntry->count++; + pEntry->chain = pNew; +} + + +/* Resize the hash table so that it cantains "new_size" buckets. +** "new_size" must be a power of 2. The hash table might fail +** to resize if sqliteMalloc() fails. +*/ +static void rehash(fts1Hash *pH, int new_size){ + struct _fts1ht *new_ht; /* The new hash table */ + fts1HashElem *elem, *next_elem; /* For looping over existing elements */ + int (*xHash)(const void*,int); /* The hash function */ + + assert( (new_size & (new_size-1))==0 ); + new_ht = (struct _fts1ht *)pH->xMalloc( new_size*sizeof(struct _fts1ht) ); + if( new_ht==0 ) return; + if( pH->ht ) pH->xFree(pH->ht); + pH->ht = new_ht; + pH->htsize = new_size; + xHash = hashFunction(pH->keyClass); + for(elem=pH->first, pH->first=0; elem; elem = next_elem){ + int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1); + next_elem = elem->next; + insertElement(pH, &new_ht[h], elem); + } +} + +/* This function (for internal use only) locates an element in an +** hash table that matches the given key. The hash for this key has +** already been computed and is passed as the 4th parameter. +*/ +static fts1HashElem *findElementGivenHash( + const fts1Hash *pH, /* The pH to be searched */ + const void *pKey, /* The key we are searching for */ + int nKey, + int h /* The hash for this key. */ +){ + fts1HashElem *elem; /* Used to loop thru the element list */ + int count; /* Number of elements left to test */ + int (*xCompare)(const void*,int,const void*,int); /* comparison function */ + + if( pH->ht ){ + struct _fts1ht *pEntry = &pH->ht[h]; + elem = pEntry->chain; + count = pEntry->count; + xCompare = compareFunction(pH->keyClass); + while( count-- && elem ){ + if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){ + return elem; + } + elem = elem->next; + } + } + return 0; +} + +/* Remove a single entry from the hash table given a pointer to that +** element and a hash on the element's key. +*/ +static void removeElementGivenHash( + fts1Hash *pH, /* The pH containing "elem" */ + fts1HashElem* elem, /* The element to be removed from the pH */ + int h /* Hash value for the element */ +){ + struct _fts1ht *pEntry; + if( elem->prev ){ + elem->prev->next = elem->next; + }else{ + pH->first = elem->next; + } + if( elem->next ){ + elem->next->prev = elem->prev; + } + pEntry = &pH->ht[h]; + if( pEntry->chain==elem ){ + pEntry->chain = elem->next; + } + pEntry->count--; + if( pEntry->count<=0 ){ + pEntry->chain = 0; + } + if( pH->copyKey && elem->pKey ){ + pH->xFree(elem->pKey); + } + pH->xFree( elem ); + pH->count--; + if( pH->count<=0 ){ + assert( pH->first==0 ); + assert( pH->count==0 ); + fts1HashClear(pH); + } +} + +/* Attempt to locate an element of the hash table pH with a key +** that matches pKey,nKey. Return the data for this element if it is +** found, or NULL if there is no match. +*/ +void *sqlite3Fts1HashFind(const fts1Hash *pH, const void *pKey, int nKey){ + int h; /* A hash on key */ + fts1HashElem *elem; /* The element that matches key */ + int (*xHash)(const void*,int); /* The hash function */ + + if( pH==0 || pH->ht==0 ) return 0; + xHash = hashFunction(pH->keyClass); + assert( xHash!=0 ); + h = (*xHash)(pKey,nKey); + assert( (pH->htsize & (pH->htsize-1))==0 ); + elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1)); + return elem ? elem->data : 0; +} + +/* Insert an element into the hash table pH. The key is pKey,nKey +** and the data is "data". +** +** If no element exists with a matching key, then a new +** element is created. A copy of the key is made if the copyKey +** flag is set. NULL is returned. +** +** If another element already exists with the same key, then the +** new data replaces the old data and the old data is returned. +** The key is not copied in this instance. If a malloc fails, then +** the new data is returned and the hash table is unchanged. +** +** If the "data" parameter to this function is NULL, then the +** element corresponding to "key" is removed from the hash table. +*/ +void *sqlite3Fts1HashInsert( + fts1Hash *pH, /* The hash table to insert into */ + const void *pKey, /* The key */ + int nKey, /* Number of bytes in the key */ + void *data /* The data */ +){ + int hraw; /* Raw hash value of the key */ + int h; /* the hash of the key modulo hash table size */ + fts1HashElem *elem; /* Used to loop thru the element list */ + fts1HashElem *new_elem; /* New element added to the pH */ + int (*xHash)(const void*,int); /* The hash function */ + + assert( pH!=0 ); + xHash = hashFunction(pH->keyClass); + assert( xHash!=0 ); + hraw = (*xHash)(pKey, nKey); + assert( (pH->htsize & (pH->htsize-1))==0 ); + h = hraw & (pH->htsize-1); + elem = findElementGivenHash(pH,pKey,nKey,h); + if( elem ){ + void *old_data = elem->data; + if( data==0 ){ + removeElementGivenHash(pH,elem,h); + }else{ + elem->data = data; + } + return old_data; + } + if( data==0 ) return 0; + new_elem = (fts1HashElem*)pH->xMalloc( sizeof(fts1HashElem) ); + if( new_elem==0 ) return data; + if( pH->copyKey && pKey!=0 ){ + new_elem->pKey = pH->xMalloc( nKey ); + if( new_elem->pKey==0 ){ + pH->xFree(new_elem); + return data; + } + memcpy((void*)new_elem->pKey, pKey, nKey); + }else{ + new_elem->pKey = (void*)pKey; + } + new_elem->nKey = nKey; + pH->count++; + if( pH->htsize==0 ){ + rehash(pH,8); + if( pH->htsize==0 ){ + pH->count = 0; + pH->xFree(new_elem); + return data; + } + } + if( pH->count > pH->htsize ){ + rehash(pH,pH->htsize*2); + } + assert( pH->htsize>0 ); + assert( (pH->htsize & (pH->htsize-1))==0 ); + h = hraw & (pH->htsize-1); + insertElement(pH, &pH->ht[h], new_elem); + new_elem->data = data; + return 0; +} + +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */ diff --git a/ext/fts1/fts1_hash.h b/ext/fts1/fts1_hash.h new file mode 100644 index 0000000000..c31c430f7c --- /dev/null +++ b/ext/fts1/fts1_hash.h @@ -0,0 +1,112 @@ +/* +** 2001 September 22 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** This is the header file for the generic hash-table implemenation +** used in SQLite. We've modified it slightly to serve as a standalone +** hash table implementation for the full-text indexing module. +** +*/ +#ifndef _FTS1_HASH_H_ +#define _FTS1_HASH_H_ + +/* Forward declarations of structures. */ +typedef struct fts1Hash fts1Hash; +typedef struct fts1HashElem fts1HashElem; + +/* A complete hash table is an instance of the following structure. +** The internals of this structure are intended to be opaque -- client +** code should not attempt to access or modify the fields of this structure +** directly. Change this structure only by using the routines below. +** However, many of the "procedures" and "functions" for modifying and +** accessing this structure are really macros, so we can't really make +** this structure opaque. +*/ +struct fts1Hash { + char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */ + char copyKey; /* True if copy of key made on insert */ + int count; /* Number of entries in this table */ + fts1HashElem *first; /* The first element of the array */ + void *(*xMalloc)(int); /* malloc() function to use */ + void (*xFree)(void *); /* free() function to use */ + int htsize; /* Number of buckets in the hash table */ + struct _fts1ht { /* the hash table */ + int count; /* Number of entries with this hash */ + fts1HashElem *chain; /* Pointer to first entry with this hash */ + } *ht; +}; + +/* Each element in the hash table is an instance of the following +** structure. All elements are stored on a single doubly-linked list. +** +** Again, this structure is intended to be opaque, but it can't really +** be opaque because it is used by macros. +*/ +struct fts1HashElem { + fts1HashElem *next, *prev; /* Next and previous elements in the table */ + void *data; /* Data associated with this element */ + void *pKey; int nKey; /* Key associated with this element */ +}; + +/* +** There are 2 different modes of operation for a hash table: +** +** FTS1_HASH_STRING pKey points to a string that is nKey bytes long +** (including the null-terminator, if any). Case +** is respected in comparisons. +** +** FTS1_HASH_BINARY pKey points to binary data nKey bytes long. +** memcmp() is used to compare keys. +** +** A copy of the key is made if the copyKey parameter to fts1HashInit is 1. +*/ +#define FTS1_HASH_STRING 1 +#define FTS1_HASH_BINARY 2 + +/* +** Access routines. To delete, insert a NULL pointer. +*/ +void sqlite3Fts1HashInit(fts1Hash*, int keytype, int copyKey); +void *sqlite3Fts1HashInsert(fts1Hash*, const void *pKey, int nKey, void *pData); +void *sqlite3Fts1HashFind(const fts1Hash*, const void *pKey, int nKey); +void sqlite3Fts1HashClear(fts1Hash*); + +/* +** Shorthand for the functions above +*/ +#define fts1HashInit sqlite3Fts1HashInit +#define fts1HashInsert sqlite3Fts1HashInsert +#define fts1HashFind sqlite3Fts1HashFind +#define fts1HashClear sqlite3Fts1HashClear + +/* +** Macros for looping over all elements of a hash table. The idiom is +** like this: +** +** fts1Hash h; +** fts1HashElem *p; +** ... +** for(p=fts1HashFirst(&h); p; p=fts1HashNext(p)){ +** SomeStructure *pData = fts1HashData(p); +** // do something with pData +** } +*/ +#define fts1HashFirst(H) ((H)->first) +#define fts1HashNext(E) ((E)->next) +#define fts1HashData(E) ((E)->data) +#define fts1HashKey(E) ((E)->pKey) +#define fts1HashKeysize(E) ((E)->nKey) + +/* +** Number of entries in a hash table +*/ +#define fts1HashCount(H) ((H)->count) + +#endif /* _FTS1_HASH_H_ */ diff --git a/ext/fts1/fts1_tokenizer.h b/ext/fts1/fts1_tokenizer.h new file mode 100644 index 0000000000..5e5010f759 --- /dev/null +++ b/ext/fts1/fts1_tokenizer.h @@ -0,0 +1,89 @@ +/* +** 2006 July 10 +** +** The author disclaims copyright to this source code. +** +************************************************************************* +** Defines the interface to tokenizers used by fulltext-search. There +** are three basic components: +** +** sqlite3_tokenizer_module is a singleton defining the tokenizer +** interface functions. This is essentially the class structure for +** tokenizers. +** +** sqlite3_tokenizer is used to define a particular tokenizer, perhaps +** including customization information defined at creation time. +** +** sqlite3_tokenizer_cursor is generated by a tokenizer to generate +** tokens from a particular input. +*/ +#ifndef _FTS1_TOKENIZER_H_ +#define _FTS1_TOKENIZER_H_ + +/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. +** If tokenizers are to be allowed to call sqlite3_*() functions, then +** we will need a way to register the API consistently. +*/ +#include "sqlite3.h" + +/* +** Structures used by the tokenizer interface. +*/ +typedef struct sqlite3_tokenizer sqlite3_tokenizer; +typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; +typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; + +struct sqlite3_tokenizer_module { + int iVersion; /* currently 0 */ + + /* + ** Create and destroy a tokenizer. argc/argv are passed down from + ** the fulltext virtual table creation to allow customization. + */ + int (*xCreate)(int argc, const char **argv, + sqlite3_tokenizer **ppTokenizer); + int (*xDestroy)(sqlite3_tokenizer *pTokenizer); + + /* + ** Tokenize a particular input. Call xOpen() to prepare to + ** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then + ** xClose() to free any internal state. The pInput passed to + ** xOpen() must exist until the cursor is closed. The ppToken + ** result from xNext() is only valid until the next call to xNext() + ** or until xClose() is called. + */ + /* TODO(shess) current implementation requires pInput to be + ** nul-terminated. This should either be fixed, or pInput/nBytes + ** should be converted to zInput. + */ + int (*xOpen)(sqlite3_tokenizer *pTokenizer, + const char *pInput, int nBytes, + sqlite3_tokenizer_cursor **ppCursor); + int (*xClose)(sqlite3_tokenizer_cursor *pCursor); + int (*xNext)(sqlite3_tokenizer_cursor *pCursor, + const char **ppToken, int *pnBytes, + int *piStartOffset, int *piEndOffset, int *piPosition); +}; + +struct sqlite3_tokenizer { + sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ + /* Tokenizer implementations will typically add additional fields */ +}; + +struct sqlite3_tokenizer_cursor { + sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ + /* Tokenizer implementations will typically add additional fields */ +}; + +/* +** Get the module for a tokenizer which generates tokens based on a +** set of non-token characters. The default is to break tokens at any +** non-alnum character, though the set of delimiters can also be +** specified by the first argv argument to xCreate(). +*/ +/* TODO(shess) This doesn't belong here. Need some sort of +** registration process. +*/ +void sqlite3Fts1SimpleTokenizerModule(sqlite3_tokenizer_module **ppModule); + +#endif /* _FTS1_TOKENIZER_H_ */ diff --git a/ext/fts1/fts1_tokenizer1.c b/ext/fts1/fts1_tokenizer1.c new file mode 100644 index 0000000000..f3e35b1e46 --- /dev/null +++ b/ext/fts1/fts1_tokenizer1.c @@ -0,0 +1,188 @@ +/* +** The author disclaims copyright to this source code. +** +************************************************************************* +** Implementation of the "simple" full-text-search tokenizer. +*/ + +/* +** The code in this file is only compiled if: +** +** * The FTS1 module is being built as an extension +** (in which case SQLITE_CORE is not defined), or +** +** * The FTS1 module is being built into the core of +** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). +*/ +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) + + +#include +#if !defined(__APPLE__) +#include +#else +#include +#endif +#include +#include +#include + +#include "fts1_tokenizer.h" + +/* Duplicate a string; the caller must free() the returned string. + * (We don't use strdup() since it's not part of the standard C library and + * may not be available everywhere.) */ +/* TODO(shess) Copied from fulltext.c, consider util.c for such +** things. */ +static char *string_dup(const char *s){ + char *str = malloc(strlen(s) + 1); + strcpy(str, s); + return str; +} + +typedef struct simple_tokenizer { + sqlite3_tokenizer base; + const char *zDelim; /* token delimiters */ +} simple_tokenizer; + +typedef struct simple_tokenizer_cursor { + sqlite3_tokenizer_cursor base; + const char *pInput; /* input we are tokenizing */ + int nBytes; /* size of the input */ + const char *pCurrent; /* current position in pInput */ + int iToken; /* index of next token to be returned */ + char *zToken; /* storage for current token */ + int nTokenBytes; /* actual size of current token */ + int nTokenAllocated; /* space allocated to zToken buffer */ +} simple_tokenizer_cursor; + +static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */ + +static int simpleCreate( + int argc, const char **argv, + sqlite3_tokenizer **ppTokenizer +){ + simple_tokenizer *t; + + t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer)); + /* TODO(shess) Delimiters need to remain the same from run to run, + ** else we need to reindex. One solution would be a meta-table to + ** track such information in the database, then we'd only want this + ** information on the initial create. + */ + if( argc>1 ){ + t->zDelim = string_dup(argv[1]); + } else { + /* Build a string excluding alphanumeric ASCII characters */ + char zDelim[0x80]; /* nul-terminated, so nul not a member */ + int i, j; + for(i=1, j=0; i<0x80; i++){ + if( !isalnum(i) ){ + zDelim[j++] = i; + } + } + zDelim[j++] = '\0'; + assert( j<=sizeof(zDelim) ); + t->zDelim = string_dup(zDelim); + } + + *ppTokenizer = &t->base; + return SQLITE_OK; +} + +static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ + simple_tokenizer *t = (simple_tokenizer *) pTokenizer; + + free((void *) t->zDelim); + free(t); + + return SQLITE_OK; +} + +static int simpleOpen( + sqlite3_tokenizer *pTokenizer, + const char *pInput, int nBytes, + sqlite3_tokenizer_cursor **ppCursor +){ + simple_tokenizer_cursor *c; + + c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor)); + c->pInput = pInput; + c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes; + c->pCurrent = c->pInput; /* start tokenizing at the beginning */ + c->iToken = 0; + c->zToken = NULL; /* no space allocated, yet. */ + c->nTokenBytes = 0; + c->nTokenAllocated = 0; + + *ppCursor = &c->base; + return SQLITE_OK; +} + +static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ + simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; + + if( NULL!=c->zToken ){ + free(c->zToken); + } + free(c); + + return SQLITE_OK; +} + +static int simpleNext( + sqlite3_tokenizer_cursor *pCursor, + const char **ppToken, int *pnBytes, + int *piStartOffset, int *piEndOffset, int *piPosition +){ + simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; + simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; + int ii; + + while( c->pCurrent-c->pInputnBytes ){ + int n = (int) strcspn(c->pCurrent, t->zDelim); + if( n>0 ){ + if( n+1>c->nTokenAllocated ){ + c->zToken = realloc(c->zToken, n+1); + } + for(ii=0; iipCurrent[ii]; + c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch; + } + c->zToken[n] = '\0'; + *ppToken = c->zToken; + *pnBytes = n; + *piStartOffset = (int) (c->pCurrent-c->pInput); + *piEndOffset = *piStartOffset+n; + *piPosition = c->iToken++; + c->pCurrent += n + 1; + + return SQLITE_OK; + } + c->pCurrent += n + 1; + /* TODO(shess) could strspn() to skip delimiters en masse. Needs + ** to happen in two places, though, which is annoying. + */ + } + return SQLITE_DONE; +} + +static sqlite3_tokenizer_module simpleTokenizerModule = { + 0, + simpleCreate, + simpleDestroy, + simpleOpen, + simpleClose, + simpleNext, +}; + +void sqlite3Fts1SimpleTokenizerModule( + sqlite3_tokenizer_module **ppModule +){ + *ppModule = &simpleTokenizerModule; +} + +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */ diff --git a/manifest b/manifest index 7ae5f04eca..2a47e37ca9 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Just\sdon't\srun\stolower()\son\shi-bit\scharacters.\s\sThis\sshouldn't\scause\nus\sto\sbreak\sany\sUTF-8\scode\spoints,\sunless\sthey\swere\salready\sbroken\sin\nthe\sinput.\s(CVS\s3376) -D 2006-08-30T21:40:30 +C Refactor\sthe\sFTS1\smodule\sso\sthat\sits\sname\sis\s"fts1"\sinstead\sof\s"fulltext",\nso\sthat\sall\ssymbols\swith\sexternal\slinkage\sbegin\swith\s"sqlite3Fts1",\sand\nso\sthat\sall\sfilenames\sbegin\swith\s"fts1".\s(CVS\s3377) +D 2006-08-31T15:07:14 F Makefile.in 8e7f9ecebab2c6e0f3db20ff129a8f9405ab64f8 F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -21,6 +21,12 @@ F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1 F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5 +F ext/fts1/fts1.c 8c6e1f62954c1cf7171167e55fa5fcfa1ddf4f2b +F ext/fts1/fts1.h fe8e8f38dd6d2d2645b9b0d6972e80985249575f +F ext/fts1/fts1_hash.c 3196cee866edbebb1c0521e21672e6d599965114 +F ext/fts1/fts1_hash.h 957d378355ed29f672cd5add012ce8b088a5e089 +F ext/fts1/fts1_tokenizer.h 887580709d32e44ca4ee3c7107c06ee444eb44c2 +F ext/fts1/fts1_tokenizer1.c 13789776e1afc4a4db04f2700eba24f9c5e77a90 F ext/fts1/fulltext.c d935e600d87bc86b7d64f55c7520ea41d6034c5c F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd F ext/fts1/simple_tokenizer.c 1844d72f7194c3fd3d7e4173053911bf0661b70d @@ -84,7 +90,7 @@ F src/sqlite.h.in 84ac26ca94a84dd603fb57a27d862f51bfd9f687 F src/sqlite3ext.h 11a046b3519c4b9b7709e6d6a95c3a36366f684a F src/sqliteInt.h 325a2d45be2b22c1e21ad649e0a898c74eaec7de F src/table.c d8817f43a6c6bf139487db161760b9e1e02da3f1 -F src/tclsqlite.c a29488f25daecd397850d78525fe01422024963b +F src/tclsqlite.c fae9746f1da8ab888364a5ba02e691922e0a184e F src/test1.c 535294d7f21a4127082c4f7a57f225482df9cc36 F src/test2.c ca74a1d8aeb7d9606e8f6b762c5daf85c1a3f92b F src/test3.c 85135c09560c48bdb0a23c9b890ab405486b8ec9 @@ -389,7 +395,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513 -P 7cdc41e748c2d8f9e3d85c07143b8bc343bf2426 -R a3d0c16407fadd186adf26ca579a098c -U shess -Z 3d482d971ce8627e2fdb8650b1cf61df +P 6c77c2d5e15e9d3efed3e274bc93cd5a4868f574 +R 3fbd41e2a6fe983095c0be06e9b96477 +U drh +Z 4687716ffdce69db90a5a92c651e7fec diff --git a/manifest.uuid b/manifest.uuid index 8ce82542a7..949092b33f 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -6c77c2d5e15e9d3efed3e274bc93cd5a4868f574 \ No newline at end of file +e1891f0dc58e5498a8845d8b9b5b092d7f9c7003 \ No newline at end of file diff --git a/src/tclsqlite.c b/src/tclsqlite.c index 0de5a6fd32..7a88e96341 100644 --- a/src/tclsqlite.c +++ b/src/tclsqlite.c @@ -11,7 +11,7 @@ ************************************************************************* ** A TCL Interface to SQLite ** -** $Id: tclsqlite.c,v 1.171 2006/08/29 12:04:19 drh Exp $ +** $Id: tclsqlite.c,v 1.172 2006/08/31 15:07:15 drh Exp $ */ #ifndef NO_TCL /* Omit this whole file if TCL is unavailable */ @@ -2075,8 +2075,8 @@ static int DbMain(void *cd, Tcl_Interp *interp, int objc,Tcl_Obj *const*objv){ #ifdef SQLITE_ENABLE_FTS1 { - extern int fulltext_init(sqlite3*); - fulltext_init(p->db); + extern int sqlite3Fts1Init(sqlite3*); + sqlite3Fts1Init(p->db); } #endif