From: adamd Date: Wed, 23 Aug 2006 23:58:50 +0000 (+0000) Subject: A first implementation of a full-text search module for SQLite. (CVS 3363) X-Git-Tag: version-3.6.10~2804 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1717edd15785dca12eccd57d0bcb28402843fd1e;p=thirdparty%2Fsqlite.git A first implementation of a full-text search module for SQLite. (CVS 3363) FossilOrigin-Name: b0d8e0d314d6f77b7d4b5dd00c694a1323f7a8e4 --- diff --git a/ext/fts1/ft_hash.c b/ext/fts1/ft_hash.c new file mode 100644 index 0000000000..8b3a7064ee --- /dev/null +++ b/ext/fts1/ft_hash.c @@ -0,0 +1,404 @@ +/* +** 2001 September 22 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** This is the implementation of generic hash-tables used in SQLite. +** We've modified it slightly to serve as a standalone hash table +** implementation for the full-text indexing module. +*/ +#include +#include +#include + +#include "ft_hash.h" + +void *malloc_and_zero(int n){ + void *p = malloc(n); + if( p ){ + memset(p, 0, n); + } + return p; +} + +/* Turn bulk memory into a hash table object by initializing the +** fields of the Hash structure. +** +** "pNew" is a pointer to the hash table that is to be initialized. +** keyClass is one of the constants HASH_INT, HASH_POINTER, +** HASH_BINARY, or HASH_STRING. The value of keyClass +** determines what kind of key the hash table will use. "copyKey" is +** true if the hash table should make its own private copy of keys and +** false if it should just use the supplied pointer. CopyKey only makes +** sense for HASH_STRING and HASH_BINARY and is ignored +** for other key classes. +*/ +void HashInit(Hash *pNew, int keyClass, int copyKey){ + assert( pNew!=0 ); + assert( keyClass>=HASH_STRING && keyClass<=HASH_BINARY ); + pNew->keyClass = keyClass; +#if 0 + if( keyClass==HASH_POINTER || keyClass==HASH_INT ) copyKey = 0; +#endif + pNew->copyKey = copyKey; + pNew->first = 0; + pNew->count = 0; + pNew->htsize = 0; + pNew->ht = 0; + pNew->xMalloc = malloc_and_zero; + pNew->xFree = free; +} + +/* Remove all entries from a hash table. Reclaim all memory. +** Call this routine to delete a hash table or to reset a hash table +** to the empty state. +*/ +void HashClear(Hash *pH){ + HashElem *elem; /* For looping over all elements of the table */ + + assert( pH!=0 ); + elem = pH->first; + pH->first = 0; + if( pH->ht ) pH->xFree(pH->ht); + pH->ht = 0; + pH->htsize = 0; + while( elem ){ + HashElem *next_elem = elem->next; + if( pH->copyKey && elem->pKey ){ + pH->xFree(elem->pKey); + } + pH->xFree(elem); + elem = next_elem; + } + pH->count = 0; +} + +#if 0 /* NOT USED */ +/* +** Hash and comparison functions when the mode is HASH_INT +*/ +static int intHash(const void *pKey, int nKey){ + return nKey ^ (nKey<<8) ^ (nKey>>8); +} +static int intCompare(const void *pKey1, int n1, const void *pKey2, int n2){ + return n2 - n1; +} +#endif + +#if 0 /* NOT USED */ +/* +** Hash and comparison functions when the mode is HASH_POINTER +*/ +static int ptrHash(const void *pKey, int nKey){ + uptr x = Addr(pKey); + return x ^ (x<<8) ^ (x>>8); +} +static int ptrCompare(const void *pKey1, int n1, const void *pKey2, int n2){ + if( pKey1==pKey2 ) return 0; + if( pKey1 0 ){ + h = (h<<3) ^ h ^ *z++; + nKey--; + } + return h & 0x7fffffff; +} +static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){ + if( n1!=n2 ) return 1; + return strncmp((const char*)pKey1,(const char*)pKey2,n1); +} + +/* +** Hash and comparison functions when the mode is HASH_BINARY +*/ +static int binHash(const void *pKey, int nKey){ + int h = 0; + const char *z = (const char *)pKey; + while( nKey-- > 0 ){ + h = (h<<3) ^ h ^ *(z++); + } + return h & 0x7fffffff; +} +static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){ + if( n1!=n2 ) return 1; + return memcmp(pKey1,pKey2,n1); +} + +/* +** Return a pointer to the appropriate hash function given the key class. +** +** The C syntax in this function definition may be unfamilar to some +** programmers, so we provide the following additional explanation: +** +** The name of the function is "hashFunction". The function takes a +** single parameter "keyClass". The return value of hashFunction() +** is a pointer to another function. Specifically, the return value +** of hashFunction() is a pointer to a function that takes two parameters +** with types "const void*" and "int" and returns an "int". +*/ +static int (*hashFunction(int keyClass))(const void*,int){ +#if 0 /* HASH_INT and HASH_POINTER are never used */ + switch( keyClass ){ + case HASH_INT: return &intHash; + case HASH_POINTER: return &ptrHash; + case HASH_STRING: return &strHash; + case HASH_BINARY: return &binHash;; + default: break; + } + return 0; +#else + if( keyClass==HASH_STRING ){ + return &strHash; + }else{ + assert( keyClass==HASH_BINARY ); + return &binHash; + } +#endif +} + +/* +** Return a pointer to the appropriate hash function given the key class. +** +** For help in interpreted the obscure C code in the function definition, +** see the header comment on the previous function. +*/ +static int (*compareFunction(int keyClass))(const void*,int,const void*,int){ +#if 0 /* HASH_INT and HASH_POINTER are never used */ + switch( keyClass ){ + case HASH_INT: return &intCompare; + case HASH_POINTER: return &ptrCompare; + case HASH_STRING: return &strCompare; + case HASH_BINARY: return &binCompare; + default: break; + } + return 0; +#else + if( keyClass==HASH_STRING ){ + return &strCompare; + }else{ + assert( keyClass==HASH_BINARY ); + return &binCompare; + } +#endif +} + +/* Link an element into the hash table +*/ +static void insertElement( + Hash *pH, /* The complete hash table */ + struct _ht *pEntry, /* The entry into which pNew is inserted */ + HashElem *pNew /* The element to be inserted */ +){ + HashElem *pHead; /* First element already in pEntry */ + pHead = pEntry->chain; + if( pHead ){ + pNew->next = pHead; + pNew->prev = pHead->prev; + if( pHead->prev ){ pHead->prev->next = pNew; } + else { pH->first = pNew; } + pHead->prev = pNew; + }else{ + pNew->next = pH->first; + if( pH->first ){ pH->first->prev = pNew; } + pNew->prev = 0; + pH->first = pNew; + } + pEntry->count++; + pEntry->chain = pNew; +} + + +/* Resize the hash table so that it cantains "new_size" buckets. +** "new_size" must be a power of 2. The hash table might fail +** to resize if sqliteMalloc() fails. +*/ +static void rehash(Hash *pH, int new_size){ + struct _ht *new_ht; /* The new hash table */ + HashElem *elem, *next_elem; /* For looping over existing elements */ + int (*xHash)(const void*,int); /* The hash function */ + + assert( (new_size & (new_size-1))==0 ); + new_ht = (struct _ht *)pH->xMalloc( new_size*sizeof(struct _ht) ); + if( new_ht==0 ) return; + if( pH->ht ) pH->xFree(pH->ht); + pH->ht = new_ht; + pH->htsize = new_size; + xHash = hashFunction(pH->keyClass); + for(elem=pH->first, pH->first=0; elem; elem = next_elem){ + int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1); + next_elem = elem->next; + insertElement(pH, &new_ht[h], elem); + } +} + +/* This function (for internal use only) locates an element in an +** hash table that matches the given key. The hash for this key has +** already been computed and is passed as the 4th parameter. +*/ +static HashElem *findElementGivenHash( + const Hash *pH, /* The pH to be searched */ + const void *pKey, /* The key we are searching for */ + int nKey, + int h /* The hash for this key. */ +){ + HashElem *elem; /* Used to loop thru the element list */ + int count; /* Number of elements left to test */ + int (*xCompare)(const void*,int,const void*,int); /* comparison function */ + + if( pH->ht ){ + struct _ht *pEntry = &pH->ht[h]; + elem = pEntry->chain; + count = pEntry->count; + xCompare = compareFunction(pH->keyClass); + while( count-- && elem ){ + if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){ + return elem; + } + elem = elem->next; + } + } + return 0; +} + +/* Remove a single entry from the hash table given a pointer to that +** element and a hash on the element's key. +*/ +static void removeElementGivenHash( + Hash *pH, /* The pH containing "elem" */ + HashElem* elem, /* The element to be removed from the pH */ + int h /* Hash value for the element */ +){ + struct _ht *pEntry; + if( elem->prev ){ + elem->prev->next = elem->next; + }else{ + pH->first = elem->next; + } + if( elem->next ){ + elem->next->prev = elem->prev; + } + pEntry = &pH->ht[h]; + if( pEntry->chain==elem ){ + pEntry->chain = elem->next; + } + pEntry->count--; + if( pEntry->count<=0 ){ + pEntry->chain = 0; + } + if( pH->copyKey && elem->pKey ){ + pH->xFree(elem->pKey); + } + pH->xFree( elem ); + pH->count--; + if( pH->count<=0 ){ + assert( pH->first==0 ); + assert( pH->count==0 ); + HashClear(pH); + } +} + +/* Attempt to locate an element of the hash table pH with a key +** that matches pKey,nKey. Return the data for this element if it is +** found, or NULL if there is no match. +*/ +void *HashFind(const Hash *pH, const void *pKey, int nKey){ + int h; /* A hash on key */ + HashElem *elem; /* The element that matches key */ + int (*xHash)(const void*,int); /* The hash function */ + + if( pH==0 || pH->ht==0 ) return 0; + xHash = hashFunction(pH->keyClass); + assert( xHash!=0 ); + h = (*xHash)(pKey,nKey); + assert( (pH->htsize & (pH->htsize-1))==0 ); + elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1)); + return elem ? elem->data : 0; +} + +/* Insert an element into the hash table pH. The key is pKey,nKey +** and the data is "data". +** +** If no element exists with a matching key, then a new +** element is created. A copy of the key is made if the copyKey +** flag is set. NULL is returned. +** +** If another element already exists with the same key, then the +** new data replaces the old data and the old data is returned. +** The key is not copied in this instance. If a malloc fails, then +** the new data is returned and the hash table is unchanged. +** +** If the "data" parameter to this function is NULL, then the +** element corresponding to "key" is removed from the hash table. +*/ +void *HashInsert(Hash *pH, const void *pKey, int nKey, void *data){ + int hraw; /* Raw hash value of the key */ + int h; /* the hash of the key modulo hash table size */ + HashElem *elem; /* Used to loop thru the element list */ + HashElem *new_elem; /* New element added to the pH */ + int (*xHash)(const void*,int); /* The hash function */ + + assert( pH!=0 ); + xHash = hashFunction(pH->keyClass); + assert( xHash!=0 ); + hraw = (*xHash)(pKey, nKey); + assert( (pH->htsize & (pH->htsize-1))==0 ); + h = hraw & (pH->htsize-1); + elem = findElementGivenHash(pH,pKey,nKey,h); + if( elem ){ + void *old_data = elem->data; + if( data==0 ){ + removeElementGivenHash(pH,elem,h); + }else{ + elem->data = data; + } + return old_data; + } + if( data==0 ) return 0; + new_elem = (HashElem*)pH->xMalloc( sizeof(HashElem) ); + if( new_elem==0 ) return data; + if( pH->copyKey && pKey!=0 ){ + new_elem->pKey = pH->xMalloc( nKey ); + if( new_elem->pKey==0 ){ + pH->xFree(new_elem); + return data; + } + memcpy((void*)new_elem->pKey, pKey, nKey); + }else{ + new_elem->pKey = (void*)pKey; + } + new_elem->nKey = nKey; + pH->count++; + if( pH->htsize==0 ){ + rehash(pH,8); + if( pH->htsize==0 ){ + pH->count = 0; + pH->xFree(new_elem); + return data; + } + } + if( pH->count > pH->htsize ){ + rehash(pH,pH->htsize*2); + } + assert( pH->htsize>0 ); + assert( (pH->htsize & (pH->htsize-1))==0 ); + h = hraw & (pH->htsize-1); + insertElement(pH, &pH->ht[h], new_elem); + new_elem->data = data; + return 0; +} diff --git a/ext/fts1/ft_hash.h b/ext/fts1/ft_hash.h new file mode 100644 index 0000000000..93b6dcfb48 --- /dev/null +++ b/ext/fts1/ft_hash.h @@ -0,0 +1,111 @@ +/* +** 2001 September 22 +** +** The author disclaims copyright to this source code. In place of +** a legal notice, here is a blessing: +** +** May you do good and not evil. +** May you find forgiveness for yourself and forgive others. +** May you share freely, never taking more than you give. +** +************************************************************************* +** This is the header file for the generic hash-table implemenation +** used in SQLite. We've modified it slightly to serve as a standalone +** hash table implementation for the full-text indexing module. +** +*/ +#ifndef _HASH_H_ +#define _HASH_H_ + +/* Forward declarations of structures. */ +typedef struct Hash Hash; +typedef struct HashElem HashElem; + +/* A complete hash table is an instance of the following structure. +** The internals of this structure are intended to be opaque -- client +** code should not attempt to access or modify the fields of this structure +** directly. Change this structure only by using the routines below. +** However, many of the "procedures" and "functions" for modifying and +** accessing this structure are really macros, so we can't really make +** this structure opaque. +*/ +struct Hash { + char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */ + char copyKey; /* True if copy of key made on insert */ + int count; /* Number of entries in this table */ + HashElem *first; /* The first element of the array */ + void *(*xMalloc)(int); /* malloc() function to use */ + void (*xFree)(void *); /* free() function to use */ + int htsize; /* Number of buckets in the hash table */ + struct _ht { /* the hash table */ + int count; /* Number of entries with this hash */ + HashElem *chain; /* Pointer to first entry with this hash */ + } *ht; +}; + +/* Each element in the hash table is an instance of the following +** structure. All elements are stored on a single doubly-linked list. +** +** Again, this structure is intended to be opaque, but it can't really +** be opaque because it is used by macros. +*/ +struct HashElem { + HashElem *next, *prev; /* Next and previous elements in the table */ + void *data; /* Data associated with this element */ + void *pKey; int nKey; /* Key associated with this element */ +}; + +/* +** There are 4 different modes of operation for a hash table: +** +** HASH_INT nKey is used as the key and pKey is ignored. +** +** HASH_POINTER pKey is used as the key and nKey is ignored. +** +** HASH_STRING pKey points to a string that is nKey bytes long +** (including the null-terminator, if any). Case +** is respected in comparisons. +** +** HASH_BINARY pKey points to binary data nKey bytes long. +** memcmp() is used to compare keys. +** +** A copy of the key is made for HASH_STRING and HASH_BINARY +** if the copyKey parameter to HashInit is 1. +*/ +/* #define HASH_INT 1 // NOT USED */ +/* #define HASH_POINTER 2 // NOT USED */ +#define HASH_STRING 3 +#define HASH_BINARY 4 + +/* +** Access routines. To delete, insert a NULL pointer. +*/ +void HashInit(Hash*, int keytype, int copyKey); +void *HashInsert(Hash*, const void *pKey, int nKey, void *pData); +void *HashFind(const Hash*, const void *pKey, int nKey); +void HashClear(Hash*); + +/* +** Macros for looping over all elements of a hash table. The idiom is +** like this: +** +** Hash h; +** HashElem *p; +** ... +** for(p=HashFirst(&h); p; p=HashNext(p)){ +** SomeStructure *pData = HashData(p); +** // do something with pData +** } +*/ +#define HashFirst(H) ((H)->first) +#define HashNext(E) ((E)->next) +#define HashData(E) ((E)->data) +#define HashKey(E) ((E)->pKey) +#define HashKeysize(E) ((E)->nKey) + +/* +** Number of entries in a hash table +*/ +#define HashCount(H) ((H)->count) + +#endif /* _HASH_H_ */ diff --git a/ext/fts1/fulltext.c b/ext/fts1/fulltext.c new file mode 100644 index 0000000000..b7844cbf21 --- /dev/null +++ b/ext/fts1/fulltext.c @@ -0,0 +1,1503 @@ +/* The author disclaims copyright to this source code. + * + * This is an SQLite module implementing full-text search. + */ + +#include +#if !defined(__APPLE__) +#include +#else +#include +#endif +#include +#include +#include + +#include "fulltext.h" +#include "ft_hash.h" +#include "tokenizer.h" +#include "sqlite3.h" +#include "sqlite3ext.h" +SQLITE_EXTENSION_INIT1 + +/* utility functions */ + +/* We encode variable-length integers in little-endian order using seven bits + * per byte as follows: +** +** KEY: +** A = 0xxxxxxx 7 bits of data and one flag bit +** B = 1xxxxxxx 7 bits of data and one flag bit +** +** 7 bits - A +** 14 bits - BA +** 21 bits - BBA +** and so on. +*/ + +/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ +#define VARINT_MAX 10 + +/* Write a 64-bit variable-length integer to memory starting at p[0]. + * The length of data written will be between 1 and VARINT_MAX bytes. + * The number of bytes written is returned. */ +int putVarint(char *p, sqlite_int64 v){ + unsigned char *q = (unsigned char *) p; + sqlite_uint64 vu = v; + do{ + *q++ = (unsigned char) ((vu & 0x7f) | 0x80); + vu >>= 7; + }while( vu!=0 ); + q[-1] &= 0x7f; /* turn off high bit in final byte */ + assert( q - (unsigned char *)p <= VARINT_MAX ); + return (int) (q - (unsigned char *)p); +} + +/* Read a 64-bit variable-length integer from memory starting at p[0]. + * Return the number of bytes read, or 0 on error. + * The value is stored in *v. */ +int getVarint(const char *p, sqlite_int64 *v){ + const unsigned char *q = (const unsigned char *) p; + sqlite_uint64 x = 0, y = 1; + while( (*q & 0x80) == 0x80 ){ + x += y * (*q++ & 0x7f); + y <<= 7; + if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ + assert( 0 ); + return 0; + } + } + x += y * (*q++); + *v = (sqlite_int64) x; + return (int) (q - (unsigned char *)p); +} + +int getVarint32(const char *p, int *pi){ + sqlite_int64 i; + int ret = getVarint(p, &i); + *pi = (int) i; + assert( *pi==i ); + return ret; +} + +/*** Document lists *** + * + * A document list holds a sorted list of varint-encoded document IDs. + * + * A doclist with type DL_POSITIONS_OFFSETS is stored like this: + * + * array { + * varint docid; + * array { + * varint position; (delta from previous position plus 1, or 0 for end) + * varint startOffset; (delta from previous startOffset) + * varint endOffset; (delta from startOffset) + * } + * } + * + * Here, array { X } means zero or more occurrences of X, adjacent in memory. + * + * A doclist with type DL_POSITIONS is like the above, but holds only docids + * and positions without offset information. + * + * A doclist with type DL_DOCIDS is like the above, but holds only docids + * without positions or offset information. + * + * On disk, every document list has positions and offsets, so we don't bother + * to serialize a doclist's type. + * + * We don't yet delta-encode document IDs; doing so will probably be a + * modest win. + * + * NOTE(shess) I've thought of a slightly (1%) better offset encoding. + * After the first offset, estimate the next offset by using the + * current token position and the previous token position and offset, + * offset to handle some variance. So the estimate would be + * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded + * as normal. Offsets more than 64 chars from the estimate are + * encoded as the delta to the previous start offset + 128. An + * additional tiny increment can be gained by using the end offset of + * the previous token to make the estimate a tiny bit more precise. +*/ + +typedef enum DocListType { + DL_DOCIDS, /* docids only */ + DL_POSITIONS, /* docids + positions */ + DL_POSITIONS_OFFSETS /* docids + positions + offsets */ +} DocListType; + +typedef struct DocList { + char *pData; + int nData; + DocListType iType; + int iLastPos; /* the last position written */ + int iLastOffset; /* the last start offset written */ +} DocList; + +/* Initialize a new DocList to hold the given data. */ +static void docListInit(DocList *d, DocListType iType, + const char *pData, int nData){ + d->nData = nData; + if( nData>0 ){ + d->pData = malloc(nData); + memcpy(d->pData, pData, nData); + } else { + d->pData = NULL; + } + d->iType = iType; + d->iLastPos = 0; + d->iLastOffset = 0; +} + +/* Create a new dynamically-allocated DocList. */ +static DocList *docListNew(DocListType iType){ + DocList *d = (DocList *) malloc(sizeof(DocList)); + docListInit(d, iType, 0, 0); + return d; +} + +static void docListDestroy(DocList *d){ + free(d->pData); +#ifndef NDEBUG + memset(d, 0x55, sizeof(*d)); +#endif +} + +static void docListDelete(DocList *d){ + docListDestroy(d); + free(d); +} + +static char *docListEnd(DocList *d){ + return d->pData + d->nData; +} + +/* Append a varint to a DocList's data. */ +static void appendVarint(DocList *d, sqlite_int64 i){ + char c[VARINT_MAX]; + int n = putVarint(c, i); + d->pData = realloc(d->pData, d->nData + n); + memcpy(d->pData + d->nData, c, n); + d->nData += n; +} + +static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ + appendVarint(d, iDocid); + d->iLastPos = 0; +} + +/* Add a position to the last position list in a doclist. */ +static void docListAddPos(DocList *d, int iPos){ + assert( d->iType>=DL_POSITIONS ); + appendVarint(d, iPos-d->iLastPos+1); + d->iLastPos = iPos; +} + +static void docListAddPosOffset(DocList *d, int iPos, + int iStartOffset, int iEndOffset){ + assert( d->iType==DL_POSITIONS_OFFSETS ); + docListAddPos(d, iPos); + appendVarint(d, iStartOffset-d->iLastOffset); + d->iLastOffset = iStartOffset; + appendVarint(d, iEndOffset-iStartOffset); +} + +/* Terminate the last position list in the given doclist. */ +static void docListAddEndPos(DocList *d){ + appendVarint(d, 0); +} + +typedef struct DocListReader { + DocList *pDoclist; + char *p; + int iLastPos; /* the last position read */ +} DocListReader; + +static void readerInit(DocListReader *r, DocList *pDoclist){ + r->pDoclist = pDoclist; + if( pDoclist!=NULL ){ + r->p = pDoclist->pData; + } + r->iLastPos = 0; +} + +static int readerAtEnd(DocListReader *pReader){ + return pReader->p >= docListEnd(pReader->pDoclist); +} + +/* Peek at the next docid without advancing the read pointer. */ +static sqlite_int64 peekDocid(DocListReader *pReader){ + sqlite_int64 ret; + assert( !readerAtEnd(pReader) ); + getVarint(pReader->p, &ret); + return ret; +} + +/* Read the next docid. */ +static sqlite_int64 readDocid(DocListReader *pReader){ + sqlite_int64 ret; + assert( !readerAtEnd(pReader) ); + pReader->p += getVarint(pReader->p, &ret); + pReader->iLastPos = 0; + return ret; +} + +/* Read the next position from a position list. + * Returns the position, or -1 at the end of the list. */ +static int readPosition(DocListReader *pReader){ + int i; + int iType = pReader->pDoclist->iType; + assert( iType>=DL_POSITIONS ); + assert( !readerAtEnd(pReader) ); + + pReader->p += getVarint32(pReader->p, &i); + if( i==0 ){ + pReader->iLastPos = -1; + return -1; + } + pReader->iLastPos += ((int) i)-1; + if( iType>=DL_POSITIONS_OFFSETS ){ + /* Skip over offsets, ignoring them for now. */ + int iStart, iEnd; + pReader->p += getVarint32(pReader->p, &iStart); + pReader->p += getVarint32(pReader->p, &iEnd); + } + return pReader->iLastPos; +} + +/* Skip past the end of a position list. */ +static void skipPositionList(DocListReader *pReader){ + while( readPosition(pReader)!=-1 ) + ; +} + +/* Skip over a docid, including its position list if the doclist has + * positions. */ +static void skipDocument(DocListReader *pReader){ + readDocid(pReader); + if( pReader->pDoclist->iType >= DL_POSITIONS ){ + skipPositionList(pReader); + } +} + +static sqlite_int64 firstDocid(DocList *d){ + DocListReader r; + readerInit(&r, d); + return readDocid(&r); +} + +/* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; + * otherwise pUpdate, which must contain only the single docid [iDocid], is + * inserted (if not present) or updated (if already present). */ +static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ + int modified = 0; + DocListReader reader; + char *p; + + assert( d->iType==pUpdate->iType); + if( pUpdate!=NULL ){ + assert( iDocid==firstDocid(pUpdate) ); + } + + readerInit(&reader, d); + while( !readerAtEnd(&reader) ){ + if( peekDocid(&reader) >= iDocid ) break; + skipDocument(&reader); + } + + p = reader.p; + /* Delete if there is a matching element. */ + if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ + skipDocument(&reader); + memmove(p, reader.p, docListEnd(d) - reader.p); + d->nData -= (reader.p - p); + modified = 1; + } + + /* Insert if indicated. */ + if( pUpdate!=NULL ){ + int iDoclist; + int nNeed; + docListAddEndPos(pUpdate); + nNeed = pUpdate->nData; + + iDoclist = p - d->pData; + d->pData = realloc(d->pData, d->nData+nNeed); + p = d->pData + iDoclist; + + memmove(p+nNeed, p, docListEnd(d) - p); + memcpy(p, pUpdate->pData, nNeed); + p += nNeed; + d->nData += nNeed; + modified = 1; + } + + return modified; +} + +/* Split the second half of doclist d into a separate doclist d2. Returns 1 + * if successful, or 0 if d contains a single document and hence can't be + * split. */ +static int docListSplit(DocList *d, DocList *d2){ + const char *pSplitPoint = d->pData + d->nData / 2; + DocListReader reader; + + readerInit(&reader, d); + while( reader.piType, reader.p, docListEnd(d) - reader.p); + d->nData = reader.p - d->pData; + d->pData = realloc(d->pData, d->nData); + return 1; +} + +/* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked + * on-disk doclist, resulting in another in-memory DocList [out]. [in] + * and [out] may or may not store position information according to the + * caller's wishes. The on-disk doclist always comes with positions. + * + * The caller must read each chunk of the on-disk doclist in succession and + * pass it to mergeBlock(). + * + * If [in] has positions, then the merge output contains only documents with + * matching positions in the two input doclists. If [in] does not have + * positions, then the merge output contains all documents common to the two + * input doclists. + * + * If [in] is NULL, then the on-disk doclist is copied to [out] directly. + * + * A merge is performed using an integer [iOffset] provided by the caller. + * [iOffset] is subtracted from each position in the on-disk doclist for the + * purpose of position comparison; this is helpful in implementing phrase + * searches. + * + * A DocListMerge is not yet able to propagate offsets through query + * processing; we should add that capability soon. +*/ +typedef struct DocListMerge { + DocListReader in; + DocList *pOut; + int iOffset; +} DocListMerge; + +static void mergeInit(DocListMerge *m, + DocList *pIn, int iOffset, DocList *pOut){ + readerInit(&m->in, pIn); + m->pOut = pOut; + m->iOffset = iOffset; + + /* can't handle offsets yet */ + assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); + assert( pOut->iType <= DL_POSITIONS ); +} + +/* A helper function for mergeBlock(), below. Merge the position lists + * pointed to by m->in and pBlockReader. + * If the merge matches, write [iDocid] to m->pOut; if m->pOut + * has positions then write all matching positions as well. */ +static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, + DocListReader *pBlockReader){ + int block_pos = readPosition(pBlockReader); + int in_pos = readPosition(&m->in); + int match = 0; + while( block_pos!=-1 || in_pos!=-1 ){ + if( block_pos-m->iOffset==in_pos ){ + if( !match ){ + docListAddDocid(m->pOut, iDocid); + match = 1; + } + if( m->pOut->iType >= DL_POSITIONS ){ + docListAddPos(m->pOut, in_pos); + } + block_pos = readPosition(pBlockReader); + in_pos = readPosition(&m->in); + } else if( in_pos==-1 || block_pos!=-1 && block_pos-m->iOffsetin); + } + } + if( m->pOut->iType >= DL_POSITIONS && match ){ + docListAddEndPos(m->pOut); + } +} + +/* Merge one block of an on-disk doclist into a DocListMerge. */ +static void mergeBlock(DocListMerge *m, DocList *pBlock){ + DocListReader blockReader; + assert( pBlock->iType >= DL_POSITIONS ); + readerInit(&blockReader, pBlock); + while( !readerAtEnd(&blockReader) ){ + sqlite_int64 iDocid = readDocid(&blockReader); + if( m->in.pDoclist!=NULL ){ + while( 1 ){ + if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ + if( peekDocid(&m->in)>=iDocid ) break; + skipDocument(&m->in); + } + if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ + skipPositionList(&blockReader); /* skip this docid in the block */ + continue; + } + readDocid(&m->in); + } + /* We have a document match. */ + if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ + /* We don't need to do a poslist merge. */ + docListAddDocid(m->pOut, iDocid); + if( m->pOut->iType >= DL_POSITIONS ){ + /* Copy all positions to the output doclist. */ + while( 1 ){ + int pos = readPosition(&blockReader); + if( pos==-1 ) break; + docListAddPos(m->pOut, pos); + } + docListAddEndPos(m->pOut); + } else skipPositionList(&blockReader); + continue; + } + mergePosList(m, iDocid, &blockReader); + } +} + +static char *string_dup_n(const char *s, int n){ + char *str = malloc(n + 1); + memcpy(str, s, n); + str[n] = '\0'; + return str; +} + +/* Duplicate a string; the caller must free() the returned string. + * (We don't use strdup() since it's not part of the standard C library and + * may not be available everywhere.) */ +static char *string_dup(const char *s){ + return string_dup_n(s, strlen(s)); +} + +/* Format a string, replacing each occurrence of the % character with + * zName. This may be more convenient than sqlite_mprintf() + * when one string is used repeatedly in a format string. + * The caller must free() the returned string. */ +static char *string_format(const char *zFormat, const char *zName){ + const char *p; + size_t len = 0; + size_t nName = strlen(zName); + char *result; + char *r; + + /* first compute length needed */ + for(p = zFormat ; *p ; ++p){ + len += (*p=='%' ? nName : 1); + } + len += 1; /* for null terminator */ + + r = result = malloc(len); + for(p = zFormat; *p; ++p){ + if( *p=='%' ){ + memcpy(r, zName, nName); + r += nName; + } else { + *r++ = *p; + } + } + *r++ = '\0'; + assert( r == result + len ); + return result; +} + +static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ + char *zCommand = string_format(zFormat, zName); + int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); + free(zCommand); + return rc; +} + +static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, + const char *zFormat){ + char *zCommand = string_format(zFormat, zName); + int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); + free(zCommand); + return rc; +} + +/* end utility functions */ + +#define QUERY_GENERIC 0 +#define QUERY_FULLTEXT 1 + +#define CHUNK_MAX 1024 + +typedef enum fulltext_statement { + CONTENT_INSERT_STMT, + CONTENT_SELECT_STMT, + CONTENT_DELETE_STMT, + + TERM_SELECT_STMT, + TERM_CHUNK_SELECT_STMT, + TERM_INSERT_STMT, + TERM_UPDATE_STMT, + TERM_DELETE_STMT, + + MAX_STMT /* Always at end! */ +} fulltext_statement; + +/* These must exactly match the enum above. */ +/* TODO(adam): Is there some risk that a statement (in particular, +** pTermSelectStmt) will be used in two cursors at once, e.g. if a +** query joins a virtual table to itself? If so perhaps we should +** move some of these to the cursor object. +*/ +const char *fulltext_zStatement[MAX_STMT] = { + /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", + /* CONTENT_SELECT */ "select content from %_content where rowid = ?", + /* CONTENT_DELETE */ "delete from %_content where rowid = ?", + + /* TERM_SELECT */ + "select rowid, doclist from %_term where term = ? and first = ?", + /* TERM_CHUNK_SELECT */ + "select max(first) from %_term where term = ? and first <= ?", + /* TERM_INSERT */ + "insert into %_term (term, first, doclist) values (?, ?, ?)", + /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", + /* TERM_DELETE */ "delete from %_term where rowid = ?", +}; + +typedef struct fulltext_vtab { + sqlite3_vtab base; + sqlite3 *db; + const char *zName; /* virtual table name */ + sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ + + /* Precompiled statements which we keep as long as the table is + ** open. + */ + sqlite3_stmt *pFulltextStatements[MAX_STMT]; +} fulltext_vtab; + +typedef struct fulltext_cursor { + sqlite3_vtab_cursor base; + int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ + + sqlite3_stmt *pStmt; + + int eof; + + /* The following is used only when iCursorType == QUERY_FULLTEXT. */ + DocListReader result; +} fulltext_cursor; + +static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ + return (fulltext_vtab *) c->base.pVtab; +} + +static sqlite3_module fulltextModule; /* forward declaration */ + +/* Puts a freshly-prepared statement determined by iStmt in *ppStmt. +** If the indicated statement has never been prepared, it is prepared +** and cached, otherwise the cached version is reset. +*/ +static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, + sqlite3_stmt **ppStmt){ + assert( iStmtpFulltextStatements[iStmt]==NULL ){ + int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], + fulltext_zStatement[iStmt]); + if( rc!=SQLITE_OK ) return rc; + } else { + int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); + if( rc!=SQLITE_OK ) return rc; + } + + *ppStmt = v->pFulltextStatements[iStmt]; + return SQLITE_OK; +} + +/* Step the indicated statement, handling errors SQLITE_BUSY (by +** retrying) and SQLITE_SCHEMA (by re-preparing and transferring +** bindings to the new statement). +** TODO(adam): We should extend this function so that it can work with +** statements declared locally, not only globally cached statements. +*/ +static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, + sqlite3_stmt **ppStmt){ + int rc; + sqlite3_stmt *s = *ppStmt; + assert( iStmtpFulltextStatements[iStmt] ); + + while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ + sqlite3_stmt *pNewStmt; + + if( rc==SQLITE_BUSY ) continue; + if( rc!=SQLITE_ERROR ) return rc; + + rc = sqlite3_reset(s); + if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; + + v->pFulltextStatements[iStmt] = NULL; /* Still in s */ + rc = sql_get_statement(v, iStmt, &pNewStmt); + if( rc!=SQLITE_OK ) goto err; + *ppStmt = pNewStmt; + + rc = sqlite3_transfer_bindings(s, pNewStmt); + if( rc!=SQLITE_OK ) goto err; + + rc = sqlite3_finalize(s); + if( rc!=SQLITE_OK ) return rc; + s = pNewStmt; + } + return rc; + + err: + sqlite3_finalize(s); + return rc; +} + +/* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. +** Useful for statements like UPDATE, where we expect no results. +*/ +static int sql_single_step_statement(fulltext_vtab *v, + fulltext_statement iStmt, + sqlite3_stmt **ppStmt){ + int rc = sql_step_statement(v, iStmt, ppStmt); + return (rc==SQLITE_DONE) ? SQLITE_OK : rc; +} + +/* insert into %_content (rowid, content) values ([rowid], [zContent]) */ +static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, + const char *zContent, int nContent){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_value(s, 1, rowid); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); +} + +/* select content from %_content where rowid = [iRow] + * The caller must delete the returned string. */ +static int content_select(fulltext_vtab *v, sqlite_int64 iRow, + char **pzContent){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 1, iRow); + if( rc!=SQLITE_OK ) return rc; + + rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); + if( rc!=SQLITE_ROW ) return rc; + + *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); + + /* We expect only one row. We must execute another sqlite3_step() + * to complete the iteration; otherwise the table will remain locked. */ + rc = sqlite3_step(s); + if( rc==SQLITE_DONE ) return SQLITE_OK; + + free(*pzContent); + return rc; +} + +/* delete from %_content where rowid = [iRow ] */ +static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 1, iRow); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); +} + +/* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] + * If found, returns SQLITE_OK; the caller must free the returned doclist. + * If no rows found, returns SQLITE_ERROR. */ +static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iFirst, + sqlite_int64 *rowid, + DocList *out){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 2, iFirst); + if( rc!=SQLITE_OK ) return rc; + + rc = sql_step_statement(v, TERM_SELECT_STMT, &s); + if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; + + *rowid = sqlite3_column_int64(s, 0); + docListInit(out, DL_POSITIONS_OFFSETS, + sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); + + /* We expect only one row. We must execute another sqlite3_step() + * to complete the iteration; otherwise the table will remain locked. */ + rc = sqlite3_step(s); + return rc==SQLITE_DONE ? SQLITE_OK : rc; +} + +/* select max(first) from %_term where term = [zTerm] and first <= [iFirst] + * If found, returns SQLITE_ROW and result in *piResult; if the query returns + * NULL (meaning no row found) returns SQLITE_DONE. + */ +static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iFirst, sqlite_int64 *piResult){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 2, iFirst); + if( rc!=SQLITE_OK ) return rc; + + rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); + if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; + + switch( sqlite3_column_type(s, 0) ){ + case SQLITE_NULL: + rc = SQLITE_DONE; + break; + case SQLITE_INTEGER: + *piResult = sqlite3_column_int64(s, 0); + break; + default: + return SQLITE_ERROR; + } + /* We expect only one row. We must execute another sqlite3_step() + * to complete the iteration; otherwise the table will remain locked. */ + if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; + return rc; +} + +/* insert into %_term (term, first, doclist) + values ([zTerm], [iFirst], [doclist]) */ +static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iFirst, DocList *doclist){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 2, iFirst); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, TERM_INSERT_STMT, &s); +} + +/* update %_term set doclist = [doclist] where rowid = [rowid] */ +static int term_update(fulltext_vtab *v, sqlite_int64 rowid, + DocList *doclist){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, + SQLITE_STATIC); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 2, rowid); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); +} + +static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ + sqlite3_stmt *s; + int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_int64(s, 1, rowid); + if( rc!=SQLITE_OK ) return rc; + + return sql_single_step_statement(v, TERM_DELETE_STMT, &s); +} + +static void fulltext_vtab_destroy(fulltext_vtab *v){ + int iStmt; + + for( iStmt=0; iStmtpFulltextStatements[iStmt]!=NULL ){ + sqlite3_finalize(v->pFulltextStatements[iStmt]); + v->pFulltextStatements[iStmt] = NULL; + } + } + + if( v->pTokenizer!=NULL ){ + v->pTokenizer->pModule->xDestroy(v->pTokenizer); + v->pTokenizer = NULL; + } + + free((void *) v->zName); + free(v); +} + +/* Current interface: +** argv[0] - module name +** argv[1] - database name +** argv[2] - table name +** argv[3] - tokenizer name (optional, a sensible default is provided) +** argv[4..] - passed to tokenizer (optional based on tokenizer) +**/ +static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv, + sqlite3_vtab **ppVTab){ + int rc; + fulltext_vtab *v; + sqlite3_tokenizer_module *m = NULL; + + assert( argc>=3 ); + v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); + /* sqlite will initialize v->base */ + v->db = db; + v->zName = string_dup(argv[2]); + v->pTokenizer = NULL; + + if( argc==3 ){ + get_simple_tokenizer_module(&m); + } else { + /* TODO(shess) For now, add new tokenizers as else if clauses. */ + if( !strcmp(argv[3], "simple") ){ + get_simple_tokenizer_module(&m); + } else { + assert( "unrecognized tokenizer"==NULL ); + } + } + + /* TODO(shess) Since tokenization impacts the index, the parameters + ** to the tokenizer need to be identical when a persistent virtual + ** table is re-created. One solution would be a meta-table to track + ** such information in the database. Then we could verify that the + ** information is identical on subsequent creates. + */ + /* TODO(shess) Why isn't argv already (const char **)? */ + rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); + if( rc!=SQLITE_OK ) return rc; + v->pTokenizer->pModule = m; + + /* TODO: verify the existence of backing tables foo_content, foo_term */ + + rc = sqlite3_declare_vtab(db, "create table x(content text)"); + if( rc!=SQLITE_OK ) return rc; + + memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); + + *ppVTab = &v->base; + return SQLITE_OK; +} + +static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv, + sqlite3_vtab **ppVTab){ + int rc; + assert( argc>=3 ); + + /* The %_content table holds the text of each full-text item, with + ** the rowid used as the docid. + ** + ** The %_term table maps each term to a document list blob + ** containing elements sorted by ascending docid, each element + ** encoded as: + ** + ** docid varint-encoded + ** token count varint-encoded + ** "count" token elements (poslist): + ** position varint-encoded as delta from previous position + ** start offset varint-encoded as delta from previous start offset + ** end offset varint-encoded as delta from start offset + ** + ** Additionally, doclist blobs can be chunked into multiple rows, + ** using "first" to order the blobs. "first" is simply the first + ** docid in the blob. + */ + /* + ** NOTE(shess) That last sentence is incorrect in the face of + ** deletion, which can leave a doclist that doesn't contain the + ** first from that row. I _believe_ this does not matter to the + ** operation of the system, but it might be reasonable to update + ** appropriately in case this assumption becomes more important. + */ + rc = sql_exec(db, argv[2], + "create table %_content(content text);" + "create table %_term(term text, first integer, doclist blob);" + "create index %_index on %_term(term, first)"); + if( rc!=SQLITE_OK ) return rc; + + return fulltextConnect(db, pAux, argc, argv, ppVTab); +} + +/* Decide how to handle an SQL query. + * At the moment, MATCH queries can include implicit boolean ANDs; we + * haven't implemented phrase searches or OR yet. */ +static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ + int i; + + for(i=0; inConstraint; ++i){ + const struct sqlite3_index_constraint *pConstraint; + pConstraint = &pInfo->aConstraint[i]; + if( pConstraint->iColumn==0 && + pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && + pConstraint->usable ){ /* a full-text search */ + pInfo->aConstraintUsage[i].argvIndex = 1; + pInfo->aConstraintUsage[i].omit = 1; + pInfo->idxNum = QUERY_FULLTEXT; + pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ + return SQLITE_OK; + } + } + pInfo->idxNum = QUERY_GENERIC; + return SQLITE_OK; +} + +static int fulltextDisconnect(sqlite3_vtab *pVTab){ + fulltext_vtab_destroy((fulltext_vtab *)pVTab); + return SQLITE_OK; +} + +static int fulltextDestroy(sqlite3_vtab *pVTab){ + fulltext_vtab *v = (fulltext_vtab *)pVTab; + + int rc = sql_exec(v->db, v->zName, + "drop table %_content; drop table %_term"); + if( rc!=SQLITE_OK ) return rc; + + fulltext_vtab_destroy((fulltext_vtab *)pVTab); + return SQLITE_OK; +} + +static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ + fulltext_cursor *c; + + c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); + /* sqlite will initialize c->base */ + *ppCursor = &c->base; + + return SQLITE_OK; +} + +static int fulltextClose(sqlite3_vtab_cursor *pCursor){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + sqlite3_finalize(c->pStmt); + if( c->result.pDoclist!=NULL ){ + docListDelete(c->result.pDoclist); + } + free(c); + return SQLITE_OK; +} + +static int fulltextNext(sqlite3_vtab_cursor *pCursor){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + sqlite_int64 iDocid; + int rc; + + switch( c->iCursorType ){ + case QUERY_GENERIC: + /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ + rc = sqlite3_step(c->pStmt); + switch( rc ){ + case SQLITE_ROW: + c->eof = 0; + return SQLITE_OK; + case SQLITE_DONE: + c->eof = 1; + return SQLITE_OK; + default: + c->eof = 1; + return rc; + } + case QUERY_FULLTEXT: + rc = sqlite3_reset(c->pStmt); + if( rc!=SQLITE_OK ) return rc; + + if( readerAtEnd(&c->result)){ + c->eof = 1; + return SQLITE_OK; + } + iDocid = readDocid(&c->result); + rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); + if( rc!=SQLITE_OK ) return rc; + /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ + rc = sqlite3_step(c->pStmt); + if( rc==SQLITE_ROW ){ /* the case we expect */ + c->eof = 0; + return SQLITE_OK; + } + /* an error occurred; abort */ + return rc==SQLITE_DONE ? SQLITE_ERROR : rc; + default: + assert( 0 ); + return SQLITE_ERROR; /* not reached */ + } +} + +static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, + sqlite3_stmt **ppStmt){ + int rc; + if( *ppStmt ){ + rc = sqlite3_reset(*ppStmt); + } else { + rc = sql_prepare(v->db, v->zName, ppStmt, + "select doclist from %_term where term = ? order by first"); + } + if( rc!=SQLITE_OK ) return rc; + + rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); + if( rc!=SQLITE_OK ) return rc; + + return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ +} + +/* Read the posting list for [zTerm]; AND it with the doclist [in] to + * produce the doclist [out], using the given offset [iOffset] for phrase + * matching. + * (*pSelect) is used to hold an SQLite statement used inside this function; + * the caller should initialize *pSelect to NULL before the first call. + */ +static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, + const char *zTerm, + DocList *pIn, int iOffset, DocList *out){ + int rc; + DocListMerge merge; + + if( pIn!=NULL && !pIn->nData ){ + /* If [pIn] is already empty, there's no point in reading the + * posting list to AND it in; return immediately. */ + return SQLITE_OK; + } + + rc = term_select_doclist(v, zTerm, -1, pSelect); + if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; + + mergeInit(&merge, pIn, iOffset, out); + while( rc==SQLITE_ROW ){ + DocList block; + docListInit(&block, DL_POSITIONS_OFFSETS, + sqlite3_column_blob(*pSelect, 0), + sqlite3_column_bytes(*pSelect, 0)); + mergeBlock(&merge, &block); + docListDestroy(&block); + + rc = sqlite3_step(*pSelect); + if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ + return rc; + } + } + + return SQLITE_OK; +} + +typedef struct QueryTerm { + int is_phrase; /* true if this term begins a new phrase */ + const char *zTerm; +} QueryTerm; + +/* A parsed query. + * + * As an example, parsing the query ["four score" years "new nation"] will + * yield a Query with 5 terms: + * "four", is_phrase = 1 + * "score", is_phrase = 0 + * "years", is_phrase = 1 + * "new", is_phrase = 1 + * "nation", is_phrase = 0 + */ +typedef struct Query { + int nTerms; + QueryTerm *pTerm; +} Query; + +void query_add(Query *q, int is_phrase, const char *zTerm){ + QueryTerm *t; + ++q->nTerms; + q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); + t = &q->pTerm[q->nTerms - 1]; + t->is_phrase = is_phrase; + t->zTerm = zTerm; +} + +void query_free(Query *q){ + int i; + for(i = 0; i < q->nTerms; ++i){ + free((void *) q->pTerm[i].zTerm); + } + free(q->pTerm); +} + +int tokenize_segment(sqlite3_tokenizer *pTokenizer, + const char *zQuery, int in_phrase, + Query *pQuery){ + sqlite3_tokenizer_module *pModule = pTokenizer->pModule; + sqlite3_tokenizer_cursor *pCursor; + int is_first = 1; + + int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); + if( rc!=SQLITE_OK ) return rc; + pCursor->pTokenizer = pTokenizer; + + while( 1 ){ + const char *zToken; + int nToken, iStartOffset, iEndOffset, dummy_pos; + + rc = pModule->xNext(pCursor, + &zToken, &nToken, + &iStartOffset, &iEndOffset, + &dummy_pos); + if( rc!=SQLITE_OK ) break; + query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); + is_first = 0; + } + + return pModule->xClose(pCursor); +} + +/* Parse a query string, yielding a Query object. */ +int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ + char *zQuery1 = string_dup(zQuery); + int in_phrase = 0; + char *s = zQuery1; + pQuery->nTerms = 0; + pQuery->pTerm = NULL; + + while( *s ){ + char *t = s; + while( *t ){ + if( *t=='"' ){ + *t++ = '\0'; + break; + } + ++t; + } + if( *s ){ + tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); + } + s = t; + in_phrase = !in_phrase; + } + + free(zQuery1); + return SQLITE_OK; +} + +/* Perform a full-text query; return a list of documents in [pResult]. */ +static int fulltext_query(fulltext_vtab *v, const char *zQuery, + DocList **pResult){ + Query q; + int phrase_start = -1; + int i; + sqlite3_stmt *pSelect = NULL; + DocList *d = NULL; + + int rc = parse_query(v, zQuery, &q); + if( rc!=SQLITE_OK ) return rc; + + /* Merge terms. */ + for(i = 0 ; i < q.nTerms ; ++i){ + /* In each merge step, we need to generate positions whenever we're + * processing a phrase which hasn't ended yet. */ + int need_positions = iiCursorType = idxNum; + switch( idxNum ){ + case QUERY_GENERIC: + zStatement = "select rowid, content from %_content"; + break; + + case QUERY_FULLTEXT: /* full-text search */ + { + const char *zQuery = (const char *)sqlite3_value_text(argv[0]); + DocList *pResult; + assert( argc==1 ); + rc = fulltext_query(v, zQuery, &pResult); + if( rc!=SQLITE_OK ) return rc; + readerInit(&c->result, pResult); + zStatement = "select rowid, content from %_content where rowid = ?"; + break; + } + + default: + assert( 0 ); + } + + rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); + if( rc!=SQLITE_OK ) return rc; + + return fulltextNext(pCursor); +} + +static int fulltextEof(sqlite3_vtab_cursor *pCursor){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + return c->eof; +} + +static int fulltextColumn(sqlite3_vtab_cursor *pCursor, + sqlite3_context *pContext, int idxCol){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + const char *s; + + assert( idxCol==0 ); + s = (const char *) sqlite3_column_text(c->pStmt, 1); + sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); + + return SQLITE_OK; +} + +static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ + fulltext_cursor *c = (fulltext_cursor *) pCursor; + + *pRowid = sqlite3_column_int64(c->pStmt, 0); + return SQLITE_OK; +} + +/* Build a hash table containing all terms in zText. */ +static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer, + const char *zText, sqlite_int64 iDocid){ + sqlite3_tokenizer_cursor *pCursor; + const char *pToken; + int nTokenBytes; + int iStartOffset, iEndOffset, iPosition; + + int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); + if( rc!=SQLITE_OK ) return rc; + + pCursor->pTokenizer = pTokenizer; + HashInit(terms, HASH_STRING, 1); + while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, + &pToken, &nTokenBytes, + &iStartOffset, &iEndOffset, + &iPosition) ){ + DocList *p; + + /* Positions can't be negative; we use -1 as a terminator internally. */ + if( iPosition<0 ) { + rc = SQLITE_ERROR; + goto err; + } + + p = HashFind(terms, pToken, nTokenBytes); + if( p==NULL ){ + p = docListNew(DL_POSITIONS_OFFSETS); + docListAddDocid(p, iDocid); + HashInsert(terms, pToken, nTokenBytes, p); + } + docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); + } + +err: + /* TODO(shess) Check return? Should this be able to cause errors at + ** this point? Actually, same question about sqlite3_finalize(), + ** though one could argue that failure there means that the data is + ** not durable. *ponder* + */ + pTokenizer->pModule->xClose(pCursor); + return rc; +} +/* Update the %_terms table to map the term [zTerm] to the given rowid. */ +static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iDocid, DocList *p){ + sqlite_int64 iFirst; + sqlite_int64 iIndexRow; + DocList doclist; + char *pBlob = NULL; + int nBlob = 0; + + int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); + if( rc==SQLITE_DONE ){ + docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); + if( docListUpdate(&doclist, iDocid, p) ){ + rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); + docListDestroy(&doclist); + return rc; + } + return SQLITE_OK; + } + if( rc!=SQLITE_ROW ) return SQLITE_ERROR; + + /* This word is in the index; add this document ID to its blob. */ + + rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); + if( rc!=SQLITE_OK ) return rc; + + if( docListUpdate(&doclist, iDocid, p) ){ + /* If the blob is too big, split it in half. */ + if( doclist.nData>CHUNK_MAX ){ + DocList half; + if( docListSplit(&doclist, &half) ){ + rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); + docListDestroy(&half); + if( rc!=SQLITE_OK ) goto err; + } + } + rc = term_update(v, iIndexRow, &doclist); + } + +err: + docListDestroy(&doclist); + return rc; +} + +/* Insert a row into the full-text index; set *piRowid to be the ID of the + * new row. */ +static int index_insert(fulltext_vtab *v, + sqlite3_value *pRequestRowid, const char *zText, + sqlite_int64 *piRowid){ + Hash terms; /* maps term string -> PosList */ + HashElem *e; + + int rc = content_insert(v, pRequestRowid, zText, -1); + if( rc!=SQLITE_OK ) return rc; + *piRowid = sqlite3_last_insert_rowid(v->db); + + if( !zText ) return SQLITE_OK; /* nothing to index */ + + rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); + if( rc!=SQLITE_OK ) return rc; + + for(e=HashFirst(&terms); e; e=HashNext(e)){ + DocList *p = HashData(e); + rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p); + if( rc!=SQLITE_OK ) break; + } + + for(e=HashFirst(&terms); e; e=HashNext(e)){ + DocList *p = HashData(e); + docListDelete(p); + } + HashClear(&terms); + return rc; +} + +static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, + sqlite_int64 iDocid){ + sqlite_int64 iFirst; + sqlite_int64 iIndexRow; + DocList doclist; + + int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); + if( rc!=SQLITE_ROW ) return SQLITE_ERROR; + + rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); + if( rc!=SQLITE_OK ) return rc; + + if( docListUpdate(&doclist, iDocid, NULL) ){ + if( doclist.nData>0 ){ + rc = term_update(v, iIndexRow, &doclist); + } else { /* empty posting list */ + rc = term_delete(v, iIndexRow); + } + } + docListDestroy(&doclist); + return rc; +} + +/* Delete a row from the full-text index. */ +static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ + char *zText; + Hash terms; + HashElem *e; + + int rc = content_select(v, iRow, &zText); + if( rc!=SQLITE_OK ) return rc; + + rc = build_terms(&terms, v->pTokenizer, zText, iRow); + free(zText); + if( rc!=SQLITE_OK ) return rc; + + for(e=HashFirst(&terms); e; e=HashNext(e)){ + rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow); + if( rc!=SQLITE_OK ) break; + } + for(e=HashFirst(&terms); e; e=HashNext(e)){ + DocList *p = HashData(e); + docListDelete(p); + } + HashClear(&terms); + + return content_delete(v, iRow); +} + +static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, + sqlite_int64 *pRowid){ + fulltext_vtab *v = (fulltext_vtab *) pVtab; + + if( nArg<2 ){ + return index_delete(v, sqlite3_value_int64(ppArg[0])); + } + + if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ + return SQLITE_ERROR; /* an update; not yet supported */ + } + + assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ + return index_insert(v, ppArg[1], + (const char *)sqlite3_value_text(ppArg[2]), pRowid); +} + +static sqlite3_module fulltextModule = { + 0, + fulltextCreate, + fulltextConnect, + fulltextBestIndex, + fulltextDisconnect, + fulltextDestroy, + fulltextOpen, + fulltextClose, + fulltextFilter, + fulltextNext, + fulltextEof, + fulltextColumn, + fulltextRowid, + fulltextUpdate +}; + +int fulltext_init(sqlite3 *db){ + return sqlite3_create_module(db, "fulltext", &fulltextModule, 0); +} + +#if !SQLITE_CORE +int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, + const sqlite3_api_routines *pApi){ + SQLITE_EXTENSION_INIT2(pApi) + return fulltext_init(db); +} +#endif diff --git a/ext/fts1/fulltext.h b/ext/fts1/fulltext.h new file mode 100644 index 0000000000..477dcab2ad --- /dev/null +++ b/ext/fts1/fulltext.h @@ -0,0 +1,11 @@ +#include "sqlite3.h" + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +int fulltext_init(sqlite3 *db); + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ diff --git a/ext/fts1/simple_tokenizer.c b/ext/fts1/simple_tokenizer.c new file mode 100644 index 0000000000..f3095c4b67 --- /dev/null +++ b/ext/fts1/simple_tokenizer.c @@ -0,0 +1,170 @@ +/* +** The author disclaims copyright to this source code. +** +************************************************************************* +** Implementation of the "simple" full-text-search tokenizer. +*/ + +#include +#if !defined(__APPLE__) +#include +#else +#include +#endif +#include +#include +#include + +#include "tokenizer.h" + +/* Duplicate a string; the caller must free() the returned string. + * (We don't use strdup() since it's not part of the standard C library and + * may not be available everywhere.) */ +/* TODO(shess) Copied from fulltext.c, consider util.c for such +** things. */ +static char *string_dup(const char *s){ + char *str = malloc(strlen(s) + 1); + strcpy(str, s); + return str; +} + +typedef struct simple_tokenizer { + sqlite3_tokenizer base; + const char *zDelim; /* token delimiters */ +} simple_tokenizer; + +typedef struct simple_tokenizer_cursor { + sqlite3_tokenizer_cursor base; + const char *pInput; /* input we are tokenizing */ + int nBytes; /* size of the input */ + const char *pCurrent; /* current position in pInput */ + int iToken; /* index of next token to be returned */ + char *zToken; /* storage for current token */ + int nTokenBytes; /* actual size of current token */ + int nTokenAllocated; /* space allocated to zToken buffer */ +} simple_tokenizer_cursor; + +static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */ + +static int simpleCreate( + int argc, const char **argv, + sqlite3_tokenizer **ppTokenizer +){ + simple_tokenizer *t; + + t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer)); + /* TODO(shess) Delimiters need to remain the same from run to run, + ** else we need to reindex. One solution would be a meta-table to + ** track such information in the database, then we'd only want this + ** information on the initial create. + */ + if( argc>1 ){ + t->zDelim = string_dup(argv[1]); + } else { + /* Build a string of non-alphanumeric ASCII characters */ + char zDelim[128]; /* nul-terminated, so nul not a member */ + int i, j; + for(i=1, j=0; i<0x80; i++){ + if( !isalnum(i) ){ + zDelim[j++] = i; + } + } + zDelim[j++] = '\0'; + assert( j<=sizeof(zDelim) ); + t->zDelim = string_dup(zDelim); + } + + *ppTokenizer = &t->base; + return SQLITE_OK; +} + +static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ + simple_tokenizer *t = (simple_tokenizer *) pTokenizer; + + free((void *) t->zDelim); + free(t); + + return SQLITE_OK; +} + +static int simpleOpen( + sqlite3_tokenizer *pTokenizer, + const char *pInput, int nBytes, + sqlite3_tokenizer_cursor **ppCursor +){ + simple_tokenizer_cursor *c; + + c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor)); + c->pInput = pInput; + c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes; + c->pCurrent = c->pInput; /* start tokenizing at the beginning */ + c->iToken = 0; + c->zToken = NULL; /* no space allocated, yet. */ + c->nTokenBytes = 0; + c->nTokenAllocated = 0; + + *ppCursor = &c->base; + return SQLITE_OK; +} + +static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ + simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; + + if( NULL!=c->zToken ){ + free(c->zToken); + } + free(c); + + return SQLITE_OK; +} + +static int simpleNext( + sqlite3_tokenizer_cursor *pCursor, + const char **ppToken, int *pnBytes, + int *piStartOffset, int *piEndOffset, int *piPosition +){ + simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; + simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; + int ii; + + while( c->pCurrent-c->pInputnBytes ){ + int n = (int) strcspn(c->pCurrent, t->zDelim); + if( n>0 ){ + if( n+1>c->nTokenAllocated ){ + c->zToken = realloc(c->zToken, n+1); + } + for(ii=0; iizToken[ii] = tolower(c->pCurrent[ii]); + } + c->zToken[n] = '\0'; + *ppToken = c->zToken; + *pnBytes = n; + *piStartOffset = (int) (c->pCurrent-c->pInput); + *piEndOffset = *piStartOffset+n; + *piPosition = c->iToken++; + c->pCurrent += n + 1; + + return SQLITE_OK; + } + c->pCurrent += n + 1; + /* TODO(shess) could strspn() to skip delimiters en masse. Needs + ** to happen in two places, though, which is annoying. + */ + } + return SQLITE_DONE; +} + +static sqlite3_tokenizer_module simpleTokenizerModule = { + 0, + simpleCreate, + simpleDestroy, + simpleOpen, + simpleClose, + simpleNext, +}; + +void get_simple_tokenizer_module( + sqlite3_tokenizer_module **ppModule +){ + *ppModule = &simpleTokenizerModule; +} diff --git a/ext/fts1/tokenizer.h b/ext/fts1/tokenizer.h new file mode 100644 index 0000000000..1d7bd1f670 --- /dev/null +++ b/ext/fts1/tokenizer.h @@ -0,0 +1,89 @@ +/* +** 2006 July 10 +** +** The author disclaims copyright to this source code. +** +************************************************************************* +** Defines the interface to tokenizers used by fulltext-search. There +** are three basic components: +** +** sqlite3_tokenizer_module is a singleton defining the tokenizer +** interface functions. This is essentially the class structure for +** tokenizers. +** +** sqlite3_tokenizer is used to define a particular tokenizer, perhaps +** including customization information defined at creation time. +** +** sqlite3_tokenizer_cursor is generated by a tokenizer to generate +** tokens from a particular input. +*/ +#ifndef _TOKENIZER_H_ +#define _TOKENIZER_H_ + +/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. +** If tokenizers are to be allowed to call sqlite3_*() functions, then +** we will need a way to register the API consistently. +*/ +#include "sqlite3.h" + +/* +** Structures used by the tokenizer interface. +*/ +typedef struct sqlite3_tokenizer sqlite3_tokenizer; +typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; +typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; + +struct sqlite3_tokenizer_module { + int iVersion; /* currently 0 */ + + /* + ** Create and destroy a tokenizer. argc/argv are passed down from + ** the fulltext virtual table creation to allow customization. + */ + int (*xCreate)(int argc, const char **argv, + sqlite3_tokenizer **ppTokenizer); + int (*xDestroy)(sqlite3_tokenizer *pTokenizer); + + /* + ** Tokenize a particular input. Call xOpen() to prepare to + ** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then + ** xClose() to free any internal state. The pInput passed to + ** xOpen() must exist until the cursor is closed. The ppToken + ** result from xNext() is only valid until the next call to xNext() + ** or until xClose() is called. + */ + /* TODO(shess) current implementation requires pInput to be + ** nul-terminated. This should either be fixed, or pInput/nBytes + ** should be converted to zInput. + */ + int (*xOpen)(sqlite3_tokenizer *pTokenizer, + const char *pInput, int nBytes, + sqlite3_tokenizer_cursor **ppCursor); + int (*xClose)(sqlite3_tokenizer_cursor *pCursor); + int (*xNext)(sqlite3_tokenizer_cursor *pCursor, + const char **ppToken, int *pnBytes, + int *piStartOffset, int *piEndOffset, int *piPosition); +}; + +struct sqlite3_tokenizer { + sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ + /* Tokenizer implementations will typically add additional fields */ +}; + +struct sqlite3_tokenizer_cursor { + sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ + /* Tokenizer implementations will typically add additional fields */ +}; + +/* +** Get the module for a tokenizer which generates tokens based on a +** set of non-token characters. The default is to break tokens at any +** non-alnum character, though the set of delimiters can also be +** specified by the first argv argument to xCreate(). +*/ +/* TODO(shess) This doesn't belong here. Need some sort of +** registration process. +*/ +void get_simple_tokenizer_module(sqlite3_tokenizer_module **ppModule); + +#endif /* _TOKENIZER_H_ */ diff --git a/manifest b/manifest index 9d1f1d1ffa..9e591912d8 100644 --- a/manifest +++ b/manifest @@ -1,5 +1,5 @@ -C Add\sthe\snew\sexperimental\ssqlite3_auto_extension()\sAPI.\s(CVS\s3362) -D 2006-08-23T20:07:21 +C A\sfirst\simplementation\sof\sa\sfull-text\ssearch\smodule\sfor\sSQLite.\s(CVS\s3363) +D 2006-08-23T23:58:50 F Makefile.in 8e7f9ecebab2c6e0f3db20ff129a8f9405ab64f8 F Makefile.linux-gcc 2d8574d1ba75f129aba2019f0b959db380a90935 F README 9c4e2d6706bdcc3efdd773ce752a8cdab4f90028 @@ -19,6 +19,12 @@ F doc/lemon.html f0f682f50210928c07e562621c3b7e8ab912a538 F doc/report1.txt a031aaf37b185e4fa540223cb516d3bccec7eeac F ext/README.txt 913a7bd3f4837ab14d7e063304181787658b14e1 F ext/fts1/README.txt 20ac73b006a70bcfd80069bdaf59214b6cf1db5e +F ext/fts1/ft_hash.c 3927bd880e65329bdc6f506555b228b28924921b +F ext/fts1/ft_hash.h 1a35e654a235c2c662d3ca0dfc3138ad60b8b7d5 +F ext/fts1/fulltext.c 373ee2d9d79eb46f09e95ca01d88b6ce62f85821 +F ext/fts1/fulltext.h 08525a47852d1d62a0be81d3fc3fe2d23b094efd +F ext/fts1/simple_tokenizer.c 289b7f35bf692e85834a7b599fd0a136e415a9eb +F ext/fts1/tokenizer.h 0c53421b832366d20d720d21ea3e1f6e66a36ef9 F install-sh 9d4de14ab9fb0facae2f48780b874848cbf2f895 F ltmain.sh f6b283068efa69f06eb8aa1fe4bddfdbdeb35826 F main.mk 22a0c92f24ffc377c41ac9d7d63ae3cbb813e532 @@ -381,7 +387,7 @@ F www/tclsqlite.tcl bb0d1357328a42b1993d78573e587c6dcbc964b9 F www/vdbe.tcl 87a31ace769f20d3627a64fa1fade7fed47b90d0 F www/version3.tcl 890248cf7b70e60c383b0e84d77d5132b3ead42b F www/whentouse.tcl 97e2b5cd296f7d8057e11f44427dea8a4c2db513 -P 533154099c9fe1238705eea03aba388dd71dc35e -R 6e117482355c04ea4130650a5df55269 -U drh -Z 4fd4520cd0f576a9a4485f55b2ce4752 +P a85fc877eb8c92bbb79ac9b7fa91fb362f37cdf7 +R 04b40b5b3a8b4af8436a461b6264fde6 +U adamd +Z e3809b70bf4ea26198337b68886699ce diff --git a/manifest.uuid b/manifest.uuid index 994196aae6..149816c807 100644 --- a/manifest.uuid +++ b/manifest.uuid @@ -1 +1 @@ -a85fc877eb8c92bbb79ac9b7fa91fb362f37cdf7 \ No newline at end of file +b0d8e0d314d6f77b7d4b5dd00c694a1323f7a8e4 \ No newline at end of file