** * The FTS2 module is being built into the core of
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
*/
+
+/* TODO(shess) Consider exporting this comment to an HTML file or the
+** wiki.
+*/
+/* The full-text index is stored in a series of b+tree (-like)
+** structures called segments which map terms to doclists. The
+** structures are like b+trees in layout, but are constructed from the
+** bottom up in optimal fashion and are not updatable. Since trees
+** are built from the bottom up, things will be described from the
+** bottom up.
+**
+**
+**** Varints ****
+** The basic unit of encoding is a variable-length integer called a
+** varint. We encode variable-length integers in little-endian order
+** using seven bits * per byte as follows:
+**
+** KEY:
+** A = 0xxxxxxx 7 bits of data and one flag bit
+** B = 1xxxxxxx 7 bits of data and one flag bit
+**
+** 7 bits - A
+** 14 bits - BA
+** 21 bits - BBA
+** and so on.
+**
+** This is identical to how sqlite encodes varints (see util.c).
+**
+**
+**** Document lists ****
+** A doclist (document list) holds a docid-sorted list of hits for a
+** given term. Doclists hold docids, and can optionally associate
+** token positions and offsets with docids.
+**
+** A DL_POSITIONS_OFFSETS doclist is stored like this:
+**
+** array {
+** varint docid;
+** array { (position list for column 0)
+** varint position; (delta from previous position plus POS_BASE)
+** varint startOffset; (delta from previous startOffset)
+** varint endOffset; (delta from startOffset)
+** }
+** array {
+** varint POS_COLUMN; (marks start of position list for new column)
+** varint column; (index of new column)
+** array {
+** varint position; (delta from previous position plus POS_BASE)
+** varint startOffset;(delta from previous startOffset)
+** varint endOffset; (delta from startOffset)
+** }
+** }
+** varint POS_END; (marks end of positions for this document.
+** }
+**
+** Here, array { X } means zero or more occurrences of X, adjacent in
+** memory. A "position" is an index of a token in the token stream
+** generated by the tokenizer, while an "offset" is a byte offset,
+** both based at 0. Note that POS_END and POS_COLUMN occur in the
+** same logical place as the position element, and act as sentinals
+** ending a position list array.
+**
+** A DL_POSITIONS doclist omits the startOffset and endOffset
+** information. A DL_DOCIDS doclist omits both the position and
+** offset information, becoming an array of varint-encoded docids.
+**
+** On-disk data is stored as type DL_DEFAULT, so we don't serialize
+** the type. Due to how deletion is implemented in the segmentation
+** system, on-disk doclists MUST store at least positions.
+**
+** TODO(shess) Delta-encode docids. This provides a 10% win versus
+** DL_POSITIONS_OFFSETS on the first 100,000 documents of the Enron
+** corpus, greater versus DL_POSITIONS.
+**
+**
+**** Segment leaf nodes ****
+** Segment leaf nodes store terms and doclists, ordered by term. Leaf
+** nodes are written using LeafWriter, and read using LeafReader (to
+** iterate through a single leaf node's data) and LeavesReader (to
+** iterate through a segment's entire leaf layer). Leaf nodes have
+** the format:
+**
+** varint iHeight; (height from leaf level, always 0)
+** varint nTerm; (length of first term)
+** char pTerm[nTerm]; (content of first term)
+** varint nDoclist; (length of term's associated doclist)
+** char pDoclist[nDoclist]; (content of doclist)
+** array {
+** (further terms are delta-encoded)
+** varint nPrefix; (length of prefix shared with previous term)
+** varint nSuffix; (length of unshared suffix)
+** char pTermSuffix[nSuffix];(unshared suffix of next term)
+** varint nDoclist; (length of term's associated doclist)
+** char pDoclist[nDoclist]; (content of doclist)
+** }
+**
+** Here, array { X } means zero or more occurrences of X, adjacent in
+** memory.
+**
+** Leaf nodes are broken into blocks which are stored contiguously in
+** the %_segments table in sorted order. This means that when the end
+** of a node is reached, the next term is in the node with the next
+** greater node id.
+**
+** New data is spilled to a new leaf node when the current node
+** exceeds LEAF_MAX bytes (default 2048). New data which itself is
+** larger than STANDALONE_MIN (default 1024) is placed in a standalone
+** node (a leaf node with a single term and doclist). The goal of
+** these settings is to pack together groups of small doclists while
+** making it efficient to directly access large doclists. The
+** assumption is that large doclists represent terms which are more
+** likely to be query targets.
+**
+** TODO(shess) It may be useful for blocking decisions to be more
+** dynamic. For instance, it may make more sense to have a 2.5k leaf
+** node rather than splitting into 2k and .5k nodes. My intuition is
+** that this might extend through 2x or 4x the pagesize.
+**
+**
+**** Segment interior nodes ****
+** Segment interior nodes store blockids for subtree nodes and terms
+** to describe what data is stored by the each subtree. Interior
+** nodes are written using InteriorWriter, and read using
+** InteriorReader. InteriorWriters are created as needed when
+** SegmentWriter creates new leaf nodes, or when an interior node
+** itself grows too big and must be split. The format of interior
+** nodes:
+**
+** varint iHeight; (height from leaf level, always >0)
+** varint iBlockid; (block id of node's leftmost subtree)
+** array {
+** varint nTerm; (length of term)
+** char pTerm[nTerm]; (content of term)
+** }
+**
+** Here, array { X } means zero or more occurrences of X, adjacent in
+** memory.
+**
+** An interior node encodes n terms separating n+1 subtrees. The
+** subtree blocks are contiguous, so only the first subtree's blockid
+** is encoded. The subtree at iBlockid will contain all terms less
+** than the first term encoded (or all terms if no term is encoded).
+** Otherwise, for terms greater than or equal to pTerm[i] but less
+** than pTerm[i+1], the subtree for that term will be rooted at
+** iBlockid+i.
+**
+** New data is spilled to a new interior node at the same height when
+** the current node exceeds INTERIOR_MAX bytes (default 2048). The
+** interior nodes at a given height are naturally tracked by interior
+** nodes at height+1, and so on.
+**
+**
+**** Segment directory ****
+** The segment directory in table %_segdir stores meta-information for
+** merging and deleting segments, and also the root node of the
+** segment's tree.
+**
+** The root node is the top node of the segment's tree after encoding
+** the entire segment, restricted to ROOT_MAX bytes (default 1024).
+** This could be either a leaf node or an interior node. If the top
+** node requires more than ROOT_MAX bytes, it is flushed to %_segments
+** and a new root interior node is generated (which should always fit
+** within ROOT_MAX because it only needs space for 2 varints, the
+** height and the blockid of the previous root).
+**
+** The meta-information in the segment directory is:
+** level - segment level (see below)
+** idx - index within level
+** - (level,idx uniquely identify a segment)
+** start_block - first leaf node
+** leaves_end_block - last leaf node
+** end_block - last block (including interior nodes)
+** root - contents of root node
+**
+** If the root node is a leaf node, then start_block,
+** leaves_end_block, and end_block are all 0.
+**
+**
+**** Segment merging ****
+** To amortize update costs, segments are groups into levels and
+** merged in matches. Each increase in level represents exponentially
+** more documents.
+**
+** New documents (actually, document updates) are tokenized and
+** written individually (using LeafWriter) to a level 0 segment, with
+** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
+** level 0 segments are merged into a single level 1 segment. Level 1
+** is populated like level 0, and eventually MERGE_COUNT level 1
+** segments are merged to a single level 2 segment (representing
+** MERGE_COUNT^2 updates), and so on.
+**
+** A segment merge traverses all segments at a given level in
+** parallel, performing a straightforward sorted merge. Since segment
+** leaf nodes are written in to the %_segments table in order, this
+** merge traverses the underlying sqlite disk structures efficiently.
+** After the merge, all segment blocks from the merged level are
+** deleted.
+**
+** MERGE_COUNT controls how often we merge segments. 16 seems to be
+** somewhat of a sweet spot for insertion performance. 32 and 64 show
+** very similar performance numbers to 16 on insertion, though they're
+** a tiny bit slower (perhaps due to more overhead in merge-time
+** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
+** 16, 2 about 66% slower than 16.
+**
+** At query time, high MERGE_COUNT increases the number of segments
+** which need to be scanned and merged. For instance, with 100k docs
+** inserted:
+**
+** MERGE_COUNT segments
+** 16 25
+** 8 12
+** 4 10
+** 2 6
+**
+** This appears to have only a moderate impact on queries for very
+** frequent terms (which are somewhat dominated by segment merge
+** costs), and infrequent and non-existent terms still seem to be fast
+** even with many segments.
+**
+** TODO(shess) That said, it would be nice to have a better query-side
+** argument for MERGE_COUNT of 16. Also, it's possible/likely that
+** optimizations to things like doclist merging will swing the sweet
+** spot around.
+**
+**
+**
+**** Handling of deletions and updates ****
+** Since we're using a segmented structure, with no docid-oriented
+** index into the term index, we clearly cannot simply update the term
+** index when a document is deleted or updated. For deletions, we
+** write an empty doclist (varint(docid) varint(POS_END)), for updates
+** we simply write the new doclist. Segment merges overwrite older
+** data for a particular docid with newer data, so deletes or updates
+** will eventually overtake the earlier data and knock it out. The
+** query logic likewise merges doclists so that newer data knocks out
+** older data.
+**
+** TODO(shess) Provide a VACUUM type operation to clear out all
+** deletions and duplications. This would basically be a forced merge
+** into a single segment.
+*/
+
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
#if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE)
#include <assert.h>
#if !defined(__APPLE__)
#include <malloc.h>
-#else
-#include <stdlib.h>
#endif
+#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
SQLITE_EXTENSION_INIT1
+/* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
+** would be nice to order the file better, perhaps something along the
+** lines of:
+**
+** - utility functions
+** - table setup functions
+** - table update functions
+** - table query functions
+**
+** Put the query functions last because they're likely to reference
+** typedefs or functions from the table update section.
+*/
+
#if 0
# define TRACE(A) printf A; fflush(stdout)
#else
nappend(sb, zFrom, strlen(zFrom));
}
-/* We encode variable-length integers in little-endian order using seven bits
- * per byte as follows:
-**
-** KEY:
-** A = 0xxxxxxx 7 bits of data and one flag bit
-** B = 1xxxxxxx 7 bits of data and one flag bit
+/* Helper functions for certain common memory-allocation idioms:
**
-** 7 bits - A
-** 14 bits - BA
-** 21 bits - BBA
-** and so on.
+** data_dup() - malloc+memcpy to duplicate a buffer
+** data_replace() - realloc+memcpy to dup a buffer over an existing buffer
+** data_append() - realloc+memcpy to append data to an existing buffer
+** data_append2() - shorthand for calling data_append() twice.
*/
+/* TODO(shess) There is a "block of binary data on the heap" construct
+** in here which could be shared with (at least) the StringBuffer and
+** DocList constructs.
+*/
+static void data_replace(char **ppTarget, int *pnTarget,
+ const char *pSource, int nSource){
+ *ppTarget = realloc(*ppTarget, nSource);
+ memcpy(*ppTarget, pSource, nSource);
+ *pnTarget = nSource;
+}
+
+static void data_dup(char **ppTarget, int *pnTarget,
+ const char *pSource, int nSource){
+ *ppTarget = malloc(nSource);
+ memcpy(*ppTarget, pSource, nSource);
+ *pnTarget = nSource;
+}
+
+static void data_append(char **ppTarget, int *pnTarget,
+ const char *pSource, int nSource){
+ *ppTarget = realloc(*ppTarget, *pnTarget+nSource);
+ memcpy(*ppTarget+*pnTarget, pSource, nSource);
+ *pnTarget += nSource;
+}
+
+static void data_append2(char **ppTarget, int *pnTarget,
+ const char *pSource1, int nSource1,
+ const char *pSource2, int nSource2){
+ *ppTarget = realloc(*ppTarget, *pnTarget+nSource1+nSource2);
+ memcpy(*ppTarget+*pnTarget, pSource1, nSource1);
+ memcpy(*ppTarget+*pnTarget+nSource1, pSource2, nSource2);
+ *pnTarget += nSource1+nSource2;
+}
/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
#define VARINT_MAX 10
return ret;
}
-/*** Document lists ***
- *
- * A document list holds a sorted list of varint-encoded document IDs.
- *
- * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
- *
- * array {
- * varint docid;
- * array {
- * varint position; (delta from previous position plus POS_BASE)
- * varint startOffset; (delta from previous startOffset)
- * varint endOffset; (delta from startOffset)
- * }
- * }
- *
- * Here, array { X } means zero or more occurrences of X, adjacent in memory.
- *
- * A position list may hold positions for text in multiple columns. A position
- * POS_COLUMN is followed by a varint containing the index of the column for
- * following positions in the list. Any positions appearing before any
- * occurrences of POS_COLUMN are for column 0.
- *
- * A doclist with type DL_POSITIONS is like the above, but holds only docids
- * and positions without offset information.
- *
- * A doclist with type DL_DOCIDS is like the above, but holds only docids
- * without positions or offset information.
- *
- * On disk, every document list has positions and offsets, so we don't bother
- * to serialize a doclist's type.
- *
- * We don't yet delta-encode document IDs; doing so will probably be a
- * modest win.
- *
- * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
- * After the first offset, estimate the next offset by using the
- * current token position and the previous token position and offset,
- * offset to handle some variance. So the estimate would be
- * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
- * as normal. Offsets more than 64 chars from the estimate are
- * encoded as the delta to the previous start offset + 128. An
- * additional tiny increment can be gained by using the end offset of
- * the previous token to make the estimate a tiny bit more precise.
-*/
-
typedef enum DocListType {
DL_DOCIDS, /* docids only */
DL_POSITIONS, /* docids + positions */
**
** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
**
+** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
+** into (no deletes or updates).
*/
#ifndef DL_DEFAULT
# define DL_DEFAULT DL_POSITIONS
POS_BASE
};
-/* Initialize a new DocList to hold the given data. */
-static void docListInit(DocList *d, DocListType iType,
- const char *pData, int nData){
+/* TODO(shess) I think it might be time to refactor the doclist
+** manipulation. Broadly put, there are four fairly discrete clients,
+** tokenization, insert-time segment merging, query-time segment
+** merging and query-time analysis. The breakdown I think might be
+** reasonable would be:
+**
+** DocListReader - Wraps const char *pData, int nData.
+** Used to traverse doclists
+** DocListWriter - Starts empty, can add complete doclist elements.
+** Used in merging doclists.
+** DocBuilder - Used when tokenizing documents.
+*/
+
+static void docListCoreInit(DocList *d, DocListType iType,
+ char *pData, int nData){
d->nData = nData;
- if( nData>0 ){
- d->pData = malloc(nData);
- memcpy(d->pData, pData, nData);
- } else {
- d->pData = NULL;
- }
+ d->pData = pData;
d->iType = iType;
d->iLastColumn = 0;
d->iLastPos = d->iLastOffset = 0;
}
+/* Initialize a new DocList pointing to static data. Don't call
+** docListDestroy() to release, just free(d) (if you originally
+** malloced d).
+*/
+static void docListStaticInit(DocList *d, DocListType iType,
+ const char *pData, int nData){
+ docListCoreInit(d, iType, (char *)pData, nData);
+}
+
+/* Initialize a new DocList to hold a copy of the given data. */
+static void docListInit(DocList *d, DocListType iType,
+ const char *pData, int nData){
+ char *p = 0;
+ if( nData>0 ){
+ p = malloc(nData);
+ memcpy(p, pData, nData);
+ }
+ docListCoreInit(d, iType, p, nData);
+}
+
/* Create a new dynamically-allocated DocList. */
static DocList *docListNew(DocListType iType){
DocList *d = (DocList *) malloc(sizeof(DocList));
return !atEnd(pReader) && d==iDocid;
}
-/* Return the first document in a document list.
-*/
-static sqlite_int64 firstDocid(DocList *d){
- DocListReader r;
- readerInit(&r, d);
- return readDocid(&r);
-}
-
#ifdef SQLITE_DEBUG
/*
** This routine is used for debugging purpose only.
*in = out;
}
-/* Helper function for docListUpdate() and docListAccumulate().
-** Splices a doclist element into the doclist represented by r,
-** leaving r pointing after the newly spliced element.
+/* Efficiently merge left and right into out, with duplicated docids
+** from right overwriting those in left (left is effectively older
+** than right). The previous code had a memmove() which introduced an
+** O(N^2) into merges, while this code should be O(N).
*/
-static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
- const char *pSource, int nSource){
- DocList *d = r->pDoclist;
- char *pTarget;
- int nTarget, found;
+static void docListMerge(DocList *out, DocList *left, DocList *right){
+ DocListReader leftReader, rightReader;
+ int iData = 0;
+#ifndef NDEBUG
+ /* Track these to make certain that every byte is processed. */
+ int nLeftProcessed = 0, nRightProcessed = 0;
+#endif
- found = skipToDocid(r, iDocid);
+ assert( left->iType==right->iType );
- /* Describe slice in d to place pSource/nSource. */
- pTarget = r->p;
- if( found ){
- skipDocument(r);
- nTarget = r->p-pTarget;
- }else{
- nTarget = 0;
+ /* Handle edge cases. */
+ /* TODO(shess) Consider simply forbidding edge cases, in the
+ ** interests of saving copies.
+ */
+ if( left->nData==0 ){
+ docListInit(out, right->iType, right->pData, right->nData);
+ return;
+ }else if(right->nData==0 ){
+ docListInit(out, left->iType, left->pData, left->nData);
+ return;
}
+ docListInit(out, left->iType, 0, 0);
- /* The sense of the following is that there are three possibilities.
- ** If nTarget==nSource, we should not move any memory nor realloc.
- ** If nTarget>nSource, trim target and realloc.
- ** If nTarget<nSource, realloc then expand target.
+ /* At this time, the sum of the space of the inputs is a strict
+ ** upper bound. *out can end up smaller if elements of *right
+ ** overwrite elements of *left, but never larger.
*/
- if( nTarget>nSource ){
- memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
- }
- if( nTarget!=nSource ){
- int iDoclist = pTarget-d->pData;
- d->pData = realloc(d->pData, d->nData+nSource-nTarget);
- pTarget = d->pData+iDoclist;
- }
- if( nTarget<nSource ){
- memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
+ out->nData = left->nData+right->nData;
+ out->pData = malloc(out->nData);
+
+ readerInit(&leftReader, left);
+ readerInit(&rightReader, right);
+
+ while( !atEnd(&leftReader) && !atEnd(&rightReader) ){
+ sqlite_int64 iLeftDocid = peekDocid(&leftReader);
+ sqlite_int64 iRightDocid = peekDocid(&rightReader);
+ const char *pStart, *pEnd;
+
+ if( iLeftDocid<iRightDocid ){
+ /* Copy from *left where less than iRightDocid. */
+ pStart = leftReader.p;
+ skipToDocid(&leftReader, iRightDocid);
+ pEnd = leftReader.p;
+#ifndef NDEBUG
+ nLeftProcessed += pEnd-pStart;
+#endif
+ }else{
+ /* Copy from *right where less than iLeftDocid, plus the element
+ ** matching iLeftDocid, if present. Also drop the matching
+ ** element from *left.
+ */
+ pStart = rightReader.p;
+ if( skipToDocid(&rightReader, iLeftDocid) ){
+#ifndef NDEBUG
+ const char *pLeftStart = leftReader.p;
+#endif
+ skipDocument(&leftReader);
+ skipDocument(&rightReader);
+#ifndef NDEBUG
+ nLeftProcessed += leftReader.p-pLeftStart;
+#endif
+ }
+ pEnd = rightReader.p;
+#ifndef NDEBUG
+ nRightProcessed += pEnd-pStart;
+#endif
+ }
+ assert( pEnd>pStart );
+ assert( iData+pEnd-pStart<=out->nData );
+ memcpy(out->pData+iData, pStart, pEnd-pStart);
+ iData += pEnd-pStart;
}
- memcpy(pTarget, pSource, nSource);
- d->nData += nSource-nTarget;
- r->p = pTarget+nSource;
-}
-
-/* Insert/update pUpdate into the doclist. */
-static void docListUpdate(DocList *d, DocList *pUpdate){
- DocListReader reader;
-
- assert( d!=NULL && pUpdate!=NULL );
- assert( d->iType==pUpdate->iType);
-
- readerInit(&reader, d);
- docListSpliceElement(&reader, firstDocid(pUpdate),
- pUpdate->pData, pUpdate->nData);
-}
-
-/* Propagate elements from pUpdate to pAcc, overwriting elements with
-** matching docids.
-*/
-static void docListAccumulate(DocList *pAcc, DocList *pUpdate){
- DocListReader accReader, updateReader;
-
- /* Handle edge cases where one doclist is empty. */
- assert( pAcc!=NULL );
- if( pUpdate==NULL || pUpdate->nData==0 ) return;
- if( pAcc->nData==0 ){
- pAcc->pData = malloc(pUpdate->nData);
- memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData);
- pAcc->nData = pUpdate->nData;
- return;
+ if( !atEnd(&leftReader) ){
+ int n = left->nData-(leftReader.p-left->pData);
+ assert( atEnd(&rightReader) );
+ memcpy(out->pData+iData, leftReader.p, n);
+ iData += n;
+#ifndef NDEBUG
+ nLeftProcessed += n;
+#endif
+ }else if( !atEnd(&rightReader) ){
+ int n = right->nData-(rightReader.p-right->pData);
+ memcpy(out->pData+iData, rightReader.p, n);
+ iData += n;
+#ifndef NDEBUG
+ nRightProcessed += n;
+#endif
}
+ out->nData = iData;
+ out->pData = realloc(out->pData, out->nData);
- readerInit(&accReader, pAcc);
- readerInit(&updateReader, pUpdate);
-
- while( !atEnd(&updateReader) ){
- char *pSource = updateReader.p;
- sqlite_int64 iDocid = readDocid(&updateReader);
- skipPositionList(&updateReader);
- docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
- }
+ assert( nLeftProcessed==left->nData );
+ assert( nRightProcessed==right->nData );
}
/*
QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
} QueryType;
-/* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0
-** before we start aggregating into larger segments. Lower CHUNK_MAX
-** means that for a given input we have more individual segments per
-** term, which means more rows in the table and a bigger index (due to
-** both more rows and bigger rowids). But it also reduces the average
-** cost of adding new elements to the segment 0 doclist, and it seems
-** to reduce the number of pages read and written during inserts. 256
-** was chosen by measuring insertion times for a certain input (first
-** 10k documents of Enron corpus), though including query performance
-** in the decision may argue for a larger value.
-*/
-#define CHUNK_MAX 256
-
typedef enum fulltext_statement {
CONTENT_INSERT_STMT,
CONTENT_SELECT_STMT,
CONTENT_UPDATE_STMT,
CONTENT_DELETE_STMT,
- TERM_SELECT_STMT,
- TERM_SELECT_ALL_STMT,
- TERM_INSERT_STMT,
- TERM_UPDATE_STMT,
- TERM_DELETE_STMT,
+ BLOCK_INSERT_STMT,
+ BLOCK_SELECT_STMT,
+ BLOCK_DELETE_STMT,
+
+ SEGDIR_MAX_INDEX_STMT,
+ SEGDIR_SET_STMT,
+ SEGDIR_SELECT_STMT,
+ SEGDIR_SPAN_STMT,
+ SEGDIR_DELETE_STMT,
+ SEGDIR_SELECT_ALL_STMT,
MAX_STMT /* Always at end! */
} fulltext_statement;
/* These must exactly match the enum above. */
-/* TODO(adam): Is there some risk that a statement (in particular,
-** pTermSelectStmt) will be used in two cursors at once, e.g. if a
-** query joins a virtual table to itself? If so perhaps we should
-** move some of these to the cursor object.
+/* TODO(shess): Is there some risk that a statement will be used in two
+** cursors at once, e.g. if a query joins a virtual table to itself?
+** If so perhaps we should move some of these to the cursor object.
*/
static const char *const fulltext_zStatement[MAX_STMT] = {
/* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
/* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
/* CONTENT_DELETE */ "delete from %_content where rowid = ?",
- /* TERM_SELECT */
- "select rowid, doclist from %_term where term = ? and segment = ?",
- /* TERM_SELECT_ALL */
- "select doclist from %_term where term = ? order by segment",
- /* TERM_INSERT */
- "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)",
- /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
- /* TERM_DELETE */ "delete from %_term where rowid = ?",
+ /* BLOCK_INSERT */ "insert into %_segments values (?)",
+ /* BLOCK_SELECT */ "select block from %_segments where rowid = ?",
+ /* BLOCK_DELETE */ "delete from %_segments where rowid between ? and ?",
+
+ /* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
+ /* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
+ /* SEGDIR_SELECT */
+ "select start_block, leaves_end_block, root from %_segdir "
+ " where level = ? order by idx",
+ /* SEGDIR_SPAN */
+ "select min(start_block), max(end_block) from %_segdir "
+ " where level = ? and start_block <> 0",
+ /* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
+ /* SEGDIR_SELECT_ALL */ "select root from %_segdir order by level desc, idx",
};
+/* MERGE_COUNT controls how often we merge segments (see comment at
+** top of file).
+*/
+#define MERGE_COUNT 16
+
/*
** A connection to a fulltext index is an instance of the following
** structure. The xCreate and xConnect methods create an instance
** open.
*/
sqlite3_stmt *pFulltextStatements[MAX_STMT];
+
+ /* Precompiled statements used for segment merges. We run a
+ ** separate select across the leaf level of each tree being merged.
+ */
+ sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
+ /* The statement used to prepare pLeafSelectStmts. */
+#define LEAF_SELECT \
+ "select block from %_segments where rowid between ? and ? order by rowid"
};
/*
return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
}
+/* Like sql_get_statement(), but for special replicated LEAF_SELECT
+** statements.
+*/
+/* TODO(shess) Write version for generic statements and then share
+** that between the cached-statement functions.
+*/
+static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
+ sqlite3_stmt **ppStmt){
+ assert( idx>=0 && idx<MERGE_COUNT );
+ if( v->pLeafSelectStmts[idx]==NULL ){
+ int rc = sql_prepare(v->db, v->zName, &v->pLeafSelectStmts[idx],
+ LEAF_SELECT);
+ if( rc!=SQLITE_OK ) return rc;
+ }else{
+ int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ *ppStmt = v->pLeafSelectStmts[idx];
+ return SQLITE_OK;
+}
+
+/* Like sql_step_statement(), but for special replicated LEAF_SELECT
+** statements.
+*/
+/* TODO(shess) Write version for generic statements and then share
+** that between the cached-statement functions.
+*/
+static int sql_step_leaf_statement(fulltext_vtab *v, int idx,
+ sqlite3_stmt **ppStmt){
+ int rc;
+ sqlite3_stmt *s = *ppStmt;
+
+ while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
+ sqlite3_stmt *pNewStmt;
+
+ if( rc==SQLITE_BUSY ) continue;
+ if( rc!=SQLITE_ERROR ) return rc;
+
+ rc = sqlite3_reset(s);
+ if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR;
+
+ v->pLeafSelectStmts[idx] = NULL; /* Still in s */
+ rc = sql_get_leaf_statement(v, idx, &pNewStmt);
+ if( rc!=SQLITE_OK ) goto err;
+ *ppStmt = pNewStmt;
+
+ rc = sqlite3_transfer_bindings(s, pNewStmt);
+ if( rc!=SQLITE_OK ) goto err;
+
+ rc = sqlite3_finalize(s);
+ if( rc!=SQLITE_OK ) return rc;
+ s = pNewStmt;
+ }
+ return rc;
+
+ err:
+ sqlite3_finalize(s);
+ return rc;
+}
+
/* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
sqlite3_value **pValues){
return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
}
-/* select rowid, doclist from %_term
- * where term = [pTerm] and segment = [iSegment]
- * If found, returns SQLITE_ROW; the caller must free the
- * returned doclist. If no rows found, returns SQLITE_DONE. */
-static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm,
- int iSegment,
- sqlite_int64 *rowid, DocList *out){
+/* insert into %_segments values ([pData])
+** returns assigned rowid in *piBlockid
+*/
+static int block_insert(fulltext_vtab *v, const char *pData, int nData,
+ sqlite_int64 *piBlockid){
sqlite3_stmt *s;
- int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
- if( rc!=SQLITE_OK ) return rc;
-
- rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
+ int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
if( rc!=SQLITE_OK ) return rc;
- rc = sqlite3_bind_int(s, 2, iSegment);
+ rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
if( rc!=SQLITE_OK ) return rc;
- rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
- if( rc!=SQLITE_ROW ) return rc;
+ rc = sql_step_statement(v, BLOCK_INSERT_STMT, &s);
+ if( rc==SQLITE_ROW ) return SQLITE_ERROR;
+ if( rc!=SQLITE_DONE ) return rc;
- *rowid = sqlite3_column_int64(s, 0);
- docListInit(out, DL_DEFAULT,
- sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
+ *piBlockid = sqlite3_last_insert_rowid(v->db);
+ return SQLITE_OK;
+}
- /* We expect only one row. We must execute another sqlite3_step()
- * to complete the iteration; otherwise the table will remain locked. */
- rc = sqlite3_step(s);
- return rc==SQLITE_DONE ? SQLITE_ROW : rc;
-}
-
-/* Load the segment doclists for term pTerm and merge them in
-** appropriate order into out. Returns SQLITE_OK if successful. If
-** there are no segments for pTerm, successfully returns an empty
-** doclist in out.
-**
-** Each document consists of 1 or more "columns". The number of
-** columns is v->nColumn. If iColumn==v->nColumn, then return
-** position information about all columns. If iColumn<v->nColumn,
-** then only return position information about the iColumn-th column
-** (where the first column is 0).
-*/
-static int term_select_all(
- fulltext_vtab *v, /* The fulltext index we are querying against */
- int iColumn, /* If <nColumn, only look at the iColumn-th column */
- const char *pTerm, /* The term whose posting lists we want */
- int nTerm, /* Number of bytes in pTerm */
- DocList *out /* Write the resulting doclist here */
-){
- DocList doclist;
+/* delete from %_segments
+** where rowid between [iStartBlockid] and [iEndBlockid]
+**
+** Deletes the range of blocks, inclusive, used to delete the blocks
+** which form a segment.
+*/
+static int block_delete(fulltext_vtab *v,
+ sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
sqlite3_stmt *s;
- int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s);
+ int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
if( rc!=SQLITE_OK ) return rc;
- rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
+ rc = sqlite3_bind_int64(s, 1, iStartBlockid);
if( rc!=SQLITE_OK ) return rc;
- docListInit(&doclist, DL_DEFAULT, 0, 0);
+ rc = sqlite3_bind_int64(s, 2, iEndBlockid);
+ if( rc!=SQLITE_OK ) return rc;
- /* TODO(shess) Handle schema and busy errors. */
- while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){
- DocList old;
+ return sql_single_step_statement(v, BLOCK_DELETE_STMT, &s);
+}
- /* TODO(shess) If we processed doclists from oldest to newest, we
- ** could skip the malloc() involved with the following call. For
- ** now, I'd rather keep this logic similar to index_insert_term().
- ** We could additionally drop elements when we see deletes, but
- ** that would require a distinct version of docListAccumulate().
- */
- docListInit(&old, DL_DEFAULT,
- sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0));
+/* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
+** at iLevel. Returns SQLITE_DONE if there are no segments at
+** iLevel. Otherwise returns an error.
+*/
+static int segdir_max_index(fulltext_vtab *v, int iLevel, int *pidx){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
- if( iColumn<v->nColumn ){ /* querying a single column */
- docListRestrictColumn(&old, iColumn);
- }
+ rc = sqlite3_bind_int(s, 1, iLevel);
+ if( rc!=SQLITE_OK ) return rc;
- /* doclist contains the newer data, so write it over old. Then
- ** steal accumulated result for doclist.
- */
- docListAccumulate(&old, &doclist);
- docListDestroy(&doclist);
- doclist = old;
- }
- if( rc!=SQLITE_DONE ){
- docListDestroy(&doclist);
+ rc = sql_step_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
+ /* Should always get at least one row due to how max() works. */
+ if( rc==SQLITE_DONE ) return SQLITE_DONE;
+ if( rc!=SQLITE_ROW ) return rc;
+
+ /* NULL means that there were no inputs to max(). */
+ if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
+ rc = sqlite3_step(s);
+ if( rc==SQLITE_ROW ) return SQLITE_ERROR;
return rc;
}
- docListDiscardEmpty(&doclist);
- *out = doclist;
- return SQLITE_OK;
+ *pidx = sqlite3_column_int(s, 0);
+
+ /* We expect only one row. We must execute another sqlite3_step()
+ * to complete the iteration; otherwise the table will remain locked. */
+ rc = sqlite3_step(s);
+ if( rc==SQLITE_ROW ) return SQLITE_ERROR;
+ if( rc!=SQLITE_DONE ) return rc;
+ return SQLITE_ROW;
}
-/* insert into %_term (rowid, term, segment, doclist)
- values ([piRowid], [pTerm], [iSegment], [doclist])
-** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid.
-**
-** NOTE(shess) piRowid is IN, with values of "space of int64" plus
-** null, it is not used to pass data back to the caller.
+/* insert into %_segdir values (
+** [iLevel], [idx],
+** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
+** [pRootData]
+** )
*/
-static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid,
- const char *pTerm, int nTerm,
- int iSegment, DocList *doclist){
+static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
+ sqlite_int64 iStartBlockid,
+ sqlite_int64 iLeavesEndBlockid,
+ sqlite_int64 iEndBlockid,
+ const char *pRootData, int nRootData){
sqlite3_stmt *s;
- int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
+ int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
if( rc!=SQLITE_OK ) return rc;
- if( piRowid==NULL ){
- rc = sqlite3_bind_null(s, 1);
- }else{
- rc = sqlite3_bind_int64(s, 1, *piRowid);
- }
+ rc = sqlite3_bind_int(s, 1, iLevel);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int(s, 2, idx);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 3, iStartBlockid);
if( rc!=SQLITE_OK ) return rc;
- rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC);
+ rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
if( rc!=SQLITE_OK ) return rc;
- rc = sqlite3_bind_int(s, 3, iSegment);
+ rc = sqlite3_bind_int64(s, 5, iEndBlockid);
if( rc!=SQLITE_OK ) return rc;
- rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC);
+ rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
if( rc!=SQLITE_OK ) return rc;
- return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
+ return sql_single_step_statement(v, SEGDIR_SET_STMT, &s);
}
-/* update %_term set doclist = [doclist] where rowid = [rowid] */
-static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
- DocList *doclist){
+/* Queries %_segdir for the block span of the segments in level
+** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel,
+** SQLITE_ROW if there are blocks, else an error.
+*/
+static int segdir_span(fulltext_vtab *v, int iLevel,
+ sqlite_int64 *piStartBlockid,
+ sqlite_int64 *piEndBlockid){
sqlite3_stmt *s;
- int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
+ int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
if( rc!=SQLITE_OK ) return rc;
- rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC);
+ rc = sqlite3_bind_int(s, 1, iLevel);
if( rc!=SQLITE_OK ) return rc;
- rc = sqlite3_bind_int64(s, 2, rowid);
- if( rc!=SQLITE_OK ) return rc;
+ rc = sql_step_statement(v, SEGDIR_SPAN_STMT, &s);
+ if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */
+ if( rc!=SQLITE_ROW ) return rc;
- return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
+ /* This happens if all segments at this level are entirely inline. */
+ if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
+ /* We expect only one row. We must execute another sqlite3_step()
+ * to complete the iteration; otherwise the table will remain locked. */
+ int rc2 = sqlite3_step(s);
+ if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
+ return rc2;
+ }
+
+ *piStartBlockid = sqlite3_column_int64(s, 0);
+ *piEndBlockid = sqlite3_column_int64(s, 1);
+
+ /* We expect only one row. We must execute another sqlite3_step()
+ * to complete the iteration; otherwise the table will remain locked. */
+ rc = sqlite3_step(s);
+ if( rc==SQLITE_ROW ) return SQLITE_ERROR;
+ if( rc!=SQLITE_DONE ) return rc;
+ return SQLITE_ROW;
}
-static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
+/* Delete the segment blocks and segment directory records for all
+** segments at iLevel.
+*/
+static int segdir_delete(fulltext_vtab *v, int iLevel){
sqlite3_stmt *s;
- int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
+ sqlite_int64 iStartBlockid, iEndBlockid;
+ int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
+ if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
+
+ if( rc==SQLITE_ROW ){
+ rc = block_delete(v, iStartBlockid, iEndBlockid);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ /* Delete the segment directory itself. */
+ rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
if( rc!=SQLITE_OK ) return rc;
- rc = sqlite3_bind_int64(s, 1, rowid);
+ rc = sqlite3_bind_int64(s, 1, iLevel);
if( rc!=SQLITE_OK ) return rc;
- return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
+ return sql_single_step_statement(v, SEGDIR_DELETE_STMT, &s);
}
/*
}
}
+ for( i=0; i<MERGE_COUNT; i++ ){
+ if( v->pLeafSelectStmts[i]!=NULL ){
+ sqlite3_finalize(v->pLeafSelectStmts[i]);
+ v->pLeafSelectStmts[i] = NULL;
+ }
+ }
+
if( v->pTokenizer!=NULL ){
v->pTokenizer->pModule->xDestroy(v->pTokenizer);
v->pTokenizer = NULL;
return rc;
}
- /* The %_content table holds the text of each document, with
- ** the rowid used as the docid.
- **
- ** The %_term table maps each term to a document list blob
- ** containing elements sorted by ascending docid, each element
- ** encoded as:
- **
- ** docid varint-encoded
- ** token elements:
- ** position+1 varint-encoded as delta from previous position
- ** start offset varint-encoded as delta from previous start offset
- ** end offset varint-encoded as delta from start offset
- **
- ** The sentinel position of 0 indicates the end of the token list.
- **
- ** Additionally, doclist blobs are chunked into multiple segments,
- ** using segment to order the segments. New elements are added to
- ** the segment at segment 0, until it exceeds CHUNK_MAX. Then
- ** segment 0 is deleted, and the doclist is inserted at segment 1.
- ** If there is already a doclist at segment 1, the segment 0 doclist
- ** is merged with it, the segment 1 doclist is deleted, and the
- ** merged doclist is inserted at segment 2, repeating those
- ** operations until an insert succeeds.
- **
- ** Since this structure doesn't allow us to update elements in place
- ** in case of deletion or update, these are simply written to
- ** segment 0 (with an empty token list in case of deletion), with
- ** docListAccumulate() taking care to retain lower-segment
- ** information in preference to higher-segment information.
- */
- /* TODO(shess) Provide a VACUUM type operation which both removes
- ** deleted elements which are no longer necessary, and duplicated
- ** elements. I suspect this will probably not be necessary in
- ** practice, though.
- */
+/* The %_content table holds the text of each document, with
+** the rowid used as the docid.
+*/
+/* TODO(shess) This comment needs elaboration to match the updated
+** code. Work it into the top-of-file comment at that time.
+*/
static int fulltextCreate(sqlite3 *db, void *pAux,
int argc, const char * const *argv,
sqlite3_vtab **ppVTab, char **pzErr){
free(schema.s);
if( rc!=SQLITE_OK ) goto out;
+ rc = sql_exec(db, spec.zName, "create table %_segments(block blob);");
+ if( rc!=SQLITE_OK ) goto out;
+
rc = sql_exec(db, spec.zName,
- "create table %_term(term text, segment integer, doclist blob, "
- "primary key(term, segment));");
+ "create table %_segdir("
+ " level integer,"
+ " idx integer,"
+ " start_block integer,"
+ " leaves_end_block integer,"
+ " end_block integer,"
+ " root blob,"
+ " primary key(level, idx)"
+ ");");
if( rc!=SQLITE_OK ) goto out;
rc = constructVtab(db, &spec, ppVTab, pzErr);
TRACE(("FTS2 Destroy %p\n", pVTab));
rc = sql_exec(v->db, v->zName,
- "drop table %_content; drop table %_term");
+ "drop table %_content;"
+ "drop table %_segments;"
+ "drop table %_segdir;"
+ );
if( rc!=SQLITE_OK ) return rc;
fulltext_vtab_destroy((fulltext_vtab *)pVTab);
}
+/* TODO(shess) If we pushed LeafReader to the top of the file, or to
+** another file, term_select() could be pushed above
+** docListOfTerm().
+*/
+static int termSelect(fulltext_vtab *v, int iColumn,
+ const char *pTerm, int nTerm, DocList *out);
+
/* Return a DocList corresponding to the query term *pTerm. If *pTerm
** is the first term of a phrase query, go ahead and evaluate the phrase
** query and return the doclist for the entire phrase query.
int i, rc;
pLeft = docListNew(DL_POSITIONS);
- rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
+ rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
if( rc ) return rc;
for(i=1; i<=pQTerm->nPhrase; i++){
pRight = docListNew(DL_POSITIONS);
- rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
+ rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
if( rc ){
docListDelete(pLeft);
return rc;
return rc;
}
-/* Update the %_terms table to map the term [pTerm] to the given rowid. */
-static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm,
- DocList *d){
- sqlite_int64 iIndexRow;
- DocList doclist;
- int iSegment = 0, rc;
-
- rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist);
- if( rc==SQLITE_DONE ){
- docListInit(&doclist, DL_DEFAULT, 0, 0);
- docListUpdate(&doclist, d);
- /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */
- rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist);
- goto err;
- }
- if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
-
- docListUpdate(&doclist, d);
- if( doclist.nData<=CHUNK_MAX ){
- rc = term_update(v, iIndexRow, &doclist);
- goto err;
- }
-
- /* Doclist doesn't fit, delete what's there, and accumulate
- ** forward.
- */
- rc = term_delete(v, iIndexRow);
- if( rc!=SQLITE_OK ) goto err;
-
- /* Try to insert the doclist into a higher segment bucket. On
- ** failure, accumulate existing doclist with the doclist from that
- ** bucket, and put results in the next bucket.
- */
- iSegment++;
- while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment,
- &doclist))!=SQLITE_OK ){
- sqlite_int64 iSegmentRow;
- DocList old;
- int rc2;
-
- /* Retain old error in case the term_insert() error was really an
- ** error rather than a bounced insert.
- */
- rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old);
- if( rc2!=SQLITE_ROW ) goto err;
-
- rc = term_delete(v, iSegmentRow);
- if( rc!=SQLITE_OK ) goto err;
-
- /* Reusing lowest-number deleted row keeps the index smaller. */
- if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow;
-
- /* doclist contains the newer data, so accumulate it over old.
- ** Then steal accumulated data for doclist.
- */
- docListAccumulate(&old, &doclist);
- docListDestroy(&doclist);
- doclist = old;
-
- iSegment++;
- }
-
- err:
- docListDestroy(&doclist);
- return rc;
-}
-
/* Add doclists for all terms in [pValues] to the hash table [terms]. */
static int insertTerms(fulltext_vtab *v, fts2Hash *terms, sqlite_int64 iRowid,
sqlite3_value **pValues){
* table [pTerms]. */
static int deleteTerms(fulltext_vtab *v, fts2Hash *pTerms, sqlite_int64 iRowid){
const char **pValues;
- int i;
+ int i, rc;
- int rc = content_select(v, iRowid, &pValues);
+ /* TODO(shess) Should we allow such tables at all? */
+ if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
+
+ rc = content_select(v, iRowid, &pValues);
if( rc!=SQLITE_OK ) return rc;
for(i = 0 ; i < v->nColumn; ++i) {
return content_update(v, pValues, iRow); /* execute an SQL UPDATE */
}
+/*******************************************************************/
+/* InteriorWriter is used to collect terms and block references into
+** interior nodes in %_segments. See commentary at top of file for
+** format.
+*/
+
+/* How large interior nodes can grow. */
+#define INTERIOR_MAX 2048
+
+/* ROOT_MAX controls how much data is stored inline in the segment
+** directory.
+*/
+/* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's
+** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
+** can both see it, but if the caller passed it in, we wouldn't even
+** need a define.
+*/
+#define ROOT_MAX 1024
+#if ROOT_MAX<VARINT_MAX*2
+# error ROOT_MAX must have enough space for a header.
+#endif
+
+/* InteriorBlock stores a linked-list of interior blocks while a lower
+** layer is being constructed.
+*/
+typedef struct InteriorBlock {
+ char *pTerm; /* Leftmost term in block's subtree. */
+ int nTerm;
+
+ char *pData;
+ int nData;
+
+ struct InteriorBlock *next;
+} InteriorBlock;
+
+static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
+ const char *pTerm, int nTerm){
+ InteriorBlock *block = calloc(1, sizeof(InteriorBlock));
+ char c[VARINT_MAX+VARINT_MAX];
+ int n;
+
+ data_dup(&block->pTerm, &block->nTerm, pTerm, nTerm);
+
+ n = putVarint(c, iHeight);
+ n += putVarint(c+n, iChildBlock);
+ data_dup(&block->pData, &block->nData, c, n);
+
+ return block;
+}
+
+typedef struct InteriorWriter {
+ int iHeight; /* from 0 at leaves. */
+ InteriorBlock *first, *last;
+ struct InteriorWriter *parentWriter;
+
+#ifndef NDEBUG
+ sqlite_int64 iLastChildBlock; /* for consistency checks. */
+#endif
+} InteriorWriter;
+
+/* Initialize an interior node where pTerm[nTerm] marks the leftmost
+** term in the tree. iChildBlock is the leftmost child block at the
+** next level down the tree.
+*/
+static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
+ sqlite_int64 iChildBlock,
+ InteriorWriter *pWriter){
+ InteriorBlock *block;
+ assert( iHeight>0 );
+ memset(pWriter, 0, sizeof(*pWriter));
+
+ pWriter->iHeight = iHeight;
+#ifndef NDEBUG
+ pWriter->iLastChildBlock = iChildBlock;
+#endif
+ block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
+ pWriter->last = pWriter->first = block;
+}
+
+/* Append the child node rooted at iChildBlock to the interior node,
+** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
+*/
+static void interiorWriterAppend(InteriorWriter *pWriter,
+ const char *pTerm, int nTerm,
+ sqlite_int64 iChildBlock){
+ char c[VARINT_MAX+VARINT_MAX];
+ int n = putVarint(c, nTerm);
+
+#ifndef NDEBUG
+ pWriter->iLastChildBlock++;
+#endif
+ assert( pWriter->iLastChildBlock==iChildBlock );
+
+ if( pWriter->last->nData+n+nTerm>INTERIOR_MAX ){
+ /* Overflow to a new block. */
+ pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
+ pTerm, nTerm);
+ pWriter->last = pWriter->last->next;
+ }else{
+ InteriorBlock *last = pWriter->last;
+ data_append2(&last->pData, &last->nData, c, n, pTerm, nTerm);
+ }
+}
+
+/* Free the space used by pWriter, including the linked-list of
+** InteriorBlocks.
+*/
+static int interiorWriterDestroy(InteriorWriter *pWriter){
+ InteriorBlock *block = pWriter->first;
+
+ while( block!=NULL ){
+ InteriorBlock *b = block;
+ block = block->next;
+ free(b->pData);
+ free(b->pTerm);
+ free(b);
+ }
+#ifndef NDEBUG
+ memset(pWriter, 0x55, sizeof(pWriter));
+#endif
+ return SQLITE_OK;
+}
+
+/* If pWriter can fit entirely in ROOT_MAX, return it as the root info
+** directly, leaving *piEndBlockid unchanged. Otherwise, flush
+** pWriter to %_segments, building a new layer of interior nodes, and
+** recursively ask for their root into.
+*/
+static int interiorWriterRootInfo(fulltext_vtab *v, InteriorWriter *pWriter,
+ char **ppRootInfo, int *pnRootInfo,
+ sqlite_int64 *piEndBlockid){
+ InteriorBlock *block = pWriter->first;
+ sqlite_int64 iBlockid = 0;
+ int rc;
+
+ /* If we can fit the segment inline */
+ if( block==pWriter->last && block->nData<ROOT_MAX ){
+ *ppRootInfo = block->pData;
+ *pnRootInfo = block->nData;
+ return SQLITE_OK;
+ }
+
+ /* Flush the first block to %_segments, and create a new level of
+ ** interior node.
+ */
+ rc = block_insert(v, block->pData, block->nData, &iBlockid);
+ if( rc!=SQLITE_OK ) return rc;
+ *piEndBlockid = iBlockid;
+
+ pWriter->parentWriter = malloc(sizeof(*pWriter->parentWriter));
+ interiorWriterInit(pWriter->iHeight+1,
+ block->pTerm, block->nTerm,
+ iBlockid, pWriter->parentWriter);
+
+ /* Flush additional blocks and append to the higher interior
+ ** node.
+ */
+ for(block=block->next; block!=NULL; block=block->next){
+ rc = block_insert(v, block->pData, block->nData, &iBlockid);
+ if( rc!=SQLITE_OK ) return rc;
+ *piEndBlockid = iBlockid;
+
+ interiorWriterAppend(pWriter->parentWriter,
+ block->pTerm, block->nTerm, iBlockid);
+ }
+
+ /* Parent node gets the chance to be the root. */
+ return interiorWriterRootInfo(v, pWriter->parentWriter,
+ ppRootInfo, pnRootInfo, piEndBlockid);
+}
+
+/****************************************************************/
+/* InteriorReader is used to read off the data from an interior node
+** (see comment at top of file for the format). InteriorReader does
+** not own its data, so interiorReaderDestroy() is a formality.
+*/
+typedef struct InteriorReader {
+ const char *pData;
+ int nData;
+
+ sqlite_int64 iBlockid;
+} InteriorReader;
+
+static void interiorReaderDestroy(InteriorReader *pReader){
+#ifndef NDEBUG
+ memset(pReader, 0x55, sizeof(pReader));
+#endif
+}
+
+static void interiorReaderInit(const char *pData, int nData,
+ InteriorReader *pReader){
+ int n;
+
+ /* Require at least the leading flag byte */
+ assert( nData>0 );
+ assert( pData[0]!='\0' );
+
+ memset(pReader, '\0', sizeof(pReader));
+
+ /* Decode the base blockid, and set the cursor to the first term. */
+ n = getVarint(pData+1, &pReader->iBlockid);
+ assert( 1+n<=nData );
+ pReader->pData = pData+1+n;
+ pReader->nData = nData-(1+n);
+}
+
+static int interiorReaderAtEnd(InteriorReader *pReader){
+ return pReader->nData<=0;
+}
+
+static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
+ return pReader->iBlockid;
+}
+
+static int interiorReaderTermBytes(InteriorReader *pReader){
+ int nTerm;
+ assert( !interiorReaderAtEnd(pReader) );
+ getVarint32(pReader->pData, &nTerm);
+ return nTerm;
+}
+static const char *interiorReaderTerm(InteriorReader *pReader){
+ int n, nTerm;
+ assert( !interiorReaderAtEnd(pReader) );
+ n = getVarint32(pReader->pData, &nTerm);
+ return pReader->pData+n;
+}
+
+/* Step forward to the next term in the node. */
+static void interiorReaderStep(InteriorReader *pReader){
+ int n, nTerm;
+ assert( !interiorReaderAtEnd(pReader) );
+ n = getVarint32(pReader->pData, &nTerm);
+ assert( n+nTerm<=pReader->nData );
+ pReader->pData += n+nTerm;
+ pReader->nData -= n+nTerm;
+ pReader->iBlockid++;
+}
+
+/* Compare the current term to pTerm[nTerm], returning strcmp-style
+** results.
+*/
+static int interiorReaderTermCmp(InteriorReader *pReader,
+ const char *pTerm, int nTerm){
+ const char *pReaderTerm = interiorReaderTerm(pReader);
+ int nReaderTerm = interiorReaderTermBytes(pReader);
+ int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
+
+ if( n==0 ){
+ if( nReaderTerm>0 ) return -1;
+ if( nTerm>0 ) return 1;
+ return 0;
+ }
+
+ c = memcmp(pReaderTerm, pTerm, n);
+ if( c!=0 ) return c;
+ return nReaderTerm - nTerm;
+}
+
+/****************************************************************/
+/* LeafWriter is used to collect terms and associated doclist data
+** into leaf blocks in %_segments (see top of file for format info).
+*/
+
+/* Put terms with data this big in their own block. */
+#define STANDALONE_MIN 1024
+
+/* Keep leaf blocks below this size. */
+#define LEAF_MAX 2048
+
+typedef struct LeafWriter {
+ int iLevel;
+ int idx;
+ sqlite_int64 iStartBlockid; /* needed to create the root info */
+ sqlite_int64 iEndBlockid; /* when we're done writing. */
+
+ char *pTerm; /* previous encoded term */
+ int nTerm;
+
+ char *pData; /* encoding buffer */
+ int nData;
+
+ InteriorWriter parentWriter; /* if we overflow */
+ int has_parent;
+} LeafWriter;
+
+static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
+ memset(pWriter, 0, sizeof(*pWriter));
+ pWriter->iLevel = iLevel;
+ pWriter->idx = idx;
+
+ /* Start out with a reasonably sized block, though it can grow. */
+ pWriter->pData = malloc(LEAF_MAX);
+ pWriter->nData = putVarint(pWriter->pData, 0);
+}
+
+/* Flush the current leaf node to %_segments, and adding the resulting
+** blockid and the starting term to the interior node which will
+** contain it.
+*/
+static int leafWriterInternalFlush(fulltext_vtab *v, LeafWriter *pWriter){
+ sqlite_int64 iBlockid = 0;
+ const char *pStartingTerm;
+ int nStartingTerm, rc, n;
+
+ /* Must have the leading varint(0) flag, plus at least some data. */
+ assert( pWriter->nData>2 );
+
+ rc = block_insert(v, pWriter->pData, pWriter->nData, &iBlockid);
+ if( rc!=SQLITE_OK ) return rc;
+ assert( iBlockid!=0 );
+
+ /* Reconstruct the first term in the leaf for purposes of building
+ ** the interior node.
+ */
+ n = getVarint32(pWriter->pData+1, &nStartingTerm);
+ pStartingTerm = pWriter->pData+1+n;
+ assert( pWriter->nData>1+n+nStartingTerm );
+
+ if( pWriter->has_parent ){
+ interiorWriterAppend(&pWriter->parentWriter,
+ pStartingTerm, nStartingTerm, iBlockid);
+ }else{
+ interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
+ &pWriter->parentWriter);
+ pWriter->has_parent = 1;
+ }
+
+ /* Track the span of this segment's leaf nodes. */
+ if( pWriter->iEndBlockid==0 ){
+ pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
+ }else{
+ pWriter->iEndBlockid++;
+ assert( iBlockid==pWriter->iEndBlockid );
+ }
+
+ /* Re-initialize the output buffer. */
+ pWriter->nData = putVarint(pWriter->pData, 0);
+ pWriter->nTerm = 0;
+
+ return SQLITE_OK;
+}
+
+/* Fetch the root info for the segment. If the entire leaf fits
+** within ROOT_MAX, then it will be returned directly, otherwise it
+** will be flushed and the root info will be returned from the
+** interior node. *piEndBlockid is set to the blockid of the last
+** interior or leaf node written to disk (0 if none are written at
+** all).
+*/
+static int leafWriterRootInfo(fulltext_vtab *v, LeafWriter *pWriter,
+ char **ppRootInfo, int *pnRootInfo,
+ sqlite_int64 *piEndBlockid){
+ /* we can fit the segment entirely inline */
+ if( !pWriter->has_parent && pWriter->nData<ROOT_MAX ){
+ *ppRootInfo = pWriter->pData;
+ *pnRootInfo = pWriter->nData;
+ *piEndBlockid = 0;
+ return SQLITE_OK;
+ }
+
+ /* Flush remaining leaf data. */
+ if( pWriter->nData>1 ){
+ int rc = leafWriterInternalFlush(v, pWriter);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ /* We must have flushed a leaf at some point. */
+ assert( pWriter->has_parent );
+
+ /* Tenatively set the end leaf blockid as the end blockid. If the
+ ** interior node can be returned inline, this will be the final
+ ** blockid, otherwise it will be overwritten by
+ ** interiorWriterRootInfo().
+ */
+ *piEndBlockid = pWriter->iEndBlockid;
+
+ return interiorWriterRootInfo(v, &pWriter->parentWriter,
+ ppRootInfo, pnRootInfo, piEndBlockid);
+}
+
+/* Collect the rootInfo data and store it into the segment directory.
+** This has the effect of flushing the segment's leaf data to
+** %_segments, and also flushing any interior nodes to %_segments.
+*/
+static int leafWriterFlush(fulltext_vtab *v, LeafWriter *pWriter){
+ sqlite_int64 iEndBlockid;
+ char *pRootInfo;
+ int rc, nRootInfo;
+
+ rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return segdir_set(v, pWriter->iLevel, pWriter->idx,
+ pWriter->iStartBlockid, pWriter->iEndBlockid,
+ iEndBlockid, pRootInfo, nRootInfo);
+}
+
+static void leafWriterDestroy(LeafWriter *pWriter){
+ if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
+ free(pWriter->pTerm);
+ free(pWriter->pData);
+}
+
+/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
+** %_segments.
+*/
+static int leafWriterStep(fulltext_vtab *v, LeafWriter *pWriter,
+ const char *pTerm, int nTerm, DocList *doclist){
+ char c[VARINT_MAX+VARINT_MAX];
+ int rc, n;
+
+ /* Flush existing data if this item won't fit well. */
+ if( pWriter->nData>1 &&
+ (doclist->nData+nTerm>STANDALONE_MIN ||
+ pWriter->nData+doclist->nData+nTerm>LEAF_MAX) ){
+ rc = leafWriterInternalFlush(v, pWriter);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ if( pWriter->nTerm==0 ){
+ /* Encode the entire leading term as:
+ ** varint(nTerm)
+ ** char pTerm[nTerm]
+ */
+ n = putVarint(c, nTerm);
+ assert( pWriter->nData==1 );
+ data_append2(&pWriter->pData, &pWriter->nData,
+ c, n, pTerm, nTerm);
+ }else{
+ /* Delta-encode the term as:
+ ** varint(nPrefix)
+ ** varint(nSuffix)
+ ** char pTermSuffix[nSuffix]
+ */
+ int nPrefix = 0;
+
+ while( nPrefix<nTerm && nPrefix<pWriter->nTerm &&
+ pTerm[nPrefix]==pWriter->pTerm[nPrefix] ){
+ nPrefix++;
+ }
+
+ n = putVarint(c, nPrefix);
+ n += putVarint(c+n, nTerm-nPrefix);
+
+ data_append2(&pWriter->pData, &pWriter->nData,
+ c, n, pTerm+nPrefix, nTerm-nPrefix);
+ }
+ data_replace(&pWriter->pTerm, &pWriter->nTerm, pTerm, nTerm);
+
+ /* Encode the doclist as:
+ ** varint(nDoclist)
+ ** char pDoclist[nDoclist]
+ */
+ n = putVarint(c, doclist->nData);
+ data_append2(&pWriter->pData, &pWriter->nData,
+ c, n, doclist->pData, doclist->nData);
+
+ /* Flush standalone blocks right out */
+ if( doclist->nData+nTerm>STANDALONE_MIN ){
+ rc = leafWriterInternalFlush(v, pWriter);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ return SQLITE_OK;
+}
+
+
+/****************************************************************/
+/* LeafReader is used to iterate over an individual leaf node. */
+typedef struct LeafReader {
+ char *pTerm; /* copy of current term. */
+ int nTerm;
+
+ const char *pData; /* data for current term. */
+ int nData;
+} LeafReader;
+
+static void leafReaderDestroy(LeafReader *pReader){
+ free(pReader->pTerm);
+#ifndef NDEBUG
+ memset(pReader, 0x55, sizeof(pReader));
+#endif
+}
+
+static int leafReaderAtEnd(LeafReader *pReader){
+ return pReader->nData<=0;
+}
+
+/* Access the current term. */
+static int leafReaderTermBytes(LeafReader *pReader){
+ return pReader->nTerm;
+}
+static const char *leafReaderTerm(LeafReader *pReader){
+ assert( pReader->nTerm>0 );
+ return pReader->pTerm;
+}
+
+/* Access the doclist data for the current term. */
+static int leafReaderDataBytes(LeafReader *pReader){
+ int nData;
+ assert( pReader->nTerm>0 );
+ getVarint32(pReader->pData, &nData);
+ return nData;
+}
+static const char *leafReaderData(LeafReader *pReader){
+ int n, nData;
+ assert( pReader->nTerm>0 );
+ n = getVarint32(pReader->pData, &nData);
+ return pReader->pData+n;
+}
+
+static void leafReaderInit(const char *pData, int nData,
+ LeafReader *pReader){
+ int nTerm, n;
+
+ assert( nData>0 );
+ assert( pData[0]=='\0' );
+
+ memset(pReader, '\0', sizeof(pReader));
+
+ /* Read the first term, skipping the header byte. */
+ n = getVarint32(pData+1, &nTerm);
+ data_dup(&pReader->pTerm, &pReader->nTerm, pData+1+n, nTerm);
+
+ /* Position after the first term. */
+ assert( 1+n+nTerm<nData );
+ pReader->pData = pData+1+n+nTerm;
+ pReader->nData = nData-1-n-nTerm;
+}
+
+/* Step the reader forward to the next term. */
+static void leafReaderStep(LeafReader *pReader){
+ int n, nData, nPrefix, nSuffix;
+ assert( !leafReaderAtEnd(pReader) );
+
+ /* Skip previous entry's data block. */
+ n = getVarint32(pReader->pData, &nData);
+ assert( n+nData<=pReader->nData );
+ pReader->pData += n+nData;
+ pReader->nData -= n+nData;
+
+ if( !leafReaderAtEnd(pReader) ){
+ /* Construct the new term using a prefix from the old term plus a
+ ** suffix from the leaf data.
+ */
+ n = getVarint32(pReader->pData, &nPrefix);
+ n += getVarint32(pReader->pData+n, &nSuffix);
+ assert( n+nSuffix<pReader->nData );
+ pReader->nTerm = nPrefix;
+ data_append(&pReader->pTerm, &pReader->nTerm, pReader->pData+n, nSuffix);
+
+ pReader->pData += n+nSuffix;
+ pReader->nData -= n+nSuffix;
+ }
+}
+
+/* strcmp-style comparison of pReader's current term against pTerm. */
+static int leafReaderTermCmp(LeafReader *pReader,
+ const char *pTerm, int nTerm){
+ int c, n = pReader->nTerm<nTerm ? pReader->nTerm : nTerm;
+ if( n==0 ){
+ if( pReader->nTerm>0 ) return -1;
+ if(nTerm>0 ) return 1;
+ return 0;
+ }
+
+ c = memcmp(pReader->pTerm, pTerm, n);
+ if( c!=0 ) return c;
+ return pReader->nTerm - nTerm;
+}
+
+
+/****************************************************************/
+/* LeavesReader wraps LeafReader to allow iterating over the entire
+** leaf layer of the tree.
+*/
+typedef struct LeavesReader {
+ int idx; /* Index within the segment. */
+
+ sqlite3_stmt *pStmt; /* Statement we're streaming leaves from. */
+ int eof; /* we've seen SQLITE_DONE from pStmt. */
+
+ LeafReader leafReader; /* reader for the current leaf. */
+ char *pRootData; /* root data for inline. */
+} LeavesReader;
+
+/* Access the current term. */
+static int leavesReaderTermBytes(LeavesReader *pReader){
+ assert( !pReader->eof );
+ return leafReaderTermBytes(&pReader->leafReader);
+}
+static const char *leavesReaderTerm(LeavesReader *pReader){
+ assert( !pReader->eof );
+ return leafReaderTerm(&pReader->leafReader);
+}
+
+/* Access the doclist data for the current term. */
+static int leavesReaderDataBytes(LeavesReader *pReader){
+ assert( !pReader->eof );
+ return leafReaderDataBytes(&pReader->leafReader);
+}
+static const char *leavesReaderData(LeavesReader *pReader){
+ assert( !pReader->eof );
+ return leafReaderData(&pReader->leafReader);
+}
+
+static int leavesReaderAtEnd(LeavesReader *pReader){
+ return pReader->eof;
+}
+
+static void leavesReaderDestroy(LeavesReader *pReader){
+ leafReaderDestroy(&pReader->leafReader);
+ if( pReader->pRootData!=0 ) free(pReader->pRootData);
+#ifndef NDEBUG
+ memset(pReader, 0x55, sizeof(pReader));
+#endif
+}
+
+/* Initialize pReader with the given root data (if iStartBlockid==0
+** the leaf data was entirely contained in the root), or from the
+** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
+*/
+static int leavesReaderInit(fulltext_vtab *v,
+ int idx,
+ sqlite_int64 iStartBlockid,
+ sqlite_int64 iEndBlockid,
+ const char *pRootData, int nRootData,
+ LeavesReader *pReader){
+ memset(pReader, 0, sizeof(*pReader));
+ pReader->idx = idx;
+
+ if( iStartBlockid==0 ){
+ /* Entire leaf level fit in root data. */
+ int n;
+ data_dup(&pReader->pRootData, &n, pRootData, nRootData);
+ leafReaderInit(pReader->pRootData, nRootData, &pReader->leafReader);
+ }else{
+ sqlite3_stmt *s;
+ int rc = sql_get_leaf_statement(v, idx, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 1, iStartBlockid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 2, iEndBlockid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sql_step_leaf_statement(v, idx, &s);
+ if( rc==SQLITE_DONE ){
+ pReader->eof = 1;
+ return SQLITE_OK;
+ }
+ if( rc!=SQLITE_ROW ) return rc;
+
+ pReader->pStmt = s;
+ leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
+ sqlite3_column_bytes(pReader->pStmt, 0),
+ &pReader->leafReader);
+ }
+ return SQLITE_OK;
+}
+
+/* Step the current leaf forward to the next term. If we reach the
+** end of the current leaf, step forward to the next leaf block.
+*/
+static int leavesReaderStep(fulltext_vtab *v, LeavesReader *pReader){
+ assert( !leavesReaderAtEnd(pReader) );
+ leafReaderStep(&pReader->leafReader);
+
+ if( leafReaderAtEnd(&pReader->leafReader) ){
+ int rc;
+ if( pReader->pRootData ){
+ pReader->eof = 1;
+ return SQLITE_OK;
+ }
+ rc = sql_step_leaf_statement(v, pReader->idx, &pReader->pStmt);
+ if( rc!=SQLITE_ROW ){
+ pReader->eof = 1;
+ return rc==SQLITE_DONE ? SQLITE_OK : rc;
+ }
+ leafReaderDestroy(&pReader->leafReader);
+ leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
+ sqlite3_column_bytes(pReader->pStmt, 0),
+ &pReader->leafReader);
+ }
+ return SQLITE_OK;
+}
+
+/* Order LeavesReaders by their term, ignoring idx. Readers at eof
+** always sort to the end.
+*/
+static int leavesReaderTermCmp(LeavesReader *lr1, LeavesReader *lr2){
+ if( leavesReaderAtEnd(lr1) ){
+ if( leavesReaderAtEnd(lr2) ) return 0;
+ return 1;
+ }
+ if( leavesReaderAtEnd(lr2) ) return -1;
+
+ return leafReaderTermCmp(&lr1->leafReader,
+ leavesReaderTerm(lr2), leavesReaderTermBytes(lr2));
+}
+
+/* Similar to leavesReaderTermCmp(), with additional ordering by idx
+** so that older segments sort before newer segments.
+*/
+static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){
+ int c = leavesReaderTermCmp(lr1, lr2);
+ if( c!=0 ) return c;
+ return lr1->idx-lr2->idx;
+}
+
+/* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its
+** sorted position.
+*/
+static void leavesReaderReorder(LeavesReader *pLr, int nLr){
+ while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
+ LeavesReader tmp = pLr[0];
+ pLr[0] = pLr[1];
+ pLr[1] = tmp;
+ nLr--;
+ pLr++;
+ }
+}
+
+/* Initializes pReaders with the segments from level iLevel, returning
+** the number of segments in *piReaders. Leaves pReaders in sorted
+** order.
+*/
+static int leavesReadersInit(fulltext_vtab *v, int iLevel,
+ LeavesReader *pReaders, int *piReaders){
+ sqlite3_stmt *s;
+ int i, rc = sql_get_statement(v, SEGDIR_SELECT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int(s, 1, iLevel);
+ if( rc!=SQLITE_OK ) return rc;
+
+ i = 0;
+ while( (rc = sql_step_statement(v, SEGDIR_SELECT_STMT, &s))==SQLITE_ROW ){
+ sqlite_int64 iStart = sqlite3_column_int64(s, 0);
+ sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
+ const char *pRootData = sqlite3_column_blob(s, 2);
+ int nRootData = sqlite3_column_bytes(s, 2);
+
+ assert( i<MERGE_COUNT );
+ rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
+ &pReaders[i]);
+ if( rc!=SQLITE_OK ) break;
+
+ i++;
+ }
+ if( rc!=SQLITE_DONE ){
+ while( i-->0 ){
+ leavesReaderDestroy(&pReaders[i]);
+ }
+ return rc;
+ }
+
+ *piReaders = i;
+
+ /* Leave our results sorted by term, then age. */
+ while( i-- ){
+ leavesReaderReorder(pReaders+i, *piReaders-i);
+ }
+ return SQLITE_OK;
+}
+
+/* Merge doclists from pReaders[nReaders] into a single doclist, which
+** is written to pWriter. Assumes pReaders is ordered oldest to
+** newest.
+*/
+/* TODO(shess) I have a version of this that merges the doclists
+** pairwise, and is thus much faster, but is also more intricate. So
+** I'll throw that in as a standalone change. N-way merge would be
+** even faster.
+*/
+static int leavesReadersMerge(fulltext_vtab *v,
+ LeavesReader *pReaders, int nReaders,
+ LeafWriter *pWriter){
+ const char *pTerm = leavesReaderTerm(pReaders);
+ int i, rc, nTerm = leavesReaderTermBytes(pReaders);
+ DocList doclist;
+
+ /* No need to merge, insert directly. */
+ if( nReaders==1 ){
+ docListStaticInit(&doclist, DL_DEFAULT,
+ leavesReaderData(pReaders),
+ leavesReaderDataBytes(pReaders));
+ }else{
+ docListInit(&doclist, DL_DEFAULT,
+ leavesReaderData(pReaders),
+ leavesReaderDataBytes(pReaders));
+
+ for(i=1; i<nReaders; i++){
+ DocList new, merged;
+ docListStaticInit(&new, DL_DEFAULT,
+ leavesReaderData(pReaders+i),
+ leavesReaderDataBytes(pReaders+i));
+ docListMerge(&merged, &doclist, &new);
+ docListDestroy(&doclist);
+ doclist = merged;
+ }
+ }
+
+ /* Insert the new doclist */
+ rc = leafWriterStep(v, pWriter, pTerm, nTerm, &doclist);
+ if( nReaders>1 ) docListDestroy(&doclist);
+ return rc;
+}
+
+/* Forward ref due to mutual recursion with segdirNextIndex(). */
+static int segmentMerge(fulltext_vtab *v, int iLevel);
+
+/* Put the next available index at iLevel into *pidx. If iLevel
+** already has MERGE_COUNT segments, they are merged to a higher
+** level to make room.
+*/
+static int segdirNextIndex(fulltext_vtab *v, int iLevel, int *pidx){
+ int rc = segdir_max_index(v, iLevel, pidx);
+ if( rc==SQLITE_DONE ){ /* No segments at iLevel. */
+ *pidx = 0;
+ }else if( rc==SQLITE_ROW ){
+ if( *pidx==(MERGE_COUNT-1) ){
+ rc = segmentMerge(v, iLevel);
+ if( rc!=SQLITE_OK ) return rc;
+ *pidx = 0;
+ }else{
+ (*pidx)++;
+ }
+ }else{
+ return rc;
+ }
+ return SQLITE_OK;
+}
+
+/* Merge MERGE_COUNT segments at iLevel into a new segment at
+** iLevel+1. If iLevel+1 is already full of segments, those will be
+** merged to make room.
+*/
+static int segmentMerge(fulltext_vtab *v, int iLevel){
+ LeafWriter writer;
+ LeavesReader lrs[MERGE_COUNT];
+ int i, rc, idx = 0;
+
+ /* Determine the next available segment index at the next level,
+ ** merging as necessary.
+ */
+ rc = segdirNextIndex(v, iLevel+1, &idx);
+ if( rc!=SQLITE_OK ) return rc;
+
+ /* TODO(shess) This assumes that we'll always see exactly
+ ** MERGE_COUNT segments to merge at a given level. That will be
+ ** broken if we allow the developer to request preemptive or
+ ** deferred merging.
+ */
+ memset(&lrs, '\0', sizeof(lrs));
+ rc = leavesReadersInit(v, iLevel, lrs, &i);
+ if( rc!=SQLITE_OK ) return rc;
+ assert( i==MERGE_COUNT );
+
+ leafWriterInit(iLevel+1, idx, &writer);
+
+ /* Since leavesReaderReorder() pushes readers at eof to the end,
+ ** when the first reader is empty, all will be empty.
+ */
+ while( !leavesReaderAtEnd(lrs) ){
+ /* Figure out how many readers share their next term. */
+ for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
+ if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
+ }
+
+ rc = leavesReadersMerge(v, lrs, i, &writer);
+ if( rc!=SQLITE_OK ) goto err;
+
+ /* Step forward those that were merged. */
+ while( i-->0 ){
+ rc = leavesReaderStep(v, lrs+i);
+ if( rc!=SQLITE_OK ) goto err;
+
+ /* Reorder by term, then by age. */
+ leavesReaderReorder(lrs+i, MERGE_COUNT-i);
+ }
+ }
+
+ for(i=0; i<MERGE_COUNT; i++){
+ leavesReaderDestroy(&lrs[i]);
+ }
+
+ rc = leafWriterFlush(v, &writer);
+ leafWriterDestroy(&writer);
+ if( rc!=SQLITE_OK ) return rc;
+
+ /* Delete the merged segment data. */
+ return segdir_delete(v, iLevel);
+
+ err:
+ for(i=0; i<MERGE_COUNT; i++){
+ leavesReaderDestroy(&lrs[i]);
+ }
+ leafWriterDestroy(&writer);
+ return rc;
+}
+
+/* Read pData[nData] as a leaf node, and if the doclist for
+** pTerm[nTerm] is present, merge it over *out (any duplicate doclists
+** read from pData will overwrite those in *out).
+*/
+static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
+ const char *pTerm, int nTerm, DocList *out){
+ LeafReader reader;
+ assert( nData>1 );
+ assert( *pData=='\0' );
+
+ leafReaderInit(pData, nData, &reader);
+ while( !leafReaderAtEnd(&reader) ){
+ int c = leafReaderTermCmp(&reader, pTerm, nTerm);
+ if( c==0 ){
+ DocList new, doclist;
+ docListStaticInit(&new, DL_DEFAULT,
+ leafReaderData(&reader), leafReaderDataBytes(&reader));
+ docListMerge(&doclist, out, &new);
+ docListDestroy(out);
+ *out = doclist;
+ }
+ if( c>=0 ) break;
+ leafReaderStep(&reader);
+ }
+ leafReaderDestroy(&reader);
+ return SQLITE_OK;
+}
+
+/* Traverse the tree represented by pData[nData] looking for
+** pTerm[nTerm], merging its doclist over *out if found (any duplicate
+** doclists read from the segment rooted at pData will overwrite those
+** in *out).
+*/
+static int loadSegment(fulltext_vtab *v, const char *pData, int nData,
+ const char *pTerm, int nTerm, DocList *out){
+ int rc;
+ sqlite3_stmt *s = NULL;
+
+ assert( nData>1 );
+
+ /* Process data as an interior node until we reach a leaf. */
+ while( *pData!='\0' ){
+ sqlite_int64 iBlockid;
+ InteriorReader reader;
+
+ /* Scan the node data until we find a term greater than our term.
+ ** Our target child will be in the blockid under that term, or in
+ ** the last blockid in the node if we never find such a term.
+ */
+ interiorReaderInit(pData, nData, &reader);
+ while( !interiorReaderAtEnd(&reader) ){
+ if( interiorReaderTermCmp(&reader, pTerm, nTerm)>0 ) break;
+ interiorReaderStep(&reader);
+ }
+
+ /* Grab the child blockid before calling sql_get_statement(),
+ ** because sql_get_statement() may reset our data out from under
+ ** us.
+ */
+ iBlockid = interiorReaderCurrentBlockid(&reader);
+ interiorReaderDestroy(&reader);
+
+ rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 1, iBlockid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sql_step_statement(v, BLOCK_SELECT_STMT, &s);
+ if( rc==SQLITE_DONE ) return SQLITE_ERROR;
+ if( rc!=SQLITE_ROW ) return rc;
+
+ pData = sqlite3_column_blob(s, 0);
+ nData = sqlite3_column_bytes(s, 0);
+ }
+
+ rc = loadSegmentLeaf(v, pData, nData, pTerm, nTerm, out);
+ if( rc!=SQLITE_OK ) return rc;
+
+ /* If we selected a child node, we need to finish that select. */
+ if( s!=NULL ){
+ /* We expect only one row. We must execute another sqlite3_step()
+ * to complete the iteration; otherwise the table will remain
+ * locked. */
+ rc = sqlite3_step(s);
+ if( rc==SQLITE_ROW ) return SQLITE_ERROR;
+ if( rc!=SQLITE_DONE ) return rc;
+ }
+ return SQLITE_OK;
+}
+
+/* Scan the database and merge together the posting lists for the term
+** into *out.
+*/
+static int termSelect(fulltext_vtab *v, int iColumn,
+ const char *pTerm, int nTerm, DocList *out){
+ DocList doclist;
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ docListInit(&doclist, DL_DEFAULT, 0, 0);
+
+ /* Traverse the segments from oldest to newest so that newer doclist
+ ** elements for given docids overwrite older elements.
+ */
+ while( (rc=sql_step_statement(v, SEGDIR_SELECT_ALL_STMT, &s))==SQLITE_ROW ){
+ rc = loadSegment(v, sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
+ pTerm, nTerm, &doclist);
+ if( rc!=SQLITE_OK ) goto err;
+ }
+ if( rc==SQLITE_DONE ){
+ *out = doclist;
+
+ /* TODO(shess) The old term_select_all() code applied the column
+ ** restrict as we merged segments, leading to smaller buffers.
+ ** This is probably worthwhile to bring back, once the new storage
+ ** system is checked in.
+ */
+ if( iColumn<v->nColumn ){ /* querying a single column */
+ docListRestrictColumn(out, iColumn);
+ }
+ docListDiscardEmpty(out);
+ return SQLITE_OK;
+ }
+
+ err:
+ docListDestroy(&doclist);
+ return rc;
+}
+
+/****************************************************************/
+/* Used to hold hashtable data for sorting. */
+typedef struct TermData {
+ const char *pTerm;
+ int nTerm;
+ DocList *pDoclist;
+} TermData;
+
+/* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
+** for equal, >0 for greater-than).
+*/
+static int termDataCmp(const void *av, const void *bv){
+ const TermData *a = (const TermData *)av;
+ const TermData *b = (const TermData *)bv;
+ int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
+ int c = memcmp(a->pTerm, b->pTerm, n);
+ if( c!=0 ) return c;
+ return a->nTerm-b->nTerm;
+}
+
+/* Order pTerms data by term, then write a new level 0 segment using
+** LeafWriter.
+*/
+static int writeZeroSegment(fulltext_vtab *v, fts2Hash *pTerms){
+ fts2HashElem *e;
+ int idx, rc, i, n;
+ TermData *pData;
+ LeafWriter writer;
+
+ /* Determine the next index at level 0, merging as necessary. */
+ rc = segdirNextIndex(v, 0, &idx);
+ if( rc!=SQLITE_OK ) return rc;
+
+ n = fts2HashCount(pTerms);
+ pData = malloc(n*sizeof(TermData));
+
+ for(i = 0, e = fts2HashFirst(pTerms); e; i++, e = fts2HashNext(e)){
+ assert( i<n );
+ pData[i].pTerm = fts2HashKey(e);
+ pData[i].nTerm = fts2HashKeysize(e);
+ pData[i].pDoclist = fts2HashData(e);
+ }
+ assert( i==n );
+
+ /* TODO(shess) Should we allow user-defined collation sequences,
+ ** here? I think we only need that once we support prefix searches.
+ */
+ if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
+
+ leafWriterInit(0, idx, &writer);
+ for(i=0; i<n; i++){
+ rc = leafWriterStep(v, &writer,
+ pData[i].pTerm, pData[i].nTerm, pData[i].pDoclist);
+ if( rc!=SQLITE_OK ) goto err;
+ }
+ rc = leafWriterFlush(v, &writer);
+
+ err:
+ free(pData);
+ leafWriterDestroy(&writer);
+ return rc;
+}
+
/* This function implements the xUpdate callback; it's the top-level entry
* point for inserting, deleting or updating a row in a full-text table. */
static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
}
- if( rc==SQLITE_OK ){
- /* Write updated doclists to disk. */
- for(e=fts2HashFirst(&terms); e; e=fts2HashNext(e)){
- DocList *p = fts2HashData(e);
- rc = index_insert_term(v, fts2HashKey(e), fts2HashKeysize(e), p);
- if( rc!=SQLITE_OK ) break;
- }
- }
+ if( rc==SQLITE_OK ) rc = writeZeroSegment(v, &terms);
/* clean up */
for(e=fts2HashFirst(&terms); e; e=fts2HashNext(e)){