]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: fts-tokenizer - Add comments explaning how it works
authorTimo Sirainen <timo.sirainen@open-xchange.com>
Tue, 26 Oct 2021 09:33:50 +0000 (12:33 +0300)
committeraki.tuomi <aki.tuomi@open-xchange.com>
Mon, 8 Nov 2021 10:31:23 +0000 (10:31 +0000)
src/lib-fts/fts-tokenizer.c

index 1f872b39d6354e7e8765eed4e816ea3916b506cd..b1bb8941b7088f7c388d6183fd023c3f279636bd 100644 (file)
@@ -139,10 +139,11 @@ fts_tokenizer_next_self(struct fts_tokenizer *tok,
                 (data == tok->prev_data && size == tok->prev_size));
 
        if (tok->prev_reply_finished) {
-               /* whole new data */
+               /* whole new data: get the first token */
                ret = tok->v->next(tok, data, size, &skip, token_r, error_r);
        } else {
-               /* continuing previous data */
+               /* continuing previous data: skip over the tokens that were
+                  already returned from it and get the next token. */
                i_assert(tok->prev_skip <= size);
 
                const unsigned char *data_next;
@@ -158,13 +159,17 @@ fts_tokenizer_next_self(struct fts_tokenizer *tok,
        }
 
        if (ret > 0) {
+               /* A token was successfully returned. There could be more
+                  tokens left within the provided data, so remember what part
+                  of the data we used so far. */
                i_assert(skip <= size - tok->prev_skip);
                tok->prev_data = data;
                tok->prev_size = size;
                tok->prev_skip = tok->prev_skip + skip;
                tok->prev_reply_finished = FALSE;
        } else if (ret == 0) {
-               /* we need a new data block */
+               /* Need more data to get the next token. The next call will
+                  provide a whole new data block, so reset the prev_* state. */
                fts_tokenizer_self_reset(tok);
        }
        return ret;
@@ -184,14 +189,30 @@ int fts_tokenizer_next(struct fts_tokenizer *tok,
 
        switch (tok->parent_state) {
        case FTS_TOKENIZER_PARENT_STATE_ADD_DATA:
+               /* Try to get the next token using this tokenizer */
                ret = fts_tokenizer_next_self(tok, data, size, token_r, error_r);
-               if (ret <= 0 || tok->parent == NULL || tok->skip_parents)
+               if (ret <= 0) {
+                       /* error / more data needed */
                        break;
+               }
+
+               /* Feed the returned token to the parent tokenizer, if it
+                  exists. The parent tokenizer may further split it into
+                  smaller pieces. */
+               if (tok->parent == NULL)
+                       break;
+               if (tok->skip_parents) {
+                       /* Parent tokenizer exists, but it's skipped for now.
+                          This can be used by child tokenizers to return a
+                          token directly, bypassing the parent tokenizer. */
+                       break;
+               }
                buffer_set_used_size(tok->parent_input, 0);
                buffer_append(tok->parent_input, *token_r, strlen(*token_r));
                tok->parent_state++;
                /* fall through */
        case FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
+               /* Return the next token from parent tokenizer */
                ret = fts_tokenizer_next(tok->parent, tok->parent_input->data,
                                         tok->parent_input->used, token_r, error_r);
                if (ret != 0)
@@ -199,11 +220,14 @@ int fts_tokenizer_next(struct fts_tokenizer *tok,
                tok->parent_state++;
                /* fall through */
        case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
+               /* No more input is coming from the child tokenizer. Return the
+                  final token(s) from the parent tokenizer. */
                ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r, error_r);
                if (ret != 0)
                        break;
-               /* we're finished sending this token to parent tokenizer.
-                  see if our own tokenizer has more tokens available */
+               /* We're finished handling the previous child token. See if
+                  there are more child tokens available with this same data
+                  input. */
                tok->parent_state = FTS_TOKENIZER_PARENT_STATE_ADD_DATA;
                return fts_tokenizer_next(tok, data, size, token_r, error_r);
        default: