]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-fts: Implement support for parent tokenizer "streaming"
authorTimo Sirainen <timo.sirainen@open-xchange.com>
Tue, 26 Oct 2021 13:34:25 +0000 (16:34 +0300)
committeraki.tuomi <aki.tuomi@open-xchange.com>
Mon, 8 Nov 2021 10:31:23 +0000 (10:31 +0000)
By default parent tokenizer is further tokenizing the token strings returned
by child tokenizer. When streaming is enabled, the parent tokenizers are
instead tokenizing a stream of data sent by the child tokenizer. This
effectively makes the parent tokenizer return the same tokens as if the
child tokenizer didn't exist (assuming child tokenizer feeds the parent
all the same input).

Arguably this should be the only way tokenizers work, but at least for now
lets keep both ways.

src/lib-fts/fts-tokenizer-private.h
src/lib-fts/fts-tokenizer.c

index 59fdeca49456079894961ac9107e6badbe8b8749..b7615b17be364aae14fcc1b215615589c4682b9c 100644 (file)
@@ -36,6 +36,14 @@ struct fts_tokenizer {
        size_t prev_skip;
        bool prev_reply_finished;
        bool skip_parents; /* Return token as is, do not hand to parents. */
+       /* Instead of handing child tokens separately to parent tokenizer,
+          treat the returned tokens as a continuous stream. The final token
+          isn't returned until the child tokenizer also sees 0-sized data. */
+       bool stream_to_parents;
+       /* Parent stream still needs to be finalized, so any final pending
+          tokens will be returned. This is used only with
+          stream_to_parents=TRUE. */
+       bool finalize_parent_pending;
 };
 
 void fts_tokenizer_register(const struct fts_tokenizer *tok_class);
index b1bb8941b7088f7c388d6183fd023c3f279636bd..2ae5a17f1fcf49a1649763adf81b28ec5ccdce22 100644 (file)
@@ -193,6 +193,16 @@ int fts_tokenizer_next(struct fts_tokenizer *tok,
                ret = fts_tokenizer_next_self(tok, data, size, token_r, error_r);
                if (ret <= 0) {
                        /* error / more data needed */
+                       if (ret == 0 && size == 0 &&
+                           tok->finalize_parent_pending) {
+                               /* Tokenizer input is being finalized. The
+                                  child tokenizer is done now, but the parent
+                                  tokenizer still needs to be finalized. */
+                               tok->finalize_parent_pending = FALSE;
+                               tok->parent_state =
+                                       FTS_TOKENIZER_PARENT_STATE_FINALIZE;
+                               return fts_tokenizer_next(tok, NULL, 0, token_r, error_r);
+                       }
                        break;
                }
 
@@ -222,9 +232,14 @@ int fts_tokenizer_next(struct fts_tokenizer *tok,
        case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
                /* No more input is coming from the child tokenizer. Return the
                   final token(s) from the parent tokenizer. */
-               ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r, error_r);
-               if (ret != 0)
-                       break;
+               if (!tok->stream_to_parents || size == 0) {
+                       ret = fts_tokenizer_next(tok->parent, NULL, 0,
+                                                token_r, error_r);
+                       if (ret != 0)
+                               break;
+               } else {
+                       tok->finalize_parent_pending = TRUE;
+               }
                /* We're finished handling the previous child token. See if
                   there are more child tokens available with this same data
                   input. */