From: Timo Sirainen Date: Tue, 26 Oct 2021 13:34:25 +0000 (+0300) Subject: lib-fts: Implement support for parent tokenizer "streaming" X-Git-Tag: 2.3.18~151 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=2af8437d1d19f1fba76a835c05878f19d64e9b72;p=thirdparty%2Fdovecot%2Fcore.git lib-fts: Implement support for parent tokenizer "streaming" By default parent tokenizer is further tokenizing the token strings returned by child tokenizer. When streaming is enabled, the parent tokenizers are instead tokenizing a stream of data sent by the child tokenizer. This effectively makes the parent tokenizer return the same tokens as if the child tokenizer didn't exist (assuming child tokenizer feeds the parent all the same input). Arguably this should be the only way tokenizers work, but at least for now lets keep both ways. --- diff --git a/src/lib-fts/fts-tokenizer-private.h b/src/lib-fts/fts-tokenizer-private.h index 59fdeca494..b7615b17be 100644 --- a/src/lib-fts/fts-tokenizer-private.h +++ b/src/lib-fts/fts-tokenizer-private.h @@ -36,6 +36,14 @@ struct fts_tokenizer { size_t prev_skip; bool prev_reply_finished; bool skip_parents; /* Return token as is, do not hand to parents. */ + /* Instead of handing child tokens separately to parent tokenizer, + treat the returned tokens as a continuous stream. The final token + isn't returned until the child tokenizer also sees 0-sized data. */ + bool stream_to_parents; + /* Parent stream still needs to be finalized, so any final pending + tokens will be returned. This is used only with + stream_to_parents=TRUE. */ + bool finalize_parent_pending; }; void fts_tokenizer_register(const struct fts_tokenizer *tok_class); diff --git a/src/lib-fts/fts-tokenizer.c b/src/lib-fts/fts-tokenizer.c index b1bb8941b7..2ae5a17f1f 100644 --- a/src/lib-fts/fts-tokenizer.c +++ b/src/lib-fts/fts-tokenizer.c @@ -193,6 +193,16 @@ int fts_tokenizer_next(struct fts_tokenizer *tok, ret = fts_tokenizer_next_self(tok, data, size, token_r, error_r); if (ret <= 0) { /* error / more data needed */ + if (ret == 0 && size == 0 && + tok->finalize_parent_pending) { + /* Tokenizer input is being finalized. The + child tokenizer is done now, but the parent + tokenizer still needs to be finalized. */ + tok->finalize_parent_pending = FALSE; + tok->parent_state = + FTS_TOKENIZER_PARENT_STATE_FINALIZE; + return fts_tokenizer_next(tok, NULL, 0, token_r, error_r); + } break; } @@ -222,9 +232,14 @@ int fts_tokenizer_next(struct fts_tokenizer *tok, case FTS_TOKENIZER_PARENT_STATE_FINALIZE: /* No more input is coming from the child tokenizer. Return the final token(s) from the parent tokenizer. */ - ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r, error_r); - if (ret != 0) - break; + if (!tok->stream_to_parents || size == 0) { + ret = fts_tokenizer_next(tok->parent, NULL, 0, + token_r, error_r); + if (ret != 0) + break; + } else { + tok->finalize_parent_pending = TRUE; + } /* We're finished handling the previous child token. See if there are more child tokens available with this same data input. */