From 0cb2ec40a40a0fa1097fb37a8b0ea262fc0ff61d Mon Sep 17 00:00:00 2001 From: Aki Tuomi Date: Mon, 17 May 2021 12:13:16 +0300 Subject: [PATCH] fts: Do not consider arbitrary headers with 8-bit data as language-specific If we do stemming etc processing for arbitrary headers, they might become impossible to find in subsequent searches due to stemming. This practically breaks e.g. From/To header searches with non-ascii names. --- src/plugins/fts/fts-build-mail.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/src/plugins/fts/fts-build-mail.c b/src/plugins/fts/fts-build-mail.c index 340fb7d962..d0f1c560ac 100644 --- a/src/plugins/fts/fts-build-mail.c +++ b/src/plugins/fts/fts-build-mail.c @@ -105,17 +105,6 @@ fts_build_unstructured_header(struct fts_mail_build_context *ctx, return ret; } -static bool data_has_8bit(const unsigned char *data, size_t size) -{ - size_t i; - - for (i = 0; i < size; i++) { - if ((data[i] & 0x80) != 0) - return TRUE; - } - return FALSE; -} - static void fts_mail_build_ctx_set_lang(struct fts_mail_build_context *ctx, struct fts_user_language *user_lang) { @@ -135,11 +124,9 @@ fts_build_tokenized_hdr_update_lang(struct fts_mail_build_context *ctx, /* Headers that don't contain any human language will only be translated to lowercase - no stemming or other filtering. There's unfortunately no pefect way of detecting which headers contain - human languages, so we have a list of some hardcoded header names - and we'll also assume that if there's any 8bit content it's a human - language. */ - if (fts_header_has_language(hdr->name) || - data_has_8bit(hdr->full_value, hdr->full_value_len)) + human languages, so we check with fts_header_has_language if the + header is something that's supposed to containing human text. */ + if (fts_header_has_language(hdr->name)) ctx->cur_user_lang = NULL; else { fts_mail_build_ctx_set_lang(ctx, -- 2.47.3