From: Aki Tuomi Date: Mon, 17 May 2021 09:13:16 +0000 (+0300) Subject: fts: Do not consider arbitrary headers with 8-bit data as language-specific X-Git-Tag: 2.3.16~67 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=0cb2ec40a40a0fa1097fb37a8b0ea262fc0ff61d;p=thirdparty%2Fdovecot%2Fcore.git fts: Do not consider arbitrary headers with 8-bit data as language-specific If we do stemming etc processing for arbitrary headers, they might become impossible to find in subsequent searches due to stemming. This practically breaks e.g. From/To header searches with non-ascii names. --- diff --git a/src/plugins/fts/fts-build-mail.c b/src/plugins/fts/fts-build-mail.c index 340fb7d962..d0f1c560ac 100644 --- a/src/plugins/fts/fts-build-mail.c +++ b/src/plugins/fts/fts-build-mail.c @@ -105,17 +105,6 @@ fts_build_unstructured_header(struct fts_mail_build_context *ctx, return ret; } -static bool data_has_8bit(const unsigned char *data, size_t size) -{ - size_t i; - - for (i = 0; i < size; i++) { - if ((data[i] & 0x80) != 0) - return TRUE; - } - return FALSE; -} - static void fts_mail_build_ctx_set_lang(struct fts_mail_build_context *ctx, struct fts_user_language *user_lang) { @@ -135,11 +124,9 @@ fts_build_tokenized_hdr_update_lang(struct fts_mail_build_context *ctx, /* Headers that don't contain any human language will only be translated to lowercase - no stemming or other filtering. There's unfortunately no pefect way of detecting which headers contain - human languages, so we have a list of some hardcoded header names - and we'll also assume that if there's any 8bit content it's a human - language. */ - if (fts_header_has_language(hdr->name) || - data_has_8bit(hdr->full_value, hdr->full_value_len)) + human languages, so we check with fts_header_has_language if the + header is something that's supposed to containing human text. */ + if (fts_header_has_language(hdr->name)) ctx->cur_user_lang = NULL; else { fts_mail_build_ctx_set_lang(ctx,