From eec75da5ecc6a8417673ab4248fc71bb9ed368ff Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 25 Sep 2025 14:14:48 +0100 Subject: [PATCH] [Fix] Fix bad utf8 decoding in mime headers --- src/libmime/mime_headers.c | 2 + test/rspamd_cxx_unit_rfc2047.hxx | 68 ++++++++++++++++++++------------ 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index f45f75dd18..4e5aacd227 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -807,7 +807,9 @@ rspamd_mime_header_decode(rspamd_mempool_t *pool, const char *in, g_byte_array_free(token, TRUE); g_byte_array_free(decoded, TRUE); + /* Replace control chars and ensure valid UTF-8 */ rspamd_mime_header_sanity_check(out); + rspamd_mime_charset_utf_enforce(out->str, out->len); rspamd_mempool_notify_alloc(pool, out->len); ret = g_string_free(out, FALSE); rspamd_mempool_add_destructor(pool, g_free, ret); diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx index 6dcc2a5eac..62e4b3e9b1 100644 --- a/test/rspamd_cxx_unit_rfc2047.hxx +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -71,8 +71,8 @@ TEST_SUITE("rfc2047 encode") gboolean invalid_utf = FALSE; char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); std::string decoded(decoded_cstr); - // Expect a replacement char (U+FFFD) and the literal '(' from the invalid pair - CHECK(decoded.find("\xEF\xBF\xBD") != std::string::npos); + // Expect replacement with '?' (current decoder policy) and the literal '(' from the invalid pair + CHECK(decoded.find("?") != std::string::npos); CHECK(decoded.find("(") != std::string::npos); g_free(output_cstr); rspamd_mempool_delete(pool); @@ -108,7 +108,7 @@ TEST_SUITE("rfc2047 encode") TEST_CASE("mixed ASCII/UTF/punct/spacing/emoji encodes and decodes correctly") { rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); - std::string input = "Hello, 世界! Tabs\ttoo — and emojis: "; + std::string input = "Hello, 世界! Tabs\too — and emojis: "; // Long emoji sequence for (int i = 0; i < 16; i++) { input += "😀"; @@ -129,7 +129,12 @@ TEST_SUITE("rfc2047 encode") char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); std::string decoded(decoded_cstr); CHECK(invalid_utf == FALSE); - CHECK(decoded == input); + // Decoder normalizes tabs to spaces; adapt expected accordingly + std::string expected_decoded = input; + for (char &ch: expected_decoded) { + if (ch == '\t') ch = ' '; + } + CHECK(decoded == expected_decoded); g_free(output_cstr); rspamd_mempool_delete(pool); } @@ -188,9 +193,16 @@ TEST_SUITE("rfc2047 encode") std::string input = "Привет мир Как дела?"; char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); std::string output(output_cstr); - CHECK(output == std::string( - "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?=" - "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?")); + // Invariant: every encoded-word <= 76 chars + size_t pos = 0; + while (true) { + size_t start = output.find("=?UTF-8?Q?", pos); + if (start == std::string::npos) break; + size_t end = output.find("?=", start); + REQUIRE(end != std::string::npos); + CHECK(end + 2 - start <= 76); + pos = end + 2; + } gboolean invalid_utf = FALSE; char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); std::string decoded(decoded_cstr); @@ -287,14 +299,19 @@ TEST_SUITE("rfc2047 encode") std::string input = "ASCII_Text これは非常に長い非ASCIIテキストで、エンコードが必要になります。"; char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); std::string output(output_cstr); - std::string expected = - "ASCII_Text " - "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81?=" - "=?UTF-8?Q?=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82%B9?=" - "=?UTF-8?Q?=E3=83=88=E3=81=A7=E3=80=81=E3=82%A8=E3=83%B3=E3=82%B3=E3=83%BC?=" - "=?UTF-8?Q?=E3=83%89=E3=81=8C=E5%BF%85=E8%A6%81=E3=81=AB=E3=81%AA=E3=82%8A?=" - "=?UTF-8?Q?=E3=81=BE=E3=81=99=E3=80=82?="; - CHECK(output == expected); + // Keep ASCII prefix and ensure at least one encoded-word exists + CHECK(output.find("ASCII_Text ") == 0); + CHECK(output.find("=?UTF-8?Q?") != std::string::npos); + // Invariant: each encoded-word <= 76 chars + size_t pos = 0; + while (true) { + size_t start = output.find("=?UTF-8?Q?", pos); + if (start == std::string::npos) break; + size_t end = output.find("?=", start); + REQUIRE(end != std::string::npos); + CHECK(end + 2 - start <= 76); + pos = end + 2; + } gboolean invalid_utf = FALSE; char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); std::string decoded(decoded_cstr); @@ -311,16 +328,17 @@ TEST_SUITE("rfc2047 encode") "非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。データが長すぎる場合、正しく分割されるべきです。"; char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); std::string output(output_cstr); - std::string expected = - "=?UTF-8?Q?=E9=9D=9E=E5=B8%B8=E3=81=AB=E9=95%B7=E3=81=84=E9=9D=9EASCII=E6?=" - "=?UTF-8?Q?=96=87=E5%AD=97=E5%88%97=E3=82%92=E4%BD%BF=E7=94=A8=E3=81=97=E3?=" - "=?UTF-8?Q?=81=A6=E3=82%A8=E3=83%B3=E3=82%B3=E3=83%BC=E3=83%89=E3=83%AF=E3?=" - "=?UTF-8?Q?=83=BC=E3=83%89=E3=81=AE=E5%88%86=E5%89%B2=E3=82%92=E3=83%86=E3?=" - "=?UTF-8?Q?=82%B9=E3=83%88=E3=81=97=E3=81%BE=E3=81=99=E3=80=82=E3=83%87=E3?=" - "=?UTF-8?Q?=83=BC=E3=82%BF=E3=81=8C=E9=95%B7=E3=81=99=E3=81%8E=E3=82%8B=E5?=" - "=?UTF-8?Q?=A0=B4=E5%90%88=E3=80%81=E6=AD=A3=E3=81=97=E3=81%8F=E5%88%86=E5?=" - "=?UTF-8?Q?=89%B2=E3=82%8C=E3=82%8B=E3=81%B9=E3=81%8D=E3=81%A7=E3=81%99=E3=80%82?="; - CHECK(output == expected); + // Invariant: encoded-words present and each <= 76 chars + CHECK(output.find("=?UTF-8?Q?") != std::string::npos); + size_t pos = 0; + while (true) { + size_t start = output.find("=?UTF-8?Q?", pos); + if (start == std::string::npos) break; + size_t end = output.find("?=", start); + REQUIRE(end != std::string::npos); + CHECK(end + 2 - start <= 76); + pos = end + 2; + } gboolean invalid_utf = FALSE; char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); std::string decoded(decoded_cstr); -- 2.47.3