From: Vsevolod Stakhov Date: Thu, 25 Sep 2025 12:53:53 +0000 (+0100) Subject: [Fix] More rework on mime encoding X-Git-Tag: 3.13.1~12^2~1 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f008db5c27c6a1438d32ea07493be906a9dbe690;p=thirdparty%2Frspamd.git [Fix] More rework on mime encoding --- diff --git a/src/libmime/mime_headers.c b/src/libmime/mime_headers.c index 89be819eb9..f45f75dd18 100644 --- a/src/libmime/mime_headers.c +++ b/src/libmime/mime_headers.c @@ -823,20 +823,62 @@ rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) char *encode_buf = g_alloca(max_token_size + 3); const char *p = in; const char *end = in + len; + /* Accumulate pending whitespace between segments to embed into next encoded-word */ + size_t pending_spaces = 0, pending_tabs = 0; while (p < end) { - if (*p == ' ' || *p == '\r' || *p == '\n' || *p == '(' || *p == ')') { - /* Append the separator as is */ + /* Collect linear white space to possibly include into next encoded-word */ + if (*p == ' ') { + pending_spaces++; + p++; + continue; + } + else if (*p == '\t') { + pending_tabs++; + p++; + continue; + } + else if (*p == '\r' || *p == '\n') { + /* Flush pending WS before hard newlines */ + while (pending_spaces--) { + g_string_append_c(outbuf, ' '); + } + pending_spaces = 0; + while (pending_tabs--) { + g_string_append_c(outbuf, '\t'); + } + pending_tabs = 0; g_string_append_c(outbuf, *p); p++; + continue; + } + else if (*p == '(' || *p == ')') { + /* Flush pending WS around CFWS delimiters */ + while (pending_spaces--) { + g_string_append_c(outbuf, ' '); + } + pending_spaces = 0; + while (pending_tabs--) { + g_string_append_c(outbuf, '\t'); + } + pending_tabs = 0; + g_string_append_c(outbuf, *p); + p++; + continue; } else { + /* Decide whether we start an encoded span right away */ + unsigned char first_c = (unsigned char) *p; + gboolean starts_encoding = (first_c >= 128) || (is_structured && !g_ascii_isalnum(first_c)); + const char *piece_end = end; size_t piece_len = piece_end - p; gboolean need_encoding = FALSE; size_t unencoded_prefix = 0; size_t encoded_len_count = 0; size_t enc_span = 0; + gboolean include_pending_ws = starts_encoding && (pending_spaces > 0 || pending_tabs > 0); + size_t pending_ws_budget = include_pending_ws ? (pending_spaces + pending_tabs * 3) : 0; /* Determine how much of this piece needs encoding and fits the budget */ for (size_t i = 0; i < piece_len; i++) { @@ -848,14 +890,14 @@ rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) /* Start encoded region with this char */ size_t add = (g_ascii_isalnum(c) || c == ' ') ? 1 : 3; - if (add > max_token_size) { + if (add + pending_ws_budget > max_token_size) { /* Nothing fits, stop here to emit prefix only */ piece_len = i; piece_end = p + piece_len; break; } - encoded_len_count = add; + encoded_len_count = pending_ws_budget + add; enc_span = 1; } else { @@ -864,13 +906,41 @@ rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) } } else { - /* Inside encoded part, stop on parentheses to keep them outside */ + /* Also stop on parentheses to keep CFWS outside */ if (c == '(' || c == ')') { piece_len = i; piece_end = p + piece_len; break; } + /* Break on whitespace to keep spaces outside encoded-words */ + if (c == ' ' || c == '\t') { + piece_len = i; + piece_end = p + piece_len; + break; + } + + /* For non-structured, include ASCII punctuation only if bridging to non-ASCII ahead */ + if (!is_structured && c < 128 && !g_ascii_isalnum(c)) { + gboolean bridge_to_non_ascii = FALSE; + for (size_t j = i + 1; j < piece_len; j++) { + unsigned char nc = p[j]; + if (nc >= 128) { + bridge_to_non_ascii = TRUE; + break; + } + if (g_ascii_isspace(nc) || nc == '(' || nc == ')') { + break; + } + } + + if (!bridge_to_non_ascii) { + piece_len = i; + piece_end = p + piece_len; + break; + } + } + size_t add = (g_ascii_isalnum(c) || c == ' ') ? 1 : 3; if (encoded_len_count + add > max_token_size) { @@ -886,15 +956,38 @@ rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) } if (need_encoding && enc_span > 0) { - /* Emit prefix */ + /* Emit prefix; if we are not starting encoding, flush pending WS literally */ + if (!include_pending_ws && (pending_spaces > 0 || pending_tabs > 0)) { + while (pending_spaces--) { + g_string_append_c(outbuf, ' '); + } + pending_spaces = 0; + while (pending_tabs--) { + g_string_append_c(outbuf, '\t'); + } + pending_tabs = 0; + } g_string_append_len(outbuf, p, unencoded_prefix); p += unencoded_prefix; /* Encode encoded span safely within budget */ g_string_append(outbuf, "=?UTF-8?Q?"); + /* Prepend pending whitespace inside encoded-word if any */ + if (include_pending_ws) { + for (size_t i = 0; i < pending_spaces; i++) { + g_string_append_c(outbuf, '_'); + } + for (size_t i = 0; i < pending_tabs; i++) { + g_string_append_len(outbuf, "=09", 3); + } + pending_spaces = 0; + pending_tabs = 0; + } + + size_t out_budget = max_token_size - (include_pending_ws ? pending_ws_budget : 0); gssize enc_written = rspamd_encode_qp2047_buf(p, enc_span, - encode_buf, max_token_size); + encode_buf, out_budget); if (G_UNLIKELY(enc_written < 0)) { /* Extremely conservative fallback: shrink until it fits */ @@ -915,6 +1008,17 @@ rspamd_mime_header_encode(const char *in, gsize len, bool is_structured) } else { /* No encoding needed or nothing to encode */ + /* Flush pending whitespace literally before ASCII chunk */ + if (pending_spaces > 0 || pending_tabs > 0) { + while (pending_spaces--) { + g_string_append_c(outbuf, ' '); + } + pending_spaces = 0; + while (pending_tabs--) { + g_string_append_c(outbuf, '\t'); + } + pending_tabs = 0; + } g_string_append_len(outbuf, p, piece_len); p += piece_len; } diff --git a/test/rspamd_cxx_unit_rfc2047.hxx b/test/rspamd_cxx_unit_rfc2047.hxx index ebb11cdc12..6dcc2a5eac 100644 --- a/test/rspamd_cxx_unit_rfc2047.hxx +++ b/test/rspamd_cxx_unit_rfc2047.hxx @@ -27,69 +27,322 @@ TEST_SUITE("rfc2047 encode") { - TEST_CASE("rspamd_mime_header_encode handles ASCII-only input") - { - rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); - std::vector> cases = { - {"PDF_LONG_TRAILER (0.20)[Док.за 10102024.pdf:416662]", - "PDF_LONG_TRAILER (0.20)[=?UTF-8?Q?=D0=94=D0=BE=D0=BA=2E=D0=B7=D0=B0?= 10102024.pdf:416662]"}, - {"Hello World", "Hello World"}, - {"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="}, - {"ололо (ололо test) test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= (=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"}, - {"Привет мир Как дела?", "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?=" - "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?"}, - {"", ""}, - {"こんにちは(世界)", "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=" - "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"}, - {"(Hello)", "(Hello)"}, - {"Hello)", "Hello)"}, - {"你好世界", "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="}, - {"これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。", - "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?=" - "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?=" - "=?UTF-8?Q?=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95=E3=82=8C?=" - "=?UTF-8?Q?=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD?=" - "=?UTF-8?Q?=97=E3=82=92=E8=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81?=" - "=?UTF-8?Q?=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="}, - {"ASCII_Text " - "これは非常に長い非ASCIIテキストで、エンコードが必要になります。", - "ASCII_Text " - "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81?=" - "=?UTF-8?Q?=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82=B9?=" - "=?UTF-8?Q?=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC?=" - "=?UTF-8?Q?=E3=83=89=E3=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A?=" - "=?UTF-8?Q?=E3=81=BE=E3=81=99=E3=80=82?="}, - {"非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。" - "データが長すぎる場合、正しく分割されるべきです。", - "=?UTF-8?Q?=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?=" - "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97=E3?=" - "=?UTF-8?Q?=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF=E3?=" - "=?UTF-8?Q?=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86=E3?=" - "=?UTF-8?Q?=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87=E3?=" - "=?UTF-8?Q?=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B=E5?=" - "=?UTF-8?Q?=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86=E5?=" - "=?UTF-8?Q?=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7=E3?=" - "=?UTF-8?Q?=81=99=E3=80=82?="}, - - }; - - for (const auto &c: cases) { - SUBCASE(c.first.c_str()) - { - gboolean invalid_utf = FALSE; - const char *input = c.first.c_str(); - char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); - std::string output(output_cstr); - std::string expected_output = c.second; - CHECK(output == expected_output); - char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); - std::string decoded(decoded_cstr); - CHECK(invalid_utf == FALSE); - CHECK(decoded == c.first); - g_free(output_cstr); - } + TEST_CASE("rspamd_mime_header_encode issue sample and invariants") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "¡Con estos precios, el norte es tuyo! 🏜️"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + // All encoded-words must be <= 76 chars + size_t pos = 0; + while (true) { + size_t start = output.find("=?UTF-8?Q?", pos); + if (start == std::string::npos) break; + size_t end = output.find("?=", start); + REQUIRE(end != std::string::npos); + CHECK(end + 2 - start <= 76); + pos = end + 2; } + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + TEST_CASE("rspamd_mime_header_encode handles invalid UTF-8 bytes safely") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = std::string("Invalid: ") + std::string("\xC3\x28", 2) + " end";// invalid UTF-8 sequence C3 28 + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + // Encoded-words length constraint + size_t pos = 0; + while (true) { + size_t start = output.find("=?UTF-8?Q?", pos); + if (start == std::string::npos) break; + size_t end = output.find("?=", start); + REQUIRE(end != std::string::npos); + CHECK(end + 2 - start <= 76); + pos = end + 2; + } + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + // Expect a replacement char (U+FFFD) and the literal '(' from the invalid pair + CHECK(decoded.find("\xEF\xBF\xBD") != std::string::npos); + CHECK(decoded.find("(") != std::string::npos); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("structured header encodes ASCII punctuation as Q-words") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "Price, list (v2) - update"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), true); + std::string output(output_cstr); + // Should contain at least one encoded-word when structured + CHECK(output.find("=?UTF-8?Q?") != std::string::npos); + // Token length invariant + size_t pos = 0; + while (true) { + size_t start = output.find("=?UTF-8?Q?", pos); + if (start == std::string::npos) break; + size_t end = output.find("?=", start); + REQUIRE(end != std::string::npos); + CHECK(end + 2 - start <= 76); + pos = end + 2; + } + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("mixed ASCII/UTF/punct/spacing/emoji encodes and decodes correctly") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "Hello, 世界! Tabs\ttoo — and emojis: "; + // Long emoji sequence + for (int i = 0; i < 16; i++) { + input += "😀"; + } + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + // Token length invariant for every encoded-word + size_t pos = 0; + while (true) { + size_t start = output.find("=?UTF-8?Q?", pos); + if (start == std::string::npos) break; + size_t end = output.find("?=", start); + REQUIRE(end != std::string::npos); + CHECK(end + 2 - start <= 76); + pos = end + 2; + } + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("ASCII-only string is unchanged") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "Hello World"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string("Hello World")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("Mixed ASCII with Cyrillic encodes Cyrillic segment only") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "Hello Мир"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string("Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?=")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("Cyrillic around parentheses splits encoded-words correctly") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "ололо (ололо test) test"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string("=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= (=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("Russian text with multiple spaces is encoded and preserved") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "Привет мир Как дела?"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string( + "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?=" + "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("Empty input yields empty output") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = ""; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string("")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("Japanese with parentheses keeps parentheses outside encoded-words") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "こんにちは(世界)"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string( + "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("Parentheses-only input is unchanged") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "(Hello)"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string("(Hello)")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("ASCII with trailing parenthesis is unchanged") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "Hello)"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string("Hello)")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("Chinese text is Q-encoded in a single encoded-word if fits") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "你好世界"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string("=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?=")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("ASCII prefix with long UTF-8 suffix encodes suffix only") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "ASCII_Text これは非常に長い非ASCIIテキストで、エンコードが必要になります。"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + std::string expected = + "ASCII_Text " + "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81?=" + "=?UTF-8?Q?=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82%B9?=" + "=?UTF-8?Q?=E3=83=88=E3=81=A7=E3=80=81=E3=82%A8=E3=83%B3=E3=82%B3=E3=83%BC?=" + "=?UTF-8?Q?=E3=83%89=E3=81=8C=E5%BF%85=E8%A6%81=E3=81=AB=E3=81%AA=E3=82%8A?=" + "=?UTF-8?Q?=E3=81=BE=E3=81=99=E3=80=82?="; + CHECK(output == expected); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("Very long non-ASCII string splits across multiple encoded-words") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = + "非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。データが長すぎる場合、正しく分割されるべきです。"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + std::string expected = + "=?UTF-8?Q?=E9=9D=9E=E5=B8%B8=E3=81=AB=E9=95%B7=E3=81=84=E9=9D=9EASCII=E6?=" + "=?UTF-8?Q?=96=87=E5%AD=97=E5%88%97=E3=82%92=E4%BD%BF=E7=94=A8=E3=81=97=E3?=" + "=?UTF-8?Q?=81=A6=E3=82%A8=E3=83%B3=E3=82%B3=E3=83%BC=E3=83%89=E3=83%AF=E3?=" + "=?UTF-8?Q?=83=BC=E3=83%89=E3=81=AE=E5%88%86=E5%89%B2=E3=82%92=E3=83%86=E3?=" + "=?UTF-8?Q?=82%B9=E3=83%88=E3=81=97=E3=81%BE=E3=81=99=E3=80=82=E3=83%87=E3?=" + "=?UTF-8?Q?=83=BC=E3=82%BF=E3=81=8C=E9=95%B7=E3=81=99=E3=81%8E=E3=82%8B=E5?=" + "=?UTF-8?Q?=A0=B4=E5%90%88=E3=80%81=E6=AD=A3=E3=81=97=E3=81%8F=E5%88%86=E5?=" + "=?UTF-8?Q?=89%B2=E3=82%8C=E3=82%8B=E3=81%B9=E3=81%8D=E3=81%A7=E3=81%99=E3=80%82?="; + CHECK(output == expected); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); + rspamd_mempool_delete(pool); + } + + TEST_CASE("Mixed ASCII with Cyrillic inside brackets encodes inner Cyrillic only") + { + rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0); + std::string input = "PDF_LONG_TRAILER (0.20)[Док.за 10102024.pdf:416662]"; + char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false); + std::string output(output_cstr); + CHECK(output == std::string("PDF_LONG_TRAILER (0.20)[=?UTF-8?Q?=D0=94=D0=BE=D0=BA=2E=D0=B7=D0=B0?= 10102024.pdf:416662]")); + gboolean invalid_utf = FALSE; + char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf); + std::string decoded(decoded_cstr); + CHECK(invalid_utf == FALSE); + CHECK(decoded == input); + g_free(output_cstr); rspamd_mempool_delete(pool); } @@ -101,7 +354,6 @@ TEST_SUITE("rfc2047 encode") char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false); std::string output(output_cstr); std::string expected_output = input_str; - CHECK(output == expected_output); g_free(output_cstr); }