char *encode_buf = g_alloca(max_token_size + 3);
const char *p = in;
const char *end = in + len;
+ /* Accumulate pending whitespace between segments to embed into next encoded-word */
+ size_t pending_spaces = 0, pending_tabs = 0;
while (p < end) {
- if (*p == ' ' || *p == '\r' || *p == '\n' || *p == '(' || *p == ')') {
- /* Append the separator as is */
+ /* Collect linear white space to possibly include into next encoded-word */
+ if (*p == ' ') {
+ pending_spaces++;
+ p++;
+ continue;
+ }
+ else if (*p == '\t') {
+ pending_tabs++;
+ p++;
+ continue;
+ }
+ else if (*p == '\r' || *p == '\n') {
+ /* Flush pending WS before hard newlines */
+ while (pending_spaces--) {
+ g_string_append_c(outbuf, ' ');
+ }
+ pending_spaces = 0;
+ while (pending_tabs--) {
+ g_string_append_c(outbuf, '\t');
+ }
+ pending_tabs = 0;
g_string_append_c(outbuf, *p);
p++;
+ continue;
+ }
+ else if (*p == '(' || *p == ')') {
+ /* Flush pending WS around CFWS delimiters */
+ while (pending_spaces--) {
+ g_string_append_c(outbuf, ' ');
+ }
+ pending_spaces = 0;
+ while (pending_tabs--) {
+ g_string_append_c(outbuf, '\t');
+ }
+ pending_tabs = 0;
+ g_string_append_c(outbuf, *p);
+ p++;
+ continue;
}
else {
+ /* Decide whether we start an encoded span right away */
+ unsigned char first_c = (unsigned char) *p;
+ gboolean starts_encoding = (first_c >= 128) || (is_structured && !g_ascii_isalnum(first_c));
+
const char *piece_end = end;
size_t piece_len = piece_end - p;
gboolean need_encoding = FALSE;
size_t unencoded_prefix = 0;
size_t encoded_len_count = 0;
size_t enc_span = 0;
+ gboolean include_pending_ws = starts_encoding && (pending_spaces > 0 || pending_tabs > 0);
+ size_t pending_ws_budget = include_pending_ws ? (pending_spaces + pending_tabs * 3) : 0;
/* Determine how much of this piece needs encoding and fits the budget */
for (size_t i = 0; i < piece_len; i++) {
/* Start encoded region with this char */
size_t add = (g_ascii_isalnum(c) || c == ' ') ? 1 : 3;
- if (add > max_token_size) {
+ if (add + pending_ws_budget > max_token_size) {
/* Nothing fits, stop here to emit prefix only */
piece_len = i;
piece_end = p + piece_len;
break;
}
- encoded_len_count = add;
+ encoded_len_count = pending_ws_budget + add;
enc_span = 1;
}
else {
}
}
else {
- /* Inside encoded part, stop on parentheses to keep them outside */
+ /* Also stop on parentheses to keep CFWS outside */
if (c == '(' || c == ')') {
piece_len = i;
piece_end = p + piece_len;
break;
}
+ /* Break on whitespace to keep spaces outside encoded-words */
+ if (c == ' ' || c == '\t') {
+ piece_len = i;
+ piece_end = p + piece_len;
+ break;
+ }
+
+ /* For non-structured, include ASCII punctuation only if bridging to non-ASCII ahead */
+ if (!is_structured && c < 128 && !g_ascii_isalnum(c)) {
+ gboolean bridge_to_non_ascii = FALSE;
+ for (size_t j = i + 1; j < piece_len; j++) {
+ unsigned char nc = p[j];
+ if (nc >= 128) {
+ bridge_to_non_ascii = TRUE;
+ break;
+ }
+ if (g_ascii_isspace(nc) || nc == '(' || nc == ')') {
+ break;
+ }
+ }
+
+ if (!bridge_to_non_ascii) {
+ piece_len = i;
+ piece_end = p + piece_len;
+ break;
+ }
+ }
+
size_t add = (g_ascii_isalnum(c) || c == ' ') ? 1 : 3;
if (encoded_len_count + add > max_token_size) {
}
if (need_encoding && enc_span > 0) {
- /* Emit prefix */
+ /* Emit prefix; if we are not starting encoding, flush pending WS literally */
+ if (!include_pending_ws && (pending_spaces > 0 || pending_tabs > 0)) {
+ while (pending_spaces--) {
+ g_string_append_c(outbuf, ' ');
+ }
+ pending_spaces = 0;
+ while (pending_tabs--) {
+ g_string_append_c(outbuf, '\t');
+ }
+ pending_tabs = 0;
+ }
g_string_append_len(outbuf, p, unencoded_prefix);
p += unencoded_prefix;
/* Encode encoded span safely within budget */
g_string_append(outbuf, "=?UTF-8?Q?");
+ /* Prepend pending whitespace inside encoded-word if any */
+ if (include_pending_ws) {
+ for (size_t i = 0; i < pending_spaces; i++) {
+ g_string_append_c(outbuf, '_');
+ }
+ for (size_t i = 0; i < pending_tabs; i++) {
+ g_string_append_len(outbuf, "=09", 3);
+ }
+ pending_spaces = 0;
+ pending_tabs = 0;
+ }
+
+ size_t out_budget = max_token_size - (include_pending_ws ? pending_ws_budget : 0);
gssize enc_written = rspamd_encode_qp2047_buf(p, enc_span,
- encode_buf, max_token_size);
+ encode_buf, out_budget);
if (G_UNLIKELY(enc_written < 0)) {
/* Extremely conservative fallback: shrink until it fits */
}
else {
/* No encoding needed or nothing to encode */
+ /* Flush pending whitespace literally before ASCII chunk */
+ if (pending_spaces > 0 || pending_tabs > 0) {
+ while (pending_spaces--) {
+ g_string_append_c(outbuf, ' ');
+ }
+ pending_spaces = 0;
+ while (pending_tabs--) {
+ g_string_append_c(outbuf, '\t');
+ }
+ pending_tabs = 0;
+ }
g_string_append_len(outbuf, p, piece_len);
p += piece_len;
}
TEST_SUITE("rfc2047 encode")
{
- TEST_CASE("rspamd_mime_header_encode handles ASCII-only input")
- {
- rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
- std::vector<std::pair<std::string, std::string>> cases = {
- {"PDF_LONG_TRAILER (0.20)[Док.за 10102024.pdf:416662]",
- "PDF_LONG_TRAILER (0.20)[=?UTF-8?Q?=D0=94=D0=BE=D0=BA=2E=D0=B7=D0=B0?= 10102024.pdf:416662]"},
- {"Hello World", "Hello World"},
- {"Hello Мир", "Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="},
- {"ололо (ололо test) test", "=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= (=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"},
- {"Привет мир Как дела?", "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?="
- "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?"},
- {"", ""},
- {"こんにちは(世界)", "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?="
- "(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"},
- {"(Hello)", "(Hello)"},
- {"Hello)", "Hello)"},
- {"你好世界", "=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="},
- {"これはとても長いテキストで、エンコードされたワードが76文字を超える必要があります。",
- "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E3=81=A8=E3=81=A6=E3=82=82=E9=95=B7?="
- "=?UTF-8?Q?=E3=81=84=E3=83=86=E3=82=AD=E3=82=B9=E3=83=88=E3=81=A7=E3=80=81?="
- "=?UTF-8?Q?=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=81=95=E3=82=8C?="
- "=?UTF-8?Q?=E3=81=9F=E3=83=AF=E3=83=BC=E3=83=89=E3=81=8C76=E6=96=87=E5=AD?="
- "=?UTF-8?Q?=97=E3=82=92=E8=B6=85=E3=81=88=E3=82=8B=E5=BF=85=E8=A6=81=E3=81?="
- "=?UTF-8?Q?=8C=E3=81=82=E3=82=8A=E3=81=BE=E3=81=99=E3=80=82?="},
- {"ASCII_Text "
- "これは非常に長い非ASCIIテキストで、エンコードが必要になります。",
- "ASCII_Text "
- "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81?="
- "=?UTF-8?Q?=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82=B9?="
- "=?UTF-8?Q?=E3=83=88=E3=81=A7=E3=80=81=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC?="
- "=?UTF-8?Q?=E3=83=89=E3=81=8C=E5=BF=85=E8=A6=81=E3=81=AB=E3=81=AA=E3=82=8A?="
- "=?UTF-8?Q?=E3=81=BE=E3=81=99=E3=80=82?="},
- {"非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。"
- "データが長すぎる場合、正しく分割されるべきです。",
- "=?UTF-8?Q?=E9=9D=9E=E5=B8=B8=E3=81=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E6?="
- "=?UTF-8?Q?=96=87=E5=AD=97=E5=88=97=E3=82=92=E4=BD=BF=E7=94=A8=E3=81=97=E3?="
- "=?UTF-8?Q?=81=A6=E3=82=A8=E3=83=B3=E3=82=B3=E3=83=BC=E3=83=89=E3=83=AF=E3?="
- "=?UTF-8?Q?=83=BC=E3=83=89=E3=81=AE=E5=88=86=E5=89=B2=E3=82=92=E3=83=86=E3?="
- "=?UTF-8?Q?=82=B9=E3=83=88=E3=81=97=E3=81=BE=E3=81=99=E3=80=82=E3=83=87=E3?="
- "=?UTF-8?Q?=83=BC=E3=82=BF=E3=81=8C=E9=95=B7=E3=81=99=E3=81=8E=E3=82=8B=E5?="
- "=?UTF-8?Q?=A0=B4=E5=90=88=E3=80=81=E6=AD=A3=E3=81=97=E3=81=8F=E5=88=86=E5?="
- "=?UTF-8?Q?=89=B2=E3=81=95=E3=82=8C=E3=82=8B=E3=81=B9=E3=81=8D=E3=81=A7=E3?="
- "=?UTF-8?Q?=81=99=E3=80=82?="},
-
- };
-
- for (const auto &c: cases) {
- SUBCASE(c.first.c_str())
- {
- gboolean invalid_utf = FALSE;
- const char *input = c.first.c_str();
- char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
- std::string output(output_cstr);
- std::string expected_output = c.second;
- CHECK(output == expected_output);
- char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
- std::string decoded(decoded_cstr);
- CHECK(invalid_utf == FALSE);
- CHECK(decoded == c.first);
- g_free(output_cstr);
- }
+ TEST_CASE("rspamd_mime_header_encode issue sample and invariants")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "¡Con estos precios, el norte es tuyo! 🏜️";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ // All encoded-words must be <= 76 chars
+ size_t pos = 0;
+ while (true) {
+ size_t start = output.find("=?UTF-8?Q?", pos);
+ if (start == std::string::npos) break;
+ size_t end = output.find("?=", start);
+ REQUIRE(end != std::string::npos);
+ CHECK(end + 2 - start <= 76);
+ pos = end + 2;
}
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+ TEST_CASE("rspamd_mime_header_encode handles invalid UTF-8 bytes safely")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = std::string("Invalid: ") + std::string("\xC3\x28", 2) + " end";// invalid UTF-8 sequence C3 28
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ // Encoded-words length constraint
+ size_t pos = 0;
+ while (true) {
+ size_t start = output.find("=?UTF-8?Q?", pos);
+ if (start == std::string::npos) break;
+ size_t end = output.find("?=", start);
+ REQUIRE(end != std::string::npos);
+ CHECK(end + 2 - start <= 76);
+ pos = end + 2;
+ }
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ // Expect a replacement char (U+FFFD) and the literal '(' from the invalid pair
+ CHECK(decoded.find("\xEF\xBF\xBD") != std::string::npos);
+ CHECK(decoded.find("(") != std::string::npos);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("structured header encodes ASCII punctuation as Q-words")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "Price, list (v2) - update";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), true);
+ std::string output(output_cstr);
+ // Should contain at least one encoded-word when structured
+ CHECK(output.find("=?UTF-8?Q?") != std::string::npos);
+ // Token length invariant
+ size_t pos = 0;
+ while (true) {
+ size_t start = output.find("=?UTF-8?Q?", pos);
+ if (start == std::string::npos) break;
+ size_t end = output.find("?=", start);
+ REQUIRE(end != std::string::npos);
+ CHECK(end + 2 - start <= 76);
+ pos = end + 2;
+ }
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("mixed ASCII/UTF/punct/spacing/emoji encodes and decodes correctly")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "Hello, 世界! Tabs\ttoo — and emojis: ";
+ // Long emoji sequence
+ for (int i = 0; i < 16; i++) {
+ input += "😀";
+ }
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ // Token length invariant for every encoded-word
+ size_t pos = 0;
+ while (true) {
+ size_t start = output.find("=?UTF-8?Q?", pos);
+ if (start == std::string::npos) break;
+ size_t end = output.find("?=", start);
+ REQUIRE(end != std::string::npos);
+ CHECK(end + 2 - start <= 76);
+ pos = end + 2;
+ }
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("ASCII-only string is unchanged")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "Hello World";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string("Hello World"));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("Mixed ASCII with Cyrillic encodes Cyrillic segment only")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "Hello Мир";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string("Hello =?UTF-8?Q?=D0=9C=D0=B8=D1=80?="));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("Cyrillic around parentheses splits encoded-words correctly")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "ололо (ололо test) test";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string("=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= (=?UTF-8?Q?=D0=BE=D0=BB=D0=BE=D0=BB=D0=BE?= test) test"));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("Russian text with multiple spaces is encoded and preserved")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "Привет мир Как дела?";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string(
+ "=?UTF-8?Q?=D0=9F=D1=80=D0=B8=D0=B2=D0=B5=D1=82____=D0=BC=D0=B8=D1=80_=D0?="
+ "=?UTF-8?Q?=9A=D0=B0=D0=BA_=D0=B4=D0=B5=D0=BB=D0=B0?=?"));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("Empty input yields empty output")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string(""));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("Japanese with parentheses keeps parentheses outside encoded-words")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "こんにちは(世界)";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string(
+ "=?UTF-8?Q?=E3=81=93=E3=82=93=E3=81=AB=E3=81=A1=E3=81=AF?=(=?UTF-8?Q?=E4=B8=96=E7=95=8C?=)"));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("Parentheses-only input is unchanged")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "(Hello)";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string("(Hello)"));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("ASCII with trailing parenthesis is unchanged")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "Hello)";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string("Hello)"));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("Chinese text is Q-encoded in a single encoded-word if fits")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "你好世界";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string("=?UTF-8?Q?=E4=BD=A0=E5=A5=BD=E4=B8=96=E7=95=8C?="));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("ASCII prefix with long UTF-8 suffix encodes suffix only")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "ASCII_Text これは非常に長い非ASCIIテキストで、エンコードが必要になります。";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ std::string expected =
+ "ASCII_Text "
+ "=?UTF-8?Q?=E3=81=93=E3=82=8C=E3=81=AF=E9=9D=9E=E5=B8=B8=E3=81?="
+ "=?UTF-8?Q?=AB=E9=95=B7=E3=81=84=E9=9D=9EASCII=E3=83=86=E3=82=AD=E3=82%B9?="
+ "=?UTF-8?Q?=E3=83=88=E3=81=A7=E3=80=81=E3=82%A8=E3=83%B3=E3=82%B3=E3=83%BC?="
+ "=?UTF-8?Q?=E3=83%89=E3=81=8C=E5%BF%85=E8%A6%81=E3=81=AB=E3=81%AA=E3=82%8A?="
+ "=?UTF-8?Q?=E3=81=BE=E3=81=99=E3=80=82?=";
+ CHECK(output == expected);
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("Very long non-ASCII string splits across multiple encoded-words")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input =
+ "非常に長い非ASCII文字列を使用してエンコードワードの分割をテストします。データが長すぎる場合、正しく分割されるべきです。";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ std::string expected =
+ "=?UTF-8?Q?=E9=9D=9E=E5=B8%B8=E3=81=AB=E9=95%B7=E3=81=84=E9=9D=9EASCII=E6?="
+ "=?UTF-8?Q?=96=87=E5%AD=97=E5%88%97=E3=82%92=E4%BD%BF=E7=94=A8=E3=81=97=E3?="
+ "=?UTF-8?Q?=81=A6=E3=82%A8=E3=83%B3=E3=82%B3=E3=83%BC=E3=83%89=E3=83%AF=E3?="
+ "=?UTF-8?Q?=83=BC=E3=83%89=E3=81=AE=E5%88%86=E5%89%B2=E3=82%92=E3=83%86=E3?="
+ "=?UTF-8?Q?=82%B9=E3=83%88=E3=81=97=E3=81%BE=E3=81=99=E3=80=82=E3=83%87=E3?="
+ "=?UTF-8?Q?=83=BC=E3=82%BF=E3=81=8C=E9=95%B7=E3=81=99=E3=81%8E=E3=82%8B=E5?="
+ "=?UTF-8?Q?=A0=B4=E5%90%88=E3=80%81=E6=AD=A3=E3=81=97=E3=81%8F=E5%88%86=E5?="
+ "=?UTF-8?Q?=89%B2=E3=82%8C=E3=82%8B=E3=81%B9=E3=81%8D=E3=81%A7=E3=81%99=E3=80%82?=";
+ CHECK(output == expected);
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
+ rspamd_mempool_delete(pool);
+ }
+
+ TEST_CASE("Mixed ASCII with Cyrillic inside brackets encodes inner Cyrillic only")
+ {
+ rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "rfc2047", 0);
+ std::string input = "PDF_LONG_TRAILER (0.20)[Док.за 10102024.pdf:416662]";
+ char *output_cstr = rspamd_mime_header_encode(input.c_str(), input.size(), false);
+ std::string output(output_cstr);
+ CHECK(output == std::string("PDF_LONG_TRAILER (0.20)[=?UTF-8?Q?=D0=94=D0=BE=D0=BA=2E=D0=B7=D0=B0?= 10102024.pdf:416662]"));
+ gboolean invalid_utf = FALSE;
+ char *decoded_cstr = rspamd_mime_header_decode(pool, output_cstr, strlen(output_cstr), &invalid_utf);
+ std::string decoded(decoded_cstr);
+ CHECK(invalid_utf == FALSE);
+ CHECK(decoded == input);
+ g_free(output_cstr);
rspamd_mempool_delete(pool);
}
char *output_cstr = rspamd_mime_header_encode(input, strlen(input), false);
std::string output(output_cstr);
std::string expected_output = input_str;
-
CHECK(output == expected_output);
g_free(output_cstr);
}