From: Vsevolod Stakhov Date: Mon, 16 Feb 2026 15:00:03 +0000 (+0000) Subject: [Feature] Add rspamd_text:normalize_newlines() for CRLF/LF conversion X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=566790047fc0462dba9d8e1c0c5d38d9702b6597;p=thirdparty%2Frspamd.git [Feature] Add rspamd_text:normalize_newlines() for CRLF/LF conversion Add fast C API and Lua binding to normalize line endings in rspamd_text: - Two-pass algorithm using rspamd_memcspn for efficient scanning - LF to CRLF conversion (for SMTP compliance, fixes issue #5888) - CRLF to LF conversion (for Unix compatibility) - Supports mempool for memory allocation - Proper ownership handling: frees old memory if owned, sets OWN flag if using g_malloc Update lua_smtp.lua to normalize messages to CRLF before SMTP DATA, ensuring compatibility with strict servers like Exchange. Includes comprehensive unit tests covering: - No-op cases (already normalized, no newlines) - Basic conversions in both directions - Mixed/inconsistent line endings - Weird line endings (lone CR, multiple CR) - Edge cases (empty, single char, large text, null bytes) - Mode parameter variations (crlf/windows/lf/unix, case insensitive) --- diff --git a/lualib/lua_smtp.lua b/lualib/lua_smtp.lua index 3c403497c6..f7a5e3dc55 100644 --- a/lualib/lua_smtp.lua +++ b/lualib/lua_smtp.lua @@ -15,6 +15,7 @@ limitations under the License. ]]-- local rspamd_tcp = require "rspamd_tcp" +local rspamd_text = require "rspamd_text" local lua_util = require "lua_util" local exports = {} @@ -107,9 +108,28 @@ local function sendmail(opts, message, callback) -- DATA stage local function data_done_cb(merr, mdata) if no_error_read(merr, mdata, '3') then + -- Normalize line endings to CRLF for SMTP compliance + local function normalize_to_crlf(msg) + if type(msg) == 'userdata' then + -- rspamd_text object + return msg:normalize_newlines("crlf") + elseif type(msg) == 'string' then + -- Convert string to text, normalize, back to string + local txt = rspamd_text.fromstring(msg) + txt:normalize_newlines("crlf") + return txt:str() + end + return msg + end + if type(message) == 'string' or type(message) == 'userdata' then + message = normalize_to_crlf(message) conn:add_write(pre_quit_cb, { message, CRLF .. '.' .. CRLF }) else + -- Array of chunks + for i = 1, #message do + message[i] = normalize_to_crlf(message[i]) + end table.insert(message, CRLF .. '.' .. CRLF) conn:add_write(pre_quit_cb, message) end diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index 6f41b32f71..6113890b3c 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -112,6 +112,14 @@ struct rspamd_lua_ip { #define RSPAMD_TEXT_FLAG_SYSMALLOC (1u << 3u) #define RSPAMD_TEXT_FLAG_FAKE (1u << 4u) #define RSPAMD_TEXT_FLAG_BINARY (1u << 5u) + +/** + * Line ending normalization modes for rspamd_lua_text + */ +enum rspamd_text_newline_mode { + RSPAMD_TEXT_NEWLINES_LF = 0, /* Normalize to LF only */ + RSPAMD_TEXT_NEWLINES_CRLF = 1, /* Normalize to CRLF */ +}; struct rspamd_lua_text { const char *start; unsigned int len; @@ -299,6 +307,22 @@ struct rspamd_lua_text *lua_new_text_task(lua_State *L, struct rspamd_task *task */ bool lua_is_text_binary(struct rspamd_lua_text *t); +/** + * Normalize line endings in a text. + * + * If the text is owned and can be modified, this function may reallocate it. + * Otherwise, it allocates new memory from pool or g_malloc. + * + * @param t text to normalize (may be modified in-place if owned) + * @param pool optional mempool for allocation (NULL = g_malloc) + * @param mode target newline mode (LF or CRLF) + * @return normalized text (may be same pointer if no changes needed) + */ +struct rspamd_lua_text *rspamd_lua_text_normalize_newlines( + struct rspamd_lua_text *t, + rspamd_mempool_t *pool, + enum rspamd_text_newline_mode mode); + struct rspamd_lua_regexp *lua_check_regexp(lua_State *L, int pos); struct rspamd_lua_upstream *lua_check_upstream(lua_State *L, int pos); diff --git a/src/lua/lua_text.c b/src/lua/lua_text.c index 4a62bdfe98..6a390d7796 100644 --- a/src/lua/lua_text.c +++ b/src/lua/lua_text.c @@ -202,6 +202,21 @@ LUA_FUNCTION_DEF(text, memcspn); * @return {rspamd_text} modified or copied text */ LUA_FUNCTION_DEF(text, oneline); +/*** + * @method rspamd_text:normalize_newlines([mode, [pool]]) + * Normalizes line endings in text to the specified format. + * - If mode is "lf" or "unix": converts CRLF to LF + * - If mode is "crlf" or "windows" (default): converts bare LF to CRLF + * + * If the text is owned, it may be modified in-place. + * If a mempool is provided, new memory is allocated from it. + * Otherwise, g_malloc is used and OWN flag is set. + * + * @param {string} mode target newline mode: "lf" or "crlf" (default: "crlf") + * @param {mempool} pool optional mempool for allocation + * @return {rspamd_text} normalized text (may be same as input if no changes) + */ +LUA_FUNCTION_DEF(text, normalize_newlines); /*** * @method rspamd_text:base32([b32type]) * Returns a text encoded in base32 (new rspamd_text is allocated) @@ -276,6 +291,7 @@ static const struct luaL_reg textlib_m[] = { LUA_INTERFACE_DEF(text, exclude_chars), LUA_INTERFACE_DEF(text, memcspn), LUA_INTERFACE_DEF(text, oneline), + LUA_INTERFACE_DEF(text, normalize_newlines), LUA_INTERFACE_DEF(text, base32), LUA_INTERFACE_DEF(text, base64), LUA_INTERFACE_DEF(text, hex), @@ -420,6 +436,143 @@ bool lua_is_text_binary(struct rspamd_lua_text *t) return false; } +struct rspamd_lua_text * +rspamd_lua_text_normalize_newlines(struct rspamd_lua_text *t, + rspamd_mempool_t *pool, + enum rspamd_text_newline_mode mode) +{ + if (t == NULL || t->len == 0) { + return t; + } + + const char *p, *end; + size_t count = 0; + + p = t->start; + end = t->start + t->len; + + if (mode == RSPAMD_TEXT_NEWLINES_CRLF) { + /* LF -> CRLF: count bare LFs (not preceded by CR) */ + while (p < end) { + size_t span = rspamd_memcspn(p, end - p, "\n", 1); + p += span; + + if (p < end) { + /* Found LF, check if bare */ + if (p == t->start || *(p - 1) != '\r') { + count++; + } + p++; + } + } + + if (count == 0) { + return t; /* Already normalized */ + } + + /* Need to insert 'count' CR characters */ + size_t new_len = t->len + count; + char *new_start; + + if (pool) { + new_start = rspamd_mempool_alloc(pool, new_len); + } + else { + new_start = g_malloc(new_len); + } + + /* Copy with CR insertion */ + char *out = new_start; + p = t->start; + + while (p < end) { + size_t span = rspamd_memcspn(p, end - p, "\n", 1); + memcpy(out, p, span); + out += span; + p += span; + + if (p < end) { + if (p == t->start || *(p - 1) != '\r') { + *out++ = '\r'; /* Insert CR */ + } + *out++ = *p++; /* Copy LF */ + } + } + + /* Free old memory if owned */ + if (t->flags & RSPAMD_TEXT_FLAG_OWN) { + g_free((void *) t->start); + } + + t->start = new_start; + t->len = new_len; + t->flags = pool ? 0 : RSPAMD_TEXT_FLAG_OWN; + } + else { + /* CRLF -> LF: count CR followed by LF */ + while (p < end) { + size_t span = rspamd_memcspn(p, end - p, "\r", 1); + p += span; + + if (p < end) { + /* Found CR, check if followed by LF */ + if (p + 1 < end && *(p + 1) == '\n') { + count++; + } + p++; + } + } + + if (count == 0) { + return t; /* Already normalized */ + } + + /* Need to remove 'count' CR characters */ + size_t new_len = t->len - count; + char *new_start; + + if (pool) { + new_start = rspamd_mempool_alloc(pool, new_len); + } + else { + new_start = g_malloc(new_len); + } + + /* Copy, skipping CR before LF */ + char *out = new_start; + p = t->start; + + while (p < end) { + size_t span = rspamd_memcspn(p, end - p, "\r", 1); + memcpy(out, p, span); + out += span; + p += span; + + if (p < end) { + /* Check if CR is followed by LF */ + if (p + 1 < end && *(p + 1) == '\n') { + /* Skip the CR, will copy LF on next iteration */ + p++; + } + else { + *out++ = *p++; /* Copy CR (not followed by LF) */ + } + } + } + + /* Free old memory if owned */ + if (t->flags & RSPAMD_TEXT_FLAG_OWN) { + g_free((void *) t->start); + } + + t->start = new_start; + t->len = new_len; + t->flags = pool ? 0 : RSPAMD_TEXT_FLAG_OWN; + } + + return t; +} + static int lua_text_fromstring(lua_State *L) @@ -1750,6 +1903,49 @@ lua_text_oneline(lua_State *L) return 1; } +static int +lua_text_normalize_newlines(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_lua_text *t = lua_check_text(L, 1); + rspamd_mempool_t *pool = NULL; + enum rspamd_text_newline_mode mode = RSPAMD_TEXT_NEWLINES_CRLF; + + if (t == NULL) { + return luaL_error(L, "invalid arguments"); + } + + /* Check for mode argument */ + if (lua_type(L, 2) == LUA_TSTRING) { + const char *mode_str = lua_tostring(L, 2); + + if (g_ascii_strcasecmp(mode_str, "lf") == 0 || + g_ascii_strcasecmp(mode_str, "unix") == 0) { + mode = RSPAMD_TEXT_NEWLINES_LF; + } + else if (g_ascii_strcasecmp(mode_str, "crlf") == 0 || + g_ascii_strcasecmp(mode_str, "windows") == 0) { + mode = RSPAMD_TEXT_NEWLINES_CRLF; + } + else { + return luaL_error(L, "invalid mode: %s (expected 'lf' or 'crlf')", mode_str); + } + } + + /* Check for pool argument */ + if (lua_type(L, 3) == LUA_TUSERDATA) { + pool = rspamd_lua_check_mempool(L, 3); + } + + /* Normalize the text (may return same pointer if no changes) */ + rspamd_lua_text_normalize_newlines(t, pool, mode); + + /* Return the (possibly modified) text */ + lua_pushvalue(L, 1); + + return 1; +} + static int lua_text_lower(lua_State *L) { diff --git a/test/lua/unit/rspamd_text.lua b/test/lua/unit/rspamd_text.lua index d643d9e634..f8106dd348 100644 --- a/test/lua/unit/rspamd_text.lua +++ b/test/lua/unit/rspamd_text.lua @@ -77,3 +77,223 @@ context("Rspamd_text:find() test", function() end) end end) + +context("Rspamd_text:normalize_newlines() test", function() + local rspamd_text = require "rspamd_text" + + -- No normalization needed + test("already CRLF - no change", function() + local t = rspamd_text.fromstring("line1\r\nline2\r\nline3\r\n") + local orig_ptr = t:ptr() + t:normalize_newlines("crlf") + assert_equal(t:ptr(), orig_ptr, "should return same text when no change needed") + assert_equal(t:str(), "line1\r\nline2\r\nline3\r\n") + end) + + test("already LF only - no change for LF mode", function() + local t = rspamd_text.fromstring("line1\nline2\nline3\n") + local orig_ptr = t:ptr() + t:normalize_newlines("lf") + assert_equal(t:ptr(), orig_ptr, "should return same text when no change needed") + assert_equal(t:str(), "line1\nline2\nline3\n") + end) + + test("no newlines at all - no change", function() + local t = rspamd_text.fromstring("just some text without newlines") + local orig_ptr = t:ptr() + t:normalize_newlines("crlf") + assert_equal(t:ptr(), orig_ptr, "should return same text when no newlines") + assert_equal(t:str(), "just some text without newlines") + end) + + -- LF to CRLF conversion + test("LF to CRLF: simple", function() + local t = rspamd_text.fromstring("line1\nline2\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "line1\r\nline2\r\n") + end) + + test("LF to CRLF: bare LF after CRLF stays CRLF", function() + local t = rspamd_text.fromstring("line1\r\nline2\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "line1\r\nline2\r\n") + end) + + test("LF to CRLF: multiple bare LFs", function() + local t = rspamd_text.fromstring("a\nb\nc\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "a\r\nb\r\nc\r\n") + end) + + test("LF to CRLF: text starting with LF", function() + local t = rspamd_text.fromstring("\nfirst line\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "\r\nfirst line\r\n") + end) + + test("LF to CRLF: text ending with LF", function() + local t = rspamd_text.fromstring("last line\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "last line\r\n") + end) + + test("LF to CRLF: consecutive LFs", function() + local t = rspamd_text.fromstring("line1\n\nline3") + t:normalize_newlines("crlf") + assert_equal(t:str(), "line1\r\n\r\nline3") + end) + + -- CRLF to LF conversion + test("CRLF to LF: simple", function() + local t = rspamd_text.fromstring("line1\r\nline2\r\n") + t:normalize_newlines("lf") + assert_equal(t:str(), "line1\nline2\n") + end) + + test("CRLF to LF: mixed CRLF and LF", function() + local t = rspamd_text.fromstring("line1\r\nline2\nline3\r\n") + t:normalize_newlines("lf") + assert_equal(t:str(), "line1\nline2\nline3\n") + end) + + test("CRLF to LF: multiple consecutive CRLF", function() + local t = rspamd_text.fromstring("a\r\n\r\nb") + t:normalize_newlines("lf") + assert_equal(t:str(), "a\n\nb") + end) + + -- Weird line endings + test("CR only (not followed by LF) preserved in CRLF mode", function() + local t = rspamd_text.fromstring("line1\rline2\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "line1\rline2\r\n") + end) + + test("CR only preserved in LF mode", function() + local t = rspamd_text.fromstring("line1\rline2\r\n") + t:normalize_newlines("lf") + assert_equal(t:str(), "line1\rline2\n") + end) + + test("multiple CRs before LF", function() + local t = rspamd_text.fromstring("line\r\r\nline2") + t:normalize_newlines("lf") + assert_equal(t:str(), "line\r\nline2") + end) + + -- Inconsistent line endings + test("mixed CRLF and LF to CRLF", function() + local t = rspamd_text.fromstring("line1\r\nline2\nline3\r\nline4\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "line1\r\nline2\r\nline3\r\nline4\r\n") + end) + + test("mixed CRLF and LF to LF", function() + local t = rspamd_text.fromstring("line1\r\nline2\nline3\r\nline4\n") + t:normalize_newlines("lf") + assert_equal(t:str(), "line1\nline2\nline3\nline4\n") + end) + + -- Only line endings + test("single LF to CRLF", function() + local t = rspamd_text.fromstring("\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "\r\n") + end) + + test("single CRLF to LF", function() + local t = rspamd_text.fromstring("\r\n") + t:normalize_newlines("lf") + assert_equal(t:str(), "\n") + end) + + test("multiple LF only to CRLF", function() + local t = rspamd_text.fromstring("\n\n\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "\r\n\r\n\r\n") + end) + + test("multiple CRLF only to LF", function() + local t = rspamd_text.fromstring("\r\n\r\n\r\n") + t:normalize_newlines("lf") + assert_equal(t:str(), "\n\n\n") + end) + + -- Edge cases + test("empty string - no change", function() + local t = rspamd_text.fromstring("") + local orig_ptr = t:ptr() + t:normalize_newlines("crlf") + assert_equal(t:ptr(), orig_ptr) + assert_equal(t:str(), "") + end) + + test("single character without newline", function() + local t = rspamd_text.fromstring("x") + local orig_ptr = t:ptr() + t:normalize_newlines("crlf") + assert_equal(t:ptr(), orig_ptr) + assert_equal(t:str(), "x") + end) + + test("single character with newline", function() + local t = rspamd_text.fromstring("x\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "x\r\n") + end) + + test("large text with many newlines", function() + local lines = {} + for i = 1, 1000 do + lines[i] = "line" .. i + end + local input = table.concat(lines, "\n") + local expected = table.concat(lines, "\r\n") + local t = rspamd_text.fromstring(input) + t:normalize_newlines("crlf") + assert_equal(t:str(), expected) + end) + + test("text with null bytes", function() + local t = rspamd_text.fromstring("line1\n\x00line2\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "line1\r\n\x00line2\r\n") + end) + + -- Mode parameter variations + test("mode 'crlf' works", function() + local t = rspamd_text.fromstring("a\n") + t:normalize_newlines("crlf") + assert_equal(t:str(), "a\r\n") + end) + + test("mode 'windows' works (alias for crlf)", function() + local t = rspamd_text.fromstring("a\n") + t:normalize_newlines("windows") + assert_equal(t:str(), "a\r\n") + end) + + test("mode 'lf' works", function() + local t = rspamd_text.fromstring("a\r\n") + t:normalize_newlines("lf") + assert_equal(t:str(), "a\n") + end) + + test("mode 'unix' works (alias for lf)", function() + local t = rspamd_text.fromstring("a\r\n") + t:normalize_newlines("unix") + assert_equal(t:str(), "a\n") + end) + + test("default mode is crlf", function() + local t = rspamd_text.fromstring("a\n") + t:normalize_newlines() + assert_equal(t:str(), "a\r\n") + end) + + test("case insensitive mode", function() + local t = rspamd_text.fromstring("a\n") + t:normalize_newlines("CRLF") + assert_equal(t:str(), "a\r\n") + end) +end)