]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Add rspamd_text:normalize_newlines() for CRLF/LF conversion
authorVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 16 Feb 2026 15:00:03 +0000 (15:00 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Mon, 16 Feb 2026 15:00:03 +0000 (15:00 +0000)
Add fast C API and Lua binding to normalize line endings in rspamd_text:
- Two-pass algorithm using rspamd_memcspn for efficient scanning
- LF to CRLF conversion (for SMTP compliance, fixes issue #5888)
- CRLF to LF conversion (for Unix compatibility)
- Supports mempool for memory allocation
- Proper ownership handling: frees old memory if owned, sets OWN flag if using g_malloc

Update lua_smtp.lua to normalize messages to CRLF before SMTP DATA,
ensuring compatibility with strict servers like Exchange.

Includes comprehensive unit tests covering:
- No-op cases (already normalized, no newlines)
- Basic conversions in both directions
- Mixed/inconsistent line endings
- Weird line endings (lone CR, multiple CR)
- Edge cases (empty, single char, large text, null bytes)
- Mode parameter variations (crlf/windows/lf/unix, case insensitive)

lualib/lua_smtp.lua
src/lua/lua_common.h
src/lua/lua_text.c
test/lua/unit/rspamd_text.lua

index 3c403497c6263777433e3281fdb880aa042a9542..f7a5e3dc55f444a435a60807bd9f7b27a56bd84e 100644 (file)
@@ -15,6 +15,7 @@ limitations under the License.
 ]]--
 
 local rspamd_tcp = require "rspamd_tcp"
+local rspamd_text = require "rspamd_text"
 local lua_util = require "lua_util"
 
 local exports = {}
@@ -107,9 +108,28 @@ local function sendmail(opts, message, callback)
     -- DATA stage
     local function data_done_cb(merr, mdata)
       if no_error_read(merr, mdata, '3') then
+        -- Normalize line endings to CRLF for SMTP compliance
+        local function normalize_to_crlf(msg)
+          if type(msg) == 'userdata' then
+            -- rspamd_text object
+            return msg:normalize_newlines("crlf")
+          elseif type(msg) == 'string' then
+            -- Convert string to text, normalize, back to string
+            local txt = rspamd_text.fromstring(msg)
+            txt:normalize_newlines("crlf")
+            return txt:str()
+          end
+          return msg
+        end
+
         if type(message) == 'string' or type(message) == 'userdata' then
+          message = normalize_to_crlf(message)
           conn:add_write(pre_quit_cb, { message, CRLF .. '.' .. CRLF })
         else
+          -- Array of chunks
+          for i = 1, #message do
+            message[i] = normalize_to_crlf(message[i])
+          end
           table.insert(message, CRLF .. '.' .. CRLF)
           conn:add_write(pre_quit_cb, message)
         end
index 6f41b32f71081028d41e971f1ea01b16798f6464..6113890b3cc23ae140d16fc0dad1683c38f3846a 100644 (file)
@@ -112,6 +112,14 @@ struct rspamd_lua_ip {
 #define RSPAMD_TEXT_FLAG_SYSMALLOC (1u << 3u)
 #define RSPAMD_TEXT_FLAG_FAKE (1u << 4u)
 #define RSPAMD_TEXT_FLAG_BINARY (1u << 5u)
+
+/**
+ * Line ending normalization modes for rspamd_lua_text
+ */
+enum rspamd_text_newline_mode {
+       RSPAMD_TEXT_NEWLINES_LF = 0,   /* Normalize to LF only */
+       RSPAMD_TEXT_NEWLINES_CRLF = 1, /* Normalize to CRLF */
+};
 struct rspamd_lua_text {
        const char *start;
        unsigned int len;
@@ -299,6 +307,22 @@ struct rspamd_lua_text *lua_new_text_task(lua_State *L, struct rspamd_task *task
  */
 bool lua_is_text_binary(struct rspamd_lua_text *t);
 
+/**
+ * Normalize line endings in a text.
+ *
+ * If the text is owned and can be modified, this function may reallocate it.
+ * Otherwise, it allocates new memory from pool or g_malloc.
+ *
+ * @param t      text to normalize (may be modified in-place if owned)
+ * @param pool   optional mempool for allocation (NULL = g_malloc)
+ * @param mode   target newline mode (LF or CRLF)
+ * @return       normalized text (may be same pointer if no changes needed)
+ */
+struct rspamd_lua_text *rspamd_lua_text_normalize_newlines(
+       struct rspamd_lua_text *t,
+       rspamd_mempool_t *pool,
+       enum rspamd_text_newline_mode mode);
+
 struct rspamd_lua_regexp *lua_check_regexp(lua_State *L, int pos);
 
 struct rspamd_lua_upstream *lua_check_upstream(lua_State *L, int pos);
index 4a62bdfe9874fe90986e33fa4cd17d755adb04ea..6a390d7796d93c3ce59234570d75bc89c5ecd7fb 100644 (file)
@@ -202,6 +202,21 @@ LUA_FUNCTION_DEF(text, memcspn);
  * @return {rspamd_text} modified or copied text
  */
 LUA_FUNCTION_DEF(text, oneline);
+/***
+ * @method rspamd_text:normalize_newlines([mode, [pool]])
+ * Normalizes line endings in text to the specified format.
+ * - If mode is "lf" or "unix": converts CRLF to LF
+ * - If mode is "crlf" or "windows" (default): converts bare LF to CRLF
+ *
+ * If the text is owned, it may be modified in-place.
+ * If a mempool is provided, new memory is allocated from it.
+ * Otherwise, g_malloc is used and OWN flag is set.
+ *
+ * @param {string} mode target newline mode: "lf" or "crlf" (default: "crlf")
+ * @param {mempool} pool optional mempool for allocation
+ * @return {rspamd_text} normalized text (may be same as input if no changes)
+ */
+LUA_FUNCTION_DEF(text, normalize_newlines);
 /***
  * @method rspamd_text:base32([b32type])
  * Returns a text encoded in base32 (new rspamd_text is allocated)
@@ -276,6 +291,7 @@ static const struct luaL_reg textlib_m[] = {
        LUA_INTERFACE_DEF(text, exclude_chars),
        LUA_INTERFACE_DEF(text, memcspn),
        LUA_INTERFACE_DEF(text, oneline),
+       LUA_INTERFACE_DEF(text, normalize_newlines),
        LUA_INTERFACE_DEF(text, base32),
        LUA_INTERFACE_DEF(text, base64),
        LUA_INTERFACE_DEF(text, hex),
@@ -420,6 +436,143 @@ bool lua_is_text_binary(struct rspamd_lua_text *t)
        return false;
 }
 
+struct rspamd_lua_text *
+rspamd_lua_text_normalize_newlines(struct rspamd_lua_text *t,
+                                                                  rspamd_mempool_t *pool,
+                                                                  enum rspamd_text_newline_mode mode)
+{
+       if (t == NULL || t->len == 0) {
+               return t;
+       }
+
+       const char *p, *end;
+       size_t count = 0;
+
+       p = t->start;
+       end = t->start + t->len;
+
+       if (mode == RSPAMD_TEXT_NEWLINES_CRLF) {
+               /* LF -> CRLF: count bare LFs (not preceded by CR) */
+               while (p < end) {
+                       size_t span = rspamd_memcspn(p, end - p, "\n", 1);
+                       p += span;
+
+                       if (p < end) {
+                               /* Found LF, check if bare */
+                               if (p == t->start || *(p - 1) != '\r') {
+                                       count++;
+                               }
+                               p++;
+                       }
+               }
+
+               if (count == 0) {
+                       return t; /* Already normalized */
+               }
+
+               /* Need to insert 'count' CR characters */
+               size_t new_len = t->len + count;
+               char *new_start;
+
+               if (pool) {
+                       new_start = rspamd_mempool_alloc(pool, new_len);
+               }
+               else {
+                       new_start = g_malloc(new_len);
+               }
+
+               /* Copy with CR insertion */
+               char *out = new_start;
+               p = t->start;
+
+               while (p < end) {
+                       size_t span = rspamd_memcspn(p, end - p, "\n", 1);
+                       memcpy(out, p, span);
+                       out += span;
+                       p += span;
+
+                       if (p < end) {
+                               if (p == t->start || *(p - 1) != '\r') {
+                                       *out++ = '\r'; /* Insert CR */
+                               }
+                               *out++ = *p++; /* Copy LF */
+                       }
+               }
+
+               /* Free old memory if owned */
+               if (t->flags & RSPAMD_TEXT_FLAG_OWN) {
+                       g_free((void *) t->start);
+               }
+
+               t->start = new_start;
+               t->len = new_len;
+               t->flags = pool ? 0 : RSPAMD_TEXT_FLAG_OWN;
+       }
+       else {
+               /* CRLF -> LF: count CR followed by LF */
+               while (p < end) {
+                       size_t span = rspamd_memcspn(p, end - p, "\r", 1);
+                       p += span;
+
+                       if (p < end) {
+                               /* Found CR, check if followed by LF */
+                               if (p + 1 < end && *(p + 1) == '\n') {
+                                       count++;
+                               }
+                               p++;
+                       }
+               }
+
+               if (count == 0) {
+                       return t; /* Already normalized */
+               }
+
+               /* Need to remove 'count' CR characters */
+               size_t new_len = t->len - count;
+               char *new_start;
+
+               if (pool) {
+                       new_start = rspamd_mempool_alloc(pool, new_len);
+               }
+               else {
+                       new_start = g_malloc(new_len);
+               }
+
+               /* Copy, skipping CR before LF */
+               char *out = new_start;
+               p = t->start;
+
+               while (p < end) {
+                       size_t span = rspamd_memcspn(p, end - p, "\r", 1);
+                       memcpy(out, p, span);
+                       out += span;
+                       p += span;
+
+                       if (p < end) {
+                               /* Check if CR is followed by LF */
+                               if (p + 1 < end && *(p + 1) == '\n') {
+                                       /* Skip the CR, will copy LF on next iteration */
+                                       p++;
+                               }
+                               else {
+                                       *out++ = *p++; /* Copy CR (not followed by LF) */
+                               }
+                       }
+               }
+
+               /* Free old memory if owned */
+               if (t->flags & RSPAMD_TEXT_FLAG_OWN) {
+                       g_free((void *) t->start);
+               }
+
+               t->start = new_start;
+               t->len = new_len;
+               t->flags = pool ? 0 : RSPAMD_TEXT_FLAG_OWN;
+       }
+
+       return t;
+}
+
 
 static int
 lua_text_fromstring(lua_State *L)
@@ -1750,6 +1903,49 @@ lua_text_oneline(lua_State *L)
        return 1;
 }
 
+static int
+lua_text_normalize_newlines(lua_State *L)
+{
+       LUA_TRACE_POINT;
+       struct rspamd_lua_text *t = lua_check_text(L, 1);
+       rspamd_mempool_t *pool = NULL;
+       enum rspamd_text_newline_mode mode = RSPAMD_TEXT_NEWLINES_CRLF;
+
+       if (t == NULL) {
+               return luaL_error(L, "invalid arguments");
+       }
+
+       /* Check for mode argument */
+       if (lua_type(L, 2) == LUA_TSTRING) {
+               const char *mode_str = lua_tostring(L, 2);
+
+               if (g_ascii_strcasecmp(mode_str, "lf") == 0 ||
+                       g_ascii_strcasecmp(mode_str, "unix") == 0) {
+                       mode = RSPAMD_TEXT_NEWLINES_LF;
+               }
+               else if (g_ascii_strcasecmp(mode_str, "crlf") == 0 ||
+                                g_ascii_strcasecmp(mode_str, "windows") == 0) {
+                       mode = RSPAMD_TEXT_NEWLINES_CRLF;
+               }
+               else {
+                       return luaL_error(L, "invalid mode: %s (expected 'lf' or 'crlf')", mode_str);
+               }
+       }
+
+       /* Check for pool argument */
+       if (lua_type(L, 3) == LUA_TUSERDATA) {
+               pool = rspamd_lua_check_mempool(L, 3);
+       }
+
+       /* Normalize the text (may return same pointer if no changes) */
+       rspamd_lua_text_normalize_newlines(t, pool, mode);
+
+       /* Return the (possibly modified) text */
+       lua_pushvalue(L, 1);
+
+       return 1;
+}
+
 static int
 lua_text_lower(lua_State *L)
 {
index d643d9e63411c0d4274cb3fc376361f915cd4b32..f8106dd348362dfc84bf03ad20ace2f908ad1091 100644 (file)
@@ -77,3 +77,223 @@ context("Rspamd_text:find() test", function()
     end)
   end
 end)
+
+context("Rspamd_text:normalize_newlines() test", function()
+  local rspamd_text = require "rspamd_text"
+
+  -- No normalization needed
+  test("already CRLF - no change", function()
+    local t = rspamd_text.fromstring("line1\r\nline2\r\nline3\r\n")
+    local orig_ptr = t:ptr()
+    t:normalize_newlines("crlf")
+    assert_equal(t:ptr(), orig_ptr, "should return same text when no change needed")
+    assert_equal(t:str(), "line1\r\nline2\r\nline3\r\n")
+  end)
+
+  test("already LF only - no change for LF mode", function()
+    local t = rspamd_text.fromstring("line1\nline2\nline3\n")
+    local orig_ptr = t:ptr()
+    t:normalize_newlines("lf")
+    assert_equal(t:ptr(), orig_ptr, "should return same text when no change needed")
+    assert_equal(t:str(), "line1\nline2\nline3\n")
+  end)
+
+  test("no newlines at all - no change", function()
+    local t = rspamd_text.fromstring("just some text without newlines")
+    local orig_ptr = t:ptr()
+    t:normalize_newlines("crlf")
+    assert_equal(t:ptr(), orig_ptr, "should return same text when no newlines")
+    assert_equal(t:str(), "just some text without newlines")
+  end)
+
+  -- LF to CRLF conversion
+  test("LF to CRLF: simple", function()
+    local t = rspamd_text.fromstring("line1\nline2\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "line1\r\nline2\r\n")
+  end)
+
+  test("LF to CRLF: bare LF after CRLF stays CRLF", function()
+    local t = rspamd_text.fromstring("line1\r\nline2\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "line1\r\nline2\r\n")
+  end)
+
+  test("LF to CRLF: multiple bare LFs", function()
+    local t = rspamd_text.fromstring("a\nb\nc\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "a\r\nb\r\nc\r\n")
+  end)
+
+  test("LF to CRLF: text starting with LF", function()
+    local t = rspamd_text.fromstring("\nfirst line\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "\r\nfirst line\r\n")
+  end)
+
+  test("LF to CRLF: text ending with LF", function()
+    local t = rspamd_text.fromstring("last line\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "last line\r\n")
+  end)
+
+  test("LF to CRLF: consecutive LFs", function()
+    local t = rspamd_text.fromstring("line1\n\nline3")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "line1\r\n\r\nline3")
+  end)
+
+  -- CRLF to LF conversion
+  test("CRLF to LF: simple", function()
+    local t = rspamd_text.fromstring("line1\r\nline2\r\n")
+    t:normalize_newlines("lf")
+    assert_equal(t:str(), "line1\nline2\n")
+  end)
+
+  test("CRLF to LF: mixed CRLF and LF", function()
+    local t = rspamd_text.fromstring("line1\r\nline2\nline3\r\n")
+    t:normalize_newlines("lf")
+    assert_equal(t:str(), "line1\nline2\nline3\n")
+  end)
+
+  test("CRLF to LF: multiple consecutive CRLF", function()
+    local t = rspamd_text.fromstring("a\r\n\r\nb")
+    t:normalize_newlines("lf")
+    assert_equal(t:str(), "a\n\nb")
+  end)
+
+  -- Weird line endings
+  test("CR only (not followed by LF) preserved in CRLF mode", function()
+    local t = rspamd_text.fromstring("line1\rline2\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "line1\rline2\r\n")
+  end)
+
+  test("CR only preserved in LF mode", function()
+    local t = rspamd_text.fromstring("line1\rline2\r\n")
+    t:normalize_newlines("lf")
+    assert_equal(t:str(), "line1\rline2\n")
+  end)
+
+  test("multiple CRs before LF", function()
+    local t = rspamd_text.fromstring("line\r\r\nline2")
+    t:normalize_newlines("lf")
+    assert_equal(t:str(), "line\r\nline2")
+  end)
+
+  -- Inconsistent line endings
+  test("mixed CRLF and LF to CRLF", function()
+    local t = rspamd_text.fromstring("line1\r\nline2\nline3\r\nline4\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "line1\r\nline2\r\nline3\r\nline4\r\n")
+  end)
+
+  test("mixed CRLF and LF to LF", function()
+    local t = rspamd_text.fromstring("line1\r\nline2\nline3\r\nline4\n")
+    t:normalize_newlines("lf")
+    assert_equal(t:str(), "line1\nline2\nline3\nline4\n")
+  end)
+
+  -- Only line endings
+  test("single LF to CRLF", function()
+    local t = rspamd_text.fromstring("\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "\r\n")
+  end)
+
+  test("single CRLF to LF", function()
+    local t = rspamd_text.fromstring("\r\n")
+    t:normalize_newlines("lf")
+    assert_equal(t:str(), "\n")
+  end)
+
+  test("multiple LF only to CRLF", function()
+    local t = rspamd_text.fromstring("\n\n\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "\r\n\r\n\r\n")
+  end)
+
+  test("multiple CRLF only to LF", function()
+    local t = rspamd_text.fromstring("\r\n\r\n\r\n")
+    t:normalize_newlines("lf")
+    assert_equal(t:str(), "\n\n\n")
+  end)
+
+  -- Edge cases
+  test("empty string - no change", function()
+    local t = rspamd_text.fromstring("")
+    local orig_ptr = t:ptr()
+    t:normalize_newlines("crlf")
+    assert_equal(t:ptr(), orig_ptr)
+    assert_equal(t:str(), "")
+  end)
+
+  test("single character without newline", function()
+    local t = rspamd_text.fromstring("x")
+    local orig_ptr = t:ptr()
+    t:normalize_newlines("crlf")
+    assert_equal(t:ptr(), orig_ptr)
+    assert_equal(t:str(), "x")
+  end)
+
+  test("single character with newline", function()
+    local t = rspamd_text.fromstring("x\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "x\r\n")
+  end)
+
+  test("large text with many newlines", function()
+    local lines = {}
+    for i = 1, 1000 do
+      lines[i] = "line" .. i
+    end
+    local input = table.concat(lines, "\n")
+    local expected = table.concat(lines, "\r\n")
+    local t = rspamd_text.fromstring(input)
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), expected)
+  end)
+
+  test("text with null bytes", function()
+    local t = rspamd_text.fromstring("line1\n\x00line2\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "line1\r\n\x00line2\r\n")
+  end)
+
+  -- Mode parameter variations
+  test("mode 'crlf' works", function()
+    local t = rspamd_text.fromstring("a\n")
+    t:normalize_newlines("crlf")
+    assert_equal(t:str(), "a\r\n")
+  end)
+
+  test("mode 'windows' works (alias for crlf)", function()
+    local t = rspamd_text.fromstring("a\n")
+    t:normalize_newlines("windows")
+    assert_equal(t:str(), "a\r\n")
+  end)
+
+  test("mode 'lf' works", function()
+    local t = rspamd_text.fromstring("a\r\n")
+    t:normalize_newlines("lf")
+    assert_equal(t:str(), "a\n")
+  end)
+
+  test("mode 'unix' works (alias for lf)", function()
+    local t = rspamd_text.fromstring("a\r\n")
+    t:normalize_newlines("unix")
+    assert_equal(t:str(), "a\n")
+  end)
+
+  test("default mode is crlf", function()
+    local t = rspamd_text.fromstring("a\n")
+    t:normalize_newlines()
+    assert_equal(t:str(), "a\r\n")
+  end)
+
+  test("case insensitive mode", function()
+    local t = rspamd_text.fromstring("a\n")
+    t:normalize_newlines("CRLF")
+    assert_equal(t:str(), "a\r\n")
+  end)
+end)