* @return {rspamd_text} modified or copied text
*/
LUA_FUNCTION_DEF(text, oneline);
+/***
+ * @method rspamd_text:normalize_newlines([mode, [pool]])
+ * Normalizes line endings in text to the specified format.
+ * - If mode is "lf" or "unix": converts CRLF to LF
+ * - If mode is "crlf" or "windows" (default): converts bare LF to CRLF
+ *
+ * If the text is owned, it may be modified in-place.
+ * If a mempool is provided, new memory is allocated from it.
+ * Otherwise, g_malloc is used and OWN flag is set.
+ *
+ * @param {string} mode target newline mode: "lf" or "crlf" (default: "crlf")
+ * @param {mempool} pool optional mempool for allocation
+ * @return {rspamd_text} normalized text (may be same as input if no changes)
+ */
+LUA_FUNCTION_DEF(text, normalize_newlines);
/***
* @method rspamd_text:base32([b32type])
* Returns a text encoded in base32 (new rspamd_text is allocated)
LUA_INTERFACE_DEF(text, exclude_chars),
LUA_INTERFACE_DEF(text, memcspn),
LUA_INTERFACE_DEF(text, oneline),
+ LUA_INTERFACE_DEF(text, normalize_newlines),
LUA_INTERFACE_DEF(text, base32),
LUA_INTERFACE_DEF(text, base64),
LUA_INTERFACE_DEF(text, hex),
return false;
}
+struct rspamd_lua_text *
+rspamd_lua_text_normalize_newlines(struct rspamd_lua_text *t,
+ rspamd_mempool_t *pool,
+ enum rspamd_text_newline_mode mode)
+{
+ if (t == NULL || t->len == 0) {
+ return t;
+ }
+
+ const char *p, *end;
+ size_t count = 0;
+
+ p = t->start;
+ end = t->start + t->len;
+
+ if (mode == RSPAMD_TEXT_NEWLINES_CRLF) {
+ /* LF -> CRLF: count bare LFs (not preceded by CR) */
+ while (p < end) {
+ size_t span = rspamd_memcspn(p, end - p, "\n", 1);
+ p += span;
+
+ if (p < end) {
+ /* Found LF, check if bare */
+ if (p == t->start || *(p - 1) != '\r') {
+ count++;
+ }
+ p++;
+ }
+ }
+
+ if (count == 0) {
+ return t; /* Already normalized */
+ }
+
+ /* Need to insert 'count' CR characters */
+ size_t new_len = t->len + count;
+ char *new_start;
+
+ if (pool) {
+ new_start = rspamd_mempool_alloc(pool, new_len);
+ }
+ else {
+ new_start = g_malloc(new_len);
+ }
+
+ /* Copy with CR insertion */
+ char *out = new_start;
+ p = t->start;
+
+ while (p < end) {
+ size_t span = rspamd_memcspn(p, end - p, "\n", 1);
+ memcpy(out, p, span);
+ out += span;
+ p += span;
+
+ if (p < end) {
+ if (p == t->start || *(p - 1) != '\r') {
+ *out++ = '\r'; /* Insert CR */
+ }
+ *out++ = *p++; /* Copy LF */
+ }
+ }
+
+ /* Free old memory if owned */
+ if (t->flags & RSPAMD_TEXT_FLAG_OWN) {
+ g_free((void *) t->start);
+ }
+
+ t->start = new_start;
+ t->len = new_len;
+ t->flags = pool ? 0 : RSPAMD_TEXT_FLAG_OWN;
+ }
+ else {
+ /* CRLF -> LF: count CR followed by LF */
+ while (p < end) {
+ size_t span = rspamd_memcspn(p, end - p, "\r", 1);
+ p += span;
+
+ if (p < end) {
+ /* Found CR, check if followed by LF */
+ if (p + 1 < end && *(p + 1) == '\n') {
+ count++;
+ }
+ p++;
+ }
+ }
+
+ if (count == 0) {
+ return t; /* Already normalized */
+ }
+
+ /* Need to remove 'count' CR characters */
+ size_t new_len = t->len - count;
+ char *new_start;
+
+ if (pool) {
+ new_start = rspamd_mempool_alloc(pool, new_len);
+ }
+ else {
+ new_start = g_malloc(new_len);
+ }
+
+ /* Copy, skipping CR before LF */
+ char *out = new_start;
+ p = t->start;
+
+ while (p < end) {
+ size_t span = rspamd_memcspn(p, end - p, "\r", 1);
+ memcpy(out, p, span);
+ out += span;
+ p += span;
+
+ if (p < end) {
+ /* Check if CR is followed by LF */
+ if (p + 1 < end && *(p + 1) == '\n') {
+ /* Skip the CR, will copy LF on next iteration */
+ p++;
+ }
+ else {
+ *out++ = *p++; /* Copy CR (not followed by LF) */
+ }
+ }
+ }
+
+ /* Free old memory if owned */
+ if (t->flags & RSPAMD_TEXT_FLAG_OWN) {
+ g_free((void *) t->start);
+ }
+
+ t->start = new_start;
+ t->len = new_len;
+ t->flags = pool ? 0 : RSPAMD_TEXT_FLAG_OWN;
+ }
+
+ return t;
+}
+
static int
lua_text_fromstring(lua_State *L)
return 1;
}
+static int
+lua_text_normalize_newlines(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_text *t = lua_check_text(L, 1);
+ rspamd_mempool_t *pool = NULL;
+ enum rspamd_text_newline_mode mode = RSPAMD_TEXT_NEWLINES_CRLF;
+
+ if (t == NULL) {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ /* Check for mode argument */
+ if (lua_type(L, 2) == LUA_TSTRING) {
+ const char *mode_str = lua_tostring(L, 2);
+
+ if (g_ascii_strcasecmp(mode_str, "lf") == 0 ||
+ g_ascii_strcasecmp(mode_str, "unix") == 0) {
+ mode = RSPAMD_TEXT_NEWLINES_LF;
+ }
+ else if (g_ascii_strcasecmp(mode_str, "crlf") == 0 ||
+ g_ascii_strcasecmp(mode_str, "windows") == 0) {
+ mode = RSPAMD_TEXT_NEWLINES_CRLF;
+ }
+ else {
+ return luaL_error(L, "invalid mode: %s (expected 'lf' or 'crlf')", mode_str);
+ }
+ }
+
+ /* Check for pool argument */
+ if (lua_type(L, 3) == LUA_TUSERDATA) {
+ pool = rspamd_lua_check_mempool(L, 3);
+ }
+
+ /* Normalize the text (may return same pointer if no changes) */
+ rspamd_lua_text_normalize_newlines(t, pool, mode);
+
+ /* Return the (possibly modified) text */
+ lua_pushvalue(L, 1);
+
+ return 1;
+}
+
static int
lua_text_lower(lua_State *L)
{
end)
end
end)
+
+context("Rspamd_text:normalize_newlines() test", function()
+ local rspamd_text = require "rspamd_text"
+
+ -- No normalization needed
+ test("already CRLF - no change", function()
+ local t = rspamd_text.fromstring("line1\r\nline2\r\nline3\r\n")
+ local orig_ptr = t:ptr()
+ t:normalize_newlines("crlf")
+ assert_equal(t:ptr(), orig_ptr, "should return same text when no change needed")
+ assert_equal(t:str(), "line1\r\nline2\r\nline3\r\n")
+ end)
+
+ test("already LF only - no change for LF mode", function()
+ local t = rspamd_text.fromstring("line1\nline2\nline3\n")
+ local orig_ptr = t:ptr()
+ t:normalize_newlines("lf")
+ assert_equal(t:ptr(), orig_ptr, "should return same text when no change needed")
+ assert_equal(t:str(), "line1\nline2\nline3\n")
+ end)
+
+ test("no newlines at all - no change", function()
+ local t = rspamd_text.fromstring("just some text without newlines")
+ local orig_ptr = t:ptr()
+ t:normalize_newlines("crlf")
+ assert_equal(t:ptr(), orig_ptr, "should return same text when no newlines")
+ assert_equal(t:str(), "just some text without newlines")
+ end)
+
+ -- LF to CRLF conversion
+ test("LF to CRLF: simple", function()
+ local t = rspamd_text.fromstring("line1\nline2\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "line1\r\nline2\r\n")
+ end)
+
+ test("LF to CRLF: bare LF after CRLF stays CRLF", function()
+ local t = rspamd_text.fromstring("line1\r\nline2\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "line1\r\nline2\r\n")
+ end)
+
+ test("LF to CRLF: multiple bare LFs", function()
+ local t = rspamd_text.fromstring("a\nb\nc\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "a\r\nb\r\nc\r\n")
+ end)
+
+ test("LF to CRLF: text starting with LF", function()
+ local t = rspamd_text.fromstring("\nfirst line\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "\r\nfirst line\r\n")
+ end)
+
+ test("LF to CRLF: text ending with LF", function()
+ local t = rspamd_text.fromstring("last line\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "last line\r\n")
+ end)
+
+ test("LF to CRLF: consecutive LFs", function()
+ local t = rspamd_text.fromstring("line1\n\nline3")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "line1\r\n\r\nline3")
+ end)
+
+ -- CRLF to LF conversion
+ test("CRLF to LF: simple", function()
+ local t = rspamd_text.fromstring("line1\r\nline2\r\n")
+ t:normalize_newlines("lf")
+ assert_equal(t:str(), "line1\nline2\n")
+ end)
+
+ test("CRLF to LF: mixed CRLF and LF", function()
+ local t = rspamd_text.fromstring("line1\r\nline2\nline3\r\n")
+ t:normalize_newlines("lf")
+ assert_equal(t:str(), "line1\nline2\nline3\n")
+ end)
+
+ test("CRLF to LF: multiple consecutive CRLF", function()
+ local t = rspamd_text.fromstring("a\r\n\r\nb")
+ t:normalize_newlines("lf")
+ assert_equal(t:str(), "a\n\nb")
+ end)
+
+ -- Weird line endings
+ test("CR only (not followed by LF) preserved in CRLF mode", function()
+ local t = rspamd_text.fromstring("line1\rline2\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "line1\rline2\r\n")
+ end)
+
+ test("CR only preserved in LF mode", function()
+ local t = rspamd_text.fromstring("line1\rline2\r\n")
+ t:normalize_newlines("lf")
+ assert_equal(t:str(), "line1\rline2\n")
+ end)
+
+ test("multiple CRs before LF", function()
+ local t = rspamd_text.fromstring("line\r\r\nline2")
+ t:normalize_newlines("lf")
+ assert_equal(t:str(), "line\r\nline2")
+ end)
+
+ -- Inconsistent line endings
+ test("mixed CRLF and LF to CRLF", function()
+ local t = rspamd_text.fromstring("line1\r\nline2\nline3\r\nline4\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "line1\r\nline2\r\nline3\r\nline4\r\n")
+ end)
+
+ test("mixed CRLF and LF to LF", function()
+ local t = rspamd_text.fromstring("line1\r\nline2\nline3\r\nline4\n")
+ t:normalize_newlines("lf")
+ assert_equal(t:str(), "line1\nline2\nline3\nline4\n")
+ end)
+
+ -- Only line endings
+ test("single LF to CRLF", function()
+ local t = rspamd_text.fromstring("\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "\r\n")
+ end)
+
+ test("single CRLF to LF", function()
+ local t = rspamd_text.fromstring("\r\n")
+ t:normalize_newlines("lf")
+ assert_equal(t:str(), "\n")
+ end)
+
+ test("multiple LF only to CRLF", function()
+ local t = rspamd_text.fromstring("\n\n\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "\r\n\r\n\r\n")
+ end)
+
+ test("multiple CRLF only to LF", function()
+ local t = rspamd_text.fromstring("\r\n\r\n\r\n")
+ t:normalize_newlines("lf")
+ assert_equal(t:str(), "\n\n\n")
+ end)
+
+ -- Edge cases
+ test("empty string - no change", function()
+ local t = rspamd_text.fromstring("")
+ local orig_ptr = t:ptr()
+ t:normalize_newlines("crlf")
+ assert_equal(t:ptr(), orig_ptr)
+ assert_equal(t:str(), "")
+ end)
+
+ test("single character without newline", function()
+ local t = rspamd_text.fromstring("x")
+ local orig_ptr = t:ptr()
+ t:normalize_newlines("crlf")
+ assert_equal(t:ptr(), orig_ptr)
+ assert_equal(t:str(), "x")
+ end)
+
+ test("single character with newline", function()
+ local t = rspamd_text.fromstring("x\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "x\r\n")
+ end)
+
+ test("large text with many newlines", function()
+ local lines = {}
+ for i = 1, 1000 do
+ lines[i] = "line" .. i
+ end
+ local input = table.concat(lines, "\n")
+ local expected = table.concat(lines, "\r\n")
+ local t = rspamd_text.fromstring(input)
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), expected)
+ end)
+
+ test("text with null bytes", function()
+ local t = rspamd_text.fromstring("line1\n\x00line2\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "line1\r\n\x00line2\r\n")
+ end)
+
+ -- Mode parameter variations
+ test("mode 'crlf' works", function()
+ local t = rspamd_text.fromstring("a\n")
+ t:normalize_newlines("crlf")
+ assert_equal(t:str(), "a\r\n")
+ end)
+
+ test("mode 'windows' works (alias for crlf)", function()
+ local t = rspamd_text.fromstring("a\n")
+ t:normalize_newlines("windows")
+ assert_equal(t:str(), "a\r\n")
+ end)
+
+ test("mode 'lf' works", function()
+ local t = rspamd_text.fromstring("a\r\n")
+ t:normalize_newlines("lf")
+ assert_equal(t:str(), "a\n")
+ end)
+
+ test("mode 'unix' works (alias for lf)", function()
+ local t = rspamd_text.fromstring("a\r\n")
+ t:normalize_newlines("unix")
+ assert_equal(t:str(), "a\n")
+ end)
+
+ test("default mode is crlf", function()
+ local t = rspamd_text.fromstring("a\n")
+ t:normalize_newlines()
+ assert_equal(t:str(), "a\r\n")
+ end)
+
+ test("case insensitive mode", function()
+ local t = rspamd_text.fromstring("a\n")
+ t:normalize_newlines("CRLF")
+ assert_equal(t:str(), "a\r\n")
+ end)
+end)