From: Vsevolod Stakhov Date: Mon, 15 Sep 2025 18:44:37 +0000 (+0100) Subject: [Rework] MIME detection via Lua Magic; enforce cfg in Lua task API X-Git-Tag: 3.13.0~7^2~1 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=9be7a2598efa3f3d138ae07b29213d1c976078a8;p=thirdparty%2Frspamd.git [Rework] MIME detection via Lua Magic; enforce cfg in Lua task API - Add rspamd_mime_parser_config on cfg; remove global state and lazy init - Initialize parser config once per cfg; preload lua_magic.detect_mime_part - Always run detection after normal part parse; promote .eml/message parts - Preserve detected_ext/detected_ct/detected_type and NO_TEXT flag - Remove duplicate detection from message.c; add debug logs - Restore CTE parsing API and fix call sites - Enforce cfg requirement in rspamd_task.load_from_string/load_from_file/create - Fix unit tests to pass rspamd_config to load_from_string --- diff --git a/lualib/lua_magic/heuristics.lua b/lualib/lua_magic/heuristics.lua index b8a1b41886..8258ff2493 100644 --- a/lualib/lua_magic/heuristics.lua +++ b/lualib/lua_magic/heuristics.lua @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -]]-- +]] -- --[[[ -- @module lua_magic/heuristics @@ -63,16 +63,16 @@ local zip_patterns = { local txt_trie local txt_patterns = { html = { - { [=[(?i)]]=], 32 }, - { [[(?i)]]=], 32 }, + { [[(?i)]], 31 }, -- Another spammy pattern - { [[(?i)<\!DOCTYPE HTML\b]], 33 }, - { [[(?i) tlen lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total", - tlen - non_printable, non_printable, tlen) + tlen - non_printable, non_printable, tlen) if non_printable / tlen > 0.0078125 then return false end @@ -509,7 +521,7 @@ exports.text_part_heuristic = function(part, log_obj, _) if ext then res[ext] = (res[ext] or 0) + weight * #positions lua_util.debugm(N, log_obj, "found txt pattern for %s: %s, total: %s; %s/%s announced", - ext, weight * #positions, res[ext], mtype, msubtype) + ext, weight * #positions, res[ext], mtype, msubtype) end end diff --git a/lualib/lua_magic/patterns.lua b/lualib/lua_magic/patterns.lua index 4a5abd8ce9..b51deab7f6 100644 --- a/lualib/lua_magic/patterns.lua +++ b/lualib/lua_magic/patterns.lua @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -]]-- +]] -- --[[[ -- @module lua_magic/patterns @@ -255,6 +255,47 @@ local patterns = { }, } }, + zip = { + matches = { + { + hex = [[504b0304]], -- PK\x03\x04 + relative_position = 0, + weight = 60, + }, + } + }, + rar = { + matches = { + { + hex = [[526172211a0700]], -- RAR4 + relative_position = 0, + weight = 60, + }, + { + hex = [[526172211a070100]], -- RAR5 + relative_position = 0, + weight = 60, + }, + } + }, + ['7z'] = { + matches = { + { + hex = [[377abcaf271c]], -- 7z signature + relative_position = 0, + weight = 60, + }, + } + }, + gz = { + matches = { + { + string = [[^\x{1f}\x{8b}\x{08}]], -- gzip with deflate method + position = 3, + weight = 60, + }, + } + }, xar = { matches = { { @@ -392,6 +433,32 @@ local patterns = { }, } }, + webp = { + matches = { + { + -- RIFF....WEBP + string = [[^RIFF....WEBP]], + position = 12, + weight = 60, + }, + } + }, + svg = { + matches = { + { + -- Case-insensitive in the first chunk + string = [[(?i)=', 0 }, + weight = 40, + }, + { + -- XML prolog hints + string = [[<\?xml\b]], + position = { '>=', 0 }, + weight = 20, + }, + } + }, -- Other pgp = { matches = { diff --git a/lualib/lua_magic/types.lua b/lualib/lua_magic/types.lua index ad4ae4349e..a005247b59 100644 --- a/lualib/lua_magic/types.lua +++ b/lualib/lua_magic/types.lua @@ -12,7 +12,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -]]-- +]] -- --[[[ -- @module lua_magic/patterns @@ -284,6 +284,17 @@ local types = { ct = 'image/heic', av_check = false, }, + webp = { + type = 'image', + ct = 'image/webp', + av_check = false, + }, + svg = { + type = 'image', + ct = 'image/svg+xml', + av_check = false, + no_text = true, + }, dwg = { type = 'image', ct = 'image/vnd.dwg', diff --git a/src/libmime/archives.c b/src/libmime/archives.c index c40c0e88a1..b02a659e2e 100644 --- a/src/libmime/archives.c +++ b/src/libmime/archives.c @@ -1777,8 +1777,7 @@ rspamd_archive_process_7zip(struct rspamd_task *task, return; } - while ((p = rspamd_7zip_read_next_section(task, p, end, arch, part)) != NULL) - ; + while ((p = rspamd_7zip_read_next_section(task, p, end, arch, part)) != NULL); part->part_type = RSPAMD_MIME_PART_ARCHIVE; part->specific.arch = arch; @@ -2026,44 +2025,37 @@ void rspamd_archives_process(struct rspamd_task *task) { unsigned int i; struct rspamd_mime_part *part; - const unsigned char rar_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07}; - const unsigned char zip_magic[] = {0x50, 0x4b, 0x03, 0x04}; - const unsigned char sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C}; - const unsigned char gz_magic[] = {0x1F, 0x8B, 0x08}; PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part) { - if (part->part_type == RSPAMD_MIME_PART_UNDEFINED) { - if (part->parsed_data.len > 0) { - if (rspamd_archive_cheat_detect(part, "zip", - zip_magic, sizeof(zip_magic))) { + if (part->parsed_data.len > 0 && part->part_type != RSPAMD_MIME_PART_ARCHIVE) { + const char *ext = part->detected_ext; + if (ext) { + if (g_ascii_strcasecmp(ext, "zip") == 0) { rspamd_archive_process_zip(task, part); } - else if (rspamd_archive_cheat_detect(part, "rar", - rar_magic, sizeof(rar_magic))) { + else if (g_ascii_strcasecmp(ext, "rar") == 0) { rspamd_archive_process_rar(task, part); } - else if (rspamd_archive_cheat_detect(part, "7z", - sz_magic, sizeof(sz_magic))) { + else if (g_ascii_strcasecmp(ext, "7z") == 0) { rspamd_archive_process_7zip(task, part); } - else if (rspamd_archive_cheat_detect(part, "gz", - gz_magic, sizeof(gz_magic))) { + else if (g_ascii_strcasecmp(ext, "gz") == 0) { rspamd_archive_process_gzip(task, part); } + } - if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT) && - part->part_type == RSPAMD_MIME_PART_ARCHIVE && - part->specific.arch) { - struct rspamd_archive *arch = part->specific.arch; + if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT) && + part->part_type == RSPAMD_MIME_PART_ARCHIVE && + part->specific.arch) { + struct rspamd_archive *arch = part->specific.arch; - msg_info_task("found %s archive with incorrect content-type: %T/%T", - rspamd_archive_type_str(arch->type), - &part->ct->type, &part->ct->subtype); + msg_info_task("found %s archive with incorrect content-type: %T/%T", + rspamd_archive_type_str(arch->type), + &part->ct->type, &part->ct->subtype); - if (!(part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) { - part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; - } + if (!(part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) { + part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN; } } } diff --git a/src/libmime/images.c b/src/libmime/images.c index 1078baba68..d137311b10 100644 --- a/src/libmime/images.c +++ b/src/libmime/images.c @@ -600,27 +600,47 @@ static bool process_image(struct rspamd_task *task, struct rspamd_mime_part *part) { struct rspamd_image *img; + const char *ext = part->detected_ext; - img = rspamd_maybe_process_image(task->task_pool, &part->parsed_data); - - if (img != NULL) { - msg_debug_images("detected %s image of size %ud x %ud", - rspamd_image_type_str(img->type), - img->width, img->height); - - if (part->cd) { - img->filename = &part->cd->filename; + if (ext != NULL && part->parsed_data.len > 0) { + /* Prefer Lua Magic decision; do not re-detect by magic */ + if (g_ascii_strcasecmp(ext, "png") == 0) { + img = process_png_image(task->task_pool, &part->parsed_data); } + else if (g_ascii_strcasecmp(ext, "jpg") == 0 || g_ascii_strcasecmp(ext, "jpeg") == 0) { + img = process_jpg_image(task->task_pool, &part->parsed_data); + } + else if (g_ascii_strcasecmp(ext, "gif") == 0) { + img = process_gif_image(task->task_pool, &part->parsed_data); + } + else if (g_ascii_strcasecmp(ext, "bmp") == 0) { + img = process_bmp_image(task->task_pool, &part->parsed_data); + } + else { + /* Unsupported image subtype for structural parsing; skip without re-magic */ + return false; + } + } + else { + /* Fallback for legacy/unknown cases */ + img = rspamd_maybe_process_image(task->task_pool, &part->parsed_data); + } - img->parent = part; - - part->part_type = RSPAMD_MIME_PART_IMAGE; - part->specific.img = img; + if (img == NULL) { + return false; + } - return true; + img->parent = part; + if (part->cd) { + img->filename = &part->cd->filename; } - return false; + part->specific.img = img; + part->part_type = RSPAMD_MIME_PART_IMAGE; + if (part->cd == NULL) { + part->cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*part->cd)); + } + return true; } const char * @@ -715,4 +735,4 @@ void rspamd_images_link(struct rspamd_task *task) rspamd_image_process_part(task, part); } } -} \ No newline at end of file +} diff --git a/src/libmime/message.c b/src/libmime/message.c index 61f675d075..21b54e7ec9 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1447,6 +1447,73 @@ void rspamd_message_process(struct rspamd_task *task) rspamd_archives_process(task); + /* Second pass: fill detected_* for parts not decided during parsing */ + if (L && task->cfg->mime_parser_cfg && + rspamd_mime_parser_get_lua_magic_cbref(task->cfg->mime_parser_cfg) != -1) { + unsigned int j; + struct rspamd_mime_part *pp; + PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), j, pp) + { + if (pp->parsed_data.len > 0 && + (/* no detection yet */ (pp->detected_type == NULL && pp->detected_ext == NULL) || + /* refine generic archives */ + (pp->detected_ext && (g_ascii_strcasecmp(pp->detected_ext, "zip") == 0 || + g_ascii_strcasecmp(pp->detected_ext, "rar") == 0 || + g_ascii_strcasecmp(pp->detected_ext, "7z") == 0 || + g_ascii_strcasecmp(pp->detected_ext, "gz") == 0)))) { + struct rspamd_mime_part **pmime; + struct rspamd_task **ptask; + lua_pushcfunction(L, &rspamd_lua_traceback); + int err_idx2 = lua_gettop(L); + lua_rawgeti(L, LUA_REGISTRYINDEX, rspamd_mime_parser_get_lua_magic_cbref(task->cfg->mime_parser_cfg)); + pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *)); + rspamd_lua_setclass(L, rspamd_mimepart_classname, -1); + *pmime = pp; + ptask = lua_newuserdata(L, sizeof(struct rspamd_task *)); + rspamd_lua_setclass(L, rspamd_task_classname, -1); + *ptask = task; + + if (lua_pcall(L, 2, 2, err_idx2) == 0) { + if (lua_istable(L, -1)) { + const char *mb; + if (lua_isstring(L, -2)) { + pp->detected_ext = rspamd_mempool_strdup(task->task_pool, lua_tostring(L, -2)); + } + lua_pushstring(L, "ct"); + lua_gettable(L, -2); + if (lua_isstring(L, -1)) { + mb = lua_tostring(L, -1); + if (mb) { + rspamd_ftok_t srch; + srch.begin = mb; + srch.len = strlen(mb); + pp->detected_ct = rspamd_content_type_parse(srch.begin, srch.len, task->task_pool); + } + } + lua_pop(L, 1); + lua_pushstring(L, "type"); + lua_gettable(L, -2); + if (lua_isstring(L, -1)) { + pp->detected_type = rspamd_mempool_strdup(task->task_pool, lua_tostring(L, -1)); + } + lua_pop(L, 1); + lua_pushstring(L, "no_text"); + lua_gettable(L, -2); + if (lua_isboolean(L, -1) && lua_toboolean(L, -1)) { + pp->flags |= RSPAMD_MIME_PART_NO_TEXT_EXTRACTION; + } + lua_pop(L, 1); + } + } + else { + msg_err_task("second-pass detect type: %s", lua_tostring(L, -1)); + } + /* restore stack */ + lua_settop(L, 0); + } + } + } + if (L) { old_top = lua_gettop(L); } diff --git a/src/libmime/mime_parser.c b/src/libmime/mime_parser.c index d66731dd6b..075fec347d 100644 --- a/src/libmime/mime_parser.c +++ b/src/libmime/mime_parser.c @@ -63,16 +63,43 @@ rspamd_mime_parser_init_shared(struct rspamd_config *cfg) if (rspamd_lua_require_function(cfg->mime_parser_cfg->L, "lua_magic", "detect_mime_part")) { cfg->mime_parser_cfg->lua_magic_detect_cbref = luaL_ref(cfg->mime_parser_cfg->L, LUA_REGISTRYINDEX); } + else { + msg_err("fatal error: cannot load lua_magic.detect_mime_part (see previous errors)"); + lua_settop(cfg->mime_parser_cfg->L, old_top); + g_abort(); + } lua_settop(cfg->mime_parser_cfg->L, old_top); } + else if (!cfg->mime_parser_cfg->L) { + msg_err("fatal error: lua state is not initialised for mime parser"); + g_abort(); + } } return cfg->mime_parser_cfg; } -void rspamd_mime_parser_free_shared(struct rspamd_mime_parser_config *unused) +void rspamd_mime_parser_free_shared(struct rspamd_mime_parser_config *cfg) { - /* noop: lifetime tied to process */ + if (cfg == NULL) { + return; + } + + /* Unref Lua callback if registered */ + if (cfg->L && cfg->lua_magic_detect_cbref != -1) { + int old_top = lua_gettop(cfg->L); + luaL_unref(cfg->L, LUA_REGISTRYINDEX, cfg->lua_magic_detect_cbref); + cfg->lua_magic_detect_cbref = -1; + lua_settop(cfg->L, old_top); + } + + /* Destroy multipattern */ + if (cfg->mp_boundary) { + rspamd_multipattern_destroy(cfg->mp_boundary); + cfg->mp_boundary = NULL; + } + + g_free(cfg); } int rspamd_mime_parser_get_lua_magic_cbref(const struct rspamd_mime_parser_config *cfg) @@ -918,6 +945,7 @@ rspamd_mime_maybe_detect_type(struct rspamd_task *task, if (L && task->cfg->mime_parser_cfg && rspamd_mime_parser_get_lua_magic_cbref(task->cfg->mime_parser_cfg) != -1) { + msg_debug_mime("will call lua_magic.detect_mime_part for part #%ud", npart->part_number); old_top = lua_gettop(L); lua_pushcfunction(L, &rspamd_lua_traceback); err_idx = lua_gettop(L); @@ -1007,6 +1035,14 @@ rspamd_mime_maybe_detect_type(struct rspamd_task *task, lua_settop(L, old_top); } + else { + int cbref = -1; + if (task->cfg && task->cfg->mime_parser_cfg) { + cbref = rspamd_mime_parser_get_lua_magic_cbref(task->cfg->mime_parser_cfg); + } + msg_debug_mime("skip lua_magic for part #%ud: L=%p, cbref=%d", + npart->part_number, (void *) L, cbref); + } /* Fallback: if nothing detected but declared CT is text, set detected_type to text */ if (npart->detected_type == NULL && npart->ct && diff --git a/src/libmime/mime_parser.h b/src/libmime/mime_parser.h index 6ed175dc8b..38175256cc 100644 --- a/src/libmime/mime_parser.h +++ b/src/libmime/mime_parser.h @@ -22,6 +22,10 @@ struct rspamd_config; struct rspamd_mime_parser_config; +#ifdef __cplusplus +extern "C" { +#endif + /* Initialize shared mime parser config (stores Lua refs, precompiled data) */ struct rspamd_mime_parser_config *rspamd_mime_parser_init_shared(struct rspamd_config *cfg); void rspamd_mime_parser_free_shared(struct rspamd_mime_parser_config *cfg); @@ -29,10 +33,6 @@ void rspamd_mime_parser_free_shared(struct rspamd_mime_parser_config *cfg); /* Accessors */ int rspamd_mime_parser_get_lua_magic_cbref(const struct rspamd_mime_parser_config *cfg); -#ifdef __cplusplus -extern "C" { -#endif - struct rspamd_task; struct rspamd_mime_part; diff --git a/src/libserver/cfg_utils.cxx b/src/libserver/cfg_utils.cxx index c22a9b877b..1e96c320af 100644 --- a/src/libserver/cfg_utils.cxx +++ b/src/libserver/cfg_utils.cxx @@ -21,6 +21,7 @@ #include "cfg_file.h" #include "rspamd.h" #include "cfg_file_private.h" +#include "libmime/mime_parser.h" #include "maps/map.h" #include "maps/map_helpers.h" @@ -383,6 +384,12 @@ void rspamd_config_free(struct rspamd_config *cfg) luaL_unref(RSPAMD_LUA_CFG_STATE(cfg), LUA_REGISTRYINDEX, sc->cbref); } + /* Free mime parser shared config if created */ + if (cfg->mime_parser_cfg) { + rspamd_mime_parser_free_shared(cfg->mime_parser_cfg); + cfg->mime_parser_cfg = nullptr; + } + DL_FOREACH_SAFE(cfg->setting_ids, set, stmp) { REF_RELEASE(set);