WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-]]--
+]] --
--[[[
-- @module lua_magic/heuristics
local txt_trie
local txt_patterns = {
html = {
- { [=[(?i)<html[\s>]]=], 32 },
- { [[(?i)<script\b]], 20 }, -- Commonly used by spammers
+ { [=[(?i)<html[\s>]]=], 32 },
+ { [[(?i)<script\b]], 20 }, -- Commonly used by spammers
{ [[<script\s+type="text\/javascript">]], 31 }, -- Another spammy pattern
- { [[(?i)<\!DOCTYPE HTML\b]], 33 },
- { [[(?i)<body\b]], 20 },
- { [[(?i)<table\b]], 20 },
- { [[(?i)<a\s]], 10 },
- { [[(?i)<p\b]], 10 },
- { [[(?i)<div\b]], 10 },
- { [[(?i)<span\b]], 10 },
+ { [[(?i)<\!DOCTYPE HTML\b]], 33 },
+ { [[(?i)<body\b]], 20 },
+ { [[(?i)<table\b]], 20 },
+ { [[(?i)<a\s]], 10 },
+ { [[(?i)<p\b]], 10 },
+ { [[(?i)<div\b]], 10 },
+ { [[(?i)<span\b]], 10 },
},
csv = {
{ [[(?:[-a-zA-Z0-9_]+\s*,){2,}(?:[-a-zA-Z0-9_]+,?[ ]*[\r\n])]], 20 }
local function compile_tries()
local default_compile_flags = bit.bor(rspamd_trie.flags.re,
- rspamd_trie.flags.dot_all,
- rspamd_trie.flags.single_match,
- rspamd_trie.flags.no_start)
+ rspamd_trie.flags.dot_all,
+ rspamd_trie.flags.single_match,
+ rspamd_trie.flags.no_start)
local function compile_pats(patterns, indexes, transform_func, compile_flags)
local strs = {}
for ext, pats in pairs(patterns) do
local function msoffice_pattern_transform(pat)
return '^' ..
table.concat(
- fun.totable(
- fun.map(function(c)
- return c .. [[\x{00}]]
- end,
- fun.iter(pat))))
+ fun.totable(
+ fun.map(function(c)
+ return c .. [[\x{00}]]
+ end,
+ fun.iter(pat))))
end
local function msoffice_clsid_transform(pat)
local hex_table = {}
end
-- Directory entries
msoffice_trie = compile_pats(msoffice_patterns, msoffice_patterns_indexes,
- msoffice_pattern_transform)
+ msoffice_pattern_transform)
-- Clsids
msoffice_trie_clsid = compile_pats(msoffice_clsids, msoffice_clsid_indexes,
- msoffice_clsid_transform)
+ msoffice_clsid_transform)
-- Misc zip patterns at the initial fragment
zip_trie = compile_pats(zip_patterns, zip_patterns_indexes,
- function(pat)
- return pat
- end)
+ function(pat)
+ return pat
+ end)
-- Text patterns at the initial fragment
txt_trie = compile_pats(txt_patterns, txt_patterns_indexes,
- function(pat_tbl)
- return pat_tbl[1]
- end,
- bit.bor(rspamd_trie.flags.re,
- rspamd_trie.flags.dot_all,
- rspamd_trie.flags.no_start))
+ function(pat_tbl)
+ return pat_tbl[1]
+ end,
+ bit.bor(rspamd_trie.flags.re,
+ rspamd_trie.flags.dot_all,
+ rspamd_trie.flags.no_start))
end
end
for n, _ in pairs(matches) do
if msoffice_clsid_indexes[n] then
lua_util.debugm(N, log_obj, "found valid clsid for %s",
- msoffice_clsid_indexes[n][1])
+ msoffice_clsid_indexes[n][1])
return true, msoffice_clsid_indexes[n][1]
end
end
apk = 0,
} -- ext + confidence pairs
+ local function has_control_or_zw(fname)
+ -- control ASCII
+ if fname:find("[%z\1-\31]") then return true end
+ -- common zero-width UTF-8: U+200B..U+200D, U+FEFF
+ if fname:find("\226\128[\139-\141]") then return true end -- U+200B..U+200D
+ if fname:find("\239\187\191") then return true end -- U+FEFF
+ return false
+ end
+
-- General msoffice patterns
local function add_msoffice_confidence(incr)
res.docx = res.docx + incr
-- Find specific files/folders in zip file
local files = arch:get_files(100) or {}
for _, file in ipairs(files) do
+ if has_control_or_zw(file) then
+ lua_util.debugm(N, log_obj, "archive filename has control/zw chars: %s", file)
+ end
if file == '[Content_Types].xml' then
add_msoffice_confidence(10)
elseif file:sub(1, 3) == 'xl/' then
for n, _ in pairs(matches) do
if zip_patterns_indexes[n] then
lua_util.debugm(N, log_obj, "found zip pattern for %s",
- zip_patterns_indexes[n][1])
+ zip_patterns_indexes[n][1])
return zip_patterns_indexes[n][1], 40
end
end
csv_grammar = lpeg.Cf(lpeg.Cc(0) * field * lpeg.P((lpeg.P(',') +
lpeg.P('\t')) * field) ^ 1 * (lpeg.S '\r\n' + -1),
- function(acc)
- return acc + 1
- end)
+ function(acc)
+ return acc + 1
+ end)
end
return csv_grammar
if not ncommas then
lua_util.debugm(N, log_obj, "not a csv line at line number %s",
- matched_lines)
+ matched_lines)
return false
end
if expected_commas and ncommas ~= expected_commas then
-- Mismatched commas
lua_util.debugm(N, log_obj, "missmatched commas on line %s: %s != %s",
- matched_lines, ncommas, expected_commas)
+ matched_lines, ncommas, expected_commas)
return false
elseif not expected_commas then
if ncommas == 0 then
end
lua_util.debugm(N, log_obj, "csv content is sane: %s fields; %s lines checked",
- expected_commas, matched_lines)
+ expected_commas, matched_lines)
return true
end
until i > tlen
lua_util.debugm(N, log_obj, "text part check: %s printable, %s non-printable, %s total",
- tlen - non_printable, non_printable, tlen)
+ tlen - non_printable, non_printable, tlen)
if non_printable / tlen > 0.0078125 then
return false
end
if ext then
res[ext] = (res[ext] or 0) + weight * #positions
lua_util.debugm(N, log_obj, "found txt pattern for %s: %s, total: %s; %s/%s announced",
- ext, weight * #positions, res[ext], mtype, msubtype)
+ ext, weight * #positions, res[ext], mtype, msubtype)
end
end
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-]]--
+]] --
--[[[
-- @module lua_magic/patterns
},
}
},
+ zip = {
+ matches = {
+ {
+ hex = [[504b0304]], -- PK\x03\x04
+ relative_position = 0,
+ weight = 60,
+ },
+ }
+ },
+ rar = {
+ matches = {
+ {
+ hex = [[526172211a0700]], -- RAR4
+ relative_position = 0,
+ weight = 60,
+ },
+ {
+ hex = [[526172211a070100]], -- RAR5
+ relative_position = 0,
+ weight = 60,
+ },
+ }
+ },
+ ['7z'] = {
+ matches = {
+ {
+ hex = [[377abcaf271c]], -- 7z signature
+ relative_position = 0,
+ weight = 60,
+ },
+ }
+ },
+ gz = {
+ matches = {
+ {
+ string = [[^\x{1f}\x{8b}\x{08}]], -- gzip with deflate method
+ position = 3,
+ weight = 60,
+ },
+ }
+ },
xar = {
matches = {
{
},
}
},
+ webp = {
+ matches = {
+ {
+ -- RIFF....WEBP
+ string = [[^RIFF....WEBP]],
+ position = 12,
+ weight = 60,
+ },
+ }
+ },
+ svg = {
+ matches = {
+ {
+ -- Case-insensitive <svg ...> in the first chunk
+ string = [[(?i)<svg\b]],
+ position = { '>=', 0 },
+ weight = 40,
+ },
+ {
+ -- XML prolog hints
+ string = [[<\?xml\b]],
+ position = { '>=', 0 },
+ weight = 20,
+ },
+ }
+ },
-- Other
pgp = {
matches = {
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-]]--
+]] --
--[[[
-- @module lua_magic/patterns
ct = 'image/heic',
av_check = false,
},
+ webp = {
+ type = 'image',
+ ct = 'image/webp',
+ av_check = false,
+ },
+ svg = {
+ type = 'image',
+ ct = 'image/svg+xml',
+ av_check = false,
+ no_text = true,
+ },
dwg = {
type = 'image',
ct = 'image/vnd.dwg',
return;
}
- while ((p = rspamd_7zip_read_next_section(task, p, end, arch, part)) != NULL)
- ;
+ while ((p = rspamd_7zip_read_next_section(task, p, end, arch, part)) != NULL);
part->part_type = RSPAMD_MIME_PART_ARCHIVE;
part->specific.arch = arch;
{
unsigned int i;
struct rspamd_mime_part *part;
- const unsigned char rar_magic[] = {0x52, 0x61, 0x72, 0x21, 0x1A, 0x07};
- const unsigned char zip_magic[] = {0x50, 0x4b, 0x03, 0x04};
- const unsigned char sz_magic[] = {'7', 'z', 0xBC, 0xAF, 0x27, 0x1C};
- const unsigned char gz_magic[] = {0x1F, 0x8B, 0x08};
PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), i, part)
{
- if (part->part_type == RSPAMD_MIME_PART_UNDEFINED) {
- if (part->parsed_data.len > 0) {
- if (rspamd_archive_cheat_detect(part, "zip",
- zip_magic, sizeof(zip_magic))) {
+ if (part->parsed_data.len > 0 && part->part_type != RSPAMD_MIME_PART_ARCHIVE) {
+ const char *ext = part->detected_ext;
+ if (ext) {
+ if (g_ascii_strcasecmp(ext, "zip") == 0) {
rspamd_archive_process_zip(task, part);
}
- else if (rspamd_archive_cheat_detect(part, "rar",
- rar_magic, sizeof(rar_magic))) {
+ else if (g_ascii_strcasecmp(ext, "rar") == 0) {
rspamd_archive_process_rar(task, part);
}
- else if (rspamd_archive_cheat_detect(part, "7z",
- sz_magic, sizeof(sz_magic))) {
+ else if (g_ascii_strcasecmp(ext, "7z") == 0) {
rspamd_archive_process_7zip(task, part);
}
- else if (rspamd_archive_cheat_detect(part, "gz",
- gz_magic, sizeof(gz_magic))) {
+ else if (g_ascii_strcasecmp(ext, "gz") == 0) {
rspamd_archive_process_gzip(task, part);
}
+ }
- if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT) &&
- part->part_type == RSPAMD_MIME_PART_ARCHIVE &&
- part->specific.arch) {
- struct rspamd_archive *arch = part->specific.arch;
+ if (part->ct && (part->ct->flags & RSPAMD_CONTENT_TYPE_TEXT) &&
+ part->part_type == RSPAMD_MIME_PART_ARCHIVE &&
+ part->specific.arch) {
+ struct rspamd_archive *arch = part->specific.arch;
- msg_info_task("found %s archive with incorrect content-type: %T/%T",
- rspamd_archive_type_str(arch->type),
- &part->ct->type, &part->ct->subtype);
+ msg_info_task("found %s archive with incorrect content-type: %T/%T",
+ rspamd_archive_type_str(arch->type),
+ &part->ct->type, &part->ct->subtype);
- if (!(part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
- part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
- }
+ if (!(part->ct->flags & RSPAMD_CONTENT_TYPE_MISSING)) {
+ part->ct->flags |= RSPAMD_CONTENT_TYPE_BROKEN;
}
}
}
process_image(struct rspamd_task *task, struct rspamd_mime_part *part)
{
struct rspamd_image *img;
+ const char *ext = part->detected_ext;
- img = rspamd_maybe_process_image(task->task_pool, &part->parsed_data);
-
- if (img != NULL) {
- msg_debug_images("detected %s image of size %ud x %ud",
- rspamd_image_type_str(img->type),
- img->width, img->height);
-
- if (part->cd) {
- img->filename = &part->cd->filename;
+ if (ext != NULL && part->parsed_data.len > 0) {
+ /* Prefer Lua Magic decision; do not re-detect by magic */
+ if (g_ascii_strcasecmp(ext, "png") == 0) {
+ img = process_png_image(task->task_pool, &part->parsed_data);
}
+ else if (g_ascii_strcasecmp(ext, "jpg") == 0 || g_ascii_strcasecmp(ext, "jpeg") == 0) {
+ img = process_jpg_image(task->task_pool, &part->parsed_data);
+ }
+ else if (g_ascii_strcasecmp(ext, "gif") == 0) {
+ img = process_gif_image(task->task_pool, &part->parsed_data);
+ }
+ else if (g_ascii_strcasecmp(ext, "bmp") == 0) {
+ img = process_bmp_image(task->task_pool, &part->parsed_data);
+ }
+ else {
+ /* Unsupported image subtype for structural parsing; skip without re-magic */
+ return false;
+ }
+ }
+ else {
+ /* Fallback for legacy/unknown cases */
+ img = rspamd_maybe_process_image(task->task_pool, &part->parsed_data);
+ }
- img->parent = part;
-
- part->part_type = RSPAMD_MIME_PART_IMAGE;
- part->specific.img = img;
+ if (img == NULL) {
+ return false;
+ }
- return true;
+ img->parent = part;
+ if (part->cd) {
+ img->filename = &part->cd->filename;
}
- return false;
+ part->specific.img = img;
+ part->part_type = RSPAMD_MIME_PART_IMAGE;
+ if (part->cd == NULL) {
+ part->cd = rspamd_mempool_alloc0(task->task_pool, sizeof(*part->cd));
+ }
+ return true;
}
const char *
rspamd_image_process_part(task, part);
}
}
-}
\ No newline at end of file
+}
rspamd_archives_process(task);
+ /* Second pass: fill detected_* for parts not decided during parsing */
+ if (L && task->cfg->mime_parser_cfg &&
+ rspamd_mime_parser_get_lua_magic_cbref(task->cfg->mime_parser_cfg) != -1) {
+ unsigned int j;
+ struct rspamd_mime_part *pp;
+ PTR_ARRAY_FOREACH(MESSAGE_FIELD(task, parts), j, pp)
+ {
+ if (pp->parsed_data.len > 0 &&
+ (/* no detection yet */ (pp->detected_type == NULL && pp->detected_ext == NULL) ||
+ /* refine generic archives */
+ (pp->detected_ext && (g_ascii_strcasecmp(pp->detected_ext, "zip") == 0 ||
+ g_ascii_strcasecmp(pp->detected_ext, "rar") == 0 ||
+ g_ascii_strcasecmp(pp->detected_ext, "7z") == 0 ||
+ g_ascii_strcasecmp(pp->detected_ext, "gz") == 0)))) {
+ struct rspamd_mime_part **pmime;
+ struct rspamd_task **ptask;
+ lua_pushcfunction(L, &rspamd_lua_traceback);
+ int err_idx2 = lua_gettop(L);
+ lua_rawgeti(L, LUA_REGISTRYINDEX, rspamd_mime_parser_get_lua_magic_cbref(task->cfg->mime_parser_cfg));
+ pmime = lua_newuserdata(L, sizeof(struct rspamd_mime_part *));
+ rspamd_lua_setclass(L, rspamd_mimepart_classname, -1);
+ *pmime = pp;
+ ptask = lua_newuserdata(L, sizeof(struct rspamd_task *));
+ rspamd_lua_setclass(L, rspamd_task_classname, -1);
+ *ptask = task;
+
+ if (lua_pcall(L, 2, 2, err_idx2) == 0) {
+ if (lua_istable(L, -1)) {
+ const char *mb;
+ if (lua_isstring(L, -2)) {
+ pp->detected_ext = rspamd_mempool_strdup(task->task_pool, lua_tostring(L, -2));
+ }
+ lua_pushstring(L, "ct");
+ lua_gettable(L, -2);
+ if (lua_isstring(L, -1)) {
+ mb = lua_tostring(L, -1);
+ if (mb) {
+ rspamd_ftok_t srch;
+ srch.begin = mb;
+ srch.len = strlen(mb);
+ pp->detected_ct = rspamd_content_type_parse(srch.begin, srch.len, task->task_pool);
+ }
+ }
+ lua_pop(L, 1);
+ lua_pushstring(L, "type");
+ lua_gettable(L, -2);
+ if (lua_isstring(L, -1)) {
+ pp->detected_type = rspamd_mempool_strdup(task->task_pool, lua_tostring(L, -1));
+ }
+ lua_pop(L, 1);
+ lua_pushstring(L, "no_text");
+ lua_gettable(L, -2);
+ if (lua_isboolean(L, -1) && lua_toboolean(L, -1)) {
+ pp->flags |= RSPAMD_MIME_PART_NO_TEXT_EXTRACTION;
+ }
+ lua_pop(L, 1);
+ }
+ }
+ else {
+ msg_err_task("second-pass detect type: %s", lua_tostring(L, -1));
+ }
+ /* restore stack */
+ lua_settop(L, 0);
+ }
+ }
+ }
+
if (L) {
old_top = lua_gettop(L);
}
if (rspamd_lua_require_function(cfg->mime_parser_cfg->L, "lua_magic", "detect_mime_part")) {
cfg->mime_parser_cfg->lua_magic_detect_cbref = luaL_ref(cfg->mime_parser_cfg->L, LUA_REGISTRYINDEX);
}
+ else {
+ msg_err("fatal error: cannot load lua_magic.detect_mime_part (see previous errors)");
+ lua_settop(cfg->mime_parser_cfg->L, old_top);
+ g_abort();
+ }
lua_settop(cfg->mime_parser_cfg->L, old_top);
}
+ else if (!cfg->mime_parser_cfg->L) {
+ msg_err("fatal error: lua state is not initialised for mime parser");
+ g_abort();
+ }
}
return cfg->mime_parser_cfg;
}
-void rspamd_mime_parser_free_shared(struct rspamd_mime_parser_config *unused)
+void rspamd_mime_parser_free_shared(struct rspamd_mime_parser_config *cfg)
{
- /* noop: lifetime tied to process */
+ if (cfg == NULL) {
+ return;
+ }
+
+ /* Unref Lua callback if registered */
+ if (cfg->L && cfg->lua_magic_detect_cbref != -1) {
+ int old_top = lua_gettop(cfg->L);
+ luaL_unref(cfg->L, LUA_REGISTRYINDEX, cfg->lua_magic_detect_cbref);
+ cfg->lua_magic_detect_cbref = -1;
+ lua_settop(cfg->L, old_top);
+ }
+
+ /* Destroy multipattern */
+ if (cfg->mp_boundary) {
+ rspamd_multipattern_destroy(cfg->mp_boundary);
+ cfg->mp_boundary = NULL;
+ }
+
+ g_free(cfg);
}
int rspamd_mime_parser_get_lua_magic_cbref(const struct rspamd_mime_parser_config *cfg)
if (L && task->cfg->mime_parser_cfg &&
rspamd_mime_parser_get_lua_magic_cbref(task->cfg->mime_parser_cfg) != -1) {
+ msg_debug_mime("will call lua_magic.detect_mime_part for part #%ud", npart->part_number);
old_top = lua_gettop(L);
lua_pushcfunction(L, &rspamd_lua_traceback);
err_idx = lua_gettop(L);
lua_settop(L, old_top);
}
+ else {
+ int cbref = -1;
+ if (task->cfg && task->cfg->mime_parser_cfg) {
+ cbref = rspamd_mime_parser_get_lua_magic_cbref(task->cfg->mime_parser_cfg);
+ }
+ msg_debug_mime("skip lua_magic for part #%ud: L=%p, cbref=%d",
+ npart->part_number, (void *) L, cbref);
+ }
/* Fallback: if nothing detected but declared CT is text, set detected_type to text */
if (npart->detected_type == NULL && npart->ct &&
struct rspamd_mime_parser_config;
+#ifdef __cplusplus
+extern "C" {
+#endif
+
/* Initialize shared mime parser config (stores Lua refs, precompiled data) */
struct rspamd_mime_parser_config *rspamd_mime_parser_init_shared(struct rspamd_config *cfg);
void rspamd_mime_parser_free_shared(struct rspamd_mime_parser_config *cfg);
/* Accessors */
int rspamd_mime_parser_get_lua_magic_cbref(const struct rspamd_mime_parser_config *cfg);
-#ifdef __cplusplus
-extern "C" {
-#endif
-
struct rspamd_task;
struct rspamd_mime_part;
#include "cfg_file.h"
#include "rspamd.h"
#include "cfg_file_private.h"
+#include "libmime/mime_parser.h"
#include "maps/map.h"
#include "maps/map_helpers.h"
luaL_unref(RSPAMD_LUA_CFG_STATE(cfg), LUA_REGISTRYINDEX, sc->cbref);
}
+ /* Free mime parser shared config if created */
+ if (cfg->mime_parser_cfg) {
+ rspamd_mime_parser_free_shared(cfg->mime_parser_cfg);
+ cfg->mime_parser_cfg = nullptr;
+ }
+
DL_FOREACH_SAFE(cfg->setting_ids, set, stmp)
{
REF_RELEASE(set);