From: Vsevolod Stakhov Date: Thu, 9 Oct 2025 07:31:51 +0000 (+0100) Subject: Merge branch 'master' into vstakhov-fuzzy-tcp-rework X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=27b416a536e22dab25febba3c8fa3ccbfdbfbde0;p=thirdparty%2Frspamd.git Merge branch 'master' into vstakhov-fuzzy-tcp-rework Resolved conflict in src/plugins/fuzzy_check.c by including both: - HTML shingles configuration parsing from master - TCP connection initialization from feature branch Fixed trailing whitespace in config files from master. --- 27b416a536e22dab25febba3c8fa3ccbfdbfbde0 diff --cc conf/modules.d/fuzzy_check_html.conf index 0000000000,face9c916d..ed4631e9ad mode 000000,100644..100644 --- a/conf/modules.d/fuzzy_check_html.conf +++ b/conf/modules.d/fuzzy_check_html.conf @@@ -1,0 -1,114 +1,114 @@@ + # HTML Fuzzy Hashing Configuration Example + # + # This configuration demonstrates how to use HTML fuzzy hashing for: + # 1. Detecting spam campaigns with similar HTML structure + # 2. Phishing detection (similar structure, different CTA domains) + # 3. Brand protection (legitimate templates vs. fake emails) + + fuzzy_check { + # Example rule for HTML structure matching + rule "HTML_FUZZY" { + # Standard fuzzy storage configuration + servers = "localhost:11335"; - ++ + # Encryption (optional, recommended for production) + # encryption_key = "your_base32_encoded_public_key"; + # fuzzy_key = "your_hashing_key"; + # fuzzy_shingles_key = "your_shingles_key"; - ++ + # Algorithm for shingles (mumhash recommended for HTML) + algorithm = "mumhash"; - ++ + # Enable HTML fuzzy hashing + html_shingles = true; - ++ + # Minimum number of HTML tags to generate hash + # (prevents hashing of trivial HTML snippets) + min_html_tags = 15; - ++ + # Weight multiplier for HTML fuzzy matches + # Can be < 1.0 to reduce impact, or > 1.0 to increase + html_weight = 1.0; - ++ + # Regular fuzzy check settings + symbol = "FUZZY_HTML"; + max_score = 20.0; - ++ + # Fuzzy flag mappings + fuzzy_map = { + # Whitelist: known legitimate HTML structures + "FUZZY_HTML_WHITELIST" { + flag = 1; + max_score = 20.0; + } + # Blacklist: known spam/phishing HTML structures + "FUZZY_HTML_BLACKLIST" { + flag = 2; + max_score = 20.0; + } + } - ++ + # Optional: skip specific hashes + # skip_hashes = "${LOCAL_CONFDIR}/local.d/fuzzy_skip_html.map"; + } - ++ + # Example: Combined text + HTML rule + rule "COMBINED_FUZZY" { + servers = "localhost:11335"; + algorithm = "mumhash"; - ++ + # Enable both text and HTML fuzzy hashing + html_shingles = true; + min_html_tags = 10; - ++ + # This rule will generate: + # - Text fuzzy hashes (from content) + # - HTML fuzzy hashes (from structure) + # Both sent to same storage with same flag - ++ + symbol = "FUZZY_COMBINED"; + max_score = 30.0; - ++ + fuzzy_map = { + "FUZZY_COMBINED_WHITE" { + flag = 10; + max_score = 30.0; + } + "FUZZY_COMBINED_SPAM" { + flag = 11; + max_score = 30.0; + } + } + } - ++ + # Example: Phishing detection rule (higher weight for HTML) + rule "PHISHING_DETECTION" { + servers = "localhost:11335"; + algorithm = "mumhash"; - ++ + html_shingles = true; + min_html_tags = 20; - ++ + # Higher weight for HTML matches = prioritize structure over content + html_weight = 1.5; - ++ + symbol = "FUZZY_PHISHING"; + max_score = 25.0; - ++ + fuzzy_map = { + # Known phishing HTML templates + "FUZZY_PHISHING_HTML" { + flag = 20; + max_score = 25.0; + } + # Known legitimate brands (for comparison) + "FUZZY_LEGIT_BRANDS" { + flag = 21; + max_score = -25.0; # Negative score = whitelist + } + } + } + } + + # Additional configuration for phishing detection rules + # See rules/fuzzy_html_phishing.lua for Lua-based detection logic diff --cc src/libserver/protocol.c index b085c69d75,368e6145f4..0709d0ad2e --- a/src/libserver/protocol.c +++ b/src/libserver/protocol.c @@@ -663,98 -663,176 +663,176 @@@ rspamd_protocol_handle_headers(struct r * We must ignore User header in case of spamc, as SA has * different meaning of this header */ - msg_debug_protocol("read user header, value: %T", hv_tok); - if (!RSPAMD_TASK_IS_SPAMC(task)) { - task->auth_user = rspamd_mempool_ftokdup(task->task_pool, - hv_tok); - } - else { - msg_info_protocol("ignore user header: legacy SA protocol"); + msg_debug_protocol("read user header, value: %T", hv_tok); + if (!RSPAMD_TASK_IS_SPAMC(task)) { + task->auth_user = rspamd_mempool_ftokdup(task->task_pool, + hv_tok); + } + else { + msg_info_protocol("ignore user header: legacy SA protocol"); + } } - } - IF_HEADER(URLS_HEADER) - { - msg_debug_protocol("read urls header, value: %T", hv_tok); - - srch.begin = "extended"; - srch.len = 8; + IF_HEADER(URLS_HEADER) + { + msg_debug_protocol("read urls header, value: %T", hv_tok); - if (rspamd_ftok_casecmp(hv_tok, &srch) == 0) { - task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS; - msg_debug_protocol("extended urls information"); - } + srch.begin = "extended"; + srch.len = 8; - /* TODO: add more formats there */ - } - IF_HEADER(USER_AGENT_HEADER) - { - msg_debug_protocol("read user-agent header, value: %T", hv_tok); + if (rspamd_ftok_casecmp(hv_tok, &srch) == 0) { + task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS; + msg_debug_protocol("extended urls information"); + } - if (hv_tok->len == 6 && - rspamd_lc_cmp(hv_tok->begin, "rspamc", 6) == 0) { - task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_LOCAL_CLIENT; + /* TODO: add more formats there */ } - } - break; - case 'l': - case 'L': - IF_HEADER(NO_LOG_HEADER) - { - msg_debug_protocol("read log header, value: %T", hv_tok); - srch.begin = "no"; - srch.len = 2; + IF_HEADER(USER_AGENT_HEADER) + { + msg_debug_protocol("read user-agent header, value: %T", hv_tok); - if (rspamd_ftok_casecmp(hv_tok, &srch) == 0) { - task->flags |= RSPAMD_TASK_FLAG_NO_LOG; + if (hv_tok->len == 6 && + rspamd_lc_cmp(hv_tok->begin, "rspamc", 6) == 0) { + task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_LOCAL_CLIENT; + } } - } - IF_HEADER(LOG_TAG_HEADER) - { - msg_debug_protocol("read log-tag header, value: %T", hv_tok); - /* Ensure that a tag is valid */ - if (rspamd_fast_utf8_validate(hv_tok->begin, hv_tok->len) == 0) { - memcpy(task->task_pool->tag.uid, hv_tok->begin, - MIN(hv_tok->len, sizeof(task->task_pool->tag.uid))); + break; + case 'l': + case 'L': + IF_HEADER(NO_LOG_HEADER) + { + msg_debug_protocol("read log header, value: %T", hv_tok); + srch.begin = "no"; + srch.len = 2; + + if (rspamd_ftok_casecmp(hv_tok, &srch) == 0) { + task->flags |= RSPAMD_TASK_FLAG_NO_LOG; + } } - } - break; - case 'm': - case 'M': - IF_HEADER(MTA_TAG_HEADER) - { - char *mta_tag; - mta_tag = rspamd_mempool_ftokdup(task->task_pool, hv_tok); - rspamd_mempool_set_variable(task->task_pool, - RSPAMD_MEMPOOL_MTA_TAG, - mta_tag, NULL); - msg_debug_protocol("read MTA-Tag header, value: %s", mta_tag); - } - IF_HEADER(MTA_NAME_HEADER) - { - char *mta_name; - mta_name = rspamd_mempool_ftokdup(task->task_pool, hv_tok); - rspamd_mempool_set_variable(task->task_pool, - RSPAMD_MEMPOOL_MTA_NAME, - mta_name, NULL); - msg_debug_protocol("read MTA-Name header, value: %s", mta_name); - } - IF_HEADER(MILTER_HEADER) - { - task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_MILTER; - msg_debug_protocol("read Milter header, value: %T", hv_tok); - } - break; - case 't': - case 'T': - IF_HEADER(TLS_CIPHER_HEADER) - { - task->flags |= RSPAMD_TASK_FLAG_SSL; - msg_debug_protocol("read TLS cipher header, value: %T", hv_tok); - } - break; - case 'x': - case 'X': - IF_HEADER("X-Rspamd-Mail-Esmtp-Args") - { - /* Parse MAIL ESMTP arguments from HTTP header */ - if (!task->mail_esmtp_args) { - task->mail_esmtp_args = g_hash_table_new_full( - rspamd_ftok_icase_hash, - rspamd_ftok_icase_equal, - rspamd_fstring_mapped_ftok_free, - rspamd_fstring_mapped_ftok_free); - } - - /* Parse KEY=VALUE format */ - const char *p = hv_tok->begin; - const char *end = hv_tok->begin + hv_tok->len; - const char *eq = memchr(p, '=', hv_tok->len); - - if (eq && eq > p) { - rspamd_fstring_t *key = rspamd_fstring_new_init(p, eq - p); - rspamd_fstring_t *value = rspamd_fstring_new_init(eq + 1, end - eq - 1); - rspamd_ftok_t *key_tok = rspamd_ftok_map(key); - rspamd_ftok_t *value_tok = rspamd_ftok_map(value); - - g_hash_table_replace(task->mail_esmtp_args, key_tok, value_tok); - msg_debug_protocol("parsed mail ESMTP arg: %T=%T", key_tok, value_tok); + IF_HEADER(LOG_TAG_HEADER) + { + msg_debug_protocol("read log-tag header, value: %T", hv_tok); + /* Ensure that a tag is valid */ + if (rspamd_fast_utf8_validate(hv_tok->begin, hv_tok->len) == 0) { + memcpy(task->task_pool->tag.uid, hv_tok->begin, + MIN(hv_tok->len, sizeof(task->task_pool->tag.uid))); + } } - } - IF_HEADER("X-Rspamd-Rcpt-Esmtp-Args") - { - /* Parse RCPT ESMTP arguments from HTTP header */ - if (!task->rcpt_esmtp_args) { - task->rcpt_esmtp_args = g_ptr_array_new(); - } - - /* Parse IDX:KEY=VALUE format */ - const char *p = hv_tok->begin; - const char *end = hv_tok->begin + hv_tok->len; - const char *colon = memchr(p, ':', hv_tok->len); - - if (colon && colon > p) { - char *endptr; - int rcpt_idx = strtol(p, &endptr, 10); - - if (endptr == colon) { - /* Ensure we have enough entries in the array */ - while (task->rcpt_esmtp_args->len <= rcpt_idx) { - g_ptr_array_add(task->rcpt_esmtp_args, NULL); - } + break; + case 'm': + case 'M': + IF_HEADER(MTA_TAG_HEADER) + { + char *mta_tag; + mta_tag = rspamd_mempool_ftokdup(task->task_pool, hv_tok); + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_MTA_TAG, + mta_tag, NULL); + msg_debug_protocol("read MTA-Tag header, value: %s", mta_tag); + } + IF_HEADER(MTA_NAME_HEADER) + { + char *mta_name; + mta_name = rspamd_mempool_ftokdup(task->task_pool, hv_tok); + rspamd_mempool_set_variable(task->task_pool, + RSPAMD_MEMPOOL_MTA_NAME, + mta_name, NULL); + msg_debug_protocol("read MTA-Name header, value: %s", mta_name); + } + IF_HEADER(MILTER_HEADER) + { + task->protocol_flags |= RSPAMD_TASK_PROTOCOL_FLAG_MILTER; + msg_debug_protocol("read Milter header, value: %T", hv_tok); + } + break; + case 't': + case 'T': + IF_HEADER(TLS_CIPHER_HEADER) + { + task->flags |= RSPAMD_TASK_FLAG_SSL; + msg_debug_protocol("read TLS cipher header, value: %T", hv_tok); + } + break; ++ case 'x': ++ case 'X': ++ IF_HEADER("X-Rspamd-Mail-Esmtp-Args") ++ { ++ /* Parse MAIL ESMTP arguments from HTTP header */ ++ if (!task->mail_esmtp_args) { ++ task->mail_esmtp_args = g_hash_table_new_full( ++ rspamd_ftok_icase_hash, ++ rspamd_ftok_icase_equal, ++ rspamd_fstring_mapped_ftok_free, ++ rspamd_fstring_mapped_ftok_free); ++ } + - /* Get or create hash table for this recipient */ - GHashTable *rcpt_args = g_ptr_array_index(task->rcpt_esmtp_args, rcpt_idx); - if (!rcpt_args) { - rcpt_args = g_hash_table_new_full( - rspamd_ftok_icase_hash, - rspamd_ftok_icase_equal, - rspamd_fstring_mapped_ftok_free, - rspamd_fstring_mapped_ftok_free); - g_ptr_array_index(task->rcpt_esmtp_args, rcpt_idx) = rcpt_args; - } ++ /* Parse KEY=VALUE format */ ++ const char *p = hv_tok->begin; ++ const char *end = hv_tok->begin + hv_tok->len; ++ const char *eq = memchr(p, '=', hv_tok->len); + - /* Parse KEY=VALUE */ - p = colon + 1; - const char *eq = memchr(p, '=', end - p); ++ if (eq && eq > p) { ++ rspamd_fstring_t *key = rspamd_fstring_new_init(p, eq - p); ++ rspamd_fstring_t *value = rspamd_fstring_new_init(eq + 1, end - eq - 1); ++ rspamd_ftok_t *key_tok = rspamd_ftok_map(key); ++ rspamd_ftok_t *value_tok = rspamd_ftok_map(value); + - if (eq && eq > p) { - rspamd_fstring_t *key = rspamd_fstring_new_init(p, eq - p); - rspamd_fstring_t *value = rspamd_fstring_new_init(eq + 1, end - eq - 1); - rspamd_ftok_t *key_tok = rspamd_ftok_map(key); - rspamd_ftok_t *value_tok = rspamd_ftok_map(value); ++ g_hash_table_replace(task->mail_esmtp_args, key_tok, value_tok); ++ msg_debug_protocol("parsed mail ESMTP arg: %T=%T", key_tok, value_tok); ++ } ++ } ++ IF_HEADER("X-Rspamd-Rcpt-Esmtp-Args") ++ { ++ /* Parse RCPT ESMTP arguments from HTTP header */ ++ if (!task->rcpt_esmtp_args) { ++ task->rcpt_esmtp_args = g_ptr_array_new(); ++ } + - g_hash_table_replace(rcpt_args, key_tok, value_tok); - msg_debug_protocol("parsed rcpt ESMTP arg for idx %d: %T=%T", rcpt_idx, key_tok, value_tok); ++ /* Parse IDX:KEY=VALUE format */ ++ const char *p = hv_tok->begin; ++ const char *end = hv_tok->begin + hv_tok->len; ++ const char *colon = memchr(p, ':', hv_tok->len); ++ ++ if (colon && colon > p) { ++ char *endptr; ++ int rcpt_idx = strtol(p, &endptr, 10); ++ ++ if (endptr == colon) { ++ /* Ensure we have enough entries in the array */ ++ while (task->rcpt_esmtp_args->len <= rcpt_idx) { ++ g_ptr_array_add(task->rcpt_esmtp_args, NULL); ++ } ++ ++ /* Get or create hash table for this recipient */ ++ GHashTable *rcpt_args = g_ptr_array_index(task->rcpt_esmtp_args, rcpt_idx); ++ if (!rcpt_args) { ++ rcpt_args = g_hash_table_new_full( ++ rspamd_ftok_icase_hash, ++ rspamd_ftok_icase_equal, ++ rspamd_fstring_mapped_ftok_free, ++ rspamd_fstring_mapped_ftok_free); ++ g_ptr_array_index(task->rcpt_esmtp_args, rcpt_idx) = rcpt_args; ++ } ++ ++ /* Parse KEY=VALUE */ ++ p = colon + 1; ++ const char *eq = memchr(p, '=', end - p); ++ ++ if (eq && eq > p) { ++ rspamd_fstring_t *key = rspamd_fstring_new_init(p, eq - p); ++ rspamd_fstring_t *value = rspamd_fstring_new_init(eq + 1, end - eq - 1); ++ rspamd_ftok_t *key_tok = rspamd_ftok_map(key); ++ rspamd_ftok_t *value_tok = rspamd_ftok_map(value); ++ ++ g_hash_table_replace(rcpt_args, key_tok, value_tok); ++ msg_debug_protocol("parsed rcpt ESMTP arg for idx %d: %T=%T", rcpt_idx, key_tok, value_tok); ++ } + } + } + } - } - break; - default: - msg_debug_protocol("generic header: %T", hn_tok); - break; ++ break; + default: + msg_debug_protocol("generic header: %T", hn_tok); + break; } rspamd_task_add_request_header (task, hn_tok, hv_tok); diff --cc src/plugins/fuzzy_check.c index d11339feb9,467fb8de38..c0d4faef14 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@@ -1962,17 -804,18 +1972,29 @@@ fuzzy_parse_rule(struct rspamd_config * rule->weight_threshold = ucl_object_todouble(value); } + if ((value = ucl_object_lookup(obj, "html_shingles")) != NULL) { + rule->html_shingles = ucl_object_toboolean(value); + } + + if ((value = ucl_object_lookup(obj, "min_html_tags")) != NULL) { + rule->min_html_tags = ucl_object_toint(value); + } + + if ((value = ucl_object_lookup(obj, "html_weight")) != NULL) { + rule->html_weight = ucl_object_todouble(value); + } + + /* Initialize rate tracker */ + rule->rate_tracker.requests_count = 0; + rule->rate_tracker.window_start = 0; + + /* Initialize TCP connection pool - array of connections with proper free function */ + rule->tcp_connections = g_ptr_array_new_with_free_func(fuzzy_tcp_connection_unref); + + /* Initialize global pending requests pool - keyed by tag */ + rule->pending_requests = g_hash_table_new_full(g_direct_hash, g_direct_equal, + NULL, g_free); + /* * Process rule in Lua */ diff --cc test/functional/configs/fuzzy_html_test.conf index 0000000000,4166e97b12..aaeefd4ada mode 000000,100644..100644 --- a/test/functional/configs/fuzzy_html_test.conf +++ b/test/functional/configs/fuzzy_html_test.conf @@@ -1,0 -1,53 +1,53 @@@ + # Test configuration for HTML fuzzy hashing + + .include(duplicate=append,priority=0) "{= env.TESTDIR =}/configs/plugins.conf" + .include(duplicate=merge,priority=0) "{= env.TESTDIR =}/configs/statistic.conf" + + fuzzy_check { + # Test rule for HTML fuzzy hashing + rule "TEST_HTML_FUZZY" { + servers = "localhost:11335"; + algorithm = "mumhash"; - ++ + # Enable HTML fuzzy hashing + html_shingles = true; + min_html_tags = 5; # Low threshold for testing + html_weight = 1.0; - ++ + symbol = "FUZZY_HTML_TEST"; + max_score = 10.0; - ++ + # Skip encryption for testing + # encryption_key = ""; - ++ + fuzzy_map = { + "FUZZY_HTML_WHITELIST" { + flag = 1; + max_score = 10.0; + } + "FUZZY_HTML_SPAM" { + flag = 2; + max_score = 10.0; + } + } + } - ++ + # Rule with both text and HTML enabled + rule "TEST_COMBINED" { + servers = "localhost:11335"; + algorithm = "mumhash"; - ++ + html_shingles = true; + min_html_tags = 3; - ++ + symbol = "FUZZY_COMBINED_TEST"; + max_score = 15.0; - ++ + fuzzy_map = { + "FUZZY_COMBINED_MATCH" { + flag = 10; + max_score = 15.0; + } + } + } + } diff --cc test/functional/messages/html_phishing.eml index 0000000000,e328c218e4..92cd3cddc5 mode 000000,100644..100644 --- a/test/functional/messages/html_phishing.eml +++ b/test/functional/messages/html_phishing.eml @@@ -1,0 -1,28 +1,28 @@@ + From: notification@example.com + To: user@test.com -Subject: Your weekly newsletter ++Subject: Your weekly newsletter + Content-Type: text/html; charset=utf-8 + + + + + Newsletter + + +
+ Logo +
+
+

Weekly Newsletter

+

URGENT: Verify your account now!

+
+

Security Alert

+

Your account has been compromised click here immediately

+ Verify Now +
+
+ + +