return nullptr;
}
+ /* For multiple classes (space-separated), take only first class */
+ gsize first_class_len = len;
+ for (gsize i = 0; i < len; i++) {
+ if (g_ascii_isspace(cls[i])) {
+ first_class_len = i;
+ break;
+ }
+ }
+
/* Skip if mostly digits */
unsigned int digit_count = 0;
- for (gsize i = 0; i < len; i++) {
+ for (gsize i = 0; i < first_class_len; i++) {
if (g_ascii_isdigit(cls[i])) {
digit_count++;
}
}
- if (digit_count > len / 2) {
+ if (digit_count > first_class_len / 2) {
return nullptr;
}
- auto *result = static_cast<char *>(rspamd_mempool_alloc(pool, len + 1));
+ auto *result = static_cast<char *>(rspamd_mempool_alloc(pool, first_class_len + 1));
gsize out_len = 0;
- for (gsize i = 0; i < len && out_len < 32; i++) {
+ for (gsize i = 0; i < first_class_len && out_len < 32; i++) {
char c = cls[i];
if (g_ascii_isalnum(c) || c == '-' || c == '_') {
result[out_len++] = g_ascii_tolower(c);
auto *hc_ptr = html_content::from_ptr(html_content);
- if (!hc_ptr || hc_ptr->all_tags.empty()) {
+ if (!hc_ptr) {
+ return nullptr;
+ }
+
+ if (hc_ptr->all_tags.empty()) {
+ /* Empty HTML - no tags */
return nullptr;
}
html_extract_structural_tokens(hc_ptr, pool, tokens, cta_domains, all_domains);
if (tokens.empty()) {
- /* Empty HTML structure */
+ /* Empty HTML structure after filtering */
return nullptr;
}
unsigned int additional_length;
unsigned char *additional_data;
- msg_debug_fuzzy_check("fuzzy_cmd_from_html_part called for rule %s", rule->name);
-
/* Check if HTML shingles are enabled for this rule */
if (!rule->html_shingles) {
- msg_debug_fuzzy_check("HTML shingles disabled for rule %s", rule->name);
return NULL;
}
/* Check if this is an HTML part */
if (!IS_TEXT_PART_HTML(part) || part->html == NULL) {
- msg_debug_fuzzy_check("Part is not HTML or html is NULL");
return NULL;
}
return NULL;
}
- msg_debug_fuzzy_check("Proceeding to generate HTML fuzzy hash, tags_count=%d",
- part->html_features ? part->html_features->tags_count : 0);
+ /*
+ * HTML fuzzy uses separate cache key to avoid conflicts with text fuzzy.
+ * Text parts can have both text hash (short text, no shingles) and HTML hash.
+ */
+ char html_cache_key[64];
+ int key_part;
+ struct rspamd_cached_shingles **html_cached_ptr;
- cached = fuzzy_cmd_get_cached(rule, task, mp);
+ memcpy(&key_part, rule->shingles_key->str, sizeof(key_part));
+ rspamd_snprintf(html_cache_key, sizeof(html_cache_key), "%s%d_html",
+ rule->algorithm_str, key_part);
- if (cached) {
- /* Copy from cache */
+ html_cached_ptr = (struct rspamd_cached_shingles **) rspamd_mempool_get_variable(
+ task->task_pool, html_cache_key);
+
+ if (html_cached_ptr && html_cached_ptr[mp->part_number]) {
+ cached = html_cached_ptr[mp->part_number];
+ /* Copy from HTML-specific cache */
additional_length = cached->additional_length;
additional_data = cached->additional_data;
sizeof(*encshcmd) + additional_length);
shcmd = &encshcmd->cmd;
- msg_debug_fuzzy_check("generating HTML shingles for part with %d tags",
- part->html_features ? part->html_features->tags_count : 0);
-
html_sh = rspamd_shingles_from_html(part->html,
- rule->shingles_key->str, task->task_pool,
+ (const unsigned char *) rule->shingles_key->str, task->task_pool,
rspamd_shingles_default_filter, NULL,
rule->alg);
if (rule->html_shingles && !(flags & FUZZY_CHECK_FLAG_NOHTML)) {
struct fuzzy_cmd_io *html_io;
- msg_debug_fuzzy_check("Attempting HTML fuzzy hash for rule %s", rule->name);
html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value,
part, mime_part);
if (html_io) {
/* Add HTML hash as separate command */
- msg_debug_fuzzy_check("HTML fuzzy hash generated and added to commands");
g_ptr_array_add(res, html_io);
}
- else {
- msg_debug_fuzzy_check("HTML fuzzy hash generation returned NULL");
- }
- }
- else {
- msg_debug_fuzzy_check("HTML fuzzy skipped: html_shingles=%d, NOHTML flag=%d",
- rule->html_shingles, !!(flags & FUZZY_CHECK_FLAG_NOHTML));
}
}
else if (mime_part->part_type == RSPAMD_MIME_PART_IMAGE &&
*** Variables ***
${HTML_TEMPLATE_1} ${RSPAMD_TESTDIR}/messages/html_template_1.eml
${HTML_TEMPLATE_1_VAR} ${RSPAMD_TESTDIR}/messages/html_template_1_variation.eml
+${HTML_TEMPLATE_1_FUZZY} ${RSPAMD_TESTDIR}/messages/html_template_1_fuzzy.eml
${HTML_PHISHING} ${RSPAMD_TESTDIR}/messages/html_phishing.eml
*** Keywords ***
Scan File ${HTML_TEMPLATE_1}
Expect Symbol ${FLAG1_SYMBOL}
-HTML Fuzzy Variation Test
- [Documentation] Check variation of same template (different text, same HTML structure)
+HTML Fuzzy Exact Match Variation Test
+ [Documentation] Check exact match with different text but identical HTML structure
IF ${RSPAMD_FUZZY_HTML_ADD} == 0
Fail "HTML Fuzzy Add was not run"
END
Scan File ${HTML_TEMPLATE_1_VAR}
- # Should match via HTML shingles despite different text
+ # Should match exactly - same HTML structure, only text differs
+ Expect Symbol ${FLAG1_SYMBOL}
+
+HTML Fuzzy Similarity Test
+ [Documentation] Check fuzzy (similarity) match with slightly different HTML structure
+ IF ${RSPAMD_FUZZY_HTML_ADD} == 0
+ Fail "HTML Fuzzy Add was not run"
+ END
+ Scan File ${HTML_TEMPLATE_1_FUZZY}
+ # Should match via shingles - similar but not identical HTML structure
+ # (added spacer div, extra paragraph, second article)
Expect Symbol ${FLAG1_SYMBOL}
HTML Fuzzy Phishing Test
HTML Fuzzy Exact Match
HTML Fuzzy Check Test
-HTML Fuzzy Template Variation
- HTML Fuzzy Variation Test
+HTML Fuzzy Exact Match With Text Variation
+ HTML Fuzzy Exact Match Variation Test
+
+HTML Fuzzy Similarity Match
+ HTML Fuzzy Similarity Test
HTML Fuzzy Phishing Detection
HTML Fuzzy Phishing Test