]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Minor] Use khash instead of GHashTable in fuzzy_check.c
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 1 Nov 2025 11:01:10 +0000 (11:01 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sat, 1 Nov 2025 11:01:10 +0000 (11:01 +0000)
src/plugins/fuzzy_check.c

index bd10f52ffa4a5c5929e00fe37ba22ebcd1feb746..a7709aaf0723b9fe5cceefb2bb4a2db5dabea1d0 100644 (file)
@@ -39,6 +39,7 @@
 #include "libserver/worker_util.h"
 #include "libserver/mempool_vars_internal.h"
 #include "libserver/html/html_features.h"
+#include "khash.h"
 #include "fuzzy_wire.h"
 #include "utlist.h"
 #include "ottery.h"
@@ -83,6 +84,9 @@ enum fuzzy_rule_mode {
        fuzzy_rule_read_write
 };
 
+struct fuzzy_tcp_pending_command; /* Forward declaration */
+KHASH_MAP_INIT_INT(fuzzy_pending_hash, struct fuzzy_tcp_pending_command *);
+
 struct fuzzy_rule {
        struct upstream_list *read_servers;  /* Servers for read operations */
        struct upstream_list *write_servers; /* Servers for write operations */
@@ -111,7 +115,7 @@ struct fuzzy_rule {
        gboolean no_share;
        gboolean no_subject;
        gboolean html_shingles;     /* Enable HTML fuzzy hashing */
-       gboolean text_hashes;        /* Enable/disable generation of text hashes */
+       gboolean text_hashes;       /* Enable/disable generation of text hashes */
        unsigned int min_html_tags; /* Minimum tags for HTML hash */
        int learn_condition_cb;
        uint32_t retransmits;
@@ -135,8 +139,8 @@ struct fuzzy_rule {
        } rate_tracker;
 
        /* TCP connection pool - array of connections, one per upstream */
-       GPtrArray *tcp_connections;   /* Array of fuzzy_tcp_connection* */
-       GHashTable *pending_requests; /* Global: tag -> fuzzy_tcp_pending_command */
+       GPtrArray *tcp_connections;                     /* Array of fuzzy_tcp_connection* */
+       khash_t(fuzzy_pending_hash) * pending_requests; /* Global: tag -> fuzzy_tcp_pending_command */
 };
 
 struct fuzzy_ctx {
@@ -568,7 +572,12 @@ fuzzy_free_rule(gpointer r)
 
        /* Clean up pending requests pool */
        if (rule->pending_requests) {
-               g_hash_table_destroy(rule->pending_requests);
+               struct fuzzy_tcp_pending_command *pending;
+
+               kh_foreach_value(rule->pending_requests, pending, {
+                       g_free(pending);
+               });
+               kh_destroy(fuzzy_pending_hash, rule->pending_requests);
        }
 }
 
@@ -914,15 +923,16 @@ fuzzy_tcp_connection_close(struct fuzzy_tcp_connection *conn)
 static void
 fuzzy_tcp_connection_cleanup(struct fuzzy_tcp_connection *conn, const char *reason)
 {
-       GHashTableIter iter;
-       gpointer key, value;
+       khash_t(fuzzy_pending_hash) * pending_ht;
        struct fuzzy_tcp_pending_command *pending;
-       GPtrArray *to_remove;
        GHashTable *sessions_to_check;
-       unsigned int i;
+       GHashTableIter session_iter;
+       struct fuzzy_client_session *session;
        struct rspamd_task *task = NULL;
+       int sessions_checked = 0;
+       unsigned int removed = 0;
 
-       if (!conn || !conn->rule || !conn->rule->pending_requests) {
+       if (!conn || !conn->rule || !(pending_ht = conn->rule->pending_requests)) {
                return;
        }
 
@@ -931,49 +941,39 @@ fuzzy_tcp_connection_cleanup(struct fuzzy_tcp_connection *conn, const char *reas
                         conn->rule->name,
                         reason ? reason : "unknown");
 
-       /* Collect commands to remove and sessions to check */
-       to_remove = g_ptr_array_new();
        sessions_to_check = g_hash_table_new(g_direct_hash, g_direct_equal);
 
-       g_hash_table_iter_init(&iter, conn->rule->pending_requests);
-       while (g_hash_table_iter_next(&iter, &key, &value)) {
-               pending = (struct fuzzy_tcp_pending_command *) value;
+       for (khiter_t k = kh_begin(pending_ht); k != kh_end(pending_ht); ++k) {
+               if (!kh_exist(pending_ht, k)) {
+                       continue;
+               }
+
+               pending = kh_val(pending_ht, k);
 
                if (pending->connection == conn) {
-                       /* Mark command as replied (failed) */
                        if (!(pending->io->flags & FUZZY_CMD_FLAG_REPLIED)) {
                                pending->io->flags |= FUZZY_CMD_FLAG_REPLIED;
                        }
 
-                       /* Collect session for completion check */
                        if (pending->session) {
                                g_hash_table_add(sessions_to_check, pending->session);
                        }
 
-                       g_ptr_array_add(to_remove, key);
-
-                       /* Get task for logging */
                        if (!task && pending->task) {
                                task = pending->task;
                        }
-               }
-       }
 
-       /* Remove pending commands from hash table */
-       for (i = 0; i < to_remove->len; i++) {
-               g_hash_table_remove(conn->rule->pending_requests,
-                                                       g_ptr_array_index(to_remove, i));
+                       g_free(pending);
+                       kh_del(fuzzy_pending_hash, pending_ht, k);
+                       removed++;
+               }
        }
 
-       if (to_remove->len > 0 && task) {
-               msg_warn_task("fuzzy_tcp: cleaned up %d pending commands due to connection failure: %s",
-                                         (int) to_remove->len, reason ? reason : "unknown");
+       if (removed > 0 && task) {
+               msg_warn_task("fuzzy_tcp: cleaned up %u pending commands due to connection failure: %s",
+                                         removed, reason ? reason : "unknown");
        }
 
-       /* Check session completion for all affected sessions */
-       GHashTableIter session_iter;
-       struct fuzzy_client_session *session;
-       int sessions_checked = 0;
        g_hash_table_iter_init(&session_iter, sessions_to_check);
        while (g_hash_table_iter_next(&session_iter, (gpointer *) &session, NULL)) {
                sessions_checked++;
@@ -985,7 +985,6 @@ fuzzy_tcp_connection_cleanup(struct fuzzy_tcp_connection *conn, const char *reas
                                           sessions_checked);
        }
 
-       g_ptr_array_free(to_remove, TRUE);
        g_hash_table_unref(sessions_to_check);
 }
 
@@ -996,43 +995,39 @@ fuzzy_tcp_connection_cleanup(struct fuzzy_tcp_connection *conn, const char *reas
 static void
 fuzzy_tcp_check_pending_timeouts(struct fuzzy_rule *rule, ev_tstamp now)
 {
-       GHashTableIter iter;
-       gpointer key, value;
+       khash_t(fuzzy_pending_hash) * pending_ht;
        struct fuzzy_tcp_pending_command *pending;
-       GPtrArray *to_remove;
        GHashTable *sessions_to_check;
-       unsigned int i;
+       GHashTableIter session_iter;
+       struct fuzzy_client_session *session;
        ev_tstamp timeout;
 
-       if (!rule || !rule->pending_requests) {
+       if (!rule || !(pending_ht = rule->pending_requests)) {
                return;
        }
 
        timeout = rule->io_timeout;
-       to_remove = g_ptr_array_new();
        sessions_to_check = g_hash_table_new(g_direct_hash, g_direct_equal);
 
-       g_hash_table_iter_init(&iter, rule->pending_requests);
-       while (g_hash_table_iter_next(&iter, &key, &value)) {
-               pending = (struct fuzzy_tcp_pending_command *) value;
+       for (khiter_t k = kh_begin(pending_ht); k != kh_end(pending_ht); ++k) {
+               if (!kh_exist(pending_ht, k)) {
+                       continue;
+               }
+
+               pending = kh_val(pending_ht, k);
 
-               /* Check if request timed out */
                if ((now - pending->send_time) > timeout) {
-                       /* Mark command as replied (timed out) */
                        if (!(pending->io->flags & FUZZY_CMD_FLAG_REPLIED)) {
                                pending->io->flags |= FUZZY_CMD_FLAG_REPLIED;
                        }
 
-                       /* Collect session for completion check */
                        if (pending->session) {
                                g_hash_table_add(sessions_to_check, pending->session);
                        }
 
-                       g_ptr_array_add(to_remove, key);
-
                        if (pending->task) {
                                struct rspamd_task *task = pending->task;
-                               /* Log timeout at info for auto mode, debug otherwise */
+
                                if (rule->tcp_auto) {
                                        msg_info_task("fuzzy_tcp: request timeout after %.2fs for tag %ud to %s",
                                                                  now - pending->send_time,
@@ -1046,24 +1041,17 @@ fuzzy_tcp_check_pending_timeouts(struct fuzzy_rule *rule, ev_tstamp now)
                                                                   rspamd_upstream_name(pending->connection->server));
                                }
                        }
-               }
-       }
 
-       /* Remove timed out commands */
-       for (i = 0; i < to_remove->len; i++) {
-               g_hash_table_remove(rule->pending_requests,
-                                                       g_ptr_array_index(to_remove, i));
+                       g_free(pending);
+                       kh_del(fuzzy_pending_hash, pending_ht, k);
+               }
        }
 
-       /* Check session completion for all affected sessions */
-       GHashTableIter session_iter;
-       struct fuzzy_client_session *session;
        g_hash_table_iter_init(&session_iter, sessions_to_check);
        while (g_hash_table_iter_next(&session_iter, (gpointer *) &session, NULL)) {
                fuzzy_check_session_is_completed(session);
        }
 
-       g_ptr_array_free(to_remove, TRUE);
        g_hash_table_unref(sessions_to_check);
 }
 
@@ -1316,8 +1304,25 @@ fuzzy_tcp_send_command(struct fuzzy_tcp_connection *conn,
                pending->connection = conn;
                pending->send_time = rspamd_get_calendar_ticks();
 
-               g_hash_table_insert(conn->rule->pending_requests,
-                                                       GINT_TO_POINTER(io->tag), pending);
+               khiter_t k;
+               int kh_ret;
+
+               k = kh_put(fuzzy_pending_hash, conn->rule->pending_requests, io->tag, &kh_ret);
+
+               if (kh_ret < 0) {
+                       msg_err_task("fuzzy_tcp: cannot register pending command for tag %ud", io->tag);
+                       g_free(pending);
+               }
+               else {
+                       if (kh_ret == 0) {
+                               struct fuzzy_tcp_pending_command *old = kh_val(conn->rule->pending_requests, k);
+                               if (old) {
+                                       g_free(old);
+                               }
+                       }
+
+                       kh_val(conn->rule->pending_requests, k) = pending;
+               }
 
                /* Mark as sent */
                io->flags |= FUZZY_CMD_FLAG_SENT;
@@ -1410,14 +1415,16 @@ fuzzy_tcp_process_reply(struct fuzzy_tcp_connection *conn,
 
        /* Extract tag and lookup pending command */
        tag = rep->v1.tag;
-       pending = g_hash_table_lookup(rule->pending_requests, GINT_TO_POINTER(tag));
+       khiter_t k = kh_get(fuzzy_pending_hash, rule->pending_requests, tag);
 
-       if (!pending) {
+       if (k == kh_end(rule->pending_requests)) {
                msg_debug("fuzzy_tcp: unexpected tag %ud from %s",
                                  tag, rspamd_upstream_name(conn->server));
                return;
        }
 
+       pending = kh_val(rule->pending_requests, k);
+
        /* Get task for debug logging */
        struct rspamd_task *task = pending->task;
 
@@ -1494,7 +1501,8 @@ fuzzy_tcp_process_reply(struct fuzzy_tcp_connection *conn,
        struct fuzzy_client_session *session_to_check = pending->session;
 
        /* Remove from pending requests */
-       g_hash_table_remove(rule->pending_requests, GINT_TO_POINTER(tag));
+       kh_del(fuzzy_pending_hash, rule->pending_requests, k);
+       g_free(pending);
 
        /* Check if session is completed */
        fuzzy_check_session_is_completed(session_to_check);
@@ -2145,8 +2153,7 @@ fuzzy_parse_rule(struct rspamd_config *cfg, const ucl_object_t *obj,
        rule->tcp_connections = g_ptr_array_new_with_free_func(fuzzy_tcp_connection_unref);
 
        /* Initialize global pending requests pool - keyed by tag */
-       rule->pending_requests = g_hash_table_new_full(g_direct_hash, g_direct_equal,
-                                                                                                  NULL, g_free);
+       rule->pending_requests = kh_init(fuzzy_pending_hash);
 
        /*
         * Process rule in Lua
@@ -2490,50 +2497,50 @@ int fuzzy_check_module_init(struct rspamd_config *cfg, struct module_ctx **ctx)
                                                           "true",
                                                           0);
        rspamd_rcl_add_doc_by_path(cfg,
-                                  "fuzzy_check.rule",
-                                  "Enable hashing of text content (set to false to disable text hashes)",
-                                  "text_hashes",
-                                  UCL_BOOLEAN,
-                                  NULL,
-                                  0,
-                                  "true",
-                                  0);
+                                                          "fuzzy_check.rule",
+                                                          "Enable hashing of text content (set to false to disable text hashes)",
+                                                          "text_hashes",
+                                                          UCL_BOOLEAN,
+                                                          NULL,
+                                                          0,
+                                                          "true",
+                                                          0);
        rspamd_rcl_add_doc_by_path(cfg,
-                                  "fuzzy_check.rule",
-                                  "Enable HTML structure hashing for this rule",
-                                  "html_shingles",
-                                  UCL_BOOLEAN,
-                                  NULL,
-                                  0,
-                                  "false",
-                                  0);
+                                                          "fuzzy_check.rule",
+                                                          "Enable HTML structure hashing for this rule",
+                                                          "html_shingles",
+                                                          UCL_BOOLEAN,
+                                                          NULL,
+                                                          0,
+                                                          "false",
+                                                          0);
        rspamd_rcl_add_doc_by_path(cfg,
-                                  "fuzzy_check.rule",
-                                  "Minimum number of HTML tags required to generate HTML hashes",
-                                  "min_html_tags",
-                                  UCL_INT,
-                                  NULL,
-                                  0,
-                                  NULL,
-                                  0);
+                                                          "fuzzy_check.rule",
+                                                          "Minimum number of HTML tags required to generate HTML hashes",
+                                                          "min_html_tags",
+                                                          UCL_INT,
+                                                          NULL,
+                                                          0,
+                                                          NULL,
+                                                          0);
        rspamd_rcl_add_doc_by_path(cfg,
-                                  "fuzzy_check.rule",
-                                  "Multiplier applied to HTML fuzzy matches",
-                                  "html_weight",
-                                  UCL_FLOAT,
-                                  NULL,
-                                  0,
-                                  NULL,
-                                  0);
+                                                          "fuzzy_check.rule",
+                                                          "Multiplier applied to HTML fuzzy matches",
+                                                          "html_weight",
+                                                          UCL_FLOAT,
+                                                          NULL,
+                                                          0,
+                                                          NULL,
+                                                          0);
        rspamd_rcl_add_doc_by_path(cfg,
-                                  "fuzzy_check.rule",
-                                  "Content hashing checks configuration object (e.g. { text = { enabled = true; }, html = { enabled = true; } })",
-                                  "checks",
-                                  UCL_OBJECT,
-                                  NULL,
-                                  0,
-                                  NULL,
-                                  0);
+                                                          "fuzzy_check.rule",
+                                                          "Content hashing checks configuration object (e.g. { text = { enabled = true; }, html = { enabled = true; } })",
+                                                          "checks",
+                                                          UCL_OBJECT,
+                                                          NULL,
+                                                          0,
+                                                          NULL,
+                                                          0);
        rspamd_rcl_add_doc_by_path(cfg,
                                                           "fuzzy_check.rule",
                                                           "Override module default min bytes for this rule",
@@ -2911,46 +2918,37 @@ int fuzzy_check_module_reconfig(struct rspamd_config *cfg)
 static void
 fuzzy_tcp_session_cleanup(struct fuzzy_client_session *session)
 {
-       GHashTableIter iter;
-       gpointer key, value;
+       khash_t(fuzzy_pending_hash) * pending_ht;
        struct fuzzy_tcp_pending_command *pending;
-       GPtrArray *to_remove;
-       unsigned int i;
+       unsigned int removed = 0;
+       struct rspamd_task *task = session->task;
 
-       if (!session || !session->rule || !session->rule->pending_requests) {
+       if (!session || !session->rule || !(pending_ht = session->rule->pending_requests)) {
                return;
        }
 
-       /* Collect tags to remove (can't modify hash table during iteration) */
-       to_remove = g_ptr_array_new();
+       for (khiter_t k = kh_begin(pending_ht); k != kh_end(pending_ht); ++k) {
+               if (!kh_exist(pending_ht, k)) {
+                       continue;
+               }
 
-       g_hash_table_iter_init(&iter, session->rule->pending_requests);
-       while (g_hash_table_iter_next(&iter, &key, &value)) {
-               pending = (struct fuzzy_tcp_pending_command *) value;
+               pending = kh_val(pending_ht, k);
 
                if (pending->session == session) {
-                       /* Mark command as replied (session finished) */
                        if (!(pending->io->flags & FUZZY_CMD_FLAG_REPLIED)) {
                                pending->io->flags |= FUZZY_CMD_FLAG_REPLIED;
                        }
 
-                       g_ptr_array_add(to_remove, key);
+                       g_free(pending);
+                       kh_del(fuzzy_pending_hash, pending_ht, k);
+                       removed++;
                }
        }
 
-       /* Remove pending commands */
-       for (i = 0; i < to_remove->len; i++) {
-               g_hash_table_remove(session->rule->pending_requests,
-                                                       g_ptr_array_index(to_remove, i));
+       if (removed > 0 && session->task) {
+               msg_debug_fuzzy_check("fuzzy_tcp: cleaned up %u pending commands for finished session",
+                                                         removed);
        }
-
-       if (to_remove->len > 0 && session->task) {
-               struct rspamd_task *task = session->task;
-               msg_debug_fuzzy_check("fuzzy_tcp: cleaned up %d pending commands for finished session",
-                                                         (int) to_remove->len);
-       }
-
-       g_ptr_array_free(to_remove, TRUE);
 }
 
 /* Finalize IO */
@@ -3696,6 +3694,7 @@ fuzzy_cmd_from_html_part(struct rspamd_task *task,
        unsigned int additional_length;
        unsigned char *additional_data;
 
+
        /* Check if HTML shingles are enabled for this rule */
        if (!rule->html_shingles) {
                return NULL;
@@ -5256,24 +5255,24 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                        !(flags & FUZZY_CHECK_FLAG_NOTEXT)) {
                                        part = mime_part->specific.txt;
                                        gboolean allow_html = rule->html_shingles &&
-                                               !(flags & FUZZY_CHECK_FLAG_NOHTML) &&
-                                               (check_part || !rule->text_hashes);
+                                                                                 !(flags & FUZZY_CHECK_FLAG_NOHTML) &&
+                                                                                 (check_part || !rule->text_hashes);
 
                                        if (check_part && rule->text_hashes) {
                                                io = fuzzy_cmd_from_text_part(task, rule,
-                                                                                       c,
-                                                                                       flag,
-                                                                                       value,
-                                                                                       !fuzzy_check,
-                                                                                       part,
-                                                                                       mime_part);
+                                                                                                         c,
+                                                                                                         flag,
+                                                                                                         value,
+                                                                                                         !fuzzy_check,
+                                                                                                         part,
+                                                                                                         mime_part);
                                        }
 
                                        if (allow_html && part != NULL) {
                                                struct fuzzy_cmd_io *html_io;
 
                                                html_io = fuzzy_cmd_from_html_part(task, rule, c, flag, value,
-                                                                                       part, mime_part);
+                                                                                                                  part, mime_part);
 
                                                if (html_io) {
                                                        /* Add HTML hash as separate command */
@@ -5282,13 +5281,13 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                        }
                                }
                                else if (check_part && mime_part->part_type == RSPAMD_MIME_PART_IMAGE &&
-                                        !(flags & FUZZY_CHECK_FLAG_NOIMAGES)) {
+                                                !(flags & FUZZY_CHECK_FLAG_NOIMAGES)) {
                                        image = mime_part->specific.img;
 
                                        io = fuzzy_cmd_from_data_part(rule, c, flag, value,
-                                                                                         task,
-                                                                                         image->parent->digest,
-                                                                                         mime_part);
+                                                                                                 task,
+                                                                                                 image->parent->digest,
+                                                                                                 mime_part);
                                        io->flags |= FUZZY_CMD_FLAG_IMAGE;
                                }
                                else if (check_part && mime_part->part_type == RSPAMD_MIME_PART_CUSTOM_LUA) {
@@ -5330,10 +5329,10 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
 
                                                                if (hlen == rspamd_cryptobox_HASHBYTES) {
                                                                        io = fuzzy_cmd_from_data_part(rule, c,
-                                                                                             flag, value,
-                                                                                             task,
-                                                                                             (unsigned char *) h,
-                                                                                             mime_part);
+                                                                                                                                 flag, value,
+                                                                                                                                 task,
+                                                                                                                                 (unsigned char *) h,
+                                                                                                                                 mime_part);
 
                                                                        if (io) {
                                                                                io->flags |= FUZZY_CMD_FLAG_CONTENT;
@@ -5349,16 +5348,16 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                                 * Add part itself as well
                                                 */
                                                io = fuzzy_cmd_from_data_part(rule, c,
-                                                                                         flag, value,
-                                                                                         task,
-                                                                                         mime_part->digest,
-                                                                                         mime_part);
+                                                                                                         flag, value,
+                                                                                                         task,
+                                                                                                         mime_part->digest,
+                                                                                                         mime_part);
                                        }
                                }
                                else if (check_part) {
                                        io = fuzzy_cmd_from_data_part(rule, c, flag, value,
-                                                                                         task,
-                                                                                         mime_part->digest, mime_part);
+                                                                                                 task,
+                                                                                                 mime_part->digest, mime_part);
                                }
 
                                if (io) {
@@ -5367,7 +5366,7 @@ fuzzy_generate_commands(struct rspamd_task *task, struct fuzzy_rule *rule,
                                        PTR_ARRAY_FOREACH(res, j, cur)
                                        {
                                                if (memcmp(cur->cmd.digest, io->cmd.digest,
-                                                          sizeof(io->cmd.digest)) == 0) {
+                                                                  sizeof(io->cmd.digest)) == 0) {
                                                        skip_existing = TRUE;
                                                        break;
                                                }