From: Philippe Antoine <pantoine@oisf.net>
Date: Tue, 18 Mar 2025 09:55:39 +0000 (+0100)
Subject: detect: add configurable limits for datasets
X-Git-Tag: suricata-8.0.0-beta1~262
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a7713db709b8a0be5fc5e5809ab58e9b14a16e85;p=thirdparty%2Fsuricata.git

detect: add configurable limits for datasets

Ticket: 7615

Avoids signatures setting extreme hash sizes, which would lead to very
high memory use.

Default to allowing:
- 65536 per dataset
- 16777216 total

To override these built-in defaults:

```yaml
datasets:
  # Limits for per rule dataset instances to avoid rules using too many
  # resources.
  limits:
    # Max value for per dataset `hashsize` setting
    #single-hashsize: 65536
    # Max combined hashsize values for all datasets.
    #total-hashsizes: 16777216
```
---

diff --git a/src/datasets.c b/src/datasets.c
index 8e7126550f..131bcff34b 100644
--- a/src/datasets.c
+++ b/src/datasets.c
@@ -39,11 +39,16 @@
 #include "util-misc.h"
 #include "util-path.h"
 #include "util-debug.h"
+#include "util-validate.h"
 
 SCMutex sets_lock = SCMUTEX_INITIALIZER;
 static Dataset *sets = NULL;
 static uint32_t set_ids = 0;
 
+uint32_t dataset_max_one_hashsize = 65536;
+uint32_t dataset_max_total_hashsize = 16777216;
+uint32_t dataset_used_hashsize = 0;
+
 int DatasetAddwRep(Dataset *set, const uint8_t *data, const uint32_t data_len, DataRepType *rep);
 
 static inline void DatasetUnlockData(THashData *d)
@@ -321,6 +326,34 @@ Dataset *DatasetFind(const char *name, enum DatasetTypes type)
     return set;
 }
 
+static bool DatasetCheckHashsize(const char *name, uint32_t hash_size)
+{
+    if (dataset_max_one_hashsize > 0 && hash_size > dataset_max_one_hashsize) {
+        SCLogError("hashsize %u in dataset '%s' exceeds configured 'single-hashsize' limit (%u)",
+                hash_size, name, dataset_max_one_hashsize);
+        return false;
+    }
+    // we cannot underflow as we know from conf loading that
+    // dataset_max_total_hashsize >= dataset_max_one_hashsize if dataset_max_total_hashsize > 0
+    if (dataset_max_total_hashsize > 0 &&
+            dataset_max_total_hashsize - hash_size < dataset_used_hashsize) {
+        SCLogError("hashsize %u in dataset '%s' exceeds configured 'total-hashsizes' limit (%u, in "
+                   "use %u)",
+                hash_size, name, dataset_max_total_hashsize, dataset_used_hashsize);
+        return false;
+    }
+
+    return true;
+}
+
+static void DatasetUpdateHashsize(const char *name, uint32_t hash_size)
+{
+    if (dataset_max_total_hashsize > 0) {
+        dataset_used_hashsize += hash_size;
+        SCLogDebug("set %s adding with hash_size %u", name, hash_size);
+    }
+}
+
 Dataset *DatasetGet(const char *name, enum DatasetTypes type, const char *save, const char *load,
         uint64_t memcap, uint32_t hashsize)
 {
@@ -374,6 +407,10 @@ Dataset *DatasetGet(const char *name, enum DatasetTypes type, const char *save,
         hashsize = default_hashsize;
     }
 
+    if (!DatasetCheckHashsize(name, hashsize)) {
+        goto out_err;
+    }
+
     set = DatasetAlloc(name);
     if (set == NULL) {
         goto out_err;
@@ -452,6 +489,10 @@ Dataset *DatasetGet(const char *name, enum DatasetTypes type, const char *save,
     set->next = sets;
     sets = set;
 
+    /* hash size accounting */
+    DEBUG_VALIDATE_BUG_ON(set->hash->config.hash_size != hashsize);
+    DatasetUpdateHashsize(set->name, set->hash->config.hash_size);
+
     SCMutexUnlock(&sets_lock);
     return set;
 out_err:
@@ -493,6 +534,10 @@ void DatasetReload(void)
             continue;
         }
         set->hidden = true;
+        if (dataset_max_total_hashsize > 0) {
+            DEBUG_VALIDATE_BUG_ON(set->hash->config.hash_size > dataset_used_hashsize);
+            dataset_used_hashsize -= set->hash->config.hash_size;
+        }
         SCLogDebug("Set %s at %p hidden successfully", set->name, set);
         set = set->next;
     }
@@ -560,6 +605,27 @@ int DatasetsInit(void)
     uint32_t default_hashsize = 0;
     GetDefaultMemcap(&default_memcap, &default_hashsize);
     if (datasets != NULL) {
+        const char *str = NULL;
+        if (ConfGet("datasets.limits.total-hashsizes", &str) == 1) {
+            if (ParseSizeStringU32(str, &dataset_max_total_hashsize) < 0) {
+                FatalError("failed to parse datasets.limits.total-hashsizes value: %s", str);
+            }
+        }
+        if (ConfGet("datasets.limits.single-hashsize", &str) == 1) {
+            if (ParseSizeStringU32(str, &dataset_max_one_hashsize) < 0) {
+                FatalError("failed to parse datasets.limits.single-hashsize value: %s", str);
+            }
+        }
+        if (dataset_max_total_hashsize > 0 &&
+                dataset_max_total_hashsize < dataset_max_one_hashsize) {
+            FatalError("total-hashsizes (%u) cannot be smaller than single-hashsize (%u)",
+                    dataset_max_total_hashsize, dataset_max_one_hashsize);
+        }
+        if (dataset_max_total_hashsize > 0 && dataset_max_one_hashsize == 0) {
+            // the total limit also applies for single limit
+            dataset_max_one_hashsize = dataset_max_total_hashsize;
+        }
+
         int list_pos = 0;
         ConfNode *iter = NULL;
         TAILQ_FOREACH(iter, &datasets->head, next) {
diff --git a/src/tests/fuzz/confyaml.c b/src/tests/fuzz/confyaml.c
index 1945528599..05995ea56f 100644
--- a/src/tests/fuzz/confyaml.c
+++ b/src/tests/fuzz/confyaml.c
@@ -112,4 +112,8 @@ app-layer:\n\
       enabled: yes\n\
 detect:\n\
   inspection-recursion-limit: 0\n\
+datasets:\n\
+  maximums:\n\
+    single_hashsize: 65536\n\
+    total_hashsizes: 16777216\n\
 ";
diff --git a/src/util-thash.c b/src/util-thash.c
index d840ae26d2..a511049e07 100644
--- a/src/util-thash.c
+++ b/src/util-thash.c
@@ -324,16 +324,11 @@ THashTableContext *THashInit(const char *cnf_prefix, uint32_t data_size,
     ctx->config.hash_size = hashsize > 0 ? hashsize : THASH_DEFAULT_HASHSIZE;
     /* Reset memcap in case of loading from file to the highest possible value
      unless defined by the rule keyword */
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-    // limit memcap size to default when fuzzing
-    SC_ATOMIC_SET(ctx->config.memcap, THASH_DEFAULT_MEMCAP);
-#else
     if (memcap > 0) {
         SC_ATOMIC_SET(ctx->config.memcap, memcap);
     } else {
         SC_ATOMIC_SET(ctx->config.memcap, reset_memcap ? UINT64_MAX : THASH_DEFAULT_MEMCAP);
     }
-#endif
     ctx->config.prealloc = THASH_DEFAULT_PREALLOC;
 
     SC_ATOMIC_INIT(ctx->counter);
diff --git a/suricata.yaml.in b/suricata.yaml.in
index 5117b09928..a30a57aaec 100644
--- a/suricata.yaml.in
+++ b/suricata.yaml.in
@@ -1235,6 +1235,14 @@ datasets:
     #memcap: 100 MiB
     #hashsize: 2048
 
+  # Limits for per rule dataset instances to avoid rules using too many
+  # resources.
+  limits:
+    # Max value for per dataset `hashsize` setting
+    #single-hashsize: 65536
+    # Max combined hashsize values for all datasets.
+    #total-hashsizes: 16777216
+
   rules:
     # Set to true to allow absolute filenames and filenames that use
     # ".." components to reference parent directories in rules that specify