]> git.ipfire.org Git - thirdparty/rspamd.git/commitdiff
[Feature] Unified hyperscan cache format for multipattern
authorVsevolod Stakhov <vsevolod@rspamd.com>
Sun, 4 Jan 2026 09:15:12 +0000 (09:15 +0000)
committerVsevolod Stakhov <vsevolod@rspamd.com>
Sun, 4 Jan 2026 09:15:12 +0000 (09:15 +0000)
Add C helper functions for serializing/deserializing hyperscan databases
with the unified format (magic, platform, CRC). Migrate multipattern from
raw .hsmp files to the unified .hs format compatible with re_cache.

- Add rspamd_hyperscan_serialize_with_header() and load_from_header()
- Update multipattern to use unified format with platform validation
- Fix CRC calculation in Lua bindings to match re_cache format

src/libserver/hyperscan_tools.cxx
src/libserver/hyperscan_tools.h
src/libutil/multipattern.c
src/lua/lua_hyperscan.cxx

index 27094c01d6c0cd3760a52b363ecfcda2fa6ddcd6..a814e1cfab998faa7aecd1fdaffa78b94017e9fc 100644 (file)
@@ -928,4 +928,224 @@ gboolean rspamd_hyperscan_create_shared_unser(const char *serialized_data,
        return TRUE;
 }
 
+/* Unified hyperscan format magic */
+static const unsigned char rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'};
+#define RSPAMD_HS_MAGIC_LEN (sizeof(rspamd_hs_magic))
+
+gboolean rspamd_hyperscan_serialize_with_header(hs_database_t *db,
+                                                                                               const unsigned int *ids,
+                                                                                               const unsigned int *flags,
+                                                                                               unsigned int n,
+                                                                                               char **out_data,
+                                                                                               gsize *out_len)
+{
+       if (!db || !out_data || !out_len) {
+               return FALSE;
+       }
+
+       /* Serialize the database - hyperscan allocates the buffer */
+       char *ser_bytes = nullptr;
+       std::size_t ser_size = 0;
+       if (hs_serialize_database(db, &ser_bytes, &ser_size) != HS_SUCCESS) {
+               msg_err_hyperscan("failed to serialize database");
+               return FALSE;
+       }
+
+       /* Get platform info */
+       hs_platform_info_t plt;
+       if (hs_populate_platform(&plt) != HS_SUCCESS) {
+               g_free(ser_bytes);
+               msg_err_hyperscan("failed to get platform info");
+               return FALSE;
+       }
+
+       /* Calculate header size */
+       std::size_t header_size = RSPAMD_HS_MAGIC_LEN +
+                                                         sizeof(plt) +
+                                                         sizeof(n) +
+                                                         (n > 0 ? sizeof(unsigned int) * n * 2 : 0) +
+                                                         sizeof(uint64_t); /* CRC */
+
+       std::size_t total_size = header_size + ser_size;
+
+       /* Allocate buffer */
+       char *buf = static_cast<char *>(g_malloc(total_size));
+       char *p = buf;
+
+       /* Magic */
+       memcpy(p, rspamd_hs_magic, RSPAMD_HS_MAGIC_LEN);
+       p += RSPAMD_HS_MAGIC_LEN;
+
+       /* Platform */
+       memcpy(p, &plt, sizeof(plt));
+       p += sizeof(plt);
+
+       /* Count */
+       memcpy(p, &n, sizeof(n));
+       p += sizeof(n);
+
+       /* IDs and flags - remember positions for CRC calculation */
+       char *ids_start = p;
+       if (n > 0 && ids && flags) {
+               memcpy(p, ids, sizeof(unsigned int) * n);
+               p += sizeof(unsigned int) * n;
+               memcpy(p, flags, sizeof(unsigned int) * n);
+               p += sizeof(unsigned int) * n;
+       }
+       else if (n > 0) {
+               memset(p, 0, sizeof(unsigned int) * n * 2);
+               p += sizeof(unsigned int) * n * 2;
+       }
+
+       /* CRC over IDs + flags + HS blob (compatible with re_cache.c format) */
+       rspamd_cryptobox_fast_hash_state_t crc_st;
+       rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe);
+       if (n > 0) {
+               /* IDs */
+               rspamd_cryptobox_fast_hash_update(&crc_st, ids_start, sizeof(unsigned int) * n);
+               /* Flags */
+               rspamd_cryptobox_fast_hash_update(&crc_st, ids_start + sizeof(unsigned int) * n,
+                                                                                 sizeof(unsigned int) * n);
+       }
+       /* HS database */
+       rspamd_cryptobox_fast_hash_update(&crc_st, ser_bytes, ser_size);
+       uint64_t crc = rspamd_cryptobox_fast_hash_final(&crc_st);
+
+       memcpy(p, &crc, sizeof(crc));
+       p += sizeof(crc);
+
+       /* Copy serialized database */
+       memcpy(p, ser_bytes, ser_size);
+       g_free(ser_bytes);
+
+       *out_data = buf;
+       *out_len = total_size;
+
+       return TRUE;
+}
+
+static GQuark rspamd_hyperscan_quark(void)
+{
+       return g_quark_from_static_string("hyperscan");
+}
+
+gboolean rspamd_hyperscan_validate_header(const char *data,
+                                                                                 gsize len,
+                                                                                 GError **err)
+{
+       if (len < RSPAMD_HS_MAGIC_LEN) {
+               g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "data too small");
+               return FALSE;
+       }
+
+       /* Check magic */
+       if (memcmp(data, rspamd_hs_magic, RSPAMD_HS_MAGIC_LEN) != 0) {
+               g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "invalid magic");
+               return FALSE;
+       }
+
+       const char *p = data + RSPAMD_HS_MAGIC_LEN;
+       const char *end = data + len;
+
+       /* Check platform */
+       if (static_cast<std::size_t>(end - p) < sizeof(hs_platform_info_t)) {
+               g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "truncated platform info");
+               return FALSE;
+       }
+
+       hs_platform_info_t stored_plt;
+       memcpy(&stored_plt, p, sizeof(stored_plt));
+       p += sizeof(stored_plt);
+
+       hs_platform_info_t cur_plt;
+       if (hs_populate_platform(&cur_plt) != HS_SUCCESS) {
+               g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "cannot get current platform");
+               return FALSE;
+       }
+
+       if (stored_plt.tune != cur_plt.tune) {
+               g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "platform mismatch");
+               return FALSE;
+       }
+
+       /* Read count */
+       if (static_cast<std::size_t>(end - p) < sizeof(unsigned int)) {
+               g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "truncated count");
+               return FALSE;
+       }
+
+       unsigned int n;
+       memcpy(&n, p, sizeof(n));
+       p += sizeof(n);
+
+       /* Remember start of IDs for CRC calculation */
+       const char *ids_start = p;
+       std::size_t arrays_size = (n > 0) ? sizeof(unsigned int) * n * 2 : 0;
+       if (static_cast<std::size_t>(end - p) < arrays_size + sizeof(uint64_t)) {
+               g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "truncated arrays or CRC");
+               return FALSE;
+       }
+
+       p += arrays_size;
+
+       /* Verify CRC (over IDs + flags + HS blob, compatible with re_cache.c) */
+       uint64_t stored_crc;
+       memcpy(&stored_crc, p, sizeof(stored_crc));
+       p += sizeof(stored_crc);
+
+       const char *hs_blob = p;
+       std::size_t hs_len = end - p;
+
+       rspamd_cryptobox_fast_hash_state_t crc_st;
+       rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe);
+       if (n > 0) {
+               /* IDs */
+               rspamd_cryptobox_fast_hash_update(&crc_st, ids_start, sizeof(unsigned int) * n);
+               /* Flags */
+               rspamd_cryptobox_fast_hash_update(&crc_st, ids_start + sizeof(unsigned int) * n,
+                                                                                 sizeof(unsigned int) * n);
+       }
+       /* HS database */
+       rspamd_cryptobox_fast_hash_update(&crc_st, hs_blob, hs_len);
+       uint64_t calc_crc = rspamd_cryptobox_fast_hash_final(&crc_st);
+
+       if (stored_crc != calc_crc) {
+               g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "CRC mismatch");
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+rspamd_hyperscan_t *rspamd_hyperscan_load_from_header(const char *data,
+                                                                                                         gsize len,
+                                                                                                         GError **err)
+{
+       if (!rspamd_hyperscan_validate_header(data, len, err)) {
+               return nullptr;
+       }
+
+       /* Skip to HS blob */
+       const char *p = data + RSPAMD_HS_MAGIC_LEN + sizeof(hs_platform_info_t);
+       unsigned int n;
+       memcpy(&n, p, sizeof(n));
+       p += sizeof(n);
+
+       /* Skip IDs and flags */
+       p += (n > 0) ? sizeof(unsigned int) * n * 2 : 0;
+       /* Skip CRC */
+       p += sizeof(uint64_t);
+
+       std::size_t hs_len = len - (p - data);
+
+       hs_database_t *db = nullptr;
+       if (hs_deserialize_database(p, hs_len, &db) != HS_SUCCESS) {
+               g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "deserialize failed");
+               return nullptr;
+       }
+
+       auto *ndb = new rspamd::util::hs_shared_database{db, nullptr};
+       return C_DB_FROM_CXX(ndb);
+}
+
 #endif// WITH_HYPERSCAN
\ No newline at end of file
index aac9897f0d96f7df4ff8e61424870de2860ceac3..e9233231614b78605fa49e21b449fc0a177b9e6c 100644 (file)
@@ -104,6 +104,47 @@ gboolean rspamd_hyperscan_create_shared_unser(const char *serialized_data,
                                                                                          int *out_fd,
                                                                                          gsize *out_size);
 
+/**
+ * Serialize a hyperscan database with unified header format.
+ * Format: [magic 8][platform][count][ids][flags][crc64][hs_blob]
+ * @param db hyperscan database to serialize
+ * @param ids array of pattern IDs (can be NULL)
+ * @param flags array of pattern flags (can be NULL)
+ * @param n number of patterns (0 if ids/flags not provided)
+ * @param[out] out_data pointer to allocated data (caller must g_free)
+ * @param[out] out_len size of serialized data
+ * @return TRUE on success
+ */
+gboolean rspamd_hyperscan_serialize_with_header(hs_database_t *db,
+                                                                                               const unsigned int *ids,
+                                                                                               const unsigned int *flags,
+                                                                                               unsigned int n,
+                                                                                               char **out_data,
+                                                                                               gsize *out_len);
+
+/**
+ * Load a hyperscan database from unified format blob.
+ * Validates magic, platform, and CRC before deserializing.
+ * @param data serialized data with header
+ * @param len size of data
+ * @param[out] err error message if validation fails (can be NULL)
+ * @return database wrapper or NULL on error
+ */
+rspamd_hyperscan_t *rspamd_hyperscan_load_from_header(const char *data,
+                                                                                                         gsize len,
+                                                                                                         GError **err);
+
+/**
+ * Validate a unified format blob without deserializing.
+ * @param data serialized data with header
+ * @param len size of data
+ * @param[out] err error message if validation fails (can be NULL)
+ * @return TRUE if valid
+ */
+gboolean rspamd_hyperscan_validate_header(const char *data,
+                                                                                 gsize len,
+                                                                                 GError **err);
+
 G_END_DECLS
 
 #endif
index 9ae798bb9da6d045f1b0977c9dd3a170f16b9790..9b72931aa95b7af1f075939823b0fa9af9d3ee02 100644 (file)
@@ -396,16 +396,37 @@ rspamd_multipattern_try_load_hs(struct rspamd_multipattern *mp,
                                                                const unsigned char *hash)
 {
        char fp[PATH_MAX];
+       gchar *data;
+       gsize len;
+       GError *err = NULL;
 
        if (hs_cache_dir == NULL) {
                return FALSE;
        }
 
-       rspamd_snprintf(fp, sizeof(fp), "%s/%*xs.hsmp", hs_cache_dir,
+       rspamd_snprintf(fp, sizeof(fp), "%s/%*xs.hs", hs_cache_dir,
                                        (int) rspamd_cryptobox_HASHBYTES / 2, hash);
-       mp->hs_db = rspamd_hyperscan_maybe_load(fp, 0);
 
-       return mp->hs_db != NULL;
+       if (!g_file_get_contents(fp, &data, &len, &err)) {
+               if (err) {
+                       msg_debug("cannot read hyperscan cache %s: %s", fp, err->message);
+                       g_error_free(err);
+               }
+               return FALSE;
+       }
+
+       mp->hs_db = rspamd_hyperscan_load_from_header(data, len, &err);
+       g_free(data);
+
+       if (mp->hs_db == NULL) {
+               if (err) {
+                       msg_debug("cannot load hyperscan cache %s: %s", fp, err->message);
+                       g_error_free(err);
+               }
+               return FALSE;
+       }
+
+       return TRUE;
 }
 
 static void
@@ -421,23 +442,26 @@ rspamd_multipattern_try_save_hs(struct rspamd_multipattern *mp,
                return;
        }
 
-       rspamd_snprintf(fp, sizeof(fp), "%s%shsmp-XXXXXXXXXXXXX", G_DIR_SEPARATOR_S,
+       rspamd_snprintf(fp, sizeof(fp), "%s%shs-XXXXXXXXXXXXX", G_DIR_SEPARATOR_S,
                                        hs_cache_dir);
 
        if ((fd = g_mkstemp_full(fp, O_CREAT | O_EXCL | O_WRONLY, 00644)) != -1) {
-               int ret;
-               if ((ret = hs_serialize_database(rspamd_hyperscan_get_database(mp->hs_db), &bytes, &len)) == HS_SUCCESS) {
+               /* Serialize with unified header format (magic, platform, CRC) */
+               if (rspamd_hyperscan_serialize_with_header(
+                               rspamd_hyperscan_get_database(mp->hs_db),
+                               NULL, NULL, 0, /* No IDs/flags needed for multipattern */
+                               &bytes, &len)) {
                        if (write(fd, bytes, len) == -1) {
                                msg_warn("cannot write hyperscan cache to %s: %s",
                                                 fp, strerror(errno));
                                unlink(fp);
-                               free(bytes);
+                               g_free(bytes);
                        }
                        else {
-                               free(bytes);
+                               g_free(bytes);
                                fsync(fd);
 
-                               rspamd_snprintf(np, sizeof(np), "%s/%*xs.hsmp", hs_cache_dir,
+                               rspamd_snprintf(np, sizeof(np), "%s/%*xs.hs", hs_cache_dir,
                                                                (int) rspamd_cryptobox_HASHBYTES / 2, hash);
 
                                if (rename(fp, np) == -1) {
@@ -451,12 +475,10 @@ rspamd_multipattern_try_save_hs(struct rspamd_multipattern *mp,
                        }
                }
                else {
-                       msg_warn("cannot serialize hyperscan cache to %s: error code %d",
-                                        fp, ret);
+                       msg_warn("cannot serialize hyperscan cache to %s", fp);
                        unlink(fp);
                }
 
-
                close(fd);
        }
        else {
index e649c9ebf097e9f1fe35e6c70a29df917ac313c2..d85d2d358e5439e7d60a320b2d74ac5a75bde802 100644 (file)
@@ -316,7 +316,8 @@ lua_hyperscan_serialize(lua_State *L)
        memcpy(p, &n, sizeof(n));
        p += sizeof(n);
 
-       /* IDs */
+       /* IDs - remember position for CRC */
+       char *ids_start = p;
        if (n > 0) {
                memcpy(p, ids.data(), sizeof(unsigned int) * n);
                p += sizeof(unsigned int) * n;
@@ -331,8 +332,17 @@ lua_hyperscan_serialize(lua_State *L)
                p += sizeof(unsigned int) * n;
        }
 
-       /* Calculate CRC over header (excluding CRC field itself) */
-       uint64_t crc = rspamd_cryptobox_fast_hash(buf, p - buf, 0xdeadbabe);
+       /* Calculate CRC over IDs + flags + HS blob (compatible with re_cache.c) */
+       rspamd_cryptobox_fast_hash_state_t crc_st;
+       rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe);
+       if (n > 0) {
+               rspamd_cryptobox_fast_hash_update(&crc_st, ids_start, sizeof(unsigned int) * n);
+               rspamd_cryptobox_fast_hash_update(&crc_st, ids_start + sizeof(unsigned int) * n,
+                                                                                 sizeof(unsigned int) * n);
+       }
+       rspamd_cryptobox_fast_hash_update(&crc_st, ser_bytes, ser_size);
+       uint64_t crc = rspamd_cryptobox_fast_hash_final(&crc_st);
+
        memcpy(p, &crc, sizeof(crc));
        p += sizeof(crc);
 
@@ -428,7 +438,8 @@ lua_hyperscan_validate(lua_State *L)
                return 2;
        }
 
-       /* Skip IDs and flags */
+       /* Remember start of IDs for CRC calculation */
+       const char *ids_start = p;
        size_t arrays_size = (n > 0) ? sizeof(unsigned int) * n * 2 : 0;
        if ((size_t) (end - p) < arrays_size + sizeof(uint64_t)) {
                lua_pushboolean(L, false);
@@ -438,12 +449,24 @@ lua_hyperscan_validate(lua_State *L)
 
        p += arrays_size;
 
-       /* Verify CRC */
+       /* Verify CRC (over IDs + flags + HS blob, compatible with re_cache.c) */
        uint64_t stored_crc;
        memcpy(&stored_crc, p, sizeof(stored_crc));
        p += sizeof(stored_crc);
 
-       uint64_t calc_crc = rspamd_cryptobox_fast_hash(data, p - data - sizeof(uint64_t), 0xdeadbabe);
+       const char *hs_blob = p;
+       size_t hs_len = end - p;
+
+       rspamd_cryptobox_fast_hash_state_t crc_st;
+       rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe);
+       if (n > 0) {
+               rspamd_cryptobox_fast_hash_update(&crc_st, ids_start, sizeof(unsigned int) * n);
+               rspamd_cryptobox_fast_hash_update(&crc_st, ids_start + sizeof(unsigned int) * n,
+                                                                                 sizeof(unsigned int) * n);
+       }
+       rspamd_cryptobox_fast_hash_update(&crc_st, hs_blob, hs_len);
+       uint64_t calc_crc = rspamd_cryptobox_fast_hash_final(&crc_st);
+
        if (stored_crc != calc_crc) {
                lua_pushboolean(L, false);
                lua_pushstring(L, "CRC mismatch");
@@ -451,7 +474,6 @@ lua_hyperscan_validate(lua_State *L)
        }
 
        /* Validate hyperscan portion */
-       size_t hs_len = end - p;
        if (hs_len == 0) {
                lua_pushboolean(L, false);
                lua_pushstring(L, "empty hyperscan database");