From: Vsevolod Stakhov Date: Sun, 4 Jan 2026 09:15:12 +0000 (+0000) Subject: [Feature] Unified hyperscan cache format for multipattern X-Git-Tag: 4.0.0~208^2~28 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=663a5f080c2aff454379a5e3eb46f097bb9b0873;p=thirdparty%2Frspamd.git [Feature] Unified hyperscan cache format for multipattern Add C helper functions for serializing/deserializing hyperscan databases with the unified format (magic, platform, CRC). Migrate multipattern from raw .hsmp files to the unified .hs format compatible with re_cache. - Add rspamd_hyperscan_serialize_with_header() and load_from_header() - Update multipattern to use unified format with platform validation - Fix CRC calculation in Lua bindings to match re_cache format --- diff --git a/src/libserver/hyperscan_tools.cxx b/src/libserver/hyperscan_tools.cxx index 27094c01d6..a814e1cfab 100644 --- a/src/libserver/hyperscan_tools.cxx +++ b/src/libserver/hyperscan_tools.cxx @@ -928,4 +928,224 @@ gboolean rspamd_hyperscan_create_shared_unser(const char *serialized_data, return TRUE; } +/* Unified hyperscan format magic */ +static const unsigned char rspamd_hs_magic[] = {'r', 's', 'h', 's', 'r', 'e', '1', '1'}; +#define RSPAMD_HS_MAGIC_LEN (sizeof(rspamd_hs_magic)) + +gboolean rspamd_hyperscan_serialize_with_header(hs_database_t *db, + const unsigned int *ids, + const unsigned int *flags, + unsigned int n, + char **out_data, + gsize *out_len) +{ + if (!db || !out_data || !out_len) { + return FALSE; + } + + /* Serialize the database - hyperscan allocates the buffer */ + char *ser_bytes = nullptr; + std::size_t ser_size = 0; + if (hs_serialize_database(db, &ser_bytes, &ser_size) != HS_SUCCESS) { + msg_err_hyperscan("failed to serialize database"); + return FALSE; + } + + /* Get platform info */ + hs_platform_info_t plt; + if (hs_populate_platform(&plt) != HS_SUCCESS) { + g_free(ser_bytes); + msg_err_hyperscan("failed to get platform info"); + return FALSE; + } + + /* Calculate header size */ + std::size_t header_size = RSPAMD_HS_MAGIC_LEN + + sizeof(plt) + + sizeof(n) + + (n > 0 ? sizeof(unsigned int) * n * 2 : 0) + + sizeof(uint64_t); /* CRC */ + + std::size_t total_size = header_size + ser_size; + + /* Allocate buffer */ + char *buf = static_cast(g_malloc(total_size)); + char *p = buf; + + /* Magic */ + memcpy(p, rspamd_hs_magic, RSPAMD_HS_MAGIC_LEN); + p += RSPAMD_HS_MAGIC_LEN; + + /* Platform */ + memcpy(p, &plt, sizeof(plt)); + p += sizeof(plt); + + /* Count */ + memcpy(p, &n, sizeof(n)); + p += sizeof(n); + + /* IDs and flags - remember positions for CRC calculation */ + char *ids_start = p; + if (n > 0 && ids && flags) { + memcpy(p, ids, sizeof(unsigned int) * n); + p += sizeof(unsigned int) * n; + memcpy(p, flags, sizeof(unsigned int) * n); + p += sizeof(unsigned int) * n; + } + else if (n > 0) { + memset(p, 0, sizeof(unsigned int) * n * 2); + p += sizeof(unsigned int) * n * 2; + } + + /* CRC over IDs + flags + HS blob (compatible with re_cache.c format) */ + rspamd_cryptobox_fast_hash_state_t crc_st; + rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); + if (n > 0) { + /* IDs */ + rspamd_cryptobox_fast_hash_update(&crc_st, ids_start, sizeof(unsigned int) * n); + /* Flags */ + rspamd_cryptobox_fast_hash_update(&crc_st, ids_start + sizeof(unsigned int) * n, + sizeof(unsigned int) * n); + } + /* HS database */ + rspamd_cryptobox_fast_hash_update(&crc_st, ser_bytes, ser_size); + uint64_t crc = rspamd_cryptobox_fast_hash_final(&crc_st); + + memcpy(p, &crc, sizeof(crc)); + p += sizeof(crc); + + /* Copy serialized database */ + memcpy(p, ser_bytes, ser_size); + g_free(ser_bytes); + + *out_data = buf; + *out_len = total_size; + + return TRUE; +} + +static GQuark rspamd_hyperscan_quark(void) +{ + return g_quark_from_static_string("hyperscan"); +} + +gboolean rspamd_hyperscan_validate_header(const char *data, + gsize len, + GError **err) +{ + if (len < RSPAMD_HS_MAGIC_LEN) { + g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "data too small"); + return FALSE; + } + + /* Check magic */ + if (memcmp(data, rspamd_hs_magic, RSPAMD_HS_MAGIC_LEN) != 0) { + g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "invalid magic"); + return FALSE; + } + + const char *p = data + RSPAMD_HS_MAGIC_LEN; + const char *end = data + len; + + /* Check platform */ + if (static_cast(end - p) < sizeof(hs_platform_info_t)) { + g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "truncated platform info"); + return FALSE; + } + + hs_platform_info_t stored_plt; + memcpy(&stored_plt, p, sizeof(stored_plt)); + p += sizeof(stored_plt); + + hs_platform_info_t cur_plt; + if (hs_populate_platform(&cur_plt) != HS_SUCCESS) { + g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "cannot get current platform"); + return FALSE; + } + + if (stored_plt.tune != cur_plt.tune) { + g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "platform mismatch"); + return FALSE; + } + + /* Read count */ + if (static_cast(end - p) < sizeof(unsigned int)) { + g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "truncated count"); + return FALSE; + } + + unsigned int n; + memcpy(&n, p, sizeof(n)); + p += sizeof(n); + + /* Remember start of IDs for CRC calculation */ + const char *ids_start = p; + std::size_t arrays_size = (n > 0) ? sizeof(unsigned int) * n * 2 : 0; + if (static_cast(end - p) < arrays_size + sizeof(uint64_t)) { + g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "truncated arrays or CRC"); + return FALSE; + } + + p += arrays_size; + + /* Verify CRC (over IDs + flags + HS blob, compatible with re_cache.c) */ + uint64_t stored_crc; + memcpy(&stored_crc, p, sizeof(stored_crc)); + p += sizeof(stored_crc); + + const char *hs_blob = p; + std::size_t hs_len = end - p; + + rspamd_cryptobox_fast_hash_state_t crc_st; + rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); + if (n > 0) { + /* IDs */ + rspamd_cryptobox_fast_hash_update(&crc_st, ids_start, sizeof(unsigned int) * n); + /* Flags */ + rspamd_cryptobox_fast_hash_update(&crc_st, ids_start + sizeof(unsigned int) * n, + sizeof(unsigned int) * n); + } + /* HS database */ + rspamd_cryptobox_fast_hash_update(&crc_st, hs_blob, hs_len); + uint64_t calc_crc = rspamd_cryptobox_fast_hash_final(&crc_st); + + if (stored_crc != calc_crc) { + g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "CRC mismatch"); + return FALSE; + } + + return TRUE; +} + +rspamd_hyperscan_t *rspamd_hyperscan_load_from_header(const char *data, + gsize len, + GError **err) +{ + if (!rspamd_hyperscan_validate_header(data, len, err)) { + return nullptr; + } + + /* Skip to HS blob */ + const char *p = data + RSPAMD_HS_MAGIC_LEN + sizeof(hs_platform_info_t); + unsigned int n; + memcpy(&n, p, sizeof(n)); + p += sizeof(n); + + /* Skip IDs and flags */ + p += (n > 0) ? sizeof(unsigned int) * n * 2 : 0; + /* Skip CRC */ + p += sizeof(uint64_t); + + std::size_t hs_len = len - (p - data); + + hs_database_t *db = nullptr; + if (hs_deserialize_database(p, hs_len, &db) != HS_SUCCESS) { + g_set_error(err, rspamd_hyperscan_quark(), EINVAL, "deserialize failed"); + return nullptr; + } + + auto *ndb = new rspamd::util::hs_shared_database{db, nullptr}; + return C_DB_FROM_CXX(ndb); +} + #endif// WITH_HYPERSCAN \ No newline at end of file diff --git a/src/libserver/hyperscan_tools.h b/src/libserver/hyperscan_tools.h index aac9897f0d..e923323161 100644 --- a/src/libserver/hyperscan_tools.h +++ b/src/libserver/hyperscan_tools.h @@ -104,6 +104,47 @@ gboolean rspamd_hyperscan_create_shared_unser(const char *serialized_data, int *out_fd, gsize *out_size); +/** + * Serialize a hyperscan database with unified header format. + * Format: [magic 8][platform][count][ids][flags][crc64][hs_blob] + * @param db hyperscan database to serialize + * @param ids array of pattern IDs (can be NULL) + * @param flags array of pattern flags (can be NULL) + * @param n number of patterns (0 if ids/flags not provided) + * @param[out] out_data pointer to allocated data (caller must g_free) + * @param[out] out_len size of serialized data + * @return TRUE on success + */ +gboolean rspamd_hyperscan_serialize_with_header(hs_database_t *db, + const unsigned int *ids, + const unsigned int *flags, + unsigned int n, + char **out_data, + gsize *out_len); + +/** + * Load a hyperscan database from unified format blob. + * Validates magic, platform, and CRC before deserializing. + * @param data serialized data with header + * @param len size of data + * @param[out] err error message if validation fails (can be NULL) + * @return database wrapper or NULL on error + */ +rspamd_hyperscan_t *rspamd_hyperscan_load_from_header(const char *data, + gsize len, + GError **err); + +/** + * Validate a unified format blob without deserializing. + * @param data serialized data with header + * @param len size of data + * @param[out] err error message if validation fails (can be NULL) + * @return TRUE if valid + */ +gboolean rspamd_hyperscan_validate_header(const char *data, + gsize len, + GError **err); + G_END_DECLS #endif diff --git a/src/libutil/multipattern.c b/src/libutil/multipattern.c index 9ae798bb9d..9b72931aa9 100644 --- a/src/libutil/multipattern.c +++ b/src/libutil/multipattern.c @@ -396,16 +396,37 @@ rspamd_multipattern_try_load_hs(struct rspamd_multipattern *mp, const unsigned char *hash) { char fp[PATH_MAX]; + gchar *data; + gsize len; + GError *err = NULL; if (hs_cache_dir == NULL) { return FALSE; } - rspamd_snprintf(fp, sizeof(fp), "%s/%*xs.hsmp", hs_cache_dir, + rspamd_snprintf(fp, sizeof(fp), "%s/%*xs.hs", hs_cache_dir, (int) rspamd_cryptobox_HASHBYTES / 2, hash); - mp->hs_db = rspamd_hyperscan_maybe_load(fp, 0); - return mp->hs_db != NULL; + if (!g_file_get_contents(fp, &data, &len, &err)) { + if (err) { + msg_debug("cannot read hyperscan cache %s: %s", fp, err->message); + g_error_free(err); + } + return FALSE; + } + + mp->hs_db = rspamd_hyperscan_load_from_header(data, len, &err); + g_free(data); + + if (mp->hs_db == NULL) { + if (err) { + msg_debug("cannot load hyperscan cache %s: %s", fp, err->message); + g_error_free(err); + } + return FALSE; + } + + return TRUE; } static void @@ -421,23 +442,26 @@ rspamd_multipattern_try_save_hs(struct rspamd_multipattern *mp, return; } - rspamd_snprintf(fp, sizeof(fp), "%s%shsmp-XXXXXXXXXXXXX", G_DIR_SEPARATOR_S, + rspamd_snprintf(fp, sizeof(fp), "%s%shs-XXXXXXXXXXXXX", G_DIR_SEPARATOR_S, hs_cache_dir); if ((fd = g_mkstemp_full(fp, O_CREAT | O_EXCL | O_WRONLY, 00644)) != -1) { - int ret; - if ((ret = hs_serialize_database(rspamd_hyperscan_get_database(mp->hs_db), &bytes, &len)) == HS_SUCCESS) { + /* Serialize with unified header format (magic, platform, CRC) */ + if (rspamd_hyperscan_serialize_with_header( + rspamd_hyperscan_get_database(mp->hs_db), + NULL, NULL, 0, /* No IDs/flags needed for multipattern */ + &bytes, &len)) { if (write(fd, bytes, len) == -1) { msg_warn("cannot write hyperscan cache to %s: %s", fp, strerror(errno)); unlink(fp); - free(bytes); + g_free(bytes); } else { - free(bytes); + g_free(bytes); fsync(fd); - rspamd_snprintf(np, sizeof(np), "%s/%*xs.hsmp", hs_cache_dir, + rspamd_snprintf(np, sizeof(np), "%s/%*xs.hs", hs_cache_dir, (int) rspamd_cryptobox_HASHBYTES / 2, hash); if (rename(fp, np) == -1) { @@ -451,12 +475,10 @@ rspamd_multipattern_try_save_hs(struct rspamd_multipattern *mp, } } else { - msg_warn("cannot serialize hyperscan cache to %s: error code %d", - fp, ret); + msg_warn("cannot serialize hyperscan cache to %s", fp); unlink(fp); } - close(fd); } else { diff --git a/src/lua/lua_hyperscan.cxx b/src/lua/lua_hyperscan.cxx index e649c9ebf0..d85d2d358e 100644 --- a/src/lua/lua_hyperscan.cxx +++ b/src/lua/lua_hyperscan.cxx @@ -316,7 +316,8 @@ lua_hyperscan_serialize(lua_State *L) memcpy(p, &n, sizeof(n)); p += sizeof(n); - /* IDs */ + /* IDs - remember position for CRC */ + char *ids_start = p; if (n > 0) { memcpy(p, ids.data(), sizeof(unsigned int) * n); p += sizeof(unsigned int) * n; @@ -331,8 +332,17 @@ lua_hyperscan_serialize(lua_State *L) p += sizeof(unsigned int) * n; } - /* Calculate CRC over header (excluding CRC field itself) */ - uint64_t crc = rspamd_cryptobox_fast_hash(buf, p - buf, 0xdeadbabe); + /* Calculate CRC over IDs + flags + HS blob (compatible with re_cache.c) */ + rspamd_cryptobox_fast_hash_state_t crc_st; + rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); + if (n > 0) { + rspamd_cryptobox_fast_hash_update(&crc_st, ids_start, sizeof(unsigned int) * n); + rspamd_cryptobox_fast_hash_update(&crc_st, ids_start + sizeof(unsigned int) * n, + sizeof(unsigned int) * n); + } + rspamd_cryptobox_fast_hash_update(&crc_st, ser_bytes, ser_size); + uint64_t crc = rspamd_cryptobox_fast_hash_final(&crc_st); + memcpy(p, &crc, sizeof(crc)); p += sizeof(crc); @@ -428,7 +438,8 @@ lua_hyperscan_validate(lua_State *L) return 2; } - /* Skip IDs and flags */ + /* Remember start of IDs for CRC calculation */ + const char *ids_start = p; size_t arrays_size = (n > 0) ? sizeof(unsigned int) * n * 2 : 0; if ((size_t) (end - p) < arrays_size + sizeof(uint64_t)) { lua_pushboolean(L, false); @@ -438,12 +449,24 @@ lua_hyperscan_validate(lua_State *L) p += arrays_size; - /* Verify CRC */ + /* Verify CRC (over IDs + flags + HS blob, compatible with re_cache.c) */ uint64_t stored_crc; memcpy(&stored_crc, p, sizeof(stored_crc)); p += sizeof(stored_crc); - uint64_t calc_crc = rspamd_cryptobox_fast_hash(data, p - data - sizeof(uint64_t), 0xdeadbabe); + const char *hs_blob = p; + size_t hs_len = end - p; + + rspamd_cryptobox_fast_hash_state_t crc_st; + rspamd_cryptobox_fast_hash_init(&crc_st, 0xdeadbabe); + if (n > 0) { + rspamd_cryptobox_fast_hash_update(&crc_st, ids_start, sizeof(unsigned int) * n); + rspamd_cryptobox_fast_hash_update(&crc_st, ids_start + sizeof(unsigned int) * n, + sizeof(unsigned int) * n); + } + rspamd_cryptobox_fast_hash_update(&crc_st, hs_blob, hs_len); + uint64_t calc_crc = rspamd_cryptobox_fast_hash_final(&crc_st); + if (stored_crc != calc_crc) { lua_pushboolean(L, false); lua_pushstring(L, "CRC mismatch"); @@ -451,7 +474,6 @@ lua_hyperscan_validate(lua_State *L) } /* Validate hyperscan portion */ - size_t hs_len = end - p; if (hs_len == 0) { lua_pushboolean(L, false); lua_pushstring(L, "empty hyperscan database");