From: Vsevolod Stakhov Date: Wed, 8 Oct 2025 14:58:58 +0000 (+0100) Subject: [Fix] Fix memory leaks in HTML shingles generation X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=fff8bc26b976dd9201ccf58dc921df809a8de61f;p=thirdparty%2Frspamd.git [Fix] Fix memory leaks in HTML shingles generation - Require mempool parameter (cannot be NULL) for consistent memory management - Change helper function to fill shingle structure in-place instead of allocating - Eliminate unnecessary allocation, memcpy, and potential memory leaks - All allocations now use rspamd_mempool_alloc0 consistently --- diff --git a/src/libutil/shingles.h b/src/libutil/shingles.h index 206e88bbcc..29e097c318 100644 --- a/src/libutil/shingles.h +++ b/src/libutil/shingles.h @@ -106,11 +106,11 @@ struct rspamd_shingle *rspamd_shingles_from_image(unsigned char *dct, * * @param html_content parsed HTML structure (void* to html_content from html.hxx) * @param key secret key (16 bytes) for shingle generation - * @param pool memory pool for allocation + * @param pool memory pool for allocation (required, cannot be NULL) * @param filter shingles filter function (typically rspamd_shingles_default_filter) * @param filterd opaque data for filter * @param alg hashing algorithm (RSPAMD_SHINGLES_MUMHASH recommended) - * @return HTML shingle structure or NULL on error + * @return HTML shingle structure or NULL on error/invalid input */ struct rspamd_html_shingle *rspamd_shingles_from_html(void *html_content, const unsigned char key[16], diff --git a/src/libutil/shingles_html.cxx b/src/libutil/shingles_html.cxx index 33dc44ca91..fbfd379fb2 100644 --- a/src/libutil/shingles_html.cxx +++ b/src/libutil/shingles_html.cxx @@ -325,18 +325,17 @@ hash_html_features(html_content *hc, const unsigned char key[16]) } /* Helper: generate shingles from string tokens (like text shingles but for tokens) */ -static struct rspamd_shingle * -generate_shingles_from_string_tokens(const std::vector &tokens, +static void +generate_shingles_from_string_tokens(struct rspamd_shingle *out, + const std::vector &tokens, const unsigned char key[16], rspamd_shingles_filter filter, gpointer filterd, enum rspamd_shingle_alg alg) { - if (tokens.empty()) { - return nullptr; + if (tokens.empty() || !out) { + return; } - - auto res = new rspamd_shingle; gsize ilen = tokens.size(); gsize hlen = ilen > SHINGLES_WINDOW ? (ilen - SHINGLES_WINDOW + 1) : 1; @@ -393,10 +392,8 @@ generate_shingles_from_string_tokens(const std::vector &tokens, /* Apply filter to get final shingles */ for (unsigned int i = 0; i < RSPAMD_SHINGLE_SIZE; i++) { - res->hashes[i] = filter(hashes[i].data(), hlen, i, key, filterd); + out->hashes[i] = filter(hashes[i].data(), hlen, i, key, filterd); } - - return res; } /* Main function: generate HTML shingles */ @@ -433,15 +430,15 @@ rspamd_shingles_from_html(void *html_content, return nullptr; } - /* Allocate result after all early exit checks to avoid leaks */ - struct rspamd_html_shingle *res; - if (pool) { - res = static_cast(rspamd_mempool_alloc0(pool, sizeof(*res))); - } - else { - res = new rspamd_html_shingle{}; + /* Pool is required for consistent memory management - avoid mixing new/delete with mempool */ + if (!pool) { + return nullptr; } + /* Allocate result after all early exit checks (zeroed) */ + struct rspamd_html_shingle *res = static_cast( + rspamd_mempool_alloc0(pool, sizeof(*res))); + /* 2. Generate direct hash of ALL tokens (for exact matching, like text parts) */ rspamd_cryptobox_hash_state_t st; rspamd_cryptobox_hash_init(&st, key, 16); @@ -452,17 +449,8 @@ rspamd_shingles_from_html(void *html_content, rspamd_cryptobox_hash_final(&st, res->direct_hash); - /* 3. Generate structure shingles from tokens (for fuzzy matching) */ - auto *struct_sgl = generate_shingles_from_string_tokens(tokens, key, filter, filterd, alg); - - if (struct_sgl) { - std::memcpy(&res->structure_shingles, struct_sgl, sizeof(struct rspamd_shingle)); - /* Always delete struct_sgl after copying, regardless of pool */ - delete struct_sgl; - } - else { - std::memset(&res->structure_shingles, 0, sizeof(struct rspamd_shingle)); - } + /* 3. Generate structure shingles from tokens (for fuzzy matching) - fill in-place */ + generate_shingles_from_string_tokens(&res->structure_shingles, tokens, key, filter, filterd, alg); /* 4. Generate CTA domains hash (critical for phishing detection) */ res->cta_domains_hash = hash_domain_list(cta_domains, key);