From: Vsevolod Stakhov Date: Fri, 7 Nov 2025 15:46:43 +0000 (+0000) Subject: [Feature] Add url:get_hash() method for efficient URL deduplication X-Git-Tag: 3.14.0~12^2~3 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=24e4b2a4040f46463b7b0c908eed405a7b24e920;p=thirdparty%2Frspamd.git [Feature] Add url:get_hash() method for efficient URL deduplication Expose rspamd_cryptobox_fast_hash via Lua API to allow hash-based URL deduplication without string conversion overhead. This is critical for handling messages with large numbers of URLs (100k+) where tostring() would fill the Lua string interning table and cause memory issues. Returns 64-bit hash as Lua number, using the same hash function as internal URL storage for consistency. --- diff --git a/src/lua/lua_url.c b/src/lua/lua_url.c index 1d163dce0b..73118f4781 100644 --- a/src/lua/lua_url.c +++ b/src/lua/lua_url.c @@ -67,6 +67,7 @@ LUA_FUNCTION_DEF(url, get_phished); LUA_FUNCTION_DEF(url, set_redirected); LUA_FUNCTION_DEF(url, get_count); LUA_FUNCTION_DEF(url, get_visible); +LUA_FUNCTION_DEF(url, get_hash); LUA_FUNCTION_DEF(url, create); LUA_FUNCTION_DEF(url, init); LUA_FUNCTION_DEF(url, all); @@ -98,6 +99,7 @@ static const struct luaL_reg urllib_m[] = { LUA_INTERFACE_DEF(url, get_visible), LUA_INTERFACE_DEF(url, get_count), + LUA_INTERFACE_DEF(url, get_hash), LUA_INTERFACE_DEF(url, get_flags), LUA_INTERFACE_DEF(url, get_flags_num), LUA_INTERFACE_DEF(url, get_order), @@ -747,6 +749,32 @@ lua_url_get_visible(lua_State *L) return 1; } +/*** + * @method url:get_hash() + * Get fast hash of the url for deduplication purposes. Uses the same hash function + * as internal URL storage (rspamd_cryptobox_fast_hash). This is much more efficient + * than converting URLs to strings for deduplication, especially with large numbers of URLs. + * @return {number} 64-bit hash as a Lua number + */ +static int +lua_url_get_hash(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_lua_url *url = lua_check_url(L, 1); + + if (url != NULL && url->url != NULL && url->url->urllen > 0) { + uint64_t hash = rspamd_cryptobox_fast_hash(url->url->string, + url->url->urllen, + rspamd_hash_seed()); + lua_pushnumber(L, (lua_Number) hash); + } + else { + lua_pushnil(L); + } + + return 1; +} + /*** * @method url:to_table() * Return url as a table with the following fields: