From: Vsevolod Stakhov Date: Fri, 8 Dec 2023 09:33:57 +0000 (+0000) Subject: [Rework] Use strings for int64_t X-Git-Tag: 3.8.0~40^2~3 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=33cf745fb1c772c57f45e14de15dc706ed5284d5;p=thirdparty%2Frspamd.git [Rework] Use strings for int64_t It seems there is no easy way to use int64 in Redis Lua, hence, we have to use strings. It's much more expensive but still some advantage over the previous schema. --- diff --git a/lualib/redis_scripts/bayes_classify.lua b/lualib/redis_scripts/bayes_classify.lua index c999609e5d..9bef96f145 100644 --- a/lualib/redis_scripts/bayes_classify.lua +++ b/lualib/redis_scripts/bayes_classify.lua @@ -1,10 +1,9 @@ -- Lua script to perform bayes classification -- This script accepts the following parameters: -- key1 - prefix for bayes tokens (e.g. for per-user classification) --- key2 - set of tokens encoded in messagepack array of int64_t +-- key2 - set of tokens encoded in messagepack array of strings local prefix = KEYS[1] -local input_tokens = cmsgpack.unpack(KEYS[2]) local output_spam = {} local output_ham = {} @@ -17,8 +16,9 @@ local prefix_underscore = prefix .. '_' -- This optimisation will save a lot of space for sparse tokens, and in Bayes that assumption is normally held if learned_ham > 0 and learned_spam > 0 then + local input_tokens = cmsgpack.unpack(KEYS[2]) for i, token in ipairs(input_tokens) do - local token_data = redis.call('HMGET', prefix_underscore .. tostring(token), 'H', 'S') + local token_data = redis.call('HMGET', prefix_underscore .. token, 'H', 'S') if token_data then local ham_count = token_data[1] diff --git a/lualib/redis_scripts/bayes_learn.lua b/lualib/redis_scripts/bayes_learn.lua index 6382547067..7536f68085 100644 --- a/lualib/redis_scripts/bayes_learn.lua +++ b/lualib/redis_scripts/bayes_learn.lua @@ -4,7 +4,7 @@ -- key2 - boolean is_spam -- key3 - string symbol -- key4 - boolean is_unlearn --- key5 - set of tokens encoded in messagepack array of int64_t +-- key5 - set of tokens encoded in messagepack array of strings local prefix = KEYS[1] local is_spam = KEYS[2] == 'true' and true or false @@ -21,5 +21,5 @@ redis.call('HSET', prefix, 'version', '2') -- new schema redis.call('HINCRBY', prefix, learned_key, is_unlearn and -1 or 1) -- increase or decrease learned count for _, token in ipairs(input_tokens) do - redis.call('HINCRBY', prefix_underscore .. tostring(token), hash_key, 1) + redis.call('HINCRBY', prefix_underscore .. token, hash_key, 1) end \ No newline at end of file diff --git a/lualib/redis_scripts/bayes_stat.lua b/lualib/redis_scripts/bayes_stat.lua new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/libstat/backends/redis_backend.cxx b/src/libstat/backends/redis_backend.cxx index 342fa02739..0eddf26cbe 100644 --- a/src/libstat/backends/redis_backend.cxx +++ b/src/libstat/backends/redis_backend.cxx @@ -657,13 +657,13 @@ void rspamd_redis_close(gpointer p) static char * rspamd_redis_serialize_tokens(struct rspamd_task *task, GPtrArray *tokens, gsize *ser_len) { - /* Each token is int64_t that requires 9 bytes + 4 bytes array len + 1 byte array magic */ - gsize req_len = tokens->len * 9 + 5, i; - gchar *buf, *p; + /* Each token is int64_t that requires 10 bytes (2 int32_t) + 4 bytes array len + 1 byte array magic */ + char max_int64_str[] = "18446744073709551615"; + auto req_len = tokens->len * sizeof(max_int64_str) + 5; rspamd_token_t *tok; - buf = (gchar *) rspamd_mempool_alloc(task->task_pool, req_len); - p = buf; + auto *buf = (gchar *) rspamd_mempool_alloc(task->task_pool, req_len); + auto *p = buf; /* Array */ *p++ = (gchar) 0xdd; @@ -673,13 +673,15 @@ rspamd_redis_serialize_tokens(struct rspamd_task *task, GPtrArray *tokens, gsize *p++ = (gchar) ((tokens->len >> 8) & 0xff); *p++ = (gchar) (tokens->len & 0xff); + int i; PTR_ARRAY_FOREACH(tokens, i, tok) { - *p++ = (gchar) 0xd3; + char numbuf[sizeof(max_int64_str)]; + auto r = rspamd_snprintf(numbuf, sizeof(numbuf), "%uL", tok->data); + *p++ = (gchar) ((r & 0xff) | 0xa0); - guint64 val = GUINT64_TO_BE(tok->data); - memcpy(p, &val, sizeof(val)); - p += sizeof(val); + memcpy(p, &numbuf, r); + p += r; } *ser_len = p - buf;