From: Vsevolod Stakhov Date: Thu, 18 Jun 2026 14:12:09 +0000 (+0100) Subject: [Feature] lua_text: byte-distribution statistics methods X-Git-Tag: 4.1.1~4^2~1 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ce36f6302400af89405bf9f8dd1d5a8a0b16dfaa;p=thirdparty%2Frspamd.git [Feature] lua_text: byte-distribution statistics methods Add byte-distribution statistics as methods on the rspamd_text class, implemented in C++20 under src/lua (lua_text_stats.{hxx,cxx}); lua_text.c is left untouched and the rspamd{text} metatable is augmented at load. Methods (each takes an optional 0-based (off, len) range, defaulting to the whole buffer): - text:entropy([off[, len]]) Shannon entropy, bits/byte - text:byte_mean([off[, len]]) mean of unsigned byte values - text:byte_deviation(mean[, off[, len]]) mean abs deviation from mean - text:serial_correlation([off[, len]]) ENT serial correlation - text:monte_carlo_pi([off[, len]]) ENT Monte-Carlo Pi deviation The core is header-only, allocation-free and O(n) (a single histogram pass shared by entropy/mean/deviation) and produces deterministic, bit-reproducible results. Offsets are byte offsets, 0-based; the range is clamped to the buffer and an out-of-range or empty range yields 0. Add C++ doctest golden-vector tests (analytically-derived exact values) and Lua unit tests covering empty/single-byte/uniform/two-symbol buffers, overlapping groups, slicing and edge cases. --- diff --git a/src/lua/CMakeLists.txt b/src/lua/CMakeLists.txt index d2e1ebc505..6d08eeafd8 100644 --- a/src/lua/CMakeLists.txt +++ b/src/lua/CMakeLists.txt @@ -30,6 +30,7 @@ SET(LUASRC ${CMAKE_CURRENT_SOURCE_DIR}/lua_common.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_dns.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_udp.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_text.c + ${CMAKE_CURRENT_SOURCE_DIR}/lua_text_stats.cxx ${CMAKE_CURRENT_SOURCE_DIR}/lua_worker.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_kann.c ${CMAKE_CURRENT_SOURCE_DIR}/lua_spf.c diff --git a/src/lua/lua_common.c b/src/lua/lua_common.c index 86a8b0a487..0b0b8b11d1 100644 --- a/src/lua/lua_common.c +++ b/src/lua/lua_common.c @@ -977,6 +977,7 @@ rspamd_lua_init(bool wipe_mem) luaopen_ip(L); luaopen_expression(L); luaopen_text(L); + rspamd_lua_text_stats_init(L); luaopen_util(L); luaopen_tcp(L); luaopen_html(L); diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index 265d42e1b3..6d399e02be 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -440,6 +440,13 @@ void luaopen_logger(lua_State *L); void luaopen_text(lua_State *L); +/* + * Augments the rspamd{text} metatable with byte-statistics methods + * (entropy/byte_mean/...). Must be called after luaopen_text. Defined in + * lua_text_stats.cxx. + */ +void rspamd_lua_text_stats_init(lua_State *L); + void luaopen_util(lua_State *L); void luaopen_tcp(lua_State *L); diff --git a/src/lua/lua_text_stats.cxx b/src/lua/lua_text_stats.cxx new file mode 100644 index 0000000000..76aa03917f --- /dev/null +++ b/src/lua/lua_text_stats.cxx @@ -0,0 +1,206 @@ +/* + * Copyright 2026 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "lua_common.h" +#include "lua_text_stats.hxx" + +#include +#include +#include + +/*** + * @module rspamd_text + * The following byte-distribution statistics methods are added to the + * `rspamd_text` class. They produce deterministic, bit-reproducible results so + * callers can compare them against fixed thresholds. Each takes an optional + * byte range: + * + * - `off` is a 0-based byte offset, defaulting to 0; + * - `len` is a byte count, defaulting to the rest of the buffer after `off`. + * + * The range is clamped to the buffer (`off` in `[0, #text)`, `len` truncated to + * the bytes available after `off`); an out-of-range or empty range yields 0 for + * every metric. + */ + +using namespace rspamd::text_stats; + +/* + * Validate the optional (off, len) range of a text and return it as a byte + * span: off must lie in [0, size) and len is truncated to the bytes available + * after off. An out-of-range or empty request yields an empty span. + */ +static std::span +lua_text_stats_slice(lua_State *L, const struct rspamd_lua_text *t, + int off_idx, int len_idx) +{ + const auto n = static_cast(t->len); + const auto *base = reinterpret_cast(t->start); + + lua_Integer off = luaL_optinteger(L, off_idx, 0); + + if (off < 0 || static_cast(off) >= n) { + return {}; + } + + const std::size_t avail = n - static_cast(off); + lua_Integer len = luaL_optinteger(L, len_idx, static_cast(avail)); + + if (len <= 0) { + return {}; + } + + const std::size_t take = std::min(static_cast(len), avail); + + return {base + off, take}; +} + +/*** + * @method text:entropy([off[, len]]) + * Shannon entropy of the byte range in bits/byte, in [0, 8]. + * @param {number} off optional 0-based byte offset (default 0) + * @param {number} len optional byte count (default: to end of text) + * @return {number} entropy in bits per byte + */ +static int +lua_text_entropy(lua_State *L) +{ + LUA_TRACE_POINT; + auto *t = lua_check_text(L, 1); + + if (t == nullptr) { + return luaL_error(L, "invalid arguments"); + } + + lua_pushnumber(L, entropy(lua_text_stats_slice(L, t, 2, 3))); + + return 1; +} + +/*** + * @method text:byte_mean([off[, len]]) + * Arithmetic mean of the (unsigned) byte values in the range. + * @param {number} off optional 0-based byte offset (default 0) + * @param {number} len optional byte count (default: to end of text) + * @return {number} mean byte value + */ +static int +lua_text_byte_mean(lua_State *L) +{ + LUA_TRACE_POINT; + auto *t = lua_check_text(L, 1); + + if (t == nullptr) { + return luaL_error(L, "invalid arguments"); + } + + lua_pushnumber(L, byte_mean(lua_text_stats_slice(L, t, 2, 3))); + + return 1; +} + +/*** + * @method text:byte_deviation(mean[, off[, len]]) + * Mean absolute deviation of the byte values from `mean` (typically + * `byte_mean` of the same range). + * @param {number} mean reference mean value + * @param {number} off optional 0-based byte offset (default 0) + * @param {number} len optional byte count (default: to end of text) + * @return {number} mean absolute deviation + */ +static int +lua_text_byte_deviation(lua_State *L) +{ + LUA_TRACE_POINT; + auto *t = lua_check_text(L, 1); + + if (t == nullptr) { + return luaL_error(L, "invalid arguments"); + } + + double mean = static_cast(luaL_checknumber(L, 2)); + + lua_pushnumber(L, byte_deviation(lua_text_stats_slice(L, t, 3, 4), mean)); + + return 1; +} + +/*** + * @method text:serial_correlation([off[, len]]) + * Serial correlation coefficient (ENT) of the byte range. + * @param {number} off optional 0-based byte offset (default 0) + * @param {number} len optional byte count (default: to end of text) + * @return {number} serial correlation coefficient + */ +static int +lua_text_serial_correlation(lua_State *L) +{ + LUA_TRACE_POINT; + auto *t = lua_check_text(L, 1); + + if (t == nullptr) { + return luaL_error(L, "invalid arguments"); + } + + lua_pushnumber(L, serial_correlation(lua_text_stats_slice(L, t, 2, 3))); + + return 1; +} + +/*** + * @method text:monte_carlo_pi([off[, len]]) + * Monte-Carlo Pi metric (ENT) of the byte range: the normalized deviation from + * Pi, `fabs((4*inmont/groups - PI) / PI)` (note: this is the deviation from Pi, + * not Pi itself). + * @param {number} off optional 0-based byte offset (default 0) + * @param {number} len optional byte count (default: to end of text) + * @return {number} normalized deviation from Pi + */ +static int +lua_text_monte_carlo_pi(lua_State *L) +{ + LUA_TRACE_POINT; + auto *t = lua_check_text(L, 1); + + if (t == nullptr) { + return luaL_error(L, "invalid arguments"); + } + + lua_pushnumber(L, monte_carlo_pi(lua_text_stats_slice(L, t, 2, 3))); + + return 1; +} + +static const struct luaL_reg text_stats_m[] = { + {"entropy", lua_text_entropy}, + {"byte_mean", lua_text_byte_mean}, + {"byte_deviation", lua_text_byte_deviation}, + {"serial_correlation", lua_text_serial_correlation}, + {"monte_carlo_pi", lua_text_monte_carlo_pi}, + {nullptr, nullptr}, +}; + +void rspamd_lua_text_stats_init(lua_State *L) +{ + /* + * Augment the existing rspamd{text} metatable (created by luaopen_text) + * with the statistics methods, so lua_text.c stays untouched. Methods live + * directly on the metatable (rspamd_lua_new_class sets __index = metatable). + */ + rspamd_lua_class_metatable(L, rspamd_text_classname); + luaL_register(L, nullptr, text_stats_m); + lua_pop(L, 1); +} diff --git a/src/lua/lua_text_stats.hxx b/src/lua/lua_text_stats.hxx new file mode 100644 index 0000000000..fe3b5759c4 --- /dev/null +++ b/src/lua/lua_text_stats.hxx @@ -0,0 +1,255 @@ +/* + * Copyright 2026 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_LUA_TEXT_STATS_HXX +#define RSPAMD_LUA_TEXT_STATS_HXX +#pragma once + +/* + * Byte-distribution statistics over a buffer: Shannon entropy, byte mean and + * mean absolute deviation, and the serial-correlation and Monte-Carlo-Pi + * randomness metrics (the classic ENT test-suite formulas, John Walker, + * fourmilab.ch/random). They are specified to produce deterministic, + * portable, bit-reproducible results so callers can compare them against + * fixed thresholds. + * + * These are pure, header-only, allocation-free C++; the Lua bindings in + * lua_text_stats.cxx and the C++ unit tests both call them directly. + * + * Two accumulation strategies are used, and the order is part of the contract + * (floating-point addition is not associative, so it affects the exact bits): + * - entropy / mean / deviation build a 256-bin histogram of the byte values + * and accumulate over the bins 0..255 - the order is fixed by the bin + * index (byte value), NOT by buffer position. + * - serial_correlation / monte_carlo_pi walk the buffer in order with no + * histogram. + * + * std::log2 is used directly for the entropy term. + */ + +#include +#include +#include +#include +#include + +namespace rspamd::text_stats { + +/* 256-bin histogram of byte values plus the total number of bytes. */ +struct byte_distribution { + std::array bins{}; + std::uint64_t total = 0; +}; + +/* Single O(n) pass; shared by entropy/mean/deviation (no allocation). */ +inline byte_distribution +make_distribution(std::span data) noexcept +{ + byte_distribution dist{}; + + for (auto b: data) { + dist.bins[std::to_integer(b)]++; + } + + dist.total = data.size(); + + return dist; +} + +/* + * Shannon entropy in bits/byte over [0, 8]: + * total = Σ bins; for each non-empty bin: x = bin/total; entropy -= x*log2(x) + * Empty input has no non-empty bins, so entropy is 0 with no division. + */ +inline double +entropy(const byte_distribution &dist) noexcept +{ + double entropy = 0.0; + + if (dist.total == 0) { + return 0.0; + } + + for (std::size_t i = 0; i < 256; i++) { + if (dist.bins[i] != 0) { + double x = (double) dist.bins[i] / (double) dist.total; + entropy -= x * std::log2(x); + } + } + + return entropy; +} + +inline double +entropy(std::span data) noexcept +{ + return entropy(make_distribution(data)); +} + +/* + * Arithmetic mean of byte values: + * sum = Σ_{i=0..255} (double) i * bins[i]; return sum / total + * Uses the unsigned byte value i (0..255). + */ +inline double +byte_mean(const byte_distribution &dist) noexcept +{ + double sum = 0.0; + + if (dist.total == 0) { + return 0.0; + } + + for (std::size_t i = 0; i < 256; i++) { + sum += (double) i * (double) dist.bins[i]; + } + + return sum / (double) dist.total; +} + +inline double +byte_mean(std::span data) noexcept +{ + return byte_mean(make_distribution(data)); +} + +/* + * Mean absolute deviation from `mean`: + * sum = Σ_{i=0..255} fabs((double) i - mean) * bins[i]; return sum / total + * The reference mean is supplied by the caller (typically byte_mean of the + * same range). + */ +inline double +byte_deviation(const byte_distribution &dist, double mean) noexcept +{ + double sum = 0.0; + + if (dist.total == 0) { + return 0.0; + } + + for (std::size_t i = 0; i < 256; i++) { + sum += std::fabs((double) i - mean) * (double) dist.bins[i]; + } + + return sum / (double) dist.total; +} + +inline double +byte_deviation(std::span data, double mean) noexcept +{ + return byte_deviation(make_distribution(data), mean); +} + +/* + * Serial correlation coefficient (ENT algorithm), walking the buffer in order: + * for each byte u: scct1 += last*u; scct2 += u; scct3 += u*u; last = u + * (the first byte is remembered as `first`) + * scct1 += last*first; scct2 *= scct2 + * scc = n*scct3 - scct2 + * scc = (scc == 0) ? -100000 : (n*scct1 - scct2) / scc + * A single byte therefore yields the -100000 sentinel; an empty buffer + * yields 0. + */ +inline double +serial_correlation(std::span data) noexcept +{ + const std::size_t n = data.size(); + + if (n == 0) { + return 0.0; + } + + double sccun = 0, sccfirst = 0, scclast = 0; + double scct1 = 0, scct2 = 0, scct3 = 0; + + for (std::size_t i = 0; i < n; i++) { + sccun = (double) std::to_integer(data[i]); + + if (i == 0) { + sccfirst = sccun; + } + + scct1 += scclast * sccun; + scct2 += sccun; + scct3 += sccun * sccun; + scclast = sccun; + } + + scct1 += scclast * sccfirst; + scct2 *= scct2; + + double scc = (double) n * scct3 - scct2; + + if (scc == 0.0) { + return -100000.0; + } + + return ((double) n * scct1 - scct2) / scc; +} + +/* + * Monte-Carlo Pi metric (ENT algorithm), consuming the buffer in groups of 6 + * bytes: the first 3 bytes form a 24-bit x coordinate and the next 3 a y + * coordinate; a group counts as "in circle" when x^2 + y^2 <= (256^3-1)^2. + * + * IMPORTANT: this does NOT return the Pi estimate. It returns the normalized + * deviation from Pi, fabs((mpi - PI) / PI), where mpi = 4 * inmont / groups and + * PI = 3.141592653589793. Fewer than 6 bytes complete no group, which yields 0. + */ +inline double +monte_carlo_pi(std::span data) noexcept +{ + constexpr double pi = 3.141592653589793; + const double incirc = std::pow(std::pow(256.0, 3.0) - 1, 2.0); + + unsigned int monte[6]; + int mcount = 0; + int inmont = 0; + const std::size_t n = data.size(); + + for (std::size_t i = 0; i < n; i++) { + monte[i % 6] = (unsigned int) std::to_integer(data[i]); + + if (i % 6 == 5) { + double mx = 0; + double my = 0; + + mcount++; + + for (int j = 0; j < 3; j++) { + mx = (mx * 256.0) + monte[j]; + my = (my * 256.0) + monte[j + 3]; + } + + if ((mx * mx + my * my) <= incirc) { + inmont++; + } + } + } + + if (mcount == 0) { + return 0.0; + } + + double mpi = 4.0 * ((double) inmont / mcount); + + return std::fabs((mpi - pi) / pi); +} + +}// namespace rspamd::text_stats + +#endif diff --git a/test/lua/unit/text_stats.lua b/test/lua/unit/text_stats.lua new file mode 100644 index 0000000000..c9319133af --- /dev/null +++ b/test/lua/unit/text_stats.lua @@ -0,0 +1,119 @@ +-- Byte-distribution statistics methods on rspamd_text: +-- entropy / byte_mean / byte_deviation / serial_correlation / monte_carlo_pi. +-- Expected values are derived analytically from the statistic definitions over +-- buffers whose statistics are determined exactly. + +context("Text byte-statistics", function() + local rspamd_text = require "rspamd_text" + + local EPS = 1e-9 + local function approx(actual, expected) + return math.abs(actual - expected) < EPS + end + + local function T(s) + return rspamd_text.fromstring(s) + end + + test("entropy: empty buffer is 0", function() + assert_equal(0.0, T(""):entropy()) + end) + + test("entropy: single symbol is 0", function() + assert_equal(0.0, T(string.rep("\0", 256)):entropy()) + assert_equal(0.0, T(string.rep("A", 100)):entropy()) + end) + + test("entropy: two equal symbols is 1 bit/byte", function() + assert_equal(1.0, T("aaaabbbb"):entropy()) + end) + + test("entropy: every byte value once is 8 bits/byte", function() + local bytes = {} + for i = 0, 255 do + bytes[#bytes + 1] = string.char(i) + end + assert_equal(8.0, T(table.concat(bytes)):entropy()) + end) + + test("byte_mean: unsigned byte values", function() + assert_equal(0.0, T(string.rep("\0", 16)):byte_mean()) + assert_equal(97.5, T("aaaabbbb"):byte_mean()) -- 'a'=97 'b'=98 + -- 0x00 and 0xFF equal counts -> 127.5 (unsigned, not -0.5) + assert_equal(127.5, T(string.char(0, 255, 0, 255)):byte_mean()) + end) + + test("byte_mean: mean of 0..255 is 127.5", function() + local bytes = {} + for i = 0, 255 do + bytes[#bytes + 1] = string.char(i) + end + assert_equal(127.5, T(table.concat(bytes)):byte_mean()) + end) + + test("byte_deviation: mean absolute deviation", function() + -- |97-97.5|*4 + |98-97.5|*4 = 4, /8 = 0.5 + assert_equal(0.5, T("aaaabbbb"):byte_deviation(97.5)) + assert_equal(0.0, T(string.rep("A", 10)):byte_deviation(65.0)) + end) + + test("byte_deviation: deviation of 0..255 about 127.5 is 64", function() + local bytes = {} + for i = 0, 255 do + bytes[#bytes + 1] = string.char(i) + end + assert_equal(64.0, T(table.concat(bytes)):byte_deviation(127.5)) + end) + + test("serial_correlation: hand-computed ramp", function() + -- {0,1,2,3} -> -0.2 + assert_true(approx(T(string.char(0, 1, 2, 3)):serial_correlation(), -0.2)) + end) + + test("serial_correlation: identical bytes hit the sentinel", function() + assert_equal(-100000.0, T(string.rep("\0", 64)):serial_correlation()) + -- single byte also -> sentinel + assert_equal(-100000.0, T("A"):serial_correlation()) + end) + + test("serial_correlation: empty buffer is 0", function() + assert_equal(0.0, T(""):serial_correlation()) + end) + + test("monte_carlo_pi: point outside the circle", function() + -- 6x0xFF -> outside -> mpi 0 -> |0 - PI|/PI == 1 + assert_true(approx(T(string.rep(string.char(255), 6)):monte_carlo_pi(), 1.0)) + end) + + test("monte_carlo_pi: all-in-circle deviation", function() + -- 12 zero bytes -> 2 groups in circle -> mpi 4 -> |4 - PI|/PI + local expected = math.abs((4.0 - 3.141592653589793) / 3.141592653589793) + assert_true(approx(T(string.rep("\0", 12)):monte_carlo_pi(), expected)) + end) + + test("monte_carlo_pi: fewer than 6 bytes is defined 0", function() + assert_equal(0.0, T("abc"):monte_carlo_pi()) + end) + + test("offset/length slicing (0-based offset)", function() + local t = T("AAAABBBB") -- 'A'=65 [0..3], 'B'=66 [4..7] + -- whole buffer: two equal symbols + assert_equal(1.0, t:entropy()) + assert_equal(65.5, t:byte_mean()) + -- slice [4, 4) -> "BBBB" + assert_equal(0.0, t:entropy(4, 4)) + assert_equal(66.0, t:byte_mean(4, 4)) + -- slice from offset to end + assert_equal(0.0, t:entropy(4)) + assert_equal(66.0, t:byte_mean(4)) + -- length clamped to available bytes + assert_equal(66.0, t:byte_mean(4, 1000)) + end) + + test("out-of-range slice is defined 0", function() + local t = T("AAAABBBB") + assert_equal(0.0, t:entropy(100)) + assert_equal(0.0, t:byte_mean(100)) + assert_equal(0.0, t:entropy(0, 0)) + end) +end) diff --git a/test/rspamd_cxx_unit.cxx b/test/rspamd_cxx_unit.cxx index af5a205374..9ae8358253 100644 --- a/test/rspamd_cxx_unit.cxx +++ b/test/rspamd_cxx_unit.cxx @@ -40,6 +40,7 @@ #include "rspamd_cxx_unit_settings_merge.hxx" #include "rspamd_cxx_unit_fpconv.hxx" #include "rspamd_cxx_unit_symcache_timeout.hxx" +#include "rspamd_cxx_unit_text_stats.hxx" static gboolean verbose = false; static const GOptionEntry entries[] = diff --git a/test/rspamd_cxx_unit_text_stats.hxx b/test/rspamd_cxx_unit_text_stats.hxx new file mode 100644 index 0000000000..15a50afb08 --- /dev/null +++ b/test/rspamd_cxx_unit_text_stats.hxx @@ -0,0 +1,185 @@ +/* + * Copyright 2026 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_RSPAMD_CXX_UNIT_TEXT_STATS_HXX +#define RSPAMD_RSPAMD_CXX_UNIT_TEXT_STATS_HXX + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" + +#include "lua/lua_text_stats.hxx" + +#include +#include +#include +#include +#include + +/* + * Golden-vector tests for the byte-statistics primitives. The expected values + * are derived analytically from the statistic definitions over buffers whose + * statistics are determined exactly: + * + * - all-zeros / all-same-byte: entropy 0; mean = that byte; deviation 0; + * serial_correlation -100000 (the n*scct3 - scct2 == 0 sentinel). + * - every byte 0..255 once: entropy 8; mean 127.5; deviation(127.5) 64. + * - "aaaabbbb" (two values, equal counts): entropy 1; mean 97.5. + * - {0,1,2,3}: serial_correlation -0.2 (hand-computed). + * - 0xFF*6 / 0x00*6: monte_carlo_pi returns |4*in/groups - PI|/PI. + * + * entropy/mean/deviation are asserted exactly (the values above are exactly + * representable doubles); the float metrics use a small epsilon. + */ +TEST_SUITE("text stats") +{ + static constexpr double EPS = 1e-9; + static constexpr double REF_PI = 3.141592653589793; + + static std::span as_bytes(const std::vector &v) + { + return {reinterpret_cast(v.data()), v.size()}; + } + + static std::span as_bytes(const std::string &s) + { + return {reinterpret_cast(s.data()), s.size()}; + } + + using namespace rspamd::text_stats; + + TEST_CASE("empty buffer is defined and division-free") + { + std::vector empty; + auto sp = as_bytes(empty); + + CHECK(entropy(sp) == 0.0); + CHECK(byte_mean(sp) == 0.0); + CHECK(byte_deviation(sp, 0.0) == 0.0); + CHECK(serial_correlation(sp) == 0.0); + CHECK(monte_carlo_pi(sp) == 0.0); + } + + TEST_CASE("all-zeros buffer") + { + std::vector z(256, 0); + auto sp = as_bytes(z); + + CHECK(entropy(sp) == 0.0); + CHECK(byte_mean(sp) == 0.0); + CHECK(byte_deviation(sp, 0.0) == 0.0); + /* all bytes identical -> n*scct3 - scct2 == 0 -> sentinel */ + CHECK(serial_correlation(sp) == -100000.0); + /* 42 full groups, every point at origin -> in circle -> mpi == 4 */ + CHECK(monte_carlo_pi(sp) == + doctest::Approx(std::fabs((4.0 - REF_PI) / REF_PI)).epsilon(EPS)); + } + + TEST_CASE("uniform distribution: every byte value once") + { + std::vector u(256); + for (int i = 0; i < 256; i++) { + u[i] = (unsigned char) i; + } + auto sp = as_bytes(u); + + /* 256 bins each with p = 1/256 = 2^-8 -> entropy exactly 8 bits/byte */ + CHECK(entropy(sp) == 8.0); + /* mean of 0..255 == 127.5 */ + CHECK(byte_mean(sp) == 127.5); + /* mean abs deviation about 127.5 == 64 */ + CHECK(byte_deviation(sp, 127.5) == 64.0); + } + + TEST_CASE("two-symbol ASCII: aaaabbbb") + { + std::string s = "aaaabbbb"; /* 4x 'a'(97), 4x 'b'(98) */ + auto sp = as_bytes(s); + + /* two equally-likely symbols -> 1 bit/byte */ + CHECK(entropy(sp) == 1.0); + CHECK(byte_mean(sp) == 97.5); + /* |97-97.5|*4 + |98-97.5|*4 = 4, /8 = 0.5 */ + CHECK(byte_deviation(sp, 97.5) == 0.5); + } + + TEST_CASE("byte_mean uses unsigned byte values (high bytes)") + { + /* 0xFF and 0x00, equal counts: unsigned mean is 127.5, not -0.5 */ + std::vector v{0x00, 0xFF, 0x00, 0xFF}; + auto sp = as_bytes(v); + + CHECK(byte_mean(sp) == 127.5); + CHECK(entropy(sp) == 1.0); + } + + TEST_CASE("serial_correlation hand-computed and edge cases") + { + /* {0,1,2,3}: scct2=6, scct3=14, scct1=8 (+last*first=0); + * scc = 4*14 - 36 = 20; (4*8 - 36)/20 = -4/20 = -0.2 */ + std::vector ramp{0, 1, 2, 3}; + CHECK(serial_correlation(as_bytes(ramp)) == + doctest::Approx(-0.2).epsilon(EPS)); + + /* single byte -> n*scct3 - scct2 == b^2 - b^2 == 0 -> sentinel */ + std::vector one{0x41}; + CHECK(serial_correlation(as_bytes(one)) == -100000.0); + } + + TEST_CASE("monte_carlo_pi point outside the circle") + { + /* 6x0xFF: x = y = 256^3-1, x^2+y^2 = 2*INCIRC > INCIRC -> outside + * -> inmont 0, groups 1 -> mpi 0 -> |0 - PI|/PI == 1 */ + std::vector ff(6, 0xFF); + CHECK(monte_carlo_pi(as_bytes(ff)) == doctest::Approx(1.0).epsilon(EPS)); + + /* 6x0x00 in circle, 6x0xFF outside -> mpi = 4*1/2 = 2 */ + std::vector mix(6, 0x00); + mix.insert(mix.end(), 6, 0xFF); + CHECK(monte_carlo_pi(as_bytes(mix)) == + doctest::Approx(std::fabs((2.0 - REF_PI) / REF_PI)).epsilon(EPS)); + + /* fewer than 6 bytes -> no complete group -> defined 0 */ + std::vector tiny{1, 2, 3}; + CHECK(monte_carlo_pi(as_bytes(tiny)) == 0.0); + } + + TEST_CASE("entropy is bounded in [0, 8]") + { + std::vector v(1024); + for (std::size_t i = 0; i < v.size(); i++) { + v[i] = (unsigned char) ((i * 37 + 11) & 0xff); + } + double e = entropy(as_bytes(v)); + CHECK(e >= 0.0); + CHECK(e <= 8.0); + } + + TEST_CASE("shared distribution matches per-metric span overloads") + { + std::string s = "The quick brown fox jumps over the lazy dog."; + auto sp = as_bytes(s); + + /* one histogram pass, reused by entropy/mean/deviation */ + auto dist = make_distribution(sp); + double m = byte_mean(dist); + + CHECK(entropy(dist) == entropy(sp)); + CHECK(m == byte_mean(sp)); + CHECK(byte_deviation(dist, m) == byte_deviation(sp, m)); + } +} + +#endif