${CMAKE_CURRENT_SOURCE_DIR}/lua_dns.c
${CMAKE_CURRENT_SOURCE_DIR}/lua_udp.c
${CMAKE_CURRENT_SOURCE_DIR}/lua_text.c
+ ${CMAKE_CURRENT_SOURCE_DIR}/lua_text_stats.cxx
${CMAKE_CURRENT_SOURCE_DIR}/lua_worker.c
${CMAKE_CURRENT_SOURCE_DIR}/lua_kann.c
${CMAKE_CURRENT_SOURCE_DIR}/lua_spf.c
luaopen_ip(L);
luaopen_expression(L);
luaopen_text(L);
+ rspamd_lua_text_stats_init(L);
luaopen_util(L);
luaopen_tcp(L);
luaopen_html(L);
void luaopen_text(lua_State *L);
+/*
+ * Augments the rspamd{text} metatable with byte-statistics methods
+ * (entropy/byte_mean/...). Must be called after luaopen_text. Defined in
+ * lua_text_stats.cxx.
+ */
+void rspamd_lua_text_stats_init(lua_State *L);
+
void luaopen_util(lua_State *L);
void luaopen_tcp(lua_State *L);
--- /dev/null
+/*
+ * Copyright 2026 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "lua_common.h"
+#include "lua_text_stats.hxx"
+
+#include <span>
+#include <cstddef>
+#include <algorithm>
+
+/***
+ * @module rspamd_text
+ * The following byte-distribution statistics methods are added to the
+ * `rspamd_text` class. They produce deterministic, bit-reproducible results so
+ * callers can compare them against fixed thresholds. Each takes an optional
+ * byte range:
+ *
+ * - `off` is a 0-based byte offset, defaulting to 0;
+ * - `len` is a byte count, defaulting to the rest of the buffer after `off`.
+ *
+ * The range is clamped to the buffer (`off` in `[0, #text)`, `len` truncated to
+ * the bytes available after `off`); an out-of-range or empty range yields 0 for
+ * every metric.
+ */
+
+using namespace rspamd::text_stats;
+
+/*
+ * Validate the optional (off, len) range of a text and return it as a byte
+ * span: off must lie in [0, size) and len is truncated to the bytes available
+ * after off. An out-of-range or empty request yields an empty span.
+ */
+static std::span<const std::byte>
+lua_text_stats_slice(lua_State *L, const struct rspamd_lua_text *t,
+ int off_idx, int len_idx)
+{
+ const auto n = static_cast<std::size_t>(t->len);
+ const auto *base = reinterpret_cast<const std::byte *>(t->start);
+
+ lua_Integer off = luaL_optinteger(L, off_idx, 0);
+
+ if (off < 0 || static_cast<std::size_t>(off) >= n) {
+ return {};
+ }
+
+ const std::size_t avail = n - static_cast<std::size_t>(off);
+ lua_Integer len = luaL_optinteger(L, len_idx, static_cast<lua_Integer>(avail));
+
+ if (len <= 0) {
+ return {};
+ }
+
+ const std::size_t take = std::min(static_cast<std::size_t>(len), avail);
+
+ return {base + off, take};
+}
+
+/***
+ * @method text:entropy([off[, len]])
+ * Shannon entropy of the byte range in bits/byte, in [0, 8].
+ * @param {number} off optional 0-based byte offset (default 0)
+ * @param {number} len optional byte count (default: to end of text)
+ * @return {number} entropy in bits per byte
+ */
+static int
+lua_text_entropy(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ auto *t = lua_check_text(L, 1);
+
+ if (t == nullptr) {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ lua_pushnumber(L, entropy(lua_text_stats_slice(L, t, 2, 3)));
+
+ return 1;
+}
+
+/***
+ * @method text:byte_mean([off[, len]])
+ * Arithmetic mean of the (unsigned) byte values in the range.
+ * @param {number} off optional 0-based byte offset (default 0)
+ * @param {number} len optional byte count (default: to end of text)
+ * @return {number} mean byte value
+ */
+static int
+lua_text_byte_mean(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ auto *t = lua_check_text(L, 1);
+
+ if (t == nullptr) {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ lua_pushnumber(L, byte_mean(lua_text_stats_slice(L, t, 2, 3)));
+
+ return 1;
+}
+
+/***
+ * @method text:byte_deviation(mean[, off[, len]])
+ * Mean absolute deviation of the byte values from `mean` (typically
+ * `byte_mean` of the same range).
+ * @param {number} mean reference mean value
+ * @param {number} off optional 0-based byte offset (default 0)
+ * @param {number} len optional byte count (default: to end of text)
+ * @return {number} mean absolute deviation
+ */
+static int
+lua_text_byte_deviation(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ auto *t = lua_check_text(L, 1);
+
+ if (t == nullptr) {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ double mean = static_cast<double>(luaL_checknumber(L, 2));
+
+ lua_pushnumber(L, byte_deviation(lua_text_stats_slice(L, t, 3, 4), mean));
+
+ return 1;
+}
+
+/***
+ * @method text:serial_correlation([off[, len]])
+ * Serial correlation coefficient (ENT) of the byte range.
+ * @param {number} off optional 0-based byte offset (default 0)
+ * @param {number} len optional byte count (default: to end of text)
+ * @return {number} serial correlation coefficient
+ */
+static int
+lua_text_serial_correlation(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ auto *t = lua_check_text(L, 1);
+
+ if (t == nullptr) {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ lua_pushnumber(L, serial_correlation(lua_text_stats_slice(L, t, 2, 3)));
+
+ return 1;
+}
+
+/***
+ * @method text:monte_carlo_pi([off[, len]])
+ * Monte-Carlo Pi metric (ENT) of the byte range: the normalized deviation from
+ * Pi, `fabs((4*inmont/groups - PI) / PI)` (note: this is the deviation from Pi,
+ * not Pi itself).
+ * @param {number} off optional 0-based byte offset (default 0)
+ * @param {number} len optional byte count (default: to end of text)
+ * @return {number} normalized deviation from Pi
+ */
+static int
+lua_text_monte_carlo_pi(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ auto *t = lua_check_text(L, 1);
+
+ if (t == nullptr) {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ lua_pushnumber(L, monte_carlo_pi(lua_text_stats_slice(L, t, 2, 3)));
+
+ return 1;
+}
+
+static const struct luaL_reg text_stats_m[] = {
+ {"entropy", lua_text_entropy},
+ {"byte_mean", lua_text_byte_mean},
+ {"byte_deviation", lua_text_byte_deviation},
+ {"serial_correlation", lua_text_serial_correlation},
+ {"monte_carlo_pi", lua_text_monte_carlo_pi},
+ {nullptr, nullptr},
+};
+
+void rspamd_lua_text_stats_init(lua_State *L)
+{
+ /*
+ * Augment the existing rspamd{text} metatable (created by luaopen_text)
+ * with the statistics methods, so lua_text.c stays untouched. Methods live
+ * directly on the metatable (rspamd_lua_new_class sets __index = metatable).
+ */
+ rspamd_lua_class_metatable(L, rspamd_text_classname);
+ luaL_register(L, nullptr, text_stats_m);
+ lua_pop(L, 1);
+}
--- /dev/null
+/*
+ * Copyright 2026 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_LUA_TEXT_STATS_HXX
+#define RSPAMD_LUA_TEXT_STATS_HXX
+#pragma once
+
+/*
+ * Byte-distribution statistics over a buffer: Shannon entropy, byte mean and
+ * mean absolute deviation, and the serial-correlation and Monte-Carlo-Pi
+ * randomness metrics (the classic ENT test-suite formulas, John Walker,
+ * fourmilab.ch/random). They are specified to produce deterministic,
+ * portable, bit-reproducible results so callers can compare them against
+ * fixed thresholds.
+ *
+ * These are pure, header-only, allocation-free C++; the Lua bindings in
+ * lua_text_stats.cxx and the C++ unit tests both call them directly.
+ *
+ * Two accumulation strategies are used, and the order is part of the contract
+ * (floating-point addition is not associative, so it affects the exact bits):
+ * - entropy / mean / deviation build a 256-bin histogram of the byte values
+ * and accumulate over the bins 0..255 - the order is fixed by the bin
+ * index (byte value), NOT by buffer position.
+ * - serial_correlation / monte_carlo_pi walk the buffer in order with no
+ * histogram.
+ *
+ * std::log2 is used directly for the entropy term.
+ */
+
+#include <array>
+#include <span>
+#include <cstdint>
+#include <cstddef>
+#include <cmath>
+
+namespace rspamd::text_stats {
+
+/* 256-bin histogram of byte values plus the total number of bytes. */
+struct byte_distribution {
+ std::array<std::uint64_t, 256> bins{};
+ std::uint64_t total = 0;
+};
+
+/* Single O(n) pass; shared by entropy/mean/deviation (no allocation). */
+inline byte_distribution
+make_distribution(std::span<const std::byte> data) noexcept
+{
+ byte_distribution dist{};
+
+ for (auto b: data) {
+ dist.bins[std::to_integer<std::uint8_t>(b)]++;
+ }
+
+ dist.total = data.size();
+
+ return dist;
+}
+
+/*
+ * Shannon entropy in bits/byte over [0, 8]:
+ * total = Σ bins; for each non-empty bin: x = bin/total; entropy -= x*log2(x)
+ * Empty input has no non-empty bins, so entropy is 0 with no division.
+ */
+inline double
+entropy(const byte_distribution &dist) noexcept
+{
+ double entropy = 0.0;
+
+ if (dist.total == 0) {
+ return 0.0;
+ }
+
+ for (std::size_t i = 0; i < 256; i++) {
+ if (dist.bins[i] != 0) {
+ double x = (double) dist.bins[i] / (double) dist.total;
+ entropy -= x * std::log2(x);
+ }
+ }
+
+ return entropy;
+}
+
+inline double
+entropy(std::span<const std::byte> data) noexcept
+{
+ return entropy(make_distribution(data));
+}
+
+/*
+ * Arithmetic mean of byte values:
+ * sum = Σ_{i=0..255} (double) i * bins[i]; return sum / total
+ * Uses the unsigned byte value i (0..255).
+ */
+inline double
+byte_mean(const byte_distribution &dist) noexcept
+{
+ double sum = 0.0;
+
+ if (dist.total == 0) {
+ return 0.0;
+ }
+
+ for (std::size_t i = 0; i < 256; i++) {
+ sum += (double) i * (double) dist.bins[i];
+ }
+
+ return sum / (double) dist.total;
+}
+
+inline double
+byte_mean(std::span<const std::byte> data) noexcept
+{
+ return byte_mean(make_distribution(data));
+}
+
+/*
+ * Mean absolute deviation from `mean`:
+ * sum = Σ_{i=0..255} fabs((double) i - mean) * bins[i]; return sum / total
+ * The reference mean is supplied by the caller (typically byte_mean of the
+ * same range).
+ */
+inline double
+byte_deviation(const byte_distribution &dist, double mean) noexcept
+{
+ double sum = 0.0;
+
+ if (dist.total == 0) {
+ return 0.0;
+ }
+
+ for (std::size_t i = 0; i < 256; i++) {
+ sum += std::fabs((double) i - mean) * (double) dist.bins[i];
+ }
+
+ return sum / (double) dist.total;
+}
+
+inline double
+byte_deviation(std::span<const std::byte> data, double mean) noexcept
+{
+ return byte_deviation(make_distribution(data), mean);
+}
+
+/*
+ * Serial correlation coefficient (ENT algorithm), walking the buffer in order:
+ * for each byte u: scct1 += last*u; scct2 += u; scct3 += u*u; last = u
+ * (the first byte is remembered as `first`)
+ * scct1 += last*first; scct2 *= scct2
+ * scc = n*scct3 - scct2
+ * scc = (scc == 0) ? -100000 : (n*scct1 - scct2) / scc
+ * A single byte therefore yields the -100000 sentinel; an empty buffer
+ * yields 0.
+ */
+inline double
+serial_correlation(std::span<const std::byte> data) noexcept
+{
+ const std::size_t n = data.size();
+
+ if (n == 0) {
+ return 0.0;
+ }
+
+ double sccun = 0, sccfirst = 0, scclast = 0;
+ double scct1 = 0, scct2 = 0, scct3 = 0;
+
+ for (std::size_t i = 0; i < n; i++) {
+ sccun = (double) std::to_integer<std::uint8_t>(data[i]);
+
+ if (i == 0) {
+ sccfirst = sccun;
+ }
+
+ scct1 += scclast * sccun;
+ scct2 += sccun;
+ scct3 += sccun * sccun;
+ scclast = sccun;
+ }
+
+ scct1 += scclast * sccfirst;
+ scct2 *= scct2;
+
+ double scc = (double) n * scct3 - scct2;
+
+ if (scc == 0.0) {
+ return -100000.0;
+ }
+
+ return ((double) n * scct1 - scct2) / scc;
+}
+
+/*
+ * Monte-Carlo Pi metric (ENT algorithm), consuming the buffer in groups of 6
+ * bytes: the first 3 bytes form a 24-bit x coordinate and the next 3 a y
+ * coordinate; a group counts as "in circle" when x^2 + y^2 <= (256^3-1)^2.
+ *
+ * IMPORTANT: this does NOT return the Pi estimate. It returns the normalized
+ * deviation from Pi, fabs((mpi - PI) / PI), where mpi = 4 * inmont / groups and
+ * PI = 3.141592653589793. Fewer than 6 bytes complete no group, which yields 0.
+ */
+inline double
+monte_carlo_pi(std::span<const std::byte> data) noexcept
+{
+ constexpr double pi = 3.141592653589793;
+ const double incirc = std::pow(std::pow(256.0, 3.0) - 1, 2.0);
+
+ unsigned int monte[6];
+ int mcount = 0;
+ int inmont = 0;
+ const std::size_t n = data.size();
+
+ for (std::size_t i = 0; i < n; i++) {
+ monte[i % 6] = (unsigned int) std::to_integer<std::uint8_t>(data[i]);
+
+ if (i % 6 == 5) {
+ double mx = 0;
+ double my = 0;
+
+ mcount++;
+
+ for (int j = 0; j < 3; j++) {
+ mx = (mx * 256.0) + monte[j];
+ my = (my * 256.0) + monte[j + 3];
+ }
+
+ if ((mx * mx + my * my) <= incirc) {
+ inmont++;
+ }
+ }
+ }
+
+ if (mcount == 0) {
+ return 0.0;
+ }
+
+ double mpi = 4.0 * ((double) inmont / mcount);
+
+ return std::fabs((mpi - pi) / pi);
+}
+
+}// namespace rspamd::text_stats
+
+#endif
--- /dev/null
+-- Byte-distribution statistics methods on rspamd_text:
+-- entropy / byte_mean / byte_deviation / serial_correlation / monte_carlo_pi.
+-- Expected values are derived analytically from the statistic definitions over
+-- buffers whose statistics are determined exactly.
+
+context("Text byte-statistics", function()
+ local rspamd_text = require "rspamd_text"
+
+ local EPS = 1e-9
+ local function approx(actual, expected)
+ return math.abs(actual - expected) < EPS
+ end
+
+ local function T(s)
+ return rspamd_text.fromstring(s)
+ end
+
+ test("entropy: empty buffer is 0", function()
+ assert_equal(0.0, T(""):entropy())
+ end)
+
+ test("entropy: single symbol is 0", function()
+ assert_equal(0.0, T(string.rep("\0", 256)):entropy())
+ assert_equal(0.0, T(string.rep("A", 100)):entropy())
+ end)
+
+ test("entropy: two equal symbols is 1 bit/byte", function()
+ assert_equal(1.0, T("aaaabbbb"):entropy())
+ end)
+
+ test("entropy: every byte value once is 8 bits/byte", function()
+ local bytes = {}
+ for i = 0, 255 do
+ bytes[#bytes + 1] = string.char(i)
+ end
+ assert_equal(8.0, T(table.concat(bytes)):entropy())
+ end)
+
+ test("byte_mean: unsigned byte values", function()
+ assert_equal(0.0, T(string.rep("\0", 16)):byte_mean())
+ assert_equal(97.5, T("aaaabbbb"):byte_mean()) -- 'a'=97 'b'=98
+ -- 0x00 and 0xFF equal counts -> 127.5 (unsigned, not -0.5)
+ assert_equal(127.5, T(string.char(0, 255, 0, 255)):byte_mean())
+ end)
+
+ test("byte_mean: mean of 0..255 is 127.5", function()
+ local bytes = {}
+ for i = 0, 255 do
+ bytes[#bytes + 1] = string.char(i)
+ end
+ assert_equal(127.5, T(table.concat(bytes)):byte_mean())
+ end)
+
+ test("byte_deviation: mean absolute deviation", function()
+ -- |97-97.5|*4 + |98-97.5|*4 = 4, /8 = 0.5
+ assert_equal(0.5, T("aaaabbbb"):byte_deviation(97.5))
+ assert_equal(0.0, T(string.rep("A", 10)):byte_deviation(65.0))
+ end)
+
+ test("byte_deviation: deviation of 0..255 about 127.5 is 64", function()
+ local bytes = {}
+ for i = 0, 255 do
+ bytes[#bytes + 1] = string.char(i)
+ end
+ assert_equal(64.0, T(table.concat(bytes)):byte_deviation(127.5))
+ end)
+
+ test("serial_correlation: hand-computed ramp", function()
+ -- {0,1,2,3} -> -0.2
+ assert_true(approx(T(string.char(0, 1, 2, 3)):serial_correlation(), -0.2))
+ end)
+
+ test("serial_correlation: identical bytes hit the sentinel", function()
+ assert_equal(-100000.0, T(string.rep("\0", 64)):serial_correlation())
+ -- single byte also -> sentinel
+ assert_equal(-100000.0, T("A"):serial_correlation())
+ end)
+
+ test("serial_correlation: empty buffer is 0", function()
+ assert_equal(0.0, T(""):serial_correlation())
+ end)
+
+ test("monte_carlo_pi: point outside the circle", function()
+ -- 6x0xFF -> outside -> mpi 0 -> |0 - PI|/PI == 1
+ assert_true(approx(T(string.rep(string.char(255), 6)):monte_carlo_pi(), 1.0))
+ end)
+
+ test("monte_carlo_pi: all-in-circle deviation", function()
+ -- 12 zero bytes -> 2 groups in circle -> mpi 4 -> |4 - PI|/PI
+ local expected = math.abs((4.0 - 3.141592653589793) / 3.141592653589793)
+ assert_true(approx(T(string.rep("\0", 12)):monte_carlo_pi(), expected))
+ end)
+
+ test("monte_carlo_pi: fewer than 6 bytes is defined 0", function()
+ assert_equal(0.0, T("abc"):monte_carlo_pi())
+ end)
+
+ test("offset/length slicing (0-based offset)", function()
+ local t = T("AAAABBBB") -- 'A'=65 [0..3], 'B'=66 [4..7]
+ -- whole buffer: two equal symbols
+ assert_equal(1.0, t:entropy())
+ assert_equal(65.5, t:byte_mean())
+ -- slice [4, 4) -> "BBBB"
+ assert_equal(0.0, t:entropy(4, 4))
+ assert_equal(66.0, t:byte_mean(4, 4))
+ -- slice from offset to end
+ assert_equal(0.0, t:entropy(4))
+ assert_equal(66.0, t:byte_mean(4))
+ -- length clamped to available bytes
+ assert_equal(66.0, t:byte_mean(4, 1000))
+ end)
+
+ test("out-of-range slice is defined 0", function()
+ local t = T("AAAABBBB")
+ assert_equal(0.0, t:entropy(100))
+ assert_equal(0.0, t:byte_mean(100))
+ assert_equal(0.0, t:entropy(0, 0))
+ end)
+end)
#include "rspamd_cxx_unit_settings_merge.hxx"
#include "rspamd_cxx_unit_fpconv.hxx"
#include "rspamd_cxx_unit_symcache_timeout.hxx"
+#include "rspamd_cxx_unit_text_stats.hxx"
static gboolean verbose = false;
static const GOptionEntry entries[] =
--- /dev/null
+/*
+ * Copyright 2026 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_RSPAMD_CXX_UNIT_TEXT_STATS_HXX
+#define RSPAMD_RSPAMD_CXX_UNIT_TEXT_STATS_HXX
+
+#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
+#include "doctest/doctest.h"
+
+#include "lua/lua_text_stats.hxx"
+
+#include <vector>
+#include <string>
+#include <span>
+#include <cstddef>
+#include <cmath>
+
+/*
+ * Golden-vector tests for the byte-statistics primitives. The expected values
+ * are derived analytically from the statistic definitions over buffers whose
+ * statistics are determined exactly:
+ *
+ * - all-zeros / all-same-byte: entropy 0; mean = that byte; deviation 0;
+ * serial_correlation -100000 (the n*scct3 - scct2 == 0 sentinel).
+ * - every byte 0..255 once: entropy 8; mean 127.5; deviation(127.5) 64.
+ * - "aaaabbbb" (two values, equal counts): entropy 1; mean 97.5.
+ * - {0,1,2,3}: serial_correlation -0.2 (hand-computed).
+ * - 0xFF*6 / 0x00*6: monte_carlo_pi returns |4*in/groups - PI|/PI.
+ *
+ * entropy/mean/deviation are asserted exactly (the values above are exactly
+ * representable doubles); the float metrics use a small epsilon.
+ */
+TEST_SUITE("text stats")
+{
+ static constexpr double EPS = 1e-9;
+ static constexpr double REF_PI = 3.141592653589793;
+
+ static std::span<const std::byte> as_bytes(const std::vector<unsigned char> &v)
+ {
+ return {reinterpret_cast<const std::byte *>(v.data()), v.size()};
+ }
+
+ static std::span<const std::byte> as_bytes(const std::string &s)
+ {
+ return {reinterpret_cast<const std::byte *>(s.data()), s.size()};
+ }
+
+ using namespace rspamd::text_stats;
+
+ TEST_CASE("empty buffer is defined and division-free")
+ {
+ std::vector<unsigned char> empty;
+ auto sp = as_bytes(empty);
+
+ CHECK(entropy(sp) == 0.0);
+ CHECK(byte_mean(sp) == 0.0);
+ CHECK(byte_deviation(sp, 0.0) == 0.0);
+ CHECK(serial_correlation(sp) == 0.0);
+ CHECK(monte_carlo_pi(sp) == 0.0);
+ }
+
+ TEST_CASE("all-zeros buffer")
+ {
+ std::vector<unsigned char> z(256, 0);
+ auto sp = as_bytes(z);
+
+ CHECK(entropy(sp) == 0.0);
+ CHECK(byte_mean(sp) == 0.0);
+ CHECK(byte_deviation(sp, 0.0) == 0.0);
+ /* all bytes identical -> n*scct3 - scct2 == 0 -> sentinel */
+ CHECK(serial_correlation(sp) == -100000.0);
+ /* 42 full groups, every point at origin -> in circle -> mpi == 4 */
+ CHECK(monte_carlo_pi(sp) ==
+ doctest::Approx(std::fabs((4.0 - REF_PI) / REF_PI)).epsilon(EPS));
+ }
+
+ TEST_CASE("uniform distribution: every byte value once")
+ {
+ std::vector<unsigned char> u(256);
+ for (int i = 0; i < 256; i++) {
+ u[i] = (unsigned char) i;
+ }
+ auto sp = as_bytes(u);
+
+ /* 256 bins each with p = 1/256 = 2^-8 -> entropy exactly 8 bits/byte */
+ CHECK(entropy(sp) == 8.0);
+ /* mean of 0..255 == 127.5 */
+ CHECK(byte_mean(sp) == 127.5);
+ /* mean abs deviation about 127.5 == 64 */
+ CHECK(byte_deviation(sp, 127.5) == 64.0);
+ }
+
+ TEST_CASE("two-symbol ASCII: aaaabbbb")
+ {
+ std::string s = "aaaabbbb"; /* 4x 'a'(97), 4x 'b'(98) */
+ auto sp = as_bytes(s);
+
+ /* two equally-likely symbols -> 1 bit/byte */
+ CHECK(entropy(sp) == 1.0);
+ CHECK(byte_mean(sp) == 97.5);
+ /* |97-97.5|*4 + |98-97.5|*4 = 4, /8 = 0.5 */
+ CHECK(byte_deviation(sp, 97.5) == 0.5);
+ }
+
+ TEST_CASE("byte_mean uses unsigned byte values (high bytes)")
+ {
+ /* 0xFF and 0x00, equal counts: unsigned mean is 127.5, not -0.5 */
+ std::vector<unsigned char> v{0x00, 0xFF, 0x00, 0xFF};
+ auto sp = as_bytes(v);
+
+ CHECK(byte_mean(sp) == 127.5);
+ CHECK(entropy(sp) == 1.0);
+ }
+
+ TEST_CASE("serial_correlation hand-computed and edge cases")
+ {
+ /* {0,1,2,3}: scct2=6, scct3=14, scct1=8 (+last*first=0);
+ * scc = 4*14 - 36 = 20; (4*8 - 36)/20 = -4/20 = -0.2 */
+ std::vector<unsigned char> ramp{0, 1, 2, 3};
+ CHECK(serial_correlation(as_bytes(ramp)) ==
+ doctest::Approx(-0.2).epsilon(EPS));
+
+ /* single byte -> n*scct3 - scct2 == b^2 - b^2 == 0 -> sentinel */
+ std::vector<unsigned char> one{0x41};
+ CHECK(serial_correlation(as_bytes(one)) == -100000.0);
+ }
+
+ TEST_CASE("monte_carlo_pi point outside the circle")
+ {
+ /* 6x0xFF: x = y = 256^3-1, x^2+y^2 = 2*INCIRC > INCIRC -> outside
+ * -> inmont 0, groups 1 -> mpi 0 -> |0 - PI|/PI == 1 */
+ std::vector<unsigned char> ff(6, 0xFF);
+ CHECK(monte_carlo_pi(as_bytes(ff)) == doctest::Approx(1.0).epsilon(EPS));
+
+ /* 6x0x00 in circle, 6x0xFF outside -> mpi = 4*1/2 = 2 */
+ std::vector<unsigned char> mix(6, 0x00);
+ mix.insert(mix.end(), 6, 0xFF);
+ CHECK(monte_carlo_pi(as_bytes(mix)) ==
+ doctest::Approx(std::fabs((2.0 - REF_PI) / REF_PI)).epsilon(EPS));
+
+ /* fewer than 6 bytes -> no complete group -> defined 0 */
+ std::vector<unsigned char> tiny{1, 2, 3};
+ CHECK(monte_carlo_pi(as_bytes(tiny)) == 0.0);
+ }
+
+ TEST_CASE("entropy is bounded in [0, 8]")
+ {
+ std::vector<unsigned char> v(1024);
+ for (std::size_t i = 0; i < v.size(); i++) {
+ v[i] = (unsigned char) ((i * 37 + 11) & 0xff);
+ }
+ double e = entropy(as_bytes(v));
+ CHECK(e >= 0.0);
+ CHECK(e <= 8.0);
+ }
+
+ TEST_CASE("shared distribution matches per-metric span overloads")
+ {
+ std::string s = "The quick brown fox jumps over the lazy dog.";
+ auto sp = as_bytes(s);
+
+ /* one histogram pass, reused by entropy/mean/deviation */
+ auto dist = make_distribution(sp);
+ double m = byte_mean(dist);
+
+ CHECK(entropy(dist) == entropy(sp));
+ CHECK(m == byte_mean(sp));
+ CHECK(byte_deviation(dist, m) == byte_deviation(sp, m));
+ }
+}
+
+#endif