From ec8a6fdfba650b19143bd4643ebe73167b3e17de Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 1 Jun 2026 19:00:42 +0100 Subject: [PATCH] [Fix] functional: move test server ports below the ephemeral range The real root cause of the 440_ssl_server flake (and the family of intermittent "bind 98 / Address already in use" failures): the test server ports sat INSIDE Linux's default ephemeral range (net.ipv4.ip_local_port_range = 32768..60999). Bases were 56379 (redis), 56380 (nginx) and 567xx (rspamd normal/controller/proxy/fuzzy + the two TLS listeners), all squarely in that window. So any outbound client socket in the test environment -- a redis client, monitored URIBL DNS lookups, an upstream connection, a dummy-helper connection -- could be handed one of those numbers by the kernel as its EPHEMERAL SOURCE PORT on connect(). When rspamd later tried to bind() a LISTENER on that exact port it got EADDRINUSE. rspamd sets SO_REUSEADDR, which does nothing against a live socket already bound by another process. The controller's SSL socket is the LAST of its five ports to bind -- by then the controller has already opened many client sockets -- so it lost this race most often and surfaced as "SSL controller never came up" -> HTTPS connection-refused for the whole retry budget. It was probabilistic (depends which ephemeral ports were in use at bind time), hence flaky and distro-dependent. Move the whole rspamd/redis/nginx block down by 31000 (e.g. normal 56789 -> 25789, controller-SSL 56796 -> 25796, redis 56379 -> 25379, nginx 56380 -> 25380). This preserves every relative offset, so the carefully spaced, collision-free per-worker layout (base + slot*100) is unchanged: across 64 worker slots the dummy_* helpers stay <= 24383, this block spans 25379..32097, and the ephemeral floor 32768 is never reached. Verified by importing vars.py for slots 0 and 63 (max port 32097 < 32768, zero cross-family collisions) and a serial 001_merged run (all six 440_ssl_server tests pass on the relocated ports). Also bump the two cosmetic fallbacks that mirrored the old bases: test_redis_client.lua's getenv default and a port_is_free docstring. --- test/functional/lib/rspamd.py | 2 +- test/functional/lib/vars.py | 39 +++++++++++++------ .../lua/rspamadm/test_redis_client.lua | 2 +- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/test/functional/lib/rspamd.py b/test/functional/lib/rspamd.py index 8a41634abe..2886b9e467 100644 --- a/test/functional/lib/rspamd.py +++ b/test/functional/lib/rspamd.py @@ -553,7 +553,7 @@ def port_is_free(addr, port): Keyword Succeeds retries; connection refused means the port is free. Example: - | Wait Until Keyword Succeeds | 10s | 0.2s | Port Is Free | 127.0.0.1 | 56790 | + | Wait Until Keyword Succeeds | 10s | 0.2s | Port Is Free | 127.0.0.1 | 25790 | """ s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(0.5) diff --git a/test/functional/lib/vars.py b/test/functional/lib/vars.py index ceaaff2ed9..1e3cba655e 100644 --- a/test/functional/lib/vars.py +++ b/test/functional/lib/vars.py @@ -93,7 +93,7 @@ def _release_slot(slot, owner_pid): _WORKER_INDEX = _worker_index() # 100 ports per worker. We currently use ~14 distinct ports; 100 leaves # headroom for future services and keeps each worker's ports humanly -# distinguishable in logs (worker 3 -> 56789 + 300 = 57089). +# distinguishable in logs (worker 3 -> 25789 + 300 = 26089). _PORT_OFFSET = _WORKER_INDEX * 100 # Per-worker prefix for unix sockets and pid files that have historically @@ -119,15 +119,30 @@ RSPAMD_KEY_PUB2 = 'mbggdnw3tdx7r3ruakjecpf5hcqr4cb4nmdp1fxynx3drbyujb3y' RSPAMD_KEY_PUB3 = 'zhypei8sartqrtow84dddgp5exh3gsr65kbw88wj7ppot1bwmuiy' RSPAMD_LOCAL_ADDR = '127.0.0.1' RSPAMD_MAP_WATCH_INTERVAL = '1min' -RSPAMD_PORT_CONTROLLER = 56790 + _PORT_OFFSET -RSPAMD_PORT_CONTROLLER_SLAVE = 56793 + _PORT_OFFSET -RSPAMD_PORT_FUZZY = 56791 + _PORT_OFFSET -RSPAMD_PORT_FUZZY_SLAVE = 56792 + _PORT_OFFSET -RSPAMD_PORT_NORMAL = 56789 + _PORT_OFFSET -RSPAMD_PORT_NORMAL_SLAVE = 56794 + _PORT_OFFSET -RSPAMD_PORT_PROXY = 56795 + _PORT_OFFSET -RSPAMD_PORT_CONTROLLER_SSL = 56796 + _PORT_OFFSET -RSPAMD_PORT_NORMAL_SSL = 56797 + _PORT_OFFSET +# All listening ports below MUST stay under Linux's default ephemeral +# range (net.ipv4.ip_local_port_range = 32768..60999). The historical +# bases sat at 56379/56380/567xx, squarely inside it, so an outbound +# client socket (redis, monitored DNS, an upstream, a dummy-helper +# connection) could transiently occupy a server port as its source port; +# rspamd's later bind() of a listener on that port then failed with +# EADDRINUSE (98). Because the controller's SSL socket is the LAST of its +# five ports to bind -- after the controller has already opened many +# client sockets -- it lost this race most often, surfacing as the flaky +# 440_ssl_server "SSL controller never came up". Moving the whole +# rspamd/redis/nginx block down by 31000 keeps it below the ephemeral +# floor while preserving every relative offset (so the carefully spaced, +# collision-free per-worker layout is unchanged). Layout across 64 worker +# slots (100 ports each): dummy_* helpers occupy <= 24383, this block +# 25379..32097, ephemeral 32768+. Do NOT move these back above 32768. +RSPAMD_PORT_CONTROLLER = 25790 + _PORT_OFFSET +RSPAMD_PORT_CONTROLLER_SLAVE = 25793 + _PORT_OFFSET +RSPAMD_PORT_FUZZY = 25791 + _PORT_OFFSET +RSPAMD_PORT_FUZZY_SLAVE = 25792 + _PORT_OFFSET +RSPAMD_PORT_NORMAL = 25789 + _PORT_OFFSET +RSPAMD_PORT_NORMAL_SLAVE = 25794 + _PORT_OFFSET +RSPAMD_PORT_PROXY = 25795 + _PORT_OFFSET +RSPAMD_PORT_CONTROLLER_SSL = 25796 + _PORT_OFFSET +RSPAMD_PORT_NORMAL_SSL = 25797 + _PORT_OFFSET RSPAMD_PORT_CLAM = 2100 + _PORT_OFFSET RSPAMD_PORT_FPROT = 2101 + _PORT_OFFSET RSPAMD_PORT_FPROT2_DUPLICATE = 2102 + _PORT_OFFSET @@ -139,9 +154,9 @@ RSPAMD_PORT_DUMMY_UDP = 5005 + _PORT_OFFSET RSPAMD_PORT_DUMMY_SSL = 14433 + _PORT_OFFSET RSPAMD_P0F_SOCKET = '{}/p0f.sock'.format(RSPAMD_TMP_PREFIX) RSPAMD_REDIS_ADDR = '127.0.0.1' -RSPAMD_REDIS_PORT = 56379 + _PORT_OFFSET +RSPAMD_REDIS_PORT = 25379 + _PORT_OFFSET RSPAMD_NGINX_ADDR = '127.0.0.1' -RSPAMD_NGINX_PORT = 56380 + _PORT_OFFSET +RSPAMD_NGINX_PORT = 25380 + _PORT_OFFSET RSPAMD_GROUP = 'nogroup' RSPAMD_USER = 'nobody' SOCK_DGRAM = socket.SOCK_DGRAM diff --git a/test/functional/lua/rspamadm/test_redis_client.lua b/test/functional/lua/rspamadm/test_redis_client.lua index b5ba621715..3e6016cecb 100644 --- a/test/functional/lua/rspamadm/test_redis_client.lua +++ b/test/functional/lua/rspamadm/test_redis_client.lua @@ -7,7 +7,7 @@ local upstream_list = require "rspamd_upstream_list" -- (unlike rspamd) does NOT populate the rspamd_env global, so read -- the RSPAMD_ env var directly. Fall back to the historical literal -- for standalone invocations. -local redis_port = tonumber(os.getenv("RSPAMD_REDIS_PORT")) or 56379 +local redis_port = tonumber(os.getenv("RSPAMD_REDIS_PORT")) or 25379 local upstreams_write = upstream_list.create('127.0.0.1', redis_port) local upstreams_read = upstream_list.create('127.0.0.1', redis_port) -- 2.47.3