From: Arran Cudbard-Bell Date: Wed, 22 Apr 2026 16:10:07 +0000 (-0400) Subject: kafka: Basic unreachable test, and cancellation race X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a9ade6928a9d9341d46eaeb926a8a9f4cf3faec1;p=thirdparty%2Ffreeradius-server.git kafka: Basic unreachable test, and cancellation race --- diff --git a/src/tests/modules/kafka/module.conf b/src/tests/modules/kafka/module.conf index 98b13c3ca2d..9f1ee6d98f6 100644 --- a/src/tests/modules/kafka/module.conf +++ b/src/tests/modules/kafka/module.conf @@ -85,3 +85,39 @@ kafka { flush_timeout = 5s } + +# +# A second kafka instance pointed at a dead address so the unreachable +# test can exercise the delivery-report failure path without +# interfering with the broker the other tests rely on. Port 1 is +# reserved (tcpmux) and virtually never bound, so librdkafka sees +# immediate ECONNREFUSED on every produce attempt. `message_timeout` +# caps how long librdkafka will retry before the DR arrives as +# `_MSG_TIMED_OUT`; keep it short so the test finishes quickly but +# long enough to cover a handful of reconnect cycles. +# +kafka kafka_unreachable { + server = "127.0.0.1:1" + + topic { + freeradius-test-unreachable { + properties { + message.timeout.ms = "1000" + } + } + + # + # Shorter timeout used by the race test, so the unlang + # `timeout` and librdkafka's `message.timeout.ms` expire + # close enough to each other that dr_msg_cb and + # kafka_xlat_produce_signal interleave unpredictably. + # + freeradius-test-race { + properties { + message.timeout.ms = "300" + } + } + } + + flush_timeout = 1s +} diff --git a/src/tests/modules/kafka/race.attrs b/src/tests/modules/kafka/race.attrs new file mode 100644 index 00000000000..a4ee6fb4198 --- /dev/null +++ b/src/tests/modules/kafka/race.attrs @@ -0,0 +1,10 @@ +# +# Input packet +# +Packet-Type = Access-Request +User-Name = 'test' + +# +# Expected answer +# +Packet-Type == Access-Accept diff --git a/src/tests/modules/kafka/race.unlang b/src/tests/modules/kafka/race.unlang new file mode 100644 index 00000000000..8693b01a762 --- /dev/null +++ b/src/tests/modules/kafka/race.unlang @@ -0,0 +1,37 @@ +# +# Race dr_msg_cb against kafka_xlat_produce_signal. +# +# The `freeradius-test-race` topic is configured with +# `message.timeout.ms = 300`. We wrap a batch of concurrent produces +# in `timeout 300ms`, so the unlang cancel signal and librdkafka's +# DR-timeout fire at approximately the same wall-clock moment. Both +# paths run on the same worker thread's event loop, but the ordering +# between them - "DR first, then cancel" vs "cancel first, then DR" +# vs "DR marks runnable, cancel fires before resume gets dispatched" +# - is determined entirely by epoll/kqueue timing. +# +# Running enough concurrent produces here means at least one of them +# hits each interleaving. Any UAF, double-free, or lost pctx +# surfaces here under ASAN/TSAN. The only behavioural assertion is +# that we reach test_pass without the worker wedging or tripping an +# invariant check in the interpreter. +# +redundant { + timeout 300ms { + parallel { + group { %kafka_unreachable.produce('freeradius-test-race', "race 1") } + group { %kafka_unreachable.produce('freeradius-test-race', "race 2") } + group { %kafka_unreachable.produce('freeradius-test-race', "race 3") } + group { %kafka_unreachable.produce('freeradius-test-race', "race 4") } + group { %kafka_unreachable.produce('freeradius-test-race', "race 5") } + group { %kafka_unreachable.produce('freeradius-test-race', "race 6") } + group { %kafka_unreachable.produce('freeradius-test-race', "race 7") } + group { %kafka_unreachable.produce('freeradius-test-race', "race 8") } + } + } + group { + ok + } +} + +test_pass diff --git a/src/tests/modules/kafka/unreachable.attrs b/src/tests/modules/kafka/unreachable.attrs new file mode 100644 index 00000000000..a4ee6fb4198 --- /dev/null +++ b/src/tests/modules/kafka/unreachable.attrs @@ -0,0 +1,10 @@ +# +# Input packet +# +Packet-Type = Access-Request +User-Name = 'test' + +# +# Expected answer +# +Packet-Type == Access-Accept diff --git a/src/tests/modules/kafka/unreachable.unlang b/src/tests/modules/kafka/unreachable.unlang new file mode 100644 index 00000000000..caf0af249e6 --- /dev/null +++ b/src/tests/modules/kafka/unreachable.unlang @@ -0,0 +1,68 @@ +# +# Exercise the two error paths a produce can take when the broker is +# dead: cancellation mid-flight, and delivery-report timeout. +# +# The `kafka_unreachable` module instance in module.conf points at +# 127.0.0.1:1 with a 1s message_timeout, so every produce here is +# guaranteed to fail - the only question is whether it fails via the +# signal path or via a natural DR. +# + +# +# Cancellation path. +# +# `parallel` starts two branches concurrently. Branch 1 calls +# `%kafka_unreachable.produce()` which yields waiting for a DR that +# won't arrive inside the timeout. Branch 2 immediately calls +# `%cancel(0)` to terminate itself; combined with the wrapping +# `timeout 200ms`, branch 1 gets an FR_SIGNAL_CANCEL delivered to +# kafka_produce_signal long before the 1s message_timeout fires. +# +# The signal handler must detach ctx->request without freeing the +# ctx - librdkafka still owns the opaque and will call dr_msg_cb +# when the message eventually times out. At that point dr_msg_cb +# sees ctx->request == NULL and silently frees. No crash, no resume +# into a freed request. +# +# If the worker survives this cleanly, the subsequent produces in +# this test file will run on a healthy event loop - that's the real +# assertion here, not anything about the rcode of the cancelled call. +# +redundant { + timeout 200ms { + parallel { + redundant { + %kafka_unreachable.produce('freeradius-test-unreachable', "cancelled mid-flight") + } + group { + %cancel(0) + } + } + } + group { + ok + } +} + +# +# Delivery-report failure path. +# +# No timeout wrapper this time: let the produce run to completion so +# the DR actually fires. librdkafka can't reach the broker, retries +# until message.timeout.ms (1s), then delivers a DR with +# RD_KAFKA_RESP_ERR__MSG_TIMED_OUT. kafka_produce_resume translates +# that to a false return from the xlat. +# +if (%kafka_unreachable.produce('freeradius-test-unreachable', "doomed")) { + test_fail +} + +# +# And one more, to prove the worker's produce path is still healthy +# after both the cancel and the natural failure above. +# +if (%kafka_unreachable.produce('freeradius-test-unreachable', "also doomed")) { + test_fail +} + +test_pass