From: Luca Boccassi Date: Tue, 26 May 2026 00:06:40 +0000 (+0100) Subject: test: skip TEST-55-OOMD entirely if stress-ng is broken on this host X-Git-Tag: v261-rc2~16 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=4840da2fdf044d78e6212c48c098c3cc6b17be7c;p=thirdparty%2Fsystemd.git test: skip TEST-55-OOMD entirely if stress-ng is broken on this host This reverts commit a17efef137 ("test: try to detect SIGILL in stress-ng and skip TEST-55-OOMD gracefully") and replaces it with a single check to skip the test cases. The previous check was not reliable as stress-ng can catch SIGILL itself and exist with an error: stress-ng[1068]: stress-ng: debug: [1068] caught SIGILL, address \ 0x00005632f8330140 (ILL_ILLOPN) stress-ng[1068]: stress-ng: debug: [1068] stress-ng: info: \ 0x00005632f8330140:<62>71 fd 48 6f 2d 36 14 1c 00 c5 d1 ef ed 49 29 ... stress-ng[1053]: stress-ng: error: [1053] vm: [1061] terminated \ with an error, exit status=2 (stressor failed) ... systemd[1]: TEST-55-OOMD-slowrule.service: Main process exited, \ code=exited, status=2/INVALIDARGUMENT systemd[1]: TEST-55-OOMD-slowrule.service: Failed with result \ 'exit-code'. Try to detect at the beginning of the test and skip the test case if it happens. --- diff --git a/test/units/TEST-55-OOMD.sh b/test/units/TEST-55-OOMD.sh index 9993e989377..972594cf15d 100755 --- a/test/units/TEST-55-OOMD.sh +++ b/test/units/TEST-55-OOMD.sh @@ -19,6 +19,20 @@ if [[ -s /skipped ]]; then exit 77 fi +# stress-ng can fail with SIGILL because GCC's target_clones / ifunc resolver +# picks an AVX-512 variant of a stressor function based on CPUID, even when +# the actual CPU (e.g. in some VMs) does not implement AVX-512 +STRESS_NG_BROKEN=0 +stress_ng_preflight_out=$(mktemp) +if ! timeout --kill-after=5s 10s stress-ng --timeout 2s --vm 4 --vm-bytes 10M --vm-keep \ + >"$stress_ng_preflight_out" 2>&1; then + if grep -E "caught SIG(ILL|SEGV|BUS|FPE)" "$stress_ng_preflight_out" >/dev/null; then + STRESS_NG_BROKEN=1 + fi +fi +rm -f "$stress_ng_preflight_out" +unset stress_ng_preflight_out + # Activate swap file if we are in a VM if systemd-detect-virt --vm --quiet; then swapoff --all @@ -94,23 +108,12 @@ else systemd-run -t -p MemoryMax=10M -p MemorySwapMax=0 -p MemoryZSwapMax=0 true fi -# stress-ng can fail with SIGILL due to trying to use AVX-512 on older CPUs, try to detect and avoid failing -stress_ng_sigilled() { - local result status sigill - local unit="${1:?}" - shift - - result=$(systemctl "$@" show "$unit" -P Result) - status=$(systemctl "$@" show "$unit" -P ExecMainStatus) - sigill=$(kill -l ILL) - - [[ "$status" == "$sigill" && ( "$result" == "signal" || "$result" == "core-dump" ) ]] -} - test_basic() { local cgroup_path="${1:?}" shift + [[ "$STRESS_NG_BROKEN" == "1" ]] && { echo "stress-ng is broken on this host, skipping ${FUNCNAME[0]}"; return 0; } + systemctl "$@" start TEST-55-OOMD-testchill.service systemctl "$@" status TEST-55-OOMD-testchill.service systemctl "$@" status TEST-55-OOMD-workload.slice @@ -136,11 +139,7 @@ test_basic() { if systemctl "$@" status TEST-55-OOMD-testbloat.service; then exit 42; fi if ! systemctl "$@" status TEST-55-OOMD-testchill.service; then exit 24; fi - if stress_ng_sigilled TEST-55-OOMD-testbloat.service "$@"; then - echo "stress-ng died with SIGILL, skipping ManagedOOMKills assertion" - else - assert_eq "$(systemctl "$@" show TEST-55-OOMD-testbloat.service -P ManagedOOMKills)" "1" - fi + assert_eq "$(systemctl "$@" show TEST-55-OOMD-testbloat.service -P ManagedOOMKills)" "1" systemctl "$@" kill --signal=KILL TEST-55-OOMD-testbloat.service || : systemctl "$@" stop TEST-55-OOMD-testbloat.service @@ -169,6 +168,8 @@ testcase_preference_avoid() { return 0 fi + [[ "$STRESS_NG_BROKEN" == "1" ]] && { echo "stress-ng is broken on this host, skipping ${FUNCNAME[0]}"; return 0; } + mkdir -p /run/systemd/system/TEST-55-OOMD-testbloat.service.d/ cat >/run/systemd/system/TEST-55-OOMD-testbloat.service.d/99-managed-oom-preference.conf </run/systemd/system/TEST-55-OOMD-testmunch.service.d/99-duration-test.conf </run/systemd/oomd/rules.d/testrule.oomrule <<'EOF' @@ -467,6 +464,8 @@ testcase_oom_rulesets_lasting_sec() { # Baseline proof: with the same workload but LastingSec=0 (testcase_oom_rulesets # above) oomd kills the unit within a couple of seconds, so an active unit after # ~6 s demonstrates LastingSec is being respected. + [[ "$STRESS_NG_BROKEN" == "1" ]] && { echo "stress-ng is broken on this host, skipping ${FUNCNAME[0]}"; return 0; } + mkdir -p /run/systemd/oomd/rules.d/ cat >/run/systemd/oomd/rules.d/slowrule.oomrule <<'EOF' [Rule] @@ -488,13 +487,9 @@ EOF # many times. With LastingSec=1h the kill must not fire. sleep 6 - if stress_ng_sigilled TEST-55-OOMD-slowrule.service; then - echo "stress-ng died with SIGILL, skipping testcase_oom_rulesets_lasting_sec assertions" - else - # Unit must still be active. If it were killed, Result= would be oom-kill. - assert_eq "$(systemctl show TEST-55-OOMD-slowrule.service -P ActiveState)" "active" - assert_eq "$(systemctl show TEST-55-OOMD-slowrule.service -P Result)" "success" - fi + # Unit must still be active — if it were killed, Result= would be oom-kill. + assert_eq "$(systemctl show TEST-55-OOMD-slowrule.service -P ActiveState)" "active" + assert_eq "$(systemctl show TEST-55-OOMD-slowrule.service -P Result)" "success" systemctl stop TEST-55-OOMD-slowrule.service 2>/dev/null || true @@ -504,6 +499,8 @@ EOF } testcase_prekill_hook() { + [[ "$STRESS_NG_BROKEN" == "1" ]] && { echo "stress-ng is broken on this host, skipping ${FUNCNAME[0]}"; return 0; } + cat >/run/systemd/oomd.conf.d/99-oomd-prekill-test.conf <<'EOF' [OOM] PrekillHookTimeoutSec=3s @@ -513,11 +510,6 @@ EOF systemctl reload systemd-oomd.service ! systemctl start --wait TEST-55-OOMD-testbloat.service || exit 1 - if stress_ng_sigilled TEST-55-OOMD-testbloat.service; then - echo "stress-ng died with SIGILL, skipping testcase_prekill_hook" - return 0 - fi - # one hook mkdir -p /run/systemd/oomd.prekill.hook/ ncat --recv-only -kUl /run/systemd/oomd.prekill.hook/althook >/tmp/oomd_event.json &