]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
torture: Make kvm-remote.sh give up on unresponsive system
authorPaul E. McKenney <paulmck@kernel.org>
Wed, 9 Oct 2024 18:56:28 +0000 (11:56 -0700)
committerUladzislau Rezki (Sony) <urezki@gmail.com>
Sat, 14 Dec 2024 15:16:58 +0000 (16:16 +0100)
Currently, a system that stops responding at the wrong time will hang
kvm-remote.sh.  This can happen when the system in question is forced
offline for maintenance, and there is currently no way for the user
to kick this script into moving ahead.  This commit therefore causes
kvm-remote.sh to wait at most 15 minutes for a non-responsive system,
that is, a system for which ssh gives an exit code of 255.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
tools/testing/selftests/rcutorture/bin/kvm-remote.sh

index 134cdef5a6e087f4b0d55529e946357d0b3fb8c2..48a8052d5dae3754f9497f487b4a744aba8211dd 100755 (executable)
@@ -181,10 +181,11 @@ done
 
 # Function to check for presence of a file on the specified system.
 # Complain if the system cannot be reached, and retry after a wait.
-# Currently just waits forever if a machine disappears.
+# Currently just waits 15 minutes if a machine disappears.
 #
 # Usage: checkremotefile system pathname
 checkremotefile () {
+       local nsshfails=0
        local ret
        local sleeptime=60
 
@@ -195,6 +196,11 @@ checkremotefile () {
                if test "$ret" -eq 255
                then
                        echo " ---" ssh failure to $1 checking for file $2, retry after $sleeptime seconds. `date` | tee -a "$oldrun/remote-log"
+                       nsshfails=$((nsshfails+1))
+                       if ((nsshfails > 15))
+                       then
+                               return 255
+                       fi
                elif test "$ret" -eq 0
                then
                        return 0
@@ -268,12 +274,23 @@ echo All batches started. `date` | tee -a "$oldrun/remote-log"
 for i in $systems
 do
        echo " ---" Waiting for $i `date` | tee -a "$oldrun/remote-log"
-       while checkremotefile "$i" "$resdir/$ds/remote.run"
+       while :
        do
+               checkremotefile "$i" "$resdir/$ds/remote.run"
+               ret=$?
+               if test "$ret" -eq 1
+               then
+                       echo " ---" Collecting results from $i `date` | tee -a "$oldrun/remote-log"
+                       ( cd "$oldrun"; ssh -o BatchMode=yes $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
+                       break;
+               fi
+               if test "$ret" -eq 255
+               then
+                       echo System $i persistent ssh failure, lost results `date` | tee -a "$oldrun/remote-log"
+                       break;
+               fi
                sleep 30
        done
-       echo " ---" Collecting results from $i `date` | tee -a "$oldrun/remote-log"
-       ( cd "$oldrun"; ssh -o BatchMode=yes $i "cd $rundir; tar -czf - kvm-remote-*.sh.out */console.log */kvm-test-1-run*.sh.out */qemu[_-]pid */qemu-retval */qemu-affinity; rm -rf $T > /dev/null 2>&1" | tar -xzf - )
 done
 
 ( kvm-end-run-stats.sh "$oldrun" "$starttime"; echo $? > $T/exitcode ) | tee -a "$oldrun/remote-log"