runtests: better detect and handle pipe errors in the controller

author Dan Fandrich <dan@coneharvesters.com>

Fri, 16 Jun 2023 03:10:24 +0000 (20:10 -0700)

committer Dan Fandrich <dan@coneharvesters.com>

Tue, 20 Jun 2023 00:14:27 +0000 (17:14 -0700)
author Dan Fandrich <dan@coneharvesters.com>
Fri, 16 Jun 2023 03:10:24 +0000 (20:10 -0700)
committer Dan Fandrich <dan@coneharvesters.com>
Tue, 20 Jun 2023 00:14:27 +0000 (17:14 -0700)
diff --git a/tests/runner.pm b/tests/runner.pm

index 65dbd6f9b96797756d164b6b572cd4213511a914..813c66989070f6f64b76da152b50b80f8d7f89a2 100644 (file)
--- a/tests/runner.pm
+++ b/tests/runner.pm
@@ -1171,7 +1171,7 @@ sub runner_test_run {
  # Async call runner_clearlocks
  # Called by controller
  sub runnerac_clearlocks {
-    controlleripccall(\&runner_clearlocks, @_);
+    return controlleripccall(\&runner_clearlocks, @_);
  }
  
  # Async call runner_shutdown
@@ -1180,36 +1180,38 @@ sub runnerac_clearlocks {
  # Called by controller
  sub runnerac_shutdown {
      my ($runnerid)=$_[0];
-    controlleripccall(\&runner_shutdown, @_);
+    my $err = controlleripccall(\&runner_shutdown, @_);
  
      # These have no more use
      close($controllerw{$runnerid});
      undef $controllerw{$runnerid};
      close($controllerr{$runnerid});
      undef $controllerr{$runnerid};
+    return $err;
  }
  
  # Async call of runner_stopservers
  # Called by controller
  sub runnerac_stopservers {
-    controlleripccall(\&runner_stopservers, @_);
+    return controlleripccall(\&runner_stopservers, @_);
  }
  
  # Async call of runner_test_preprocess
  # Called by controller
  sub runnerac_test_preprocess {
-    controlleripccall(\&runner_test_preprocess, @_);
+    return controlleripccall(\&runner_test_preprocess, @_);
  }
  
  # Async call of runner_test_run
  # Called by controller
  sub runnerac_test_run {
-    controlleripccall(\&runner_test_run, @_);
+    return controlleripccall(\&runner_test_run, @_);
  }
  
  ###################################################################
  # Call an arbitrary function via IPC
  # The first argument is the function reference, the second is the runner ID
+# Returns 0 on success, -1 on error writing to runner
  # Called by controller (indirectly, via a more specific function)
  sub controlleripccall {
      my $funcref = shift @_;
@@ -1223,30 +1225,44 @@ sub controlleripccall {
      my $margs = freeze \@_;
  
      # Send IPC call via pipe
-    syswrite($controllerw{$runnerid}, (pack "L", length($margs)) . $margs);
+    my $err;
+    while(! defined ($err = syswrite($controllerw{$runnerid}, (pack "L", length($margs)) . $margs)) || $err <= 0) {
+        if((!defined $err && ! $!{EINTR}) || (defined $err && $err == 0)) {
+            # Runner has likely died
+            return -1;
+        }
+        # system call was interrupted, probably by ^C; restart it so we stay in sync
+    }
  
      if(!$multiprocess) {
          # Call the remote function here in single process mode
          ipcrecv();
       }
+     return 0;
  }
  
  ###################################################################
  # Receive async response of a previous call via IPC
-# The first return value is the runner ID
+# The first return value is the runner ID or undef on error
  # Called by controller
  sub runnerar {
      my ($runnerid) = @_;
      my $err;
      my $datalen;
      while(! defined ($err = sysread($controllerr{$runnerid}, $datalen, 4)) || $err <= 0) {
-        $!{EINTR} || die "error in runnerar: $!\n";
+        if((!defined $err && ! $!{EINTR}) || (defined $err && $err == 0)) {
+            # Runner is likely dead and closed the pipe
+            return undef;
+        }
          # system call was interrupted, probably by ^C; restart it so we stay in sync
      }
      my $len=unpack("L", $datalen);
      my $buf;
      while(! defined ($err = sysread($controllerr{$runnerid}, $buf, $len)) || $err <= 0) {
-        $!{EINTR} || die "error in runnerar: $!\n";
+        if((!defined $err && ! $!{EINTR}) || (defined $err && $err == 0)) {
+            # Runner is likely dead and closed the pipe
+            return undef;
+        }
          # system call was interrupted, probably by ^C; restart it so we stay in sync
      }
  
@@ -1260,7 +1276,9 @@ sub runnerar {
  }
  
  ###################################################################
-# Returns runner ID if a response from an async call is ready
+# Returns runner ID if a response from an async call is ready or error
+# First value is ready, second is error, however an error case shows up
+# as ready in Linux, so you can't trust it.
  # argument is 0 for nonblocking, undef for blocking, anything else for timeout
  # Called by controller
  sub runnerar_ready {
@@ -1279,19 +1297,26 @@ sub runnerar_ready {
      $maxfileno || die "Internal error: no runners are available to wait on\n";
  
      # Wait for any pipe from any runner to be ready
+    # This may be interrupted and return EINTR, but this is ignored and the
+    # caller will need to later call this function again.
      # TODO: this is relatively slow with hundreds of fds
-    # TODO: handle errors
-    if(select(my $rout=$rin, undef, undef, $blocking)) {
+    my $ein = $rin;
+    if(select(my $rout=$rin, undef, my $eout=$ein, $blocking) >= 1) {
          for my $fd (0..$maxfileno) {
+            # Return an error condition first in case it's both
+            if(vec($eout, $fd, 1)) {
+                return (undef, $idbyfileno{$fd});
+            }
              if(vec($rout, $fd, 1)) {
-                return $idbyfileno{$fd};
+                return ($idbyfileno{$fd}, undef);
              }
          }
          die "Internal pipe readiness inconsistency\n";
      }
-    return undef;
+    return (undef, undef);
  }
  
+
  ###################################################################
  # Cleanly abort and exit the runner
  # This uses print since there is no longer any controller to write logs.
diff --git a/tests/runtests.pl b/tests/runtests.pl

index 2c10c127795773a67350a251afbe102a66d502b5..d7a6843c52205ca65023eefe1af28e38d61d5958 100755 (executable)
--- a/tests/runtests.pl
+++ b/tests/runtests.pl
@@ -272,6 +272,7 @@ sub catch_usr1 {
  }
  
  $SIG{USR1} = \&catch_usr1;
+$SIG{PIPE} = 'IGNORE';  # these errors are captured in the read/write calls
  
  ##########################################################################
  # Clear all possible '*_proxy' environment variables for various protocols
@@ -1505,12 +1506,20 @@ sub singletest_check {
              if(!$filename) {
                  logmsg "ERROR: section verify=>file$partsuffix ".
                         "has no name attribute\n";
-                runnerac_stopservers($runnerid);
-                # TODO: this is a blocking call that will stall the controller,
-                # but this error condition should never happen except during
-                # development.
-                my ($rid, $unexpected, $logs) = runnerar($runnerid);
-                logmsg $logs;
+                if (runnerac_stopservers($runnerid)) {
+                    logmsg "ERROR: runner $runnerid seems to have died\n";
+                } else {
+
+                    # TODO: this is a blocking call that will stall the controller,
+                    # but this error condition should never happen except during
+                    # development.
+                    my ($rid, $unexpected, $logs) = runnerar($runnerid);
+                    if(!$rid) {
+                        logmsg "ERROR: runner $runnerid seems to have died\n";
+                    } else {
+                        logmsg $logs;
+                    }
+                }
                  # timestamp test result verification end
                  $timevrfyend{$testnum} = Time::HiRes::time();
                  return -1;
@@ -1740,7 +1749,11 @@ sub singletest {
              && $clearlocks) {
              # On Windows, lock files can't be deleted when the process still
              # has them open, so kill those processes first
-            runnerac_clearlocks($runnerid, "$logdir/$LOCKDIR");
+            if(runnerac_clearlocks($runnerid, "$logdir/$LOCKDIR")) {
+                logmsg "ERROR: runner $runnerid seems to have died\n";
+                $singletest_state{$runnerid} = ST_INIT;
+                return (-1, 0);
+            }
              $singletest_state{$runnerid} = ST_CLEARLOCKS;
          } else {
              $singletest_state{$runnerid} = ST_INITED;
@@ -1751,6 +1764,11 @@ sub singletest {
  
      } elsif($singletest_state{$runnerid} == ST_CLEARLOCKS) {
          my ($rid, $logs) = runnerar($runnerid);
+        if(!$rid) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
          logmsg $logs;
          my $logdir = getrunnerlogdir($runnerid);
          cleardir($logdir);
@@ -1776,11 +1794,20 @@ sub singletest {
          # Register the test case with the CI environment
          citest_starttest($testnum);
  
-        runnerac_test_preprocess($runnerid, $testnum);
+        if(runnerac_test_preprocess($runnerid, $testnum)) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
          $singletest_state{$runnerid} = ST_PREPROCESS;
  
      } elsif($singletest_state{$runnerid} == ST_PREPROCESS) {
          my ($rid, $why, $error, $logs, $testtimings) = runnerar($runnerid);
+        if(!$rid) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
          logmsg $logs;
          updatetesttimings($testnum, %$testtimings);
          if($error == -2) {
@@ -1813,11 +1840,20 @@ sub singletest {
          my $CURLOUT;
          my $tool;
          my $usedvalgrind;
-        runnerac_test_run($runnerid, $testnum);
+        if(runnerac_test_run($runnerid, $testnum)) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
          $singletest_state{$runnerid} = ST_RUN;
  
      } elsif($singletest_state{$runnerid} == ST_RUN) {
          my ($rid, $error, $logs, $testtimings, $cmdres, $CURLOUT, $tool, $usedvalgrind) = runnerar($runnerid);
+        if(!$rid) {
+            logmsg "ERROR: runner $runnerid seems to have died\n";
+            $singletest_state{$runnerid} = ST_INIT;
+            return (-1, 0);
+        }
          logmsg $logs;
          updatetesttimings($testnum, %$testtimings);
          if($error == -1) {
@@ -2782,8 +2818,13 @@ while () {
              # this runner is busy running a test
              $runnersrunning{$runnerid} = $testnum;
          } else {
-            # We make this assumption to avoid having to handle $error here
-            die "Internal error: test must not complete on first call";
+            runnerready($runnerid);
+            if($error >= 0) {
+                # We make this simplifying assumption to avoid having to handle
+                # $error properly here, but we must handle the case of runner
+                # death without abending here.
+                die "Internal error: test must not complete on first call";
+            }
          }
      }
  
@@ -2799,7 +2840,15 @@ while () {
      # one immediately. If all runners are busy, wait a fraction of a second
      # for one to finish so we can still loop around to check the abort flag.
      my $runnerwait = scalar(@runnersidle) && scalar(@runtests) ? 0 : 0.5;
-    my $ridready = runnerar_ready($runnerwait);
+    my ($ridready, $riderror) = runnerar_ready($runnerwait);
+    if($ridready && ! defined $runnersrunning{$ridready}) {
+        # On Linux, a closed pipe still shows up as ready instead of error.
+        # Detect this here by seeing if we are expecting it to be ready and
+        # treat it as an error if not.
+        logmsg "ERROR: Runner $ridready is unexpectedly ready; is probably actually dead\n";
+        $riderror = $ridready;
+        undef $ridready;
+    }
      if($ridready) {
          # This runner is ready to be serviced
          my $testnum = $runnersrunning{$ridready};
@@ -2812,7 +2861,6 @@ while () {
          } else {
              # Test is complete
              runnerready($ridready);
-print "COMPLETED $testnum \n" if($verbose); #. join(",", keys(%runnersrunning)) . "\n";
  
              if($error < 0) {
                  # not a test we can run
@@ -2847,6 +2895,11 @@ print "COMPLETED $testnum \n" if($verbose); #. join(",", keys(%runnersrunning))
              }
          }
      }
+    if($riderror) {
+        logmsg "ERROR: runner $riderror is dead! aborting test run\n";
+        delete $runnersrunning{$riderror} if(defined $runnersrunning{$riderror});
+        $globalabort = 1;
+    }
  }
  
  my $sofar = time() - $start;
author	Dan Fandrich <dan@coneharvesters.com>
	Fri, 16 Jun 2023 03:10:24 +0000 (20:10 -0700)
committer	Dan Fandrich <dan@coneharvesters.com>
	Tue, 20 Jun 2023 00:14:27 +0000 (17:14 -0700)
tests/runner.pm		patch \| blob \| blame \| history
tests/runtests.pl		patch \| blob \| blame \| history