From 0839e6d8d9371b7301133c6a2799bab1c906d2e1 Mon Sep 17 00:00:00 2001
From: =?utf8?q?P=C3=A1draig=20Brady?= <P@draigBrady.com>
Date: Sun, 19 Mar 2017 17:22:34 -0700
Subject: [PATCH] split: process more efficiently when filters exit early

* src/split.c (bytes_split): Don't write to an existing filter
if it has exited.  When filters exit early, skip input data if
possible.  Refactor out 2 redundant variables.
* tests/split/filter.sh: Improve test coverage given the
new more efficient processing.  Also use a 10TB file to
expand the file systems tested on.
---
 src/split.c           | 37 ++++++++++++++++++++++---------------
 tests/split/filter.sh | 33 +++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/src/split.c b/src/split.c
index 85bc052a86..01f97afadc 100644
--- a/src/split.c
+++ b/src/split.c
@@ -623,6 +623,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
 {
   size_t n_read;
   bool new_file_flag = true;
+  bool filter_ok = true;
   uintmax_t to_write = n_bytes;
   uintmax_t opened = 0;
   bool eof;
@@ -637,42 +638,48 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
         }
       else
         {
+          if (! filter_ok
+              && lseek (STDIN_FILENO, to_write, SEEK_CUR) != -1)
+            {
+              to_write = n_bytes;
+              new_file_flag = true;
+            }
+
           n_read = safe_read (STDIN_FILENO, buf, bufsize);
           if (n_read == SAFE_READ_ERROR)
             die (EXIT_FAILURE, errno, "%s", quotef (infile));
           eof = n_read == 0;
         }
       char *bp_out = buf;
-      size_t to_read = n_read;
-      while (to_write <= to_read)
+      while (to_write <= n_read)
         {
-          size_t w = to_write;
-          bool cwrite_ok = cwrite (new_file_flag, bp_out, w);
+          if (filter_ok || new_file_flag)
+            filter_ok = cwrite (new_file_flag, bp_out, to_write);
           opened += new_file_flag;
           new_file_flag = !max_files || (opened < max_files);
-          if (!new_file_flag && !cwrite_ok)
+          if (! filter_ok && ! new_file_flag)
             {
-              /* If filter no longer accepting input, stop reading.  */
-              n_read = to_read = 0;
+              /* If filters no longer accepting input, stop reading.  */
+              n_read = 0;
               eof = true;
               break;
             }
-          bp_out += w;
-          to_read -= w;
+          bp_out += to_write;
+          n_read -= to_write;
           to_write = n_bytes;
         }
-      if (to_read != 0)
+      if (n_read != 0)
         {
-          bool cwrite_ok = cwrite (new_file_flag, bp_out, to_read);
+          if (filter_ok || new_file_flag)
+            filter_ok = cwrite (new_file_flag, bp_out, n_read);
           opened += new_file_flag;
-          to_write -= to_read;
           new_file_flag = false;
-          if (!cwrite_ok && opened == max_files)
+          if (! filter_ok && opened == max_files)
             {
-              /* If filter no longer accepting input, stop reading.  */
-              n_read = 0;
+              /* If filters no longer accepting input, stop reading.  */
               break;
             }
+          to_write -= n_read;
         }
     }
   while (! eof);
diff --git a/tests/split/filter.sh b/tests/split/filter.sh
index a85093c764..a703b3bd04 100755
--- a/tests/split/filter.sh
+++ b/tests/split/filter.sh
@@ -18,8 +18,7 @@
 
 . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
 print_ver_ split
-require_sparse_support_ # for 'truncate --size=$OFF_T_MAX'
-eval $(getlimits) # for OFF_T limits
+require_sparse_support_ # for 'truncate --size=$LARGE'
 xz --version || skip_ "xz (better than gzip/bzip2) required"
 
 for total_n_lines in 5 3000 20000; do
@@ -52,15 +51,29 @@ returns_ 1 split -n 1/2 --filter='true' /dev/null 2>&1 || fail=1
 # where they would result in a non zero exit from split.
 yes | head -n200K | split -b1G --filter='head -c1 >/dev/null' || fail=1
 
-# Do not use a size of OFF_T_MAX, since split.c applies a GNU/Hurd
-# /dev/zero workaround for files of that size.  Use one less:
-N=$(expr $OFF_T_MAX - 1)
-
 # Ensure that "endless" input is ignored when all filters finish
-timeout 10 sh -c 'yes | split --filter="head -c1 >/dev/null" -n r/1' || fail=1
-if truncate -s$N zero.in; then
-  timeout 10 sh -c 'split --filter="head -c1 >/dev/null" -n 1 zero.in' || fail=1
-fi
+for mode in '' 'r/'; do
+  FILE = '-'
+  if test "$mode" = ''; then
+    FILE = 'zero.in'
+    truncate -s10T "$FILE" || continue
+  fi
+  for N in 1 2; do
+    rm -f x??.n || framework_failure_
+    timeout 10 sh -c \
+      "yes | split --filter='head -c1 >\$FILE.n' -n $mode$N $FILE" || fail=1
+    # Also ensure we get appropriate output from each filter
+    seq 1 $N | tr '0-9' 1 > stat.exp
+    stat -c%s x??.n > stat.out || framework_failure_
+    compare stat.exp stat.out || fail=1
+  done
+done
+
+# Ensure that "endless" input _is_ processed for unbounded number of filters
+for buf in 1000 1000000; do
+  returns_ 124 timeout .5 sh -c \
+    "yes | split --filter='head -c1 >/dev/null' -b $buf" || fail=1
+done
 
 # Ensure that "endless" input _is_ processed for unbounded number of filters
 for buf in 1000 1000000; do
-- 
2.47.2