split: split more evenly with -n

author Paul Eggert <eggert@cs.ucla.edu>

Sat, 4 Mar 2023 19:41:03 +0000 (11:41 -0800)

committer Paul Eggert <eggert@cs.ucla.edu>

Sat, 4 Mar 2023 22:49:45 +0000 (14:49 -0800)
author Paul Eggert <eggert@cs.ucla.edu>
Sat, 4 Mar 2023 19:41:03 +0000 (11:41 -0800)
committer Paul Eggert <eggert@cs.ucla.edu>
Sat, 4 Mar 2023 22:49:45 +0000 (14:49 -0800)
diff --git a/NEWS b/NEWS

index 2694cf3050e21475d05304299b90cf9dc73897c5..f7a95e7fb7fa4f9eba8da2ae2275dea1399f59df 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -106,6 +106,11 @@ GNU coreutils NEWS                                    -*- outline -*-
    internal errors it would exit with status 1, which was less distinguishable
    from errors from the invoked command.
  
+  'split -n N' now splits more evenly when the input size is not a
+  multiple of N, by creating N output files whose sizes differ by at
+  most 1 byte.  Formerly, it did this only when the input size was
+  less than N.
+
    'stat -c %s' now prints sizes as unsigned, consistent with 'ls'.
  
  ** New Features
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index 7ea910ba82662d6407b43aef099af66530c943f0..b07a330ebbebf4464adb45510cd0bfeddef19bca 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3393,8 +3393,8 @@ r/@var{n}    like @samp{l} but use round robin distribution
  r/@var{k}/@var{n}  likewise but output only @var{k}th of @var{n} to stdout
  @end example
  
-Any excess bytes remaining after dividing the @var{input}
-into @var{n} chunks, are assigned to the last chunk.
+If the input size is not a multiple of @var{n}, early output files are
+one byte longer than later output files, to make up the difference.
  Any excess bytes appearing after the initial calculation are discarded
  (except when using @samp{r} mode).
  
@@ -3402,8 +3402,8 @@ All @var{n} files are created even if there are fewer than @var{n} lines,
  or the @var{input} is truncated.
  
  For @samp{l} mode, chunks are approximately @var{input} size / @var{n}.
-The @var{input} is partitioned into @var{n} equal sized portions, with
-the last assigned any excess.  If a line @emph{starts} within a partition
+Although the @var{input} is still partitioned as before into @var{n} regions
+of approximately equal size, if a line @emph{starts} within a partition
  it is written completely to the corresponding file.  Since lines or records
  are not split even if they overlap a partition, the files written
  can be larger or smaller than the partition size, and even empty
diff --git a/src/split.c b/src/split.c

index 574250d272a39643a9013fa86677df0990ba00b7..c66bc69a261fee58abb6a9cc09a702a798505c7b 100644 (file)
--- a/src/split.c
+++ b/src/split.c
@@ -619,21 +619,23 @@ cwrite (bool new_file_flag, char const *bp, size_t bytes)
  }
  
  /* Split into pieces of exactly N_BYTES bytes.
+   However, the first REM_BYTES pieces should be 1 byte longer.
     Use buffer BUF, whose size is BUFSIZE.
     BUF contains the first INITIAL_READ input bytes.  */
  
  static void
-bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
+bytes_split (uintmax_t n_bytes, uintmax_t rem_bytes,
+             char *buf, size_t bufsize, size_t initial_read,
               uintmax_t max_files)
  {
    size_t n_read;
    bool new_file_flag = true;
    bool filter_ok = true;
-  uintmax_t to_write = n_bytes;
    uintmax_t opened = 0;
-  bool eof;
+  uintmax_t to_write = n_bytes + (0 < rem_bytes);
+  bool eof = ! to_write;
  
-  do
+  while (! eof)
      {
        if (initial_read != SIZE_MAX)
          {
@@ -646,7 +648,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
            if (! filter_ok
                && lseek (STDIN_FILENO, to_write, SEEK_CUR) != -1)
              {
-              to_write = n_bytes;
+              to_write = n_bytes + (opened + 1 < rem_bytes);
                new_file_flag = true;
              }
  
@@ -656,7 +658,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
            eof = n_read == 0;
          }
        char *bp_out = buf;
-      while (to_write <= n_read)
+      while (0 < to_write && to_write <= n_read)
          {
            if (filter_ok || new_file_flag)
              filter_ok = cwrite (new_file_flag, bp_out, to_write);
@@ -671,7 +673,7 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
              }
            bp_out += to_write;
            n_read -= to_write;
-          to_write = n_bytes;
+          to_write = n_bytes + (opened < rem_bytes);
          }
        if (n_read != 0)
          {
@@ -687,7 +689,6 @@ bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize, size_t initial_read,
            to_write -= n_read;
          }
      }
-  while (! eof);
  
    /* Ensure NUMBER files are created, which truncates
       any existing files or notifies any consumers on fifos.
@@ -864,19 +865,20 @@ static void
  lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
                     size_t initial_read, off_t file_size)
  {
-  assert (n && k <= n && n <= file_size);
+  assert (n && k <= n);
  
-  const off_t chunk_size = file_size / n;
+  uintmax_t rem_bytes = file_size % n;
+  off_t chunk_size = file_size / n;
    uintmax_t chunk_no = 1;
-  off_t chunk_end = chunk_size;
+  off_t chunk_end = chunk_size + (0 < rem_bytes);
    off_t n_written = 0;
    bool new_file_flag = true;
    bool chunk_truncated = false;
  
-  if (k > 1)
+  if (k > 1 && 0 < file_size)
      {
        /* Start reading 1 byte before kth chunk of file.  */
-      off_t start = (k - 1) * chunk_size - 1;
+      off_t start = (k - 1) * chunk_size + MIN (k - 1, rem_bytes) - 1;
        if (start < initial_read)
          {
            memmove (buf, buf + start, initial_read - start);
@@ -890,7 +892,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
          }
        n_written = start;
        chunk_no = k - 1;
-      chunk_end = chunk_no * chunk_size;
+      chunk_end = start + 1;
      }
  
    while (n_written < file_size)
@@ -904,13 +906,13 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
          }
        else
          {
-          n_read = safe_read (STDIN_FILENO, buf, bufsize);
+          n_read = safe_read (STDIN_FILENO, buf,
+                              MIN (bufsize, file_size - n_written));
            if (n_read == SAFE_READ_ERROR)
              die (EXIT_FAILURE, errno, "%s", quotef (infile));
          }
        if (n_read == 0)
          break; /* eof.  */
-      n_read = MIN (n_read, file_size - n_written);
        chunk_truncated = false;
        eob = buf + n_read;
  
@@ -956,13 +958,10 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
                    chunk_truncated = true;
                    break;
                  }
-              chunk_no++;
-              if (k && chunk_no > k)
+              if (k == chunk_no)
                  return;
-              if (chunk_no == n)
-                chunk_end = file_size; /* >= chunk_size.  */
-              else
-                chunk_end += chunk_size;
+              chunk_end += chunk_size + (chunk_no < rem_bytes);
+              chunk_no++;
                if (chunk_end <= n_written)
                  {
                    if (! k)
@@ -994,10 +993,10 @@ bytes_chunk_extract (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
    off_t start;
    off_t end;
  
-  assert (k && n && k <= n && n <= file_size);
+  assert (0 < k && k <= n);
  
-  start = (k - 1) * (file_size / n);
-  end = (k == n) ? file_size : k * (file_size / n);
+  start = (k - 1) * (file_size / n) + MIN (k - 1, file_size % n);
+  end = k == n ? file_size : k * (file_size / n) + MIN (k, file_size % n);
  
    if (start < initial_read)
      {
@@ -1607,9 +1606,6 @@ main (int argc, char **argv)
                 _("invalid number of chunks"),
                 quote (umaxtostr (n_units, buffer)));
          }
-      /* increase file_size to n_units here, so that we still process
-         any input data, and create empty files for the rest.  */
-      file_size = MAX (file_size, n_units);
      }
  
    /* When filtering, closure of one pipe must not terminate the process,
@@ -1632,7 +1628,7 @@ main (int argc, char **argv)
        break;
  
      case type_bytes:
-      bytes_split (n_units, buf, in_blk_size, SIZE_MAX, 0);
+      bytes_split (n_units, 0, buf, in_blk_size, SIZE_MAX, 0);
        break;
  
      case type_byteslines:
@@ -1641,8 +1637,8 @@ main (int argc, char **argv)
  
      case type_chunk_bytes:
        if (k_units == 0)
-        bytes_split (file_size / n_units, buf, in_blk_size, initial_read,
-                     n_units);
+        bytes_split (file_size / n_units, file_size % n_units,
+                     buf, in_blk_size, initial_read, n_units);
        else
          bytes_chunk_extract (k_units, n_units, buf, in_blk_size, initial_read,
                               file_size);
diff --git a/tests/split/b-chunk.sh b/tests/split/b-chunk.sh

index b3195000fdc90aa851c29cfade03ec743437eff0..1e9a6f6e9a5d1ec59a27ad1da3631d9a19113b2c 100755 (executable)
--- a/tests/split/b-chunk.sh
+++ b/tests/split/b-chunk.sh
@@ -25,6 +25,24 @@ split -n 10 /dev/null || fail=1
  test "$(stat -c %s x* | uniq -c | sed 's/^ *//; s/ /x/')" = "10x0" || fail=1
  rm -f x??
  
+printf 'abc' > abc || framework_failure_
+printf 'a' > exp-a || framework_failure_
+printf 'b' > exp-b || framework_failure_
+printf 'c' > exp-c || framework_failure_
+printf 'ab' > exp-ab || framework_failure_
+split -n 4 abc || fail=1
+compare exp-a xaa || fail=1
+compare exp-b xab || fail=1
+compare exp-c xac || fail=1
+compare /dev/null xad || fail=1
+test ! -f xae || fail=1
+rm -f x??
+split -n 2 abc || fail=1
+compare exp-ab xaa || fail=1
+compare exp-c xab || fail=1
+test ! -f xac || fail=1
+rm -f x??
+
  # When extracting K of N where N > file size
  # no data is extracted, and no files are written
  split -n 2/3 /dev/null || fail=1
@@ -35,9 +53,9 @@ split -e -n 10 /dev/null || fail=1
  returns_ 1 stat x?? 2>/dev/null || fail=1
  
  printf '1\n2\n3\n4\n5\n' > input || framework_failure_
-printf '1\n2' > exp-1 || framework_failure_
-printf '\n3\n' > exp-2 || framework_failure_
-printf '4\n5\n' > exp-3 || framework_failure_
+printf '1\n2\n' > exp-1 || framework_failure_
+printf '3\n4' > exp-2 || framework_failure_
+printf '\n5\n' > exp-3 || framework_failure_
  
  for file in input /proc/version /sys/kernel/profiling; do
    test -f $file || continue
diff --git a/tests/split/l-chunk.sh b/tests/split/l-chunk.sh

index 73cd729a4e28fd259f0e8e3f03cc6d0f09a56448..cdb201746d520e79cda53a64107ae99ac6127c3c 100755 (executable)
--- a/tests/split/l-chunk.sh
+++ b/tests/split/l-chunk.sh
@@ -59,11 +59,11 @@ sed "s/': .*/'/" < err.t > err || framework_failure_
  compare exp err || fail=1
  
  printf '%s' "\
-14 16 09 15 16 10
+14 16 16 08 16 10
  14 08 08 10 14 08 08 10
-06 08 08 02 06 08 08 02 06 08 08 10
-06 08 02 06 08 00 08 02 06 08 02 06 08 00 10
-06 00 08 00 02 06 00 02 06 00 08 00 01 07 00 02 06 00 08 00 02 16
+08 06 08 08 08 08 08 02 06 08 08 02
+06 08 08 02 06 08 02 06 08 02 06 08 00 08 02
+06 02 06 02 06 02 06 02 06 02 06 02 06 02 06 00 08 00 02 06 00 02
  " > exp || framework_failure_
  
  sed 's/00 *//g' exp > exp.elide_empty || framework_failure_
@@ -120,17 +120,13 @@ test "$DEBUGGING" && test "$VERBOSE" && set -x
  
  
  # Check extraction of particular chunks
-> out
-printf '1\n12345\n' > exp
-split -n l/13/15 in > out
+split -n l/13/15 in > out &&
+compare /dev/null out || fail=1
+printf '1\n12345\n' > exp || framework_failure_
+split -n l/14/15 in > out &&
  compare exp out || fail=1
-> out
-printf '' > exp
-split -n l/14/15 in > out
-compare exp out || fail=1
-> out
-printf '1\n12345\n1\n' > exp
-split -n l/15/15 in > out
+printf '1\n' > exp || framework_failure_
+split -n l/15/15 in > out &&
  compare exp out || fail=1
  
  # test input with no \n at end
author	Paul Eggert <eggert@cs.ucla.edu>
	Sat, 4 Mar 2023 19:41:03 +0000 (11:41 -0800)
committer	Paul Eggert <eggert@cs.ucla.edu>
	Sat, 4 Mar 2023 22:49:45 +0000 (14:49 -0800)
NEWS		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/split.c		patch \| blob \| blame \| history
tests/split/b-chunk.sh		patch \| blob \| blame \| history
tests/split/l-chunk.sh		patch \| blob \| blame \| history