split: new -t option to select record separator

author Assaf Gordon <assafgordon@gmail.com>

Wed, 7 Jan 2015 23:30:28 +0000 (18:30 -0500)

committer Pádraig Brady <P@draigBrady.com>

Mon, 19 Jan 2015 23:22:37 +0000 (23:22 +0000)
author Assaf Gordon <assafgordon@gmail.com>
Wed, 7 Jan 2015 23:30:28 +0000 (18:30 -0500)
committer Pádraig Brady <P@draigBrady.com>
Mon, 19 Jan 2015 23:22:37 +0000 (23:22 +0000)
diff --git a/NEWS b/NEWS

index f59bfc1d6339e825efd6a0cf99e4b2c49ebaf33d..e0a2893cb1ee52d7209cd7ba42736386354a42e9 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -45,6 +45,9 @@ GNU coreutils NEWS                                    -*- outline -*-
    dd accepts a new status=progress level to print data transfer statistics
    on stderr approximately every second.
  
+  split accepts a new --separator option to select a record separator character
+  other than the default newline character.
+
  ** Changes in behavior
  
    df no longer suppresses separate exports of the same remote device, as
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index 1cc65329ce0dc7c65bc94af11224699a51066b90..5a3c31a15cd398681c97648b45b313a9401a1c52 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3395,6 +3395,8 @@ The program accepts the following options.  Also see @ref{Common options}.
  @opindex -l
  @opindex --lines
  Put @var{lines} lines of @var{input} into each output file.
+If @option{--separator} is specified, then @var{lines} determines
+the number of records.
  
  For compatibility @command{split} also supports an obsolete
  option syntax @option{-@var{lines}}.  New scripts should use
@@ -3412,9 +3414,11 @@ Put @var{size} bytes of @var{input} into each output file.
  @opindex -C
  @opindex --line-bytes
  Put into each output file as many complete lines of @var{input} as
-possible without exceeding @var{size} bytes.  Individual lines longer than
-@var{size} bytes are broken into multiple files.
+possible without exceeding @var{size} bytes.  Individual lines or records
+longer than @var{size} bytes are broken into multiple files.
  @var{size} has the same format as for the @option{--bytes} option.
+If @option{--separator} is specified, then @var{lines} determines
+the number of records.
  
  @item --filter=@var{command}
  @opindex --filter
@@ -3445,7 +3449,7 @@ Split @var{input} to @var{chunks} output files where @var{chunks} may be:
  @example
  @var{n}      generate @var{n} files based on current size of @var{input}
  @var{k}/@var{n}    only output @var{k}th of @var{n} to stdout
-l/@var{n}    generate @var{n} files without splitting lines
+l/@var{n}    generate @var{n} files without splitting lines or records
  l/@var{k}/@var{n}  likewise but only output @var{k}th of @var{n} to stdout
  r/@var{n}    like @samp{l} but use round robin distribution
  r/@var{k}/@var{n}  likewise but only output @var{k}th of @var{n} to stdout
@@ -3462,10 +3466,10 @@ or the @var{input} is truncated.
  For @samp{l} mode, chunks are approximately @var{input} size / @var{n}.
  The @var{input} is partitioned into @var{n} equal sized portions, with
  the last assigned any excess.  If a line @emph{starts} within a partition
-it is written completely to the corresponding file.  Since lines
+it is written completely to the corresponding file.  Since lines or records
  are not split even if they overlap a partition, the files written
  can be larger or smaller than the partition size, and even empty
-if a line is so long as to completely overlap the partition.
+if a line/record is so long as to completely overlap the partition.
  
  For @samp{r} mode, the size of @var{input} is irrelevant,
  and so can be a pipe for example.
@@ -3505,6 +3509,17 @@ than the number requested, or if a line is so long as to completely
  span a chunk.  The output file sequence numbers, always run consecutively
  even when this option is specified.
  
+@item -t @var{separator}
+@itemx --separator=@var{separator}
+@opindex -t
+@opindex --separator
+@cindex line separator character
+@cindex record separator character
+Use character @var{separator} as the record separator instead of the default
+newline character (ASCII LF).
+To specify ASCII NUL as the separator, use the two-character string @samp{\0},
+e.g., @samp{split -t '\0'}.
+
  @item -u
  @itemx --unbuffered
  @opindex -u
diff --git a/src/split.c b/src/split.c

index ef672f4fdf45200439977f6b9fe23254befc1ee1..d17616c177631a2cd0baf91984d0f56f83007c7c 100644 (file)
--- a/src/split.c
+++ b/src/split.c
@@ -16,10 +16,9 @@
  \f
  /* By tege@sics.se, with rms.
  
-   To do:
-   * Implement -t CHAR or -t REGEX to specify break characters other
-     than newline. */
-
+   TODO:
+   * support -p REGEX as in BSD's split.
+   * support --suppress-matched as in csplit.  */
  #include <config.h>
  
  #include <assert.h>
@@ -108,6 +107,9 @@ static bool elide_empty_files;
     input to output, which is much slower, so disabled by default.  */
  static bool unbuffered;
  
+/* The character marking end of line.  Defaults to \n below.  */
+static int eolchar = -1;
+
  /* The split mode to use.  */
  enum Split_type
  {
@@ -139,6 +141,7 @@ static struct option const longopts[] =
    {"numeric-suffixes", optional_argument, NULL, 'd'},
    {"filter", required_argument, NULL, FILTER_OPTION},
    {"verbose", no_argument, NULL, VERBOSE_OPTION},
+  {"separator", required_argument, NULL, 't'},
    {"-io-blksize", required_argument, NULL,
     IO_BLKSIZE_OPTION}, /* do not document */
    {GETOPT_HELP_OPTION_DECL},
@@ -216,13 +219,15 @@ is -, read standard input.\n\
    -a, --suffix-length=N   generate suffixes of length N (default %d)\n\
        --additional-suffix=SUFFIX  append an additional SUFFIX to file names\n\
    -b, --bytes=SIZE        put SIZE bytes per output file\n\
-  -C, --line-bytes=SIZE   put at most SIZE bytes of lines per output file\n\
+  -C, --line-bytes=SIZE   put at most SIZE bytes of records per output file\n\
    -d, --numeric-suffixes[=FROM]  use numeric suffixes instead of alphabetic;\n\
                                     FROM changes the start value (default 0)\n\
    -e, --elide-empty-files  do not generate empty output files with '-n'\n\
        --filter=COMMAND    write to shell COMMAND; file name is $FILE\n\
-  -l, --lines=NUMBER      put NUMBER lines per output file\n\
+  -l, --lines=NUMBER      put NUMBER lines/records per output file\n\
    -n, --number=CHUNKS     generate CHUNKS output files; see explanation below\n\
+  -t, --separator=SEP     use SEP instead of newline as the record separator;\n\
+                            '\\0' (zero) specifies the NUL character\n\
    -u, --unbuffered        immediately copy input to output with '-n r/...'\n\
  "), DEFAULT_SUFFIX_LENGTH);
        fputs (_("\
@@ -236,8 +241,8 @@ is -, read standard input.\n\
  CHUNKS may be:\n\
    N       split into N files based on size of input\n\
    K/N     output Kth of N to stdout\n\
-  l/N     split into N files without splitting lines\n\
-  l/K/N   output Kth of N to stdout without splitting lines\n\
+  l/N     split into N files without splitting lines/records\n\
+  l/K/N   output Kth of N to stdout without splitting lines/records\n\
    r/N     like 'l' but use round robin distribution\n\
    r/K/N   likewise but only output Kth of N to stdout\n\
  "), stdout);
@@ -630,10 +635,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
          error (EXIT_FAILURE, errno, "%s", infile);
        bp = bp_out = buf;
        eob = bp + n_read;
-      *eob = '\n';
+      *eob = eolchar;
        while (true)
          {
-          bp = memchr (bp, '\n', eob - bp + 1);
+          bp = memchr (bp, eolchar, eob - bp + 1);
            if (bp == eob)
              {
                if (eob != bp_out) /* do not write 0 bytes! */
@@ -692,10 +697,10 @@ line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
                /* Have enough for split.  */
                split_rest = n_bytes - n_out - n_hold;
                eoc = sob + split_rest - 1;
-              eol = memrchr (sob, '\n', split_rest);
+              eol = memrchr (sob, eolchar, split_rest);
              }
            else
-            eol = memrchr (sob, '\n', n_left);
+            eol = memrchr (sob, eolchar, n_left);
  
            /* Output hold space if possible.  */
            if (n_hold && !(!eol && n_out))
@@ -833,7 +838,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
  
            /* Begin looking for '\n' at last byte of chunk.  */
            off_t skip = MIN (n_read, MAX (0, chunk_end - n_written));
-          char *bp_out = memchr (bp + skip, '\n', n_read - skip);
+          char *bp_out = memchr (bp + skip, eolchar, n_read - skip);
            if (bp_out++)
              next = true;
            else
@@ -1080,7 +1085,7 @@ lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize)
            bool next = false;
  
            /* Find end of line. */
-          char *bp_out = memchr (bp, '\n', eob - bp);
+          char *bp_out = memchr (bp, eolchar, eob - bp);
            if (bp_out)
              {
                bp_out++;
@@ -1224,7 +1229,7 @@ main (int argc, char **argv)
        int this_optind = optind ? optind : 1;
        char *slash;
  
-      c = getopt_long (argc, argv, "0123456789C:a:b:del:n:u",
+      c = getopt_long (argc, argv, "0123456789C:a:b:del:n:t:u",
                         longopts, NULL);
        if (c == -1)
          break;
@@ -1303,6 +1308,36 @@ main (int argc, char **argv)
            unbuffered = true;
            break;
  
+        case 't':
+          {
+            char neweol = optarg[0];
+            if (! neweol)
+              error (EXIT_FAILURE, 0, _("empty record separator"));
+            if (optarg[1])
+              {
+                if (STREQ (optarg, "\\0"))
+                  neweol = '\0';
+                else
+                  {
+                    /* Provoke with 'split -txx'.  Complain about
+                       "multi-character tab" instead of "multibyte tab", so
+                       that the diagnostic's wording does not need to be
+                       changed once multibyte characters are supported.  */
+                    error (EXIT_FAILURE, 0, _("multi-character separator %s"),
+                           quote (optarg));
+                  }
+              }
+            /* Make it explicit we don't support multiple separators.  */
+            if (0 <= eolchar && neweol != eolchar)
+              {
+                error (EXIT_FAILURE, 0,
+                       _("multiple separator characters specified"));
+              }
+
+            eolchar = neweol;
+          }
+          break;
+
          case '0':
          case '1':
          case '2':
@@ -1398,6 +1433,9 @@ main (int argc, char **argv)
        usage (EXIT_FAILURE);
      }
  
+  if (eolchar < 0)
+    eolchar = '\n';
+
    set_suffix_length (n_units, split_type);
  
    /* Get out the filename arguments.  */
diff --git a/tests/local.mk b/tests/local.mk

index 6fc8599611c9cfaf6662c9857de0c911563dd3c7..5dcbd5595c750d135aae4694624408dd5526516e 100644 (file)
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -358,6 +358,7 @@ all_tests =                                 \
    tests/split/line-bytes.sh                    \
    tests/split/l-chunk.sh                       \
    tests/split/r-chunk.sh                       \
+  tests/split/record-sep.sh                    \
    tests/split/numeric.sh                       \
    tests/split/guard-input.sh                   \
    tests/misc/stat-birthtime.sh                 \
@@ -402,7 +403,7 @@ all_tests =                                 \
    tests/misc/xattr.sh                          \
    tests/tail-2/wait.sh                         \
    tests/tail-2/retry.sh                                \
-  tests/tail-2/symlink.sh                              \
+  tests/tail-2/symlink.sh                      \
    tests/tail-2/tail-c.sh                       \
    tests/chmod/c-option.sh                      \
    tests/chmod/equal-x.sh                       \
@@ -483,7 +484,7 @@ all_tests =                                 \
    tests/dd/ascii.sh                            \
    tests/dd/direct.sh                           \
    tests/dd/misc.sh                             \
-  tests/dd/no-allocate.sh                              \
+  tests/dd/no-allocate.sh                      \
    tests/dd/nocache.sh                          \
    tests/dd/not-rewound.sh                      \
    tests/dd/reblock.sh                          \
diff --git a/tests/split/record-sep.sh b/tests/split/record-sep.sh

new file mode 100755 (executable)

index 0000000..f41215a
--- /dev/null
+++ b/tests/split/record-sep.sh
@@ -0,0 +1,78 @@
+#!/bin/sh
+# test split with custom record separators
+
+# Copyright (C) 2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ split
+
+NL='
+'
+
+for sep in "$NL" '\0' ':'; do
+
+  test "$sep" = "$NL" && tr='\n' || tr="$sep"
+
+  for mode in '--lines=2' '--line-bytes=4' '--number=l/3' '--number=r/3'; do
+
+    # Generate in default mode for comparison
+    printf '1\n2\n3\n4\n5\n' > in || framework_failure_
+    split $mode in || fail=1
+    tr '\n' "$tr" < xaa > exp1
+    tr '\n' "$tr" < xab > exp2
+    tr '\n' "$tr" < xac > exp3
+
+    rm -f x??
+
+    # Generate output with specified --separator
+    printf '1\n2\n3\n4\n5\n' | tr '\n' "$tr" > in || framework_failure_
+    split $mode -t "$sep" in || fail=1
+
+    compare exp1 xaa || fail=1
+    compare exp2 xab || fail=1
+    compare exp3 xac || fail=1
+    test -f xad && fail=1
+  done
+
+done
+
+
+#
+# Test usage edge cases
+#
+
+# Should fail: '-t' requires an argument
+{ split -t </dev/null >/dev/null 2>/dev/null || test $? -ne 1; } &&
+  { warn_ "-t without argument did not trigger an error" ; fail=1 ; }
+
+# should fail: multi-character separator
+{ split -txx </dev/null >/dev/null 2>&1 || test $? -ne 1; } &&
+  { warn_ "-txx did not trigger an error" ; fail=1 ; }
+
+# should fail: different separators used
+{ split -ta -tb </dev/null >/dev/null 2>&1 || test $? -ne 1; } &&
+  { warn_ "-ta -tb did not trigger an error" ; fail=1 ; }
+
+# should fail: different separators used, including default
+{ split -t"$NL" -tb </dev/null >/dev/null 2>&1 || test $? -ne 1; } &&
+  { warn_ "-t\$NL -tb did not trigger an error" ; fail=1 ; }
+
+# should not fail: same separator used multiple times
+split -t: -t: </dev/null >/dev/null 2>&1 ||
+  { warn_ "-t: -t: triggered an error" ; fail=1 ; }
+
+
+Exit $fail
author	Assaf Gordon <assafgordon@gmail.com>
	Wed, 7 Jan 2015 23:30:28 +0000 (18:30 -0500)
committer	Pádraig Brady <P@draigBrady.com>
	Mon, 19 Jan 2015 23:22:37 +0000 (23:22 +0000)
NEWS		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/split.c		patch \| blob \| blame \| history
tests/local.mk		patch \| blob \| blame \| history
tests/split/record-sep.sh	[new file with mode: 0755]	patch \| blob