@opindex -l
@opindex --lines
Put @var{lines} lines of @var{input} into each output file.
+If @option{--separator} is specified, then @var{lines} determines
+the number of records.
For compatibility @command{split} also supports an obsolete
option syntax @option{-@var{lines}}. New scripts should use
@opindex -C
@opindex --line-bytes
Put into each output file as many complete lines of @var{input} as
-possible without exceeding @var{size} bytes. Individual lines longer than
-@var{size} bytes are broken into multiple files.
+possible without exceeding @var{size} bytes. Individual lines or records
+longer than @var{size} bytes are broken into multiple files.
@var{size} has the same format as for the @option{--bytes} option.
+If @option{--separator} is specified, then @var{lines} determines
+the number of records.
@item --filter=@var{command}
@opindex --filter
@example
@var{n} generate @var{n} files based on current size of @var{input}
@var{k}/@var{n} only output @var{k}th of @var{n} to stdout
-l/@var{n} generate @var{n} files without splitting lines
+l/@var{n} generate @var{n} files without splitting lines or records
l/@var{k}/@var{n} likewise but only output @var{k}th of @var{n} to stdout
r/@var{n} like @samp{l} but use round robin distribution
r/@var{k}/@var{n} likewise but only output @var{k}th of @var{n} to stdout
For @samp{l} mode, chunks are approximately @var{input} size / @var{n}.
The @var{input} is partitioned into @var{n} equal sized portions, with
the last assigned any excess. If a line @emph{starts} within a partition
-it is written completely to the corresponding file. Since lines
+it is written completely to the corresponding file. Since lines or records
are not split even if they overlap a partition, the files written
can be larger or smaller than the partition size, and even empty
-if a line is so long as to completely overlap the partition.
+if a line/record is so long as to completely overlap the partition.
For @samp{r} mode, the size of @var{input} is irrelevant,
and so can be a pipe for example.
span a chunk. The output file sequence numbers, always run consecutively
even when this option is specified.
+@item -t @var{separator}
+@itemx --separator=@var{separator}
+@opindex -t
+@opindex --separator
+@cindex line separator character
+@cindex record separator character
+Use character @var{separator} as the record separator instead of the default
+newline character (ASCII LF).
+To specify ASCII NUL as the separator, use the two-character string @samp{\0},
+e.g., @samp{split -t '\0'}.
+
@item -u
@itemx --unbuffered
@opindex -u
\f
/* By tege@sics.se, with rms.
- To do:
- * Implement -t CHAR or -t REGEX to specify break characters other
- than newline. */
-
+ TODO:
+ * support -p REGEX as in BSD's split.
+ * support --suppress-matched as in csplit. */
#include <config.h>
#include <assert.h>
input to output, which is much slower, so disabled by default. */
static bool unbuffered;
+/* The character marking end of line. Defaults to \n below. */
+static int eolchar = -1;
+
/* The split mode to use. */
enum Split_type
{
{"numeric-suffixes", optional_argument, NULL, 'd'},
{"filter", required_argument, NULL, FILTER_OPTION},
{"verbose", no_argument, NULL, VERBOSE_OPTION},
+ {"separator", required_argument, NULL, 't'},
{"-io-blksize", required_argument, NULL,
IO_BLKSIZE_OPTION}, /* do not document */
{GETOPT_HELP_OPTION_DECL},
-a, --suffix-length=N generate suffixes of length N (default %d)\n\
--additional-suffix=SUFFIX append an additional SUFFIX to file names\n\
-b, --bytes=SIZE put SIZE bytes per output file\n\
- -C, --line-bytes=SIZE put at most SIZE bytes of lines per output file\n\
+ -C, --line-bytes=SIZE put at most SIZE bytes of records per output file\n\
-d, --numeric-suffixes[=FROM] use numeric suffixes instead of alphabetic;\n\
FROM changes the start value (default 0)\n\
-e, --elide-empty-files do not generate empty output files with '-n'\n\
--filter=COMMAND write to shell COMMAND; file name is $FILE\n\
- -l, --lines=NUMBER put NUMBER lines per output file\n\
+ -l, --lines=NUMBER put NUMBER lines/records per output file\n\
-n, --number=CHUNKS generate CHUNKS output files; see explanation below\n\
+ -t, --separator=SEP use SEP instead of newline as the record separator;\n\
+ '\\0' (zero) specifies the NUL character\n\
-u, --unbuffered immediately copy input to output with '-n r/...'\n\
"), DEFAULT_SUFFIX_LENGTH);
fputs (_("\
CHUNKS may be:\n\
N split into N files based on size of input\n\
K/N output Kth of N to stdout\n\
- l/N split into N files without splitting lines\n\
- l/K/N output Kth of N to stdout without splitting lines\n\
+ l/N split into N files without splitting lines/records\n\
+ l/K/N output Kth of N to stdout without splitting lines/records\n\
r/N like 'l' but use round robin distribution\n\
r/K/N likewise but only output Kth of N to stdout\n\
"), stdout);
error (EXIT_FAILURE, errno, "%s", infile);
bp = bp_out = buf;
eob = bp + n_read;
- *eob = '\n';
+ *eob = eolchar;
while (true)
{
- bp = memchr (bp, '\n', eob - bp + 1);
+ bp = memchr (bp, eolchar, eob - bp + 1);
if (bp == eob)
{
if (eob != bp_out) /* do not write 0 bytes! */
/* Have enough for split. */
split_rest = n_bytes - n_out - n_hold;
eoc = sob + split_rest - 1;
- eol = memrchr (sob, '\n', split_rest);
+ eol = memrchr (sob, eolchar, split_rest);
}
else
- eol = memrchr (sob, '\n', n_left);
+ eol = memrchr (sob, eolchar, n_left);
/* Output hold space if possible. */
if (n_hold && !(!eol && n_out))
/* Begin looking for '\n' at last byte of chunk. */
off_t skip = MIN (n_read, MAX (0, chunk_end - n_written));
- char *bp_out = memchr (bp + skip, '\n', n_read - skip);
+ char *bp_out = memchr (bp + skip, eolchar, n_read - skip);
if (bp_out++)
next = true;
else
bool next = false;
/* Find end of line. */
- char *bp_out = memchr (bp, '\n', eob - bp);
+ char *bp_out = memchr (bp, eolchar, eob - bp);
if (bp_out)
{
bp_out++;
int this_optind = optind ? optind : 1;
char *slash;
- c = getopt_long (argc, argv, "0123456789C:a:b:del:n:u",
+ c = getopt_long (argc, argv, "0123456789C:a:b:del:n:t:u",
longopts, NULL);
if (c == -1)
break;
unbuffered = true;
break;
+ case 't':
+ {
+ char neweol = optarg[0];
+ if (! neweol)
+ error (EXIT_FAILURE, 0, _("empty record separator"));
+ if (optarg[1])
+ {
+ if (STREQ (optarg, "\\0"))
+ neweol = '\0';
+ else
+ {
+ /* Provoke with 'split -txx'. Complain about
+ "multi-character tab" instead of "multibyte tab", so
+ that the diagnostic's wording does not need to be
+ changed once multibyte characters are supported. */
+ error (EXIT_FAILURE, 0, _("multi-character separator %s"),
+ quote (optarg));
+ }
+ }
+ /* Make it explicit we don't support multiple separators. */
+ if (0 <= eolchar && neweol != eolchar)
+ {
+ error (EXIT_FAILURE, 0,
+ _("multiple separator characters specified"));
+ }
+
+ eolchar = neweol;
+ }
+ break;
+
case '0':
case '1':
case '2':
usage (EXIT_FAILURE);
}
+ if (eolchar < 0)
+ eolchar = '\n';
+
set_suffix_length (n_units, split_type);
/* Get out the filename arguments. */
--- /dev/null
+#!/bin/sh
+# test split with custom record separators
+
+# Copyright (C) 2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ split
+
+NL='
+'
+
+for sep in "$NL" '\0' ':'; do
+
+ test "$sep" = "$NL" && tr='\n' || tr="$sep"
+
+ for mode in '--lines=2' '--line-bytes=4' '--number=l/3' '--number=r/3'; do
+
+ # Generate in default mode for comparison
+ printf '1\n2\n3\n4\n5\n' > in || framework_failure_
+ split $mode in || fail=1
+ tr '\n' "$tr" < xaa > exp1
+ tr '\n' "$tr" < xab > exp2
+ tr '\n' "$tr" < xac > exp3
+
+ rm -f x??
+
+ # Generate output with specified --separator
+ printf '1\n2\n3\n4\n5\n' | tr '\n' "$tr" > in || framework_failure_
+ split $mode -t "$sep" in || fail=1
+
+ compare exp1 xaa || fail=1
+ compare exp2 xab || fail=1
+ compare exp3 xac || fail=1
+ test -f xad && fail=1
+ done
+
+done
+
+
+#
+# Test usage edge cases
+#
+
+# Should fail: '-t' requires an argument
+{ split -t </dev/null >/dev/null 2>/dev/null || test $? -ne 1; } &&
+ { warn_ "-t without argument did not trigger an error" ; fail=1 ; }
+
+# should fail: multi-character separator
+{ split -txx </dev/null >/dev/null 2>&1 || test $? -ne 1; } &&
+ { warn_ "-txx did not trigger an error" ; fail=1 ; }
+
+# should fail: different separators used
+{ split -ta -tb </dev/null >/dev/null 2>&1 || test $? -ne 1; } &&
+ { warn_ "-ta -tb did not trigger an error" ; fail=1 ; }
+
+# should fail: different separators used, including default
+{ split -t"$NL" -tb </dev/null >/dev/null 2>&1 || test $? -ne 1; } &&
+ { warn_ "-t\$NL -tb did not trigger an error" ; fail=1 ; }
+
+# should not fail: same separator used multiple times
+split -t: -t: </dev/null >/dev/null 2>&1 ||
+ { warn_ "-t: -t: triggered an error" ; fail=1 ; }
+
+
+Exit $fail