* NEWS: Mention the change in behavior.
* doc/coreutils.texi (newlineFieldSeparator): A new description,
referenced from ({join,sort,uniq} invocation).
* src/system.h (field_sep): A new inline function to determine
if a character is a field separator.
* src/join.c (usage): s/whitespace/blank/ to be more accurate
wrt which characters are field separators.
(xfields): s/isblank/field_sep/.
* src/sort.c (inittables): Likewise.
* src/uniq.c (find_field): Likewise.
* tests/misc/join.pl: Adjust -z test, and add a test/example
for processing the whole record with field processing.
* tests/misc/sort.pl: Add -z test cases, including case with '\n'.
* tests/misc/uniq.pl: Add -z -f test case with \n.
ls now quotes file names unambiguously and appropriate for use in a shell,
when outputting to a terminal.
+ join, sort, uniq with --zero-terminated, now treat '\n' as a field delimiter.
+
** Improvements
All utilities now quote user supplied arguments in error strings,
uniq} inspects the entire line. @xref{uniq invocation}.
@optZeroTerminated
+@macro newlineFieldSeparator
+Note with @option{-z} the newline character is treated as a field separator.
+@end macro
@end table
compared.
@optZeroTerminated
+@newlineFieldSeparator
@end table
(either @samp{1} or @samp{2}), instead of the normal output.
@optZeroTerminated
+@newlineFieldSeparator
@end table
program_name);
fputs (_("\
For each pair of input lines with identical join fields, write a line to\n\
-standard output. The default join field is the first, delimited by whitespace.\
+standard output. The default join field is the first, delimited by blanks.\
\n\
"), stdout);
fputs (_("\
else if (tab < 0)
{
/* Skip leading blanks before the first field. */
- while (isblank (to_uchar (*ptr)))
+ while (field_sep (*ptr))
if (++ptr == lim)
return;
do
{
char *sep;
- for (sep = ptr + 1; sep != lim && ! isblank (to_uchar (*sep)); sep++)
+ for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
continue;
extract_field (line, ptr, sep - ptr);
if (sep == lim)
return;
- for (ptr = sep + 1; ptr != lim && isblank (to_uchar (*ptr)); ptr++)
+ for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
continue;
}
while (ptr != lim);
for (i = 0; i < UCHAR_LIM; ++i)
{
- blanks[i] = !! isblank (i);
+ blanks[i] = field_sep (i);
nonprinting[i] = ! isprint (i);
- nondictionary[i] = ! isalnum (i) && ! isblank (i);
+ nondictionary[i] = ! isalnum (i) && ! field_sep (i);
fold_toupper[i] = toupper (i);
}
errors that the cast doesn't. */
static inline unsigned char to_uchar (char ch) { return ch; }
+/* '\n' is considered a field separator with --zero-terminated. */
+static inline bool
+field_sep (unsigned char ch)
+{
+ return isblank (ch) || ch == '\n';
+}
+
#include <locale.h>
/* Take care of NLS matters. */
for (count = 0; count < skip_fields && i < size; count++)
{
- while (i < size && isblank (to_uchar (lp[i])))
+ while (i < size && field_sep (lp[i]))
i++;
- while (i < size && !isblank (to_uchar (lp[i])))
+ while (i < size && !field_sep (lp[i]))
i++;
}
# missing last NUL at the end of the last line (=end of file)
['z4', '-z',
["a\0c\0e", "a\0b\0c"], "a\0c\0", 0],
-# edge-case: the embedded newlines should treated as
-# part of the nul-terminated line
+# With -z, embedded newlines are treated as field separators.
+# Note '\n' are converted to ' ' in this case.
['z5', '-z -a1 -a2',
- ["a\n1\0c 3\0","b\n8\0c 9\0"], "a\n1\0b\n8\0c 3 9\0"],
+ ["a\n\n1\0c 3\0", "a 2\0b\n8\0c 9\0"], "a 1 2\0b 8\0c 3 9\0"],
+# One can avoid field processing like:
+['z6', '-z -t ""',
+ ["a\n1\n\0", "a\n1\n\0"], "a\n1\n\0"],
);
["output-is-input-3", '-m -o f', {OUT=>''},
{IN=> {g=> "a\n"}}, {IN=> {h=> "b\n"}}, {IN=> {f=> "c\n"}},
{CMP=> ["a\nb\nc\n", {'f'=> undef}]} ],
+
+# --zero-terminated
+['zero-1', '-z', {IN=>"2\0001\000"}, {OUT=>"1\0002\000"}],
+['zero-2', '-z -k2,2', {IN=>"1\n2\0002\n1\000"}, {OUT=>"2\n1\0001\n2\000"}],
+['zero-3', '-zb -k2,2', {IN=>"1\n\n2\0002\n1\0"}, {OUT=>"2\n1\0001\n\n2\0"}],
);
# Add _POSIX2_VERSION=199209 to the environment of each test
['3z', '-z', {IN=>"a\na"}, {OUT=>"a\na\0"}],
['4z', '-z', {IN=>"a\nb"}, {OUT=>"a\nb\0"}],
['5z', '-z', {IN=>"a\na\nb"}, {OUT=>"a\na\nb\0"}],
+ ['10z', '-z -f1', {IN=>"a\nb\n\0c\nb\n\0"}, {OUT=>"a\nb\n\0"}],
['20z', '-dz', {IN=>"a\na\n"}, {OUT=>""}],
# Make sure that eight bit characters work