join,sort,uniq: with -z, treat '\n' as a field separator

author Pádraig Brady <P@draigBrady.com>

Tue, 12 Jan 2016 16:29:32 +0000 (16:29 +0000)

committer Pádraig Brady <P@draigBrady.com>

Wed, 13 Jan 2016 10:59:18 +0000 (10:59 +0000)
author Pádraig Brady <P@draigBrady.com>
Tue, 12 Jan 2016 16:29:32 +0000 (16:29 +0000)
committer Pádraig Brady <P@draigBrady.com>
Wed, 13 Jan 2016 10:59:18 +0000 (10:59 +0000)
diff --git a/NEWS b/NEWS

index 6e48a5365397a1772c643d846c2659337d53fe87..192d8fa9e56d8a03bc4fe32855415529f195166a 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -64,6 +64,8 @@ GNU coreutils NEWS                                    -*- outline -*-
    ls now quotes file names unambiguously and appropriate for use in a shell,
    when outputting to a terminal.
  
+  join, sort, uniq with --zero-terminated, now treat '\n' as a field delimiter.
+
  ** Improvements
  
    All utilities now quote user supplied arguments in error strings,
diff --git a/doc/coreutils.texi b/doc/coreutils.texi

index 25380628d7e35c2b9c4001cc466725f86800d33f..80e9a032dadf55c3038b4afc9d48104e5c1014be 100644 (file)
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -4493,6 +4493,9 @@ numeric string when checking for uniqueness, whereas @code{sort -n |
  uniq} inspects the entire line.  @xref{uniq invocation}.
  
  @optZeroTerminated
+@macro newlineFieldSeparator
+Note with @option{-z} the newline character is treated as a field separator.
+@end macro
  
  @end table
  
@@ -5034,6 +5037,7 @@ fields and characters).  By default the entire rest of the lines are
  compared.
  
  @optZeroTerminated
+@newlineFieldSeparator
  
  @end table
  
@@ -6157,6 +6161,7 @@ Print a line for each unpairable line in file @var{file-number}
  (either @samp{1} or @samp{2}), instead of the normal output.
  
  @optZeroTerminated
+@newlineFieldSeparator
  
  @end table
  
diff --git a/src/join.c b/src/join.c

index 8686428fb4c6c188c32720d1cee351aa41eb1d83..9b25da66765bf4f195e4b77e4fc0866e641d7e73 100644 (file)
--- a/src/join.c
+++ b/src/join.c
@@ -194,7 +194,7 @@ Usage: %s [OPTION]... FILE1 FILE2\n\
                program_name);
        fputs (_("\
  For each pair of input lines with identical join fields, write a line to\n\
-standard output.  The default join field is the first, delimited by whitespace.\
+standard output.  The default join field is the first, delimited by blanks.\
  \n\
  "), stdout);
        fputs (_("\
@@ -284,19 +284,19 @@ xfields (struct line *line)
    else if (tab < 0)
      {
        /* Skip leading blanks before the first field.  */
-      while (isblank (to_uchar (*ptr)))
+      while (field_sep (*ptr))
          if (++ptr == lim)
            return;
  
        do
          {
            char *sep;
-          for (sep = ptr + 1; sep != lim && ! isblank (to_uchar (*sep)); sep++)
+          for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
              continue;
            extract_field (line, ptr, sep - ptr);
            if (sep == lim)
              return;
-          for (ptr = sep + 1; ptr != lim && isblank (to_uchar (*ptr)); ptr++)
+          for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
              continue;
          }
        while (ptr != lim);
diff --git a/src/sort.c b/src/sort.c

index aca3b423119678396e7126357283a185539a4713..575877d22ff05bc84203fbe4258eb875f6b6aa21 100644 (file)
--- a/src/sort.c
+++ b/src/sort.c
@@ -1275,9 +1275,9 @@ inittables (void)
  
    for (i = 0; i < UCHAR_LIM; ++i)
      {
-      blanks[i] = !! isblank (i);
+      blanks[i] = field_sep (i);
        nonprinting[i] = ! isprint (i);
-      nondictionary[i] = ! isalnum (i) && ! isblank (i);
+      nondictionary[i] = ! isalnum (i) && ! field_sep (i);
        fold_toupper[i] = toupper (i);
      }
  
diff --git a/src/system.h b/src/system.h

index c1c4a18a329aabd614ca89b6d9b577e877cda252..9898bc79c483cf1c3f300fac47ad96b9211fe01a 100644 (file)
--- a/src/system.h
+++ b/src/system.h
@@ -155,6 +155,13 @@ enum
     errors that the cast doesn't.  */
  static inline unsigned char to_uchar (char ch) { return ch; }
  
+/* '\n' is considered a field separator with  --zero-terminated.  */
+static inline bool
+field_sep (unsigned char ch)
+{
+  return isblank (ch) || ch == '\n';
+}
+
  #include <locale.h>
  
  /* Take care of NLS matters.  */
diff --git a/src/uniq.c b/src/uniq.c

index 6f8cd4a7050919d90ce5dc51ad0e2e168f224b52..0e118da9d1cf902129ad3d3a197c4f749b946dad 100644 (file)
--- a/src/uniq.c
+++ b/src/uniq.c
@@ -261,9 +261,9 @@ find_field (struct linebuffer const *line)
  
    for (count = 0; count < skip_fields && i < size; count++)
      {
-      while (i < size && isblank (to_uchar (lp[i])))
+      while (i < size && field_sep (lp[i]))
          i++;
-      while (i < size && !isblank (to_uchar (lp[i])))
+      while (i < size && !field_sep (lp[i]))
          i++;
      }
  
diff --git a/tests/misc/join.pl b/tests/misc/join.pl

index 2a40f00953197615db910ab6fb3503428102ccd4..4d399d8aea18490c6a25d67337f745689b766045 100755 (executable)
--- a/tests/misc/join.pl
+++ b/tests/misc/join.pl
@@ -290,10 +290,13 @@ my @tv = (
  # missing last NUL at the end of the last line (=end of file)
  ['z4', '-z',
   ["a\0c\0e", "a\0b\0c"], "a\0c\0", 0],
-# edge-case: the embedded newlines should treated as
-# part of the nul-terminated line
+# With -z, embedded newlines are treated as field separators.
+# Note '\n' are converted to ' ' in this case.
  ['z5', '-z -a1 -a2',
- ["a\n1\0c 3\0","b\n8\0c 9\0"], "a\n1\0b\n8\0c 3 9\0"],
+ ["a\n\n1\0c 3\0", "a 2\0b\n8\0c 9\0"], "a 1 2\0b 8\0c 3 9\0"],
+# One can avoid field processing like:
+['z6', '-z -t ""',
+ ["a\n1\n\0", "a\n1\n\0"], "a\n1\n\0"],
  
  );
  
diff --git a/tests/misc/sort.pl b/tests/misc/sort.pl

index c9bcce1945437cba9e5f00c40590cc734a71736e..c3e7f8e48957791f451f14172003aaeed8d80fa7 100755 (executable)
--- a/tests/misc/sort.pl
+++ b/tests/misc/sort.pl
@@ -406,6 +406,11 @@ my @Tests =
  ["output-is-input-3", '-m -o f', {OUT=>''},
   {IN=> {g=> "a\n"}}, {IN=> {h=> "b\n"}}, {IN=> {f=> "c\n"}},
   {CMP=> ["a\nb\nc\n", {'f'=> undef}]} ],
+
+# --zero-terminated
+['zero-1', '-z', {IN=>"2\0001\000"}, {OUT=>"1\0002\000"}],
+['zero-2', '-z -k2,2', {IN=>"1\n2\0002\n1\000"}, {OUT=>"2\n1\0001\n2\000"}],
+['zero-3', '-zb -k2,2', {IN=>"1\n\n2\0002\n1\0"}, {OUT=>"2\n1\0001\n\n2\0"}],
  );
  
  # Add _POSIX2_VERSION=199209 to the environment of each test
diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl

index 2bc06b9d695e477e154d7f8c20749fdc3cd46345..f028036bed16935b30b623b2da19fff0570d7153 100755 (executable)
--- a/tests/misc/uniq.pl
+++ b/tests/misc/uniq.pl
@@ -95,6 +95,7 @@ my @Tests =
   ['3z', '-z', {IN=>"a\na"}, {OUT=>"a\na\0"}],
   ['4z', '-z', {IN=>"a\nb"}, {OUT=>"a\nb\0"}],
   ['5z', '-z', {IN=>"a\na\nb"}, {OUT=>"a\na\nb\0"}],
+ ['10z', '-z -f1', {IN=>"a\nb\n\0c\nb\n\0"}, {OUT=>"a\nb\n\0"}],
   ['20z', '-dz', {IN=>"a\na\n"}, {OUT=>""}],
  
   # Make sure that eight bit characters work
author	Pádraig Brady <P@draigBrady.com>
	Tue, 12 Jan 2016 16:29:32 +0000 (16:29 +0000)
committer	Pádraig Brady <P@draigBrady.com>
	Wed, 13 Jan 2016 10:59:18 +0000 (10:59 +0000)
NEWS		patch \| blob \| blame \| history
doc/coreutils.texi		patch \| blob \| blame \| history
src/join.c		patch \| blob \| blame \| history
src/sort.c		patch \| blob \| blame \| history
src/system.h		patch \| blob \| blame \| history
src/uniq.c		patch \| blob \| blame \| history
tests/misc/join.pl		patch \| blob \| blame \| history
tests/misc/sort.pl		patch \| blob \| blame \| history
tests/misc/uniq.pl		patch \| blob \| blame \| history