]> git.ipfire.org Git - thirdparty/coreutils.git/commitdiff
cut: implement -w,--whitespace-delimited
authorPádraig Brady <P@draigBrady.com>
Wed, 11 Mar 2026 22:42:45 +0000 (22:42 +0000)
committerPádraig Brady <P@draigBrady.com>
Sun, 5 Apr 2026 12:15:56 +0000 (13:15 +0100)
* src/cut.c (cut_fields_ws): A new function handling both
uni-byte and multi-byte cases.
* tests/cut/cut.pl: Add a test cases.

src/cut.c
tests/cut/cut.pl

index 82e9065b692d68d127d15eef7e38e1e065bf52e4..65fc664277ef0a3b60bd2b6effac1802f01b1c5d 100644 (file)
--- a/src/cut.c
+++ b/src/cut.c
@@ -109,6 +109,9 @@ static char output_delimiter_default[MB_LEN_MAX];
 /* True if we have ever read standard input.  */
 static bool have_read_stdin;
 
+/* If true, interpret each run of whitespace as one field delimiter.  */
+static bool whitespace_delimited;
+
 /* Whether to cut bytes, characters, or fields.  */
 static enum
 {
@@ -132,6 +135,7 @@ static struct option const longopts[] =
   {"characters", required_argument, NULL, 'c'},
   {"fields", required_argument, NULL, 'f'},
   {"delimiter", required_argument, NULL, 'd'},
+  {"whitespace-delimited", no_argument, NULL, 'w'},
   {"only-delimited", no_argument, NULL, 's'},
   {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION},
   {"complement", no_argument, NULL, COMPLEMENT_OPTION},
@@ -192,6 +196,10 @@ Print selected parts of lines from each FILE to standard output.\n\
       --output-delimiter=STRING\n\
          use STRING as the output delimiter;\n\
          the default is to use the input delimiter\n\
+"));
+      oputs (_("\
+  -w, --whitespace-delimited\n\
+         use runs of whitespace as the field delimiter\n\
 "));
       oputs (_("\
   -z, --zero-terminated\n\
@@ -259,6 +267,43 @@ field_delim_eq (mcel_t g)
   return delim_mcel.err ? g.err == delim_mcel.err : mcel_eq (g, delim_mcel);
 }
 
+enum field_terminator
+{
+  FIELD_DELIMITER,
+  FIELD_LINE_DELIMITER,
+  FIELD_EOF
+};
+
+static inline mcel_t
+mbbuf_get_saved_char (mbbuf_t *mbbuf, bool *have_saved, mcel_t *saved_g)
+{
+  if (*have_saved)
+    {
+      *have_saved = false;
+      return *saved_g;
+    }
+  return mbbuf_get_char (mbbuf);
+}
+
+static inline enum field_terminator
+skip_whitespace_delim (mbbuf_t *mbuf, bool *have_saved, mcel_t *saved_g,
+                       bool *have_pending_line)
+{
+  mcel_t g;
+
+  do
+    {
+      g = mbbuf_get_char (mbuf);
+      if (g.ch != MBBUF_EOF)
+        *have_pending_line = true;
+    }
+  while (g.ch != MBBUF_EOF && g.ch != line_delim && c32issep (g.ch));
+
+  *saved_g = g;
+  *have_saved = true;
+  return FIELD_DELIMITER;
+}
+
 static void
 write_bytes (char const *buf, size_t n_bytes)
 {
@@ -392,13 +437,6 @@ cut_characters (FILE *stream)
 static void
 cut_fields_mb (FILE *stream)
 {
-  enum field_terminator
-  {
-    FIELD_DELIMITER,
-    FIELD_LINE_DELIMITER,
-    FIELD_EOF
-  };
-
   static char line_in[IO_BUFSIZE];
   mbbuf_t mbbuf;
   uintmax_t field_idx = 1;
@@ -538,6 +576,197 @@ cut_fields_mb (FILE *stream)
     }
 }
 
+/* Read from STREAM, printing to standard output any selected fields,
+   using runs of whitespace as the field delimiter.  */
+
+static void
+cut_fields_ws (FILE *stream)
+{
+  static char line_in[IO_BUFSIZE];
+  mbbuf_t mbbuf;
+  uintmax_t field_idx = 1;
+  bool found_any_selected_field = false;
+  bool buffer_first_field;
+  bool have_pending_line = false;
+  bool have_saved = false;
+  mcel_t saved_g = { .ch = MBBUF_EOF };
+
+  current_rp = frp;
+  mbbuf_init (&mbbuf, line_in, sizeof line_in, stream);
+
+  buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
+
+  while (true)
+    {
+      if (field_idx == 1 && buffer_first_field)
+        {
+          size_t n_bytes = 0;
+          enum field_terminator terminator;
+
+          while (true)
+            {
+              mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g);
+
+              if (g.ch == MBBUF_EOF)
+                {
+                  if (n_bytes == 0)
+                    return;
+                  terminator = FIELD_EOF;
+                  break;
+                }
+
+              have_pending_line = true;
+
+              if (g.ch == line_delim)
+                {
+                  if (field_1_bufsize - n_bytes < g.len)
+                    field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize,
+                                              g.len, -1,
+                                              sizeof *field_1_buffer);
+                  memcpy (field_1_buffer + n_bytes,
+                          mbbuf_char_offset (&mbbuf, g), g.len);
+                  n_bytes += g.len;
+                  terminator = FIELD_LINE_DELIMITER;
+                  break;
+                }
+
+              if (c32issep (g.ch))
+                {
+                  terminator = skip_whitespace_delim (&mbbuf, &have_saved,
+                                                      &saved_g,
+                                                      &have_pending_line);
+                  break;
+                }
+
+              if (field_1_bufsize - n_bytes < g.len)
+                field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize,
+                                          g.len, -1,
+                                          sizeof *field_1_buffer);
+              memcpy (field_1_buffer + n_bytes, mbbuf_char_offset (&mbbuf, g),
+                      g.len);
+              n_bytes += g.len;
+            }
+
+          if (terminator != FIELD_DELIMITER)
+            {
+              if (!suppress_non_delimited)
+                {
+                  write_bytes (field_1_buffer, n_bytes);
+                  if (terminator == FIELD_EOF)
+                    {
+                      if (putchar (line_delim) < 0)
+                        write_error ();
+                    }
+                }
+
+              if (terminator == FIELD_EOF)
+                break;
+
+              field_idx = 1;
+              current_rp = frp;
+              found_any_selected_field = false;
+              have_pending_line = false;
+              continue;
+            }
+
+          if (print_kth (1))
+            {
+              write_bytes (field_1_buffer, n_bytes);
+              found_any_selected_field = true;
+            }
+          next_item (&field_idx);
+        }
+
+      enum field_terminator terminator;
+
+      if (print_kth (field_idx))
+        {
+          if (found_any_selected_field)
+            write_bytes (output_delimiter_string, output_delimiter_length);
+          found_any_selected_field = true;
+
+          while (true)
+            {
+              mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g);
+
+              if (g.ch == MBBUF_EOF)
+                {
+                  terminator = FIELD_EOF;
+                  break;
+                }
+
+              have_pending_line = true;
+
+              if (g.ch == line_delim)
+                {
+                  terminator = FIELD_LINE_DELIMITER;
+                  break;
+                }
+
+              if (c32issep (g.ch))
+                {
+                  terminator = skip_whitespace_delim (&mbbuf, &have_saved,
+                                                      &saved_g,
+                                                      &have_pending_line);
+                  break;
+                }
+
+              write_bytes (mbbuf_char_offset (&mbbuf, g), g.len);
+            }
+        }
+      else
+        {
+          while (true)
+            {
+              mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g);
+
+              if (g.ch == MBBUF_EOF)
+                {
+                  terminator = FIELD_EOF;
+                  break;
+                }
+
+              have_pending_line = true;
+
+              if (g.ch == line_delim)
+                {
+                  terminator = FIELD_LINE_DELIMITER;
+                  break;
+                }
+
+              if (c32issep (g.ch))
+                {
+                  terminator = skip_whitespace_delim (&mbbuf, &have_saved,
+                                                      &saved_g,
+                                                      &have_pending_line);
+                  break;
+                }
+            }
+        }
+
+      if (terminator == FIELD_DELIMITER)
+        next_item (&field_idx);
+      else
+        {
+          if (terminator == FIELD_EOF && !have_pending_line)
+            break;
+          if (found_any_selected_field
+              || !(suppress_non_delimited && field_idx == 1))
+            {
+              if (putchar (line_delim) < 0)
+                write_error ();
+            }
+          if (terminator == FIELD_EOF)
+            break;
+
+          field_idx = 1;
+          current_rp = frp;
+          found_any_selected_field = false;
+          have_pending_line = false;
+        }
+    }
+}
+
 /* Read from stream STREAM, printing to standard output any selected fields.  */
 
 static void
@@ -749,7 +978,7 @@ main (int argc, char **argv)
 
   atexit (close_stdout);
 
-  while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, NULL))
+  while ((optc = getopt_long (argc, argv, "b:c:d:f:nszw", longopts, NULL))
          != -1)
     {
       switch (optc)
@@ -800,6 +1029,10 @@ main (int argc, char **argv)
           delim_specified = true;
           break;
 
+        case 'w':
+          whitespace_delimited = true;
+          break;
+
         case OUTPUT_DELIMITER_OPTION:
           /* Interpret --output-delimiter='' to mean
              'use the NUL byte as the delimiter.'  */
@@ -835,7 +1068,7 @@ main (int argc, char **argv)
 
   if (cut_mode == CUT_MODE_BYTES || cut_mode == CUT_MODE_CHARACTERS)
     {
-      if (delim_specified)
+      if (delim_specified || whitespace_delimited)
         FATAL_ERROR (_("an input delimiter may be specified only\
  when operating on fields"));
 
@@ -844,6 +1077,9 @@ main (int argc, char **argv)
 \tonly when operating on fields"));
     }
 
+  if (delim_specified && whitespace_delimited)
+    FATAL_ERROR (_("-d and -w are mutually exclusive"));
+
   set_fields (spec_list_string,
               (((cut_mode == CUT_MODE_BYTES
                  || cut_mode == CUT_MODE_CHARACTERS)
@@ -880,7 +1116,9 @@ main (int argc, char **argv)
       break;
 
     case CUT_MODE_FIELDS:
-      cut_stream = single_byte_field_delim_ok () ? cut_fields : cut_fields_mb;
+      cut_stream = whitespace_delimited ? cut_fields_ws
+                   : single_byte_field_delim_ok () ? cut_fields
+                   : cut_fields_mb;
       break;
     }
   affirm (cut_stream);
index 3906b4fcfd3e0aee9fad5e13c36261f52b5fad90..0e378ed2470ac45d6d426bed133ad693e94d47cf 100755 (executable)
@@ -36,6 +36,7 @@ my $inval_pos = "$prog: invalid byte or character range\n$try";
 my $no_endpoint = "$prog: invalid range with no endpoint: -\n$try";
 my $nofield = "$prog: an input delimiter may be specified only when " .
               "operating on fields\n$try";
+my $mutual_dw = "$prog: -d and -w are mutually exclusive\n$try";
 
 my @Tests =
  (
@@ -134,6 +135,13 @@ my @Tests =
   ['8bit-delim', '-d', "\255", '--out=_', '-f2,3', {IN=>"a\255b\255c\n"},
    {OUT=>"b_c\n"}],
 
+  ['w-delim-1', '-w', '-f2,3', {IN=>"a\tb  c\n"}, {OUT=>"b\tc\n"}],
+  ['w-delim-2', '-w', '-f1,2', {IN=>"  a b\n"}, {OUT=>"\ta\n"}],
+  ['w-delim-3', '-s', '-w', '-f2', {IN=>"abc\n"}, {OUT=>""}],
+  ['w-delim-4', '-s', '-w', '-f1', {IN=>"a b c\n"}, {OUT=>"a\n"}],
+  ['w-delim-5', '-w', '-d:', '-f1', {EXIT=>1}, {ERR=>$mutual_dw}],
+  ['w-delim-6', '-w', '-f1,2', {IN=>"a  \n"}, {OUT=>"a\t\n"}],
+
   # newline processing for fields
   ['newline-1', '-f1-', {IN=>"a\nb"}, {OUT=>"a\nb\n"}],
   ['newline-2', '-f1-', {IN=>""}, {OUT=>""}],
@@ -266,6 +274,10 @@ if ($mb_locale ne 'C')
        {ENV => "LC_ALL=$mb_locale"}],
       ['mb-delim-3', '-s', '-d', "\xc3\xa9", '-f2',
        {IN=>"abc\n"}, {OUT=>""},
+       {ENV => "LC_ALL=$mb_locale"}],
+      ['mb-w-delim-1', '-w', '-f2', {IN=>"a\xe2\x80\x83b\n"}, {OUT=>"b\n"},
+       {ENV => "LC_ALL=$mb_locale"}],
+      ['mb-w-delim-2', '-sw', '-f2', {IN=>"a\xc2\xa0b\n"}, {OUT=>""},
        {ENV => "LC_ALL=$mb_locale"}];
   }