]> git.ipfire.org Git - thirdparty/coreutils.git/commitdiff
cut: refactor multi-byte updates
authorPádraig Brady <P@draigBrady.com>
Thu, 12 Mar 2026 18:58:46 +0000 (18:58 +0000)
committerPádraig Brady <P@draigBrady.com>
Sun, 5 Apr 2026 12:15:56 +0000 (13:15 +0100)
* src/cut.c: 160 fewer lines

Helpers extracted (replacing repeated inline patterns):
- write_line_delim(), write_pending_line_delim(), reset_item_line()
  - line boundary code used by cut_bytes{,no_split}, cut_characters
- write_selected_item()
  - output-delimiter + write logic used by all three byte/char functions
- reset_field_line()
  - field line reset used by cut_fields_mb_any

Field functions unified via cut_fields_mb_any(stream, whitespace_mode):
- struct mbfield_parser encapsulates the whitespace vs.
  fixed-delimiter state (saved char, mode flag)
- mbfield_get_char() - dispatches to saved-char or direct read
- mbfield_terminator()
  - returns FIELD_{DATA,DELIMETER,LINE_DELIMITER} based on mode
- read_mb_field_to_buffer()
  - replaces the two duplicated first-field buffering loops
- scan_mb_field(mbbuf, parser, pending, write_field)
  - replaces the four duplicated field scan loops
  (print+skip × two modes) with a single function and a write_field bool
- cut_fields_mb and cut_fields_ws are now trivial wrappers

src/cut.c

index 613e2a9ea193527823a33b6b851a9e88fdb18f5e..507d1ba0747a9b2e71d63370ba14ef6370304a9a 100644 (file)
--- a/src/cut.c
+++ b/src/cut.c
@@ -273,11 +273,19 @@ field_delim_eq (mcel_t g)
 
 enum field_terminator
 {
+  FIELD_DATA,
   FIELD_DELIMITER,
   FIELD_LINE_DELIMITER,
   FIELD_EOF
 };
 
+struct mbfield_parser
+{
+  bool whitespace_delimited;
+  bool have_saved;
+  mcel_t saved_g;
+};
+
 static inline mcel_t
 mbbuf_get_saved_char (mbbuf_t *mbbuf, bool *have_saved, mcel_t *saved_g)
 {
@@ -315,6 +323,133 @@ write_bytes (char const *buf, size_t n_bytes)
     write_error ();
 }
 
+static inline void
+write_line_delim (void)
+{
+  if (putchar (line_delim) < 0)
+    write_error ();
+}
+
+static inline void
+reset_item_line (uintmax_t *item_idx, bool *print_delimiter)
+{
+  write_line_delim ();
+  *item_idx = 0;
+  *print_delimiter = false;
+  current_rp = frp;
+}
+
+static inline void
+write_pending_line_delim (uintmax_t item_idx)
+{
+  if (item_idx > 0)
+    write_line_delim ();
+}
+
+static inline void
+write_selected_item (bool *print_delimiter, bool range_start,
+                     char const *buf, size_t n_bytes)
+{
+  if (output_delimiter_string != output_delimiter_default)
+    {
+      if (*print_delimiter && range_start)
+        write_bytes (output_delimiter_string, output_delimiter_length);
+      *print_delimiter = true;
+    }
+
+  write_bytes (buf, n_bytes);
+}
+
+static inline mcel_t
+mbfield_get_char (mbbuf_t *mbbuf, struct mbfield_parser *parser)
+{
+  return (parser->whitespace_delimited
+          ? mbbuf_get_saved_char (mbbuf, &parser->have_saved, &parser->saved_g)
+          : mbbuf_get_char (mbbuf));
+}
+
+static inline enum field_terminator
+mbfield_terminator (mbbuf_t *mbbuf, struct mbfield_parser *parser, mcel_t g,
+                    bool *have_pending_line)
+{
+  if (g.ch == line_delim)
+    return FIELD_LINE_DELIMITER;
+
+  if (parser->whitespace_delimited)
+    return (c32issep (g.ch)
+            ? skip_whitespace_delim (mbbuf, &parser->have_saved,
+                                     &parser->saved_g, have_pending_line)
+            : FIELD_DATA);
+
+  return field_delim_eq (g) ? FIELD_DELIMITER : FIELD_DATA;
+}
+
+static inline void
+append_field_1_bytes (mbbuf_t *mbbuf, mcel_t g, size_t *n_bytes)
+{
+  if (field_1_bufsize - *n_bytes < g.len)
+    {
+      field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize,
+                                g.len, -1, sizeof *field_1_buffer);
+    }
+
+  memcpy (field_1_buffer + *n_bytes, mbbuf_char_offset (mbbuf, g), g.len);
+  *n_bytes += g.len;
+}
+
+static enum field_terminator
+read_mb_field_to_buffer (mbbuf_t *mbbuf, struct mbfield_parser *parser,
+                         bool *have_pending_line, size_t *n_bytes)
+{
+  while (true)
+    {
+      mcel_t g = mbfield_get_char (mbbuf, parser);
+      if (g.ch == MBBUF_EOF)
+        return FIELD_EOF;
+
+      *have_pending_line = true;
+
+      enum field_terminator terminator
+        = mbfield_terminator (mbbuf, parser, g, have_pending_line);
+      if (terminator != FIELD_DATA)
+        return terminator;
+
+      append_field_1_bytes (mbbuf, g, n_bytes);
+    }
+}
+
+static enum field_terminator
+scan_mb_field (mbbuf_t *mbbuf, struct mbfield_parser *parser,
+               bool *have_pending_line, bool write_field)
+{
+  while (true)
+    {
+      mcel_t g = mbfield_get_char (mbbuf, parser);
+      if (g.ch == MBBUF_EOF)
+        return FIELD_EOF;
+
+      *have_pending_line = true;
+
+      enum field_terminator terminator
+        = mbfield_terminator (mbbuf, parser, g, have_pending_line);
+      if (terminator != FIELD_DATA)
+        return terminator;
+
+      if (write_field)
+        write_bytes (mbbuf_char_offset (mbbuf, g), g.len);
+    }
+}
+
+static inline void
+reset_field_line (uintmax_t *field_idx, bool *found_any_selected_field,
+                  bool *have_pending_line)
+{
+  *field_idx = 1;
+  current_rp = frp;
+  *found_any_selected_field = false;
+  *have_pending_line = false;
+}
+
 /* Read from stream STREAM, printing to standard output any selected bytes.  */
 
 static void
@@ -335,20 +470,10 @@ cut_bytes (FILE *stream)
       c = getc (stream);
 
       if (c == line_delim)
-        {
-          if (putchar (c) < 0)
-            write_error ();
-          byte_idx = 0;
-          print_delimiter = false;
-          current_rp = frp;
-        }
+        reset_item_line (&byte_idx, &print_delimiter);
       else if (c == EOF)
         {
-          if (byte_idx > 0)
-          {
-            if (putchar (line_delim) < 0)
-              write_error ();
-          }
+          write_pending_line_delim (byte_idx);
           break;
         }
       else
@@ -356,20 +481,9 @@ cut_bytes (FILE *stream)
           next_item (&byte_idx);
           if (print_kth (byte_idx))
             {
-              if (output_delimiter_string != output_delimiter_default)
-                {
-                  if (print_delimiter && is_range_start_index (byte_idx))
-                    {
-                      if (fwrite (output_delimiter_string, sizeof (char),
-                                  output_delimiter_length, stdout)
-                          != output_delimiter_length)
-                        write_error ();
-                    }
-                  print_delimiter = true;
-                }
-
-              if (putchar (c) < 0)
-                write_error ();
+              char ch = c;
+              write_selected_item (&print_delimiter,
+                                   is_range_start_index (byte_idx), &ch, 1);
             }
         }
     }
@@ -394,20 +508,10 @@ cut_bytes_no_split (FILE *stream)
       mcel_t g = mbbuf_get_char (&mbbuf);
 
       if (g.ch == line_delim)
-        {
-          if (putchar (line_delim) < 0)
-            write_error ();
-          byte_idx = 0;
-          print_delimiter = false;
-          current_rp = frp;
-        }
+        reset_item_line (&byte_idx, &print_delimiter);
       else if (g.ch == MBBUF_EOF)
         {
-          if (byte_idx > 0)
-            {
-              if (putchar (line_delim) < 0)
-                write_error ();
-            }
+          write_pending_line_delim (byte_idx);
           break;
         }
       else
@@ -433,17 +537,8 @@ cut_bytes_no_split (FILE *stream)
             }
 
           if (seen_selected && suffix_selected)
-            {
-              if (output_delimiter_string != output_delimiter_default)
-                {
-                  if (print_delimiter && first_selected_is_range_start)
-                    write_bytes (output_delimiter_string,
-                                 output_delimiter_length);
-                  print_delimiter = true;
-                }
-
-              write_bytes (mbbuf_char_offset (&mbbuf, g), g.len);
-            }
+            write_selected_item (&print_delimiter,first_selected_is_range_start,
+                                 mbbuf_char_offset (&mbbuf, g), g.len);
         }
     }
 }
@@ -466,207 +561,40 @@ cut_characters (FILE *stream)
       mcel_t g = mbbuf_get_char (&mbbuf);
 
       if (g.ch == line_delim)
-        {
-          if (putchar (line_delim) < 0)
-            write_error ();
-          char_idx = 0;
-          print_delimiter = false;
-          current_rp = frp;
-        }
+        reset_item_line (&char_idx, &print_delimiter);
       else if (g.ch == MBBUF_EOF)
         {
-          if (char_idx > 0)
-            {
-              if (putchar (line_delim) < 0)
-                write_error ();
-            }
+          write_pending_line_delim (char_idx);
           break;
         }
       else
         {
           next_item (&char_idx);
           if (print_kth (char_idx))
-            {
-              if (output_delimiter_string != output_delimiter_default)
-                {
-                  if (print_delimiter && is_range_start_index (char_idx))
-                    {
-                      if (fwrite (output_delimiter_string, sizeof (char),
-                                  output_delimiter_length, stdout)
-                          != output_delimiter_length)
-                        write_error ();
-                    }
-                  print_delimiter = true;
-                }
-
-              if (fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len,
-                          stdout)
-                  != g.len)
-                write_error ();
-            }
+            write_selected_item (&print_delimiter,
+                                 is_range_start_index (char_idx),
+                                 mbbuf_char_offset (&mbbuf, g), g.len);
         }
     }
 }
 
 /* Read from STREAM, printing to standard output any selected fields,
-   using a multibyte field delimiter.  */
+   using a multibyte-aware field delimiter parser.  */
 
 static void
-cut_fields_mb (FILE *stream)
+cut_fields_mb_any (FILE *stream, bool whitespace_mode)
 {
   static char line_in[IO_BUFSIZE];
   mbbuf_t mbbuf;
-  uintmax_t field_idx = 1;
-  bool found_any_selected_field = false;
-  bool buffer_first_field;
-  bool have_pending_line = false;
-
-  current_rp = frp;
-  mbbuf_init (&mbbuf, line_in, sizeof line_in, stream);
-
-  buffer_first_field = (suppress_non_delimited ^ !print_kth (1));
-
-  while (true)
+  struct mbfield_parser parser =
     {
-      if (field_idx == 1 && buffer_first_field)
-        {
-          size_t n_bytes = 0;
-          enum field_terminator terminator;
-
-          while (true)
-            {
-              mcel_t g = mbbuf_get_char (&mbbuf);
-
-              if (g.ch == MBBUF_EOF)
-                {
-                  if (n_bytes == 0)
-                    return;
-                  terminator = FIELD_EOF;
-                  break;
-                }
-
-              if (field_1_bufsize - n_bytes < g.len)
-                {
-                  field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize,
-                                            g.len, -1,
-                                            sizeof *field_1_buffer);
-                }
-
-              memcpy (field_1_buffer + n_bytes, mbbuf_char_offset (&mbbuf, g),
-                      g.len);
-              n_bytes += g.len;
-              have_pending_line = true;
-
-              if (g.ch == line_delim)
-                {
-                  terminator = FIELD_LINE_DELIMITER;
-                  break;
-                }
-
-              if (field_delim_eq (g))
-                {
-                  terminator = FIELD_DELIMITER;
-                  break;
-                }
-            }
-
-          if (terminator != FIELD_DELIMITER)
-            {
-              if (!suppress_non_delimited)
-                {
-                  write_bytes (field_1_buffer, n_bytes);
-                  if (terminator == FIELD_EOF)
-                    {
-                      if (putchar (line_delim) < 0)
-                        write_error ();
-                    }
-                }
-
-              if (terminator == FIELD_EOF)
-                break;
-
-              field_idx = 1;
-              current_rp = frp;
-              found_any_selected_field = false;
-              have_pending_line = false;
-              continue;
-            }
-
-          if (print_kth (1))
-            {
-              write_bytes (field_1_buffer, n_bytes - delim_length);
-              found_any_selected_field = true;
-            }
-          next_item (&field_idx);
-        }
-
-      mcel_t g;
-
-      if (print_kth (field_idx))
-        {
-          if (found_any_selected_field)
-            write_bytes (output_delimiter_string, output_delimiter_length);
-          found_any_selected_field = true;
-
-          while (true)
-            {
-              g = mbbuf_get_char (&mbbuf);
-              if (g.ch != MBBUF_EOF)
-                have_pending_line = true;
-              if (g.ch == MBBUF_EOF || g.ch == line_delim || field_delim_eq (g))
-                break;
-              write_bytes (mbbuf_char_offset (&mbbuf, g), g.len);
-            }
-        }
-      else
-        {
-          while (true)
-            {
-              g = mbbuf_get_char (&mbbuf);
-              if (g.ch != MBBUF_EOF)
-                have_pending_line = true;
-              if (g.ch == MBBUF_EOF || g.ch == line_delim || field_delim_eq (g))
-                break;
-            }
-        }
-
-      if (field_delim_eq (g))
-        next_item (&field_idx);
-      else if (g.ch == line_delim || g.ch == MBBUF_EOF)
-        {
-          if (g.ch == MBBUF_EOF && !have_pending_line)
-            break;
-          if (found_any_selected_field
-              || !(suppress_non_delimited && field_idx == 1))
-            {
-              if (putchar (line_delim) < 0)
-                write_error ();
-            }
-          if (g.ch == MBBUF_EOF)
-            break;
-
-          field_idx = 1;
-          current_rp = frp;
-          found_any_selected_field = false;
-          have_pending_line = false;
-        }
-    }
-}
-
-/* Read from STREAM, printing to standard output any selected fields,
-   using runs of whitespace as the field delimiter.  */
-
-static void
-cut_fields_ws (FILE *stream)
-{
-  static char line_in[IO_BUFSIZE];
-  mbbuf_t mbbuf;
+      .whitespace_delimited = whitespace_mode,
+      .saved_g = { .ch = MBBUF_EOF }
+    };
   uintmax_t field_idx = 1;
   bool found_any_selected_field = false;
   bool buffer_first_field;
   bool have_pending_line = false;
-  bool have_saved = false;
-  mcel_t saved_g = { .ch = MBBUF_EOF };
 
   current_rp = frp;
   mbbuf_init (&mbbuf, line_in, sizeof line_in, stream);
@@ -678,71 +606,25 @@ cut_fields_ws (FILE *stream)
       if (field_idx == 1 && buffer_first_field)
         {
           size_t n_bytes = 0;
-          enum field_terminator terminator;
-
-          while (true)
-            {
-              mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g);
-
-              if (g.ch == MBBUF_EOF)
-                {
-                  if (n_bytes == 0)
-                    return;
-                  terminator = FIELD_EOF;
-                  break;
-                }
-
-              have_pending_line = true;
-
-              if (g.ch == line_delim)
-                {
-                  if (field_1_bufsize - n_bytes < g.len)
-                    field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize,
-                                              g.len, -1,
-                                              sizeof *field_1_buffer);
-                  memcpy (field_1_buffer + n_bytes,
-                          mbbuf_char_offset (&mbbuf, g), g.len);
-                  n_bytes += g.len;
-                  terminator = FIELD_LINE_DELIMITER;
-                  break;
-                }
-
-              if (c32issep (g.ch))
-                {
-                  terminator = skip_whitespace_delim (&mbbuf, &have_saved,
-                                                      &saved_g,
-                                                      &have_pending_line);
-                  break;
-                }
-
-              if (field_1_bufsize - n_bytes < g.len)
-                field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize,
-                                          g.len, -1,
-                                          sizeof *field_1_buffer);
-              memcpy (field_1_buffer + n_bytes, mbbuf_char_offset (&mbbuf, g),
-                      g.len);
-              n_bytes += g.len;
-            }
+          enum field_terminator terminator
+            = read_mb_field_to_buffer (&mbbuf, &parser, &have_pending_line,
+                                       &n_bytes);
+          if (terminator == FIELD_EOF && n_bytes == 0)
+            return;
 
           if (terminator != FIELD_DELIMITER)
             {
               if (!suppress_non_delimited)
                 {
                   write_bytes (field_1_buffer, n_bytes);
-                  if (terminator == FIELD_EOF)
-                    {
-                      if (putchar (line_delim) < 0)
-                        write_error ();
-                    }
+                  write_line_delim ();
                 }
 
               if (terminator == FIELD_EOF)
                 break;
 
-              field_idx = 1;
-              current_rp = frp;
-              found_any_selected_field = false;
-              have_pending_line = false;
+              reset_field_line (&field_idx, &found_any_selected_field,
+                                &have_pending_line);
               continue;
             }
 
@@ -755,71 +637,17 @@ cut_fields_ws (FILE *stream)
         }
 
       enum field_terminator terminator;
+      bool write_field = print_kth (field_idx);
 
-      if (print_kth (field_idx))
+      if (write_field)
         {
           if (found_any_selected_field)
             write_bytes (output_delimiter_string, output_delimiter_length);
           found_any_selected_field = true;
-
-          while (true)
-            {
-              mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g);
-
-              if (g.ch == MBBUF_EOF)
-                {
-                  terminator = FIELD_EOF;
-                  break;
-                }
-
-              have_pending_line = true;
-
-              if (g.ch == line_delim)
-                {
-                  terminator = FIELD_LINE_DELIMITER;
-                  break;
-                }
-
-              if (c32issep (g.ch))
-                {
-                  terminator = skip_whitespace_delim (&mbbuf, &have_saved,
-                                                      &saved_g,
-                                                      &have_pending_line);
-                  break;
-                }
-
-              write_bytes (mbbuf_char_offset (&mbbuf, g), g.len);
-            }
         }
-      else
-        {
-          while (true)
-            {
-              mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g);
 
-              if (g.ch == MBBUF_EOF)
-                {
-                  terminator = FIELD_EOF;
-                  break;
-                }
-
-              have_pending_line = true;
-
-              if (g.ch == line_delim)
-                {
-                  terminator = FIELD_LINE_DELIMITER;
-                  break;
-                }
-
-              if (c32issep (g.ch))
-                {
-                  terminator = skip_whitespace_delim (&mbbuf, &have_saved,
-                                                      &saved_g,
-                                                      &have_pending_line);
-                  break;
-                }
-            }
-        }
+      terminator = scan_mb_field (&mbbuf, &parser, &have_pending_line,
+                                  write_field);
 
       if (terminator == FIELD_DELIMITER)
         next_item (&field_idx);
@@ -829,21 +657,34 @@ cut_fields_ws (FILE *stream)
             break;
           if (found_any_selected_field
               || !(suppress_non_delimited && field_idx == 1))
-            {
-              if (putchar (line_delim) < 0)
-                write_error ();
-            }
+            write_line_delim ();
           if (terminator == FIELD_EOF)
             break;
 
-          field_idx = 1;
-          current_rp = frp;
-          found_any_selected_field = false;
-          have_pending_line = false;
+          reset_field_line (&field_idx, &found_any_selected_field,
+                            &have_pending_line);
         }
     }
 }
 
+/* Read from STREAM, printing to standard output any selected fields,
+   using a multibyte field delimiter.  */
+
+static void
+cut_fields_mb (FILE *stream)
+{
+  cut_fields_mb_any (stream, false);
+}
+
+/* Read from STREAM, printing to standard output any selected fields,
+   using runs of whitespace as the field delimiter.  */
+
+static void
+cut_fields_ws (FILE *stream)
+{
+  cut_fields_mb_any (stream, true);
+}
+
 /* Read from stream STREAM, printing to standard output any selected fields.  */
 
 static void