]> git.ipfire.org Git - thirdparty/coreutils.git/commitdiff
cut: optimize -b by avoiding per byte iteration
authorPádraig Brady <P@draigBrady.com>
Sun, 22 Mar 2026 12:20:04 +0000 (12:20 +0000)
committerPádraig Brady <P@draigBrady.com>
Sun, 5 Apr 2026 12:15:56 +0000 (13:15 +0100)
Always memchr(line_delim) which is fast and allows:

- skipping whole segments when the next selected byte is beyond them
- skipping unselected prefixes in bulk
- writing contiguous selected spans in bulk

This wins for lines >= 4 characters,
but is slower lines <= 3 characters, especially if selecting bytes 1-3.
That is unusual though.

src/cut.c

index 208fa6e4a62901acf89e62527deb08107596a06e..9f73eefd5c31a7a6d5e32d91ea60527005f06155 100644 (file)
--- a/src/cut.c
+++ b/src/cut.c
@@ -658,6 +658,13 @@ field_selection_exhausted (uintmax_t field_idx)
   return !print_kth (field_idx) && current_rp->lo == UINTMAX_MAX;
 }
 
+static inline void
+sync_byte_selection (uintmax_t byte_idx)
+{
+  while (current_rp->hi <= byte_idx)
+    current_rp++;
+}
+
 static inline void
 reset_field_line (uintmax_t *field_idx, bool *found_any_selected_field,
                   bool *have_pending_line, struct mbfield_parser *parser)
@@ -675,35 +682,58 @@ reset_field_line (uintmax_t *field_idx, bool *found_any_selected_field,
 static void
 cut_bytes (FILE *stream)
 {
-  uintmax_t byte_idx;  /* Number of bytes in the line so far.  */
-  /* Whether to begin printing delimiters between ranges for the current line.
-     Set after we've begun printing data corresponding to the first range.  */
-  bool print_delimiter;
+  uintmax_t byte_idx = 0;
+  bool print_delimiter = false;
+  static char line_in[IO_BUFSIZE];
 
-  byte_idx = 0;
-  print_delimiter = false;
   current_rp = frp;
+
   while (true)
     {
-      int c;           /* Each character from the file.  */
-
-      c = getc (stream);
-
-      if (c == line_delim)
-        reset_item_line (&byte_idx, &print_delimiter);
-      else if (c == EOF)
+      idx_t available = fread (line_in, sizeof *line_in, sizeof line_in,
+                               stream);
+      if (available == 0)
         {
           write_pending_line_delim (byte_idx);
           break;
         }
-      else
+
+      idx_t processed = 0;
+
+      while (processed < available)
         {
-          next_item (&byte_idx);
-          if (print_kth (byte_idx))
+          char *line = line_in + processed;
+          char *line_end = memchr ((void *) line, line_delim,
+                                   available - processed);
+          char *end = line + (line_end ? line_end - line : available - processed);
+          char *p = line;
+
+          while (p < end)
             {
-              char ch = c;
-              write_selected_item (&print_delimiter,
-                                   is_range_start_index (byte_idx), &ch, 1);
+              sync_byte_selection (byte_idx);
+
+              if (byte_idx + 1 < current_rp->lo)
+                {
+                  idx_t skip = MIN (end - p, current_rp->lo - (byte_idx + 1));
+                  p += skip;
+                  byte_idx += skip;
+                }
+              else
+                {
+                  idx_t n = MIN (end - p, current_rp->hi - byte_idx);
+                  write_selected_item (&print_delimiter,
+                                       is_range_start_index (byte_idx + 1),
+                                       p, n);
+                  p += n;
+                  byte_idx += n;
+                }
+            }
+
+          processed += end - line;
+          if (line_end)
+            {
+              processed++;
+              reset_item_line (&byte_idx, &print_delimiter);
             }
         }
     }