]> git.ipfire.org Git - thirdparty/coreutils.git/commitdiff
cut: optimize when no delimiter in input
authorPádraig Brady <P@draigBrady.com>
Sat, 21 Mar 2026 14:15:48 +0000 (14:15 +0000)
committerPádraig Brady <P@draigBrady.com>
Sun, 5 Apr 2026 12:15:56 +0000 (13:15 +0100)
This is about 20x faster.
Note we only do the delimiter search once per chunk,
and it's usually quick as delimiters wouldn't be too far
into the a chunk if present, so we don't bother
to cache the found delimiter.

src/cut.c
tests/cut/cut.pl

index 15f2c0fa9ded71e114d374cb67cbfb71b66cd556..208fa6e4a62901acf89e62527deb08107596a06e 100644 (file)
--- a/src/cut.c
+++ b/src/cut.c
@@ -936,6 +936,31 @@ cut_fields_bytesearch (FILE *stream)
           .blank_delimited = whitespace_delimited
         };
 
+      /* Shortcut the case were there is no delimiter in input,
+         as directly outputting without parsing is 20x faster.  */
+      if (field_idx == 1
+          && !suppress_non_delimited && !whitespace_delimited
+          && !field_delim_is_line_delim ()
+          && !have_pending_line
+          && field_1_n_bytes == 0
+          && !skip_line_remainder
+          && !find_bytesearch_field_delim (chunk, safe))
+        {
+          char *last_line_delim = feof (mbbuf.fp) ? chunk + safe - 1
+                                  : memrchr ((void *) chunk, line_delim, safe);
+          if (last_line_delim)
+            {
+              idx_t n = last_line_delim - chunk + 1;
+              write_bytes (chunk, n);
+              if (feof (mbbuf.fp) && chunk[n - 1] != line_delim)
+                write_line_delim ();
+              mbbuf_advance (&mbbuf, n);
+              if (feof (mbbuf.fp))
+                return;
+              continue;
+            }
+        }
+
       while (processed < safe)
         {
           char *terminator = NULL;
index 8c3c06653ef4754840df8ed68994a3d0310627c2..bfccdbe85f05e52c7dcc7f07ccd8051ed6c31f5a 100755 (executable)
@@ -170,6 +170,7 @@ my @Tests =
   ['newline-4', '-d:', '-f1', {IN=>"a:1\nb:2"}, {OUT=>"a\nb\n"}],
   ['newline-5', '-d:', '-f2', {IN=>"a:1\nb:2\n"}, {OUT=>"1\n2\n"}],
   ['newline-6', '-d:', '-f2', {IN=>"a:1\nb:2"}, {OUT=>"1\n2\n"}],
+  ['newline-6a', '-d:', '-f2', {IN=>"a\nb"}, {OUT=>"a\nb\n"}],
   ['newline-7', '-s', '-d:', '-f1', {IN=>"a:1\nb:2"}, {OUT=>"a\nb\n"}],
   ['newline-8', '-s', '-d:', '-f1', {IN=>"a:1\nb:2\n"}, {OUT=>"a\nb\n"}],
   ['newline-9', '-s', '-d:', '-f1', {IN=>"a1\nb2"}, {OUT=>""}],