]> git.ipfire.org Git - thirdparty/coreutils.git/commitdiff
cut: -f: fix handling of multi-byte delimiters that span buffers
authorPádraig Brady <P@draigBrady.com>
Thu, 2 Apr 2026 20:56:23 +0000 (21:56 +0100)
committerPádraig Brady <P@draigBrady.com>
Mon, 6 Apr 2026 14:52:58 +0000 (15:52 +0100)
* src/cut.c (cut_fields_bytesearch): Ensure up to delim_bytes -1
is left for the next refill.
* tests/cut/cut.pl: Add a test case.

src/cut.c
tests/cut/cut.pl

index 80e34cc0951a9a1577f003bb4e7bb593b22ec2ae..b11a8c4e5579dd9112fe94de7653234a6ebc1f68 100644 (file)
--- a/src/cut.c
+++ b/src/cut.c
@@ -628,6 +628,25 @@ find_field_delim (char *buf, size_t len)
 #endif
 }
 
+/* Return the number of trailing bytes in BUF that could be the initial
+   bytes of a delimiter split across buffers.  */
+
+ATTRIBUTE_PURE
+static idx_t
+field_delim_overlap (char const *buf, idx_t len)
+{
+  idx_t overlap = MIN (len, delim_length - 1);
+
+  while (0 < overlap)
+    {
+      if (memcmp (buf + len - overlap, delim_bytes, overlap) == 0)
+        return overlap;
+      overlap--;
+    }
+
+  return 0;
+}
+
 /* Byte search for line end or delimiter in BUF,
    returning results in CTX.  */
 
@@ -1142,6 +1161,12 @@ cut_fields_bytesearch (FILE *stream)
           idx_t field_len = terminator ? terminator - (chunk + processed)
                                        : n_avail - processed;
 
+          if (terminator_kind == FIELD_DATA
+              && !search.at_eof
+              && !whitespace_delimited
+              && !field_delim_is_line_delim ())
+            field_len -= field_delim_overlap (chunk + processed, field_len);
+
           if (field_len || terminator)
             have_pending_line = true;
 
index 24fb3d9a5c36e35c5097de74325e47ec9f06107a..bf522b0082b88c05994b801173d555c9237c1b4c 100755 (executable)
@@ -345,6 +345,9 @@ if ($mb_locale ne 'C')
       ['mb-delim-8', '-d', "\xff", '-f2',  # Note 0xF5-0xFF is efficient
        {IN=>"a\xffb\n"}, {OUT=>"b\n"},
        {ENV => "LC_ALL=$mb_locale"}],
+      ['mb-delim-9', '-d', "\xc3\xa9", '-f2',
+       {IN=>('a' x ($IO_BUFSIZE - 1)) . "\xc3\xa9b\n"}, {OUT=>"b\n"},
+       {ENV => "LC_ALL=$mb_locale"}],
       ['mb-w-delim-1', '-w', '-f2', {IN=>"a\xe2\x80\x83b\n"}, {OUT=>"b\n"},
        {ENV => "LC_ALL=$mb_locale"}],
       ['mb-w-delim-2', '-sw', '-f2', {IN=>"a\xc2\xa0b\n"}, {OUT=>""},