From: Pádraig Brady
Date: Thu, 2 Apr 2026 20:56:23 +0000 (+0100) Subject: cut: -f: fix handling of multi-byte delimiters that span buffers X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1a44a25808f9bda727ffbda753ff2eeab3bf79cc;p=thirdparty%2Fcoreutils.git cut: -f: fix handling of multi-byte delimiters that span buffers * src/cut.c (cut_fields_bytesearch): Ensure up to delim_bytes -1 is left for the next refill. * tests/cut/cut.pl: Add a test case. --- diff --git a/src/cut.c b/src/cut.c index 80e34cc095..b11a8c4e55 100644 --- a/src/cut.c +++ b/src/cut.c @@ -628,6 +628,25 @@ find_field_delim (char *buf, size_t len) #endif } +/* Return the number of trailing bytes in BUF that could be the initial + bytes of a delimiter split across buffers. */ + +ATTRIBUTE_PURE +static idx_t +field_delim_overlap (char const *buf, idx_t len) +{ + idx_t overlap = MIN (len, delim_length - 1); + + while (0 < overlap) + { + if (memcmp (buf + len - overlap, delim_bytes, overlap) == 0) + return overlap; + overlap--; + } + + return 0; +} + /* Byte search for line end or delimiter in BUF, returning results in CTX. */ @@ -1142,6 +1161,12 @@ cut_fields_bytesearch (FILE *stream) idx_t field_len = terminator ? terminator - (chunk + processed) : n_avail - processed; + if (terminator_kind == FIELD_DATA + && !search.at_eof + && !whitespace_delimited + && !field_delim_is_line_delim ()) + field_len -= field_delim_overlap (chunk + processed, field_len); + if (field_len || terminator) have_pending_line = true; diff --git a/tests/cut/cut.pl b/tests/cut/cut.pl index 24fb3d9a5c..bf522b0082 100755 --- a/tests/cut/cut.pl +++ b/tests/cut/cut.pl @@ -345,6 +345,9 @@ if ($mb_locale ne 'C') ['mb-delim-8', '-d', "\xff", '-f2', # Note 0xF5-0xFF is efficient {IN=>"a\xffb\n"}, {OUT=>"b\n"}, {ENV => "LC_ALL=$mb_locale"}], + ['mb-delim-9', '-d', "\xc3\xa9", '-f2', + {IN=>('a' x ($IO_BUFSIZE - 1)) . "\xc3\xa9b\n"}, {OUT=>"b\n"}, + {ENV => "LC_ALL=$mb_locale"}], ['mb-w-delim-1', '-w', '-f2', {IN=>"a\xe2\x80\x83b\n"}, {OUT=>"b\n"}, {ENV => "LC_ALL=$mb_locale"}], ['mb-w-delim-2', '-sw', '-f2', {IN=>"a\xc2\xa0b\n"}, {OUT=>""},