From: Pádraig Brady Date: Thu, 12 Mar 2026 17:27:00 +0000 (+0000) Subject: cut: implement -n to avoid outputting partial characters X-Git-Tag: v9.11~117 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=57110d8bae0637baab050466fad2946b645b51a0;p=thirdparty%2Fcoreutils.git cut: implement -n to avoid outputting partial characters Both the i18n patch and FreeBSD/macOS support this option. They do differ in behavior somewhat as the i18n patch may output more bytes than requested. $ printf '\xc3\xa9b\n' | i18n-cut -n -b1 é There is also a bug in the i18n patch with multi-byte at the start of a line: $ printf '\xc3\xa9b\n' | i18n-cut -n -b1-2 éb We follow the FreeBSD behavior since it seems more useful to have -b be a hard limit, rather than a soft limit. This also reduces the possibility of duplicate character output with separate cut invocations with non overlapping byte ranges. * src/cut.c (cut_bytes_no_split): A new function similar to cut_characters, to handle multi-byte characters with byte limit semantics. * tests/cut/cut.pl: Add test cases. --- diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 042173a720..fd0b7edeae 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -6215,7 +6215,10 @@ With @option{-f}, use the first byte of @var{input_delim_byte} as the input fields separator (default is TAB). @optItem{cut,-n,} -Do not split multi-byte characters (no-op for now). +@optItemx{cut,--no-partial,} +With @option{--bytes}, do not split multi-byte characters. +A byte range must encompass the end of a multi-byte character +for it to be selected. @optItem{cut,-s,} @optItemx{cut,--only-delimited,} diff --git a/src/cut.c b/src/cut.c index 7381395d67..613e2a9ea1 100644 --- a/src/cut.c +++ b/src/cut.c @@ -109,6 +109,9 @@ static char output_delimiter_default[MB_LEN_MAX]; /* True if we have ever read standard input. */ static bool have_read_stdin; +/* If true, don't split multibyte characters in byte mode. */ +static bool no_split; + /* If true, interpret each run of whitespace as one field delimiter. */ static bool whitespace_delimited; @@ -135,6 +138,7 @@ static struct option const longopts[] = {"characters", required_argument, NULL, 'c'}, {"fields", required_argument, NULL, 'f'}, {"delimiter", required_argument, NULL, 'd'}, + {"no-partial", no_argument, NULL, 'n'}, {"whitespace-delimited", no_argument, NULL, 'w'}, {"only-delimited", no_argument, NULL, 's'}, {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION}, @@ -185,8 +189,8 @@ Print selected parts of lines from each FILE to standard output.\n\ no delimiter character, unless the -s option is specified\n\ ")); oputs (_("\ - -n\n\ - (ignored)\n\ + -n, --no-partial\n\ + with -b, don't output partial multi-byte characters\n\ ")); oputs (_("\ -s, --only-delimited\n\ @@ -371,6 +375,79 @@ cut_bytes (FILE *stream) } } +/* Read from STREAM, printing selected bytes without splitting + multibyte characters. */ + +static void +cut_bytes_no_split (FILE *stream) +{ + uintmax_t byte_idx = 0; + bool print_delimiter = false; + static char line_in[IO_BUFSIZE]; + mbbuf_t mbbuf; + + current_rp = frp; + mbbuf_init (&mbbuf, line_in, sizeof line_in, stream); + + while (true) + { + mcel_t g = mbbuf_get_char (&mbbuf); + + if (g.ch == line_delim) + { + if (putchar (line_delim) < 0) + write_error (); + byte_idx = 0; + print_delimiter = false; + current_rp = frp; + } + else if (g.ch == MBBUF_EOF) + { + if (byte_idx > 0) + { + if (putchar (line_delim) < 0) + write_error (); + } + break; + } + else + { + bool first_selected_is_range_start = false; + bool seen_selected = false; + bool suffix_selected = true; + + for (idx_t i = 0; i < g.len; i++) + { + next_item (&byte_idx); + if (print_kth (byte_idx)) + { + if (!seen_selected) + { + seen_selected = true; + first_selected_is_range_start + = is_range_start_index (byte_idx); + } + } + else if (seen_selected) + suffix_selected = false; + } + + if (seen_selected && suffix_selected) + { + if (output_delimiter_string != output_delimiter_default) + { + if (print_delimiter && first_selected_is_range_start) + write_bytes (output_delimiter_string, + output_delimiter_length); + print_delimiter = true; + } + + write_bytes (mbbuf_char_offset (&mbbuf, g), g.len); + } + } + } +} + /* Read from STREAM, printing to standard output any selected characters. */ static void @@ -1042,6 +1119,7 @@ main (int argc, char **argv) break; case 'n': + no_split = true; break; case 's': @@ -1108,7 +1186,8 @@ main (int argc, char **argv) unreachable (); case CUT_MODE_BYTES: - cut_stream = cut_bytes; + cut_stream = MB_CUR_MAX <= 1 || !no_split + ? cut_bytes : cut_bytes_no_split; break; case CUT_MODE_CHARACTERS: diff --git a/tests/cut/cut.pl b/tests/cut/cut.pl index c768124305..905e84c548 100755 --- a/tests/cut/cut.pl +++ b/tests/cut/cut.pl @@ -269,6 +269,29 @@ if ($mb_locale ne 'C') {ENV => "LC_ALL=$mb_locale"}], ['mb-char-5', '-c1-2', {IN=>"\xc3x\n"}, {OUT=>"\xc3x\n"}, {ENV => "LC_ALL=$mb_locale"}], + # Note mb-byte-n-1 and mb-byte-n-4 differ from coreutils-i18n patch, + # which outputs a character if any byte is selected. + # I.e., the i18n patch may output more bytes that the requested range. + # Also mb-byte-n-3 differs from coreutils-i18n patch, + # but that looks like a bug in that patch rather than a design choice. + ['mb-byte-n-1', qw(-b1 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-byte-n-2', qw(-b2 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"\xc3\xa9\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-byte-n-3', qw(-b1-2 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"\xc3\xa9\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-byte-n-4', qw(-b1,3 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"x\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-byte-n-5', qw(-b2-3 -n), {IN=>"\xc3\xa9x\n"}, {OUT=>"\xc3\xa9x\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-byte-n-6', qw(-b2 -n), {IN=>"\xe2\x82\xacx\n"}, {OUT=>"\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-byte-n-7', qw(-b3 -n), {IN=>"\xe2\x82\xacx\n"}, + {OUT=>"\xe2\x82\xac\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-byte-n-8', qw(-b2-3 -n), {IN=>"\xe2\x82\xacx\n"}, + {OUT=>"\xe2\x82\xac\n"}, + {ENV => "LC_ALL=$mb_locale"}], ['mb-delim-1', '-d', "\xc3\xa9", '-f2', {IN=>"a\xc3\xa9b\xc3\xa9c\n"}, {OUT=>"b\n"}, {ENV => "LC_ALL=$mb_locale"}],