From: Pádraig Brady Date: Wed, 11 Mar 2026 20:50:23 +0000 (+0000) Subject: cut: support multi-byte input with -c X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=97703386e6905709ba6ab276a0f930e1eabfcd34;p=thirdparty%2Fcoreutils.git cut: support multi-byte input with -c * src/cut.c * tests/cut/cut.pl --- diff --git a/doc/coreutils.texi b/doc/coreutils.texi index b860792dab..042173a720 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -6168,7 +6168,7 @@ internationalization will change that. Tabs and backspaces are treated like any other character; they take up 1 character. If an output delimiter is specified, (see the description of @option{--output-delimiter}), then output that string between ranges -of selected bytes. +of selected characters. @optItem{cut,--complement,} This option is a GNU extension. diff --git a/src/cut.c b/src/cut.c index 702ca55b9f..204fc0dfa2 100644 --- a/src/cut.c +++ b/src/cut.c @@ -32,6 +32,8 @@ #include "assure.h" #include "fadvise.h" #include "getndelim2.h" +#include "ioblksize.h" +#include "mbbuf.h" #include "set-fields.h" @@ -52,9 +54,9 @@ while (0) -/* Pointer inside RP. When checking if a byte or field is selected +/* Pointer inside RP. When checking if a -b,-c,-f is selected by a finite range, we check if it is between CURRENT_RP.LO - and CURRENT_RP.HI. If the byte or field index is greater than + and CURRENT_RP.HI. If the index is greater than CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */ static struct field_range_pair *current_rp; @@ -98,6 +100,15 @@ static char output_delimiter_default[1]; /* True if we have ever read standard input. */ static bool have_read_stdin; +/* Whether to cut bytes, characters, or fields. */ +static enum +{ + CUT_MODE_NONE, + CUT_MODE_BYTES, + CUT_MODE_CHARACTERS, + CUT_MODE_FIELDS +} cut_mode = CUT_MODE_NONE; + /* For long options that have no equivalent short option, use a non-character as a pseudo short option, starting with CHAR_MAX + 1. */ enum @@ -293,6 +304,66 @@ cut_bytes (FILE *stream) } } +/* Read from STREAM, printing to standard output any selected characters. */ + +static void +cut_characters (FILE *stream) +{ + uintmax_t char_idx = 0; + bool print_delimiter = false; + static char line_in[IO_BUFSIZE]; + mbbuf_t mbbuf; + + current_rp = frp; + mbbuf_init (&mbbuf, line_in, sizeof line_in, stream); + + while (true) + { + mcel_t g = mbbuf_get_char (&mbbuf); + + if (g.ch == line_delim) + { + if (putchar (line_delim) < 0) + write_error (); + char_idx = 0; + print_delimiter = false; + current_rp = frp; + } + else if (g.ch == MBBUF_EOF) + { + if (char_idx > 0) + { + if (putchar (line_delim) < 0) + write_error (); + } + break; + } + else + { + next_item (&char_idx); + if (print_kth (char_idx)) + { + if (output_delimiter_string != output_delimiter_default) + { + if (print_delimiter && is_range_start_index (char_idx)) + { + if (fwrite (output_delimiter_string, sizeof (char), + output_delimiter_length, stdout) + != output_delimiter_length) + write_error (); + } + print_delimiter = true; + } + + if (fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, + stdout) + != g.len) + write_error (); + } + } + } +} + /* Read from stream STREAM, printing to standard output any selected fields. */ static void @@ -491,7 +562,6 @@ main (int argc, char **argv) int optc; bool ok; bool delim_specified = false; - bool byte_mode = false; char *spec_list_string = NULL; initialize_main (&argc, &argv); @@ -508,12 +578,15 @@ main (int argc, char **argv) switch (optc) { case 'b': + cut_mode = CUT_MODE_BYTES; + FALLTHROUGH; case 'c': - /* Build the byte list. */ - byte_mode = true; + if (optc == 'c') + cut_mode = CUT_MODE_CHARACTERS; FALLTHROUGH; case 'f': - /* Build the field list. */ + if (optc == 'f') + cut_mode = CUT_MODE_FIELDS; if (spec_list_string) FATAL_ERROR (_("only one list may be specified")); spec_list_string = optarg; @@ -561,7 +634,7 @@ main (int argc, char **argv) if (!spec_list_string) FATAL_ERROR (_("you must specify a list of bytes, characters, or fields")); - if (byte_mode) + if (cut_mode == CUT_MODE_BYTES || cut_mode == CUT_MODE_CHARACTERS) { if (delim_specified) FATAL_ERROR (_("an input delimiter may be specified only\ @@ -573,7 +646,9 @@ main (int argc, char **argv) } set_fields (spec_list_string, - ((byte_mode ? SETFLD_ERRMSG_USE_POS : 0) + (((cut_mode == CUT_MODE_BYTES + || cut_mode == CUT_MODE_CHARACTERS) + ? SETFLD_ERRMSG_USE_POS : 0) | (complement ? SETFLD_COMPLEMENT : 0))); if (!delim_specified) @@ -586,7 +661,25 @@ main (int argc, char **argv) output_delimiter_length = 1; } - void (*cut_stream) (FILE *) = byte_mode ? cut_bytes : cut_fields; + void (*cut_stream) (FILE *) = NULL; + switch (cut_mode) + { + case CUT_MODE_NONE: + unreachable (); + + case CUT_MODE_BYTES: + cut_stream = cut_bytes; + break; + + case CUT_MODE_CHARACTERS: + cut_stream = MB_CUR_MAX <= 1 ? cut_bytes : cut_characters; + break; + + case CUT_MODE_FIELDS: + cut_stream = cut_fields; + break; + } + affirm (cut_stream); if (optind == argc) ok = cut_file ("-", cut_stream); else diff --git a/tests/cut/cut.pl b/tests/cut/cut.pl index aa44542198..28a0ae95a2 100755 --- a/tests/cut/cut.pl +++ b/tests/cut/cut.pl @@ -247,6 +247,17 @@ if ($mb_locale ne 'C') push @new, ["$test_name-mb", @new_t, {ENV => "LC_ALL=$mb_locale"}]; } push @Tests, @new; + + push @Tests, + ['mb-char-1', '-c1', {IN=>"\xc3\xa9x\n"}, {OUT=>"\xc3\xa9\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-char-2', '-c2', {IN=>"\xc3\xa9x\n"}, {OUT=>"x\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-char-3', '-c1,3', '--output-d=:', + {IN=>"\xc3\xa9a\xe2\x82\xacb\n"}, {OUT=>"\xc3\xa9:\xe2\x82\xac\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-char-4', '-c1-2', {IN=>"\xc3x\n"}, {OUT=>"\xc3x\n"}, + {ENV => "LC_ALL=$mb_locale"}]; }