From: Pádraig Brady Date: Wed, 11 Mar 2026 22:42:45 +0000 (+0000) Subject: cut: implement -w,--whitespace-delimited X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=19aa72b4eab2b616e8759642b119b2098a239e27;p=thirdparty%2Fcoreutils.git cut: implement -w,--whitespace-delimited * src/cut.c (cut_fields_ws): A new function handling both uni-byte and multi-byte cases. * tests/cut/cut.pl: Add a test cases. --- diff --git a/src/cut.c b/src/cut.c index 82e9065b69..65fc664277 100644 --- a/src/cut.c +++ b/src/cut.c @@ -109,6 +109,9 @@ static char output_delimiter_default[MB_LEN_MAX]; /* True if we have ever read standard input. */ static bool have_read_stdin; +/* If true, interpret each run of whitespace as one field delimiter. */ +static bool whitespace_delimited; + /* Whether to cut bytes, characters, or fields. */ static enum { @@ -132,6 +135,7 @@ static struct option const longopts[] = {"characters", required_argument, NULL, 'c'}, {"fields", required_argument, NULL, 'f'}, {"delimiter", required_argument, NULL, 'd'}, + {"whitespace-delimited", no_argument, NULL, 'w'}, {"only-delimited", no_argument, NULL, 's'}, {"output-delimiter", required_argument, NULL, OUTPUT_DELIMITER_OPTION}, {"complement", no_argument, NULL, COMPLEMENT_OPTION}, @@ -192,6 +196,10 @@ Print selected parts of lines from each FILE to standard output.\n\ --output-delimiter=STRING\n\ use STRING as the output delimiter;\n\ the default is to use the input delimiter\n\ +")); + oputs (_("\ + -w, --whitespace-delimited\n\ + use runs of whitespace as the field delimiter\n\ ")); oputs (_("\ -z, --zero-terminated\n\ @@ -259,6 +267,43 @@ field_delim_eq (mcel_t g) return delim_mcel.err ? g.err == delim_mcel.err : mcel_eq (g, delim_mcel); } +enum field_terminator +{ + FIELD_DELIMITER, + FIELD_LINE_DELIMITER, + FIELD_EOF +}; + +static inline mcel_t +mbbuf_get_saved_char (mbbuf_t *mbbuf, bool *have_saved, mcel_t *saved_g) +{ + if (*have_saved) + { + *have_saved = false; + return *saved_g; + } + return mbbuf_get_char (mbbuf); +} + +static inline enum field_terminator +skip_whitespace_delim (mbbuf_t *mbuf, bool *have_saved, mcel_t *saved_g, + bool *have_pending_line) +{ + mcel_t g; + + do + { + g = mbbuf_get_char (mbuf); + if (g.ch != MBBUF_EOF) + *have_pending_line = true; + } + while (g.ch != MBBUF_EOF && g.ch != line_delim && c32issep (g.ch)); + + *saved_g = g; + *have_saved = true; + return FIELD_DELIMITER; +} + static void write_bytes (char const *buf, size_t n_bytes) { @@ -392,13 +437,6 @@ cut_characters (FILE *stream) static void cut_fields_mb (FILE *stream) { - enum field_terminator - { - FIELD_DELIMITER, - FIELD_LINE_DELIMITER, - FIELD_EOF - }; - static char line_in[IO_BUFSIZE]; mbbuf_t mbbuf; uintmax_t field_idx = 1; @@ -538,6 +576,197 @@ cut_fields_mb (FILE *stream) } } +/* Read from STREAM, printing to standard output any selected fields, + using runs of whitespace as the field delimiter. */ + +static void +cut_fields_ws (FILE *stream) +{ + static char line_in[IO_BUFSIZE]; + mbbuf_t mbbuf; + uintmax_t field_idx = 1; + bool found_any_selected_field = false; + bool buffer_first_field; + bool have_pending_line = false; + bool have_saved = false; + mcel_t saved_g = { .ch = MBBUF_EOF }; + + current_rp = frp; + mbbuf_init (&mbbuf, line_in, sizeof line_in, stream); + + buffer_first_field = (suppress_non_delimited ^ !print_kth (1)); + + while (true) + { + if (field_idx == 1 && buffer_first_field) + { + size_t n_bytes = 0; + enum field_terminator terminator; + + while (true) + { + mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g); + + if (g.ch == MBBUF_EOF) + { + if (n_bytes == 0) + return; + terminator = FIELD_EOF; + break; + } + + have_pending_line = true; + + if (g.ch == line_delim) + { + if (field_1_bufsize - n_bytes < g.len) + field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize, + g.len, -1, + sizeof *field_1_buffer); + memcpy (field_1_buffer + n_bytes, + mbbuf_char_offset (&mbbuf, g), g.len); + n_bytes += g.len; + terminator = FIELD_LINE_DELIMITER; + break; + } + + if (c32issep (g.ch)) + { + terminator = skip_whitespace_delim (&mbbuf, &have_saved, + &saved_g, + &have_pending_line); + break; + } + + if (field_1_bufsize - n_bytes < g.len) + field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize, + g.len, -1, + sizeof *field_1_buffer); + memcpy (field_1_buffer + n_bytes, mbbuf_char_offset (&mbbuf, g), + g.len); + n_bytes += g.len; + } + + if (terminator != FIELD_DELIMITER) + { + if (!suppress_non_delimited) + { + write_bytes (field_1_buffer, n_bytes); + if (terminator == FIELD_EOF) + { + if (putchar (line_delim) < 0) + write_error (); + } + } + + if (terminator == FIELD_EOF) + break; + + field_idx = 1; + current_rp = frp; + found_any_selected_field = false; + have_pending_line = false; + continue; + } + + if (print_kth (1)) + { + write_bytes (field_1_buffer, n_bytes); + found_any_selected_field = true; + } + next_item (&field_idx); + } + + enum field_terminator terminator; + + if (print_kth (field_idx)) + { + if (found_any_selected_field) + write_bytes (output_delimiter_string, output_delimiter_length); + found_any_selected_field = true; + + while (true) + { + mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g); + + if (g.ch == MBBUF_EOF) + { + terminator = FIELD_EOF; + break; + } + + have_pending_line = true; + + if (g.ch == line_delim) + { + terminator = FIELD_LINE_DELIMITER; + break; + } + + if (c32issep (g.ch)) + { + terminator = skip_whitespace_delim (&mbbuf, &have_saved, + &saved_g, + &have_pending_line); + break; + } + + write_bytes (mbbuf_char_offset (&mbbuf, g), g.len); + } + } + else + { + while (true) + { + mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g); + + if (g.ch == MBBUF_EOF) + { + terminator = FIELD_EOF; + break; + } + + have_pending_line = true; + + if (g.ch == line_delim) + { + terminator = FIELD_LINE_DELIMITER; + break; + } + + if (c32issep (g.ch)) + { + terminator = skip_whitespace_delim (&mbbuf, &have_saved, + &saved_g, + &have_pending_line); + break; + } + } + } + + if (terminator == FIELD_DELIMITER) + next_item (&field_idx); + else + { + if (terminator == FIELD_EOF && !have_pending_line) + break; + if (found_any_selected_field + || !(suppress_non_delimited && field_idx == 1)) + { + if (putchar (line_delim) < 0) + write_error (); + } + if (terminator == FIELD_EOF) + break; + + field_idx = 1; + current_rp = frp; + found_any_selected_field = false; + have_pending_line = false; + } + } +} + /* Read from stream STREAM, printing to standard output any selected fields. */ static void @@ -749,7 +978,7 @@ main (int argc, char **argv) atexit (close_stdout); - while ((optc = getopt_long (argc, argv, "b:c:d:f:nsz", longopts, NULL)) + while ((optc = getopt_long (argc, argv, "b:c:d:f:nszw", longopts, NULL)) != -1) { switch (optc) @@ -800,6 +1029,10 @@ main (int argc, char **argv) delim_specified = true; break; + case 'w': + whitespace_delimited = true; + break; + case OUTPUT_DELIMITER_OPTION: /* Interpret --output-delimiter='' to mean 'use the NUL byte as the delimiter.' */ @@ -835,7 +1068,7 @@ main (int argc, char **argv) if (cut_mode == CUT_MODE_BYTES || cut_mode == CUT_MODE_CHARACTERS) { - if (delim_specified) + if (delim_specified || whitespace_delimited) FATAL_ERROR (_("an input delimiter may be specified only\ when operating on fields")); @@ -844,6 +1077,9 @@ main (int argc, char **argv) \tonly when operating on fields")); } + if (delim_specified && whitespace_delimited) + FATAL_ERROR (_("-d and -w are mutually exclusive")); + set_fields (spec_list_string, (((cut_mode == CUT_MODE_BYTES || cut_mode == CUT_MODE_CHARACTERS) @@ -880,7 +1116,9 @@ main (int argc, char **argv) break; case CUT_MODE_FIELDS: - cut_stream = single_byte_field_delim_ok () ? cut_fields : cut_fields_mb; + cut_stream = whitespace_delimited ? cut_fields_ws + : single_byte_field_delim_ok () ? cut_fields + : cut_fields_mb; break; } affirm (cut_stream); diff --git a/tests/cut/cut.pl b/tests/cut/cut.pl index 3906b4fcfd..0e378ed247 100755 --- a/tests/cut/cut.pl +++ b/tests/cut/cut.pl @@ -36,6 +36,7 @@ my $inval_pos = "$prog: invalid byte or character range\n$try"; my $no_endpoint = "$prog: invalid range with no endpoint: -\n$try"; my $nofield = "$prog: an input delimiter may be specified only when " . "operating on fields\n$try"; +my $mutual_dw = "$prog: -d and -w are mutually exclusive\n$try"; my @Tests = ( @@ -134,6 +135,13 @@ my @Tests = ['8bit-delim', '-d', "\255", '--out=_', '-f2,3', {IN=>"a\255b\255c\n"}, {OUT=>"b_c\n"}], + ['w-delim-1', '-w', '-f2,3', {IN=>"a\tb c\n"}, {OUT=>"b\tc\n"}], + ['w-delim-2', '-w', '-f1,2', {IN=>" a b\n"}, {OUT=>"\ta\n"}], + ['w-delim-3', '-s', '-w', '-f2', {IN=>"abc\n"}, {OUT=>""}], + ['w-delim-4', '-s', '-w', '-f1', {IN=>"a b c\n"}, {OUT=>"a\n"}], + ['w-delim-5', '-w', '-d:', '-f1', {EXIT=>1}, {ERR=>$mutual_dw}], + ['w-delim-6', '-w', '-f1,2', {IN=>"a \n"}, {OUT=>"a\t\n"}], + # newline processing for fields ['newline-1', '-f1-', {IN=>"a\nb"}, {OUT=>"a\nb\n"}], ['newline-2', '-f1-', {IN=>""}, {OUT=>""}], @@ -266,6 +274,10 @@ if ($mb_locale ne 'C') {ENV => "LC_ALL=$mb_locale"}], ['mb-delim-3', '-s', '-d', "\xc3\xa9", '-f2', {IN=>"abc\n"}, {OUT=>""}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-w-delim-1', '-w', '-f2', {IN=>"a\xe2\x80\x83b\n"}, {OUT=>"b\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-w-delim-2', '-sw', '-f2', {IN=>"a\xc2\xa0b\n"}, {OUT=>""}, {ENV => "LC_ALL=$mb_locale"}]; }