From: Pádraig Brady Date: Fri, 13 Mar 2026 14:57:42 +0000 (+0000) Subject: cut: support --whitespace-delimited=trimmed X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=eb1f0577464b22dc13735db666e99aeb0005f989;p=thirdparty%2Fcoreutils.git cut: support --whitespace-delimited=trimmed Support ignoring leading and trailing whitespace. E.g. this matches awk's default field splitting mode. * src/cut.c * tests/cut/cut.pl: Add test cases. --- diff --git a/src/cut.c b/src/cut.c index bfa8305f52..9faf8a872e 100644 --- a/src/cut.c +++ b/src/cut.c @@ -29,6 +29,7 @@ #include #include "system.h" +#include "argmatch.h" #include "assure.h" #include "fadvise.h" #include "getndelim2.h" @@ -115,6 +116,26 @@ static bool no_split; /* If true, interpret each run of whitespace as one field delimiter. */ static bool whitespace_delimited; +/* If true, ignore leading and trailing whitespace in -w mode. */ +static bool trim_outer_whitespace; + +enum whitespace_option +{ + WHITESPACE_OPTION_TRIMMED +}; + +static char const *const whitespace_option_args[] = +{ + "trimmed", NULL +}; + +static enum whitespace_option const whitespace_option_types[] = +{ + WHITESPACE_OPTION_TRIMMED +}; + +ARGMATCH_VERIFY (whitespace_option_args, whitespace_option_types); + /* Whether to cut bytes, characters, or fields. */ static enum { @@ -138,7 +159,7 @@ static struct option const longopts[] = {"fields", required_argument, NULL, 'f'}, {"delimiter", required_argument, NULL, 'd'}, {"no-partial", no_argument, NULL, 'n'}, - {"whitespace-delimited", no_argument, NULL, 'w'}, + {"whitespace-delimited", optional_argument, NULL, 'w'}, {"only-delimited", no_argument, NULL, 's'}, {"output-delimiter", required_argument, NULL, 'O'}, {"complement", no_argument, NULL, COMPLEMENT_OPTION}, @@ -201,8 +222,9 @@ Print selected parts of lines from each FILE to standard output.\n\ do not print lines not containing delimiters\n\ ")); oputs (_("\ - -w, --whitespace-delimited\n\ - use runs of whitespace as the field delimiter\n\ + -w, --whitespace-delimited[=trimmed]\n\ + use runs of whitespace as the field delimiter;\n\ + with 'trimmed', ignore leading and trailing whitespace\n\ ")); oputs (_("\ -z, --zero-terminated\n\ @@ -281,6 +303,8 @@ enum field_terminator struct mbfield_parser { bool whitespace_delimited; + bool trim_outer_whitespace; + bool at_line_start; bool have_saved; mcel_t saved_g; }; @@ -297,8 +321,9 @@ mbbuf_get_saved_char (mbbuf_t *mbbuf, bool *have_saved, mcel_t *saved_g) } static inline enum field_terminator -skip_whitespace_delim (mbbuf_t *mbuf, bool *have_saved, mcel_t *saved_g, - bool *have_pending_line) +skip_whitespace_run (mbbuf_t *mbuf, struct mbfield_parser *parser, + bool *have_pending_line, + bool have_initial_whitespace) { mcel_t g; @@ -310,9 +335,17 @@ skip_whitespace_delim (mbbuf_t *mbuf, bool *have_saved, mcel_t *saved_g, } while (g.ch != MBBUF_EOF && g.ch != line_delim && c32issep (g.ch)); - *saved_g = g; - *have_saved = true; - return FIELD_DELIMITER; + bool trim_start = parser->trim_outer_whitespace && parser->at_line_start; + + if (parser->trim_outer_whitespace + && (g.ch == MBBUF_EOF || g.ch == line_delim)) + { + return g.ch == MBBUF_EOF ? FIELD_EOF : FIELD_LINE_DELIMITER; + } + + parser->saved_g = g; + parser->have_saved = true; + return trim_start && !have_initial_whitespace ? FIELD_DATA : FIELD_DELIMITER; } static void @@ -376,8 +409,7 @@ mbfield_terminator (mbbuf_t *mbbuf, struct mbfield_parser *parser, mcel_t g, if (parser->whitespace_delimited) return (c32issep (g.ch) - ? skip_whitespace_delim (mbbuf, &parser->have_saved, - &parser->saved_g, have_pending_line) + ? skip_whitespace_run (mbbuf, parser, have_pending_line, true) : FIELD_DATA); return field_delim_eq (g) ? FIELD_DELIMITER : FIELD_DATA; @@ -400,6 +432,18 @@ static enum field_terminator read_mb_field_to_buffer (mbbuf_t *mbbuf, struct mbfield_parser *parser, bool *have_pending_line, size_t *n_bytes) { + if (parser->whitespace_delimited + && parser->trim_outer_whitespace + && parser->at_line_start) + { + enum field_terminator terminator + = skip_whitespace_run (mbbuf, parser, have_pending_line, false); + if (terminator != FIELD_DATA) + return terminator; + } + + parser->at_line_start = false; + while (true) { mcel_t g = mbfield_get_char (mbbuf, parser); @@ -421,6 +465,18 @@ static enum field_terminator scan_mb_field (mbbuf_t *mbbuf, struct mbfield_parser *parser, bool *have_pending_line, bool write_field) { + if (parser->whitespace_delimited + && parser->trim_outer_whitespace + && parser->at_line_start) + { + enum field_terminator terminator + = skip_whitespace_run (mbbuf, parser, have_pending_line, false); + if (terminator != FIELD_DATA) + return terminator; + } + + parser->at_line_start = false; + while (true) { mcel_t g = mbfield_get_char (mbbuf, parser); @@ -441,12 +497,14 @@ scan_mb_field (mbbuf_t *mbbuf, struct mbfield_parser *parser, static inline void reset_field_line (uintmax_t *field_idx, bool *found_any_selected_field, - bool *have_pending_line) + bool *have_pending_line, struct mbfield_parser *parser) { *field_idx = 1; current_rp = frp; *found_any_selected_field = false; *have_pending_line = false; + parser->have_saved = false; + parser->at_line_start = true; } /* Read from stream STREAM, printing to standard output any selected bytes. */ @@ -588,6 +646,8 @@ cut_fields_mb_any (FILE *stream, bool whitespace_mode) struct mbfield_parser parser = { .whitespace_delimited = whitespace_mode, + .trim_outer_whitespace = trim_outer_whitespace, + .at_line_start = true, .saved_g = { .ch = MBBUF_EOF } }; uintmax_t field_idx = 1; @@ -623,7 +683,7 @@ cut_fields_mb_any (FILE *stream, bool whitespace_mode) break; reset_field_line (&field_idx, &found_any_selected_field, - &have_pending_line); + &have_pending_line, &parser); continue; } @@ -661,7 +721,7 @@ cut_fields_mb_any (FILE *stream, bool whitespace_mode) break; reset_field_line (&field_idx, &found_any_selected_field, - &have_pending_line); + &have_pending_line, &parser); } } } @@ -746,12 +806,12 @@ cut_fields (FILE *stream) { /* Empty. */ } - else - { - write_bytes (field_1_buffer, n_bytes); - /* Make sure the output line is newline terminated. */ - if (field_1_buffer[n_bytes - 1] != line_delim) - { + else + { + write_bytes (field_1_buffer, n_bytes); + /* Make sure the output line is newline terminated. */ + if (field_1_buffer[n_bytes - 1] != line_delim) + { if (putchar (line_delim) < 0) write_error (); } @@ -760,13 +820,13 @@ cut_fields (FILE *stream) continue; } - if (print_kth (1)) - { - /* Print the field, but not the trailing delimiter. */ - write_bytes (field_1_buffer, n_bytes - 1); + if (print_kth (1)) + { + /* Print the field, but not the trailing delimiter. */ + write_bytes (field_1_buffer, n_bytes - 1); - /* With -d$'\n' don't treat the last '\n' as a delimiter. */ - if (delim == line_delim) + /* With -d$'\n' don't treat the last '\n' as a delim. */ + if (delim == line_delim) { int last_c = getc (stream); if (last_c != EOF) @@ -948,6 +1008,12 @@ main (int argc, char **argv) case 'w': whitespace_delimited = true; + trim_outer_whitespace + = (optarg + && XARGMATCH ("--whitespace-delimited", optarg, + whitespace_option_args, + whitespace_option_types) + == WHITESPACE_OPTION_TRIMMED); break; case 'O': diff --git a/tests/cut/cut.pl b/tests/cut/cut.pl index 893386a7fe..6ce8bc46ee 100755 --- a/tests/cut/cut.pl +++ b/tests/cut/cut.pl @@ -140,6 +140,12 @@ my @Tests = ['w-delim-4', '-s', '-w', '-f1', {IN=>"a b c\n"}, {OUT=>"a\n"}], ['w-delim-5', '-w', '-d:', '-f1', {EXIT=>1}, {ERR=>$mutual_dw}], ['w-delim-6', '-w', '-f1,2', {IN=>"a \n"}, {OUT=>"a\t\n"}], + ['w-delim-7', '--whitespace-delimited', '-f1,2', + {IN=>" a b\n"}, {OUT=>"\ta\n"}], + ['w-trim-1', '--whitespace-delimited=trimmed', '-f1,2', + {IN=>" a b \n"}, {OUT=>"a\tb\n"}], + ['w-trim-2', '-s', '--whitespace-delimited=trimmed', '-f1', + {IN=>" a \n"}, {OUT=>""}], # newline processing for fields ['newline-1', '-f1-', {IN=>"a\nb"}, {OUT=>"a\nb\n"}],