From: Pádraig Brady
Date: Thu, 12 Mar 2026 18:58:46 +0000 (+0000) Subject: cut: refactor multi-byte updates X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f644b4ca53d875e6ff62a50b8d02dc2b4e421b76;p=thirdparty%2Fcoreutils.git cut: refactor multi-byte updates * src/cut.c: 160 fewer lines Helpers extracted (replacing repeated inline patterns): - write_line_delim(), write_pending_line_delim(), reset_item_line() - line boundary code used by cut_bytes{,no_split}, cut_characters - write_selected_item() - output-delimiter + write logic used by all three byte/char functions - reset_field_line() - field line reset used by cut_fields_mb_any Field functions unified via cut_fields_mb_any(stream, whitespace_mode): - struct mbfield_parser encapsulates the whitespace vs. fixed-delimiter state (saved char, mode flag) - mbfield_get_char() - dispatches to saved-char or direct read - mbfield_terminator() - returns FIELD_{DATA,DELIMETER,LINE_DELIMITER} based on mode - read_mb_field_to_buffer() - replaces the two duplicated first-field buffering loops - scan_mb_field(mbbuf, parser, pending, write_field) - replaces the four duplicated field scan loops (print+skip × two modes) with a single function and a write_field bool - cut_fields_mb and cut_fields_ws are now trivial wrappers --- diff --git a/src/cut.c b/src/cut.c index 613e2a9ea1..507d1ba074 100644 --- a/src/cut.c +++ b/src/cut.c @@ -273,11 +273,19 @@ field_delim_eq (mcel_t g) enum field_terminator { + FIELD_DATA, FIELD_DELIMITER, FIELD_LINE_DELIMITER, FIELD_EOF }; +struct mbfield_parser +{ + bool whitespace_delimited; + bool have_saved; + mcel_t saved_g; +}; + static inline mcel_t mbbuf_get_saved_char (mbbuf_t *mbbuf, bool *have_saved, mcel_t *saved_g) { @@ -315,6 +323,133 @@ write_bytes (char const *buf, size_t n_bytes) write_error (); } +static inline void +write_line_delim (void) +{ + if (putchar (line_delim) < 0) + write_error (); +} + +static inline void +reset_item_line (uintmax_t *item_idx, bool *print_delimiter) +{ + write_line_delim (); + *item_idx = 0; + *print_delimiter = false; + current_rp = frp; +} + +static inline void +write_pending_line_delim (uintmax_t item_idx) +{ + if (item_idx > 0) + write_line_delim (); +} + +static inline void +write_selected_item (bool *print_delimiter, bool range_start, + char const *buf, size_t n_bytes) +{ + if (output_delimiter_string != output_delimiter_default) + { + if (*print_delimiter && range_start) + write_bytes (output_delimiter_string, output_delimiter_length); + *print_delimiter = true; + } + + write_bytes (buf, n_bytes); +} + +static inline mcel_t +mbfield_get_char (mbbuf_t *mbbuf, struct mbfield_parser *parser) +{ + return (parser->whitespace_delimited + ? mbbuf_get_saved_char (mbbuf, &parser->have_saved, &parser->saved_g) + : mbbuf_get_char (mbbuf)); +} + +static inline enum field_terminator +mbfield_terminator (mbbuf_t *mbbuf, struct mbfield_parser *parser, mcel_t g, + bool *have_pending_line) +{ + if (g.ch == line_delim) + return FIELD_LINE_DELIMITER; + + if (parser->whitespace_delimited) + return (c32issep (g.ch) + ? skip_whitespace_delim (mbbuf, &parser->have_saved, + &parser->saved_g, have_pending_line) + : FIELD_DATA); + + return field_delim_eq (g) ? FIELD_DELIMITER : FIELD_DATA; +} + +static inline void +append_field_1_bytes (mbbuf_t *mbbuf, mcel_t g, size_t *n_bytes) +{ + if (field_1_bufsize - *n_bytes < g.len) + { + field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize, + g.len, -1, sizeof *field_1_buffer); + } + + memcpy (field_1_buffer + *n_bytes, mbbuf_char_offset (mbbuf, g), g.len); + *n_bytes += g.len; +} + +static enum field_terminator +read_mb_field_to_buffer (mbbuf_t *mbbuf, struct mbfield_parser *parser, + bool *have_pending_line, size_t *n_bytes) +{ + while (true) + { + mcel_t g = mbfield_get_char (mbbuf, parser); + if (g.ch == MBBUF_EOF) + return FIELD_EOF; + + *have_pending_line = true; + + enum field_terminator terminator + = mbfield_terminator (mbbuf, parser, g, have_pending_line); + if (terminator != FIELD_DATA) + return terminator; + + append_field_1_bytes (mbbuf, g, n_bytes); + } +} + +static enum field_terminator +scan_mb_field (mbbuf_t *mbbuf, struct mbfield_parser *parser, + bool *have_pending_line, bool write_field) +{ + while (true) + { + mcel_t g = mbfield_get_char (mbbuf, parser); + if (g.ch == MBBUF_EOF) + return FIELD_EOF; + + *have_pending_line = true; + + enum field_terminator terminator + = mbfield_terminator (mbbuf, parser, g, have_pending_line); + if (terminator != FIELD_DATA) + return terminator; + + if (write_field) + write_bytes (mbbuf_char_offset (mbbuf, g), g.len); + } +} + +static inline void +reset_field_line (uintmax_t *field_idx, bool *found_any_selected_field, + bool *have_pending_line) +{ + *field_idx = 1; + current_rp = frp; + *found_any_selected_field = false; + *have_pending_line = false; +} + /* Read from stream STREAM, printing to standard output any selected bytes. */ static void @@ -335,20 +470,10 @@ cut_bytes (FILE *stream) c = getc (stream); if (c == line_delim) - { - if (putchar (c) < 0) - write_error (); - byte_idx = 0; - print_delimiter = false; - current_rp = frp; - } + reset_item_line (&byte_idx, &print_delimiter); else if (c == EOF) { - if (byte_idx > 0) - { - if (putchar (line_delim) < 0) - write_error (); - } + write_pending_line_delim (byte_idx); break; } else @@ -356,20 +481,9 @@ cut_bytes (FILE *stream) next_item (&byte_idx); if (print_kth (byte_idx)) { - if (output_delimiter_string != output_delimiter_default) - { - if (print_delimiter && is_range_start_index (byte_idx)) - { - if (fwrite (output_delimiter_string, sizeof (char), - output_delimiter_length, stdout) - != output_delimiter_length) - write_error (); - } - print_delimiter = true; - } - - if (putchar (c) < 0) - write_error (); + char ch = c; + write_selected_item (&print_delimiter, + is_range_start_index (byte_idx), &ch, 1); } } } @@ -394,20 +508,10 @@ cut_bytes_no_split (FILE *stream) mcel_t g = mbbuf_get_char (&mbbuf); if (g.ch == line_delim) - { - if (putchar (line_delim) < 0) - write_error (); - byte_idx = 0; - print_delimiter = false; - current_rp = frp; - } + reset_item_line (&byte_idx, &print_delimiter); else if (g.ch == MBBUF_EOF) { - if (byte_idx > 0) - { - if (putchar (line_delim) < 0) - write_error (); - } + write_pending_line_delim (byte_idx); break; } else @@ -433,17 +537,8 @@ cut_bytes_no_split (FILE *stream) } if (seen_selected && suffix_selected) - { - if (output_delimiter_string != output_delimiter_default) - { - if (print_delimiter && first_selected_is_range_start) - write_bytes (output_delimiter_string, - output_delimiter_length); - print_delimiter = true; - } - - write_bytes (mbbuf_char_offset (&mbbuf, g), g.len); - } + write_selected_item (&print_delimiter,first_selected_is_range_start, + mbbuf_char_offset (&mbbuf, g), g.len); } } } @@ -466,207 +561,40 @@ cut_characters (FILE *stream) mcel_t g = mbbuf_get_char (&mbbuf); if (g.ch == line_delim) - { - if (putchar (line_delim) < 0) - write_error (); - char_idx = 0; - print_delimiter = false; - current_rp = frp; - } + reset_item_line (&char_idx, &print_delimiter); else if (g.ch == MBBUF_EOF) { - if (char_idx > 0) - { - if (putchar (line_delim) < 0) - write_error (); - } + write_pending_line_delim (char_idx); break; } else { next_item (&char_idx); if (print_kth (char_idx)) - { - if (output_delimiter_string != output_delimiter_default) - { - if (print_delimiter && is_range_start_index (char_idx)) - { - if (fwrite (output_delimiter_string, sizeof (char), - output_delimiter_length, stdout) - != output_delimiter_length) - write_error (); - } - print_delimiter = true; - } - - if (fwrite (mbbuf_char_offset (&mbbuf, g), sizeof (char), g.len, - stdout) - != g.len) - write_error (); - } + write_selected_item (&print_delimiter, + is_range_start_index (char_idx), + mbbuf_char_offset (&mbbuf, g), g.len); } } } /* Read from STREAM, printing to standard output any selected fields, - using a multibyte field delimiter. */ + using a multibyte-aware field delimiter parser. */ static void -cut_fields_mb (FILE *stream) +cut_fields_mb_any (FILE *stream, bool whitespace_mode) { static char line_in[IO_BUFSIZE]; mbbuf_t mbbuf; - uintmax_t field_idx = 1; - bool found_any_selected_field = false; - bool buffer_first_field; - bool have_pending_line = false; - - current_rp = frp; - mbbuf_init (&mbbuf, line_in, sizeof line_in, stream); - - buffer_first_field = (suppress_non_delimited ^ !print_kth (1)); - - while (true) + struct mbfield_parser parser = { - if (field_idx == 1 && buffer_first_field) - { - size_t n_bytes = 0; - enum field_terminator terminator; - - while (true) - { - mcel_t g = mbbuf_get_char (&mbbuf); - - if (g.ch == MBBUF_EOF) - { - if (n_bytes == 0) - return; - terminator = FIELD_EOF; - break; - } - - if (field_1_bufsize - n_bytes < g.len) - { - field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize, - g.len, -1, - sizeof *field_1_buffer); - } - - memcpy (field_1_buffer + n_bytes, mbbuf_char_offset (&mbbuf, g), - g.len); - n_bytes += g.len; - have_pending_line = true; - - if (g.ch == line_delim) - { - terminator = FIELD_LINE_DELIMITER; - break; - } - - if (field_delim_eq (g)) - { - terminator = FIELD_DELIMITER; - break; - } - } - - if (terminator != FIELD_DELIMITER) - { - if (!suppress_non_delimited) - { - write_bytes (field_1_buffer, n_bytes); - if (terminator == FIELD_EOF) - { - if (putchar (line_delim) < 0) - write_error (); - } - } - - if (terminator == FIELD_EOF) - break; - - field_idx = 1; - current_rp = frp; - found_any_selected_field = false; - have_pending_line = false; - continue; - } - - if (print_kth (1)) - { - write_bytes (field_1_buffer, n_bytes - delim_length); - found_any_selected_field = true; - } - next_item (&field_idx); - } - - mcel_t g; - - if (print_kth (field_idx)) - { - if (found_any_selected_field) - write_bytes (output_delimiter_string, output_delimiter_length); - found_any_selected_field = true; - - while (true) - { - g = mbbuf_get_char (&mbbuf); - if (g.ch != MBBUF_EOF) - have_pending_line = true; - if (g.ch == MBBUF_EOF || g.ch == line_delim || field_delim_eq (g)) - break; - write_bytes (mbbuf_char_offset (&mbbuf, g), g.len); - } - } - else - { - while (true) - { - g = mbbuf_get_char (&mbbuf); - if (g.ch != MBBUF_EOF) - have_pending_line = true; - if (g.ch == MBBUF_EOF || g.ch == line_delim || field_delim_eq (g)) - break; - } - } - - if (field_delim_eq (g)) - next_item (&field_idx); - else if (g.ch == line_delim || g.ch == MBBUF_EOF) - { - if (g.ch == MBBUF_EOF && !have_pending_line) - break; - if (found_any_selected_field - || !(suppress_non_delimited && field_idx == 1)) - { - if (putchar (line_delim) < 0) - write_error (); - } - if (g.ch == MBBUF_EOF) - break; - - field_idx = 1; - current_rp = frp; - found_any_selected_field = false; - have_pending_line = false; - } - } -} - -/* Read from STREAM, printing to standard output any selected fields, - using runs of whitespace as the field delimiter. */ - -static void -cut_fields_ws (FILE *stream) -{ - static char line_in[IO_BUFSIZE]; - mbbuf_t mbbuf; + .whitespace_delimited = whitespace_mode, + .saved_g = { .ch = MBBUF_EOF } + }; uintmax_t field_idx = 1; bool found_any_selected_field = false; bool buffer_first_field; bool have_pending_line = false; - bool have_saved = false; - mcel_t saved_g = { .ch = MBBUF_EOF }; current_rp = frp; mbbuf_init (&mbbuf, line_in, sizeof line_in, stream); @@ -678,71 +606,25 @@ cut_fields_ws (FILE *stream) if (field_idx == 1 && buffer_first_field) { size_t n_bytes = 0; - enum field_terminator terminator; - - while (true) - { - mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g); - - if (g.ch == MBBUF_EOF) - { - if (n_bytes == 0) - return; - terminator = FIELD_EOF; - break; - } - - have_pending_line = true; - - if (g.ch == line_delim) - { - if (field_1_bufsize - n_bytes < g.len) - field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize, - g.len, -1, - sizeof *field_1_buffer); - memcpy (field_1_buffer + n_bytes, - mbbuf_char_offset (&mbbuf, g), g.len); - n_bytes += g.len; - terminator = FIELD_LINE_DELIMITER; - break; - } - - if (c32issep (g.ch)) - { - terminator = skip_whitespace_delim (&mbbuf, &have_saved, - &saved_g, - &have_pending_line); - break; - } - - if (field_1_bufsize - n_bytes < g.len) - field_1_buffer = xpalloc (field_1_buffer, &field_1_bufsize, - g.len, -1, - sizeof *field_1_buffer); - memcpy (field_1_buffer + n_bytes, mbbuf_char_offset (&mbbuf, g), - g.len); - n_bytes += g.len; - } + enum field_terminator terminator + = read_mb_field_to_buffer (&mbbuf, &parser, &have_pending_line, + &n_bytes); + if (terminator == FIELD_EOF && n_bytes == 0) + return; if (terminator != FIELD_DELIMITER) { if (!suppress_non_delimited) { write_bytes (field_1_buffer, n_bytes); - if (terminator == FIELD_EOF) - { - if (putchar (line_delim) < 0) - write_error (); - } + write_line_delim (); } if (terminator == FIELD_EOF) break; - field_idx = 1; - current_rp = frp; - found_any_selected_field = false; - have_pending_line = false; + reset_field_line (&field_idx, &found_any_selected_field, + &have_pending_line); continue; } @@ -755,71 +637,17 @@ cut_fields_ws (FILE *stream) } enum field_terminator terminator; + bool write_field = print_kth (field_idx); - if (print_kth (field_idx)) + if (write_field) { if (found_any_selected_field) write_bytes (output_delimiter_string, output_delimiter_length); found_any_selected_field = true; - - while (true) - { - mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g); - - if (g.ch == MBBUF_EOF) - { - terminator = FIELD_EOF; - break; - } - - have_pending_line = true; - - if (g.ch == line_delim) - { - terminator = FIELD_LINE_DELIMITER; - break; - } - - if (c32issep (g.ch)) - { - terminator = skip_whitespace_delim (&mbbuf, &have_saved, - &saved_g, - &have_pending_line); - break; - } - - write_bytes (mbbuf_char_offset (&mbbuf, g), g.len); - } } - else - { - while (true) - { - mcel_t g = mbbuf_get_saved_char (&mbbuf, &have_saved, &saved_g); - if (g.ch == MBBUF_EOF) - { - terminator = FIELD_EOF; - break; - } - - have_pending_line = true; - - if (g.ch == line_delim) - { - terminator = FIELD_LINE_DELIMITER; - break; - } - - if (c32issep (g.ch)) - { - terminator = skip_whitespace_delim (&mbbuf, &have_saved, - &saved_g, - &have_pending_line); - break; - } - } - } + terminator = scan_mb_field (&mbbuf, &parser, &have_pending_line, + write_field); if (terminator == FIELD_DELIMITER) next_item (&field_idx); @@ -829,21 +657,34 @@ cut_fields_ws (FILE *stream) break; if (found_any_selected_field || !(suppress_non_delimited && field_idx == 1)) - { - if (putchar (line_delim) < 0) - write_error (); - } + write_line_delim (); if (terminator == FIELD_EOF) break; - field_idx = 1; - current_rp = frp; - found_any_selected_field = false; - have_pending_line = false; + reset_field_line (&field_idx, &found_any_selected_field, + &have_pending_line); } } } +/* Read from STREAM, printing to standard output any selected fields, + using a multibyte field delimiter. */ + +static void +cut_fields_mb (FILE *stream) +{ + cut_fields_mb_any (stream, false); +} + +/* Read from STREAM, printing to standard output any selected fields, + using runs of whitespace as the field delimiter. */ + +static void +cut_fields_ws (FILE *stream) +{ + cut_fields_mb_any (stream, true); +} + /* Read from stream STREAM, printing to standard output any selected fields. */ static void