From: Pádraig Brady
Date: Mon, 16 Mar 2026 11:11:59 +0000 (+0000) Subject: cut: faster utf8 processing X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=801686242edcea3ebe607cebe49a964fe34e032a;p=thirdparty%2Fcoreutils.git cut: faster utf8 processing TODO: improve to use bounded memory where possible --- diff --git a/src/cut.c b/src/cut.c index 80b4aa84b3..93ccad6a72 100644 --- a/src/cut.c +++ b/src/cut.c @@ -122,6 +122,12 @@ static bool trim_outer_whitespace; /* If true, default the output delimiter to a single space. */ static bool space_output_delimiter_default; +/* Buffer for record-oriented UTF-8 field splitting. */ +static char *line_buffer; + +/* The number of bytes allocated for LINE_BUFFER. */ +static idx_t line_bufsize; + enum whitespace_option { WHITESPACE_OPTION_TRIMMED @@ -293,6 +299,26 @@ single_byte_field_delim_ok (void) && (MB_CUR_MAX <= 1 || to_uchar (delim_bytes[0]) < 0x30); } +/* Return true if the current charset is UTF-8. */ +static bool +is_utf8_charset (void) +{ + static int is_utf8 = -1; + if (is_utf8 == -1) + { + char32_t w; + mbstate_t mbs = {0}; + is_utf8 = mbrtoc32 (&w, "\xe2\x9f\xb8", 3, &mbs) == 3 && w == 0x27F8; + } + return is_utf8; +} + +static inline bool +utf8_field_delim_ok (void) +{ + return ! delim_mcel.err && is_utf8_charset (); +} + static inline bool field_delim_eq (mcel_t g) { @@ -471,6 +497,44 @@ scan_mb_field (mbbuf_t *mbbuf, struct mbfield_parser *parser, } } +/* Return a pointer to the next field delimiter in the UTF-8 record BUF, + searching LEN bytes. Return NULL if none is found. DELIM_BYTES must + represent a valid UTF-8 character. + Like mbsmbchr() in numfmt but handles NUL bytes. */ +ATTRIBUTE_PURE +static char * +find_utf8_field_delim (char const *buf, size_t len) +{ + unsigned char delim_0 = delim_bytes[0]; + if (delim_0 < 0x80) + return memchr ((void *) buf, delim_0, len); + +#if MEMMEM_IS_FASTER /* Surprisingly not on glibc-2.42 */ + return memmem (buf, len, delim_bytes, delim_length); +#else + char const *p = buf; + char const *end = buf + len; + + while (p < end) + { + char const *nul = memchr (p, '\0', end - p); + if (!nul) + return (char *) strstr (p, delim_bytes); + + if (p < nul) + { + char *match = strstr (p, delim_bytes); + if (match) + return match; + } + + p = nul + 1; + } + + return NULL; +#endif +} + static inline void reset_field_line (uintmax_t *field_idx, bool *found_any_selected_field, bool *have_pending_line, struct mbfield_parser *parser) @@ -711,6 +775,72 @@ cut_fields_mb (FILE *stream) cut_fields_mb_any (stream, false); } +/* Read from STREAM, printing to standard output any selected fields, + using UTF-8-aware byte searches for the field delimiter. */ + +static void +cut_fields_mb_utf8 (FILE *stream) +{ + while (true) + { + size_t line_bufsize_s = line_bufsize; + ssize_t len = getndelim2 (&line_buffer, &line_bufsize_s, 0, + GETNLINE_NO_LIMIT, line_delim, EOF, stream); + line_bufsize = line_bufsize_s; + + if (len < 0) + return; + + size_t n_bytes = len; + bool have_line_delim = 0 < n_bytes && line_buffer[n_bytes - 1] == line_delim; + size_t data_len = n_bytes - have_line_delim; + char *field_start = line_buffer; + char *field_end = find_utf8_field_delim (field_start, data_len); + + if (!field_end) + { + if (!suppress_non_delimited) + { + write_bytes (line_buffer, data_len); + write_line_delim (); + } + continue; + } + + uintmax_t field_idx = 1; + bool found_any_selected_field = false; + current_rp = frp; + + while (true) + { + size_t field_len = (field_end + ? field_end - field_start + : line_buffer + data_len - field_start); + + if (print_kth (field_idx)) + { + if (found_any_selected_field) + write_bytes (output_delimiter_string, output_delimiter_length); + write_bytes (field_start, field_len); + found_any_selected_field = true; + } + + if (!field_end) + break; + + next_item (&field_idx); + field_start = field_end + delim_length; + field_end = find_utf8_field_delim (field_start, + line_buffer + data_len + - field_start); + } + + if (found_any_selected_field + || !(suppress_non_delimited && field_idx == 1)) + write_line_delim (); + } +} + /* Read from STREAM, printing to standard output any selected fields, using runs of whitespace as the field delimiter. */ @@ -1098,6 +1228,7 @@ main (int argc, char **argv) case CUT_MODE_FIELDS: cut_stream = whitespace_delimited ? cut_fields_ws : single_byte_field_delim_ok () ? cut_fields + : utf8_field_delim_ok () ? cut_fields_mb_utf8 : cut_fields_mb; break; } diff --git a/tests/cut/cut.pl b/tests/cut/cut.pl index f550f8f307..4767da5f09 100755 --- a/tests/cut/cut.pl +++ b/tests/cut/cut.pl @@ -315,6 +315,12 @@ if ($mb_locale ne 'C') ['mb-delim-5', '-d', "\xa9", '-f2', # Different from coreutils-i18n {IN=>"A\xc3\xa9B\xa9C\n"}, {OUT=>"C\n"}, # (we don't split valid chars) {ENV => "LC_ALL=$mb_locale"}], + ['mb-delim-6', '-d', "\xc3\xa9", '-f1,3', + {IN=>"a\xc3\xa9b\xc3\xa9c"}, {OUT=>"a\xc3\xa9c\n"}, + {ENV => "LC_ALL=$mb_locale"}], + ['mb-delim-7', '-d', "\xc3\xa9", '-f2', + {IN=>"a\0b\xc3\xa9c\n"}, {OUT=>"c\n"}, + {ENV => "LC_ALL=$mb_locale"}], ['mb-w-delim-1', '-w', '-f2', {IN=>"a\xe2\x80\x83b\n"}, {OUT=>"b\n"}, {ENV => "LC_ALL=$mb_locale"}], ['mb-w-delim-2', '-sw', '-f2', {IN=>"a\xc2\xa0b\n"}, {OUT=>""},