From: Pádraig Brady Date: Fri, 27 Mar 2026 18:59:01 +0000 (+0000) Subject: cut: optimize UTF-8 input with 0xF5-0xFF delimiters X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=24571c41f32cc76df996e063a2b5b82561fc9374;p=thirdparty%2Fcoreutils.git cut: optimize UTF-8 input with 0xF5-0xFF delimiters * src/cut.c (bytesearch_field_delim_ok): Expand the range of bytes that can be simply searched for. 0xF5-0xFF can't appear in valid UTF-8 characters, and so may be used as delimiters in UTF-8 input, so it's worth optimizing for. * tests/cut/cut.pl: Add a test case (mainly as documentation). --- diff --git a/src/cut.c b/src/cut.c index 2d7797b1b1..3a7b7ce0a7 100644 --- a/src/cut.c +++ b/src/cut.c @@ -311,6 +311,9 @@ mcel_isblank (mcel_t g) return (g.len == 1 && c_isblank (g.ch)) || (g.len > 1 && c32issep (g.ch)); } +/* Return TRUE if it's valid to do a simple byte search + for the delimiter bytes. */ + static inline bool bytesearch_field_delim_ok (void) { @@ -318,7 +321,8 @@ bytesearch_field_delim_ok (void) return (delim_length == 1 ? (MB_CUR_MAX <= 1 - || (is_utf8_charset () ? delim_0 < 0x80 : delim_0 < 0x30)) + || (is_utf8_charset () + ? (delim_0 < 0x80 || delim_0 > 0xF4) : delim_0 < 0x30)) : utf8_field_delim_ok ()); } diff --git a/tests/cut/cut.pl b/tests/cut/cut.pl index bfccdbe85f..a34c9d345b 100755 --- a/tests/cut/cut.pl +++ b/tests/cut/cut.pl @@ -337,6 +337,9 @@ if ($mb_locale ne 'C') ['mb-delim-7', '-d', "\xc3\xa9", '-f2', {IN=>"a\0b\xc3\xa9c\n"}, {OUT=>"c\n"}, {ENV => "LC_ALL=$mb_locale"}], + ['mb-delim-8', '-d', "\xff", '-f2', # Note 0xF5-0xFF is efficient + {IN=>"a\xffb\n"}, {OUT=>"b\n"}, + {ENV => "LC_ALL=$mb_locale"}], ['mb-w-delim-1', '-w', '-f2', {IN=>"a\xe2\x80\x83b\n"}, {OUT=>"b\n"}, {ENV => "LC_ALL=$mb_locale"}], ['mb-w-delim-2', '-sw', '-f2', {IN=>"a\xc2\xa0b\n"}, {OUT=>""},