From: Pádraig Brady
Date: Thu, 26 Mar 2026 16:52:56 +0000 (+0000) Subject: maint: cut: simplify mbbuf_fill X-Git-Tag: v9.11~82 X-Git-Url: http://git.ipfire.org/gitweb/?a=commitdiff_plain;h=7ceb4c348b8b859525ca857298a4149bf4b4f0ae;p=thirdparty%2Fcoreutils.git maint: cut: simplify mbbuf_fill We can only byte search with uni-byte or utf-8. utf-8 implicitly can't false match a delimiter at buffer boundary. So don't worry about finding the exact utf8 boundary at end of buffer, rather just ensuring the buffer always starts with a valid character (by ensuring MCEL_LEN_MAX-1 moved to start of buffer on each refill). --- diff --git a/gl/lib/mbbuf.h b/gl/lib/mbbuf.h index 060825a634..84e1fdecf5 100644 --- a/gl/lib/mbbuf.h +++ b/gl/lib/mbbuf.h @@ -70,17 +70,14 @@ mbbuf_init (mbbuf_t *mbbuf, char *buffer, idx_t size, FILE *fp) mbbuf->offset = 0; } -/* Fill the input buffer with at least MIN_AVAILABLE bytes if possible. +/* Fill the input buffer with at least MCEL_LEN_MAX bytes if possible. Return the number of bytes available from the current offset. */ MBBUF_INLINE idx_t -mbbuf_fill (mbbuf_t *mbbuf, idx_t min_available) +mbbuf_fill (mbbuf_t *mbbuf) { idx_t available = mbbuf_avail (mbbuf); - if (mbbuf->size < min_available) - min_available = mbbuf->size; - - if (available < min_available && ! feof (mbbuf->fp)) + if (available < MCEL_LEN_MAX && ! feof (mbbuf->fp)) { idx_t start; if (!(0 < available)) @@ -108,48 +105,13 @@ mbbuf_advance (mbbuf_t *mbbuf, idx_t n) mbbuf->offset += n; } -/* Return the largest prefix of the current contents that is safe to process - with byte searches, while leaving at least OVERLAP bytes unprocessed unless - EOF has been seen. The returned prefix never ends in the middle of a UTF-8 - sequence, but it may include invalid bytes. */ -MBBUF_INLINE idx_t -mbbuf_utf8_safe_prefix (mbbuf_t *mbbuf, idx_t overlap) -{ - idx_t available = mbbuf_fill (mbbuf, overlap + 4); - if (available == 0) - return 0; - - if (feof (mbbuf->fp)) - return available; - - if (available <= overlap) - return 0; - - idx_t end = available - overlap; - char const *buf = mbbuf->buffer + mbbuf->offset; - idx_t start = end - 1; - - while (0 < start - && ((unsigned char) buf[start] & 0xC0) == 0x80) - start--; - - unsigned char lead = buf[start]; - idx_t len = (lead < 0x80 ? 1 - : (lead & 0xE0) == 0xC0 ? 2 - : (lead & 0xF0) == 0xE0 ? 3 - : (lead & 0xF8) == 0xF0 ? 4 - : 1); - - return start + len <= end ? end : start; -} - /* Get the next character in the buffer, filling it from FP if necessary. If an invalid multi-byte character is seen, we assume the program wants to fall back to the read byte. */ MBBUF_INLINE mcel_t mbbuf_get_char (mbbuf_t *mbbuf) { - idx_t available = mbbuf_fill (mbbuf, MCEL_LEN_MAX); + idx_t available = mbbuf_fill (mbbuf); if (available <= 0) return (mcel_t) { .ch = MBBUF_EOF }; mcel_t g = mcel_scan (mbbuf->buffer + mbbuf->offset, diff --git a/src/cut.c b/src/cut.c index a6d2cb1d63..2d7797b1b1 100644 --- a/src/cut.c +++ b/src/cut.c @@ -685,19 +685,6 @@ begin_field_output (uintmax_t field_idx, bool buffer_first_field, return write_field; } -static inline idx_t -bytesearch_safe_prefix (mbbuf_t *mbbuf, idx_t overlap) -{ - idx_t available = mbbuf_fill (mbbuf, overlap + 1); - if (available == 0) - return 0; - - if (feof (mbbuf->fp)) - return available; - - return overlap < available ? available - overlap : 0; -} - static inline bool field_selection_exhausted (uintmax_t field_idx) { @@ -1024,7 +1011,6 @@ cut_fields_bytesearch (FILE *stream) bool skip_blank_run = false; bool write_field; idx_t field_1_n_bytes = 0; - idx_t overlap = whitespace_delimited ? 0 : delim_length - 1; current_rp = frp; bool buffer_first_field = suppress_non_delimited ^ !print_kth (1); @@ -1034,13 +1020,9 @@ cut_fields_bytesearch (FILE *stream) while (true) { - idx_t safe = bytesearch_safe_prefix (&mbbuf, overlap); + idx_t safe = mbbuf_fill (&mbbuf); if (safe == 0) - { - if (mbbuf_avail (&mbbuf) == 0) - break; - continue; - } + break; char *chunk = mbbuf.buffer + mbbuf.offset; idx_t processed = 0;