From: Pádraig Brady Date: Mon, 16 Mar 2026 12:45:59 +0000 (+0000) Subject: cut: add utf8 helper to mbbuf X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ba7c1fbadb2d4aaf0dd211d545eca0b220b36259;p=thirdparty%2Fcoreutils.git cut: add utf8 helper to mbbuf * gl/lib/mbbuf.h: To safely search a bounded buffer, without needed to have unbounded memory with getndelim2. TODO: rename mbbuf_utf8_safe_prefix to mbbuf_fill_utf8 --- diff --git a/gl/lib/mbbuf.h b/gl/lib/mbbuf.h index ea4b7e7087..060825a634 100644 --- a/gl/lib/mbbuf.h +++ b/gl/lib/mbbuf.h @@ -49,6 +49,12 @@ typedef struct idx_t offset; /* Current position in BUFFER. */ } mbbuf_t; +MBBUF_INLINE idx_t +mbbuf_avail (mbbuf_t const *mbbuf) +{ + return mbbuf->length - mbbuf->offset; +} + /* Initialize MBBUF with an allocated BUFFER of SIZE bytes and a file stream FP open for reading. SIZE must be greater than or equal to MCEL_LEN_MAX. */ @@ -64,15 +70,17 @@ mbbuf_init (mbbuf_t *mbbuf, char *buffer, idx_t size, FILE *fp) mbbuf->offset = 0; } -/* Get the next character in the buffer, filling it from FP if necessary. - If an invalid multi-byte character is seen, we assume the program wants to - fall back to the read byte. */ -MBBUF_INLINE mcel_t -mbbuf_get_char (mbbuf_t *mbbuf) +/* Fill the input buffer with at least MIN_AVAILABLE bytes if possible. + Return the number of bytes available from the current offset. */ +MBBUF_INLINE idx_t +mbbuf_fill (mbbuf_t *mbbuf, idx_t min_available) { - idx_t available = mbbuf->length - mbbuf->offset; - /* Check if we need to fill the input buffer. */ - if (available < MCEL_LEN_MAX && ! feof (mbbuf->fp)) + idx_t available = mbbuf_avail (mbbuf); + + if (mbbuf->size < min_available) + min_available = mbbuf->size; + + if (available < min_available && ! feof (mbbuf->fp)) { idx_t start; if (!(0 < available)) @@ -85,8 +93,63 @@ mbbuf_get_char (mbbuf_t *mbbuf) mbbuf->length = fread (mbbuf->buffer + start, 1, mbbuf->size - start, mbbuf->fp) + start; mbbuf->offset = 0; - available = mbbuf->length - mbbuf->offset; + available = mbbuf_avail (mbbuf); } + + return available; +} + +/* Consume N bytes from the current buffer. */ +MBBUF_INLINE void +mbbuf_advance (mbbuf_t *mbbuf, idx_t n) +{ + if (mbbuf_avail (mbbuf) < n) + unreachable (); + mbbuf->offset += n; +} + +/* Return the largest prefix of the current contents that is safe to process + with byte searches, while leaving at least OVERLAP bytes unprocessed unless + EOF has been seen. The returned prefix never ends in the middle of a UTF-8 + sequence, but it may include invalid bytes. */ +MBBUF_INLINE idx_t +mbbuf_utf8_safe_prefix (mbbuf_t *mbbuf, idx_t overlap) +{ + idx_t available = mbbuf_fill (mbbuf, overlap + 4); + if (available == 0) + return 0; + + if (feof (mbbuf->fp)) + return available; + + if (available <= overlap) + return 0; + + idx_t end = available - overlap; + char const *buf = mbbuf->buffer + mbbuf->offset; + idx_t start = end - 1; + + while (0 < start + && ((unsigned char) buf[start] & 0xC0) == 0x80) + start--; + + unsigned char lead = buf[start]; + idx_t len = (lead < 0x80 ? 1 + : (lead & 0xE0) == 0xC0 ? 2 + : (lead & 0xF0) == 0xE0 ? 3 + : (lead & 0xF8) == 0xF0 ? 4 + : 1); + + return start + len <= end ? end : start; +} + +/* Get the next character in the buffer, filling it from FP if necessary. + If an invalid multi-byte character is seen, we assume the program wants to + fall back to the read byte. */ +MBBUF_INLINE mcel_t +mbbuf_get_char (mbbuf_t *mbbuf) +{ + idx_t available = mbbuf_fill (mbbuf, MCEL_LEN_MAX); if (available <= 0) return (mcel_t) { .ch = MBBUF_EOF }; mcel_t g = mcel_scan (mbbuf->buffer + mbbuf->offset,