From ae89cd646a7dfd0cf655e8c0d0d56b93288abb71 Mon Sep 17 00:00:00 2001 From: Collin Funk Date: Mon, 25 Aug 2025 23:15:21 -0700 Subject: [PATCH] fold: don't truncate multibyte characters at the end of the buffer * src/fold.c (fold_file): Replace invalid characters with the original byte read. Copy multibyte sequences that may not yet be read to the start of the buffer before reading more bytes. * tests/fold/fold-characters.sh: Add a test case. --- src/fold.c | 24 ++++++++++++++++++++++-- tests/fold/fold-characters.sh | 19 +++++++++++++++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/src/fold.c b/src/fold.c index 7bf30cd0bb..208b004d62 100644 --- a/src/fold.c +++ b/src/fold.c @@ -139,6 +139,7 @@ fold_file (char const *filename, size_t width) idx_t offset_out = 0; /* Index in 'line_out' for next char. */ static char line_out[IO_BUFSIZE]; static char line_in[IO_BUFSIZE]; + static size_t offset_in = 0; static size_t length_in = 0; int saved_errno; @@ -158,14 +159,30 @@ fold_file (char const *filename, size_t width) fadvise (istream, FADVISE_SEQUENTIAL); - while (0 < (length_in = fread (line_in, 1, sizeof line_in, istream))) + while (0 < (length_in = fread (line_in + offset_in, 1, + sizeof line_in - offset_in, istream))) { char *p = line_in; - char *lim = p + length_in; + char *lim = p + length_in + offset_in; mcel_t g; for (; p < lim; p += g.len) { g = mcel_scan (p, lim); + if (g.err) + { + /* Replace the character with the byte if it cannot be a + truncated multibyte sequence. */ + if (!(lim - p <= MCEL_LEN_MAX)) + g.ch = p[0]; + else + { + /* It may be a truncated multibyte sequence. Move it to the + front of the input buffer. */ + memmove (line_in, p, lim - p); + offset_in = lim - p; + goto next_line; + } + } if (g.ch == '\n') { memcpy (line_out + offset_out, p, g.len); @@ -241,6 +258,9 @@ fold_file (char const *filename, size_t width) } if (feof (istream)) break; + /* We read a full buffer of complete characters. */ + offset_in = 0; + next_line: } saved_errno = errno; diff --git a/tests/fold/fold-characters.sh b/tests/fold/fold-characters.sh index 159f6ddac8..be17d80be1 100755 --- a/tests/fold/fold-characters.sh +++ b/tests/fold/fold-characters.sh @@ -58,6 +58,25 @@ compare column-exp2 column-out2 || fail=1 fold --characters -w 10 input2 > character-out2 || fail=1 compare character-exp2 character-out2 || fail=1 +# Test a Unicode character on the edge of the input buffer. +# Keep in sync with IO_BUFSIZE - 1. +yes a | head -n 262143 | tr -d '\n' > input3 || framework_failure_ +env printf '\uB250' >> input3 || framework_failure_ +yes a | head -n 100 | tr -d '\n' >> input3 || framework_failure_ +env printf '\n' >> input3 || framework_failure_ + +yes a | head -n 80 | tr -d '\n' > exp3 || framework_failure_ +env printf '\n' >> exp3 || framework_failure_ +yes a | head -n 63 | tr -d '\n' >> exp3 || framework_failure_ +env printf '\uB250' >> exp3 || framework_failure_ +yes a | head -n 16 | tr -d '\n' >> exp3 || framework_failure_ +env printf '\n' >> exp3 || framework_failure_ +yes a | head -n 80 | tr -d '\n' >> exp3 || framework_failure_ +env printf '\naaaa\n' >> exp3 || framework_failure_ + +fold --characters input3 | tail -n 4 > out3 || fail=1 +compare exp3 out3 || fail=1 + # Ensure bounded memory operation vm=$(get_min_ulimit_v_ fold /dev/null) && { yes | tr -d '\n' | (ulimit -v $(($vm+8000)) && fold 2>err) | head || fail=1 -- 2.47.3