From: Collin Funk Date: Mon, 29 Sep 2025 03:16:26 +0000 (-0700) Subject: fold: move multi-byte character reading to a module X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=16a67363898443b03ef62848c30f15bfcc682ca0;p=thirdparty%2Fcoreutils.git fold: move multi-byte character reading to a module * gl/modules/mbbuf: New file. * gl/lib/mbbuf.c: Likewise. * gl/lib/mbbuf.h: Likewise. * gl/local.mk (EXTRA_DIST): Add the new files. * bootstrap.conf (gnulib_modules): Add mbbuf. * src/fold.c: Include mbbuf.h. (fold_file): Use the mbbuf functions instead of calling fread and handling the input buffer ourselves. * cfg.mk (exclude_file_name_regexp--sc_preprocessor_indentation) (exclude_file_name_regexp--sc_GPL_version): Match gl/lib/mbbuf.c and gl/lib/mbbuf.h. --- diff --git a/bootstrap.conf b/bootstrap.conf index adf09910de..f470aa48b3 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -169,6 +169,7 @@ gnulib_modules=" maintainer-makefile malloc-gnu manywarnings + mbbuf mbrlen mbrtoc32 mbrtowc diff --git a/cfg.mk b/cfg.mk index 3335a0e5d9..03fc61560c 100644 --- a/cfg.mk +++ b/cfg.mk @@ -938,7 +938,7 @@ exclude_file_name_regexp--sc_prohibit_tab_based_indentation = \ $(tbi_1)|$(tbi_2)|$(tbi_3) exclude_file_name_regexp--sc_preprocessor_indentation = \ - ^(gl/lib/rand-isaac\.[ch]|gl/tests/test-rand-isaac\.c)$$|$(_ll) + ^(gl/lib/(rand-isaac|mbbuf)\.[ch]|gl/tests/test-rand-isaac\.c)$$|$(_ll) exclude_file_name_regexp--sc_prohibit_stat_st_blocks = \ ^(src/system\.h|tests/du/2g\.sh)$$ @@ -999,3 +999,4 @@ csiwl_2 = kno,ois,afile,whats,hda,indx,ot,nam,ist codespell_ignore_words_list = $(csiwl_1),$(csiwl_2) exclude_file_name_regexp--sc_codespell = \ ^(THANKS\.in|tests/pr/.*(F|tn?|l(o|m|i)|bl))$$ +exclude_file_name_regexp--sc_GPL_version = ^(gl/lib/mbbuf\.[hc])$$ diff --git a/gl/lib/mbbuf.c b/gl/lib/mbbuf.c new file mode 100644 index 0000000000..551a9ced38 --- /dev/null +++ b/gl/lib/mbbuf.c @@ -0,0 +1,22 @@ +/* Buffering for multi-byte characters. + Copyright (C) 2025 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* Written by Collin Funk. */ + +#include + +#define MBBUF_INLINE _GL_EXTERN_INLINE +#include "mbbuf.h" diff --git a/gl/lib/mbbuf.h b/gl/lib/mbbuf.h new file mode 100644 index 0000000000..292b131c6c --- /dev/null +++ b/gl/lib/mbbuf.h @@ -0,0 +1,117 @@ +/* Buffering for multi-byte characters. + Copyright (C) 2025 Free Software Foundation, Inc. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 2.1 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +/* Written by Collin Funk. */ + +#ifndef _MBBUF_H +#define _MBBUF_H 1 + +#ifndef _GL_INLINE_HEADER_BEGIN +# error "Please include config.h first." +#endif + +#include +#include + +#include "mcel.h" +#include "idx.h" + +_GL_INLINE_HEADER_BEGIN +#ifndef MBBUF_INLINE +# define MBBUF_INLINE _GL_INLINE +#endif + +/* End of file. */ +#define MBBUF_EOF UINT32_MAX + +/* MBBUF_EOF should not be a valid character. */ +static_assert (MCEL_CHAR_MAX < MBBUF_EOF); + +typedef struct +{ + char *buffer; /* Input buffer. */ + FILE *fp; /* Input file stream. */ + idx_t size; /* Number of bytes allocated for BUFFER. */ + idx_t length; /* Number of bytes with data in BUFFER. */ + idx_t offset; /* Current position in BUFFER. */ +} mbbuf_t; + +/* Initialize MBBUF with an allocated BUFFER of SIZE bytes and a file stream + FP open for reading. SIZE must be greater than or equal to MCEL_LEN_MAX. + */ +MBBUF_INLINE void +mbbuf_init (mbbuf_t *mbbuf, char *buffer, idx_t size, FILE *fp) +{ + if (size < MCEL_LEN_MAX) + unreachable (); + mbbuf->buffer = buffer; + mbbuf->fp = fp; + mbbuf->size = size; + mbbuf->length = 0; + mbbuf->offset = 0; +} + +/* Get the next character in the buffer, filling it from FP if necessary. + If an invalid multi-byte character is seen, we assume the program wants to + fall back to the read byte. */ +MBBUF_INLINE mcel_t +mbbuf_get_char (mbbuf_t *mbbuf) +{ + idx_t available = mbbuf->length - mbbuf->offset; + /* Check if we need to fill the input buffer. */ + if (available < MCEL_LEN_MAX && ! feof (mbbuf->fp)) + { + idx_t start; + if (!(0 < available)) + start = 0; + else + { + memmove (mbbuf->buffer, mbbuf->buffer + mbbuf->offset, available); + start = available; + } + mbbuf->length = fread (mbbuf->buffer + start, 1, mbbuf->size - start, + mbbuf->fp) + start; + mbbuf->offset = 0; + available = mbbuf->length - mbbuf->offset; + } + if (available <= 0) + return (mcel_t) { .ch = MBBUF_EOF }; + mcel_t g = mcel_scan (mbbuf->buffer + mbbuf->offset, + mbbuf->buffer + mbbuf->length); + if (! g.err) + mbbuf->offset += g.len; + else + { + /* Assume the program will emit the byte, but keep the error flag. */ + g.ch = mbbuf->buffer[mbbuf->offset++]; + g.len = 1; + } + return g; +} + +/* Returns a pointer to the first byte in the previously read character from + mbbuf_get_char. */ +MBBUF_INLINE char * +mbbuf_char_offset (mbbuf_t *mbbuf, mcel_t g) +{ + if (mbbuf->offset < g.len) + unreachable (); + return mbbuf->buffer + (mbbuf->offset - g.len); +} + +_GL_INLINE_HEADER_END + +#endif diff --git a/gl/local.mk b/gl/local.mk index cf13de3d8d..7dfa80d40e 100644 --- a/gl/local.mk +++ b/gl/local.mk @@ -30,6 +30,8 @@ gl/lib/fd-reopen.c \ gl/lib/fd-reopen.h \ gl/lib/heap.c \ gl/lib/heap.h \ +gl/lib/mbbuf.c \ +gl/lib/mbbuf.h \ gl/lib/rand-isaac.c \ gl/lib/rand-isaac.h \ gl/lib/randint.c \ @@ -65,6 +67,7 @@ gl/modules/fadvise-tests \ gl/modules/fd-reopen \ gl/modules/heap \ gl/modules/link-tests.diff \ +gl/modules/mbbuf \ gl/modules/randint \ gl/modules/randperm \ gl/modules/randread \ diff --git a/gl/modules/mbbuf b/gl/modules/mbbuf new file mode 100644 index 0000000000..187298b8bb --- /dev/null +++ b/gl/modules/mbbuf @@ -0,0 +1,27 @@ +Description: +Buffering for multi-byte characters. + +Files: +lib/mbbuf.c +lib/mbbuf.h + +Depends-on: +c99 +extern-inline +idx +mcel +stddef-h + +configure.ac: + +Makefile.am: +lib_SOURCES += mbbuf.c mbbuf.h + +Include: +"mbbuf.h" + +License: +LGPLv2+ + +Maintainer: +all diff --git a/src/fold.c b/src/fold.c index e908371432..47ed427dbf 100644 --- a/src/fold.c +++ b/src/fold.c @@ -27,6 +27,7 @@ #include "fadvise.h" #include "ioblksize.h" #include "mcel.h" +#include "mbbuf.h" #include "xdectoint.h" #define TAB_WIDTH 8 @@ -153,8 +154,7 @@ fold_file (char const *filename, size_t width) idx_t offset_out = 0; /* Index in 'line_out' for next char. */ static char line_out[IO_BUFSIZE]; static char line_in[IO_BUFSIZE]; - static size_t offset_in = 0; - static size_t length_in = 0; + mbbuf_t mbbuf; int saved_errno; if (streq (filename, "-")) @@ -172,116 +172,87 @@ fold_file (char const *filename, size_t width) } fadvise (istream, FADVISE_SEQUENTIAL); + mbbuf_init (&mbbuf, line_in, sizeof line_in, istream); - while (0 < (length_in = fread (line_in + offset_in, 1, - sizeof line_in - offset_in, istream)) - || 0 < offset_in) + mcel_t g; + while ((g = mbbuf_get_char (&mbbuf)).ch != MBBUF_EOF) { - char *p = line_in; - char *lim = p + length_in + offset_in; - mcel_t g; - for (; p < lim; p += g.len) + if (g.ch == '\n') { - g = mcel_scan (p, lim); - if (g.err) - { - /* Replace the character with the byte if it cannot be a - truncated multibyte sequence. */ - if (!(lim - p <= MCEL_LEN_MAX) || length_in == 0) - g.ch = p[0]; - else - { - /* It may be a truncated multibyte sequence. Move it to the - front of the input buffer. */ - memmove (line_in, p, lim - p); - offset_in = lim - p; - goto next_line; - } - } - if (g.ch == '\n') - { - write_out (line_out, offset_out, /*newline=*/ true); - column = offset_out = 0; - continue; - } - rescan: - column = adjust_column (column, g); + write_out (line_out, offset_out, /*newline=*/ true); + column = offset_out = 0; + continue; + } + rescan: + column = adjust_column (column, g); - if (column > width) + if (column > width) + { + /* This character would make the line too long. + Print the line plus a newline, and make this character + start the next line. */ + if (break_spaces) { - /* This character would make the line too long. - Print the line plus a newline, and make this character - start the next line. */ - if (break_spaces) - { - int space_length = 0; - idx_t logical_end = offset_out; - char *logical_p = line_out; - char *logical_lim = logical_p + logical_end; + int space_length = 0; + idx_t logical_end = offset_out; + char *logical_p = line_out; + char *logical_lim = logical_p + logical_end; - for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len) - { - g2 = mcel_scan (logical_p, logical_lim); - if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch)) - { - space_length = g2.len; - logical_end = logical_p - line_out; - } - } - - if (space_length) + for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len) + { + g2 = mcel_scan (logical_p, logical_lim); + if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch)) { - logical_end += space_length; - /* Found a blank. Don't output the part after it. */ - write_out (line_out, logical_end, /*newline=*/ true); - /* Move the remainder to the beginning of the next line. - The areas being copied here might overlap. */ - memmove (line_out, line_out + logical_end, - offset_out - logical_end); - offset_out -= logical_end; - column = 0; - char *printed_p = line_out; - char *printed_lim = printed_p + offset_out; - for (mcel_t g2; printed_p < printed_lim; - printed_p += g2.len) - { - g2 = mcel_scan (printed_p, printed_lim); - column = adjust_column (column, g2); - } - goto rescan; + space_length = g2.len; + logical_end = logical_p - line_out; } } - if (offset_out == 0) + if (space_length) { - memcpy (line_out, p, g.len); - offset_out += g.len; - continue; + logical_end += space_length; + /* Found a blank. Don't output the part after it. */ + write_out (line_out, logical_end, /*newline=*/ true); + /* Move the remainder to the beginning of the next line. + The areas being copied here might overlap. */ + memmove (line_out, line_out + logical_end, + offset_out - logical_end); + offset_out -= logical_end; + column = 0; + char *printed_p = line_out; + char *printed_lim = printed_p + offset_out; + for (mcel_t g2; printed_p < printed_lim; + printed_p += g2.len) + { + g2 = mcel_scan (printed_p, printed_lim); + column = adjust_column (column, g2); + } + goto rescan; } - - write_out (line_out, offset_out, /*newline=*/ true); - column = offset_out = 0; - goto rescan; } - /* This can occur if we have read characters with a width of - zero. */ - if (sizeof line_out <= offset_out + g.len) + if (offset_out == 0) { - write_out (line_out, offset_out, /*newline=*/ false); - offset_out = 0; + memcpy (line_out, mbbuf_char_offset (&mbbuf, g), g.len); + offset_out += g.len; + continue; } - memcpy (line_out + offset_out, p, g.len); - offset_out += g.len; + write_out (line_out, offset_out, /*newline=*/ true); + column = offset_out = 0; + goto rescan; } - if (feof (istream)) - break; - /* We read a full buffer of complete characters. */ - offset_in = 0; + /* This can occur if we have read characters with a width of + zero. */ + if (sizeof line_out <= offset_out + g.len) + { + write_out (line_out, offset_out, /*newline=*/ false); + offset_out = 0; + } - next_line:; + memcpy (line_out + offset_out, mbbuf_char_offset (&mbbuf, g), g.len); + offset_out += g.len; } saved_errno = errno;