]> git.ipfire.org Git - thirdparty/coreutils.git/commitdiff
fold: move multi-byte character reading to a module
authorCollin Funk <collin.funk1@gmail.com>
Mon, 29 Sep 2025 03:16:26 +0000 (20:16 -0700)
committerCollin Funk <collin.funk1@gmail.com>
Wed, 1 Oct 2025 02:45:44 +0000 (19:45 -0700)
* gl/modules/mbbuf: New file.
* gl/lib/mbbuf.c: Likewise.
* gl/lib/mbbuf.h: Likewise.
* gl/local.mk (EXTRA_DIST): Add the new files.
* bootstrap.conf (gnulib_modules): Add mbbuf.
* src/fold.c: Include mbbuf.h.
(fold_file): Use the mbbuf functions instead of calling fread and
handling the input buffer ourselves.
* cfg.mk (exclude_file_name_regexp--sc_preprocessor_indentation)
(exclude_file_name_regexp--sc_GPL_version): Match gl/lib/mbbuf.c and
gl/lib/mbbuf.h.

bootstrap.conf
cfg.mk
gl/lib/mbbuf.c [new file with mode: 0644]
gl/lib/mbbuf.h [new file with mode: 0644]
gl/local.mk
gl/modules/mbbuf [new file with mode: 0644]
src/fold.c

index adf09910de8083788bb7868f6d014bdfba28f84c..f470aa48b31eff69cd396cc85b4cdea78e755fff 100644 (file)
@@ -169,6 +169,7 @@ gnulib_modules="
   maintainer-makefile
   malloc-gnu
   manywarnings
+  mbbuf
   mbrlen
   mbrtoc32
   mbrtowc
diff --git a/cfg.mk b/cfg.mk
index 3335a0e5d9576e905aac65924a66ff7d32be77f1..03fc61560cf8be8f3d2c1a1a76b61fbcc514fe48 100644 (file)
--- a/cfg.mk
+++ b/cfg.mk
@@ -938,7 +938,7 @@ exclude_file_name_regexp--sc_prohibit_tab_based_indentation = \
   $(tbi_1)|$(tbi_2)|$(tbi_3)
 
 exclude_file_name_regexp--sc_preprocessor_indentation = \
-  ^(gl/lib/rand-isaac\.[ch]|gl/tests/test-rand-isaac\.c)$$|$(_ll)
+  ^(gl/lib/(rand-isaac|mbbuf)\.[ch]|gl/tests/test-rand-isaac\.c)$$|$(_ll)
 exclude_file_name_regexp--sc_prohibit_stat_st_blocks = \
   ^(src/system\.h|tests/du/2g\.sh)$$
 
@@ -999,3 +999,4 @@ csiwl_2 = kno,ois,afile,whats,hda,indx,ot,nam,ist
 codespell_ignore_words_list = $(csiwl_1),$(csiwl_2)
 exclude_file_name_regexp--sc_codespell = \
   ^(THANKS\.in|tests/pr/.*(F|tn?|l(o|m|i)|bl))$$
+exclude_file_name_regexp--sc_GPL_version = ^(gl/lib/mbbuf\.[hc])$$
diff --git a/gl/lib/mbbuf.c b/gl/lib/mbbuf.c
new file mode 100644 (file)
index 0000000..551a9ce
--- /dev/null
@@ -0,0 +1,22 @@
+/* Buffering for multi-byte characters.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Collin Funk.  */
+
+#include <config.h>
+
+#define MBBUF_INLINE _GL_EXTERN_INLINE
+#include "mbbuf.h"
diff --git a/gl/lib/mbbuf.h b/gl/lib/mbbuf.h
new file mode 100644 (file)
index 0000000..292b131
--- /dev/null
@@ -0,0 +1,117 @@
+/* Buffering for multi-byte characters.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 2.1 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Collin Funk.  */
+
+#ifndef _MBBUF_H
+#define _MBBUF_H 1
+
+#ifndef _GL_INLINE_HEADER_BEGIN
+# error "Please include config.h first."
+#endif
+
+#include <stdio.h>
+#include <stddef.h>
+
+#include "mcel.h"
+#include "idx.h"
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef MBBUF_INLINE
+# define MBBUF_INLINE _GL_INLINE
+#endif
+
+/* End of file.  */
+#define MBBUF_EOF UINT32_MAX
+
+/* MBBUF_EOF should not be a valid character.  */
+static_assert (MCEL_CHAR_MAX < MBBUF_EOF);
+
+typedef struct
+{
+  char *buffer;    /* Input buffer.  */
+  FILE *fp;        /* Input file stream.  */
+  idx_t size;      /* Number of bytes allocated for BUFFER.  */
+  idx_t length;    /* Number of bytes with data in BUFFER.  */
+  idx_t offset;    /* Current position in BUFFER.  */
+} mbbuf_t;
+
+/* Initialize MBBUF with an allocated BUFFER of SIZE bytes and a file stream
+   FP open for reading.  SIZE must be greater than or equal to MCEL_LEN_MAX.
+ */
+MBBUF_INLINE void
+mbbuf_init (mbbuf_t *mbbuf, char *buffer, idx_t size, FILE *fp)
+{
+  if (size < MCEL_LEN_MAX)
+    unreachable ();
+  mbbuf->buffer = buffer;
+  mbbuf->fp = fp;
+  mbbuf->size = size;
+  mbbuf->length = 0;
+  mbbuf->offset = 0;
+}
+
+/* Get the next character in the buffer, filling it from FP if necessary.
+   If an invalid multi-byte character is seen, we assume the program wants to
+   fall back to the read byte.  */
+MBBUF_INLINE mcel_t
+mbbuf_get_char (mbbuf_t *mbbuf)
+{
+  idx_t available = mbbuf->length - mbbuf->offset;
+  /* Check if we need to fill the input buffer.  */
+  if (available < MCEL_LEN_MAX && ! feof (mbbuf->fp))
+    {
+      idx_t start;
+      if (!(0 < available))
+        start = 0;
+      else
+        {
+          memmove (mbbuf->buffer, mbbuf->buffer + mbbuf->offset, available);
+          start = available;
+        }
+      mbbuf->length = fread (mbbuf->buffer + start, 1, mbbuf->size - start,
+                             mbbuf->fp) + start;
+      mbbuf->offset = 0;
+      available = mbbuf->length - mbbuf->offset;
+    }
+  if (available <= 0)
+    return (mcel_t) { .ch = MBBUF_EOF };
+  mcel_t g = mcel_scan (mbbuf->buffer + mbbuf->offset,
+                        mbbuf->buffer + mbbuf->length);
+  if (! g.err)
+    mbbuf->offset += g.len;
+  else
+    {
+      /* Assume the program will emit the byte, but keep the error flag.  */
+      g.ch = mbbuf->buffer[mbbuf->offset++];
+      g.len = 1;
+    }
+  return g;
+}
+
+/* Returns a pointer to the first byte in the previously read character from
+   mbbuf_get_char.  */
+MBBUF_INLINE char *
+mbbuf_char_offset (mbbuf_t *mbbuf, mcel_t g)
+{
+  if (mbbuf->offset < g.len)
+    unreachable ();
+  return mbbuf->buffer + (mbbuf->offset - g.len);
+}
+
+_GL_INLINE_HEADER_END
+
+#endif
index cf13de3d8d93c6ca8c466353c2fb7ed48a0c8930..7dfa80d40e480719fae6b51536f3eee9ee29229b 100644 (file)
@@ -30,6 +30,8 @@ gl/lib/fd-reopen.c \
 gl/lib/fd-reopen.h \
 gl/lib/heap.c \
 gl/lib/heap.h \
+gl/lib/mbbuf.c \
+gl/lib/mbbuf.h \
 gl/lib/rand-isaac.c \
 gl/lib/rand-isaac.h \
 gl/lib/randint.c \
@@ -65,6 +67,7 @@ gl/modules/fadvise-tests \
 gl/modules/fd-reopen \
 gl/modules/heap \
 gl/modules/link-tests.diff \
+gl/modules/mbbuf \
 gl/modules/randint \
 gl/modules/randperm \
 gl/modules/randread \
diff --git a/gl/modules/mbbuf b/gl/modules/mbbuf
new file mode 100644 (file)
index 0000000..187298b
--- /dev/null
@@ -0,0 +1,27 @@
+Description:
+Buffering for multi-byte characters.
+
+Files:
+lib/mbbuf.c
+lib/mbbuf.h
+
+Depends-on:
+c99
+extern-inline
+idx
+mcel
+stddef-h
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += mbbuf.c mbbuf.h
+
+Include:
+"mbbuf.h"
+
+License:
+LGPLv2+
+
+Maintainer:
+all
index e9083714327f54d65a038ad271f4789eac05a19f..47ed427dbfdf54e6cfd88fb68872e5c270c218a3 100644 (file)
@@ -27,6 +27,7 @@
 #include "fadvise.h"
 #include "ioblksize.h"
 #include "mcel.h"
+#include "mbbuf.h"
 #include "xdectoint.h"
 
 #define TAB_WIDTH 8
@@ -153,8 +154,7 @@ fold_file (char const *filename, size_t width)
   idx_t offset_out = 0;                /* Index in 'line_out' for next char. */
   static char line_out[IO_BUFSIZE];
   static char line_in[IO_BUFSIZE];
-  static size_t offset_in = 0;
-  static size_t length_in = 0;
+  mbbuf_t mbbuf;
   int saved_errno;
 
   if (streq (filename, "-"))
@@ -172,116 +172,87 @@ fold_file (char const *filename, size_t width)
     }
 
   fadvise (istream, FADVISE_SEQUENTIAL);
+  mbbuf_init (&mbbuf, line_in, sizeof line_in, istream);
 
-  while (0 < (length_in = fread (line_in + offset_in, 1,
-                                 sizeof line_in - offset_in, istream))
-         || 0 < offset_in)
+  mcel_t g;
+  while ((g = mbbuf_get_char (&mbbuf)).ch != MBBUF_EOF)
     {
-      char *p = line_in;
-      char *lim = p + length_in + offset_in;
-      mcel_t g;
-      for (; p < lim; p += g.len)
+      if (g.ch == '\n')
         {
-          g = mcel_scan (p, lim);
-          if (g.err)
-            {
-              /* Replace the character with the byte if it cannot be a
-                 truncated multibyte sequence.  */
-              if (!(lim - p <= MCEL_LEN_MAX) || length_in == 0)
-                g.ch = p[0];
-              else
-                {
-                  /* It may be a truncated multibyte sequence.  Move it to the
-                     front of the input buffer.  */
-                  memmove (line_in, p, lim - p);
-                  offset_in = lim - p;
-                  goto next_line;
-                }
-            }
-          if (g.ch == '\n')
-            {
-              write_out (line_out, offset_out, /*newline=*/ true);
-              column = offset_out = 0;
-              continue;
-            }
-        rescan:
-          column = adjust_column (column, g);
+          write_out (line_out, offset_out, /*newline=*/ true);
+          column = offset_out = 0;
+          continue;
+        }
+    rescan:
+      column = adjust_column (column, g);
 
-          if (column > width)
+      if (column > width)
+        {
+          /* This character would make the line too long.
+             Print the line plus a newline, and make this character
+             start the next line. */
+          if (break_spaces)
             {
-              /* This character would make the line too long.
-                 Print the line plus a newline, and make this character
-                 start the next line. */
-              if (break_spaces)
-                {
-                  int space_length = 0;
-                  idx_t logical_end = offset_out;
-                  char *logical_p = line_out;
-                  char *logical_lim = logical_p + logical_end;
+              int space_length = 0;
+              idx_t logical_end = offset_out;
+              char *logical_p = line_out;
+              char *logical_lim = logical_p + logical_end;
 
-                  for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
-                    {
-                      g2 = mcel_scan (logical_p, logical_lim);
-                      if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch))
-                        {
-                          space_length = g2.len;
-                          logical_end = logical_p - line_out;
-                        }
-                    }
-
-                  if (space_length)
+              for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
+                {
+                  g2 = mcel_scan (logical_p, logical_lim);
+                  if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch))
                     {
-                      logical_end += space_length;
-                      /* Found a blank.  Don't output the part after it. */
-                      write_out (line_out, logical_end, /*newline=*/ true);
-                      /* Move the remainder to the beginning of the next line.
-                         The areas being copied here might overlap. */
-                      memmove (line_out, line_out + logical_end,
-                               offset_out - logical_end);
-                      offset_out -= logical_end;
-                      column = 0;
-                      char *printed_p = line_out;
-                      char *printed_lim = printed_p + offset_out;
-                      for (mcel_t g2; printed_p < printed_lim;
-                           printed_p += g2.len)
-                        {
-                          g2 = mcel_scan (printed_p, printed_lim);
-                          column = adjust_column (column, g2);
-                        }
-                      goto rescan;
+                      space_length = g2.len;
+                      logical_end = logical_p - line_out;
                     }
                 }
 
-              if (offset_out == 0)
+              if (space_length)
                 {
-                  memcpy (line_out, p, g.len);
-                  offset_out += g.len;
-                  continue;
+                  logical_end += space_length;
+                  /* Found a blank.  Don't output the part after it. */
+                  write_out (line_out, logical_end, /*newline=*/ true);
+                  /* Move the remainder to the beginning of the next line.
+                     The areas being copied here might overlap. */
+                  memmove (line_out, line_out + logical_end,
+                           offset_out - logical_end);
+                  offset_out -= logical_end;
+                  column = 0;
+                  char *printed_p = line_out;
+                  char *printed_lim = printed_p + offset_out;
+                  for (mcel_t g2; printed_p < printed_lim;
+                       printed_p += g2.len)
+                    {
+                      g2 = mcel_scan (printed_p, printed_lim);
+                      column = adjust_column (column, g2);
+                    }
+                  goto rescan;
                 }
-
-              write_out (line_out, offset_out, /*newline=*/ true);
-              column = offset_out = 0;
-              goto rescan;
             }
 
-          /* This can occur if we have read characters with a width of
-             zero.  */
-          if (sizeof line_out <= offset_out + g.len)
+          if (offset_out == 0)
             {
-              write_out (line_out, offset_out, /*newline=*/ false);
-              offset_out = 0;
+              memcpy (line_out, mbbuf_char_offset (&mbbuf, g), g.len);
+              offset_out += g.len;
+              continue;
             }
 
-          memcpy (line_out + offset_out, p, g.len);
-          offset_out += g.len;
+          write_out (line_out, offset_out, /*newline=*/ true);
+          column = offset_out = 0;
+          goto rescan;
         }
-      if (feof (istream))
-        break;
 
-      /* We read a full buffer of complete characters.  */
-      offset_in = 0;
+      /* This can occur if we have read characters with a width of
+         zero.  */
+      if (sizeof line_out <= offset_out + g.len)
+        {
+          write_out (line_out, offset_out, /*newline=*/ false);
+          offset_out = 0;
+        }
 
-    next_line:;
+      memcpy (line_out + offset_out, mbbuf_char_offset (&mbbuf, g), g.len);
+      offset_out += g.len;
     }
 
   saved_errno = errno;