--- /dev/null
+/* Buffering for multi-byte characters.
+ Copyright (C) 2025 Free Software Foundation, Inc.
+
+ This file is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Lesser General Public License as
+ published by the Free Software Foundation; either version 2.1 of the
+ License, or (at your option) any later version.
+
+ This file is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+/* Written by Collin Funk. */
+
+#ifndef _MBBUF_H
+#define _MBBUF_H 1
+
+#ifndef _GL_INLINE_HEADER_BEGIN
+# error "Please include config.h first."
+#endif
+
+#include <stdio.h>
+#include <stddef.h>
+
+#include "mcel.h"
+#include "idx.h"
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef MBBUF_INLINE
+# define MBBUF_INLINE _GL_INLINE
+#endif
+
+/* End of file. */
+#define MBBUF_EOF UINT32_MAX
+
+/* MBBUF_EOF should not be a valid character. */
+static_assert (MCEL_CHAR_MAX < MBBUF_EOF);
+
+typedef struct
+{
+ char *buffer; /* Input buffer. */
+ FILE *fp; /* Input file stream. */
+ idx_t size; /* Number of bytes allocated for BUFFER. */
+ idx_t length; /* Number of bytes with data in BUFFER. */
+ idx_t offset; /* Current position in BUFFER. */
+} mbbuf_t;
+
+/* Initialize MBBUF with an allocated BUFFER of SIZE bytes and a file stream
+ FP open for reading. SIZE must be greater than or equal to MCEL_LEN_MAX.
+ */
+MBBUF_INLINE void
+mbbuf_init (mbbuf_t *mbbuf, char *buffer, idx_t size, FILE *fp)
+{
+ if (size < MCEL_LEN_MAX)
+ unreachable ();
+ mbbuf->buffer = buffer;
+ mbbuf->fp = fp;
+ mbbuf->size = size;
+ mbbuf->length = 0;
+ mbbuf->offset = 0;
+}
+
+/* Get the next character in the buffer, filling it from FP if necessary.
+ If an invalid multi-byte character is seen, we assume the program wants to
+ fall back to the read byte. */
+MBBUF_INLINE mcel_t
+mbbuf_get_char (mbbuf_t *mbbuf)
+{
+ idx_t available = mbbuf->length - mbbuf->offset;
+ /* Check if we need to fill the input buffer. */
+ if (available < MCEL_LEN_MAX && ! feof (mbbuf->fp))
+ {
+ idx_t start;
+ if (!(0 < available))
+ start = 0;
+ else
+ {
+ memmove (mbbuf->buffer, mbbuf->buffer + mbbuf->offset, available);
+ start = available;
+ }
+ mbbuf->length = fread (mbbuf->buffer + start, 1, mbbuf->size - start,
+ mbbuf->fp) + start;
+ mbbuf->offset = 0;
+ available = mbbuf->length - mbbuf->offset;
+ }
+ if (available <= 0)
+ return (mcel_t) { .ch = MBBUF_EOF };
+ mcel_t g = mcel_scan (mbbuf->buffer + mbbuf->offset,
+ mbbuf->buffer + mbbuf->length);
+ if (! g.err)
+ mbbuf->offset += g.len;
+ else
+ {
+ /* Assume the program will emit the byte, but keep the error flag. */
+ g.ch = mbbuf->buffer[mbbuf->offset++];
+ g.len = 1;
+ }
+ return g;
+}
+
+/* Returns a pointer to the first byte in the previously read character from
+ mbbuf_get_char. */
+MBBUF_INLINE char *
+mbbuf_char_offset (mbbuf_t *mbbuf, mcel_t g)
+{
+ if (mbbuf->offset < g.len)
+ unreachable ();
+ return mbbuf->buffer + (mbbuf->offset - g.len);
+}
+
+_GL_INLINE_HEADER_END
+
+#endif
#include "fadvise.h"
#include "ioblksize.h"
#include "mcel.h"
+#include "mbbuf.h"
#include "xdectoint.h"
#define TAB_WIDTH 8
idx_t offset_out = 0; /* Index in 'line_out' for next char. */
static char line_out[IO_BUFSIZE];
static char line_in[IO_BUFSIZE];
- static size_t offset_in = 0;
- static size_t length_in = 0;
+ mbbuf_t mbbuf;
int saved_errno;
if (streq (filename, "-"))
}
fadvise (istream, FADVISE_SEQUENTIAL);
+ mbbuf_init (&mbbuf, line_in, sizeof line_in, istream);
- while (0 < (length_in = fread (line_in + offset_in, 1,
- sizeof line_in - offset_in, istream))
- || 0 < offset_in)
+ mcel_t g;
+ while ((g = mbbuf_get_char (&mbbuf)).ch != MBBUF_EOF)
{
- char *p = line_in;
- char *lim = p + length_in + offset_in;
- mcel_t g;
- for (; p < lim; p += g.len)
+ if (g.ch == '\n')
{
- g = mcel_scan (p, lim);
- if (g.err)
- {
- /* Replace the character with the byte if it cannot be a
- truncated multibyte sequence. */
- if (!(lim - p <= MCEL_LEN_MAX) || length_in == 0)
- g.ch = p[0];
- else
- {
- /* It may be a truncated multibyte sequence. Move it to the
- front of the input buffer. */
- memmove (line_in, p, lim - p);
- offset_in = lim - p;
- goto next_line;
- }
- }
- if (g.ch == '\n')
- {
- write_out (line_out, offset_out, /*newline=*/ true);
- column = offset_out = 0;
- continue;
- }
- rescan:
- column = adjust_column (column, g);
+ write_out (line_out, offset_out, /*newline=*/ true);
+ column = offset_out = 0;
+ continue;
+ }
+ rescan:
+ column = adjust_column (column, g);
- if (column > width)
+ if (column > width)
+ {
+ /* This character would make the line too long.
+ Print the line plus a newline, and make this character
+ start the next line. */
+ if (break_spaces)
{
- /* This character would make the line too long.
- Print the line plus a newline, and make this character
- start the next line. */
- if (break_spaces)
- {
- int space_length = 0;
- idx_t logical_end = offset_out;
- char *logical_p = line_out;
- char *logical_lim = logical_p + logical_end;
+ int space_length = 0;
+ idx_t logical_end = offset_out;
+ char *logical_p = line_out;
+ char *logical_lim = logical_p + logical_end;
- for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
- {
- g2 = mcel_scan (logical_p, logical_lim);
- if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch))
- {
- space_length = g2.len;
- logical_end = logical_p - line_out;
- }
- }
-
- if (space_length)
+ for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
+ {
+ g2 = mcel_scan (logical_p, logical_lim);
+ if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch))
{
- logical_end += space_length;
- /* Found a blank. Don't output the part after it. */
- write_out (line_out, logical_end, /*newline=*/ true);
- /* Move the remainder to the beginning of the next line.
- The areas being copied here might overlap. */
- memmove (line_out, line_out + logical_end,
- offset_out - logical_end);
- offset_out -= logical_end;
- column = 0;
- char *printed_p = line_out;
- char *printed_lim = printed_p + offset_out;
- for (mcel_t g2; printed_p < printed_lim;
- printed_p += g2.len)
- {
- g2 = mcel_scan (printed_p, printed_lim);
- column = adjust_column (column, g2);
- }
- goto rescan;
+ space_length = g2.len;
+ logical_end = logical_p - line_out;
}
}
- if (offset_out == 0)
+ if (space_length)
{
- memcpy (line_out, p, g.len);
- offset_out += g.len;
- continue;
+ logical_end += space_length;
+ /* Found a blank. Don't output the part after it. */
+ write_out (line_out, logical_end, /*newline=*/ true);
+ /* Move the remainder to the beginning of the next line.
+ The areas being copied here might overlap. */
+ memmove (line_out, line_out + logical_end,
+ offset_out - logical_end);
+ offset_out -= logical_end;
+ column = 0;
+ char *printed_p = line_out;
+ char *printed_lim = printed_p + offset_out;
+ for (mcel_t g2; printed_p < printed_lim;
+ printed_p += g2.len)
+ {
+ g2 = mcel_scan (printed_p, printed_lim);
+ column = adjust_column (column, g2);
+ }
+ goto rescan;
}
-
- write_out (line_out, offset_out, /*newline=*/ true);
- column = offset_out = 0;
- goto rescan;
}
- /* This can occur if we have read characters with a width of
- zero. */
- if (sizeof line_out <= offset_out + g.len)
+ if (offset_out == 0)
{
- write_out (line_out, offset_out, /*newline=*/ false);
- offset_out = 0;
+ memcpy (line_out, mbbuf_char_offset (&mbbuf, g), g.len);
+ offset_out += g.len;
+ continue;
}
- memcpy (line_out + offset_out, p, g.len);
- offset_out += g.len;
+ write_out (line_out, offset_out, /*newline=*/ true);
+ column = offset_out = 0;
+ goto rescan;
}
- if (feof (istream))
- break;
- /* We read a full buffer of complete characters. */
- offset_in = 0;
+ /* This can occur if we have read characters with a width of
+ zero. */
+ if (sizeof line_out <= offset_out + g.len)
+ {
+ write_out (line_out, offset_out, /*newline=*/ false);
+ offset_out = 0;
+ }
- next_line:;
+ memcpy (line_out + offset_out, mbbuf_char_offset (&mbbuf, g), g.len);
+ offset_out += g.len;
}
saved_errno = errno;