#include <sys/types.h>
#include "system.h"
#include "fadvise.h"
+#include "mcel.h"
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "paste"
proper_name ("David M. Ihnat"), \
proper_name ("David MacKenzie")
-/* Indicates that no delimiter should be added in the current position. */
-#define EMPTY_DELIM '\0'
-
/* If nonzero, we have read standard input at some point. */
static bool have_read_stdin;
corresponding lines from each file in parallel. */
static bool serial_merge;
-/* The delimiters between lines of input files (used cyclically). */
+/* The delimiters between lines of input files (used cyclically).
+ This stores the raw bytes of all delimiters concatenated. */
static char *delims;
-/* A pointer to the character after the end of 'delims'. */
-static char const *delim_end;
+/* Length of each delimiter in bytes (supports multi-byte characters).
+ A length of 0 indicates no delimiter at this position (from \0 escape). */
+static size_t *delim_lens;
+
+/* Number of delimiters. */
+static idx_t num_delims;
static unsigned char line_delim = '\n';
{nullptr, 0, nullptr, 0}
};
-/* Set globals delims and delim_end. Copy STRPTR to DELIMS, converting
- backslash representations of special characters in STRPTR to their actual
- values. The set of possible backslash characters has been expanded beyond
- that recognized by the Unix version.
+/* Set globals delims, delim_lens, and num_delims.
+ Process STRPTR converting backslash representations of special characters
+ to their actual values. The set of possible backslash characters has been
+ expanded beyond that recognized by the Unix version.
Return 0 upon success.
If the string ends in an odd number of backslashes, ignore the
final backslash and return nonzero. */
bool backslash_at_end = false;
delims = strout;
+ delim_lens = xnmalloc (MAX (1, strlen (strptr)), sizeof *delim_lens);
+
+ char const *s = strptr;
+ idx_t idx = 0;
- while (*strptr)
+ while (*s)
{
- if (*strptr != '\\') /* Is it an escape character? */
- *strout++ = *strptr++; /* No, just transfer it. */
- else
+ if (*s == '\\')
{
- switch (*++strptr)
+ s++;
+ if (*s == '\0')
{
- case '0':
- *strout++ = EMPTY_DELIM;
- break;
-
- case 'b':
- *strout++ = '\b';
- break;
-
- case 'f':
- *strout++ = '\f';
- break;
-
- case 'n':
- *strout++ = '\n';
- break;
-
- case 'r':
- *strout++ = '\r';
+ backslash_at_end = true;
break;
+ }
+ else if (*s == '0')
+ {
+ /* Empty delimiter at this position. */
+ s++;
+ delim_lens[idx++] = 0;
+ }
+ else
+ {
+ switch (*s)
+ {
+ case 'b': *strout++ = '\b'; break;
+ case 'f': *strout++ = '\f'; break;
+ case 'n': *strout++ = '\n'; break;
+ case 'r': *strout++ = '\r'; break;
+ case 't': *strout++ = '\t'; break;
+ case 'v': *strout++ = '\v'; break;
+ case '\\': *strout++ = '\\'; break;
+ default: goto copy_character;
+ }
- case 't':
- *strout++ = '\t';
- break;
+ s++;
+ delim_lens[idx++] = 1;
+ }
- case 'v':
- *strout++ = '\v';
- break;
+ continue;
+ }
- case '\\':
- *strout++ = '\\';
- break;
+ copy_character:
+ mcel_t g = mcel_scanz (s);
+ strout = mempcpy (strout, s, g.len);
+ s += g.len;
+ delim_lens[idx++] = g.len;
+ }
- case '\0':
- backslash_at_end = true;
- goto done;
+ *strout = '\0';
- default:
- *strout++ = *strptr;
- break;
- }
- strptr++;
- }
+ if (idx == 0)
+ {
+ delim_lens[0] = 0;
+ idx = 1;
}
- done:
+ num_delims = idx;
- delim_end = strout;
return backslash_at_end ? 1 : 0;
}
write_error ();
}
+/* Output the delimiter at DELIMPTR with length LEN.
+ If LEN is 0, nothing is output (empty delimiter from \0 escape). */
+
+static inline void
+output_delim (char const *delimptr, size_t len)
+{
+ if (len > 0 && fwrite (delimptr, 1, len, stdout) != len)
+ write_error ();
+}
+
/* Perform column paste on the NFILES files named in FNAMPTR.
Return true if successful, false if one or more files could not be
opened or read. */
bool ok = true;
/* If all files are just ready to be closed, or will be on this
round, the string of delimiters must be preserved.
- delbuf[0] through delbuf[nfiles]
- store the delimiters for closed files. */
- char *delbuf = xmalloc (nfiles + 2);
+ delbuf stores the delimiter bytes for closed files.
+ Size it to hold up to (nfiles - 1) delimiters. */
+ char *delbuf = xmalloc ((nfiles - 1) * MB_CUR_MAX + 1);
/* Streams open to the files to process; null if the corresponding
stream is closed. */
{
/* Set up for the next line. */
bool somedone = false;
- char const *delimptr = delims;
- size_t delims_saved = 0; /* Number of delims saved in 'delbuf'. */
+ idx_t delimidx = 0; /* Current delimiter index. */
+ idx_t delimoff = 0; /* Current offset into delims. */
+ idx_t delims_saved = 0; /* Bytes saved in 'delbuf'. */
for (size_t i = 0; i < nfiles && files_open; i++)
{
else
{
/* Closed file; add delimiter to 'delbuf'. */
- if (*delimptr != EMPTY_DELIM)
- delbuf[delims_saved++] = *delimptr;
- if (++delimptr == delim_end)
- delimptr = delims;
+ size_t len = delim_lens[delimidx];
+ if (len > 0)
+ {
+ memcpy (delbuf + delims_saved, delims + delimoff, len);
+ delims_saved += len;
+ }
+ delimoff += len;
+ if (++delimidx == num_delims)
+ {
+ delimidx = 0;
+ delimoff = 0;
+ }
}
}
else
{
if (chr != line_delim && chr != EOF)
xputchar (chr);
- if (*delimptr != EMPTY_DELIM)
- xputchar (*delimptr);
- if (++delimptr == delim_end)
- delimptr = delims;
+ output_delim (delims + delimoff, delim_lens[delimidx]);
+ delimoff += delim_lens[delimidx];
+ if (++delimidx == num_delims)
+ {
+ delimidx = 0;
+ delimoff = 0;
+ }
}
else
{
{
bool ok = true; /* false if open or read errors occur. */
int charnew, charold; /* Current and previous char read. */
- char const *delimptr; /* Current delimiter char. */
FILE *fileptr; /* Open for reading current file. */
for (; nfiles; nfiles--, fnamptr++)
fadvise (fileptr, FADVISE_SEQUENTIAL);
}
- delimptr = delims; /* Set up for delimiter string. */
+ idx_t delimidx = 0; /* Current delimiter index. */
+ idx_t delimoff = 0; /* Current offset into delims. */
charold = getc (fileptr);
saved_errno = errno;
/* Process the old character. */
if (charold == line_delim)
{
- if (*delimptr != EMPTY_DELIM)
- xputchar (*delimptr);
-
- if (++delimptr == delim_end)
- delimptr = delims;
+ output_delim (delims + delimoff, delim_lens[delimidx]);
+ delimoff += delim_lens[delimidx];
+ if (++delimidx == num_delims)
+ {
+ delimidx = 0;
+ delimoff = 0;
+ }
}
else
xputchar (charold);
(nfiles, &argv[optind]));
free (delims);
+ free (delim_lens);
if (have_read_stdin && fclose (stdin) == EOF)
error (EXIT_FAILURE, errno, "-");
--- /dev/null
+#!/bin/sh
+# Test multi-byte delimiter handling in paste
+
+# Copyright (C) 2026 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ paste printf
+
+test "$LOCALE_FR_UTF8" != none || skip_ 'French UTF-8 locale not available'
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+# UTF-8 test: 2-byte character (e.g., cent sign)
+delim_cent=$(env printf '\xc2\xa2')
+# UTF-8 test: 3-byte character (e.g., euro sign)
+delim_euro=$(env printf '\xe2\x82\xac')
+# UTF-8 test: 4-byte character (e.g., emoji: U+1F600)
+delim_emoji=$(env printf '\xf0\x9f\x98\x80')
+
+printf '1\n2\n' > f1 || framework_failure_
+printf 'a\nb\n' > f2 || framework_failure_
+
+# Test parallel mode with multi-byte delimiters
+for delim in "$delim_cent" "$delim_euro" "$delim_emoji"; do
+ paste -d "$delim" f1 f2 > out || fail=1
+ printf "1${delim}a\n2${delim}b\n" > exp || framework_failure_
+ compare exp out || fail=1
+done
+
+# Test serial mode with multi-byte delimiters
+printf '1\n2\n3\n' > f3 || framework_failure_
+for delim in "$delim_cent" "$delim_euro"; do
+ paste -s -d "$delim" f3 > out || fail=1
+ printf "1${delim}2${delim}3\n" > exp || framework_failure_
+ compare exp out || fail=1
+done
+
+# Test multiple multi-byte delimiters cycling
+printf 'a\nb\nc\n' > f4 || framework_failure_
+printf '1\n2\n3\n' > f5 || framework_failure_
+printf 'x\ny\nz\n' > f6 || framework_failure_
+paste -d "${delim_cent}${delim_euro}" f4 f5 f6 > out || fail=1
+printf "a${delim_cent}1${delim_euro}x\n" > exp || framework_failure_
+printf "b${delim_cent}2${delim_euro}y\n" >> exp || framework_failure_
+printf "c${delim_cent}3${delim_euro}z\n" >> exp || framework_failure_
+compare exp out || fail=1
+
+# Test multi-byte delimiters mixed with empty delimiter (\0)
+paste -s -d "${delim_euro}\\0" f3 > out || fail=1
+printf "1${delim_euro}23\n" > exp || framework_failure_
+compare exp out || fail=1
+
+# Test invalid UTF-8 sequences are still passed through
+delims_invalid=$(bad_unicode)
+delim_invalid=$(env printf '%s' "$delims_invalid" | cut -b1)
+paste -d "$delims_invalid" f1 f2 > out || fail=1
+printf "1${delim_invalid}a\n2${delim_invalid}b\n" > exp || framework_failure_
+compare exp out || fail=1
+
+# Test that \<multi-byte char> is treated like <multi-byte char>
+# (unknown escapes pass through the escaped character)
+paste -d "\\${delim_euro}" f1 f2 > out || fail=1
+paste -d "$delim_euro" f1 f2 > exp || fail=1
+compare exp out || fail=1
+
+
+# Test GB18030 encoding if available
+export LC_ALL=zh_CN.gb18030
+
+if test "$(locale charmap 2>/dev/null | sed 's/gb/GB/')" = GB18030; then
+ # GB18030 2-byte character (e.g., 0xA2 0xE3 is a valid GB18030 char)
+ delim_gb18030=$(env printf '\xa2\xe3')
+
+ paste -d "$delim_gb18030" f1 f2 > out || fail=1
+ printf "1${delim_gb18030}a\n2${delim_gb18030}b\n" > exp || framework_failure_
+ compare exp out || fail=1
+
+ paste -s -d "$delim_gb18030" f3 > out || fail=1
+ printf "1${delim_gb18030}2${delim_gb18030}3\n" > exp || framework_failure_
+ compare exp out || fail=1
+
+ # Note 0xFF is invalid in GB18030, but we support all single byte delimiters
+ delim_ff=$(env printf '\xff')
+ paste -d "$delim_ff" f1 f2 > out || fail=1
+ printf "1${delim_ff}a\n2${delim_ff}b\n" > exp || framework_failure_
+ compare exp out || fail=1
+fi
+
+Exit $fail