From: Pádraig Brady Date: Mon, 12 Jan 2026 23:41:29 +0000 (+0000) Subject: paste: support multi-byte delimiters X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=06d228043f4bbf056d1d33f7fd2b4a39259ab5ae;p=thirdparty%2Fcoreutils.git paste: support multi-byte delimiters * src/paste.c (collapse_escapes): This is the central --delimiters parsing function, so adjust to handle multi-byte chars with mcel_scanz(). Populate a delimiters length array to support characters of differing lengths. (paste_serial): Use the delimiters length array to output the appropriate delimiter. (paste_parallel): Likewise. * tests/paste/multi-byte.sh: A new test. * tests/local.mk: Reference the new test. * NEWS: Mention the improvement. --- diff --git a/NEWS b/NEWS index 1ccc524264..b4031caa00 100644 --- a/NEWS +++ b/NEWS @@ -81,6 +81,8 @@ GNU coreutils NEWS -*- outline -*- 'du' now processes directories with 10,000 or more entries up to 9 times faster on the Lustre file system. + 'paste' now supports multi-byte --delimiters characters. + 'pinky' will now exit immediately upon receiving a write error, which is significant when reading large plan or project files. diff --git a/src/paste.c b/src/paste.c index f48f57f6be..01ed596e23 100644 --- a/src/paste.c +++ b/src/paste.c @@ -42,6 +42,7 @@ #include #include "system.h" #include "fadvise.h" +#include "mcel.h" /* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "paste" @@ -50,9 +51,6 @@ proper_name ("David M. Ihnat"), \ proper_name ("David MacKenzie") -/* Indicates that no delimiter should be added in the current position. */ -#define EMPTY_DELIM '\0' - /* If nonzero, we have read standard input at some point. */ static bool have_read_stdin; @@ -60,11 +58,16 @@ static bool have_read_stdin; corresponding lines from each file in parallel. */ static bool serial_merge; -/* The delimiters between lines of input files (used cyclically). */ +/* The delimiters between lines of input files (used cyclically). + This stores the raw bytes of all delimiters concatenated. */ static char *delims; -/* A pointer to the character after the end of 'delims'. */ -static char const *delim_end; +/* Length of each delimiter in bytes (supports multi-byte characters). + A length of 0 indicates no delimiter at this position (from \0 escape). */ +static size_t *delim_lens; + +/* Number of delimiters. */ +static idx_t num_delims; static unsigned char line_delim = '\n'; @@ -78,10 +81,10 @@ static struct option const longopts[] = {nullptr, 0, nullptr, 0} }; -/* Set globals delims and delim_end. Copy STRPTR to DELIMS, converting - backslash representations of special characters in STRPTR to their actual - values. The set of possible backslash characters has been expanded beyond - that recognized by the Unix version. +/* Set globals delims, delim_lens, and num_delims. + Process STRPTR converting backslash representations of special characters + to their actual values. The set of possible backslash characters has been + expanded beyond that recognized by the Unix version. Return 0 upon success. If the string ends in an odd number of backslashes, ignore the final backslash and return nonzero. */ @@ -93,62 +96,65 @@ collapse_escapes (char const *strptr) bool backslash_at_end = false; delims = strout; + delim_lens = xnmalloc (MAX (1, strlen (strptr)), sizeof *delim_lens); + + char const *s = strptr; + idx_t idx = 0; - while (*strptr) + while (*s) { - if (*strptr != '\\') /* Is it an escape character? */ - *strout++ = *strptr++; /* No, just transfer it. */ - else + if (*s == '\\') { - switch (*++strptr) + s++; + if (*s == '\0') { - case '0': - *strout++ = EMPTY_DELIM; - break; - - case 'b': - *strout++ = '\b'; - break; - - case 'f': - *strout++ = '\f'; - break; - - case 'n': - *strout++ = '\n'; - break; - - case 'r': - *strout++ = '\r'; + backslash_at_end = true; break; + } + else if (*s == '0') + { + /* Empty delimiter at this position. */ + s++; + delim_lens[idx++] = 0; + } + else + { + switch (*s) + { + case 'b': *strout++ = '\b'; break; + case 'f': *strout++ = '\f'; break; + case 'n': *strout++ = '\n'; break; + case 'r': *strout++ = '\r'; break; + case 't': *strout++ = '\t'; break; + case 'v': *strout++ = '\v'; break; + case '\\': *strout++ = '\\'; break; + default: goto copy_character; + } - case 't': - *strout++ = '\t'; - break; + s++; + delim_lens[idx++] = 1; + } - case 'v': - *strout++ = '\v'; - break; + continue; + } - case '\\': - *strout++ = '\\'; - break; + copy_character: + mcel_t g = mcel_scanz (s); + strout = mempcpy (strout, s, g.len); + s += g.len; + delim_lens[idx++] = g.len; + } - case '\0': - backslash_at_end = true; - goto done; + *strout = '\0'; - default: - *strout++ = *strptr; - break; - } - strptr++; - } + if (idx == 0) + { + delim_lens[0] = 0; + idx = 1; } - done: + num_delims = idx; - delim_end = strout; return backslash_at_end ? 1 : 0; } @@ -161,6 +167,16 @@ xputchar (char c) write_error (); } +/* Output the delimiter at DELIMPTR with length LEN. + If LEN is 0, nothing is output (empty delimiter from \0 escape). */ + +static inline void +output_delim (char const *delimptr, size_t len) +{ + if (len > 0 && fwrite (delimptr, 1, len, stdout) != len) + write_error (); +} + /* Perform column paste on the NFILES files named in FNAMPTR. Return true if successful, false if one or more files could not be opened or read. */ @@ -171,9 +187,9 @@ paste_parallel (size_t nfiles, char **fnamptr) bool ok = true; /* If all files are just ready to be closed, or will be on this round, the string of delimiters must be preserved. - delbuf[0] through delbuf[nfiles] - store the delimiters for closed files. */ - char *delbuf = xmalloc (nfiles + 2); + delbuf stores the delimiter bytes for closed files. + Size it to hold up to (nfiles - 1) delimiters. */ + char *delbuf = xmalloc ((nfiles - 1) * MB_CUR_MAX + 1); /* Streams open to the files to process; null if the corresponding stream is closed. */ @@ -218,8 +234,9 @@ paste_parallel (size_t nfiles, char **fnamptr) { /* Set up for the next line. */ bool somedone = false; - char const *delimptr = delims; - size_t delims_saved = 0; /* Number of delims saved in 'delbuf'. */ + idx_t delimidx = 0; /* Current delimiter index. */ + idx_t delimoff = 0; /* Current offset into delims. */ + idx_t delims_saved = 0; /* Bytes saved in 'delbuf'. */ for (size_t i = 0; i < nfiles && files_open; i++) { @@ -292,10 +309,18 @@ paste_parallel (size_t nfiles, char **fnamptr) else { /* Closed file; add delimiter to 'delbuf'. */ - if (*delimptr != EMPTY_DELIM) - delbuf[delims_saved++] = *delimptr; - if (++delimptr == delim_end) - delimptr = delims; + size_t len = delim_lens[delimidx]; + if (len > 0) + { + memcpy (delbuf + delims_saved, delims + delimoff, len); + delims_saved += len; + } + delimoff += len; + if (++delimidx == num_delims) + { + delimidx = 0; + delimoff = 0; + } } } else @@ -308,10 +333,13 @@ paste_parallel (size_t nfiles, char **fnamptr) { if (chr != line_delim && chr != EOF) xputchar (chr); - if (*delimptr != EMPTY_DELIM) - xputchar (*delimptr); - if (++delimptr == delim_end) - delimptr = delims; + output_delim (delims + delimoff, delim_lens[delimidx]); + delimoff += delim_lens[delimidx]; + if (++delimidx == num_delims) + { + delimidx = 0; + delimoff = 0; + } } else { @@ -337,7 +365,6 @@ paste_serial (size_t nfiles, char **fnamptr) { bool ok = true; /* false if open or read errors occur. */ int charnew, charold; /* Current and previous char read. */ - char const *delimptr; /* Current delimiter char. */ FILE *fileptr; /* Open for reading current file. */ for (; nfiles; nfiles--, fnamptr++) @@ -361,7 +388,8 @@ paste_serial (size_t nfiles, char **fnamptr) fadvise (fileptr, FADVISE_SEQUENTIAL); } - delimptr = delims; /* Set up for delimiter string. */ + idx_t delimidx = 0; /* Current delimiter index. */ + idx_t delimoff = 0; /* Current offset into delims. */ charold = getc (fileptr); saved_errno = errno; @@ -378,11 +406,13 @@ paste_serial (size_t nfiles, char **fnamptr) /* Process the old character. */ if (charold == line_delim) { - if (*delimptr != EMPTY_DELIM) - xputchar (*delimptr); - - if (++delimptr == delim_end) - delimptr = delims; + output_delim (delims + delimoff, delim_lens[delimidx]); + delimoff += delim_lens[delimidx]; + if (++delimidx == num_delims) + { + delimidx = 0; + delimoff = 0; + } } else xputchar (charold); @@ -520,6 +550,7 @@ main (int argc, char **argv) (nfiles, &argv[optind])); free (delims); + free (delim_lens); if (have_read_stdin && fclose (stdin) == EOF) error (EXIT_FAILURE, errno, "-"); diff --git a/tests/local.mk b/tests/local.mk index 01db536782..4bae33c6ec 100644 --- a/tests/local.mk +++ b/tests/local.mk @@ -377,9 +377,10 @@ all_tests = \ tests/od/od-j.sh \ tests/od/od-multiple-t.sh \ tests/od/od-x8.sh \ - tests/misc/paste.pl \ tests/misc/pathchk.sh \ tests/misc/printenv.sh \ + tests/paste/paste.pl \ + tests/paste/multi-byte.sh \ tests/printf/printf.sh \ tests/printf/printf-cov.pl \ tests/printf/printf-hex.sh \ diff --git a/tests/paste/multi-byte.sh b/tests/paste/multi-byte.sh new file mode 100755 index 0000000000..d0749d47d6 --- /dev/null +++ b/tests/paste/multi-byte.sh @@ -0,0 +1,103 @@ +#!/bin/sh +# Test multi-byte delimiter handling in paste + +# Copyright (C) 2026 Free Software Foundation, Inc. + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src +print_ver_ paste printf + +test "$LOCALE_FR_UTF8" != none || skip_ 'French UTF-8 locale not available' + +LC_ALL=$LOCALE_FR_UTF8 +export LC_ALL + +# UTF-8 test: 2-byte character (e.g., cent sign) +delim_cent=$(env printf '\xc2\xa2') +# UTF-8 test: 3-byte character (e.g., euro sign) +delim_euro=$(env printf '\xe2\x82\xac') +# UTF-8 test: 4-byte character (e.g., emoji: U+1F600) +delim_emoji=$(env printf '\xf0\x9f\x98\x80') + +printf '1\n2\n' > f1 || framework_failure_ +printf 'a\nb\n' > f2 || framework_failure_ + +# Test parallel mode with multi-byte delimiters +for delim in "$delim_cent" "$delim_euro" "$delim_emoji"; do + paste -d "$delim" f1 f2 > out || fail=1 + printf "1${delim}a\n2${delim}b\n" > exp || framework_failure_ + compare exp out || fail=1 +done + +# Test serial mode with multi-byte delimiters +printf '1\n2\n3\n' > f3 || framework_failure_ +for delim in "$delim_cent" "$delim_euro"; do + paste -s -d "$delim" f3 > out || fail=1 + printf "1${delim}2${delim}3\n" > exp || framework_failure_ + compare exp out || fail=1 +done + +# Test multiple multi-byte delimiters cycling +printf 'a\nb\nc\n' > f4 || framework_failure_ +printf '1\n2\n3\n' > f5 || framework_failure_ +printf 'x\ny\nz\n' > f6 || framework_failure_ +paste -d "${delim_cent}${delim_euro}" f4 f5 f6 > out || fail=1 +printf "a${delim_cent}1${delim_euro}x\n" > exp || framework_failure_ +printf "b${delim_cent}2${delim_euro}y\n" >> exp || framework_failure_ +printf "c${delim_cent}3${delim_euro}z\n" >> exp || framework_failure_ +compare exp out || fail=1 + +# Test multi-byte delimiters mixed with empty delimiter (\0) +paste -s -d "${delim_euro}\\0" f3 > out || fail=1 +printf "1${delim_euro}23\n" > exp || framework_failure_ +compare exp out || fail=1 + +# Test invalid UTF-8 sequences are still passed through +delims_invalid=$(bad_unicode) +delim_invalid=$(env printf '%s' "$delims_invalid" | cut -b1) +paste -d "$delims_invalid" f1 f2 > out || fail=1 +printf "1${delim_invalid}a\n2${delim_invalid}b\n" > exp || framework_failure_ +compare exp out || fail=1 + +# Test that \ is treated like +# (unknown escapes pass through the escaped character) +paste -d "\\${delim_euro}" f1 f2 > out || fail=1 +paste -d "$delim_euro" f1 f2 > exp || fail=1 +compare exp out || fail=1 + + +# Test GB18030 encoding if available +export LC_ALL=zh_CN.gb18030 + +if test "$(locale charmap 2>/dev/null | sed 's/gb/GB/')" = GB18030; then + # GB18030 2-byte character (e.g., 0xA2 0xE3 is a valid GB18030 char) + delim_gb18030=$(env printf '\xa2\xe3') + + paste -d "$delim_gb18030" f1 f2 > out || fail=1 + printf "1${delim_gb18030}a\n2${delim_gb18030}b\n" > exp || framework_failure_ + compare exp out || fail=1 + + paste -s -d "$delim_gb18030" f3 > out || fail=1 + printf "1${delim_gb18030}2${delim_gb18030}3\n" > exp || framework_failure_ + compare exp out || fail=1 + + # Note 0xFF is invalid in GB18030, but we support all single byte delimiters + delim_ff=$(env printf '\xff') + paste -d "$delim_ff" f1 f2 > out || fail=1 + printf "1${delim_ff}a\n2${delim_ff}b\n" > exp || framework_failure_ + compare exp out || fail=1 +fi + +Exit $fail diff --git a/tests/misc/paste.pl b/tests/paste/paste.pl similarity index 100% rename from tests/misc/paste.pl rename to tests/paste/paste.pl