From: Pádraig Brady Date: Sat, 7 Mar 2026 14:23:38 +0000 (+0000) Subject: yes: use a zero-copy implementation via (vm)splice X-Git-Tag: v9.11~177 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2b1c059e6a06eebbb721d010b1221ec54200cc33;p=thirdparty%2Fcoreutils.git yes: use a zero-copy implementation via (vm)splice A good reference for the concepts used here is: https://mazzo.li/posts/fast-pipes.html We don't consider huge pages or busy loops here, but use vmsplice(), and splice() to get significant speedups: i7-5600U-laptop $ taskset 1 yes | taskset 2 pv > /dev/null ... [4.98GiB/s] i7-5600U-laptop $ taskset 1 src/yes | taskset 2 pv > /dev/null ... [34.1GiB/s] IBM,9043-MRX $ taskset 1 yes | taskset 2 pv > /dev/null ... [11.6GiB/s] IBM,9043-MRX $ taskset 1 src/yes | taskset 2 pv > /dev/null ... [175GiB/s] Also throughput to file (on BTRFS) was seen to increase significantly. With a Fedora 43 laptop improving from 690MiB/s to 1.1GiB/s. * bootstrap.conf: Ensure sys/uio.h is present. This was an existing transitive dependency. * m4/jm-macros.m4: Define HAVE_SPLICE appropriately. We assume vmsplice() is available if splice() is as they were introduced at the same time to Linux and glibc. * src/yes.c (repeat_pattern): A new function to efficiently duplicate a pattern in a buffer with memcpy calls that double in size. This also makes the setup for the existing write() path more efficient. (pipe_splice_size): A new function to increase the kernel pipe buffer if possible, and use an appropriately sized buffer based on that (25%). (splice_write): A new function to call vmplice() when outputting to a pipe, and also splice() if outputting to a non-pipe. * tests/misc/yes.sh: Verify the non-pipe output case, (main): Adjust to always calling write on the minimal buffer first, then trying vmsplice(), then falling back to write from bigger buffer. and the vmsplice() fallback to write() case. * NEWS: Mention the improvement. --- diff --git a/NEWS b/NEWS index 666968c86e..a42670f8c6 100644 --- a/NEWS +++ b/NEWS @@ -54,6 +54,9 @@ GNU coreutils NEWS -*- outline -*- 'wc -l' now operates up to three times faster on hosts that support Neon instructions. + 'yes' now uses zero-copy I/O on Linux to significantly increase throughput. + E.g., increases from 12GiB/s to 175GiB/s were seen on some systems. + ** Build-related ./configure --enable-single-binary=hardlinks is now supported on systems diff --git a/bootstrap.conf b/bootstrap.conf index 331382841c..5f15a82f90 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -287,6 +287,7 @@ gnulib_modules=" sys_resource-h sys_stat-h sys_types-h + sys_uio-h sys_wait-h targetdir tempname diff --git a/m4/jm-macros.m4 b/m4/jm-macros.m4 index 53be2e3427..fbf0827039 100644 --- a/m4/jm-macros.m4 +++ b/m4/jm-macros.m4 @@ -70,6 +70,7 @@ AC_DEFUN([coreutils_MACROS], setgroups sethostname siginterrupt + splice sync sysinfo tcgetpgrp diff --git a/src/yes.c b/src/yes.c index 66b6243db4..91e540eba9 100644 --- a/src/yes.c +++ b/src/yes.c @@ -19,10 +19,13 @@ #include #include #include +#include #include "system.h" +#include "alignalloc.h" #include "full-write.h" +#include "isapipe.h" #include "long-options.h" /* The official name of this program (e.g., no 'g' prefix). */ @@ -54,6 +57,145 @@ Repeatedly output a line with all specified STRING(s), or 'y'.\n\ exit (status); } +/* Fill DEST[0..BUFSIZE-1] with repeated copies of SRC[0..SRCSIZE-1], + doubling the copy size each iteration. DEST may equal SRC. */ + +static void +repeat_pattern (char *dest, char const *src, idx_t srcsize, idx_t bufsize) +{ + if (dest != src) + memcpy (dest, src, srcsize); + for (idx_t filled = srcsize; filled < bufsize; ) + { + idx_t chunk = MIN (filled, bufsize - filled); + memcpy (dest + filled, dest, chunk); + filled += chunk; + } +} + +#if HAVE_SPLICE + +/* Empirically determined pipe size for best throughput. + Needs to be <= /proc/sys/fs/pipe-max-size */ +enum { SPLICE_PIPE_SIZE = 512 * 1024 }; + +/* Enlarge a pipe towards SPLICE_PIPE_SIZE and return the actual + capacity as a quarter of the pipe size (the empirical sweet spot + for vmsplice throughput), rounded down to a multiple of COPYSIZE. + Return 0 if the result would be smaller than COPYSIZE. */ + +static idx_t +pipe_splice_size (int fd, idx_t copysize) +{ + int pipe_cap = 0; +# if defined F_SETPIPE_SZ && defined F_GETPIPE_SZ + if ((pipe_cap = fcntl (fd, F_SETPIPE_SZ, SPLICE_PIPE_SIZE)) < 0) + pipe_cap = fcntl (fd, F_GETPIPE_SZ); +# endif + if (pipe_cap <= 0) + pipe_cap = 64 * 1024; + + size_t buf_cap = pipe_cap / 4; + return buf_cap / copysize * copysize; +} + +#endif + +/* Repeatedly write the COPYSIZE-byte pattern in BUF to standard output + using vmsplice/splice zero-copy I/O. Since the data never varies, + SPLICE_F_GIFT tells the kernel the pages will not be modified. + + Return TRUE if splice I/O was used (caller should check errno and + report any error). Return FALSE if splice could not be used. */ + +static bool +splice_write (MAYBE_UNUSED char const *buf, MAYBE_UNUSED idx_t copysize) +{ + bool output_started = false; +#if HAVE_SPLICE + idx_t page_size = getpagesize (); + + bool stdout_is_pipe = isapipe (STDOUT_FILENO) > 0; + + /* Determine buffer size: enlarge the target pipe, + then use 1/4 of actual capacity as the transfer size. */ + int pipefd[2] = { -1, -1 }; + idx_t splice_bufsize; + char *splice_buf = NULL; + + if (stdout_is_pipe) + splice_bufsize = pipe_splice_size (STDOUT_FILENO, copysize); + else + { + if (pipe2 (pipefd, 0) < 0) + return false; + splice_bufsize = pipe_splice_size (pipefd[0], copysize); + } + + if (splice_bufsize == 0) + goto done; + + /* Allocate page-aligned buffer for vmsplice. + Needed with SPLICE_F_GIFT, but generally good for performance. */ + if (! (splice_buf = alignalloc (page_size, splice_bufsize))) + goto done; + + repeat_pattern (splice_buf, buf, copysize, splice_bufsize); + + /* For the pipe case, vmsplice directly to stdout. + For the non-pipe case, vmsplice into the intermediate pipe + and then splice from it to stdout. */ + int vmsplice_fd = stdout_is_pipe ? STDOUT_FILENO : pipefd[1]; + + for (;;) + { + struct iovec iov = { .iov_base = splice_buf, + .iov_len = splice_bufsize }; + + while (iov.iov_len > 0) + { + /* Use SPLICE_F_{GIFT,MOVE} to allow the kernel to take references + to the pages. I.e., we're indicating we won't make changes. + SPLICE_F_GIFT is only appropriate for full pages. */ + unsigned int flags = iov.iov_len % page_size ? 0 : SPLICE_F_GIFT; + ssize_t n = vmsplice (vmsplice_fd, &iov, 1, flags); + if (n <= 0) + goto done; + if (stdout_is_pipe) + output_started = true; + iov.iov_base = (char *) iov.iov_base + n; + iov.iov_len -= n; + } + + /* For non-pipe stdout, drain intermediate pipe to stdout. */ + if (! stdout_is_pipe) + { + idx_t remaining = splice_bufsize; + while (remaining > 0) + { + ssize_t s = splice (pipefd[0], NULL, STDOUT_FILENO, NULL, + remaining, SPLICE_F_MOVE); + if (s <= 0) + goto done; + output_started = true; + remaining -= s; + } + } + } + +done: + if (pipefd[0] >= 0) + { + int saved_errno = errno; + close (pipefd[0]); + close (pipefd[1]); + errno = saved_errno; + } + alignfree (splice_buf); +#endif + return output_started; +} + int main (int argc, char **argv) { @@ -117,18 +259,22 @@ main (int argc, char **argv) while (++operandp < operand_lim); buf[bufused - 1] = '\n'; - /* If a larger buffer was allocated, fill it by repeating the buffer - contents. */ - size_t copysize = bufused; - for (size_t copies = bufalloc / copysize; --copies; ) + idx_t copysize = bufused; + + /* Repeatedly output the buffer until there is a write error; then fail. + Do a minimal write first to check output with minimal set up cost. + If successful then set up for efficient repetition. */ + if (full_write (STDOUT_FILENO, buf, copysize) == copysize + && splice_write (buf, copysize) == 0) { - memcpy (buf + bufused, buf, copysize); - bufused += copysize; + /* If a larger buffer was allocated, fill it by repeated copies. */ + bufused = bufalloc / copysize * copysize; + if (bufused > copysize) + repeat_pattern (buf, buf, copysize, bufused); + while (full_write (STDOUT_FILENO, buf, bufused) == bufused) + continue; } - /* Repeatedly output the buffer until there is a write error; then fail. */ - while (full_write (STDOUT_FILENO, buf, bufused) == bufused) - continue; error (0, errno, _("standard output")); main_exit (EXIT_FAILURE); } diff --git a/tests/misc/yes.sh b/tests/misc/yes.sh index ba340c9fab..56d89a36ad 100755 --- a/tests/misc/yes.sh +++ b/tests/misc/yes.sh @@ -19,6 +19,7 @@ . "${srcdir=.}/tests/init.sh"; path_prepend_ ./src print_ver_ yes getlimits_ +uses_strace_ # Check basic operation test "$(yes | head -n1)" = 'y' || fail=1 @@ -56,4 +57,23 @@ if test -w /dev/full && test -c /dev/full; then done fi +# Check the non pipe output case, since that is different with splice +if timeout 10 true; then + timeout .1 yes >/dev/null + test $? = 124 || fail=1 +fi + +# Ensure we fallback to write() if there is an issue with vmsplice +no_vmsplice() { strace -f -o /dev/null -e inject=vmsplice:error=ENOSYS "$@"; } +if no_vmsplice true; then + test "$(no_vmsplice yes | head -n2 | paste -s -d '')" = 'yy' || fail=1 +fi +# Ensure we fallback to write() if there is an issue with pipe2() +# For example if we don't have enough file descriptors available. +no_pipe() { strace -f -o /dev/null -e inject=pipe,pipe2:error=EMFILE "$@"; } +if no_pipe true; then + no_pipe timeout .1 yes >/dev/null + test $? = 124 || fail=1 +fi + Exit $fail