]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
perf bench: Add --write-size option to sched pipe
authorBreno Leitao <leitao@debian.org>
Wed, 3 Jun 2026 10:35:07 +0000 (03:35 -0700)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Thu, 4 Jun 2026 13:11:04 +0000 (10:11 -0300)
The default ping-pong uses sizeof(int) (4 bytes) per iteration, which
exercises only the pipe-buffer merge path and keeps allocation entirely
out of the picture. That makes the bench a useful scheduler / context-
switch latency probe but unable to surface anything from the pipe
page-allocation hot path.

Add a -s/--write-size option that sets the bytes written and read per
ping-pong iteration. The buffer is allocated for each side via struct
thread_data and replaces the on-stack int previously used. The default
remains sizeof(int) so existing invocations are unchanged.

With --write-size set above PAGE_SIZE the bench drives anon_pipe_write()
through alloc_page() (or the bulk pre-alloc, if the relevant patch is
applied), which is what we want when measuring pipe locking and page
allocation work.

The bench is a ping-pong: both sides call write() before read(), so a
single write_size payload must fit entirely in the pipe buffer or both
sides deadlock waiting for the other to drain.

Resize the pipe via F_SETPIPE_SZ to match write_size (skipped at the
sizeof(int) default), and error out cleanly when the request exceeds
/proc/sys/fs/pipe-max-size.

Committer testing:

  ⬢ [acme@toolbx perf-tools-next]$ perf bench sched pipe
  # Running 'sched/pipe' benchmark:
  # Executed 1000000 pipe operations between two processes

     Total time: 0.915 [sec]

       0.915493 usecs/op
        1092307 ops/sec
  ⬢ [acme@toolbx perf-tools-next]$ perf bench sched pipe --write-size 1024
  # Running 'sched/pipe' benchmark:
  # Executed 1000000 pipe operations between two processes

     Total time: 0.891 [sec]

       0.891915 usecs/op
        1121183 ops/sec
  ⬢ [acme@toolbx perf-tools-next]$ perf bench sched pipe --write-size 4096
  # Running 'sched/pipe' benchmark:
  # Executed 1000000 pipe operations between two processes

     Total time: 1.366 [sec]

       1.366073 usecs/op
         732025 ops/sec
  ⬢ [acme@toolbx perf-tools-next]$ strace -e fcntl perf bench sched pipe --write-size 4096
  # Running 'sched/pipe' benchmark:
  fcntl(4, F_SETPIPE_SZ, 4096)            = 4096
  fcntl(6, F_SETPIPE_SZ, 4096)            = 4096
  ^Cstrace: Process 17840 detached

  ⬢ [acme@toolbx perf-tools-next]$ strace -e fcntl perf bench sched pipe --write-size 1024
  # Running 'sched/pipe' benchmark:
  fcntl(4, F_SETPIPE_SZ, 1024)            = 4096
  fcntl(6, F_SETPIPE_SZ, 1024)            = 4096
  ^Cstrace: Process 17845 detached

  ⬢ [acme@toolbx perf-tools-next]$ strace -e fcntl perf bench sched pipe
  # Running 'sched/pipe' benchmark:
  ^Cstrace: Process 17851 detached

  ⬢ [acme@toolbx perf-tools-next]$
  ⬢ [acme@toolbx perf-tools-next]$ perf bench sched pipe --write-size 1048577
  # Running 'sched/pipe' benchmark:
  --write-size 1048577 exceeds /proc/sys/fs/pipe-max-size
  ⬢ [acme@toolbx perf-tools-next]$ cat /proc/sys/fs/pipe-max-size
  1048576
  ⬢ [acme@toolbx perf-tools-next]$
  acme@number:~/git/perf-tools-next$

Signed-off-by: Breno Leitao <leitao@debian.org>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Clark <james.clark@linaro.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/bench/sched-pipe.c

index 70139036d68f09b5b52bd03baac8393d64d5b760..eb20c6d73d06044394e664061a4176bd57c67a18 100644 (file)
@@ -22,7 +22,9 @@
 #include <string.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <assert.h>
+#include <poll.h>
 #include <sys/epoll.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -39,6 +41,7 @@ struct thread_data {
        int                     epoll_fd;
        bool                    cgroup_failed;
        pthread_t               pthread;
+       char                    *buf;
 };
 
 #define LOOPS_DEFAULT 1000000
@@ -48,6 +51,7 @@ static        int                     loops = LOOPS_DEFAULT;
 static bool                    threaded;
 
 static bool                    nonblocking;
+static unsigned int            write_size = sizeof(int);
 static char                    *cgrp_names[2];
 static struct cgroup           *cgrps[2];
 
@@ -88,6 +92,8 @@ static const struct option options[] = {
        OPT_BOOLEAN('n', "nonblocking", &nonblocking,   "Use non-blocking operations"),
        OPT_INTEGER('l', "loop",        &loops,         "Specify number of loops"),
        OPT_BOOLEAN('T', "threaded",    &threaded,      "Specify threads/process based task setup"),
+       OPT_UINTEGER('s', "write-size", &write_size,
+                    "Bytes per ping-pong write (default 4-bytes). Use larger values to exercise the pipe page-allocation path."),
        OPT_CALLBACK('G', "cgroups", NULL, "SEND,RECV",
                     "Put sender and receivers in given cgroups",
                     parse_two_cgroups),
@@ -170,25 +176,77 @@ static void exit_cgroup(int nr)
        free(cgrp_names[nr]);
 }
 
+/* Sleep until @fd is writable, so we don't busy-spin on EWOULDBLOCK. */
+static inline void wait_writable(int fd)
+{
+       struct pollfd pfd = {
+               .fd     = fd,
+               .events = POLLOUT,
+       };
+
+       poll(&pfd, 1, -1);
+}
+
+/*
+ * Loop on short read()/write(): the kernel may return fewer bytes than
+ * requested, retry on EINTR, and in non-blocking mode wait via poll()
+ * when the writer transiently hits EWOULDBLOCK while the peer is still
+ * draining a full pipe (capacity is sized to write_size).
+ */
+static inline int write_pipe(struct thread_data *td)
+{
+       unsigned int done = 0;
+       int ret;
+
+       while (done < write_size) {
+               ret = write(td->pipe_write, td->buf + done, write_size - done);
+               if (ret < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       if (nonblocking && errno == EWOULDBLOCK) {
+                               wait_writable(td->pipe_write);
+                               continue;
+                       }
+                       return ret;
+               }
+               done += ret;
+       }
+       return done;
+}
+
 static inline int read_pipe(struct thread_data *td)
 {
-       int ret, m;
-retry:
-       if (nonblocking) {
-               ret = epoll_wait(td->epoll_fd, &td->epoll_ev, 1, -1);
-               if (ret < 0)
+       unsigned int done = 0;
+       int ret;
+
+       while (done < write_size) {
+               if (nonblocking) {
+                       ret = epoll_wait(td->epoll_fd, &td->epoll_ev, 1, -1);
+                       if (ret < 0) {
+                               if (errno == EINTR)
+                                       continue;
+                               return ret;
+                       }
+               }
+               ret = read(td->pipe_read, td->buf + done, write_size - done);
+               if (ret < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       if (nonblocking && errno == EWOULDBLOCK)
+                               continue;
                        return ret;
+               }
+               if (ret == 0)
+                       return done;
+               done += ret;
        }
-       ret = read(td->pipe_read, &m, sizeof(int));
-       if (nonblocking && ret < 0 && errno == EWOULDBLOCK)
-               goto retry;
-       return ret;
+       return done;
 }
 
 static void *worker_thread(void *__tdata)
 {
        struct thread_data *td = __tdata;
-       int i, ret, m = 0;
+       int i, ret;
 
        ret = enter_cgroup(td->nr);
        if (ret < 0) {
@@ -204,15 +262,38 @@ static void *worker_thread(void *__tdata)
        }
 
        for (i = 0; i < loops; i++) {
-               ret = write(td->pipe_write, &m, sizeof(int));
-               BUG_ON(ret != sizeof(int));
+               ret = write_pipe(td);
+               BUG_ON(ret != (int)write_size);
                ret = read_pipe(td);
-               BUG_ON(ret != sizeof(int));
+               BUG_ON(ret != (int)write_size);
        }
 
        return NULL;
 }
 
+/*
+ * On a custom write_size, resize the pipes so a single payload fits.
+ */
+static int resize_pipes(int wfd1, int wfd2)
+{
+       int r1, r2;
+
+       if (write_size <= sizeof(int))
+               return 0;
+
+       r1 = fcntl(wfd1, F_SETPIPE_SZ, write_size);
+       r2 = fcntl(wfd2, F_SETPIPE_SZ, write_size);
+       if (r1 < 0 || r2 < 0 ||
+           (unsigned int)r1 < write_size ||
+           (unsigned int)r2 < write_size) {
+               fprintf(stderr,
+                       "--write-size %u exceeds /proc/sys/fs/pipe-max-size\n",
+                       write_size);
+               return -1;
+       }
+       return 0;
+}
+
 int bench_sched_pipe(int argc, const char **argv)
 {
        struct thread_data threads[2] = {};
@@ -233,12 +314,31 @@ int bench_sched_pipe(int argc, const char **argv)
 
        argc = parse_options(argc, argv, options, bench_sched_pipe_usage, 0);
 
+       /*
+        * The error paths below return early without closing the pipes or
+        * freeing the cgroup state. That is fine: bench_sched_pipe() runs
+        * once and the process exits right after it returns, so these are
+        * not real leaks.
+        */
+       if (write_size == 0 || write_size > INT_MAX) {
+               fprintf(stderr, "--write-size must be in 1..%d\n", INT_MAX);
+               return -1;
+       }
+
        if (nonblocking)
                flags |= O_NONBLOCK;
 
        BUG_ON(pipe2(pipe_1, flags));
        BUG_ON(pipe2(pipe_2, flags));
 
+       if (resize_pipes(pipe_1[1], pipe_2[1]) < 0)
+               return -1;
+
+       for (t = 0; t < nr_threads; t++) {
+               threads[t].buf = calloc(1, write_size);
+               BUG_ON(!threads[t].buf);
+       }
+
        gettimeofday(&start, NULL);
 
        for (t = 0; t < nr_threads; t++) {
@@ -287,6 +387,9 @@ int bench_sched_pipe(int argc, const char **argv)
        gettimeofday(&stop, NULL);
        timersub(&stop, &start, &diff);
 
+       for (t = 0; t < nr_threads; t++)
+               free(threads[t].buf);
+
        exit_cgroup(0);
        exit_cgroup(1);