From: Breno Leitao Date: Wed, 3 Jun 2026 10:35:07 +0000 (-0700) Subject: perf bench: Add --write-size option to sched pipe X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=b52ba22c7078e1987ce0cc0a8385654cb36296e3;p=thirdparty%2Fkernel%2Flinux.git perf bench: Add --write-size option to sched pipe The default ping-pong uses sizeof(int) (4 bytes) per iteration, which exercises only the pipe-buffer merge path and keeps allocation entirely out of the picture. That makes the bench a useful scheduler / context- switch latency probe but unable to surface anything from the pipe page-allocation hot path. Add a -s/--write-size option that sets the bytes written and read per ping-pong iteration. The buffer is allocated for each side via struct thread_data and replaces the on-stack int previously used. The default remains sizeof(int) so existing invocations are unchanged. With --write-size set above PAGE_SIZE the bench drives anon_pipe_write() through alloc_page() (or the bulk pre-alloc, if the relevant patch is applied), which is what we want when measuring pipe locking and page allocation work. The bench is a ping-pong: both sides call write() before read(), so a single write_size payload must fit entirely in the pipe buffer or both sides deadlock waiting for the other to drain. Resize the pipe via F_SETPIPE_SZ to match write_size (skipped at the sizeof(int) default), and error out cleanly when the request exceeds /proc/sys/fs/pipe-max-size. Committer testing: ⬢ [acme@toolbx perf-tools-next]$ perf bench sched pipe # Running 'sched/pipe' benchmark: # Executed 1000000 pipe operations between two processes Total time: 0.915 [sec] 0.915493 usecs/op 1092307 ops/sec ⬢ [acme@toolbx perf-tools-next]$ perf bench sched pipe --write-size 1024 # Running 'sched/pipe' benchmark: # Executed 1000000 pipe operations between two processes Total time: 0.891 [sec] 0.891915 usecs/op 1121183 ops/sec ⬢ [acme@toolbx perf-tools-next]$ perf bench sched pipe --write-size 4096 # Running 'sched/pipe' benchmark: # Executed 1000000 pipe operations between two processes Total time: 1.366 [sec] 1.366073 usecs/op 732025 ops/sec ⬢ [acme@toolbx perf-tools-next]$ strace -e fcntl perf bench sched pipe --write-size 4096 # Running 'sched/pipe' benchmark: fcntl(4, F_SETPIPE_SZ, 4096) = 4096 fcntl(6, F_SETPIPE_SZ, 4096) = 4096 ^Cstrace: Process 17840 detached ⬢ [acme@toolbx perf-tools-next]$ strace -e fcntl perf bench sched pipe --write-size 1024 # Running 'sched/pipe' benchmark: fcntl(4, F_SETPIPE_SZ, 1024) = 4096 fcntl(6, F_SETPIPE_SZ, 1024) = 4096 ^Cstrace: Process 17845 detached ⬢ [acme@toolbx perf-tools-next]$ strace -e fcntl perf bench sched pipe # Running 'sched/pipe' benchmark: ^Cstrace: Process 17851 detached ⬢ [acme@toolbx perf-tools-next]$ ⬢ [acme@toolbx perf-tools-next]$ perf bench sched pipe --write-size 1048577 # Running 'sched/pipe' benchmark: --write-size 1048577 exceeds /proc/sys/fs/pipe-max-size ⬢ [acme@toolbx perf-tools-next]$ cat /proc/sys/fs/pipe-max-size 1048576 ⬢ [acme@toolbx perf-tools-next]$ acme@number:~/git/perf-tools-next$ Signed-off-by: Breno Leitao Acked-by: Namhyung Kim Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: Mark Rutland Cc: Peter Zijlstra Signed-off-by: Arnaldo Carvalho de Melo --- diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c index 70139036d68f0..eb20c6d73d060 100644 --- a/tools/perf/bench/sched-pipe.c +++ b/tools/perf/bench/sched-pipe.c @@ -22,7 +22,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -39,6 +41,7 @@ struct thread_data { int epoll_fd; bool cgroup_failed; pthread_t pthread; + char *buf; }; #define LOOPS_DEFAULT 1000000 @@ -48,6 +51,7 @@ static int loops = LOOPS_DEFAULT; static bool threaded; static bool nonblocking; +static unsigned int write_size = sizeof(int); static char *cgrp_names[2]; static struct cgroup *cgrps[2]; @@ -88,6 +92,8 @@ static const struct option options[] = { OPT_BOOLEAN('n', "nonblocking", &nonblocking, "Use non-blocking operations"), OPT_INTEGER('l', "loop", &loops, "Specify number of loops"), OPT_BOOLEAN('T', "threaded", &threaded, "Specify threads/process based task setup"), + OPT_UINTEGER('s', "write-size", &write_size, + "Bytes per ping-pong write (default 4-bytes). Use larger values to exercise the pipe page-allocation path."), OPT_CALLBACK('G', "cgroups", NULL, "SEND,RECV", "Put sender and receivers in given cgroups", parse_two_cgroups), @@ -170,25 +176,77 @@ static void exit_cgroup(int nr) free(cgrp_names[nr]); } +/* Sleep until @fd is writable, so we don't busy-spin on EWOULDBLOCK. */ +static inline void wait_writable(int fd) +{ + struct pollfd pfd = { + .fd = fd, + .events = POLLOUT, + }; + + poll(&pfd, 1, -1); +} + +/* + * Loop on short read()/write(): the kernel may return fewer bytes than + * requested, retry on EINTR, and in non-blocking mode wait via poll() + * when the writer transiently hits EWOULDBLOCK while the peer is still + * draining a full pipe (capacity is sized to write_size). + */ +static inline int write_pipe(struct thread_data *td) +{ + unsigned int done = 0; + int ret; + + while (done < write_size) { + ret = write(td->pipe_write, td->buf + done, write_size - done); + if (ret < 0) { + if (errno == EINTR) + continue; + if (nonblocking && errno == EWOULDBLOCK) { + wait_writable(td->pipe_write); + continue; + } + return ret; + } + done += ret; + } + return done; +} + static inline int read_pipe(struct thread_data *td) { - int ret, m; -retry: - if (nonblocking) { - ret = epoll_wait(td->epoll_fd, &td->epoll_ev, 1, -1); - if (ret < 0) + unsigned int done = 0; + int ret; + + while (done < write_size) { + if (nonblocking) { + ret = epoll_wait(td->epoll_fd, &td->epoll_ev, 1, -1); + if (ret < 0) { + if (errno == EINTR) + continue; + return ret; + } + } + ret = read(td->pipe_read, td->buf + done, write_size - done); + if (ret < 0) { + if (errno == EINTR) + continue; + if (nonblocking && errno == EWOULDBLOCK) + continue; return ret; + } + if (ret == 0) + return done; + done += ret; } - ret = read(td->pipe_read, &m, sizeof(int)); - if (nonblocking && ret < 0 && errno == EWOULDBLOCK) - goto retry; - return ret; + return done; } static void *worker_thread(void *__tdata) { struct thread_data *td = __tdata; - int i, ret, m = 0; + int i, ret; ret = enter_cgroup(td->nr); if (ret < 0) { @@ -204,15 +262,38 @@ static void *worker_thread(void *__tdata) } for (i = 0; i < loops; i++) { - ret = write(td->pipe_write, &m, sizeof(int)); - BUG_ON(ret != sizeof(int)); + ret = write_pipe(td); + BUG_ON(ret != (int)write_size); ret = read_pipe(td); - BUG_ON(ret != sizeof(int)); + BUG_ON(ret != (int)write_size); } return NULL; } +/* + * On a custom write_size, resize the pipes so a single payload fits. + */ +static int resize_pipes(int wfd1, int wfd2) +{ + int r1, r2; + + if (write_size <= sizeof(int)) + return 0; + + r1 = fcntl(wfd1, F_SETPIPE_SZ, write_size); + r2 = fcntl(wfd2, F_SETPIPE_SZ, write_size); + if (r1 < 0 || r2 < 0 || + (unsigned int)r1 < write_size || + (unsigned int)r2 < write_size) { + fprintf(stderr, + "--write-size %u exceeds /proc/sys/fs/pipe-max-size\n", + write_size); + return -1; + } + return 0; +} + int bench_sched_pipe(int argc, const char **argv) { struct thread_data threads[2] = {}; @@ -233,12 +314,31 @@ int bench_sched_pipe(int argc, const char **argv) argc = parse_options(argc, argv, options, bench_sched_pipe_usage, 0); + /* + * The error paths below return early without closing the pipes or + * freeing the cgroup state. That is fine: bench_sched_pipe() runs + * once and the process exits right after it returns, so these are + * not real leaks. + */ + if (write_size == 0 || write_size > INT_MAX) { + fprintf(stderr, "--write-size must be in 1..%d\n", INT_MAX); + return -1; + } + if (nonblocking) flags |= O_NONBLOCK; BUG_ON(pipe2(pipe_1, flags)); BUG_ON(pipe2(pipe_2, flags)); + if (resize_pipes(pipe_1[1], pipe_2[1]) < 0) + return -1; + + for (t = 0; t < nr_threads; t++) { + threads[t].buf = calloc(1, write_size); + BUG_ON(!threads[t].buf); + } + gettimeofday(&start, NULL); for (t = 0; t < nr_threads; t++) { @@ -287,6 +387,9 @@ int bench_sched_pipe(int argc, const char **argv) gettimeofday(&stop, NULL); timersub(&stop, &start, &diff); + for (t = 0; t < nr_threads; t++) + free(threads[t].buf); + exit_cgroup(0); exit_cgroup(1);