From: Bart Van Assche Date: Mon, 1 Jul 2019 22:09:04 +0000 (-0700) Subject: Add support for the Linux io_uring system calls X-Git-Tag: VALGRIND_3_16_0~256 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=97fa86915e717dc68123f05b668b73adfe8885a9;p=thirdparty%2Fvalgrind.git Add support for the Linux io_uring system calls Man pages and test code are available in the following git repository: http://git.kernel.dk/cgit/liburing/ --- diff --git a/NEWS b/NEWS index 770a231788..5e6c49b599 100644 --- a/NEWS +++ b/NEWS @@ -57,6 +57,7 @@ where XXXXXX is the bug number as listed below. 404406 s390x: z14 miscellaneous instructions not implemented n-i-bz Fix minor one time leaks in dhat. n-i-bz Add --run-cxx-freeres=no in outer args to avoid inner crashes. +n-i-bz Add support for the Linux io_uring system calls Release 3.15.0 (12 April 2019) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/coregrind/m_syswrap/priv_syswrap-linux.h b/coregrind/m_syswrap/priv_syswrap-linux.h index 2471524609..be2f9bdde1 100644 --- a/coregrind/m_syswrap/priv_syswrap-linux.h +++ b/coregrind/m_syswrap/priv_syswrap-linux.h @@ -304,6 +304,13 @@ DECL_TEMPLATE(linux, sys_pkey_alloc); DECL_TEMPLATE(linux, sys_pkey_free); DECL_TEMPLATE(linux, sys_pkey_mprotect); +// Linux io_uring system calls. See also commit 2b188cc1bb85 ("Add io_uring IO +// interface") # v5.1. See also commit edafccee56ff ("io_uring: add support +// for pre-mapped user IO buffers") # v5.1. +DECL_TEMPLATE(linux, sys_io_uring_setup); +DECL_TEMPLATE(linux, sys_io_uring_enter); +DECL_TEMPLATE(linux, sys_io_uring_register); + /* --------------------------------------------------------------------- Wrappers for sockets and ipc-ery. These are split into standalone procedures because x86-linux hides them inside multiplexors diff --git a/coregrind/m_syswrap/syswrap-amd64-linux.c b/coregrind/m_syswrap/syswrap-amd64-linux.c index 2d6b95f598..382dc65cff 100644 --- a/coregrind/m_syswrap/syswrap-amd64-linux.c +++ b/coregrind/m_syswrap/syswrap-amd64-linux.c @@ -867,6 +867,10 @@ static SyscallTableEntry syscall_table[] = { LINXY(__NR_pkey_mprotect, sys_pkey_mprotect), // 329 LINX_(__NR_pkey_alloc, sys_pkey_alloc), // 330 LINX_(__NR_pkey_free, sys_pkey_free), // 331 + + LINXY(__NR_io_uring_setup, sys_io_uring_setup), // 425 + LINXY(__NR_io_uring_enter, sys_io_uring_enter), // 426 + LINXY(__NR_io_uring_register, sys_io_uring_register), // 427 }; SyscallTableEntry* ML_(get_linux_syscall_entry) ( UInt sysno ) diff --git a/coregrind/m_syswrap/syswrap-linux.c b/coregrind/m_syswrap/syswrap-linux.c index 5452b8d9c1..36d09d6e09 100644 --- a/coregrind/m_syswrap/syswrap-linux.c +++ b/coregrind/m_syswrap/syswrap-linux.c @@ -12190,6 +12190,79 @@ POST(sys_pkey_mprotect) ML_(notify_core_and_tool_of_mprotect)(addr, len, prot); } +PRE(sys_io_uring_setup) +{ + PRINT("sys_io_uring_setup ( %#" FMT_REGWORD "x, %" FMT_REGWORD "u )", + ARG1, ARG2); + PRE_REG_READ2(long, "io_uring_setup", unsigned int, entries, + struct vki_io_uring_params *, p); + if (ARG2) + PRE_MEM_READ("io_uring_setup(p)", ARG2, + offsetof(struct vki_io_uring_params, sq_off)); +} + +POST(sys_io_uring_setup) +{ + vg_assert(SUCCESS); + if (!ML_(fd_allowed)(RES, "io_uring_setup", tid, True)) { + VG_(close)(RES); + SET_STATUS_Failure( VKI_EMFILE ); + } else { + if (VG_(clo_track_fds)) + ML_(record_fd_open_with_given_name)(tid, RES, (HChar*)(Addr)ARG1); + POST_MEM_WRITE(ARG2 + offsetof(struct vki_io_uring_params, sq_off), + sizeof(struct vki_io_sqring_offsets) + + sizeof(struct vki_io_cqring_offsets)); + } +} + +PRE(sys_io_uring_enter) +{ + PRINT("sys_io_uring_enter ( %#" FMT_REGWORD "x, %" FMT_REGWORD "u, %" + FMT_REGWORD "u %" FMT_REGWORD "u, %" FMT_REGWORD "u %" + FMT_REGWORD "u )", + ARG1, ARG2, ARG3, ARG4, ARG5, ARG6); + PRE_REG_READ6(long, "io_uring_enter", + unsigned int, fd, unsigned int, to_submit, + unsigned int, min_complete, unsigned int, flags, + const void *, sig, unsigned long, sigsz); + if (ARG5) + PRE_MEM_READ("io_uring_enter(sig)", ARG5, ARG6); +} + +POST(sys_io_uring_enter) +{ +} + +PRE(sys_io_uring_register) +{ + PRINT("sys_io_uring_register ( %#" FMT_REGWORD "x, %" FMT_REGWORD "u, %" + FMT_REGWORD "u %" FMT_REGWORD "u )", ARG1, ARG2, ARG3, ARG4); + PRE_REG_READ4(long, "io_uring_register", + unsigned int, fd, unsigned int, opcode, + void *, arg, unsigned int, nr_args); + switch (ARG2) { + case VKI_IORING_REGISTER_BUFFERS: + PRE_MEM_READ("", ARG3, ARG4 * sizeof(struct vki_iovec)); + break; + case VKI_IORING_UNREGISTER_BUFFERS: + break; + case VKI_IORING_REGISTER_FILES: + PRE_MEM_READ("", ARG3, ARG4 * sizeof(__vki_s32)); + break; + case VKI_IORING_UNREGISTER_FILES: + break; + case VKI_IORING_REGISTER_EVENTFD: + PRE_MEM_READ("", ARG3, sizeof(__vki_s32)); + break; + case VKI_IORING_UNREGISTER_EVENTFD: + break; + } +} + +POST(sys_io_uring_register) +{ +} #undef PRE #undef POST diff --git a/coregrind/m_syswrap/syswrap-x86-linux.c b/coregrind/m_syswrap/syswrap-x86-linux.c index 3829fa4156..9ff53a92a4 100644 --- a/coregrind/m_syswrap/syswrap-x86-linux.c +++ b/coregrind/m_syswrap/syswrap-x86-linux.c @@ -1614,7 +1614,11 @@ static SyscallTableEntry syscall_table[] = { LINXY(__NR_statx, sys_statx), // 383 /* Explicitly not supported on i386 yet. */ - GENX_(__NR_arch_prctl, sys_ni_syscall) // 384 + GENX_(__NR_arch_prctl, sys_ni_syscall), // 384 + + LINXY(__NR_io_uring_setup, sys_io_uring_setup), // 425 + LINXY(__NR_io_uring_enter, sys_io_uring_enter), // 426 + LINXY(__NR_io_uring_register, sys_io_uring_register),// 427 }; SyscallTableEntry* ML_(get_linux_syscall_entry) ( UInt sysno ) diff --git a/include/pub_tool_vki.h b/include/pub_tool_vki.h index 560800d9a6..4aa36f388e 100644 --- a/include/pub_tool_vki.h +++ b/include/pub_tool_vki.h @@ -46,6 +46,7 @@ #if defined(VGO_linux) # include "vki/vki-linux.h" # include "vki/vki-linux-drm.h" +# include "vki/vki-linux-io_uring.h" #elif defined(VGO_darwin) # include "vki/vki-darwin.h" #elif defined(VGO_solaris) diff --git a/include/vki/vki-linux-io_uring.h b/include/vki/vki-linux-io_uring.h new file mode 100644 index 0000000000..5059c3010a --- /dev/null +++ b/include/vki/vki-linux-io_uring.h @@ -0,0 +1,159 @@ +/* + This file is part of Valgrind, a dynamic binary instrumentation framework. + + Copyright (C) 2019 Bart Van Assche + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, see . + + The GNU General Public License is contained in the file COPYING. +*/ + +#ifndef _VKI_IO_URING_H_ +#define _VKI_IO_URING_H_ + +// Derived from linux-5.2/include/uapi/linux/io_uring.h */ + +/* + * IO submission data structure (Submission Queue Entry) + */ +struct vki_io_uring_sqe { + __vki_u8 opcode; /* type of operation for this sqe */ + __vki_u8 flags; /* IOSQE_ flags */ + __vki_u16 ioprio; /* ioprio for the request */ + __vki_s32 fd; /* file descriptor to do IO on */ + __vki_u64 off; /* offset into file */ + __vki_u64 addr; /* pointer to buffer or iovecs */ + __vki_u32 len; /* buffer size or number of iovecs */ + union { + int rw_flags; + __vki_u32 fsync_flags; + __vki_u16 poll_events; + __vki_u32 sync_range_flags; + __vki_u32 msg_flags; + }; + __vki_u64 user_data; /* data to be passed back at completion time */ + union { + __vki_u16 buf_index; /* index into fixed buffers, if used */ + __vki_u64 __pad2[3]; + }; +}; + +/* + * sqe->flags + */ +#define VKI_IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ +#define VKI_IOSQE_IO_DRAIN (1U << 1) /* issue after inflight IO */ +#define VKI_IOSQE_IO_LINK (1U << 2) /* links next sqe */ + +/* + * io_uring_setup() flags + */ +#define VKI_IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ +#define VKI_IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ +#define VKI_IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ + +#define VKI_IORING_OP_NOP 0 +#define VKI_IORING_OP_READV 1 +#define VKI_IORING_OP_WRITEV 2 +#define VKI_IORING_OP_FSYNC 3 +#define VKI_IORING_OP_READ_FIXED 4 +#define VKI_IORING_OP_WRITE_FIXED 5 +#define VKI_IORING_OP_POLL_ADD 6 +#define VKI_IORING_OP_POLL_REMOVE 7 +#define VKI_IORING_OP_SYNC_FILE_RANGE 8 +#define VKI_IORING_OP_SENDMSG 9 +#define VKI_IORING_OP_RECVMSG 10 + +/* + * sqe->fsync_flags + */ +#define VKI_IORING_FSYNC_DATASYNC (1U << 0) + +/* + * IO completion data structure (Completion Queue Entry) + */ +struct vki_io_uring_cqe { + __vki_u64 user_data; /* sqe->data submission passed back */ + __vki_s32 res; /* result code for this event */ + __vki_u32 flags; +}; + +/* + * Magic offsets for the application to mmap the data it needs + */ +#define VKI_IORING_OFF_SQ_RING 0ULL +#define VKI_IORING_OFF_CQ_RING 0x8000000ULL +#define VKI_IORING_OFF_SQES 0x10000000ULL + +/* + * Filled with the offset for mmap(2) + */ +struct vki_io_sqring_offsets { + __vki_u32 head; + __vki_u32 tail; + __vki_u32 ring_mask; + __vki_u32 ring_entries; + __vki_u32 flags; + __vki_u32 dropped; + __vki_u32 array; + __vki_u32 resv1; + __vki_u64 resv2; +}; + +/* + * sq_ring->flags + */ +#define VKI_IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ + +struct vki_io_cqring_offsets { + __vki_u32 head; + __vki_u32 tail; + __vki_u32 ring_mask; + __vki_u32 ring_entries; + __vki_u32 overflow; + __vki_u32 cqes; + __vki_u64 resv[2]; +}; + +/* + * io_uring_enter(2) flags + */ +#define VKI_IORING_ENTER_GETEVENTS (1U << 0) +#define VKI_IORING_ENTER_SQ_WAKEUP (1U << 1) + +/* + * Passed in for io_uring_setup(2). Copied back with updated info on success + */ +struct vki_io_uring_params { + __vki_u32 sq_entries; + __vki_u32 cq_entries; + __vki_u32 flags; + __vki_u32 sq_thread_cpu; + __vki_u32 sq_thread_idle; + __vki_u32 resv[5]; + struct vki_io_sqring_offsets sq_off; + struct vki_io_cqring_offsets cq_off; +}; + +/* + * io_uring_register(2) opcodes and arguments + */ +#define VKI_IORING_REGISTER_BUFFERS 0 +#define VKI_IORING_UNREGISTER_BUFFERS 1 +#define VKI_IORING_REGISTER_FILES 2 +#define VKI_IORING_UNREGISTER_FILES 3 +#define VKI_IORING_REGISTER_EVENTFD 4 +#define VKI_IORING_UNREGISTER_EVENTFD 5 + +#endif