From: Quentin Monnet Date: Wed, 4 Apr 2018 23:40:49 +0000 (+0100) Subject: Add support for bpf system call X-Git-Tag: VALGRIND_3_14_0~57 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1d933b5a4afdbc2eff2fc27027847998b84393c5;p=thirdparty%2Fvalgrind.git Add support for bpf system call Fixes: 388786 - Support bpf syscall in amd64 Linux Add support for bpf() Linux-specific system call on amd64 platform. The bpf() syscall is used to handle eBPF objects (programs and maps), and can be used for a number of operations. It takes three arguments: - "cmd" is an integer encoding a subcommand to run. Available subcommand include loading a new program, creating a map or updating its entries, retrieving information about an eBPF object, and may others. - "attr" is a pointer to an object of type union bpf_attr. This object converts to a struct related to selected subcommand, and embeds the various parameters used with this subcommand. Some of those parameters are read by the kernel (example for an eBPF map lookup: the key of the entry to lookup), others are written into (the value retrieved from the map lookup). - "attr_size" is the size of the object pointed by "attr". Since the action performed by the kernel, and the way "attr" attributes are processed depends on the subcommand in use, the PRE() and POST() wrappers need to make the distinction as well. For each subcommand, mark the attributes that are read or written. For some map operations, the only way to infer the size of the memory areas used for read or write operations seems to involve reading from /proc//fdinfo/ in order to retrieve the size of keys and values for this map. The definitions of union bpf_attr and of other eBPF-related elements required for adequately performing the checks were added to the Linux header file. Processing related to file descriptors is added in a follow-up patch. --- diff --git a/NEWS b/NEWS index e1bb4adf04..dcbfce621f 100644 --- a/NEWS +++ b/NEWS @@ -121,6 +121,7 @@ where XXXXXX is the bug number as listed below. 387766 asm shifts cause false positive "Conditional jump or move depends on uninitialised value" 387773 .gnu_debugaltlink paths resolve relative to .debug file, not symlink +388786 Support bpf syscall in amd64 Linux 388862 Add replacements for wmemchr and wcsnlen on Linux 389065 valgrind meets gcc flag -Wlogical-op 389373 exp-sgcheck the 'impossible' happened as Ist_LoadG is not instrumented diff --git a/coregrind/m_syswrap/priv_syswrap-linux.h b/coregrind/m_syswrap/priv_syswrap-linux.h index 296ef6599d..f76191adaf 100644 --- a/coregrind/m_syswrap/priv_syswrap-linux.h +++ b/coregrind/m_syswrap/priv_syswrap-linux.h @@ -295,6 +295,9 @@ DECL_TEMPLATE(linux, sys_syncfs); DECL_TEMPLATE(linux, sys_membarrier); +// Linux-specific (new in Linux 3.18) +DECL_TEMPLATE(linux, sys_bpf); + // Linux-specific (new in Linux 4.11) DECL_TEMPLATE(linux, sys_statx); diff --git a/coregrind/m_syswrap/syswrap-amd64-linux.c b/coregrind/m_syswrap/syswrap-amd64-linux.c index 9255e7bad8..60fb5ffc08 100644 --- a/coregrind/m_syswrap/syswrap-amd64-linux.c +++ b/coregrind/m_syswrap/syswrap-amd64-linux.c @@ -846,7 +846,7 @@ static SyscallTableEntry syscall_table[] = { LINXY(__NR_memfd_create, sys_memfd_create), // 319 // LIN__(__NR_kexec_file_load, sys_ni_syscall), // 320 -// LIN__(__NR_bpf, sys_ni_syscall), // 321 + LINXY(__NR_bpf, sys_bpf), // 321 LINXY(__NR_statx, sys_statx), // 332 diff --git a/coregrind/m_syswrap/syswrap-linux.c b/coregrind/m_syswrap/syswrap-linux.c index b0d65414a2..1e6f5fc4ac 100644 --- a/coregrind/m_syswrap/syswrap-linux.c +++ b/coregrind/m_syswrap/syswrap-linux.c @@ -11580,6 +11580,297 @@ PRE(sys_kcmp) } } +/* --------------------------------------------------------------------- + bpf wrappers + ------------------------------------------------------------------ */ + +static Bool bpf_map_get_sizes(Int fd, UInt *key_size, UInt *value_size) +{ + HChar path[32], buf[1024]; /* large enough */ + SysRes sres; + HChar *comp; + Int proc_fd; + + *key_size = 0; + *value_size = 0; + + VG_(sprintf)(path, "/proc/%d/fdinfo/%d", VG_(getpid)(), fd); + sres = VG_(open)(path, VKI_O_RDONLY, 0); + if (sr_isError(sres)) + return False; + proc_fd = sr_Res(sres); + + if (VG_(read)(proc_fd, buf, sizeof(buf)) <= 0) + return False; + VG_(close)(proc_fd); + + comp = VG_(strstr)(buf, "key_size:"); + if (comp) + *key_size = VG_(strtoull10)(comp + sizeof("key_size:"), NULL); + + comp = VG_(strstr)(buf, "value_size:"); + if (comp) + *value_size = VG_(strtoull10)(comp + sizeof("value_size:"), NULL); + + return (*key_size && *value_size); +} + +/* + * From a file descriptor for an eBPF object, try to determine the size of the + * struct that will be written, i.e. determine if object is a map or a program. + * There is no direct way to do this, so parse /proc//fdinfo/ and + * search for strings "prog_type" or "map_type". + */ +static UInt bpf_obj_get_info_size(Int fd) +{ + HChar path[32], buf[1024]; /* large enough */ + SysRes sres; + Int proc_fd; + + VG_(sprintf)(path, "/proc/%d/fdinfo/%d", VG_(getpid)(), fd); + sres = VG_(open)(path, VKI_O_RDONLY, 0); + if (sr_isError(sres)) + return 0; + proc_fd = sr_Res(sres); + + if (VG_(read)(proc_fd, buf, sizeof(buf)) <= 0) + return 0; + VG_(close)(proc_fd); + + if (VG_(strstr)(buf, "prog_type:")) + return sizeof(struct vki_bpf_prog_info); + + if (VG_(strstr)(buf, "map_type:")) + return sizeof(struct vki_bpf_map_info); + + return 0; +} + +PRE(sys_bpf) +{ + union vki_bpf_attr *attr = (union vki_bpf_attr *)(Addr)ARG2; + UInt res, key_size, value_size; + + PRE_REG_READ3(long, "bpf", + int, cmd, union vki_bpf_attr *, attr, unsigned int, size); + PRINT("bpf ( %ld, %" FMT_REGWORD "u, %" FMT_REGWORD "u )", + ARG1, ARG2, ARG3); + switch (ARG1) { + case VKI_BPF_MAP_CREATE: + case VKI_BPF_PROG_ATTACH: + case VKI_BPF_PROG_DETACH: + case VKI_BPF_PROG_GET_NEXT_ID: + case VKI_BPF_MAP_GET_NEXT_ID: + case VKI_BPF_PROG_GET_FD_BY_ID: + case VKI_BPF_MAP_GET_FD_BY_ID: + case VKI_BPF_BTF_GET_FD_BY_ID: + break; + case VKI_BPF_MAP_LOOKUP_ELEM: + /* Perform a lookup on an eBPF map. Read key, write value. */ + if (ML_(safe_to_deref)(attr, ARG3) && + attr->key != 0 && attr->value != 0) { + /* Get size of key and value for this map. */ + if (bpf_map_get_sizes(attr->map_fd, &key_size, &value_size)) { + PRE_MEM_READ("bpf(attr->key)", attr->key, key_size); + PRE_MEM_WRITE("bpf(attr->value)", attr->value, value_size); + } + } + break; + case VKI_BPF_MAP_UPDATE_ELEM: + /* Add or update a map element in kernel. Read key, read value. */ + if (ML_(safe_to_deref)(attr, ARG3) && + attr->key != 0 && attr->value != 0) { + /* Get size of key and value for this map. */ + if (bpf_map_get_sizes(attr->map_fd, &key_size, &value_size)) { + PRE_MEM_READ("bpf(attr->key)", attr->key, key_size); + PRE_MEM_READ("bpf(attr->value)", attr->value, value_size); + } + } + break; + case VKI_BPF_MAP_DELETE_ELEM: + /* Delete a map element in kernel. Read key from user space. */ + if (ML_(safe_to_deref)(attr, ARG3) && attr->key != 0) { + /* Get size of key for this map. */ + if (bpf_map_get_sizes(attr->map_fd, &key_size, &value_size)) + PRE_MEM_READ("bpf(attr->key)", attr->key, key_size); + } + break; + case VKI_BPF_MAP_GET_NEXT_KEY: + /* From a key, get next key for the map. Read key, write next key. */ + if (ML_(safe_to_deref)(attr, ARG3) && + attr->key != 0 && attr->next_key != 0) { + /* Get size of key for this map. */ + if (bpf_map_get_sizes(attr->map_fd, &key_size, &value_size)) { + PRE_MEM_READ("bpf(attr->key)", attr->key, key_size); + PRE_MEM_WRITE("bpf(attr->next_key)", attr->next_key, key_size); + } + } + break; + case VKI_BPF_PROG_LOAD: + /* Load a program into the kernel from an array of instructions. */ + if (ML_(safe_to_deref)(attr, ARG3)) { + /* Read instructions, license, program name. */ + PRE_MEM_READ("bpf(attr->insns)", attr->insns, + attr->insn_cnt * sizeof(struct vki_bpf_insn)); + /* License is limited to 128 characters in kernel/bpf/syscall.c. */ + pre_asciiz_str(tid, attr->license, 128, "bpf(attr->license)"); + pre_asciiz_str(tid, (unsigned long int)attr->prog_name, + VKI_BPF_OBJ_NAME_LEN, "bpf(attr->prog_name)"); + /* Possibly write up to log_len into user space log buffer. */ + if (attr->log_level && attr->log_size > 128 && attr->log_buf != 0) + PRE_MEM_WRITE("bpf(attr->log_buf)", + attr->log_buf, attr->log_size); + } + break; + case VKI_BPF_OBJ_PIN: + /* Pin eBPF program or map to given location under /sys/fs/bpf/. */ + /* fall through */ + case VKI_BPF_OBJ_GET: + /* Get pinned eBPF program or map. Read path name. */ + if (ML_(safe_to_deref)(attr, ARG3)) + pre_asciiz_str(tid, attr->pathname, VKI_BPF_OBJ_NAME_LEN, + "bpf(attr->pathname)"); + break; + case VKI_BPF_PROG_TEST_RUN: + /* Test prog. Read data_in, write up to data_size_out to data_out. */ + if (ML_(safe_to_deref)(attr, ARG3) && + attr->test.data_in != 0 && attr->test.data_out != 0) { + PRE_MEM_READ("bpf(attr->test.data_in)", + attr->test.data_in, attr->test.data_size_in); + /* + * TODO: Kernel writes to data_out but we do not know the size yet. + * PRE_MEM_WRITE("bpf(attr->test.data_out)", + * attr->test.data_out, ??); + */ + } + break; + case VKI_BPF_OBJ_GET_INFO_BY_FD: + /* Get info for eBPF map or program. Write info. */ + if (ML_(safe_to_deref)(attr, ARG3) && attr->info.info != 0) { + /* Get size of struct to write: is object a program or a map? */ + res = bpf_obj_get_info_size(attr->info.bpf_fd); + if (res) + PRE_MEM_WRITE("bpf(attr->info.info)", attr->info.info, + VG_MIN(attr->info.info_len, res)); + else + PRE_MEM_WRITE("bpf(attr->info.info)", attr->info.info, + VG_MIN(attr->info.info_len, + VG_MAX(sizeof(struct vki_bpf_prog_info), + sizeof(struct vki_bpf_map_info)))); + } + break; + case VKI_BPF_PROG_QUERY: + /* + * Query list of eBPF program attached to cgroup. + * Write array of ids (up to attr->query.prog_cnt u32-long ids). + */ + if (ML_(safe_to_deref)(attr, ARG3) && attr->query.prog_ids != 0) + PRE_MEM_WRITE("bpf(attr->query.prog_ids)", attr->query.prog_ids, + attr->query.prog_cnt * sizeof(__vki_u32)); + break; + case VKI_BPF_RAW_TRACEPOINT_OPEN: + /* Open raw tracepoint. Read tracepoint name. */ + if (ML_(safe_to_deref)(attr, ARG3)) { + /* Name is limited to 128 characters in kernel/bpf/syscall.c. */ + pre_asciiz_str(tid, attr->raw_tracepoint.name, 128, + "bpf(attr->raw_tracepoint.name)"); + } + break; + case VKI_BPF_BTF_LOAD: + /* Load BTF information about a program into the kernel. */ + if (ML_(safe_to_deref)(attr, ARG3)) { + /* Read BTF data. */ + PRE_MEM_READ("bpf(attr->btf)", attr->btf, attr->btf_size); + /* Possibly write up to btf_log_len into user space log buffer. */ + if (attr->btf_log_level && attr->btf_log_size > 128 && + attr->btf_log_buf != 0) + PRE_MEM_WRITE("bpf(attr->btf_log_buf)", + attr->btf_log_buf, attr->btf_log_size); + } + case VKI_BPF_TASK_FD_QUERY: + /* Get info about the task. Write collected info. */ + if (ML_(safe_to_deref)(attr, ARG3)) { + if (attr->task_fd_query.buf_len > 0) { + /* Write task or perf event name. */ + PRE_MEM_WRITE("bpf(attr->task_fd_query.buf)", + attr->task_fd_query.buf, + attr->task_fd_query.buf_len); + } + } + break; + default: + VG_(message)(Vg_DebugMsg, + "FATAL: unhandled eBPF command %lu\n", ARG1); + VG_(core_panic)("... bye!\n"); + break; + } +} + +POST(sys_bpf) +{ + union vki_bpf_attr *attr = (union vki_bpf_attr *)(Addr)ARG2; + UInt key_size, value_size; + + vg_assert(SUCCESS); + + switch (ARG1) { + case VKI_BPF_MAP_CREATE: + case VKI_BPF_MAP_UPDATE_ELEM: + case VKI_BPF_MAP_DELETE_ELEM: + case VKI_BPF_OBJ_PIN: + case VKI_BPF_OBJ_GET: + case VKI_BPF_PROG_ATTACH: + case VKI_BPF_PROG_DETACH: + case VKI_BPF_PROG_GET_NEXT_ID: + case VKI_BPF_MAP_GET_NEXT_ID: + case VKI_BPF_PROG_GET_FD_BY_ID: + case VKI_BPF_MAP_GET_FD_BY_ID: + case VKI_BPF_BTF_GET_FD_BY_ID: + case VKI_BPF_RAW_TRACEPOINT_OPEN: + break; + /* + * TODO: Is there a way to pass information between PRE and POST hooks? + * To avoid querying again for the size of keys and values. + */ + case VKI_BPF_MAP_LOOKUP_ELEM: + if (bpf_map_get_sizes(attr->map_fd, &key_size, &value_size)) + POST_MEM_WRITE(attr->value, value_size); + break; + case VKI_BPF_MAP_GET_NEXT_KEY: + if (bpf_map_get_sizes(attr->map_fd, &key_size, &value_size)) + POST_MEM_WRITE(attr->next_key, key_size); + break; + case VKI_BPF_PROG_LOAD: + if (attr->log_level) + POST_MEM_WRITE(attr->log_buf, attr->log_size); + break; + case VKI_BPF_PROG_TEST_RUN: + POST_MEM_WRITE(attr->test.data_out, attr->test.data_size_out); + break; + case VKI_BPF_OBJ_GET_INFO_BY_FD: + POST_MEM_WRITE(attr->info.info, attr->info.info_len); + break; + case VKI_BPF_PROG_QUERY: + if (attr->query.prog_ids) + POST_MEM_WRITE(attr->query.prog_ids, + attr->query.prog_cnt * sizeof(__vki_u32)); + break; + case VKI_BPF_BTF_LOAD: + /* Return a file descriptor for BTF data, write into btf_log_buf. */ + if (attr->btf_log_level) + POST_MEM_WRITE(attr->btf_log_buf, attr->btf_log_size); + break; + case VKI_BPF_TASK_FD_QUERY: + POST_MEM_WRITE(attr->task_fd_query.buf, attr->task_fd_query.buf_len); + break; + default: + VG_(message)(Vg_DebugMsg, + "FATAL: unhandled eBPF command %lu\n", ARG1); + VG_(core_panic)("... bye!\n"); + break; + } +} + #undef PRE #undef POST diff --git a/include/vki/vki-linux.h b/include/vki/vki-linux.h index 1beeebba11..44b683ee8d 100644 --- a/include/vki/vki-linux.h +++ b/include/vki/vki-linux.h @@ -4829,6 +4829,207 @@ struct vki_blk_zone_range { #define VKI_BLKREPORTZONE _VKI_IOWR(0x12, 130, struct vki_blk_zone_report) #define VKI_BLKRESETZONE _VKI_IOW(0x12, 131, struct vki_blk_zone_range) +//---------------------------------------------------------------------- +// From linux-4.18/include/uapi/linux/bpf.h +//---------------------------------------------------------------------- + +struct vki_bpf_insn { + __vki_u8 code; /* opcode */ + __vki_u8 dst_reg:4; /* dest register */ + __vki_u8 src_reg:4; /* source register */ + __vki_s16 off; /* signed offset */ + __vki_s32 imm; /* signed immediate constant */ +}; + +enum vki_bpf_cmd { + VKI_BPF_MAP_CREATE, + VKI_BPF_MAP_LOOKUP_ELEM, + VKI_BPF_MAP_UPDATE_ELEM, + VKI_BPF_MAP_DELETE_ELEM, + VKI_BPF_MAP_GET_NEXT_KEY, + VKI_BPF_PROG_LOAD, + VKI_BPF_OBJ_PIN, + VKI_BPF_OBJ_GET, + VKI_BPF_PROG_ATTACH, + VKI_BPF_PROG_DETACH, + VKI_BPF_PROG_TEST_RUN, + VKI_BPF_PROG_GET_NEXT_ID, + VKI_BPF_MAP_GET_NEXT_ID, + VKI_BPF_PROG_GET_FD_BY_ID, + VKI_BPF_MAP_GET_FD_BY_ID, + VKI_BPF_OBJ_GET_INFO_BY_FD, + VKI_BPF_PROG_QUERY, + VKI_BPF_RAW_TRACEPOINT_OPEN, + VKI_BPF_BTF_LOAD, + VKI_BPF_BTF_GET_FD_BY_ID, + VKI_BPF_TASK_FD_QUERY, +}; + +#define VKI_BPF_OBJ_NAME_LEN 16U + +#define __vki_aligned_u64 __vki_u64 __attribute__((aligned(8))) +union vki_bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __vki_u32 map_type; /* one of enum bpf_map_type */ + __vki_u32 key_size; /* size of key in bytes */ + __vki_u32 value_size; /* size of value in bytes */ + __vki_u32 max_entries; /* max number of entries in a map */ + __vki_u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ + __vki_u32 inner_map_fd; /* fd pointing to the inner map */ + __vki_u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ + char map_name[VKI_BPF_OBJ_NAME_LEN]; + __vki_u32 map_ifindex; /* ifindex of netdev to create on */ + __vki_u32 btf_fd; /* fd pointing to a BTF type data */ + __vki_u32 btf_key_type_id; /* BTF type_id of the key */ + __vki_u32 btf_value_type_id; /* BTF type_id of the value */ + }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __vki_u32 map_fd; + __vki_aligned_u64 key; + union { + __vki_aligned_u64 value; + __vki_aligned_u64 next_key; + }; + __vki_u64 flags; + }; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __vki_u32 prog_type; /* one of enum bpf_prog_type */ + __vki_u32 insn_cnt; + __vki_aligned_u64 insns; + __vki_aligned_u64 license; + __vki_u32 log_level; /* verbosity level of verifier */ + __vki_u32 log_size; /* size of user buffer */ + __vki_aligned_u64 log_buf; /* user supplied buffer */ + __vki_u32 kern_version; /* checked when prog_type=kprobe */ + __vki_u32 prog_flags; + char prog_name[VKI_BPF_OBJ_NAME_LEN]; + __vki_u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __vki_u32 expected_attach_type; + }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __vki_aligned_u64 pathname; + __vki_u32 bpf_fd; + __vki_u32 file_flags; + }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + __vki_u32 target_fd; /* container object to attach to */ + __vki_u32 attach_bpf_fd; /* eBPF program to attach */ + __vki_u32 attach_type; + __vki_u32 attach_flags; + }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __vki_u32 prog_fd; + __vki_u32 retval; + __vki_u32 data_size_in; + __vki_u32 data_size_out; + __vki_aligned_u64 data_in; + __vki_aligned_u64 data_out; + __vki_u32 repeat; + __vki_u32 duration; + } test; + + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __vki_u32 start_id; + __vki_u32 prog_id; + __vki_u32 map_id; + __vki_u32 btf_id; + }; + __vki_u32 next_id; + __vki_u32 open_flags; + }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __vki_u32 bpf_fd; + __vki_u32 info_len; + __vki_aligned_u64 info; + } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + __vki_u32 target_fd; /* container object to query */ + __vki_u32 attach_type; + __vki_u32 query_flags; + __vki_u32 attach_flags; + __vki_aligned_u64 prog_ids; + __vki_u32 prog_cnt; + } query; + + struct { + __vki_u64 name; + __vki_u32 prog_fd; + } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __vki_aligned_u64 btf; + __vki_aligned_u64 btf_log_buf; + __vki_u32 btf_size; + __vki_u32 btf_log_size; + __vki_u32 btf_log_level; + }; + + struct { + __vki_u32 pid; /* input: pid */ + __vki_u32 fd; /* input: fd */ + __vki_u32 flags; /* input: flags */ + __vki_u32 buf_len; /* input/output: buf len */ + __vki_aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __vki_u32 prog_id; /* output: prod_id */ + __vki_u32 fd_type; /* output: BPF_FD_TYPE_* */ + __vki_u64 probe_offset; /* output: probe_offset */ + __vki_u64 probe_addr; /* output: probe_addr */ + } task_fd_query; +} __attribute__((aligned(8))); + +#define VKI_BPF_TAG_SIZE 8 + +struct vki_bpf_prog_info { + __vki_u32 type; + __vki_u32 id; + __vki_u8 tag[VKI_BPF_TAG_SIZE]; + __vki_u32 jited_prog_len; + __vki_u32 xlated_prog_len; + __vki_aligned_u64 jited_prog_insns; + __vki_aligned_u64 xlated_prog_insns; + __vki_u64 load_time; /* ns since boottime */ + __vki_u32 created_by_uid; + __vki_u32 nr_map_ids; + __vki_aligned_u64 map_ids; + char name[VKI_BPF_OBJ_NAME_LEN]; + __vki_u32 ifindex; + __vki_u64 netns_dev; + __vki_u64 netns_ino; +} __attribute__((aligned(8))); + +struct vki_bpf_map_info { + __vki_u32 type; + __vki_u32 id; + __vki_u32 key_size; + __vki_u32 value_size; + __vki_u32 max_entries; + __vki_u32 map_flags; + char name[VKI_BPF_OBJ_NAME_LEN]; + __vki_u32 ifindex; + __vki_u64 netns_dev; + __vki_u64 netns_ino; +} __attribute__((aligned(8))); + /*--------------------------------------------------------------------*/ /*--- end ---*/ /*--------------------------------------------------------------------*/