[people/ms/linux.git] / kernel / bpf / syscall.c

/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 */
#include <linux/bpf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/license.h>
#include <linux/filter.h>
#include <linux/version.h>

static LIST_HEAD(bpf_map_types);

static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
{
	struct bpf_map_type_list *tl;
	struct bpf_map *map;

	list_for_each_entry(tl, &bpf_map_types, list_node) {
		if (tl->type == attr->map_type) {
			map = tl->ops->map_alloc(attr);
			if (IS_ERR(map))
				return map;
			map->ops = tl->ops;
			map->map_type = attr->map_type;
			return map;
		}
	}
	return ERR_PTR(-EINVAL);
}

/* boot time registration of different map implementations */
void bpf_register_map_type(struct bpf_map_type_list *tl)
{
	list_add(&tl->list_node, &bpf_map_types);
}

/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
	struct bpf_map *map = container_of(work, struct bpf_map, work);

	/* implementation dependent freeing */
	map->ops->map_free(map);
}

/* decrement map refcnt and schedule it for freeing via workqueue
 * (unrelying map implementation ops->map_free() might sleep)
 */
void bpf_map_put(struct bpf_map *map)
{
	if (atomic_dec_and_test(&map->refcnt)) {
		INIT_WORK(&map->work, bpf_map_free_deferred);
		schedule_work(&map->work);
	}
}

static int bpf_map_release(struct inode *inode, struct file *filp)
{
	struct bpf_map *map = filp->private_data;

	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
		/* prog_array stores refcnt-ed bpf_prog pointers
		 * release them all when user space closes prog_array_fd
		 */
		bpf_fd_array_map_clear(map);

	bpf_map_put(map);
	return 0;
}

static const struct file_operations bpf_map_fops = {
	.release = bpf_map_release,
};

/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
		   sizeof(attr->CMD##_LAST_FIELD), 0, \
		   sizeof(*attr) - \
		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
		   sizeof(attr->CMD##_LAST_FIELD)) != NULL

#define BPF_MAP_CREATE_LAST_FIELD max_entries
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
	struct bpf_map *map;
	int err;

	err = CHECK_ATTR(BPF_MAP_CREATE);
	if (err)
		return -EINVAL;

	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
	map = find_and_alloc_map(attr);
	if (IS_ERR(map))
		return PTR_ERR(map);

	atomic_set(&map->refcnt, 1);

	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);

	if (err < 0)
		/* failed to allocate fd */
		goto free_map;

	return err;

free_map:
	map->ops->map_free(map);
	return err;
}

/* if error is returned, fd is released.
 * On success caller should complete fd access with matching fdput()
 */
struct bpf_map *bpf_map_get(struct fd f)
{
	struct bpf_map *map;

	if (!f.file)
		return ERR_PTR(-EBADF);

	if (f.file->f_op != &bpf_map_fops) {
		fdput(f);
		return ERR_PTR(-EINVAL);
	}

	map = f.file->private_data;

	return map;
}

/* helper to convert user pointers passed inside __aligned_u64 fields */
static void __user *u64_to_ptr(__u64 val)
{
	return (void __user *) (unsigned long) val;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value

static int map_lookup_elem(union bpf_attr *attr)
{
	void __user *ukey = u64_to_ptr(attr->key);
	void __user *uvalue = u64_to_ptr(attr->value);
	int ufd = attr->map_fd;
	struct bpf_map *map;
	void *key, *value, *ptr;
	struct fd f;
	int err;

	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
		return -EINVAL;

	f = fdget(ufd);
	map = bpf_map_get(f);
	if (IS_ERR(map))
		return PTR_ERR(map);

	err = -ENOMEM;
	key = kmalloc(map->key_size, GFP_USER);
	if (!key)
		goto err_put;

	err = -EFAULT;
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	err = -ENOMEM;
	value = kmalloc(map->value_size, GFP_USER);
	if (!value)
		goto free_key;

	rcu_read_lock();
	ptr = map->ops->map_lookup_elem(map, key);
	if (ptr)
		memcpy(value, ptr, map->value_size);
	rcu_read_unlock();

	err = -ENOENT;
	if (!ptr)
		goto free_value;

	err = -EFAULT;
	if (copy_to_user(uvalue, value, map->value_size) != 0)
		goto free_value;

	err = 0;

free_value:
	kfree(value);
free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags

static int map_update_elem(union bpf_attr *attr)
{
	void __user *ukey = u64_to_ptr(attr->key);
	void __user *uvalue = u64_to_ptr(attr->value);
	int ufd = attr->map_fd;
	struct bpf_map *map;
	void *key, *value;
	struct fd f;
	int err;

	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
		return -EINVAL;

	f = fdget(ufd);
	map = bpf_map_get(f);
	if (IS_ERR(map))
		return PTR_ERR(map);

	err = -ENOMEM;
	key = kmalloc(map->key_size, GFP_USER);
	if (!key)
		goto err_put;

	err = -EFAULT;
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	err = -ENOMEM;
	value = kmalloc(map->value_size, GFP_USER);
	if (!value)
		goto free_key;

	err = -EFAULT;
	if (copy_from_user(value, uvalue, map->value_size) != 0)
		goto free_value;

	/* eBPF program that use maps are running under rcu_read_lock(),
	 * therefore all map accessors rely on this fact, so do the same here
	 */
	rcu_read_lock();
	err = map->ops->map_update_elem(map, key, value, attr->flags);
	rcu_read_unlock();

free_value:
	kfree(value);
free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

#define BPF_MAP_DELETE_ELEM_LAST_FIELD key

static int map_delete_elem(union bpf_attr *attr)
{
	void __user *ukey = u64_to_ptr(attr->key);
	int ufd = attr->map_fd;
	struct bpf_map *map;
	struct fd f;
	void *key;
	int err;

	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
		return -EINVAL;

	f = fdget(ufd);
	map = bpf_map_get(f);
	if (IS_ERR(map))
		return PTR_ERR(map);

	err = -ENOMEM;
	key = kmalloc(map->key_size, GFP_USER);
	if (!key)
		goto err_put;

	err = -EFAULT;
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	rcu_read_lock();
	err = map->ops->map_delete_elem(map, key);
	rcu_read_unlock();

free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

/* last field in 'union bpf_attr' used by this command */
#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key

static int map_get_next_key(union bpf_attr *attr)
{
	void __user *ukey = u64_to_ptr(attr->key);
	void __user *unext_key = u64_to_ptr(attr->next_key);
	int ufd = attr->map_fd;
	struct bpf_map *map;
	void *key, *next_key;
	struct fd f;
	int err;

	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
		return -EINVAL;

	f = fdget(ufd);
	map = bpf_map_get(f);
	if (IS_ERR(map))
		return PTR_ERR(map);

	err = -ENOMEM;
	key = kmalloc(map->key_size, GFP_USER);
	if (!key)
		goto err_put;

	err = -EFAULT;
	if (copy_from_user(key, ukey, map->key_size) != 0)
		goto free_key;

	err = -ENOMEM;
	next_key = kmalloc(map->key_size, GFP_USER);
	if (!next_key)
		goto free_key;

	rcu_read_lock();
	err = map->ops->map_get_next_key(map, key, next_key);
	rcu_read_unlock();
	if (err)
		goto free_next_key;

	err = -EFAULT;
	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
		goto free_next_key;

	err = 0;

free_next_key:
	kfree(next_key);
free_key:
	kfree(key);
err_put:
	fdput(f);
	return err;
}

static LIST_HEAD(bpf_prog_types);

static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
{
	struct bpf_prog_type_list *tl;

	list_for_each_entry(tl, &bpf_prog_types, list_node) {
		if (tl->type == type) {
			prog->aux->ops = tl->ops;
			prog->type = type;
			return 0;
		}
	}

	return -EINVAL;
}

void bpf_register_prog_type(struct bpf_prog_type_list *tl)
{
	list_add(&tl->list_node, &bpf_prog_types);
}

/* fixup insn->imm field of bpf_call instructions:
 * if (insn->imm == BPF_FUNC_map_lookup_elem)
 *      insn->imm = bpf_map_lookup_elem - __bpf_call_base;
 * else if (insn->imm == BPF_FUNC_map_update_elem)
 *      insn->imm = bpf_map_update_elem - __bpf_call_base;
 * else ...
 *
 * this function is called after eBPF program passed verification
 */
static void fixup_bpf_calls(struct bpf_prog *prog)
{
	const struct bpf_func_proto *fn;
	int i;

	for (i = 0; i < prog->len; i++) {
		struct bpf_insn *insn = &prog->insnsi[i];

		if (insn->code == (BPF_JMP | BPF_CALL)) {
			/* we reach here when program has bpf_call instructions
			 * and it passed bpf_check(), means that
			 * ops->get_func_proto must have been supplied, check it
			 */
			BUG_ON(!prog->aux->ops->get_func_proto);

			if (insn->imm == BPF_FUNC_get_route_realm)
				prog->dst_needed = 1;
			if (insn->imm == BPF_FUNC_get_prandom_u32)
				bpf_user_rnd_init_once();
			if (insn->imm == BPF_FUNC_tail_call) {
				/* mark bpf_tail_call as different opcode
				 * to avoid conditional branch in
				 * interpeter for every normal call
				 * and to prevent accidental JITing by
				 * JIT compiler that doesn't support
				 * bpf_tail_call yet
				 */
				insn->imm = 0;
				insn->code |= BPF_X;
				continue;
			}

			fn = prog->aux->ops->get_func_proto(insn->imm);
			/* all functions that have prototype and verifier allowed
			 * programs to call them, must be real in-kernel functions
			 */
			BUG_ON(!fn->func);
			insn->imm = fn->func - __bpf_call_base;
		}
	}
}

/* drop refcnt on maps used by eBPF program and free auxilary data */
static void free_used_maps(struct bpf_prog_aux *aux)
{
	int i;

	for (i = 0; i < aux->used_map_cnt; i++)
		bpf_map_put(aux->used_maps[i]);

	kfree(aux->used_maps);
}

static void __prog_put_rcu(struct rcu_head *rcu)
{
	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);

	free_used_maps(aux);
	bpf_prog_free(aux->prog);
}

/* version of bpf_prog_put() that is called after a grace period */
void bpf_prog_put_rcu(struct bpf_prog *prog)
{
	if (atomic_dec_and_test(&prog->aux->refcnt)) {
		prog->aux->prog = prog;
		call_rcu(&prog->aux->rcu, __prog_put_rcu);
	}
}

void bpf_prog_put(struct bpf_prog *prog)
{
	if (atomic_dec_and_test(&prog->aux->refcnt)) {
		free_used_maps(prog->aux);
		bpf_prog_free(prog);
	}
}
EXPORT_SYMBOL_GPL(bpf_prog_put);

static int bpf_prog_release(struct inode *inode, struct file *filp)
{
	struct bpf_prog *prog = filp->private_data;

	bpf_prog_put_rcu(prog);
	return 0;
}

static const struct file_operations bpf_prog_fops = {
        .release = bpf_prog_release,
};

static struct bpf_prog *get_prog(struct fd f)
{
	struct bpf_prog *prog;

	if (!f.file)
		return ERR_PTR(-EBADF);

	if (f.file->f_op != &bpf_prog_fops) {
		fdput(f);
		return ERR_PTR(-EINVAL);
	}

	prog = f.file->private_data;

	return prog;
}

/* called by sockets/tracing/seccomp before attaching program to an event
 * pairs with bpf_prog_put()
 */
struct bpf_prog *bpf_prog_get(u32 ufd)
{
	struct fd f = fdget(ufd);
	struct bpf_prog *prog;

	prog = get_prog(f);

	if (IS_ERR(prog))
		return prog;

	atomic_inc(&prog->aux->refcnt);
	fdput(f);
	return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_get);

/* last field in 'union bpf_attr' used by this command */
#define	BPF_PROG_LOAD_LAST_FIELD kern_version

static int bpf_prog_load(union bpf_attr *attr)
{
	enum bpf_prog_type type = attr->prog_type;
	struct bpf_prog *prog;
	int err;
	char license[128];
	bool is_gpl;

	if (CHECK_ATTR(BPF_PROG_LOAD))
		return -EINVAL;

	/* copy eBPF program license from user space */
	if (strncpy_from_user(license, u64_to_ptr(attr->license),
			      sizeof(license) - 1) < 0)
		return -EFAULT;
	license[sizeof(license) - 1] = 0;

	/* eBPF programs must be GPL compatible to use GPL-ed functions */
	is_gpl = license_is_gpl_compatible(license);

	if (attr->insn_cnt >= BPF_MAXINSNS)
		return -EINVAL;

	if (type == BPF_PROG_TYPE_KPROBE &&
	    attr->kern_version != LINUX_VERSION_CODE)
		return -EINVAL;

	/* plain bpf_prog allocation */
	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
	if (!prog)
		return -ENOMEM;

	prog->len = attr->insn_cnt;

	err = -EFAULT;
	if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
			   prog->len * sizeof(struct bpf_insn)) != 0)
		goto free_prog;

	prog->orig_prog = NULL;
	prog->jited = 0;

	atomic_set(&prog->aux->refcnt, 1);
	prog->gpl_compatible = is_gpl ? 1 : 0;

	/* find program type: socket_filter vs tracing_filter */
	err = find_prog_type(type, prog);
	if (err < 0)
		goto free_prog;

	/* run eBPF verifier */
	err = bpf_check(&prog, attr);
	if (err < 0)
		goto free_used_maps;

	/* fixup BPF_CALL->imm field */
	fixup_bpf_calls(prog);

	/* eBPF program is ready to be JITed */
	err = bpf_prog_select_runtime(prog);
	if (err < 0)
		goto free_used_maps;

	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
	if (err < 0)
		/* failed to allocate fd */
		goto free_used_maps;

	return err;

free_used_maps:
	free_used_maps(prog->aux);
free_prog:
	bpf_prog_free(prog);
	return err;
}

SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
	union bpf_attr attr = {};
	int err;

	/* the syscall is limited to root temporarily. This restriction will be
	 * lifted when security audit is clean. Note that eBPF+tracing must have
	 * this restriction, since it may pass kernel data to user space
	 */
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

	if (!access_ok(VERIFY_READ, uattr, 1))
		return -EFAULT;

	if (size > PAGE_SIZE)	/* silly large */
		return -E2BIG;

	/* If we're handed a bigger struct than we know of,
	 * ensure all the unknown bits are 0 - i.e. new
	 * user-space does not rely on any kernel feature
	 * extensions we dont know about yet.
	 */
	if (size > sizeof(attr)) {
		unsigned char __user *addr;
		unsigned char __user *end;
		unsigned char val;

		addr = (void __user *)uattr + sizeof(attr);
		end  = (void __user *)uattr + size;

		for (; addr < end; addr++) {
			err = get_user(val, addr);
			if (err)
				return err;
			if (val)
				return -E2BIG;
		}
		size = sizeof(attr);
	}

	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
	if (copy_from_user(&attr, uattr, size) != 0)
		return -EFAULT;

	switch (cmd) {
	case BPF_MAP_CREATE:
		err = map_create(&attr);
		break;
	case BPF_MAP_LOOKUP_ELEM:
		err = map_lookup_elem(&attr);
		break;
	case BPF_MAP_UPDATE_ELEM:
		err = map_update_elem(&attr);
		break;
	case BPF_MAP_DELETE_ELEM:
		err = map_delete_elem(&attr);
		break;
	case BPF_MAP_GET_NEXT_KEY:
		err = map_get_next_key(&attr);
		break;
	case BPF_PROG_LOAD:
		err = bpf_prog_load(&attr);
		break;
	default:
		err = -EINVAL;
		break;
	}

	return err;
}
Commit	Line	Data
99c55f7d AS	1	/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
	2	*
	3	* This program is free software; you can redistribute it and/or
	4	* modify it under the terms of version 2 of the GNU General Public
	5	* License as published by the Free Software Foundation.
	6	*
	7	* This program is distributed in the hope that it will be useful, but
	8	* WITHOUT ANY WARRANTY; without even the implied warranty of
	9	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	10	* General Public License for more details.
	11	*/
	12	#include <linux/bpf.h>
	13	#include <linux/syscalls.h>
	14	#include <linux/slab.h>
	15	#include <linux/anon_inodes.h>
db20fd2b	16	#include <linux/file.h>
09756af4 AS	17	#include <linux/license.h>
09756af4 AS	18	#include <linux/filter.h>
2541517c	19	#include <linux/version.h>
99c55f7d AS	20
	21	static LIST_HEAD(bpf_map_types);
	22
	23	static struct bpf_map find_and_alloc_map(union bpf_attr attr)
	24	{
	25	struct bpf_map_type_list *tl;
	26	struct bpf_map *map;
	27
	28	list_for_each_entry(tl, &bpf_map_types, list_node) {
	29	if (tl->type == attr->map_type) {
	30	map = tl->ops->map_alloc(attr);
	31	if (IS_ERR(map))
	32	return map;
	33	map->ops = tl->ops;
	34	map->map_type = attr->map_type;
	35	return map;
	36	}
	37	}
	38	return ERR_PTR(-EINVAL);
	39	}
	40
	41	/* boot time registration of different map implementations */
	42	void bpf_register_map_type(struct bpf_map_type_list *tl)
	43	{
	44	list_add(&tl->list_node, &bpf_map_types);
	45	}
	46
	47	/* called from workqueue */
	48	static void bpf_map_free_deferred(struct work_struct *work)
	49	{
	50	struct bpf_map *map = container_of(work, struct bpf_map, work);
	51
	52	/* implementation dependent freeing */
	53	map->ops->map_free(map);
	54	}
	55
	56	/* decrement map refcnt and schedule it for freeing via workqueue
	57	* (unrelying map implementation ops->map_free() might sleep)
	58	*/
	59	void bpf_map_put(struct bpf_map *map)
	60	{
	61	if (atomic_dec_and_test(&map->refcnt)) {
	62	INIT_WORK(&map->work, bpf_map_free_deferred);
	63	schedule_work(&map->work);
	64	}
	65	}
	66
	67	static int bpf_map_release(struct inode inode, struct file filp)
	68	{
	69	struct bpf_map *map = filp->private_data;
	70
04fd61ab AS	71	if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
	72	/* prog_array stores refcnt-ed bpf_prog pointers
	73	* release them all when user space closes prog_array_fd
	74	*/
2a36f0b9	75	bpf_fd_array_map_clear(map);
04fd61ab	76
99c55f7d AS	77	bpf_map_put(map);
	78	return 0;
	79	}
	80
	81	static const struct file_operations bpf_map_fops = {
	82	.release = bpf_map_release,
	83	};
	84
	85	/* helper macro to check that unused fields 'union bpf_attr' are zero */
	86	#define CHECK_ATTR(CMD) \
	87	memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
	88	sizeof(attr->CMD##_LAST_FIELD), 0, \
	89	sizeof(*attr) - \
	90	offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
	91	sizeof(attr->CMD##_LAST_FIELD)) != NULL
	92
	93	#define BPF_MAP_CREATE_LAST_FIELD max_entries
	94	/* called via syscall */
	95	static int map_create(union bpf_attr *attr)
	96	{
	97	struct bpf_map *map;
	98	int err;
	99
	100	err = CHECK_ATTR(BPF_MAP_CREATE);
	101	if (err)
	102	return -EINVAL;
	103
	104	/* find map type and init map: hashtable vs rbtree vs bloom vs ... */
	105	map = find_and_alloc_map(attr);
	106	if (IS_ERR(map))
	107	return PTR_ERR(map);
	108
	109	atomic_set(&map->refcnt, 1);
	110
	111	err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR \| O_CLOEXEC);
	112
	113	if (err < 0)
	114	/* failed to allocate fd */
	115	goto free_map;
	116
	117	return err;
	118
	119	free_map:
	120	map->ops->map_free(map);
	121	return err;
	122	}
	123
db20fd2b AS	124	/* if error is returned, fd is released.
	125	* On success caller should complete fd access with matching fdput()
	126	*/
	127	struct bpf_map *bpf_map_get(struct fd f)
	128	{
	129	struct bpf_map *map;
	130
	131	if (!f.file)
	132	return ERR_PTR(-EBADF);
	133
	134	if (f.file->f_op != &bpf_map_fops) {
	135	fdput(f);
	136	return ERR_PTR(-EINVAL);
	137	}
	138
	139	map = f.file->private_data;
	140
	141	return map;
	142	}
	143
	144	/* helper to convert user pointers passed inside __aligned_u64 fields */
	145	static void __user *u64_to_ptr(__u64 val)
	146	{
	147	return (void __user *) (unsigned long) val;
	148	}
	149
	150	/* last field in 'union bpf_attr' used by this command */
	151	#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
	152
	153	static int map_lookup_elem(union bpf_attr *attr)
	154	{
	155	void __user *ukey = u64_to_ptr(attr->key);
	156	void __user *uvalue = u64_to_ptr(attr->value);
	157	int ufd = attr->map_fd;
db20fd2b	158	struct bpf_map *map;
8ebe667c	159	void key, value, *ptr;
592867bf	160	struct fd f;
db20fd2b AS	161	int err;
	162
	163	if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
	164	return -EINVAL;
	165
592867bf	166	f = fdget(ufd);
db20fd2b AS	167	map = bpf_map_get(f);
	168	if (IS_ERR(map))
	169	return PTR_ERR(map);
	170
	171	err = -ENOMEM;
	172	key = kmalloc(map->key_size, GFP_USER);
	173	if (!key)
	174	goto err_put;
	175
	176	err = -EFAULT;
	177	if (copy_from_user(key, ukey, map->key_size) != 0)
	178	goto free_key;
	179
8ebe667c AS	180	err = -ENOMEM;
8ebe667c AS	181	value = kmalloc(map->value_size, GFP_USER);
db20fd2b	182	if (!value)
8ebe667c AS	183	goto free_key;
	184
	185	rcu_read_lock();
	186	ptr = map->ops->map_lookup_elem(map, key);
	187	if (ptr)
	188	memcpy(value, ptr, map->value_size);
	189	rcu_read_unlock();
	190
	191	err = -ENOENT;
	192	if (!ptr)
	193	goto free_value;
db20fd2b AS	194
	195	err = -EFAULT;
	196	if (copy_to_user(uvalue, value, map->value_size) != 0)
8ebe667c	197	goto free_value;
db20fd2b AS	198
	199	err = 0;
	200
8ebe667c AS	201	free_value:
8ebe667c AS	202	kfree(value);
db20fd2b AS	203	free_key:
	204	kfree(key);
	205	err_put:
	206	fdput(f);
	207	return err;
	208	}
	209
3274f520	210	#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
db20fd2b AS	211
	212	static int map_update_elem(union bpf_attr *attr)
	213	{
	214	void __user *ukey = u64_to_ptr(attr->key);
	215	void __user *uvalue = u64_to_ptr(attr->value);
	216	int ufd = attr->map_fd;
db20fd2b AS	217	struct bpf_map *map;
db20fd2b AS	218	void key, value;
592867bf	219	struct fd f;
db20fd2b AS	220	int err;
	221
	222	if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
	223	return -EINVAL;
	224
592867bf	225	f = fdget(ufd);
db20fd2b AS	226	map = bpf_map_get(f);
	227	if (IS_ERR(map))
	228	return PTR_ERR(map);
	229
	230	err = -ENOMEM;
	231	key = kmalloc(map->key_size, GFP_USER);
	232	if (!key)
	233	goto err_put;
	234
	235	err = -EFAULT;
	236	if (copy_from_user(key, ukey, map->key_size) != 0)
	237	goto free_key;
	238
	239	err = -ENOMEM;
	240	value = kmalloc(map->value_size, GFP_USER);
	241	if (!value)
	242	goto free_key;
	243
	244	err = -EFAULT;
	245	if (copy_from_user(value, uvalue, map->value_size) != 0)
	246	goto free_value;
	247
	248	/* eBPF program that use maps are running under rcu_read_lock(),
	249	* therefore all map accessors rely on this fact, so do the same here
	250	*/
	251	rcu_read_lock();
3274f520	252	err = map->ops->map_update_elem(map, key, value, attr->flags);
db20fd2b AS	253	rcu_read_unlock();
	254
	255	free_value:
	256	kfree(value);
	257	free_key:
	258	kfree(key);
	259	err_put:
	260	fdput(f);
	261	return err;
	262	}
	263
	264	#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
	265
	266	static int map_delete_elem(union bpf_attr *attr)
	267	{
	268	void __user *ukey = u64_to_ptr(attr->key);
	269	int ufd = attr->map_fd;
db20fd2b	270	struct bpf_map *map;
592867bf	271	struct fd f;
db20fd2b AS	272	void *key;
	273	int err;
	274
	275	if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
	276	return -EINVAL;
	277
592867bf	278	f = fdget(ufd);
db20fd2b AS	279	map = bpf_map_get(f);
	280	if (IS_ERR(map))
	281	return PTR_ERR(map);
	282
	283	err = -ENOMEM;
	284	key = kmalloc(map->key_size, GFP_USER);
	285	if (!key)
	286	goto err_put;
	287
	288	err = -EFAULT;
	289	if (copy_from_user(key, ukey, map->key_size) != 0)
	290	goto free_key;
	291
	292	rcu_read_lock();
	293	err = map->ops->map_delete_elem(map, key);
	294	rcu_read_unlock();
	295
	296	free_key:
	297	kfree(key);
	298	err_put:
	299	fdput(f);
	300	return err;
	301	}
	302
	303	/* last field in 'union bpf_attr' used by this command */
	304	#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
	305
	306	static int map_get_next_key(union bpf_attr *attr)
	307	{
	308	void __user *ukey = u64_to_ptr(attr->key);
	309	void __user *unext_key = u64_to_ptr(attr->next_key);
	310	int ufd = attr->map_fd;
db20fd2b AS	311	struct bpf_map *map;
db20fd2b AS	312	void key, next_key;
592867bf	313	struct fd f;
db20fd2b AS	314	int err;
	315
	316	if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
	317	return -EINVAL;
	318
592867bf	319	f = fdget(ufd);
db20fd2b AS	320	map = bpf_map_get(f);
	321	if (IS_ERR(map))
	322	return PTR_ERR(map);
	323
	324	err = -ENOMEM;
	325	key = kmalloc(map->key_size, GFP_USER);
	326	if (!key)
	327	goto err_put;
	328
	329	err = -EFAULT;
	330	if (copy_from_user(key, ukey, map->key_size) != 0)
	331	goto free_key;
	332
	333	err = -ENOMEM;
	334	next_key = kmalloc(map->key_size, GFP_USER);
	335	if (!next_key)
	336	goto free_key;
	337
	338	rcu_read_lock();
	339	err = map->ops->map_get_next_key(map, key, next_key);
	340	rcu_read_unlock();
	341	if (err)
	342	goto free_next_key;
	343
	344	err = -EFAULT;
	345	if (copy_to_user(unext_key, next_key, map->key_size) != 0)
	346	goto free_next_key;
	347
	348	err = 0;
	349
	350	free_next_key:
	351	kfree(next_key);
	352	free_key:
	353	kfree(key);
	354	err_put:
	355	fdput(f);
	356	return err;
	357	}
	358
09756af4 AS	359	static LIST_HEAD(bpf_prog_types);
	360
	361	static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
	362	{
	363	struct bpf_prog_type_list *tl;
	364
	365	list_for_each_entry(tl, &bpf_prog_types, list_node) {
	366	if (tl->type == type) {
	367	prog->aux->ops = tl->ops;
24701ece	368	prog->type = type;
09756af4 AS	369	return 0;
	370	}
	371	}
24701ece	372
09756af4 AS	373	return -EINVAL;
	374	}
	375
	376	void bpf_register_prog_type(struct bpf_prog_type_list *tl)
	377	{
	378	list_add(&tl->list_node, &bpf_prog_types);
	379	}
	380
0a542a86 AS	381	/* fixup insn->imm field of bpf_call instructions:
	382	* if (insn->imm == BPF_FUNC_map_lookup_elem)
	383	* insn->imm = bpf_map_lookup_elem - __bpf_call_base;
	384	* else if (insn->imm == BPF_FUNC_map_update_elem)
	385	* insn->imm = bpf_map_update_elem - __bpf_call_base;
	386	* else ...
	387	*
	388	* this function is called after eBPF program passed verification
	389	*/
	390	static void fixup_bpf_calls(struct bpf_prog *prog)
	391	{
	392	const struct bpf_func_proto *fn;
	393	int i;
	394
	395	for (i = 0; i < prog->len; i++) {
	396	struct bpf_insn *insn = &prog->insnsi[i];
	397
	398	if (insn->code == (BPF_JMP \| BPF_CALL)) {
	399	/* we reach here when program has bpf_call instructions
	400	* and it passed bpf_check(), means that
	401	* ops->get_func_proto must have been supplied, check it
	402	*/
	403	BUG_ON(!prog->aux->ops->get_func_proto);
	404
c46646d0 DB	405	if (insn->imm == BPF_FUNC_get_route_realm)
c46646d0 DB	406	prog->dst_needed = 1;
3ad00405 DB	407	if (insn->imm == BPF_FUNC_get_prandom_u32)
3ad00405 DB	408	bpf_user_rnd_init_once();
04fd61ab AS	409	if (insn->imm == BPF_FUNC_tail_call) {
	410	/* mark bpf_tail_call as different opcode
	411	* to avoid conditional branch in
	412	* interpeter for every normal call
	413	* and to prevent accidental JITing by
	414	* JIT compiler that doesn't support
	415	* bpf_tail_call yet
	416	*/
	417	insn->imm = 0;
	418	insn->code \|= BPF_X;
	419	continue;
	420	}
	421
0a542a86 AS	422	fn = prog->aux->ops->get_func_proto(insn->imm);
	423	/* all functions that have prototype and verifier allowed
	424	* programs to call them, must be real in-kernel functions
	425	*/
	426	BUG_ON(!fn->func);
	427	insn->imm = fn->func - __bpf_call_base;
	428	}
	429	}
	430	}
	431
09756af4 AS	432	/* drop refcnt on maps used by eBPF program and free auxilary data */
	433	static void free_used_maps(struct bpf_prog_aux *aux)
	434	{
	435	int i;
	436
	437	for (i = 0; i < aux->used_map_cnt; i++)
	438	bpf_map_put(aux->used_maps[i]);
	439
	440	kfree(aux->used_maps);
	441	}
	442
abf2e7d6 AS	443	static void __prog_put_rcu(struct rcu_head *rcu)
	444	{
	445	struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
	446
	447	free_used_maps(aux);
	448	bpf_prog_free(aux->prog);
	449	}
	450
	451	/* version of bpf_prog_put() that is called after a grace period */
	452	void bpf_prog_put_rcu(struct bpf_prog *prog)
	453	{
	454	if (atomic_dec_and_test(&prog->aux->refcnt)) {
	455	prog->aux->prog = prog;
	456	call_rcu(&prog->aux->rcu, __prog_put_rcu);
	457	}
	458	}
	459
09756af4 AS	460	void bpf_prog_put(struct bpf_prog *prog)
	461	{
	462	if (atomic_dec_and_test(&prog->aux->refcnt)) {
	463	free_used_maps(prog->aux);
	464	bpf_prog_free(prog);
	465	}
	466	}
e2e9b654	467	EXPORT_SYMBOL_GPL(bpf_prog_put);
09756af4 AS	468
	469	static int bpf_prog_release(struct inode inode, struct file filp)
	470	{
	471	struct bpf_prog *prog = filp->private_data;
	472
abf2e7d6	473	bpf_prog_put_rcu(prog);
09756af4 AS	474	return 0;
	475	}
	476
	477	static const struct file_operations bpf_prog_fops = {
	478	.release = bpf_prog_release,
	479	};
	480
	481	static struct bpf_prog *get_prog(struct fd f)
	482	{
	483	struct bpf_prog *prog;
	484
	485	if (!f.file)
	486	return ERR_PTR(-EBADF);
	487
	488	if (f.file->f_op != &bpf_prog_fops) {
	489	fdput(f);
	490	return ERR_PTR(-EINVAL);
	491	}
	492
	493	prog = f.file->private_data;
	494
	495	return prog;
	496	}
	497
	498	/* called by sockets/tracing/seccomp before attaching program to an event
	499	* pairs with bpf_prog_put()
	500	*/
	501	struct bpf_prog *bpf_prog_get(u32 ufd)
	502	{
	503	struct fd f = fdget(ufd);
	504	struct bpf_prog *prog;
	505
	506	prog = get_prog(f);
	507
	508	if (IS_ERR(prog))
	509	return prog;
	510
	511	atomic_inc(&prog->aux->refcnt);
	512	fdput(f);
	513	return prog;
	514	}
e2e9b654	515	EXPORT_SYMBOL_GPL(bpf_prog_get);
09756af4 AS	516
09756af4 AS	517	/* last field in 'union bpf_attr' used by this command */
2541517c	518	#define BPF_PROG_LOAD_LAST_FIELD kern_version
09756af4 AS	519
	520	static int bpf_prog_load(union bpf_attr *attr)
	521	{
	522	enum bpf_prog_type type = attr->prog_type;
	523	struct bpf_prog *prog;
	524	int err;
	525	char license[128];
	526	bool is_gpl;
	527
	528	if (CHECK_ATTR(BPF_PROG_LOAD))
	529	return -EINVAL;
	530
	531	/* copy eBPF program license from user space */
	532	if (strncpy_from_user(license, u64_to_ptr(attr->license),
	533	sizeof(license) - 1) < 0)
	534	return -EFAULT;
	535	license[sizeof(license) - 1] = 0;
	536
	537	/* eBPF programs must be GPL compatible to use GPL-ed functions */
	538	is_gpl = license_is_gpl_compatible(license);
	539
	540	if (attr->insn_cnt >= BPF_MAXINSNS)
	541	return -EINVAL;
	542
2541517c AS	543	if (type == BPF_PROG_TYPE_KPROBE &&
	544	attr->kern_version != LINUX_VERSION_CODE)
	545	return -EINVAL;
	546
09756af4 AS	547	/* plain bpf_prog allocation */
	548	prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
	549	if (!prog)
	550	return -ENOMEM;
	551
	552	prog->len = attr->insn_cnt;
	553
	554	err = -EFAULT;
	555	if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
	556	prog->len * sizeof(struct bpf_insn)) != 0)
	557	goto free_prog;
	558
	559	prog->orig_prog = NULL;
a91263d5	560	prog->jited = 0;
09756af4 AS	561
09756af4 AS	562	atomic_set(&prog->aux->refcnt, 1);
a91263d5	563	prog->gpl_compatible = is_gpl ? 1 : 0;
09756af4 AS	564
	565	/* find program type: socket_filter vs tracing_filter */
	566	err = find_prog_type(type, prog);
	567	if (err < 0)
	568	goto free_prog;
	569
	570	/* run eBPF verifier */
9bac3d6d	571	err = bpf_check(&prog, attr);
09756af4 AS	572	if (err < 0)
	573	goto free_used_maps;
	574
0a542a86 AS	575	/* fixup BPF_CALL->imm field */
	576	fixup_bpf_calls(prog);
	577
09756af4	578	/* eBPF program is ready to be JITed */
04fd61ab AS	579	err = bpf_prog_select_runtime(prog);
	580	if (err < 0)
	581	goto free_used_maps;
09756af4 AS	582
09756af4 AS	583	err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR \| O_CLOEXEC);
09756af4 AS	584	if (err < 0)
	585	/* failed to allocate fd */
	586	goto free_used_maps;
	587
	588	return err;
	589
	590	free_used_maps:
	591	free_used_maps(prog->aux);
	592	free_prog:
	593	bpf_prog_free(prog);
	594	return err;
	595	}
	596
99c55f7d AS	597	SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
	598	{
	599	union bpf_attr attr = {};
	600	int err;
	601
	602	/* the syscall is limited to root temporarily. This restriction will be
	603	* lifted when security audit is clean. Note that eBPF+tracing must have
	604	* this restriction, since it may pass kernel data to user space
	605	*/
	606	if (!capable(CAP_SYS_ADMIN))
	607	return -EPERM;
	608
	609	if (!access_ok(VERIFY_READ, uattr, 1))
	610	return -EFAULT;
	611
	612	if (size > PAGE_SIZE) /* silly large */
	613	return -E2BIG;
	614
	615	/* If we're handed a bigger struct than we know of,
	616	* ensure all the unknown bits are 0 - i.e. new
	617	* user-space does not rely on any kernel feature
	618	* extensions we dont know about yet.
	619	*/
	620	if (size > sizeof(attr)) {
	621	unsigned char __user *addr;
	622	unsigned char __user *end;
	623	unsigned char val;
	624
	625	addr = (void __user *)uattr + sizeof(attr);
	626	end = (void __user *)uattr + size;
	627
	628	for (; addr < end; addr++) {
	629	err = get_user(val, addr);
	630	if (err)
	631	return err;
	632	if (val)
	633	return -E2BIG;
	634	}
	635	size = sizeof(attr);
	636	}
	637
	638	/* copy attributes from user space, may be less than sizeof(bpf_attr) */
	639	if (copy_from_user(&attr, uattr, size) != 0)
	640	return -EFAULT;
	641
	642	switch (cmd) {
	643	case BPF_MAP_CREATE:
	644	err = map_create(&attr);
	645	break;
db20fd2b AS	646	case BPF_MAP_LOOKUP_ELEM:
	647	err = map_lookup_elem(&attr);
	648	break;
	649	case BPF_MAP_UPDATE_ELEM:
	650	err = map_update_elem(&attr);
	651	break;
	652	case BPF_MAP_DELETE_ELEM:
	653	err = map_delete_elem(&attr);
	654	break;
	655	case BPF_MAP_GET_NEXT_KEY:
	656	err = map_get_next_key(&attr);
	657	break;
09756af4 AS	658	case BPF_PROG_LOAD:
	659	err = bpf_prog_load(&attr);
	660	break;
99c55f7d AS	661	default:
	662	err = -EINVAL;
	663	break;
	664	}
	665
	666	return err;
	667	}