--- /dev/null
+From b2157399cc9898260d6031c5bfe45fe137c1fbe7 Mon Sep 17 00:00:00 2001
+From: Alexei Starovoitov <ast@kernel.org>
+Date: Sun, 7 Jan 2018 17:33:02 -0800
+Subject: bpf: prevent out-of-bounds speculation
+
+From: Alexei Starovoitov <ast@kernel.org>
+
+commit b2157399cc9898260d6031c5bfe45fe137c1fbe7 upstream.
+
+Under speculation, CPUs may mis-predict branches in bounds checks. Thus,
+memory accesses under a bounds check may be speculated even if the
+bounds check fails, providing a primitive for building a side channel.
+
+To avoid leaking kernel data round up array-based maps and mask the index
+after bounds check, so speculated load with out of bounds index will load
+either valid value from the array or zero from the padded area.
+
+Unconditionally mask index for all array types even when max_entries
+are not rounded to power of 2 for root user.
+When map is created by unpriv user generate a sequence of bpf insns
+that includes AND operation to make sure that JITed code includes
+the same 'index & index_mask' operation.
+
+If prog_array map is created by unpriv user replace
+ bpf_tail_call(ctx, map, index);
+with
+ if (index >= max_entries) {
+ index &= map->index_mask;
+ bpf_tail_call(ctx, map, index);
+ }
+(along with roundup to power 2) to prevent out-of-bounds speculation.
+There is secondary redundant 'if (index >= max_entries)' in the interpreter
+and in all JITs, but they can be optimized later if necessary.
+
+Other array-like maps (cpumap, devmap, sockmap, perf_event_array, cgroup_array)
+cannot be used by unpriv, so no changes there.
+
+That fixes bpf side of "Variant 1: bounds check bypass (CVE-2017-5753)" on
+all architectures with and without JIT.
+
+v2->v3:
+Daniel noticed that attack potentially can be crafted via syscall commands
+without loading the program, so add masking to those paths as well.
+
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Acked-by: John Fastabend <john.fastabend@gmail.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Cc: Jiri Slaby <jslaby@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/bpf.h | 2 ++
+ kernel/bpf/arraymap.c | 47 ++++++++++++++++++++++++++++++++++++-----------
+ kernel/bpf/verifier.c | 36 ++++++++++++++++++++++++++++++++++++
+ 3 files changed, 74 insertions(+), 11 deletions(-)
+
+--- a/include/linux/bpf.h
++++ b/include/linux/bpf.h
+@@ -51,6 +51,7 @@ struct bpf_map {
+ u32 pages;
+ u32 id;
+ int numa_node;
++ bool unpriv_array;
+ struct user_struct *user;
+ const struct bpf_map_ops *ops;
+ struct work_struct work;
+@@ -195,6 +196,7 @@ struct bpf_prog_aux {
+ struct bpf_array {
+ struct bpf_map map;
+ u32 elem_size;
++ u32 index_mask;
+ /* 'ownership' of prog_array is claimed by the first program that
+ * is going to use this map or by the first program which FD is stored
+ * in the map to make sure that all callers and callees have the same
+--- a/kernel/bpf/arraymap.c
++++ b/kernel/bpf/arraymap.c
+@@ -50,9 +50,10 @@ static struct bpf_map *array_map_alloc(u
+ {
+ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
+ int numa_node = bpf_map_attr_numa_node(attr);
++ u32 elem_size, index_mask, max_entries;
++ bool unpriv = !capable(CAP_SYS_ADMIN);
+ struct bpf_array *array;
+ u64 array_size;
+- u32 elem_size;
+
+ /* check sanity of attributes */
+ if (attr->max_entries == 0 || attr->key_size != 4 ||
+@@ -68,11 +69,20 @@ static struct bpf_map *array_map_alloc(u
+
+ elem_size = round_up(attr->value_size, 8);
+
++ max_entries = attr->max_entries;
++ index_mask = roundup_pow_of_two(max_entries) - 1;
++
++ if (unpriv)
++ /* round up array size to nearest power of 2,
++ * since cpu will speculate within index_mask limits
++ */
++ max_entries = index_mask + 1;
++
+ array_size = sizeof(*array);
+ if (percpu)
+- array_size += (u64) attr->max_entries * sizeof(void *);
++ array_size += (u64) max_entries * sizeof(void *);
+ else
+- array_size += (u64) attr->max_entries * elem_size;
++ array_size += (u64) max_entries * elem_size;
+
+ /* make sure there is no u32 overflow later in round_up() */
+ if (array_size >= U32_MAX - PAGE_SIZE)
+@@ -82,6 +92,8 @@ static struct bpf_map *array_map_alloc(u
+ array = bpf_map_area_alloc(array_size, numa_node);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
++ array->index_mask = index_mask;
++ array->map.unpriv_array = unpriv;
+
+ /* copy mandatory map attributes */
+ array->map.map_type = attr->map_type;
+@@ -117,12 +129,13 @@ static void *array_map_lookup_elem(struc
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+- return array->value + array->elem_size * index;
++ return array->value + array->elem_size * (index & array->index_mask);
+ }
+
+ /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
+ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
+ {
++ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct bpf_insn *insn = insn_buf;
+ u32 elem_size = round_up(map->value_size, 8);
+ const int ret = BPF_REG_0;
+@@ -131,7 +144,12 @@ static u32 array_map_gen_lookup(struct b
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
++ if (map->unpriv_array) {
++ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4);
++ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
++ } else {
++ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3);
++ }
+
+ if (is_power_of_2(elem_size)) {
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+@@ -153,7 +171,7 @@ static void *percpu_array_map_lookup_ele
+ if (unlikely(index >= array->map.max_entries))
+ return NULL;
+
+- return this_cpu_ptr(array->pptrs[index]);
++ return this_cpu_ptr(array->pptrs[index & array->index_mask]);
+ }
+
+ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
+@@ -173,7 +191,7 @@ int bpf_percpu_array_copy(struct bpf_map
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+- pptr = array->pptrs[index];
++ pptr = array->pptrs[index & array->index_mask];
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
+ off += size;
+@@ -221,10 +239,11 @@ static int array_map_update_elem(struct
+ return -EEXIST;
+
+ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+- memcpy(this_cpu_ptr(array->pptrs[index]),
++ memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
+ value, map->value_size);
+ else
+- memcpy(array->value + array->elem_size * index,
++ memcpy(array->value +
++ array->elem_size * (index & array->index_mask),
+ value, map->value_size);
+ return 0;
+ }
+@@ -258,7 +277,7 @@ int bpf_percpu_array_update(struct bpf_m
+ */
+ size = round_up(map->value_size, 8);
+ rcu_read_lock();
+- pptr = array->pptrs[index];
++ pptr = array->pptrs[index & array->index_mask];
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
+ off += size;
+@@ -609,6 +628,7 @@ static void *array_of_map_lookup_elem(st
+ static u32 array_of_map_gen_lookup(struct bpf_map *map,
+ struct bpf_insn *insn_buf)
+ {
++ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ u32 elem_size = round_up(map->value_size, 8);
+ struct bpf_insn *insn = insn_buf;
+ const int ret = BPF_REG_0;
+@@ -617,7 +637,12 @@ static u32 array_of_map_gen_lookup(struc
+
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value));
+ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0);
+- *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
++ if (map->unpriv_array) {
++ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6);
++ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask);
++ } else {
++ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5);
++ }
+ if (is_power_of_2(elem_size))
+ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size));
+ else
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1701,6 +1701,13 @@ static int check_call(struct bpf_verifie
+ err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta);
+ if (err)
+ return err;
++ if (func_id == BPF_FUNC_tail_call) {
++ if (meta.map_ptr == NULL) {
++ verbose(env, "verifier bug\n");
++ return -EINVAL;
++ }
++ env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr;
++ }
+ err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta);
+ if (err)
+ return err;
+@@ -4315,6 +4322,35 @@ static int fixup_bpf_calls(struct bpf_ve
+ */
+ insn->imm = 0;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
++
++ /* instead of changing every JIT dealing with tail_call
++ * emit two extra insns:
++ * if (index >= max_entries) goto out;
++ * index &= array->index_mask;
++ * to avoid out-of-bounds cpu speculation
++ */
++ map_ptr = env->insn_aux_data[i + delta].map_ptr;
++ if (map_ptr == BPF_MAP_PTR_POISON) {
++ verbose(env, "tail_call obusing map_ptr\n");
++ return -EINVAL;
++ }
++ if (!map_ptr->unpriv_array)
++ continue;
++ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3,
++ map_ptr->max_entries, 2);
++ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3,
++ container_of(map_ptr,
++ struct bpf_array,
++ map)->index_mask);
++ insn_buf[2] = *insn;
++ cnt = 3;
++ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
++ if (!new_prog)
++ return -ENOMEM;
++
++ delta += cnt - 1;
++ env->prog = prog = new_prog;
++ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+