From bb06ab20fd12e14d8d6be33698c59951a6b28bf7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 2 Jul 2019 08:16:32 +0200 Subject: [PATCH] 4.19-stable patches added patches: arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch bpf-fix-unconnected-udp-hooks.patch bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch --- ...ninitialised-stack-in-failed-cmpxchg.patch | 35 ++ ...le-stadd-over-ldxr-stxr-loop-in-xadd.patch | 165 ++++++++++ ...ed-bpf-tracepoints-with-per-cpu-data.patch | 191 +++++++++++ .../bpf-fix-unconnected-udp-hooks.patch | 308 ++++++++++++++++++ ...child-of-last-leftmost-node-for-null.patch | 127 ++++++++ ...tion-of-bpf_fib_lookup-related-flags.patch | 40 +++ ...ng-reuseport-s-bpf_prog-from-udp_gro.patch | 55 ++++ ...eport-s-bpf_prog-from-__udp6_lib_err.patch | 50 +++ ...out-return-values-of-arch-futex-code.patch | 57 ++++ queue-4.19/series | 9 + 10 files changed, 1037 insertions(+) create mode 100644 queue-4.19/arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch create mode 100644 queue-4.19/bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch create mode 100644 queue-4.19/bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch create mode 100644 queue-4.19/bpf-fix-unconnected-udp-hooks.patch create mode 100644 queue-4.19/bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch create mode 100644 queue-4.19/bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch create mode 100644 queue-4.19/bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch create mode 100644 queue-4.19/bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch create mode 100644 queue-4.19/futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch diff --git a/queue-4.19/arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch b/queue-4.19/arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch new file mode 100644 index 00000000000..f6be711819a --- /dev/null +++ b/queue-4.19/arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch @@ -0,0 +1,35 @@ +From 8e4e0ac02b449297b86498ac24db5786ddd9f647 Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Wed, 10 Apr 2019 11:49:11 +0100 +Subject: arm64: futex: Avoid copying out uninitialised stack in failed cmpxchg() + +From: Will Deacon + +commit 8e4e0ac02b449297b86498ac24db5786ddd9f647 upstream. + +Returning an error code from futex_atomic_cmpxchg_inatomic() indicates +that the caller should not make any use of *uval, and should instead act +upon on the value of the error code. Although this is implemented +correctly in our futex code, we needlessly copy uninitialised stack to +*uval in the error case, which can easily be avoided. + +Signed-off-by: Will Deacon +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/include/asm/futex.h | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/arch/arm64/include/asm/futex.h ++++ b/arch/arm64/include/asm/futex.h +@@ -134,7 +134,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, + : "memory"); + uaccess_disable(); + +- *uval = val; ++ if (!ret) ++ *uval = val; ++ + return ret; + } + diff --git a/queue-4.19/bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch b/queue-4.19/bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch new file mode 100644 index 00000000000..5ce84c4c549 --- /dev/null +++ b/queue-4.19/bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch @@ -0,0 +1,165 @@ +From 34b8ab091f9ef57a2bb3c8c8359a0a03a8abf2f9 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Fri, 26 Apr 2019 21:48:22 +0200 +Subject: bpf, arm64: use more scalable stadd over ldxr / stxr loop in xadd + +From: Daniel Borkmann + +commit 34b8ab091f9ef57a2bb3c8c8359a0a03a8abf2f9 upstream. + +Since ARMv8.1 supplement introduced LSE atomic instructions back in 2016, +lets add support for STADD and use that in favor of LDXR / STXR loop for +the XADD mapping if available. STADD is encoded as an alias for LDADD with +XZR as the destination register, therefore add LDADD to the instruction +encoder along with STADD as special case and use it in the JIT for CPUs +that advertise LSE atomics in CPUID register. If immediate offset in the +BPF XADD insn is 0, then use dst register directly instead of temporary +one. + +Signed-off-by: Daniel Borkmann +Acked-by: Jean-Philippe Brucker +Acked-by: Will Deacon +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/include/asm/insn.h | 8 ++++++++ + arch/arm64/kernel/insn.c | 40 ++++++++++++++++++++++++++++++++++++++++ + arch/arm64/net/bpf_jit.h | 4 ++++ + arch/arm64/net/bpf_jit_comp.c | 28 +++++++++++++++++++--------- + 4 files changed, 71 insertions(+), 9 deletions(-) + +--- a/arch/arm64/include/asm/insn.h ++++ b/arch/arm64/include/asm/insn.h +@@ -272,6 +272,7 @@ __AARCH64_INSN_FUNCS(adrp, 0x9F000000, 0 + __AARCH64_INSN_FUNCS(prfm, 0x3FC00000, 0x39800000) + __AARCH64_INSN_FUNCS(prfm_lit, 0xFF000000, 0xD8000000) + __AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800) ++__AARCH64_INSN_FUNCS(ldadd, 0x3F20FC00, 0xB8200000) + __AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800) + __AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000) + __AARCH64_INSN_FUNCS(ldrsw_lit, 0xFF000000, 0x98000000) +@@ -389,6 +390,13 @@ u32 aarch64_insn_gen_load_store_ex(enum + enum aarch64_insn_register state, + enum aarch64_insn_size_type size, + enum aarch64_insn_ldst_type type); ++u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result, ++ enum aarch64_insn_register address, ++ enum aarch64_insn_register value, ++ enum aarch64_insn_size_type size); ++u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address, ++ enum aarch64_insn_register value, ++ enum aarch64_insn_size_type size); + u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst, + enum aarch64_insn_register src, + int imm, enum aarch64_insn_variant variant, +--- a/arch/arm64/kernel/insn.c ++++ b/arch/arm64/kernel/insn.c +@@ -734,6 +734,46 @@ u32 aarch64_insn_gen_load_store_ex(enum + state); + } + ++u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result, ++ enum aarch64_insn_register address, ++ enum aarch64_insn_register value, ++ enum aarch64_insn_size_type size) ++{ ++ u32 insn = aarch64_insn_get_ldadd_value(); ++ ++ switch (size) { ++ case AARCH64_INSN_SIZE_32: ++ case AARCH64_INSN_SIZE_64: ++ break; ++ default: ++ pr_err("%s: unimplemented size encoding %d\n", __func__, size); ++ return AARCH64_BREAK_FAULT; ++ } ++ ++ insn = aarch64_insn_encode_ldst_size(size, insn); ++ ++ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, ++ result); ++ ++ insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn, ++ address); ++ ++ return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn, ++ value); ++} ++ ++u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address, ++ enum aarch64_insn_register value, ++ enum aarch64_insn_size_type size) ++{ ++ /* ++ * STADD is simply encoded as an alias for LDADD with XZR as ++ * the destination register. ++ */ ++ return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address, ++ value, size); ++} ++ + static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type, + enum aarch64_insn_prfm_target target, + enum aarch64_insn_prfm_policy policy, +--- a/arch/arm64/net/bpf_jit.h ++++ b/arch/arm64/net/bpf_jit.h +@@ -100,6 +100,10 @@ + #define A64_STXR(sf, Rt, Rn, Rs) \ + A64_LSX(sf, Rt, Rn, Rs, STORE_EX) + ++/* LSE atomics */ ++#define A64_STADD(sf, Rn, Rs) \ ++ aarch64_insn_gen_stadd(Rn, Rs, A64_SIZE(sf)) ++ + /* Add/subtract (immediate) */ + #define A64_ADDSUB_IMM(sf, Rd, Rn, imm12, type) \ + aarch64_insn_gen_add_sub_imm(Rd, Rn, imm12, \ +--- a/arch/arm64/net/bpf_jit_comp.c ++++ b/arch/arm64/net/bpf_jit_comp.c +@@ -364,7 +364,7 @@ static int build_insn(const struct bpf_i + const int i = insn - ctx->prog->insnsi; + const bool is64 = BPF_CLASS(code) == BPF_ALU64; + const bool isdw = BPF_SIZE(code) == BPF_DW; +- u8 jmp_cond; ++ u8 jmp_cond, reg; + s32 jmp_offset; + + #define check_imm(bits, imm) do { \ +@@ -730,18 +730,28 @@ emit_cond_jmp: + break; + } + break; ++ + /* STX XADD: lock *(u32 *)(dst + off) += src */ + case BPF_STX | BPF_XADD | BPF_W: + /* STX XADD: lock *(u64 *)(dst + off) += src */ + case BPF_STX | BPF_XADD | BPF_DW: +- emit_a64_mov_i(1, tmp, off, ctx); +- emit(A64_ADD(1, tmp, tmp, dst), ctx); +- emit(A64_LDXR(isdw, tmp2, tmp), ctx); +- emit(A64_ADD(isdw, tmp2, tmp2, src), ctx); +- emit(A64_STXR(isdw, tmp2, tmp, tmp3), ctx); +- jmp_offset = -3; +- check_imm19(jmp_offset); +- emit(A64_CBNZ(0, tmp3, jmp_offset), ctx); ++ if (!off) { ++ reg = dst; ++ } else { ++ emit_a64_mov_i(1, tmp, off, ctx); ++ emit(A64_ADD(1, tmp, tmp, dst), ctx); ++ reg = tmp; ++ } ++ if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) { ++ emit(A64_STADD(isdw, reg, src), ctx); ++ } else { ++ emit(A64_LDXR(isdw, tmp2, reg), ctx); ++ emit(A64_ADD(isdw, tmp2, tmp2, src), ctx); ++ emit(A64_STXR(isdw, tmp2, reg, tmp3), ctx); ++ jmp_offset = -3; ++ check_imm19(jmp_offset); ++ emit(A64_CBNZ(0, tmp3, jmp_offset), ctx); ++ } + break; + + default: diff --git a/queue-4.19/bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch b/queue-4.19/bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch new file mode 100644 index 00000000000..36aacf5be9b --- /dev/null +++ b/queue-4.19/bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch @@ -0,0 +1,191 @@ +From 9594dc3c7e71b9f52bee1d7852eb3d4e3aea9e99 Mon Sep 17 00:00:00 2001 +From: Matt Mullins +Date: Tue, 11 Jun 2019 14:53:04 -0700 +Subject: bpf: fix nested bpf tracepoints with per-cpu data + +From: Matt Mullins + +commit 9594dc3c7e71b9f52bee1d7852eb3d4e3aea9e99 upstream. + +BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as +they do not increment bpf_prog_active while executing. + +This enables three levels of nesting, to support + - a kprobe or raw tp or perf event, + - another one of the above that irq context happens to call, and + - another one in nmi context +(at most one of which may be a kprobe or perf event). + +Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data") +Signed-off-by: Matt Mullins +Acked-by: Andrii Nakryiko +Acked-by: Daniel Borkmann +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/bpf_trace.c | 100 +++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 84 insertions(+), 16 deletions(-) + +--- a/kernel/trace/bpf_trace.c ++++ b/kernel/trace/bpf_trace.c +@@ -365,8 +365,6 @@ static const struct bpf_func_proto bpf_p + .arg4_type = ARG_CONST_SIZE, + }; + +-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); +- + static __always_inline u64 + __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, + u64 flags, struct perf_sample_data *sd) +@@ -398,24 +396,50 @@ __bpf_perf_event_output(struct pt_regs * + return 0; + } + ++/* ++ * Support executing tracepoints in normal, irq, and nmi context that each call ++ * bpf_perf_event_output ++ */ ++struct bpf_trace_sample_data { ++ struct perf_sample_data sds[3]; ++}; ++ ++static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); ++static DEFINE_PER_CPU(int, bpf_trace_nest_level); + BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, + u64, flags, void *, data, u64, size) + { +- struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); ++ struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); ++ int nest_level = this_cpu_inc_return(bpf_trace_nest_level); + struct perf_raw_record raw = { + .frag = { + .size = size, + .data = data, + }, + }; ++ struct perf_sample_data *sd; ++ int err; + +- if (unlikely(flags & ~(BPF_F_INDEX_MASK))) +- return -EINVAL; ++ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { ++ err = -EBUSY; ++ goto out; ++ } ++ ++ sd = &sds->sds[nest_level - 1]; ++ ++ if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { ++ err = -EINVAL; ++ goto out; ++ } + + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; + +- return __bpf_perf_event_output(regs, map, flags, sd); ++ err = __bpf_perf_event_output(regs, map, flags, sd); ++ ++out: ++ this_cpu_dec(bpf_trace_nest_level); ++ return err; + } + + static const struct bpf_func_proto bpf_perf_event_output_proto = { +@@ -772,16 +796,48 @@ pe_prog_func_proto(enum bpf_func_id func + /* + * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp + * to avoid potential recursive reuse issue when/if tracepoints are added +- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack ++ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. ++ * ++ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage ++ * in normal, irq, and nmi context. + */ +-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); ++struct bpf_raw_tp_regs { ++ struct pt_regs regs[3]; ++}; ++static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); ++static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); ++static struct pt_regs *get_bpf_raw_tp_regs(void) ++{ ++ struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); ++ int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); ++ ++ if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { ++ this_cpu_dec(bpf_raw_tp_nest_level); ++ return ERR_PTR(-EBUSY); ++ } ++ ++ return &tp_regs->regs[nest_level - 1]; ++} ++ ++static void put_bpf_raw_tp_regs(void) ++{ ++ this_cpu_dec(bpf_raw_tp_nest_level); ++} ++ + BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags, void *, data, u64, size) + { +- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); ++ struct pt_regs *regs = get_bpf_raw_tp_regs(); ++ int ret; ++ ++ if (IS_ERR(regs)) ++ return PTR_ERR(regs); + + perf_fetch_caller_regs(regs); +- return ____bpf_perf_event_output(regs, map, flags, data, size); ++ ret = ____bpf_perf_event_output(regs, map, flags, data, size); ++ ++ put_bpf_raw_tp_regs(); ++ return ret; + } + + static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { +@@ -798,12 +854,18 @@ static const struct bpf_func_proto bpf_p + BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags) + { +- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); ++ struct pt_regs *regs = get_bpf_raw_tp_regs(); ++ int ret; ++ ++ if (IS_ERR(regs)) ++ return PTR_ERR(regs); + + perf_fetch_caller_regs(regs); + /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ +- return bpf_get_stackid((unsigned long) regs, (unsigned long) map, +- flags, 0, 0); ++ ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, ++ flags, 0, 0); ++ put_bpf_raw_tp_regs(); ++ return ret; + } + + static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { +@@ -818,11 +880,17 @@ static const struct bpf_func_proto bpf_g + BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, + void *, buf, u32, size, u64, flags) + { +- struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); ++ struct pt_regs *regs = get_bpf_raw_tp_regs(); ++ int ret; ++ ++ if (IS_ERR(regs)) ++ return PTR_ERR(regs); + + perf_fetch_caller_regs(regs); +- return bpf_get_stack((unsigned long) regs, (unsigned long) buf, +- (unsigned long) size, flags, 0); ++ ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, ++ (unsigned long) size, flags, 0); ++ put_bpf_raw_tp_regs(); ++ return ret; + } + + static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { diff --git a/queue-4.19/bpf-fix-unconnected-udp-hooks.patch b/queue-4.19/bpf-fix-unconnected-udp-hooks.patch new file mode 100644 index 00000000000..7b98746a70c --- /dev/null +++ b/queue-4.19/bpf-fix-unconnected-udp-hooks.patch @@ -0,0 +1,308 @@ +From 983695fa676568fc0fe5ddd995c7267aabc24632 Mon Sep 17 00:00:00 2001 +From: Daniel Borkmann +Date: Fri, 7 Jun 2019 01:48:57 +0200 +Subject: bpf: fix unconnected udp hooks + +From: Daniel Borkmann + +commit 983695fa676568fc0fe5ddd995c7267aabc24632 upstream. + +Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently +to applications as also stated in original motivation in 7828f20e3779 ("Merge +branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter +two hooks into Cilium to enable host based load-balancing with Kubernetes, +I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes +typically sets up DNS as a service and is thus subject to load-balancing. + +Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API +is currently insufficient and thus not usable as-is for standard applications +shipped with most distros. To break down the issue we ran into with a simple +example: + + # cat /etc/resolv.conf + nameserver 147.75.207.207 + nameserver 147.75.207.208 + +For the purpose of a simple test, we set up above IPs as service IPs and +transparently redirect traffic to a different DNS backend server for that +node: + + # cilium service list + ID Frontend Backend + 1 147.75.207.207:53 1 => 8.8.8.8:53 + 2 147.75.207.208:53 1 => 8.8.8.8:53 + +The attached BPF program is basically selecting one of the backends if the +service IP/port matches on the cgroup hook. DNS breaks here, because the +hooks are not transparent enough to applications which have built-in msg_name +address checks: + + # nslookup 1.1.1.1 + ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 + ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 + ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 + [...] + ;; connection timed out; no servers could be reached + + # dig 1.1.1.1 + ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 + ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53 + ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53 + [...] + + ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 + ;; global options: +cmd + ;; connection timed out; no servers could be reached + +For comparison, if none of the service IPs is used, and we tell nslookup +to use 8.8.8.8 directly it works just fine, of course: + + # nslookup 1.1.1.1 8.8.8.8 + 1.1.1.1.in-addr.arpa name = one.one.one.one. + +In order to fix this and thus act more transparent to the application, +this needs reverse translation on recvmsg() side. A minimal fix for this +API is to add similar recvmsg() hooks behind the BPF cgroups static key +such that the program can track state and replace the current sockaddr_in{,6} +with the original service IP. From BPF side, this basically tracks the +service tuple plus socket cookie in an LRU map where the reverse NAT can +then be retrieved via map value as one example. Side-note: the BPF cgroups +static key should be converted to a per-hook static key in future. + +Same example after this fix: + + # cilium service list + ID Frontend Backend + 1 147.75.207.207:53 1 => 8.8.8.8:53 + 2 147.75.207.208:53 1 => 8.8.8.8:53 + +Lookups work fine now: + + # nslookup 1.1.1.1 + 1.1.1.1.in-addr.arpa name = one.one.one.one. + + Authoritative answers can be found from: + + # dig 1.1.1.1 + + ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1 + ;; global options: +cmd + ;; Got answer: + ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550 + ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1 + + ;; OPT PSEUDOSECTION: + ; EDNS: version: 0, flags:; udp: 512 + ;; QUESTION SECTION: + ;1.1.1.1. IN A + + ;; AUTHORITY SECTION: + . 23426 IN SOA a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400 + + ;; Query time: 17 msec + ;; SERVER: 147.75.207.207#53(147.75.207.207) + ;; WHEN: Tue May 21 12:59:38 UTC 2019 + ;; MSG SIZE rcvd: 111 + +And from an actual packet level it shows that we're using the back end +server when talking via 147.75.207.20{7,8} front end: + + # tcpdump -i any udp + [...] + 12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) + 12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38) + 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) + 12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67) + [...] + +In order to be flexible and to have same semantics as in sendmsg BPF +programs, we only allow return codes in [1,1] range. In the sendmsg case +the program is called if msg->msg_name is present which can be the case +in both, connected and unconnected UDP. + +The former only relies on the sockaddr_in{,6} passed via connect(2) if +passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar +way to call into the BPF program whenever a non-NULL msg->msg_name was +passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note +that for TCP case, the msg->msg_name is ignored in the regular recvmsg +path and therefore not relevant. + +For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE, +the hook is not called. This is intentional as it aligns with the same +semantics as in case of TCP cgroup BPF hooks right now. This might be +better addressed in future through a different bpf_attach_type such +that this case can be distinguished from the regular recvmsg paths, +for example. + +Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg") +Signed-off-by: Daniel Borkmann +Acked-by: Andrey Ignatov +Acked-by: Martin KaFai Lau +Acked-by: Martynas Pumputis +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/bpf-cgroup.h | 8 ++++++++ + include/uapi/linux/bpf.h | 2 ++ + kernel/bpf/syscall.c | 8 ++++++++ + kernel/bpf/verifier.c | 12 ++++++++---- + net/core/filter.c | 2 ++ + net/ipv4/udp.c | 4 ++++ + net/ipv6/udp.c | 4 ++++ + 7 files changed, 36 insertions(+), 4 deletions(-) + +--- a/include/linux/bpf-cgroup.h ++++ b/include/linux/bpf-cgroup.h +@@ -210,6 +210,12 @@ void bpf_cgroup_storage_release(struct b + #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) + ++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ ++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) ++ ++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ ++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) ++ + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ + ({ \ + int __ret = 0; \ +@@ -290,6 +296,8 @@ static inline void bpf_cgroup_storage_fr + #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) + +--- a/include/uapi/linux/bpf.h ++++ b/include/uapi/linux/bpf.h +@@ -172,6 +172,8 @@ enum bpf_attach_type { + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, ++ BPF_CGROUP_UDP4_RECVMSG = 19, ++ BPF_CGROUP_UDP6_RECVMSG, + __MAX_BPF_ATTACH_TYPE + }; + +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -1342,6 +1342,8 @@ bpf_prog_load_check_attach_type(enum bpf + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: + return 0; + default: + return -EINVAL; +@@ -1622,6 +1624,8 @@ static int bpf_prog_attach(const union b + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; + case BPF_CGROUP_SOCK_OPS: +@@ -1698,6 +1702,8 @@ static int bpf_prog_detach(const union b + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; + case BPF_CGROUP_SOCK_OPS: +@@ -1744,6 +1750,8 @@ static int bpf_prog_query(const union bp + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: + case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: + case BPF_CGROUP_SOCK_OPS: + case BPF_CGROUP_DEVICE: + break; +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -4342,9 +4342,12 @@ static int check_return_code(struct bpf_ + struct tnum range = tnum_range(0, 1); + + switch (env->prog->type) { ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: ++ if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || ++ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) ++ range = tnum_range(1, 1); + case BPF_PROG_TYPE_CGROUP_SKB: + case BPF_PROG_TYPE_CGROUP_SOCK: +- case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_CGROUP_DEVICE: + break; +@@ -4360,16 +4363,17 @@ static int check_return_code(struct bpf_ + } + + if (!tnum_in(range, reg->var_off)) { ++ char tn_buf[48]; ++ + verbose(env, "At program exit the register R0 "); + if (!tnum_is_unknown(reg->var_off)) { +- char tn_buf[48]; +- + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "has value %s", tn_buf); + } else { + verbose(env, "has unknown scalar value"); + } +- verbose(env, " should have been 0 or 1\n"); ++ tnum_strn(tn_buf, sizeof(tn_buf), range); ++ verbose(env, " should have been in %s\n", tn_buf); + return -EINVAL; + } + return 0; +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -5558,6 +5558,7 @@ static bool sock_addr_is_valid_access(in + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_UDP4_SENDMSG: ++ case BPF_CGROUP_UDP4_RECVMSG: + break; + default: + return false; +@@ -5568,6 +5569,7 @@ static bool sock_addr_is_valid_access(in + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_UDP6_SENDMSG: ++ case BPF_CGROUP_UDP6_RECVMSG: + break; + default: + return false; +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -1720,6 +1720,10 @@ try_again: + sin->sin_addr.s_addr = ip_hdr(skb)->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + *addr_len = sizeof(*sin); ++ ++ if (cgroup_bpf_enabled) ++ BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, ++ (struct sockaddr *)sin); + } + if (inet->cmsg_flags) + ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -419,6 +419,10 @@ try_again: + inet6_iif(skb)); + } + *addr_len = sizeof(*sin6); ++ ++ if (cgroup_bpf_enabled) ++ BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, ++ (struct sockaddr *)sin6); + } + + if (np->rxopt.all) diff --git a/queue-4.19/bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch b/queue-4.19/bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch new file mode 100644 index 00000000000..1dd60827f4b --- /dev/null +++ b/queue-4.19/bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch @@ -0,0 +1,127 @@ +From da2577fdd0932ea4eefe73903f1130ee366767d2 Mon Sep 17 00:00:00 2001 +From: Jonathan Lemon +Date: Sat, 8 Jun 2019 12:54:19 -0700 +Subject: bpf: lpm_trie: check left child of last leftmost node for NULL + +From: Jonathan Lemon + +commit da2577fdd0932ea4eefe73903f1130ee366767d2 upstream. + +If the leftmost parent node of the tree has does not have a child +on the left side, then trie_get_next_key (and bpftool map dump) will +not look at the child on the right. This leads to the traversal +missing elements. + +Lookup is not affected. + +Update selftest to handle this case. + +Reproducer: + + bpftool map create /sys/fs/bpf/lpm type lpm_trie key 6 \ + value 1 entries 256 name test_lpm flags 1 + bpftool map update pinned /sys/fs/bpf/lpm key 8 0 0 0 0 0 value 1 + bpftool map update pinned /sys/fs/bpf/lpm key 16 0 0 0 0 128 value 2 + bpftool map dump pinned /sys/fs/bpf/lpm + +Returns only 1 element. (2 expected) + +Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE") +Signed-off-by: Jonathan Lemon +Acked-by: Martin KaFai Lau +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/bpf/lpm_trie.c | 9 ++++-- + tools/testing/selftests/bpf/test_lpm_map.c | 41 ++++++++++++++++++++++++++--- + 2 files changed, 45 insertions(+), 5 deletions(-) + +--- a/kernel/bpf/lpm_trie.c ++++ b/kernel/bpf/lpm_trie.c +@@ -676,9 +676,14 @@ find_leftmost: + * have exact two children, so this function will never return NULL. + */ + for (node = search_root; node;) { +- if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) ++ if (node->flags & LPM_TREE_NODE_FLAG_IM) { ++ node = rcu_dereference(node->child[0]); ++ } else { + next_node = node; +- node = rcu_dereference(node->child[0]); ++ node = rcu_dereference(node->child[0]); ++ if (!node) ++ node = rcu_dereference(next_node->child[1]); ++ } + } + do_copy: + next_key->prefixlen = next_node->prefixlen; +--- a/tools/testing/selftests/bpf/test_lpm_map.c ++++ b/tools/testing/selftests/bpf/test_lpm_map.c +@@ -573,13 +573,13 @@ static void test_lpm_get_next_key(void) + + /* add one more element (total two) */ + key_p->prefixlen = 24; +- inet_pton(AF_INET, "192.168.0.0", key_p->data); ++ inet_pton(AF_INET, "192.168.128.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); + assert(key_p->prefixlen == 24 && key_p->data[0] == 192 && +- key_p->data[1] == 168 && key_p->data[2] == 0); ++ key_p->data[1] == 168 && key_p->data[2] == 128); + + memset(next_key_p, 0, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); +@@ -592,7 +592,7 @@ static void test_lpm_get_next_key(void) + + /* Add one more element (total three) */ + key_p->prefixlen = 24; +- inet_pton(AF_INET, "192.168.128.0", key_p->data); ++ inet_pton(AF_INET, "192.168.0.0", key_p->data); + assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); + + memset(key_p, 0, key_size); +@@ -628,6 +628,41 @@ static void test_lpm_get_next_key(void) + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); + assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && + next_key_p->data[1] == 168 && next_key_p->data[2] == 1); ++ ++ memcpy(key_p, next_key_p, key_size); ++ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); ++ assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && ++ next_key_p->data[1] == 168 && next_key_p->data[2] == 128); ++ ++ memcpy(key_p, next_key_p, key_size); ++ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); ++ assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 && ++ next_key_p->data[1] == 168); ++ ++ memcpy(key_p, next_key_p, key_size); ++ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 && ++ errno == ENOENT); ++ ++ /* Add one more element (total five) */ ++ key_p->prefixlen = 28; ++ inet_pton(AF_INET, "192.168.1.128", key_p->data); ++ assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0); ++ ++ memset(key_p, 0, key_size); ++ assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0); ++ assert(key_p->prefixlen == 24 && key_p->data[0] == 192 && ++ key_p->data[1] == 168 && key_p->data[2] == 0); ++ ++ memset(next_key_p, 0, key_size); ++ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); ++ assert(next_key_p->prefixlen == 28 && next_key_p->data[0] == 192 && ++ next_key_p->data[1] == 168 && next_key_p->data[2] == 1 && ++ next_key_p->data[3] == 128); ++ ++ memcpy(key_p, next_key_p, key_size); ++ assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); ++ assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 && ++ next_key_p->data[1] == 168 && next_key_p->data[2] == 1); + + memcpy(key_p, next_key_p, key_size); + assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0); diff --git a/queue-4.19/bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch b/queue-4.19/bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch new file mode 100644 index 00000000000..f01d590ba65 --- /dev/null +++ b/queue-4.19/bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch @@ -0,0 +1,40 @@ +From b1d6c15b9d824a58c5415673f374fac19e8eccdf Mon Sep 17 00:00:00 2001 +From: Martynas Pumputis +Date: Wed, 12 Jun 2019 18:05:40 +0200 +Subject: bpf: simplify definition of BPF_FIB_LOOKUP related flags + +From: Martynas Pumputis + +commit b1d6c15b9d824a58c5415673f374fac19e8eccdf upstream. + +Previously, the BPF_FIB_LOOKUP_{DIRECT,OUTPUT} flags in the BPF UAPI +were defined with the help of BIT macro. This had the following issues: + +- In order to use any of the flags, a user was required to depend + on . +- No other flag in bpf.h uses the macro, so it seems that an unwritten + convention is to use (1 << (nr)) to define BPF-related flags. + +Fixes: 87f5fc7e48dd ("bpf: Provide helper to do forwarding lookups in kernel FIB table") +Signed-off-by: Martynas Pumputis +Acked-by: Andrii Nakryiko +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman + +--- + include/uapi/linux/bpf.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/include/uapi/linux/bpf.h ++++ b/include/uapi/linux/bpf.h +@@ -2705,8 +2705,8 @@ struct bpf_raw_tracepoint_args { + /* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +-#define BPF_FIB_LOOKUP_DIRECT BIT(0) +-#define BPF_FIB_LOOKUP_OUTPUT BIT(1) ++#define BPF_FIB_LOOKUP_DIRECT (1U << 0) ++#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) + + enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ diff --git a/queue-4.19/bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch b/queue-4.19/bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch new file mode 100644 index 00000000000..b4fa6614a2e --- /dev/null +++ b/queue-4.19/bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch @@ -0,0 +1,55 @@ +From 257a525fe2e49584842c504a92c27097407f778f Mon Sep 17 00:00:00 2001 +From: Martin KaFai Lau +Date: Fri, 31 May 2019 15:29:13 -0700 +Subject: bpf: udp: Avoid calling reuseport's bpf_prog from udp_gro + +From: Martin KaFai Lau + +commit 257a525fe2e49584842c504a92c27097407f778f upstream. + +When the commit a6024562ffd7 ("udp: Add GRO functions to UDP socket") +added udp[46]_lib_lookup_skb to the udp_gro code path, it broke +the reuseport_select_sock() assumption that skb->data is pointing +to the transport header. + +This patch follows an earlier __udp6_lib_err() fix by +passing a NULL skb to avoid calling the reuseport's bpf_prog. + +Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket") +Cc: Tom Herbert +Signed-off-by: Martin KaFai Lau +Acked-by: Song Liu +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman + +--- + net/ipv4/udp.c | 6 +++++- + net/ipv6/udp.c | 2 +- + 2 files changed, 6 insertions(+), 2 deletions(-) + +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -542,7 +542,11 @@ static inline struct sock *__udp4_lib_lo + struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, + __be16 sport, __be16 dport) + { +- return __udp4_lib_lookup_skb(skb, sport, dport, &udp_table); ++ const struct iphdr *iph = ip_hdr(skb); ++ ++ return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, ++ iph->daddr, dport, inet_iif(skb), ++ inet_sdif(skb), &udp_table, NULL); + } + EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); + +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -282,7 +282,7 @@ struct sock *udp6_lib_lookup_skb(struct + + return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), +- inet6_sdif(skb), &udp_table, skb); ++ inet6_sdif(skb), &udp_table, NULL); + } + EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb); + diff --git a/queue-4.19/bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch b/queue-4.19/bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch new file mode 100644 index 00000000000..a554d4fa209 --- /dev/null +++ b/queue-4.19/bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch @@ -0,0 +1,50 @@ +From 4ac30c4b3659efac031818c418beb51e630d512d Mon Sep 17 00:00:00 2001 +From: Martin KaFai Lau +Date: Fri, 31 May 2019 15:29:11 -0700 +Subject: bpf: udp: ipv6: Avoid running reuseport's bpf_prog from __udp6_lib_err + +From: Martin KaFai Lau + +commit 4ac30c4b3659efac031818c418beb51e630d512d upstream. + +__udp6_lib_err() may be called when handling icmpv6 message. For example, +the icmpv6 toobig(type=2). __udp6_lib_lookup() is then called +which may call reuseport_select_sock(). reuseport_select_sock() will +call into a bpf_prog (if there is one). + +reuseport_select_sock() is expecting the skb->data pointing to the +transport header (udphdr in this case). For example, run_bpf_filter() +is pulling the transport header. + +However, in the __udp6_lib_err() path, the skb->data is pointing to the +ipv6hdr instead of the udphdr. + +One option is to pull and push the ipv6hdr in __udp6_lib_err(). +Instead of doing this, this patch follows how the original +commit 538950a1b752 ("soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF") +was done in IPv4, which has passed a NULL skb pointer to +reuseport_select_sock(). + +Fixes: 538950a1b752 ("soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF") +Cc: Craig Gallek +Signed-off-by: Martin KaFai Lau +Acked-by: Song Liu +Acked-by: Craig Gallek +Signed-off-by: Alexei Starovoitov +Signed-off-by: Daniel Borkmann +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/udp.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -482,7 +482,7 @@ void __udp6_lib_err(struct sk_buff *skb, + struct net *net = dev_net(skb->dev); + + sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, +- inet6_iif(skb), 0, udptable, skb); ++ inet6_iif(skb), 0, udptable, NULL); + if (!sk) { + __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), + ICMP6_MIB_INERRORS); diff --git a/queue-4.19/futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch b/queue-4.19/futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch new file mode 100644 index 00000000000..d40da9b8503 --- /dev/null +++ b/queue-4.19/futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch @@ -0,0 +1,57 @@ +From 427503519739e779c0db8afe876c1b33f3ac60ae Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Wed, 10 Apr 2019 11:51:54 +0100 +Subject: futex: Update comments and docs about return values of arch futex code + +From: Will Deacon + +commit 427503519739e779c0db8afe876c1b33f3ac60ae upstream. + +The architecture implementations of 'arch_futex_atomic_op_inuser()' and +'futex_atomic_cmpxchg_inatomic()' are permitted to return only -EFAULT, +-EAGAIN or -ENOSYS in the case of failure. + +Update the comments in the asm-generic/ implementation and also a stray +reference in the robust futex documentation. + +Signed-off-by: Will Deacon +Signed-off-by: Greg Kroah-Hartman + +--- + Documentation/robust-futexes.txt | 3 +-- + include/asm-generic/futex.h | 8 ++++++-- + 2 files changed, 7 insertions(+), 4 deletions(-) + +--- a/Documentation/robust-futexes.txt ++++ b/Documentation/robust-futexes.txt +@@ -218,5 +218,4 @@ All other architectures should build jus + the new syscalls yet. + + Architectures need to implement the new futex_atomic_cmpxchg_inatomic() +-inline function before writing up the syscalls (that function returns +--ENOSYS right now). ++inline function before writing up the syscalls. +--- a/include/asm-generic/futex.h ++++ b/include/asm-generic/futex.h +@@ -23,7 +23,9 @@ + * + * Return: + * 0 - On success +- * <0 - On error ++ * -EFAULT - User access resulted in a page fault ++ * -EAGAIN - Atomic operation was unable to complete due to contention ++ * -ENOSYS - Operation not supported + */ + static inline int + arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr) +@@ -85,7 +87,9 @@ out_pagefault_enable: + * + * Return: + * 0 - On success +- * <0 - On error ++ * -EFAULT - User access resulted in a page fault ++ * -EAGAIN - Atomic operation was unable to complete due to contention ++ * -ENOSYS - Function not implemented (only if !HAVE_FUTEX_CMPXCHG) + */ + static inline int + futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, diff --git a/queue-4.19/series b/queue-4.19/series index c3fee71f563..da14211a7da 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -59,3 +59,12 @@ team-always-enable-vlan-tx-offload.patch tipc-change-to-use-register_pernet_device.patch tipc-check-msg-req-data-len-in-tipc_nl_compat_bearer_disable.patch tun-wake-up-waitqueues-after-iff_up-is-set.patch +bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch +bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch +bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch +bpf-fix-unconnected-udp-hooks.patch +bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch +bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch +arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch +bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch +futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch -- 2.47.2