]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.19-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 2 Jul 2019 06:16:32 +0000 (08:16 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 2 Jul 2019 06:16:32 +0000 (08:16 +0200)
added patches:
arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch
bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch
bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch
bpf-fix-unconnected-udp-hooks.patch
bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch
bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch
bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch
bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch
futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch

queue-4.19/arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch [new file with mode: 0644]
queue-4.19/bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch [new file with mode: 0644]
queue-4.19/bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch [new file with mode: 0644]
queue-4.19/bpf-fix-unconnected-udp-hooks.patch [new file with mode: 0644]
queue-4.19/bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch [new file with mode: 0644]
queue-4.19/bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch [new file with mode: 0644]
queue-4.19/bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch [new file with mode: 0644]
queue-4.19/bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch [new file with mode: 0644]
queue-4.19/futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch [new file with mode: 0644]
queue-4.19/series

diff --git a/queue-4.19/arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch b/queue-4.19/arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch
new file mode 100644 (file)
index 0000000..f6be711
--- /dev/null
@@ -0,0 +1,35 @@
+From 8e4e0ac02b449297b86498ac24db5786ddd9f647 Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Wed, 10 Apr 2019 11:49:11 +0100
+Subject: arm64: futex: Avoid copying out uninitialised stack in failed cmpxchg()
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 8e4e0ac02b449297b86498ac24db5786ddd9f647 upstream.
+
+Returning an error code from futex_atomic_cmpxchg_inatomic() indicates
+that the caller should not make any use of *uval, and should instead act
+upon on the value of the error code. Although this is implemented
+correctly in our futex code, we needlessly copy uninitialised stack to
+*uval in the error case, which can easily be avoided.
+
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/include/asm/futex.h |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/arch/arm64/include/asm/futex.h
++++ b/arch/arm64/include/asm/futex.h
+@@ -134,7 +134,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval,
+       : "memory");
+       uaccess_disable();
+-      *uval = val;
++      if (!ret)
++              *uval = val;
++
+       return ret;
+ }
diff --git a/queue-4.19/bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch b/queue-4.19/bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch
new file mode 100644 (file)
index 0000000..5ce84c4
--- /dev/null
@@ -0,0 +1,165 @@
+From 34b8ab091f9ef57a2bb3c8c8359a0a03a8abf2f9 Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Fri, 26 Apr 2019 21:48:22 +0200
+Subject: bpf, arm64: use more scalable stadd over ldxr / stxr loop in xadd
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit 34b8ab091f9ef57a2bb3c8c8359a0a03a8abf2f9 upstream.
+
+Since ARMv8.1 supplement introduced LSE atomic instructions back in 2016,
+lets add support for STADD and use that in favor of LDXR / STXR loop for
+the XADD mapping if available. STADD is encoded as an alias for LDADD with
+XZR as the destination register, therefore add LDADD to the instruction
+encoder along with STADD as special case and use it in the JIT for CPUs
+that advertise LSE atomics in CPUID register. If immediate offset in the
+BPF XADD insn is 0, then use dst register directly instead of temporary
+one.
+
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Jean-Philippe Brucker <jean-philippe.brucker@arm.com>
+Acked-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/include/asm/insn.h |    8 ++++++++
+ arch/arm64/kernel/insn.c      |   40 ++++++++++++++++++++++++++++++++++++++++
+ arch/arm64/net/bpf_jit.h      |    4 ++++
+ arch/arm64/net/bpf_jit_comp.c |   28 +++++++++++++++++++---------
+ 4 files changed, 71 insertions(+), 9 deletions(-)
+
+--- a/arch/arm64/include/asm/insn.h
++++ b/arch/arm64/include/asm/insn.h
+@@ -272,6 +272,7 @@ __AARCH64_INSN_FUNCS(adrp, 0x9F000000, 0
+ __AARCH64_INSN_FUNCS(prfm,    0x3FC00000, 0x39800000)
+ __AARCH64_INSN_FUNCS(prfm_lit,        0xFF000000, 0xD8000000)
+ __AARCH64_INSN_FUNCS(str_reg, 0x3FE0EC00, 0x38206800)
++__AARCH64_INSN_FUNCS(ldadd,   0x3F20FC00, 0xB8200000)
+ __AARCH64_INSN_FUNCS(ldr_reg, 0x3FE0EC00, 0x38606800)
+ __AARCH64_INSN_FUNCS(ldr_lit, 0xBF000000, 0x18000000)
+ __AARCH64_INSN_FUNCS(ldrsw_lit,       0xFF000000, 0x98000000)
+@@ -389,6 +390,13 @@ u32 aarch64_insn_gen_load_store_ex(enum
+                                  enum aarch64_insn_register state,
+                                  enum aarch64_insn_size_type size,
+                                  enum aarch64_insn_ldst_type type);
++u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
++                         enum aarch64_insn_register address,
++                         enum aarch64_insn_register value,
++                         enum aarch64_insn_size_type size);
++u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
++                         enum aarch64_insn_register value,
++                         enum aarch64_insn_size_type size);
+ u32 aarch64_insn_gen_add_sub_imm(enum aarch64_insn_register dst,
+                                enum aarch64_insn_register src,
+                                int imm, enum aarch64_insn_variant variant,
+--- a/arch/arm64/kernel/insn.c
++++ b/arch/arm64/kernel/insn.c
+@@ -734,6 +734,46 @@ u32 aarch64_insn_gen_load_store_ex(enum
+                                           state);
+ }
++u32 aarch64_insn_gen_ldadd(enum aarch64_insn_register result,
++                         enum aarch64_insn_register address,
++                         enum aarch64_insn_register value,
++                         enum aarch64_insn_size_type size)
++{
++      u32 insn = aarch64_insn_get_ldadd_value();
++
++      switch (size) {
++      case AARCH64_INSN_SIZE_32:
++      case AARCH64_INSN_SIZE_64:
++              break;
++      default:
++              pr_err("%s: unimplemented size encoding %d\n", __func__, size);
++              return AARCH64_BREAK_FAULT;
++      }
++
++      insn = aarch64_insn_encode_ldst_size(size, insn);
++
++      insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn,
++                                          result);
++
++      insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
++                                          address);
++
++      return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RS, insn,
++                                          value);
++}
++
++u32 aarch64_insn_gen_stadd(enum aarch64_insn_register address,
++                         enum aarch64_insn_register value,
++                         enum aarch64_insn_size_type size)
++{
++      /*
++       * STADD is simply encoded as an alias for LDADD with XZR as
++       * the destination register.
++       */
++      return aarch64_insn_gen_ldadd(AARCH64_INSN_REG_ZR, address,
++                                    value, size);
++}
++
+ static u32 aarch64_insn_encode_prfm_imm(enum aarch64_insn_prfm_type type,
+                                       enum aarch64_insn_prfm_target target,
+                                       enum aarch64_insn_prfm_policy policy,
+--- a/arch/arm64/net/bpf_jit.h
++++ b/arch/arm64/net/bpf_jit.h
+@@ -100,6 +100,10 @@
+ #define A64_STXR(sf, Rt, Rn, Rs) \
+       A64_LSX(sf, Rt, Rn, Rs, STORE_EX)
++/* LSE atomics */
++#define A64_STADD(sf, Rn, Rs) \
++      aarch64_insn_gen_stadd(Rn, Rs, A64_SIZE(sf))
++
+ /* Add/subtract (immediate) */
+ #define A64_ADDSUB_IMM(sf, Rd, Rn, imm12, type) \
+       aarch64_insn_gen_add_sub_imm(Rd, Rn, imm12, \
+--- a/arch/arm64/net/bpf_jit_comp.c
++++ b/arch/arm64/net/bpf_jit_comp.c
+@@ -364,7 +364,7 @@ static int build_insn(const struct bpf_i
+       const int i = insn - ctx->prog->insnsi;
+       const bool is64 = BPF_CLASS(code) == BPF_ALU64;
+       const bool isdw = BPF_SIZE(code) == BPF_DW;
+-      u8 jmp_cond;
++      u8 jmp_cond, reg;
+       s32 jmp_offset;
+ #define check_imm(bits, imm) do {                             \
+@@ -730,18 +730,28 @@ emit_cond_jmp:
+                       break;
+               }
+               break;
++
+       /* STX XADD: lock *(u32 *)(dst + off) += src */
+       case BPF_STX | BPF_XADD | BPF_W:
+       /* STX XADD: lock *(u64 *)(dst + off) += src */
+       case BPF_STX | BPF_XADD | BPF_DW:
+-              emit_a64_mov_i(1, tmp, off, ctx);
+-              emit(A64_ADD(1, tmp, tmp, dst), ctx);
+-              emit(A64_LDXR(isdw, tmp2, tmp), ctx);
+-              emit(A64_ADD(isdw, tmp2, tmp2, src), ctx);
+-              emit(A64_STXR(isdw, tmp2, tmp, tmp3), ctx);
+-              jmp_offset = -3;
+-              check_imm19(jmp_offset);
+-              emit(A64_CBNZ(0, tmp3, jmp_offset), ctx);
++              if (!off) {
++                      reg = dst;
++              } else {
++                      emit_a64_mov_i(1, tmp, off, ctx);
++                      emit(A64_ADD(1, tmp, tmp, dst), ctx);
++                      reg = tmp;
++              }
++              if (cpus_have_cap(ARM64_HAS_LSE_ATOMICS)) {
++                      emit(A64_STADD(isdw, reg, src), ctx);
++              } else {
++                      emit(A64_LDXR(isdw, tmp2, reg), ctx);
++                      emit(A64_ADD(isdw, tmp2, tmp2, src), ctx);
++                      emit(A64_STXR(isdw, tmp2, reg, tmp3), ctx);
++                      jmp_offset = -3;
++                      check_imm19(jmp_offset);
++                      emit(A64_CBNZ(0, tmp3, jmp_offset), ctx);
++              }
+               break;
+       default:
diff --git a/queue-4.19/bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch b/queue-4.19/bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch
new file mode 100644 (file)
index 0000000..36aacf5
--- /dev/null
@@ -0,0 +1,191 @@
+From 9594dc3c7e71b9f52bee1d7852eb3d4e3aea9e99 Mon Sep 17 00:00:00 2001
+From: Matt Mullins <mmullins@fb.com>
+Date: Tue, 11 Jun 2019 14:53:04 -0700
+Subject: bpf: fix nested bpf tracepoints with per-cpu data
+
+From: Matt Mullins <mmullins@fb.com>
+
+commit 9594dc3c7e71b9f52bee1d7852eb3d4e3aea9e99 upstream.
+
+BPF_PROG_TYPE_RAW_TRACEPOINTs can be executed nested on the same CPU, as
+they do not increment bpf_prog_active while executing.
+
+This enables three levels of nesting, to support
+  - a kprobe or raw tp or perf event,
+  - another one of the above that irq context happens to call, and
+  - another one in nmi context
+(at most one of which may be a kprobe or perf event).
+
+Fixes: 20b9d7ac4852 ("bpf: avoid excessive stack usage for perf_sample_data")
+Signed-off-by: Matt Mullins <mmullins@fb.com>
+Acked-by: Andrii Nakryiko <andriin@fb.com>
+Acked-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/trace/bpf_trace.c |  100 +++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 84 insertions(+), 16 deletions(-)
+
+--- a/kernel/trace/bpf_trace.c
++++ b/kernel/trace/bpf_trace.c
+@@ -365,8 +365,6 @@ static const struct bpf_func_proto bpf_p
+       .arg4_type      = ARG_CONST_SIZE,
+ };
+-static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
+-
+ static __always_inline u64
+ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
+                       u64 flags, struct perf_sample_data *sd)
+@@ -398,24 +396,50 @@ __bpf_perf_event_output(struct pt_regs *
+       return 0;
+ }
++/*
++ * Support executing tracepoints in normal, irq, and nmi context that each call
++ * bpf_perf_event_output
++ */
++struct bpf_trace_sample_data {
++      struct perf_sample_data sds[3];
++};
++
++static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
++static DEFINE_PER_CPU(int, bpf_trace_nest_level);
+ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
+          u64, flags, void *, data, u64, size)
+ {
+-      struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
++      struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
++      int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
+       struct perf_raw_record raw = {
+               .frag = {
+                       .size = size,
+                       .data = data,
+               },
+       };
++      struct perf_sample_data *sd;
++      int err;
+-      if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
+-              return -EINVAL;
++      if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
++              err = -EBUSY;
++              goto out;
++      }
++
++      sd = &sds->sds[nest_level - 1];
++
++      if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
++              err = -EINVAL;
++              goto out;
++      }
+       perf_sample_data_init(sd, 0, 0);
+       sd->raw = &raw;
+-      return __bpf_perf_event_output(regs, map, flags, sd);
++      err = __bpf_perf_event_output(regs, map, flags, sd);
++
++out:
++      this_cpu_dec(bpf_trace_nest_level);
++      return err;
+ }
+ static const struct bpf_func_proto bpf_perf_event_output_proto = {
+@@ -772,16 +796,48 @@ pe_prog_func_proto(enum bpf_func_id func
+ /*
+  * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
+  * to avoid potential recursive reuse issue when/if tracepoints are added
+- * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack
++ * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
++ *
++ * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
++ * in normal, irq, and nmi context.
+  */
+-static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs);
++struct bpf_raw_tp_regs {
++      struct pt_regs regs[3];
++};
++static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
++static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
++static struct pt_regs *get_bpf_raw_tp_regs(void)
++{
++      struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
++      int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);
++
++      if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
++              this_cpu_dec(bpf_raw_tp_nest_level);
++              return ERR_PTR(-EBUSY);
++      }
++
++      return &tp_regs->regs[nest_level - 1];
++}
++
++static void put_bpf_raw_tp_regs(void)
++{
++      this_cpu_dec(bpf_raw_tp_nest_level);
++}
++
+ BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
+          struct bpf_map *, map, u64, flags, void *, data, u64, size)
+ {
+-      struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
++      struct pt_regs *regs = get_bpf_raw_tp_regs();
++      int ret;
++
++      if (IS_ERR(regs))
++              return PTR_ERR(regs);
+       perf_fetch_caller_regs(regs);
+-      return ____bpf_perf_event_output(regs, map, flags, data, size);
++      ret = ____bpf_perf_event_output(regs, map, flags, data, size);
++
++      put_bpf_raw_tp_regs();
++      return ret;
+ }
+ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
+@@ -798,12 +854,18 @@ static const struct bpf_func_proto bpf_p
+ BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
+          struct bpf_map *, map, u64, flags)
+ {
+-      struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
++      struct pt_regs *regs = get_bpf_raw_tp_regs();
++      int ret;
++
++      if (IS_ERR(regs))
++              return PTR_ERR(regs);
+       perf_fetch_caller_regs(regs);
+       /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
+-      return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
+-                             flags, 0, 0);
++      ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
++                            flags, 0, 0);
++      put_bpf_raw_tp_regs();
++      return ret;
+ }
+ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
+@@ -818,11 +880,17 @@ static const struct bpf_func_proto bpf_g
+ BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
+          void *, buf, u32, size, u64, flags)
+ {
+-      struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs);
++      struct pt_regs *regs = get_bpf_raw_tp_regs();
++      int ret;
++
++      if (IS_ERR(regs))
++              return PTR_ERR(regs);
+       perf_fetch_caller_regs(regs);
+-      return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
+-                           (unsigned long) size, flags, 0);
++      ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
++                          (unsigned long) size, flags, 0);
++      put_bpf_raw_tp_regs();
++      return ret;
+ }
+ static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
diff --git a/queue-4.19/bpf-fix-unconnected-udp-hooks.patch b/queue-4.19/bpf-fix-unconnected-udp-hooks.patch
new file mode 100644 (file)
index 0000000..7b98746
--- /dev/null
@@ -0,0 +1,308 @@
+From 983695fa676568fc0fe5ddd995c7267aabc24632 Mon Sep 17 00:00:00 2001
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Fri, 7 Jun 2019 01:48:57 +0200
+Subject: bpf: fix unconnected udp hooks
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+commit 983695fa676568fc0fe5ddd995c7267aabc24632 upstream.
+
+Intention of cgroup bind/connect/sendmsg BPF hooks is to act transparently
+to applications as also stated in original motivation in 7828f20e3779 ("Merge
+branch 'bpf-cgroup-bind-connect'"). When recently integrating the latter
+two hooks into Cilium to enable host based load-balancing with Kubernetes,
+I ran into the issue that pods couldn't start up as DNS got broken. Kubernetes
+typically sets up DNS as a service and is thus subject to load-balancing.
+
+Upon further debugging, it turns out that the cgroupv2 sendmsg BPF hooks API
+is currently insufficient and thus not usable as-is for standard applications
+shipped with most distros. To break down the issue we ran into with a simple
+example:
+
+  # cat /etc/resolv.conf
+  nameserver 147.75.207.207
+  nameserver 147.75.207.208
+
+For the purpose of a simple test, we set up above IPs as service IPs and
+transparently redirect traffic to a different DNS backend server for that
+node:
+
+  # cilium service list
+  ID   Frontend            Backend
+  1    147.75.207.207:53   1 => 8.8.8.8:53
+  2    147.75.207.208:53   1 => 8.8.8.8:53
+
+The attached BPF program is basically selecting one of the backends if the
+service IP/port matches on the cgroup hook. DNS breaks here, because the
+hooks are not transparent enough to applications which have built-in msg_name
+address checks:
+
+  # nslookup 1.1.1.1
+  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
+  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53
+  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
+  [...]
+  ;; connection timed out; no servers could be reached
+
+  # dig 1.1.1.1
+  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
+  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.208#53
+  ;; reply from unexpected source: 8.8.8.8#53, expected 147.75.207.207#53
+  [...]
+
+  ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1
+  ;; global options: +cmd
+  ;; connection timed out; no servers could be reached
+
+For comparison, if none of the service IPs is used, and we tell nslookup
+to use 8.8.8.8 directly it works just fine, of course:
+
+  # nslookup 1.1.1.1 8.8.8.8
+  1.1.1.1.in-addr.arpa name = one.one.one.one.
+
+In order to fix this and thus act more transparent to the application,
+this needs reverse translation on recvmsg() side. A minimal fix for this
+API is to add similar recvmsg() hooks behind the BPF cgroups static key
+such that the program can track state and replace the current sockaddr_in{,6}
+with the original service IP. From BPF side, this basically tracks the
+service tuple plus socket cookie in an LRU map where the reverse NAT can
+then be retrieved via map value as one example. Side-note: the BPF cgroups
+static key should be converted to a per-hook static key in future.
+
+Same example after this fix:
+
+  # cilium service list
+  ID   Frontend            Backend
+  1    147.75.207.207:53   1 => 8.8.8.8:53
+  2    147.75.207.208:53   1 => 8.8.8.8:53
+
+Lookups work fine now:
+
+  # nslookup 1.1.1.1
+  1.1.1.1.in-addr.arpa    name = one.one.one.one.
+
+  Authoritative answers can be found from:
+
+  # dig 1.1.1.1
+
+  ; <<>> DiG 9.11.3-1ubuntu1.7-Ubuntu <<>> 1.1.1.1
+  ;; global options: +cmd
+  ;; Got answer:
+  ;; ->>HEADER<<- opcode: QUERY, status: NXDOMAIN, id: 51550
+  ;; flags: qr rd ra ad; QUERY: 1, ANSWER: 0, AUTHORITY: 1, ADDITIONAL: 1
+
+  ;; OPT PSEUDOSECTION:
+  ; EDNS: version: 0, flags:; udp: 512
+  ;; QUESTION SECTION:
+  ;1.1.1.1.                       IN      A
+
+  ;; AUTHORITY SECTION:
+  .                       23426   IN      SOA     a.root-servers.net. nstld.verisign-grs.com. 2019052001 1800 900 604800 86400
+
+  ;; Query time: 17 msec
+  ;; SERVER: 147.75.207.207#53(147.75.207.207)
+  ;; WHEN: Tue May 21 12:59:38 UTC 2019
+  ;; MSG SIZE  rcvd: 111
+
+And from an actual packet level it shows that we're using the back end
+server when talking via 147.75.207.20{7,8} front end:
+
+  # tcpdump -i any udp
+  [...]
+  12:59:52.698732 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38)
+  12:59:52.698735 IP foo.42011 > google-public-dns-a.google.com.domain: 18803+ PTR? 1.1.1.1.in-addr.arpa. (38)
+  12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67)
+  12:59:52.701208 IP google-public-dns-a.google.com.domain > foo.42011: 18803 1/0/0 PTR one.one.one.one. (67)
+  [...]
+
+In order to be flexible and to have same semantics as in sendmsg BPF
+programs, we only allow return codes in [1,1] range. In the sendmsg case
+the program is called if msg->msg_name is present which can be the case
+in both, connected and unconnected UDP.
+
+The former only relies on the sockaddr_in{,6} passed via connect(2) if
+passed msg->msg_name was NULL. Therefore, on recvmsg side, we act in similar
+way to call into the BPF program whenever a non-NULL msg->msg_name was
+passed independent of sk->sk_state being TCP_ESTABLISHED or not. Note
+that for TCP case, the msg->msg_name is ignored in the regular recvmsg
+path and therefore not relevant.
+
+For the case of ip{,v6}_recv_error() paths, picked up via MSG_ERRQUEUE,
+the hook is not called. This is intentional as it aligns with the same
+semantics as in case of TCP cgroup BPF hooks right now. This might be
+better addressed in future through a different bpf_attach_type such
+that this case can be distinguished from the regular recvmsg paths,
+for example.
+
+Fixes: 1cedee13d25a ("bpf: Hooks for sys_sendmsg")
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Andrey Ignatov <rdna@fb.com>
+Acked-by: Martin KaFai Lau <kafai@fb.com>
+Acked-by: Martynas Pumputis <m@lambda.lt>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf-cgroup.h |    8 ++++++++
+ include/uapi/linux/bpf.h   |    2 ++
+ kernel/bpf/syscall.c       |    8 ++++++++
+ kernel/bpf/verifier.c      |   12 ++++++++----
+ net/core/filter.c          |    2 ++
+ net/ipv4/udp.c             |    4 ++++
+ net/ipv6/udp.c             |    4 ++++
+ 7 files changed, 36 insertions(+), 4 deletions(-)
+
+--- a/include/linux/bpf-cgroup.h
++++ b/include/linux/bpf-cgroup.h
+@@ -210,6 +210,12 @@ void bpf_cgroup_storage_release(struct b
+ #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)                      \
+       BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx)
++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)                      \
++      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL)
++
++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)                      \
++      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL)
++
+ #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops)                                       \
+ ({                                                                           \
+       int __ret = 0;                                                         \
+@@ -290,6 +296,8 @@ static inline void bpf_cgroup_storage_fr
+ #define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
+--- a/include/uapi/linux/bpf.h
++++ b/include/uapi/linux/bpf.h
+@@ -172,6 +172,8 @@ enum bpf_attach_type {
+       BPF_CGROUP_UDP4_SENDMSG,
+       BPF_CGROUP_UDP6_SENDMSG,
+       BPF_LIRC_MODE2,
++      BPF_CGROUP_UDP4_RECVMSG = 19,
++      BPF_CGROUP_UDP6_RECVMSG,
+       __MAX_BPF_ATTACH_TYPE
+ };
+--- a/kernel/bpf/syscall.c
++++ b/kernel/bpf/syscall.c
+@@ -1342,6 +1342,8 @@ bpf_prog_load_check_attach_type(enum bpf
+               case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_UDP4_SENDMSG:
+               case BPF_CGROUP_UDP6_SENDMSG:
++              case BPF_CGROUP_UDP4_RECVMSG:
++              case BPF_CGROUP_UDP6_RECVMSG:
+                       return 0;
+               default:
+                       return -EINVAL;
+@@ -1622,6 +1624,8 @@ static int bpf_prog_attach(const union b
+       case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_UDP4_SENDMSG:
+       case BPF_CGROUP_UDP6_SENDMSG:
++      case BPF_CGROUP_UDP4_RECVMSG:
++      case BPF_CGROUP_UDP6_RECVMSG:
+               ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
+               break;
+       case BPF_CGROUP_SOCK_OPS:
+@@ -1698,6 +1702,8 @@ static int bpf_prog_detach(const union b
+       case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_UDP4_SENDMSG:
+       case BPF_CGROUP_UDP6_SENDMSG:
++      case BPF_CGROUP_UDP4_RECVMSG:
++      case BPF_CGROUP_UDP6_RECVMSG:
+               ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR;
+               break;
+       case BPF_CGROUP_SOCK_OPS:
+@@ -1744,6 +1750,8 @@ static int bpf_prog_query(const union bp
+       case BPF_CGROUP_INET6_CONNECT:
+       case BPF_CGROUP_UDP4_SENDMSG:
+       case BPF_CGROUP_UDP6_SENDMSG:
++      case BPF_CGROUP_UDP4_RECVMSG:
++      case BPF_CGROUP_UDP6_RECVMSG:
+       case BPF_CGROUP_SOCK_OPS:
+       case BPF_CGROUP_DEVICE:
+               break;
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -4342,9 +4342,12 @@ static int check_return_code(struct bpf_
+       struct tnum range = tnum_range(0, 1);
+       switch (env->prog->type) {
++      case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
++              if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG ||
++                  env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG)
++                      range = tnum_range(1, 1);
+       case BPF_PROG_TYPE_CGROUP_SKB:
+       case BPF_PROG_TYPE_CGROUP_SOCK:
+-      case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+       case BPF_PROG_TYPE_SOCK_OPS:
+       case BPF_PROG_TYPE_CGROUP_DEVICE:
+               break;
+@@ -4360,16 +4363,17 @@ static int check_return_code(struct bpf_
+       }
+       if (!tnum_in(range, reg->var_off)) {
++              char tn_buf[48];
++
+               verbose(env, "At program exit the register R0 ");
+               if (!tnum_is_unknown(reg->var_off)) {
+-                      char tn_buf[48];
+-
+                       tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+                       verbose(env, "has value %s", tn_buf);
+               } else {
+                       verbose(env, "has unknown scalar value");
+               }
+-              verbose(env, " should have been 0 or 1\n");
++              tnum_strn(tn_buf, sizeof(tn_buf), range);
++              verbose(env, " should have been in %s\n", tn_buf);
+               return -EINVAL;
+       }
+       return 0;
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -5558,6 +5558,7 @@ static bool sock_addr_is_valid_access(in
+               case BPF_CGROUP_INET4_BIND:
+               case BPF_CGROUP_INET4_CONNECT:
+               case BPF_CGROUP_UDP4_SENDMSG:
++              case BPF_CGROUP_UDP4_RECVMSG:
+                       break;
+               default:
+                       return false;
+@@ -5568,6 +5569,7 @@ static bool sock_addr_is_valid_access(in
+               case BPF_CGROUP_INET6_BIND:
+               case BPF_CGROUP_INET6_CONNECT:
+               case BPF_CGROUP_UDP6_SENDMSG:
++              case BPF_CGROUP_UDP6_RECVMSG:
+                       break;
+               default:
+                       return false;
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -1720,6 +1720,10 @@ try_again:
+               sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
+               memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+               *addr_len = sizeof(*sin);
++
++              if (cgroup_bpf_enabled)
++                      BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
++                                                      (struct sockaddr *)sin);
+       }
+       if (inet->cmsg_flags)
+               ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off);
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -419,6 +419,10 @@ try_again:
+                                                   inet6_iif(skb));
+               }
+               *addr_len = sizeof(*sin6);
++
++              if (cgroup_bpf_enabled)
++                      BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
++                                              (struct sockaddr *)sin6);
+       }
+       if (np->rxopt.all)
diff --git a/queue-4.19/bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch b/queue-4.19/bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch
new file mode 100644 (file)
index 0000000..1dd6082
--- /dev/null
@@ -0,0 +1,127 @@
+From da2577fdd0932ea4eefe73903f1130ee366767d2 Mon Sep 17 00:00:00 2001
+From: Jonathan Lemon <jonathan.lemon@gmail.com>
+Date: Sat, 8 Jun 2019 12:54:19 -0700
+Subject: bpf: lpm_trie: check left child of last leftmost node for NULL
+
+From: Jonathan Lemon <jonathan.lemon@gmail.com>
+
+commit da2577fdd0932ea4eefe73903f1130ee366767d2 upstream.
+
+If the leftmost parent node of the tree has does not have a child
+on the left side, then trie_get_next_key (and bpftool map dump) will
+not look at the child on the right.  This leads to the traversal
+missing elements.
+
+Lookup is not affected.
+
+Update selftest to handle this case.
+
+Reproducer:
+
+ bpftool map create /sys/fs/bpf/lpm type lpm_trie key 6 \
+     value 1 entries 256 name test_lpm flags 1
+ bpftool map update pinned /sys/fs/bpf/lpm key  8 0 0 0  0   0 value 1
+ bpftool map update pinned /sys/fs/bpf/lpm key 16 0 0 0  0 128 value 2
+ bpftool map dump   pinned /sys/fs/bpf/lpm
+
+Returns only 1 element. (2 expected)
+
+Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE")
+Signed-off-by: Jonathan Lemon <jonathan.lemon@gmail.com>
+Acked-by: Martin KaFai Lau <kafai@fb.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/bpf/lpm_trie.c                      |    9 ++++--
+ tools/testing/selftests/bpf/test_lpm_map.c |   41 ++++++++++++++++++++++++++---
+ 2 files changed, 45 insertions(+), 5 deletions(-)
+
+--- a/kernel/bpf/lpm_trie.c
++++ b/kernel/bpf/lpm_trie.c
+@@ -676,9 +676,14 @@ find_leftmost:
+        * have exact two children, so this function will never return NULL.
+        */
+       for (node = search_root; node;) {
+-              if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
++              if (node->flags & LPM_TREE_NODE_FLAG_IM) {
++                      node = rcu_dereference(node->child[0]);
++              } else {
+                       next_node = node;
+-              node = rcu_dereference(node->child[0]);
++                      node = rcu_dereference(node->child[0]);
++                      if (!node)
++                              node = rcu_dereference(next_node->child[1]);
++              }
+       }
+ do_copy:
+       next_key->prefixlen = next_node->prefixlen;
+--- a/tools/testing/selftests/bpf/test_lpm_map.c
++++ b/tools/testing/selftests/bpf/test_lpm_map.c
+@@ -573,13 +573,13 @@ static void test_lpm_get_next_key(void)
+       /* add one more element (total two) */
+       key_p->prefixlen = 24;
+-      inet_pton(AF_INET, "192.168.0.0", key_p->data);
++      inet_pton(AF_INET, "192.168.128.0", key_p->data);
+       assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+       memset(key_p, 0, key_size);
+       assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
+       assert(key_p->prefixlen == 24 && key_p->data[0] == 192 &&
+-             key_p->data[1] == 168 && key_p->data[2] == 0);
++             key_p->data[1] == 168 && key_p->data[2] == 128);
+       memset(next_key_p, 0, key_size);
+       assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+@@ -592,7 +592,7 @@ static void test_lpm_get_next_key(void)
+       /* Add one more element (total three) */
+       key_p->prefixlen = 24;
+-      inet_pton(AF_INET, "192.168.128.0", key_p->data);
++      inet_pton(AF_INET, "192.168.0.0", key_p->data);
+       assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
+       memset(key_p, 0, key_size);
+@@ -628,6 +628,41 @@ static void test_lpm_get_next_key(void)
+       assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
+       assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
+              next_key_p->data[1] == 168 && next_key_p->data[2] == 1);
++
++      memcpy(key_p, next_key_p, key_size);
++      assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
++      assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
++             next_key_p->data[1] == 168 && next_key_p->data[2] == 128);
++
++      memcpy(key_p, next_key_p, key_size);
++      assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
++      assert(next_key_p->prefixlen == 16 && next_key_p->data[0] == 192 &&
++             next_key_p->data[1] == 168);
++
++      memcpy(key_p, next_key_p, key_size);
++      assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == -1 &&
++             errno == ENOENT);
++
++      /* Add one more element (total five) */
++      key_p->prefixlen = 28;
++      inet_pton(AF_INET, "192.168.1.128", key_p->data);
++      assert(bpf_map_update_elem(map_fd, key_p, &value, 0) == 0);
++
++      memset(key_p, 0, key_size);
++      assert(bpf_map_get_next_key(map_fd, NULL, key_p) == 0);
++      assert(key_p->prefixlen == 24 && key_p->data[0] == 192 &&
++             key_p->data[1] == 168 && key_p->data[2] == 0);
++
++      memset(next_key_p, 0, key_size);
++      assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
++      assert(next_key_p->prefixlen == 28 && next_key_p->data[0] == 192 &&
++             next_key_p->data[1] == 168 && next_key_p->data[2] == 1 &&
++             next_key_p->data[3] == 128);
++
++      memcpy(key_p, next_key_p, key_size);
++      assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
++      assert(next_key_p->prefixlen == 24 && next_key_p->data[0] == 192 &&
++             next_key_p->data[1] == 168 && next_key_p->data[2] == 1);
+       memcpy(key_p, next_key_p, key_size);
+       assert(bpf_map_get_next_key(map_fd, key_p, next_key_p) == 0);
diff --git a/queue-4.19/bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch b/queue-4.19/bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch
new file mode 100644 (file)
index 0000000..f01d590
--- /dev/null
@@ -0,0 +1,40 @@
+From b1d6c15b9d824a58c5415673f374fac19e8eccdf Mon Sep 17 00:00:00 2001
+From: Martynas Pumputis <m@lambda.lt>
+Date: Wed, 12 Jun 2019 18:05:40 +0200
+Subject: bpf: simplify definition of BPF_FIB_LOOKUP related flags
+
+From: Martynas Pumputis <m@lambda.lt>
+
+commit b1d6c15b9d824a58c5415673f374fac19e8eccdf upstream.
+
+Previously, the BPF_FIB_LOOKUP_{DIRECT,OUTPUT} flags in the BPF UAPI
+were defined with the help of BIT macro. This had the following issues:
+
+- In order to use any of the flags, a user was required to depend
+  on <linux/bits.h>.
+- No other flag in bpf.h uses the macro, so it seems that an unwritten
+  convention is to use (1 << (nr)) to define BPF-related flags.
+
+Fixes: 87f5fc7e48dd ("bpf: Provide helper to do forwarding lookups in kernel FIB table")
+Signed-off-by: Martynas Pumputis <m@lambda.lt>
+Acked-by: Andrii Nakryiko <andriin@fb.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/uapi/linux/bpf.h |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/include/uapi/linux/bpf.h
++++ b/include/uapi/linux/bpf.h
+@@ -2705,8 +2705,8 @@ struct bpf_raw_tracepoint_args {
+ /* DIRECT:  Skip the FIB rules and go to FIB table associated with device
+  * OUTPUT:  Do lookup from egress perspective; default is ingress
+  */
+-#define BPF_FIB_LOOKUP_DIRECT  BIT(0)
+-#define BPF_FIB_LOOKUP_OUTPUT  BIT(1)
++#define BPF_FIB_LOOKUP_DIRECT  (1U << 0)
++#define BPF_FIB_LOOKUP_OUTPUT  (1U << 1)
+ enum {
+       BPF_FIB_LKUP_RET_SUCCESS,      /* lookup successful */
diff --git a/queue-4.19/bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch b/queue-4.19/bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch
new file mode 100644 (file)
index 0000000..b4fa661
--- /dev/null
@@ -0,0 +1,55 @@
+From 257a525fe2e49584842c504a92c27097407f778f Mon Sep 17 00:00:00 2001
+From: Martin KaFai Lau <kafai@fb.com>
+Date: Fri, 31 May 2019 15:29:13 -0700
+Subject: bpf: udp: Avoid calling reuseport's bpf_prog from udp_gro
+
+From: Martin KaFai Lau <kafai@fb.com>
+
+commit 257a525fe2e49584842c504a92c27097407f778f upstream.
+
+When the commit a6024562ffd7 ("udp: Add GRO functions to UDP socket")
+added udp[46]_lib_lookup_skb to the udp_gro code path, it broke
+the reuseport_select_sock() assumption that skb->data is pointing
+to the transport header.
+
+This patch follows an earlier __udp6_lib_err() fix by
+passing a NULL skb to avoid calling the reuseport's bpf_prog.
+
+Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket")
+Cc: Tom Herbert <tom@herbertland.com>
+Signed-off-by: Martin KaFai Lau <kafai@fb.com>
+Acked-by: Song Liu <songliubraving@fb.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/udp.c |    6 +++++-
+ net/ipv6/udp.c |    2 +-
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -542,7 +542,11 @@ static inline struct sock *__udp4_lib_lo
+ struct sock *udp4_lib_lookup_skb(struct sk_buff *skb,
+                                __be16 sport, __be16 dport)
+ {
+-      return __udp4_lib_lookup_skb(skb, sport, dport, &udp_table);
++      const struct iphdr *iph = ip_hdr(skb);
++
++      return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport,
++                               iph->daddr, dport, inet_iif(skb),
++                               inet_sdif(skb), &udp_table, NULL);
+ }
+ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb);
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -282,7 +282,7 @@ struct sock *udp6_lib_lookup_skb(struct
+       return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport,
+                                &iph->daddr, dport, inet6_iif(skb),
+-                               inet6_sdif(skb), &udp_table, skb);
++                               inet6_sdif(skb), &udp_table, NULL);
+ }
+ EXPORT_SYMBOL_GPL(udp6_lib_lookup_skb);
diff --git a/queue-4.19/bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch b/queue-4.19/bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch
new file mode 100644 (file)
index 0000000..a554d4f
--- /dev/null
@@ -0,0 +1,50 @@
+From 4ac30c4b3659efac031818c418beb51e630d512d Mon Sep 17 00:00:00 2001
+From: Martin KaFai Lau <kafai@fb.com>
+Date: Fri, 31 May 2019 15:29:11 -0700
+Subject: bpf: udp: ipv6: Avoid running reuseport's bpf_prog from __udp6_lib_err
+
+From: Martin KaFai Lau <kafai@fb.com>
+
+commit 4ac30c4b3659efac031818c418beb51e630d512d upstream.
+
+__udp6_lib_err() may be called when handling icmpv6 message. For example,
+the icmpv6 toobig(type=2).  __udp6_lib_lookup() is then called
+which may call reuseport_select_sock().  reuseport_select_sock() will
+call into a bpf_prog (if there is one).
+
+reuseport_select_sock() is expecting the skb->data pointing to the
+transport header (udphdr in this case).  For example, run_bpf_filter()
+is pulling the transport header.
+
+However, in the __udp6_lib_err() path, the skb->data is pointing to the
+ipv6hdr instead of the udphdr.
+
+One option is to pull and push the ipv6hdr in __udp6_lib_err().
+Instead of doing this, this patch follows how the original
+commit 538950a1b752 ("soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF")
+was done in IPv4, which has passed a NULL skb pointer to
+reuseport_select_sock().
+
+Fixes: 538950a1b752 ("soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF")
+Cc: Craig Gallek <kraig@google.com>
+Signed-off-by: Martin KaFai Lau <kafai@fb.com>
+Acked-by: Song Liu <songliubraving@fb.com>
+Acked-by: Craig Gallek <kraig@google.com>
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/udp.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -482,7 +482,7 @@ void __udp6_lib_err(struct sk_buff *skb,
+       struct net *net = dev_net(skb->dev);
+       sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
+-                             inet6_iif(skb), 0, udptable, skb);
++                             inet6_iif(skb), 0, udptable, NULL);
+       if (!sk) {
+               __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+                                 ICMP6_MIB_INERRORS);
diff --git a/queue-4.19/futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch b/queue-4.19/futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch
new file mode 100644 (file)
index 0000000..d40da9b
--- /dev/null
@@ -0,0 +1,57 @@
+From 427503519739e779c0db8afe876c1b33f3ac60ae Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Wed, 10 Apr 2019 11:51:54 +0100
+Subject: futex: Update comments and docs about return values of arch futex code
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 427503519739e779c0db8afe876c1b33f3ac60ae upstream.
+
+The architecture implementations of 'arch_futex_atomic_op_inuser()' and
+'futex_atomic_cmpxchg_inatomic()' are permitted to return only -EFAULT,
+-EAGAIN or -ENOSYS in the case of failure.
+
+Update the comments in the asm-generic/ implementation and also a stray
+reference in the robust futex documentation.
+
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ Documentation/robust-futexes.txt |    3 +--
+ include/asm-generic/futex.h      |    8 ++++++--
+ 2 files changed, 7 insertions(+), 4 deletions(-)
+
+--- a/Documentation/robust-futexes.txt
++++ b/Documentation/robust-futexes.txt
+@@ -218,5 +218,4 @@ All other architectures should build jus
+ the new syscalls yet.
+ Architectures need to implement the new futex_atomic_cmpxchg_inatomic()
+-inline function before writing up the syscalls (that function returns
+--ENOSYS right now).
++inline function before writing up the syscalls.
+--- a/include/asm-generic/futex.h
++++ b/include/asm-generic/futex.h
+@@ -23,7 +23,9 @@
+  *
+  * Return:
+  * 0 - On success
+- * <0 - On error
++ * -EFAULT - User access resulted in a page fault
++ * -EAGAIN - Atomic operation was unable to complete due to contention
++ * -ENOSYS - Operation not supported
+  */
+ static inline int
+ arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, u32 __user *uaddr)
+@@ -85,7 +87,9 @@ out_pagefault_enable:
+  *
+  * Return:
+  * 0 - On success
+- * <0 - On error
++ * -EFAULT - User access resulted in a page fault
++ * -EAGAIN - Atomic operation was unable to complete due to contention
++ * -ENOSYS - Function not implemented (only if !HAVE_FUTEX_CMPXCHG)
+  */
+ static inline int
+ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
index c3fee71f5637278068c758dec582dd2dbc7b7008..da14211a7da67a5bcdc292aefd42c0ddbec6059c 100644 (file)
@@ -59,3 +59,12 @@ team-always-enable-vlan-tx-offload.patch
 tipc-change-to-use-register_pernet_device.patch
 tipc-check-msg-req-data-len-in-tipc_nl_compat_bearer_disable.patch
 tun-wake-up-waitqueues-after-iff_up-is-set.patch
+bpf-simplify-definition-of-bpf_fib_lookup-related-flags.patch
+bpf-lpm_trie-check-left-child-of-last-leftmost-node-for-null.patch
+bpf-fix-nested-bpf-tracepoints-with-per-cpu-data.patch
+bpf-fix-unconnected-udp-hooks.patch
+bpf-udp-avoid-calling-reuseport-s-bpf_prog-from-udp_gro.patch
+bpf-udp-ipv6-avoid-running-reuseport-s-bpf_prog-from-__udp6_lib_err.patch
+arm64-futex-avoid-copying-out-uninitialised-stack-in-failed-cmpxchg.patch
+bpf-arm64-use-more-scalable-stadd-over-ldxr-stxr-loop-in-xadd.patch
+futex-update-comments-and-docs-about-return-values-of-arch-futex-code.patch