4.9-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 18 Mar 2017 14:06:49 +0000 (22:06 +0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 18 Mar 2017 14:06:49 +0000 (22:06 +0800)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 18 Mar 2017 14:06:49 +0000 (22:06 +0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 18 Mar 2017 14:06:49 +0000 (22:06 +0800)
diff --git a/queue-4.9/act_connmark-avoid-crashing-on-malformed-nlattrs-with-null-parms.patch b/queue-4.9/act_connmark-avoid-crashing-on-malformed-nlattrs-with-null-parms.patch

new file mode 100644 (file)

index 0000000..c6a90f0
--- /dev/null
+++ b/queue-4.9/act_connmark-avoid-crashing-on-malformed-nlattrs-with-null-parms.patch
@@ -0,0 +1,57 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Etienne Noss <etienne.noss@wifirst.fr>
+Date: Fri, 10 Mar 2017 16:55:32 +0100
+Subject: act_connmark: avoid crashing on malformed nlattrs with null parms
+
+From: Etienne Noss <etienne.noss@wifirst.fr>
+
+
+[ Upstream commit 52491c7607c5527138095edf44c53169dc1ddb82 ]
+
+tcf_connmark_init does not check in its configuration if TCA_CONNMARK_PARMS
+is set, resulting in a null pointer dereference when trying to access it.
+
+[501099.043007] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004
+[501099.043039] IP: [<ffffffffc10c60fb>] tcf_connmark_init+0x8b/0x180 [act_connmark]
+...
+[501099.044334] Call Trace:
+[501099.044345]  [<ffffffffa47270e8>] ? tcf_action_init_1+0x198/0x1b0
+[501099.044363]  [<ffffffffa47271b0>] ? tcf_action_init+0xb0/0x120
+[501099.044380]  [<ffffffffa47250a4>] ? tcf_exts_validate+0xc4/0x110
+[501099.044398]  [<ffffffffc0f5fa97>] ? u32_set_parms+0xa7/0x270 [cls_u32]
+[501099.044417]  [<ffffffffc0f60bf0>] ? u32_change+0x680/0x87b [cls_u32]
+[501099.044436]  [<ffffffffa4725d1d>] ? tc_ctl_tfilter+0x4dd/0x8a0
+[501099.044454]  [<ffffffffa44a23a1>] ? security_capable+0x41/0x60
+[501099.044471]  [<ffffffffa470ca01>] ? rtnetlink_rcv_msg+0xe1/0x220
+[501099.044490]  [<ffffffffa470c920>] ? rtnl_newlink+0x870/0x870
+[501099.044507]  [<ffffffffa472cc61>] ? netlink_rcv_skb+0xa1/0xc0
+[501099.044524]  [<ffffffffa47073f4>] ? rtnetlink_rcv+0x24/0x30
+[501099.044541]  [<ffffffffa472c634>] ? netlink_unicast+0x184/0x230
+[501099.044558]  [<ffffffffa472c9d8>] ? netlink_sendmsg+0x2f8/0x3b0
+[501099.044576]  [<ffffffffa46d8880>] ? sock_sendmsg+0x30/0x40
+[501099.044592]  [<ffffffffa46d8e03>] ? SYSC_sendto+0xd3/0x150
+[501099.044608]  [<ffffffffa425fda1>] ? __do_page_fault+0x2d1/0x510
+[501099.044626]  [<ffffffffa47fbd7b>] ? system_call_fast_compare_end+0xc/0x9b
+
+Fixes: 22a5dc0e5e3e ("net: sched: Introduce connmark action")
+Signed-off-by: Étienne Noss <etienne.noss@wifirst.fr>
+Signed-off-by: Victorien Molle <victorien.molle@wifirst.fr>
+Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/act_connmark.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/sched/act_connmark.c
++++ b/net/sched/act_connmark.c
+@@ -113,6 +113,9 @@ static int tcf_connmark_init(struct net
+       if (ret < 0)
+               return ret;
+ 
++      if (!tb[TCA_CONNMARK_PARMS])
++              return -EINVAL;
++
+       parm = nla_data(tb[TCA_CONNMARK_PARMS]);
+ 
+       if (!tcf_hash_check(tn, parm->index, a, bind)) {
diff --git a/queue-4.9/bpf-detect-identical-ptr_to_map_value_or_null-registers.patch b/queue-4.9/bpf-detect-identical-ptr_to_map_value_or_null-registers.patch

new file mode 100644 (file)

index 0000000..d888eb8
--- /dev/null
+++ b/queue-4.9/bpf-detect-identical-ptr_to_map_value_or_null-registers.patch
@@ -0,0 +1,173 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Thomas Graf <tgraf@suug.ch>
+Date: Tue, 18 Oct 2016 19:51:19 +0200
+Subject: bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers
+
+From: Thomas Graf <tgraf@suug.ch>
+
+
+[ Upstream commit 57a09bf0a416700676e77102c28f9cfcb48267e0 ]
+
+A BPF program is required to check the return register of a
+map_elem_lookup() call before accessing memory. The verifier keeps
+track of this by converting the type of the result register from
+PTR_TO_MAP_VALUE_OR_NULL to PTR_TO_MAP_VALUE after a conditional
+jump ensures safety. This check is currently exclusively performed
+for the result register 0.
+
+In the event the compiler reorders instructions, BPF_MOV64_REG
+instructions may be moved before the conditional jump which causes
+them to keep their type PTR_TO_MAP_VALUE_OR_NULL to which the
+verifier objects when the register is accessed:
+
+0: (b7) r1 = 10
+1: (7b) *(u64 *)(r10 -8) = r1
+2: (bf) r2 = r10
+3: (07) r2 += -8
+4: (18) r1 = 0x59c00000
+6: (85) call 1
+7: (bf) r4 = r0
+8: (15) if r0 == 0x0 goto pc+1
+ R0=map_value(ks=8,vs=8) R4=map_value_or_null(ks=8,vs=8) R10=fp
+9: (7a) *(u64 *)(r4 +0) = 0
+R4 invalid mem access 'map_value_or_null'
+
+This commit extends the verifier to keep track of all identical
+PTR_TO_MAP_VALUE_OR_NULL registers after a map_elem_lookup() by
+assigning them an ID and then marking them all when the conditional
+jump is observed.
+
+Signed-off-by: Thomas Graf <tgraf@suug.ch>
+Reviewed-by: Josef Bacik <jbacik@fb.com>
+Acked-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h |    2 -
+ kernel/bpf/verifier.c        |   61 +++++++++++++++++++++++++++++++------------
+ 2 files changed, 46 insertions(+), 17 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -24,13 +24,13 @@ struct bpf_reg_state {
+        */
+       s64 min_value;
+       u64 max_value;
++      u32 id;
+       union {
+               /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
+               s64 imm;
+ 
+               /* valid when type == PTR_TO_PACKET* */
+               struct {
+-                      u32 id;
+                       u16 off;
+                       u16 range;
+               };
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -212,9 +212,10 @@ static void print_verifier_state(struct
+               else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+                        t == PTR_TO_MAP_VALUE_OR_NULL ||
+                        t == PTR_TO_MAP_VALUE_ADJ)
+-                      verbose("(ks=%d,vs=%d)",
++                      verbose("(ks=%d,vs=%d,id=%u)",
+                               reg->map_ptr->key_size,
+-                              reg->map_ptr->value_size);
++                              reg->map_ptr->value_size,
++                              reg->id);
+               if (reg->min_value != BPF_REGISTER_MIN_RANGE)
+                       verbose(",min_value=%lld",
+                               (long long)reg->min_value);
+@@ -447,6 +448,7 @@ static void mark_reg_unknown_value(struc
+ {
+       BUG_ON(regno >= MAX_BPF_REG);
+       regs[regno].type = UNKNOWN_VALUE;
++      regs[regno].id = 0;
+       regs[regno].imm = 0;
+ }
+ 
+@@ -1252,6 +1254,7 @@ static int check_call(struct bpf_verifie
+                       return -EINVAL;
+               }
+               regs[BPF_REG_0].map_ptr = meta.map_ptr;
++              regs[BPF_REG_0].id = ++env->id_gen;
+       } else {
+               verbose("unknown return type %d of func %d\n",
+                       fn->ret_type, func_id);
+@@ -1668,8 +1671,7 @@ static int check_alu_op(struct bpf_verif
+                                               insn->src_reg);
+                                       return -EACCES;
+                               }
+-                              regs[insn->dst_reg].type = UNKNOWN_VALUE;
+-                              regs[insn->dst_reg].map_ptr = NULL;
++                              mark_reg_unknown_value(regs, insn->dst_reg);
+                       }
+               } else {
+                       /* case: R = imm
+@@ -1931,6 +1933,38 @@ static void reg_set_min_max_inv(struct b
+       check_reg_overflow(true_reg);
+ }
+ 
++static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
++                       enum bpf_reg_type type)
++{
++      struct bpf_reg_state *reg = &regs[regno];
++
++      if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
++              reg->type = type;
++              if (type == UNKNOWN_VALUE)
++                      mark_reg_unknown_value(regs, regno);
++      }
++}
++
++/* The logic is similar to find_good_pkt_pointers(), both could eventually
++ * be folded together at some point.
++ */
++static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
++                        enum bpf_reg_type type)
++{
++      struct bpf_reg_state *regs = state->regs;
++      int i;
++
++      for (i = 0; i < MAX_BPF_REG; i++)
++              mark_map_reg(regs, i, regs[regno].id, type);
++
++      for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
++              if (state->stack_slot_type[i] != STACK_SPILL)
++                      continue;
++              mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE,
++                           regs[regno].id, type);
++      }
++}
++
+ static int check_cond_jmp_op(struct bpf_verifier_env *env,
+                            struct bpf_insn *insn, int *insn_idx)
+ {
+@@ -2018,18 +2052,13 @@ static int check_cond_jmp_op(struct bpf_
+       if (BPF_SRC(insn->code) == BPF_K &&
+           insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+           dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
+-              if (opcode == BPF_JEQ) {
+-                      /* next fallthrough insn can access memory via
+-                       * this register
+-                       */
+-                      regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+-                      /* branch targer cannot access it, since reg == 0 */
+-                      mark_reg_unknown_value(other_branch->regs,
+-                                             insn->dst_reg);
+-              } else {
+-                      other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+-                      mark_reg_unknown_value(regs, insn->dst_reg);
+-              }
++              /* Mark all identical map registers in each branch as either
++               * safe or unknown depending R == 0 or R != 0 conditional.
++               */
++              mark_map_regs(this_branch, insn->dst_reg,
++                            opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
++              mark_map_regs(other_branch, insn->dst_reg,
++                            opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
+       } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+                  dst_reg->type == PTR_TO_PACKET &&
+                  regs[insn->src_reg].type == PTR_TO_PACKET_END) {
diff --git a/queue-4.9/bpf-fix-mark_reg_unknown_value-for-spilled-regs-on-map-value-marking.patch b/queue-4.9/bpf-fix-mark_reg_unknown_value-for-spilled-regs-on-map-value-marking.patch

new file mode 100644 (file)

index 0000000..c2db16d
--- /dev/null
+++ b/queue-4.9/bpf-fix-mark_reg_unknown_value-for-spilled-regs-on-map-value-marking.patch
@@ -0,0 +1,103 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Sun, 18 Dec 2016 01:52:59 +0100
+Subject: bpf: fix mark_reg_unknown_value for spilled regs on map value marking
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+
+[ Upstream commit 6760bf2ddde8ad64f8205a651223a93de3a35494 ]
+
+Martin reported a verifier issue that hit the BUG_ON() for his
+test case in the mark_reg_unknown_value() function:
+
+  [  202.861380] kernel BUG at kernel/bpf/verifier.c:467!
+  [...]
+  [  203.291109] Call Trace:
+  [  203.296501]  [<ffffffff811364d5>] mark_map_reg+0x45/0x50
+  [  203.308225]  [<ffffffff81136558>] mark_map_regs+0x78/0x90
+  [  203.320140]  [<ffffffff8113938d>] do_check+0x226d/0x2c90
+  [  203.331865]  [<ffffffff8113a6ab>] bpf_check+0x48b/0x780
+  [  203.343403]  [<ffffffff81134c8e>] bpf_prog_load+0x27e/0x440
+  [  203.355705]  [<ffffffff8118a38f>] ? handle_mm_fault+0x11af/0x1230
+  [  203.369158]  [<ffffffff812d8188>] ? security_capable+0x48/0x60
+  [  203.382035]  [<ffffffff811351a4>] SyS_bpf+0x124/0x960
+  [  203.393185]  [<ffffffff810515f6>] ? __do_page_fault+0x276/0x490
+  [  203.406258]  [<ffffffff816db320>] entry_SYSCALL_64_fastpath+0x13/0x94
+
+This issue got uncovered after the fix in a08dd0da5307 ("bpf: fix
+regression on verifier pruning wrt map lookups"). The reason why it
+wasn't noticed before was, because as mentioned in a08dd0da5307,
+mark_map_regs() was doing the id matching incorrectly based on the
+uncached regs[regno].id. So, in the first loop, we walked all regs
+and as soon as we found regno == i, then this reg's id was cleared
+when calling mark_reg_unknown_value() thus that every subsequent
+register was probed against id of 0 (which, in combination with the
+PTR_TO_MAP_VALUE_OR_NULL type is an invalid condition that no other
+register state can hold), and therefore wasn't type transitioned such
+as in the spilled register case for the second loop.
+
+Now since that got fixed, it turned out that 57a09bf0a416 ("bpf:
+Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") used
+mark_reg_unknown_value() incorrectly for the spilled regs, and thus
+hitting the BUG_ON() in some cases due to regno >= MAX_BPF_REG.
+
+Although spilled regs have the same type as the non-spilled regs
+for the verifier state, that is, struct bpf_reg_state, they are
+semantically different from the non-spilled regs. In other words,
+there can be up to 64 (MAX_BPF_STACK / BPF_REG_SIZE) spilled regs
+in the stack, for example, register R<x> could have been spilled by
+the program to stack location X, Y, Z, and in mark_map_regs() we
+need to scan these stack slots of type STACK_SPILL for potential
+registers that we have to transition from PTR_TO_MAP_VALUE_OR_NULL.
+Therefore, depending on the location, the spilled_regs regno can
+be a lot higher than just MAX_BPF_REG's value since we operate on
+stack instead. The reset in mark_reg_unknown_value() itself is
+just fine, only that the BUG_ON() was inappropriate for this. Fix
+it by making a __mark_reg_unknown_value() version that can be
+called from mark_map_reg() generically; we know for the non-spilled
+case that the regno is always < MAX_BPF_REG anyway.
+
+Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
+Reported-by: Martin KaFai Lau <kafai@fb.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -444,14 +444,19 @@ static void init_reg_state(struct bpf_re
+       regs[BPF_REG_1].type = PTR_TO_CTX;
+ }
+ 
+-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
++static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+ {
+-      BUG_ON(regno >= MAX_BPF_REG);
+       regs[regno].type = UNKNOWN_VALUE;
+       regs[regno].id = 0;
+       regs[regno].imm = 0;
+ }
+ 
++static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
++{
++      BUG_ON(regno >= MAX_BPF_REG);
++      __mark_reg_unknown_value(regs, regno);
++}
++
+ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
+ {
+       regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
+@@ -1946,7 +1951,7 @@ static void mark_map_reg(struct bpf_reg_
+                */
+               reg->id = 0;
+               if (type == UNKNOWN_VALUE)
+-                      mark_reg_unknown_value(regs, regno);
++                      __mark_reg_unknown_value(regs, regno);
+       }
+ }
+ 
diff --git a/queue-4.9/bpf-fix-regression-on-verifier-pruning-wrt-map-lookups.patch b/queue-4.9/bpf-fix-regression-on-verifier-pruning-wrt-map-lookups.patch

new file mode 100644 (file)

index 0000000..81aa8a0
--- /dev/null
+++ b/queue-4.9/bpf-fix-regression-on-verifier-pruning-wrt-map-lookups.patch
@@ -0,0 +1,107 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Thu, 15 Dec 2016 01:30:06 +0100
+Subject: bpf: fix regression on verifier pruning wrt map lookups
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+
+[ Upstream commit a08dd0da5307ba01295c8383923e51e7997c3576 ]
+
+Commit 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL
+registers") introduced a regression where existing programs stopped
+loading due to reaching the verifier's maximum complexity limit,
+whereas prior to this commit they were loading just fine; the affected
+program has roughly 2k instructions.
+
+What was found is that state pruning couldn't be performed effectively
+anymore due to mismatches of the verifier's register state, in particular
+in the id tracking. It doesn't mean that 57a09bf0a416 is incorrect per
+se, but rather that verifier needs to perform a lot more work for the
+same program with regards to involved map lookups.
+
+Since commit 57a09bf0a416 is only about tracking registers with type
+PTR_TO_MAP_VALUE_OR_NULL, the id is only needed to follow registers
+until they are promoted through pattern matching with a NULL check to
+either PTR_TO_MAP_VALUE or UNKNOWN_VALUE type. After that point, the
+id becomes irrelevant for the transitioned types.
+
+For UNKNOWN_VALUE, id is already reset to 0 via mark_reg_unknown_value(),
+but not so for PTR_TO_MAP_VALUE where id is becoming stale. It's even
+transferred further into other types that don't make use of it. Among
+others, one example is where UNKNOWN_VALUE is set on function call
+return with RET_INTEGER return type.
+
+states_equal() will then fall through the memcmp() on register state;
+note that the second memcmp() uses offsetofend(), so the id is part of
+that since d2a4dd37f6b4 ("bpf: fix state equivalence"). But the bisect
+pointed already to 57a09bf0a416, where we really reach beyond complexity
+limit. What I found was that states_equal() often failed in this
+case due to id mismatches in spilled regs with registers in type
+PTR_TO_MAP_VALUE. Unlike non-spilled regs, spilled regs just perform
+a memcmp() on their reg state and don't have any other optimizations
+in place, therefore also id was relevant in this case for making a
+pruning decision.
+
+We can safely reset id to 0 as well when converting to PTR_TO_MAP_VALUE.
+For the affected program, it resulted in a ~17 fold reduction of
+complexity and let the program load fine again. Selftest suite also
+runs fine. The only other place where env->id_gen is used currently is
+through direct packet access, but for these cases id is long living, thus
+a different scenario.
+
+Also, the current logic in mark_map_regs() is not fully correct when
+marking NULL branch with UNKNOWN_VALUE. We need to cache the destination
+reg's id in any case. Otherwise, once we marked that reg as UNKNOWN_VALUE,
+it's id is reset and any subsequent registers that hold the original id
+and are of type PTR_TO_MAP_VALUE_OR_NULL won't be marked UNKNOWN_VALUE
+anymore, since mark_map_reg() reuses the uncached regs[regno].id that
+was just overridden. Note, we don't need to cache it outside of
+mark_map_regs(), since it's called once on this_branch and the other
+time on other_branch, which are both two independent verifier states.
+A test case for this is added here, too.
+
+Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Thomas Graf <tgraf@suug.ch>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |   11 ++++++++---
+ 1 file changed, 8 insertions(+), 3 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -1940,6 +1940,11 @@ static void mark_map_reg(struct bpf_reg_
+ 
+       if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+               reg->type = type;
++              /* We don't need id from this point onwards anymore, thus we
++               * should better reset it, so that state pruning has chances
++               * to take effect.
++               */
++              reg->id = 0;
+               if (type == UNKNOWN_VALUE)
+                       mark_reg_unknown_value(regs, regno);
+       }
+@@ -1952,16 +1957,16 @@ static void mark_map_regs(struct bpf_ver
+                         enum bpf_reg_type type)
+ {
+       struct bpf_reg_state *regs = state->regs;
++      u32 id = regs[regno].id;
+       int i;
+ 
+       for (i = 0; i < MAX_BPF_REG; i++)
+-              mark_map_reg(regs, i, regs[regno].id, type);
++              mark_map_reg(regs, i, id, type);
+ 
+       for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+               if (state->stack_slot_type[i] != STACK_SPILL)
+                       continue;
+-              mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE,
+-                           regs[regno].id, type);
++              mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+       }
+ }
+ 
diff --git a/queue-4.9/bpf-fix-state-equivalence.patch b/queue-4.9/bpf-fix-state-equivalence.patch

new file mode 100644 (file)

index 0000000..4e3adfd
--- /dev/null
+++ b/queue-4.9/bpf-fix-state-equivalence.patch
@@ -0,0 +1,68 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Alexei Starovoitov <ast@fb.com>
+Date: Wed, 7 Dec 2016 10:57:59 -0800
+Subject: bpf: fix state equivalence
+
+From: Alexei Starovoitov <ast@fb.com>
+
+
+[ Upstream commit d2a4dd37f6b41fbcad76efbf63124eb3126c66fe ]
+
+Commmits 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
+and 484611357c19 ("bpf: allow access into map value arrays") by themselves
+are correct, but in combination they make state equivalence ignore 'id' field
+of the register state which can lead to accepting invalid program.
+
+Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers")
+Fixes: 484611357c19 ("bpf: allow access into map value arrays")
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Acked-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Thomas Graf <tgraf@suug.ch>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/bpf_verifier.h |   14 +++++++-------
+ kernel/bpf/verifier.c        |    2 +-
+ 2 files changed, 8 insertions(+), 8 deletions(-)
+
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -18,13 +18,6 @@
+ 
+ struct bpf_reg_state {
+       enum bpf_reg_type type;
+-      /*
+-       * Used to determine if any memory access using this register will
+-       * result in a bad access.
+-       */
+-      s64 min_value;
+-      u64 max_value;
+-      u32 id;
+       union {
+               /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */
+               s64 imm;
+@@ -40,6 +33,13 @@ struct bpf_reg_state {
+                */
+               struct bpf_map *map_ptr;
+       };
++      u32 id;
++      /* Used to determine if any memory access using this register will
++       * result in a bad access. These two fields must be last.
++       * See states_equal()
++       */
++      s64 min_value;
++      u64 max_value;
+ };
+ 
+ enum bpf_stack_slot_type {
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2498,7 +2498,7 @@ static bool states_equal(struct bpf_veri
+                * we didn't do a variable access into a map then we are a-ok.
+                */
+               if (!varlen_map_access &&
+-                  rold->type == rcur->type && rold->imm == rcur->imm)
++                  memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
+                       continue;
+ 
+               /* If we didn't map access then again we don't care about the
diff --git a/queue-4.9/bridge-drop-netfilter-fake-rtable-unconditionally.patch b/queue-4.9/bridge-drop-netfilter-fake-rtable-unconditionally.patch

new file mode 100644 (file)

index 0000000..244c03c
--- /dev/null
+++ b/queue-4.9/bridge-drop-netfilter-fake-rtable-unconditionally.patch
@@ -0,0 +1,83 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Florian Westphal <fw@strlen.de>
+Date: Mon, 13 Mar 2017 17:38:17 +0100
+Subject: bridge: drop netfilter fake rtable unconditionally
+
+From: Florian Westphal <fw@strlen.de>
+
+
+[ Upstream commit a13b2082ece95247779b9995c4e91b4246bed023 ]
+
+Andreas reports kernel oops during rmmod of the br_netfilter module.
+Hannes debugged the oops down to a NULL rt6info->rt6i_indev.
+
+Problem is that br_netfilter has the nasty concept of adding a fake
+rtable to skb->dst; this happens in a br_netfilter prerouting hook.
+
+A second hook (in bridge LOCAL_IN) is supposed to remove these again
+before the skb is handed up the stack.
+
+However, on module unload hooks get unregistered which means an
+skb could traverse the prerouting hook that attaches the fake_rtable,
+while the 'fake rtable remove' hook gets removed from the hooklist
+immediately after.
+
+Fixes: 34666d467cbf1e2e3c7 ("netfilter: bridge: move br_netfilter out of the core")
+Reported-by: Andreas Karis <akaris@redhat.com>
+Debugged-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bridge/br_input.c           |    1 +
+ net/bridge/br_netfilter_hooks.c |   21 ---------------------
+ 2 files changed, 1 insertion(+), 21 deletions(-)
+
+--- a/net/bridge/br_input.c
++++ b/net/bridge/br_input.c
+@@ -29,6 +29,7 @@ EXPORT_SYMBOL(br_should_route_hook);
+ static int
+ br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
+ {
++      br_drop_fake_rtable(skb);
+       return netif_receive_skb(skb);
+ }
+ 
+--- a/net/bridge/br_netfilter_hooks.c
++++ b/net/bridge/br_netfilter_hooks.c
+@@ -521,21 +521,6 @@ static unsigned int br_nf_pre_routing(vo
+ }
+ 
+ 
+-/* PF_BRIDGE/LOCAL_IN ************************************************/
+-/* The packet is locally destined, which requires a real
+- * dst_entry, so detach the fake one.  On the way up, the
+- * packet would pass through PRE_ROUTING again (which already
+- * took place when the packet entered the bridge), but we
+- * register an IPv4 PRE_ROUTING 'sabotage' hook that will
+- * prevent this from happening. */
+-static unsigned int br_nf_local_in(void *priv,
+-                                 struct sk_buff *skb,
+-                                 const struct nf_hook_state *state)
+-{
+-      br_drop_fake_rtable(skb);
+-      return NF_ACCEPT;
+-}
+-
+ /* PF_BRIDGE/FORWARD *************************************************/
+ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+ {
+@@ -906,12 +891,6 @@ static struct nf_hook_ops br_nf_ops[] __
+               .priority = NF_BR_PRI_BRNF,
+       },
+       {
+-              .hook = br_nf_local_in,
+-              .pf = NFPROTO_BRIDGE,
+-              .hooknum = NF_BR_LOCAL_IN,
+-              .priority = NF_BR_PRI_BRNF,
+-      },
+-      {
+               .hook = br_nf_forward_ip,
+               .pf = NFPROTO_BRIDGE,
+               .hooknum = NF_BR_FORWARD,
diff --git a/queue-4.9/dccp-fix-memory-leak-during-tear-down-of-unsuccessful-connection-request.patch b/queue-4.9/dccp-fix-memory-leak-during-tear-down-of-unsuccessful-connection-request.patch

new file mode 100644 (file)

index 0000000..9fda3c4
--- /dev/null
+++ b/queue-4.9/dccp-fix-memory-leak-during-tear-down-of-unsuccessful-connection-request.patch
@@ -0,0 +1,33 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Hannes Frederic Sowa <hannes@stressinduktion.org>
+Date: Mon, 13 Mar 2017 00:01:30 +0100
+Subject: dccp: fix memory leak during tear-down of unsuccessful connection request
+
+From: Hannes Frederic Sowa <hannes@stressinduktion.org>
+
+
+[ Upstream commit 72ef9c4125c7b257e3a714d62d778ab46583d6a3 ]
+
+This patch fixes a memory leak, which happens if the connection request
+is not fulfilled between parsing the DCCP options and handling the SYN
+(because e.g. the backlog is full), because we forgot to free the
+list of ack vectors.
+
+Reported-by: Jianwen Ji <jiji@redhat.com>
+Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/dccp/ccids/ccid2.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/dccp/ccids/ccid2.c
++++ b/net/dccp/ccids/ccid2.c
+@@ -749,6 +749,7 @@ static void ccid2_hc_tx_exit(struct sock
+       for (i = 0; i < hc->tx_seqbufc; i++)
+               kfree(hc->tx_seqbuf[i]);
+       hc->tx_seqbufc = 0;
++      dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks);
+ }
+ 
+ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb)
diff --git a/queue-4.9/dccp-fix-use-after-free-in-dccp_feat_activate_values.patch b/queue-4.9/dccp-fix-use-after-free-in-dccp_feat_activate_values.patch

new file mode 100644 (file)

index 0000000..159e444
--- /dev/null
+++ b/queue-4.9/dccp-fix-use-after-free-in-dccp_feat_activate_values.patch
@@ -0,0 +1,237 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Sun, 5 Mar 2017 10:52:16 -0800
+Subject: dccp: fix use-after-free in dccp_feat_activate_values
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 62f8f4d9066c1c6f2474845d1ca7e2891f2ae3fd ]
+
+Dmitry reported crashes in DCCP stack [1]
+
+Problem here is that when I got rid of listener spinlock, I missed the
+fact that DCCP stores a complex state in struct dccp_request_sock,
+while TCP does not.
+
+Since multiple cpus could access it at the same time, we need to add
+protection.
+
+[1]
+BUG: KASAN: use-after-free in dccp_feat_activate_values+0x967/0xab0
+net/dccp/feat.c:1541 at addr ffff88003713be68
+Read of size 8 by task syz-executor2/8457
+CPU: 2 PID: 8457 Comm: syz-executor2 Not tainted 4.10.0-rc7+ #127
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+Call Trace:
+ <IRQ>
+ __dump_stack lib/dump_stack.c:15 [inline]
+ dump_stack+0x292/0x398 lib/dump_stack.c:51
+ kasan_object_err+0x1c/0x70 mm/kasan/report.c:162
+ print_address_description mm/kasan/report.c:200 [inline]
+ kasan_report_error mm/kasan/report.c:289 [inline]
+ kasan_report.part.1+0x20e/0x4e0 mm/kasan/report.c:311
+ kasan_report mm/kasan/report.c:332 [inline]
+ __asan_report_load8_noabort+0x29/0x30 mm/kasan/report.c:332
+ dccp_feat_activate_values+0x967/0xab0 net/dccp/feat.c:1541
+ dccp_create_openreq_child+0x464/0x610 net/dccp/minisocks.c:121
+ dccp_v6_request_recv_sock+0x1f6/0x1960 net/dccp/ipv6.c:457
+ dccp_check_req+0x335/0x5a0 net/dccp/minisocks.c:186
+ dccp_v6_rcv+0x69e/0x1d00 net/dccp/ipv6.c:711
+ ip6_input_finish+0x46d/0x17a0 net/ipv6/ip6_input.c:279
+ NF_HOOK include/linux/netfilter.h:257 [inline]
+ ip6_input+0xdb/0x590 net/ipv6/ip6_input.c:322
+ dst_input include/net/dst.h:507 [inline]
+ ip6_rcv_finish+0x289/0x890 net/ipv6/ip6_input.c:69
+ NF_HOOK include/linux/netfilter.h:257 [inline]
+ ipv6_rcv+0x12ec/0x23d0 net/ipv6/ip6_input.c:203
+ __netif_receive_skb_core+0x1ae5/0x3400 net/core/dev.c:4190
+ __netif_receive_skb+0x2a/0x170 net/core/dev.c:4228
+ process_backlog+0xe5/0x6c0 net/core/dev.c:4839
+ napi_poll net/core/dev.c:5202 [inline]
+ net_rx_action+0xe70/0x1900 net/core/dev.c:5267
+ __do_softirq+0x2fb/0xb7d kernel/softirq.c:284
+ do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902
+ </IRQ>
+ do_softirq.part.17+0x1e8/0x230 kernel/softirq.c:328
+ do_softirq kernel/softirq.c:176 [inline]
+ __local_bh_enable_ip+0x1f2/0x200 kernel/softirq.c:181
+ local_bh_enable include/linux/bottom_half.h:31 [inline]
+ rcu_read_unlock_bh include/linux/rcupdate.h:971 [inline]
+ ip6_finish_output2+0xbb0/0x23d0 net/ipv6/ip6_output.c:123
+ ip6_finish_output+0x302/0x960 net/ipv6/ip6_output.c:148
+ NF_HOOK_COND include/linux/netfilter.h:246 [inline]
+ ip6_output+0x1cb/0x8d0 net/ipv6/ip6_output.c:162
+ ip6_xmit+0xcdf/0x20d0 include/net/dst.h:501
+ inet6_csk_xmit+0x320/0x5f0 net/ipv6/inet6_connection_sock.c:179
+ dccp_transmit_skb+0xb09/0x1120 net/dccp/output.c:141
+ dccp_xmit_packet+0x215/0x760 net/dccp/output.c:280
+ dccp_write_xmit+0x168/0x1d0 net/dccp/output.c:362
+ dccp_sendmsg+0x79c/0xb10 net/dccp/proto.c:796
+ inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744
+ sock_sendmsg_nosec net/socket.c:635 [inline]
+ sock_sendmsg+0xca/0x110 net/socket.c:645
+ SYSC_sendto+0x660/0x810 net/socket.c:1687
+ SyS_sendto+0x40/0x50 net/socket.c:1655
+ entry_SYSCALL_64_fastpath+0x1f/0xc2
+RIP: 0033:0x4458b9
+RSP: 002b:00007f8ceb77bb58 EFLAGS: 00000282 ORIG_RAX: 000000000000002c
+RAX: ffffffffffffffda RBX: 0000000000000017 RCX: 00000000004458b9
+RDX: 0000000000000023 RSI: 0000000020e60000 RDI: 0000000000000017
+RBP: 00000000006e1b90 R08: 00000000200f9fe1 R09: 0000000000000020
+R10: 0000000000008010 R11: 0000000000000282 R12: 00000000007080a8
+R13: 0000000000000000 R14: 00007f8ceb77c9c0 R15: 00007f8ceb77c700
+Object at ffff88003713be50, in cache kmalloc-64 size: 64
+Allocated:
+PID = 8446
+ save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
+ save_stack+0x43/0xd0 mm/kasan/kasan.c:502
+ set_track mm/kasan/kasan.c:514 [inline]
+ kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605
+ kmem_cache_alloc_trace+0x82/0x270 mm/slub.c:2738
+ kmalloc include/linux/slab.h:490 [inline]
+ dccp_feat_entry_new+0x214/0x410 net/dccp/feat.c:467
+ dccp_feat_push_change+0x38/0x220 net/dccp/feat.c:487
+ __feat_register_sp+0x223/0x2f0 net/dccp/feat.c:741
+ dccp_feat_propagate_ccid+0x22b/0x2b0 net/dccp/feat.c:949
+ dccp_feat_server_ccid_dependencies+0x1b3/0x250 net/dccp/feat.c:1012
+ dccp_make_response+0x1f1/0xc90 net/dccp/output.c:423
+ dccp_v6_send_response+0x4ec/0xc20 net/dccp/ipv6.c:217
+ dccp_v6_conn_request+0xaba/0x11b0 net/dccp/ipv6.c:377
+ dccp_rcv_state_process+0x51e/0x1650 net/dccp/input.c:606
+ dccp_v6_do_rcv+0x213/0x350 net/dccp/ipv6.c:632
+ sk_backlog_rcv include/net/sock.h:893 [inline]
+ __sk_receive_skb+0x36f/0xcc0 net/core/sock.c:479
+ dccp_v6_rcv+0xba5/0x1d00 net/dccp/ipv6.c:742
+ ip6_input_finish+0x46d/0x17a0 net/ipv6/ip6_input.c:279
+ NF_HOOK include/linux/netfilter.h:257 [inline]
+ ip6_input+0xdb/0x590 net/ipv6/ip6_input.c:322
+ dst_input include/net/dst.h:507 [inline]
+ ip6_rcv_finish+0x289/0x890 net/ipv6/ip6_input.c:69
+ NF_HOOK include/linux/netfilter.h:257 [inline]
+ ipv6_rcv+0x12ec/0x23d0 net/ipv6/ip6_input.c:203
+ __netif_receive_skb_core+0x1ae5/0x3400 net/core/dev.c:4190
+ __netif_receive_skb+0x2a/0x170 net/core/dev.c:4228
+ process_backlog+0xe5/0x6c0 net/core/dev.c:4839
+ napi_poll net/core/dev.c:5202 [inline]
+ net_rx_action+0xe70/0x1900 net/core/dev.c:5267
+ __do_softirq+0x2fb/0xb7d kernel/softirq.c:284
+Freed:
+PID = 15
+ save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
+ save_stack+0x43/0xd0 mm/kasan/kasan.c:502
+ set_track mm/kasan/kasan.c:514 [inline]
+ kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:578
+ slab_free_hook mm/slub.c:1355 [inline]
+ slab_free_freelist_hook mm/slub.c:1377 [inline]
+ slab_free mm/slub.c:2954 [inline]
+ kfree+0xe8/0x2b0 mm/slub.c:3874
+ dccp_feat_entry_destructor.part.4+0x48/0x60 net/dccp/feat.c:418
+ dccp_feat_entry_destructor net/dccp/feat.c:416 [inline]
+ dccp_feat_list_pop net/dccp/feat.c:541 [inline]
+ dccp_feat_activate_values+0x57f/0xab0 net/dccp/feat.c:1543
+ dccp_create_openreq_child+0x464/0x610 net/dccp/minisocks.c:121
+ dccp_v6_request_recv_sock+0x1f6/0x1960 net/dccp/ipv6.c:457
+ dccp_check_req+0x335/0x5a0 net/dccp/minisocks.c:186
+ dccp_v6_rcv+0x69e/0x1d00 net/dccp/ipv6.c:711
+ ip6_input_finish+0x46d/0x17a0 net/ipv6/ip6_input.c:279
+ NF_HOOK include/linux/netfilter.h:257 [inline]
+ ip6_input+0xdb/0x590 net/ipv6/ip6_input.c:322
+ dst_input include/net/dst.h:507 [inline]
+ ip6_rcv_finish+0x289/0x890 net/ipv6/ip6_input.c:69
+ NF_HOOK include/linux/netfilter.h:257 [inline]
+ ipv6_rcv+0x12ec/0x23d0 net/ipv6/ip6_input.c:203
+ __netif_receive_skb_core+0x1ae5/0x3400 net/core/dev.c:4190
+ __netif_receive_skb+0x2a/0x170 net/core/dev.c:4228
+ process_backlog+0xe5/0x6c0 net/core/dev.c:4839
+ napi_poll net/core/dev.c:5202 [inline]
+ net_rx_action+0xe70/0x1900 net/core/dev.c:5267
+ __do_softirq+0x2fb/0xb7d kernel/softirq.c:284
+Memory state around the buggy address:
+ ffff88003713bd00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff88003713bd80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+>ffff88003713be00: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb
+                                                          ^
+
+Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Tested-by: Dmitry Vyukov <dvyukov@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/dccp.h |    1 +
+ net/dccp/minisocks.c |   24 ++++++++++++++++--------
+ 2 files changed, 17 insertions(+), 8 deletions(-)
+
+--- a/include/linux/dccp.h
++++ b/include/linux/dccp.h
+@@ -163,6 +163,7 @@ struct dccp_request_sock {
+       __u64                    dreq_isr;
+       __u64                    dreq_gsr;
+       __be32                   dreq_service;
++      spinlock_t               dreq_lock;
+       struct list_head         dreq_featneg;
+       __u32                    dreq_timestamp_echo;
+       __u32                    dreq_timestamp_time;
+--- a/net/dccp/minisocks.c
++++ b/net/dccp/minisocks.c
+@@ -146,6 +146,13 @@ struct sock *dccp_check_req(struct sock
+       struct dccp_request_sock *dreq = dccp_rsk(req);
+       bool own_req;
+ 
++      /* TCP/DCCP listeners became lockless.
++       * DCCP stores complex state in its request_sock, so we need
++       * a protection for them, now this code runs without being protected
++       * by the parent (listener) lock.
++       */
++      spin_lock_bh(&dreq->dreq_lock);
++
+       /* Check for retransmitted REQUEST */
+       if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
+ 
+@@ -160,7 +167,7 @@ struct sock *dccp_check_req(struct sock
+                       inet_rtx_syn_ack(sk, req);
+               }
+               /* Network Duplicate, discard packet */
+-              return NULL;
++              goto out;
+       }
+ 
+       DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR;
+@@ -186,20 +193,20 @@ struct sock *dccp_check_req(struct sock
+ 
+       child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+                                                        req, &own_req);
+-      if (!child)
+-              goto listen_overflow;
+-
+-      return inet_csk_complete_hashdance(sk, child, req, own_req);
++      if (child) {
++              child = inet_csk_complete_hashdance(sk, child, req, own_req);
++              goto out;
++      }
+ 
+-listen_overflow:
+-      dccp_pr_debug("listen_overflow!\n");
+       DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
+ drop:
+       if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET)
+               req->rsk_ops->send_reset(sk, skb);
+ 
+       inet_csk_reqsk_queue_drop(sk, req);
+-      return NULL;
++out:
++      spin_unlock_bh(&dreq->dreq_lock);
++      return child;
+ }
+ 
+ EXPORT_SYMBOL_GPL(dccp_check_req);
+@@ -250,6 +257,7 @@ int dccp_reqsk_init(struct request_sock
+ {
+       struct dccp_request_sock *dreq = dccp_rsk(req);
+ 
++      spin_lock_init(&dreq->dreq_lock);
+       inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport;
+       inet_rsk(req)->ir_num      = ntohs(dccp_hdr(skb)->dccph_dport);
+       inet_rsk(req)->acked       = 0;
diff --git a/queue-4.9/dccp-tcp-fix-routing-redirect-race.patch b/queue-4.9/dccp-tcp-fix-routing-redirect-race.patch

new file mode 100644 (file)

index 0000000..4b0ce16
--- /dev/null
+++ b/queue-4.9/dccp-tcp-fix-routing-redirect-race.patch
@@ -0,0 +1,160 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Jon Maxwell <jmaxwell37@gmail.com>
+Date: Fri, 10 Mar 2017 16:40:33 +1100
+Subject: dccp/tcp: fix routing redirect race
+
+From: Jon Maxwell <jmaxwell37@gmail.com>
+
+
+[ Upstream commit 45caeaa5ac0b4b11784ac6f932c0ad4c6b67cda0 ]
+
+As Eric Dumazet pointed out this also needs to be fixed in IPv6.
+v2: Contains the IPv6 tcp/Ipv6 dccp patches as well.
+
+We have seen a few incidents lately where a dst_enty has been freed
+with a dangling TCP socket reference (sk->sk_dst_cache) pointing to that
+dst_entry. If the conditions/timings are right a crash then ensues when the
+freed dst_entry is referenced later on. A Common crashing back trace is:
+
+ #8 [] page_fault at ffffffff8163e648
+    [exception RIP: __tcp_ack_snd_check+74]
+.
+.
+ #9 [] tcp_rcv_established at ffffffff81580b64
+#10 [] tcp_v4_do_rcv at ffffffff8158b54a
+#11 [] tcp_v4_rcv at ffffffff8158cd02
+#12 [] ip_local_deliver_finish at ffffffff815668f4
+#13 [] ip_local_deliver at ffffffff81566bd9
+#14 [] ip_rcv_finish at ffffffff8156656d
+#15 [] ip_rcv at ffffffff81566f06
+#16 [] __netif_receive_skb_core at ffffffff8152b3a2
+#17 [] __netif_receive_skb at ffffffff8152b608
+#18 [] netif_receive_skb at ffffffff8152b690
+#19 [] vmxnet3_rq_rx_complete at ffffffffa015eeaf [vmxnet3]
+#20 [] vmxnet3_poll_rx_only at ffffffffa015f32a [vmxnet3]
+#21 [] net_rx_action at ffffffff8152bac2
+#22 [] __do_softirq at ffffffff81084b4f
+#23 [] call_softirq at ffffffff8164845c
+#24 [] do_softirq at ffffffff81016fc5
+#25 [] irq_exit at ffffffff81084ee5
+#26 [] do_IRQ at ffffffff81648ff8
+
+Of course it may happen with other NIC drivers as well.
+
+It's found the freed dst_entry here:
+
+ 224 static bool tcp_in_quickack_mode(struct sock *sk)↩
+ 225 {↩
+ 226 ▹       const struct inet_connection_sock *icsk = inet_csk(sk);↩
+ 227 ▹       const struct dst_entry *dst = __sk_dst_get(sk);↩
+ 228 ↩
+ 229 ▹       return (dst && dst_metric(dst, RTAX_QUICKACK)) ||↩
+ 230 ▹       ▹       (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);↩
+ 231 }↩
+
+But there are other backtraces attributed to the same freed dst_entry in
+netfilter code as well.
+
+All the vmcores showed 2 significant clues:
+
+- Remote hosts behind the default gateway had always been redirected to a
+different gateway. A rtable/dst_entry will be added for that host. Making
+more dst_entrys with lower reference counts. Making this more probable.
+
+- All vmcores showed a postitive LockDroppedIcmps value, e.g:
+
+LockDroppedIcmps                  267
+
+A closer look at the tcp_v4_err() handler revealed that do_redirect() will run
+regardless of whether user space has the socket locked. This can result in a
+race condition where the same dst_entry cached in sk->sk_dst_entry can be
+decremented twice for the same socket via:
+
+do_redirect()->__sk_dst_check()-> dst_release().
+
+Which leads to the dst_entry being prematurely freed with another socket
+pointing to it via sk->sk_dst_cache and a subsequent crash.
+
+To fix this skip do_redirect() if usespace has the socket locked. Instead let
+the redirect take place later when user space does not have the socket
+locked.
+
+The dccp/IPv6 code is very similar in this respect, so fixing it there too.
+
+As Eric Garver pointed out the following commit now invalidates routes. Which
+can set the dst->obsolete flag so that ipv4_dst_check() returns null and
+triggers the dst_release().
+
+Fixes: ceb3320610d6 ("ipv4: Kill routes during PMTU/redirect updates.")
+Cc: Eric Garver <egarver@redhat.com>
+Cc: Hannes Sowa <hsowa@redhat.com>
+Signed-off-by: Jon Maxwell <jmaxwell37@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/dccp/ipv4.c     |    3 ++-
+ net/dccp/ipv6.c     |    8 +++++---
+ net/ipv4/tcp_ipv4.c |    3 ++-
+ net/ipv6/tcp_ipv6.c |    8 +++++---
+ 4 files changed, 14 insertions(+), 8 deletions(-)
+
+--- a/net/dccp/ipv4.c
++++ b/net/dccp/ipv4.c
+@@ -289,7 +289,8 @@ static void dccp_v4_err(struct sk_buff *
+ 
+       switch (type) {
+       case ICMP_REDIRECT:
+-              dccp_do_redirect(skb, sk);
++              if (!sock_owned_by_user(sk))
++                      dccp_do_redirect(skb, sk);
+               goto out;
+       case ICMP_SOURCE_QUENCH:
+               /* Just silently ignore these. */
+--- a/net/dccp/ipv6.c
++++ b/net/dccp/ipv6.c
+@@ -122,10 +122,12 @@ static void dccp_v6_err(struct sk_buff *
+       np = inet6_sk(sk);
+ 
+       if (type == NDISC_REDIRECT) {
+-              struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
++              if (!sock_owned_by_user(sk)) {
++                      struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
+ 
+-              if (dst)
+-                      dst->ops->redirect(dst, sk, skb);
++                      if (dst)
++                              dst->ops->redirect(dst, sk, skb);
++              }
+               goto out;
+       }
+ 
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -421,7 +421,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb
+ 
+       switch (type) {
+       case ICMP_REDIRECT:
+-              do_redirect(icmp_skb, sk);
++              if (!sock_owned_by_user(sk))
++                      do_redirect(icmp_skb, sk);
+               goto out;
+       case ICMP_SOURCE_QUENCH:
+               /* Just silently ignore these. */
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -375,10 +375,12 @@ static void tcp_v6_err(struct sk_buff *s
+       np = inet6_sk(sk);
+ 
+       if (type == NDISC_REDIRECT) {
+-              struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
++              if (!sock_owned_by_user(sk)) {
++                      struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie);
+ 
+-              if (dst)
+-                      dst->ops->redirect(dst, sk, skb);
++                      if (dst)
++                              dst->ops->redirect(dst, sk, skb);
++              }
+               goto out;
+       }
+ 
diff --git a/queue-4.9/dccp-unlock-sock-before-calling-sk_free.patch b/queue-4.9/dccp-unlock-sock-before-calling-sk_free.patch

new file mode 100644 (file)

index 0000000..9153f58
--- /dev/null
+++ b/queue-4.9/dccp-unlock-sock-before-calling-sk_free.patch
@@ -0,0 +1,81 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Arnaldo Carvalho de Melo <acme@redhat.com>
+Date: Wed, 1 Mar 2017 16:35:07 -0300
+Subject: dccp: Unlock sock before calling sk_free()
+
+From: Arnaldo Carvalho de Melo <acme@redhat.com>
+
+
+[ Upstream commit d5afb6f9b6bb2c57bd0c05e76e12489dc0d037d9 ]
+
+The code where sk_clone() came from created a new socket and locked it,
+but then, on the error path didn't unlock it.
+
+This problem stayed there for a long while, till b0691c8ee7c2 ("net:
+Unlock sock before calling sk_free()") fixed it, but unfortunately the
+callers of sk_clone() (now sk_clone_locked()) were not audited and the
+one in dccp_create_openreq_child() remained.
+
+Now in the age of the syskaller fuzzer, this was finally uncovered, as
+reported by Dmitry:
+
+ ---- 8< ----
+
+I've got the following report while running syzkaller fuzzer on
+86292b33d4b7 ("Merge branch 'akpm' (patches from Andrew)")
+
+  [ BUG: held lock freed! ]
+  4.10.0+ #234 Not tainted
+  -------------------------
+  syz-executor6/6898 is freeing memory
+  ffff88006286cac0-ffff88006286d3b7, with a lock still held there!
+   (slock-AF_INET6){+.-...}, at: [<ffffffff8362c2c9>] spin_lock
+  include/linux/spinlock.h:299 [inline]
+   (slock-AF_INET6){+.-...}, at: [<ffffffff8362c2c9>]
+  sk_clone_lock+0x3d9/0x12c0 net/core/sock.c:1504
+  5 locks held by syz-executor6/6898:
+   #0:  (sk_lock-AF_INET6){+.+.+.}, at: [<ffffffff839a34b4>] lock_sock
+  include/net/sock.h:1460 [inline]
+   #0:  (sk_lock-AF_INET6){+.+.+.}, at: [<ffffffff839a34b4>]
+  inet_stream_connect+0x44/0xa0 net/ipv4/af_inet.c:681
+   #1:  (rcu_read_lock){......}, at: [<ffffffff83bc1c2a>]
+  inet6_csk_xmit+0x12a/0x5d0 net/ipv6/inet6_connection_sock.c:126
+   #2:  (rcu_read_lock){......}, at: [<ffffffff8369b424>] __skb_unlink
+  include/linux/skbuff.h:1767 [inline]
+   #2:  (rcu_read_lock){......}, at: [<ffffffff8369b424>] __skb_dequeue
+  include/linux/skbuff.h:1783 [inline]
+   #2:  (rcu_read_lock){......}, at: [<ffffffff8369b424>]
+  process_backlog+0x264/0x730 net/core/dev.c:4835
+   #3:  (rcu_read_lock){......}, at: [<ffffffff83aeb5c0>]
+  ip6_input_finish+0x0/0x1700 net/ipv6/ip6_input.c:59
+   #4:  (slock-AF_INET6){+.-...}, at: [<ffffffff8362c2c9>] spin_lock
+  include/linux/spinlock.h:299 [inline]
+   #4:  (slock-AF_INET6){+.-...}, at: [<ffffffff8362c2c9>]
+  sk_clone_lock+0x3d9/0x12c0 net/core/sock.c:1504
+
+Fix it just like was done by b0691c8ee7c2 ("net: Unlock sock before calling
+sk_free()").
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Cc: Cong Wang <xiyou.wangcong@gmail.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/20170301153510.GE15145@kernel.org
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/dccp/minisocks.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/dccp/minisocks.c
++++ b/net/dccp/minisocks.c
+@@ -122,6 +122,7 @@ struct sock *dccp_create_openreq_child(c
+                       /* It is still raw copy of parent, so invalidate
+                        * destructor and make plain sk_free() */
+                       newsk->sk_destruct = NULL;
++                      bh_unlock_sock(newsk);
+                       sk_free(newsk);
+                       return NULL;
+               }
diff --git a/queue-4.9/geneve-lock-rcu-on-tx-path.patch b/queue-4.9/geneve-lock-rcu-on-tx-path.patch

new file mode 100644 (file)

index 0000000..2cd0330
--- /dev/null
+++ b/queue-4.9/geneve-lock-rcu-on-tx-path.patch
@@ -0,0 +1,48 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Jakub Kicinski <jakub.kicinski@netronome.com>
+Date: Fri, 24 Feb 2017 11:43:37 -0800
+Subject: geneve: lock RCU on TX path
+
+From: Jakub Kicinski <jakub.kicinski@netronome.com>
+
+
+[ Upstream commit a717e3f740803cc88bd5c9a70c93504f6a368663 ]
+
+There is no guarantees that callers of the TX path will hold
+the RCU lock.  Grab it explicitly.
+
+Fixes: fceb9c3e3825 ("geneve: avoid using stale geneve socket.")
+Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/geneve.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/geneve.c
++++ b/drivers/net/geneve.c
+@@ -1039,16 +1039,22 @@ static netdev_tx_t geneve_xmit(struct sk
+ {
+       struct geneve_dev *geneve = netdev_priv(dev);
+       struct ip_tunnel_info *info = NULL;
++      int err;
+ 
+       if (geneve->collect_md)
+               info = skb_tunnel_info(skb);
+ 
++      rcu_read_lock();
+ #if IS_ENABLED(CONFIG_IPV6)
+       if ((info && ip_tunnel_info_af(info) == AF_INET6) ||
+           (!info && geneve->remote.sa.sa_family == AF_INET6))
+-              return geneve6_xmit_skb(skb, dev, info);
++              err = geneve6_xmit_skb(skb, dev, info);
++      else
+ #endif
+-      return geneve_xmit_skb(skb, dev, info);
++              err = geneve_xmit_skb(skb, dev, info);
++      rcu_read_unlock();
++
++      return err;
+ }
+ 
+ static int __geneve_change_mtu(struct net_device *dev, int new_mtu, bool strict)
diff --git a/queue-4.9/ipv4-mask-tos-for-input-route.patch b/queue-4.9/ipv4-mask-tos-for-input-route.patch

new file mode 100644 (file)

index 0000000..b3946a5
--- /dev/null
+++ b/queue-4.9/ipv4-mask-tos-for-input-route.patch
@@ -0,0 +1,35 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Julian Anastasov <ja@ssi.bg>
+Date: Sun, 26 Feb 2017 17:14:35 +0200
+Subject: ipv4: mask tos for input route
+
+From: Julian Anastasov <ja@ssi.bg>
+
+
+[ Upstream commit 6e28099d38c0e50d62c1afc054e37e573adf3d21 ]
+
+Restore the lost masking of TOS in input route code to
+allow ip rules to match it properly.
+
+Problem [1] noticed by Shmulik Ladkani <shmulik.ladkani@gmail.com>
+
+[1] http://marc.info/?t=137331755300040&r=1&w=2
+
+Fixes: 89aef8921bfb ("ipv4: Delete routing cache.")
+Signed-off-by: Julian Anastasov <ja@ssi.bg>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/route.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -1968,6 +1968,7 @@ int ip_route_input_noref(struct sk_buff
+ {
+       int res;
+ 
++      tos &= IPTOS_RT_MASK;
+       rcu_read_lock();
+ 
+       /* Multicast recognition logic is moved from route cache to here.
diff --git a/queue-4.9/ipv6-avoid-write-to-a-possibly-cloned-skb.patch b/queue-4.9/ipv6-avoid-write-to-a-possibly-cloned-skb.patch

new file mode 100644 (file)

index 0000000..ceafd11
--- /dev/null
+++ b/queue-4.9/ipv6-avoid-write-to-a-possibly-cloned-skb.patch
@@ -0,0 +1,65 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Florian Westphal <fw@strlen.de>
+Date: Mon, 13 Mar 2017 16:24:28 +0100
+Subject: ipv6: avoid write to a possibly cloned skb
+
+From: Florian Westphal <fw@strlen.de>
+
+
+[ Upstream commit 79e49503efe53a8c51d8b695bedc8a346c5e4a87 ]
+
+ip6_fragment, in case skb has a fraglist, checks if the
+skb is cloned.  If it is, it will move to the 'slow path' and allocates
+new skbs for each fragment.
+
+However, right before entering the slowpath loop, it updates the
+nexthdr value of the last ipv6 extension header to NEXTHDR_FRAGMENT,
+to account for the fragment header that will be inserted in the new
+ipv6-fragment skbs.
+
+In case original skb is cloned this munges nexthdr value of another
+skb.  Avoid this by doing the nexthdr update for each of the new fragment
+skbs separately.
+
+This was observed with tcpdump on a bridge device where netfilter ipv6
+reassembly is active:  tcpdump shows malformed fragment headers as
+the l4 header (icmpv6, tcp, etc). is decoded as a fragment header.
+
+Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
+Reported-by: Andreas Karis <akaris@redhat.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_output.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -757,13 +757,14 @@ slow_path:
+        *      Fragment the datagram.
+        */
+ 
+-      *prevhdr = NEXTHDR_FRAGMENT;
+       troom = rt->dst.dev->needed_tailroom;
+ 
+       /*
+        *      Keep copying data until we run out.
+        */
+       while (left > 0)        {
++              u8 *fragnexthdr_offset;
++
+               len = left;
+               /* IF: it doesn't fit, use 'mtu' - the data space left */
+               if (len > mtu)
+@@ -808,6 +809,10 @@ slow_path:
+                */
+               skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
+ 
++              fragnexthdr_offset = skb_network_header(frag);
++              fragnexthdr_offset += prevhdr - skb_network_header(skb);
++              *fragnexthdr_offset = NEXTHDR_FRAGMENT;
++
+               /*
+                *      Build fragment header.
+                */
diff --git a/queue-4.9/ipv6-make-ecmp-route-replacement-less-greedy.patch b/queue-4.9/ipv6-make-ecmp-route-replacement-less-greedy.patch

new file mode 100644 (file)

index 0000000..f6ff821
--- /dev/null
+++ b/queue-4.9/ipv6-make-ecmp-route-replacement-less-greedy.patch
@@ -0,0 +1,71 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Sabrina Dubroca <sd@queasysnail.net>
+Date: Mon, 13 Mar 2017 13:28:09 +0100
+Subject: ipv6: make ECMP route replacement less greedy
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+
+[ Upstream commit 67e194007be08d071294456274dd53e0a04fdf90 ]
+
+Commit 27596472473a ("ipv6: fix ECMP route replacement") introduced a
+loop that removes all siblings of an ECMP route that is being
+replaced. However, this loop doesn't stop when it has replaced
+siblings, and keeps removing other routes with a higher metric.
+We also end up triggering the WARN_ON after the loop, because after
+this nsiblings < 0.
+
+Instead, stop the loop when we have taken care of all routes with the
+same metric as the route being replaced.
+
+  Reproducer:
+  ===========
+    #!/bin/sh
+
+    ip netns add ns1
+    ip netns add ns2
+    ip -net ns1 link set lo up
+
+    for x in 0 1 2 ; do
+        ip link add veth$x netns ns2 type veth peer name eth$x netns ns1
+        ip -net ns1 link set eth$x up
+        ip -net ns2 link set veth$x up
+    done
+
+    ip -net ns1 -6 r a 2000::/64 nexthop via fe80::0 dev eth0 \
+            nexthop via fe80::1 dev eth1 nexthop via fe80::2 dev eth2
+    ip -net ns1 -6 r a 2000::/64 via fe80::42 dev eth0 metric 256
+    ip -net ns1 -6 r a 2000::/64 via fe80::43 dev eth0 metric 2048
+
+    echo "before replace, 3 routes"
+    ip -net ns1 -6 r | grep -v '^fe80\|^ff00'
+    echo
+
+    ip -net ns1 -6 r c 2000::/64 nexthop via fe80::4 dev eth0 \
+            nexthop via fe80::5 dev eth1 nexthop via fe80::6 dev eth2
+
+    echo "after replace, only 2 routes, metric 2048 is gone"
+    ip -net ns1 -6 r | grep -v '^fe80\|^ff00'
+
+Fixes: 27596472473a ("ipv6: fix ECMP route replacement")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+Reviewed-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Michal Kubecek <mkubecek@suse.cz>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_fib.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/ipv6/ip6_fib.c
++++ b/net/ipv6/ip6_fib.c
+@@ -908,6 +908,8 @@ add:
+                       ins = &rt->dst.rt6_next;
+                       iter = *ins;
+                       while (iter) {
++                              if (iter->rt6i_metric > rt->rt6i_metric)
++                                      break;
+                               if (rt6_qualify_for_ecmp(iter)) {
+                                       *ins = iter->dst.rt6_next;
+                                       fib6_purge_rt(iter, fn, info->nl_net);
diff --git a/queue-4.9/ipv6-orphan-skbs-in-reassembly-unit.patch b/queue-4.9/ipv6-orphan-skbs-in-reassembly-unit.patch

new file mode 100644 (file)

index 0000000..6a88a73
--- /dev/null
+++ b/queue-4.9/ipv6-orphan-skbs-in-reassembly-unit.patch
@@ -0,0 +1,172 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 1 Mar 2017 14:45:06 -0800
+Subject: ipv6: orphan skbs in reassembly unit
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 48cac18ecf1de82f76259a54402c3adb7839ad01 ]
+
+Andrey reported a use-after-free in IPv6 stack.
+
+Issue here is that we free the socket while it still has skb
+in TX path and in some queues.
+
+It happens here because IPv6 reassembly unit messes skb->truesize,
+breaking skb_set_owner_w() badly.
+
+We fixed a similar issue for IPV4 in commit 8282f27449bf ("inet: frag:
+Always orphan skbs inside ip_defrag()")
+Acked-by: Joe Stringer <joe@ovn.org>
+
+==================================================================
+BUG: KASAN: use-after-free in sock_wfree+0x118/0x120
+Read of size 8 at addr ffff880062da0060 by task a.out/4140
+
+page:ffffea00018b6800 count:1 mapcount:0 mapping:          (null)
+index:0x0 compound_mapcount: 0
+flags: 0x100000000008100(slab|head)
+raw: 0100000000008100 0000000000000000 0000000000000000 0000000180130013
+raw: dead000000000100 dead000000000200 ffff88006741f140 0000000000000000
+page dumped because: kasan: bad access detected
+
+CPU: 0 PID: 4140 Comm: a.out Not tainted 4.10.0-rc3+ #59
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+Call Trace:
+ __dump_stack lib/dump_stack.c:15
+ dump_stack+0x292/0x398 lib/dump_stack.c:51
+ describe_address mm/kasan/report.c:262
+ kasan_report_error+0x121/0x560 mm/kasan/report.c:370
+ kasan_report mm/kasan/report.c:392
+ __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:413
+ sock_flag ./arch/x86/include/asm/bitops.h:324
+ sock_wfree+0x118/0x120 net/core/sock.c:1631
+ skb_release_head_state+0xfc/0x250 net/core/skbuff.c:655
+ skb_release_all+0x15/0x60 net/core/skbuff.c:668
+ __kfree_skb+0x15/0x20 net/core/skbuff.c:684
+ kfree_skb+0x16e/0x4e0 net/core/skbuff.c:705
+ inet_frag_destroy+0x121/0x290 net/ipv4/inet_fragment.c:304
+ inet_frag_put ./include/net/inet_frag.h:133
+ nf_ct_frag6_gather+0x1125/0x38b0 net/ipv6/netfilter/nf_conntrack_reasm.c:617
+ ipv6_defrag+0x21b/0x350 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68
+ nf_hook_entry_hookfn ./include/linux/netfilter.h:102
+ nf_hook_slow+0xc3/0x290 net/netfilter/core.c:310
+ nf_hook ./include/linux/netfilter.h:212
+ __ip6_local_out+0x52c/0xaf0 net/ipv6/output_core.c:160
+ ip6_local_out+0x2d/0x170 net/ipv6/output_core.c:170
+ ip6_send_skb+0xa1/0x340 net/ipv6/ip6_output.c:1722
+ ip6_push_pending_frames+0xb3/0xe0 net/ipv6/ip6_output.c:1742
+ rawv6_push_pending_frames net/ipv6/raw.c:613
+ rawv6_sendmsg+0x2cff/0x4130 net/ipv6/raw.c:927
+ inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744
+ sock_sendmsg_nosec net/socket.c:635
+ sock_sendmsg+0xca/0x110 net/socket.c:645
+ sock_write_iter+0x326/0x620 net/socket.c:848
+ new_sync_write fs/read_write.c:499
+ __vfs_write+0x483/0x760 fs/read_write.c:512
+ vfs_write+0x187/0x530 fs/read_write.c:560
+ SYSC_write fs/read_write.c:607
+ SyS_write+0xfb/0x230 fs/read_write.c:599
+ entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203
+RIP: 0033:0x7ff26e6f5b79
+RSP: 002b:00007ff268e0ed98 EFLAGS: 00000206 ORIG_RAX: 0000000000000001
+RAX: ffffffffffffffda RBX: 00007ff268e0f9c0 RCX: 00007ff26e6f5b79
+RDX: 0000000000000010 RSI: 0000000020f50fe1 RDI: 0000000000000003
+RBP: 00007ff26ebc1220 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000
+R13: 00007ff268e0f9c0 R14: 00007ff26efec040 R15: 0000000000000003
+
+The buggy address belongs to the object at ffff880062da0000
+ which belongs to the cache RAWv6 of size 1504
+The buggy address ffff880062da0060 is located 96 bytes inside
+ of 1504-byte region [ffff880062da0000, ffff880062da05e0)
+
+Freed by task 4113:
+ save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
+ save_stack+0x43/0xd0 mm/kasan/kasan.c:502
+ set_track mm/kasan/kasan.c:514
+ kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:578
+ slab_free_hook mm/slub.c:1352
+ slab_free_freelist_hook mm/slub.c:1374
+ slab_free mm/slub.c:2951
+ kmem_cache_free+0xb2/0x2c0 mm/slub.c:2973
+ sk_prot_free net/core/sock.c:1377
+ __sk_destruct+0x49c/0x6e0 net/core/sock.c:1452
+ sk_destruct+0x47/0x80 net/core/sock.c:1460
+ __sk_free+0x57/0x230 net/core/sock.c:1468
+ sk_free+0x23/0x30 net/core/sock.c:1479
+ sock_put ./include/net/sock.h:1638
+ sk_common_release+0x31e/0x4e0 net/core/sock.c:2782
+ rawv6_close+0x54/0x80 net/ipv6/raw.c:1214
+ inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425
+ inet6_release+0x50/0x70 net/ipv6/af_inet6.c:431
+ sock_release+0x8d/0x1e0 net/socket.c:599
+ sock_close+0x16/0x20 net/socket.c:1063
+ __fput+0x332/0x7f0 fs/file_table.c:208
+ ____fput+0x15/0x20 fs/file_table.c:244
+ task_work_run+0x19b/0x270 kernel/task_work.c:116
+ exit_task_work ./include/linux/task_work.h:21
+ do_exit+0x186b/0x2800 kernel/exit.c:839
+ do_group_exit+0x149/0x420 kernel/exit.c:943
+ SYSC_exit_group kernel/exit.c:954
+ SyS_exit_group+0x1d/0x20 kernel/exit.c:952
+ entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203
+
+Allocated by task 4115:
+ save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57
+ save_stack+0x43/0xd0 mm/kasan/kasan.c:502
+ set_track mm/kasan/kasan.c:514
+ kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605
+ kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:544
+ slab_post_alloc_hook mm/slab.h:432
+ slab_alloc_node mm/slub.c:2708
+ slab_alloc mm/slub.c:2716
+ kmem_cache_alloc+0x1af/0x250 mm/slub.c:2721
+ sk_prot_alloc+0x65/0x2a0 net/core/sock.c:1334
+ sk_alloc+0x105/0x1010 net/core/sock.c:1396
+ inet6_create+0x44d/0x1150 net/ipv6/af_inet6.c:183
+ __sock_create+0x4f6/0x880 net/socket.c:1199
+ sock_create net/socket.c:1239
+ SYSC_socket net/socket.c:1269
+ SyS_socket+0xf9/0x230 net/socket.c:1249
+ entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203
+
+Memory state around the buggy address:
+ ffff880062d9ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff880062d9ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+>ffff880062da0000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                                                       ^
+ ffff880062da0080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff880062da0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+==================================================================
+
+Reported-by: Andrey Konovalov <andreyknvl@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/netfilter/nf_conntrack_reasm.c |    1 +
+ net/openvswitch/conntrack.c             |    1 -
+ 2 files changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -589,6 +589,7 @@ int nf_ct_frag6_gather(struct net *net,
+       hdr = ipv6_hdr(skb);
+       fhdr = (struct frag_hdr *)skb_transport_header(skb);
+ 
++      skb_orphan(skb);
+       fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
+                    skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+       if (fq == NULL) {
+--- a/net/openvswitch/conntrack.c
++++ b/net/openvswitch/conntrack.c
+@@ -367,7 +367,6 @@ static int handle_fragments(struct net *
+       } else if (key->eth.type == htons(ETH_P_IPV6)) {
+               enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
+ 
+-              skb_orphan(skb);
+               memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+               err = nf_ct_frag6_gather(net, skb, user);
+               if (err) {
diff --git a/queue-4.9/l2tp-avoid-use-after-free-caused-by-l2tp_ip_backlog_recv.patch b/queue-4.9/l2tp-avoid-use-after-free-caused-by-l2tp_ip_backlog_recv.patch

new file mode 100644 (file)

index 0000000..f7af697
--- /dev/null
+++ b/queue-4.9/l2tp-avoid-use-after-free-caused-by-l2tp_ip_backlog_recv.patch
@@ -0,0 +1,32 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Paul Hüber <phueber@kernsp.in>
+Date: Sun, 26 Feb 2017 17:58:19 +0100
+Subject: l2tp: avoid use-after-free caused by l2tp_ip_backlog_recv
+
+From: Paul Hüber <phueber@kernsp.in>
+
+
+[ Upstream commit 51fb60eb162ab84c5edf2ae9c63cf0b878e5547e ]
+
+l2tp_ip_backlog_recv may not return -1 if the packet gets dropped.
+The return value is passed up to ip_local_deliver_finish, which treats
+negative values as an IP protocol number for resubmission.
+
+Signed-off-by: Paul Hüber <phueber@kernsp.in>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/l2tp/l2tp_ip.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/l2tp/l2tp_ip.c
++++ b/net/l2tp/l2tp_ip.c
+@@ -388,7 +388,7 @@ static int l2tp_ip_backlog_recv(struct s
+ drop:
+       IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS);
+       kfree_skb(skb);
+-      return -1;
++      return 0;
+ }
+ 
+ /* Userspace will call sendmsg() on the tunnel socket to send L2TP
diff --git a/queue-4.9/mlxsw-spectrum_router-avoid-potential-packets-loss.patch b/queue-4.9/mlxsw-spectrum_router-avoid-potential-packets-loss.patch

new file mode 100644 (file)

index 0000000..90b49a8
--- /dev/null
+++ b/queue-4.9/mlxsw-spectrum_router-avoid-potential-packets-loss.patch
@@ -0,0 +1,78 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Ido Schimmel <idosch@mellanox.com>
+Date: Tue, 28 Feb 2017 08:55:40 +0100
+Subject: mlxsw: spectrum_router: Avoid potential packets loss
+
+From: Ido Schimmel <idosch@mellanox.com>
+
+
+[ Upstream commit f7df4923fa986247e93ec2cdff5ca168fff14dcf ]
+
+When the structure of the LPM tree changes (f.e., due to the addition of
+a new prefix), we unbind the old tree and then bind the new one. This
+may result in temporary packet loss.
+
+Instead, overwrite the old binding with the new one.
+
+Fixes: 6b75c4807db3 ("mlxsw: spectrum_router: Add virtual router management")
+Signed-off-by: Ido Schimmel <idosch@mellanox.com>
+Signed-off-by: Jiri Pirko <jiri@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c |   30 ++++++++++++------
+ 1 file changed, 20 insertions(+), 10 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+@@ -500,30 +500,40 @@ static int
+ mlxsw_sp_vr_lpm_tree_check(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_vr *vr,
+                          struct mlxsw_sp_prefix_usage *req_prefix_usage)
+ {
+-      struct mlxsw_sp_lpm_tree *lpm_tree;
++      struct mlxsw_sp_lpm_tree *lpm_tree = vr->lpm_tree;
++      struct mlxsw_sp_lpm_tree *new_tree;
++      int err;
+ 
+-      if (mlxsw_sp_prefix_usage_eq(req_prefix_usage,
+-                                   &vr->lpm_tree->prefix_usage))
++      if (mlxsw_sp_prefix_usage_eq(req_prefix_usage, &lpm_tree->prefix_usage))
+               return 0;
+ 
+-      lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, req_prefix_usage,
++      new_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, req_prefix_usage,
+                                        vr->proto, false);
+-      if (IS_ERR(lpm_tree)) {
++      if (IS_ERR(new_tree)) {
+               /* We failed to get a tree according to the required
+                * prefix usage. However, the current tree might be still good
+                * for us if our requirement is subset of the prefixes used
+                * in the tree.
+                */
+               if (mlxsw_sp_prefix_usage_subset(req_prefix_usage,
+-                                               &vr->lpm_tree->prefix_usage))
++                                               &lpm_tree->prefix_usage))
+                       return 0;
+-              return PTR_ERR(lpm_tree);
++              return PTR_ERR(new_tree);
+       }
+ 
+-      mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, vr);
+-      mlxsw_sp_lpm_tree_put(mlxsw_sp, vr->lpm_tree);
++      /* Prevent packet loss by overwriting existing binding */
++      vr->lpm_tree = new_tree;
++      err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, vr);
++      if (err)
++              goto err_tree_bind;
++      mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree);
++
++      return 0;
++
++err_tree_bind:
+       vr->lpm_tree = lpm_tree;
+-      return mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, vr);
++      mlxsw_sp_lpm_tree_put(mlxsw_sp, new_tree);
++      return err;
+ }
+ 
+ static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp,
diff --git a/queue-4.9/mpls-do-not-decrement-alive-counter-for-unregister-events.patch b/queue-4.9/mpls-do-not-decrement-alive-counter-for-unregister-events.patch

new file mode 100644 (file)

index 0000000..85ab3b5
--- /dev/null
+++ b/queue-4.9/mpls-do-not-decrement-alive-counter-for-unregister-events.patch
@@ -0,0 +1,53 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: David Ahern <dsa@cumulusnetworks.com>
+Date: Fri, 10 Mar 2017 14:11:39 -0800
+Subject: mpls: Do not decrement alive counter for unregister events
+
+From: David Ahern <dsa@cumulusnetworks.com>
+
+
+[ Upstream commit 79099aab38c8f5c746748b066ae74ba984fe2cc8 ]
+
+Multipath routes can be rendered usesless when a device in one of the
+paths is deleted. For example:
+
+$ ip -f mpls ro ls
+100
+       nexthop as to 200 via inet 172.16.2.2  dev virt12
+       nexthop as to 300 via inet 172.16.3.2  dev br0
+101
+       nexthop as to 201 via inet6 2000:2::2  dev virt12
+       nexthop as to 301 via inet6 2000:3::2  dev br0
+
+$ ip li del br0
+
+When br0 is deleted the other hop is not considered in
+mpls_select_multipath because of the alive check -- rt_nhn_alive
+is 0.
+
+rt_nhn_alive is decremented once in mpls_ifdown when the device is taken
+down (NETDEV_DOWN) and again when it is deleted (NETDEV_UNREGISTER). For
+a 2 hop route, deleting one device drops the alive count to 0. Since
+devices are taken down before unregistering, the decrement on
+NETDEV_UNREGISTER is redundant.
+
+Fixes: c89359a42e2a4 ("mpls: support for dead routes")
+Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mpls/af_mpls.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/mpls/af_mpls.c
++++ b/net/mpls/af_mpls.c
+@@ -956,7 +956,8 @@ static void mpls_ifdown(struct net_devic
+                               /* fall through */
+                       case NETDEV_CHANGE:
+                               nh->nh_flags |= RTNH_F_LINKDOWN;
+-                              ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1;
++                              if (event != NETDEV_UNREGISTER)
++                                      ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1;
+                               break;
+                       }
+                       if (event == NETDEV_UNREGISTER)
diff --git a/queue-4.9/mpls-send-route-delete-notifications-when-router-module-is-unloaded.patch b/queue-4.9/mpls-send-route-delete-notifications-when-router-module-is-unloaded.patch

new file mode 100644 (file)

index 0000000..8e3aff3
--- /dev/null
+++ b/queue-4.9/mpls-send-route-delete-notifications-when-router-module-is-unloaded.patch
@@ -0,0 +1,33 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: David Ahern <dsa@cumulusnetworks.com>
+Date: Fri, 10 Mar 2017 09:46:15 -0800
+Subject: mpls: Send route delete notifications when router module is unloaded
+
+From: David Ahern <dsa@cumulusnetworks.com>
+
+
+[ Upstream commit e37791ec1ad785b59022ae211f63a16189bacebf ]
+
+When the mpls_router module is unloaded, mpls routes are deleted but
+notifications are not sent to userspace leaving userspace caches
+out of sync. Add the call to mpls_notify_route in mpls_net_exit as
+routes are freed.
+
+Fixes: 0189197f44160 ("mpls: Basic routing support")
+Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mpls/af_mpls.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/mpls/af_mpls.c
++++ b/net/mpls/af_mpls.c
+@@ -1696,6 +1696,7 @@ static void mpls_net_exit(struct net *ne
+       for (index = 0; index < platform_labels; index++) {
+               struct mpls_route *rt = rtnl_dereference(platform_label[index]);
+               RCU_INIT_POINTER(platform_label[index], NULL);
++              mpls_notify_route(net, index, rt, NULL, NULL);
+               mpls_rt_free(rt);
+       }
+       rtnl_unlock();
diff --git a/queue-4.9/net-bridge-allow-ipv6-when-multicast-flood-is-disabled.patch b/queue-4.9/net-bridge-allow-ipv6-when-multicast-flood-is-disabled.patch

new file mode 100644 (file)

index 0000000..416061f
--- /dev/null
+++ b/queue-4.9/net-bridge-allow-ipv6-when-multicast-flood-is-disabled.patch
@@ -0,0 +1,37 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Mike Manning <mmanning@brocade.com>
+Date: Wed, 1 Mar 2017 09:55:28 +0000
+Subject: net: bridge: allow IPv6 when multicast flood is disabled
+
+From: Mike Manning <mmanning@brocade.com>
+
+
+[ Upstream commit 8953de2f02ad7b15e4964c82f9afd60f128e4e98 ]
+
+Even with multicast flooding turned off, IPv6 ND should still work so
+that IPv6 connectivity is provided. Allow this by continuing to flood
+multicast traffic originated by us.
+
+Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag")
+Cc: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+Signed-off-by: Mike Manning <mmanning@brocade.com>
+Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bridge/br_forward.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/bridge/br_forward.c
++++ b/net/bridge/br_forward.c
+@@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, str
+               /* Do not flood unicast traffic to ports that turn it off */
+               if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD))
+                       continue;
++              /* Do not flood if mc off, except for traffic we originate */
+               if (pkt_type == BR_PKT_MULTICAST &&
+-                  !(p->flags & BR_MCAST_FLOOD))
++                  !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev)
+                       continue;
+ 
+               /* Do not flood to ports that enable proxy ARP */
diff --git a/queue-4.9/net-don-t-call-strlen-on-the-user-buffer-in-packet_bind_spkt.patch b/queue-4.9/net-don-t-call-strlen-on-the-user-buffer-in-packet_bind_spkt.patch

new file mode 100644 (file)

index 0000000..1731bec
--- /dev/null
+++ b/queue-4.9/net-don-t-call-strlen-on-the-user-buffer-in-packet_bind_spkt.patch
@@ -0,0 +1,108 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Alexander Potapenko <glider@google.com>
+Date: Wed, 1 Mar 2017 12:57:20 +0100
+Subject: net: don't call strlen() on the user buffer in packet_bind_spkt()
+
+From: Alexander Potapenko <glider@google.com>
+
+
+[ Upstream commit 540e2894f7905538740aaf122bd8e0548e1c34a4 ]
+
+KMSAN (KernelMemorySanitizer, a new error detection tool) reports use of
+uninitialized memory in packet_bind_spkt():
+Acked-by: Eric Dumazet <edumazet@google.com>
+
+==================================================================
+BUG: KMSAN: use of unitialized memory
+CPU: 0 PID: 1074 Comm: packet Not tainted 4.8.0-rc6+ #1891
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs
+01/01/2011
+ 0000000000000000 ffff88006b6dfc08 ffffffff82559ae8 ffff88006b6dfb48
+ ffffffff818a7c91 ffffffff85b9c870 0000000000000092 ffffffff85b9c550
+ 0000000000000000 0000000000000092 00000000ec400911 0000000000000002
+Call Trace:
+ [<     inline     >] __dump_stack lib/dump_stack.c:15
+ [<ffffffff82559ae8>] dump_stack+0x238/0x290 lib/dump_stack.c:51
+ [<ffffffff818a6626>] kmsan_report+0x276/0x2e0 mm/kmsan/kmsan.c:1003
+ [<ffffffff818a783b>] __msan_warning+0x5b/0xb0
+mm/kmsan/kmsan_instr.c:424
+ [<     inline     >] strlen lib/string.c:484
+ [<ffffffff8259b58d>] strlcpy+0x9d/0x200 lib/string.c:144
+ [<ffffffff84b2eca4>] packet_bind_spkt+0x144/0x230
+net/packet/af_packet.c:3132
+ [<ffffffff84242e4d>] SYSC_bind+0x40d/0x5f0 net/socket.c:1370
+ [<ffffffff84242a22>] SyS_bind+0x82/0xa0 net/socket.c:1356
+ [<ffffffff8515991b>] entry_SYSCALL_64_fastpath+0x13/0x8f
+arch/x86/entry/entry_64.o:?
+chained origin: 00000000eba00911
+ [<ffffffff810bb787>] save_stack_trace+0x27/0x50
+arch/x86/kernel/stacktrace.c:67
+ [<     inline     >] kmsan_save_stack_with_flags mm/kmsan/kmsan.c:322
+ [<     inline     >] kmsan_save_stack mm/kmsan/kmsan.c:334
+ [<ffffffff818a59f8>] kmsan_internal_chain_origin+0x118/0x1e0
+mm/kmsan/kmsan.c:527
+ [<ffffffff818a7773>] __msan_set_alloca_origin4+0xc3/0x130
+mm/kmsan/kmsan_instr.c:380
+ [<ffffffff84242b69>] SYSC_bind+0x129/0x5f0 net/socket.c:1356
+ [<ffffffff84242a22>] SyS_bind+0x82/0xa0 net/socket.c:1356
+ [<ffffffff8515991b>] entry_SYSCALL_64_fastpath+0x13/0x8f
+arch/x86/entry/entry_64.o:?
+origin description: ----address@SYSC_bind (origin=00000000eb400911)
+==================================================================
+(the line numbers are relative to 4.8-rc6, but the bug persists
+upstream)
+
+, when I run the following program as root:
+
+=====================================
+ #include <string.h>
+ #include <sys/socket.h>
+ #include <netpacket/packet.h>
+ #include <net/ethernet.h>
+
+ int main() {
+   struct sockaddr addr;
+   memset(&addr, 0xff, sizeof(addr));
+   addr.sa_family = AF_PACKET;
+   int fd = socket(PF_PACKET, SOCK_PACKET, htons(ETH_P_ALL));
+   bind(fd, &addr, sizeof(addr));
+   return 0;
+ }
+=====================================
+
+This happens because addr.sa_data copied from the userspace is not
+zero-terminated, and copying it with strlcpy() in packet_bind_spkt()
+results in calling strlen() on the kernel copy of that non-terminated
+buffer.
+
+Signed-off-by: Alexander Potapenko <glider@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/packet/af_packet.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/net/packet/af_packet.c
++++ b/net/packet/af_packet.c
+@@ -3140,7 +3140,7 @@ static int packet_bind_spkt(struct socke
+                           int addr_len)
+ {
+       struct sock *sk = sock->sk;
+-      char name[15];
++      char name[sizeof(uaddr->sa_data) + 1];
+ 
+       /*
+        *      Check legality
+@@ -3148,7 +3148,11 @@ static int packet_bind_spkt(struct socke
+ 
+       if (addr_len != sizeof(struct sockaddr))
+               return -EINVAL;
+-      strlcpy(name, uaddr->sa_data, sizeof(name));
++      /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
++       * zero-terminated.
++       */
++      memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
++      name[sizeof(uaddr->sa_data)] = 0;
+ 
+       return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
+ }
diff --git a/queue-4.9/net-fix-socket-refcounting-in-skb_complete_tx_timestamp.patch b/queue-4.9/net-fix-socket-refcounting-in-skb_complete_tx_timestamp.patch

new file mode 100644 (file)

index 0000000..08bfe56
--- /dev/null
+++ b/queue-4.9/net-fix-socket-refcounting-in-skb_complete_tx_timestamp.patch
@@ -0,0 +1,53 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 3 Mar 2017 21:01:03 -0800
+Subject: net: fix socket refcounting in skb_complete_tx_timestamp()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 9ac25fc063751379cb77434fef9f3b088cd3e2f7 ]
+
+TX skbs do not necessarily hold a reference on skb->sk->sk_refcnt
+By the time TX completion happens, sk_refcnt might be already 0.
+
+sock_hold()/sock_put() would then corrupt critical state, like
+sk_wmem_alloc and lead to leaks or use after free.
+
+Fixes: 62bccb8cdb69 ("net-timestamp: Make the clone operation stand-alone from phy timestamping")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Alexander Duyck <alexander.h.duyck@intel.com>
+Cc: Johannes Berg <johannes@sipsolutions.net>
+Cc: Soheil Hassas Yeganeh <soheil@google.com>
+Cc: Willem de Bruijn <willemb@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c |   15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -3814,13 +3814,14 @@ void skb_complete_tx_timestamp(struct sk
+       if (!skb_may_tx_timestamp(sk, false))
+               return;
+ 
+-      /* take a reference to prevent skb_orphan() from freeing the socket */
+-      sock_hold(sk);
+-
+-      *skb_hwtstamps(skb) = *hwtstamps;
+-      __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
+-
+-      sock_put(sk);
++      /* Take a reference to prevent skb_orphan() from freeing the socket,
++       * but only if the socket refcount is not zero.
++       */
++      if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
++              *skb_hwtstamps(skb) = *hwtstamps;
++              __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
++              sock_put(sk);
++      }
+ }
+ EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+ 
diff --git a/queue-4.9/net-fix-socket-refcounting-in-skb_complete_wifi_ack.patch b/queue-4.9/net-fix-socket-refcounting-in-skb_complete_wifi_ack.patch

new file mode 100644 (file)

index 0000000..98ee783
--- /dev/null
+++ b/queue-4.9/net-fix-socket-refcounting-in-skb_complete_wifi_ack.patch
@@ -0,0 +1,62 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 3 Mar 2017 21:01:02 -0800
+Subject: net: fix socket refcounting in skb_complete_wifi_ack()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit dd4f10722aeb10f4f582948839f066bebe44e5fb ]
+
+TX skbs do not necessarily hold a reference on skb->sk->sk_refcnt
+By the time TX completion happens, sk_refcnt might be already 0.
+
+sock_hold()/sock_put() would then corrupt critical state, like
+sk_wmem_alloc.
+
+Fixes: bf7fa551e0ce ("mac80211: Resolve sk_refcnt/sk_wmem_alloc issue in wifi ack path")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Alexander Duyck <alexander.h.duyck@intel.com>
+Cc: Johannes Berg <johannes@sipsolutions.net>
+Cc: Soheil Hassas Yeganeh <soheil@google.com>
+Cc: Willem de Bruijn <willemb@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c |   15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -3871,7 +3871,7 @@ void skb_complete_wifi_ack(struct sk_buf
+ {
+       struct sock *sk = skb->sk;
+       struct sock_exterr_skb *serr;
+-      int err;
++      int err = 1;
+ 
+       skb->wifi_acked_valid = 1;
+       skb->wifi_acked = acked;
+@@ -3881,14 +3881,15 @@ void skb_complete_wifi_ack(struct sk_buf
+       serr->ee.ee_errno = ENOMSG;
+       serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+ 
+-      /* take a reference to prevent skb_orphan() from freeing the socket */
+-      sock_hold(sk);
+-
+-      err = sock_queue_err_skb(sk, skb);
++      /* Take a reference to prevent skb_orphan() from freeing the socket,
++       * but only if the socket refcount is not zero.
++       */
++      if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) {
++              err = sock_queue_err_skb(sk, skb);
++              sock_put(sk);
++      }
+       if (err)
+               kfree_skb(skb);
+-
+-      sock_put(sk);
+ }
+ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+ 
diff --git a/queue-4.9/net-mlx5e-do-not-reduce-lro-wqe-size-when-not-using-build_skb.patch b/queue-4.9/net-mlx5e-do-not-reduce-lro-wqe-size-when-not-using-build_skb.patch

new file mode 100644 (file)

index 0000000..9e5e4c2
--- /dev/null
+++ b/queue-4.9/net-mlx5e-do-not-reduce-lro-wqe-size-when-not-using-build_skb.patch
@@ -0,0 +1,56 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Tariq Toukan <tariqt@mellanox.com>
+Date: Wed, 22 Feb 2017 17:20:13 +0200
+Subject: net/mlx5e: Do not reduce LRO WQE size when not using build_skb
+
+From: Tariq Toukan <tariqt@mellanox.com>
+
+
+[ Upstream commit 4078e637c12f1e0a74293f1ec9563f42bff14a03 ]
+
+When rq_type is Striding RQ, no room of SKB_RESERVE is needed
+as SKB allocation is not done via build_skb.
+
+Fixes: e4b85508072b ("net/mlx5e: Slightly reduce hardware LRO size")
+Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+@@ -81,6 +81,7 @@ static bool mlx5e_check_fragmented_strid
+ static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, u8 rq_type)
+ {
+       priv->params.rq_wq_type = rq_type;
++      priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ;
+       switch (priv->params.rq_wq_type) {
+       case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+               priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW;
+@@ -92,6 +93,10 @@ static void mlx5e_set_rq_type_params(str
+               break;
+       default: /* MLX5_WQ_TYPE_LINKED_LIST */
+               priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE;
++
++              /* Extra room needed for build_skb */
++              priv->params.lro_wqe_sz -= MLX5_RX_HEADROOM +
++                      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+       }
+       priv->params.min_rx_wqes = mlx5_min_rx_wqes(priv->params.rq_wq_type,
+                                              BIT(priv->params.log_rq_size));
+@@ -3473,12 +3478,6 @@ static void mlx5e_build_nic_netdev_priv(
+       mlx5e_build_default_indir_rqt(mdev, priv->params.indirection_rqt,
+                                     MLX5E_INDIR_RQT_SIZE, profile->max_nch(mdev));
+ 
+-      priv->params.lro_wqe_sz =
+-              MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ -
+-              /* Extra room needed for build_skb */
+-              MLX5_RX_HEADROOM -
+-              SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+-
+       /* Initialize pflags */
+       MLX5E_SET_PRIV_FLAG(priv, MLX5E_PFLAG_RX_CQE_BASED_MODER,
+                           priv->params.rx_cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
diff --git a/queue-4.9/net-mlx5e-fix-wrong-cqe-decompression.patch b/queue-4.9/net-mlx5e-fix-wrong-cqe-decompression.patch

new file mode 100644 (file)

index 0000000..ee7984c
--- /dev/null
+++ b/queue-4.9/net-mlx5e-fix-wrong-cqe-decompression.patch
@@ -0,0 +1,71 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Tariq Toukan <tariqt@mellanox.com>
+Date: Wed, 22 Feb 2017 17:20:16 +0200
+Subject: net/mlx5e: Fix wrong CQE decompression
+
+From: Tariq Toukan <tariqt@mellanox.com>
+
+
+[ Upstream commit 36154be40a28e4afaa0416da2681d80b7e2ca319 ]
+
+In cqe compression with striding RQ, the decompression of the CQE field
+wqe_counter was done with a wrong wraparound value.
+This caused handling cqes with a wrong pointer to wqe (rx descriptor)
+and creating SKBs with wrong data, pointing to wrong (and already consumed)
+strides/pages.
+
+The meaning of the CQE field wqe_counter in striding RQ holds the
+stride index instead of the WQE index. Hence, when decompressing
+a CQE, wqe_counter should have wrapped-around the number of strides
+in a single multi-packet WQE.
+
+We dropped this wrap-around mask at all in CQE decompression of striding
+RQ. It is not needed as in such cases the CQE compression session would
+break because of different value of wqe_id field, starting a new
+compression session.
+
+Tested:
+ ethtool -K ethxx lro off/on
+ ethtool --set-priv-flags ethxx rx_cqe_compress on
+ super_netperf 16 {ipv4,ipv6} -t TCP_STREAM -m 50 -D
+ verified no csum errors and no page refcount issues.
+
+Fixes: 7219ab34f184 ("net/mlx5e: CQE compression")
+Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
+Reported-by: Tom Herbert <tom@herbertland.com>
+Cc: kernel-team@fb.com
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |   13 ++++++-------
+ 1 file changed, 6 insertions(+), 7 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+@@ -92,19 +92,18 @@ static inline void mlx5e_cqes_update_own
+ static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq,
+                                       struct mlx5e_cq *cq, u32 cqcc)
+ {
+-      u16 wqe_cnt_step;
+-
+       cq->title.byte_cnt     = cq->mini_arr[cq->mini_arr_idx].byte_cnt;
+       cq->title.check_sum    = cq->mini_arr[cq->mini_arr_idx].checksum;
+       cq->title.op_own      &= 0xf0;
+       cq->title.op_own      |= 0x01 & (cqcc >> cq->wq.log_sz);
+       cq->title.wqe_counter  = cpu_to_be16(cq->decmprs_wqe_counter);
+ 
+-      wqe_cnt_step =
+-              rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ?
+-              mpwrq_get_cqe_consumed_strides(&cq->title) : 1;
+-      cq->decmprs_wqe_counter =
+-              (cq->decmprs_wqe_counter + wqe_cnt_step) & rq->wq.sz_m1;
++      if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
++              cq->decmprs_wqe_counter +=
++                      mpwrq_get_cqe_consumed_strides(&cq->title);
++      else
++              cq->decmprs_wqe_counter =
++                      (cq->decmprs_wqe_counter + 1) & rq->wq.sz_m1;
+ }
+ 
+ static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq,
diff --git a/queue-4.9/net-mlx5e-register-unregister-vport-representors-on-interface-attach-detach.patch b/queue-4.9/net-mlx5e-register-unregister-vport-representors-on-interface-attach-detach.patch

new file mode 100644 (file)

index 0000000..e0bc3aa
--- /dev/null
+++ b/queue-4.9/net-mlx5e-register-unregister-vport-representors-on-interface-attach-detach.patch
@@ -0,0 +1,90 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Saeed Mahameed <saeedm@mellanox.com>
+Date: Wed, 22 Feb 2017 17:20:12 +0200
+Subject: net/mlx5e: Register/unregister vport representors on interface attach/detach
+
+From: Saeed Mahameed <saeedm@mellanox.com>
+
+
+[ Upstream commit 6f08a22c5fb2b9aefb8ecd8496758e7a677c1fde ]
+
+Currently vport representors are added only on driver load and removed on
+driver unload.  Apparently we forgot to handle them when we added the
+seamless reset flow feature.  This caused to leave the representors
+netdevs alive and active with open HW resources on pci shutdown and on
+error reset flows.
+
+To overcome this we move their handling to interface attach/detach, so
+they would be cleaned up on shutdown and recreated on reset flows.
+
+Fixes: 26e59d8077a3 ("net/mlx5e: Implement mlx5e interface attach/detach callbacks")
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Reviewed-by: Hadar Hen Zion <hadarh@mellanox.com>
+Reviewed-by: Roi Dayan <roid@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   23 ++++++++++++++--------
+ 1 file changed, 15 insertions(+), 8 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+@@ -3936,6 +3936,19 @@ static void mlx5e_register_vport_rep(str
+       }
+ }
+ 
++static void mlx5e_unregister_vport_rep(struct mlx5_core_dev *mdev)
++{
++      struct mlx5_eswitch *esw = mdev->priv.eswitch;
++      int total_vfs = MLX5_TOTAL_VPORTS(mdev);
++      int vport;
++
++      if (!MLX5_CAP_GEN(mdev, vport_group_manager))
++              return;
++
++      for (vport = 1; vport < total_vfs; vport++)
++              mlx5_eswitch_unregister_vport_rep(esw, vport);
++}
++
+ void mlx5e_detach_netdev(struct mlx5_core_dev *mdev, struct net_device *netdev)
+ {
+       struct mlx5e_priv *priv = netdev_priv(netdev);
+@@ -3983,6 +3996,7 @@ static int mlx5e_attach(struct mlx5_core
+               return err;
+       }
+ 
++      mlx5e_register_vport_rep(mdev);
+       return 0;
+ }
+ 
+@@ -3994,6 +4008,7 @@ static void mlx5e_detach(struct mlx5_cor
+       if (!netif_device_present(netdev))
+               return;
+ 
++      mlx5e_unregister_vport_rep(mdev);
+       mlx5e_detach_netdev(mdev, netdev);
+       mlx5e_destroy_mdev_resources(mdev);
+ }
+@@ -4012,8 +4027,6 @@ static void *mlx5e_add(struct mlx5_core_
+       if (err)
+               return NULL;
+ 
+-      mlx5e_register_vport_rep(mdev);
+-
+       if (MLX5_CAP_GEN(mdev, vport_group_manager))
+               ppriv = &esw->offloads.vport_reps[0];
+ 
+@@ -4065,13 +4078,7 @@ void mlx5e_destroy_netdev(struct mlx5_co
+ 
+ static void mlx5e_remove(struct mlx5_core_dev *mdev, void *vpriv)
+ {
+-      struct mlx5_eswitch *esw = mdev->priv.eswitch;
+-      int total_vfs = MLX5_TOTAL_VPORTS(mdev);
+       struct mlx5e_priv *priv = vpriv;
+-      int vport;
+-
+-      for (vport = 1; vport < total_vfs; vport++)
+-              mlx5_eswitch_unregister_vport_rep(esw, vport);
+ 
+       unregister_netdev(priv->netdev);
+       mlx5e_detach(mdev, vpriv);
diff --git a/queue-4.9/net-net_enable_timestamp-can-be-called-from-irq-contexts.patch b/queue-4.9/net-net_enable_timestamp-can-be-called-from-irq-contexts.patch

new file mode 100644 (file)

index 0000000..b2e2e3e
--- /dev/null
+++ b/queue-4.9/net-net_enable_timestamp-can-be-called-from-irq-contexts.patch
@@ -0,0 +1,96 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 1 Mar 2017 14:28:39 -0800
+Subject: net: net_enable_timestamp() can be called from irq contexts
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 13baa00ad01bb3a9f893e3a08cbc2d072fc0c15d ]
+
+It is now very clear that silly TCP listeners might play with
+enabling/disabling timestamping while new children are added
+to their accept queue.
+
+Meaning net_enable_timestamp() can be called from BH context
+while current state of the static key is not enabled.
+
+Lets play safe and allow all contexts.
+
+The work queue is scheduled only under the problematic cases,
+which are the static key enable/disable transition, to not slow down
+critical paths.
+
+This extends and improves what we did in commit 5fa8bbda38c6 ("net: use
+a work queue to defer net_disable_timestamp() work")
+
+Fixes: b90e5794c5bd ("net: dont call jump_label_dec from irq context")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dev.c |   35 +++++++++++++++++++++++++++++++----
+ 1 file changed, 31 insertions(+), 4 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -1697,27 +1697,54 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue);
+ static struct static_key netstamp_needed __read_mostly;
+ #ifdef HAVE_JUMP_LABEL
+ static atomic_t netstamp_needed_deferred;
++static atomic_t netstamp_wanted;
+ static void netstamp_clear(struct work_struct *work)
+ {
+       int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
++      int wanted;
+ 
+-      while (deferred--)
+-              static_key_slow_dec(&netstamp_needed);
++      wanted = atomic_add_return(deferred, &netstamp_wanted);
++      if (wanted > 0)
++              static_key_enable(&netstamp_needed);
++      else
++              static_key_disable(&netstamp_needed);
+ }
+ static DECLARE_WORK(netstamp_work, netstamp_clear);
+ #endif
+ 
+ void net_enable_timestamp(void)
+ {
++#ifdef HAVE_JUMP_LABEL
++      int wanted;
++
++      while (1) {
++              wanted = atomic_read(&netstamp_wanted);
++              if (wanted <= 0)
++                      break;
++              if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
++                      return;
++      }
++      atomic_inc(&netstamp_needed_deferred);
++      schedule_work(&netstamp_work);
++#else
+       static_key_slow_inc(&netstamp_needed);
++#endif
+ }
+ EXPORT_SYMBOL(net_enable_timestamp);
+ 
+ void net_disable_timestamp(void)
+ {
+ #ifdef HAVE_JUMP_LABEL
+-      /* net_disable_timestamp() can be called from non process context */
+-      atomic_inc(&netstamp_needed_deferred);
++      int wanted;
++
++      while (1) {
++              wanted = atomic_read(&netstamp_wanted);
++              if (wanted <= 1)
++                      break;
++              if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
++                      return;
++      }
++      atomic_dec(&netstamp_needed_deferred);
+       schedule_work(&netstamp_work);
+ #else
+       static_key_slow_dec(&netstamp_needed);
diff --git a/queue-4.9/net-phy-avoid-deadlock-during-phy_error.patch b/queue-4.9/net-phy-avoid-deadlock-during-phy_error.patch

new file mode 100644 (file)

index 0000000..267ea2f
--- /dev/null
+++ b/queue-4.9/net-phy-avoid-deadlock-during-phy_error.patch
@@ -0,0 +1,76 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Florian Fainelli <f.fainelli@gmail.com>
+Date: Fri, 20 Jan 2017 15:31:52 -0800
+Subject: net: phy: Avoid deadlock during phy_error()
+
+From: Florian Fainelli <f.fainelli@gmail.com>
+
+
+[ Upstream commit eab127717a6af54401ba534790c793ec143cd1fc ]
+
+phy_error() is called in the PHY state machine workqueue context, and
+calls phy_trigger_machine() which does a cancel_delayed_work_sync() of
+the workqueue we execute from, causing a deadlock situation.
+
+Augment phy_trigger_machine() machine with a sync boolean indicating
+whether we should use cancel_*_sync() or just cancel_*_work().
+
+Fixes: 3c293f4e08b5 ("net: phy: Trigger state machine on state change and not polling.")
+Reported-by: Russell King <rmk+kernel@armlinux.org.uk>
+Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/phy.c |   14 +++++++++-----
+ 1 file changed, 9 insertions(+), 5 deletions(-)
+
+--- a/drivers/net/phy/phy.c
++++ b/drivers/net/phy/phy.c
+@@ -611,14 +611,18 @@ void phy_start_machine(struct phy_device
+  * phy_trigger_machine - trigger the state machine to run
+  *
+  * @phydev: the phy_device struct
++ * @sync: indicate whether we should wait for the workqueue cancelation
+  *
+  * Description: There has been a change in state which requires that the
+  *   state machine runs.
+  */
+ 
+-static void phy_trigger_machine(struct phy_device *phydev)
++static void phy_trigger_machine(struct phy_device *phydev, bool sync)
+ {
+-      cancel_delayed_work_sync(&phydev->state_queue);
++      if (sync)
++              cancel_delayed_work_sync(&phydev->state_queue);
++      else
++              cancel_delayed_work(&phydev->state_queue);
+       queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0);
+ }
+ 
+@@ -655,7 +659,7 @@ static void phy_error(struct phy_device
+       phydev->state = PHY_HALTED;
+       mutex_unlock(&phydev->lock);
+ 
+-      phy_trigger_machine(phydev);
++      phy_trigger_machine(phydev, false);
+ }
+ 
+ /**
+@@ -817,7 +821,7 @@ void phy_change(struct work_struct *work
+       }
+ 
+       /* reschedule state queue work to run as soon as possible */
+-      phy_trigger_machine(phydev);
++      phy_trigger_machine(phydev, true);
+       return;
+ 
+ ignore:
+@@ -907,7 +911,7 @@ void phy_start(struct phy_device *phydev
+       if (do_resume)
+               phy_resume(phydev);
+ 
+-      phy_trigger_machine(phydev);
++      phy_trigger_machine(phydev, true);
+ }
+ EXPORT_SYMBOL(phy_start);
+ 
diff --git a/queue-4.9/net-sched-act_skbmod-remove-unneeded-rcu_read_unlock-in-tcf_skbmod_dump.patch b/queue-4.9/net-sched-act_skbmod-remove-unneeded-rcu_read_unlock-in-tcf_skbmod_dump.patch

new file mode 100644 (file)

index 0000000..764ac6c
--- /dev/null
+++ b/queue-4.9/net-sched-act_skbmod-remove-unneeded-rcu_read_unlock-in-tcf_skbmod_dump.patch
@@ -0,0 +1,30 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Alexey Khoroshilov <khoroshilov@ispras.ru>
+Date: Sun, 5 Mar 2017 03:01:55 +0300
+Subject: net/sched: act_skbmod: remove unneeded rcu_read_unlock in tcf_skbmod_dump
+
+From: Alexey Khoroshilov <khoroshilov@ispras.ru>
+
+
+[ Upstream commit 6c4dc75c251721f517e9daeb5370ea606b5b35ce ]
+
+Found by Linux Driver Verification project (linuxtesting.org).
+
+Signed-off-by: Alexey Khoroshilov <khoroshilov@ispras.ru>
+Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/act_skbmod.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/net/sched/act_skbmod.c
++++ b/net/sched/act_skbmod.c
+@@ -228,7 +228,6 @@ static int tcf_skbmod_dump(struct sk_buf
+ 
+       return skb->len;
+ nla_put_failure:
+-      rcu_read_unlock();
+       nlmsg_trim(skb, b);
+       return -1;
+ }
diff --git a/queue-4.9/net-sched-actions-decrement-module-reference-count-after-table-flush.patch b/queue-4.9/net-sched-actions-decrement-module-reference-count-after-table-flush.patch

new file mode 100644 (file)

index 0000000..b8e7a96
--- /dev/null
+++ b/queue-4.9/net-sched-actions-decrement-module-reference-count-after-table-flush.patch
@@ -0,0 +1,94 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Roman Mashak <mrv@mojatatu.com>
+Date: Fri, 24 Feb 2017 11:00:32 -0500
+Subject: net sched actions: decrement module reference count after table flush.
+
+From: Roman Mashak <mrv@mojatatu.com>
+
+
+[ Upstream commit edb9d1bff4bbe19b8ae0e71b1f38732591a9eeb2 ]
+
+When tc actions are loaded as a module and no actions have been installed,
+flushing them would result in actions removed from the memory, but modules
+reference count not being decremented, so that the modules would not be
+unloaded.
+
+Following is example with GACT action:
+
+% sudo modprobe act_gact
+% lsmod
+Module                  Size  Used by
+act_gact               16384  0
+%
+% sudo tc actions ls action gact
+%
+% sudo tc actions flush action gact
+% lsmod
+Module                  Size  Used by
+act_gact               16384  1
+% sudo tc actions flush action gact
+% lsmod
+Module                  Size  Used by
+act_gact               16384  2
+% sudo rmmod act_gact
+rmmod: ERROR: Module act_gact is in use
+....
+
+After the fix:
+% lsmod
+Module                  Size  Used by
+act_gact               16384  0
+%
+% sudo tc actions add action pass index 1
+% sudo tc actions add action pass index 2
+% sudo tc actions add action pass index 3
+% lsmod
+Module                  Size  Used by
+act_gact               16384  3
+%
+% sudo tc actions flush action gact
+% lsmod
+Module                  Size  Used by
+act_gact               16384  0
+%
+% sudo tc actions flush action gact
+% lsmod
+Module                  Size  Used by
+act_gact               16384  0
+% sudo rmmod act_gact
+% lsmod
+Module                  Size  Used by
+%
+
+Fixes: f97017cdefef ("net-sched: Fix actions flushing")
+Signed-off-by: Roman Mashak <mrv@mojatatu.com>
+Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
+Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/act_api.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/net/sched/act_api.c
++++ b/net/sched/act_api.c
+@@ -820,10 +820,8 @@ static int tca_action_flush(struct net *
+               goto out_module_put;
+ 
+       err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops);
+-      if (err < 0)
++      if (err <= 0)
+               goto out_module_put;
+-      if (err == 0)
+-              goto noflush_out;
+ 
+       nla_nest_end(skb, nest);
+ 
+@@ -840,7 +838,6 @@ static int tca_action_flush(struct net *
+ out_module_put:
+       module_put(ops->owner);
+ err_out:
+-noflush_out:
+       kfree_skb(skb);
+       return err;
+ }
diff --git a/queue-4.9/net-tunnel-set-inner-protocol-in-network-gro-hooks.patch b/queue-4.9/net-tunnel-set-inner-protocol-in-network-gro-hooks.patch

new file mode 100644 (file)

index 0000000..a9f4e6e
--- /dev/null
+++ b/queue-4.9/net-tunnel-set-inner-protocol-in-network-gro-hooks.patch
@@ -0,0 +1,70 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Tue, 7 Mar 2017 18:33:31 +0100
+Subject: net/tunnel: set inner protocol in network gro hooks
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+
+[ Upstream commit 294acf1c01bace5cea5d30b510504238bf5f7c25 ]
+
+The gso code of several tunnels type (gre and udp tunnels)
+takes for granted that the skb->inner_protocol is properly
+initialized and drops the packet elsewhere.
+
+On the forwarding path no one is initializing such field,
+so gro encapsulated packets are dropped on forward.
+
+Since commit 38720352412a ("gre: Use inner_proto to obtain
+inner header protocol"), this can be reproduced when the
+encapsulated packets use gre as the tunneling protocol.
+
+The issue happens also with vxlan and geneve tunnels since
+commit 8bce6d7d0d1e ("udp: Generalize skb_udp_segment"), if the
+forwarding host's ingress nic has h/w offload for such tunnel
+and a vxlan/geneve device is configured on top of it, regardless
+of the configured peer address and vni.
+
+To address the issue, this change initialize the inner_protocol
+field for encapsulated packets in both ipv4 and ipv6 gro complete
+callbacks.
+
+Fixes: 38720352412a ("gre: Use inner_proto to obtain inner header protocol")
+Fixes: 8bce6d7d0d1e ("udp: Generalize skb_udp_segment")
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/af_inet.c     |    4 +++-
+ net/ipv6/ip6_offload.c |    4 +++-
+ 2 files changed, 6 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -1460,8 +1460,10 @@ int inet_gro_complete(struct sk_buff *sk
+       int proto = iph->protocol;
+       int err = -ENOSYS;
+ 
+-      if (skb->encapsulation)
++      if (skb->encapsulation) {
++              skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP));
+               skb_set_inner_network_header(skb, nhoff);
++      }
+ 
+       csum_replace2(&iph->check, iph->tot_len, newlen);
+       iph->tot_len = newlen;
+--- a/net/ipv6/ip6_offload.c
++++ b/net/ipv6/ip6_offload.c
+@@ -294,8 +294,10 @@ static int ipv6_gro_complete(struct sk_b
+       struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff);
+       int err = -ENOSYS;
+ 
+-      if (skb->encapsulation)
++      if (skb->encapsulation) {
++              skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
+               skb_set_inner_network_header(skb, nhoff);
++      }
+ 
+       iph->payload_len = htons(skb->len - nhoff - sizeof(*iph));
+ 
diff --git a/queue-4.9/strparser-destroy-workqueue-on-module-exit.patch b/queue-4.9/strparser-destroy-workqueue-on-module-exit.patch

new file mode 100644 (file)

index 0000000..329bd21
--- /dev/null
+++ b/queue-4.9/strparser-destroy-workqueue-on-module-exit.patch
@@ -0,0 +1,29 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: WANG Cong <xiyou.wangcong@gmail.com>
+Date: Fri, 3 Mar 2017 12:21:14 -0800
+Subject: strparser: destroy workqueue on module exit
+
+From: WANG Cong <xiyou.wangcong@gmail.com>
+
+
+[ Upstream commit f78ef7cd9a0686b979679d0de061c6dbfd8d649e ]
+
+Fixes: 43a0c6751a32 ("strparser: Stream parser for messages")
+Cc: Tom Herbert <tom@herbertland.com>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/strparser/strparser.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/strparser/strparser.c
++++ b/net/strparser/strparser.c
+@@ -504,6 +504,7 @@ static int __init strp_mod_init(void)
+ 
+ static void __exit strp_mod_exit(void)
+ {
++      destroy_workqueue(strp_wq);
+ }
+ module_init(strp_mod_init);
+ module_exit(strp_mod_exit);
diff --git a/queue-4.9/tcp-dccp-block-bh-for-syn-processing.patch b/queue-4.9/tcp-dccp-block-bh-for-syn-processing.patch

new file mode 100644 (file)

index 0000000..0e7c996
--- /dev/null
+++ b/queue-4.9/tcp-dccp-block-bh-for-syn-processing.patch
@@ -0,0 +1,206 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 1 Mar 2017 08:39:49 -0800
+Subject: tcp/dccp: block BH for SYN processing
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 449809a66c1d0b1563dee84493e14bf3104d2d7e ]
+
+SYN processing really was meant to be handled from BH.
+
+When I got rid of BH blocking while processing socket backlog
+in commit 5413d1babe8f ("net: do not block BH while processing socket
+backlog"), I forgot that a malicious user could transition to TCP_LISTEN
+from a state that allowed (SYN) packets to be parked in the socket
+backlog while socket is owned by the thread doing the listen() call.
+
+Sure enough syzkaller found this and reported the bug ;)
+
+=================================
+[ INFO: inconsistent lock state ]
+4.10.0+ #60 Not tainted
+---------------------------------
+inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage.
+syz-executor0/5090 [HC0[0]:SC0[0]:HE1:SE1] takes:
+ (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at:
+[<ffffffff83a6a370>] spin_lock include/linux/spinlock.h:299 [inline]
+ (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at:
+[<ffffffff83a6a370>] inet_ehash_insert+0x240/0xad0
+net/ipv4/inet_hashtables.c:407
+{IN-SOFTIRQ-W} state was registered at:
+  mark_irqflags kernel/locking/lockdep.c:2923 [inline]
+  __lock_acquire+0xbcf/0x3270 kernel/locking/lockdep.c:3295
+  lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753
+  __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
+  _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151
+  spin_lock include/linux/spinlock.h:299 [inline]
+  inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407
+  reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline]
+  inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764
+  tcp_conn_request+0x25cc/0x3310 net/ipv4/tcp_input.c:6399
+  tcp_v4_conn_request+0x157/0x220 net/ipv4/tcp_ipv4.c:1262
+  tcp_rcv_state_process+0x802/0x4130 net/ipv4/tcp_input.c:5889
+  tcp_v4_do_rcv+0x56b/0x940 net/ipv4/tcp_ipv4.c:1433
+  tcp_v4_rcv+0x2e12/0x3210 net/ipv4/tcp_ipv4.c:1711
+  ip_local_deliver_finish+0x4ce/0xc40 net/ipv4/ip_input.c:216
+  NF_HOOK include/linux/netfilter.h:257 [inline]
+  ip_local_deliver+0x1ce/0x710 net/ipv4/ip_input.c:257
+  dst_input include/net/dst.h:492 [inline]
+  ip_rcv_finish+0xb1d/0x2110 net/ipv4/ip_input.c:396
+  NF_HOOK include/linux/netfilter.h:257 [inline]
+  ip_rcv+0xd90/0x19c0 net/ipv4/ip_input.c:487
+  __netif_receive_skb_core+0x1ad1/0x3400 net/core/dev.c:4179
+  __netif_receive_skb+0x2a/0x170 net/core/dev.c:4217
+  netif_receive_skb_internal+0x1d6/0x430 net/core/dev.c:4245
+  napi_skb_finish net/core/dev.c:4602 [inline]
+  napi_gro_receive+0x4e6/0x680 net/core/dev.c:4636
+  e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4033 [inline]
+  e1000_clean_rx_irq+0x5e0/0x1490
+drivers/net/ethernet/intel/e1000/e1000_main.c:4489
+  e1000_clean+0xb9a/0x2910 drivers/net/ethernet/intel/e1000/e1000_main.c:3834
+  napi_poll net/core/dev.c:5171 [inline]
+  net_rx_action+0xe70/0x1900 net/core/dev.c:5236
+  __do_softirq+0x2fb/0xb7d kernel/softirq.c:284
+  invoke_softirq kernel/softirq.c:364 [inline]
+  irq_exit+0x19e/0x1d0 kernel/softirq.c:405
+  exiting_irq arch/x86/include/asm/apic.h:658 [inline]
+  do_IRQ+0x81/0x1a0 arch/x86/kernel/irq.c:250
+  ret_from_intr+0x0/0x20
+  native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:53
+  arch_safe_halt arch/x86/include/asm/paravirt.h:98 [inline]
+  default_idle+0x8f/0x410 arch/x86/kernel/process.c:271
+  arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:262
+  default_idle_call+0x36/0x60 kernel/sched/idle.c:96
+  cpuidle_idle_call kernel/sched/idle.c:154 [inline]
+  do_idle+0x348/0x440 kernel/sched/idle.c:243
+  cpu_startup_entry+0x18/0x20 kernel/sched/idle.c:345
+  start_secondary+0x344/0x440 arch/x86/kernel/smpboot.c:272
+  verify_cpu+0x0/0xfc
+irq event stamp: 1741
+hardirqs last  enabled at (1741): [<ffffffff84d49d77>]
+__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160
+[inline]
+hardirqs last  enabled at (1741): [<ffffffff84d49d77>]
+_raw_spin_unlock_irqrestore+0xf7/0x1a0 kernel/locking/spinlock.c:191
+hardirqs last disabled at (1740): [<ffffffff84d4a732>]
+__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:108 [inline]
+hardirqs last disabled at (1740): [<ffffffff84d4a732>]
+_raw_spin_lock_irqsave+0xa2/0x110 kernel/locking/spinlock.c:159
+softirqs last  enabled at (1738): [<ffffffff84d4deff>]
+__do_softirq+0x7cf/0xb7d kernel/softirq.c:310
+softirqs last disabled at (1571): [<ffffffff84d4b92c>]
+do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902
+
+other info that might help us debug this:
+ Possible unsafe locking scenario:
+
+       CPU0
+       ----
+  lock(&(&hashinfo->ehash_locks[i])->rlock);
+  <Interrupt>
+    lock(&(&hashinfo->ehash_locks[i])->rlock);
+
+ *** DEADLOCK ***
+
+1 lock held by syz-executor0/5090:
+ #0:  (sk_lock-AF_INET6){+.+.+.}, at: [<ffffffff83406b43>] lock_sock
+include/net/sock.h:1460 [inline]
+ #0:  (sk_lock-AF_INET6){+.+.+.}, at: [<ffffffff83406b43>]
+sock_setsockopt+0x233/0x1e40 net/core/sock.c:683
+
+stack backtrace:
+CPU: 1 PID: 5090 Comm: syz-executor0 Not tainted 4.10.0+ #60
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+Call Trace:
+ __dump_stack lib/dump_stack.c:15 [inline]
+ dump_stack+0x292/0x398 lib/dump_stack.c:51
+ print_usage_bug+0x3ef/0x450 kernel/locking/lockdep.c:2387
+ valid_state kernel/locking/lockdep.c:2400 [inline]
+ mark_lock_irq kernel/locking/lockdep.c:2602 [inline]
+ mark_lock+0xf30/0x1410 kernel/locking/lockdep.c:3065
+ mark_irqflags kernel/locking/lockdep.c:2941 [inline]
+ __lock_acquire+0x6dc/0x3270 kernel/locking/lockdep.c:3295
+ lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753
+ __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
+ _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151
+ spin_lock include/linux/spinlock.h:299 [inline]
+ inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407
+ reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline]
+ inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764
+ dccp_v6_conn_request+0xada/0x11b0 net/dccp/ipv6.c:380
+ dccp_rcv_state_process+0x51e/0x1660 net/dccp/input.c:606
+ dccp_v6_do_rcv+0x213/0x350 net/dccp/ipv6.c:632
+ sk_backlog_rcv include/net/sock.h:896 [inline]
+ __release_sock+0x127/0x3a0 net/core/sock.c:2052
+ release_sock+0xa5/0x2b0 net/core/sock.c:2539
+ sock_setsockopt+0x60f/0x1e40 net/core/sock.c:1016
+ SYSC_setsockopt net/socket.c:1782 [inline]
+ SyS_setsockopt+0x2fb/0x3a0 net/socket.c:1765
+ entry_SYSCALL_64_fastpath+0x1f/0xc2
+RIP: 0033:0x4458b9
+RSP: 002b:00007fe8b26c2b58 EFLAGS: 00000292 ORIG_RAX: 0000000000000036
+RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00000000004458b9
+RDX: 000000000000001a RSI: 0000000000000001 RDI: 0000000000000006
+RBP: 00000000006e2110 R08: 0000000000000010 R09: 0000000000000000
+R10: 00000000208c3000 R11: 0000000000000292 R12: 0000000000708000
+R13: 0000000020000000 R14: 0000000000001000 R15: 0000000000000000
+
+Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Andrey Konovalov <andreyknvl@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/dccp/input.c     |   10 ++++++++--
+ net/ipv4/tcp_input.c |   10 ++++++++--
+ 2 files changed, 16 insertions(+), 4 deletions(-)
+
+--- a/net/dccp/input.c
++++ b/net/dccp/input.c
+@@ -577,6 +577,7 @@ int dccp_rcv_state_process(struct sock *
+       struct dccp_sock *dp = dccp_sk(sk);
+       struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb);
+       const int old_state = sk->sk_state;
++      bool acceptable;
+       int queued = 0;
+ 
+       /*
+@@ -603,8 +604,13 @@ int dccp_rcv_state_process(struct sock *
+        */
+       if (sk->sk_state == DCCP_LISTEN) {
+               if (dh->dccph_type == DCCP_PKT_REQUEST) {
+-                      if (inet_csk(sk)->icsk_af_ops->conn_request(sk,
+-                                                                  skb) < 0)
++                      /* It is possible that we process SYN packets from backlog,
++                       * so we need to make sure to disable BH right there.
++                       */
++                      local_bh_disable();
++                      acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0;
++                      local_bh_enable();
++                      if (!acceptable)
+                               return 1;
+                       consume_skb(skb);
+                       return 0;
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -5916,9 +5916,15 @@ int tcp_rcv_state_process(struct sock *s
+               if (th->syn) {
+                       if (th->fin)
+                               goto discard;
+-                      if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
+-                              return 1;
++                      /* It is possible that we process SYN packets from backlog,
++                       * so we need to make sure to disable BH right there.
++                       */
++                      local_bh_disable();
++                      acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
++                      local_bh_enable();
+ 
++                      if (!acceptable)
++                              return 1;
+                       consume_skb(skb);
+                       return 0;
+               }
diff --git a/queue-4.9/tcp-fix-various-issues-for-sockets-morphing-to-listen-state.patch b/queue-4.9/tcp-fix-various-issues-for-sockets-morphing-to-listen-state.patch

new file mode 100644 (file)

index 0000000..11f24a1
--- /dev/null
+++ b/queue-4.9/tcp-fix-various-issues-for-sockets-morphing-to-listen-state.patch
@@ -0,0 +1,74 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 3 Mar 2017 14:08:21 -0800
+Subject: tcp: fix various issues for sockets morphing to listen state
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 02b2faaf0af1d85585f6d6980e286d53612acfc2 ]
+
+Dmitry Vyukov reported a divide by 0 triggered by syzkaller, exploiting
+tcp_disconnect() path that was never really considered and/or used
+before syzkaller ;)
+
+I was not able to reproduce the bug, but it seems issues here are the
+three possible actions that assumed they would never trigger on a
+listener.
+
+1) tcp_write_timer_handler
+2) tcp_delack_timer_handler
+3) MTU reduction
+
+Only IPv6 MTU reduction was properly testing TCP_CLOSE and TCP_LISTEN
+ states from tcp_v6_mtu_reduced()
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_ipv4.c  |    7 +++++--
+ net/ipv4/tcp_timer.c |    6 ++++--
+ 2 files changed, 9 insertions(+), 4 deletions(-)
+
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -269,10 +269,13 @@ EXPORT_SYMBOL(tcp_v4_connect);
+  */
+ void tcp_v4_mtu_reduced(struct sock *sk)
+ {
+-      struct dst_entry *dst;
+       struct inet_sock *inet = inet_sk(sk);
+-      u32 mtu = tcp_sk(sk)->mtu_info;
++      struct dst_entry *dst;
++      u32 mtu;
+ 
++      if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
++              return;
++      mtu = tcp_sk(sk)->mtu_info;
+       dst = inet_csk_update_pmtu(sk, mtu);
+       if (!dst)
+               return;
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -249,7 +249,8 @@ void tcp_delack_timer_handler(struct soc
+ 
+       sk_mem_reclaim_partial(sk);
+ 
+-      if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
++      if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
++          !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
+               goto out;
+ 
+       if (time_after(icsk->icsk_ack.timeout, jiffies)) {
+@@ -552,7 +553,8 @@ void tcp_write_timer_handler(struct sock
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       int event;
+ 
+-      if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
++      if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) ||
++          !icsk->icsk_pending)
+               goto out;
+ 
+       if (time_after(icsk->icsk_timeout, jiffies)) {
diff --git a/queue-4.9/tun-fix-premature-pollout-notification-on-tun-devices.patch b/queue-4.9/tun-fix-premature-pollout-notification-on-tun-devices.patch

new file mode 100644 (file)

index 0000000..05d6a62
--- /dev/null
+++ b/queue-4.9/tun-fix-premature-pollout-notification-on-tun-devices.patch
@@ -0,0 +1,78 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Hannes Frederic Sowa <hannes@stressinduktion.org>
+Date: Mon, 13 Mar 2017 00:00:26 +0100
+Subject: tun: fix premature POLLOUT notification on tun devices
+
+From: Hannes Frederic Sowa <hannes@stressinduktion.org>
+
+
+[ Upstream commit b20e2d54789c6acbf6bd0efdbec2cf5fa4d90ef1 ]
+
+aszlig observed failing ssh tunnels (-w) during initialization since
+commit cc9da6cc4f56e0 ("ipv6: addrconf: use stable address generator for
+ARPHRD_NONE"). We already had reports that the mentioned commit breaks
+Juniper VPN connections. I can't clearly say that the Juniper VPN client
+has the same problem, but it is worth a try to hint to this patch.
+
+Because of the early generation of link local addresses, the kernel now
+can start asking for routers on the local subnet much earlier than usual.
+Those router solicitation packets arrive inside the ssh channels and
+should be transmitted to the tun fd before the configuration scripts
+might have upped the interface and made it ready for transmission.
+
+ssh polls on the interface and receives back a POLL_OUT. It tries to send
+the earily router solicitation packet to the tun interface.  Unfortunately
+it hasn't been up'ed yet by config scripts, thus failing with -EIO. ssh
+doesn't retry again and considers the tun interface broken forever.
+
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=121131
+Fixes: cc9da6cc4f56 ("ipv6: addrconf: use stable address generator for ARPHRD_NONE")
+Cc: Bjørn Mork <bjorn@mork.no>
+Reported-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
+Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
+Reported-by: Jonas Lippuner <jonas@lippuner.ca>
+Cc: Jonas Lippuner <jonas@lippuner.ca>
+Reported-by: aszlig <aszlig@redmoonstudios.org>
+Cc: aszlig <aszlig@redmoonstudios.org>
+Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/tun.c |   18 +++++++++++++++---
+ 1 file changed, 15 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -819,7 +819,18 @@ static void tun_net_uninit(struct net_de
+ /* Net device open. */
+ static int tun_net_open(struct net_device *dev)
+ {
++      struct tun_struct *tun = netdev_priv(dev);
++      int i;
++
+       netif_tx_start_all_queues(dev);
++
++      for (i = 0; i < tun->numqueues; i++) {
++              struct tun_file *tfile;
++
++              tfile = rtnl_dereference(tun->tfiles[i]);
++              tfile->socket.sk->sk_write_space(tfile->socket.sk);
++      }
++
+       return 0;
+ }
+ 
+@@ -1116,9 +1127,10 @@ static unsigned int tun_chr_poll(struct
+       if (!skb_array_empty(&tfile->tx_array))
+               mask |= POLLIN | POLLRDNORM;
+ 
+-      if (sock_writeable(sk) ||
+-          (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
+-           sock_writeable(sk)))
++      if (tun->dev->flags & IFF_UP &&
++          (sock_writeable(sk) ||
++           (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
++            sock_writeable(sk))))
+               mask |= POLLOUT | POLLWRNORM;
+ 
+       if (tun->dev->reg_state != NETREG_REGISTERED)
diff --git a/queue-4.9/uapi-fix-linux-packet_diag.h-userspace-compilation-error.patch b/queue-4.9/uapi-fix-linux-packet_diag.h-userspace-compilation-error.patch

new file mode 100644 (file)

index 0000000..5f3884c
--- /dev/null
+++ b/queue-4.9/uapi-fix-linux-packet_diag.h-userspace-compilation-error.patch
@@ -0,0 +1,44 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: "Dmitry V. Levin" <ldv@altlinux.org>
+Date: Tue, 7 Mar 2017 23:50:50 +0300
+Subject: uapi: fix linux/packet_diag.h userspace compilation error
+
+From: "Dmitry V. Levin" <ldv@altlinux.org>
+
+
+[ Upstream commit 745cb7f8a5de0805cade3de3991b7a95317c7c73 ]
+
+Replace MAX_ADDR_LEN with its numeric value to fix the following
+linux/packet_diag.h userspace compilation error:
+
+/usr/include/linux/packet_diag.h:67:17: error: 'MAX_ADDR_LEN' undeclared here (not in a function)
+  __u8 pdmc_addr[MAX_ADDR_LEN];
+
+This is not the first case in the UAPI where the numeric value
+of MAX_ADDR_LEN is used instead of symbolic one, uapi/linux/if_link.h
+already does the same:
+
+$ grep MAX_ADDR_LEN include/uapi/linux/if_link.h
+       __u8 mac[32]; /* MAX_ADDR_LEN */
+
+There are no UAPI headers besides these two that use MAX_ADDR_LEN.
+
+Signed-off-by: Dmitry V. Levin <ldv@altlinux.org>
+Acked-by: Pavel Emelyanov <xemul@virtuozzo.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/uapi/linux/packet_diag.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/uapi/linux/packet_diag.h
++++ b/include/uapi/linux/packet_diag.h
+@@ -64,7 +64,7 @@ struct packet_diag_mclist {
+       __u32   pdmc_count;
+       __u16   pdmc_type;
+       __u16   pdmc_alen;
+-      __u8    pdmc_addr[MAX_ADDR_LEN];
++      __u8    pdmc_addr[32]; /* MAX_ADDR_LEN */
+ };
+ 
+ struct packet_diag_ring {
diff --git a/queue-4.9/vrf-fix-use-after-free-in-vrf_xmit.patch b/queue-4.9/vrf-fix-use-after-free-in-vrf_xmit.patch

new file mode 100644 (file)

index 0000000..de72799
--- /dev/null
+++ b/queue-4.9/vrf-fix-use-after-free-in-vrf_xmit.patch
@@ -0,0 +1,56 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: David Ahern <dsa@cumulusnetworks.com>
+Date: Mon, 6 Mar 2017 08:53:04 -0800
+Subject: vrf: Fix use-after-free in vrf_xmit
+
+From: David Ahern <dsa@cumulusnetworks.com>
+
+
+[ Upstream commit f7887d40e541f74402df0684a1463c0a0bb68c68 ]
+
+KASAN detected a use-after-free:
+
+[  269.467067] BUG: KASAN: use-after-free in vrf_xmit+0x7f1/0x827 [vrf] at addr ffff8800350a21c0
+[  269.467067] Read of size 4 by task ssh/1879
+[  269.467067] CPU: 1 PID: 1879 Comm: ssh Not tainted 4.10.0+ #249
+[  269.467067] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
+[  269.467067] Call Trace:
+[  269.467067]  dump_stack+0x81/0xb6
+[  269.467067]  kasan_object_err+0x21/0x78
+[  269.467067]  kasan_report+0x2f7/0x450
+[  269.467067]  ? vrf_xmit+0x7f1/0x827 [vrf]
+[  269.467067]  ? ip_output+0xa4/0xdb
+[  269.467067]  __asan_load4+0x6b/0x6d
+[  269.467067]  vrf_xmit+0x7f1/0x827 [vrf]
+...
+
+Which corresponds to the skb access after xmit handling. Fix by saving
+skb->len and using the saved value to update stats.
+
+Fixes: 193125dbd8eb2 ("net: Introduce VRF device driver")
+Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vrf.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/vrf.c
++++ b/drivers/net/vrf.c
+@@ -346,6 +346,7 @@ static netdev_tx_t is_ip_tx_frame(struct
+ 
+ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
+ {
++      int len = skb->len;
+       netdev_tx_t ret = is_ip_tx_frame(skb, dev);
+ 
+       if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
+@@ -353,7 +354,7 @@ static netdev_tx_t vrf_xmit(struct sk_bu
+ 
+               u64_stats_update_begin(&dstats->syncp);
+               dstats->tx_pkts++;
+-              dstats->tx_bytes += skb->len;
++              dstats->tx_bytes += len;
+               u64_stats_update_end(&dstats->syncp);
+       } else {
+               this_cpu_inc(dev->dstats->tx_drps);
diff --git a/queue-4.9/vti6-return-gre_key-for-vti6.patch b/queue-4.9/vti6-return-gre_key-for-vti6.patch

new file mode 100644 (file)

index 0000000..c8320b8
--- /dev/null
+++ b/queue-4.9/vti6-return-gre_key-for-vti6.patch
@@ -0,0 +1,33 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: David Forster <dforster@brocade.com>
+Date: Fri, 24 Feb 2017 14:20:32 +0000
+Subject: vti6: return GRE_KEY for vti6
+
+From: David Forster <dforster@brocade.com>
+
+
+[ Upstream commit 7dcdf941cdc96692ab99fd790c8cc68945514851 ]
+
+Align vti6 with vti by returning GRE_KEY flag. This enables iproute2
+to display tunnel keys on "ip -6 tunnel show"
+
+Signed-off-by: David Forster <dforster@brocade.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_vti.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/net/ipv6/ip6_vti.c
++++ b/net/ipv6/ip6_vti.c
+@@ -691,6 +691,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 *
+       u->link = p->link;
+       u->i_key = p->i_key;
+       u->o_key = p->o_key;
++      if (u->i_key)
++              u->i_flags |= GRE_KEY;
++      if (u->o_key)
++              u->o_flags |= GRE_KEY;
+       u->proto = p->proto;
+ 
+       memcpy(u->name, p->name, sizeof(u->name));
diff --git a/queue-4.9/vxlan-correctly-validate-vxlan-id-against-vxlan_n_vid.patch b/queue-4.9/vxlan-correctly-validate-vxlan-id-against-vxlan_n_vid.patch

new file mode 100644 (file)

index 0000000..175a4f0
--- /dev/null
+++ b/queue-4.9/vxlan-correctly-validate-vxlan-id-against-vxlan_n_vid.patch
@@ -0,0 +1,33 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Matthias Schiffer <mschiffer@universe-factory.net>
+Date: Thu, 23 Feb 2017 17:19:41 +0100
+Subject: vxlan: correctly validate VXLAN ID against VXLAN_N_VID
+
+From: Matthias Schiffer <mschiffer@universe-factory.net>
+
+
+[ Upstream commit 4e37d6911f36545b286d15073f6f2222f840e81c ]
+
+The incorrect check caused an off-by-one error: the maximum VID 0xffffff
+was unusable.
+
+Fixes: d342894c5d2f ("vxlan: virtual extensible lan")
+Signed-off-by: Matthias Schiffer <mschiffer@universe-factory.net>
+Acked-by: Jiri Benc <jbenc@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vxlan.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/vxlan.c
++++ b/drivers/net/vxlan.c
+@@ -2637,7 +2637,7 @@ static int vxlan_validate(struct nlattr
+ 
+       if (data[IFLA_VXLAN_ID]) {
+               __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
+-              if (id >= VXLAN_VID_MASK)
++              if (id >= VXLAN_N_VID)
+                       return -ERANGE;
+       }
+ 
diff --git a/queue-4.9/vxlan-don-t-allow-overwrite-of-config-src-addr.patch b/queue-4.9/vxlan-don-t-allow-overwrite-of-config-src-addr.patch

new file mode 100644 (file)

index 0000000..af9b962
--- /dev/null
+++ b/queue-4.9/vxlan-don-t-allow-overwrite-of-config-src-addr.patch
@@ -0,0 +1,94 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Brian Russell <brussell@brocade.com>
+Date: Fri, 24 Feb 2017 17:47:11 +0000
+Subject: vxlan: don't allow overwrite of config src addr
+
+From: Brian Russell <brussell@brocade.com>
+
+
+[ Upstream commit 1158632b5a2dcce0786c1b1b99654e81cc867981 ]
+
+When using IPv6 transport and a default dst, a pointer to the configured
+source address is passed into the route lookup. If no source address is
+configured, then the value is overwritten.
+
+IPv6 route lookup ignores egress ifindex match if the source address is set,
+so if egress ifindex match is desired, the source address must be passed
+as any. The overwrite breaks this for subsequent lookups.
+
+Avoid this by copying the configured address to an existing stack variable
+and pass a pointer to that instead.
+
+Fixes: 272d96a5ab10 ("net: vxlan: lwt: Use source ip address during route lookup.")
+
+Signed-off-by: Brian Russell <brussell@brocade.com>
+Acked-by: Jiri Benc <jbenc@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vxlan.c |   12 +++++-------
+ 1 file changed, 5 insertions(+), 7 deletions(-)
+
+--- a/drivers/net/vxlan.c
++++ b/drivers/net/vxlan.c
+@@ -1942,7 +1942,6 @@ static void vxlan_xmit_one(struct sk_buf
+       const struct iphdr *old_iph;
+       union vxlan_addr *dst;
+       union vxlan_addr remote_ip, local_ip;
+-      union vxlan_addr *src;
+       struct vxlan_metadata _md;
+       struct vxlan_metadata *md = &_md;
+       __be16 src_port = 0, dst_port;
+@@ -1960,7 +1959,7 @@ static void vxlan_xmit_one(struct sk_buf
+               dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
+               vni = rdst->remote_vni;
+               dst = &rdst->remote_ip;
+-              src = &vxlan->cfg.saddr;
++              local_ip = vxlan->cfg.saddr;
+               dst_cache = &rdst->dst_cache;
+       } else {
+               if (!info) {
+@@ -1979,7 +1978,6 @@ static void vxlan_xmit_one(struct sk_buf
+                       local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
+               }
+               dst = &remote_ip;
+-              src = &local_ip;
+               dst_cache = &info->dst_cache;
+       }
+ 
+@@ -2028,7 +2026,7 @@ static void vxlan_xmit_one(struct sk_buf
+               rt = vxlan_get_route(vxlan, skb,
+                                    rdst ? rdst->remote_ifindex : 0, tos,
+                                    dst->sin.sin_addr.s_addr,
+-                                   &src->sin.sin_addr.s_addr,
++                                   &local_ip.sin.sin_addr.s_addr,
+                                    dst_cache, info);
+               if (IS_ERR(rt)) {
+                       netdev_dbg(dev, "no route to %pI4\n",
+@@ -2071,7 +2069,7 @@ static void vxlan_xmit_one(struct sk_buf
+               if (err < 0)
+                       goto xmit_tx_error;
+ 
+-              udp_tunnel_xmit_skb(rt, sk, skb, src->sin.sin_addr.s_addr,
++              udp_tunnel_xmit_skb(rt, sk, skb, local_ip.sin.sin_addr.s_addr,
+                                   dst->sin.sin_addr.s_addr, tos, ttl, df,
+                                   src_port, dst_port, xnet, !udp_sum);
+ #if IS_ENABLED(CONFIG_IPV6)
+@@ -2087,7 +2085,7 @@ static void vxlan_xmit_one(struct sk_buf
+               ndst = vxlan6_get_route(vxlan, skb,
+                                       rdst ? rdst->remote_ifindex : 0, tos,
+                                       label, &dst->sin6.sin6_addr,
+-                                      &src->sin6.sin6_addr,
++                                      &local_ip.sin6.sin6_addr,
+                                       dst_cache, info);
+               if (IS_ERR(ndst)) {
+                       netdev_dbg(dev, "no route to %pI6\n",
+@@ -2134,7 +2132,7 @@ static void vxlan_xmit_one(struct sk_buf
+                       return;
+               }
+               udp_tunnel6_xmit_skb(ndst, sk, skb, dev,
+-                                   &src->sin6.sin6_addr,
++                                   &local_ip.sin6.sin6_addr,
+                                    &dst->sin6.sin6_addr, tos, ttl,
+                                    label, src_port, dst_port, !udp_sum);
+ #endif
diff --git a/queue-4.9/vxlan-lock-rcu-on-tx-path.patch b/queue-4.9/vxlan-lock-rcu-on-tx-path.patch

new file mode 100644 (file)

index 0000000..e34a842
--- /dev/null
+++ b/queue-4.9/vxlan-lock-rcu-on-tx-path.patch
@@ -0,0 +1,85 @@
+From foo@baz Sat Mar 18 22:03:25 CST 2017
+From: Jakub Kicinski <jakub.kicinski@netronome.com>
+Date: Fri, 24 Feb 2017 11:43:36 -0800
+Subject: vxlan: lock RCU on TX path
+
+From: Jakub Kicinski <jakub.kicinski@netronome.com>
+
+
+[ Upstream commit 56de859e9967c070464a9a9f4f18d73f9447298e ]
+
+There is no guarantees that callers of the TX path will hold
+the RCU lock.  Grab it explicitly.
+
+Fixes: c6fcc4fc5f8b ("vxlan: avoid using stale vxlan socket.")
+Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vxlan.c |   13 ++++++++-----
+ 1 file changed, 8 insertions(+), 5 deletions(-)
+
+--- a/drivers/net/vxlan.c
++++ b/drivers/net/vxlan.c
+@@ -1955,6 +1955,7 @@ static void vxlan_xmit_one(struct sk_buf
+ 
+       info = skb_tunnel_info(skb);
+ 
++      rcu_read_lock();
+       if (rdst) {
+               dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
+               vni = rdst->remote_vni;
+@@ -1985,7 +1986,7 @@ static void vxlan_xmit_one(struct sk_buf
+               if (did_rsc) {
+                       /* short-circuited back to local bridge */
+                       vxlan_encap_bypass(skb, vxlan, vxlan);
+-                      return;
++                      goto out_unlock;
+               }
+               goto drop;
+       }
+@@ -2054,7 +2055,7 @@ static void vxlan_xmit_one(struct sk_buf
+                       if (!dst_vxlan)
+                               goto tx_error;
+                       vxlan_encap_bypass(skb, vxlan, dst_vxlan);
+-                      return;
++                      goto out_unlock;
+               }
+ 
+               if (!info)
+@@ -2115,7 +2116,7 @@ static void vxlan_xmit_one(struct sk_buf
+                       if (!dst_vxlan)
+                               goto tx_error;
+                       vxlan_encap_bypass(skb, vxlan, dst_vxlan);
+-                      return;
++                      goto out_unlock;
+               }
+ 
+               if (!info)
+@@ -2129,7 +2130,7 @@ static void vxlan_xmit_one(struct sk_buf
+               if (err < 0) {
+                       dst_release(ndst);
+                       dev->stats.tx_errors++;
+-                      return;
++                      goto out_unlock;
+               }
+               udp_tunnel6_xmit_skb(ndst, sk, skb, dev,
+                                    &local_ip.sin6.sin6_addr,
+@@ -2137,7 +2138,8 @@ static void vxlan_xmit_one(struct sk_buf
+                                    label, src_port, dst_port, !udp_sum);
+ #endif
+       }
+-
++out_unlock:
++      rcu_read_unlock();
+       return;
+ 
+ drop:
+@@ -2153,6 +2155,7 @@ tx_error:
+       dev->stats.tx_errors++;
+ tx_free:
+       dev_kfree_skb(skb);
++      rcu_read_unlock();
+ }
+ 
+ /* Transmit local packets over Vxlan
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 18 Mar 2017 14:06:49 +0000 (22:06 +0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 18 Mar 2017 14:06:49 +0000 (22:06 +0800)
queue-4.9/act_connmark-avoid-crashing-on-malformed-nlattrs-with-null-parms.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/bpf-detect-identical-ptr_to_map_value_or_null-registers.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/bpf-fix-mark_reg_unknown_value-for-spilled-regs-on-map-value-marking.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/bpf-fix-regression-on-verifier-pruning-wrt-map-lookups.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/bpf-fix-state-equivalence.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/bridge-drop-netfilter-fake-rtable-unconditionally.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/dccp-fix-memory-leak-during-tear-down-of-unsuccessful-connection-request.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/dccp-fix-use-after-free-in-dccp_feat_activate_values.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/dccp-tcp-fix-routing-redirect-race.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/dccp-unlock-sock-before-calling-sk_free.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/geneve-lock-rcu-on-tx-path.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ipv4-mask-tos-for-input-route.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ipv6-avoid-write-to-a-possibly-cloned-skb.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ipv6-make-ecmp-route-replacement-less-greedy.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ipv6-orphan-skbs-in-reassembly-unit.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/l2tp-avoid-use-after-free-caused-by-l2tp_ip_backlog_recv.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mlxsw-spectrum_router-avoid-potential-packets-loss.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mpls-do-not-decrement-alive-counter-for-unregister-events.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/mpls-send-route-delete-notifications-when-router-module-is-unloaded.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-bridge-allow-ipv6-when-multicast-flood-is-disabled.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-don-t-call-strlen-on-the-user-buffer-in-packet_bind_spkt.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-fix-socket-refcounting-in-skb_complete_tx_timestamp.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-fix-socket-refcounting-in-skb_complete_wifi_ack.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-mlx5e-do-not-reduce-lro-wqe-size-when-not-using-build_skb.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-mlx5e-fix-wrong-cqe-decompression.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-mlx5e-register-unregister-vport-representors-on-interface-attach-detach.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-net_enable_timestamp-can-be-called-from-irq-contexts.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-phy-avoid-deadlock-during-phy_error.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-sched-act_skbmod-remove-unneeded-rcu_read_unlock-in-tcf_skbmod_dump.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-sched-actions-decrement-module-reference-count-after-table-flush.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-tunnel-set-inner-protocol-in-network-gro-hooks.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/strparser-destroy-workqueue-on-module-exit.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tcp-dccp-block-bh-for-syn-processing.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tcp-fix-various-issues-for-sockets-morphing-to-listen-state.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tun-fix-premature-pollout-notification-on-tun-devices.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/uapi-fix-linux-packet_diag.h-userspace-compilation-error.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/vrf-fix-use-after-free-in-vrf_xmit.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/vti6-return-gre_key-for-vti6.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/vxlan-correctly-validate-vxlan-id-against-vxlan_n_vid.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/vxlan-don-t-allow-overwrite-of-config-src-addr.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/vxlan-lock-rcu-on-tx-path.patch	[new file with mode: 0644]	patch \| blob