From: Greg Kroah-Hartman Date: Sat, 18 Mar 2017 14:06:49 +0000 (+0800) Subject: 4.9-stable patches X-Git-Tag: v4.4.56~13 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=91608625f4ca539ae9958d0c390c410bc95880d7;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: act_connmark-avoid-crashing-on-malformed-nlattrs-with-null-parms.patch bpf-detect-identical-ptr_to_map_value_or_null-registers.patch bpf-fix-mark_reg_unknown_value-for-spilled-regs-on-map-value-marking.patch bpf-fix-regression-on-verifier-pruning-wrt-map-lookups.patch bpf-fix-state-equivalence.patch bridge-drop-netfilter-fake-rtable-unconditionally.patch dccp-fix-memory-leak-during-tear-down-of-unsuccessful-connection-request.patch dccp-fix-use-after-free-in-dccp_feat_activate_values.patch dccp-tcp-fix-routing-redirect-race.patch dccp-unlock-sock-before-calling-sk_free.patch geneve-lock-rcu-on-tx-path.patch ipv4-mask-tos-for-input-route.patch ipv6-avoid-write-to-a-possibly-cloned-skb.patch ipv6-make-ecmp-route-replacement-less-greedy.patch ipv6-orphan-skbs-in-reassembly-unit.patch l2tp-avoid-use-after-free-caused-by-l2tp_ip_backlog_recv.patch mlxsw-spectrum_router-avoid-potential-packets-loss.patch mpls-do-not-decrement-alive-counter-for-unregister-events.patch mpls-send-route-delete-notifications-when-router-module-is-unloaded.patch net-bridge-allow-ipv6-when-multicast-flood-is-disabled.patch net-don-t-call-strlen-on-the-user-buffer-in-packet_bind_spkt.patch net-fix-socket-refcounting-in-skb_complete_tx_timestamp.patch net-fix-socket-refcounting-in-skb_complete_wifi_ack.patch net-mlx5e-do-not-reduce-lro-wqe-size-when-not-using-build_skb.patch net-mlx5e-fix-wrong-cqe-decompression.patch net-mlx5e-register-unregister-vport-representors-on-interface-attach-detach.patch net-net_enable_timestamp-can-be-called-from-irq-contexts.patch net-phy-avoid-deadlock-during-phy_error.patch net-sched-act_skbmod-remove-unneeded-rcu_read_unlock-in-tcf_skbmod_dump.patch net-sched-actions-decrement-module-reference-count-after-table-flush.patch net-tunnel-set-inner-protocol-in-network-gro-hooks.patch strparser-destroy-workqueue-on-module-exit.patch tcp-dccp-block-bh-for-syn-processing.patch tcp-fix-various-issues-for-sockets-morphing-to-listen-state.patch tun-fix-premature-pollout-notification-on-tun-devices.patch uapi-fix-linux-packet_diag.h-userspace-compilation-error.patch vrf-fix-use-after-free-in-vrf_xmit.patch vti6-return-gre_key-for-vti6.patch vxlan-correctly-validate-vxlan-id-against-vxlan_n_vid.patch vxlan-don-t-allow-overwrite-of-config-src-addr.patch vxlan-lock-rcu-on-tx-path.patch --- diff --git a/queue-4.9/act_connmark-avoid-crashing-on-malformed-nlattrs-with-null-parms.patch b/queue-4.9/act_connmark-avoid-crashing-on-malformed-nlattrs-with-null-parms.patch new file mode 100644 index 00000000000..c6a90f0878a --- /dev/null +++ b/queue-4.9/act_connmark-avoid-crashing-on-malformed-nlattrs-with-null-parms.patch @@ -0,0 +1,57 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Etienne Noss +Date: Fri, 10 Mar 2017 16:55:32 +0100 +Subject: act_connmark: avoid crashing on malformed nlattrs with null parms + +From: Etienne Noss + + +[ Upstream commit 52491c7607c5527138095edf44c53169dc1ddb82 ] + +tcf_connmark_init does not check in its configuration if TCA_CONNMARK_PARMS +is set, resulting in a null pointer dereference when trying to access it. + +[501099.043007] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 +[501099.043039] IP: [] tcf_connmark_init+0x8b/0x180 [act_connmark] +... +[501099.044334] Call Trace: +[501099.044345] [] ? tcf_action_init_1+0x198/0x1b0 +[501099.044363] [] ? tcf_action_init+0xb0/0x120 +[501099.044380] [] ? tcf_exts_validate+0xc4/0x110 +[501099.044398] [] ? u32_set_parms+0xa7/0x270 [cls_u32] +[501099.044417] [] ? u32_change+0x680/0x87b [cls_u32] +[501099.044436] [] ? tc_ctl_tfilter+0x4dd/0x8a0 +[501099.044454] [] ? security_capable+0x41/0x60 +[501099.044471] [] ? rtnetlink_rcv_msg+0xe1/0x220 +[501099.044490] [] ? rtnl_newlink+0x870/0x870 +[501099.044507] [] ? netlink_rcv_skb+0xa1/0xc0 +[501099.044524] [] ? rtnetlink_rcv+0x24/0x30 +[501099.044541] [] ? netlink_unicast+0x184/0x230 +[501099.044558] [] ? netlink_sendmsg+0x2f8/0x3b0 +[501099.044576] [] ? sock_sendmsg+0x30/0x40 +[501099.044592] [] ? SYSC_sendto+0xd3/0x150 +[501099.044608] [] ? __do_page_fault+0x2d1/0x510 +[501099.044626] [] ? system_call_fast_compare_end+0xc/0x9b + +Fixes: 22a5dc0e5e3e ("net: sched: Introduce connmark action") +Signed-off-by: Étienne Noss +Signed-off-by: Victorien Molle +Acked-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/act_connmark.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/net/sched/act_connmark.c ++++ b/net/sched/act_connmark.c +@@ -113,6 +113,9 @@ static int tcf_connmark_init(struct net + if (ret < 0) + return ret; + ++ if (!tb[TCA_CONNMARK_PARMS]) ++ return -EINVAL; ++ + parm = nla_data(tb[TCA_CONNMARK_PARMS]); + + if (!tcf_hash_check(tn, parm->index, a, bind)) { diff --git a/queue-4.9/bpf-detect-identical-ptr_to_map_value_or_null-registers.patch b/queue-4.9/bpf-detect-identical-ptr_to_map_value_or_null-registers.patch new file mode 100644 index 00000000000..d888eb8b1ad --- /dev/null +++ b/queue-4.9/bpf-detect-identical-ptr_to_map_value_or_null-registers.patch @@ -0,0 +1,173 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Thomas Graf +Date: Tue, 18 Oct 2016 19:51:19 +0200 +Subject: bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers + +From: Thomas Graf + + +[ Upstream commit 57a09bf0a416700676e77102c28f9cfcb48267e0 ] + +A BPF program is required to check the return register of a +map_elem_lookup() call before accessing memory. The verifier keeps +track of this by converting the type of the result register from +PTR_TO_MAP_VALUE_OR_NULL to PTR_TO_MAP_VALUE after a conditional +jump ensures safety. This check is currently exclusively performed +for the result register 0. + +In the event the compiler reorders instructions, BPF_MOV64_REG +instructions may be moved before the conditional jump which causes +them to keep their type PTR_TO_MAP_VALUE_OR_NULL to which the +verifier objects when the register is accessed: + +0: (b7) r1 = 10 +1: (7b) *(u64 *)(r10 -8) = r1 +2: (bf) r2 = r10 +3: (07) r2 += -8 +4: (18) r1 = 0x59c00000 +6: (85) call 1 +7: (bf) r4 = r0 +8: (15) if r0 == 0x0 goto pc+1 + R0=map_value(ks=8,vs=8) R4=map_value_or_null(ks=8,vs=8) R10=fp +9: (7a) *(u64 *)(r4 +0) = 0 +R4 invalid mem access 'map_value_or_null' + +This commit extends the verifier to keep track of all identical +PTR_TO_MAP_VALUE_OR_NULL registers after a map_elem_lookup() by +assigning them an ID and then marking them all when the conditional +jump is observed. + +Signed-off-by: Thomas Graf +Reviewed-by: Josef Bacik +Acked-by: Daniel Borkmann +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/bpf_verifier.h | 2 - + kernel/bpf/verifier.c | 61 +++++++++++++++++++++++++++++++------------ + 2 files changed, 46 insertions(+), 17 deletions(-) + +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -24,13 +24,13 @@ struct bpf_reg_state { + */ + s64 min_value; + u64 max_value; ++ u32 id; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ + s64 imm; + + /* valid when type == PTR_TO_PACKET* */ + struct { +- u32 id; + u16 off; + u16 range; + }; +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -212,9 +212,10 @@ static void print_verifier_state(struct + else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || + t == PTR_TO_MAP_VALUE_OR_NULL || + t == PTR_TO_MAP_VALUE_ADJ) +- verbose("(ks=%d,vs=%d)", ++ verbose("(ks=%d,vs=%d,id=%u)", + reg->map_ptr->key_size, +- reg->map_ptr->value_size); ++ reg->map_ptr->value_size, ++ reg->id); + if (reg->min_value != BPF_REGISTER_MIN_RANGE) + verbose(",min_value=%lld", + (long long)reg->min_value); +@@ -447,6 +448,7 @@ static void mark_reg_unknown_value(struc + { + BUG_ON(regno >= MAX_BPF_REG); + regs[regno].type = UNKNOWN_VALUE; ++ regs[regno].id = 0; + regs[regno].imm = 0; + } + +@@ -1252,6 +1254,7 @@ static int check_call(struct bpf_verifie + return -EINVAL; + } + regs[BPF_REG_0].map_ptr = meta.map_ptr; ++ regs[BPF_REG_0].id = ++env->id_gen; + } else { + verbose("unknown return type %d of func %d\n", + fn->ret_type, func_id); +@@ -1668,8 +1671,7 @@ static int check_alu_op(struct bpf_verif + insn->src_reg); + return -EACCES; + } +- regs[insn->dst_reg].type = UNKNOWN_VALUE; +- regs[insn->dst_reg].map_ptr = NULL; ++ mark_reg_unknown_value(regs, insn->dst_reg); + } + } else { + /* case: R = imm +@@ -1931,6 +1933,38 @@ static void reg_set_min_max_inv(struct b + check_reg_overflow(true_reg); + } + ++static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, ++ enum bpf_reg_type type) ++{ ++ struct bpf_reg_state *reg = ®s[regno]; ++ ++ if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { ++ reg->type = type; ++ if (type == UNKNOWN_VALUE) ++ mark_reg_unknown_value(regs, regno); ++ } ++} ++ ++/* The logic is similar to find_good_pkt_pointers(), both could eventually ++ * be folded together at some point. ++ */ ++static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, ++ enum bpf_reg_type type) ++{ ++ struct bpf_reg_state *regs = state->regs; ++ int i; ++ ++ for (i = 0; i < MAX_BPF_REG; i++) ++ mark_map_reg(regs, i, regs[regno].id, type); ++ ++ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { ++ if (state->stack_slot_type[i] != STACK_SPILL) ++ continue; ++ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, ++ regs[regno].id, type); ++ } ++} ++ + static int check_cond_jmp_op(struct bpf_verifier_env *env, + struct bpf_insn *insn, int *insn_idx) + { +@@ -2018,18 +2052,13 @@ static int check_cond_jmp_op(struct bpf_ + if (BPF_SRC(insn->code) == BPF_K && + insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && + dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { +- if (opcode == BPF_JEQ) { +- /* next fallthrough insn can access memory via +- * this register +- */ +- regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; +- /* branch targer cannot access it, since reg == 0 */ +- mark_reg_unknown_value(other_branch->regs, +- insn->dst_reg); +- } else { +- other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; +- mark_reg_unknown_value(regs, insn->dst_reg); +- } ++ /* Mark all identical map registers in each branch as either ++ * safe or unknown depending R == 0 or R != 0 conditional. ++ */ ++ mark_map_regs(this_branch, insn->dst_reg, ++ opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE); ++ mark_map_regs(other_branch, insn->dst_reg, ++ opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE); + } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT && + dst_reg->type == PTR_TO_PACKET && + regs[insn->src_reg].type == PTR_TO_PACKET_END) { diff --git a/queue-4.9/bpf-fix-mark_reg_unknown_value-for-spilled-regs-on-map-value-marking.patch b/queue-4.9/bpf-fix-mark_reg_unknown_value-for-spilled-regs-on-map-value-marking.patch new file mode 100644 index 00000000000..c2db16d91e1 --- /dev/null +++ b/queue-4.9/bpf-fix-mark_reg_unknown_value-for-spilled-regs-on-map-value-marking.patch @@ -0,0 +1,103 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Daniel Borkmann +Date: Sun, 18 Dec 2016 01:52:59 +0100 +Subject: bpf: fix mark_reg_unknown_value for spilled regs on map value marking + +From: Daniel Borkmann + + +[ Upstream commit 6760bf2ddde8ad64f8205a651223a93de3a35494 ] + +Martin reported a verifier issue that hit the BUG_ON() for his +test case in the mark_reg_unknown_value() function: + + [ 202.861380] kernel BUG at kernel/bpf/verifier.c:467! + [...] + [ 203.291109] Call Trace: + [ 203.296501] [] mark_map_reg+0x45/0x50 + [ 203.308225] [] mark_map_regs+0x78/0x90 + [ 203.320140] [] do_check+0x226d/0x2c90 + [ 203.331865] [] bpf_check+0x48b/0x780 + [ 203.343403] [] bpf_prog_load+0x27e/0x440 + [ 203.355705] [] ? handle_mm_fault+0x11af/0x1230 + [ 203.369158] [] ? security_capable+0x48/0x60 + [ 203.382035] [] SyS_bpf+0x124/0x960 + [ 203.393185] [] ? __do_page_fault+0x276/0x490 + [ 203.406258] [] entry_SYSCALL_64_fastpath+0x13/0x94 + +This issue got uncovered after the fix in a08dd0da5307 ("bpf: fix +regression on verifier pruning wrt map lookups"). The reason why it +wasn't noticed before was, because as mentioned in a08dd0da5307, +mark_map_regs() was doing the id matching incorrectly based on the +uncached regs[regno].id. So, in the first loop, we walked all regs +and as soon as we found regno == i, then this reg's id was cleared +when calling mark_reg_unknown_value() thus that every subsequent +register was probed against id of 0 (which, in combination with the +PTR_TO_MAP_VALUE_OR_NULL type is an invalid condition that no other +register state can hold), and therefore wasn't type transitioned such +as in the spilled register case for the second loop. + +Now since that got fixed, it turned out that 57a09bf0a416 ("bpf: +Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") used +mark_reg_unknown_value() incorrectly for the spilled regs, and thus +hitting the BUG_ON() in some cases due to regno >= MAX_BPF_REG. + +Although spilled regs have the same type as the non-spilled regs +for the verifier state, that is, struct bpf_reg_state, they are +semantically different from the non-spilled regs. In other words, +there can be up to 64 (MAX_BPF_STACK / BPF_REG_SIZE) spilled regs +in the stack, for example, register R could have been spilled by +the program to stack location X, Y, Z, and in mark_map_regs() we +need to scan these stack slots of type STACK_SPILL for potential +registers that we have to transition from PTR_TO_MAP_VALUE_OR_NULL. +Therefore, depending on the location, the spilled_regs regno can +be a lot higher than just MAX_BPF_REG's value since we operate on +stack instead. The reset in mark_reg_unknown_value() itself is +just fine, only that the BUG_ON() was inappropriate for this. Fix +it by making a __mark_reg_unknown_value() version that can be +called from mark_map_reg() generically; we know for the non-spilled +case that the regno is always < MAX_BPF_REG anyway. + +Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") +Reported-by: Martin KaFai Lau +Signed-off-by: Daniel Borkmann +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -444,14 +444,19 @@ static void init_reg_state(struct bpf_re + regs[BPF_REG_1].type = PTR_TO_CTX; + } + +-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) ++static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) + { +- BUG_ON(regno >= MAX_BPF_REG); + regs[regno].type = UNKNOWN_VALUE; + regs[regno].id = 0; + regs[regno].imm = 0; + } + ++static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno) ++{ ++ BUG_ON(regno >= MAX_BPF_REG); ++ __mark_reg_unknown_value(regs, regno); ++} ++ + static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) + { + regs[regno].min_value = BPF_REGISTER_MIN_RANGE; +@@ -1946,7 +1951,7 @@ static void mark_map_reg(struct bpf_reg_ + */ + reg->id = 0; + if (type == UNKNOWN_VALUE) +- mark_reg_unknown_value(regs, regno); ++ __mark_reg_unknown_value(regs, regno); + } + } + diff --git a/queue-4.9/bpf-fix-regression-on-verifier-pruning-wrt-map-lookups.patch b/queue-4.9/bpf-fix-regression-on-verifier-pruning-wrt-map-lookups.patch new file mode 100644 index 00000000000..81aa8a08575 --- /dev/null +++ b/queue-4.9/bpf-fix-regression-on-verifier-pruning-wrt-map-lookups.patch @@ -0,0 +1,107 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Daniel Borkmann +Date: Thu, 15 Dec 2016 01:30:06 +0100 +Subject: bpf: fix regression on verifier pruning wrt map lookups + +From: Daniel Borkmann + + +[ Upstream commit a08dd0da5307ba01295c8383923e51e7997c3576 ] + +Commit 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL +registers") introduced a regression where existing programs stopped +loading due to reaching the verifier's maximum complexity limit, +whereas prior to this commit they were loading just fine; the affected +program has roughly 2k instructions. + +What was found is that state pruning couldn't be performed effectively +anymore due to mismatches of the verifier's register state, in particular +in the id tracking. It doesn't mean that 57a09bf0a416 is incorrect per +se, but rather that verifier needs to perform a lot more work for the +same program with regards to involved map lookups. + +Since commit 57a09bf0a416 is only about tracking registers with type +PTR_TO_MAP_VALUE_OR_NULL, the id is only needed to follow registers +until they are promoted through pattern matching with a NULL check to +either PTR_TO_MAP_VALUE or UNKNOWN_VALUE type. After that point, the +id becomes irrelevant for the transitioned types. + +For UNKNOWN_VALUE, id is already reset to 0 via mark_reg_unknown_value(), +but not so for PTR_TO_MAP_VALUE where id is becoming stale. It's even +transferred further into other types that don't make use of it. Among +others, one example is where UNKNOWN_VALUE is set on function call +return with RET_INTEGER return type. + +states_equal() will then fall through the memcmp() on register state; +note that the second memcmp() uses offsetofend(), so the id is part of +that since d2a4dd37f6b4 ("bpf: fix state equivalence"). But the bisect +pointed already to 57a09bf0a416, where we really reach beyond complexity +limit. What I found was that states_equal() often failed in this +case due to id mismatches in spilled regs with registers in type +PTR_TO_MAP_VALUE. Unlike non-spilled regs, spilled regs just perform +a memcmp() on their reg state and don't have any other optimizations +in place, therefore also id was relevant in this case for making a +pruning decision. + +We can safely reset id to 0 as well when converting to PTR_TO_MAP_VALUE. +For the affected program, it resulted in a ~17 fold reduction of +complexity and let the program load fine again. Selftest suite also +runs fine. The only other place where env->id_gen is used currently is +through direct packet access, but for these cases id is long living, thus +a different scenario. + +Also, the current logic in mark_map_regs() is not fully correct when +marking NULL branch with UNKNOWN_VALUE. We need to cache the destination +reg's id in any case. Otherwise, once we marked that reg as UNKNOWN_VALUE, +it's id is reset and any subsequent registers that hold the original id +and are of type PTR_TO_MAP_VALUE_OR_NULL won't be marked UNKNOWN_VALUE +anymore, since mark_map_reg() reuses the uncached regs[regno].id that +was just overridden. Note, we don't need to cache it outside of +mark_map_regs(), since it's called once on this_branch and the other +time on other_branch, which are both two independent verifier states. +A test case for this is added here, too. + +Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") +Signed-off-by: Daniel Borkmann +Acked-by: Thomas Graf +Acked-by: Alexei Starovoitov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/verifier.c | 11 ++++++++--- + 1 file changed, 8 insertions(+), 3 deletions(-) + +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -1940,6 +1940,11 @@ static void mark_map_reg(struct bpf_reg_ + + if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { + reg->type = type; ++ /* We don't need id from this point onwards anymore, thus we ++ * should better reset it, so that state pruning has chances ++ * to take effect. ++ */ ++ reg->id = 0; + if (type == UNKNOWN_VALUE) + mark_reg_unknown_value(regs, regno); + } +@@ -1952,16 +1957,16 @@ static void mark_map_regs(struct bpf_ver + enum bpf_reg_type type) + { + struct bpf_reg_state *regs = state->regs; ++ u32 id = regs[regno].id; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) +- mark_map_reg(regs, i, regs[regno].id, type); ++ mark_map_reg(regs, i, id, type); + + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (state->stack_slot_type[i] != STACK_SPILL) + continue; +- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, +- regs[regno].id, type); ++ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type); + } + } + diff --git a/queue-4.9/bpf-fix-state-equivalence.patch b/queue-4.9/bpf-fix-state-equivalence.patch new file mode 100644 index 00000000000..4e3adfd340f --- /dev/null +++ b/queue-4.9/bpf-fix-state-equivalence.patch @@ -0,0 +1,68 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Alexei Starovoitov +Date: Wed, 7 Dec 2016 10:57:59 -0800 +Subject: bpf: fix state equivalence + +From: Alexei Starovoitov + + +[ Upstream commit d2a4dd37f6b41fbcad76efbf63124eb3126c66fe ] + +Commmits 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") +and 484611357c19 ("bpf: allow access into map value arrays") by themselves +are correct, but in combination they make state equivalence ignore 'id' field +of the register state which can lead to accepting invalid program. + +Fixes: 57a09bf0a416 ("bpf: Detect identical PTR_TO_MAP_VALUE_OR_NULL registers") +Fixes: 484611357c19 ("bpf: allow access into map value arrays") +Signed-off-by: Alexei Starovoitov +Acked-by: Daniel Borkmann +Acked-by: Thomas Graf +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/bpf_verifier.h | 14 +++++++------- + kernel/bpf/verifier.c | 2 +- + 2 files changed, 8 insertions(+), 8 deletions(-) + +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -18,13 +18,6 @@ + + struct bpf_reg_state { + enum bpf_reg_type type; +- /* +- * Used to determine if any memory access using this register will +- * result in a bad access. +- */ +- s64 min_value; +- u64 max_value; +- u32 id; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK | UNKNOWN_VALUE */ + s64 imm; +@@ -40,6 +33,13 @@ struct bpf_reg_state { + */ + struct bpf_map *map_ptr; + }; ++ u32 id; ++ /* Used to determine if any memory access using this register will ++ * result in a bad access. These two fields must be last. ++ * See states_equal() ++ */ ++ s64 min_value; ++ u64 max_value; + }; + + enum bpf_stack_slot_type { +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2498,7 +2498,7 @@ static bool states_equal(struct bpf_veri + * we didn't do a variable access into a map then we are a-ok. + */ + if (!varlen_map_access && +- rold->type == rcur->type && rold->imm == rcur->imm) ++ memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0) + continue; + + /* If we didn't map access then again we don't care about the diff --git a/queue-4.9/bridge-drop-netfilter-fake-rtable-unconditionally.patch b/queue-4.9/bridge-drop-netfilter-fake-rtable-unconditionally.patch new file mode 100644 index 00000000000..244c03c65a1 --- /dev/null +++ b/queue-4.9/bridge-drop-netfilter-fake-rtable-unconditionally.patch @@ -0,0 +1,83 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Florian Westphal +Date: Mon, 13 Mar 2017 17:38:17 +0100 +Subject: bridge: drop netfilter fake rtable unconditionally + +From: Florian Westphal + + +[ Upstream commit a13b2082ece95247779b9995c4e91b4246bed023 ] + +Andreas reports kernel oops during rmmod of the br_netfilter module. +Hannes debugged the oops down to a NULL rt6info->rt6i_indev. + +Problem is that br_netfilter has the nasty concept of adding a fake +rtable to skb->dst; this happens in a br_netfilter prerouting hook. + +A second hook (in bridge LOCAL_IN) is supposed to remove these again +before the skb is handed up the stack. + +However, on module unload hooks get unregistered which means an +skb could traverse the prerouting hook that attaches the fake_rtable, +while the 'fake rtable remove' hook gets removed from the hooklist +immediately after. + +Fixes: 34666d467cbf1e2e3c7 ("netfilter: bridge: move br_netfilter out of the core") +Reported-by: Andreas Karis +Debugged-by: Hannes Frederic Sowa +Signed-off-by: Florian Westphal +Acked-by: Pablo Neira Ayuso +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_input.c | 1 + + net/bridge/br_netfilter_hooks.c | 21 --------------------- + 2 files changed, 1 insertion(+), 21 deletions(-) + +--- a/net/bridge/br_input.c ++++ b/net/bridge/br_input.c +@@ -29,6 +29,7 @@ EXPORT_SYMBOL(br_should_route_hook); + static int + br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb) + { ++ br_drop_fake_rtable(skb); + return netif_receive_skb(skb); + } + +--- a/net/bridge/br_netfilter_hooks.c ++++ b/net/bridge/br_netfilter_hooks.c +@@ -521,21 +521,6 @@ static unsigned int br_nf_pre_routing(vo + } + + +-/* PF_BRIDGE/LOCAL_IN ************************************************/ +-/* The packet is locally destined, which requires a real +- * dst_entry, so detach the fake one. On the way up, the +- * packet would pass through PRE_ROUTING again (which already +- * took place when the packet entered the bridge), but we +- * register an IPv4 PRE_ROUTING 'sabotage' hook that will +- * prevent this from happening. */ +-static unsigned int br_nf_local_in(void *priv, +- struct sk_buff *skb, +- const struct nf_hook_state *state) +-{ +- br_drop_fake_rtable(skb); +- return NF_ACCEPT; +-} +- + /* PF_BRIDGE/FORWARD *************************************************/ + static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb) + { +@@ -906,12 +891,6 @@ static struct nf_hook_ops br_nf_ops[] __ + .priority = NF_BR_PRI_BRNF, + }, + { +- .hook = br_nf_local_in, +- .pf = NFPROTO_BRIDGE, +- .hooknum = NF_BR_LOCAL_IN, +- .priority = NF_BR_PRI_BRNF, +- }, +- { + .hook = br_nf_forward_ip, + .pf = NFPROTO_BRIDGE, + .hooknum = NF_BR_FORWARD, diff --git a/queue-4.9/dccp-fix-memory-leak-during-tear-down-of-unsuccessful-connection-request.patch b/queue-4.9/dccp-fix-memory-leak-during-tear-down-of-unsuccessful-connection-request.patch new file mode 100644 index 00000000000..9fda3c4d142 --- /dev/null +++ b/queue-4.9/dccp-fix-memory-leak-during-tear-down-of-unsuccessful-connection-request.patch @@ -0,0 +1,33 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Hannes Frederic Sowa +Date: Mon, 13 Mar 2017 00:01:30 +0100 +Subject: dccp: fix memory leak during tear-down of unsuccessful connection request + +From: Hannes Frederic Sowa + + +[ Upstream commit 72ef9c4125c7b257e3a714d62d778ab46583d6a3 ] + +This patch fixes a memory leak, which happens if the connection request +is not fulfilled between parsing the DCCP options and handling the SYN +(because e.g. the backlog is full), because we forgot to free the +list of ack vectors. + +Reported-by: Jianwen Ji +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/dccp/ccids/ccid2.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/dccp/ccids/ccid2.c ++++ b/net/dccp/ccids/ccid2.c +@@ -749,6 +749,7 @@ static void ccid2_hc_tx_exit(struct sock + for (i = 0; i < hc->tx_seqbufc; i++) + kfree(hc->tx_seqbuf[i]); + hc->tx_seqbufc = 0; ++ dccp_ackvec_parsed_cleanup(&hc->tx_av_chunks); + } + + static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) diff --git a/queue-4.9/dccp-fix-use-after-free-in-dccp_feat_activate_values.patch b/queue-4.9/dccp-fix-use-after-free-in-dccp_feat_activate_values.patch new file mode 100644 index 00000000000..159e444ca5b --- /dev/null +++ b/queue-4.9/dccp-fix-use-after-free-in-dccp_feat_activate_values.patch @@ -0,0 +1,237 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Eric Dumazet +Date: Sun, 5 Mar 2017 10:52:16 -0800 +Subject: dccp: fix use-after-free in dccp_feat_activate_values + +From: Eric Dumazet + + +[ Upstream commit 62f8f4d9066c1c6f2474845d1ca7e2891f2ae3fd ] + +Dmitry reported crashes in DCCP stack [1] + +Problem here is that when I got rid of listener spinlock, I missed the +fact that DCCP stores a complex state in struct dccp_request_sock, +while TCP does not. + +Since multiple cpus could access it at the same time, we need to add +protection. + +[1] +BUG: KASAN: use-after-free in dccp_feat_activate_values+0x967/0xab0 +net/dccp/feat.c:1541 at addr ffff88003713be68 +Read of size 8 by task syz-executor2/8457 +CPU: 2 PID: 8457 Comm: syz-executor2 Not tainted 4.10.0-rc7+ #127 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 +Call Trace: + + __dump_stack lib/dump_stack.c:15 [inline] + dump_stack+0x292/0x398 lib/dump_stack.c:51 + kasan_object_err+0x1c/0x70 mm/kasan/report.c:162 + print_address_description mm/kasan/report.c:200 [inline] + kasan_report_error mm/kasan/report.c:289 [inline] + kasan_report.part.1+0x20e/0x4e0 mm/kasan/report.c:311 + kasan_report mm/kasan/report.c:332 [inline] + __asan_report_load8_noabort+0x29/0x30 mm/kasan/report.c:332 + dccp_feat_activate_values+0x967/0xab0 net/dccp/feat.c:1541 + dccp_create_openreq_child+0x464/0x610 net/dccp/minisocks.c:121 + dccp_v6_request_recv_sock+0x1f6/0x1960 net/dccp/ipv6.c:457 + dccp_check_req+0x335/0x5a0 net/dccp/minisocks.c:186 + dccp_v6_rcv+0x69e/0x1d00 net/dccp/ipv6.c:711 + ip6_input_finish+0x46d/0x17a0 net/ipv6/ip6_input.c:279 + NF_HOOK include/linux/netfilter.h:257 [inline] + ip6_input+0xdb/0x590 net/ipv6/ip6_input.c:322 + dst_input include/net/dst.h:507 [inline] + ip6_rcv_finish+0x289/0x890 net/ipv6/ip6_input.c:69 + NF_HOOK include/linux/netfilter.h:257 [inline] + ipv6_rcv+0x12ec/0x23d0 net/ipv6/ip6_input.c:203 + __netif_receive_skb_core+0x1ae5/0x3400 net/core/dev.c:4190 + __netif_receive_skb+0x2a/0x170 net/core/dev.c:4228 + process_backlog+0xe5/0x6c0 net/core/dev.c:4839 + napi_poll net/core/dev.c:5202 [inline] + net_rx_action+0xe70/0x1900 net/core/dev.c:5267 + __do_softirq+0x2fb/0xb7d kernel/softirq.c:284 + do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902 + + do_softirq.part.17+0x1e8/0x230 kernel/softirq.c:328 + do_softirq kernel/softirq.c:176 [inline] + __local_bh_enable_ip+0x1f2/0x200 kernel/softirq.c:181 + local_bh_enable include/linux/bottom_half.h:31 [inline] + rcu_read_unlock_bh include/linux/rcupdate.h:971 [inline] + ip6_finish_output2+0xbb0/0x23d0 net/ipv6/ip6_output.c:123 + ip6_finish_output+0x302/0x960 net/ipv6/ip6_output.c:148 + NF_HOOK_COND include/linux/netfilter.h:246 [inline] + ip6_output+0x1cb/0x8d0 net/ipv6/ip6_output.c:162 + ip6_xmit+0xcdf/0x20d0 include/net/dst.h:501 + inet6_csk_xmit+0x320/0x5f0 net/ipv6/inet6_connection_sock.c:179 + dccp_transmit_skb+0xb09/0x1120 net/dccp/output.c:141 + dccp_xmit_packet+0x215/0x760 net/dccp/output.c:280 + dccp_write_xmit+0x168/0x1d0 net/dccp/output.c:362 + dccp_sendmsg+0x79c/0xb10 net/dccp/proto.c:796 + inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744 + sock_sendmsg_nosec net/socket.c:635 [inline] + sock_sendmsg+0xca/0x110 net/socket.c:645 + SYSC_sendto+0x660/0x810 net/socket.c:1687 + SyS_sendto+0x40/0x50 net/socket.c:1655 + entry_SYSCALL_64_fastpath+0x1f/0xc2 +RIP: 0033:0x4458b9 +RSP: 002b:00007f8ceb77bb58 EFLAGS: 00000282 ORIG_RAX: 000000000000002c +RAX: ffffffffffffffda RBX: 0000000000000017 RCX: 00000000004458b9 +RDX: 0000000000000023 RSI: 0000000020e60000 RDI: 0000000000000017 +RBP: 00000000006e1b90 R08: 00000000200f9fe1 R09: 0000000000000020 +R10: 0000000000008010 R11: 0000000000000282 R12: 00000000007080a8 +R13: 0000000000000000 R14: 00007f8ceb77c9c0 R15: 00007f8ceb77c700 +Object at ffff88003713be50, in cache kmalloc-64 size: 64 +Allocated: +PID = 8446 + save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57 + save_stack+0x43/0xd0 mm/kasan/kasan.c:502 + set_track mm/kasan/kasan.c:514 [inline] + kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605 + kmem_cache_alloc_trace+0x82/0x270 mm/slub.c:2738 + kmalloc include/linux/slab.h:490 [inline] + dccp_feat_entry_new+0x214/0x410 net/dccp/feat.c:467 + dccp_feat_push_change+0x38/0x220 net/dccp/feat.c:487 + __feat_register_sp+0x223/0x2f0 net/dccp/feat.c:741 + dccp_feat_propagate_ccid+0x22b/0x2b0 net/dccp/feat.c:949 + dccp_feat_server_ccid_dependencies+0x1b3/0x250 net/dccp/feat.c:1012 + dccp_make_response+0x1f1/0xc90 net/dccp/output.c:423 + dccp_v6_send_response+0x4ec/0xc20 net/dccp/ipv6.c:217 + dccp_v6_conn_request+0xaba/0x11b0 net/dccp/ipv6.c:377 + dccp_rcv_state_process+0x51e/0x1650 net/dccp/input.c:606 + dccp_v6_do_rcv+0x213/0x350 net/dccp/ipv6.c:632 + sk_backlog_rcv include/net/sock.h:893 [inline] + __sk_receive_skb+0x36f/0xcc0 net/core/sock.c:479 + dccp_v6_rcv+0xba5/0x1d00 net/dccp/ipv6.c:742 + ip6_input_finish+0x46d/0x17a0 net/ipv6/ip6_input.c:279 + NF_HOOK include/linux/netfilter.h:257 [inline] + ip6_input+0xdb/0x590 net/ipv6/ip6_input.c:322 + dst_input include/net/dst.h:507 [inline] + ip6_rcv_finish+0x289/0x890 net/ipv6/ip6_input.c:69 + NF_HOOK include/linux/netfilter.h:257 [inline] + ipv6_rcv+0x12ec/0x23d0 net/ipv6/ip6_input.c:203 + __netif_receive_skb_core+0x1ae5/0x3400 net/core/dev.c:4190 + __netif_receive_skb+0x2a/0x170 net/core/dev.c:4228 + process_backlog+0xe5/0x6c0 net/core/dev.c:4839 + napi_poll net/core/dev.c:5202 [inline] + net_rx_action+0xe70/0x1900 net/core/dev.c:5267 + __do_softirq+0x2fb/0xb7d kernel/softirq.c:284 +Freed: +PID = 15 + save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57 + save_stack+0x43/0xd0 mm/kasan/kasan.c:502 + set_track mm/kasan/kasan.c:514 [inline] + kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:578 + slab_free_hook mm/slub.c:1355 [inline] + slab_free_freelist_hook mm/slub.c:1377 [inline] + slab_free mm/slub.c:2954 [inline] + kfree+0xe8/0x2b0 mm/slub.c:3874 + dccp_feat_entry_destructor.part.4+0x48/0x60 net/dccp/feat.c:418 + dccp_feat_entry_destructor net/dccp/feat.c:416 [inline] + dccp_feat_list_pop net/dccp/feat.c:541 [inline] + dccp_feat_activate_values+0x57f/0xab0 net/dccp/feat.c:1543 + dccp_create_openreq_child+0x464/0x610 net/dccp/minisocks.c:121 + dccp_v6_request_recv_sock+0x1f6/0x1960 net/dccp/ipv6.c:457 + dccp_check_req+0x335/0x5a0 net/dccp/minisocks.c:186 + dccp_v6_rcv+0x69e/0x1d00 net/dccp/ipv6.c:711 + ip6_input_finish+0x46d/0x17a0 net/ipv6/ip6_input.c:279 + NF_HOOK include/linux/netfilter.h:257 [inline] + ip6_input+0xdb/0x590 net/ipv6/ip6_input.c:322 + dst_input include/net/dst.h:507 [inline] + ip6_rcv_finish+0x289/0x890 net/ipv6/ip6_input.c:69 + NF_HOOK include/linux/netfilter.h:257 [inline] + ipv6_rcv+0x12ec/0x23d0 net/ipv6/ip6_input.c:203 + __netif_receive_skb_core+0x1ae5/0x3400 net/core/dev.c:4190 + __netif_receive_skb+0x2a/0x170 net/core/dev.c:4228 + process_backlog+0xe5/0x6c0 net/core/dev.c:4839 + napi_poll net/core/dev.c:5202 [inline] + net_rx_action+0xe70/0x1900 net/core/dev.c:5267 + __do_softirq+0x2fb/0xb7d kernel/softirq.c:284 +Memory state around the buggy address: + ffff88003713bd00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + ffff88003713bd80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +>ffff88003713be00: fc fc fc fc fc fc fc fc fc fc fb fb fb fb fb fb + ^ + +Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") +Signed-off-by: Eric Dumazet +Reported-by: Dmitry Vyukov +Tested-by: Dmitry Vyukov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/dccp.h | 1 + + net/dccp/minisocks.c | 24 ++++++++++++++++-------- + 2 files changed, 17 insertions(+), 8 deletions(-) + +--- a/include/linux/dccp.h ++++ b/include/linux/dccp.h +@@ -163,6 +163,7 @@ struct dccp_request_sock { + __u64 dreq_isr; + __u64 dreq_gsr; + __be32 dreq_service; ++ spinlock_t dreq_lock; + struct list_head dreq_featneg; + __u32 dreq_timestamp_echo; + __u32 dreq_timestamp_time; +--- a/net/dccp/minisocks.c ++++ b/net/dccp/minisocks.c +@@ -146,6 +146,13 @@ struct sock *dccp_check_req(struct sock + struct dccp_request_sock *dreq = dccp_rsk(req); + bool own_req; + ++ /* TCP/DCCP listeners became lockless. ++ * DCCP stores complex state in its request_sock, so we need ++ * a protection for them, now this code runs without being protected ++ * by the parent (listener) lock. ++ */ ++ spin_lock_bh(&dreq->dreq_lock); ++ + /* Check for retransmitted REQUEST */ + if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) { + +@@ -160,7 +167,7 @@ struct sock *dccp_check_req(struct sock + inet_rtx_syn_ack(sk, req); + } + /* Network Duplicate, discard packet */ +- return NULL; ++ goto out; + } + + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; +@@ -186,20 +193,20 @@ struct sock *dccp_check_req(struct sock + + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, + req, &own_req); +- if (!child) +- goto listen_overflow; +- +- return inet_csk_complete_hashdance(sk, child, req, own_req); ++ if (child) { ++ child = inet_csk_complete_hashdance(sk, child, req, own_req); ++ goto out; ++ } + +-listen_overflow: +- dccp_pr_debug("listen_overflow!\n"); + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; + drop: + if (dccp_hdr(skb)->dccph_type != DCCP_PKT_RESET) + req->rsk_ops->send_reset(sk, skb); + + inet_csk_reqsk_queue_drop(sk, req); +- return NULL; ++out: ++ spin_unlock_bh(&dreq->dreq_lock); ++ return child; + } + + EXPORT_SYMBOL_GPL(dccp_check_req); +@@ -250,6 +257,7 @@ int dccp_reqsk_init(struct request_sock + { + struct dccp_request_sock *dreq = dccp_rsk(req); + ++ spin_lock_init(&dreq->dreq_lock); + inet_rsk(req)->ir_rmt_port = dccp_hdr(skb)->dccph_sport; + inet_rsk(req)->ir_num = ntohs(dccp_hdr(skb)->dccph_dport); + inet_rsk(req)->acked = 0; diff --git a/queue-4.9/dccp-tcp-fix-routing-redirect-race.patch b/queue-4.9/dccp-tcp-fix-routing-redirect-race.patch new file mode 100644 index 00000000000..4b0ce16a478 --- /dev/null +++ b/queue-4.9/dccp-tcp-fix-routing-redirect-race.patch @@ -0,0 +1,160 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Jon Maxwell +Date: Fri, 10 Mar 2017 16:40:33 +1100 +Subject: dccp/tcp: fix routing redirect race + +From: Jon Maxwell + + +[ Upstream commit 45caeaa5ac0b4b11784ac6f932c0ad4c6b67cda0 ] + +As Eric Dumazet pointed out this also needs to be fixed in IPv6. +v2: Contains the IPv6 tcp/Ipv6 dccp patches as well. + +We have seen a few incidents lately where a dst_enty has been freed +with a dangling TCP socket reference (sk->sk_dst_cache) pointing to that +dst_entry. If the conditions/timings are right a crash then ensues when the +freed dst_entry is referenced later on. A Common crashing back trace is: + + #8 [] page_fault at ffffffff8163e648 + [exception RIP: __tcp_ack_snd_check+74] +. +. + #9 [] tcp_rcv_established at ffffffff81580b64 +#10 [] tcp_v4_do_rcv at ffffffff8158b54a +#11 [] tcp_v4_rcv at ffffffff8158cd02 +#12 [] ip_local_deliver_finish at ffffffff815668f4 +#13 [] ip_local_deliver at ffffffff81566bd9 +#14 [] ip_rcv_finish at ffffffff8156656d +#15 [] ip_rcv at ffffffff81566f06 +#16 [] __netif_receive_skb_core at ffffffff8152b3a2 +#17 [] __netif_receive_skb at ffffffff8152b608 +#18 [] netif_receive_skb at ffffffff8152b690 +#19 [] vmxnet3_rq_rx_complete at ffffffffa015eeaf [vmxnet3] +#20 [] vmxnet3_poll_rx_only at ffffffffa015f32a [vmxnet3] +#21 [] net_rx_action at ffffffff8152bac2 +#22 [] __do_softirq at ffffffff81084b4f +#23 [] call_softirq at ffffffff8164845c +#24 [] do_softirq at ffffffff81016fc5 +#25 [] irq_exit at ffffffff81084ee5 +#26 [] do_IRQ at ffffffff81648ff8 + +Of course it may happen with other NIC drivers as well. + +It's found the freed dst_entry here: + + 224 static bool tcp_in_quickack_mode(struct sock *sk)↩ + 225 {↩ + 226 ▹ const struct inet_connection_sock *icsk = inet_csk(sk);↩ + 227 ▹ const struct dst_entry *dst = __sk_dst_get(sk);↩ + 228 ↩ + 229 ▹ return (dst && dst_metric(dst, RTAX_QUICKACK)) ||↩ + 230 ▹ ▹ (icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong);↩ + 231 }↩ + +But there are other backtraces attributed to the same freed dst_entry in +netfilter code as well. + +All the vmcores showed 2 significant clues: + +- Remote hosts behind the default gateway had always been redirected to a +different gateway. A rtable/dst_entry will be added for that host. Making +more dst_entrys with lower reference counts. Making this more probable. + +- All vmcores showed a postitive LockDroppedIcmps value, e.g: + +LockDroppedIcmps 267 + +A closer look at the tcp_v4_err() handler revealed that do_redirect() will run +regardless of whether user space has the socket locked. This can result in a +race condition where the same dst_entry cached in sk->sk_dst_entry can be +decremented twice for the same socket via: + +do_redirect()->__sk_dst_check()-> dst_release(). + +Which leads to the dst_entry being prematurely freed with another socket +pointing to it via sk->sk_dst_cache and a subsequent crash. + +To fix this skip do_redirect() if usespace has the socket locked. Instead let +the redirect take place later when user space does not have the socket +locked. + +The dccp/IPv6 code is very similar in this respect, so fixing it there too. + +As Eric Garver pointed out the following commit now invalidates routes. Which +can set the dst->obsolete flag so that ipv4_dst_check() returns null and +triggers the dst_release(). + +Fixes: ceb3320610d6 ("ipv4: Kill routes during PMTU/redirect updates.") +Cc: Eric Garver +Cc: Hannes Sowa +Signed-off-by: Jon Maxwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/dccp/ipv4.c | 3 ++- + net/dccp/ipv6.c | 8 +++++--- + net/ipv4/tcp_ipv4.c | 3 ++- + net/ipv6/tcp_ipv6.c | 8 +++++--- + 4 files changed, 14 insertions(+), 8 deletions(-) + +--- a/net/dccp/ipv4.c ++++ b/net/dccp/ipv4.c +@@ -289,7 +289,8 @@ static void dccp_v4_err(struct sk_buff * + + switch (type) { + case ICMP_REDIRECT: +- dccp_do_redirect(skb, sk); ++ if (!sock_owned_by_user(sk)) ++ dccp_do_redirect(skb, sk); + goto out; + case ICMP_SOURCE_QUENCH: + /* Just silently ignore these. */ +--- a/net/dccp/ipv6.c ++++ b/net/dccp/ipv6.c +@@ -122,10 +122,12 @@ static void dccp_v6_err(struct sk_buff * + np = inet6_sk(sk); + + if (type == NDISC_REDIRECT) { +- struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); ++ if (!sock_owned_by_user(sk)) { ++ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); + +- if (dst) +- dst->ops->redirect(dst, sk, skb); ++ if (dst) ++ dst->ops->redirect(dst, sk, skb); ++ } + goto out; + } + +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -421,7 +421,8 @@ void tcp_v4_err(struct sk_buff *icmp_skb + + switch (type) { + case ICMP_REDIRECT: +- do_redirect(icmp_skb, sk); ++ if (!sock_owned_by_user(sk)) ++ do_redirect(icmp_skb, sk); + goto out; + case ICMP_SOURCE_QUENCH: + /* Just silently ignore these. */ +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -375,10 +375,12 @@ static void tcp_v6_err(struct sk_buff *s + np = inet6_sk(sk); + + if (type == NDISC_REDIRECT) { +- struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); ++ if (!sock_owned_by_user(sk)) { ++ struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); + +- if (dst) +- dst->ops->redirect(dst, sk, skb); ++ if (dst) ++ dst->ops->redirect(dst, sk, skb); ++ } + goto out; + } + diff --git a/queue-4.9/dccp-unlock-sock-before-calling-sk_free.patch b/queue-4.9/dccp-unlock-sock-before-calling-sk_free.patch new file mode 100644 index 00000000000..9153f584485 --- /dev/null +++ b/queue-4.9/dccp-unlock-sock-before-calling-sk_free.patch @@ -0,0 +1,81 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Arnaldo Carvalho de Melo +Date: Wed, 1 Mar 2017 16:35:07 -0300 +Subject: dccp: Unlock sock before calling sk_free() + +From: Arnaldo Carvalho de Melo + + +[ Upstream commit d5afb6f9b6bb2c57bd0c05e76e12489dc0d037d9 ] + +The code where sk_clone() came from created a new socket and locked it, +but then, on the error path didn't unlock it. + +This problem stayed there for a long while, till b0691c8ee7c2 ("net: +Unlock sock before calling sk_free()") fixed it, but unfortunately the +callers of sk_clone() (now sk_clone_locked()) were not audited and the +one in dccp_create_openreq_child() remained. + +Now in the age of the syskaller fuzzer, this was finally uncovered, as +reported by Dmitry: + + ---- 8< ---- + +I've got the following report while running syzkaller fuzzer on +86292b33d4b7 ("Merge branch 'akpm' (patches from Andrew)") + + [ BUG: held lock freed! ] + 4.10.0+ #234 Not tainted + ------------------------- + syz-executor6/6898 is freeing memory + ffff88006286cac0-ffff88006286d3b7, with a lock still held there! + (slock-AF_INET6){+.-...}, at: [] spin_lock + include/linux/spinlock.h:299 [inline] + (slock-AF_INET6){+.-...}, at: [] + sk_clone_lock+0x3d9/0x12c0 net/core/sock.c:1504 + 5 locks held by syz-executor6/6898: + #0: (sk_lock-AF_INET6){+.+.+.}, at: [] lock_sock + include/net/sock.h:1460 [inline] + #0: (sk_lock-AF_INET6){+.+.+.}, at: [] + inet_stream_connect+0x44/0xa0 net/ipv4/af_inet.c:681 + #1: (rcu_read_lock){......}, at: [] + inet6_csk_xmit+0x12a/0x5d0 net/ipv6/inet6_connection_sock.c:126 + #2: (rcu_read_lock){......}, at: [] __skb_unlink + include/linux/skbuff.h:1767 [inline] + #2: (rcu_read_lock){......}, at: [] __skb_dequeue + include/linux/skbuff.h:1783 [inline] + #2: (rcu_read_lock){......}, at: [] + process_backlog+0x264/0x730 net/core/dev.c:4835 + #3: (rcu_read_lock){......}, at: [] + ip6_input_finish+0x0/0x1700 net/ipv6/ip6_input.c:59 + #4: (slock-AF_INET6){+.-...}, at: [] spin_lock + include/linux/spinlock.h:299 [inline] + #4: (slock-AF_INET6){+.-...}, at: [] + sk_clone_lock+0x3d9/0x12c0 net/core/sock.c:1504 + +Fix it just like was done by b0691c8ee7c2 ("net: Unlock sock before calling +sk_free()"). + +Reported-by: Dmitry Vyukov +Cc: Cong Wang +Cc: Eric Dumazet +Cc: Gerrit Renker +Cc: Thomas Gleixner +Link: http://lkml.kernel.org/r/20170301153510.GE15145@kernel.org +Signed-off-by: Arnaldo Carvalho de Melo +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/dccp/minisocks.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/dccp/minisocks.c ++++ b/net/dccp/minisocks.c +@@ -122,6 +122,7 @@ struct sock *dccp_create_openreq_child(c + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; ++ bh_unlock_sock(newsk); + sk_free(newsk); + return NULL; + } diff --git a/queue-4.9/geneve-lock-rcu-on-tx-path.patch b/queue-4.9/geneve-lock-rcu-on-tx-path.patch new file mode 100644 index 00000000000..2cd033070bb --- /dev/null +++ b/queue-4.9/geneve-lock-rcu-on-tx-path.patch @@ -0,0 +1,48 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Jakub Kicinski +Date: Fri, 24 Feb 2017 11:43:37 -0800 +Subject: geneve: lock RCU on TX path + +From: Jakub Kicinski + + +[ Upstream commit a717e3f740803cc88bd5c9a70c93504f6a368663 ] + +There is no guarantees that callers of the TX path will hold +the RCU lock. Grab it explicitly. + +Fixes: fceb9c3e3825 ("geneve: avoid using stale geneve socket.") +Signed-off-by: Jakub Kicinski +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/geneve.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/drivers/net/geneve.c ++++ b/drivers/net/geneve.c +@@ -1039,16 +1039,22 @@ static netdev_tx_t geneve_xmit(struct sk + { + struct geneve_dev *geneve = netdev_priv(dev); + struct ip_tunnel_info *info = NULL; ++ int err; + + if (geneve->collect_md) + info = skb_tunnel_info(skb); + ++ rcu_read_lock(); + #if IS_ENABLED(CONFIG_IPV6) + if ((info && ip_tunnel_info_af(info) == AF_INET6) || + (!info && geneve->remote.sa.sa_family == AF_INET6)) +- return geneve6_xmit_skb(skb, dev, info); ++ err = geneve6_xmit_skb(skb, dev, info); ++ else + #endif +- return geneve_xmit_skb(skb, dev, info); ++ err = geneve_xmit_skb(skb, dev, info); ++ rcu_read_unlock(); ++ ++ return err; + } + + static int __geneve_change_mtu(struct net_device *dev, int new_mtu, bool strict) diff --git a/queue-4.9/ipv4-mask-tos-for-input-route.patch b/queue-4.9/ipv4-mask-tos-for-input-route.patch new file mode 100644 index 00000000000..b3946a53f42 --- /dev/null +++ b/queue-4.9/ipv4-mask-tos-for-input-route.patch @@ -0,0 +1,35 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Julian Anastasov +Date: Sun, 26 Feb 2017 17:14:35 +0200 +Subject: ipv4: mask tos for input route + +From: Julian Anastasov + + +[ Upstream commit 6e28099d38c0e50d62c1afc054e37e573adf3d21 ] + +Restore the lost masking of TOS in input route code to +allow ip rules to match it properly. + +Problem [1] noticed by Shmulik Ladkani + +[1] http://marc.info/?t=137331755300040&r=1&w=2 + +Fixes: 89aef8921bfb ("ipv4: Delete routing cache.") +Signed-off-by: Julian Anastasov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/route.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -1968,6 +1968,7 @@ int ip_route_input_noref(struct sk_buff + { + int res; + ++ tos &= IPTOS_RT_MASK; + rcu_read_lock(); + + /* Multicast recognition logic is moved from route cache to here. diff --git a/queue-4.9/ipv6-avoid-write-to-a-possibly-cloned-skb.patch b/queue-4.9/ipv6-avoid-write-to-a-possibly-cloned-skb.patch new file mode 100644 index 00000000000..ceafd11fe74 --- /dev/null +++ b/queue-4.9/ipv6-avoid-write-to-a-possibly-cloned-skb.patch @@ -0,0 +1,65 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Florian Westphal +Date: Mon, 13 Mar 2017 16:24:28 +0100 +Subject: ipv6: avoid write to a possibly cloned skb + +From: Florian Westphal + + +[ Upstream commit 79e49503efe53a8c51d8b695bedc8a346c5e4a87 ] + +ip6_fragment, in case skb has a fraglist, checks if the +skb is cloned. If it is, it will move to the 'slow path' and allocates +new skbs for each fragment. + +However, right before entering the slowpath loop, it updates the +nexthdr value of the last ipv6 extension header to NEXTHDR_FRAGMENT, +to account for the fragment header that will be inserted in the new +ipv6-fragment skbs. + +In case original skb is cloned this munges nexthdr value of another +skb. Avoid this by doing the nexthdr update for each of the new fragment +skbs separately. + +This was observed with tcpdump on a bridge device where netfilter ipv6 +reassembly is active: tcpdump shows malformed fragment headers as +the l4 header (icmpv6, tcp, etc). is decoded as a fragment header. + +Cc: Hannes Frederic Sowa +Reported-by: Andreas Karis +Signed-off-by: Florian Westphal +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_output.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -757,13 +757,14 @@ slow_path: + * Fragment the datagram. + */ + +- *prevhdr = NEXTHDR_FRAGMENT; + troom = rt->dst.dev->needed_tailroom; + + /* + * Keep copying data until we run out. + */ + while (left > 0) { ++ u8 *fragnexthdr_offset; ++ + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) +@@ -808,6 +809,10 @@ slow_path: + */ + skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); + ++ fragnexthdr_offset = skb_network_header(frag); ++ fragnexthdr_offset += prevhdr - skb_network_header(skb); ++ *fragnexthdr_offset = NEXTHDR_FRAGMENT; ++ + /* + * Build fragment header. + */ diff --git a/queue-4.9/ipv6-make-ecmp-route-replacement-less-greedy.patch b/queue-4.9/ipv6-make-ecmp-route-replacement-less-greedy.patch new file mode 100644 index 00000000000..f6ff8218a34 --- /dev/null +++ b/queue-4.9/ipv6-make-ecmp-route-replacement-less-greedy.patch @@ -0,0 +1,71 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Sabrina Dubroca +Date: Mon, 13 Mar 2017 13:28:09 +0100 +Subject: ipv6: make ECMP route replacement less greedy + +From: Sabrina Dubroca + + +[ Upstream commit 67e194007be08d071294456274dd53e0a04fdf90 ] + +Commit 27596472473a ("ipv6: fix ECMP route replacement") introduced a +loop that removes all siblings of an ECMP route that is being +replaced. However, this loop doesn't stop when it has replaced +siblings, and keeps removing other routes with a higher metric. +We also end up triggering the WARN_ON after the loop, because after +this nsiblings < 0. + +Instead, stop the loop when we have taken care of all routes with the +same metric as the route being replaced. + + Reproducer: + =========== + #!/bin/sh + + ip netns add ns1 + ip netns add ns2 + ip -net ns1 link set lo up + + for x in 0 1 2 ; do + ip link add veth$x netns ns2 type veth peer name eth$x netns ns1 + ip -net ns1 link set eth$x up + ip -net ns2 link set veth$x up + done + + ip -net ns1 -6 r a 2000::/64 nexthop via fe80::0 dev eth0 \ + nexthop via fe80::1 dev eth1 nexthop via fe80::2 dev eth2 + ip -net ns1 -6 r a 2000::/64 via fe80::42 dev eth0 metric 256 + ip -net ns1 -6 r a 2000::/64 via fe80::43 dev eth0 metric 2048 + + echo "before replace, 3 routes" + ip -net ns1 -6 r | grep -v '^fe80\|^ff00' + echo + + ip -net ns1 -6 r c 2000::/64 nexthop via fe80::4 dev eth0 \ + nexthop via fe80::5 dev eth1 nexthop via fe80::6 dev eth2 + + echo "after replace, only 2 routes, metric 2048 is gone" + ip -net ns1 -6 r | grep -v '^fe80\|^ff00' + +Fixes: 27596472473a ("ipv6: fix ECMP route replacement") +Signed-off-by: Sabrina Dubroca +Acked-by: Nicolas Dichtel +Reviewed-by: Xin Long +Reviewed-by: Michal Kubecek +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_fib.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/ipv6/ip6_fib.c ++++ b/net/ipv6/ip6_fib.c +@@ -908,6 +908,8 @@ add: + ins = &rt->dst.rt6_next; + iter = *ins; + while (iter) { ++ if (iter->rt6i_metric > rt->rt6i_metric) ++ break; + if (rt6_qualify_for_ecmp(iter)) { + *ins = iter->dst.rt6_next; + fib6_purge_rt(iter, fn, info->nl_net); diff --git a/queue-4.9/ipv6-orphan-skbs-in-reassembly-unit.patch b/queue-4.9/ipv6-orphan-skbs-in-reassembly-unit.patch new file mode 100644 index 00000000000..6a88a73d2dd --- /dev/null +++ b/queue-4.9/ipv6-orphan-skbs-in-reassembly-unit.patch @@ -0,0 +1,172 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Eric Dumazet +Date: Wed, 1 Mar 2017 14:45:06 -0800 +Subject: ipv6: orphan skbs in reassembly unit + +From: Eric Dumazet + + +[ Upstream commit 48cac18ecf1de82f76259a54402c3adb7839ad01 ] + +Andrey reported a use-after-free in IPv6 stack. + +Issue here is that we free the socket while it still has skb +in TX path and in some queues. + +It happens here because IPv6 reassembly unit messes skb->truesize, +breaking skb_set_owner_w() badly. + +We fixed a similar issue for IPV4 in commit 8282f27449bf ("inet: frag: +Always orphan skbs inside ip_defrag()") +Acked-by: Joe Stringer + +================================================================== +BUG: KASAN: use-after-free in sock_wfree+0x118/0x120 +Read of size 8 at addr ffff880062da0060 by task a.out/4140 + +page:ffffea00018b6800 count:1 mapcount:0 mapping: (null) +index:0x0 compound_mapcount: 0 +flags: 0x100000000008100(slab|head) +raw: 0100000000008100 0000000000000000 0000000000000000 0000000180130013 +raw: dead000000000100 dead000000000200 ffff88006741f140 0000000000000000 +page dumped because: kasan: bad access detected + +CPU: 0 PID: 4140 Comm: a.out Not tainted 4.10.0-rc3+ #59 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 +Call Trace: + __dump_stack lib/dump_stack.c:15 + dump_stack+0x292/0x398 lib/dump_stack.c:51 + describe_address mm/kasan/report.c:262 + kasan_report_error+0x121/0x560 mm/kasan/report.c:370 + kasan_report mm/kasan/report.c:392 + __asan_report_load8_noabort+0x3e/0x40 mm/kasan/report.c:413 + sock_flag ./arch/x86/include/asm/bitops.h:324 + sock_wfree+0x118/0x120 net/core/sock.c:1631 + skb_release_head_state+0xfc/0x250 net/core/skbuff.c:655 + skb_release_all+0x15/0x60 net/core/skbuff.c:668 + __kfree_skb+0x15/0x20 net/core/skbuff.c:684 + kfree_skb+0x16e/0x4e0 net/core/skbuff.c:705 + inet_frag_destroy+0x121/0x290 net/ipv4/inet_fragment.c:304 + inet_frag_put ./include/net/inet_frag.h:133 + nf_ct_frag6_gather+0x1125/0x38b0 net/ipv6/netfilter/nf_conntrack_reasm.c:617 + ipv6_defrag+0x21b/0x350 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:68 + nf_hook_entry_hookfn ./include/linux/netfilter.h:102 + nf_hook_slow+0xc3/0x290 net/netfilter/core.c:310 + nf_hook ./include/linux/netfilter.h:212 + __ip6_local_out+0x52c/0xaf0 net/ipv6/output_core.c:160 + ip6_local_out+0x2d/0x170 net/ipv6/output_core.c:170 + ip6_send_skb+0xa1/0x340 net/ipv6/ip6_output.c:1722 + ip6_push_pending_frames+0xb3/0xe0 net/ipv6/ip6_output.c:1742 + rawv6_push_pending_frames net/ipv6/raw.c:613 + rawv6_sendmsg+0x2cff/0x4130 net/ipv6/raw.c:927 + inet_sendmsg+0x164/0x5b0 net/ipv4/af_inet.c:744 + sock_sendmsg_nosec net/socket.c:635 + sock_sendmsg+0xca/0x110 net/socket.c:645 + sock_write_iter+0x326/0x620 net/socket.c:848 + new_sync_write fs/read_write.c:499 + __vfs_write+0x483/0x760 fs/read_write.c:512 + vfs_write+0x187/0x530 fs/read_write.c:560 + SYSC_write fs/read_write.c:607 + SyS_write+0xfb/0x230 fs/read_write.c:599 + entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203 +RIP: 0033:0x7ff26e6f5b79 +RSP: 002b:00007ff268e0ed98 EFLAGS: 00000206 ORIG_RAX: 0000000000000001 +RAX: ffffffffffffffda RBX: 00007ff268e0f9c0 RCX: 00007ff26e6f5b79 +RDX: 0000000000000010 RSI: 0000000020f50fe1 RDI: 0000000000000003 +RBP: 00007ff26ebc1220 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000206 R12: 0000000000000000 +R13: 00007ff268e0f9c0 R14: 00007ff26efec040 R15: 0000000000000003 + +The buggy address belongs to the object at ffff880062da0000 + which belongs to the cache RAWv6 of size 1504 +The buggy address ffff880062da0060 is located 96 bytes inside + of 1504-byte region [ffff880062da0000, ffff880062da05e0) + +Freed by task 4113: + save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57 + save_stack+0x43/0xd0 mm/kasan/kasan.c:502 + set_track mm/kasan/kasan.c:514 + kasan_slab_free+0x73/0xc0 mm/kasan/kasan.c:578 + slab_free_hook mm/slub.c:1352 + slab_free_freelist_hook mm/slub.c:1374 + slab_free mm/slub.c:2951 + kmem_cache_free+0xb2/0x2c0 mm/slub.c:2973 + sk_prot_free net/core/sock.c:1377 + __sk_destruct+0x49c/0x6e0 net/core/sock.c:1452 + sk_destruct+0x47/0x80 net/core/sock.c:1460 + __sk_free+0x57/0x230 net/core/sock.c:1468 + sk_free+0x23/0x30 net/core/sock.c:1479 + sock_put ./include/net/sock.h:1638 + sk_common_release+0x31e/0x4e0 net/core/sock.c:2782 + rawv6_close+0x54/0x80 net/ipv6/raw.c:1214 + inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425 + inet6_release+0x50/0x70 net/ipv6/af_inet6.c:431 + sock_release+0x8d/0x1e0 net/socket.c:599 + sock_close+0x16/0x20 net/socket.c:1063 + __fput+0x332/0x7f0 fs/file_table.c:208 + ____fput+0x15/0x20 fs/file_table.c:244 + task_work_run+0x19b/0x270 kernel/task_work.c:116 + exit_task_work ./include/linux/task_work.h:21 + do_exit+0x186b/0x2800 kernel/exit.c:839 + do_group_exit+0x149/0x420 kernel/exit.c:943 + SYSC_exit_group kernel/exit.c:954 + SyS_exit_group+0x1d/0x20 kernel/exit.c:952 + entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203 + +Allocated by task 4115: + save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:57 + save_stack+0x43/0xd0 mm/kasan/kasan.c:502 + set_track mm/kasan/kasan.c:514 + kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:605 + kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:544 + slab_post_alloc_hook mm/slab.h:432 + slab_alloc_node mm/slub.c:2708 + slab_alloc mm/slub.c:2716 + kmem_cache_alloc+0x1af/0x250 mm/slub.c:2721 + sk_prot_alloc+0x65/0x2a0 net/core/sock.c:1334 + sk_alloc+0x105/0x1010 net/core/sock.c:1396 + inet6_create+0x44d/0x1150 net/ipv6/af_inet6.c:183 + __sock_create+0x4f6/0x880 net/socket.c:1199 + sock_create net/socket.c:1239 + SYSC_socket net/socket.c:1269 + SyS_socket+0xf9/0x230 net/socket.c:1249 + entry_SYSCALL_64_fastpath+0x1f/0xc2 arch/x86/entry/entry_64.S:203 + +Memory state around the buggy address: + ffff880062d9ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + ffff880062d9ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +>ffff880062da0000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ^ + ffff880062da0080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ffff880062da0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb +================================================================== + +Reported-by: Andrey Konovalov +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/netfilter/nf_conntrack_reasm.c | 1 + + net/openvswitch/conntrack.c | 1 - + 2 files changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv6/netfilter/nf_conntrack_reasm.c ++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c +@@ -589,6 +589,7 @@ int nf_ct_frag6_gather(struct net *net, + hdr = ipv6_hdr(skb); + fhdr = (struct frag_hdr *)skb_transport_header(skb); + ++ skb_orphan(skb); + fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, + skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); + if (fq == NULL) { +--- a/net/openvswitch/conntrack.c ++++ b/net/openvswitch/conntrack.c +@@ -367,7 +367,6 @@ static int handle_fragments(struct net * + } else if (key->eth.type == htons(ETH_P_IPV6)) { + enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; + +- skb_orphan(skb); + memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); + err = nf_ct_frag6_gather(net, skb, user); + if (err) { diff --git a/queue-4.9/l2tp-avoid-use-after-free-caused-by-l2tp_ip_backlog_recv.patch b/queue-4.9/l2tp-avoid-use-after-free-caused-by-l2tp_ip_backlog_recv.patch new file mode 100644 index 00000000000..f7af6975e5c --- /dev/null +++ b/queue-4.9/l2tp-avoid-use-after-free-caused-by-l2tp_ip_backlog_recv.patch @@ -0,0 +1,32 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Paul Hüber +Date: Sun, 26 Feb 2017 17:58:19 +0100 +Subject: l2tp: avoid use-after-free caused by l2tp_ip_backlog_recv + +From: Paul Hüber + + +[ Upstream commit 51fb60eb162ab84c5edf2ae9c63cf0b878e5547e ] + +l2tp_ip_backlog_recv may not return -1 if the packet gets dropped. +The return value is passed up to ip_local_deliver_finish, which treats +negative values as an IP protocol number for resubmission. + +Signed-off-by: Paul Hüber +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/l2tp/l2tp_ip.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/l2tp/l2tp_ip.c ++++ b/net/l2tp/l2tp_ip.c +@@ -388,7 +388,7 @@ static int l2tp_ip_backlog_recv(struct s + drop: + IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); + kfree_skb(skb); +- return -1; ++ return 0; + } + + /* Userspace will call sendmsg() on the tunnel socket to send L2TP diff --git a/queue-4.9/mlxsw-spectrum_router-avoid-potential-packets-loss.patch b/queue-4.9/mlxsw-spectrum_router-avoid-potential-packets-loss.patch new file mode 100644 index 00000000000..90b49a80a9f --- /dev/null +++ b/queue-4.9/mlxsw-spectrum_router-avoid-potential-packets-loss.patch @@ -0,0 +1,78 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Ido Schimmel +Date: Tue, 28 Feb 2017 08:55:40 +0100 +Subject: mlxsw: spectrum_router: Avoid potential packets loss + +From: Ido Schimmel + + +[ Upstream commit f7df4923fa986247e93ec2cdff5ca168fff14dcf ] + +When the structure of the LPM tree changes (f.e., due to the addition of +a new prefix), we unbind the old tree and then bind the new one. This +may result in temporary packet loss. + +Instead, overwrite the old binding with the new one. + +Fixes: 6b75c4807db3 ("mlxsw: spectrum_router: Add virtual router management") +Signed-off-by: Ido Schimmel +Signed-off-by: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 30 ++++++++++++------ + 1 file changed, 20 insertions(+), 10 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c ++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +@@ -500,30 +500,40 @@ static int + mlxsw_sp_vr_lpm_tree_check(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_vr *vr, + struct mlxsw_sp_prefix_usage *req_prefix_usage) + { +- struct mlxsw_sp_lpm_tree *lpm_tree; ++ struct mlxsw_sp_lpm_tree *lpm_tree = vr->lpm_tree; ++ struct mlxsw_sp_lpm_tree *new_tree; ++ int err; + +- if (mlxsw_sp_prefix_usage_eq(req_prefix_usage, +- &vr->lpm_tree->prefix_usage)) ++ if (mlxsw_sp_prefix_usage_eq(req_prefix_usage, &lpm_tree->prefix_usage)) + return 0; + +- lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, req_prefix_usage, ++ new_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, req_prefix_usage, + vr->proto, false); +- if (IS_ERR(lpm_tree)) { ++ if (IS_ERR(new_tree)) { + /* We failed to get a tree according to the required + * prefix usage. However, the current tree might be still good + * for us if our requirement is subset of the prefixes used + * in the tree. + */ + if (mlxsw_sp_prefix_usage_subset(req_prefix_usage, +- &vr->lpm_tree->prefix_usage)) ++ &lpm_tree->prefix_usage)) + return 0; +- return PTR_ERR(lpm_tree); ++ return PTR_ERR(new_tree); + } + +- mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, vr); +- mlxsw_sp_lpm_tree_put(mlxsw_sp, vr->lpm_tree); ++ /* Prevent packet loss by overwriting existing binding */ ++ vr->lpm_tree = new_tree; ++ err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, vr); ++ if (err) ++ goto err_tree_bind; ++ mlxsw_sp_lpm_tree_put(mlxsw_sp, lpm_tree); ++ ++ return 0; ++ ++err_tree_bind: + vr->lpm_tree = lpm_tree; +- return mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, vr); ++ mlxsw_sp_lpm_tree_put(mlxsw_sp, new_tree); ++ return err; + } + + static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp, diff --git a/queue-4.9/mpls-do-not-decrement-alive-counter-for-unregister-events.patch b/queue-4.9/mpls-do-not-decrement-alive-counter-for-unregister-events.patch new file mode 100644 index 00000000000..85ab3b59287 --- /dev/null +++ b/queue-4.9/mpls-do-not-decrement-alive-counter-for-unregister-events.patch @@ -0,0 +1,53 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: David Ahern +Date: Fri, 10 Mar 2017 14:11:39 -0800 +Subject: mpls: Do not decrement alive counter for unregister events + +From: David Ahern + + +[ Upstream commit 79099aab38c8f5c746748b066ae74ba984fe2cc8 ] + +Multipath routes can be rendered usesless when a device in one of the +paths is deleted. For example: + +$ ip -f mpls ro ls +100 + nexthop as to 200 via inet 172.16.2.2 dev virt12 + nexthop as to 300 via inet 172.16.3.2 dev br0 +101 + nexthop as to 201 via inet6 2000:2::2 dev virt12 + nexthop as to 301 via inet6 2000:3::2 dev br0 + +$ ip li del br0 + +When br0 is deleted the other hop is not considered in +mpls_select_multipath because of the alive check -- rt_nhn_alive +is 0. + +rt_nhn_alive is decremented once in mpls_ifdown when the device is taken +down (NETDEV_DOWN) and again when it is deleted (NETDEV_UNREGISTER). For +a 2 hop route, deleting one device drops the alive count to 0. Since +devices are taken down before unregistering, the decrement on +NETDEV_UNREGISTER is redundant. + +Fixes: c89359a42e2a4 ("mpls: support for dead routes") +Signed-off-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/mpls/af_mpls.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/mpls/af_mpls.c ++++ b/net/mpls/af_mpls.c +@@ -956,7 +956,8 @@ static void mpls_ifdown(struct net_devic + /* fall through */ + case NETDEV_CHANGE: + nh->nh_flags |= RTNH_F_LINKDOWN; +- ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1; ++ if (event != NETDEV_UNREGISTER) ++ ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1; + break; + } + if (event == NETDEV_UNREGISTER) diff --git a/queue-4.9/mpls-send-route-delete-notifications-when-router-module-is-unloaded.patch b/queue-4.9/mpls-send-route-delete-notifications-when-router-module-is-unloaded.patch new file mode 100644 index 00000000000..8e3aff38455 --- /dev/null +++ b/queue-4.9/mpls-send-route-delete-notifications-when-router-module-is-unloaded.patch @@ -0,0 +1,33 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: David Ahern +Date: Fri, 10 Mar 2017 09:46:15 -0800 +Subject: mpls: Send route delete notifications when router module is unloaded + +From: David Ahern + + +[ Upstream commit e37791ec1ad785b59022ae211f63a16189bacebf ] + +When the mpls_router module is unloaded, mpls routes are deleted but +notifications are not sent to userspace leaving userspace caches +out of sync. Add the call to mpls_notify_route in mpls_net_exit as +routes are freed. + +Fixes: 0189197f44160 ("mpls: Basic routing support") +Signed-off-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/mpls/af_mpls.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/mpls/af_mpls.c ++++ b/net/mpls/af_mpls.c +@@ -1696,6 +1696,7 @@ static void mpls_net_exit(struct net *ne + for (index = 0; index < platform_labels; index++) { + struct mpls_route *rt = rtnl_dereference(platform_label[index]); + RCU_INIT_POINTER(platform_label[index], NULL); ++ mpls_notify_route(net, index, rt, NULL, NULL); + mpls_rt_free(rt); + } + rtnl_unlock(); diff --git a/queue-4.9/net-bridge-allow-ipv6-when-multicast-flood-is-disabled.patch b/queue-4.9/net-bridge-allow-ipv6-when-multicast-flood-is-disabled.patch new file mode 100644 index 00000000000..416061fe181 --- /dev/null +++ b/queue-4.9/net-bridge-allow-ipv6-when-multicast-flood-is-disabled.patch @@ -0,0 +1,37 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Mike Manning +Date: Wed, 1 Mar 2017 09:55:28 +0000 +Subject: net: bridge: allow IPv6 when multicast flood is disabled + +From: Mike Manning + + +[ Upstream commit 8953de2f02ad7b15e4964c82f9afd60f128e4e98 ] + +Even with multicast flooding turned off, IPv6 ND should still work so +that IPv6 connectivity is provided. Allow this by continuing to flood +multicast traffic originated by us. + +Fixes: b6cb5ac8331b ("net: bridge: add per-port multicast flood flag") +Cc: Nikolay Aleksandrov +Signed-off-by: Mike Manning +Acked-by: Nikolay Aleksandrov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_forward.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/bridge/br_forward.c ++++ b/net/bridge/br_forward.c +@@ -186,8 +186,9 @@ void br_flood(struct net_bridge *br, str + /* Do not flood unicast traffic to ports that turn it off */ + if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) + continue; ++ /* Do not flood if mc off, except for traffic we originate */ + if (pkt_type == BR_PKT_MULTICAST && +- !(p->flags & BR_MCAST_FLOOD)) ++ !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) + continue; + + /* Do not flood to ports that enable proxy ARP */ diff --git a/queue-4.9/net-don-t-call-strlen-on-the-user-buffer-in-packet_bind_spkt.patch b/queue-4.9/net-don-t-call-strlen-on-the-user-buffer-in-packet_bind_spkt.patch new file mode 100644 index 00000000000..1731bec51c3 --- /dev/null +++ b/queue-4.9/net-don-t-call-strlen-on-the-user-buffer-in-packet_bind_spkt.patch @@ -0,0 +1,108 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Alexander Potapenko +Date: Wed, 1 Mar 2017 12:57:20 +0100 +Subject: net: don't call strlen() on the user buffer in packet_bind_spkt() + +From: Alexander Potapenko + + +[ Upstream commit 540e2894f7905538740aaf122bd8e0548e1c34a4 ] + +KMSAN (KernelMemorySanitizer, a new error detection tool) reports use of +uninitialized memory in packet_bind_spkt(): +Acked-by: Eric Dumazet + +================================================================== +BUG: KMSAN: use of unitialized memory +CPU: 0 PID: 1074 Comm: packet Not tainted 4.8.0-rc6+ #1891 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs +01/01/2011 + 0000000000000000 ffff88006b6dfc08 ffffffff82559ae8 ffff88006b6dfb48 + ffffffff818a7c91 ffffffff85b9c870 0000000000000092 ffffffff85b9c550 + 0000000000000000 0000000000000092 00000000ec400911 0000000000000002 +Call Trace: + [< inline >] __dump_stack lib/dump_stack.c:15 + [] dump_stack+0x238/0x290 lib/dump_stack.c:51 + [] kmsan_report+0x276/0x2e0 mm/kmsan/kmsan.c:1003 + [] __msan_warning+0x5b/0xb0 +mm/kmsan/kmsan_instr.c:424 + [< inline >] strlen lib/string.c:484 + [] strlcpy+0x9d/0x200 lib/string.c:144 + [] packet_bind_spkt+0x144/0x230 +net/packet/af_packet.c:3132 + [] SYSC_bind+0x40d/0x5f0 net/socket.c:1370 + [] SyS_bind+0x82/0xa0 net/socket.c:1356 + [] entry_SYSCALL_64_fastpath+0x13/0x8f +arch/x86/entry/entry_64.o:? +chained origin: 00000000eba00911 + [] save_stack_trace+0x27/0x50 +arch/x86/kernel/stacktrace.c:67 + [< inline >] kmsan_save_stack_with_flags mm/kmsan/kmsan.c:322 + [< inline >] kmsan_save_stack mm/kmsan/kmsan.c:334 + [] kmsan_internal_chain_origin+0x118/0x1e0 +mm/kmsan/kmsan.c:527 + [] __msan_set_alloca_origin4+0xc3/0x130 +mm/kmsan/kmsan_instr.c:380 + [] SYSC_bind+0x129/0x5f0 net/socket.c:1356 + [] SyS_bind+0x82/0xa0 net/socket.c:1356 + [] entry_SYSCALL_64_fastpath+0x13/0x8f +arch/x86/entry/entry_64.o:? +origin description: ----address@SYSC_bind (origin=00000000eb400911) +================================================================== +(the line numbers are relative to 4.8-rc6, but the bug persists +upstream) + +, when I run the following program as root: + +===================================== + #include + #include + #include + #include + + int main() { + struct sockaddr addr; + memset(&addr, 0xff, sizeof(addr)); + addr.sa_family = AF_PACKET; + int fd = socket(PF_PACKET, SOCK_PACKET, htons(ETH_P_ALL)); + bind(fd, &addr, sizeof(addr)); + return 0; + } +===================================== + +This happens because addr.sa_data copied from the userspace is not +zero-terminated, and copying it with strlcpy() in packet_bind_spkt() +results in calling strlen() on the kernel copy of that non-terminated +buffer. + +Signed-off-by: Alexander Potapenko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/packet/af_packet.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -3140,7 +3140,7 @@ static int packet_bind_spkt(struct socke + int addr_len) + { + struct sock *sk = sock->sk; +- char name[15]; ++ char name[sizeof(uaddr->sa_data) + 1]; + + /* + * Check legality +@@ -3148,7 +3148,11 @@ static int packet_bind_spkt(struct socke + + if (addr_len != sizeof(struct sockaddr)) + return -EINVAL; +- strlcpy(name, uaddr->sa_data, sizeof(name)); ++ /* uaddr->sa_data comes from the userspace, it's not guaranteed to be ++ * zero-terminated. ++ */ ++ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); ++ name[sizeof(uaddr->sa_data)] = 0; + + return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); + } diff --git a/queue-4.9/net-fix-socket-refcounting-in-skb_complete_tx_timestamp.patch b/queue-4.9/net-fix-socket-refcounting-in-skb_complete_tx_timestamp.patch new file mode 100644 index 00000000000..08bfe567e10 --- /dev/null +++ b/queue-4.9/net-fix-socket-refcounting-in-skb_complete_tx_timestamp.patch @@ -0,0 +1,53 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Eric Dumazet +Date: Fri, 3 Mar 2017 21:01:03 -0800 +Subject: net: fix socket refcounting in skb_complete_tx_timestamp() + +From: Eric Dumazet + + +[ Upstream commit 9ac25fc063751379cb77434fef9f3b088cd3e2f7 ] + +TX skbs do not necessarily hold a reference on skb->sk->sk_refcnt +By the time TX completion happens, sk_refcnt might be already 0. + +sock_hold()/sock_put() would then corrupt critical state, like +sk_wmem_alloc and lead to leaks or use after free. + +Fixes: 62bccb8cdb69 ("net-timestamp: Make the clone operation stand-alone from phy timestamping") +Signed-off-by: Eric Dumazet +Cc: Alexander Duyck +Cc: Johannes Berg +Cc: Soheil Hassas Yeganeh +Cc: Willem de Bruijn +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/skbuff.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -3814,13 +3814,14 @@ void skb_complete_tx_timestamp(struct sk + if (!skb_may_tx_timestamp(sk, false)) + return; + +- /* take a reference to prevent skb_orphan() from freeing the socket */ +- sock_hold(sk); +- +- *skb_hwtstamps(skb) = *hwtstamps; +- __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); +- +- sock_put(sk); ++ /* Take a reference to prevent skb_orphan() from freeing the socket, ++ * but only if the socket refcount is not zero. ++ */ ++ if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { ++ *skb_hwtstamps(skb) = *hwtstamps; ++ __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND); ++ sock_put(sk); ++ } + } + EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp); + diff --git a/queue-4.9/net-fix-socket-refcounting-in-skb_complete_wifi_ack.patch b/queue-4.9/net-fix-socket-refcounting-in-skb_complete_wifi_ack.patch new file mode 100644 index 00000000000..98ee783a72d --- /dev/null +++ b/queue-4.9/net-fix-socket-refcounting-in-skb_complete_wifi_ack.patch @@ -0,0 +1,62 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Eric Dumazet +Date: Fri, 3 Mar 2017 21:01:02 -0800 +Subject: net: fix socket refcounting in skb_complete_wifi_ack() + +From: Eric Dumazet + + +[ Upstream commit dd4f10722aeb10f4f582948839f066bebe44e5fb ] + +TX skbs do not necessarily hold a reference on skb->sk->sk_refcnt +By the time TX completion happens, sk_refcnt might be already 0. + +sock_hold()/sock_put() would then corrupt critical state, like +sk_wmem_alloc. + +Fixes: bf7fa551e0ce ("mac80211: Resolve sk_refcnt/sk_wmem_alloc issue in wifi ack path") +Signed-off-by: Eric Dumazet +Cc: Alexander Duyck +Cc: Johannes Berg +Cc: Soheil Hassas Yeganeh +Cc: Willem de Bruijn +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/skbuff.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -3871,7 +3871,7 @@ void skb_complete_wifi_ack(struct sk_buf + { + struct sock *sk = skb->sk; + struct sock_exterr_skb *serr; +- int err; ++ int err = 1; + + skb->wifi_acked_valid = 1; + skb->wifi_acked = acked; +@@ -3881,14 +3881,15 @@ void skb_complete_wifi_ack(struct sk_buf + serr->ee.ee_errno = ENOMSG; + serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS; + +- /* take a reference to prevent skb_orphan() from freeing the socket */ +- sock_hold(sk); +- +- err = sock_queue_err_skb(sk, skb); ++ /* Take a reference to prevent skb_orphan() from freeing the socket, ++ * but only if the socket refcount is not zero. ++ */ ++ if (likely(atomic_inc_not_zero(&sk->sk_refcnt))) { ++ err = sock_queue_err_skb(sk, skb); ++ sock_put(sk); ++ } + if (err) + kfree_skb(skb); +- +- sock_put(sk); + } + EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); + diff --git a/queue-4.9/net-mlx5e-do-not-reduce-lro-wqe-size-when-not-using-build_skb.patch b/queue-4.9/net-mlx5e-do-not-reduce-lro-wqe-size-when-not-using-build_skb.patch new file mode 100644 index 00000000000..9e5e4c2cf55 --- /dev/null +++ b/queue-4.9/net-mlx5e-do-not-reduce-lro-wqe-size-when-not-using-build_skb.patch @@ -0,0 +1,56 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Tariq Toukan +Date: Wed, 22 Feb 2017 17:20:13 +0200 +Subject: net/mlx5e: Do not reduce LRO WQE size when not using build_skb + +From: Tariq Toukan + + +[ Upstream commit 4078e637c12f1e0a74293f1ec9563f42bff14a03 ] + +When rq_type is Striding RQ, no room of SKB_RESERVE is needed +as SKB allocation is not done via build_skb. + +Fixes: e4b85508072b ("net/mlx5e: Slightly reduce hardware LRO size") +Signed-off-by: Tariq Toukan +Signed-off-by: Saeed Mahameed +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +@@ -81,6 +81,7 @@ static bool mlx5e_check_fragmented_strid + static void mlx5e_set_rq_type_params(struct mlx5e_priv *priv, u8 rq_type) + { + priv->params.rq_wq_type = rq_type; ++ priv->params.lro_wqe_sz = MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ; + switch (priv->params.rq_wq_type) { + case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE_MPW; +@@ -92,6 +93,10 @@ static void mlx5e_set_rq_type_params(str + break; + default: /* MLX5_WQ_TYPE_LINKED_LIST */ + priv->params.log_rq_size = MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE; ++ ++ /* Extra room needed for build_skb */ ++ priv->params.lro_wqe_sz -= MLX5_RX_HEADROOM + ++ SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + } + priv->params.min_rx_wqes = mlx5_min_rx_wqes(priv->params.rq_wq_type, + BIT(priv->params.log_rq_size)); +@@ -3473,12 +3478,6 @@ static void mlx5e_build_nic_netdev_priv( + mlx5e_build_default_indir_rqt(mdev, priv->params.indirection_rqt, + MLX5E_INDIR_RQT_SIZE, profile->max_nch(mdev)); + +- priv->params.lro_wqe_sz = +- MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ - +- /* Extra room needed for build_skb */ +- MLX5_RX_HEADROOM - +- SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); +- + /* Initialize pflags */ + MLX5E_SET_PRIV_FLAG(priv, MLX5E_PFLAG_RX_CQE_BASED_MODER, + priv->params.rx_cq_period_mode == MLX5_CQ_PERIOD_MODE_START_FROM_CQE); diff --git a/queue-4.9/net-mlx5e-fix-wrong-cqe-decompression.patch b/queue-4.9/net-mlx5e-fix-wrong-cqe-decompression.patch new file mode 100644 index 00000000000..ee7984c57be --- /dev/null +++ b/queue-4.9/net-mlx5e-fix-wrong-cqe-decompression.patch @@ -0,0 +1,71 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Tariq Toukan +Date: Wed, 22 Feb 2017 17:20:16 +0200 +Subject: net/mlx5e: Fix wrong CQE decompression + +From: Tariq Toukan + + +[ Upstream commit 36154be40a28e4afaa0416da2681d80b7e2ca319 ] + +In cqe compression with striding RQ, the decompression of the CQE field +wqe_counter was done with a wrong wraparound value. +This caused handling cqes with a wrong pointer to wqe (rx descriptor) +and creating SKBs with wrong data, pointing to wrong (and already consumed) +strides/pages. + +The meaning of the CQE field wqe_counter in striding RQ holds the +stride index instead of the WQE index. Hence, when decompressing +a CQE, wqe_counter should have wrapped-around the number of strides +in a single multi-packet WQE. + +We dropped this wrap-around mask at all in CQE decompression of striding +RQ. It is not needed as in such cases the CQE compression session would +break because of different value of wqe_id field, starting a new +compression session. + +Tested: + ethtool -K ethxx lro off/on + ethtool --set-priv-flags ethxx rx_cqe_compress on + super_netperf 16 {ipv4,ipv6} -t TCP_STREAM -m 50 -D + verified no csum errors and no page refcount issues. + +Fixes: 7219ab34f184 ("net/mlx5e: CQE compression") +Signed-off-by: Tariq Toukan +Reported-by: Tom Herbert +Cc: kernel-team@fb.com +Signed-off-by: Saeed Mahameed +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 13 ++++++------- + 1 file changed, 6 insertions(+), 7 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +@@ -92,19 +92,18 @@ static inline void mlx5e_cqes_update_own + static inline void mlx5e_decompress_cqe(struct mlx5e_rq *rq, + struct mlx5e_cq *cq, u32 cqcc) + { +- u16 wqe_cnt_step; +- + cq->title.byte_cnt = cq->mini_arr[cq->mini_arr_idx].byte_cnt; + cq->title.check_sum = cq->mini_arr[cq->mini_arr_idx].checksum; + cq->title.op_own &= 0xf0; + cq->title.op_own |= 0x01 & (cqcc >> cq->wq.log_sz); + cq->title.wqe_counter = cpu_to_be16(cq->decmprs_wqe_counter); + +- wqe_cnt_step = +- rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ ? +- mpwrq_get_cqe_consumed_strides(&cq->title) : 1; +- cq->decmprs_wqe_counter = +- (cq->decmprs_wqe_counter + wqe_cnt_step) & rq->wq.sz_m1; ++ if (rq->wq_type == MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ) ++ cq->decmprs_wqe_counter += ++ mpwrq_get_cqe_consumed_strides(&cq->title); ++ else ++ cq->decmprs_wqe_counter = ++ (cq->decmprs_wqe_counter + 1) & rq->wq.sz_m1; + } + + static inline void mlx5e_decompress_cqe_no_hash(struct mlx5e_rq *rq, diff --git a/queue-4.9/net-mlx5e-register-unregister-vport-representors-on-interface-attach-detach.patch b/queue-4.9/net-mlx5e-register-unregister-vport-representors-on-interface-attach-detach.patch new file mode 100644 index 00000000000..e0bc3aae378 --- /dev/null +++ b/queue-4.9/net-mlx5e-register-unregister-vport-representors-on-interface-attach-detach.patch @@ -0,0 +1,90 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Saeed Mahameed +Date: Wed, 22 Feb 2017 17:20:12 +0200 +Subject: net/mlx5e: Register/unregister vport representors on interface attach/detach + +From: Saeed Mahameed + + +[ Upstream commit 6f08a22c5fb2b9aefb8ecd8496758e7a677c1fde ] + +Currently vport representors are added only on driver load and removed on +driver unload. Apparently we forgot to handle them when we added the +seamless reset flow feature. This caused to leave the representors +netdevs alive and active with open HW resources on pci shutdown and on +error reset flows. + +To overcome this we move their handling to interface attach/detach, so +they would be cleaned up on shutdown and recreated on reset flows. + +Fixes: 26e59d8077a3 ("net/mlx5e: Implement mlx5e interface attach/detach callbacks") +Signed-off-by: Saeed Mahameed +Reviewed-by: Hadar Hen Zion +Reviewed-by: Roi Dayan +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 23 ++++++++++++++-------- + 1 file changed, 15 insertions(+), 8 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +@@ -3936,6 +3936,19 @@ static void mlx5e_register_vport_rep(str + } + } + ++static void mlx5e_unregister_vport_rep(struct mlx5_core_dev *mdev) ++{ ++ struct mlx5_eswitch *esw = mdev->priv.eswitch; ++ int total_vfs = MLX5_TOTAL_VPORTS(mdev); ++ int vport; ++ ++ if (!MLX5_CAP_GEN(mdev, vport_group_manager)) ++ return; ++ ++ for (vport = 1; vport < total_vfs; vport++) ++ mlx5_eswitch_unregister_vport_rep(esw, vport); ++} ++ + void mlx5e_detach_netdev(struct mlx5_core_dev *mdev, struct net_device *netdev) + { + struct mlx5e_priv *priv = netdev_priv(netdev); +@@ -3983,6 +3996,7 @@ static int mlx5e_attach(struct mlx5_core + return err; + } + ++ mlx5e_register_vport_rep(mdev); + return 0; + } + +@@ -3994,6 +4008,7 @@ static void mlx5e_detach(struct mlx5_cor + if (!netif_device_present(netdev)) + return; + ++ mlx5e_unregister_vport_rep(mdev); + mlx5e_detach_netdev(mdev, netdev); + mlx5e_destroy_mdev_resources(mdev); + } +@@ -4012,8 +4027,6 @@ static void *mlx5e_add(struct mlx5_core_ + if (err) + return NULL; + +- mlx5e_register_vport_rep(mdev); +- + if (MLX5_CAP_GEN(mdev, vport_group_manager)) + ppriv = &esw->offloads.vport_reps[0]; + +@@ -4065,13 +4078,7 @@ void mlx5e_destroy_netdev(struct mlx5_co + + static void mlx5e_remove(struct mlx5_core_dev *mdev, void *vpriv) + { +- struct mlx5_eswitch *esw = mdev->priv.eswitch; +- int total_vfs = MLX5_TOTAL_VPORTS(mdev); + struct mlx5e_priv *priv = vpriv; +- int vport; +- +- for (vport = 1; vport < total_vfs; vport++) +- mlx5_eswitch_unregister_vport_rep(esw, vport); + + unregister_netdev(priv->netdev); + mlx5e_detach(mdev, vpriv); diff --git a/queue-4.9/net-net_enable_timestamp-can-be-called-from-irq-contexts.patch b/queue-4.9/net-net_enable_timestamp-can-be-called-from-irq-contexts.patch new file mode 100644 index 00000000000..b2e2e3e1c63 --- /dev/null +++ b/queue-4.9/net-net_enable_timestamp-can-be-called-from-irq-contexts.patch @@ -0,0 +1,96 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Eric Dumazet +Date: Wed, 1 Mar 2017 14:28:39 -0800 +Subject: net: net_enable_timestamp() can be called from irq contexts + +From: Eric Dumazet + + +[ Upstream commit 13baa00ad01bb3a9f893e3a08cbc2d072fc0c15d ] + +It is now very clear that silly TCP listeners might play with +enabling/disabling timestamping while new children are added +to their accept queue. + +Meaning net_enable_timestamp() can be called from BH context +while current state of the static key is not enabled. + +Lets play safe and allow all contexts. + +The work queue is scheduled only under the problematic cases, +which are the static key enable/disable transition, to not slow down +critical paths. + +This extends and improves what we did in commit 5fa8bbda38c6 ("net: use +a work queue to defer net_disable_timestamp() work") + +Fixes: b90e5794c5bd ("net: dont call jump_label_dec from irq context") +Signed-off-by: Eric Dumazet +Reported-by: Dmitry Vyukov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 35 +++++++++++++++++++++++++++++++---- + 1 file changed, 31 insertions(+), 4 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -1697,27 +1697,54 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue); + static struct static_key netstamp_needed __read_mostly; + #ifdef HAVE_JUMP_LABEL + static atomic_t netstamp_needed_deferred; ++static atomic_t netstamp_wanted; + static void netstamp_clear(struct work_struct *work) + { + int deferred = atomic_xchg(&netstamp_needed_deferred, 0); ++ int wanted; + +- while (deferred--) +- static_key_slow_dec(&netstamp_needed); ++ wanted = atomic_add_return(deferred, &netstamp_wanted); ++ if (wanted > 0) ++ static_key_enable(&netstamp_needed); ++ else ++ static_key_disable(&netstamp_needed); + } + static DECLARE_WORK(netstamp_work, netstamp_clear); + #endif + + void net_enable_timestamp(void) + { ++#ifdef HAVE_JUMP_LABEL ++ int wanted; ++ ++ while (1) { ++ wanted = atomic_read(&netstamp_wanted); ++ if (wanted <= 0) ++ break; ++ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted) ++ return; ++ } ++ atomic_inc(&netstamp_needed_deferred); ++ schedule_work(&netstamp_work); ++#else + static_key_slow_inc(&netstamp_needed); ++#endif + } + EXPORT_SYMBOL(net_enable_timestamp); + + void net_disable_timestamp(void) + { + #ifdef HAVE_JUMP_LABEL +- /* net_disable_timestamp() can be called from non process context */ +- atomic_inc(&netstamp_needed_deferred); ++ int wanted; ++ ++ while (1) { ++ wanted = atomic_read(&netstamp_wanted); ++ if (wanted <= 1) ++ break; ++ if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted) ++ return; ++ } ++ atomic_dec(&netstamp_needed_deferred); + schedule_work(&netstamp_work); + #else + static_key_slow_dec(&netstamp_needed); diff --git a/queue-4.9/net-phy-avoid-deadlock-during-phy_error.patch b/queue-4.9/net-phy-avoid-deadlock-during-phy_error.patch new file mode 100644 index 00000000000..267ea2f11d8 --- /dev/null +++ b/queue-4.9/net-phy-avoid-deadlock-during-phy_error.patch @@ -0,0 +1,76 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Florian Fainelli +Date: Fri, 20 Jan 2017 15:31:52 -0800 +Subject: net: phy: Avoid deadlock during phy_error() + +From: Florian Fainelli + + +[ Upstream commit eab127717a6af54401ba534790c793ec143cd1fc ] + +phy_error() is called in the PHY state machine workqueue context, and +calls phy_trigger_machine() which does a cancel_delayed_work_sync() of +the workqueue we execute from, causing a deadlock situation. + +Augment phy_trigger_machine() machine with a sync boolean indicating +whether we should use cancel_*_sync() or just cancel_*_work(). + +Fixes: 3c293f4e08b5 ("net: phy: Trigger state machine on state change and not polling.") +Reported-by: Russell King +Signed-off-by: Florian Fainelli +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/phy/phy.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +--- a/drivers/net/phy/phy.c ++++ b/drivers/net/phy/phy.c +@@ -611,14 +611,18 @@ void phy_start_machine(struct phy_device + * phy_trigger_machine - trigger the state machine to run + * + * @phydev: the phy_device struct ++ * @sync: indicate whether we should wait for the workqueue cancelation + * + * Description: There has been a change in state which requires that the + * state machine runs. + */ + +-static void phy_trigger_machine(struct phy_device *phydev) ++static void phy_trigger_machine(struct phy_device *phydev, bool sync) + { +- cancel_delayed_work_sync(&phydev->state_queue); ++ if (sync) ++ cancel_delayed_work_sync(&phydev->state_queue); ++ else ++ cancel_delayed_work(&phydev->state_queue); + queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0); + } + +@@ -655,7 +659,7 @@ static void phy_error(struct phy_device + phydev->state = PHY_HALTED; + mutex_unlock(&phydev->lock); + +- phy_trigger_machine(phydev); ++ phy_trigger_machine(phydev, false); + } + + /** +@@ -817,7 +821,7 @@ void phy_change(struct work_struct *work + } + + /* reschedule state queue work to run as soon as possible */ +- phy_trigger_machine(phydev); ++ phy_trigger_machine(phydev, true); + return; + + ignore: +@@ -907,7 +911,7 @@ void phy_start(struct phy_device *phydev + if (do_resume) + phy_resume(phydev); + +- phy_trigger_machine(phydev); ++ phy_trigger_machine(phydev, true); + } + EXPORT_SYMBOL(phy_start); + diff --git a/queue-4.9/net-sched-act_skbmod-remove-unneeded-rcu_read_unlock-in-tcf_skbmod_dump.patch b/queue-4.9/net-sched-act_skbmod-remove-unneeded-rcu_read_unlock-in-tcf_skbmod_dump.patch new file mode 100644 index 00000000000..764ac6c740e --- /dev/null +++ b/queue-4.9/net-sched-act_skbmod-remove-unneeded-rcu_read_unlock-in-tcf_skbmod_dump.patch @@ -0,0 +1,30 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Alexey Khoroshilov +Date: Sun, 5 Mar 2017 03:01:55 +0300 +Subject: net/sched: act_skbmod: remove unneeded rcu_read_unlock in tcf_skbmod_dump + +From: Alexey Khoroshilov + + +[ Upstream commit 6c4dc75c251721f517e9daeb5370ea606b5b35ce ] + +Found by Linux Driver Verification project (linuxtesting.org). + +Signed-off-by: Alexey Khoroshilov +Acked-by: Jamal Hadi Salim +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/act_skbmod.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/net/sched/act_skbmod.c ++++ b/net/sched/act_skbmod.c +@@ -228,7 +228,6 @@ static int tcf_skbmod_dump(struct sk_buf + + return skb->len; + nla_put_failure: +- rcu_read_unlock(); + nlmsg_trim(skb, b); + return -1; + } diff --git a/queue-4.9/net-sched-actions-decrement-module-reference-count-after-table-flush.patch b/queue-4.9/net-sched-actions-decrement-module-reference-count-after-table-flush.patch new file mode 100644 index 00000000000..b8e7a96df4b --- /dev/null +++ b/queue-4.9/net-sched-actions-decrement-module-reference-count-after-table-flush.patch @@ -0,0 +1,94 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Roman Mashak +Date: Fri, 24 Feb 2017 11:00:32 -0500 +Subject: net sched actions: decrement module reference count after table flush. + +From: Roman Mashak + + +[ Upstream commit edb9d1bff4bbe19b8ae0e71b1f38732591a9eeb2 ] + +When tc actions are loaded as a module and no actions have been installed, +flushing them would result in actions removed from the memory, but modules +reference count not being decremented, so that the modules would not be +unloaded. + +Following is example with GACT action: + +% sudo modprobe act_gact +% lsmod +Module Size Used by +act_gact 16384 0 +% +% sudo tc actions ls action gact +% +% sudo tc actions flush action gact +% lsmod +Module Size Used by +act_gact 16384 1 +% sudo tc actions flush action gact +% lsmod +Module Size Used by +act_gact 16384 2 +% sudo rmmod act_gact +rmmod: ERROR: Module act_gact is in use +.... + +After the fix: +% lsmod +Module Size Used by +act_gact 16384 0 +% +% sudo tc actions add action pass index 1 +% sudo tc actions add action pass index 2 +% sudo tc actions add action pass index 3 +% lsmod +Module Size Used by +act_gact 16384 3 +% +% sudo tc actions flush action gact +% lsmod +Module Size Used by +act_gact 16384 0 +% +% sudo tc actions flush action gact +% lsmod +Module Size Used by +act_gact 16384 0 +% sudo rmmod act_gact +% lsmod +Module Size Used by +% + +Fixes: f97017cdefef ("net-sched: Fix actions flushing") +Signed-off-by: Roman Mashak +Signed-off-by: Jamal Hadi Salim +Acked-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/act_api.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/net/sched/act_api.c ++++ b/net/sched/act_api.c +@@ -820,10 +820,8 @@ static int tca_action_flush(struct net * + goto out_module_put; + + err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops); +- if (err < 0) ++ if (err <= 0) + goto out_module_put; +- if (err == 0) +- goto noflush_out; + + nla_nest_end(skb, nest); + +@@ -840,7 +838,6 @@ static int tca_action_flush(struct net * + out_module_put: + module_put(ops->owner); + err_out: +-noflush_out: + kfree_skb(skb); + return err; + } diff --git a/queue-4.9/net-tunnel-set-inner-protocol-in-network-gro-hooks.patch b/queue-4.9/net-tunnel-set-inner-protocol-in-network-gro-hooks.patch new file mode 100644 index 00000000000..a9f4e6ead91 --- /dev/null +++ b/queue-4.9/net-tunnel-set-inner-protocol-in-network-gro-hooks.patch @@ -0,0 +1,70 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Paolo Abeni +Date: Tue, 7 Mar 2017 18:33:31 +0100 +Subject: net/tunnel: set inner protocol in network gro hooks + +From: Paolo Abeni + + +[ Upstream commit 294acf1c01bace5cea5d30b510504238bf5f7c25 ] + +The gso code of several tunnels type (gre and udp tunnels) +takes for granted that the skb->inner_protocol is properly +initialized and drops the packet elsewhere. + +On the forwarding path no one is initializing such field, +so gro encapsulated packets are dropped on forward. + +Since commit 38720352412a ("gre: Use inner_proto to obtain +inner header protocol"), this can be reproduced when the +encapsulated packets use gre as the tunneling protocol. + +The issue happens also with vxlan and geneve tunnels since +commit 8bce6d7d0d1e ("udp: Generalize skb_udp_segment"), if the +forwarding host's ingress nic has h/w offload for such tunnel +and a vxlan/geneve device is configured on top of it, regardless +of the configured peer address and vni. + +To address the issue, this change initialize the inner_protocol +field for encapsulated packets in both ipv4 and ipv6 gro complete +callbacks. + +Fixes: 38720352412a ("gre: Use inner_proto to obtain inner header protocol") +Fixes: 8bce6d7d0d1e ("udp: Generalize skb_udp_segment") +Signed-off-by: Paolo Abeni +Acked-by: Alexander Duyck +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/af_inet.c | 4 +++- + net/ipv6/ip6_offload.c | 4 +++- + 2 files changed, 6 insertions(+), 2 deletions(-) + +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -1460,8 +1460,10 @@ int inet_gro_complete(struct sk_buff *sk + int proto = iph->protocol; + int err = -ENOSYS; + +- if (skb->encapsulation) ++ if (skb->encapsulation) { ++ skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP)); + skb_set_inner_network_header(skb, nhoff); ++ } + + csum_replace2(&iph->check, iph->tot_len, newlen); + iph->tot_len = newlen; +--- a/net/ipv6/ip6_offload.c ++++ b/net/ipv6/ip6_offload.c +@@ -294,8 +294,10 @@ static int ipv6_gro_complete(struct sk_b + struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + nhoff); + int err = -ENOSYS; + +- if (skb->encapsulation) ++ if (skb->encapsulation) { ++ skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); + skb_set_inner_network_header(skb, nhoff); ++ } + + iph->payload_len = htons(skb->len - nhoff - sizeof(*iph)); + diff --git a/queue-4.9/strparser-destroy-workqueue-on-module-exit.patch b/queue-4.9/strparser-destroy-workqueue-on-module-exit.patch new file mode 100644 index 00000000000..329bd21631b --- /dev/null +++ b/queue-4.9/strparser-destroy-workqueue-on-module-exit.patch @@ -0,0 +1,29 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: WANG Cong +Date: Fri, 3 Mar 2017 12:21:14 -0800 +Subject: strparser: destroy workqueue on module exit + +From: WANG Cong + + +[ Upstream commit f78ef7cd9a0686b979679d0de061c6dbfd8d649e ] + +Fixes: 43a0c6751a32 ("strparser: Stream parser for messages") +Cc: Tom Herbert +Signed-off-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/strparser/strparser.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/strparser/strparser.c ++++ b/net/strparser/strparser.c +@@ -504,6 +504,7 @@ static int __init strp_mod_init(void) + + static void __exit strp_mod_exit(void) + { ++ destroy_workqueue(strp_wq); + } + module_init(strp_mod_init); + module_exit(strp_mod_exit); diff --git a/queue-4.9/tcp-dccp-block-bh-for-syn-processing.patch b/queue-4.9/tcp-dccp-block-bh-for-syn-processing.patch new file mode 100644 index 00000000000..0e7c996feec --- /dev/null +++ b/queue-4.9/tcp-dccp-block-bh-for-syn-processing.patch @@ -0,0 +1,206 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Eric Dumazet +Date: Wed, 1 Mar 2017 08:39:49 -0800 +Subject: tcp/dccp: block BH for SYN processing + +From: Eric Dumazet + + +[ Upstream commit 449809a66c1d0b1563dee84493e14bf3104d2d7e ] + +SYN processing really was meant to be handled from BH. + +When I got rid of BH blocking while processing socket backlog +in commit 5413d1babe8f ("net: do not block BH while processing socket +backlog"), I forgot that a malicious user could transition to TCP_LISTEN +from a state that allowed (SYN) packets to be parked in the socket +backlog while socket is owned by the thread doing the listen() call. + +Sure enough syzkaller found this and reported the bug ;) + +================================= +[ INFO: inconsistent lock state ] +4.10.0+ #60 Not tainted +--------------------------------- +inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. +syz-executor0/5090 [HC0[0]:SC0[0]:HE1:SE1] takes: + (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at: +[] spin_lock include/linux/spinlock.h:299 [inline] + (&(&hashinfo->ehash_locks[i])->rlock){+.?...}, at: +[] inet_ehash_insert+0x240/0xad0 +net/ipv4/inet_hashtables.c:407 +{IN-SOFTIRQ-W} state was registered at: + mark_irqflags kernel/locking/lockdep.c:2923 [inline] + __lock_acquire+0xbcf/0x3270 kernel/locking/lockdep.c:3295 + lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753 + __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] + _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 + spin_lock include/linux/spinlock.h:299 [inline] + inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407 + reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline] + inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764 + tcp_conn_request+0x25cc/0x3310 net/ipv4/tcp_input.c:6399 + tcp_v4_conn_request+0x157/0x220 net/ipv4/tcp_ipv4.c:1262 + tcp_rcv_state_process+0x802/0x4130 net/ipv4/tcp_input.c:5889 + tcp_v4_do_rcv+0x56b/0x940 net/ipv4/tcp_ipv4.c:1433 + tcp_v4_rcv+0x2e12/0x3210 net/ipv4/tcp_ipv4.c:1711 + ip_local_deliver_finish+0x4ce/0xc40 net/ipv4/ip_input.c:216 + NF_HOOK include/linux/netfilter.h:257 [inline] + ip_local_deliver+0x1ce/0x710 net/ipv4/ip_input.c:257 + dst_input include/net/dst.h:492 [inline] + ip_rcv_finish+0xb1d/0x2110 net/ipv4/ip_input.c:396 + NF_HOOK include/linux/netfilter.h:257 [inline] + ip_rcv+0xd90/0x19c0 net/ipv4/ip_input.c:487 + __netif_receive_skb_core+0x1ad1/0x3400 net/core/dev.c:4179 + __netif_receive_skb+0x2a/0x170 net/core/dev.c:4217 + netif_receive_skb_internal+0x1d6/0x430 net/core/dev.c:4245 + napi_skb_finish net/core/dev.c:4602 [inline] + napi_gro_receive+0x4e6/0x680 net/core/dev.c:4636 + e1000_receive_skb drivers/net/ethernet/intel/e1000/e1000_main.c:4033 [inline] + e1000_clean_rx_irq+0x5e0/0x1490 +drivers/net/ethernet/intel/e1000/e1000_main.c:4489 + e1000_clean+0xb9a/0x2910 drivers/net/ethernet/intel/e1000/e1000_main.c:3834 + napi_poll net/core/dev.c:5171 [inline] + net_rx_action+0xe70/0x1900 net/core/dev.c:5236 + __do_softirq+0x2fb/0xb7d kernel/softirq.c:284 + invoke_softirq kernel/softirq.c:364 [inline] + irq_exit+0x19e/0x1d0 kernel/softirq.c:405 + exiting_irq arch/x86/include/asm/apic.h:658 [inline] + do_IRQ+0x81/0x1a0 arch/x86/kernel/irq.c:250 + ret_from_intr+0x0/0x20 + native_safe_halt+0x6/0x10 arch/x86/include/asm/irqflags.h:53 + arch_safe_halt arch/x86/include/asm/paravirt.h:98 [inline] + default_idle+0x8f/0x410 arch/x86/kernel/process.c:271 + arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:262 + default_idle_call+0x36/0x60 kernel/sched/idle.c:96 + cpuidle_idle_call kernel/sched/idle.c:154 [inline] + do_idle+0x348/0x440 kernel/sched/idle.c:243 + cpu_startup_entry+0x18/0x20 kernel/sched/idle.c:345 + start_secondary+0x344/0x440 arch/x86/kernel/smpboot.c:272 + verify_cpu+0x0/0xfc +irq event stamp: 1741 +hardirqs last enabled at (1741): [] +__raw_spin_unlock_irqrestore include/linux/spinlock_api_smp.h:160 +[inline] +hardirqs last enabled at (1741): [] +_raw_spin_unlock_irqrestore+0xf7/0x1a0 kernel/locking/spinlock.c:191 +hardirqs last disabled at (1740): [] +__raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:108 [inline] +hardirqs last disabled at (1740): [] +_raw_spin_lock_irqsave+0xa2/0x110 kernel/locking/spinlock.c:159 +softirqs last enabled at (1738): [] +__do_softirq+0x7cf/0xb7d kernel/softirq.c:310 +softirqs last disabled at (1571): [] +do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:902 + +other info that might help us debug this: + Possible unsafe locking scenario: + + CPU0 + ---- + lock(&(&hashinfo->ehash_locks[i])->rlock); + + lock(&(&hashinfo->ehash_locks[i])->rlock); + + *** DEADLOCK *** + +1 lock held by syz-executor0/5090: + #0: (sk_lock-AF_INET6){+.+.+.}, at: [] lock_sock +include/net/sock.h:1460 [inline] + #0: (sk_lock-AF_INET6){+.+.+.}, at: [] +sock_setsockopt+0x233/0x1e40 net/core/sock.c:683 + +stack backtrace: +CPU: 1 PID: 5090 Comm: syz-executor0 Not tainted 4.10.0+ #60 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 +Call Trace: + __dump_stack lib/dump_stack.c:15 [inline] + dump_stack+0x292/0x398 lib/dump_stack.c:51 + print_usage_bug+0x3ef/0x450 kernel/locking/lockdep.c:2387 + valid_state kernel/locking/lockdep.c:2400 [inline] + mark_lock_irq kernel/locking/lockdep.c:2602 [inline] + mark_lock+0xf30/0x1410 kernel/locking/lockdep.c:3065 + mark_irqflags kernel/locking/lockdep.c:2941 [inline] + __lock_acquire+0x6dc/0x3270 kernel/locking/lockdep.c:3295 + lock_acquire+0x241/0x580 kernel/locking/lockdep.c:3753 + __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline] + _raw_spin_lock+0x33/0x50 kernel/locking/spinlock.c:151 + spin_lock include/linux/spinlock.h:299 [inline] + inet_ehash_insert+0x240/0xad0 net/ipv4/inet_hashtables.c:407 + reqsk_queue_hash_req net/ipv4/inet_connection_sock.c:753 [inline] + inet_csk_reqsk_queue_hash_add+0x1b7/0x2a0 net/ipv4/inet_connection_sock.c:764 + dccp_v6_conn_request+0xada/0x11b0 net/dccp/ipv6.c:380 + dccp_rcv_state_process+0x51e/0x1660 net/dccp/input.c:606 + dccp_v6_do_rcv+0x213/0x350 net/dccp/ipv6.c:632 + sk_backlog_rcv include/net/sock.h:896 [inline] + __release_sock+0x127/0x3a0 net/core/sock.c:2052 + release_sock+0xa5/0x2b0 net/core/sock.c:2539 + sock_setsockopt+0x60f/0x1e40 net/core/sock.c:1016 + SYSC_setsockopt net/socket.c:1782 [inline] + SyS_setsockopt+0x2fb/0x3a0 net/socket.c:1765 + entry_SYSCALL_64_fastpath+0x1f/0xc2 +RIP: 0033:0x4458b9 +RSP: 002b:00007fe8b26c2b58 EFLAGS: 00000292 ORIG_RAX: 0000000000000036 +RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00000000004458b9 +RDX: 000000000000001a RSI: 0000000000000001 RDI: 0000000000000006 +RBP: 00000000006e2110 R08: 0000000000000010 R09: 0000000000000000 +R10: 00000000208c3000 R11: 0000000000000292 R12: 0000000000708000 +R13: 0000000020000000 R14: 0000000000001000 R15: 0000000000000000 + +Fixes: 5413d1babe8f ("net: do not block BH while processing socket backlog") +Signed-off-by: Eric Dumazet +Reported-by: Andrey Konovalov +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/dccp/input.c | 10 ++++++++-- + net/ipv4/tcp_input.c | 10 ++++++++-- + 2 files changed, 16 insertions(+), 4 deletions(-) + +--- a/net/dccp/input.c ++++ b/net/dccp/input.c +@@ -577,6 +577,7 @@ int dccp_rcv_state_process(struct sock * + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + const int old_state = sk->sk_state; ++ bool acceptable; + int queued = 0; + + /* +@@ -603,8 +604,13 @@ int dccp_rcv_state_process(struct sock * + */ + if (sk->sk_state == DCCP_LISTEN) { + if (dh->dccph_type == DCCP_PKT_REQUEST) { +- if (inet_csk(sk)->icsk_af_ops->conn_request(sk, +- skb) < 0) ++ /* It is possible that we process SYN packets from backlog, ++ * so we need to make sure to disable BH right there. ++ */ ++ local_bh_disable(); ++ acceptable = inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) >= 0; ++ local_bh_enable(); ++ if (!acceptable) + return 1; + consume_skb(skb); + return 0; +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5916,9 +5916,15 @@ int tcp_rcv_state_process(struct sock *s + if (th->syn) { + if (th->fin) + goto discard; +- if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) +- return 1; ++ /* It is possible that we process SYN packets from backlog, ++ * so we need to make sure to disable BH right there. ++ */ ++ local_bh_disable(); ++ acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; ++ local_bh_enable(); + ++ if (!acceptable) ++ return 1; + consume_skb(skb); + return 0; + } diff --git a/queue-4.9/tcp-fix-various-issues-for-sockets-morphing-to-listen-state.patch b/queue-4.9/tcp-fix-various-issues-for-sockets-morphing-to-listen-state.patch new file mode 100644 index 00000000000..11f24a1e1b0 --- /dev/null +++ b/queue-4.9/tcp-fix-various-issues-for-sockets-morphing-to-listen-state.patch @@ -0,0 +1,74 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Eric Dumazet +Date: Fri, 3 Mar 2017 14:08:21 -0800 +Subject: tcp: fix various issues for sockets morphing to listen state + +From: Eric Dumazet + + +[ Upstream commit 02b2faaf0af1d85585f6d6980e286d53612acfc2 ] + +Dmitry Vyukov reported a divide by 0 triggered by syzkaller, exploiting +tcp_disconnect() path that was never really considered and/or used +before syzkaller ;) + +I was not able to reproduce the bug, but it seems issues here are the +three possible actions that assumed they would never trigger on a +listener. + +1) tcp_write_timer_handler +2) tcp_delack_timer_handler +3) MTU reduction + +Only IPv6 MTU reduction was properly testing TCP_CLOSE and TCP_LISTEN + states from tcp_v6_mtu_reduced() + +Signed-off-by: Eric Dumazet +Reported-by: Dmitry Vyukov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_ipv4.c | 7 +++++-- + net/ipv4/tcp_timer.c | 6 ++++-- + 2 files changed, 9 insertions(+), 4 deletions(-) + +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -269,10 +269,13 @@ EXPORT_SYMBOL(tcp_v4_connect); + */ + void tcp_v4_mtu_reduced(struct sock *sk) + { +- struct dst_entry *dst; + struct inet_sock *inet = inet_sk(sk); +- u32 mtu = tcp_sk(sk)->mtu_info; ++ struct dst_entry *dst; ++ u32 mtu; + ++ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) ++ return; ++ mtu = tcp_sk(sk)->mtu_info; + dst = inet_csk_update_pmtu(sk, mtu); + if (!dst) + return; +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -249,7 +249,8 @@ void tcp_delack_timer_handler(struct soc + + sk_mem_reclaim_partial(sk); + +- if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) ++ if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || ++ !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) + goto out; + + if (time_after(icsk->icsk_ack.timeout, jiffies)) { +@@ -552,7 +553,8 @@ void tcp_write_timer_handler(struct sock + struct inet_connection_sock *icsk = inet_csk(sk); + int event; + +- if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) ++ if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || ++ !icsk->icsk_pending) + goto out; + + if (time_after(icsk->icsk_timeout, jiffies)) { diff --git a/queue-4.9/tun-fix-premature-pollout-notification-on-tun-devices.patch b/queue-4.9/tun-fix-premature-pollout-notification-on-tun-devices.patch new file mode 100644 index 00000000000..05d6a624317 --- /dev/null +++ b/queue-4.9/tun-fix-premature-pollout-notification-on-tun-devices.patch @@ -0,0 +1,78 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Hannes Frederic Sowa +Date: Mon, 13 Mar 2017 00:00:26 +0100 +Subject: tun: fix premature POLLOUT notification on tun devices + +From: Hannes Frederic Sowa + + +[ Upstream commit b20e2d54789c6acbf6bd0efdbec2cf5fa4d90ef1 ] + +aszlig observed failing ssh tunnels (-w) during initialization since +commit cc9da6cc4f56e0 ("ipv6: addrconf: use stable address generator for +ARPHRD_NONE"). We already had reports that the mentioned commit breaks +Juniper VPN connections. I can't clearly say that the Juniper VPN client +has the same problem, but it is worth a try to hint to this patch. + +Because of the early generation of link local addresses, the kernel now +can start asking for routers on the local subnet much earlier than usual. +Those router solicitation packets arrive inside the ssh channels and +should be transmitted to the tun fd before the configuration scripts +might have upped the interface and made it ready for transmission. + +ssh polls on the interface and receives back a POLL_OUT. It tries to send +the earily router solicitation packet to the tun interface. Unfortunately +it hasn't been up'ed yet by config scripts, thus failing with -EIO. ssh +doesn't retry again and considers the tun interface broken forever. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=121131 +Fixes: cc9da6cc4f56 ("ipv6: addrconf: use stable address generator for ARPHRD_NONE") +Cc: Bjørn Mork +Reported-by: Valdis Kletnieks +Cc: Valdis Kletnieks +Reported-by: Jonas Lippuner +Cc: Jonas Lippuner +Reported-by: aszlig +Cc: aszlig +Signed-off-by: Hannes Frederic Sowa +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 18 +++++++++++++++--- + 1 file changed, 15 insertions(+), 3 deletions(-) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -819,7 +819,18 @@ static void tun_net_uninit(struct net_de + /* Net device open. */ + static int tun_net_open(struct net_device *dev) + { ++ struct tun_struct *tun = netdev_priv(dev); ++ int i; ++ + netif_tx_start_all_queues(dev); ++ ++ for (i = 0; i < tun->numqueues; i++) { ++ struct tun_file *tfile; ++ ++ tfile = rtnl_dereference(tun->tfiles[i]); ++ tfile->socket.sk->sk_write_space(tfile->socket.sk); ++ } ++ + return 0; + } + +@@ -1116,9 +1127,10 @@ static unsigned int tun_chr_poll(struct + if (!skb_array_empty(&tfile->tx_array)) + mask |= POLLIN | POLLRDNORM; + +- if (sock_writeable(sk) || +- (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && +- sock_writeable(sk))) ++ if (tun->dev->flags & IFF_UP && ++ (sock_writeable(sk) || ++ (!test_and_set_bit(SOCKWQ_ASYNC_NOSPACE, &sk->sk_socket->flags) && ++ sock_writeable(sk)))) + mask |= POLLOUT | POLLWRNORM; + + if (tun->dev->reg_state != NETREG_REGISTERED) diff --git a/queue-4.9/uapi-fix-linux-packet_diag.h-userspace-compilation-error.patch b/queue-4.9/uapi-fix-linux-packet_diag.h-userspace-compilation-error.patch new file mode 100644 index 00000000000..5f3884c14f2 --- /dev/null +++ b/queue-4.9/uapi-fix-linux-packet_diag.h-userspace-compilation-error.patch @@ -0,0 +1,44 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: "Dmitry V. Levin" +Date: Tue, 7 Mar 2017 23:50:50 +0300 +Subject: uapi: fix linux/packet_diag.h userspace compilation error + +From: "Dmitry V. Levin" + + +[ Upstream commit 745cb7f8a5de0805cade3de3991b7a95317c7c73 ] + +Replace MAX_ADDR_LEN with its numeric value to fix the following +linux/packet_diag.h userspace compilation error: + +/usr/include/linux/packet_diag.h:67:17: error: 'MAX_ADDR_LEN' undeclared here (not in a function) + __u8 pdmc_addr[MAX_ADDR_LEN]; + +This is not the first case in the UAPI where the numeric value +of MAX_ADDR_LEN is used instead of symbolic one, uapi/linux/if_link.h +already does the same: + +$ grep MAX_ADDR_LEN include/uapi/linux/if_link.h + __u8 mac[32]; /* MAX_ADDR_LEN */ + +There are no UAPI headers besides these two that use MAX_ADDR_LEN. + +Signed-off-by: Dmitry V. Levin +Acked-by: Pavel Emelyanov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/uapi/linux/packet_diag.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/uapi/linux/packet_diag.h ++++ b/include/uapi/linux/packet_diag.h +@@ -64,7 +64,7 @@ struct packet_diag_mclist { + __u32 pdmc_count; + __u16 pdmc_type; + __u16 pdmc_alen; +- __u8 pdmc_addr[MAX_ADDR_LEN]; ++ __u8 pdmc_addr[32]; /* MAX_ADDR_LEN */ + }; + + struct packet_diag_ring { diff --git a/queue-4.9/vrf-fix-use-after-free-in-vrf_xmit.patch b/queue-4.9/vrf-fix-use-after-free-in-vrf_xmit.patch new file mode 100644 index 00000000000..de72799d26b --- /dev/null +++ b/queue-4.9/vrf-fix-use-after-free-in-vrf_xmit.patch @@ -0,0 +1,56 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: David Ahern +Date: Mon, 6 Mar 2017 08:53:04 -0800 +Subject: vrf: Fix use-after-free in vrf_xmit + +From: David Ahern + + +[ Upstream commit f7887d40e541f74402df0684a1463c0a0bb68c68 ] + +KASAN detected a use-after-free: + +[ 269.467067] BUG: KASAN: use-after-free in vrf_xmit+0x7f1/0x827 [vrf] at addr ffff8800350a21c0 +[ 269.467067] Read of size 4 by task ssh/1879 +[ 269.467067] CPU: 1 PID: 1879 Comm: ssh Not tainted 4.10.0+ #249 +[ 269.467067] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014 +[ 269.467067] Call Trace: +[ 269.467067] dump_stack+0x81/0xb6 +[ 269.467067] kasan_object_err+0x21/0x78 +[ 269.467067] kasan_report+0x2f7/0x450 +[ 269.467067] ? vrf_xmit+0x7f1/0x827 [vrf] +[ 269.467067] ? ip_output+0xa4/0xdb +[ 269.467067] __asan_load4+0x6b/0x6d +[ 269.467067] vrf_xmit+0x7f1/0x827 [vrf] +... + +Which corresponds to the skb access after xmit handling. Fix by saving +skb->len and using the saved value to update stats. + +Fixes: 193125dbd8eb2 ("net: Introduce VRF device driver") +Signed-off-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vrf.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/net/vrf.c ++++ b/drivers/net/vrf.c +@@ -346,6 +346,7 @@ static netdev_tx_t is_ip_tx_frame(struct + + static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) + { ++ int len = skb->len; + netdev_tx_t ret = is_ip_tx_frame(skb, dev); + + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { +@@ -353,7 +354,7 @@ static netdev_tx_t vrf_xmit(struct sk_bu + + u64_stats_update_begin(&dstats->syncp); + dstats->tx_pkts++; +- dstats->tx_bytes += skb->len; ++ dstats->tx_bytes += len; + u64_stats_update_end(&dstats->syncp); + } else { + this_cpu_inc(dev->dstats->tx_drps); diff --git a/queue-4.9/vti6-return-gre_key-for-vti6.patch b/queue-4.9/vti6-return-gre_key-for-vti6.patch new file mode 100644 index 00000000000..c8320b8e1d1 --- /dev/null +++ b/queue-4.9/vti6-return-gre_key-for-vti6.patch @@ -0,0 +1,33 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: David Forster +Date: Fri, 24 Feb 2017 14:20:32 +0000 +Subject: vti6: return GRE_KEY for vti6 + +From: David Forster + + +[ Upstream commit 7dcdf941cdc96692ab99fd790c8cc68945514851 ] + +Align vti6 with vti by returning GRE_KEY flag. This enables iproute2 +to display tunnel keys on "ip -6 tunnel show" + +Signed-off-by: David Forster +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_vti.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/net/ipv6/ip6_vti.c ++++ b/net/ipv6/ip6_vti.c +@@ -691,6 +691,10 @@ vti6_parm_to_user(struct ip6_tnl_parm2 * + u->link = p->link; + u->i_key = p->i_key; + u->o_key = p->o_key; ++ if (u->i_key) ++ u->i_flags |= GRE_KEY; ++ if (u->o_key) ++ u->o_flags |= GRE_KEY; + u->proto = p->proto; + + memcpy(u->name, p->name, sizeof(u->name)); diff --git a/queue-4.9/vxlan-correctly-validate-vxlan-id-against-vxlan_n_vid.patch b/queue-4.9/vxlan-correctly-validate-vxlan-id-against-vxlan_n_vid.patch new file mode 100644 index 00000000000..175a4f06637 --- /dev/null +++ b/queue-4.9/vxlan-correctly-validate-vxlan-id-against-vxlan_n_vid.patch @@ -0,0 +1,33 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Matthias Schiffer +Date: Thu, 23 Feb 2017 17:19:41 +0100 +Subject: vxlan: correctly validate VXLAN ID against VXLAN_N_VID + +From: Matthias Schiffer + + +[ Upstream commit 4e37d6911f36545b286d15073f6f2222f840e81c ] + +The incorrect check caused an off-by-one error: the maximum VID 0xffffff +was unusable. + +Fixes: d342894c5d2f ("vxlan: virtual extensible lan") +Signed-off-by: Matthias Schiffer +Acked-by: Jiri Benc +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vxlan.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -2637,7 +2637,7 @@ static int vxlan_validate(struct nlattr + + if (data[IFLA_VXLAN_ID]) { + __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); +- if (id >= VXLAN_VID_MASK) ++ if (id >= VXLAN_N_VID) + return -ERANGE; + } + diff --git a/queue-4.9/vxlan-don-t-allow-overwrite-of-config-src-addr.patch b/queue-4.9/vxlan-don-t-allow-overwrite-of-config-src-addr.patch new file mode 100644 index 00000000000..af9b962198f --- /dev/null +++ b/queue-4.9/vxlan-don-t-allow-overwrite-of-config-src-addr.patch @@ -0,0 +1,94 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Brian Russell +Date: Fri, 24 Feb 2017 17:47:11 +0000 +Subject: vxlan: don't allow overwrite of config src addr + +From: Brian Russell + + +[ Upstream commit 1158632b5a2dcce0786c1b1b99654e81cc867981 ] + +When using IPv6 transport and a default dst, a pointer to the configured +source address is passed into the route lookup. If no source address is +configured, then the value is overwritten. + +IPv6 route lookup ignores egress ifindex match if the source address is set, +so if egress ifindex match is desired, the source address must be passed +as any. The overwrite breaks this for subsequent lookups. + +Avoid this by copying the configured address to an existing stack variable +and pass a pointer to that instead. + +Fixes: 272d96a5ab10 ("net: vxlan: lwt: Use source ip address during route lookup.") + +Signed-off-by: Brian Russell +Acked-by: Jiri Benc +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vxlan.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -1942,7 +1942,6 @@ static void vxlan_xmit_one(struct sk_buf + const struct iphdr *old_iph; + union vxlan_addr *dst; + union vxlan_addr remote_ip, local_ip; +- union vxlan_addr *src; + struct vxlan_metadata _md; + struct vxlan_metadata *md = &_md; + __be16 src_port = 0, dst_port; +@@ -1960,7 +1959,7 @@ static void vxlan_xmit_one(struct sk_buf + dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; + vni = rdst->remote_vni; + dst = &rdst->remote_ip; +- src = &vxlan->cfg.saddr; ++ local_ip = vxlan->cfg.saddr; + dst_cache = &rdst->dst_cache; + } else { + if (!info) { +@@ -1979,7 +1978,6 @@ static void vxlan_xmit_one(struct sk_buf + local_ip.sin6.sin6_addr = info->key.u.ipv6.src; + } + dst = &remote_ip; +- src = &local_ip; + dst_cache = &info->dst_cache; + } + +@@ -2028,7 +2026,7 @@ static void vxlan_xmit_one(struct sk_buf + rt = vxlan_get_route(vxlan, skb, + rdst ? rdst->remote_ifindex : 0, tos, + dst->sin.sin_addr.s_addr, +- &src->sin.sin_addr.s_addr, ++ &local_ip.sin.sin_addr.s_addr, + dst_cache, info); + if (IS_ERR(rt)) { + netdev_dbg(dev, "no route to %pI4\n", +@@ -2071,7 +2069,7 @@ static void vxlan_xmit_one(struct sk_buf + if (err < 0) + goto xmit_tx_error; + +- udp_tunnel_xmit_skb(rt, sk, skb, src->sin.sin_addr.s_addr, ++ udp_tunnel_xmit_skb(rt, sk, skb, local_ip.sin.sin_addr.s_addr, + dst->sin.sin_addr.s_addr, tos, ttl, df, + src_port, dst_port, xnet, !udp_sum); + #if IS_ENABLED(CONFIG_IPV6) +@@ -2087,7 +2085,7 @@ static void vxlan_xmit_one(struct sk_buf + ndst = vxlan6_get_route(vxlan, skb, + rdst ? rdst->remote_ifindex : 0, tos, + label, &dst->sin6.sin6_addr, +- &src->sin6.sin6_addr, ++ &local_ip.sin6.sin6_addr, + dst_cache, info); + if (IS_ERR(ndst)) { + netdev_dbg(dev, "no route to %pI6\n", +@@ -2134,7 +2132,7 @@ static void vxlan_xmit_one(struct sk_buf + return; + } + udp_tunnel6_xmit_skb(ndst, sk, skb, dev, +- &src->sin6.sin6_addr, ++ &local_ip.sin6.sin6_addr, + &dst->sin6.sin6_addr, tos, ttl, + label, src_port, dst_port, !udp_sum); + #endif diff --git a/queue-4.9/vxlan-lock-rcu-on-tx-path.patch b/queue-4.9/vxlan-lock-rcu-on-tx-path.patch new file mode 100644 index 00000000000..e34a8423301 --- /dev/null +++ b/queue-4.9/vxlan-lock-rcu-on-tx-path.patch @@ -0,0 +1,85 @@ +From foo@baz Sat Mar 18 22:03:25 CST 2017 +From: Jakub Kicinski +Date: Fri, 24 Feb 2017 11:43:36 -0800 +Subject: vxlan: lock RCU on TX path + +From: Jakub Kicinski + + +[ Upstream commit 56de859e9967c070464a9a9f4f18d73f9447298e ] + +There is no guarantees that callers of the TX path will hold +the RCU lock. Grab it explicitly. + +Fixes: c6fcc4fc5f8b ("vxlan: avoid using stale vxlan socket.") +Signed-off-by: Jakub Kicinski +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vxlan.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -1955,6 +1955,7 @@ static void vxlan_xmit_one(struct sk_buf + + info = skb_tunnel_info(skb); + ++ rcu_read_lock(); + if (rdst) { + dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; + vni = rdst->remote_vni; +@@ -1985,7 +1986,7 @@ static void vxlan_xmit_one(struct sk_buf + if (did_rsc) { + /* short-circuited back to local bridge */ + vxlan_encap_bypass(skb, vxlan, vxlan); +- return; ++ goto out_unlock; + } + goto drop; + } +@@ -2054,7 +2055,7 @@ static void vxlan_xmit_one(struct sk_buf + if (!dst_vxlan) + goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); +- return; ++ goto out_unlock; + } + + if (!info) +@@ -2115,7 +2116,7 @@ static void vxlan_xmit_one(struct sk_buf + if (!dst_vxlan) + goto tx_error; + vxlan_encap_bypass(skb, vxlan, dst_vxlan); +- return; ++ goto out_unlock; + } + + if (!info) +@@ -2129,7 +2130,7 @@ static void vxlan_xmit_one(struct sk_buf + if (err < 0) { + dst_release(ndst); + dev->stats.tx_errors++; +- return; ++ goto out_unlock; + } + udp_tunnel6_xmit_skb(ndst, sk, skb, dev, + &local_ip.sin6.sin6_addr, +@@ -2137,7 +2138,8 @@ static void vxlan_xmit_one(struct sk_buf + label, src_port, dst_port, !udp_sum); + #endif + } +- ++out_unlock: ++ rcu_read_unlock(); + return; + + drop: +@@ -2153,6 +2155,7 @@ tx_error: + dev->stats.tx_errors++; + tx_free: + dev_kfree_skb(skb); ++ rcu_read_unlock(); + } + + /* Transmit local packets over Vxlan