From: Greg Kroah-Hartman Date: Wed, 7 Mar 2018 03:31:04 +0000 (-0800) Subject: 4.15-stable patches X-Git-Tag: v4.14.25~35 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=0a986a52401a8a4b23dccf1fa7cc48ce9aff8581;p=thirdparty%2Fkernel%2Fstable-queue.git 4.15-stable patches added patches: amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch bridge-check-brport-attr-show-in-brport_show.patch bridge-fix-vlan-reference-count-problem.patch cls_u32-fix-use-after-free-in-u32_destroy_key.patch cxgb4-fix-trailing-zero-in-cim-la-dump.patch doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch fib_semantics-don-t-match-route-with-mismatching-tclassid.patch hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch l2tp-fix-races-with-tunnel-socket-close.patch l2tp-fix-tunnel-lookup-use-after-free-race.patch mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch net-fix-race-on-decreasing-number-of-tx-queues.patch net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch net-mlx5-fix-error-handling-when-adding-flow-rules.patch net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch net-phy-restore-phy_resume-locking-assumption.patch net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch net-sched-report-if-filter-is-too-large-to-dump.patch net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch netlink-put-module-reference-if-dump-start-fails.patch ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch rxrpc-fix-send-in-rxrpc_send_data_packet.patch s390-qeth-fix-double-free-on-ip-add-remove-race.patch s390-qeth-fix-ip-address-lookup-for-l3-devices.patch s390-qeth-fix-ip-removal-on-offline-cards.patch s390-qeth-fix-ipa-command-submission-race.patch s390-qeth-fix-overestimated-count-of-buffer-elements.patch s390-qeth-fix-setip-command-handling.patch s390-qeth-fix-underestimated-count-of-buffer-elements.patch sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch tcp-purge-write-queue-upon-rst.patch tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch tcp-revert-f-rto-middle-box-workaround.patch tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch tcp_bbr-better-deal-with-suboptimal-gso.patch tls-use-correct-sk-sk_prot-for-ipv6.patch tuntap-correctly-add-the-missing-xdp-flush.patch tuntap-disable-preemption-during-xdp-processing.patch udplite-fix-partial-checksum-initialization.patch virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch --- diff --git a/queue-4.15/amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch b/queue-4.15/amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch new file mode 100644 index 00000000000..962e71423af --- /dev/null +++ b/queue-4.15/amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch @@ -0,0 +1,31 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Tom Lendacky +Date: Tue, 20 Feb 2018 15:22:05 -0600 +Subject: amd-xgbe: Restore PCI interrupt enablement setting on resume + +From: Tom Lendacky + + +[ Upstream commit cfd092f2db8b4b6727e1c03ef68a7842e1023573 ] + +After resuming from suspend, the PCI device support must re-enable the +interrupt setting so that interrupts are actually delivered. + +Signed-off-by: Tom Lendacky +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/amd/xgbe/xgbe-pci.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c +@@ -426,6 +426,8 @@ static int xgbe_pci_resume(struct pci_de + struct net_device *netdev = pdata->netdev; + int ret = 0; + ++ XP_IOWRITE(pdata, XP_INT_EN, 0x1fffff); ++ + pdata->lpm_ctrl &= ~MDIO_CTRL1_LPOWER; + XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, pdata->lpm_ctrl); + diff --git a/queue-4.15/bridge-check-brport-attr-show-in-brport_show.patch b/queue-4.15/bridge-check-brport-attr-show-in-brport_show.patch new file mode 100644 index 00000000000..191b43cb036 --- /dev/null +++ b/queue-4.15/bridge-check-brport-attr-show-in-brport_show.patch @@ -0,0 +1,48 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Xin Long +Date: Mon, 12 Feb 2018 17:15:40 +0800 +Subject: bridge: check brport attr show in brport_show + +From: Xin Long + + +[ Upstream commit 1b12580af1d0677c3c3a19e35bfe5d59b03f737f ] + +Now br_sysfs_if file flush doesn't have attr show. To read it will +cause kernel panic after users chmod u+r this file. + +Xiong found this issue when running the commands: + + ip link add br0 type bridge + ip link add type veth + ip link set veth0 master br0 + chmod u+r /sys/devices/virtual/net/veth0/brport/flush + timeout 3 cat /sys/devices/virtual/net/veth0/brport/flush + +kernel crashed with NULL a pointer dereference call trace. + +This patch is to fix it by return -EINVAL when brport_attr->show +is null, just the same as the check for brport_attr->store in +brport_store(). + +Fixes: 9cf637473c85 ("bridge: add sysfs hook to flush forwarding table") +Reported-by: Xiong Zhou +Signed-off-by: Xin Long +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_sysfs_if.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/net/bridge/br_sysfs_if.c ++++ b/net/bridge/br_sysfs_if.c +@@ -255,6 +255,9 @@ static ssize_t brport_show(struct kobjec + struct brport_attribute *brport_attr = to_brport_attr(attr); + struct net_bridge_port *p = to_brport(kobj); + ++ if (!brport_attr->show) ++ return -EINVAL; ++ + return brport_attr->show(p, buf); + } + diff --git a/queue-4.15/bridge-fix-vlan-reference-count-problem.patch b/queue-4.15/bridge-fix-vlan-reference-count-problem.patch new file mode 100644 index 00000000000..5d4bd5b1158 --- /dev/null +++ b/queue-4.15/bridge-fix-vlan-reference-count-problem.patch @@ -0,0 +1,40 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Ido Schimmel +Date: Sun, 25 Feb 2018 21:59:06 +0200 +Subject: bridge: Fix VLAN reference count problem + +From: Ido Schimmel + + +[ Upstream commit 0e5a82efda872c2469c210957d7d4161ef8f4391 ] + +When a VLAN is added on a port, a reference is taken on the +corresponding master VLAN entry. If it does not already exist, then it +is created and a reference taken. + +However, in the second case a reference is not really taken when +CONFIG_REFCOUNT_FULL is enabled as refcount_inc() is replaced by +refcount_inc_not_zero(). + +Fix this by using refcount_set() on a newly created master VLAN entry. + +Fixes: 251277598596 ("net, bridge: convert net_bridge_vlan.refcnt from atomic_t to refcount_t") +Signed-off-by: Ido Schimmel +Acked-by: Nikolay Aleksandrov +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/bridge/br_vlan.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/bridge/br_vlan.c ++++ b/net/bridge/br_vlan.c +@@ -168,6 +168,8 @@ static struct net_bridge_vlan *br_vlan_g + masterv = br_vlan_find(vg, vid); + if (WARN_ON(!masterv)) + return NULL; ++ refcount_set(&masterv->refcnt, 1); ++ return masterv; + } + refcount_inc(&masterv->refcnt); + diff --git a/queue-4.15/cls_u32-fix-use-after-free-in-u32_destroy_key.patch b/queue-4.15/cls_u32-fix-use-after-free-in-u32_destroy_key.patch new file mode 100644 index 00000000000..d30825e89fe --- /dev/null +++ b/queue-4.15/cls_u32-fix-use-after-free-in-u32_destroy_key.patch @@ -0,0 +1,193 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Paolo Abeni +Date: Mon, 5 Feb 2018 22:23:01 +0100 +Subject: cls_u32: fix use after free in u32_destroy_key() + +From: Paolo Abeni + + +[ Upstream commit d7cdee5ea8d28ae1b6922deb0c1badaa3aa0ef8c ] + +Li Shuang reported an Oops with cls_u32 due to an use-after-free +in u32_destroy_key(). The use-after-free can be triggered with: + +dev=lo +tc qdisc add dev $dev root handle 1: htb default 10 +tc filter add dev $dev parent 1: prio 5 handle 1: protocol ip u32 divisor 256 +tc filter add dev $dev protocol ip parent 1: prio 5 u32 ht 800:: match ip dst\ + 10.0.0.0/8 hashkey mask 0x0000ff00 at 16 link 1: +tc qdisc del dev $dev root + +Which causes the following kasan splat: + + ================================================================== + BUG: KASAN: use-after-free in u32_destroy_key.constprop.21+0x117/0x140 [cls_u32] + Read of size 4 at addr ffff881b83dae618 by task kworker/u48:5/571 + + CPU: 17 PID: 571 Comm: kworker/u48:5 Not tainted 4.15.0+ #87 + Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.1.7 06/16/2016 + Workqueue: tc_filter_workqueue u32_delete_key_freepf_work [cls_u32] + Call Trace: + dump_stack+0xd6/0x182 + ? dma_virt_map_sg+0x22e/0x22e + print_address_description+0x73/0x290 + kasan_report+0x277/0x360 + ? u32_destroy_key.constprop.21+0x117/0x140 [cls_u32] + u32_destroy_key.constprop.21+0x117/0x140 [cls_u32] + u32_delete_key_freepf_work+0x1c/0x30 [cls_u32] + process_one_work+0xae0/0x1c80 + ? sched_clock+0x5/0x10 + ? pwq_dec_nr_in_flight+0x3c0/0x3c0 + ? _raw_spin_unlock_irq+0x29/0x40 + ? trace_hardirqs_on_caller+0x381/0x570 + ? _raw_spin_unlock_irq+0x29/0x40 + ? finish_task_switch+0x1e5/0x760 + ? finish_task_switch+0x208/0x760 + ? preempt_notifier_dec+0x20/0x20 + ? __schedule+0x839/0x1ee0 + ? check_noncircular+0x20/0x20 + ? firmware_map_remove+0x73/0x73 + ? find_held_lock+0x39/0x1c0 + ? worker_thread+0x434/0x1820 + ? lock_contended+0xee0/0xee0 + ? lock_release+0x1100/0x1100 + ? init_rescuer.part.16+0x150/0x150 + ? retint_kernel+0x10/0x10 + worker_thread+0x216/0x1820 + ? process_one_work+0x1c80/0x1c80 + ? lock_acquire+0x1a5/0x540 + ? lock_downgrade+0x6b0/0x6b0 + ? sched_clock+0x5/0x10 + ? lock_release+0x1100/0x1100 + ? compat_start_thread+0x80/0x80 + ? do_raw_spin_trylock+0x190/0x190 + ? _raw_spin_unlock_irq+0x29/0x40 + ? trace_hardirqs_on_caller+0x381/0x570 + ? _raw_spin_unlock_irq+0x29/0x40 + ? finish_task_switch+0x1e5/0x760 + ? finish_task_switch+0x208/0x760 + ? preempt_notifier_dec+0x20/0x20 + ? __schedule+0x839/0x1ee0 + ? kmem_cache_alloc_trace+0x143/0x320 + ? firmware_map_remove+0x73/0x73 + ? sched_clock+0x5/0x10 + ? sched_clock_cpu+0x18/0x170 + ? find_held_lock+0x39/0x1c0 + ? schedule+0xf3/0x3b0 + ? lock_downgrade+0x6b0/0x6b0 + ? __schedule+0x1ee0/0x1ee0 + ? do_wait_intr_irq+0x340/0x340 + ? do_raw_spin_trylock+0x190/0x190 + ? _raw_spin_unlock_irqrestore+0x32/0x60 + ? process_one_work+0x1c80/0x1c80 + ? process_one_work+0x1c80/0x1c80 + kthread+0x312/0x3d0 + ? kthread_create_worker_on_cpu+0xc0/0xc0 + ret_from_fork+0x3a/0x50 + + Allocated by task 1688: + kasan_kmalloc+0xa0/0xd0 + __kmalloc+0x162/0x380 + u32_change+0x1220/0x3c9e [cls_u32] + tc_ctl_tfilter+0x1ba6/0x2f80 + rtnetlink_rcv_msg+0x4f0/0x9d0 + netlink_rcv_skb+0x124/0x320 + netlink_unicast+0x430/0x600 + netlink_sendmsg+0x8fa/0xd60 + sock_sendmsg+0xb1/0xe0 + ___sys_sendmsg+0x678/0x980 + __sys_sendmsg+0xc4/0x210 + do_syscall_64+0x232/0x7f0 + return_from_SYSCALL_64+0x0/0x75 + + Freed by task 112: + kasan_slab_free+0x71/0xc0 + kfree+0x114/0x320 + rcu_process_callbacks+0xc3f/0x1600 + __do_softirq+0x2bf/0xc06 + + The buggy address belongs to the object at ffff881b83dae600 + which belongs to the cache kmalloc-4096 of size 4096 + The buggy address is located 24 bytes inside of + 4096-byte region [ffff881b83dae600, ffff881b83daf600) + The buggy address belongs to the page: + page:ffffea006e0f6a00 count:1 mapcount:0 mapping: (null) index:0x0 compound_mapcount: 0 + flags: 0x17ffffc0008100(slab|head) + raw: 0017ffffc0008100 0000000000000000 0000000000000000 0000000100070007 + raw: dead000000000100 dead000000000200 ffff880187c0e600 0000000000000000 + page dumped because: kasan: bad access detected + + Memory state around the buggy address: + ffff881b83dae500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + ffff881b83dae580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + >ffff881b83dae600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ^ + ffff881b83dae680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ffff881b83dae700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ================================================================== + +The problem is that the htnode is freed before the linked knodes and the +latter will try to access the first at u32_destroy_key() time. +This change addresses the issue using the htnode refcnt to guarantee +the correct free order. While at it also add a RCU annotation, +to keep sparse happy. + +v1 -> v2: use rtnl_derefence() instead of RCU read locks +v2 -> v3: + - don't check refcnt in u32_destroy_hnode() + - cleaned-up u32_destroy() implementation + - cleaned-up code comment +v3 -> v4: + - dropped unneeded comment + +Reported-by: Li Shuang +Fixes: c0d378ef1266 ("net_sched: use tcf_queue_work() in u32 filter") +Signed-off-by: Paolo Abeni +Acked-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/cls_u32.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +--- a/net/sched/cls_u32.c ++++ b/net/sched/cls_u32.c +@@ -397,10 +397,12 @@ static int u32_init(struct tcf_proto *tp + static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n, + bool free_pf) + { ++ struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); ++ + tcf_exts_destroy(&n->exts); + tcf_exts_put_net(&n->exts); +- if (n->ht_down) +- n->ht_down->refcnt--; ++ if (ht && --ht->refcnt == 0) ++ kfree(ht); + #ifdef CONFIG_CLS_U32_PERF + if (free_pf) + free_percpu(n->pf); +@@ -653,16 +655,15 @@ static void u32_destroy(struct tcf_proto + + hlist_del(&tp_c->hnode); + +- for (ht = rtnl_dereference(tp_c->hlist); +- ht; +- ht = rtnl_dereference(ht->next)) { +- ht->refcnt--; +- u32_clear_hnode(tp, ht); +- } +- + while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) { ++ u32_clear_hnode(tp, ht); + RCU_INIT_POINTER(tp_c->hlist, ht->next); +- kfree_rcu(ht, rcu); ++ ++ /* u32_destroy_key() will later free ht for us, if it's ++ * still referenced by some knode ++ */ ++ if (--ht->refcnt == 0) ++ kfree_rcu(ht, rcu); + } + + idr_destroy(&tp_c->handle_idr); diff --git a/queue-4.15/cxgb4-fix-trailing-zero-in-cim-la-dump.patch b/queue-4.15/cxgb4-fix-trailing-zero-in-cim-la-dump.patch new file mode 100644 index 00000000000..48e61340e7e --- /dev/null +++ b/queue-4.15/cxgb4-fix-trailing-zero-in-cim-la-dump.patch @@ -0,0 +1,44 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Rahul Lakkireddy +Date: Thu, 15 Feb 2018 18:20:01 +0530 +Subject: cxgb4: fix trailing zero in CIM LA dump + +From: Rahul Lakkireddy + + +[ Upstream commit e6f02a4d57cc438099bc8abfba43ba1400d77b38 ] + +Set correct size of the CIM LA dump for T6. + +Fixes: 27887bc7cb7f ("cxgb4: collect hardware LA dumps") +Signed-off-by: Rahul Lakkireddy +Signed-off-by: Ganesh Goudar +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c | 2 +- + drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c ++++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c +@@ -156,7 +156,7 @@ int cudbg_collect_cim_la(struct cudbg_in + + if (is_t6(padap->params.chip)) { + size = padap->params.cim_la_size / 10 + 1; +- size *= 11 * sizeof(u32); ++ size *= 10 * sizeof(u32); + } else { + size = padap->params.cim_la_size / 8; + size *= 8 * sizeof(u32); +--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c ++++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c +@@ -97,7 +97,7 @@ static u32 cxgb4_get_entity_length(struc + case CUDBG_CIM_LA: + if (is_t6(adap->params.chip)) { + len = adap->params.cim_la_size / 10 + 1; +- len *= 11 * sizeof(u32); ++ len *= 10 * sizeof(u32); + } else { + len = adap->params.cim_la_size / 8; + len *= 8 * sizeof(u32); diff --git a/queue-4.15/doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch b/queue-4.15/doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch new file mode 100644 index 00000000000..f036721c0bc --- /dev/null +++ b/queue-4.15/doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch @@ -0,0 +1,42 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Tonghao Zhang +Date: Sun, 4 Feb 2018 18:07:10 -0800 +Subject: doc: Change the min default value of tcp_wmem/tcp_rmem. + +From: Tonghao Zhang + + +[ Upstream commit a61a86f8db92923a2a4c857c49a795bcae754497 ] + +The SK_MEM_QUANTUM was changed from PAGE_SIZE to 4096. And the +tcp_wmem/tcp_rmem min default values are 4096. + +Fixes: bd68a2a854ad ("net: set SK_MEM_QUANTUM to 4096") +Cc: Eric Dumazet +Signed-off-by: Tonghao Zhang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/networking/ip-sysctl.txt | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/Documentation/networking/ip-sysctl.txt ++++ b/Documentation/networking/ip-sysctl.txt +@@ -508,7 +508,7 @@ tcp_rmem - vector of 3 INTEGERs: min, de + min: Minimal size of receive buffer used by TCP sockets. + It is guaranteed to each TCP socket, even under moderate memory + pressure. +- Default: 1 page ++ Default: 4K + + default: initial size of receive buffer used by TCP sockets. + This value overrides net.core.rmem_default used by other protocols. +@@ -666,7 +666,7 @@ tcp_window_scaling - BOOLEAN + tcp_wmem - vector of 3 INTEGERs: min, default, max + min: Amount of memory reserved for send buffers for TCP sockets. + Each TCP socket has rights to use it due to fact of its birth. +- Default: 1 page ++ Default: 4K + + default: initial size of send buffer used by TCP sockets. This + value overrides net.core.wmem_default used by other protocols. diff --git a/queue-4.15/fib_semantics-don-t-match-route-with-mismatching-tclassid.patch b/queue-4.15/fib_semantics-don-t-match-route-with-mismatching-tclassid.patch new file mode 100644 index 00000000000..0cb24aa241c --- /dev/null +++ b/queue-4.15/fib_semantics-don-t-match-route-with-mismatching-tclassid.patch @@ -0,0 +1,66 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Stefano Brivio +Date: Thu, 15 Feb 2018 09:46:03 +0100 +Subject: fib_semantics: Don't match route with mismatching tclassid + +From: Stefano Brivio + + +[ Upstream commit a8c6db1dfd1b1d18359241372bb204054f2c3174 ] + +In fib_nh_match(), if output interface or gateway are passed in +the FIB configuration, we don't have to check next hops of +multipath routes to conclude whether we have a match or not. + +However, we might still have routes with different realms +matching the same output interface and gateway configuration, +and this needs to cause the match to fail. Otherwise the first +route inserted in the FIB will match, regardless of the realms: + + # ip route add 1.1.1.1 dev eth0 table 1234 realms 1/2 + # ip route append 1.1.1.1 dev eth0 table 1234 realms 3/4 + # ip route list table 1234 + 1.1.1.1 dev eth0 scope link realms 1/2 + 1.1.1.1 dev eth0 scope link realms 3/4 + # ip route del 1.1.1.1 dev ens3 table 1234 realms 3/4 + # ip route list table 1234 + 1.1.1.1 dev ens3 scope link realms 3/4 + +whereas route with realms 3/4 should have been deleted instead. + +Explicitly check for fc_flow passed in the FIB configuration +(this comes from RTA_FLOW extracted by rtm_to_fib_config()) and +fail matching if it differs from nh_tclassid. + +The handling of RTA_FLOW for multipath routes later in +fib_nh_match() is still needed, as we can have multiple RTA_FLOW +attributes that need to be matched against the tclassid of each +next hop. + +v2: Check that fc_flow is set before discarding the match, so + that the user can still select the first matching rule by + not specifying any realm, as suggested by David Ahern. + +Reported-by: Jianlin Shi +Signed-off-by: Stefano Brivio +Acked-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/fib_semantics.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/net/ipv4/fib_semantics.c ++++ b/net/ipv4/fib_semantics.c +@@ -646,6 +646,11 @@ int fib_nh_match(struct fib_config *cfg, + fi->fib_nh, cfg, extack)) + return 1; + } ++#ifdef CONFIG_IP_ROUTE_CLASSID ++ if (cfg->fc_flow && ++ cfg->fc_flow != fi->fib_nh->nh_tclassid) ++ return 1; ++#endif + if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) && + (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw)) + return 0; diff --git a/queue-4.15/hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch b/queue-4.15/hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch new file mode 100644 index 00000000000..156c0fab674 --- /dev/null +++ b/queue-4.15/hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch @@ -0,0 +1,37 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Denis Du +Date: Sat, 24 Feb 2018 16:51:42 -0500 +Subject: hdlc_ppp: carrier detect ok, don't turn off negotiation + +From: Denis Du + + +[ Upstream commit b6c3bad1ba83af1062a7ff6986d9edc4f3d7fc8e ] + +Sometimes when physical lines have a just good noise to make the protocol +handshaking fail, but the carrier detect still good. Then after remove of +the noise, nobody will trigger this protocol to be start again to cause +the link to never come back. The fix is when the carrier is still on, not +terminate the protocol handshaking. + +Signed-off-by: Denis Du +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/wan/hdlc_ppp.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/drivers/net/wan/hdlc_ppp.c ++++ b/drivers/net/wan/hdlc_ppp.c +@@ -574,7 +574,10 @@ static void ppp_timer(struct timer_list + ppp_cp_event(proto->dev, proto->pid, TO_GOOD, 0, 0, + 0, NULL); + proto->restart_counter--; +- } else ++ } else if (netif_carrier_ok(proto->dev)) ++ ppp_cp_event(proto->dev, proto->pid, TO_GOOD, 0, 0, ++ 0, NULL); ++ else + ppp_cp_event(proto->dev, proto->pid, TO_BAD, 0, 0, + 0, NULL); + break; diff --git a/queue-4.15/ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch b/queue-4.15/ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch new file mode 100644 index 00000000000..d891305069f --- /dev/null +++ b/queue-4.15/ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch @@ -0,0 +1,49 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Arnd Bergmann +Date: Thu, 22 Feb 2018 16:55:34 +0100 +Subject: ipv6 sit: work around bogus gcc-8 -Wrestrict warning + +From: Arnd Bergmann + + +[ Upstream commit ca79bec237f5809a7c3c59bd41cd0880aa889966 ] + +gcc-8 has a new warning that detects overlapping input and output arguments +in memcpy(). It triggers for sit_init_net() calling ipip6_tunnel_clone_6rd(), +which is actually correct: + +net/ipv6/sit.c: In function 'sit_init_net': +net/ipv6/sit.c:192:3: error: 'memcpy' source argument is the same as destination [-Werror=restrict] + +The problem here is that the logic detecting the memcpy() arguments finds them +to be the same, but the conditional that tests for the input and output of +ipip6_tunnel_clone_6rd() to be identical is not a compile-time constant. + +We know that netdev_priv(t->dev) is the same as t for a tunnel device, +and comparing "dev" directly here lets the compiler figure out as well +that 'dev == sitn->fb_tunnel_dev' when called from sit_init_net(), so +it no longer warns. + +This code is old, so Cc stable to make sure that we don't get the warning +for older kernels built with new gcc. + +Cc: Martin Sebor +Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83456 +Signed-off-by: Arnd Bergmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/sit.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ipv6/sit.c ++++ b/net/ipv6/sit.c +@@ -182,7 +182,7 @@ static void ipip6_tunnel_clone_6rd(struc + #ifdef CONFIG_IPV6_SIT_6RD + struct ip_tunnel *t = netdev_priv(dev); + +- if (t->dev == sitn->fb_tunnel_dev) { ++ if (dev == sitn->fb_tunnel_dev) { + ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); + t->ip6rd.relay_prefix = 0; + t->ip6rd.prefixlen = 16; diff --git a/queue-4.15/l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch b/queue-4.15/l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch new file mode 100644 index 00000000000..595c16435dd --- /dev/null +++ b/queue-4.15/l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch @@ -0,0 +1,118 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: James Chapman +Date: Fri, 23 Feb 2018 17:45:44 +0000 +Subject: l2tp: don't use inet_shutdown on ppp session destroy + +From: James Chapman + + +[ Upstream commit 225eb26489d05c679a4c4197ffcb81c81e9dcaf4 ] + +Previously, if a ppp session was closed, we called inet_shutdown to mark +the socket as unconnected such that userspace would get errors and +then close the socket. This could race with userspace closing the +socket. Instead, leave userspace to close the socket in its own time +(our session will be detached anyway). + +BUG: KASAN: use-after-free in inet_shutdown+0x5d/0x1c0 +Read of size 4 at addr ffff880010ea3ac0 by task syzbot_347bd5ac/8296 + +CPU: 3 PID: 8296 Comm: syzbot_347bd5ac Not tainted 4.16.0-rc1+ #91 +Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 +Call Trace: + dump_stack+0x101/0x157 + ? inet_shutdown+0x5d/0x1c0 + print_address_description+0x78/0x260 + ? inet_shutdown+0x5d/0x1c0 + kasan_report+0x240/0x360 + __asan_load4+0x78/0x80 + inet_shutdown+0x5d/0x1c0 + ? pppol2tp_show+0x80/0x80 + pppol2tp_session_close+0x68/0xb0 + l2tp_tunnel_closeall+0x199/0x210 + ? udp_v6_flush_pending_frames+0x90/0x90 + l2tp_udp_encap_destroy+0x6b/0xc0 + ? l2tp_tunnel_del_work+0x2e0/0x2e0 + udpv6_destroy_sock+0x8c/0x90 + sk_common_release+0x47/0x190 + udp_lib_close+0x15/0x20 + inet_release+0x85/0xd0 + inet6_release+0x43/0x60 + sock_release+0x53/0x100 + ? sock_alloc_file+0x260/0x260 + sock_close+0x1b/0x20 + __fput+0x19f/0x380 + ____fput+0x1a/0x20 + task_work_run+0xd2/0x110 + exit_to_usermode_loop+0x18d/0x190 + do_syscall_64+0x389/0x3b0 + entry_SYSCALL_64_after_hwframe+0x26/0x9b +RIP: 0033:0x7fe240a45259 +RSP: 002b:00007fe241132df8 EFLAGS: 00000297 ORIG_RAX: 0000000000000003 +RAX: 0000000000000000 RBX: 0000000000000000 RCX: 00007fe240a45259 +RDX: 00007fe240a45259 RSI: 0000000000000000 RDI: 00000000000000a5 +RBP: 00007fe241132e20 R08: 00007fe241133700 R09: 0000000000000000 +R10: 00007fe241133700 R11: 0000000000000297 R12: 0000000000000000 +R13: 00007ffc49aff84f R14: 0000000000000000 R15: 00007fe241141040 + +Allocated by task 8331: + save_stack+0x43/0xd0 + kasan_kmalloc+0xad/0xe0 + kasan_slab_alloc+0x12/0x20 + kmem_cache_alloc+0x144/0x3e0 + sock_alloc_inode+0x22/0x130 + alloc_inode+0x3d/0xf0 + new_inode_pseudo+0x1c/0x90 + sock_alloc+0x30/0x110 + __sock_create+0xaa/0x4c0 + SyS_socket+0xbe/0x130 + do_syscall_64+0x128/0x3b0 + entry_SYSCALL_64_after_hwframe+0x26/0x9b + +Freed by task 8314: + save_stack+0x43/0xd0 + __kasan_slab_free+0x11a/0x170 + kasan_slab_free+0xe/0x10 + kmem_cache_free+0x88/0x2b0 + sock_destroy_inode+0x49/0x50 + destroy_inode+0x77/0xb0 + evict+0x285/0x340 + iput+0x429/0x530 + dentry_unlink_inode+0x28c/0x2c0 + __dentry_kill+0x1e3/0x2f0 + dput.part.21+0x500/0x560 + dput+0x24/0x30 + __fput+0x2aa/0x380 + ____fput+0x1a/0x20 + task_work_run+0xd2/0x110 + exit_to_usermode_loop+0x18d/0x190 + do_syscall_64+0x389/0x3b0 + entry_SYSCALL_64_after_hwframe+0x26/0x9b + +Fixes: fd558d186df2c ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts") +Signed-off-by: James Chapman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/l2tp/l2tp_ppp.c | 10 ---------- + 1 file changed, 10 deletions(-) + +--- a/net/l2tp/l2tp_ppp.c ++++ b/net/l2tp/l2tp_ppp.c +@@ -420,16 +420,6 @@ abort: + */ + static void pppol2tp_session_close(struct l2tp_session *session) + { +- struct sock *sk; +- +- BUG_ON(session->magic != L2TP_SESSION_MAGIC); +- +- sk = pppol2tp_session_get_sock(session); +- if (sk) { +- if (sk->sk_socket) +- inet_shutdown(sk->sk_socket, SEND_SHUTDOWN); +- sock_put(sk); +- } + } + + /* Really kill the session socket. (Called from sock_put() if diff --git a/queue-4.15/l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch b/queue-4.15/l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch new file mode 100644 index 00000000000..93f6a0ebce1 --- /dev/null +++ b/queue-4.15/l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch @@ -0,0 +1,87 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: James Chapman +Date: Fri, 23 Feb 2018 17:45:43 +0000 +Subject: l2tp: don't use inet_shutdown on tunnel destroy + +From: James Chapman + + +[ Upstream commit 76a6abdb2513ad4ea0ded55d2c66160491f2e848 ] + +Previously, if a tunnel was closed, we called inet_shutdown to mark +the socket as unconnected such that userspace would get errors and +then close the socket. This could race with userspace closing the +socket. Instead, leave userspace to close the socket in its own time +(our tunnel will be detached anyway). + +BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 +IP: __lock_acquire+0x263/0x1630 +PGD 0 P4D 0 +Oops: 0000 [#1] SMP KASAN +Modules linked in: +CPU: 2 PID: 42 Comm: kworker/u8:2 Not tainted 4.15.0-rc7+ #129 +Workqueue: l2tp l2tp_tunnel_del_work +RIP: 0010:__lock_acquire+0x263/0x1630 +RSP: 0018:ffff88001a37fc70 EFLAGS: 00010002 +RAX: 0000000000000001 RBX: 0000000000000088 RCX: 0000000000000000 +RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000 +RBP: ffff88001a37fd18 R08: 0000000000000001 R09: 0000000000000000 +R10: 0000000000000000 R11: 00000000000076fd R12: 00000000000000a0 +R13: ffff88001a3722c0 R14: 0000000000000001 R15: 0000000000000000 +FS: 0000000000000000(0000) GS:ffff88001ad00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00000000000000a0 CR3: 000000001730b000 CR4: 00000000000006e0 +Call Trace: + ? __lock_acquire+0xc77/0x1630 + ? console_trylock+0x11/0xa0 + lock_acquire+0x117/0x230 + ? lock_sock_nested+0x3a/0xa0 + _raw_spin_lock_bh+0x3a/0x50 + ? lock_sock_nested+0x3a/0xa0 + lock_sock_nested+0x3a/0xa0 + inet_shutdown+0x33/0xf0 + l2tp_tunnel_del_work+0x60/0xef + process_one_work+0x1ea/0x5f0 + ? process_one_work+0x162/0x5f0 + worker_thread+0x48/0x3e0 + ? trace_hardirqs_on+0xd/0x10 + kthread+0x108/0x140 + ? process_one_work+0x5f0/0x5f0 + ? kthread_stop+0x2a0/0x2a0 + ret_from_fork+0x24/0x30 +Code: 00 41 81 ff ff 1f 00 00 0f 87 7a 13 00 00 45 85 f6 49 8b 85 +68 08 00 00 0f 84 ae 03 00 00 c7 44 24 18 00 00 00 00 e9 f0 00 00 00 <49> 81 3c +24 80 93 3f 83 b8 00 00 00 00 44 0f 44 c0 83 fe 01 0f +RIP: __lock_acquire+0x263/0x1630 RSP: ffff88001a37fc70 +CR2: 00000000000000a0 + +Fixes: 309795f4bec2d ("l2tp: Add netlink control API for L2TP") +Signed-off-by: James Chapman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/l2tp/l2tp_core.c | 11 ++--------- + 1 file changed, 2 insertions(+), 9 deletions(-) + +--- a/net/l2tp/l2tp_core.c ++++ b/net/l2tp/l2tp_core.c +@@ -1336,17 +1336,10 @@ static void l2tp_tunnel_del_work(struct + + sock = sk->sk_socket; + +- /* If the tunnel socket was created by userspace, then go through the +- * inet layer to shut the socket down, and let userspace close it. +- * Otherwise, if we created the socket directly within the kernel, use ++ /* If the tunnel socket was created within the kernel, use + * the sk API to release it here. +- * In either case the tunnel resources are freed in the socket +- * destructor when the tunnel socket goes away. + */ +- if (tunnel->fd >= 0) { +- if (sock) +- inet_shutdown(sock, 2); +- } else { ++ if (tunnel->fd < 0) { + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sock_release(sock); diff --git a/queue-4.15/l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch b/queue-4.15/l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch new file mode 100644 index 00000000000..e880747fec2 --- /dev/null +++ b/queue-4.15/l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch @@ -0,0 +1,172 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: James Chapman +Date: Fri, 23 Feb 2018 17:45:46 +0000 +Subject: l2tp: fix race in pppol2tp_release with session object destroy + +From: James Chapman + + +[ Upstream commit d02ba2a6110c530a32926af8ad441111774d2893 ] + +pppol2tp_release uses call_rcu to put the final ref on its socket. But +the session object doesn't hold a ref on the session socket so may be +freed while the pppol2tp_put_sk RCU callback is scheduled. Fix this by +having the session hold a ref on its socket until the session is +destroyed. It is this ref that is dropped via call_rcu. + +Sessions are also deleted via l2tp_tunnel_closeall. This must now also put +the final ref via call_rcu. So move the call_rcu call site into +pppol2tp_session_close so that this happens in both destroy paths. A +common destroy path should really be implemented, perhaps with +l2tp_tunnel_closeall calling l2tp_session_delete like pppol2tp_release +does, but this will be looked at later. + +ODEBUG: activate active (active state 1) object type: rcu_head hint: (null) +WARNING: CPU: 3 PID: 13407 at lib/debugobjects.c:291 debug_print_object+0x166/0x220 +Modules linked in: +CPU: 3 PID: 13407 Comm: syzbot_19c09769 Not tainted 4.16.0-rc2+ #38 +Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 +RIP: 0010:debug_print_object+0x166/0x220 +RSP: 0018:ffff880013647a00 EFLAGS: 00010082 +RAX: dffffc0000000008 RBX: 0000000000000003 RCX: ffffffff814d3333 +RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff88001a59f6d0 +RBP: ffff880013647a40 R08: 0000000000000000 R09: 0000000000000001 +R10: ffff8800136479a8 R11: 0000000000000000 R12: 0000000000000001 +R13: ffffffff86161420 R14: ffffffff85648b60 R15: 0000000000000000 +FS: 0000000000000000(0000) GS:ffff88001a580000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000020e77000 CR3: 0000000006022000 CR4: 00000000000006e0 +Call Trace: + debug_object_activate+0x38b/0x530 + ? debug_object_assert_init+0x3b0/0x3b0 + ? __mutex_unlock_slowpath+0x85/0x8b0 + ? pppol2tp_session_destruct+0x110/0x110 + __call_rcu.constprop.66+0x39/0x890 + ? __call_rcu.constprop.66+0x39/0x890 + call_rcu_sched+0x17/0x20 + pppol2tp_release+0x2c7/0x440 + ? fcntl_setlk+0xca0/0xca0 + ? sock_alloc_file+0x340/0x340 + sock_release+0x92/0x1e0 + sock_close+0x1b/0x20 + __fput+0x296/0x6e0 + ____fput+0x1a/0x20 + task_work_run+0x127/0x1a0 + do_exit+0x7f9/0x2ce0 + ? SYSC_connect+0x212/0x310 + ? mm_update_next_owner+0x690/0x690 + ? up_read+0x1f/0x40 + ? __do_page_fault+0x3c8/0xca0 + do_group_exit+0x10d/0x330 + ? do_group_exit+0x330/0x330 + SyS_exit_group+0x22/0x30 + do_syscall_64+0x1e0/0x730 + ? trace_hardirqs_off_thunk+0x1a/0x1c + entry_SYSCALL_64_after_hwframe+0x42/0xb7 +RIP: 0033:0x7f362e471259 +RSP: 002b:00007ffe389abe08 EFLAGS: 00000202 ORIG_RAX: 00000000000000e7 +RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f362e471259 +RDX: 00007f362e471259 RSI: 000000000000002e RDI: 0000000000000000 +RBP: 00007ffe389abe30 R08: 0000000000000000 R09: 00007f362e944270 +R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000400b60 +R13: 00007ffe389abf50 R14: 0000000000000000 R15: 0000000000000000 +Code: 8d 3c dd a0 8f 64 85 48 89 fa 48 c1 ea 03 80 3c 02 00 75 7b 48 8b 14 dd a0 8f 64 85 4c 89 f6 48 c7 c7 20 85 64 85 e +8 2a 55 14 ff <0f> 0b 83 05 ad 2a 68 04 01 48 83 c4 18 5b 41 5c 41 5d 41 5e 41 + +Fixes: ee40fb2e1eb5b ("l2tp: protect sock pointer of struct pppol2tp_session with RCU") +Signed-off-by: James Chapman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/l2tp/l2tp_ppp.c | 52 +++++++++++++++++++++++++++------------------------- + 1 file changed, 27 insertions(+), 25 deletions(-) + +--- a/net/l2tp/l2tp_ppp.c ++++ b/net/l2tp/l2tp_ppp.c +@@ -416,10 +416,28 @@ abort: + * Session (and tunnel control) socket create/destroy. + *****************************************************************************/ + ++static void pppol2tp_put_sk(struct rcu_head *head) ++{ ++ struct pppol2tp_session *ps; ++ ++ ps = container_of(head, typeof(*ps), rcu); ++ sock_put(ps->__sk); ++} ++ + /* Called by l2tp_core when a session socket is being closed. + */ + static void pppol2tp_session_close(struct l2tp_session *session) + { ++ struct pppol2tp_session *ps; ++ ++ ps = l2tp_session_priv(session); ++ mutex_lock(&ps->sk_lock); ++ ps->__sk = rcu_dereference_protected(ps->sk, ++ lockdep_is_held(&ps->sk_lock)); ++ RCU_INIT_POINTER(ps->sk, NULL); ++ if (ps->__sk) ++ call_rcu(&ps->rcu, pppol2tp_put_sk); ++ mutex_unlock(&ps->sk_lock); + } + + /* Really kill the session socket. (Called from sock_put() if +@@ -439,14 +457,6 @@ static void pppol2tp_session_destruct(st + } + } + +-static void pppol2tp_put_sk(struct rcu_head *head) +-{ +- struct pppol2tp_session *ps; +- +- ps = container_of(head, typeof(*ps), rcu); +- sock_put(ps->__sk); +-} +- + /* Called when the PPPoX socket (session) is closed. + */ + static int pppol2tp_release(struct socket *sock) +@@ -470,26 +480,17 @@ static int pppol2tp_release(struct socke + sock_orphan(sk); + sock->sk = NULL; + ++ /* If the socket is associated with a session, ++ * l2tp_session_delete will call pppol2tp_session_close which ++ * will drop the session's ref on the socket. ++ */ + session = pppol2tp_sock_to_session(sk); +- +- if (session != NULL) { +- struct pppol2tp_session *ps; +- ++ if (session) { + l2tp_session_delete(session); +- +- ps = l2tp_session_priv(session); +- mutex_lock(&ps->sk_lock); +- ps->__sk = rcu_dereference_protected(ps->sk, +- lockdep_is_held(&ps->sk_lock)); +- RCU_INIT_POINTER(ps->sk, NULL); +- mutex_unlock(&ps->sk_lock); +- call_rcu(&ps->rcu, pppol2tp_put_sk); +- +- /* Rely on the sock_put() call at the end of the function for +- * dropping the reference held by pppol2tp_sock_to_session(). +- * The last reference will be dropped by pppol2tp_put_sk(). +- */ ++ /* drop the ref obtained by pppol2tp_sock_to_session */ ++ sock_put(sk); + } ++ + release_sock(sk); + + /* This will delete the session context via +@@ -786,6 +787,7 @@ static int pppol2tp_connect(struct socke + + out_no_ppp: + /* This is how we get the session context from the socket. */ ++ sock_hold(sk); + sk->sk_user_data = session; + rcu_assign_pointer(ps->sk, sk); + mutex_unlock(&ps->sk_lock); diff --git a/queue-4.15/l2tp-fix-races-with-tunnel-socket-close.patch b/queue-4.15/l2tp-fix-races-with-tunnel-socket-close.patch new file mode 100644 index 00000000000..6b13f6c7e69 --- /dev/null +++ b/queue-4.15/l2tp-fix-races-with-tunnel-socket-close.patch @@ -0,0 +1,417 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: James Chapman +Date: Fri, 23 Feb 2018 17:45:45 +0000 +Subject: l2tp: fix races with tunnel socket close + +From: James Chapman + + +[ Upstream commit d00fa9adc528c1b0e64d532556764852df8bd7b9 ] + +The tunnel socket tunnel->sock (struct sock) is accessed when +preparing a new ppp session on a tunnel at pppol2tp_session_init. If +the socket is closed by a thread while another is creating a new +session, the threads race. In pppol2tp_connect, the tunnel object may +be created if the pppol2tp socket is associated with the special +session_id 0 and the tunnel socket is looked up using the provided +fd. When handling this, pppol2tp_connect cannot sock_hold the tunnel +socket to prevent it being destroyed during pppol2tp_connect since +this may itself may race with the socket being destroyed. Doing +sockfd_lookup in pppol2tp_connect isn't sufficient to prevent +tunnel->sock going away either because a given tunnel socket fd may be +reused between calls to pppol2tp_connect. Instead, have +l2tp_tunnel_create sock_hold the tunnel socket before it does +sockfd_put. This ensures that the tunnel's socket is always extant +while the tunnel object exists. Hold a ref on the socket until the +tunnel is destroyed and ensure that all tunnel destroy paths go +through a common function (l2tp_tunnel_delete) since this will do the +final sock_put to release the tunnel socket. + +Since the tunnel's socket is now guaranteed to exist if the tunnel +exists, we no longer need to use sockfd_lookup via l2tp_sock_to_tunnel +to derive the tunnel from the socket since this is always +sk_user_data. + +Also, sessions no longer sock_hold the tunnel socket since sessions +already hold a tunnel ref and the tunnel sock will not be freed until +the tunnel is freed. Removing these sock_holds in +l2tp_session_register avoids a possible sock leak in the +pppol2tp_connect error path if l2tp_session_register succeeds but +attaching a ppp channel fails. The pppol2tp_connect error path could +have been fixed instead and have the sock ref dropped when the session +is freed, but doing a sock_put of the tunnel socket when the session +is freed would require a new session_free callback. It is simpler to +just remove the sock_hold of the tunnel socket in +l2tp_session_register, now that the tunnel socket lifetime is +guaranteed. + +Finally, some init code in l2tp_tunnel_create is reordered to ensure +that the new tunnel object's refcount is set and the tunnel socket ref +is taken before the tunnel socket destructor callbacks are set. + +kasan: CONFIG_KASAN_INLINE enabled +kasan: GPF could be caused by NULL-ptr deref or user memory access +general protection fault: 0000 [#1] SMP KASAN +Modules linked in: +CPU: 0 PID: 4360 Comm: syzbot_19c09769 Not tainted 4.16.0-rc2+ #34 +Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 +RIP: 0010:pppol2tp_session_init+0x1d6/0x500 +RSP: 0018:ffff88001377fb40 EFLAGS: 00010212 +RAX: dffffc0000000000 RBX: ffff88001636a940 RCX: ffffffff84836c1d +RDX: 0000000000000045 RSI: 0000000055976744 RDI: 0000000000000228 +RBP: ffff88001377fb60 R08: ffffffff84836bc8 R09: 0000000000000002 +R10: ffff88001377fab8 R11: 0000000000000001 R12: 0000000000000000 +R13: ffff88001636aac8 R14: ffff8800160f81c0 R15: 1ffff100026eff76 +FS: 00007ffb3ea66700(0000) GS:ffff88001a400000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000020e77000 CR3: 0000000016261000 CR4: 00000000000006f0 +Call Trace: + pppol2tp_connect+0xd18/0x13c0 + ? pppol2tp_session_create+0x170/0x170 + ? __might_fault+0x115/0x1d0 + ? lock_downgrade+0x860/0x860 + ? __might_fault+0xe5/0x1d0 + ? security_socket_connect+0x8e/0xc0 + SYSC_connect+0x1b6/0x310 + ? SYSC_bind+0x280/0x280 + ? __do_page_fault+0x5d1/0xca0 + ? up_read+0x1f/0x40 + ? __do_page_fault+0x3c8/0xca0 + SyS_connect+0x29/0x30 + ? SyS_accept+0x40/0x40 + do_syscall_64+0x1e0/0x730 + ? trace_hardirqs_off_thunk+0x1a/0x1c + entry_SYSCALL_64_after_hwframe+0x42/0xb7 +RIP: 0033:0x7ffb3e376259 +RSP: 002b:00007ffeda4f6508 EFLAGS: 00000202 ORIG_RAX: 000000000000002a +RAX: ffffffffffffffda RBX: 0000000020e77012 RCX: 00007ffb3e376259 +RDX: 000000000000002e RSI: 0000000020e77000 RDI: 0000000000000004 +RBP: 00007ffeda4f6540 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000400b60 +R13: 00007ffeda4f6660 R14: 0000000000000000 R15: 0000000000000000 +Code: 80 3d b0 ff 06 02 00 0f 84 07 02 00 00 e8 13 d6 db fc 49 8d bc 24 28 02 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 f +a 48 c1 ea 03 <80> 3c 02 00 0f 85 ed 02 00 00 4d 8b a4 24 28 02 00 00 e8 13 16 + +Fixes: 80d84ef3ff1dd ("l2tp: prevent l2tp_tunnel_delete racing with userspace close") +Signed-off-by: James Chapman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/l2tp/l2tp_core.c | 117 ++++++++++++++------------------------------------- + net/l2tp/l2tp_core.h | 23 ---------- + net/l2tp/l2tp_ip.c | 10 +--- + net/l2tp/l2tp_ip6.c | 8 +-- + 4 files changed, 42 insertions(+), 116 deletions(-) + +--- a/net/l2tp/l2tp_core.c ++++ b/net/l2tp/l2tp_core.c +@@ -136,51 +136,6 @@ l2tp_session_id_hash_2(struct l2tp_net * + + } + +-/* Lookup the tunnel socket, possibly involving the fs code if the socket is +- * owned by userspace. A struct sock returned from this function must be +- * released using l2tp_tunnel_sock_put once you're done with it. +- */ +-static struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel) +-{ +- int err = 0; +- struct socket *sock = NULL; +- struct sock *sk = NULL; +- +- if (!tunnel) +- goto out; +- +- if (tunnel->fd >= 0) { +- /* Socket is owned by userspace, who might be in the process +- * of closing it. Look the socket up using the fd to ensure +- * consistency. +- */ +- sock = sockfd_lookup(tunnel->fd, &err); +- if (sock) +- sk = sock->sk; +- } else { +- /* Socket is owned by kernelspace */ +- sk = tunnel->sock; +- sock_hold(sk); +- } +- +-out: +- return sk; +-} +- +-/* Drop a reference to a tunnel socket obtained via. l2tp_tunnel_sock_put */ +-static void l2tp_tunnel_sock_put(struct sock *sk) +-{ +- struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); +- if (tunnel) { +- if (tunnel->fd >= 0) { +- /* Socket is owned by userspace */ +- sockfd_put(sk->sk_socket); +- } +- sock_put(sk); +- } +- sock_put(sk); +-} +- + /* Session hash list. + * The session_id SHOULD be random according to RFC2661, but several + * L2TP implementations (Cisco and Microsoft) use incrementing +@@ -193,6 +148,13 @@ l2tp_session_id_hash(struct l2tp_tunnel + return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; + } + ++void l2tp_tunnel_free(struct l2tp_tunnel *tunnel) ++{ ++ sock_put(tunnel->sock); ++ /* the tunnel is freed in the socket destructor */ ++} ++EXPORT_SYMBOL(l2tp_tunnel_free); ++ + /* Lookup a tunnel. A new reference is held on the returned tunnel. */ + struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id) + { +@@ -345,13 +307,11 @@ int l2tp_session_register(struct l2tp_se + } + + l2tp_tunnel_inc_refcount(tunnel); +- sock_hold(tunnel->sock); + hlist_add_head_rcu(&session->global_hlist, g_head); + + spin_unlock_bh(&pn->l2tp_session_hlist_lock); + } else { + l2tp_tunnel_inc_refcount(tunnel); +- sock_hold(tunnel->sock); + } + + hlist_add_head(&session->hlist, head); +@@ -975,7 +935,7 @@ int l2tp_udp_encap_recv(struct sock *sk, + { + struct l2tp_tunnel *tunnel; + +- tunnel = l2tp_sock_to_tunnel(sk); ++ tunnel = l2tp_tunnel(sk); + if (tunnel == NULL) + goto pass_up; + +@@ -983,13 +943,10 @@ int l2tp_udp_encap_recv(struct sock *sk, + tunnel->name, skb->len); + + if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook)) +- goto pass_up_put; ++ goto pass_up; + +- sock_put(sk); + return 0; + +-pass_up_put: +- sock_put(sk); + pass_up: + return 1; + } +@@ -1223,7 +1180,6 @@ static void l2tp_tunnel_destruct(struct + + l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name); + +- + /* Disable udp encapsulation */ + switch (tunnel->encap) { + case L2TP_ENCAPTYPE_UDP: +@@ -1246,12 +1202,11 @@ static void l2tp_tunnel_destruct(struct + list_del_rcu(&tunnel->list); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); + +- tunnel->sock = NULL; +- l2tp_tunnel_dec_refcount(tunnel); +- + /* Call the original destructor */ + if (sk->sk_destruct) + (*sk->sk_destruct)(sk); ++ ++ kfree_rcu(tunnel, rcu); + end: + return; + } +@@ -1312,30 +1267,22 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall); + /* Tunnel socket destroy hook for UDP encapsulation */ + static void l2tp_udp_encap_destroy(struct sock *sk) + { +- struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); +- if (tunnel) { +- l2tp_tunnel_closeall(tunnel); +- sock_put(sk); +- } ++ struct l2tp_tunnel *tunnel = l2tp_tunnel(sk); ++ ++ if (tunnel) ++ l2tp_tunnel_delete(tunnel); + } + + /* Workqueue tunnel deletion function */ + static void l2tp_tunnel_del_work(struct work_struct *work) + { +- struct l2tp_tunnel *tunnel = NULL; +- struct socket *sock = NULL; +- struct sock *sk = NULL; +- +- tunnel = container_of(work, struct l2tp_tunnel, del_work); ++ struct l2tp_tunnel *tunnel = container_of(work, struct l2tp_tunnel, ++ del_work); ++ struct sock *sk = tunnel->sock; ++ struct socket *sock = sk->sk_socket; + + l2tp_tunnel_closeall(tunnel); + +- sk = l2tp_tunnel_sock_lookup(tunnel); +- if (!sk) +- goto out; +- +- sock = sk->sk_socket; +- + /* If the tunnel socket was created within the kernel, use + * the sk API to release it here. + */ +@@ -1346,8 +1293,10 @@ static void l2tp_tunnel_del_work(struct + } + } + +- l2tp_tunnel_sock_put(sk); +-out: ++ /* drop initial ref */ ++ l2tp_tunnel_dec_refcount(tunnel); ++ ++ /* drop workqueue ref */ + l2tp_tunnel_dec_refcount(tunnel); + } + +@@ -1600,13 +1549,22 @@ int l2tp_tunnel_create(struct net *net, + sk->sk_user_data = tunnel; + } + ++ /* Bump the reference count. The tunnel context is deleted ++ * only when this drops to zero. A reference is also held on ++ * the tunnel socket to ensure that it is not released while ++ * the tunnel is extant. Must be done before sk_destruct is ++ * set. ++ */ ++ refcount_set(&tunnel->ref_count, 1); ++ sock_hold(sk); ++ tunnel->sock = sk; ++ tunnel->fd = fd; ++ + /* Hook on the tunnel socket destructor so that we can cleanup + * if the tunnel socket goes away. + */ + tunnel->old_sk_destruct = sk->sk_destruct; + sk->sk_destruct = &l2tp_tunnel_destruct; +- tunnel->sock = sk; +- tunnel->fd = fd; + lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock"); + + sk->sk_allocation = GFP_ATOMIC; +@@ -1616,11 +1574,6 @@ int l2tp_tunnel_create(struct net *net, + + /* Add tunnel to our list */ + INIT_LIST_HEAD(&tunnel->list); +- +- /* Bump the reference count. The tunnel context is deleted +- * only when this drops to zero. Must be done before list insertion +- */ +- refcount_set(&tunnel->ref_count, 1); + spin_lock_bh(&pn->l2tp_tunnel_list_lock); + list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list); + spin_unlock_bh(&pn->l2tp_tunnel_list_lock); +@@ -1661,8 +1614,6 @@ void l2tp_session_free(struct l2tp_sessi + + if (tunnel) { + BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); +- sock_put(tunnel->sock); +- session->tunnel = NULL; + l2tp_tunnel_dec_refcount(tunnel); + } + +--- a/net/l2tp/l2tp_core.h ++++ b/net/l2tp/l2tp_core.h +@@ -219,27 +219,8 @@ static inline void *l2tp_session_priv(st + return &session->priv[0]; + } + +-static inline struct l2tp_tunnel *l2tp_sock_to_tunnel(struct sock *sk) +-{ +- struct l2tp_tunnel *tunnel; +- +- if (sk == NULL) +- return NULL; +- +- sock_hold(sk); +- tunnel = (struct l2tp_tunnel *)(sk->sk_user_data); +- if (tunnel == NULL) { +- sock_put(sk); +- goto out; +- } +- +- BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC); +- +-out: +- return tunnel; +-} +- + struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id); ++void l2tp_tunnel_free(struct l2tp_tunnel *tunnel); + + struct l2tp_session *l2tp_session_get(const struct net *net, + struct l2tp_tunnel *tunnel, +@@ -288,7 +269,7 @@ static inline void l2tp_tunnel_inc_refco + static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel) + { + if (refcount_dec_and_test(&tunnel->ref_count)) +- kfree_rcu(tunnel, rcu); ++ l2tp_tunnel_free(tunnel); + } + + /* Session reference counts. Incremented when code obtains a reference +--- a/net/l2tp/l2tp_ip.c ++++ b/net/l2tp/l2tp_ip.c +@@ -234,17 +234,13 @@ static void l2tp_ip_close(struct sock *s + static void l2tp_ip_destroy_sock(struct sock *sk) + { + struct sk_buff *skb; +- struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); ++ struct l2tp_tunnel *tunnel = sk->sk_user_data; + + while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) + kfree_skb(skb); + +- if (tunnel) { +- l2tp_tunnel_closeall(tunnel); +- sock_put(sk); +- } +- +- sk_refcnt_debug_dec(sk); ++ if (tunnel) ++ l2tp_tunnel_delete(tunnel); + } + + static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +--- a/net/l2tp/l2tp_ip6.c ++++ b/net/l2tp/l2tp_ip6.c +@@ -248,16 +248,14 @@ static void l2tp_ip6_close(struct sock * + + static void l2tp_ip6_destroy_sock(struct sock *sk) + { +- struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk); ++ struct l2tp_tunnel *tunnel = sk->sk_user_data; + + lock_sock(sk); + ip6_flush_pending_frames(sk); + release_sock(sk); + +- if (tunnel) { +- l2tp_tunnel_closeall(tunnel); +- sock_put(sk); +- } ++ if (tunnel) ++ l2tp_tunnel_delete(tunnel); + + inet6_destroy_sock(sk); + } diff --git a/queue-4.15/l2tp-fix-tunnel-lookup-use-after-free-race.patch b/queue-4.15/l2tp-fix-tunnel-lookup-use-after-free-race.patch new file mode 100644 index 00000000000..16fae98a1ee --- /dev/null +++ b/queue-4.15/l2tp-fix-tunnel-lookup-use-after-free-race.patch @@ -0,0 +1,116 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: James Chapman +Date: Fri, 23 Feb 2018 17:45:47 +0000 +Subject: l2tp: fix tunnel lookup use-after-free race + +From: James Chapman + + +[ Upstream commit 28f5bfb819195ad9c2eb9486babe7b0e4efe925f ] + +l2tp_tunnel_get walks the tunnel list to find a matching tunnel +instance and if a match is found, its refcount is increased before +returning the tunnel pointer. But when tunnel objects are destroyed, +they are on the tunnel list after their refcount hits zero. Fix this +by moving the code that removes the tunnel from the tunnel list from +the tunnel socket destructor into in the l2tp_tunnel_delete path, +before the tunnel refcount is decremented. + +refcount_t: increment on 0; use-after-free. +WARNING: CPU: 3 PID: 13507 at lib/refcount.c:153 refcount_inc+0x47/0x50 +Modules linked in: +CPU: 3 PID: 13507 Comm: syzbot_6e6a5ec8 Not tainted 4.16.0-rc2+ #36 +Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 +RIP: 0010:refcount_inc+0x47/0x50 +RSP: 0018:ffff8800136ffb20 EFLAGS: 00010286 +RAX: dffffc0000000008 RBX: ffff880017068e68 RCX: ffffffff814d3333 +RDX: 0000000000000000 RSI: ffff88001a59f6d8 RDI: ffff88001a59f6d8 +RBP: ffff8800136ffb28 R08: 0000000000000000 R09: 0000000000000000 +R10: ffff8800136ffab0 R11: 0000000000000000 R12: ffff880017068e50 +R13: 0000000000000000 R14: ffff8800174da800 R15: 0000000000000004 +FS: 00007f403ab1e700(0000) GS:ffff88001a580000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00000000205fafd2 CR3: 0000000016770000 CR4: 00000000000006e0 +Call Trace: + l2tp_tunnel_get+0x2dd/0x4e0 + pppol2tp_connect+0x428/0x13c0 + ? pppol2tp_session_create+0x170/0x170 + ? __might_fault+0x115/0x1d0 + ? lock_downgrade+0x860/0x860 + ? __might_fault+0xe5/0x1d0 + ? security_socket_connect+0x8e/0xc0 + SYSC_connect+0x1b6/0x310 + ? SYSC_bind+0x280/0x280 + ? __do_page_fault+0x5d1/0xca0 + ? up_read+0x1f/0x40 + ? __do_page_fault+0x3c8/0xca0 + SyS_connect+0x29/0x30 + ? SyS_accept+0x40/0x40 + do_syscall_64+0x1e0/0x730 + ? trace_hardirqs_off_thunk+0x1a/0x1c + entry_SYSCALL_64_after_hwframe+0x42/0xb7 +RIP: 0033:0x7f403a42f259 +RSP: 002b:00007f403ab1dee8 EFLAGS: 00000296 ORIG_RAX: 000000000000002a +RAX: ffffffffffffffda RBX: 00000000205fafe4 RCX: 00007f403a42f259 +RDX: 000000000000002e RSI: 00000000205fafd2 RDI: 0000000000000004 +RBP: 00007f403ab1df20 R08: 00007f403ab1e700 R09: 0000000000000000 +R10: 00007f403ab1e700 R11: 0000000000000296 R12: 0000000000000000 +R13: 00007ffc81906cbf R14: 0000000000000000 R15: 00007f403ab2b040 +Code: 3b ff 5b 5d c3 e8 ca 5f 3b ff 80 3d 49 8e 66 04 00 75 ea e8 bc 5f 3b ff 48 c7 c7 60 69 64 85 c6 05 34 8e 66 04 01 e8 59 49 15 ff <0f> 0b eb ce 0f 1f 44 00 00 55 48 89 e5 41 56 41 55 41 54 53 49 + +Fixes: f8ccac0e44934 ("l2tp: put tunnel socket release on a workqueue") +Reported-and-tested-by: syzbot+19c09769f14b48810113@syzkaller.appspotmail.com +Reported-and-tested-by: syzbot+347bd5acde002e353a36@syzkaller.appspotmail.com +Reported-and-tested-by: syzbot+6e6a5ec8de31a94cd015@syzkaller.appspotmail.com +Reported-and-tested-by: syzbot+9df43faf09bd400f2993@syzkaller.appspotmail.com +Signed-off-by: James Chapman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/l2tp/l2tp_core.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +--- a/net/l2tp/l2tp_core.c ++++ b/net/l2tp/l2tp_core.c +@@ -1173,7 +1173,6 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb); + static void l2tp_tunnel_destruct(struct sock *sk) + { + struct l2tp_tunnel *tunnel = l2tp_tunnel(sk); +- struct l2tp_net *pn; + + if (tunnel == NULL) + goto end; +@@ -1196,12 +1195,6 @@ static void l2tp_tunnel_destruct(struct + sk->sk_destruct = tunnel->old_sk_destruct; + sk->sk_user_data = NULL; + +- /* Remove the tunnel struct from the tunnel list */ +- pn = l2tp_pernet(tunnel->l2tp_net); +- spin_lock_bh(&pn->l2tp_tunnel_list_lock); +- list_del_rcu(&tunnel->list); +- spin_unlock_bh(&pn->l2tp_tunnel_list_lock); +- + /* Call the original destructor */ + if (sk->sk_destruct) + (*sk->sk_destruct)(sk); +@@ -1280,6 +1273,7 @@ static void l2tp_tunnel_del_work(struct + del_work); + struct sock *sk = tunnel->sock; + struct socket *sock = sk->sk_socket; ++ struct l2tp_net *pn; + + l2tp_tunnel_closeall(tunnel); + +@@ -1293,6 +1287,12 @@ static void l2tp_tunnel_del_work(struct + } + } + ++ /* Remove the tunnel struct from the tunnel list */ ++ pn = l2tp_pernet(tunnel->l2tp_net); ++ spin_lock_bh(&pn->l2tp_tunnel_list_lock); ++ list_del_rcu(&tunnel->list); ++ spin_unlock_bh(&pn->l2tp_tunnel_list_lock); ++ + /* drop initial ref */ + l2tp_tunnel_dec_refcount(tunnel); + diff --git a/queue-4.15/mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch b/queue-4.15/mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch new file mode 100644 index 00000000000..b275ee0daeb --- /dev/null +++ b/queue-4.15/mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch @@ -0,0 +1,41 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Ido Schimmel +Date: Sat, 17 Feb 2018 00:30:44 +0100 +Subject: mlxsw: spectrum_router: Do not unconditionally clear route offload indication + +From: Ido Schimmel + + +[ Upstream commit d1c95af366961101819f07e3c64d44f3be7f0367 ] + +When mlxsw replaces (or deletes) a route it removes the offload +indication from the replaced route. This is problematic for IPv4 routes, +as the offload indication is stored in the fib_info which is usually +shared between multiple routes. + +Instead of unconditionally clearing the offload indication, only clear +it if no other route is using the fib_info. + +Fixes: 3984d1a89fe7 ("mlxsw: spectrum_router: Provide offload indication using nexthop flags") +Signed-off-by: Ido Schimmel +Reported-by: Alexander Petrovskiy +Tested-by: Alexander Petrovskiy +Signed-off-by: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c ++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +@@ -3765,6 +3765,9 @@ mlxsw_sp_fib4_entry_offload_unset(struct + struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group; + int i; + ++ if (!list_is_singular(&nh_grp->fib_list)) ++ return; ++ + for (i = 0; i < nh_grp->count; i++) { + struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i]; + diff --git a/queue-4.15/mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch b/queue-4.15/mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch new file mode 100644 index 00000000000..46222bf8198 --- /dev/null +++ b/queue-4.15/mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch @@ -0,0 +1,92 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Jiri Pirko +Date: Tue, 13 Feb 2018 11:22:42 +0100 +Subject: mlxsw: spectrum_router: Fix error path in mlxsw_sp_vr_create + +From: Jiri Pirko + + +[ Upstream commit 0f2d2b2736b08dafa3bde31d048750fbc8df3a31 ] + +Since mlxsw_sp_fib_create() and mlxsw_sp_mr_table_create() +use ERR_PTR macro to propagate int err through return of a pointer, +the return value is not NULL in case of failure. So if one +of the calls fails, one of vr->fib4, vr->fib6 or vr->mr4_table +is not NULL and mlxsw_sp_vr_is_used wrongly assumes +that vr is in use which leads to crash like following one: + +[ 1293.949291] BUG: unable to handle kernel NULL pointer dereference at 00000000000006c9 +[ 1293.952729] IP: mlxsw_sp_mr_table_flush+0x15/0x70 [mlxsw_spectrum] + +Fix this by using local variables to hold the pointers and set vr->* +only in case everything went fine. + +Fixes: 76610ebbde18 ("mlxsw: spectrum_router: Refactor virtual router handling") +Fixes: a3d9bc506d64 ("mlxsw: spectrum_router: Extend virtual routers with IPv6 support") +Fixes: d42b0965b1d4 ("mlxsw: spectrum_router: Add multicast routes notification handling functionality") +Signed-off-by: Jiri Pirko +Reviewed-by: Ido Schimmel +Signed-off-by: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 32 ++++++++++-------- + 1 file changed, 18 insertions(+), 14 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c ++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +@@ -737,6 +737,9 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_c + u32 tb_id, + struct netlink_ext_ack *extack) + { ++ struct mlxsw_sp_mr_table *mr4_table; ++ struct mlxsw_sp_fib *fib4; ++ struct mlxsw_sp_fib *fib6; + struct mlxsw_sp_vr *vr; + int err; + +@@ -745,29 +748,30 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_c + NL_SET_ERR_MSG(extack, "spectrum: Exceeded number of supported virtual routers"); + return ERR_PTR(-EBUSY); + } +- vr->fib4 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV4); +- if (IS_ERR(vr->fib4)) +- return ERR_CAST(vr->fib4); +- vr->fib6 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV6); +- if (IS_ERR(vr->fib6)) { +- err = PTR_ERR(vr->fib6); ++ fib4 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV4); ++ if (IS_ERR(fib4)) ++ return ERR_CAST(fib4); ++ fib6 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV6); ++ if (IS_ERR(fib6)) { ++ err = PTR_ERR(fib6); + goto err_fib6_create; + } +- vr->mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id, +- MLXSW_SP_L3_PROTO_IPV4); +- if (IS_ERR(vr->mr4_table)) { +- err = PTR_ERR(vr->mr4_table); ++ mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id, ++ MLXSW_SP_L3_PROTO_IPV4); ++ if (IS_ERR(mr4_table)) { ++ err = PTR_ERR(mr4_table); + goto err_mr_table_create; + } ++ vr->fib4 = fib4; ++ vr->fib6 = fib6; ++ vr->mr4_table = mr4_table; + vr->tb_id = tb_id; + return vr; + + err_mr_table_create: +- mlxsw_sp_fib_destroy(vr->fib6); +- vr->fib6 = NULL; ++ mlxsw_sp_fib_destroy(fib6); + err_fib6_create: +- mlxsw_sp_fib_destroy(vr->fib4); +- vr->fib4 = NULL; ++ mlxsw_sp_fib_destroy(fib4); + return ERR_PTR(err); + } + diff --git a/queue-4.15/mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch b/queue-4.15/mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch new file mode 100644 index 00000000000..56d967cfc55 --- /dev/null +++ b/queue-4.15/mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch @@ -0,0 +1,104 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Shalom Toledo +Date: Thu, 1 Mar 2018 11:37:05 +0100 +Subject: mlxsw: spectrum_switchdev: Check success of FDB add operation + +From: Shalom Toledo + + +[ Upstream commit 0a8a1bf17e3af34f1f8d2368916a6327f8b3bfd5 ] + +Until now, we assumed that in case of error when adding FDB entries, the +write operation will fail, but this is not the case. Instead, we need to +check that the number of entries reported in the response is equal to +the number of entries specified in the request. + +Fixes: 56ade8fe3fe1 ("mlxsw: spectrum: Add initial support for Spectrum ASIC") +Reported-by: Ido Schimmel +Signed-off-by: Shalom Toledo +Reviewed-by: Ido Schimmel +Signed-off-by: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 29 +++++++++++++-- + 1 file changed, 27 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c ++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +@@ -1203,6 +1203,7 @@ static int __mlxsw_sp_port_fdb_uc_op(str + bool dynamic) + { + char *sfd_pl; ++ u8 num_rec; + int err; + + sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL); +@@ -1212,9 +1213,16 @@ static int __mlxsw_sp_port_fdb_uc_op(str + mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0); + mlxsw_reg_sfd_uc_pack(sfd_pl, 0, mlxsw_sp_sfd_rec_policy(dynamic), + mac, fid, action, local_port); ++ num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl); +- kfree(sfd_pl); ++ if (err) ++ goto out; ++ ++ if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl)) ++ err = -EBUSY; + ++out: ++ kfree(sfd_pl); + return err; + } + +@@ -1239,6 +1247,7 @@ static int mlxsw_sp_port_fdb_uc_lag_op(s + bool adding, bool dynamic) + { + char *sfd_pl; ++ u8 num_rec; + int err; + + sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL); +@@ -1249,9 +1258,16 @@ static int mlxsw_sp_port_fdb_uc_lag_op(s + mlxsw_reg_sfd_uc_lag_pack(sfd_pl, 0, mlxsw_sp_sfd_rec_policy(dynamic), + mac, fid, MLXSW_REG_SFD_REC_ACTION_NOP, + lag_vid, lag_id); ++ num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl); +- kfree(sfd_pl); ++ if (err) ++ goto out; + ++ if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl)) ++ err = -EBUSY; ++ ++out: ++ kfree(sfd_pl); + return err; + } + +@@ -1296,6 +1312,7 @@ static int mlxsw_sp_port_mdb_op(struct m + u16 fid, u16 mid_idx, bool adding) + { + char *sfd_pl; ++ u8 num_rec; + int err; + + sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL); +@@ -1305,7 +1322,15 @@ static int mlxsw_sp_port_mdb_op(struct m + mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0); + mlxsw_reg_sfd_mc_pack(sfd_pl, 0, addr, fid, + MLXSW_REG_SFD_REC_ACTION_NOP, mid_idx); ++ num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl); + err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl); ++ if (err) ++ goto out; ++ ++ if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl)) ++ err = -EBUSY; ++ ++out: + kfree(sfd_pl); + return err; + } diff --git a/queue-4.15/net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch b/queue-4.15/net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch new file mode 100644 index 00000000000..052075bcf24 --- /dev/null +++ b/queue-4.15/net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch @@ -0,0 +1,32 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Wolfram Sang +Date: Mon, 5 Feb 2018 21:10:01 +0100 +Subject: net: amd-xgbe: fix comparison to bitshift when dealing with a mask + +From: Wolfram Sang + + +[ Upstream commit a3276892db7a588bedc33168e502572008f714a9 ] + +Due to a typo, the mask was destroyed by a comparison instead of a bit +shift. + +Signed-off-by: Wolfram Sang +Acked-by: Tom Lendacky +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/amd/xgbe/xgbe-drv.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c ++++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c +@@ -595,7 +595,7 @@ isr_done: + + reissue_mask = 1 << 0; + if (!pdata->per_channel_irq) +- reissue_mask |= 0xffff < 4; ++ reissue_mask |= 0xffff << 4; + + XP_IOWRITE(pdata, XP_INT_REISSUE_EN, reissue_mask); + } diff --git a/queue-4.15/net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch b/queue-4.15/net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch new file mode 100644 index 00000000000..217aa6cfcde --- /dev/null +++ b/queue-4.15/net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch @@ -0,0 +1,84 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Grygorii Strashko +Date: Tue, 6 Feb 2018 19:17:06 -0600 +Subject: net: ethernet: ti: cpsw: fix net watchdog timeout + +From: Grygorii Strashko + + +[ Upstream commit 62f94c2101f35cd45775df00ba09bde77580e26a ] + +It was discovered that simple program which indefinitely sends 200b UDP +packets and runs on TI AM574x SoC (SMP) under RT Kernel triggers network +watchdog timeout in TI CPSW driver (<6 hours run). The network watchdog +timeout is triggered due to race between cpsw_ndo_start_xmit() and +cpsw_tx_handler() [NAPI] + +cpsw_ndo_start_xmit() + if (unlikely(!cpdma_check_free_tx_desc(txch))) { + txq = netdev_get_tx_queue(ndev, q_idx); + netif_tx_stop_queue(txq); + +^^ as per [1] barier has to be used after set_bit() otherwise new value +might not be visible to other cpus + } + +cpsw_tx_handler() + if (unlikely(netif_tx_queue_stopped(txq))) + netif_tx_wake_queue(txq); + +and when it happens ndev TX queue became disabled forever while driver's HW +TX queue is empty. + +Fix this, by adding smp_mb__after_atomic() after netif_tx_stop_queue() +calls and double check for free TX descriptors after stopping ndev TX queue +- if there are free TX descriptors wake up ndev TX queue. + +[1] https://www.kernel.org/doc/html/latest/core-api/atomic_ops.html +Signed-off-by: Grygorii Strashko +Reviewed-by: Ivan Khoronzhuk +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/ti/cpsw.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/ti/cpsw.c ++++ b/drivers/net/ethernet/ti/cpsw.c +@@ -1618,6 +1618,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(s + q_idx = q_idx % cpsw->tx_ch_num; + + txch = cpsw->txv[q_idx].ch; ++ txq = netdev_get_tx_queue(ndev, q_idx); + ret = cpsw_tx_packet_submit(priv, skb, txch); + if (unlikely(ret != 0)) { + cpsw_err(priv, tx_err, "desc submit failed\n"); +@@ -1628,15 +1629,26 @@ static netdev_tx_t cpsw_ndo_start_xmit(s + * tell the kernel to stop sending us tx frames. + */ + if (unlikely(!cpdma_check_free_tx_desc(txch))) { +- txq = netdev_get_tx_queue(ndev, q_idx); + netif_tx_stop_queue(txq); ++ ++ /* Barrier, so that stop_queue visible to other cpus */ ++ smp_mb__after_atomic(); ++ ++ if (cpdma_check_free_tx_desc(txch)) ++ netif_tx_wake_queue(txq); + } + + return NETDEV_TX_OK; + fail: + ndev->stats.tx_dropped++; +- txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb)); + netif_tx_stop_queue(txq); ++ ++ /* Barrier, so that stop_queue visible to other cpus */ ++ smp_mb__after_atomic(); ++ ++ if (cpdma_check_free_tx_desc(txch)) ++ netif_tx_wake_queue(txq); ++ + return NETDEV_TX_BUSY; + } + diff --git a/queue-4.15/net-fix-race-on-decreasing-number-of-tx-queues.patch b/queue-4.15/net-fix-race-on-decreasing-number-of-tx-queues.patch new file mode 100644 index 00000000000..82b2083df13 --- /dev/null +++ b/queue-4.15/net-fix-race-on-decreasing-number-of-tx-queues.patch @@ -0,0 +1,68 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Jakub Kicinski +Date: Mon, 12 Feb 2018 21:35:31 -0800 +Subject: net: fix race on decreasing number of TX queues + +From: Jakub Kicinski + + +[ Upstream commit ac5b70198adc25c73fba28de4f78adcee8f6be0b ] + +netif_set_real_num_tx_queues() can be called when netdev is up. +That usually happens when user requests change of number of +channels/rings with ethtool -L. The procedure for changing +the number of queues involves resetting the qdiscs and setting +dev->num_tx_queues to the new value. When the new value is +lower than the old one, extra care has to be taken to ensure +ordering of accesses to the number of queues vs qdisc reset. + +Currently the queues are reset before new dev->num_tx_queues +is assigned, leaving a window of time where packets can be +enqueued onto the queues going down, leading to a likely +crash in the drivers, since most drivers don't check if TX +skbs are assigned to an active queue. + +Fixes: e6484930d7c7 ("net: allocate tx queues in register_netdevice") +Signed-off-by: Jakub Kicinski +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -2366,8 +2366,11 @@ EXPORT_SYMBOL(netdev_set_num_tc); + */ + int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) + { ++ bool disabling; + int rc; + ++ disabling = txq < dev->real_num_tx_queues; ++ + if (txq < 1 || txq > dev->num_tx_queues) + return -EINVAL; + +@@ -2383,15 +2386,19 @@ int netif_set_real_num_tx_queues(struct + if (dev->num_tc) + netif_setup_tc(dev, txq); + +- if (txq < dev->real_num_tx_queues) { ++ dev->real_num_tx_queues = txq; ++ ++ if (disabling) { ++ synchronize_net(); + qdisc_reset_all_tx_gt(dev, txq); + #ifdef CONFIG_XPS + netif_reset_xps_queues_gt(dev, txq); + #endif + } ++ } else { ++ dev->real_num_tx_queues = txq; + } + +- dev->real_num_tx_queues = txq; + return 0; + } + EXPORT_SYMBOL(netif_set_real_num_tx_queues); diff --git a/queue-4.15/net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch b/queue-4.15/net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch new file mode 100644 index 00000000000..c08ff7c02a8 --- /dev/null +++ b/queue-4.15/net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch @@ -0,0 +1,64 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Sabrina Dubroca +Date: Mon, 26 Feb 2018 16:13:43 +0100 +Subject: net: ipv4: don't allow setting net.ipv4.route.min_pmtu below 68 + +From: Sabrina Dubroca + + +[ Upstream commit c7272c2f1229125f74f22dcdd59de9bbd804f1c8 ] + +According to RFC 1191 sections 3 and 4, ICMP frag-needed messages +indicating an MTU below 68 should be rejected: + + A host MUST never reduce its estimate of the Path MTU below 68 + octets. + +and (talking about ICMP frag-needed's Next-Hop MTU field): + + This field will never contain a value less than 68, since every + router "must be able to forward a datagram of 68 octets without + fragmentation". + +Furthermore, by letting net.ipv4.route.min_pmtu be set to negative +values, we can end up with a very large PMTU when (-1) is cast into u32. + +Let's also make ip_rt_min_pmtu a u32, since it's only ever compared to +unsigned ints. + +Reported-by: Jianlin Shi +Signed-off-by: Sabrina Dubroca +Reviewed-by: Stefano Brivio +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/route.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -128,10 +128,13 @@ static int ip_rt_redirect_silence __read + static int ip_rt_error_cost __read_mostly = HZ; + static int ip_rt_error_burst __read_mostly = 5 * HZ; + static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; +-static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; ++static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; + static int ip_rt_min_advmss __read_mostly = 256; + + static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; ++ ++static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; ++ + /* + * Interface to generic destination cache. + */ +@@ -2934,7 +2937,8 @@ static struct ctl_table ipv4_route_table + .data = &ip_rt_min_pmtu, + .maxlen = sizeof(int), + .mode = 0644, +- .proc_handler = proc_dointvec, ++ .proc_handler = proc_dointvec_minmax, ++ .extra1 = &ip_min_valid_pmtu, + }, + { + .procname = "min_adv_mss", diff --git a/queue-4.15/net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch b/queue-4.15/net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch new file mode 100644 index 00000000000..d16954998e3 --- /dev/null +++ b/queue-4.15/net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch @@ -0,0 +1,36 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: David Ahern +Date: Wed, 21 Feb 2018 11:00:54 -0800 +Subject: net: ipv4: Set addr_type in hash_keys for forwarded case + +From: David Ahern + + +[ Upstream commit 1fe4b1184c2ae2bfbf9e8b14c9c0c1945c98f205 ] + +The result of the skb flow dissect is copied from keys to hash_keys to +ensure only the intended data is hashed. The original L4 hash patch +overlooked setting the addr_type for this case; add it. + +Fixes: bf4e0a3db97eb ("net: ipv4: add support for ECMP hash policy choice") +Reported-by: Ido Schimmel +Signed-off-by: David Ahern +Acked-by: Nikolay Aleksandrov +Reviewed-by: Ido Schimmel +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/route.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -1832,6 +1832,8 @@ int fib_multipath_hash(const struct fib_ + return skb_get_hash_raw(skb) >> 1; + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, flag); ++ ++ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + hash_keys.ports.src = keys.ports.src; diff --git a/queue-4.15/net-mlx5-fix-error-handling-when-adding-flow-rules.patch b/queue-4.15/net-mlx5-fix-error-handling-when-adding-flow-rules.patch new file mode 100644 index 00000000000..8925769c30e --- /dev/null +++ b/queue-4.15/net-mlx5-fix-error-handling-when-adding-flow-rules.patch @@ -0,0 +1,52 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Vlad Buslov +Date: Tue, 6 Feb 2018 10:52:19 +0200 +Subject: net/mlx5: Fix error handling when adding flow rules + +From: Vlad Buslov + + +[ Upstream commit 9238e380e823a39983ee8d6b6ee8d1a9c4ba8a65 ] + +If building match list or adding existing fg fails when +node is locked, function returned without unlocking it. +This happened if node version changed or adding existing fg +returned with EAGAIN after jumping to search_again_locked label. + +Fixes: bd71b08ec2ee ("net/mlx5: Support multiple updates of steering rules in parallel") +Signed-off-by: Vlad Buslov +Reviewed-by: Maor Gottlieb +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +@@ -1755,8 +1755,11 @@ search_again_locked: + + /* Collect all fgs which has a matching match_criteria */ + err = build_match_list(&match_head, ft, spec); +- if (err) ++ if (err) { ++ if (take_write) ++ up_write_ref_node(&ft->node); + return ERR_PTR(err); ++ } + + if (!take_write) + up_read_ref_node(&ft->node); +@@ -1765,8 +1768,11 @@ search_again_locked: + dest_num, version); + free_match_list(&match_head); + if (!IS_ERR(rule) || +- (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) ++ (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) { ++ if (take_write) ++ up_write_ref_node(&ft->node); + return rule; ++ } + + if (!take_write) { + nested_down_write_ref_node(&ft->node, FS_LOCK_GRANDPARENT); diff --git a/queue-4.15/net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch b/queue-4.15/net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch new file mode 100644 index 00000000000..1cc558119d4 --- /dev/null +++ b/queue-4.15/net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch @@ -0,0 +1,36 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Inbar Karmy +Date: Thu, 7 Dec 2017 17:26:33 +0200 +Subject: net/mlx5e: Fix loopback self test when GRO is off + +From: Inbar Karmy + + +[ Upstream commit ef7a3518f7dd4f4cf5e5b5358c93d1eb78df28fb ] + +When GRO is off, the transport header pointer in sk_buff is +initialized to network's header. + +To find the udp header, instead of using udp_hdr() which assumes +skb_network_header was set, manually calculate the udp header offset. + +Fixes: 0952da791c97 ("net/mlx5e: Add support for loopback selftest") +Signed-off-by: Inbar Karmy +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +@@ -216,7 +216,8 @@ mlx5e_test_loopback_validate(struct sk_b + if (iph->protocol != IPPROTO_UDP) + goto out; + +- udph = udp_hdr(skb); ++ /* Don't assume skb_transport_header() was set */ ++ udph = (struct udphdr *)((u8 *)iph + 4 * iph->ihl); + if (udph->dest != htons(9)) + goto out; + diff --git a/queue-4.15/net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch b/queue-4.15/net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch new file mode 100644 index 00000000000..6bcfbec653d --- /dev/null +++ b/queue-4.15/net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch @@ -0,0 +1,125 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Gal Pressman +Date: Wed, 20 Dec 2017 08:48:24 +0200 +Subject: net/mlx5e: Fix TCP checksum in LRO buffers + +From: Gal Pressman + + +[ Upstream commit 8babd44d2079079f9d5a4aca7005aed80236efe0 ] + +When receiving an LRO packet, the checksum field is set by the hardware +to the checksum of the first coalesced packet. Obviously, this checksum +is not valid for the merged LRO packet and should be fixed. We can use +the CQE checksum which covers the checksum of the entire merged packet +TCP payload to help us calculate the checksum incrementally. + +Tested by sending IPv4/6 traffic with LRO enabled, RX checksum disabled +and watching nstat checksum error counters (in addition to the obvious +bandwidth drop caused by checksum errors). + +This bug is usually "hidden" since LRO packets would go through the +CHECKSUM_UNNECESSARY flow which does not validate the packet checksum. + +It's important to note that previous to this patch, LRO packets provided +with CHECKSUM_UNNECESSARY are indeed packets with a correct validated +checksum (even though the checksum inside the TCP header is incorrect), +since the hardware LRO aggregation is terminated upon receiving a packet +with bad checksum. + +Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files") +Signed-off-by: Gal Pressman +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 47 +++++++++++++++++------- + 1 file changed, 34 insertions(+), 13 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + #include "en.h" + #include "en_tc.h" + #include "eswitch.h" +@@ -547,20 +548,33 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_r + return true; + } + ++static void mlx5e_lro_update_tcp_hdr(struct mlx5_cqe64 *cqe, struct tcphdr *tcp) ++{ ++ u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe); ++ u8 tcp_ack = (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) || ++ (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA); ++ ++ tcp->check = 0; ++ tcp->psh = get_cqe_lro_tcppsh(cqe); ++ ++ if (tcp_ack) { ++ tcp->ack = 1; ++ tcp->ack_seq = cqe->lro_ack_seq_num; ++ tcp->window = cqe->lro_tcp_win; ++ } ++} ++ + static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe, + u32 cqe_bcnt) + { + struct ethhdr *eth = (struct ethhdr *)(skb->data); + struct tcphdr *tcp; + int network_depth = 0; ++ __wsum check; + __be16 proto; + u16 tot_len; + void *ip_p; + +- u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe); +- u8 tcp_ack = (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) || +- (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA); +- + proto = __vlan_get_protocol(skb, eth->h_proto, &network_depth); + + tot_len = cqe_bcnt - network_depth; +@@ -577,23 +591,30 @@ static void mlx5e_lro_update_hdr(struct + ipv4->check = 0; + ipv4->check = ip_fast_csum((unsigned char *)ipv4, + ipv4->ihl); ++ ++ mlx5e_lro_update_tcp_hdr(cqe, tcp); ++ check = csum_partial(tcp, tcp->doff * 4, ++ csum_unfold((__force __sum16)cqe->check_sum)); ++ /* Almost done, don't forget the pseudo header */ ++ tcp->check = csum_tcpudp_magic(ipv4->saddr, ipv4->daddr, ++ tot_len - sizeof(struct iphdr), ++ IPPROTO_TCP, check); + } else { ++ u16 payload_len = tot_len - sizeof(struct ipv6hdr); + struct ipv6hdr *ipv6 = ip_p; + + tcp = ip_p + sizeof(struct ipv6hdr); + skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; + + ipv6->hop_limit = cqe->lro_min_ttl; +- ipv6->payload_len = cpu_to_be16(tot_len - +- sizeof(struct ipv6hdr)); +- } ++ ipv6->payload_len = cpu_to_be16(payload_len); + +- tcp->psh = get_cqe_lro_tcppsh(cqe); +- +- if (tcp_ack) { +- tcp->ack = 1; +- tcp->ack_seq = cqe->lro_ack_seq_num; +- tcp->window = cqe->lro_tcp_win; ++ mlx5e_lro_update_tcp_hdr(cqe, tcp); ++ check = csum_partial(tcp, tcp->doff * 4, ++ csum_unfold((__force __sum16)cqe->check_sum)); ++ /* Almost done, don't forget the pseudo header */ ++ tcp->check = csum_ipv6_magic(&ipv6->saddr, &ipv6->daddr, payload_len, ++ IPPROTO_TCP, check); + } + } + diff --git a/queue-4.15/net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch b/queue-4.15/net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch new file mode 100644 index 00000000000..cc182ebec7a --- /dev/null +++ b/queue-4.15/net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch @@ -0,0 +1,65 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Gal Pressman +Date: Thu, 25 Jan 2018 18:00:41 +0200 +Subject: net/mlx5e: Specify numa node when allocating drop rq + +From: Gal Pressman + + +[ Upstream commit 2f0db87901698cd73d828cc6fb1957b8916fc911 ] + +When allocating a drop rq, no numa node is explicitly set which means +allocations are done on node zero. This is not necessarily the nearest +numa node to the HCA, and even worse, might even be a memoryless numa +node. + +Choose the numa_node given to us by the pci device in order to properly +allocate the coherent dma memory instead of assuming zero is valid. + +Fixes: 556dd1b9c313 ("net/mlx5e: Set drop RQ's necessary parameters only") +Signed-off-by: Gal Pressman +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +@@ -1911,13 +1911,16 @@ static void mlx5e_build_rq_param(struct + param->wq.linear = 1; + } + +-static void mlx5e_build_drop_rq_param(struct mlx5e_rq_param *param) ++static void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev, ++ struct mlx5e_rq_param *param) + { + void *rqc = param->rqc; + void *wq = MLX5_ADDR_OF(rqc, rqc, wq); + + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST); + MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe))); ++ ++ param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev); + } + + static void mlx5e_build_sq_param_common(struct mlx5e_priv *priv, +@@ -2774,6 +2777,9 @@ static int mlx5e_alloc_drop_cq(struct ml + struct mlx5e_cq *cq, + struct mlx5e_cq_param *param) + { ++ param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev); ++ param->wq.db_numa_node = dev_to_node(&mdev->pdev->dev); ++ + return mlx5e_alloc_cq_common(mdev, param, cq); + } + +@@ -2785,7 +2791,7 @@ static int mlx5e_open_drop_rq(struct mlx + struct mlx5e_cq *cq = &drop_rq->cq; + int err; + +- mlx5e_build_drop_rq_param(&rq_param); ++ mlx5e_build_drop_rq_param(mdev, &rq_param); + + err = mlx5e_alloc_drop_cq(mdev, cq, &cq_param); + if (err) diff --git a/queue-4.15/net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch b/queue-4.15/net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch new file mode 100644 index 00000000000..10fda683be3 --- /dev/null +++ b/queue-4.15/net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch @@ -0,0 +1,40 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Eran Ben Elisha +Date: Thu, 25 Jan 2018 11:18:09 +0200 +Subject: net/mlx5e: Verify inline header size do not exceed SKB linear size + +From: Eran Ben Elisha + + +[ Upstream commit f600c6088018d1dbc5777d18daa83660f7ea4a64 ] + +Driver tries to copy at least MLX5E_MIN_INLINE bytes into the control +segment of the WQE. It assumes that the linear part contains at least +MLX5E_MIN_INLINE bytes, which can be wrong. + +Cited commit verified that driver will not copy more bytes into the +inline header part that the actual size of the packet. Re-factor this +check to make sure we do not exceed the linear part as well. + +This fix is aligned with the current driver's assumption that the entire +L2 will be present in the linear part of the SKB. + +Fixes: 6aace17e64f4 ("net/mlx5e: Fix inline header size for small packets") +Signed-off-by: Eran Ben Elisha +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +@@ -176,7 +176,7 @@ static inline u16 mlx5e_calc_min_inline( + default: + hlen = mlx5e_skb_l2_header_offset(skb); + } +- return min_t(u16, hlen, skb->len); ++ return min_t(u16, hlen, skb_headlen(skb)); + } + + static inline void mlx5e_tx_skb_pull_inline(unsigned char **skb_data, diff --git a/queue-4.15/net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch b/queue-4.15/net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch new file mode 100644 index 00000000000..e0463b32943 --- /dev/null +++ b/queue-4.15/net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch @@ -0,0 +1,34 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Heiner Kallweit +Date: Thu, 8 Feb 2018 21:01:48 +0100 +Subject: net: phy: fix phy_start to consider PHY_IGNORE_INTERRUPT + +From: Heiner Kallweit + + +[ Upstream commit 08f5138512180a479ce6b9d23b825c9f4cd3be77 ] + +This condition wasn't adjusted when PHY_IGNORE_INTERRUPT (-2) was added +long ago. In case of PHY_IGNORE_INTERRUPT the MAC interrupt indicates +also PHY state changes and we should do what the symbol says. + +Fixes: 84a527a41f38 ("net: phylib: fix interrupts re-enablement in phy_start") +Signed-off-by: Heiner Kallweit +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/phy/phy.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/phy/phy.c ++++ b/drivers/net/phy/phy.c +@@ -844,7 +844,7 @@ void phy_start(struct phy_device *phydev + phy_resume(phydev); + + /* make sure interrupts are re-enabled for the PHY */ +- if (phydev->irq != PHY_POLL) { ++ if (phy_interrupt_is_valid(phydev)) { + err = phy_enable_interrupts(phydev); + if (err < 0) + break; diff --git a/queue-4.15/net-phy-restore-phy_resume-locking-assumption.patch b/queue-4.15/net-phy-restore-phy_resume-locking-assumption.patch new file mode 100644 index 00000000000..2cdaf5e8ee5 --- /dev/null +++ b/queue-4.15/net-phy-restore-phy_resume-locking-assumption.patch @@ -0,0 +1,101 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Andrew Lunn +Date: Tue, 27 Feb 2018 01:56:06 +0100 +Subject: net: phy: Restore phy_resume() locking assumption + +From: Andrew Lunn + + +[ Upstream commit 9c2c2e62df3fa30fb13fbeb7512a4eede729383b ] + +commit f5e64032a799 ("net: phy: fix resume handling") changes the +locking semantics for phy_resume() such that the caller now needs to +hold the phy mutex. Not all call sites were adopted to this new +semantic, resulting in warnings from the added +WARN_ON(!mutex_is_locked(&phydev->lock)). Rather than change the +semantics, add a __phy_resume() and restore the old behavior of +phy_resume(). + +Reported-by: Heiner Kallweit +Fixes: f5e64032a799 ("net: phy: fix resume handling") +Signed-off-by: Andrew Lunn +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/phy/phy.c | 2 +- + drivers/net/phy/phy_device.c | 18 +++++++++++++----- + include/linux/phy.h | 1 + + 3 files changed, 15 insertions(+), 6 deletions(-) + +--- a/drivers/net/phy/phy.c ++++ b/drivers/net/phy/phy.c +@@ -841,7 +841,7 @@ void phy_start(struct phy_device *phydev + break; + case PHY_HALTED: + /* if phy was suspended, bring the physical link up again */ +- phy_resume(phydev); ++ __phy_resume(phydev); + + /* make sure interrupts are re-enabled for the PHY */ + if (phy_interrupt_is_valid(phydev)) { +--- a/drivers/net/phy/phy_device.c ++++ b/drivers/net/phy/phy_device.c +@@ -135,9 +135,7 @@ static int mdio_bus_phy_resume(struct de + if (!mdio_bus_phy_may_suspend(phydev)) + goto no_resume; + +- mutex_lock(&phydev->lock); + ret = phy_resume(phydev); +- mutex_unlock(&phydev->lock); + if (ret < 0) + return ret; + +@@ -1028,9 +1026,7 @@ int phy_attach_direct(struct net_device + if (err) + goto error; + +- mutex_lock(&phydev->lock); + phy_resume(phydev); +- mutex_unlock(&phydev->lock); + phy_led_triggers_register(phydev); + + return err; +@@ -1156,7 +1152,7 @@ int phy_suspend(struct phy_device *phyde + } + EXPORT_SYMBOL(phy_suspend); + +-int phy_resume(struct phy_device *phydev) ++int __phy_resume(struct phy_device *phydev) + { + struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver); + int ret = 0; +@@ -1173,6 +1169,18 @@ int phy_resume(struct phy_device *phydev + + return ret; + } ++EXPORT_SYMBOL(__phy_resume); ++ ++int phy_resume(struct phy_device *phydev) ++{ ++ int ret; ++ ++ mutex_lock(&phydev->lock); ++ ret = __phy_resume(phydev); ++ mutex_unlock(&phydev->lock); ++ ++ return ret; ++} + EXPORT_SYMBOL(phy_resume); + + int phy_loopback(struct phy_device *phydev, bool enable) +--- a/include/linux/phy.h ++++ b/include/linux/phy.h +@@ -819,6 +819,7 @@ void phy_device_remove(struct phy_device + int phy_init_hw(struct phy_device *phydev); + int phy_suspend(struct phy_device *phydev); + int phy_resume(struct phy_device *phydev); ++int __phy_resume(struct phy_device *phydev); + int phy_loopback(struct phy_device *phydev, bool enable); + struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, + phy_interface_t interface); diff --git a/queue-4.15/net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch b/queue-4.15/net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch new file mode 100644 index 00000000000..d3cf5681391 --- /dev/null +++ b/queue-4.15/net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch @@ -0,0 +1,50 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Ivan Vecera +Date: Thu, 8 Feb 2018 16:10:39 +0100 +Subject: net/sched: cls_u32: fix cls_u32 on filter replace + +From: Ivan Vecera + + +[ Upstream commit eb53f7af6f15285e2f6ada97285395343ce9f433 ] + +The following sequence is currently broken: + + # tc qdisc add dev foo ingress + # tc filter replace dev foo protocol all ingress \ + u32 match u8 0 0 action mirred egress mirror dev bar1 + # tc filter replace dev foo protocol all ingress \ + handle 800::800 pref 49152 \ + u32 match u8 0 0 action mirred egress mirror dev bar2 + Error: cls_u32: Key node flags do not match passed flags. + We have an error talking to the kernel, -1 + +The error comes from u32_change() when comparing new and +existing flags. The existing ones always contains one of +TCA_CLS_FLAGS_{,NOT}_IN_HW flag depending on offloading state. +These flags cannot be passed from userspace so the condition +(n->flags != flags) in u32_change() always fails. + +Fix the condition so the flags TCA_CLS_FLAGS_NOT_IN_HW and +TCA_CLS_FLAGS_IN_HW are not taken into account. + +Fixes: 24d3dc6d27ea ("net/sched: cls_u32: Reflect HW offload status") +Signed-off-by: Ivan Vecera +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/cls_u32.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/net/sched/cls_u32.c ++++ b/net/sched/cls_u32.c +@@ -928,7 +928,8 @@ static int u32_change(struct net *net, s + if (TC_U32_KEY(n->handle) == 0) + return -EINVAL; + +- if (n->flags != flags) ++ if ((n->flags ^ flags) & ++ ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW)) + return -EINVAL; + + new = u32_init_knode(tp, n); diff --git a/queue-4.15/net-sched-report-if-filter-is-too-large-to-dump.patch b/queue-4.15/net-sched-report-if-filter-is-too-large-to-dump.patch new file mode 100644 index 00000000000..28cec557299 --- /dev/null +++ b/queue-4.15/net-sched-report-if-filter-is-too-large-to-dump.patch @@ -0,0 +1,54 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Roman Kapl +Date: Mon, 19 Feb 2018 21:32:51 +0100 +Subject: net: sched: report if filter is too large to dump + +From: Roman Kapl + + +[ Upstream commit 5ae437ad5a2ed573b1ebb04e0afa70b8869f88dd ] + +So far, if the filter was too large to fit in the allocated skb, the +kernel did not return any error and stopped dumping. Modify the dumper +so that it returns -EMSGSIZE when a filter fails to dump and it is the +first filter in the skb. If we are not first, we will get a next chance +with more room. + +I understand this is pretty near to being an API change, but the +original design (silent truncation) can be considered a bug. + +Note: The error case can happen pretty easily if you create a filter +with 32 actions and have 4kb pages. Also recent versions of iproute try +to be clever with their buffer allocation size, which in turn leads to + +Signed-off-by: Roman Kapl +Acked-by: Jiri Pirko +Acked-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/cls_api.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/net/sched/cls_api.c ++++ b/net/sched/cls_api.c +@@ -1054,13 +1054,18 @@ static int tc_dump_tfilter(struct sk_buf + nla_get_u32(tca[TCA_CHAIN]) != chain->index) + continue; + if (!tcf_chain_dump(chain, q, parent, skb, cb, +- index_start, &index)) ++ index_start, &index)) { ++ err = -EMSGSIZE; + break; ++ } + } + + cb->args[0] = index; + + out: ++ /* If we did no progress, the error (EMSGSIZE) is real */ ++ if (skb->len == 0 && err) ++ return err; + return skb->len; + } + diff --git a/queue-4.15/net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch b/queue-4.15/net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch new file mode 100644 index 00000000000..eae31bf5748 --- /dev/null +++ b/queue-4.15/net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch @@ -0,0 +1,37 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Eric Dumazet +Date: Thu, 22 Feb 2018 19:45:27 -0800 +Subject: net_sched: gen_estimator: fix broken estimators based on percpu stats + +From: Eric Dumazet + + +[ Upstream commit a5f7add332b4ea6d4b9480971b3b0f5e66466ae9 ] + +pfifo_fast got percpu stats lately, uncovering a bug I introduced last +year in linux-4.10. + +I missed the fact that we have to clear our temporary storage +before calling __gnet_stats_copy_basic() in the case of percpu stats. + +Without this fix, rate estimators (tc qd replace dev xxx root est 1sec +4sec pfifo_fast) are utterly broken. + +Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators") +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/gen_estimator.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/core/gen_estimator.c ++++ b/net/core/gen_estimator.c +@@ -66,6 +66,7 @@ struct net_rate_estimator { + static void est_fetch_counters(struct net_rate_estimator *e, + struct gnet_stats_basic_packed *b) + { ++ memset(b, 0, sizeof(*b)); + if (e->stats_lock) + spin_lock(e->stats_lock); + diff --git a/queue-4.15/netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch b/queue-4.15/netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch new file mode 100644 index 00000000000..cb31fea07b7 --- /dev/null +++ b/queue-4.15/netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch @@ -0,0 +1,62 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Nicolas Dichtel +Date: Tue, 6 Feb 2018 14:48:32 +0100 +Subject: netlink: ensure to loop over all netns in genlmsg_multicast_allns() + +From: Nicolas Dichtel + + +[ Upstream commit cb9f7a9a5c96a773bbc9c70660dc600cfff82f82 ] + +Nowadays, nlmsg_multicast() returns only 0 or -ESRCH but this was not the +case when commit 134e63756d5f was pushed. +However, there was no reason to stop the loop if a netns does not have +listeners. +Returns -ESRCH only if there was no listeners in all netns. + +To avoid having the same problem in the future, I didn't take the +assumption that nlmsg_multicast() returns only 0 or -ESRCH. + +Fixes: 134e63756d5f ("genetlink: make netns aware") +CC: Johannes Berg +Signed-off-by: Nicolas Dichtel +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/genetlink.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +--- a/net/netlink/genetlink.c ++++ b/net/netlink/genetlink.c +@@ -1081,6 +1081,7 @@ static int genlmsg_mcast(struct sk_buff + { + struct sk_buff *tmp; + struct net *net, *prev = NULL; ++ bool delivered = false; + int err; + + for_each_net_rcu(net) { +@@ -1092,14 +1093,21 @@ static int genlmsg_mcast(struct sk_buff + } + err = nlmsg_multicast(prev->genl_sock, tmp, + portid, group, flags); +- if (err) ++ if (!err) ++ delivered = true; ++ else if (err != -ESRCH) + goto error; + } + + prev = net; + } + +- return nlmsg_multicast(prev->genl_sock, skb, portid, group, flags); ++ err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags); ++ if (!err) ++ delivered = true; ++ else if (err != -ESRCH) ++ goto error; ++ return delivered ? 0 : -ESRCH; + error: + kfree_skb(skb); + return err; diff --git a/queue-4.15/netlink-put-module-reference-if-dump-start-fails.patch b/queue-4.15/netlink-put-module-reference-if-dump-start-fails.patch new file mode 100644 index 00000000000..017a28ea560 --- /dev/null +++ b/queue-4.15/netlink-put-module-reference-if-dump-start-fails.patch @@ -0,0 +1,48 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: "Jason A. Donenfeld" +Date: Wed, 21 Feb 2018 04:41:59 +0100 +Subject: netlink: put module reference if dump start fails + +From: "Jason A. Donenfeld" + + +[ Upstream commit b87b6194be631c94785fe93398651e804ed43e28 ] + +Before, if cb->start() failed, the module reference would never be put, +because cb->cb_running is intentionally false at this point. Users are +generally annoyed by this because they can no longer unload modules that +leak references. Also, it may be possible to tediously wrap a reference +counter back to zero, especially since module.c still uses atomic_inc +instead of refcount_inc. + +This patch expands the error path to simply call module_put if +cb->start() fails. + +Fixes: 41c87425a1ac ("netlink: do not set cb_running if dump's start() errs") +Signed-off-by: Jason A. Donenfeld +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/netlink/af_netlink.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -2275,7 +2275,7 @@ int __netlink_dump_start(struct sock *ss + if (cb->start) { + ret = cb->start(cb); + if (ret) +- goto error_unlock; ++ goto error_put; + } + + nlk->cb_running = true; +@@ -2295,6 +2295,8 @@ int __netlink_dump_start(struct sock *ss + */ + return -EINTR; + ++error_put: ++ module_put(control->module); + error_unlock: + sock_put(sk); + mutex_unlock(nlk->cb_mutex); diff --git a/queue-4.15/ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch b/queue-4.15/ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch new file mode 100644 index 00000000000..b4bfb7eebe2 --- /dev/null +++ b/queue-4.15/ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch @@ -0,0 +1,60 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Guillaume Nault +Date: Fri, 2 Mar 2018 18:41:16 +0100 +Subject: ppp: prevent unregistered channels from connecting to PPP units + +From: Guillaume Nault + + +[ Upstream commit 77f840e3e5f09c6d7d727e85e6e08276dd813d11 ] + +PPP units don't hold any reference on the channels connected to it. +It is the channel's responsibility to ensure that it disconnects from +its unit before being destroyed. +In practice, this is ensured by ppp_unregister_channel() disconnecting +the channel from the unit before dropping a reference on the channel. + +However, it is possible for an unregistered channel to connect to a PPP +unit: register a channel with ppp_register_net_channel(), attach a +/dev/ppp file to it with ioctl(PPPIOCATTCHAN), unregister the channel +with ppp_unregister_channel() and finally connect the /dev/ppp file to +a PPP unit with ioctl(PPPIOCCONNECT). + +Once in this situation, the channel is only held by the /dev/ppp file, +which can be released at anytime and free the channel without letting +the parent PPP unit know. Then the ppp structure ends up with dangling +pointers in its ->channels list. + +Prevent this scenario by forbidding unregistered channels from +connecting to PPP units. This maintains the code logic by keeping +ppp_unregister_channel() responsible from disconnecting the channel if +necessary and avoids modification on the reference counting mechanism. + +This issue seems to predate git history (successfully reproduced on +Linux 2.6.26 and earlier PPP commits are unrelated). + +Signed-off-by: Guillaume Nault +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ppp/ppp_generic.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/drivers/net/ppp/ppp_generic.c ++++ b/drivers/net/ppp/ppp_generic.c +@@ -3161,6 +3161,15 @@ ppp_connect_channel(struct channel *pch, + goto outl; + + ppp_lock(ppp); ++ spin_lock_bh(&pch->downl); ++ if (!pch->chan) { ++ /* Don't connect unregistered channels */ ++ spin_unlock_bh(&pch->downl); ++ ppp_unlock(ppp); ++ ret = -ENOTCONN; ++ goto outl; ++ } ++ spin_unlock_bh(&pch->downl); + if (pch->file.hdrlen > ppp->file.hdrlen) + ppp->file.hdrlen = pch->file.hdrlen; + hdrlen = pch->file.hdrlen + 2; /* for protocol bytes */ diff --git a/queue-4.15/revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch b/queue-4.15/revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch new file mode 100644 index 00000000000..07dcde7cc2b --- /dev/null +++ b/queue-4.15/revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch @@ -0,0 +1,51 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: Julian Wiedmann +Date: Tue, 27 Feb 2018 18:58:15 +0100 +Subject: Revert "s390/qeth: fix using of ref counter for rxip addresses" + +From: Julian Wiedmann + + +[ Upstream commit 4964c66fd49b2e2342da35358f2ff74614bcbaee ] + +This reverts commit cb816192d986f7596009dedcf2201fe2e5bc2aa7. + +The issue this attempted to fix never actually occurs. +l3_add_rxip() checks (via l3_ip_from_hash()) if the requested address +was previously added to the card. If so, it returns -EEXIST and doesn't +call l3_add_ip(). +As a result, the "address exists" path in l3_add_ip() is never taken +for rxip addresses, and this patch had no effect. + +Fixes: cb816192d986 ("s390/qeth: fix using of ref counter for rxip addresses") +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_l3_main.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/drivers/s390/net/qeth_l3_main.c ++++ b/drivers/s390/net/qeth_l3_main.c +@@ -250,8 +250,7 @@ int qeth_l3_delete_ip(struct qeth_card * + return -ENOENT; + + addr->ref_counter--; +- if (addr->ref_counter > 0 && (addr->type == QETH_IP_TYPE_NORMAL || +- addr->type == QETH_IP_TYPE_RXIP)) ++ if (addr->type == QETH_IP_TYPE_NORMAL && addr->ref_counter > 0) + return rc; + if (addr->in_progress) + return -EINPROGRESS; +@@ -329,9 +328,8 @@ int qeth_l3_add_ip(struct qeth_card *car + kfree(addr); + } + } else { +- if (addr->type == QETH_IP_TYPE_NORMAL || +- addr->type == QETH_IP_TYPE_RXIP) +- addr->ref_counter++; ++ if (addr->type == QETH_IP_TYPE_NORMAL) ++ addr->ref_counter++; + } + + return rc; diff --git a/queue-4.15/rxrpc-fix-send-in-rxrpc_send_data_packet.patch b/queue-4.15/rxrpc-fix-send-in-rxrpc_send_data_packet.patch new file mode 100644 index 00000000000..dc21bde6805 --- /dev/null +++ b/queue-4.15/rxrpc-fix-send-in-rxrpc_send_data_packet.patch @@ -0,0 +1,37 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: David Howells +Date: Thu, 22 Feb 2018 14:38:14 +0000 +Subject: rxrpc: Fix send in rxrpc_send_data_packet() + +From: David Howells + + +[ Upstream commit 93c62c45ed5fad1b87e3a45835b251cd68de9c46 ] + +All the kernel_sendmsg() calls in rxrpc_send_data_packet() need to send +both parts of the iov[] buffer, but one of them does not. Fix it so that +it does. + +Without this, short IPv6 rxrpc DATA packets may be seen that have the rxrpc +header included, but no payload. + +Fixes: 5a924b8951f8 ("rxrpc: Don't store the rxrpc header in the Tx queue sk_buffs") +Reported-by: Marc Dionne +Signed-off-by: David Howells +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/rxrpc/output.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/rxrpc/output.c ++++ b/net/rxrpc/output.c +@@ -445,7 +445,7 @@ send_fragmentable: + (char *)&opt, sizeof(opt)); + if (ret == 0) { + ret = kernel_sendmsg(conn->params.local->socket, &msg, +- iov, 1, iov[0].iov_len); ++ iov, 2, len); + + opt = IPV6_PMTUDISC_DO; + kernel_setsockopt(conn->params.local->socket, diff --git a/queue-4.15/s390-qeth-fix-double-free-on-ip-add-remove-race.patch b/queue-4.15/s390-qeth-fix-double-free-on-ip-add-remove-race.patch new file mode 100644 index 00000000000..facf4d9f588 --- /dev/null +++ b/queue-4.15/s390-qeth-fix-double-free-on-ip-add-remove-race.patch @@ -0,0 +1,47 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: Julian Wiedmann +Date: Tue, 27 Feb 2018 18:58:14 +0100 +Subject: s390/qeth: fix double-free on IP add/remove race + +From: Julian Wiedmann + + +[ Upstream commit 14d066c3531a87f727968cacd85bd95c75f59843 ] + +Registering an IPv4 address with the HW takes quite a while, so we +temporarily drop the ip_htable lock. Any concurrent add/remove of the +same IP adjusts the IP's use count, and (on remove) is then blocked by +addr->in_progress. +After the register call has completed, we check the use count for +concurrently attempted add/remove calls - and possibly straight-away +deregister the IP again. This happens via l3_delete_ip(), which +1) looks up the queried IP in the htable (getting a reference to the + *same* queried object), +2) deregisters the IP from the HW, and +3) frees the IP object. + +The caller in l3_add_ip() then does a second free on the same object. + +For this case, skip all the extra checks and lookups in l3_delete_ip() +and just deregister & free the IP object ourselves. + +Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback") +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_l3_main.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/s390/net/qeth_l3_main.c ++++ b/drivers/s390/net/qeth_l3_main.c +@@ -320,7 +320,8 @@ int qeth_l3_add_ip(struct qeth_card *car + (rc == IPA_RC_LAN_OFFLINE)) { + addr->disp_flag = QETH_DISP_ADDR_DO_NOTHING; + if (addr->ref_counter < 1) { +- qeth_l3_delete_ip(card, addr); ++ qeth_l3_deregister_addr_entry(card, addr); ++ hash_del(&addr->hnode); + kfree(addr); + } + } else { diff --git a/queue-4.15/s390-qeth-fix-ip-address-lookup-for-l3-devices.patch b/queue-4.15/s390-qeth-fix-ip-address-lookup-for-l3-devices.patch new file mode 100644 index 00000000000..8132c010110 --- /dev/null +++ b/queue-4.15/s390-qeth-fix-ip-address-lookup-for-l3-devices.patch @@ -0,0 +1,255 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: Julian Wiedmann +Date: Tue, 27 Feb 2018 18:58:16 +0100 +Subject: s390/qeth: fix IP address lookup for L3 devices + +From: Julian Wiedmann + + +[ Upstream commit c5c48c58b259bb8f0482398370ee539d7a12df3e ] + +Current code ("qeth_l3_ip_from_hash()") matches a queried address object +against objects in the IP table by IP address, Mask/Prefix Length and +MAC address ("qeth_l3_ipaddrs_is_equal()"). But what callers actually +require is either +a) "is this IP address registered" (ie. match by IP address only), +before adding a new address. +b) or "is this address object registered" (ie. match all relevant + attributes), before deleting an address. + +Right now +1. the ADD path is too strict in its lookup, and eg. doesn't detect +conflicts between an existing NORMAL address and a new VIPA address +(because the NORMAL address will have mask != 0, while VIPA has +a mask == 0), +2. the DELETE path is not strict enough, and eg. allows del_rxip() to +delete a VIPA address as long as the IP address matches. + +Fix all this by adding helpers (_addr_match_ip() and _addr_match_all()) +that do the appropriate checking. + +Note that the ADD path for NORMAL addresses is special, as qeth keeps +track of how many times such an address is in use (and there is no +immediate way of returning errors to the caller). So when a requested +NORMAL address _fully_ matches an existing one, it's not considered a +conflict and we merely increment the refcount. + +Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback") +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_l3.h | 34 ++++++++++++++ + drivers/s390/net/qeth_l3_main.c | 91 ++++++++++++++++++---------------------- + 2 files changed, 74 insertions(+), 51 deletions(-) + +--- a/drivers/s390/net/qeth_l3.h ++++ b/drivers/s390/net/qeth_l3.h +@@ -40,8 +40,40 @@ struct qeth_ipaddr { + unsigned int pfxlen; + } a6; + } u; +- + }; ++ ++static inline bool qeth_l3_addr_match_ip(struct qeth_ipaddr *a1, ++ struct qeth_ipaddr *a2) ++{ ++ if (a1->proto != a2->proto) ++ return false; ++ if (a1->proto == QETH_PROT_IPV6) ++ return ipv6_addr_equal(&a1->u.a6.addr, &a2->u.a6.addr); ++ return a1->u.a4.addr == a2->u.a4.addr; ++} ++ ++static inline bool qeth_l3_addr_match_all(struct qeth_ipaddr *a1, ++ struct qeth_ipaddr *a2) ++{ ++ /* Assumes that the pair was obtained via qeth_l3_addr_find_by_ip(), ++ * so 'proto' and 'addr' match for sure. ++ * ++ * For ucast: ++ * - 'mac' is always 0. ++ * - 'mask'/'pfxlen' for RXIP/VIPA is always 0. For NORMAL, matching ++ * values are required to avoid mixups in takeover eligibility. ++ * ++ * For mcast, ++ * - 'mac' is mapped from the IP, and thus always matches. ++ * - 'mask'/'pfxlen' is always 0. ++ */ ++ if (a1->type != a2->type) ++ return false; ++ if (a1->proto == QETH_PROT_IPV6) ++ return a1->u.a6.pfxlen == a2->u.a6.pfxlen; ++ return a1->u.a4.mask == a2->u.a4.mask; ++} ++ + static inline u64 qeth_l3_ipaddr_hash(struct qeth_ipaddr *addr) + { + u64 ret = 0; +--- a/drivers/s390/net/qeth_l3_main.c ++++ b/drivers/s390/net/qeth_l3_main.c +@@ -150,6 +150,24 @@ int qeth_l3_string_to_ipaddr(const char + return -EINVAL; + } + ++static struct qeth_ipaddr *qeth_l3_find_addr_by_ip(struct qeth_card *card, ++ struct qeth_ipaddr *query) ++{ ++ u64 key = qeth_l3_ipaddr_hash(query); ++ struct qeth_ipaddr *addr; ++ ++ if (query->is_multicast) { ++ hash_for_each_possible(card->ip_mc_htable, addr, hnode, key) ++ if (qeth_l3_addr_match_ip(addr, query)) ++ return addr; ++ } else { ++ hash_for_each_possible(card->ip_htable, addr, hnode, key) ++ if (qeth_l3_addr_match_ip(addr, query)) ++ return addr; ++ } ++ return NULL; ++} ++ + static void qeth_l3_convert_addr_to_bits(u8 *addr, u8 *bits, int len) + { + int i, j; +@@ -203,34 +221,6 @@ static bool qeth_l3_is_addr_covered_by_i + return rc; + } + +-inline int +-qeth_l3_ipaddrs_is_equal(struct qeth_ipaddr *addr1, struct qeth_ipaddr *addr2) +-{ +- return addr1->proto == addr2->proto && +- !memcmp(&addr1->u, &addr2->u, sizeof(addr1->u)) && +- !memcmp(&addr1->mac, &addr2->mac, sizeof(addr1->mac)); +-} +- +-static struct qeth_ipaddr * +-qeth_l3_ip_from_hash(struct qeth_card *card, struct qeth_ipaddr *tmp_addr) +-{ +- struct qeth_ipaddr *addr; +- +- if (tmp_addr->is_multicast) { +- hash_for_each_possible(card->ip_mc_htable, addr, +- hnode, qeth_l3_ipaddr_hash(tmp_addr)) +- if (qeth_l3_ipaddrs_is_equal(tmp_addr, addr)) +- return addr; +- } else { +- hash_for_each_possible(card->ip_htable, addr, +- hnode, qeth_l3_ipaddr_hash(tmp_addr)) +- if (qeth_l3_ipaddrs_is_equal(tmp_addr, addr)) +- return addr; +- } +- +- return NULL; +-} +- + int qeth_l3_delete_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr) + { + int rc = 0; +@@ -245,8 +235,8 @@ int qeth_l3_delete_ip(struct qeth_card * + QETH_CARD_HEX(card, 4, ((char *)&tmp_addr->u.a6.addr) + 8, 8); + } + +- addr = qeth_l3_ip_from_hash(card, tmp_addr); +- if (!addr) ++ addr = qeth_l3_find_addr_by_ip(card, tmp_addr); ++ if (!addr || !qeth_l3_addr_match_all(addr, tmp_addr)) + return -ENOENT; + + addr->ref_counter--; +@@ -268,6 +258,7 @@ int qeth_l3_add_ip(struct qeth_card *car + { + int rc = 0; + struct qeth_ipaddr *addr; ++ char buf[40]; + + QETH_CARD_TEXT(card, 4, "addip"); + +@@ -278,8 +269,20 @@ int qeth_l3_add_ip(struct qeth_card *car + QETH_CARD_HEX(card, 4, ((char *)&tmp_addr->u.a6.addr) + 8, 8); + } + +- addr = qeth_l3_ip_from_hash(card, tmp_addr); +- if (!addr) { ++ addr = qeth_l3_find_addr_by_ip(card, tmp_addr); ++ if (addr) { ++ if (tmp_addr->type != QETH_IP_TYPE_NORMAL) ++ return -EADDRINUSE; ++ if (qeth_l3_addr_match_all(addr, tmp_addr)) { ++ addr->ref_counter++; ++ return 0; ++ } ++ qeth_l3_ipaddr_to_string(tmp_addr->proto, (u8 *)&tmp_addr->u, ++ buf); ++ dev_warn(&card->gdev->dev, ++ "Registering IP address %s failed\n", buf); ++ return -EADDRINUSE; ++ } else { + addr = qeth_l3_get_addr_buffer(tmp_addr->proto); + if (!addr) + return -ENOMEM; +@@ -327,11 +330,7 @@ int qeth_l3_add_ip(struct qeth_card *car + hash_del(&addr->hnode); + kfree(addr); + } +- } else { +- if (addr->type == QETH_IP_TYPE_NORMAL) +- addr->ref_counter++; + } +- + return rc; + } + +@@ -715,12 +714,7 @@ int qeth_l3_add_vipa(struct qeth_card *c + return -ENOMEM; + + spin_lock_bh(&card->ip_lock); +- +- if (qeth_l3_ip_from_hash(card, ipaddr)) +- rc = -EEXIST; +- else +- qeth_l3_add_ip(card, ipaddr); +- ++ rc = qeth_l3_add_ip(card, ipaddr); + spin_unlock_bh(&card->ip_lock); + + kfree(ipaddr); +@@ -783,12 +777,7 @@ int qeth_l3_add_rxip(struct qeth_card *c + return -ENOMEM; + + spin_lock_bh(&card->ip_lock); +- +- if (qeth_l3_ip_from_hash(card, ipaddr)) +- rc = -EEXIST; +- else +- qeth_l3_add_ip(card, ipaddr); +- ++ rc = qeth_l3_add_ip(card, ipaddr); + spin_unlock_bh(&card->ip_lock); + + kfree(ipaddr); +@@ -1396,8 +1385,9 @@ qeth_l3_add_mc_to_hash(struct qeth_card + memcpy(tmp->mac, buf, sizeof(tmp->mac)); + tmp->is_multicast = 1; + +- ipm = qeth_l3_ip_from_hash(card, tmp); ++ ipm = qeth_l3_find_addr_by_ip(card, tmp); + if (ipm) { ++ /* for mcast, by-IP match means full match */ + ipm->disp_flag = QETH_DISP_ADDR_DO_NOTHING; + } else { + ipm = qeth_l3_get_addr_buffer(QETH_PROT_IPV4); +@@ -1480,8 +1470,9 @@ qeth_l3_add_mc6_to_hash(struct qeth_card + sizeof(struct in6_addr)); + tmp->is_multicast = 1; + +- ipm = qeth_l3_ip_from_hash(card, tmp); ++ ipm = qeth_l3_find_addr_by_ip(card, tmp); + if (ipm) { ++ /* for mcast, by-IP match means full match */ + ipm->disp_flag = QETH_DISP_ADDR_DO_NOTHING; + continue; + } diff --git a/queue-4.15/s390-qeth-fix-ip-removal-on-offline-cards.patch b/queue-4.15/s390-qeth-fix-ip-removal-on-offline-cards.patch new file mode 100644 index 00000000000..3b526760949 --- /dev/null +++ b/queue-4.15/s390-qeth-fix-ip-removal-on-offline-cards.patch @@ -0,0 +1,58 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: Julian Wiedmann +Date: Tue, 27 Feb 2018 18:58:13 +0100 +Subject: s390/qeth: fix IP removal on offline cards + +From: Julian Wiedmann + + +[ Upstream commit 98d823ab1fbdcb13abc25b420f9bb71bade42056 ] + +If the HW is not reachable, then none of the IPs in qeth's internal +table has been registered with the HW yet. So when deleting such an IP, +there's no need to stage it for deregistration - just drop it from +the table. + +This fixes the "add-delete-add" scenario on an offline card, where the +the second "add" merely increments the IP's use count. But as the IP is +still set to DISP_ADDR_DELETE from the previous "delete" step, +l3_recover_ip() won't register it with the HW when the card goes online. + +Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback") +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_l3_main.c | 14 +++----------- + 1 file changed, 3 insertions(+), 11 deletions(-) + +--- a/drivers/s390/net/qeth_l3_main.c ++++ b/drivers/s390/net/qeth_l3_main.c +@@ -256,12 +256,8 @@ int qeth_l3_delete_ip(struct qeth_card * + if (addr->in_progress) + return -EINPROGRESS; + +- if (!qeth_card_hw_is_reachable(card)) { +- addr->disp_flag = QETH_DISP_ADDR_DELETE; +- return 0; +- } +- +- rc = qeth_l3_deregister_addr_entry(card, addr); ++ if (qeth_card_hw_is_reachable(card)) ++ rc = qeth_l3_deregister_addr_entry(card, addr); + + hash_del(&addr->hnode); + kfree(addr); +@@ -404,11 +400,7 @@ static void qeth_l3_recover_ip(struct qe + spin_lock_bh(&card->ip_lock); + + hash_for_each_safe(card->ip_htable, i, tmp, addr, hnode) { +- if (addr->disp_flag == QETH_DISP_ADDR_DELETE) { +- qeth_l3_deregister_addr_entry(card, addr); +- hash_del(&addr->hnode); +- kfree(addr); +- } else if (addr->disp_flag == QETH_DISP_ADDR_ADD) { ++ if (addr->disp_flag == QETH_DISP_ADDR_ADD) { + if (addr->proto == QETH_PROT_IPV4) { + addr->in_progress = 1; + spin_unlock_bh(&card->ip_lock); diff --git a/queue-4.15/s390-qeth-fix-ipa-command-submission-race.patch b/queue-4.15/s390-qeth-fix-ipa-command-submission-race.patch new file mode 100644 index 00000000000..31adc905780 --- /dev/null +++ b/queue-4.15/s390-qeth-fix-ipa-command-submission-race.patch @@ -0,0 +1,83 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: Julian Wiedmann +Date: Tue, 27 Feb 2018 18:58:17 +0100 +Subject: s390/qeth: fix IPA command submission race + +From: Julian Wiedmann + + +[ Upstream commit d22ffb5a712f9211ffd104c38fc17cbfb1b5e2b0 ] + +If multiple IPA commands are build & sent out concurrently, +fill_ipacmd_header() may assign a seqno value to a command that's +different from what send_control_data() later assigns to this command's +reply. +This is due to other commands passing through send_control_data(), +and incrementing card->seqno.ipa along the way. + +So one IPA command has no reply that's waiting for its seqno, while some +other IPA command has multiple reply objects waiting for it. +Only one of those waiting replies wins, and the other(s) times out and +triggers a recovery via send_ipa_cmd(). + +Fix this by making sure that the same seqno value is assigned to +a command and its reply object. +Do so immediately before submitting the command & while holding the +irq_pending "lock", to produce nicely ascending seqnos. + +As a side effect, *all* IPA commands now use a reply object that's +waiting for its actual seqno. Previously, early IPA commands that were +submitted while the card was still DOWN used the "catch-all" IDX seqno. + +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_core_main.c | 19 ++++++++++--------- + 1 file changed, 10 insertions(+), 9 deletions(-) + +--- a/drivers/s390/net/qeth_core_main.c ++++ b/drivers/s390/net/qeth_core_main.c +@@ -2071,24 +2071,25 @@ int qeth_send_control_data(struct qeth_c + } + reply->callback = reply_cb; + reply->param = reply_param; +- if (card->state == CARD_STATE_DOWN) +- reply->seqno = QETH_IDX_COMMAND_SEQNO; +- else +- reply->seqno = card->seqno.ipa++; ++ + init_waitqueue_head(&reply->wait_q); +- spin_lock_irqsave(&card->lock, flags); +- list_add_tail(&reply->list, &card->cmd_waiter_list); +- spin_unlock_irqrestore(&card->lock, flags); + + while (atomic_cmpxchg(&card->write.irq_pending, 0, 1)) ; +- qeth_prepare_control_data(card, len, iob); + + if (IS_IPA(iob->data)) { + cmd = __ipa_cmd(iob); ++ cmd->hdr.seqno = card->seqno.ipa++; ++ reply->seqno = cmd->hdr.seqno; + event_timeout = QETH_IPA_TIMEOUT; + } else { ++ reply->seqno = QETH_IDX_COMMAND_SEQNO; + event_timeout = QETH_TIMEOUT; + } ++ qeth_prepare_control_data(card, len, iob); ++ ++ spin_lock_irqsave(&card->lock, flags); ++ list_add_tail(&reply->list, &card->cmd_waiter_list); ++ spin_unlock_irqrestore(&card->lock, flags); + + timeout = jiffies + event_timeout; + +@@ -2870,7 +2871,7 @@ static void qeth_fill_ipacmd_header(stru + memset(cmd, 0, sizeof(struct qeth_ipa_cmd)); + cmd->hdr.command = command; + cmd->hdr.initiator = IPA_CMD_INITIATOR_HOST; +- cmd->hdr.seqno = card->seqno.ipa; ++ /* cmd->hdr.seqno is set by qeth_send_control_data() */ + cmd->hdr.adapter_type = qeth_get_ipa_adp_type(card->info.link_type); + cmd->hdr.rel_adapter_no = (__u8) card->info.portno; + if (card->options.layer2) diff --git a/queue-4.15/s390-qeth-fix-overestimated-count-of-buffer-elements.patch b/queue-4.15/s390-qeth-fix-overestimated-count-of-buffer-elements.patch new file mode 100644 index 00000000000..c8ad32642d9 --- /dev/null +++ b/queue-4.15/s390-qeth-fix-overestimated-count-of-buffer-elements.patch @@ -0,0 +1,74 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: Julian Wiedmann +Date: Tue, 27 Feb 2018 18:58:12 +0100 +Subject: s390/qeth: fix overestimated count of buffer elements + +From: Julian Wiedmann + + +[ Upstream commit 12472af89632beb1ed8dea29d4efe208ca05b06a ] + +qeth_get_elements_for_range() doesn't know how to handle a 0-length +range (ie. start == end), and returns 1 when it should return 0. +Such ranges occur on TSO skbs, where the L2/L3/L4 headers (and thus all +of the skb's linear data) are skipped when mapping the skb into regular +buffer elements. + +This overestimation may cause several performance-related issues: +1. sub-optimal IO buffer selection, where the next buffer gets selected + even though the skb would actually still fit into the current buffer. +2. forced linearization, if the element count for a non-linear skb + exceeds QETH_MAX_BUFFER_ELEMENTS. + +Rather than modifying qeth_get_elements_for_range() and adding overhead +to every caller, fix up those callers that are in risk of passing a +0-length range. + +Fixes: 2863c61334aa ("qeth: refactor calculation of SBALE count") +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_core_main.c | 10 ++++++---- + drivers/s390/net/qeth_l3_main.c | 11 ++++++----- + 2 files changed, 12 insertions(+), 9 deletions(-) + +--- a/drivers/s390/net/qeth_core_main.c ++++ b/drivers/s390/net/qeth_core_main.c +@@ -3835,10 +3835,12 @@ EXPORT_SYMBOL_GPL(qeth_get_elements_for_ + int qeth_get_elements_no(struct qeth_card *card, + struct sk_buff *skb, int extra_elems, int data_offset) + { +- int elements = qeth_get_elements_for_range( +- (addr_t)skb->data + data_offset, +- (addr_t)skb->data + skb_headlen(skb)) + +- qeth_get_elements_for_frags(skb); ++ addr_t end = (addr_t)skb->data + skb_headlen(skb); ++ int elements = qeth_get_elements_for_frags(skb); ++ addr_t start = (addr_t)skb->data + data_offset; ++ ++ if (start != end) ++ elements += qeth_get_elements_for_range(start, end); + + if ((elements + extra_elems) > QETH_MAX_BUFFER_ELEMENTS(card)) { + QETH_DBF_MESSAGE(2, "Invalid size of IP packet " +--- a/drivers/s390/net/qeth_l3_main.c ++++ b/drivers/s390/net/qeth_l3_main.c +@@ -2629,11 +2629,12 @@ static void qeth_tso_fill_header(struct + static int qeth_l3_get_elements_no_tso(struct qeth_card *card, + struct sk_buff *skb, int extra_elems) + { +- addr_t tcpdptr = (addr_t)tcp_hdr(skb) + tcp_hdrlen(skb); +- int elements = qeth_get_elements_for_range( +- tcpdptr, +- (addr_t)skb->data + skb_headlen(skb)) + +- qeth_get_elements_for_frags(skb); ++ addr_t start = (addr_t)tcp_hdr(skb) + tcp_hdrlen(skb); ++ addr_t end = (addr_t)skb->data + skb_headlen(skb); ++ int elements = qeth_get_elements_for_frags(skb); ++ ++ if (start != end) ++ elements += qeth_get_elements_for_range(start, end); + + if ((elements + extra_elems) > QETH_MAX_BUFFER_ELEMENTS(card)) { + QETH_DBF_MESSAGE(2, diff --git a/queue-4.15/s390-qeth-fix-setip-command-handling.patch b/queue-4.15/s390-qeth-fix-setip-command-handling.patch new file mode 100644 index 00000000000..54f19cde4d6 --- /dev/null +++ b/queue-4.15/s390-qeth-fix-setip-command-handling.patch @@ -0,0 +1,76 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: Julian Wiedmann +Date: Fri, 9 Feb 2018 11:03:50 +0100 +Subject: s390/qeth: fix SETIP command handling + +From: Julian Wiedmann + + +[ Upstream commit 1c5b2216fbb973a9410e0b06389740b5c1289171 ] + +send_control_data() applies some special handling to SETIP v4 IPA +commands. But current code parses *all* command types for the SETIP +command code. Limit the command code check to IPA commands. + +Fixes: 5b54e16f1a54 ("qeth: do not spin for SETIP ip assist command") +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_core.h | 5 +++++ + drivers/s390/net/qeth_core_main.c | 14 ++++++++------ + 2 files changed, 13 insertions(+), 6 deletions(-) + +--- a/drivers/s390/net/qeth_core.h ++++ b/drivers/s390/net/qeth_core.h +@@ -581,6 +581,11 @@ struct qeth_cmd_buffer { + void (*callback) (struct qeth_channel *, struct qeth_cmd_buffer *); + }; + ++static inline struct qeth_ipa_cmd *__ipa_cmd(struct qeth_cmd_buffer *iob) ++{ ++ return (struct qeth_ipa_cmd *)(iob->data + IPA_PDU_HEADER_SIZE); ++} ++ + /** + * definition of a qeth channel, used for read and write + */ +--- a/drivers/s390/net/qeth_core_main.c ++++ b/drivers/s390/net/qeth_core_main.c +@@ -2057,7 +2057,7 @@ int qeth_send_control_data(struct qeth_c + unsigned long flags; + struct qeth_reply *reply = NULL; + unsigned long timeout, event_timeout; +- struct qeth_ipa_cmd *cmd; ++ struct qeth_ipa_cmd *cmd = NULL; + + QETH_CARD_TEXT(card, 2, "sendctl"); + +@@ -2083,10 +2083,13 @@ int qeth_send_control_data(struct qeth_c + while (atomic_cmpxchg(&card->write.irq_pending, 0, 1)) ; + qeth_prepare_control_data(card, len, iob); + +- if (IS_IPA(iob->data)) ++ if (IS_IPA(iob->data)) { ++ cmd = __ipa_cmd(iob); + event_timeout = QETH_IPA_TIMEOUT; +- else ++ } else { + event_timeout = QETH_TIMEOUT; ++ } ++ + timeout = jiffies + event_timeout; + + QETH_CARD_TEXT(card, 6, "noirqpnd"); +@@ -2111,9 +2114,8 @@ int qeth_send_control_data(struct qeth_c + + /* we have only one long running ipassist, since we can ensure + process context of this command we can sleep */ +- cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE); +- if ((cmd->hdr.command == IPA_CMD_SETIP) && +- (cmd->hdr.prot_version == QETH_PROT_IPV4)) { ++ if (cmd && cmd->hdr.command == IPA_CMD_SETIP && ++ cmd->hdr.prot_version == QETH_PROT_IPV4) { + if (!wait_event_timeout(reply->wait_q, + atomic_read(&reply->received), event_timeout)) + goto time_err; diff --git a/queue-4.15/s390-qeth-fix-underestimated-count-of-buffer-elements.patch b/queue-4.15/s390-qeth-fix-underestimated-count-of-buffer-elements.patch new file mode 100644 index 00000000000..8058639329b --- /dev/null +++ b/queue-4.15/s390-qeth-fix-underestimated-count-of-buffer-elements.patch @@ -0,0 +1,40 @@ +From foo@baz Tue Mar 6 19:02:57 PST 2018 +From: Ursula Braun +Date: Fri, 9 Feb 2018 11:03:49 +0100 +Subject: s390/qeth: fix underestimated count of buffer elements + +From: Ursula Braun + + +[ Upstream commit 89271c65edd599207dd982007900506283c90ae3 ] + +For a memory range/skb where the last byte falls onto a page boundary +(ie. 'end' is of the form xxx...xxx001), the PFN_UP() part of the +calculation currently doesn't round up to the next PFN due to an +off-by-one error. +Thus qeth believes that the skb occupies one page less than it +actually does, and may select a IO buffer that doesn't have enough spare +buffer elements to fit all of the skb's data. +HW detects this as a malformed buffer descriptor, and raises an +exception which then triggers device recovery. + +Fixes: 2863c61334aa ("qeth: refactor calculation of SBALE count") +Signed-off-by: Ursula Braun +Signed-off-by: Julian Wiedmann +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/net/qeth_core.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/s390/net/qeth_core.h ++++ b/drivers/s390/net/qeth_core.h +@@ -836,7 +836,7 @@ struct qeth_trap_id { + */ + static inline int qeth_get_elements_for_range(addr_t start, addr_t end) + { +- return PFN_UP(end - 1) - PFN_DOWN(start); ++ return PFN_UP(end) - PFN_DOWN(start); + } + + static inline int qeth_get_micros(void) diff --git a/queue-4.15/sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch b/queue-4.15/sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch new file mode 100644 index 00000000000..e4016efa49f --- /dev/null +++ b/queue-4.15/sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch @@ -0,0 +1,49 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Xin Long +Date: Mon, 12 Feb 2018 18:29:06 +0800 +Subject: sctp: do not pr_err for the duplicated node in transport rhlist + +From: Xin Long + + +[ Upstream commit 27af86bb038d9c8b8066cd17854ddaf2ea92bce1 ] + +The pr_err in sctp_hash_transport was supposed to report a sctp bug +for using rhashtable/rhlist. + +The err '-EEXIST' introduced in Commit cd2b70875058 ("sctp: check +duplicate node before inserting a new transport") doesn't belong +to that case. + +So just return -EEXIST back without pr_err any kmsg. + +Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new transport") +Reported-by: Wei Chen +Signed-off-by: Xin Long +Acked-by: Marcelo Ricardo Leitner +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sctp/input.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/net/sctp/input.c ++++ b/net/sctp/input.c +@@ -897,15 +897,12 @@ int sctp_hash_transport(struct sctp_tran + rhl_for_each_entry_rcu(transport, tmp, list, node) + if (transport->asoc->ep == t->asoc->ep) { + rcu_read_unlock(); +- err = -EEXIST; +- goto out; ++ return -EEXIST; + } + rcu_read_unlock(); + + err = rhltable_insert_key(&sctp_transport_hashtable, &arg, + &t->node, sctp_hash_params); +- +-out: + if (err) + pr_err_once("insert transport fail, errno %d\n", err); + diff --git a/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch b/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch new file mode 100644 index 00000000000..eb919c6dcc1 --- /dev/null +++ b/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch @@ -0,0 +1,86 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Tommi Rantala +Date: Mon, 5 Feb 2018 21:48:14 +0200 +Subject: sctp: fix dst refcnt leak in sctp_v4_get_dst + +From: Tommi Rantala + + +[ Upstream commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8 ] + +Fix dst reference count leak in sctp_v4_get_dst() introduced in commit +410f03831 ("sctp: add routing output fallback"): + +When walking the address_list, successive ip_route_output_key() calls +may return the same rt->dst with the reference incremented on each call. + +The code would not decrement the dst refcount when the dst pointer was +identical from the previous iteration, causing the dst refcnt leak. + +Testcase: + ip netns add TEST + ip netns exec TEST ip link set lo up + ip link add dummy0 type dummy + ip link add dummy1 type dummy + ip link add dummy2 type dummy + ip link set dev dummy0 netns TEST + ip link set dev dummy1 netns TEST + ip link set dev dummy2 netns TEST + ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0 + ip netns exec TEST ip link set dummy0 up + ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1 + ip netns exec TEST ip link set dummy1 up + ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2 + ip netns exec TEST ip link set dummy2 up + ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3 + ip netns del TEST + +In 4.4 and 4.9 kernels this results to: + [ 354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1 + [ 405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1 + ... + +Fixes: 410f03831 ("sctp: add routing output fallback") +Fixes: 0ca50d12f ("sctp: fix src address selection if using secondary addresses") +Signed-off-by: Tommi Rantala +Acked-by: Marcelo Ricardo Leitner +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sctp/protocol.c | 10 ++++------ + 1 file changed, 4 insertions(+), 6 deletions(-) + +--- a/net/sctp/protocol.c ++++ b/net/sctp/protocol.c +@@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_ + if (IS_ERR(rt)) + continue; + +- if (!dst) +- dst = &rt->dst; +- + /* Ensure the src address belongs to the output + * interface. + */ + odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, + false); + if (!odev || odev->ifindex != fl4->flowi4_oif) { +- if (&rt->dst != dst) ++ if (!dst) ++ dst = &rt->dst; ++ else + dst_release(&rt->dst); + continue; + } + +- if (dst != &rt->dst) +- dst_release(dst); ++ dst_release(dst); + dst = &rt->dst; + break; + } diff --git a/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch b/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch new file mode 100644 index 00000000000..c7aa19f8c02 --- /dev/null +++ b/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch @@ -0,0 +1,57 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Alexey Kodanev +Date: Mon, 5 Feb 2018 15:10:35 +0300 +Subject: sctp: fix dst refcnt leak in sctp_v6_get_dst() + +From: Alexey Kodanev + + +[ Upstream commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2 ] + +When going through the bind address list in sctp_v6_get_dst() and +the previously found address is better ('matchlen > bmatchlen'), +the code continues to the next iteration without releasing currently +held destination. + +Fix it by releasing 'bdst' before continue to the next iteration, and +instead of introducing one more '!IS_ERR(bdst)' check for dst_release(), +move the already existed one right after ip6_dst_lookup_flow(), i.e. we +shouldn't proceed further if we get an error for the route lookup. + +Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using secondary addresses for ipv6") +Signed-off-by: Alexey Kodanev +Acked-by: Neil Horman +Acked-by: Marcelo Ricardo Leitner +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sctp/ipv6.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +--- a/net/sctp/ipv6.c ++++ b/net/sctp/ipv6.c +@@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_ + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + bdst = ip6_dst_lookup_flow(sk, fl6, final_p); + +- if (!IS_ERR(bdst) && +- ipv6_chk_addr(dev_net(bdst->dev), ++ if (IS_ERR(bdst)) ++ continue; ++ ++ if (ipv6_chk_addr(dev_net(bdst->dev), + &laddr->a.v6.sin6_addr, bdst->dev, 1)) { + if (!IS_ERR_OR_NULL(dst)) + dst_release(dst); +@@ -336,8 +338,10 @@ static void sctp_v6_get_dst(struct sctp_ + } + + bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); +- if (matchlen > bmatchlen) ++ if (matchlen > bmatchlen) { ++ dst_release(bdst); + continue; ++ } + + if (!IS_ERR_OR_NULL(dst)) + dst_release(dst); diff --git a/queue-4.15/sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch b/queue-4.15/sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch new file mode 100644 index 00000000000..29ac84cf833 --- /dev/null +++ b/queue-4.15/sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch @@ -0,0 +1,86 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Alexey Kodanev +Date: Fri, 9 Feb 2018 17:35:23 +0300 +Subject: sctp: verify size of a new chunk in _sctp_make_chunk() + +From: Alexey Kodanev + + +[ Upstream commit 07f2c7ab6f8d0a7e7c5764c4e6cc9c52951b9d9c ] + +When SCTP makes INIT or INIT_ACK packet the total chunk length +can exceed SCTP_MAX_CHUNK_LEN which leads to kernel panic when +transmitting these packets, e.g. the crash on sending INIT_ACK: + +[ 597.804948] skbuff: skb_over_panic: text:00000000ffae06e4 len:120168 + put:120156 head:000000007aa47635 data:00000000d991c2de + tail:0x1d640 end:0xfec0 dev: +... +[ 597.976970] ------------[ cut here ]------------ +[ 598.033408] kernel BUG at net/core/skbuff.c:104! +[ 600.314841] Call Trace: +[ 600.345829] +[ 600.371639] ? sctp_packet_transmit+0x2095/0x26d0 [sctp] +[ 600.436934] skb_put+0x16c/0x200 +[ 600.477295] sctp_packet_transmit+0x2095/0x26d0 [sctp] +[ 600.540630] ? sctp_packet_config+0x890/0x890 [sctp] +[ 600.601781] ? __sctp_packet_append_chunk+0x3b4/0xd00 [sctp] +[ 600.671356] ? sctp_cmp_addr_exact+0x3f/0x90 [sctp] +[ 600.731482] sctp_outq_flush+0x663/0x30d0 [sctp] +[ 600.788565] ? sctp_make_init+0xbf0/0xbf0 [sctp] +[ 600.845555] ? sctp_check_transmitted+0x18f0/0x18f0 [sctp] +[ 600.912945] ? sctp_outq_tail+0x631/0x9d0 [sctp] +[ 600.969936] sctp_cmd_interpreter.isra.22+0x3be1/0x5cb0 [sctp] +[ 601.041593] ? sctp_sf_do_5_1B_init+0x85f/0xc30 [sctp] +[ 601.104837] ? sctp_generate_t1_cookie_event+0x20/0x20 [sctp] +[ 601.175436] ? sctp_eat_data+0x1710/0x1710 [sctp] +[ 601.233575] sctp_do_sm+0x182/0x560 [sctp] +[ 601.284328] ? sctp_has_association+0x70/0x70 [sctp] +[ 601.345586] ? sctp_rcv+0xef4/0x32f0 [sctp] +[ 601.397478] ? sctp6_rcv+0xa/0x20 [sctp] +... + +Here the chunk size for INIT_ACK packet becomes too big, mostly +because of the state cookie (INIT packet has large size with +many address parameters), plus additional server parameters. + +Later this chunk causes the panic in skb_put_data(): + + skb_packet_transmit() + sctp_packet_pack() + skb_put_data(nskb, chunk->skb->data, chunk->skb->len); + +'nskb' (head skb) was previously allocated with packet->size +from u16 'chunk->chunk_hdr->length'. + +As suggested by Marcelo we should check the chunk's length in +_sctp_make_chunk() before trying to allocate skb for it and +discard a chunk if its size bigger than SCTP_MAX_CHUNK_LEN. + +Signed-off-by: Alexey Kodanev +Acked-by: Marcelo Ricardo Leitner +Acked-by: Neil Horman +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/sctp/sm_make_chunk.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/net/sctp/sm_make_chunk.c ++++ b/net/sctp/sm_make_chunk.c +@@ -1378,9 +1378,14 @@ static struct sctp_chunk *_sctp_make_chu + struct sctp_chunk *retval; + struct sk_buff *skb; + struct sock *sk; ++ int chunklen; ++ ++ chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen); ++ if (chunklen > SCTP_MAX_CHUNK_LEN) ++ goto nodata; + + /* No need to allocate LL here, as this is only a chunk. */ +- skb = alloc_skb(SCTP_PAD4(sizeof(*chunk_hdr) + paylen), gfp); ++ skb = alloc_skb(chunklen, gfp); + if (!skb) + goto nodata; + diff --git a/queue-4.15/series b/queue-4.15/series index 6f2f35f849a..d278d34b009 100644 --- a/queue-4.15/series +++ b/queue-4.15/series @@ -36,3 +36,62 @@ direct-io-fix-sleep-in-atomic-due-to-sync-aio.patch x86-xen-zero-msr_ia32_spec_ctrl-before-suspend.patch x86-platform-intel-mid-handle-intel-edison-reboot-correctly.patch x86-cpu_entry_area-sync-cpu_entry_area-to-initial_page_table.patch +bridge-check-brport-attr-show-in-brport_show.patch +fib_semantics-don-t-match-route-with-mismatching-tclassid.patch +hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch +ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch +net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch +net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch +net-fix-race-on-decreasing-number-of-tx-queues.patch +net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch +netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch +net-sched-report-if-filter-is-too-large-to-dump.patch +ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch +sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch +udplite-fix-partial-checksum-initialization.patch +net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch +sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch +mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch +net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch +net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch +tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch +rxrpc-fix-send-in-rxrpc_send_data_packet.patch +tcp_bbr-better-deal-with-suboptimal-gso.patch +doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch +net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch +net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch +net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch +sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch +mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch +net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch +sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch +bridge-fix-vlan-reference-count-problem.patch +net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch +tls-use-correct-sk-sk_prot-for-ipv6.patch +amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch +cls_u32-fix-use-after-free-in-u32_destroy_key.patch +mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch +netlink-put-module-reference-if-dump-start-fails.patch +tcp-purge-write-queue-upon-rst.patch +tuntap-correctly-add-the-missing-xdp-flush.patch +tuntap-disable-preemption-during-xdp-processing.patch +virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch +cxgb4-fix-trailing-zero-in-cim-la-dump.patch +net-mlx5-fix-error-handling-when-adding-flow-rules.patch +net-phy-restore-phy_resume-locking-assumption.patch +tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch +l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch +l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch +l2tp-fix-races-with-tunnel-socket-close.patch +l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch +l2tp-fix-tunnel-lookup-use-after-free-race.patch +s390-qeth-fix-underestimated-count-of-buffer-elements.patch +s390-qeth-fix-setip-command-handling.patch +s390-qeth-fix-overestimated-count-of-buffer-elements.patch +s390-qeth-fix-ip-removal-on-offline-cards.patch +s390-qeth-fix-double-free-on-ip-add-remove-race.patch +revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch +s390-qeth-fix-ip-address-lookup-for-l3-devices.patch +s390-qeth-fix-ipa-command-submission-race.patch +tcp-revert-f-rto-middle-box-workaround.patch +tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch diff --git a/queue-4.15/tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch b/queue-4.15/tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch new file mode 100644 index 00000000000..a8897a45fc1 --- /dev/null +++ b/queue-4.15/tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch @@ -0,0 +1,69 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Ilya Lesokhin +Date: Mon, 12 Feb 2018 12:57:04 +0200 +Subject: tcp: Honor the eor bit in tcp_mtu_probe + +From: Ilya Lesokhin + + +[ Upstream commit 808cf9e38cd7923036a99f459ccc8cf2955e47af ] + +Avoid SKB coalescing if eor bit is set in one of the relevant +SKBs. + +Fixes: c134ecb87817 ("tcp: Make use of MSG_EOR in tcp_sendmsg") +Signed-off-by: Ilya Lesokhin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_output.c | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2026,6 +2026,24 @@ static inline void tcp_mtu_check_reprobe + } + } + ++static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len) ++{ ++ struct sk_buff *skb, *next; ++ ++ skb = tcp_send_head(sk); ++ tcp_for_write_queue_from_safe(skb, next, sk) { ++ if (len <= skb->len) ++ break; ++ ++ if (unlikely(TCP_SKB_CB(skb)->eor)) ++ return false; ++ ++ len -= skb->len; ++ } ++ ++ return true; ++} ++ + /* Create a new MTU probe if we are ready. + * MTU probe is regularly attempting to increase the path MTU by + * deliberately sending larger packets. This discovers routing +@@ -2098,6 +2116,9 @@ static int tcp_mtu_probe(struct sock *sk + return 0; + } + ++ if (!tcp_can_coalesce_send_queue_head(sk, probe_size)) ++ return -1; ++ + /* We're allowed to probe. Build it now. */ + nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false); + if (!nskb) +@@ -2133,6 +2154,10 @@ static int tcp_mtu_probe(struct sock *sk + /* We've eaten all the data from this skb. + * Throw it away. */ + TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; ++ /* If this is the last SKB we copy and eor is set ++ * we need to propagate it to the new skb. ++ */ ++ TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor; + tcp_unlink_write_queue(skb, sk); + sk_wmem_free_skb(sk, skb); + } else { diff --git a/queue-4.15/tcp-purge-write-queue-upon-rst.patch b/queue-4.15/tcp-purge-write-queue-upon-rst.patch new file mode 100644 index 00000000000..b925c8c91f8 --- /dev/null +++ b/queue-4.15/tcp-purge-write-queue-upon-rst.patch @@ -0,0 +1,44 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Soheil Hassas Yeganeh +Date: Tue, 27 Feb 2018 18:32:18 -0500 +Subject: tcp: purge write queue upon RST + +From: Soheil Hassas Yeganeh + + +[ Upstream commit a27fd7a8ed3856faaf5a2ff1c8c5f00c0667aaa0 ] + +When the connection is reset, there is no point in +keeping the packets on the write queue until the connection +is closed. + +RFC 793 (page 70) and RFC 793-bis (page 64) both suggest +purging the write queue upon RST: +https://tools.ietf.org/html/draft-ietf-tcpm-rfc793bis-07 + +Moreover, this is essential for a correct MSG_ZEROCOPY +implementation, because userspace cannot call close(fd) +before receiving zerocopy signals even when the connection +is reset. + +Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY") +Signed-off-by: Soheil Hassas Yeganeh +Reviewed-by: Eric Dumazet +Signed-off-by: Yuchung Cheng +Signed-off-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -3988,6 +3988,7 @@ void tcp_reset(struct sock *sk) + /* This barrier is coupled with smp_rmb() in tcp_poll() */ + smp_wmb(); + ++ tcp_write_queue_purge(sk); + tcp_done(sk); + + if (!sock_flag(sk, SOCK_DEAD)) diff --git a/queue-4.15/tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch b/queue-4.15/tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch new file mode 100644 index 00000000000..65a3e7e3a52 --- /dev/null +++ b/queue-4.15/tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch @@ -0,0 +1,84 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Yuchung Cheng +Date: Tue, 27 Feb 2018 14:15:02 -0800 +Subject: tcp: revert F-RTO extension to detect more spurious timeouts + +From: Yuchung Cheng + + +[ Upstream commit fc68e171d376c322e6777a3d7ac2f0278b68b17f ] + +This reverts commit 89fe18e44f7ee5ab1c90d0dff5835acee7751427. + +While the patch could detect more spurious timeouts, it could cause +poor TCP performance on broken middle-boxes that modifies TCP packets +(e.g. receive window, SACK options). Since the performance gain is +much smaller compared to the potential loss. The best solution is +to fully revert the change. + +Fixes: 89fe18e44f7e ("tcp: extend F-RTO to catch more spurious timeouts") +Reported-by: Teodor Milkov +Signed-off-by: Yuchung Cheng +Signed-off-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 30 ++++++++++++------------------ + 1 file changed, 12 insertions(+), 18 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -1915,6 +1915,7 @@ void tcp_enter_loss(struct sock *sk) + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + struct sk_buff *skb; ++ bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; + bool is_reneg; /* is receiver reneging on SACKs? */ + bool mark_lost; + +@@ -1973,15 +1974,12 @@ void tcp_enter_loss(struct sock *sk) + tp->high_seq = tp->snd_nxt; + tcp_ecn_queue_cwr(tp); + +- /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO +- * if a previous recovery is underway, otherwise it may incorrectly +- * call a timeout spurious if some previously retransmitted packets +- * are s/acked (sec 3.2). We do not apply that retriction since +- * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS +- * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO +- * on PTMU discovery to avoid sending new data. ++ /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous ++ * loss recovery is underway except recurring timeout(s) on ++ * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing + */ + tp->frto = net->ipv4.sysctl_tcp_frto && ++ (new_recovery || icsk->icsk_retransmits) && + !inet_csk(sk)->icsk_mtup.probe_size; + } + +@@ -2634,18 +2632,14 @@ static void tcp_process_loss(struct sock + tcp_try_undo_loss(sk, false)) + return; + +- /* The ACK (s)acks some never-retransmitted data meaning not all +- * the data packets before the timeout were lost. Therefore we +- * undo the congestion window and state. This is essentially +- * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since +- * a retransmitted skb is permantly marked, we can apply such an +- * operation even if F-RTO was not used. +- */ +- if ((flag & FLAG_ORIG_SACK_ACKED) && +- tcp_try_undo_loss(sk, tp->undo_marker)) +- return; +- + if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ ++ /* Step 3.b. A timeout is spurious if not all data are ++ * lost, i.e., never-retransmitted data are (s)acked. ++ */ ++ if ((flag & FLAG_ORIG_SACK_ACKED) && ++ tcp_try_undo_loss(sk, true)) ++ return; ++ + if (after(tp->snd_nxt, tp->high_seq)) { + if (flag & FLAG_DATA_SACKED || is_dupack) + tp->frto = 0; /* Step 3.a. loss was real */ diff --git a/queue-4.15/tcp-revert-f-rto-middle-box-workaround.patch b/queue-4.15/tcp-revert-f-rto-middle-box-workaround.patch new file mode 100644 index 00000000000..e3d3c3fcb4b --- /dev/null +++ b/queue-4.15/tcp-revert-f-rto-middle-box-workaround.patch @@ -0,0 +1,65 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Yuchung Cheng +Date: Tue, 27 Feb 2018 14:15:01 -0800 +Subject: tcp: revert F-RTO middle-box workaround + +From: Yuchung Cheng + + +[ Upstream commit d4131f09770d9b7471c9da65e6ecd2477746ac5c ] + +This reverts commit cc663f4d4c97b7297fb45135ab23cfd508b35a77. While fixing +some broken middle-boxes that modifies receive window fields, it does not +address middle-boxes that strip off SACK options. The best solution is +to fully revert this patch and the root F-RTO enhancement. + +Fixes: cc663f4d4c97 ("tcp: restrict F-RTO to work-around broken middle-boxes") +Reported-by: Teodor Milkov +Signed-off-by: Yuchung Cheng +Signed-off-by: Neal Cardwell +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 17 +++++++---------- + 1 file changed, 7 insertions(+), 10 deletions(-) + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 45f750e85714..50963f92a67d 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -1915,7 +1915,6 @@ void tcp_enter_loss(struct sock *sk) + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + struct sk_buff *skb; +- bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery; + bool is_reneg; /* is receiver reneging on SACKs? */ + bool mark_lost; + +@@ -1974,17 +1973,15 @@ void tcp_enter_loss(struct sock *sk) + tp->high_seq = tp->snd_nxt; + tcp_ecn_queue_cwr(tp); + +- /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous +- * loss recovery is underway except recurring timeout(s) on +- * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing +- * +- * In theory F-RTO can be used repeatedly during loss recovery. +- * In practice this interacts badly with broken middle-boxes that +- * falsely raise the receive window, which results in repeated +- * timeouts and stop-and-go behavior. ++ /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO ++ * if a previous recovery is underway, otherwise it may incorrectly ++ * call a timeout spurious if some previously retransmitted packets ++ * are s/acked (sec 3.2). We do not apply that retriction since ++ * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS ++ * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO ++ * on PTMU discovery to avoid sending new data. + */ + tp->frto = net->ipv4.sysctl_tcp_frto && +- (new_recovery || icsk->icsk_retransmits) && + !inet_csk(sk)->icsk_mtup.probe_size; + } + +-- +2.14.3 + diff --git a/queue-4.15/tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch b/queue-4.15/tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch new file mode 100644 index 00000000000..aca5ce6e339 --- /dev/null +++ b/queue-4.15/tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch @@ -0,0 +1,55 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Song Liu +Date: Tue, 6 Feb 2018 20:50:23 -0800 +Subject: tcp: tracepoint: only call trace_tcp_send_reset with full socket + +From: Song Liu + + +[ Upstream commit 5c487bb9adddbc1d23433e09d2548759375c2b52 ] + +tracepoint tcp_send_reset requires a full socket to work. However, it +may be called when in TCP_TIME_WAIT: + + case TCP_TW_RST: + tcp_v6_send_reset(sk, skb); + inet_twsk_deschedule_put(inet_twsk(sk)); + goto discard_it; + +To avoid this problem, this patch checks the socket with sk_fullsock() +before calling trace_tcp_send_reset(). + +Fixes: c24b14c46bb8 ("tcp: add tracepoint trace_tcp_send_reset") +Signed-off-by: Song Liu +Reviewed-by: Lawrence Brakmo +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_ipv4.c | 3 ++- + net/ipv6/tcp_ipv6.c | 3 ++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -705,7 +705,8 @@ static void tcp_v4_send_reset(const stru + */ + if (sk) { + arg.bound_dev_if = sk->sk_bound_dev_if; +- trace_tcp_send_reset(sk, skb); ++ if (sk_fullsock(sk)) ++ trace_tcp_send_reset(sk, skb); + } + + BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) != +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -943,7 +943,8 @@ static void tcp_v6_send_reset(const stru + + if (sk) { + oif = sk->sk_bound_dev_if; +- trace_tcp_send_reset(sk, skb); ++ if (sk_fullsock(sk)) ++ trace_tcp_send_reset(sk, skb); + } + + tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0); diff --git a/queue-4.15/tcp_bbr-better-deal-with-suboptimal-gso.patch b/queue-4.15/tcp_bbr-better-deal-with-suboptimal-gso.patch new file mode 100644 index 00000000000..13f36cfa33e --- /dev/null +++ b/queue-4.15/tcp_bbr-better-deal-with-suboptimal-gso.patch @@ -0,0 +1,89 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Eric Dumazet +Date: Wed, 21 Feb 2018 06:43:03 -0800 +Subject: tcp_bbr: better deal with suboptimal GSO + +From: Eric Dumazet + + +[ Upstream commit 350c9f484bde93ef229682eedd98cd5f74350f7f ] + +BBR uses tcp_tso_autosize() in an attempt to probe what would be the +burst sizes and to adjust cwnd in bbr_target_cwnd() with following +gold formula : + +/* Allow enough full-sized skbs in flight to utilize end systems. */ +cwnd += 3 * bbr->tso_segs_goal; + +But GSO can be lacking or be constrained to very small +units (ip link set dev ... gso_max_segs 2) + +What we really want is to have enough packets in flight so that both +GSO and GRO are efficient. + +So in the case GSO is off or downgraded, we still want to have the same +number of packets in flight as if GSO/TSO was fully operational, so +that GRO can hopefully be working efficiently. + +To fix this issue, we make tcp_tso_autosize() unaware of +sk->sk_gso_max_segs + +Only tcp_tso_segs() has to enforce the gso_max_segs limit. + +Tested: + +ethtool -K eth0 tso off gso off +tc qd replace dev eth0 root pfifo_fast + +Before patch: +for f in {1..5}; do ./super_netperf 1 -H lpaa24 -- -K bbr; done +    691  (ss -temoi shows cwnd is stuck around 6 ) +    667 +    651 +    631 +    517 + +After patch : +# for f in {1..5}; do ./super_netperf 1 -H lpaa24 -- -K bbr; done +   1733 (ss -temoi shows cwnd is around 386 ) +   1778 +   1746 +   1781 +   1718 + +Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") +Signed-off-by: Eric Dumazet +Reported-by: Oleksandr Natalenko +Acked-by: Neal Cardwell +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_output.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1730,7 +1730,7 @@ u32 tcp_tso_autosize(const struct sock * + */ + segs = max_t(u32, bytes / mss_now, min_tso_segs); + +- return min_t(u32, segs, sk->sk_gso_max_segs); ++ return segs; + } + EXPORT_SYMBOL(tcp_tso_autosize); + +@@ -1742,9 +1742,10 @@ static u32 tcp_tso_segs(struct sock *sk, + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0; + +- return tso_segs ? : +- tcp_tso_autosize(sk, mss_now, +- sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ if (!tso_segs) ++ tso_segs = tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + + /* Returns the portion of skb which can be sent right away */ diff --git a/queue-4.15/tls-use-correct-sk-sk_prot-for-ipv6.patch b/queue-4.15/tls-use-correct-sk-sk_prot-for-ipv6.patch new file mode 100644 index 00000000000..f1e46c02b6b --- /dev/null +++ b/queue-4.15/tls-use-correct-sk-sk_prot-for-ipv6.patch @@ -0,0 +1,122 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Boris Pismenny +Date: Tue, 27 Feb 2018 14:18:39 +0200 +Subject: tls: Use correct sk->sk_prot for IPV6 + +From: Boris Pismenny + + +[ Upstream commit c113187d38ff85dc302a1bb55864b203ebb2ba10 ] + +The tls ulp overrides sk->prot with a new tls specific proto structs. +The tls specific structs were previously based on the ipv4 specific +tcp_prot sturct. +As a result, attaching the tls ulp to an ipv6 tcp socket replaced +some ipv6 callback with the ipv4 equivalents. + +This patch adds ipv6 tls proto structs and uses them when +attached to ipv6 sockets. + +Fixes: 3c4d7559159b ('tls: kernel TLS support') +Signed-off-by: Boris Pismenny +Signed-off-by: Ilya Lesokhin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/tls/tls_main.c | 52 +++++++++++++++++++++++++++++++++++++--------------- + 1 file changed, 37 insertions(+), 15 deletions(-) + +--- a/net/tls/tls_main.c ++++ b/net/tls/tls_main.c +@@ -46,16 +46,26 @@ MODULE_DESCRIPTION("Transport Layer Secu + MODULE_LICENSE("Dual BSD/GPL"); + + enum { ++ TLSV4, ++ TLSV6, ++ TLS_NUM_PROTS, ++}; ++ ++enum { + TLS_BASE_TX, + TLS_SW_TX, + TLS_NUM_CONFIG, + }; + +-static struct proto tls_prots[TLS_NUM_CONFIG]; ++static struct proto *saved_tcpv6_prot; ++static DEFINE_MUTEX(tcpv6_prot_mutex); ++static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG]; + + static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx) + { +- sk->sk_prot = &tls_prots[ctx->tx_conf]; ++ int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; ++ ++ sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf]; + } + + int wait_on_pending_writer(struct sock *sk, long *timeo) +@@ -450,8 +460,21 @@ static int tls_setsockopt(struct sock *s + return do_tls_setsockopt(sk, optname, optval, optlen); + } + ++static void build_protos(struct proto *prot, struct proto *base) ++{ ++ prot[TLS_BASE_TX] = *base; ++ prot[TLS_BASE_TX].setsockopt = tls_setsockopt; ++ prot[TLS_BASE_TX].getsockopt = tls_getsockopt; ++ prot[TLS_BASE_TX].close = tls_sk_proto_close; ++ ++ prot[TLS_SW_TX] = prot[TLS_BASE_TX]; ++ prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; ++ prot[TLS_SW_TX].sendpage = tls_sw_sendpage; ++} ++ + static int tls_init(struct sock *sk) + { ++ int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; + struct inet_connection_sock *icsk = inet_csk(sk); + struct tls_context *ctx; + int rc = 0; +@@ -476,6 +499,17 @@ static int tls_init(struct sock *sk) + ctx->getsockopt = sk->sk_prot->getsockopt; + ctx->sk_proto_close = sk->sk_prot->close; + ++ /* Build IPv6 TLS whenever the address of tcpv6_prot changes */ ++ if (ip_ver == TLSV6 && ++ unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) { ++ mutex_lock(&tcpv6_prot_mutex); ++ if (likely(sk->sk_prot != saved_tcpv6_prot)) { ++ build_protos(tls_prots[TLSV6], sk->sk_prot); ++ smp_store_release(&saved_tcpv6_prot, sk->sk_prot); ++ } ++ mutex_unlock(&tcpv6_prot_mutex); ++ } ++ + ctx->tx_conf = TLS_BASE_TX; + update_sk_prot(sk, ctx); + out: +@@ -488,21 +522,9 @@ static struct tcp_ulp_ops tcp_tls_ulp_op + .init = tls_init, + }; + +-static void build_protos(struct proto *prot, struct proto *base) +-{ +- prot[TLS_BASE_TX] = *base; +- prot[TLS_BASE_TX].setsockopt = tls_setsockopt; +- prot[TLS_BASE_TX].getsockopt = tls_getsockopt; +- prot[TLS_BASE_TX].close = tls_sk_proto_close; +- +- prot[TLS_SW_TX] = prot[TLS_BASE_TX]; +- prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg; +- prot[TLS_SW_TX].sendpage = tls_sw_sendpage; +-} +- + static int __init tls_register(void) + { +- build_protos(tls_prots, &tcp_prot); ++ build_protos(tls_prots[TLSV4], &tcp_prot); + + tcp_register_ulp(&tcp_tls_ulp_ops); + diff --git a/queue-4.15/tuntap-correctly-add-the-missing-xdp-flush.patch b/queue-4.15/tuntap-correctly-add-the-missing-xdp-flush.patch new file mode 100644 index 00000000000..1c66e47db4b --- /dev/null +++ b/queue-4.15/tuntap-correctly-add-the-missing-xdp-flush.patch @@ -0,0 +1,37 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Jason Wang +Date: Sat, 24 Feb 2018 11:32:26 +0800 +Subject: tuntap: correctly add the missing XDP flush + +From: Jason Wang + + +[ Upstream commit 1bb4f2e868a2891ab8bc668b8173d6ccb8c4ce6f ] + +We don't flush batched XDP packets through xdp_do_flush_map(), this +will cause packets stall at TX queue. Consider we don't do XDP on NAPI +poll(), the only possible fix is to call xdp_do_flush_map() +immediately after xdp_do_redirect(). + +Note, this in fact won't try to batch packets through devmap, we could +address in the future. + +Reported-by: Christoffer Dall +Fixes: 761876c857cb ("tap: XDP support") +Signed-off-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1490,6 +1490,7 @@ static struct sk_buff *tun_build_skb(str + get_page(alloc_frag->page); + alloc_frag->offset += buflen; + err = xdp_do_redirect(tun->dev, &xdp, xdp_prog); ++ xdp_do_flush_map(); + if (err) + goto err_redirect; + rcu_read_unlock(); diff --git a/queue-4.15/tuntap-disable-preemption-during-xdp-processing.patch b/queue-4.15/tuntap-disable-preemption-during-xdp-processing.patch new file mode 100644 index 00000000000..db31a0850d3 --- /dev/null +++ b/queue-4.15/tuntap-disable-preemption-during-xdp-processing.patch @@ -0,0 +1,75 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Jason Wang +Date: Sat, 24 Feb 2018 11:32:25 +0800 +Subject: tuntap: disable preemption during XDP processing + +From: Jason Wang + + +[ Upstream commit 23e43f07f896f8578318cfcc9466f1e8b8ab21b6 ] + +Except for tuntap, all other drivers' XDP was implemented at NAPI +poll() routine in a bh. This guarantees all XDP operation were done at +the same CPU which is required by e.g BFP_MAP_TYPE_PERCPU_ARRAY. But +for tuntap, we do it in process context and we try to protect XDP +processing by RCU reader lock. This is insufficient since +CONFIG_PREEMPT_RCU can preempt the RCU reader critical section which +breaks the assumption that all XDP were processed in the same CPU. + +Fixing this by simply disabling preemption during XDP processing. + +Fixes: 761876c857cb ("tap: XDP support") +Signed-off-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/tun.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1471,6 +1471,7 @@ static struct sk_buff *tun_build_skb(str + else + *skb_xdp = 0; + ++ preempt_disable(); + rcu_read_lock(); + xdp_prog = rcu_dereference(tun->xdp_prog); + if (xdp_prog && !*skb_xdp) { +@@ -1494,6 +1495,7 @@ static struct sk_buff *tun_build_skb(str + if (err) + goto err_redirect; + rcu_read_unlock(); ++ preempt_enable(); + return NULL; + case XDP_TX: + xdp_xmit = true; +@@ -1515,6 +1517,7 @@ static struct sk_buff *tun_build_skb(str + skb = build_skb(buf, buflen); + if (!skb) { + rcu_read_unlock(); ++ preempt_enable(); + return ERR_PTR(-ENOMEM); + } + +@@ -1527,10 +1530,12 @@ static struct sk_buff *tun_build_skb(str + skb->dev = tun->dev; + generic_xdp_tx(skb, xdp_prog); + rcu_read_unlock(); ++ preempt_enable(); + return NULL; + } + + rcu_read_unlock(); ++ preempt_enable(); + + return skb; + +@@ -1538,6 +1543,7 @@ err_redirect: + put_page(alloc_frag->page); + err_xdp: + rcu_read_unlock(); ++ preempt_enable(); + this_cpu_inc(tun->pcpu_stats->rx_dropped); + return NULL; + } diff --git a/queue-4.15/udplite-fix-partial-checksum-initialization.patch b/queue-4.15/udplite-fix-partial-checksum-initialization.patch new file mode 100644 index 00000000000..ee73e2badc9 --- /dev/null +++ b/queue-4.15/udplite-fix-partial-checksum-initialization.patch @@ -0,0 +1,76 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Alexey Kodanev +Date: Thu, 15 Feb 2018 20:18:43 +0300 +Subject: udplite: fix partial checksum initialization + +From: Alexey Kodanev + + +[ Upstream commit 15f35d49c93f4fa9875235e7bf3e3783d2dd7a1b ] + +Since UDP-Lite is always using checksum, the following path is +triggered when calculating pseudo header for it: + + udp4_csum_init() or udp6_csum_init() + skb_checksum_init_zero_check() + __skb_checksum_validate_complete() + +The problem can appear if skb->len is less than CHECKSUM_BREAK. In +this particular case __skb_checksum_validate_complete() also invokes +__skb_checksum_complete(skb). If UDP-Lite is using partial checksum +that covers only part of a packet, the function will return bad +checksum and the packet will be dropped. + +It can be fixed if we skip skb_checksum_init_zero_check() and only +set the required pseudo header checksum for UDP-Lite with partial +checksum before udp4_csum_init()/udp6_csum_init() functions return. + +Fixes: ed70fcfcee95 ("net: Call skb_checksum_init in IPv4") +Fixes: e4f45b7f40bd ("net: Call skb_checksum_init in IPv6") +Signed-off-by: Alexey Kodanev +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/udplite.h | 1 + + net/ipv4/udp.c | 5 +++++ + net/ipv6/ip6_checksum.c | 5 +++++ + 3 files changed, 11 insertions(+) + +--- a/include/net/udplite.h ++++ b/include/net/udplite.h +@@ -64,6 +64,7 @@ static inline int udplite_checksum_init( + UDP_SKB_CB(skb)->cscov = cscov; + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; ++ skb->csum_valid = 0; + } + + return 0; +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -2031,6 +2031,11 @@ static inline int udp4_csum_init(struct + err = udplite_checksum_init(skb, uh); + if (err) + return err; ++ ++ if (UDP_SKB_CB(skb)->partial_cov) { ++ skb->csum = inet_compute_pseudo(skb, proto); ++ return 0; ++ } + } + + /* Note, we are only interested in != 0 or == 0, thus the +--- a/net/ipv6/ip6_checksum.c ++++ b/net/ipv6/ip6_checksum.c +@@ -73,6 +73,11 @@ int udp6_csum_init(struct sk_buff *skb, + err = udplite_checksum_init(skb, uh); + if (err) + return err; ++ ++ if (UDP_SKB_CB(skb)->partial_cov) { ++ skb->csum = ip6_compute_pseudo(skb, proto); ++ return 0; ++ } + } + + /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels) diff --git a/queue-4.15/virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch b/queue-4.15/virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch new file mode 100644 index 00000000000..31376ddbeaf --- /dev/null +++ b/queue-4.15/virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch @@ -0,0 +1,49 @@ +From foo@baz Tue Mar 6 19:02:56 PST 2018 +From: Jason Wang +Date: Wed, 28 Feb 2018 18:20:04 +0800 +Subject: virtio-net: disable NAPI only when enabled during XDP set + +From: Jason Wang + + +[ Upstream commit 4e09ff5362843dff3accfa84c805c7f3a99de9cd ] + +We try to disable NAPI to prevent a single XDP TX queue being used by +multiple cpus. But we don't check if device is up (NAPI is enabled), +this could result stall because of infinite wait in +napi_disable(). Fixing this by checking device state through +netif_running() before. + +Fixes: 4941d472bf95b ("virtio-net: do not reset during XDP set") +Signed-off-by: Jason Wang +Acked-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/virtio_net.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/drivers/net/virtio_net.c ++++ b/drivers/net/virtio_net.c +@@ -2040,8 +2040,9 @@ static int virtnet_xdp_set(struct net_de + } + + /* Make sure NAPI is not using any XDP TX queues for RX. */ +- for (i = 0; i < vi->max_queue_pairs; i++) +- napi_disable(&vi->rq[i].napi); ++ if (netif_running(dev)) ++ for (i = 0; i < vi->max_queue_pairs; i++) ++ napi_disable(&vi->rq[i].napi); + + netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp); + err = _virtnet_set_queues(vi, curr_qp + xdp_qp); +@@ -2060,7 +2061,8 @@ static int virtnet_xdp_set(struct net_de + } + if (old_prog) + bpf_prog_put(old_prog); +- virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); ++ if (netif_running(dev)) ++ virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi); + } + + return 0;