]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.15-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 7 Mar 2018 03:31:04 +0000 (19:31 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 7 Mar 2018 03:31:04 +0000 (19:31 -0800)
added patches:
amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch
bridge-check-brport-attr-show-in-brport_show.patch
bridge-fix-vlan-reference-count-problem.patch
cls_u32-fix-use-after-free-in-u32_destroy_key.patch
cxgb4-fix-trailing-zero-in-cim-la-dump.patch
doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch
fib_semantics-don-t-match-route-with-mismatching-tclassid.patch
hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch
ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch
l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch
l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch
l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch
l2tp-fix-races-with-tunnel-socket-close.patch
l2tp-fix-tunnel-lookup-use-after-free-race.patch
mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch
mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch
mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch
net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch
net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch
net-fix-race-on-decreasing-number-of-tx-queues.patch
net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch
net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch
net-mlx5-fix-error-handling-when-adding-flow-rules.patch
net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch
net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch
net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch
net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch
net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch
net-phy-restore-phy_resume-locking-assumption.patch
net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch
net-sched-report-if-filter-is-too-large-to-dump.patch
net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch
netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch
netlink-put-module-reference-if-dump-start-fails.patch
ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch
revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch
rxrpc-fix-send-in-rxrpc_send_data_packet.patch
s390-qeth-fix-double-free-on-ip-add-remove-race.patch
s390-qeth-fix-ip-address-lookup-for-l3-devices.patch
s390-qeth-fix-ip-removal-on-offline-cards.patch
s390-qeth-fix-ipa-command-submission-race.patch
s390-qeth-fix-overestimated-count-of-buffer-elements.patch
s390-qeth-fix-setip-command-handling.patch
s390-qeth-fix-underestimated-count-of-buffer-elements.patch
sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch
sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch
sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch
sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch
tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch
tcp-purge-write-queue-upon-rst.patch
tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch
tcp-revert-f-rto-middle-box-workaround.patch
tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch
tcp_bbr-better-deal-with-suboptimal-gso.patch
tls-use-correct-sk-sk_prot-for-ipv6.patch
tuntap-correctly-add-the-missing-xdp-flush.patch
tuntap-disable-preemption-during-xdp-processing.patch
udplite-fix-partial-checksum-initialization.patch
virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch

60 files changed:
queue-4.15/amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch [new file with mode: 0644]
queue-4.15/bridge-check-brport-attr-show-in-brport_show.patch [new file with mode: 0644]
queue-4.15/bridge-fix-vlan-reference-count-problem.patch [new file with mode: 0644]
queue-4.15/cls_u32-fix-use-after-free-in-u32_destroy_key.patch [new file with mode: 0644]
queue-4.15/cxgb4-fix-trailing-zero-in-cim-la-dump.patch [new file with mode: 0644]
queue-4.15/doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch [new file with mode: 0644]
queue-4.15/fib_semantics-don-t-match-route-with-mismatching-tclassid.patch [new file with mode: 0644]
queue-4.15/hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch [new file with mode: 0644]
queue-4.15/ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch [new file with mode: 0644]
queue-4.15/l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch [new file with mode: 0644]
queue-4.15/l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch [new file with mode: 0644]
queue-4.15/l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch [new file with mode: 0644]
queue-4.15/l2tp-fix-races-with-tunnel-socket-close.patch [new file with mode: 0644]
queue-4.15/l2tp-fix-tunnel-lookup-use-after-free-race.patch [new file with mode: 0644]
queue-4.15/mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch [new file with mode: 0644]
queue-4.15/mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch [new file with mode: 0644]
queue-4.15/mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch [new file with mode: 0644]
queue-4.15/net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch [new file with mode: 0644]
queue-4.15/net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch [new file with mode: 0644]
queue-4.15/net-fix-race-on-decreasing-number-of-tx-queues.patch [new file with mode: 0644]
queue-4.15/net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch [new file with mode: 0644]
queue-4.15/net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch [new file with mode: 0644]
queue-4.15/net-mlx5-fix-error-handling-when-adding-flow-rules.patch [new file with mode: 0644]
queue-4.15/net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch [new file with mode: 0644]
queue-4.15/net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch [new file with mode: 0644]
queue-4.15/net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch [new file with mode: 0644]
queue-4.15/net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch [new file with mode: 0644]
queue-4.15/net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch [new file with mode: 0644]
queue-4.15/net-phy-restore-phy_resume-locking-assumption.patch [new file with mode: 0644]
queue-4.15/net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch [new file with mode: 0644]
queue-4.15/net-sched-report-if-filter-is-too-large-to-dump.patch [new file with mode: 0644]
queue-4.15/net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch [new file with mode: 0644]
queue-4.15/netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch [new file with mode: 0644]
queue-4.15/netlink-put-module-reference-if-dump-start-fails.patch [new file with mode: 0644]
queue-4.15/ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch [new file with mode: 0644]
queue-4.15/revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch [new file with mode: 0644]
queue-4.15/rxrpc-fix-send-in-rxrpc_send_data_packet.patch [new file with mode: 0644]
queue-4.15/s390-qeth-fix-double-free-on-ip-add-remove-race.patch [new file with mode: 0644]
queue-4.15/s390-qeth-fix-ip-address-lookup-for-l3-devices.patch [new file with mode: 0644]
queue-4.15/s390-qeth-fix-ip-removal-on-offline-cards.patch [new file with mode: 0644]
queue-4.15/s390-qeth-fix-ipa-command-submission-race.patch [new file with mode: 0644]
queue-4.15/s390-qeth-fix-overestimated-count-of-buffer-elements.patch [new file with mode: 0644]
queue-4.15/s390-qeth-fix-setip-command-handling.patch [new file with mode: 0644]
queue-4.15/s390-qeth-fix-underestimated-count-of-buffer-elements.patch [new file with mode: 0644]
queue-4.15/sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch [new file with mode: 0644]
queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch [new file with mode: 0644]
queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch [new file with mode: 0644]
queue-4.15/sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch [new file with mode: 0644]
queue-4.15/series
queue-4.15/tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch [new file with mode: 0644]
queue-4.15/tcp-purge-write-queue-upon-rst.patch [new file with mode: 0644]
queue-4.15/tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch [new file with mode: 0644]
queue-4.15/tcp-revert-f-rto-middle-box-workaround.patch [new file with mode: 0644]
queue-4.15/tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch [new file with mode: 0644]
queue-4.15/tcp_bbr-better-deal-with-suboptimal-gso.patch [new file with mode: 0644]
queue-4.15/tls-use-correct-sk-sk_prot-for-ipv6.patch [new file with mode: 0644]
queue-4.15/tuntap-correctly-add-the-missing-xdp-flush.patch [new file with mode: 0644]
queue-4.15/tuntap-disable-preemption-during-xdp-processing.patch [new file with mode: 0644]
queue-4.15/udplite-fix-partial-checksum-initialization.patch [new file with mode: 0644]
queue-4.15/virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch [new file with mode: 0644]

diff --git a/queue-4.15/amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch b/queue-4.15/amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch
new file mode 100644 (file)
index 0000000..962e714
--- /dev/null
@@ -0,0 +1,31 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Tom Lendacky <thomas.lendacky@amd.com>
+Date: Tue, 20 Feb 2018 15:22:05 -0600
+Subject: amd-xgbe: Restore PCI interrupt enablement setting on resume
+
+From: Tom Lendacky <thomas.lendacky@amd.com>
+
+
+[ Upstream commit cfd092f2db8b4b6727e1c03ef68a7842e1023573 ]
+
+After resuming from suspend, the PCI device support must re-enable the
+interrupt setting so that interrupts are actually delivered.
+
+Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/amd/xgbe/xgbe-pci.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
++++ b/drivers/net/ethernet/amd/xgbe/xgbe-pci.c
+@@ -426,6 +426,8 @@ static int xgbe_pci_resume(struct pci_de
+       struct net_device *netdev = pdata->netdev;
+       int ret = 0;
++      XP_IOWRITE(pdata, XP_INT_EN, 0x1fffff);
++
+       pdata->lpm_ctrl &= ~MDIO_CTRL1_LPOWER;
+       XMDIO_WRITE(pdata, MDIO_MMD_PCS, MDIO_CTRL1, pdata->lpm_ctrl);
diff --git a/queue-4.15/bridge-check-brport-attr-show-in-brport_show.patch b/queue-4.15/bridge-check-brport-attr-show-in-brport_show.patch
new file mode 100644 (file)
index 0000000..191b43c
--- /dev/null
@@ -0,0 +1,48 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Xin Long <lucien.xin@gmail.com>
+Date: Mon, 12 Feb 2018 17:15:40 +0800
+Subject: bridge: check brport attr show in brport_show
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit 1b12580af1d0677c3c3a19e35bfe5d59b03f737f ]
+
+Now br_sysfs_if file flush doesn't have attr show. To read it will
+cause kernel panic after users chmod u+r this file.
+
+Xiong found this issue when running the commands:
+
+  ip link add br0 type bridge
+  ip link add type veth
+  ip link set veth0 master br0
+  chmod u+r /sys/devices/virtual/net/veth0/brport/flush
+  timeout 3 cat /sys/devices/virtual/net/veth0/brport/flush
+
+kernel crashed with NULL a pointer dereference call trace.
+
+This patch is to fix it by return -EINVAL when brport_attr->show
+is null, just the same as the check for brport_attr->store in
+brport_store().
+
+Fixes: 9cf637473c85 ("bridge: add sysfs hook to flush forwarding table")
+Reported-by: Xiong Zhou <xzhou@redhat.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bridge/br_sysfs_if.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/bridge/br_sysfs_if.c
++++ b/net/bridge/br_sysfs_if.c
+@@ -255,6 +255,9 @@ static ssize_t brport_show(struct kobjec
+       struct brport_attribute *brport_attr = to_brport_attr(attr);
+       struct net_bridge_port *p = to_brport(kobj);
++      if (!brport_attr->show)
++              return -EINVAL;
++
+       return brport_attr->show(p, buf);
+ }
diff --git a/queue-4.15/bridge-fix-vlan-reference-count-problem.patch b/queue-4.15/bridge-fix-vlan-reference-count-problem.patch
new file mode 100644 (file)
index 0000000..5d4bd5b
--- /dev/null
@@ -0,0 +1,40 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Ido Schimmel <idosch@mellanox.com>
+Date: Sun, 25 Feb 2018 21:59:06 +0200
+Subject: bridge: Fix VLAN reference count problem
+
+From: Ido Schimmel <idosch@mellanox.com>
+
+
+[ Upstream commit 0e5a82efda872c2469c210957d7d4161ef8f4391 ]
+
+When a VLAN is added on a port, a reference is taken on the
+corresponding master VLAN entry. If it does not already exist, then it
+is created and a reference taken.
+
+However, in the second case a reference is not really taken when
+CONFIG_REFCOUNT_FULL is enabled as refcount_inc() is replaced by
+refcount_inc_not_zero().
+
+Fix this by using refcount_set() on a newly created master VLAN entry.
+
+Fixes: 251277598596 ("net, bridge: convert net_bridge_vlan.refcnt from atomic_t to refcount_t")
+Signed-off-by: Ido Schimmel <idosch@mellanox.com>
+Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bridge/br_vlan.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/bridge/br_vlan.c
++++ b/net/bridge/br_vlan.c
+@@ -168,6 +168,8 @@ static struct net_bridge_vlan *br_vlan_g
+               masterv = br_vlan_find(vg, vid);
+               if (WARN_ON(!masterv))
+                       return NULL;
++              refcount_set(&masterv->refcnt, 1);
++              return masterv;
+       }
+       refcount_inc(&masterv->refcnt);
diff --git a/queue-4.15/cls_u32-fix-use-after-free-in-u32_destroy_key.patch b/queue-4.15/cls_u32-fix-use-after-free-in-u32_destroy_key.patch
new file mode 100644 (file)
index 0000000..d30825e
--- /dev/null
@@ -0,0 +1,193 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Mon, 5 Feb 2018 22:23:01 +0100
+Subject: cls_u32: fix use after free in u32_destroy_key()
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+
+[ Upstream commit d7cdee5ea8d28ae1b6922deb0c1badaa3aa0ef8c ]
+
+Li Shuang reported an Oops with cls_u32 due to an use-after-free
+in u32_destroy_key(). The use-after-free can be triggered with:
+
+dev=lo
+tc qdisc add dev $dev root handle 1: htb default 10
+tc filter add dev $dev parent 1: prio 5 handle 1: protocol ip u32 divisor 256
+tc filter add dev $dev protocol ip parent 1: prio 5 u32 ht 800:: match ip dst\
+ 10.0.0.0/8 hashkey mask 0x0000ff00 at 16 link 1:
+tc qdisc del dev $dev root
+
+Which causes the following kasan splat:
+
+ ==================================================================
+ BUG: KASAN: use-after-free in u32_destroy_key.constprop.21+0x117/0x140 [cls_u32]
+ Read of size 4 at addr ffff881b83dae618 by task kworker/u48:5/571
+
+ CPU: 17 PID: 571 Comm: kworker/u48:5 Not tainted 4.15.0+ #87
+ Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.1.7 06/16/2016
+ Workqueue: tc_filter_workqueue u32_delete_key_freepf_work [cls_u32]
+ Call Trace:
+  dump_stack+0xd6/0x182
+  ? dma_virt_map_sg+0x22e/0x22e
+  print_address_description+0x73/0x290
+  kasan_report+0x277/0x360
+  ? u32_destroy_key.constprop.21+0x117/0x140 [cls_u32]
+  u32_destroy_key.constprop.21+0x117/0x140 [cls_u32]
+  u32_delete_key_freepf_work+0x1c/0x30 [cls_u32]
+  process_one_work+0xae0/0x1c80
+  ? sched_clock+0x5/0x10
+  ? pwq_dec_nr_in_flight+0x3c0/0x3c0
+  ? _raw_spin_unlock_irq+0x29/0x40
+  ? trace_hardirqs_on_caller+0x381/0x570
+  ? _raw_spin_unlock_irq+0x29/0x40
+  ? finish_task_switch+0x1e5/0x760
+  ? finish_task_switch+0x208/0x760
+  ? preempt_notifier_dec+0x20/0x20
+  ? __schedule+0x839/0x1ee0
+  ? check_noncircular+0x20/0x20
+  ? firmware_map_remove+0x73/0x73
+  ? find_held_lock+0x39/0x1c0
+  ? worker_thread+0x434/0x1820
+  ? lock_contended+0xee0/0xee0
+  ? lock_release+0x1100/0x1100
+  ? init_rescuer.part.16+0x150/0x150
+  ? retint_kernel+0x10/0x10
+  worker_thread+0x216/0x1820
+  ? process_one_work+0x1c80/0x1c80
+  ? lock_acquire+0x1a5/0x540
+  ? lock_downgrade+0x6b0/0x6b0
+  ? sched_clock+0x5/0x10
+  ? lock_release+0x1100/0x1100
+  ? compat_start_thread+0x80/0x80
+  ? do_raw_spin_trylock+0x190/0x190
+  ? _raw_spin_unlock_irq+0x29/0x40
+  ? trace_hardirqs_on_caller+0x381/0x570
+  ? _raw_spin_unlock_irq+0x29/0x40
+  ? finish_task_switch+0x1e5/0x760
+  ? finish_task_switch+0x208/0x760
+  ? preempt_notifier_dec+0x20/0x20
+  ? __schedule+0x839/0x1ee0
+  ? kmem_cache_alloc_trace+0x143/0x320
+  ? firmware_map_remove+0x73/0x73
+  ? sched_clock+0x5/0x10
+  ? sched_clock_cpu+0x18/0x170
+  ? find_held_lock+0x39/0x1c0
+  ? schedule+0xf3/0x3b0
+  ? lock_downgrade+0x6b0/0x6b0
+  ? __schedule+0x1ee0/0x1ee0
+  ? do_wait_intr_irq+0x340/0x340
+  ? do_raw_spin_trylock+0x190/0x190
+  ? _raw_spin_unlock_irqrestore+0x32/0x60
+  ? process_one_work+0x1c80/0x1c80
+  ? process_one_work+0x1c80/0x1c80
+  kthread+0x312/0x3d0
+  ? kthread_create_worker_on_cpu+0xc0/0xc0
+  ret_from_fork+0x3a/0x50
+
+ Allocated by task 1688:
+  kasan_kmalloc+0xa0/0xd0
+  __kmalloc+0x162/0x380
+  u32_change+0x1220/0x3c9e [cls_u32]
+  tc_ctl_tfilter+0x1ba6/0x2f80
+  rtnetlink_rcv_msg+0x4f0/0x9d0
+  netlink_rcv_skb+0x124/0x320
+  netlink_unicast+0x430/0x600
+  netlink_sendmsg+0x8fa/0xd60
+  sock_sendmsg+0xb1/0xe0
+  ___sys_sendmsg+0x678/0x980
+  __sys_sendmsg+0xc4/0x210
+  do_syscall_64+0x232/0x7f0
+  return_from_SYSCALL_64+0x0/0x75
+
+ Freed by task 112:
+  kasan_slab_free+0x71/0xc0
+  kfree+0x114/0x320
+  rcu_process_callbacks+0xc3f/0x1600
+  __do_softirq+0x2bf/0xc06
+
+ The buggy address belongs to the object at ffff881b83dae600
+  which belongs to the cache kmalloc-4096 of size 4096
+ The buggy address is located 24 bytes inside of
+  4096-byte region [ffff881b83dae600, ffff881b83daf600)
+ The buggy address belongs to the page:
+ page:ffffea006e0f6a00 count:1 mapcount:0 mapping:          (null) index:0x0 compound_mapcount: 0
+ flags: 0x17ffffc0008100(slab|head)
+ raw: 0017ffffc0008100 0000000000000000 0000000000000000 0000000100070007
+ raw: dead000000000100 dead000000000200 ffff880187c0e600 0000000000000000
+ page dumped because: kasan: bad access detected
+
+ Memory state around the buggy address:
+  ffff881b83dae500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+  ffff881b83dae580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ >ffff881b83dae600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                             ^
+  ffff881b83dae680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+  ffff881b83dae700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ==================================================================
+
+The problem is that the htnode is freed before the linked knodes and the
+latter will try to access the first at u32_destroy_key() time.
+This change addresses the issue using the htnode refcnt to guarantee
+the correct free order. While at it also add a RCU annotation,
+to keep sparse happy.
+
+v1 -> v2: use rtnl_derefence() instead of RCU read locks
+v2 -> v3:
+  - don't check refcnt in u32_destroy_hnode()
+  - cleaned-up u32_destroy() implementation
+  - cleaned-up code comment
+v3 -> v4:
+  - dropped unneeded comment
+
+Reported-by: Li Shuang <shuali@redhat.com>
+Fixes: c0d378ef1266 ("net_sched: use tcf_queue_work() in u32 filter")
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/cls_u32.c |   21 +++++++++++----------
+ 1 file changed, 11 insertions(+), 10 deletions(-)
+
+--- a/net/sched/cls_u32.c
++++ b/net/sched/cls_u32.c
+@@ -397,10 +397,12 @@ static int u32_init(struct tcf_proto *tp
+ static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n,
+                          bool free_pf)
+ {
++      struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
++
+       tcf_exts_destroy(&n->exts);
+       tcf_exts_put_net(&n->exts);
+-      if (n->ht_down)
+-              n->ht_down->refcnt--;
++      if (ht && --ht->refcnt == 0)
++              kfree(ht);
+ #ifdef CONFIG_CLS_U32_PERF
+       if (free_pf)
+               free_percpu(n->pf);
+@@ -653,16 +655,15 @@ static void u32_destroy(struct tcf_proto
+               hlist_del(&tp_c->hnode);
+-              for (ht = rtnl_dereference(tp_c->hlist);
+-                   ht;
+-                   ht = rtnl_dereference(ht->next)) {
+-                      ht->refcnt--;
+-                      u32_clear_hnode(tp, ht);
+-              }
+-
+               while ((ht = rtnl_dereference(tp_c->hlist)) != NULL) {
++                      u32_clear_hnode(tp, ht);
+                       RCU_INIT_POINTER(tp_c->hlist, ht->next);
+-                      kfree_rcu(ht, rcu);
++
++                      /* u32_destroy_key() will later free ht for us, if it's
++                       * still referenced by some knode
++                       */
++                      if (--ht->refcnt == 0)
++                              kfree_rcu(ht, rcu);
+               }
+               idr_destroy(&tp_c->handle_idr);
diff --git a/queue-4.15/cxgb4-fix-trailing-zero-in-cim-la-dump.patch b/queue-4.15/cxgb4-fix-trailing-zero-in-cim-la-dump.patch
new file mode 100644 (file)
index 0000000..48e6134
--- /dev/null
@@ -0,0 +1,44 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
+Date: Thu, 15 Feb 2018 18:20:01 +0530
+Subject: cxgb4: fix trailing zero in CIM LA dump
+
+From: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
+
+
+[ Upstream commit e6f02a4d57cc438099bc8abfba43ba1400d77b38 ]
+
+Set correct size of the CIM LA dump for T6.
+
+Fixes: 27887bc7cb7f ("cxgb4: collect hardware LA dumps")
+Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
+Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c   |    2 +-
+ drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c |    2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
++++ b/drivers/net/ethernet/chelsio/cxgb4/cudbg_lib.c
+@@ -156,7 +156,7 @@ int cudbg_collect_cim_la(struct cudbg_in
+       if (is_t6(padap->params.chip)) {
+               size = padap->params.cim_la_size / 10 + 1;
+-              size *= 11 * sizeof(u32);
++              size *= 10 * sizeof(u32);
+       } else {
+               size = padap->params.cim_la_size / 8;
+               size *= 8 * sizeof(u32);
+--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
++++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_cudbg.c
+@@ -97,7 +97,7 @@ static u32 cxgb4_get_entity_length(struc
+       case CUDBG_CIM_LA:
+               if (is_t6(adap->params.chip)) {
+                       len = adap->params.cim_la_size / 10 + 1;
+-                      len *= 11 * sizeof(u32);
++                      len *= 10 * sizeof(u32);
+               } else {
+                       len = adap->params.cim_la_size / 8;
+                       len *= 8 * sizeof(u32);
diff --git a/queue-4.15/doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch b/queue-4.15/doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch
new file mode 100644 (file)
index 0000000..f036721
--- /dev/null
@@ -0,0 +1,42 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
+Date: Sun, 4 Feb 2018 18:07:10 -0800
+Subject: doc: Change the min default value of tcp_wmem/tcp_rmem.
+
+From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
+
+
+[ Upstream commit a61a86f8db92923a2a4c857c49a795bcae754497 ]
+
+The SK_MEM_QUANTUM was changed from PAGE_SIZE to 4096. And the
+tcp_wmem/tcp_rmem min default values are 4096.
+
+Fixes: bd68a2a854ad ("net: set SK_MEM_QUANTUM to 4096")
+Cc: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/networking/ip-sysctl.txt |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/Documentation/networking/ip-sysctl.txt
++++ b/Documentation/networking/ip-sysctl.txt
+@@ -508,7 +508,7 @@ tcp_rmem - vector of 3 INTEGERs: min, de
+       min: Minimal size of receive buffer used by TCP sockets.
+       It is guaranteed to each TCP socket, even under moderate memory
+       pressure.
+-      Default: 1 page
++      Default: 4K
+       default: initial size of receive buffer used by TCP sockets.
+       This value overrides net.core.rmem_default used by other protocols.
+@@ -666,7 +666,7 @@ tcp_window_scaling - BOOLEAN
+ tcp_wmem - vector of 3 INTEGERs: min, default, max
+       min: Amount of memory reserved for send buffers for TCP sockets.
+       Each TCP socket has rights to use it due to fact of its birth.
+-      Default: 1 page
++      Default: 4K
+       default: initial size of send buffer used by TCP sockets.  This
+       value overrides net.core.wmem_default used by other protocols.
diff --git a/queue-4.15/fib_semantics-don-t-match-route-with-mismatching-tclassid.patch b/queue-4.15/fib_semantics-don-t-match-route-with-mismatching-tclassid.patch
new file mode 100644 (file)
index 0000000..0cb24aa
--- /dev/null
@@ -0,0 +1,66 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Stefano Brivio <sbrivio@redhat.com>
+Date: Thu, 15 Feb 2018 09:46:03 +0100
+Subject: fib_semantics: Don't match route with mismatching tclassid
+
+From: Stefano Brivio <sbrivio@redhat.com>
+
+
+[ Upstream commit a8c6db1dfd1b1d18359241372bb204054f2c3174 ]
+
+In fib_nh_match(), if output interface or gateway are passed in
+the FIB configuration, we don't have to check next hops of
+multipath routes to conclude whether we have a match or not.
+
+However, we might still have routes with different realms
+matching the same output interface and gateway configuration,
+and this needs to cause the match to fail. Otherwise the first
+route inserted in the FIB will match, regardless of the realms:
+
+ # ip route add 1.1.1.1 dev eth0 table 1234 realms 1/2
+ # ip route append 1.1.1.1 dev eth0 table 1234 realms 3/4
+ # ip route list table 1234
+ 1.1.1.1 dev eth0 scope link realms 1/2
+ 1.1.1.1 dev eth0 scope link realms 3/4
+ # ip route del 1.1.1.1 dev ens3 table 1234 realms 3/4
+ # ip route list table 1234
+ 1.1.1.1 dev ens3 scope link realms 3/4
+
+whereas route with realms 3/4 should have been deleted instead.
+
+Explicitly check for fc_flow passed in the FIB configuration
+(this comes from RTA_FLOW extracted by rtm_to_fib_config()) and
+fail matching if it differs from nh_tclassid.
+
+The handling of RTA_FLOW for multipath routes later in
+fib_nh_match() is still needed, as we can have multiple RTA_FLOW
+attributes that need to be matched against the tclassid of each
+next hop.
+
+v2: Check that fc_flow is set before discarding the match, so
+    that the user can still select the first matching rule by
+    not specifying any realm, as suggested by David Ahern.
+
+Reported-by: Jianlin Shi <jishi@redhat.com>
+Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
+Acked-by: David Ahern <dsahern@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/fib_semantics.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/net/ipv4/fib_semantics.c
++++ b/net/ipv4/fib_semantics.c
+@@ -646,6 +646,11 @@ int fib_nh_match(struct fib_config *cfg,
+                                           fi->fib_nh, cfg, extack))
+                               return 1;
+               }
++#ifdef CONFIG_IP_ROUTE_CLASSID
++              if (cfg->fc_flow &&
++                  cfg->fc_flow != fi->fib_nh->nh_tclassid)
++                      return 1;
++#endif
+               if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
+                   (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
+                       return 0;
diff --git a/queue-4.15/hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch b/queue-4.15/hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch
new file mode 100644 (file)
index 0000000..156c0fa
--- /dev/null
@@ -0,0 +1,37 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Denis Du <dudenis2000@yahoo.ca>
+Date: Sat, 24 Feb 2018 16:51:42 -0500
+Subject: hdlc_ppp: carrier detect ok, don't turn off negotiation
+
+From: Denis Du <dudenis2000@yahoo.ca>
+
+
+[ Upstream commit b6c3bad1ba83af1062a7ff6986d9edc4f3d7fc8e ]
+
+Sometimes when physical lines have a just good noise to make the protocol
+handshaking fail, but the carrier detect still good. Then after remove of
+the noise, nobody will trigger this protocol to be start again to cause
+the link to never come back. The fix is when the carrier is still on, not
+terminate the protocol handshaking.
+
+Signed-off-by: Denis Du <dudenis2000@yahoo.ca>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wan/hdlc_ppp.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/wan/hdlc_ppp.c
++++ b/drivers/net/wan/hdlc_ppp.c
+@@ -574,7 +574,10 @@ static void ppp_timer(struct timer_list
+                       ppp_cp_event(proto->dev, proto->pid, TO_GOOD, 0, 0,
+                                    0, NULL);
+                       proto->restart_counter--;
+-              } else
++              } else if (netif_carrier_ok(proto->dev))
++                      ppp_cp_event(proto->dev, proto->pid, TO_GOOD, 0, 0,
++                                   0, NULL);
++              else
+                       ppp_cp_event(proto->dev, proto->pid, TO_BAD, 0, 0,
+                                    0, NULL);
+               break;
diff --git a/queue-4.15/ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch b/queue-4.15/ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch
new file mode 100644 (file)
index 0000000..d891305
--- /dev/null
@@ -0,0 +1,49 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Arnd Bergmann <arnd@arndb.de>
+Date: Thu, 22 Feb 2018 16:55:34 +0100
+Subject: ipv6 sit: work around bogus gcc-8 -Wrestrict warning
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+
+[ Upstream commit ca79bec237f5809a7c3c59bd41cd0880aa889966 ]
+
+gcc-8 has a new warning that detects overlapping input and output arguments
+in memcpy(). It triggers for sit_init_net() calling ipip6_tunnel_clone_6rd(),
+which is actually correct:
+
+net/ipv6/sit.c: In function 'sit_init_net':
+net/ipv6/sit.c:192:3: error: 'memcpy' source argument is the same as destination [-Werror=restrict]
+
+The problem here is that the logic detecting the memcpy() arguments finds them
+to be the same, but the conditional that tests for the input and output of
+ipip6_tunnel_clone_6rd() to be identical is not a compile-time constant.
+
+We know that netdev_priv(t->dev) is the same as t for a tunnel device,
+and comparing "dev" directly here lets the compiler figure out as well
+that 'dev == sitn->fb_tunnel_dev' when called from sit_init_net(), so
+it no longer warns.
+
+This code is old, so Cc stable to make sure that we don't get the warning
+for older kernels built with new gcc.
+
+Cc: Martin Sebor <msebor@gmail.com>
+Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=83456
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/sit.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv6/sit.c
++++ b/net/ipv6/sit.c
+@@ -182,7 +182,7 @@ static void ipip6_tunnel_clone_6rd(struc
+ #ifdef CONFIG_IPV6_SIT_6RD
+       struct ip_tunnel *t = netdev_priv(dev);
+-      if (t->dev == sitn->fb_tunnel_dev) {
++      if (dev == sitn->fb_tunnel_dev) {
+               ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
+               t->ip6rd.relay_prefix = 0;
+               t->ip6rd.prefixlen = 16;
diff --git a/queue-4.15/l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch b/queue-4.15/l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch
new file mode 100644 (file)
index 0000000..595c164
--- /dev/null
@@ -0,0 +1,118 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: James Chapman <jchapman@katalix.com>
+Date: Fri, 23 Feb 2018 17:45:44 +0000
+Subject: l2tp: don't use inet_shutdown on ppp session destroy
+
+From: James Chapman <jchapman@katalix.com>
+
+
+[ Upstream commit 225eb26489d05c679a4c4197ffcb81c81e9dcaf4 ]
+
+Previously, if a ppp session was closed, we called inet_shutdown to mark
+the socket as unconnected such that userspace would get errors and
+then close the socket. This could race with userspace closing the
+socket. Instead, leave userspace to close the socket in its own time
+(our session will be detached anyway).
+
+BUG: KASAN: use-after-free in inet_shutdown+0x5d/0x1c0
+Read of size 4 at addr ffff880010ea3ac0 by task syzbot_347bd5ac/8296
+
+CPU: 3 PID: 8296 Comm: syzbot_347bd5ac Not tainted 4.16.0-rc1+ #91
+Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+Call Trace:
+ dump_stack+0x101/0x157
+ ? inet_shutdown+0x5d/0x1c0
+ print_address_description+0x78/0x260
+ ? inet_shutdown+0x5d/0x1c0
+ kasan_report+0x240/0x360
+ __asan_load4+0x78/0x80
+ inet_shutdown+0x5d/0x1c0
+ ? pppol2tp_show+0x80/0x80
+ pppol2tp_session_close+0x68/0xb0
+ l2tp_tunnel_closeall+0x199/0x210
+ ? udp_v6_flush_pending_frames+0x90/0x90
+ l2tp_udp_encap_destroy+0x6b/0xc0
+ ? l2tp_tunnel_del_work+0x2e0/0x2e0
+ udpv6_destroy_sock+0x8c/0x90
+ sk_common_release+0x47/0x190
+ udp_lib_close+0x15/0x20
+ inet_release+0x85/0xd0
+ inet6_release+0x43/0x60
+ sock_release+0x53/0x100
+ ? sock_alloc_file+0x260/0x260
+ sock_close+0x1b/0x20
+ __fput+0x19f/0x380
+ ____fput+0x1a/0x20
+ task_work_run+0xd2/0x110
+ exit_to_usermode_loop+0x18d/0x190
+ do_syscall_64+0x389/0x3b0
+ entry_SYSCALL_64_after_hwframe+0x26/0x9b
+RIP: 0033:0x7fe240a45259
+RSP: 002b:00007fe241132df8 EFLAGS: 00000297 ORIG_RAX: 0000000000000003
+RAX: 0000000000000000 RBX: 0000000000000000 RCX: 00007fe240a45259
+RDX: 00007fe240a45259 RSI: 0000000000000000 RDI: 00000000000000a5
+RBP: 00007fe241132e20 R08: 00007fe241133700 R09: 0000000000000000
+R10: 00007fe241133700 R11: 0000000000000297 R12: 0000000000000000
+R13: 00007ffc49aff84f R14: 0000000000000000 R15: 00007fe241141040
+
+Allocated by task 8331:
+ save_stack+0x43/0xd0
+ kasan_kmalloc+0xad/0xe0
+ kasan_slab_alloc+0x12/0x20
+ kmem_cache_alloc+0x144/0x3e0
+ sock_alloc_inode+0x22/0x130
+ alloc_inode+0x3d/0xf0
+ new_inode_pseudo+0x1c/0x90
+ sock_alloc+0x30/0x110
+ __sock_create+0xaa/0x4c0
+ SyS_socket+0xbe/0x130
+ do_syscall_64+0x128/0x3b0
+ entry_SYSCALL_64_after_hwframe+0x26/0x9b
+
+Freed by task 8314:
+ save_stack+0x43/0xd0
+ __kasan_slab_free+0x11a/0x170
+ kasan_slab_free+0xe/0x10
+ kmem_cache_free+0x88/0x2b0
+ sock_destroy_inode+0x49/0x50
+ destroy_inode+0x77/0xb0
+ evict+0x285/0x340
+ iput+0x429/0x530
+ dentry_unlink_inode+0x28c/0x2c0
+ __dentry_kill+0x1e3/0x2f0
+ dput.part.21+0x500/0x560
+ dput+0x24/0x30
+ __fput+0x2aa/0x380
+ ____fput+0x1a/0x20
+ task_work_run+0xd2/0x110
+ exit_to_usermode_loop+0x18d/0x190
+ do_syscall_64+0x389/0x3b0
+ entry_SYSCALL_64_after_hwframe+0x26/0x9b
+
+Fixes: fd558d186df2c ("l2tp: Split pppol2tp patch into separate l2tp and ppp parts")
+Signed-off-by: James Chapman <jchapman@katalix.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/l2tp/l2tp_ppp.c |   10 ----------
+ 1 file changed, 10 deletions(-)
+
+--- a/net/l2tp/l2tp_ppp.c
++++ b/net/l2tp/l2tp_ppp.c
+@@ -420,16 +420,6 @@ abort:
+  */
+ static void pppol2tp_session_close(struct l2tp_session *session)
+ {
+-      struct sock *sk;
+-
+-      BUG_ON(session->magic != L2TP_SESSION_MAGIC);
+-
+-      sk = pppol2tp_session_get_sock(session);
+-      if (sk) {
+-              if (sk->sk_socket)
+-                      inet_shutdown(sk->sk_socket, SEND_SHUTDOWN);
+-              sock_put(sk);
+-      }
+ }
+ /* Really kill the session socket. (Called from sock_put() if
diff --git a/queue-4.15/l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch b/queue-4.15/l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch
new file mode 100644 (file)
index 0000000..93f6a0e
--- /dev/null
@@ -0,0 +1,87 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: James Chapman <jchapman@katalix.com>
+Date: Fri, 23 Feb 2018 17:45:43 +0000
+Subject: l2tp: don't use inet_shutdown on tunnel destroy
+
+From: James Chapman <jchapman@katalix.com>
+
+
+[ Upstream commit 76a6abdb2513ad4ea0ded55d2c66160491f2e848 ]
+
+Previously, if a tunnel was closed, we called inet_shutdown to mark
+the socket as unconnected such that userspace would get errors and
+then close the socket. This could race with userspace closing the
+socket. Instead, leave userspace to close the socket in its own time
+(our tunnel will be detached anyway).
+
+BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0
+IP: __lock_acquire+0x263/0x1630
+PGD 0 P4D 0
+Oops: 0000 [#1] SMP KASAN
+Modules linked in:
+CPU: 2 PID: 42 Comm: kworker/u8:2 Not tainted 4.15.0-rc7+ #129
+Workqueue: l2tp l2tp_tunnel_del_work
+RIP: 0010:__lock_acquire+0x263/0x1630
+RSP: 0018:ffff88001a37fc70 EFLAGS: 00010002
+RAX: 0000000000000001 RBX: 0000000000000088 RCX: 0000000000000000
+RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
+RBP: ffff88001a37fd18 R08: 0000000000000001 R09: 0000000000000000
+R10: 0000000000000000 R11: 00000000000076fd R12: 00000000000000a0
+R13: ffff88001a3722c0 R14: 0000000000000001 R15: 0000000000000000
+FS:  0000000000000000(0000) GS:ffff88001ad00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00000000000000a0 CR3: 000000001730b000 CR4: 00000000000006e0
+Call Trace:
+ ? __lock_acquire+0xc77/0x1630
+ ? console_trylock+0x11/0xa0
+ lock_acquire+0x117/0x230
+ ? lock_sock_nested+0x3a/0xa0
+ _raw_spin_lock_bh+0x3a/0x50
+ ? lock_sock_nested+0x3a/0xa0
+ lock_sock_nested+0x3a/0xa0
+ inet_shutdown+0x33/0xf0
+ l2tp_tunnel_del_work+0x60/0xef
+ process_one_work+0x1ea/0x5f0
+ ? process_one_work+0x162/0x5f0
+ worker_thread+0x48/0x3e0
+ ? trace_hardirqs_on+0xd/0x10
+ kthread+0x108/0x140
+ ? process_one_work+0x5f0/0x5f0
+ ? kthread_stop+0x2a0/0x2a0
+ ret_from_fork+0x24/0x30
+Code: 00 41 81 ff ff 1f 00 00 0f 87 7a 13 00 00 45 85 f6 49 8b 85
+68 08 00 00 0f 84 ae 03 00 00 c7 44 24 18 00 00 00 00 e9 f0 00 00 00 <49> 81 3c
+24 80 93 3f 83 b8 00 00 00 00 44 0f 44 c0 83 fe 01 0f
+RIP: __lock_acquire+0x263/0x1630 RSP: ffff88001a37fc70
+CR2: 00000000000000a0
+
+Fixes: 309795f4bec2d ("l2tp: Add netlink control API for L2TP")
+Signed-off-by: James Chapman <jchapman@katalix.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/l2tp/l2tp_core.c |   11 ++---------
+ 1 file changed, 2 insertions(+), 9 deletions(-)
+
+--- a/net/l2tp/l2tp_core.c
++++ b/net/l2tp/l2tp_core.c
+@@ -1336,17 +1336,10 @@ static void l2tp_tunnel_del_work(struct
+       sock = sk->sk_socket;
+-      /* If the tunnel socket was created by userspace, then go through the
+-       * inet layer to shut the socket down, and let userspace close it.
+-       * Otherwise, if we created the socket directly within the kernel, use
++      /* If the tunnel socket was created within the kernel, use
+        * the sk API to release it here.
+-       * In either case the tunnel resources are freed in the socket
+-       * destructor when the tunnel socket goes away.
+        */
+-      if (tunnel->fd >= 0) {
+-              if (sock)
+-                      inet_shutdown(sock, 2);
+-      } else {
++      if (tunnel->fd < 0) {
+               if (sock) {
+                       kernel_sock_shutdown(sock, SHUT_RDWR);
+                       sock_release(sock);
diff --git a/queue-4.15/l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch b/queue-4.15/l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch
new file mode 100644 (file)
index 0000000..e880747
--- /dev/null
@@ -0,0 +1,172 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: James Chapman <jchapman@katalix.com>
+Date: Fri, 23 Feb 2018 17:45:46 +0000
+Subject: l2tp: fix race in pppol2tp_release with session object destroy
+
+From: James Chapman <jchapman@katalix.com>
+
+
+[ Upstream commit d02ba2a6110c530a32926af8ad441111774d2893 ]
+
+pppol2tp_release uses call_rcu to put the final ref on its socket. But
+the session object doesn't hold a ref on the session socket so may be
+freed while the pppol2tp_put_sk RCU callback is scheduled. Fix this by
+having the session hold a ref on its socket until the session is
+destroyed. It is this ref that is dropped via call_rcu.
+
+Sessions are also deleted via l2tp_tunnel_closeall. This must now also put
+the final ref via call_rcu. So move the call_rcu call site into
+pppol2tp_session_close so that this happens in both destroy paths. A
+common destroy path should really be implemented, perhaps with
+l2tp_tunnel_closeall calling l2tp_session_delete like pppol2tp_release
+does, but this will be looked at later.
+
+ODEBUG: activate active (active state 1) object type: rcu_head hint:           (null)
+WARNING: CPU: 3 PID: 13407 at lib/debugobjects.c:291 debug_print_object+0x166/0x220
+Modules linked in:
+CPU: 3 PID: 13407 Comm: syzbot_19c09769 Not tainted 4.16.0-rc2+ #38
+Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+RIP: 0010:debug_print_object+0x166/0x220
+RSP: 0018:ffff880013647a00 EFLAGS: 00010082
+RAX: dffffc0000000008 RBX: 0000000000000003 RCX: ffffffff814d3333
+RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffff88001a59f6d0
+RBP: ffff880013647a40 R08: 0000000000000000 R09: 0000000000000001
+R10: ffff8800136479a8 R11: 0000000000000000 R12: 0000000000000001
+R13: ffffffff86161420 R14: ffffffff85648b60 R15: 0000000000000000
+FS:  0000000000000000(0000) GS:ffff88001a580000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000020e77000 CR3: 0000000006022000 CR4: 00000000000006e0
+Call Trace:
+ debug_object_activate+0x38b/0x530
+ ? debug_object_assert_init+0x3b0/0x3b0
+ ? __mutex_unlock_slowpath+0x85/0x8b0
+ ? pppol2tp_session_destruct+0x110/0x110
+ __call_rcu.constprop.66+0x39/0x890
+ ? __call_rcu.constprop.66+0x39/0x890
+ call_rcu_sched+0x17/0x20
+ pppol2tp_release+0x2c7/0x440
+ ? fcntl_setlk+0xca0/0xca0
+ ? sock_alloc_file+0x340/0x340
+ sock_release+0x92/0x1e0
+ sock_close+0x1b/0x20
+ __fput+0x296/0x6e0
+ ____fput+0x1a/0x20
+ task_work_run+0x127/0x1a0
+ do_exit+0x7f9/0x2ce0
+ ? SYSC_connect+0x212/0x310
+ ? mm_update_next_owner+0x690/0x690
+ ? up_read+0x1f/0x40
+ ? __do_page_fault+0x3c8/0xca0
+ do_group_exit+0x10d/0x330
+ ? do_group_exit+0x330/0x330
+ SyS_exit_group+0x22/0x30
+ do_syscall_64+0x1e0/0x730
+ ? trace_hardirqs_off_thunk+0x1a/0x1c
+ entry_SYSCALL_64_after_hwframe+0x42/0xb7
+RIP: 0033:0x7f362e471259
+RSP: 002b:00007ffe389abe08 EFLAGS: 00000202 ORIG_RAX: 00000000000000e7
+RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f362e471259
+RDX: 00007f362e471259 RSI: 000000000000002e RDI: 0000000000000000
+RBP: 00007ffe389abe30 R08: 0000000000000000 R09: 00007f362e944270
+R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000400b60
+R13: 00007ffe389abf50 R14: 0000000000000000 R15: 0000000000000000
+Code: 8d 3c dd a0 8f 64 85 48 89 fa 48 c1 ea 03 80 3c 02 00 75 7b 48 8b 14 dd a0 8f 64 85 4c 89 f6 48 c7 c7 20 85 64 85 e
+8 2a 55 14 ff <0f> 0b 83 05 ad 2a 68 04 01 48 83 c4 18 5b 41 5c 41 5d 41 5e 41
+
+Fixes: ee40fb2e1eb5b ("l2tp: protect sock pointer of struct pppol2tp_session with RCU")
+Signed-off-by: James Chapman <jchapman@katalix.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/l2tp/l2tp_ppp.c |   52 +++++++++++++++++++++++++++-------------------------
+ 1 file changed, 27 insertions(+), 25 deletions(-)
+
+--- a/net/l2tp/l2tp_ppp.c
++++ b/net/l2tp/l2tp_ppp.c
+@@ -416,10 +416,28 @@ abort:
+  * Session (and tunnel control) socket create/destroy.
+  *****************************************************************************/
++static void pppol2tp_put_sk(struct rcu_head *head)
++{
++      struct pppol2tp_session *ps;
++
++      ps = container_of(head, typeof(*ps), rcu);
++      sock_put(ps->__sk);
++}
++
+ /* Called by l2tp_core when a session socket is being closed.
+  */
+ static void pppol2tp_session_close(struct l2tp_session *session)
+ {
++      struct pppol2tp_session *ps;
++
++      ps = l2tp_session_priv(session);
++      mutex_lock(&ps->sk_lock);
++      ps->__sk = rcu_dereference_protected(ps->sk,
++                                           lockdep_is_held(&ps->sk_lock));
++      RCU_INIT_POINTER(ps->sk, NULL);
++      if (ps->__sk)
++              call_rcu(&ps->rcu, pppol2tp_put_sk);
++      mutex_unlock(&ps->sk_lock);
+ }
+ /* Really kill the session socket. (Called from sock_put() if
+@@ -439,14 +457,6 @@ static void pppol2tp_session_destruct(st
+       }
+ }
+-static void pppol2tp_put_sk(struct rcu_head *head)
+-{
+-      struct pppol2tp_session *ps;
+-
+-      ps = container_of(head, typeof(*ps), rcu);
+-      sock_put(ps->__sk);
+-}
+-
+ /* Called when the PPPoX socket (session) is closed.
+  */
+ static int pppol2tp_release(struct socket *sock)
+@@ -470,26 +480,17 @@ static int pppol2tp_release(struct socke
+       sock_orphan(sk);
+       sock->sk = NULL;
++      /* If the socket is associated with a session,
++       * l2tp_session_delete will call pppol2tp_session_close which
++       * will drop the session's ref on the socket.
++       */
+       session = pppol2tp_sock_to_session(sk);
+-
+-      if (session != NULL) {
+-              struct pppol2tp_session *ps;
+-
++      if (session) {
+               l2tp_session_delete(session);
+-
+-              ps = l2tp_session_priv(session);
+-              mutex_lock(&ps->sk_lock);
+-              ps->__sk = rcu_dereference_protected(ps->sk,
+-                                                   lockdep_is_held(&ps->sk_lock));
+-              RCU_INIT_POINTER(ps->sk, NULL);
+-              mutex_unlock(&ps->sk_lock);
+-              call_rcu(&ps->rcu, pppol2tp_put_sk);
+-
+-              /* Rely on the sock_put() call at the end of the function for
+-               * dropping the reference held by pppol2tp_sock_to_session().
+-               * The last reference will be dropped by pppol2tp_put_sk().
+-               */
++              /* drop the ref obtained by pppol2tp_sock_to_session */
++              sock_put(sk);
+       }
++
+       release_sock(sk);
+       /* This will delete the session context via
+@@ -786,6 +787,7 @@ static int pppol2tp_connect(struct socke
+ out_no_ppp:
+       /* This is how we get the session context from the socket. */
++      sock_hold(sk);
+       sk->sk_user_data = session;
+       rcu_assign_pointer(ps->sk, sk);
+       mutex_unlock(&ps->sk_lock);
diff --git a/queue-4.15/l2tp-fix-races-with-tunnel-socket-close.patch b/queue-4.15/l2tp-fix-races-with-tunnel-socket-close.patch
new file mode 100644 (file)
index 0000000..6b13f6c
--- /dev/null
@@ -0,0 +1,417 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: James Chapman <jchapman@katalix.com>
+Date: Fri, 23 Feb 2018 17:45:45 +0000
+Subject: l2tp: fix races with tunnel socket close
+
+From: James Chapman <jchapman@katalix.com>
+
+
+[ Upstream commit d00fa9adc528c1b0e64d532556764852df8bd7b9 ]
+
+The tunnel socket tunnel->sock (struct sock) is accessed when
+preparing a new ppp session on a tunnel at pppol2tp_session_init. If
+the socket is closed by a thread while another is creating a new
+session, the threads race. In pppol2tp_connect, the tunnel object may
+be created if the pppol2tp socket is associated with the special
+session_id 0 and the tunnel socket is looked up using the provided
+fd. When handling this, pppol2tp_connect cannot sock_hold the tunnel
+socket to prevent it being destroyed during pppol2tp_connect since
+this may itself may race with the socket being destroyed. Doing
+sockfd_lookup in pppol2tp_connect isn't sufficient to prevent
+tunnel->sock going away either because a given tunnel socket fd may be
+reused between calls to pppol2tp_connect. Instead, have
+l2tp_tunnel_create sock_hold the tunnel socket before it does
+sockfd_put. This ensures that the tunnel's socket is always extant
+while the tunnel object exists. Hold a ref on the socket until the
+tunnel is destroyed and ensure that all tunnel destroy paths go
+through a common function (l2tp_tunnel_delete) since this will do the
+final sock_put to release the tunnel socket.
+
+Since the tunnel's socket is now guaranteed to exist if the tunnel
+exists, we no longer need to use sockfd_lookup via l2tp_sock_to_tunnel
+to derive the tunnel from the socket since this is always
+sk_user_data.
+
+Also, sessions no longer sock_hold the tunnel socket since sessions
+already hold a tunnel ref and the tunnel sock will not be freed until
+the tunnel is freed. Removing these sock_holds in
+l2tp_session_register avoids a possible sock leak in the
+pppol2tp_connect error path if l2tp_session_register succeeds but
+attaching a ppp channel fails. The pppol2tp_connect error path could
+have been fixed instead and have the sock ref dropped when the session
+is freed, but doing a sock_put of the tunnel socket when the session
+is freed would require a new session_free callback. It is simpler to
+just remove the sock_hold of the tunnel socket in
+l2tp_session_register, now that the tunnel socket lifetime is
+guaranteed.
+
+Finally, some init code in l2tp_tunnel_create is reordered to ensure
+that the new tunnel object's refcount is set and the tunnel socket ref
+is taken before the tunnel socket destructor callbacks are set.
+
+kasan: CONFIG_KASAN_INLINE enabled
+kasan: GPF could be caused by NULL-ptr deref or user memory access
+general protection fault: 0000 [#1] SMP KASAN
+Modules linked in:
+CPU: 0 PID: 4360 Comm: syzbot_19c09769 Not tainted 4.16.0-rc2+ #34
+Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+RIP: 0010:pppol2tp_session_init+0x1d6/0x500
+RSP: 0018:ffff88001377fb40 EFLAGS: 00010212
+RAX: dffffc0000000000 RBX: ffff88001636a940 RCX: ffffffff84836c1d
+RDX: 0000000000000045 RSI: 0000000055976744 RDI: 0000000000000228
+RBP: ffff88001377fb60 R08: ffffffff84836bc8 R09: 0000000000000002
+R10: ffff88001377fab8 R11: 0000000000000001 R12: 0000000000000000
+R13: ffff88001636aac8 R14: ffff8800160f81c0 R15: 1ffff100026eff76
+FS:  00007ffb3ea66700(0000) GS:ffff88001a400000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000020e77000 CR3: 0000000016261000 CR4: 00000000000006f0
+Call Trace:
+ pppol2tp_connect+0xd18/0x13c0
+ ? pppol2tp_session_create+0x170/0x170
+ ? __might_fault+0x115/0x1d0
+ ? lock_downgrade+0x860/0x860
+ ? __might_fault+0xe5/0x1d0
+ ? security_socket_connect+0x8e/0xc0
+ SYSC_connect+0x1b6/0x310
+ ? SYSC_bind+0x280/0x280
+ ? __do_page_fault+0x5d1/0xca0
+ ? up_read+0x1f/0x40
+ ? __do_page_fault+0x3c8/0xca0
+ SyS_connect+0x29/0x30
+ ? SyS_accept+0x40/0x40
+ do_syscall_64+0x1e0/0x730
+ ? trace_hardirqs_off_thunk+0x1a/0x1c
+ entry_SYSCALL_64_after_hwframe+0x42/0xb7
+RIP: 0033:0x7ffb3e376259
+RSP: 002b:00007ffeda4f6508 EFLAGS: 00000202 ORIG_RAX: 000000000000002a
+RAX: ffffffffffffffda RBX: 0000000020e77012 RCX: 00007ffb3e376259
+RDX: 000000000000002e RSI: 0000000020e77000 RDI: 0000000000000004
+RBP: 00007ffeda4f6540 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000400b60
+R13: 00007ffeda4f6660 R14: 0000000000000000 R15: 0000000000000000
+Code: 80 3d b0 ff 06 02 00 0f 84 07 02 00 00 e8 13 d6 db fc 49 8d bc 24 28 02 00 00 48 b8 00 00 00 00 00 fc ff df 48 89 f
+a 48 c1 ea 03 <80> 3c 02 00 0f 85 ed 02 00 00 4d 8b a4 24 28 02 00 00 e8 13 16
+
+Fixes: 80d84ef3ff1dd ("l2tp: prevent l2tp_tunnel_delete racing with userspace close")
+Signed-off-by: James Chapman <jchapman@katalix.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/l2tp/l2tp_core.c |  117 ++++++++++++++-------------------------------------
+ net/l2tp/l2tp_core.h |   23 ----------
+ net/l2tp/l2tp_ip.c   |   10 +---
+ net/l2tp/l2tp_ip6.c  |    8 +--
+ 4 files changed, 42 insertions(+), 116 deletions(-)
+
+--- a/net/l2tp/l2tp_core.c
++++ b/net/l2tp/l2tp_core.c
+@@ -136,51 +136,6 @@ l2tp_session_id_hash_2(struct l2tp_net *
+ }
+-/* Lookup the tunnel socket, possibly involving the fs code if the socket is
+- * owned by userspace.  A struct sock returned from this function must be
+- * released using l2tp_tunnel_sock_put once you're done with it.
+- */
+-static struct sock *l2tp_tunnel_sock_lookup(struct l2tp_tunnel *tunnel)
+-{
+-      int err = 0;
+-      struct socket *sock = NULL;
+-      struct sock *sk = NULL;
+-
+-      if (!tunnel)
+-              goto out;
+-
+-      if (tunnel->fd >= 0) {
+-              /* Socket is owned by userspace, who might be in the process
+-               * of closing it.  Look the socket up using the fd to ensure
+-               * consistency.
+-               */
+-              sock = sockfd_lookup(tunnel->fd, &err);
+-              if (sock)
+-                      sk = sock->sk;
+-      } else {
+-              /* Socket is owned by kernelspace */
+-              sk = tunnel->sock;
+-              sock_hold(sk);
+-      }
+-
+-out:
+-      return sk;
+-}
+-
+-/* Drop a reference to a tunnel socket obtained via. l2tp_tunnel_sock_put */
+-static void l2tp_tunnel_sock_put(struct sock *sk)
+-{
+-      struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
+-      if (tunnel) {
+-              if (tunnel->fd >= 0) {
+-                      /* Socket is owned by userspace */
+-                      sockfd_put(sk->sk_socket);
+-              }
+-              sock_put(sk);
+-      }
+-      sock_put(sk);
+-}
+-
+ /* Session hash list.
+  * The session_id SHOULD be random according to RFC2661, but several
+  * L2TP implementations (Cisco and Microsoft) use incrementing
+@@ -193,6 +148,13 @@ l2tp_session_id_hash(struct l2tp_tunnel
+       return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)];
+ }
++void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
++{
++      sock_put(tunnel->sock);
++      /* the tunnel is freed in the socket destructor */
++}
++EXPORT_SYMBOL(l2tp_tunnel_free);
++
+ /* Lookup a tunnel. A new reference is held on the returned tunnel. */
+ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id)
+ {
+@@ -345,13 +307,11 @@ int l2tp_session_register(struct l2tp_se
+                       }
+               l2tp_tunnel_inc_refcount(tunnel);
+-              sock_hold(tunnel->sock);
+               hlist_add_head_rcu(&session->global_hlist, g_head);
+               spin_unlock_bh(&pn->l2tp_session_hlist_lock);
+       } else {
+               l2tp_tunnel_inc_refcount(tunnel);
+-              sock_hold(tunnel->sock);
+       }
+       hlist_add_head(&session->hlist, head);
+@@ -975,7 +935,7 @@ int l2tp_udp_encap_recv(struct sock *sk,
+ {
+       struct l2tp_tunnel *tunnel;
+-      tunnel = l2tp_sock_to_tunnel(sk);
++      tunnel = l2tp_tunnel(sk);
+       if (tunnel == NULL)
+               goto pass_up;
+@@ -983,13 +943,10 @@ int l2tp_udp_encap_recv(struct sock *sk,
+                tunnel->name, skb->len);
+       if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook))
+-              goto pass_up_put;
++              goto pass_up;
+-      sock_put(sk);
+       return 0;
+-pass_up_put:
+-      sock_put(sk);
+ pass_up:
+       return 1;
+ }
+@@ -1223,7 +1180,6 @@ static void l2tp_tunnel_destruct(struct
+       l2tp_info(tunnel, L2TP_MSG_CONTROL, "%s: closing...\n", tunnel->name);
+-
+       /* Disable udp encapsulation */
+       switch (tunnel->encap) {
+       case L2TP_ENCAPTYPE_UDP:
+@@ -1246,12 +1202,11 @@ static void l2tp_tunnel_destruct(struct
+       list_del_rcu(&tunnel->list);
+       spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+-      tunnel->sock = NULL;
+-      l2tp_tunnel_dec_refcount(tunnel);
+-
+       /* Call the original destructor */
+       if (sk->sk_destruct)
+               (*sk->sk_destruct)(sk);
++
++      kfree_rcu(tunnel, rcu);
+ end:
+       return;
+ }
+@@ -1312,30 +1267,22 @@ EXPORT_SYMBOL_GPL(l2tp_tunnel_closeall);
+ /* Tunnel socket destroy hook for UDP encapsulation */
+ static void l2tp_udp_encap_destroy(struct sock *sk)
+ {
+-      struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
+-      if (tunnel) {
+-              l2tp_tunnel_closeall(tunnel);
+-              sock_put(sk);
+-      }
++      struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
++
++      if (tunnel)
++              l2tp_tunnel_delete(tunnel);
+ }
+ /* Workqueue tunnel deletion function */
+ static void l2tp_tunnel_del_work(struct work_struct *work)
+ {
+-      struct l2tp_tunnel *tunnel = NULL;
+-      struct socket *sock = NULL;
+-      struct sock *sk = NULL;
+-
+-      tunnel = container_of(work, struct l2tp_tunnel, del_work);
++      struct l2tp_tunnel *tunnel = container_of(work, struct l2tp_tunnel,
++                                                del_work);
++      struct sock *sk = tunnel->sock;
++      struct socket *sock = sk->sk_socket;
+       l2tp_tunnel_closeall(tunnel);
+-      sk = l2tp_tunnel_sock_lookup(tunnel);
+-      if (!sk)
+-              goto out;
+-
+-      sock = sk->sk_socket;
+-
+       /* If the tunnel socket was created within the kernel, use
+        * the sk API to release it here.
+        */
+@@ -1346,8 +1293,10 @@ static void l2tp_tunnel_del_work(struct
+               }
+       }
+-      l2tp_tunnel_sock_put(sk);
+-out:
++      /* drop initial ref */
++      l2tp_tunnel_dec_refcount(tunnel);
++
++      /* drop workqueue ref */
+       l2tp_tunnel_dec_refcount(tunnel);
+ }
+@@ -1600,13 +1549,22 @@ int l2tp_tunnel_create(struct net *net,
+               sk->sk_user_data = tunnel;
+       }
++      /* Bump the reference count. The tunnel context is deleted
++       * only when this drops to zero. A reference is also held on
++       * the tunnel socket to ensure that it is not released while
++       * the tunnel is extant. Must be done before sk_destruct is
++       * set.
++       */
++      refcount_set(&tunnel->ref_count, 1);
++      sock_hold(sk);
++      tunnel->sock = sk;
++      tunnel->fd = fd;
++
+       /* Hook on the tunnel socket destructor so that we can cleanup
+        * if the tunnel socket goes away.
+        */
+       tunnel->old_sk_destruct = sk->sk_destruct;
+       sk->sk_destruct = &l2tp_tunnel_destruct;
+-      tunnel->sock = sk;
+-      tunnel->fd = fd;
+       lockdep_set_class_and_name(&sk->sk_lock.slock, &l2tp_socket_class, "l2tp_sock");
+       sk->sk_allocation = GFP_ATOMIC;
+@@ -1616,11 +1574,6 @@ int l2tp_tunnel_create(struct net *net,
+       /* Add tunnel to our list */
+       INIT_LIST_HEAD(&tunnel->list);
+-
+-      /* Bump the reference count. The tunnel context is deleted
+-       * only when this drops to zero. Must be done before list insertion
+-       */
+-      refcount_set(&tunnel->ref_count, 1);
+       spin_lock_bh(&pn->l2tp_tunnel_list_lock);
+       list_add_rcu(&tunnel->list, &pn->l2tp_tunnel_list);
+       spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+@@ -1661,8 +1614,6 @@ void l2tp_session_free(struct l2tp_sessi
+       if (tunnel) {
+               BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC);
+-              sock_put(tunnel->sock);
+-              session->tunnel = NULL;
+               l2tp_tunnel_dec_refcount(tunnel);
+       }
+--- a/net/l2tp/l2tp_core.h
++++ b/net/l2tp/l2tp_core.h
+@@ -219,27 +219,8 @@ static inline void *l2tp_session_priv(st
+       return &session->priv[0];
+ }
+-static inline struct l2tp_tunnel *l2tp_sock_to_tunnel(struct sock *sk)
+-{
+-      struct l2tp_tunnel *tunnel;
+-
+-      if (sk == NULL)
+-              return NULL;
+-
+-      sock_hold(sk);
+-      tunnel = (struct l2tp_tunnel *)(sk->sk_user_data);
+-      if (tunnel == NULL) {
+-              sock_put(sk);
+-              goto out;
+-      }
+-
+-      BUG_ON(tunnel->magic != L2TP_TUNNEL_MAGIC);
+-
+-out:
+-      return tunnel;
+-}
+-
+ struct l2tp_tunnel *l2tp_tunnel_get(const struct net *net, u32 tunnel_id);
++void l2tp_tunnel_free(struct l2tp_tunnel *tunnel);
+ struct l2tp_session *l2tp_session_get(const struct net *net,
+                                     struct l2tp_tunnel *tunnel,
+@@ -288,7 +269,7 @@ static inline void l2tp_tunnel_inc_refco
+ static inline void l2tp_tunnel_dec_refcount(struct l2tp_tunnel *tunnel)
+ {
+       if (refcount_dec_and_test(&tunnel->ref_count))
+-              kfree_rcu(tunnel, rcu);
++              l2tp_tunnel_free(tunnel);
+ }
+ /* Session reference counts. Incremented when code obtains a reference
+--- a/net/l2tp/l2tp_ip.c
++++ b/net/l2tp/l2tp_ip.c
+@@ -234,17 +234,13 @@ static void l2tp_ip_close(struct sock *s
+ static void l2tp_ip_destroy_sock(struct sock *sk)
+ {
+       struct sk_buff *skb;
+-      struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
++      struct l2tp_tunnel *tunnel = sk->sk_user_data;
+       while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+               kfree_skb(skb);
+-      if (tunnel) {
+-              l2tp_tunnel_closeall(tunnel);
+-              sock_put(sk);
+-      }
+-
+-      sk_refcnt_debug_dec(sk);
++      if (tunnel)
++              l2tp_tunnel_delete(tunnel);
+ }
+ static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+--- a/net/l2tp/l2tp_ip6.c
++++ b/net/l2tp/l2tp_ip6.c
+@@ -248,16 +248,14 @@ static void l2tp_ip6_close(struct sock *
+ static void l2tp_ip6_destroy_sock(struct sock *sk)
+ {
+-      struct l2tp_tunnel *tunnel = l2tp_sock_to_tunnel(sk);
++      struct l2tp_tunnel *tunnel = sk->sk_user_data;
+       lock_sock(sk);
+       ip6_flush_pending_frames(sk);
+       release_sock(sk);
+-      if (tunnel) {
+-              l2tp_tunnel_closeall(tunnel);
+-              sock_put(sk);
+-      }
++      if (tunnel)
++              l2tp_tunnel_delete(tunnel);
+       inet6_destroy_sock(sk);
+ }
diff --git a/queue-4.15/l2tp-fix-tunnel-lookup-use-after-free-race.patch b/queue-4.15/l2tp-fix-tunnel-lookup-use-after-free-race.patch
new file mode 100644 (file)
index 0000000..16fae98
--- /dev/null
@@ -0,0 +1,116 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: James Chapman <jchapman@katalix.com>
+Date: Fri, 23 Feb 2018 17:45:47 +0000
+Subject: l2tp: fix tunnel lookup use-after-free race
+
+From: James Chapman <jchapman@katalix.com>
+
+
+[ Upstream commit 28f5bfb819195ad9c2eb9486babe7b0e4efe925f ]
+
+l2tp_tunnel_get walks the tunnel list to find a matching tunnel
+instance and if a match is found, its refcount is increased before
+returning the tunnel pointer. But when tunnel objects are destroyed,
+they are on the tunnel list after their refcount hits zero. Fix this
+by moving the code that removes the tunnel from the tunnel list from
+the tunnel socket destructor into in the l2tp_tunnel_delete path,
+before the tunnel refcount is decremented.
+
+refcount_t: increment on 0; use-after-free.
+WARNING: CPU: 3 PID: 13507 at lib/refcount.c:153 refcount_inc+0x47/0x50
+Modules linked in:
+CPU: 3 PID: 13507 Comm: syzbot_6e6a5ec8 Not tainted 4.16.0-rc2+ #36
+Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+RIP: 0010:refcount_inc+0x47/0x50
+RSP: 0018:ffff8800136ffb20 EFLAGS: 00010286
+RAX: dffffc0000000008 RBX: ffff880017068e68 RCX: ffffffff814d3333
+RDX: 0000000000000000 RSI: ffff88001a59f6d8 RDI: ffff88001a59f6d8
+RBP: ffff8800136ffb28 R08: 0000000000000000 R09: 0000000000000000
+R10: ffff8800136ffab0 R11: 0000000000000000 R12: ffff880017068e50
+R13: 0000000000000000 R14: ffff8800174da800 R15: 0000000000000004
+FS:  00007f403ab1e700(0000) GS:ffff88001a580000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00000000205fafd2 CR3: 0000000016770000 CR4: 00000000000006e0
+Call Trace:
+ l2tp_tunnel_get+0x2dd/0x4e0
+ pppol2tp_connect+0x428/0x13c0
+ ? pppol2tp_session_create+0x170/0x170
+ ? __might_fault+0x115/0x1d0
+ ? lock_downgrade+0x860/0x860
+ ? __might_fault+0xe5/0x1d0
+ ? security_socket_connect+0x8e/0xc0
+ SYSC_connect+0x1b6/0x310
+ ? SYSC_bind+0x280/0x280
+ ? __do_page_fault+0x5d1/0xca0
+ ? up_read+0x1f/0x40
+ ? __do_page_fault+0x3c8/0xca0
+ SyS_connect+0x29/0x30
+ ? SyS_accept+0x40/0x40
+ do_syscall_64+0x1e0/0x730
+ ? trace_hardirqs_off_thunk+0x1a/0x1c
+ entry_SYSCALL_64_after_hwframe+0x42/0xb7
+RIP: 0033:0x7f403a42f259
+RSP: 002b:00007f403ab1dee8 EFLAGS: 00000296 ORIG_RAX: 000000000000002a
+RAX: ffffffffffffffda RBX: 00000000205fafe4 RCX: 00007f403a42f259
+RDX: 000000000000002e RSI: 00000000205fafd2 RDI: 0000000000000004
+RBP: 00007f403ab1df20 R08: 00007f403ab1e700 R09: 0000000000000000
+R10: 00007f403ab1e700 R11: 0000000000000296 R12: 0000000000000000
+R13: 00007ffc81906cbf R14: 0000000000000000 R15: 00007f403ab2b040
+Code: 3b ff 5b 5d c3 e8 ca 5f 3b ff 80 3d 49 8e 66 04 00 75 ea e8 bc 5f 3b ff 48 c7 c7 60 69 64 85 c6 05 34 8e 66 04 01 e8 59 49 15 ff <0f> 0b eb ce 0f 1f 44 00 00 55 48 89 e5 41 56 41 55 41 54 53 49
+
+Fixes: f8ccac0e44934 ("l2tp: put tunnel socket release on a workqueue")
+Reported-and-tested-by: syzbot+19c09769f14b48810113@syzkaller.appspotmail.com
+Reported-and-tested-by: syzbot+347bd5acde002e353a36@syzkaller.appspotmail.com
+Reported-and-tested-by: syzbot+6e6a5ec8de31a94cd015@syzkaller.appspotmail.com
+Reported-and-tested-by: syzbot+9df43faf09bd400f2993@syzkaller.appspotmail.com
+Signed-off-by: James Chapman <jchapman@katalix.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/l2tp/l2tp_core.c |   14 +++++++-------
+ 1 file changed, 7 insertions(+), 7 deletions(-)
+
+--- a/net/l2tp/l2tp_core.c
++++ b/net/l2tp/l2tp_core.c
+@@ -1173,7 +1173,6 @@ EXPORT_SYMBOL_GPL(l2tp_xmit_skb);
+ static void l2tp_tunnel_destruct(struct sock *sk)
+ {
+       struct l2tp_tunnel *tunnel = l2tp_tunnel(sk);
+-      struct l2tp_net *pn;
+       if (tunnel == NULL)
+               goto end;
+@@ -1196,12 +1195,6 @@ static void l2tp_tunnel_destruct(struct
+       sk->sk_destruct = tunnel->old_sk_destruct;
+       sk->sk_user_data = NULL;
+-      /* Remove the tunnel struct from the tunnel list */
+-      pn = l2tp_pernet(tunnel->l2tp_net);
+-      spin_lock_bh(&pn->l2tp_tunnel_list_lock);
+-      list_del_rcu(&tunnel->list);
+-      spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
+-
+       /* Call the original destructor */
+       if (sk->sk_destruct)
+               (*sk->sk_destruct)(sk);
+@@ -1280,6 +1273,7 @@ static void l2tp_tunnel_del_work(struct
+                                                 del_work);
+       struct sock *sk = tunnel->sock;
+       struct socket *sock = sk->sk_socket;
++      struct l2tp_net *pn;
+       l2tp_tunnel_closeall(tunnel);
+@@ -1293,6 +1287,12 @@ static void l2tp_tunnel_del_work(struct
+               }
+       }
++      /* Remove the tunnel struct from the tunnel list */
++      pn = l2tp_pernet(tunnel->l2tp_net);
++      spin_lock_bh(&pn->l2tp_tunnel_list_lock);
++      list_del_rcu(&tunnel->list);
++      spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
++
+       /* drop initial ref */
+       l2tp_tunnel_dec_refcount(tunnel);
diff --git a/queue-4.15/mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch b/queue-4.15/mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch
new file mode 100644 (file)
index 0000000..b275ee0
--- /dev/null
@@ -0,0 +1,41 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Ido Schimmel <idosch@mellanox.com>
+Date: Sat, 17 Feb 2018 00:30:44 +0100
+Subject: mlxsw: spectrum_router: Do not unconditionally clear route offload indication
+
+From: Ido Schimmel <idosch@mellanox.com>
+
+
+[ Upstream commit d1c95af366961101819f07e3c64d44f3be7f0367 ]
+
+When mlxsw replaces (or deletes) a route it removes the offload
+indication from the replaced route. This is problematic for IPv4 routes,
+as the offload indication is stored in the fib_info which is usually
+shared between multiple routes.
+
+Instead of unconditionally clearing the offload indication, only clear
+it if no other route is using the fib_info.
+
+Fixes: 3984d1a89fe7 ("mlxsw: spectrum_router: Provide offload indication using nexthop flags")
+Signed-off-by: Ido Schimmel <idosch@mellanox.com>
+Reported-by: Alexander Petrovskiy <alexpe@mellanox.com>
+Tested-by: Alexander Petrovskiy <alexpe@mellanox.com>
+Signed-off-by: Jiri Pirko <jiri@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+@@ -3765,6 +3765,9 @@ mlxsw_sp_fib4_entry_offload_unset(struct
+       struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
+       int i;
++      if (!list_is_singular(&nh_grp->fib_list))
++              return;
++
+       for (i = 0; i < nh_grp->count; i++) {
+               struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
diff --git a/queue-4.15/mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch b/queue-4.15/mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch
new file mode 100644 (file)
index 0000000..46222bf
--- /dev/null
@@ -0,0 +1,92 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Jiri Pirko <jiri@mellanox.com>
+Date: Tue, 13 Feb 2018 11:22:42 +0100
+Subject: mlxsw: spectrum_router: Fix error path in mlxsw_sp_vr_create
+
+From: Jiri Pirko <jiri@mellanox.com>
+
+
+[ Upstream commit 0f2d2b2736b08dafa3bde31d048750fbc8df3a31 ]
+
+Since mlxsw_sp_fib_create() and mlxsw_sp_mr_table_create()
+use ERR_PTR macro to propagate int err through return of a pointer,
+the return value is not NULL in case of failure. So if one
+of the calls fails, one of vr->fib4, vr->fib6 or vr->mr4_table
+is not NULL and mlxsw_sp_vr_is_used wrongly assumes
+that vr is in use which leads to crash like following one:
+
+[ 1293.949291] BUG: unable to handle kernel NULL pointer dereference at 00000000000006c9
+[ 1293.952729] IP: mlxsw_sp_mr_table_flush+0x15/0x70 [mlxsw_spectrum]
+
+Fix this by using local variables to hold the pointers and set vr->*
+only in case everything went fine.
+
+Fixes: 76610ebbde18 ("mlxsw: spectrum_router: Refactor virtual router handling")
+Fixes: a3d9bc506d64 ("mlxsw: spectrum_router: Extend virtual routers with IPv6 support")
+Fixes: d42b0965b1d4 ("mlxsw: spectrum_router: Add multicast routes notification handling functionality")
+Signed-off-by: Jiri Pirko <jiri@mellanox.com>
+Reviewed-by: Ido Schimmel <idosch@mellanox.com>
+Signed-off-by: Jiri Pirko <jiri@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c |   32 ++++++++++--------
+ 1 file changed, 18 insertions(+), 14 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+@@ -737,6 +737,9 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_c
+                                             u32 tb_id,
+                                             struct netlink_ext_ack *extack)
+ {
++      struct mlxsw_sp_mr_table *mr4_table;
++      struct mlxsw_sp_fib *fib4;
++      struct mlxsw_sp_fib *fib6;
+       struct mlxsw_sp_vr *vr;
+       int err;
+@@ -745,29 +748,30 @@ static struct mlxsw_sp_vr *mlxsw_sp_vr_c
+               NL_SET_ERR_MSG(extack, "spectrum: Exceeded number of supported virtual routers");
+               return ERR_PTR(-EBUSY);
+       }
+-      vr->fib4 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV4);
+-      if (IS_ERR(vr->fib4))
+-              return ERR_CAST(vr->fib4);
+-      vr->fib6 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV6);
+-      if (IS_ERR(vr->fib6)) {
+-              err = PTR_ERR(vr->fib6);
++      fib4 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV4);
++      if (IS_ERR(fib4))
++              return ERR_CAST(fib4);
++      fib6 = mlxsw_sp_fib_create(vr, MLXSW_SP_L3_PROTO_IPV6);
++      if (IS_ERR(fib6)) {
++              err = PTR_ERR(fib6);
+               goto err_fib6_create;
+       }
+-      vr->mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id,
+-                                               MLXSW_SP_L3_PROTO_IPV4);
+-      if (IS_ERR(vr->mr4_table)) {
+-              err = PTR_ERR(vr->mr4_table);
++      mr4_table = mlxsw_sp_mr_table_create(mlxsw_sp, vr->id,
++                                           MLXSW_SP_L3_PROTO_IPV4);
++      if (IS_ERR(mr4_table)) {
++              err = PTR_ERR(mr4_table);
+               goto err_mr_table_create;
+       }
++      vr->fib4 = fib4;
++      vr->fib6 = fib6;
++      vr->mr4_table = mr4_table;
+       vr->tb_id = tb_id;
+       return vr;
+ err_mr_table_create:
+-      mlxsw_sp_fib_destroy(vr->fib6);
+-      vr->fib6 = NULL;
++      mlxsw_sp_fib_destroy(fib6);
+ err_fib6_create:
+-      mlxsw_sp_fib_destroy(vr->fib4);
+-      vr->fib4 = NULL;
++      mlxsw_sp_fib_destroy(fib4);
+       return ERR_PTR(err);
+ }
diff --git a/queue-4.15/mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch b/queue-4.15/mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch
new file mode 100644 (file)
index 0000000..56d967c
--- /dev/null
@@ -0,0 +1,104 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Shalom Toledo <shalomt@mellanox.com>
+Date: Thu, 1 Mar 2018 11:37:05 +0100
+Subject: mlxsw: spectrum_switchdev: Check success of FDB add operation
+
+From: Shalom Toledo <shalomt@mellanox.com>
+
+
+[ Upstream commit 0a8a1bf17e3af34f1f8d2368916a6327f8b3bfd5 ]
+
+Until now, we assumed that in case of error when adding FDB entries, the
+write operation will fail, but this is not the case. Instead, we need to
+check that the number of entries reported in the response is equal to
+the number of entries specified in the request.
+
+Fixes: 56ade8fe3fe1 ("mlxsw: spectrum: Add initial support for Spectrum ASIC")
+Reported-by: Ido Schimmel <idosch@mellanox.com>
+Signed-off-by: Shalom Toledo <shalomt@mellanox.com>
+Reviewed-by: Ido Schimmel <idosch@mellanox.com>
+Signed-off-by: Jiri Pirko <jiri@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c |   29 +++++++++++++--
+ 1 file changed, 27 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
+@@ -1203,6 +1203,7 @@ static int __mlxsw_sp_port_fdb_uc_op(str
+                                    bool dynamic)
+ {
+       char *sfd_pl;
++      u8 num_rec;
+       int err;
+       sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL);
+@@ -1212,9 +1213,16 @@ static int __mlxsw_sp_port_fdb_uc_op(str
+       mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0);
+       mlxsw_reg_sfd_uc_pack(sfd_pl, 0, mlxsw_sp_sfd_rec_policy(dynamic),
+                             mac, fid, action, local_port);
++      num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl);
+       err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
+-      kfree(sfd_pl);
++      if (err)
++              goto out;
++
++      if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl))
++              err = -EBUSY;
++out:
++      kfree(sfd_pl);
+       return err;
+ }
+@@ -1239,6 +1247,7 @@ static int mlxsw_sp_port_fdb_uc_lag_op(s
+                                      bool adding, bool dynamic)
+ {
+       char *sfd_pl;
++      u8 num_rec;
+       int err;
+       sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL);
+@@ -1249,9 +1258,16 @@ static int mlxsw_sp_port_fdb_uc_lag_op(s
+       mlxsw_reg_sfd_uc_lag_pack(sfd_pl, 0, mlxsw_sp_sfd_rec_policy(dynamic),
+                                 mac, fid, MLXSW_REG_SFD_REC_ACTION_NOP,
+                                 lag_vid, lag_id);
++      num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl);
+       err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
+-      kfree(sfd_pl);
++      if (err)
++              goto out;
++      if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl))
++              err = -EBUSY;
++
++out:
++      kfree(sfd_pl);
+       return err;
+ }
+@@ -1296,6 +1312,7 @@ static int mlxsw_sp_port_mdb_op(struct m
+                               u16 fid, u16 mid_idx, bool adding)
+ {
+       char *sfd_pl;
++      u8 num_rec;
+       int err;
+       sfd_pl = kmalloc(MLXSW_REG_SFD_LEN, GFP_KERNEL);
+@@ -1305,7 +1322,15 @@ static int mlxsw_sp_port_mdb_op(struct m
+       mlxsw_reg_sfd_pack(sfd_pl, mlxsw_sp_sfd_op(adding), 0);
+       mlxsw_reg_sfd_mc_pack(sfd_pl, 0, addr, fid,
+                             MLXSW_REG_SFD_REC_ACTION_NOP, mid_idx);
++      num_rec = mlxsw_reg_sfd_num_rec_get(sfd_pl);
+       err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(sfd), sfd_pl);
++      if (err)
++              goto out;
++
++      if (num_rec != mlxsw_reg_sfd_num_rec_get(sfd_pl))
++              err = -EBUSY;
++
++out:
+       kfree(sfd_pl);
+       return err;
+ }
diff --git a/queue-4.15/net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch b/queue-4.15/net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch
new file mode 100644 (file)
index 0000000..052075b
--- /dev/null
@@ -0,0 +1,32 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Wolfram Sang <wsa+renesas@sang-engineering.com>
+Date: Mon, 5 Feb 2018 21:10:01 +0100
+Subject: net: amd-xgbe: fix comparison to bitshift when dealing with a mask
+
+From: Wolfram Sang <wsa+renesas@sang-engineering.com>
+
+
+[ Upstream commit a3276892db7a588bedc33168e502572008f714a9 ]
+
+Due to a typo, the mask was destroyed by a comparison instead of a bit
+shift.
+
+Signed-off-by: Wolfram Sang <wsa+renesas@sang-engineering.com>
+Acked-by: Tom Lendacky <thomas.lendacky@amd.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/amd/xgbe/xgbe-drv.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
++++ b/drivers/net/ethernet/amd/xgbe/xgbe-drv.c
+@@ -595,7 +595,7 @@ isr_done:
+               reissue_mask = 1 << 0;
+               if (!pdata->per_channel_irq)
+-                      reissue_mask |= 0xffff < 4;
++                      reissue_mask |= 0xffff << 4;
+               XP_IOWRITE(pdata, XP_INT_REISSUE_EN, reissue_mask);
+       }
diff --git a/queue-4.15/net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch b/queue-4.15/net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch
new file mode 100644 (file)
index 0000000..217aa6c
--- /dev/null
@@ -0,0 +1,84 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Grygorii Strashko <grygorii.strashko@ti.com>
+Date: Tue, 6 Feb 2018 19:17:06 -0600
+Subject: net: ethernet: ti: cpsw: fix net watchdog timeout
+
+From: Grygorii Strashko <grygorii.strashko@ti.com>
+
+
+[ Upstream commit 62f94c2101f35cd45775df00ba09bde77580e26a ]
+
+It was discovered that simple program which indefinitely sends 200b UDP
+packets and runs on TI AM574x SoC (SMP) under RT Kernel triggers network
+watchdog timeout in TI CPSW driver (<6 hours run). The network watchdog
+timeout is triggered due to race between cpsw_ndo_start_xmit() and
+cpsw_tx_handler() [NAPI]
+
+cpsw_ndo_start_xmit()
+       if (unlikely(!cpdma_check_free_tx_desc(txch))) {
+               txq = netdev_get_tx_queue(ndev, q_idx);
+               netif_tx_stop_queue(txq);
+
+^^ as per [1] barier has to be used after set_bit() otherwise new value
+might not be visible to other cpus
+       }
+
+cpsw_tx_handler()
+       if (unlikely(netif_tx_queue_stopped(txq)))
+               netif_tx_wake_queue(txq);
+
+and when it happens ndev TX queue became disabled forever while driver's HW
+TX queue is empty.
+
+Fix this, by adding smp_mb__after_atomic() after netif_tx_stop_queue()
+calls and double check for free TX descriptors after stopping ndev TX queue
+- if there are free TX descriptors wake up ndev TX queue.
+
+[1] https://www.kernel.org/doc/html/latest/core-api/atomic_ops.html
+Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
+Reviewed-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/ti/cpsw.c |   16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/ti/cpsw.c
++++ b/drivers/net/ethernet/ti/cpsw.c
+@@ -1618,6 +1618,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(s
+               q_idx = q_idx % cpsw->tx_ch_num;
+       txch = cpsw->txv[q_idx].ch;
++      txq = netdev_get_tx_queue(ndev, q_idx);
+       ret = cpsw_tx_packet_submit(priv, skb, txch);
+       if (unlikely(ret != 0)) {
+               cpsw_err(priv, tx_err, "desc submit failed\n");
+@@ -1628,15 +1629,26 @@ static netdev_tx_t cpsw_ndo_start_xmit(s
+        * tell the kernel to stop sending us tx frames.
+        */
+       if (unlikely(!cpdma_check_free_tx_desc(txch))) {
+-              txq = netdev_get_tx_queue(ndev, q_idx);
+               netif_tx_stop_queue(txq);
++
++              /* Barrier, so that stop_queue visible to other cpus */
++              smp_mb__after_atomic();
++
++              if (cpdma_check_free_tx_desc(txch))
++                      netif_tx_wake_queue(txq);
+       }
+       return NETDEV_TX_OK;
+ fail:
+       ndev->stats.tx_dropped++;
+-      txq = netdev_get_tx_queue(ndev, skb_get_queue_mapping(skb));
+       netif_tx_stop_queue(txq);
++
++      /* Barrier, so that stop_queue visible to other cpus */
++      smp_mb__after_atomic();
++
++      if (cpdma_check_free_tx_desc(txch))
++              netif_tx_wake_queue(txq);
++
+       return NETDEV_TX_BUSY;
+ }
diff --git a/queue-4.15/net-fix-race-on-decreasing-number-of-tx-queues.patch b/queue-4.15/net-fix-race-on-decreasing-number-of-tx-queues.patch
new file mode 100644 (file)
index 0000000..82b2083
--- /dev/null
@@ -0,0 +1,68 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Jakub Kicinski <jakub.kicinski@netronome.com>
+Date: Mon, 12 Feb 2018 21:35:31 -0800
+Subject: net: fix race on decreasing number of TX queues
+
+From: Jakub Kicinski <jakub.kicinski@netronome.com>
+
+
+[ Upstream commit ac5b70198adc25c73fba28de4f78adcee8f6be0b ]
+
+netif_set_real_num_tx_queues() can be called when netdev is up.
+That usually happens when user requests change of number of
+channels/rings with ethtool -L.  The procedure for changing
+the number of queues involves resetting the qdiscs and setting
+dev->num_tx_queues to the new value.  When the new value is
+lower than the old one, extra care has to be taken to ensure
+ordering of accesses to the number of queues vs qdisc reset.
+
+Currently the queues are reset before new dev->num_tx_queues
+is assigned, leaving a window of time where packets can be
+enqueued onto the queues going down, leading to a likely
+crash in the drivers, since most drivers don't check if TX
+skbs are assigned to an active queue.
+
+Fixes: e6484930d7c7 ("net: allocate tx queues in register_netdevice")
+Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dev.c |   11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -2366,8 +2366,11 @@ EXPORT_SYMBOL(netdev_set_num_tc);
+  */
+ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
+ {
++      bool disabling;
+       int rc;
++      disabling = txq < dev->real_num_tx_queues;
++
+       if (txq < 1 || txq > dev->num_tx_queues)
+               return -EINVAL;
+@@ -2383,15 +2386,19 @@ int netif_set_real_num_tx_queues(struct
+               if (dev->num_tc)
+                       netif_setup_tc(dev, txq);
+-              if (txq < dev->real_num_tx_queues) {
++              dev->real_num_tx_queues = txq;
++
++              if (disabling) {
++                      synchronize_net();
+                       qdisc_reset_all_tx_gt(dev, txq);
+ #ifdef CONFIG_XPS
+                       netif_reset_xps_queues_gt(dev, txq);
+ #endif
+               }
++      } else {
++              dev->real_num_tx_queues = txq;
+       }
+-      dev->real_num_tx_queues = txq;
+       return 0;
+ }
+ EXPORT_SYMBOL(netif_set_real_num_tx_queues);
diff --git a/queue-4.15/net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch b/queue-4.15/net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch
new file mode 100644 (file)
index 0000000..c08ff7c
--- /dev/null
@@ -0,0 +1,64 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Sabrina Dubroca <sd@queasysnail.net>
+Date: Mon, 26 Feb 2018 16:13:43 +0100
+Subject: net: ipv4: don't allow setting net.ipv4.route.min_pmtu below 68
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+
+[ Upstream commit c7272c2f1229125f74f22dcdd59de9bbd804f1c8 ]
+
+According to RFC 1191 sections 3 and 4, ICMP frag-needed messages
+indicating an MTU below 68 should be rejected:
+
+    A host MUST never reduce its estimate of the Path MTU below 68
+    octets.
+
+and (talking about ICMP frag-needed's Next-Hop MTU field):
+
+    This field will never contain a value less than 68, since every
+    router "must be able to forward a datagram of 68 octets without
+    fragmentation".
+
+Furthermore, by letting net.ipv4.route.min_pmtu be set to negative
+values, we can end up with a very large PMTU when (-1) is cast into u32.
+
+Let's also make ip_rt_min_pmtu a u32, since it's only ever compared to
+unsigned ints.
+
+Reported-by: Jianlin Shi <jishi@redhat.com>
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/route.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -128,10 +128,13 @@ static int ip_rt_redirect_silence __read
+ static int ip_rt_error_cost __read_mostly     = HZ;
+ static int ip_rt_error_burst __read_mostly    = 5 * HZ;
+ static int ip_rt_mtu_expires __read_mostly    = 10 * 60 * HZ;
+-static int ip_rt_min_pmtu __read_mostly               = 512 + 20 + 20;
++static u32 ip_rt_min_pmtu __read_mostly               = 512 + 20 + 20;
+ static int ip_rt_min_advmss __read_mostly     = 256;
+ static int ip_rt_gc_timeout __read_mostly     = RT_GC_TIMEOUT;
++
++static int ip_min_valid_pmtu __read_mostly    = IPV4_MIN_MTU;
++
+ /*
+  *    Interface to generic destination cache.
+  */
+@@ -2934,7 +2937,8 @@ static struct ctl_table ipv4_route_table
+               .data           = &ip_rt_min_pmtu,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+-              .proc_handler   = proc_dointvec,
++              .proc_handler   = proc_dointvec_minmax,
++              .extra1         = &ip_min_valid_pmtu,
+       },
+       {
+               .procname       = "min_adv_mss",
diff --git a/queue-4.15/net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch b/queue-4.15/net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch
new file mode 100644 (file)
index 0000000..d169549
--- /dev/null
@@ -0,0 +1,36 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: David Ahern <dsahern@gmail.com>
+Date: Wed, 21 Feb 2018 11:00:54 -0800
+Subject: net: ipv4: Set addr_type in hash_keys for forwarded case
+
+From: David Ahern <dsahern@gmail.com>
+
+
+[ Upstream commit 1fe4b1184c2ae2bfbf9e8b14c9c0c1945c98f205 ]
+
+The result of the skb flow dissect is copied from keys to hash_keys to
+ensure only the intended data is hashed. The original L4 hash patch
+overlooked setting the addr_type for this case; add it.
+
+Fixes: bf4e0a3db97eb ("net: ipv4: add support for ECMP hash policy choice")
+Reported-by: Ido Schimmel <idosch@idosch.org>
+Signed-off-by: David Ahern <dsahern@gmail.com>
+Acked-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+Reviewed-by: Ido Schimmel <idosch@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/route.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/ipv4/route.c
++++ b/net/ipv4/route.c
+@@ -1832,6 +1832,8 @@ int fib_multipath_hash(const struct fib_
+                               return skb_get_hash_raw(skb) >> 1;
+                       memset(&hash_keys, 0, sizeof(hash_keys));
+                       skb_flow_dissect_flow_keys(skb, &keys, flag);
++
++                      hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+                       hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
+                       hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
+                       hash_keys.ports.src = keys.ports.src;
diff --git a/queue-4.15/net-mlx5-fix-error-handling-when-adding-flow-rules.patch b/queue-4.15/net-mlx5-fix-error-handling-when-adding-flow-rules.patch
new file mode 100644 (file)
index 0000000..8925769
--- /dev/null
@@ -0,0 +1,52 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Vlad Buslov <vladbu@mellanox.com>
+Date: Tue, 6 Feb 2018 10:52:19 +0200
+Subject: net/mlx5: Fix error handling when adding flow rules
+
+From: Vlad Buslov <vladbu@mellanox.com>
+
+
+[ Upstream commit 9238e380e823a39983ee8d6b6ee8d1a9c4ba8a65 ]
+
+If building match list or adding existing fg fails when
+node is locked, function returned without unlocking it.
+This happened if node version changed or adding existing fg
+returned with EAGAIN after jumping to search_again_locked label.
+
+Fixes: bd71b08ec2ee ("net/mlx5: Support multiple updates of steering rules in parallel")
+Signed-off-by: Vlad Buslov <vladbu@mellanox.com>
+Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/fs_core.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
+@@ -1755,8 +1755,11 @@ search_again_locked:
+       /* Collect all fgs which has a matching match_criteria */
+       err = build_match_list(&match_head, ft, spec);
+-      if (err)
++      if (err) {
++              if (take_write)
++                      up_write_ref_node(&ft->node);
+               return ERR_PTR(err);
++      }
+       if (!take_write)
+               up_read_ref_node(&ft->node);
+@@ -1765,8 +1768,11 @@ search_again_locked:
+                                     dest_num, version);
+       free_match_list(&match_head);
+       if (!IS_ERR(rule) ||
+-          (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN))
++          (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) {
++              if (take_write)
++                      up_write_ref_node(&ft->node);
+               return rule;
++      }
+       if (!take_write) {
+               nested_down_write_ref_node(&ft->node, FS_LOCK_GRANDPARENT);
diff --git a/queue-4.15/net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch b/queue-4.15/net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch
new file mode 100644 (file)
index 0000000..1cc5581
--- /dev/null
@@ -0,0 +1,36 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Inbar Karmy <inbark@mellanox.com>
+Date: Thu, 7 Dec 2017 17:26:33 +0200
+Subject: net/mlx5e: Fix loopback self test when GRO is off
+
+From: Inbar Karmy <inbark@mellanox.com>
+
+
+[ Upstream commit ef7a3518f7dd4f4cf5e5b5358c93d1eb78df28fb ]
+
+When GRO is off, the transport header pointer in sk_buff is
+initialized to network's header.
+
+To find the udp header, instead of using udp_hdr() which assumes
+skb_network_header was set, manually calculate the udp header offset.
+
+Fixes: 0952da791c97 ("net/mlx5e: Add support for loopback selftest")
+Signed-off-by: Inbar Karmy <inbark@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c
+@@ -216,7 +216,8 @@ mlx5e_test_loopback_validate(struct sk_b
+       if (iph->protocol != IPPROTO_UDP)
+               goto out;
+-      udph = udp_hdr(skb);
++      /* Don't assume skb_transport_header() was set */
++      udph = (struct udphdr *)((u8 *)iph + 4 * iph->ihl);
+       if (udph->dest != htons(9))
+               goto out;
diff --git a/queue-4.15/net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch b/queue-4.15/net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch
new file mode 100644 (file)
index 0000000..6bcfbec
--- /dev/null
@@ -0,0 +1,125 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Gal Pressman <galp@mellanox.com>
+Date: Wed, 20 Dec 2017 08:48:24 +0200
+Subject: net/mlx5e: Fix TCP checksum in LRO buffers
+
+From: Gal Pressman <galp@mellanox.com>
+
+
+[ Upstream commit 8babd44d2079079f9d5a4aca7005aed80236efe0 ]
+
+When receiving an LRO packet, the checksum field is set by the hardware
+to the checksum of the first coalesced packet. Obviously, this checksum
+is not valid for the merged LRO packet and should be fixed.  We can use
+the CQE checksum which covers the checksum of the entire merged packet
+TCP payload to help us calculate the checksum incrementally.
+
+Tested by sending IPv4/6 traffic with LRO enabled, RX checksum disabled
+and watching nstat checksum error counters (in addition to the obvious
+bandwidth drop caused by checksum errors).
+
+This bug is usually "hidden" since LRO packets would go through the
+CHECKSUM_UNNECESSARY flow which does not validate the packet checksum.
+
+It's important to note that previous to this patch, LRO packets provided
+with CHECKSUM_UNNECESSARY are indeed packets with a correct validated
+checksum (even though the checksum inside the TCP header is incorrect),
+since the hardware LRO aggregation is terminated upon receiving a packet
+with bad checksum.
+
+Fixes: e586b3b0baee ("net/mlx5: Ethernet Datapath files")
+Signed-off-by: Gal Pressman <galp@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_rx.c |   47 +++++++++++++++++-------
+ 1 file changed, 34 insertions(+), 13 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+@@ -36,6 +36,7 @@
+ #include <linux/tcp.h>
+ #include <linux/bpf_trace.h>
+ #include <net/busy_poll.h>
++#include <net/ip6_checksum.h>
+ #include "en.h"
+ #include "en_tc.h"
+ #include "eswitch.h"
+@@ -547,20 +548,33 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_r
+       return true;
+ }
++static void mlx5e_lro_update_tcp_hdr(struct mlx5_cqe64 *cqe, struct tcphdr *tcp)
++{
++      u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe);
++      u8 tcp_ack     = (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) ||
++                       (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA);
++
++      tcp->check                      = 0;
++      tcp->psh                        = get_cqe_lro_tcppsh(cqe);
++
++      if (tcp_ack) {
++              tcp->ack                = 1;
++              tcp->ack_seq            = cqe->lro_ack_seq_num;
++              tcp->window             = cqe->lro_tcp_win;
++      }
++}
++
+ static void mlx5e_lro_update_hdr(struct sk_buff *skb, struct mlx5_cqe64 *cqe,
+                                u32 cqe_bcnt)
+ {
+       struct ethhdr   *eth = (struct ethhdr *)(skb->data);
+       struct tcphdr   *tcp;
+       int network_depth = 0;
++      __wsum check;
+       __be16 proto;
+       u16 tot_len;
+       void *ip_p;
+-      u8 l4_hdr_type = get_cqe_l4_hdr_type(cqe);
+-      u8 tcp_ack = (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_NO_DATA) ||
+-              (l4_hdr_type == CQE_L4_HDR_TYPE_TCP_ACK_AND_DATA);
+-
+       proto = __vlan_get_protocol(skb, eth->h_proto, &network_depth);
+       tot_len = cqe_bcnt - network_depth;
+@@ -577,23 +591,30 @@ static void mlx5e_lro_update_hdr(struct
+               ipv4->check             = 0;
+               ipv4->check             = ip_fast_csum((unsigned char *)ipv4,
+                                                      ipv4->ihl);
++
++              mlx5e_lro_update_tcp_hdr(cqe, tcp);
++              check = csum_partial(tcp, tcp->doff * 4,
++                                   csum_unfold((__force __sum16)cqe->check_sum));
++              /* Almost done, don't forget the pseudo header */
++              tcp->check = csum_tcpudp_magic(ipv4->saddr, ipv4->daddr,
++                                             tot_len - sizeof(struct iphdr),
++                                             IPPROTO_TCP, check);
+       } else {
++              u16 payload_len = tot_len - sizeof(struct ipv6hdr);
+               struct ipv6hdr *ipv6 = ip_p;
+               tcp = ip_p + sizeof(struct ipv6hdr);
+               skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
+               ipv6->hop_limit         = cqe->lro_min_ttl;
+-              ipv6->payload_len       = cpu_to_be16(tot_len -
+-                                                    sizeof(struct ipv6hdr));
+-      }
++              ipv6->payload_len       = cpu_to_be16(payload_len);
+-      tcp->psh = get_cqe_lro_tcppsh(cqe);
+-
+-      if (tcp_ack) {
+-              tcp->ack                = 1;
+-              tcp->ack_seq            = cqe->lro_ack_seq_num;
+-              tcp->window             = cqe->lro_tcp_win;
++              mlx5e_lro_update_tcp_hdr(cqe, tcp);
++              check = csum_partial(tcp, tcp->doff * 4,
++                                   csum_unfold((__force __sum16)cqe->check_sum));
++              /* Almost done, don't forget the pseudo header */
++              tcp->check = csum_ipv6_magic(&ipv6->saddr, &ipv6->daddr, payload_len,
++                                           IPPROTO_TCP, check);
+       }
+ }
diff --git a/queue-4.15/net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch b/queue-4.15/net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch
new file mode 100644 (file)
index 0000000..cc182eb
--- /dev/null
@@ -0,0 +1,65 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Gal Pressman <galp@mellanox.com>
+Date: Thu, 25 Jan 2018 18:00:41 +0200
+Subject: net/mlx5e: Specify numa node when allocating drop rq
+
+From: Gal Pressman <galp@mellanox.com>
+
+
+[ Upstream commit 2f0db87901698cd73d828cc6fb1957b8916fc911 ]
+
+When allocating a drop rq, no numa node is explicitly set which means
+allocations are done on node zero. This is not necessarily the nearest
+numa node to the HCA, and even worse, might even be a memoryless numa
+node.
+
+Choose the numa_node given to us by the pci device in order to properly
+allocate the coherent dma memory instead of assuming zero is valid.
+
+Fixes: 556dd1b9c313 ("net/mlx5e: Set drop RQ's necessary parameters only")
+Signed-off-by: Gal Pressman <galp@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+@@ -1911,13 +1911,16 @@ static void mlx5e_build_rq_param(struct
+       param->wq.linear = 1;
+ }
+-static void mlx5e_build_drop_rq_param(struct mlx5e_rq_param *param)
++static void mlx5e_build_drop_rq_param(struct mlx5_core_dev *mdev,
++                                    struct mlx5e_rq_param *param)
+ {
+       void *rqc = param->rqc;
+       void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+       MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
+       MLX5_SET(wq, wq, log_wq_stride,    ilog2(sizeof(struct mlx5e_rx_wqe)));
++
++      param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
+ }
+ static void mlx5e_build_sq_param_common(struct mlx5e_priv *priv,
+@@ -2774,6 +2777,9 @@ static int mlx5e_alloc_drop_cq(struct ml
+                              struct mlx5e_cq *cq,
+                              struct mlx5e_cq_param *param)
+ {
++      param->wq.buf_numa_node = dev_to_node(&mdev->pdev->dev);
++      param->wq.db_numa_node  = dev_to_node(&mdev->pdev->dev);
++
+       return mlx5e_alloc_cq_common(mdev, param, cq);
+ }
+@@ -2785,7 +2791,7 @@ static int mlx5e_open_drop_rq(struct mlx
+       struct mlx5e_cq *cq = &drop_rq->cq;
+       int err;
+-      mlx5e_build_drop_rq_param(&rq_param);
++      mlx5e_build_drop_rq_param(mdev, &rq_param);
+       err = mlx5e_alloc_drop_cq(mdev, cq, &cq_param);
+       if (err)
diff --git a/queue-4.15/net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch b/queue-4.15/net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch
new file mode 100644 (file)
index 0000000..10fda68
--- /dev/null
@@ -0,0 +1,40 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Eran Ben Elisha <eranbe@mellanox.com>
+Date: Thu, 25 Jan 2018 11:18:09 +0200
+Subject: net/mlx5e: Verify inline header size do not exceed SKB linear size
+
+From: Eran Ben Elisha <eranbe@mellanox.com>
+
+
+[ Upstream commit f600c6088018d1dbc5777d18daa83660f7ea4a64 ]
+
+Driver tries to copy at least MLX5E_MIN_INLINE bytes into the control
+segment of the WQE. It assumes that the linear part contains at least
+MLX5E_MIN_INLINE bytes, which can be wrong.
+
+Cited commit verified that driver will not copy more bytes into the
+inline header part that the actual size of the packet. Re-factor this
+check to make sure we do not exceed the linear part as well.
+
+This fix is aligned with the current driver's assumption that the entire
+L2 will be present in the linear part of the SKB.
+
+Fixes: 6aace17e64f4 ("net/mlx5e: Fix inline header size for small packets")
+Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_tx.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c
+@@ -176,7 +176,7 @@ static inline u16 mlx5e_calc_min_inline(
+       default:
+               hlen = mlx5e_skb_l2_header_offset(skb);
+       }
+-      return min_t(u16, hlen, skb->len);
++      return min_t(u16, hlen, skb_headlen(skb));
+ }
+ static inline void mlx5e_tx_skb_pull_inline(unsigned char **skb_data,
diff --git a/queue-4.15/net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch b/queue-4.15/net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch
new file mode 100644 (file)
index 0000000..e0463b3
--- /dev/null
@@ -0,0 +1,34 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Heiner Kallweit <hkallweit1@gmail.com>
+Date: Thu, 8 Feb 2018 21:01:48 +0100
+Subject: net: phy: fix phy_start to consider PHY_IGNORE_INTERRUPT
+
+From: Heiner Kallweit <hkallweit1@gmail.com>
+
+
+[ Upstream commit 08f5138512180a479ce6b9d23b825c9f4cd3be77 ]
+
+This condition wasn't adjusted when PHY_IGNORE_INTERRUPT (-2) was added
+long ago. In case of PHY_IGNORE_INTERRUPT the MAC interrupt indicates
+also PHY state changes and we should do what the symbol says.
+
+Fixes: 84a527a41f38 ("net: phylib: fix interrupts re-enablement in phy_start")
+Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
+Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/phy.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/phy/phy.c
++++ b/drivers/net/phy/phy.c
+@@ -844,7 +844,7 @@ void phy_start(struct phy_device *phydev
+               phy_resume(phydev);
+               /* make sure interrupts are re-enabled for the PHY */
+-              if (phydev->irq != PHY_POLL) {
++              if (phy_interrupt_is_valid(phydev)) {
+                       err = phy_enable_interrupts(phydev);
+                       if (err < 0)
+                               break;
diff --git a/queue-4.15/net-phy-restore-phy_resume-locking-assumption.patch b/queue-4.15/net-phy-restore-phy_resume-locking-assumption.patch
new file mode 100644 (file)
index 0000000..2cdaf5e
--- /dev/null
@@ -0,0 +1,101 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Andrew Lunn <andrew@lunn.ch>
+Date: Tue, 27 Feb 2018 01:56:06 +0100
+Subject: net: phy: Restore phy_resume() locking assumption
+
+From: Andrew Lunn <andrew@lunn.ch>
+
+
+[ Upstream commit 9c2c2e62df3fa30fb13fbeb7512a4eede729383b ]
+
+commit f5e64032a799 ("net: phy: fix resume handling") changes the
+locking semantics for phy_resume() such that the caller now needs to
+hold the phy mutex. Not all call sites were adopted to this new
+semantic, resulting in warnings from the added
+WARN_ON(!mutex_is_locked(&phydev->lock)).  Rather than change the
+semantics, add a __phy_resume() and restore the old behavior of
+phy_resume().
+
+Reported-by: Heiner Kallweit <hkallweit1@gmail.com>
+Fixes: f5e64032a799 ("net: phy: fix resume handling")
+Signed-off-by: Andrew Lunn <andrew@lunn.ch>
+Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/phy.c        |    2 +-
+ drivers/net/phy/phy_device.c |   18 +++++++++++++-----
+ include/linux/phy.h          |    1 +
+ 3 files changed, 15 insertions(+), 6 deletions(-)
+
+--- a/drivers/net/phy/phy.c
++++ b/drivers/net/phy/phy.c
+@@ -841,7 +841,7 @@ void phy_start(struct phy_device *phydev
+               break;
+       case PHY_HALTED:
+               /* if phy was suspended, bring the physical link up again */
+-              phy_resume(phydev);
++              __phy_resume(phydev);
+               /* make sure interrupts are re-enabled for the PHY */
+               if (phy_interrupt_is_valid(phydev)) {
+--- a/drivers/net/phy/phy_device.c
++++ b/drivers/net/phy/phy_device.c
+@@ -135,9 +135,7 @@ static int mdio_bus_phy_resume(struct de
+       if (!mdio_bus_phy_may_suspend(phydev))
+               goto no_resume;
+-      mutex_lock(&phydev->lock);
+       ret = phy_resume(phydev);
+-      mutex_unlock(&phydev->lock);
+       if (ret < 0)
+               return ret;
+@@ -1028,9 +1026,7 @@ int phy_attach_direct(struct net_device
+       if (err)
+               goto error;
+-      mutex_lock(&phydev->lock);
+       phy_resume(phydev);
+-      mutex_unlock(&phydev->lock);
+       phy_led_triggers_register(phydev);
+       return err;
+@@ -1156,7 +1152,7 @@ int phy_suspend(struct phy_device *phyde
+ }
+ EXPORT_SYMBOL(phy_suspend);
+-int phy_resume(struct phy_device *phydev)
++int __phy_resume(struct phy_device *phydev)
+ {
+       struct phy_driver *phydrv = to_phy_driver(phydev->mdio.dev.driver);
+       int ret = 0;
+@@ -1173,6 +1169,18 @@ int phy_resume(struct phy_device *phydev
+       return ret;
+ }
++EXPORT_SYMBOL(__phy_resume);
++
++int phy_resume(struct phy_device *phydev)
++{
++      int ret;
++
++      mutex_lock(&phydev->lock);
++      ret = __phy_resume(phydev);
++      mutex_unlock(&phydev->lock);
++
++      return ret;
++}
+ EXPORT_SYMBOL(phy_resume);
+ int phy_loopback(struct phy_device *phydev, bool enable)
+--- a/include/linux/phy.h
++++ b/include/linux/phy.h
+@@ -819,6 +819,7 @@ void phy_device_remove(struct phy_device
+ int phy_init_hw(struct phy_device *phydev);
+ int phy_suspend(struct phy_device *phydev);
+ int phy_resume(struct phy_device *phydev);
++int __phy_resume(struct phy_device *phydev);
+ int phy_loopback(struct phy_device *phydev, bool enable);
+ struct phy_device *phy_attach(struct net_device *dev, const char *bus_id,
+                             phy_interface_t interface);
diff --git a/queue-4.15/net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch b/queue-4.15/net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch
new file mode 100644 (file)
index 0000000..d3cf568
--- /dev/null
@@ -0,0 +1,50 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Ivan Vecera <ivecera@redhat.com>
+Date: Thu, 8 Feb 2018 16:10:39 +0100
+Subject: net/sched: cls_u32: fix cls_u32 on filter replace
+
+From: Ivan Vecera <ivecera@redhat.com>
+
+
+[ Upstream commit eb53f7af6f15285e2f6ada97285395343ce9f433 ]
+
+The following sequence is currently broken:
+
+ # tc qdisc add dev foo ingress
+ # tc filter replace dev foo protocol all ingress \
+   u32 match u8 0 0 action mirred egress mirror dev bar1
+ # tc filter replace dev foo protocol all ingress \
+   handle 800::800 pref 49152 \
+   u32 match u8 0 0 action mirred egress mirror dev bar2
+ Error: cls_u32: Key node flags do not match passed flags.
+ We have an error talking to the kernel, -1
+
+The error comes from u32_change() when comparing new and
+existing flags. The existing ones always contains one of
+TCA_CLS_FLAGS_{,NOT}_IN_HW flag depending on offloading state.
+These flags cannot be passed from userspace so the condition
+(n->flags != flags) in u32_change() always fails.
+
+Fix the condition so the flags TCA_CLS_FLAGS_NOT_IN_HW and
+TCA_CLS_FLAGS_IN_HW are not taken into account.
+
+Fixes: 24d3dc6d27ea ("net/sched: cls_u32: Reflect HW offload status")
+Signed-off-by: Ivan Vecera <ivecera@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/cls_u32.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/sched/cls_u32.c
++++ b/net/sched/cls_u32.c
+@@ -928,7 +928,8 @@ static int u32_change(struct net *net, s
+               if (TC_U32_KEY(n->handle) == 0)
+                       return -EINVAL;
+-              if (n->flags != flags)
++              if ((n->flags ^ flags) &
++                  ~(TCA_CLS_FLAGS_IN_HW | TCA_CLS_FLAGS_NOT_IN_HW))
+                       return -EINVAL;
+               new = u32_init_knode(tp, n);
diff --git a/queue-4.15/net-sched-report-if-filter-is-too-large-to-dump.patch b/queue-4.15/net-sched-report-if-filter-is-too-large-to-dump.patch
new file mode 100644 (file)
index 0000000..28cec55
--- /dev/null
@@ -0,0 +1,54 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Roman Kapl <code@rkapl.cz>
+Date: Mon, 19 Feb 2018 21:32:51 +0100
+Subject: net: sched: report if filter is too large to dump
+
+From: Roman Kapl <code@rkapl.cz>
+
+
+[ Upstream commit 5ae437ad5a2ed573b1ebb04e0afa70b8869f88dd ]
+
+So far, if the filter was too large to fit in the allocated skb, the
+kernel did not return any error and stopped dumping. Modify the dumper
+so that it returns -EMSGSIZE when a filter fails to dump and it is the
+first filter in the skb. If we are not first, we will get a next chance
+with more room.
+
+I understand this is pretty near to being an API change, but the
+original design (silent truncation) can be considered a bug.
+
+Note: The error case can happen pretty easily if you create a filter
+with 32 actions and have 4kb pages. Also recent versions of iproute try
+to be clever with their buffer allocation size, which in turn leads to
+
+Signed-off-by: Roman Kapl <code@rkapl.cz>
+Acked-by: Jiri Pirko <jiri@mellanox.com>
+Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/cls_api.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/net/sched/cls_api.c
++++ b/net/sched/cls_api.c
+@@ -1054,13 +1054,18 @@ static int tc_dump_tfilter(struct sk_buf
+                   nla_get_u32(tca[TCA_CHAIN]) != chain->index)
+                       continue;
+               if (!tcf_chain_dump(chain, q, parent, skb, cb,
+-                                  index_start, &index))
++                                  index_start, &index)) {
++                      err = -EMSGSIZE;
+                       break;
++              }
+       }
+       cb->args[0] = index;
+ out:
++      /* If we did no progress, the error (EMSGSIZE) is real */
++      if (skb->len == 0 && err)
++              return err;
+       return skb->len;
+ }
diff --git a/queue-4.15/net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch b/queue-4.15/net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch
new file mode 100644 (file)
index 0000000..eae31bf
--- /dev/null
@@ -0,0 +1,37 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 22 Feb 2018 19:45:27 -0800
+Subject: net_sched: gen_estimator: fix broken estimators based on percpu stats
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit a5f7add332b4ea6d4b9480971b3b0f5e66466ae9 ]
+
+pfifo_fast got percpu stats lately, uncovering a bug I introduced last
+year in linux-4.10.
+
+I missed the fact that we have to clear our temporary storage
+before calling __gnet_stats_copy_basic() in the case of percpu stats.
+
+Without this fix, rate estimators (tc qd replace dev xxx root est 1sec
+4sec pfifo_fast) are utterly broken.
+
+Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/gen_estimator.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/core/gen_estimator.c
++++ b/net/core/gen_estimator.c
+@@ -66,6 +66,7 @@ struct net_rate_estimator {
+ static void est_fetch_counters(struct net_rate_estimator *e,
+                              struct gnet_stats_basic_packed *b)
+ {
++      memset(b, 0, sizeof(*b));
+       if (e->stats_lock)
+               spin_lock(e->stats_lock);
diff --git a/queue-4.15/netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch b/queue-4.15/netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch
new file mode 100644 (file)
index 0000000..cb31fea
--- /dev/null
@@ -0,0 +1,62 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+Date: Tue, 6 Feb 2018 14:48:32 +0100
+Subject: netlink: ensure to loop over all netns in genlmsg_multicast_allns()
+
+From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+
+
+[ Upstream commit cb9f7a9a5c96a773bbc9c70660dc600cfff82f82 ]
+
+Nowadays, nlmsg_multicast() returns only 0 or -ESRCH but this was not the
+case when commit 134e63756d5f was pushed.
+However, there was no reason to stop the loop if a netns does not have
+listeners.
+Returns -ESRCH only if there was no listeners in all netns.
+
+To avoid having the same problem in the future, I didn't take the
+assumption that nlmsg_multicast() returns only 0 or -ESRCH.
+
+Fixes: 134e63756d5f ("genetlink: make netns aware")
+CC: Johannes Berg <johannes.berg@intel.com>
+Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netlink/genetlink.c |   12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/net/netlink/genetlink.c
++++ b/net/netlink/genetlink.c
+@@ -1081,6 +1081,7 @@ static int genlmsg_mcast(struct sk_buff
+ {
+       struct sk_buff *tmp;
+       struct net *net, *prev = NULL;
++      bool delivered = false;
+       int err;
+       for_each_net_rcu(net) {
+@@ -1092,14 +1093,21 @@ static int genlmsg_mcast(struct sk_buff
+                       }
+                       err = nlmsg_multicast(prev->genl_sock, tmp,
+                                             portid, group, flags);
+-                      if (err)
++                      if (!err)
++                              delivered = true;
++                      else if (err != -ESRCH)
+                               goto error;
+               }
+               prev = net;
+       }
+-      return nlmsg_multicast(prev->genl_sock, skb, portid, group, flags);
++      err = nlmsg_multicast(prev->genl_sock, skb, portid, group, flags);
++      if (!err)
++              delivered = true;
++      else if (err != -ESRCH)
++              goto error;
++      return delivered ? 0 : -ESRCH;
+  error:
+       kfree_skb(skb);
+       return err;
diff --git a/queue-4.15/netlink-put-module-reference-if-dump-start-fails.patch b/queue-4.15/netlink-put-module-reference-if-dump-start-fails.patch
new file mode 100644 (file)
index 0000000..017a28e
--- /dev/null
@@ -0,0 +1,48 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Wed, 21 Feb 2018 04:41:59 +0100
+Subject: netlink: put module reference if dump start fails
+
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+
+
+[ Upstream commit b87b6194be631c94785fe93398651e804ed43e28 ]
+
+Before, if cb->start() failed, the module reference would never be put,
+because cb->cb_running is intentionally false at this point. Users are
+generally annoyed by this because they can no longer unload modules that
+leak references. Also, it may be possible to tediously wrap a reference
+counter back to zero, especially since module.c still uses atomic_inc
+instead of refcount_inc.
+
+This patch expands the error path to simply call module_put if
+cb->start() fails.
+
+Fixes: 41c87425a1ac ("netlink: do not set cb_running if dump's start() errs")
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netlink/af_netlink.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -2275,7 +2275,7 @@ int __netlink_dump_start(struct sock *ss
+       if (cb->start) {
+               ret = cb->start(cb);
+               if (ret)
+-                      goto error_unlock;
++                      goto error_put;
+       }
+       nlk->cb_running = true;
+@@ -2295,6 +2295,8 @@ int __netlink_dump_start(struct sock *ss
+        */
+       return -EINTR;
++error_put:
++      module_put(control->module);
+ error_unlock:
+       sock_put(sk);
+       mutex_unlock(nlk->cb_mutex);
diff --git a/queue-4.15/ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch b/queue-4.15/ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch
new file mode 100644 (file)
index 0000000..b4bfb7e
--- /dev/null
@@ -0,0 +1,60 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Guillaume Nault <g.nault@alphalink.fr>
+Date: Fri, 2 Mar 2018 18:41:16 +0100
+Subject: ppp: prevent unregistered channels from connecting to PPP units
+
+From: Guillaume Nault <g.nault@alphalink.fr>
+
+
+[ Upstream commit 77f840e3e5f09c6d7d727e85e6e08276dd813d11 ]
+
+PPP units don't hold any reference on the channels connected to it.
+It is the channel's responsibility to ensure that it disconnects from
+its unit before being destroyed.
+In practice, this is ensured by ppp_unregister_channel() disconnecting
+the channel from the unit before dropping a reference on the channel.
+
+However, it is possible for an unregistered channel to connect to a PPP
+unit: register a channel with ppp_register_net_channel(), attach a
+/dev/ppp file to it with ioctl(PPPIOCATTCHAN), unregister the channel
+with ppp_unregister_channel() and finally connect the /dev/ppp file to
+a PPP unit with ioctl(PPPIOCCONNECT).
+
+Once in this situation, the channel is only held by the /dev/ppp file,
+which can be released at anytime and free the channel without letting
+the parent PPP unit know. Then the ppp structure ends up with dangling
+pointers in its ->channels list.
+
+Prevent this scenario by forbidding unregistered channels from
+connecting to PPP units. This maintains the code logic by keeping
+ppp_unregister_channel() responsible from disconnecting the channel if
+necessary and avoids modification on the reference counting mechanism.
+
+This issue seems to predate git history (successfully reproduced on
+Linux 2.6.26 and earlier PPP commits are unrelated).
+
+Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ppp/ppp_generic.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/drivers/net/ppp/ppp_generic.c
++++ b/drivers/net/ppp/ppp_generic.c
+@@ -3161,6 +3161,15 @@ ppp_connect_channel(struct channel *pch,
+               goto outl;
+       ppp_lock(ppp);
++      spin_lock_bh(&pch->downl);
++      if (!pch->chan) {
++              /* Don't connect unregistered channels */
++              spin_unlock_bh(&pch->downl);
++              ppp_unlock(ppp);
++              ret = -ENOTCONN;
++              goto outl;
++      }
++      spin_unlock_bh(&pch->downl);
+       if (pch->file.hdrlen > ppp->file.hdrlen)
+               ppp->file.hdrlen = pch->file.hdrlen;
+       hdrlen = pch->file.hdrlen + 2;  /* for protocol bytes */
diff --git a/queue-4.15/revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch b/queue-4.15/revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch
new file mode 100644 (file)
index 0000000..07dcde7
--- /dev/null
@@ -0,0 +1,51 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Tue, 27 Feb 2018 18:58:15 +0100
+Subject: Revert "s390/qeth: fix using of ref counter for rxip addresses"
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit 4964c66fd49b2e2342da35358f2ff74614bcbaee ]
+
+This reverts commit cb816192d986f7596009dedcf2201fe2e5bc2aa7.
+
+The issue this attempted to fix never actually occurs.
+l3_add_rxip() checks (via l3_ip_from_hash()) if the requested address
+was previously added to the card. If so, it returns -EEXIST and doesn't
+call l3_add_ip().
+As a result, the "address exists" path in l3_add_ip() is never taken
+for rxip addresses, and this patch had no effect.
+
+Fixes: cb816192d986 ("s390/qeth: fix using of ref counter for rxip addresses")
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_l3_main.c |    8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/drivers/s390/net/qeth_l3_main.c
++++ b/drivers/s390/net/qeth_l3_main.c
+@@ -250,8 +250,7 @@ int qeth_l3_delete_ip(struct qeth_card *
+               return -ENOENT;
+       addr->ref_counter--;
+-      if (addr->ref_counter > 0 && (addr->type == QETH_IP_TYPE_NORMAL ||
+-                                    addr->type == QETH_IP_TYPE_RXIP))
++      if (addr->type == QETH_IP_TYPE_NORMAL && addr->ref_counter > 0)
+               return rc;
+       if (addr->in_progress)
+               return -EINPROGRESS;
+@@ -329,9 +328,8 @@ int qeth_l3_add_ip(struct qeth_card *car
+                       kfree(addr);
+               }
+       } else {
+-              if (addr->type == QETH_IP_TYPE_NORMAL ||
+-                  addr->type == QETH_IP_TYPE_RXIP)
+-                      addr->ref_counter++;
++                      if (addr->type == QETH_IP_TYPE_NORMAL)
++                              addr->ref_counter++;
+       }
+       return rc;
diff --git a/queue-4.15/rxrpc-fix-send-in-rxrpc_send_data_packet.patch b/queue-4.15/rxrpc-fix-send-in-rxrpc_send_data_packet.patch
new file mode 100644 (file)
index 0000000..dc21bde
--- /dev/null
@@ -0,0 +1,37 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: David Howells <dhowells@redhat.com>
+Date: Thu, 22 Feb 2018 14:38:14 +0000
+Subject: rxrpc: Fix send in rxrpc_send_data_packet()
+
+From: David Howells <dhowells@redhat.com>
+
+
+[ Upstream commit 93c62c45ed5fad1b87e3a45835b251cd68de9c46 ]
+
+All the kernel_sendmsg() calls in rxrpc_send_data_packet() need to send
+both parts of the iov[] buffer, but one of them does not.  Fix it so that
+it does.
+
+Without this, short IPv6 rxrpc DATA packets may be seen that have the rxrpc
+header included, but no payload.
+
+Fixes: 5a924b8951f8 ("rxrpc: Don't store the rxrpc header in the Tx queue sk_buffs")
+Reported-by: Marc Dionne <marc.dionne@auristor.com>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/rxrpc/output.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/rxrpc/output.c
++++ b/net/rxrpc/output.c
+@@ -445,7 +445,7 @@ send_fragmentable:
+                                       (char *)&opt, sizeof(opt));
+               if (ret == 0) {
+                       ret = kernel_sendmsg(conn->params.local->socket, &msg,
+-                                           iov, 1, iov[0].iov_len);
++                                           iov, 2, len);
+                       opt = IPV6_PMTUDISC_DO;
+                       kernel_setsockopt(conn->params.local->socket,
diff --git a/queue-4.15/s390-qeth-fix-double-free-on-ip-add-remove-race.patch b/queue-4.15/s390-qeth-fix-double-free-on-ip-add-remove-race.patch
new file mode 100644 (file)
index 0000000..facf4d9
--- /dev/null
@@ -0,0 +1,47 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Tue, 27 Feb 2018 18:58:14 +0100
+Subject: s390/qeth: fix double-free on IP add/remove race
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit 14d066c3531a87f727968cacd85bd95c75f59843 ]
+
+Registering an IPv4 address with the HW takes quite a while, so we
+temporarily drop the ip_htable lock. Any concurrent add/remove of the
+same IP adjusts the IP's use count, and (on remove) is then blocked by
+addr->in_progress.
+After the register call has completed, we check the use count for
+concurrently attempted add/remove calls - and possibly straight-away
+deregister the IP again. This happens via l3_delete_ip(), which
+1) looks up the queried IP in the htable (getting a reference to the
+   *same* queried object),
+2) deregisters the IP from the HW, and
+3) frees the IP object.
+
+The caller in l3_add_ip() then does a second free on the same object.
+
+For this case, skip all the extra checks and lookups in l3_delete_ip()
+and just deregister & free the IP object ourselves.
+
+Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback")
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_l3_main.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/s390/net/qeth_l3_main.c
++++ b/drivers/s390/net/qeth_l3_main.c
+@@ -320,7 +320,8 @@ int qeth_l3_add_ip(struct qeth_card *car
+                               (rc == IPA_RC_LAN_OFFLINE)) {
+                       addr->disp_flag = QETH_DISP_ADDR_DO_NOTHING;
+                       if (addr->ref_counter < 1) {
+-                              qeth_l3_delete_ip(card, addr);
++                              qeth_l3_deregister_addr_entry(card, addr);
++                              hash_del(&addr->hnode);
+                               kfree(addr);
+                       }
+               } else {
diff --git a/queue-4.15/s390-qeth-fix-ip-address-lookup-for-l3-devices.patch b/queue-4.15/s390-qeth-fix-ip-address-lookup-for-l3-devices.patch
new file mode 100644 (file)
index 0000000..8132c01
--- /dev/null
@@ -0,0 +1,255 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Tue, 27 Feb 2018 18:58:16 +0100
+Subject: s390/qeth: fix IP address lookup for L3 devices
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit c5c48c58b259bb8f0482398370ee539d7a12df3e ]
+
+Current code ("qeth_l3_ip_from_hash()") matches a queried address object
+against objects in the IP table by IP address, Mask/Prefix Length and
+MAC address ("qeth_l3_ipaddrs_is_equal()"). But what callers actually
+require is either
+a) "is this IP address registered" (ie. match by IP address only),
+before adding a new address.
+b) or "is this address object registered" (ie. match all relevant
+   attributes), before deleting an address.
+
+Right now
+1. the ADD path is too strict in its lookup, and eg. doesn't detect
+conflicts between an existing NORMAL address and a new VIPA address
+(because the NORMAL address will have mask != 0, while VIPA has
+a mask == 0),
+2. the DELETE path is not strict enough, and eg. allows del_rxip() to
+delete a VIPA address as long as the IP address matches.
+
+Fix all this by adding helpers (_addr_match_ip() and _addr_match_all())
+that do the appropriate checking.
+
+Note that the ADD path for NORMAL addresses is special, as qeth keeps
+track of how many times such an address is in use (and there is no
+immediate way of returning errors to the caller). So when a requested
+NORMAL address _fully_ matches an existing one, it's not considered a
+conflict and we merely increment the refcount.
+
+Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback")
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_l3.h      |   34 ++++++++++++++
+ drivers/s390/net/qeth_l3_main.c |   91 ++++++++++++++++++----------------------
+ 2 files changed, 74 insertions(+), 51 deletions(-)
+
+--- a/drivers/s390/net/qeth_l3.h
++++ b/drivers/s390/net/qeth_l3.h
+@@ -40,8 +40,40 @@ struct qeth_ipaddr {
+                       unsigned int pfxlen;
+               } a6;
+       } u;
+-
+ };
++
++static inline bool qeth_l3_addr_match_ip(struct qeth_ipaddr *a1,
++                                       struct qeth_ipaddr *a2)
++{
++      if (a1->proto != a2->proto)
++              return false;
++      if (a1->proto == QETH_PROT_IPV6)
++              return ipv6_addr_equal(&a1->u.a6.addr, &a2->u.a6.addr);
++      return a1->u.a4.addr == a2->u.a4.addr;
++}
++
++static inline bool qeth_l3_addr_match_all(struct qeth_ipaddr *a1,
++                                        struct qeth_ipaddr *a2)
++{
++      /* Assumes that the pair was obtained via qeth_l3_addr_find_by_ip(),
++       * so 'proto' and 'addr' match for sure.
++       *
++       * For ucast:
++       * -    'mac' is always 0.
++       * -    'mask'/'pfxlen' for RXIP/VIPA is always 0. For NORMAL, matching
++       *      values are required to avoid mixups in takeover eligibility.
++       *
++       * For mcast,
++       * -    'mac' is mapped from the IP, and thus always matches.
++       * -    'mask'/'pfxlen' is always 0.
++       */
++      if (a1->type != a2->type)
++              return false;
++      if (a1->proto == QETH_PROT_IPV6)
++              return a1->u.a6.pfxlen == a2->u.a6.pfxlen;
++      return a1->u.a4.mask == a2->u.a4.mask;
++}
++
+ static inline  u64 qeth_l3_ipaddr_hash(struct qeth_ipaddr *addr)
+ {
+       u64  ret = 0;
+--- a/drivers/s390/net/qeth_l3_main.c
++++ b/drivers/s390/net/qeth_l3_main.c
+@@ -150,6 +150,24 @@ int qeth_l3_string_to_ipaddr(const char
+               return -EINVAL;
+ }
++static struct qeth_ipaddr *qeth_l3_find_addr_by_ip(struct qeth_card *card,
++                                                 struct qeth_ipaddr *query)
++{
++      u64 key = qeth_l3_ipaddr_hash(query);
++      struct qeth_ipaddr *addr;
++
++      if (query->is_multicast) {
++              hash_for_each_possible(card->ip_mc_htable, addr, hnode, key)
++                      if (qeth_l3_addr_match_ip(addr, query))
++                              return addr;
++      } else {
++              hash_for_each_possible(card->ip_htable,  addr, hnode, key)
++                      if (qeth_l3_addr_match_ip(addr, query))
++                              return addr;
++      }
++      return NULL;
++}
++
+ static void qeth_l3_convert_addr_to_bits(u8 *addr, u8 *bits, int len)
+ {
+       int i, j;
+@@ -203,34 +221,6 @@ static bool qeth_l3_is_addr_covered_by_i
+       return rc;
+ }
+-inline int
+-qeth_l3_ipaddrs_is_equal(struct qeth_ipaddr *addr1, struct qeth_ipaddr *addr2)
+-{
+-      return addr1->proto == addr2->proto &&
+-              !memcmp(&addr1->u, &addr2->u, sizeof(addr1->u))  &&
+-              !memcmp(&addr1->mac, &addr2->mac, sizeof(addr1->mac));
+-}
+-
+-static struct qeth_ipaddr *
+-qeth_l3_ip_from_hash(struct qeth_card *card, struct qeth_ipaddr *tmp_addr)
+-{
+-      struct qeth_ipaddr *addr;
+-
+-      if (tmp_addr->is_multicast) {
+-              hash_for_each_possible(card->ip_mc_htable,  addr,
+-                              hnode, qeth_l3_ipaddr_hash(tmp_addr))
+-                      if (qeth_l3_ipaddrs_is_equal(tmp_addr, addr))
+-                              return addr;
+-      } else {
+-              hash_for_each_possible(card->ip_htable,  addr,
+-                              hnode, qeth_l3_ipaddr_hash(tmp_addr))
+-                      if (qeth_l3_ipaddrs_is_equal(tmp_addr, addr))
+-                              return addr;
+-      }
+-
+-      return NULL;
+-}
+-
+ int qeth_l3_delete_ip(struct qeth_card *card, struct qeth_ipaddr *tmp_addr)
+ {
+       int rc = 0;
+@@ -245,8 +235,8 @@ int qeth_l3_delete_ip(struct qeth_card *
+               QETH_CARD_HEX(card, 4, ((char *)&tmp_addr->u.a6.addr) + 8, 8);
+       }
+-      addr = qeth_l3_ip_from_hash(card, tmp_addr);
+-      if (!addr)
++      addr = qeth_l3_find_addr_by_ip(card, tmp_addr);
++      if (!addr || !qeth_l3_addr_match_all(addr, tmp_addr))
+               return -ENOENT;
+       addr->ref_counter--;
+@@ -268,6 +258,7 @@ int qeth_l3_add_ip(struct qeth_card *car
+ {
+       int rc = 0;
+       struct qeth_ipaddr *addr;
++      char buf[40];
+       QETH_CARD_TEXT(card, 4, "addip");
+@@ -278,8 +269,20 @@ int qeth_l3_add_ip(struct qeth_card *car
+               QETH_CARD_HEX(card, 4, ((char *)&tmp_addr->u.a6.addr) + 8, 8);
+       }
+-      addr = qeth_l3_ip_from_hash(card, tmp_addr);
+-      if (!addr) {
++      addr = qeth_l3_find_addr_by_ip(card, tmp_addr);
++      if (addr) {
++              if (tmp_addr->type != QETH_IP_TYPE_NORMAL)
++                      return -EADDRINUSE;
++              if (qeth_l3_addr_match_all(addr, tmp_addr)) {
++                      addr->ref_counter++;
++                      return 0;
++              }
++              qeth_l3_ipaddr_to_string(tmp_addr->proto, (u8 *)&tmp_addr->u,
++                                       buf);
++              dev_warn(&card->gdev->dev,
++                       "Registering IP address %s failed\n", buf);
++              return -EADDRINUSE;
++      } else {
+               addr = qeth_l3_get_addr_buffer(tmp_addr->proto);
+               if (!addr)
+                       return -ENOMEM;
+@@ -327,11 +330,7 @@ int qeth_l3_add_ip(struct qeth_card *car
+                       hash_del(&addr->hnode);
+                       kfree(addr);
+               }
+-      } else {
+-                      if (addr->type == QETH_IP_TYPE_NORMAL)
+-                              addr->ref_counter++;
+       }
+-
+       return rc;
+ }
+@@ -715,12 +714,7 @@ int qeth_l3_add_vipa(struct qeth_card *c
+               return -ENOMEM;
+       spin_lock_bh(&card->ip_lock);
+-
+-      if (qeth_l3_ip_from_hash(card, ipaddr))
+-              rc = -EEXIST;
+-      else
+-              qeth_l3_add_ip(card, ipaddr);
+-
++      rc = qeth_l3_add_ip(card, ipaddr);
+       spin_unlock_bh(&card->ip_lock);
+       kfree(ipaddr);
+@@ -783,12 +777,7 @@ int qeth_l3_add_rxip(struct qeth_card *c
+               return -ENOMEM;
+       spin_lock_bh(&card->ip_lock);
+-
+-      if (qeth_l3_ip_from_hash(card, ipaddr))
+-              rc = -EEXIST;
+-      else
+-              qeth_l3_add_ip(card, ipaddr);
+-
++      rc = qeth_l3_add_ip(card, ipaddr);
+       spin_unlock_bh(&card->ip_lock);
+       kfree(ipaddr);
+@@ -1396,8 +1385,9 @@ qeth_l3_add_mc_to_hash(struct qeth_card
+               memcpy(tmp->mac, buf, sizeof(tmp->mac));
+               tmp->is_multicast = 1;
+-              ipm = qeth_l3_ip_from_hash(card, tmp);
++              ipm = qeth_l3_find_addr_by_ip(card, tmp);
+               if (ipm) {
++                      /* for mcast, by-IP match means full match */
+                       ipm->disp_flag = QETH_DISP_ADDR_DO_NOTHING;
+               } else {
+                       ipm = qeth_l3_get_addr_buffer(QETH_PROT_IPV4);
+@@ -1480,8 +1470,9 @@ qeth_l3_add_mc6_to_hash(struct qeth_card
+                      sizeof(struct in6_addr));
+               tmp->is_multicast = 1;
+-              ipm = qeth_l3_ip_from_hash(card, tmp);
++              ipm = qeth_l3_find_addr_by_ip(card, tmp);
+               if (ipm) {
++                      /* for mcast, by-IP match means full match */
+                       ipm->disp_flag = QETH_DISP_ADDR_DO_NOTHING;
+                       continue;
+               }
diff --git a/queue-4.15/s390-qeth-fix-ip-removal-on-offline-cards.patch b/queue-4.15/s390-qeth-fix-ip-removal-on-offline-cards.patch
new file mode 100644 (file)
index 0000000..3b52676
--- /dev/null
@@ -0,0 +1,58 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Tue, 27 Feb 2018 18:58:13 +0100
+Subject: s390/qeth: fix IP removal on offline cards
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit 98d823ab1fbdcb13abc25b420f9bb71bade42056 ]
+
+If the HW is not reachable, then none of the IPs in qeth's internal
+table has been registered with the HW yet. So when deleting such an IP,
+there's no need to stage it for deregistration - just drop it from
+the table.
+
+This fixes the "add-delete-add" scenario on an offline card, where the
+the second "add" merely increments the IP's use count. But as the IP is
+still set to DISP_ADDR_DELETE from the previous "delete" step,
+l3_recover_ip() won't register it with the HW when the card goes online.
+
+Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback")
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_l3_main.c |   14 +++-----------
+ 1 file changed, 3 insertions(+), 11 deletions(-)
+
+--- a/drivers/s390/net/qeth_l3_main.c
++++ b/drivers/s390/net/qeth_l3_main.c
+@@ -256,12 +256,8 @@ int qeth_l3_delete_ip(struct qeth_card *
+       if (addr->in_progress)
+               return -EINPROGRESS;
+-      if (!qeth_card_hw_is_reachable(card)) {
+-              addr->disp_flag = QETH_DISP_ADDR_DELETE;
+-              return 0;
+-      }
+-
+-      rc = qeth_l3_deregister_addr_entry(card, addr);
++      if (qeth_card_hw_is_reachable(card))
++              rc = qeth_l3_deregister_addr_entry(card, addr);
+       hash_del(&addr->hnode);
+       kfree(addr);
+@@ -404,11 +400,7 @@ static void qeth_l3_recover_ip(struct qe
+       spin_lock_bh(&card->ip_lock);
+       hash_for_each_safe(card->ip_htable, i, tmp, addr, hnode) {
+-              if (addr->disp_flag == QETH_DISP_ADDR_DELETE) {
+-                      qeth_l3_deregister_addr_entry(card, addr);
+-                      hash_del(&addr->hnode);
+-                      kfree(addr);
+-              } else if (addr->disp_flag == QETH_DISP_ADDR_ADD) {
++              if (addr->disp_flag == QETH_DISP_ADDR_ADD) {
+                       if (addr->proto == QETH_PROT_IPV4) {
+                               addr->in_progress = 1;
+                               spin_unlock_bh(&card->ip_lock);
diff --git a/queue-4.15/s390-qeth-fix-ipa-command-submission-race.patch b/queue-4.15/s390-qeth-fix-ipa-command-submission-race.patch
new file mode 100644 (file)
index 0000000..31adc90
--- /dev/null
@@ -0,0 +1,83 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Tue, 27 Feb 2018 18:58:17 +0100
+Subject: s390/qeth: fix IPA command submission race
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit d22ffb5a712f9211ffd104c38fc17cbfb1b5e2b0 ]
+
+If multiple IPA commands are build & sent out concurrently,
+fill_ipacmd_header() may assign a seqno value to a command that's
+different from what send_control_data() later assigns to this command's
+reply.
+This is due to other commands passing through send_control_data(),
+and incrementing card->seqno.ipa along the way.
+
+So one IPA command has no reply that's waiting for its seqno, while some
+other IPA command has multiple reply objects waiting for it.
+Only one of those waiting replies wins, and the other(s) times out and
+triggers a recovery via send_ipa_cmd().
+
+Fix this by making sure that the same seqno value is assigned to
+a command and its reply object.
+Do so immediately before submitting the command & while holding the
+irq_pending "lock", to produce nicely ascending seqnos.
+
+As a side effect, *all* IPA commands now use a reply object that's
+waiting for its actual seqno. Previously, early IPA commands that were
+submitted while the card was still DOWN used the "catch-all" IDX seqno.
+
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_core_main.c |   19 ++++++++++---------
+ 1 file changed, 10 insertions(+), 9 deletions(-)
+
+--- a/drivers/s390/net/qeth_core_main.c
++++ b/drivers/s390/net/qeth_core_main.c
+@@ -2071,24 +2071,25 @@ int qeth_send_control_data(struct qeth_c
+       }
+       reply->callback = reply_cb;
+       reply->param = reply_param;
+-      if (card->state == CARD_STATE_DOWN)
+-              reply->seqno = QETH_IDX_COMMAND_SEQNO;
+-      else
+-              reply->seqno = card->seqno.ipa++;
++
+       init_waitqueue_head(&reply->wait_q);
+-      spin_lock_irqsave(&card->lock, flags);
+-      list_add_tail(&reply->list, &card->cmd_waiter_list);
+-      spin_unlock_irqrestore(&card->lock, flags);
+       while (atomic_cmpxchg(&card->write.irq_pending, 0, 1)) ;
+-      qeth_prepare_control_data(card, len, iob);
+       if (IS_IPA(iob->data)) {
+               cmd = __ipa_cmd(iob);
++              cmd->hdr.seqno = card->seqno.ipa++;
++              reply->seqno = cmd->hdr.seqno;
+               event_timeout = QETH_IPA_TIMEOUT;
+       } else {
++              reply->seqno = QETH_IDX_COMMAND_SEQNO;
+               event_timeout = QETH_TIMEOUT;
+       }
++      qeth_prepare_control_data(card, len, iob);
++
++      spin_lock_irqsave(&card->lock, flags);
++      list_add_tail(&reply->list, &card->cmd_waiter_list);
++      spin_unlock_irqrestore(&card->lock, flags);
+       timeout = jiffies + event_timeout;
+@@ -2870,7 +2871,7 @@ static void qeth_fill_ipacmd_header(stru
+       memset(cmd, 0, sizeof(struct qeth_ipa_cmd));
+       cmd->hdr.command = command;
+       cmd->hdr.initiator = IPA_CMD_INITIATOR_HOST;
+-      cmd->hdr.seqno = card->seqno.ipa;
++      /* cmd->hdr.seqno is set by qeth_send_control_data() */
+       cmd->hdr.adapter_type = qeth_get_ipa_adp_type(card->info.link_type);
+       cmd->hdr.rel_adapter_no = (__u8) card->info.portno;
+       if (card->options.layer2)
diff --git a/queue-4.15/s390-qeth-fix-overestimated-count-of-buffer-elements.patch b/queue-4.15/s390-qeth-fix-overestimated-count-of-buffer-elements.patch
new file mode 100644 (file)
index 0000000..c8ad326
--- /dev/null
@@ -0,0 +1,74 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Tue, 27 Feb 2018 18:58:12 +0100
+Subject: s390/qeth: fix overestimated count of buffer elements
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit 12472af89632beb1ed8dea29d4efe208ca05b06a ]
+
+qeth_get_elements_for_range() doesn't know how to handle a 0-length
+range (ie. start == end), and returns 1 when it should return 0.
+Such ranges occur on TSO skbs, where the L2/L3/L4 headers (and thus all
+of the skb's linear data) are skipped when mapping the skb into regular
+buffer elements.
+
+This overestimation may cause several performance-related issues:
+1. sub-optimal IO buffer selection, where the next buffer gets selected
+   even though the skb would actually still fit into the current buffer.
+2. forced linearization, if the element count for a non-linear skb
+   exceeds QETH_MAX_BUFFER_ELEMENTS.
+
+Rather than modifying qeth_get_elements_for_range() and adding overhead
+to every caller, fix up those callers that are in risk of passing a
+0-length range.
+
+Fixes: 2863c61334aa ("qeth: refactor calculation of SBALE count")
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_core_main.c |   10 ++++++----
+ drivers/s390/net/qeth_l3_main.c   |   11 ++++++-----
+ 2 files changed, 12 insertions(+), 9 deletions(-)
+
+--- a/drivers/s390/net/qeth_core_main.c
++++ b/drivers/s390/net/qeth_core_main.c
+@@ -3835,10 +3835,12 @@ EXPORT_SYMBOL_GPL(qeth_get_elements_for_
+ int qeth_get_elements_no(struct qeth_card *card,
+                    struct sk_buff *skb, int extra_elems, int data_offset)
+ {
+-      int elements = qeth_get_elements_for_range(
+-                              (addr_t)skb->data + data_offset,
+-                              (addr_t)skb->data + skb_headlen(skb)) +
+-                      qeth_get_elements_for_frags(skb);
++      addr_t end = (addr_t)skb->data + skb_headlen(skb);
++      int elements = qeth_get_elements_for_frags(skb);
++      addr_t start = (addr_t)skb->data + data_offset;
++
++      if (start != end)
++              elements += qeth_get_elements_for_range(start, end);
+       if ((elements + extra_elems) > QETH_MAX_BUFFER_ELEMENTS(card)) {
+               QETH_DBF_MESSAGE(2, "Invalid size of IP packet "
+--- a/drivers/s390/net/qeth_l3_main.c
++++ b/drivers/s390/net/qeth_l3_main.c
+@@ -2629,11 +2629,12 @@ static void qeth_tso_fill_header(struct
+ static int qeth_l3_get_elements_no_tso(struct qeth_card *card,
+                       struct sk_buff *skb, int extra_elems)
+ {
+-      addr_t tcpdptr = (addr_t)tcp_hdr(skb) + tcp_hdrlen(skb);
+-      int elements = qeth_get_elements_for_range(
+-                              tcpdptr,
+-                              (addr_t)skb->data + skb_headlen(skb)) +
+-                              qeth_get_elements_for_frags(skb);
++      addr_t start = (addr_t)tcp_hdr(skb) + tcp_hdrlen(skb);
++      addr_t end = (addr_t)skb->data + skb_headlen(skb);
++      int elements = qeth_get_elements_for_frags(skb);
++
++      if (start != end)
++              elements += qeth_get_elements_for_range(start, end);
+       if ((elements + extra_elems) > QETH_MAX_BUFFER_ELEMENTS(card)) {
+               QETH_DBF_MESSAGE(2,
diff --git a/queue-4.15/s390-qeth-fix-setip-command-handling.patch b/queue-4.15/s390-qeth-fix-setip-command-handling.patch
new file mode 100644 (file)
index 0000000..54f19cd
--- /dev/null
@@ -0,0 +1,76 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Fri, 9 Feb 2018 11:03:50 +0100
+Subject: s390/qeth: fix SETIP command handling
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit 1c5b2216fbb973a9410e0b06389740b5c1289171 ]
+
+send_control_data() applies some special handling to SETIP v4 IPA
+commands. But current code parses *all* command types for the SETIP
+command code. Limit the command code check to IPA commands.
+
+Fixes: 5b54e16f1a54 ("qeth: do not spin for SETIP ip assist command")
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_core.h      |    5 +++++
+ drivers/s390/net/qeth_core_main.c |   14 ++++++++------
+ 2 files changed, 13 insertions(+), 6 deletions(-)
+
+--- a/drivers/s390/net/qeth_core.h
++++ b/drivers/s390/net/qeth_core.h
+@@ -581,6 +581,11 @@ struct qeth_cmd_buffer {
+       void (*callback) (struct qeth_channel *, struct qeth_cmd_buffer *);
+ };
++static inline struct qeth_ipa_cmd *__ipa_cmd(struct qeth_cmd_buffer *iob)
++{
++      return (struct qeth_ipa_cmd *)(iob->data + IPA_PDU_HEADER_SIZE);
++}
++
+ /**
+  * definition of a qeth channel, used for read and write
+  */
+--- a/drivers/s390/net/qeth_core_main.c
++++ b/drivers/s390/net/qeth_core_main.c
+@@ -2057,7 +2057,7 @@ int qeth_send_control_data(struct qeth_c
+       unsigned long flags;
+       struct qeth_reply *reply = NULL;
+       unsigned long timeout, event_timeout;
+-      struct qeth_ipa_cmd *cmd;
++      struct qeth_ipa_cmd *cmd = NULL;
+       QETH_CARD_TEXT(card, 2, "sendctl");
+@@ -2083,10 +2083,13 @@ int qeth_send_control_data(struct qeth_c
+       while (atomic_cmpxchg(&card->write.irq_pending, 0, 1)) ;
+       qeth_prepare_control_data(card, len, iob);
+-      if (IS_IPA(iob->data))
++      if (IS_IPA(iob->data)) {
++              cmd = __ipa_cmd(iob);
+               event_timeout = QETH_IPA_TIMEOUT;
+-      else
++      } else {
+               event_timeout = QETH_TIMEOUT;
++      }
++
+       timeout = jiffies + event_timeout;
+       QETH_CARD_TEXT(card, 6, "noirqpnd");
+@@ -2111,9 +2114,8 @@ int qeth_send_control_data(struct qeth_c
+       /* we have only one long running ipassist, since we can ensure
+          process context of this command we can sleep */
+-      cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+-      if ((cmd->hdr.command == IPA_CMD_SETIP) &&
+-          (cmd->hdr.prot_version == QETH_PROT_IPV4)) {
++      if (cmd && cmd->hdr.command == IPA_CMD_SETIP &&
++          cmd->hdr.prot_version == QETH_PROT_IPV4) {
+               if (!wait_event_timeout(reply->wait_q,
+                   atomic_read(&reply->received), event_timeout))
+                       goto time_err;
diff --git a/queue-4.15/s390-qeth-fix-underestimated-count-of-buffer-elements.patch b/queue-4.15/s390-qeth-fix-underestimated-count-of-buffer-elements.patch
new file mode 100644 (file)
index 0000000..8058639
--- /dev/null
@@ -0,0 +1,40 @@
+From foo@baz Tue Mar  6 19:02:57 PST 2018
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Date: Fri, 9 Feb 2018 11:03:49 +0100
+Subject: s390/qeth: fix underestimated count of buffer elements
+
+From: Ursula Braun <ubraun@linux.vnet.ibm.com>
+
+
+[ Upstream commit 89271c65edd599207dd982007900506283c90ae3 ]
+
+For a memory range/skb where the last byte falls onto a page boundary
+(ie. 'end' is of the form xxx...xxx001), the PFN_UP() part of the
+calculation currently doesn't round up to the next PFN due to an
+off-by-one error.
+Thus qeth believes that the skb occupies one page less than it
+actually does, and may select a IO buffer that doesn't have enough spare
+buffer elements to fit all of the skb's data.
+HW detects this as a malformed buffer descriptor, and raises an
+exception which then triggers device recovery.
+
+Fixes: 2863c61334aa ("qeth: refactor calculation of SBALE count")
+Signed-off-by: Ursula Braun <ubraun@linux.vnet.ibm.com>
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_core.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/s390/net/qeth_core.h
++++ b/drivers/s390/net/qeth_core.h
+@@ -836,7 +836,7 @@ struct qeth_trap_id {
+  */
+ static inline int qeth_get_elements_for_range(addr_t start, addr_t end)
+ {
+-      return PFN_UP(end - 1) - PFN_DOWN(start);
++      return PFN_UP(end) - PFN_DOWN(start);
+ }
+ static inline int qeth_get_micros(void)
diff --git a/queue-4.15/sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch b/queue-4.15/sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch
new file mode 100644 (file)
index 0000000..e4016ef
--- /dev/null
@@ -0,0 +1,49 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Xin Long <lucien.xin@gmail.com>
+Date: Mon, 12 Feb 2018 18:29:06 +0800
+Subject: sctp: do not pr_err for the duplicated node in transport rhlist
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit 27af86bb038d9c8b8066cd17854ddaf2ea92bce1 ]
+
+The pr_err in sctp_hash_transport was supposed to report a sctp bug
+for using rhashtable/rhlist.
+
+The err '-EEXIST' introduced in Commit cd2b70875058 ("sctp: check
+duplicate node before inserting a new transport") doesn't belong
+to that case.
+
+So just return -EEXIST back without pr_err any kmsg.
+
+Fixes: cd2b70875058 ("sctp: check duplicate node before inserting a new transport")
+Reported-by: Wei Chen <weichen@redhat.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/input.c |    5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -897,15 +897,12 @@ int sctp_hash_transport(struct sctp_tran
+       rhl_for_each_entry_rcu(transport, tmp, list, node)
+               if (transport->asoc->ep == t->asoc->ep) {
+                       rcu_read_unlock();
+-                      err = -EEXIST;
+-                      goto out;
++                      return -EEXIST;
+               }
+       rcu_read_unlock();
+       err = rhltable_insert_key(&sctp_transport_hashtable, &arg,
+                                 &t->node, sctp_hash_params);
+-
+-out:
+       if (err)
+               pr_err_once("insert transport fail, errno %d\n", err);
diff --git a/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch b/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch
new file mode 100644 (file)
index 0000000..eb919c6
--- /dev/null
@@ -0,0 +1,86 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Tommi Rantala <tommi.t.rantala@nokia.com>
+Date: Mon, 5 Feb 2018 21:48:14 +0200
+Subject: sctp: fix dst refcnt leak in sctp_v4_get_dst
+
+From: Tommi Rantala <tommi.t.rantala@nokia.com>
+
+
+[ Upstream commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8 ]
+
+Fix dst reference count leak in sctp_v4_get_dst() introduced in commit
+410f03831 ("sctp: add routing output fallback"):
+
+When walking the address_list, successive ip_route_output_key() calls
+may return the same rt->dst with the reference incremented on each call.
+
+The code would not decrement the dst refcount when the dst pointer was
+identical from the previous iteration, causing the dst refcnt leak.
+
+Testcase:
+  ip netns add TEST
+  ip netns exec TEST ip link set lo up
+  ip link add dummy0 type dummy
+  ip link add dummy1 type dummy
+  ip link add dummy2 type dummy
+  ip link set dev dummy0 netns TEST
+  ip link set dev dummy1 netns TEST
+  ip link set dev dummy2 netns TEST
+  ip netns exec TEST ip addr add 192.168.1.1/24 dev dummy0
+  ip netns exec TEST ip link set dummy0 up
+  ip netns exec TEST ip addr add 192.168.1.2/24 dev dummy1
+  ip netns exec TEST ip link set dummy1 up
+  ip netns exec TEST ip addr add 192.168.1.3/24 dev dummy2
+  ip netns exec TEST ip link set dummy2 up
+  ip netns exec TEST sctp_test -H 192.168.1.2 -P 20002 -h 192.168.1.1 -p 20000 -s -B 192.168.1.3
+  ip netns del TEST
+
+In 4.4 and 4.9 kernels this results to:
+  [  354.179591] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  364.419674] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  374.663664] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  384.903717] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  395.143724] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  [  405.383645] unregister_netdevice: waiting for lo to become free. Usage count = 1
+  ...
+
+Fixes: 410f03831 ("sctp: add routing output fallback")
+Fixes: 0ca50d12f ("sctp: fix src address selection if using secondary addresses")
+Signed-off-by: Tommi Rantala <tommi.t.rantala@nokia.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/protocol.c |   10 ++++------
+ 1 file changed, 4 insertions(+), 6 deletions(-)
+
+--- a/net/sctp/protocol.c
++++ b/net/sctp/protocol.c
+@@ -514,22 +514,20 @@ static void sctp_v4_get_dst(struct sctp_
+               if (IS_ERR(rt))
+                       continue;
+-              if (!dst)
+-                      dst = &rt->dst;
+-
+               /* Ensure the src address belongs to the output
+                * interface.
+                */
+               odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr,
+                                    false);
+               if (!odev || odev->ifindex != fl4->flowi4_oif) {
+-                      if (&rt->dst != dst)
++                      if (!dst)
++                              dst = &rt->dst;
++                      else
+                               dst_release(&rt->dst);
+                       continue;
+               }
+-              if (dst != &rt->dst)
+-                      dst_release(dst);
++              dst_release(dst);
+               dst = &rt->dst;
+               break;
+       }
diff --git a/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch b/queue-4.15/sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch
new file mode 100644 (file)
index 0000000..c7aa19f
--- /dev/null
@@ -0,0 +1,57 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+Date: Mon, 5 Feb 2018 15:10:35 +0300
+Subject: sctp: fix dst refcnt leak in sctp_v6_get_dst()
+
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+
+
+[ Upstream commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2 ]
+
+When going through the bind address list in sctp_v6_get_dst() and
+the previously found address is better ('matchlen > bmatchlen'),
+the code continues to the next iteration without releasing currently
+held destination.
+
+Fix it by releasing 'bdst' before continue to the next iteration, and
+instead of introducing one more '!IS_ERR(bdst)' check for dst_release(),
+move the already existed one right after ip6_dst_lookup_flow(), i.e. we
+shouldn't proceed further if we get an error for the route lookup.
+
+Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using secondary addresses for ipv6")
+Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/ipv6.c |   10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+--- a/net/sctp/ipv6.c
++++ b/net/sctp/ipv6.c
+@@ -326,8 +326,10 @@ static void sctp_v6_get_dst(struct sctp_
+               final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+               bdst = ip6_dst_lookup_flow(sk, fl6, final_p);
+-              if (!IS_ERR(bdst) &&
+-                  ipv6_chk_addr(dev_net(bdst->dev),
++              if (IS_ERR(bdst))
++                      continue;
++
++              if (ipv6_chk_addr(dev_net(bdst->dev),
+                                 &laddr->a.v6.sin6_addr, bdst->dev, 1)) {
+                       if (!IS_ERR_OR_NULL(dst))
+                               dst_release(dst);
+@@ -336,8 +338,10 @@ static void sctp_v6_get_dst(struct sctp_
+               }
+               bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a);
+-              if (matchlen > bmatchlen)
++              if (matchlen > bmatchlen) {
++                      dst_release(bdst);
+                       continue;
++              }
+               if (!IS_ERR_OR_NULL(dst))
+                       dst_release(dst);
diff --git a/queue-4.15/sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch b/queue-4.15/sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch
new file mode 100644 (file)
index 0000000..29ac84c
--- /dev/null
@@ -0,0 +1,86 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+Date: Fri, 9 Feb 2018 17:35:23 +0300
+Subject: sctp: verify size of a new chunk in _sctp_make_chunk()
+
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+
+
+[ Upstream commit 07f2c7ab6f8d0a7e7c5764c4e6cc9c52951b9d9c ]
+
+When SCTP makes INIT or INIT_ACK packet the total chunk length
+can exceed SCTP_MAX_CHUNK_LEN which leads to kernel panic when
+transmitting these packets, e.g. the crash on sending INIT_ACK:
+
+[  597.804948] skbuff: skb_over_panic: text:00000000ffae06e4 len:120168
+               put:120156 head:000000007aa47635 data:00000000d991c2de
+               tail:0x1d640 end:0xfec0 dev:<NULL>
+...
+[  597.976970] ------------[ cut here ]------------
+[  598.033408] kernel BUG at net/core/skbuff.c:104!
+[  600.314841] Call Trace:
+[  600.345829]  <IRQ>
+[  600.371639]  ? sctp_packet_transmit+0x2095/0x26d0 [sctp]
+[  600.436934]  skb_put+0x16c/0x200
+[  600.477295]  sctp_packet_transmit+0x2095/0x26d0 [sctp]
+[  600.540630]  ? sctp_packet_config+0x890/0x890 [sctp]
+[  600.601781]  ? __sctp_packet_append_chunk+0x3b4/0xd00 [sctp]
+[  600.671356]  ? sctp_cmp_addr_exact+0x3f/0x90 [sctp]
+[  600.731482]  sctp_outq_flush+0x663/0x30d0 [sctp]
+[  600.788565]  ? sctp_make_init+0xbf0/0xbf0 [sctp]
+[  600.845555]  ? sctp_check_transmitted+0x18f0/0x18f0 [sctp]
+[  600.912945]  ? sctp_outq_tail+0x631/0x9d0 [sctp]
+[  600.969936]  sctp_cmd_interpreter.isra.22+0x3be1/0x5cb0 [sctp]
+[  601.041593]  ? sctp_sf_do_5_1B_init+0x85f/0xc30 [sctp]
+[  601.104837]  ? sctp_generate_t1_cookie_event+0x20/0x20 [sctp]
+[  601.175436]  ? sctp_eat_data+0x1710/0x1710 [sctp]
+[  601.233575]  sctp_do_sm+0x182/0x560 [sctp]
+[  601.284328]  ? sctp_has_association+0x70/0x70 [sctp]
+[  601.345586]  ? sctp_rcv+0xef4/0x32f0 [sctp]
+[  601.397478]  ? sctp6_rcv+0xa/0x20 [sctp]
+...
+
+Here the chunk size for INIT_ACK packet becomes too big, mostly
+because of the state cookie (INIT packet has large size with
+many address parameters), plus additional server parameters.
+
+Later this chunk causes the panic in skb_put_data():
+
+  skb_packet_transmit()
+      sctp_packet_pack()
+          skb_put_data(nskb, chunk->skb->data, chunk->skb->len);
+
+'nskb' (head skb) was previously allocated with packet->size
+from u16 'chunk->chunk_hdr->length'.
+
+As suggested by Marcelo we should check the chunk's length in
+_sctp_make_chunk() before trying to allocate skb for it and
+discard a chunk if its size bigger than SCTP_MAX_CHUNK_LEN.
+
+Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leinter@gmail.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/sm_make_chunk.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/net/sctp/sm_make_chunk.c
++++ b/net/sctp/sm_make_chunk.c
+@@ -1378,9 +1378,14 @@ static struct sctp_chunk *_sctp_make_chu
+       struct sctp_chunk *retval;
+       struct sk_buff *skb;
+       struct sock *sk;
++      int chunklen;
++
++      chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen);
++      if (chunklen > SCTP_MAX_CHUNK_LEN)
++              goto nodata;
+       /* No need to allocate LL here, as this is only a chunk. */
+-      skb = alloc_skb(SCTP_PAD4(sizeof(*chunk_hdr) + paylen), gfp);
++      skb = alloc_skb(chunklen, gfp);
+       if (!skb)
+               goto nodata;
index 6f2f35f849ae105a380ac2b0eb335fc0e2cbc1cd..d278d34b0091ad6b1cbbaf1198be738f9a81bb7a 100644 (file)
@@ -36,3 +36,62 @@ direct-io-fix-sleep-in-atomic-due-to-sync-aio.patch
 x86-xen-zero-msr_ia32_spec_ctrl-before-suspend.patch
 x86-platform-intel-mid-handle-intel-edison-reboot-correctly.patch
 x86-cpu_entry_area-sync-cpu_entry_area-to-initial_page_table.patch
+bridge-check-brport-attr-show-in-brport_show.patch
+fib_semantics-don-t-match-route-with-mismatching-tclassid.patch
+hdlc_ppp-carrier-detect-ok-don-t-turn-off-negotiation.patch
+ipv6-sit-work-around-bogus-gcc-8-wrestrict-warning.patch
+net-amd-xgbe-fix-comparison-to-bitshift-when-dealing-with-a-mask.patch
+net-ethernet-ti-cpsw-fix-net-watchdog-timeout.patch
+net-fix-race-on-decreasing-number-of-tx-queues.patch
+net-ipv4-don-t-allow-setting-net.ipv4.route.min_pmtu-below-68.patch
+netlink-ensure-to-loop-over-all-netns-in-genlmsg_multicast_allns.patch
+net-sched-report-if-filter-is-too-large-to-dump.patch
+ppp-prevent-unregistered-channels-from-connecting-to-ppp-units.patch
+sctp-verify-size-of-a-new-chunk-in-_sctp_make_chunk.patch
+udplite-fix-partial-checksum-initialization.patch
+net-mlx5e-fix-tcp-checksum-in-lro-buffers.patch
+sctp-fix-dst-refcnt-leak-in-sctp_v4_get_dst.patch
+mlxsw-spectrum_switchdev-check-success-of-fdb-add-operation.patch
+net-mlx5e-specify-numa-node-when-allocating-drop-rq.patch
+net-phy-fix-phy_start-to-consider-phy_ignore_interrupt.patch
+tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch
+rxrpc-fix-send-in-rxrpc_send_data_packet.patch
+tcp_bbr-better-deal-with-suboptimal-gso.patch
+doc-change-the-min-default-value-of-tcp_wmem-tcp_rmem.patch
+net-mlx5e-fix-loopback-self-test-when-gro-is-off.patch
+net_sched-gen_estimator-fix-broken-estimators-based-on-percpu-stats.patch
+net-sched-cls_u32-fix-cls_u32-on-filter-replace.patch
+sctp-do-not-pr_err-for-the-duplicated-node-in-transport-rhlist.patch
+mlxsw-spectrum_router-fix-error-path-in-mlxsw_sp_vr_create.patch
+net-ipv4-set-addr_type-in-hash_keys-for-forwarded-case.patch
+sctp-fix-dst-refcnt-leak-in-sctp_v6_get_dst.patch
+bridge-fix-vlan-reference-count-problem.patch
+net-mlx5e-verify-inline-header-size-do-not-exceed-skb-linear-size.patch
+tls-use-correct-sk-sk_prot-for-ipv6.patch
+amd-xgbe-restore-pci-interrupt-enablement-setting-on-resume.patch
+cls_u32-fix-use-after-free-in-u32_destroy_key.patch
+mlxsw-spectrum_router-do-not-unconditionally-clear-route-offload-indication.patch
+netlink-put-module-reference-if-dump-start-fails.patch
+tcp-purge-write-queue-upon-rst.patch
+tuntap-correctly-add-the-missing-xdp-flush.patch
+tuntap-disable-preemption-during-xdp-processing.patch
+virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch
+cxgb4-fix-trailing-zero-in-cim-la-dump.patch
+net-mlx5-fix-error-handling-when-adding-flow-rules.patch
+net-phy-restore-phy_resume-locking-assumption.patch
+tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch
+l2tp-don-t-use-inet_shutdown-on-tunnel-destroy.patch
+l2tp-don-t-use-inet_shutdown-on-ppp-session-destroy.patch
+l2tp-fix-races-with-tunnel-socket-close.patch
+l2tp-fix-race-in-pppol2tp_release-with-session-object-destroy.patch
+l2tp-fix-tunnel-lookup-use-after-free-race.patch
+s390-qeth-fix-underestimated-count-of-buffer-elements.patch
+s390-qeth-fix-setip-command-handling.patch
+s390-qeth-fix-overestimated-count-of-buffer-elements.patch
+s390-qeth-fix-ip-removal-on-offline-cards.patch
+s390-qeth-fix-double-free-on-ip-add-remove-race.patch
+revert-s390-qeth-fix-using-of-ref-counter-for-rxip-addresses.patch
+s390-qeth-fix-ip-address-lookup-for-l3-devices.patch
+s390-qeth-fix-ipa-command-submission-race.patch
+tcp-revert-f-rto-middle-box-workaround.patch
+tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch
diff --git a/queue-4.15/tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch b/queue-4.15/tcp-honor-the-eor-bit-in-tcp_mtu_probe.patch
new file mode 100644 (file)
index 0000000..a8897a4
--- /dev/null
@@ -0,0 +1,69 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Ilya Lesokhin <ilyal@mellanox.com>
+Date: Mon, 12 Feb 2018 12:57:04 +0200
+Subject: tcp: Honor the eor bit in tcp_mtu_probe
+
+From: Ilya Lesokhin <ilyal@mellanox.com>
+
+
+[ Upstream commit 808cf9e38cd7923036a99f459ccc8cf2955e47af ]
+
+Avoid SKB coalescing if eor bit is set in one of the relevant
+SKBs.
+
+Fixes: c134ecb87817 ("tcp: Make use of MSG_EOR in tcp_sendmsg")
+Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_output.c |   25 +++++++++++++++++++++++++
+ 1 file changed, 25 insertions(+)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -2026,6 +2026,24 @@ static inline void tcp_mtu_check_reprobe
+       }
+ }
++static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
++{
++      struct sk_buff *skb, *next;
++
++      skb = tcp_send_head(sk);
++      tcp_for_write_queue_from_safe(skb, next, sk) {
++              if (len <= skb->len)
++                      break;
++
++              if (unlikely(TCP_SKB_CB(skb)->eor))
++                      return false;
++
++              len -= skb->len;
++      }
++
++      return true;
++}
++
+ /* Create a new MTU probe if we are ready.
+  * MTU probe is regularly attempting to increase the path MTU by
+  * deliberately sending larger packets.  This discovers routing
+@@ -2098,6 +2116,9 @@ static int tcp_mtu_probe(struct sock *sk
+                       return 0;
+       }
++      if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
++              return -1;
++
+       /* We're allowed to probe.  Build it now. */
+       nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
+       if (!nskb)
+@@ -2133,6 +2154,10 @@ static int tcp_mtu_probe(struct sock *sk
+                       /* We've eaten all the data from this skb.
+                        * Throw it away. */
+                       TCP_SKB_CB(nskb)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
++                      /* If this is the last SKB we copy and eor is set
++                       * we need to propagate it to the new skb.
++                       */
++                      TCP_SKB_CB(nskb)->eor = TCP_SKB_CB(skb)->eor;
+                       tcp_unlink_write_queue(skb, sk);
+                       sk_wmem_free_skb(sk, skb);
+               } else {
diff --git a/queue-4.15/tcp-purge-write-queue-upon-rst.patch b/queue-4.15/tcp-purge-write-queue-upon-rst.patch
new file mode 100644 (file)
index 0000000..b925c8c
--- /dev/null
@@ -0,0 +1,44 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Soheil Hassas Yeganeh <soheil@google.com>
+Date: Tue, 27 Feb 2018 18:32:18 -0500
+Subject: tcp: purge write queue upon RST
+
+From: Soheil Hassas Yeganeh <soheil@google.com>
+
+
+[ Upstream commit a27fd7a8ed3856faaf5a2ff1c8c5f00c0667aaa0 ]
+
+When the connection is reset, there is no point in
+keeping the packets on the write queue until the connection
+is closed.
+
+RFC 793 (page 70) and RFC 793-bis (page 64) both suggest
+purging the write queue upon RST:
+https://tools.ietf.org/html/draft-ietf-tcpm-rfc793bis-07
+
+Moreover, this is essential for a correct MSG_ZEROCOPY
+implementation, because userspace cannot call close(fd)
+before receiving zerocopy signals even when the connection
+is reset.
+
+Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY")
+Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -3988,6 +3988,7 @@ void tcp_reset(struct sock *sk)
+       /* This barrier is coupled with smp_rmb() in tcp_poll() */
+       smp_wmb();
++      tcp_write_queue_purge(sk);
+       tcp_done(sk);
+       if (!sock_flag(sk, SOCK_DEAD))
diff --git a/queue-4.15/tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch b/queue-4.15/tcp-revert-f-rto-extension-to-detect-more-spurious-timeouts.patch
new file mode 100644 (file)
index 0000000..65a3e7e
--- /dev/null
@@ -0,0 +1,84 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Tue, 27 Feb 2018 14:15:02 -0800
+Subject: tcp: revert F-RTO extension to detect more spurious timeouts
+
+From: Yuchung Cheng <ycheng@google.com>
+
+
+[ Upstream commit fc68e171d376c322e6777a3d7ac2f0278b68b17f ]
+
+This reverts commit 89fe18e44f7ee5ab1c90d0dff5835acee7751427.
+
+While the patch could detect more spurious timeouts, it could cause
+poor TCP performance on broken middle-boxes that modifies TCP packets
+(e.g. receive window, SACK options). Since the performance gain is
+much smaller compared to the potential loss. The best solution is
+to fully revert the change.
+
+Fixes: 89fe18e44f7e ("tcp: extend F-RTO to catch more spurious timeouts")
+Reported-by: Teodor Milkov <tm@del.bg>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |   30 ++++++++++++------------------
+ 1 file changed, 12 insertions(+), 18 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1915,6 +1915,7 @@ void tcp_enter_loss(struct sock *sk)
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct net *net = sock_net(sk);
+       struct sk_buff *skb;
++      bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
+       bool is_reneg;                  /* is receiver reneging on SACKs? */
+       bool mark_lost;
+@@ -1973,15 +1974,12 @@ void tcp_enter_loss(struct sock *sk)
+       tp->high_seq = tp->snd_nxt;
+       tcp_ecn_queue_cwr(tp);
+-      /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
+-       * if a previous recovery is underway, otherwise it may incorrectly
+-       * call a timeout spurious if some previously retransmitted packets
+-       * are s/acked (sec 3.2). We do not apply that retriction since
+-       * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
+-       * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
+-       * on PTMU discovery to avoid sending new data.
++      /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
++       * loss recovery is underway except recurring timeout(s) on
++       * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+        */
+       tp->frto = net->ipv4.sysctl_tcp_frto &&
++                 (new_recovery || icsk->icsk_retransmits) &&
+                  !inet_csk(sk)->icsk_mtup.probe_size;
+ }
+@@ -2634,18 +2632,14 @@ static void tcp_process_loss(struct sock
+           tcp_try_undo_loss(sk, false))
+               return;
+-      /* The ACK (s)acks some never-retransmitted data meaning not all
+-       * the data packets before the timeout were lost. Therefore we
+-       * undo the congestion window and state. This is essentially
+-       * the operation in F-RTO (RFC5682 section 3.1 step 3.b). Since
+-       * a retransmitted skb is permantly marked, we can apply such an
+-       * operation even if F-RTO was not used.
+-       */
+-      if ((flag & FLAG_ORIG_SACK_ACKED) &&
+-          tcp_try_undo_loss(sk, tp->undo_marker))
+-              return;
+-
+       if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
++              /* Step 3.b. A timeout is spurious if not all data are
++               * lost, i.e., never-retransmitted data are (s)acked.
++               */
++              if ((flag & FLAG_ORIG_SACK_ACKED) &&
++                  tcp_try_undo_loss(sk, true))
++                      return;
++
+               if (after(tp->snd_nxt, tp->high_seq)) {
+                       if (flag & FLAG_DATA_SACKED || is_dupack)
+                               tp->frto = 0; /* Step 3.a. loss was real */
diff --git a/queue-4.15/tcp-revert-f-rto-middle-box-workaround.patch b/queue-4.15/tcp-revert-f-rto-middle-box-workaround.patch
new file mode 100644 (file)
index 0000000..e3d3c3f
--- /dev/null
@@ -0,0 +1,65 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Tue, 27 Feb 2018 14:15:01 -0800
+Subject: tcp: revert F-RTO middle-box workaround
+
+From: Yuchung Cheng <ycheng@google.com>
+
+
+[ Upstream commit d4131f09770d9b7471c9da65e6ecd2477746ac5c ]
+
+This reverts commit cc663f4d4c97b7297fb45135ab23cfd508b35a77. While fixing
+some broken middle-boxes that modifies receive window fields, it does not
+address middle-boxes that strip off SACK options. The best solution is
+to fully revert this patch and the root F-RTO enhancement.
+
+Fixes: cc663f4d4c97 ("tcp: restrict F-RTO to work-around broken middle-boxes")
+Reported-by: Teodor Milkov <tm@del.bg>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c | 17 +++++++----------
+ 1 file changed, 7 insertions(+), 10 deletions(-)
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 45f750e85714..50963f92a67d 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1915,7 +1915,6 @@ void tcp_enter_loss(struct sock *sk)
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct net *net = sock_net(sk);
+       struct sk_buff *skb;
+-      bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
+       bool is_reneg;                  /* is receiver reneging on SACKs? */
+       bool mark_lost;
+@@ -1974,17 +1973,15 @@ void tcp_enter_loss(struct sock *sk)
+       tp->high_seq = tp->snd_nxt;
+       tcp_ecn_queue_cwr(tp);
+-      /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+-       * loss recovery is underway except recurring timeout(s) on
+-       * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+-       *
+-       * In theory F-RTO can be used repeatedly during loss recovery.
+-       * In practice this interacts badly with broken middle-boxes that
+-       * falsely raise the receive window, which results in repeated
+-       * timeouts and stop-and-go behavior.
++      /* F-RTO RFC5682 sec 3.1 step 1 mandates to disable F-RTO
++       * if a previous recovery is underway, otherwise it may incorrectly
++       * call a timeout spurious if some previously retransmitted packets
++       * are s/acked (sec 3.2). We do not apply that retriction since
++       * retransmitted skbs are permanently tagged with TCPCB_EVER_RETRANS
++       * so FLAG_ORIG_SACK_ACKED is always correct. But we do disable F-RTO
++       * on PTMU discovery to avoid sending new data.
+        */
+       tp->frto = net->ipv4.sysctl_tcp_frto &&
+-                 (new_recovery || icsk->icsk_retransmits) &&
+                  !inet_csk(sk)->icsk_mtup.probe_size;
+ }
+-- 
+2.14.3
+
diff --git a/queue-4.15/tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch b/queue-4.15/tcp-tracepoint-only-call-trace_tcp_send_reset-with-full-socket.patch
new file mode 100644 (file)
index 0000000..aca5ce6
--- /dev/null
@@ -0,0 +1,55 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Song Liu <songliubraving@fb.com>
+Date: Tue, 6 Feb 2018 20:50:23 -0800
+Subject: tcp: tracepoint: only call trace_tcp_send_reset with full socket
+
+From: Song Liu <songliubraving@fb.com>
+
+
+[ Upstream commit 5c487bb9adddbc1d23433e09d2548759375c2b52 ]
+
+tracepoint tcp_send_reset requires a full socket to work. However, it
+may be called when in TCP_TIME_WAIT:
+
+        case TCP_TW_RST:
+                tcp_v6_send_reset(sk, skb);
+                inet_twsk_deschedule_put(inet_twsk(sk));
+                goto discard_it;
+
+To avoid this problem, this patch checks the socket with sk_fullsock()
+before calling trace_tcp_send_reset().
+
+Fixes: c24b14c46bb8 ("tcp: add tracepoint trace_tcp_send_reset")
+Signed-off-by: Song Liu <songliubraving@fb.com>
+Reviewed-by: Lawrence Brakmo <brakmo@fb.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_ipv4.c |    3 ++-
+ net/ipv6/tcp_ipv6.c |    3 ++-
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -705,7 +705,8 @@ static void tcp_v4_send_reset(const stru
+        */
+       if (sk) {
+               arg.bound_dev_if = sk->sk_bound_dev_if;
+-              trace_tcp_send_reset(sk, skb);
++              if (sk_fullsock(sk))
++                      trace_tcp_send_reset(sk, skb);
+       }
+       BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -943,7 +943,8 @@ static void tcp_v6_send_reset(const stru
+       if (sk) {
+               oif = sk->sk_bound_dev_if;
+-              trace_tcp_send_reset(sk, skb);
++              if (sk_fullsock(sk))
++                      trace_tcp_send_reset(sk, skb);
+       }
+       tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, 0, 0);
diff --git a/queue-4.15/tcp_bbr-better-deal-with-suboptimal-gso.patch b/queue-4.15/tcp_bbr-better-deal-with-suboptimal-gso.patch
new file mode 100644 (file)
index 0000000..13f36cf
--- /dev/null
@@ -0,0 +1,89 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 21 Feb 2018 06:43:03 -0800
+Subject: tcp_bbr: better deal with suboptimal GSO
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 350c9f484bde93ef229682eedd98cd5f74350f7f ]
+
+BBR uses tcp_tso_autosize() in an attempt to probe what would be the
+burst sizes and to adjust cwnd in bbr_target_cwnd() with following
+gold formula :
+
+/* Allow enough full-sized skbs in flight to utilize end systems. */
+cwnd += 3 * bbr->tso_segs_goal;
+
+But GSO can be lacking or be constrained to very small
+units (ip link set dev ... gso_max_segs 2)
+
+What we really want is to have enough packets in flight so that both
+GSO and GRO are efficient.
+
+So in the case GSO is off or downgraded, we still want to have the same
+number of packets in flight as if GSO/TSO was fully operational, so
+that GRO can hopefully be working efficiently.
+
+To fix this issue, we make tcp_tso_autosize() unaware of
+sk->sk_gso_max_segs
+
+Only tcp_tso_segs() has to enforce the gso_max_segs limit.
+
+Tested:
+
+ethtool -K eth0 tso off gso off
+tc qd replace dev eth0 root pfifo_fast
+
+Before patch:
+for f in {1..5}; do ./super_netperf 1 -H lpaa24 -- -K bbr; done
+    691  (ss -temoi shows cwnd is stuck around 6 )
+    667
+    651
+    631
+    517
+
+After patch :
+# for f in {1..5}; do ./super_netperf 1 -H lpaa24 -- -K bbr; done
+   1733 (ss -temoi shows cwnd is around 386 )
+   1778
+   1746
+   1781
+   1718
+
+Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_output.c |    9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1730,7 +1730,7 @@ u32 tcp_tso_autosize(const struct sock *
+        */
+       segs = max_t(u32, bytes / mss_now, min_tso_segs);
+-      return min_t(u32, segs, sk->sk_gso_max_segs);
++      return segs;
+ }
+ EXPORT_SYMBOL(tcp_tso_autosize);
+@@ -1742,9 +1742,10 @@ static u32 tcp_tso_segs(struct sock *sk,
+       const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+       u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
+-      return tso_segs ? :
+-              tcp_tso_autosize(sk, mss_now,
+-                               sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++      if (!tso_segs)
++              tso_segs = tcp_tso_autosize(sk, mss_now,
++                              sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs);
++      return min_t(u32, tso_segs, sk->sk_gso_max_segs);
+ }
+ /* Returns the portion of skb which can be sent right away */
diff --git a/queue-4.15/tls-use-correct-sk-sk_prot-for-ipv6.patch b/queue-4.15/tls-use-correct-sk-sk_prot-for-ipv6.patch
new file mode 100644 (file)
index 0000000..f1e46c0
--- /dev/null
@@ -0,0 +1,122 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Boris Pismenny <borisp@mellanox.com>
+Date: Tue, 27 Feb 2018 14:18:39 +0200
+Subject: tls: Use correct sk->sk_prot for IPV6
+
+From: Boris Pismenny <borisp@mellanox.com>
+
+
+[ Upstream commit c113187d38ff85dc302a1bb55864b203ebb2ba10 ]
+
+The tls ulp overrides sk->prot with a new tls specific proto structs.
+The tls specific structs were previously based on the ipv4 specific
+tcp_prot sturct.
+As a result, attaching the tls ulp to an ipv6 tcp socket replaced
+some ipv6 callback with the ipv4 equivalents.
+
+This patch adds ipv6 tls proto structs and uses them when
+attached to ipv6 sockets.
+
+Fixes: 3c4d7559159b ('tls: kernel TLS support')
+Signed-off-by: Boris Pismenny <borisp@mellanox.com>
+Signed-off-by: Ilya Lesokhin <ilyal@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/tls/tls_main.c |   52 +++++++++++++++++++++++++++++++++++++---------------
+ 1 file changed, 37 insertions(+), 15 deletions(-)
+
+--- a/net/tls/tls_main.c
++++ b/net/tls/tls_main.c
+@@ -46,16 +46,26 @@ MODULE_DESCRIPTION("Transport Layer Secu
+ MODULE_LICENSE("Dual BSD/GPL");
+ enum {
++      TLSV4,
++      TLSV6,
++      TLS_NUM_PROTS,
++};
++
++enum {
+       TLS_BASE_TX,
+       TLS_SW_TX,
+       TLS_NUM_CONFIG,
+ };
+-static struct proto tls_prots[TLS_NUM_CONFIG];
++static struct proto *saved_tcpv6_prot;
++static DEFINE_MUTEX(tcpv6_prot_mutex);
++static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
+ static inline void update_sk_prot(struct sock *sk, struct tls_context *ctx)
+ {
+-      sk->sk_prot = &tls_prots[ctx->tx_conf];
++      int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
++
++      sk->sk_prot = &tls_prots[ip_ver][ctx->tx_conf];
+ }
+ int wait_on_pending_writer(struct sock *sk, long *timeo)
+@@ -450,8 +460,21 @@ static int tls_setsockopt(struct sock *s
+       return do_tls_setsockopt(sk, optname, optval, optlen);
+ }
++static void build_protos(struct proto *prot, struct proto *base)
++{
++      prot[TLS_BASE_TX] = *base;
++      prot[TLS_BASE_TX].setsockopt    = tls_setsockopt;
++      prot[TLS_BASE_TX].getsockopt    = tls_getsockopt;
++      prot[TLS_BASE_TX].close         = tls_sk_proto_close;
++
++      prot[TLS_SW_TX] = prot[TLS_BASE_TX];
++      prot[TLS_SW_TX].sendmsg         = tls_sw_sendmsg;
++      prot[TLS_SW_TX].sendpage        = tls_sw_sendpage;
++}
++
+ static int tls_init(struct sock *sk)
+ {
++      int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tls_context *ctx;
+       int rc = 0;
+@@ -476,6 +499,17 @@ static int tls_init(struct sock *sk)
+       ctx->getsockopt = sk->sk_prot->getsockopt;
+       ctx->sk_proto_close = sk->sk_prot->close;
++      /* Build IPv6 TLS whenever the address of tcpv6_prot changes */
++      if (ip_ver == TLSV6 &&
++          unlikely(sk->sk_prot != smp_load_acquire(&saved_tcpv6_prot))) {
++              mutex_lock(&tcpv6_prot_mutex);
++              if (likely(sk->sk_prot != saved_tcpv6_prot)) {
++                      build_protos(tls_prots[TLSV6], sk->sk_prot);
++                      smp_store_release(&saved_tcpv6_prot, sk->sk_prot);
++              }
++              mutex_unlock(&tcpv6_prot_mutex);
++      }
++
+       ctx->tx_conf = TLS_BASE_TX;
+       update_sk_prot(sk, ctx);
+ out:
+@@ -488,21 +522,9 @@ static struct tcp_ulp_ops tcp_tls_ulp_op
+       .init                   = tls_init,
+ };
+-static void build_protos(struct proto *prot, struct proto *base)
+-{
+-      prot[TLS_BASE_TX] = *base;
+-      prot[TLS_BASE_TX].setsockopt    = tls_setsockopt;
+-      prot[TLS_BASE_TX].getsockopt    = tls_getsockopt;
+-      prot[TLS_BASE_TX].close         = tls_sk_proto_close;
+-
+-      prot[TLS_SW_TX] = prot[TLS_BASE_TX];
+-      prot[TLS_SW_TX].sendmsg         = tls_sw_sendmsg;
+-      prot[TLS_SW_TX].sendpage        = tls_sw_sendpage;
+-}
+-
+ static int __init tls_register(void)
+ {
+-      build_protos(tls_prots, &tcp_prot);
++      build_protos(tls_prots[TLSV4], &tcp_prot);
+       tcp_register_ulp(&tcp_tls_ulp_ops);
diff --git a/queue-4.15/tuntap-correctly-add-the-missing-xdp-flush.patch b/queue-4.15/tuntap-correctly-add-the-missing-xdp-flush.patch
new file mode 100644 (file)
index 0000000..1c66e47
--- /dev/null
@@ -0,0 +1,37 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Jason Wang <jasowang@redhat.com>
+Date: Sat, 24 Feb 2018 11:32:26 +0800
+Subject: tuntap: correctly add the missing XDP flush
+
+From: Jason Wang <jasowang@redhat.com>
+
+
+[ Upstream commit 1bb4f2e868a2891ab8bc668b8173d6ccb8c4ce6f ]
+
+We don't flush batched XDP packets through xdp_do_flush_map(), this
+will cause packets stall at TX queue. Consider we don't do XDP on NAPI
+poll(), the only possible fix is to call xdp_do_flush_map()
+immediately after xdp_do_redirect().
+
+Note, this in fact won't try to batch packets through devmap, we could
+address in the future.
+
+Reported-by: Christoffer Dall <christoffer.dall@linaro.org>
+Fixes: 761876c857cb ("tap: XDP support")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/tun.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1490,6 +1490,7 @@ static struct sk_buff *tun_build_skb(str
+                       get_page(alloc_frag->page);
+                       alloc_frag->offset += buflen;
+                       err = xdp_do_redirect(tun->dev, &xdp, xdp_prog);
++                      xdp_do_flush_map();
+                       if (err)
+                               goto err_redirect;
+                       rcu_read_unlock();
diff --git a/queue-4.15/tuntap-disable-preemption-during-xdp-processing.patch b/queue-4.15/tuntap-disable-preemption-during-xdp-processing.patch
new file mode 100644 (file)
index 0000000..db31a08
--- /dev/null
@@ -0,0 +1,75 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Jason Wang <jasowang@redhat.com>
+Date: Sat, 24 Feb 2018 11:32:25 +0800
+Subject: tuntap: disable preemption during XDP processing
+
+From: Jason Wang <jasowang@redhat.com>
+
+
+[ Upstream commit 23e43f07f896f8578318cfcc9466f1e8b8ab21b6 ]
+
+Except for tuntap, all other drivers' XDP was implemented at NAPI
+poll() routine in a bh. This guarantees all XDP operation were done at
+the same CPU which is required by e.g BFP_MAP_TYPE_PERCPU_ARRAY. But
+for tuntap, we do it in process context and we try to protect XDP
+processing by RCU reader lock. This is insufficient since
+CONFIG_PREEMPT_RCU can preempt the RCU reader critical section which
+breaks the assumption that all XDP were processed in the same CPU.
+
+Fixing this by simply disabling preemption during XDP processing.
+
+Fixes: 761876c857cb ("tap: XDP support")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/tun.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1471,6 +1471,7 @@ static struct sk_buff *tun_build_skb(str
+       else
+               *skb_xdp = 0;
++      preempt_disable();
+       rcu_read_lock();
+       xdp_prog = rcu_dereference(tun->xdp_prog);
+       if (xdp_prog && !*skb_xdp) {
+@@ -1494,6 +1495,7 @@ static struct sk_buff *tun_build_skb(str
+                       if (err)
+                               goto err_redirect;
+                       rcu_read_unlock();
++                      preempt_enable();
+                       return NULL;
+               case XDP_TX:
+                       xdp_xmit = true;
+@@ -1515,6 +1517,7 @@ static struct sk_buff *tun_build_skb(str
+       skb = build_skb(buf, buflen);
+       if (!skb) {
+               rcu_read_unlock();
++              preempt_enable();
+               return ERR_PTR(-ENOMEM);
+       }
+@@ -1527,10 +1530,12 @@ static struct sk_buff *tun_build_skb(str
+               skb->dev = tun->dev;
+               generic_xdp_tx(skb, xdp_prog);
+               rcu_read_unlock();
++              preempt_enable();
+               return NULL;
+       }
+       rcu_read_unlock();
++      preempt_enable();
+       return skb;
+@@ -1538,6 +1543,7 @@ err_redirect:
+       put_page(alloc_frag->page);
+ err_xdp:
+       rcu_read_unlock();
++      preempt_enable();
+       this_cpu_inc(tun->pcpu_stats->rx_dropped);
+       return NULL;
+ }
diff --git a/queue-4.15/udplite-fix-partial-checksum-initialization.patch b/queue-4.15/udplite-fix-partial-checksum-initialization.patch
new file mode 100644 (file)
index 0000000..ee73e2b
--- /dev/null
@@ -0,0 +1,76 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+Date: Thu, 15 Feb 2018 20:18:43 +0300
+Subject: udplite: fix partial checksum initialization
+
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+
+
+[ Upstream commit 15f35d49c93f4fa9875235e7bf3e3783d2dd7a1b ]
+
+Since UDP-Lite is always using checksum, the following path is
+triggered when calculating pseudo header for it:
+
+  udp4_csum_init() or udp6_csum_init()
+    skb_checksum_init_zero_check()
+      __skb_checksum_validate_complete()
+
+The problem can appear if skb->len is less than CHECKSUM_BREAK. In
+this particular case __skb_checksum_validate_complete() also invokes
+__skb_checksum_complete(skb). If UDP-Lite is using partial checksum
+that covers only part of a packet, the function will return bad
+checksum and the packet will be dropped.
+
+It can be fixed if we skip skb_checksum_init_zero_check() and only
+set the required pseudo header checksum for UDP-Lite with partial
+checksum before udp4_csum_init()/udp6_csum_init() functions return.
+
+Fixes: ed70fcfcee95 ("net: Call skb_checksum_init in IPv4")
+Fixes: e4f45b7f40bd ("net: Call skb_checksum_init in IPv6")
+Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/udplite.h   |    1 +
+ net/ipv4/udp.c          |    5 +++++
+ net/ipv6/ip6_checksum.c |    5 +++++
+ 3 files changed, 11 insertions(+)
+
+--- a/include/net/udplite.h
++++ b/include/net/udplite.h
+@@ -64,6 +64,7 @@ static inline int udplite_checksum_init(
+               UDP_SKB_CB(skb)->cscov = cscov;
+               if (skb->ip_summed == CHECKSUM_COMPLETE)
+                       skb->ip_summed = CHECKSUM_NONE;
++              skb->csum_valid = 0;
+         }
+       return 0;
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -2031,6 +2031,11 @@ static inline int udp4_csum_init(struct
+               err = udplite_checksum_init(skb, uh);
+               if (err)
+                       return err;
++
++              if (UDP_SKB_CB(skb)->partial_cov) {
++                      skb->csum = inet_compute_pseudo(skb, proto);
++                      return 0;
++              }
+       }
+       /* Note, we are only interested in != 0 or == 0, thus the
+--- a/net/ipv6/ip6_checksum.c
++++ b/net/ipv6/ip6_checksum.c
+@@ -73,6 +73,11 @@ int udp6_csum_init(struct sk_buff *skb,
+               err = udplite_checksum_init(skb, uh);
+               if (err)
+                       return err;
++
++              if (UDP_SKB_CB(skb)->partial_cov) {
++                      skb->csum = ip6_compute_pseudo(skb, proto);
++                      return 0;
++              }
+       }
+       /* To support RFC 6936 (allow zero checksum in UDP/IPV6 for tunnels)
diff --git a/queue-4.15/virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch b/queue-4.15/virtio-net-disable-napi-only-when-enabled-during-xdp-set.patch
new file mode 100644 (file)
index 0000000..31376dd
--- /dev/null
@@ -0,0 +1,49 @@
+From foo@baz Tue Mar  6 19:02:56 PST 2018
+From: Jason Wang <jasowang@redhat.com>
+Date: Wed, 28 Feb 2018 18:20:04 +0800
+Subject: virtio-net: disable NAPI only when enabled during XDP set
+
+From: Jason Wang <jasowang@redhat.com>
+
+
+[ Upstream commit 4e09ff5362843dff3accfa84c805c7f3a99de9cd ]
+
+We try to disable NAPI to prevent a single XDP TX queue being used by
+multiple cpus. But we don't check if device is up (NAPI is enabled),
+this could result stall because of infinite wait in
+napi_disable(). Fixing this by checking device state through
+netif_running() before.
+
+Fixes: 4941d472bf95b ("virtio-net: do not reset during XDP set")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/virtio_net.c |    8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/virtio_net.c
++++ b/drivers/net/virtio_net.c
+@@ -2040,8 +2040,9 @@ static int virtnet_xdp_set(struct net_de
+       }
+       /* Make sure NAPI is not using any XDP TX queues for RX. */
+-      for (i = 0; i < vi->max_queue_pairs; i++)
+-              napi_disable(&vi->rq[i].napi);
++      if (netif_running(dev))
++              for (i = 0; i < vi->max_queue_pairs; i++)
++                      napi_disable(&vi->rq[i].napi);
+       netif_set_real_num_rx_queues(dev, curr_qp + xdp_qp);
+       err = _virtnet_set_queues(vi, curr_qp + xdp_qp);
+@@ -2060,7 +2061,8 @@ static int virtnet_xdp_set(struct net_de
+               }
+               if (old_prog)
+                       bpf_prog_put(old_prog);
+-              virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
++              if (netif_running(dev))
++                      virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
+       }
+       return 0;