--- /dev/null
+From a826b04303a40d52439aa141035fca5654ccaccd Mon Sep 17 00:00:00 2001
+From: Matteo Croce <mcroce@microsoft.com>
+Date: Fri, 15 Jan 2021 19:42:08 +0100
+Subject: ipv6: create multicast route with RTPROT_KERNEL
+
+From: Matteo Croce <mcroce@microsoft.com>
+
+commit a826b04303a40d52439aa141035fca5654ccaccd upstream.
+
+The ff00::/8 multicast route is created without specifying the fc_protocol
+field, so the default RTPROT_BOOT value is used:
+
+ $ ip -6 -d route
+ unicast ::1 dev lo proto kernel scope global metric 256 pref medium
+ unicast fe80::/64 dev eth0 proto kernel scope global metric 256 pref medium
+ unicast ff00::/8 dev eth0 proto boot scope global metric 256 pref medium
+
+As the documentation says, this value identifies routes installed during
+boot, but the route is created when interface is set up.
+Change the value to RTPROT_KERNEL which is a better value.
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Signed-off-by: Matteo Croce <mcroce@microsoft.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv6/addrconf.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -2468,6 +2468,7 @@ static void addrconf_add_mroute(struct n
+ .fc_flags = RTF_UP,
+ .fc_type = RTN_UNICAST,
+ .fc_nlinfo.nl_net = dev_net(dev),
++ .fc_protocol = RTPROT_KERNEL,
+ };
+
+ ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0);
--- /dev/null
+From ceed9038b2783d14e0422bdc6fd04f70580efb4c Mon Sep 17 00:00:00 2001
+From: Matteo Croce <mcroce@microsoft.com>
+Date: Fri, 15 Jan 2021 19:42:09 +0100
+Subject: ipv6: set multicast flag on the multicast route
+
+From: Matteo Croce <mcroce@microsoft.com>
+
+commit ceed9038b2783d14e0422bdc6fd04f70580efb4c upstream.
+
+The multicast route ff00::/8 is created with type RTN_UNICAST:
+
+ $ ip -6 -d route
+ unicast ::1 dev lo proto kernel scope global metric 256 pref medium
+ unicast fe80::/64 dev eth0 proto kernel scope global metric 256 pref medium
+ unicast ff00::/8 dev eth0 proto kernel scope global metric 256 pref medium
+
+Set the type to RTN_MULTICAST which is more appropriate.
+
+Fixes: e8478e80e5a7 ("net/ipv6: Save route type in rt6_info")
+Signed-off-by: Matteo Croce <mcroce@microsoft.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv6/addrconf.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -2466,7 +2466,7 @@ static void addrconf_add_mroute(struct n
+ .fc_ifindex = dev->ifindex,
+ .fc_dst_len = 8,
+ .fc_flags = RTF_UP,
+- .fc_type = RTN_UNICAST,
++ .fc_type = RTN_MULTICAST,
+ .fc_nlinfo.nl_net = dev_net(dev),
+ .fc_protocol = RTPROT_KERNEL,
+ };
--- /dev/null
+From 5dabd1712cd056814f9ab15f1d68157ceb04e741 Mon Sep 17 00:00:00 2001
+From: Lecopzer Chen <lecopzer@gmail.com>
+Date: Sat, 23 Jan 2021 21:01:29 -0800
+Subject: kasan: fix incorrect arguments passing in kasan_add_zero_shadow
+
+From: Lecopzer Chen <lecopzer@gmail.com>
+
+commit 5dabd1712cd056814f9ab15f1d68157ceb04e741 upstream.
+
+kasan_remove_zero_shadow() shall use original virtual address, start and
+size, instead of shadow address.
+
+Link: https://lkml.kernel.org/r/20210103063847.5963-1-lecopzer@gmail.com
+Fixes: 0207df4fa1a86 ("kernel/memremap, kasan: make ZONE_DEVICE with work with KASAN")
+Signed-off-by: Lecopzer Chen <lecopzer.chen@mediatek.com>
+Reviewed-by: Andrey Konovalov <andreyknvl@google.com>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Alexander Potapenko <glider@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/kasan/init.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/mm/kasan/init.c
++++ b/mm/kasan/init.c
+@@ -492,7 +492,6 @@ int kasan_add_zero_shadow(void *start, u
+
+ ret = kasan_populate_early_shadow(shadow_start, shadow_end);
+ if (ret)
+- kasan_remove_zero_shadow(shadow_start,
+- size >> KASAN_SHADOW_SCALE_SHIFT);
++ kasan_remove_zero_shadow(start, size);
+ return ret;
+ }
--- /dev/null
+From a11a496ee6e2ab6ed850233c96b94caf042af0b9 Mon Sep 17 00:00:00 2001
+From: Lecopzer Chen <lecopzer@gmail.com>
+Date: Sat, 23 Jan 2021 21:01:25 -0800
+Subject: kasan: fix unaligned address is unhandled in kasan_remove_zero_shadow
+
+From: Lecopzer Chen <lecopzer@gmail.com>
+
+commit a11a496ee6e2ab6ed850233c96b94caf042af0b9 upstream.
+
+During testing kasan_populate_early_shadow and kasan_remove_zero_shadow,
+if the shadow start and end address in kasan_remove_zero_shadow() is not
+aligned to PMD_SIZE, the remain unaligned PTE won't be removed.
+
+In the test case for kasan_remove_zero_shadow():
+
+ shadow_start: 0xffffffb802000000, shadow end: 0xffffffbfbe000000
+
+ 3-level page table:
+ PUD_SIZE: 0x40000000 PMD_SIZE: 0x200000 PAGE_SIZE: 4K
+
+0xffffffbf80000000 ~ 0xffffffbfbdf80000 will not be removed because in
+kasan_remove_pud_table(), kasan_pmd_table(*pud) is true but the next
+address is 0xffffffbfbdf80000 which is not aligned to PUD_SIZE.
+
+In the correct condition, this should fallback to the next level
+kasan_remove_pmd_table() but the condition flow always continue to skip
+the unaligned part.
+
+Fix by correcting the condition when next and addr are neither aligned.
+
+Link: https://lkml.kernel.org/r/20210103135621.83129-1-lecopzer@gmail.com
+Fixes: 0207df4fa1a86 ("kernel/memremap, kasan: make ZONE_DEVICE with work with KASAN")
+Signed-off-by: Lecopzer Chen <lecopzer.chen@mediatek.com>
+Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Cc: Dan Williams <dan.j.williams@intel.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: YJ Chiang <yj.chiang@mediatek.com>
+Cc: Andrey Konovalov <andreyknvl@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/kasan/init.c | 20 ++++++++++++--------
+ 1 file changed, 12 insertions(+), 8 deletions(-)
+
+--- a/mm/kasan/init.c
++++ b/mm/kasan/init.c
+@@ -377,9 +377,10 @@ static void kasan_remove_pmd_table(pmd_t
+
+ if (kasan_pte_table(*pmd)) {
+ if (IS_ALIGNED(addr, PMD_SIZE) &&
+- IS_ALIGNED(next, PMD_SIZE))
++ IS_ALIGNED(next, PMD_SIZE)) {
+ pmd_clear(pmd);
+- continue;
++ continue;
++ }
+ }
+ pte = pte_offset_kernel(pmd, addr);
+ kasan_remove_pte_table(pte, addr, next);
+@@ -402,9 +403,10 @@ static void kasan_remove_pud_table(pud_t
+
+ if (kasan_pmd_table(*pud)) {
+ if (IS_ALIGNED(addr, PUD_SIZE) &&
+- IS_ALIGNED(next, PUD_SIZE))
++ IS_ALIGNED(next, PUD_SIZE)) {
+ pud_clear(pud);
+- continue;
++ continue;
++ }
+ }
+ pmd = pmd_offset(pud, addr);
+ pmd_base = pmd_offset(pud, 0);
+@@ -428,9 +430,10 @@ static void kasan_remove_p4d_table(p4d_t
+
+ if (kasan_pud_table(*p4d)) {
+ if (IS_ALIGNED(addr, P4D_SIZE) &&
+- IS_ALIGNED(next, P4D_SIZE))
++ IS_ALIGNED(next, P4D_SIZE)) {
+ p4d_clear(p4d);
+- continue;
++ continue;
++ }
+ }
+ pud = pud_offset(p4d, addr);
+ kasan_remove_pud_table(pud, addr, next);
+@@ -462,9 +465,10 @@ void kasan_remove_zero_shadow(void *star
+
+ if (kasan_p4d_table(*pgd)) {
+ if (IS_ALIGNED(addr, PGDIR_SIZE) &&
+- IS_ALIGNED(next, PGDIR_SIZE))
++ IS_ALIGNED(next, PGDIR_SIZE)) {
+ pgd_clear(pgd);
+- continue;
++ continue;
++ }
+ }
+
+ p4d = p4d_offset(pgd, addr);
--- /dev/null
+From 7e238de8283acd32c26c2bc2a50672d0ea862ff7 Mon Sep 17 00:00:00 2001
+From: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
+Date: Tue, 19 Jan 2021 10:53:33 +0200
+Subject: net: core: devlink: use right genl user_ptr when handling port param get/set
+
+From: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
+
+commit 7e238de8283acd32c26c2bc2a50672d0ea862ff7 upstream.
+
+Fix incorrect user_ptr dereferencing when handling port param get/set:
+
+ idx [0] stores the 'struct devlink' pointer;
+ idx [1] stores the 'struct devlink_port' pointer;
+
+Fixes: 637989b5d77e ("devlink: Always use user_ptr[0] for devlink and simplify post_doit")
+CC: Parav Pandit <parav@mellanox.com>
+Signed-off-by: Oleksandr Mazur <oleksandr.mazur@plvision.eu>
+Signed-off-by: Vadym Kochan <vadym.kochan@plvision.eu>
+Link: https://lore.kernel.org/r/20210119085333.16833-1-vadym.kochan@plvision.eu
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/core/devlink.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/core/devlink.c
++++ b/net/core/devlink.c
+@@ -4134,7 +4134,7 @@ out:
+ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+ {
+- struct devlink_port *devlink_port = info->user_ptr[0];
++ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct devlink_param_item *param_item;
+ struct sk_buff *msg;
+ int err;
+@@ -4163,7 +4163,7 @@ static int devlink_nl_cmd_port_param_get
+ static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+ {
+- struct devlink_port *devlink_port = info->user_ptr[0];
++ struct devlink_port *devlink_port = info->user_ptr[1];
+
+ return __devlink_nl_cmd_param_set_doit(devlink_port->devlink,
+ devlink_port->index,
--- /dev/null
+From a3eb4e9d4c9218476d05c52dfd2be3d6fdce6b91 Mon Sep 17 00:00:00 2001
+From: Tariq Toukan <tariqt@nvidia.com>
+Date: Sun, 17 Jan 2021 17:15:38 +0200
+Subject: net: Disable NETIF_F_HW_TLS_RX when RXCSUM is disabled
+
+From: Tariq Toukan <tariqt@nvidia.com>
+
+commit a3eb4e9d4c9218476d05c52dfd2be3d6fdce6b91 upstream.
+
+With NETIF_F_HW_TLS_RX packets are decrypted in HW. This cannot be
+logically done when RXCSUM offload is off.
+
+Fixes: 14136564c8ee ("net: Add TLS RX offload feature")
+Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
+Reviewed-by: Boris Pismenny <borisp@nvidia.com>
+Link: https://lore.kernel.org/r/20210117151538.9411-1-tariqt@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/core/dev.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -9602,6 +9602,11 @@ static netdev_features_t netdev_fix_feat
+ }
+ }
+
++ if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
++ netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
++ features &= ~NETIF_F_HW_TLS_RX;
++ }
++
+ return features;
+ }
+
--- /dev/null
+From 8e4052c32d6b4b39c1e13c652c7e33748d447409 Mon Sep 17 00:00:00 2001
+From: Dan Carpenter <dan.carpenter@oracle.com>
+Date: Tue, 19 Jan 2021 17:48:03 +0300
+Subject: net: dsa: b53: fix an off by one in checking "vlan->vid"
+
+From: Dan Carpenter <dan.carpenter@oracle.com>
+
+commit 8e4052c32d6b4b39c1e13c652c7e33748d447409 upstream.
+
+The > comparison should be >= to prevent accessing one element beyond
+the end of the dev->vlans[] array in the caller function, b53_vlan_add().
+The "dev->vlans" array is allocated in the b53_switch_init() function
+and it has "dev->num_vlans" elements.
+
+Fixes: a2482d2ce349 ("net: dsa: b53: Plug in VLAN support")
+Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
+Acked-by: Florian Fainelli <f.fainelli@gmail.com>
+Link: https://lore.kernel.org/r/YAbxI97Dl/pmBy5V@mwanda
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/dsa/b53/b53_common.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/dsa/b53/b53_common.c
++++ b/drivers/net/dsa/b53/b53_common.c
+@@ -1404,7 +1404,7 @@ int b53_vlan_prepare(struct dsa_switch *
+ !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED))
+ return -EINVAL;
+
+- if (vlan->vid_end > dev->num_vlans)
++ if (vlan->vid_end >= dev->num_vlans)
+ return -ERANGE;
+
+ b53_enable_vlan(dev, true, ds->vlan_filtering);
--- /dev/null
+From 79267ae22615496655feee2db0848f6786bcf67a Mon Sep 17 00:00:00 2001
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+Date: Mon, 18 Jan 2021 15:52:10 +0200
+Subject: net: mscc: ocelot: allow offloading of bridge on top of LAG
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+commit 79267ae22615496655feee2db0848f6786bcf67a upstream.
+
+The blamed commit was too aggressive, and it made ocelot_netdevice_event
+react only to network interface events emitted for the ocelot switch
+ports.
+
+In fact, only the PRECHANGEUPPER should have had that check.
+
+When we ignore all events that are not for us, we miss the fact that the
+upper of the LAG changes, and the bonding interface gets enslaved to a
+bridge. This is an operation we could offload under certain conditions.
+
+Fixes: 7afb3e575e5a ("net: mscc: ocelot: don't handle netdev events for other netdevs")
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Reviewed-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
+Link: https://lore.kernel.org/r/20210118135210.2666246-1-olteanv@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/ethernet/mscc/ocelot_net.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+--- a/drivers/net/ethernet/mscc/ocelot_net.c
++++ b/drivers/net/ethernet/mscc/ocelot_net.c
+@@ -952,10 +952,8 @@ static int ocelot_netdevice_event(struct
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ int ret = 0;
+
+- if (!ocelot_netdevice_dev_check(dev))
+- return 0;
+-
+ if (event == NETDEV_PRECHANGEUPPER &&
++ ocelot_netdevice_dev_check(dev) &&
+ netif_is_lag_master(info->upper_dev)) {
+ struct netdev_lag_upper_info *lag_upper_info = info->upper_info;
+ struct netlink_ext_ack *extack;
--- /dev/null
+From 584b7cfcdc7d6d416a9d6fece9516764bd977d2e Mon Sep 17 00:00:00 2001
+From: Alban Bedel <alban.bedel@aerq.com>
+Date: Tue, 19 Jan 2021 15:06:38 +0100
+Subject: net: mscc: ocelot: Fix multicast to the CPU port
+
+From: Alban Bedel <alban.bedel@aerq.com>
+
+commit 584b7cfcdc7d6d416a9d6fece9516764bd977d2e upstream.
+
+Multicast entries in the MAC table use the high bits of the MAC
+address to encode the ports that should get the packets. But this port
+mask does not work for the CPU port, to receive these packets on the
+CPU port the MAC_CPU_COPY flag must be set.
+
+Because of this IPv6 was effectively not working because neighbor
+solicitations were never received. This was not apparent before commit
+9403c158 (net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb
+entries) as the IPv6 entries were broken so all incoming IPv6
+multicast was then treated as unknown and flooded on all ports.
+
+To fix this problem rework the ocelot_mact_learn() to set the
+MAC_CPU_COPY flag when a multicast entry that target the CPU port is
+added. For this we have to read back the ports endcoded in the pseudo
+MAC address by the caller. It is not a very nice design but that avoid
+changing the callers and should make backporting easier.
+
+Signed-off-by: Alban Bedel <alban.bedel@aerq.com>
+Fixes: 9403c158b872 ("net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries")
+Link: https://lore.kernel.org/r/20210119140638.203374-1-alban.bedel@aerq.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/net/ethernet/mscc/ocelot.c | 23 ++++++++++++++++++-----
+ 1 file changed, 18 insertions(+), 5 deletions(-)
+
+--- a/drivers/net/ethernet/mscc/ocelot.c
++++ b/drivers/net/ethernet/mscc/ocelot.c
+@@ -60,14 +60,27 @@ int ocelot_mact_learn(struct ocelot *oce
+ const unsigned char mac[ETH_ALEN],
+ unsigned int vid, enum macaccess_entry_type type)
+ {
++ u32 cmd = ANA_TABLES_MACACCESS_VALID |
++ ANA_TABLES_MACACCESS_DEST_IDX(port) |
++ ANA_TABLES_MACACCESS_ENTRYTYPE(type) |
++ ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_LEARN);
++ unsigned int mc_ports;
++
++ /* Set MAC_CPU_COPY if the CPU port is used by a multicast entry */
++ if (type == ENTRYTYPE_MACv4)
++ mc_ports = (mac[1] << 8) | mac[2];
++ else if (type == ENTRYTYPE_MACv6)
++ mc_ports = (mac[0] << 8) | mac[1];
++ else
++ mc_ports = 0;
++
++ if (mc_ports & BIT(ocelot->num_phys_ports))
++ cmd |= ANA_TABLES_MACACCESS_MAC_CPU_COPY;
++
+ ocelot_mact_select(ocelot, mac, vid);
+
+ /* Issue a write command */
+- ocelot_write(ocelot, ANA_TABLES_MACACCESS_VALID |
+- ANA_TABLES_MACACCESS_DEST_IDX(port) |
+- ANA_TABLES_MACACCESS_ENTRYTYPE(type) |
+- ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_LEARN),
+- ANA_TABLES_MACACCESS);
++ ocelot_write(ocelot, cmd, ANA_TABLES_MACACCESS);
+
+ return ocelot_mact_wait_for_completion(ocelot);
+ }
--- /dev/null
+From bcd0cf19ef8258ac31b9a20248b05c15a1f4b4b0 Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 14 Jan 2021 10:52:29 -0800
+Subject: net_sched: avoid shift-out-of-bounds in tcindex_set_parms()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit bcd0cf19ef8258ac31b9a20248b05c15a1f4b4b0 upstream.
+
+tc_index being 16bit wide, we need to check that TCA_TCINDEX_SHIFT
+attribute is not silly.
+
+UBSAN: shift-out-of-bounds in net/sched/cls_tcindex.c:260:29
+shift exponent 255 is too large for 32-bit type 'int'
+CPU: 0 PID: 8516 Comm: syz-executor228 Not tainted 5.10.0-syzkaller #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+Call Trace:
+ __dump_stack lib/dump_stack.c:79 [inline]
+ dump_stack+0x107/0x163 lib/dump_stack.c:120
+ ubsan_epilogue+0xb/0x5a lib/ubsan.c:148
+ __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395
+ valid_perfect_hash net/sched/cls_tcindex.c:260 [inline]
+ tcindex_set_parms.cold+0x1b/0x215 net/sched/cls_tcindex.c:425
+ tcindex_change+0x232/0x340 net/sched/cls_tcindex.c:546
+ tc_new_tfilter+0x13fb/0x21b0 net/sched/cls_api.c:2127
+ rtnetlink_rcv_msg+0x8b6/0xb80 net/core/rtnetlink.c:5555
+ netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494
+ netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline]
+ netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1330
+ netlink_sendmsg+0x907/0xe40 net/netlink/af_netlink.c:1919
+ sock_sendmsg_nosec net/socket.c:652 [inline]
+ sock_sendmsg+0xcf/0x120 net/socket.c:672
+ ____sys_sendmsg+0x6e8/0x810 net/socket.c:2336
+ ___sys_sendmsg+0xf3/0x170 net/socket.c:2390
+ __sys_sendmsg+0xe5/0x1b0 net/socket.c:2423
+ do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Link: https://lore.kernel.org/r/20210114185229.1742255-1-eric.dumazet@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/sched/cls_tcindex.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/net/sched/cls_tcindex.c
++++ b/net/sched/cls_tcindex.c
+@@ -366,9 +366,13 @@ tcindex_set_parms(struct net *net, struc
+ if (tb[TCA_TCINDEX_MASK])
+ cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]);
+
+- if (tb[TCA_TCINDEX_SHIFT])
++ if (tb[TCA_TCINDEX_SHIFT]) {
+ cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]);
+-
++ if (cp->shift > 16) {
++ err = -EINVAL;
++ goto errout;
++ }
++ }
+ if (!cp->hash) {
+ /* Hash not specified, use perfect hash if the upper limit
+ * of the hashing index is below the threshold.
--- /dev/null
+From dd5e073381f2ada3630f36be42833c6e9c78b75e Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 14 Jan 2021 10:19:29 -0800
+Subject: net_sched: gen_estimator: support large ewma log
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit dd5e073381f2ada3630f36be42833c6e9c78b75e upstream.
+
+syzbot report reminded us that very big ewma_log were supported in the past,
+even if they made litle sense.
+
+tc qdisc replace dev xxx root est 1sec 131072sec ...
+
+While fixing the bug, also add boundary checks for ewma_log, in line
+with range supported by iproute2.
+
+UBSAN: shift-out-of-bounds in net/core/gen_estimator.c:83:38
+shift exponent -1 is negative
+CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.10.0-syzkaller #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+Call Trace:
+ <IRQ>
+ __dump_stack lib/dump_stack.c:79 [inline]
+ dump_stack+0x107/0x163 lib/dump_stack.c:120
+ ubsan_epilogue+0xb/0x5a lib/ubsan.c:148
+ __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395
+ est_timer.cold+0xbb/0x12d net/core/gen_estimator.c:83
+ call_timer_fn+0x1a5/0x710 kernel/time/timer.c:1417
+ expire_timers kernel/time/timer.c:1462 [inline]
+ __run_timers.part.0+0x692/0xa80 kernel/time/timer.c:1731
+ __run_timers kernel/time/timer.c:1712 [inline]
+ run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1744
+ __do_softirq+0x2bc/0xa77 kernel/softirq.c:343
+ asm_call_irq_on_stack+0xf/0x20
+ </IRQ>
+ __run_on_irqstack arch/x86/include/asm/irq_stack.h:26 [inline]
+ run_on_irqstack_cond arch/x86/include/asm/irq_stack.h:77 [inline]
+ do_softirq_own_stack+0xaa/0xd0 arch/x86/kernel/irq_64.c:77
+ invoke_softirq kernel/softirq.c:226 [inline]
+ __irq_exit_rcu+0x17f/0x200 kernel/softirq.c:420
+ irq_exit_rcu+0x5/0x20 kernel/softirq.c:432
+ sysvec_apic_timer_interrupt+0x4d/0x100 arch/x86/kernel/apic/apic.c:1096
+ asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:628
+RIP: 0010:native_save_fl arch/x86/include/asm/irqflags.h:29 [inline]
+RIP: 0010:arch_local_save_flags arch/x86/include/asm/irqflags.h:79 [inline]
+RIP: 0010:arch_irqs_disabled arch/x86/include/asm/irqflags.h:169 [inline]
+RIP: 0010:acpi_safe_halt drivers/acpi/processor_idle.c:111 [inline]
+RIP: 0010:acpi_idle_do_entry+0x1c9/0x250 drivers/acpi/processor_idle.c:516
+
+Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Link: https://lore.kernel.org/r/20210114181929.1717985-1-eric.dumazet@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/core/gen_estimator.c | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/net/core/gen_estimator.c
++++ b/net/core/gen_estimator.c
+@@ -80,11 +80,11 @@ static void est_timer(struct timer_list
+ u64 rate, brate;
+
+ est_fetch_counters(est, &b);
+- brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log);
+- brate -= (est->avbps >> est->ewma_log);
++ brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log);
++ brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log);
+
+- rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log);
+- rate -= (est->avpps >> est->ewma_log);
++ rate = (b.packets - est->last_packets) << (10 - est->intvl_log);
++ rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log);
+
+ write_seqcount_begin(&est->seq);
+ est->avbps += brate;
+@@ -143,6 +143,9 @@ int gen_new_estimator(struct gnet_stats_
+ if (parm->interval < -2 || parm->interval > 3)
+ return -EINVAL;
+
++ if (parm->ewma_log == 0 || parm->ewma_log >= 31)
++ return -EINVAL;
++
+ est = kzalloc(sizeof(*est), GFP_KERNEL);
+ if (!est)
+ return -ENOBUFS;
--- /dev/null
+From e4bedf48aaa5552bc1f49703abd17606e7e6e82a Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 14 Jan 2021 08:06:37 -0800
+Subject: net_sched: reject silly cell_log in qdisc_get_rtab()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit e4bedf48aaa5552bc1f49703abd17606e7e6e82a upstream.
+
+iproute2 probably never goes beyond 8 for the cell exponent,
+but stick to the max shift exponent for signed 32bit.
+
+UBSAN reported:
+UBSAN: shift-out-of-bounds in net/sched/sch_api.c:389:22
+shift exponent 130 is too large for 32-bit type 'int'
+CPU: 1 PID: 8450 Comm: syz-executor586 Not tainted 5.11.0-rc3-syzkaller #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+Call Trace:
+ __dump_stack lib/dump_stack.c:79 [inline]
+ dump_stack+0x183/0x22e lib/dump_stack.c:120
+ ubsan_epilogue lib/ubsan.c:148 [inline]
+ __ubsan_handle_shift_out_of_bounds+0x432/0x4d0 lib/ubsan.c:395
+ __detect_linklayer+0x2a9/0x330 net/sched/sch_api.c:389
+ qdisc_get_rtab+0x2b5/0x410 net/sched/sch_api.c:435
+ cbq_init+0x28f/0x12c0 net/sched/sch_cbq.c:1180
+ qdisc_create+0x801/0x1470 net/sched/sch_api.c:1246
+ tc_modify_qdisc+0x9e3/0x1fc0 net/sched/sch_api.c:1662
+ rtnetlink_rcv_msg+0xb1d/0xe60 net/core/rtnetlink.c:5564
+ netlink_rcv_skb+0x1f0/0x460 net/netlink/af_netlink.c:2494
+ netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline]
+ netlink_unicast+0x7de/0x9b0 net/netlink/af_netlink.c:1330
+ netlink_sendmsg+0xaa6/0xe90 net/netlink/af_netlink.c:1919
+ sock_sendmsg_nosec net/socket.c:652 [inline]
+ sock_sendmsg net/socket.c:672 [inline]
+ ____sys_sendmsg+0x5a2/0x900 net/socket.c:2345
+ ___sys_sendmsg net/socket.c:2399 [inline]
+ __sys_sendmsg+0x319/0x400 net/socket.c:2432
+ do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
+ entry_SYSCALL_64_after_hwframe+0x44/0xa9
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Acked-by: Cong Wang <cong.wang@bytedance.com>
+Link: https://lore.kernel.org/r/20210114160637.1660597-1-eric.dumazet@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/sched/sch_api.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/sched/sch_api.c
++++ b/net/sched/sch_api.c
+@@ -412,7 +412,8 @@ struct qdisc_rate_table *qdisc_get_rtab(
+ {
+ struct qdisc_rate_table *rtab;
+
+- if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
++ if (tab == NULL || r->rate == 0 ||
++ r->cell_log == 0 || r->cell_log >= 32 ||
+ nla_len(tab) != TC_RTAB_SIZE) {
+ NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
+ return NULL;
cachefiles-drop-superfluous-readpages-aops-null-check.patch
lightnvm-fix-memory-leak-when-submit-fails.patch
skbuff-back-tiny-skbs-with-kmalloc-in-__netdev_alloc_skb-too.patch
+kasan-fix-unaligned-address-is-unhandled-in-kasan_remove_zero_shadow.patch
+kasan-fix-incorrect-arguments-passing-in-kasan_add_zero_shadow.patch
+tcp-fix-tcp-socket-rehash-stats-mis-accounting.patch
+net_sched-gen_estimator-support-large-ewma-log.patch
+udp-mask-tos-bits-in-udp_v4_early_demux.patch
+ipv6-create-multicast-route-with-rtprot_kernel.patch
+net_sched-avoid-shift-out-of-bounds-in-tcindex_set_parms.patch
+net_sched-reject-silly-cell_log-in-qdisc_get_rtab.patch
+ipv6-set-multicast-flag-on-the-multicast-route.patch
+net-mscc-ocelot-allow-offloading-of-bridge-on-top-of-lag.patch
+net-disable-netif_f_hw_tls_rx-when-rxcsum-is-disabled.patch
+net-dsa-b53-fix-an-off-by-one-in-checking-vlan-vid.patch
+tcp-do-not-mess-with-cloned-skbs-in-tcp_add_backlog.patch
+tcp-fix-tcp_user_timeout-with-zero-window.patch
+net-mscc-ocelot-fix-multicast-to-the-cpu-port.patch
+net-core-devlink-use-right-genl-user_ptr-when-handling-port-param-get-set.patch
--- /dev/null
+From b160c28548bc0a87cbd16d5af6d3edcfd70b8c9a Mon Sep 17 00:00:00 2001
+From: Eric Dumazet <edumazet@google.com>
+Date: Tue, 19 Jan 2021 08:49:00 -0800
+Subject: tcp: do not mess with cloned skbs in tcp_add_backlog()
+
+From: Eric Dumazet <edumazet@google.com>
+
+commit b160c28548bc0a87cbd16d5af6d3edcfd70b8c9a upstream.
+
+Heiner Kallweit reported that some skbs were sent with
+the following invalid GSO properties :
+- gso_size > 0
+- gso_type == 0
+
+This was triggerring a WARN_ON_ONCE() in rtl8169_tso_csum_v2.
+
+Juerg Haefliger was able to reproduce a similar issue using
+a lan78xx NIC and a workload mixing TCP incoming traffic
+and forwarded packets.
+
+The problem is that tcp_add_backlog() is writing
+over gso_segs and gso_size even if the incoming packet will not
+be coalesced to the backlog tail packet.
+
+While skb_try_coalesce() would bail out if tail packet is cloned,
+this overwriting would lead to corruptions of other packets
+cooked by lan78xx, sharing a common super-packet.
+
+The strategy used by lan78xx is to use a big skb, and split
+it into all received packets using skb_clone() to avoid copies.
+The drawback of this strategy is that all the small skb share a common
+struct skb_shared_info.
+
+This patch rewrites TCP gso_size/gso_segs handling to only
+happen on the tail skb, since skb_try_coalesce() made sure
+it was not cloned.
+
+Fixes: 4f693b55c3d2 ("tcp: implement coalescing on backlog queue")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Bisected-by: Juerg Haefliger <juergh@canonical.com>
+Tested-by: Juerg Haefliger <juergh@canonical.com>
+Reported-by: Heiner Kallweit <hkallweit1@gmail.com>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=209423
+Link: https://lore.kernel.org/r/20210119164900.766957-1-eric.dumazet@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/tcp_ipv4.c | 25 +++++++++++++------------
+ 1 file changed, 13 insertions(+), 12 deletions(-)
+
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -1755,6 +1755,7 @@ int tcp_v4_early_demux(struct sk_buff *s
+ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
+ {
+ u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf);
++ u32 tail_gso_size, tail_gso_segs;
+ struct skb_shared_info *shinfo;
+ const struct tcphdr *th;
+ struct tcphdr *thtail;
+@@ -1762,6 +1763,7 @@ bool tcp_add_backlog(struct sock *sk, st
+ unsigned int hdrlen;
+ bool fragstolen;
+ u32 gso_segs;
++ u32 gso_size;
+ int delta;
+
+ /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
+@@ -1787,13 +1789,6 @@ bool tcp_add_backlog(struct sock *sk, st
+ */
+ th = (const struct tcphdr *)skb->data;
+ hdrlen = th->doff * 4;
+- shinfo = skb_shinfo(skb);
+-
+- if (!shinfo->gso_size)
+- shinfo->gso_size = skb->len - hdrlen;
+-
+- if (!shinfo->gso_segs)
+- shinfo->gso_segs = 1;
+
+ tail = sk->sk_backlog.tail;
+ if (!tail)
+@@ -1816,6 +1811,15 @@ bool tcp_add_backlog(struct sock *sk, st
+ goto no_coalesce;
+
+ __skb_pull(skb, hdrlen);
++
++ shinfo = skb_shinfo(skb);
++ gso_size = shinfo->gso_size ?: skb->len;
++ gso_segs = shinfo->gso_segs ?: 1;
++
++ shinfo = skb_shinfo(tail);
++ tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
++ tail_gso_segs = shinfo->gso_segs ?: 1;
++
+ if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
+ TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
+
+@@ -1842,11 +1846,8 @@ bool tcp_add_backlog(struct sock *sk, st
+ }
+
+ /* Not as strict as GRO. We only need to carry mss max value */
+- skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
+- skb_shinfo(tail)->gso_size);
+-
+- gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
+- skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
++ shinfo->gso_size = max(gso_size, tail_gso_size);
++ shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
+
+ sk->sk_backlog.len += delta;
+ __NET_INC_STATS(sock_net(sk),
--- /dev/null
+From 9c30ae8398b0813e237bde387d67a7f74ab2db2d Mon Sep 17 00:00:00 2001
+From: Yuchung Cheng <ycheng@google.com>
+Date: Tue, 19 Jan 2021 11:26:19 -0800
+Subject: tcp: fix TCP socket rehash stats mis-accounting
+
+From: Yuchung Cheng <ycheng@google.com>
+
+commit 9c30ae8398b0813e237bde387d67a7f74ab2db2d upstream.
+
+The previous commit 32efcc06d2a1 ("tcp: export count for rehash attempts")
+would mis-account rehashing SNMP and socket stats:
+
+ a. During handshake of an active open, only counts the first
+ SYN timeout
+
+ b. After handshake of passive and active open, stop updating
+ after (roughly) TCP_RETRIES1 recurring RTOs
+
+ c. After the socket aborts, over count timeout_rehash by 1
+
+This patch fixes this by checking the rehash result from sk_rethink_txhash.
+
+Fixes: 32efcc06d2a1 ("tcp: export count for rehash attempts")
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Link: https://lore.kernel.org/r/20210119192619.1848270-1-ycheng@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/net/sock.h | 17 ++++++++++++-----
+ net/ipv4/tcp_input.c | 5 ++---
+ net/ipv4/tcp_timer.c | 22 ++++++++--------------
+ 3 files changed, 22 insertions(+), 22 deletions(-)
+
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -1903,10 +1903,13 @@ static inline void sk_set_txhash(struct
+ sk->sk_txhash = net_tx_rndhash();
+ }
+
+-static inline void sk_rethink_txhash(struct sock *sk)
++static inline bool sk_rethink_txhash(struct sock *sk)
+ {
+- if (sk->sk_txhash)
++ if (sk->sk_txhash) {
+ sk_set_txhash(sk);
++ return true;
++ }
++ return false;
+ }
+
+ static inline struct dst_entry *
+@@ -1929,12 +1932,10 @@ sk_dst_get(struct sock *sk)
+ return dst;
+ }
+
+-static inline void dst_negative_advice(struct sock *sk)
++static inline void __dst_negative_advice(struct sock *sk)
+ {
+ struct dst_entry *ndst, *dst = __sk_dst_get(sk);
+
+- sk_rethink_txhash(sk);
+-
+ if (dst && dst->ops->negative_advice) {
+ ndst = dst->ops->negative_advice(dst);
+
+@@ -1946,6 +1947,12 @@ static inline void dst_negative_advice(s
+ }
+ }
+
++static inline void dst_negative_advice(struct sock *sk)
++{
++ sk_rethink_txhash(sk);
++ __dst_negative_advice(sk);
++}
++
+ static inline void
+ __sk_dst_set(struct sock *sk, struct dst_entry *dst)
+ {
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4379,10 +4379,9 @@ static void tcp_rcv_spurious_retrans(str
+ * The receiver remembers and reflects via DSACKs. Leverage the
+ * DSACK state and change the txhash to re-route speculatively.
+ */
+- if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) {
+- sk_rethink_txhash(sk);
++ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq &&
++ sk_rethink_txhash(sk))
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+- }
+ }
+
+ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -219,14 +219,8 @@ static int tcp_write_timeout(struct sock
+ int retry_until;
+
+ if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+- if (icsk->icsk_retransmits) {
+- dst_negative_advice(sk);
+- } else {
+- sk_rethink_txhash(sk);
+- tp->timeout_rehash++;
+- __NET_INC_STATS(sock_net(sk),
+- LINUX_MIB_TCPTIMEOUTREHASH);
+- }
++ if (icsk->icsk_retransmits)
++ __dst_negative_advice(sk);
+ retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
+ expired = icsk->icsk_retransmits >= retry_until;
+ } else {
+@@ -234,12 +228,7 @@ static int tcp_write_timeout(struct sock
+ /* Black hole detection */
+ tcp_mtu_probing(icsk, sk);
+
+- dst_negative_advice(sk);
+- } else {
+- sk_rethink_txhash(sk);
+- tp->timeout_rehash++;
+- __NET_INC_STATS(sock_net(sk),
+- LINUX_MIB_TCPTIMEOUTREHASH);
++ __dst_negative_advice(sk);
+ }
+
+ retry_until = net->ipv4.sysctl_tcp_retries2;
+@@ -270,6 +259,11 @@ static int tcp_write_timeout(struct sock
+ return 1;
+ }
+
++ if (sk_rethink_txhash(sk)) {
++ tp->timeout_rehash++;
++ __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
++ }
++
+ return 0;
+ }
+
--- /dev/null
+From 9d9b1ee0b2d1c9e02b2338c4a4b0a062d2d3edac Mon Sep 17 00:00:00 2001
+From: Enke Chen <enchen@paloaltonetworks.com>
+Date: Fri, 15 Jan 2021 14:30:58 -0800
+Subject: tcp: fix TCP_USER_TIMEOUT with zero window
+
+From: Enke Chen <enchen@paloaltonetworks.com>
+
+commit 9d9b1ee0b2d1c9e02b2338c4a4b0a062d2d3edac upstream.
+
+The TCP session does not terminate with TCP_USER_TIMEOUT when data
+remain untransmitted due to zero window.
+
+The number of unanswered zero-window probes (tcp_probes_out) is
+reset to zero with incoming acks irrespective of the window size,
+as described in tcp_probe_timer():
+
+ RFC 1122 4.2.2.17 requires the sender to stay open indefinitely
+ as long as the receiver continues to respond probes. We support
+ this by default and reset icsk_probes_out with incoming ACKs.
+
+This counter, however, is the wrong one to be used in calculating the
+duration that the window remains closed and data remain untransmitted.
+Thanks to Jonathan Maxwell <jmaxwell37@gmail.com> for diagnosing the
+actual issue.
+
+In this patch a new timestamp is introduced for the socket in order to
+track the elapsed time for the zero-window probes that have not been
+answered with any non-zero window ack.
+
+Fixes: 9721e709fa68 ("tcp: simplify window probe aborting on USER_TIMEOUT")
+Reported-by: William McCall <william.mccall@gmail.com>
+Co-developed-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Enke Chen <enchen@paloaltonetworks.com>
+Reviewed-by: Yuchung Cheng <ycheng@google.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20210115223058.GA39267@localhost.localdomain
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/net/inet_connection_sock.h | 3 +++
+ net/ipv4/inet_connection_sock.c | 1 +
+ net/ipv4/tcp.c | 1 +
+ net/ipv4/tcp_input.c | 1 +
+ net/ipv4/tcp_output.c | 1 +
+ net/ipv4/tcp_timer.c | 14 +++++++-------
+ 6 files changed, 14 insertions(+), 7 deletions(-)
+
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -76,6 +76,8 @@ struct inet_connection_sock_af_ops {
+ * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options)
+ * @icsk_ack: Delayed ACK control data
+ * @icsk_mtup; MTU probing control data
++ * @icsk_probes_tstamp: Probe timestamp (cleared by non-zero window ack)
++ * @icsk_user_timeout: TCP_USER_TIMEOUT value
+ */
+ struct inet_connection_sock {
+ /* inet_sock has to be the first member! */
+@@ -129,6 +131,7 @@ struct inet_connection_sock {
+
+ u32 probe_timestamp;
+ } icsk_mtup;
++ u32 icsk_probes_tstamp;
+ u32 icsk_user_timeout;
+
+ u64 icsk_ca_priv[104 / sizeof(u64)];
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -851,6 +851,7 @@ struct sock *inet_csk_clone_lock(const s
+ newicsk->icsk_retransmits = 0;
+ newicsk->icsk_backoff = 0;
+ newicsk->icsk_probes_out = 0;
++ newicsk->icsk_probes_tstamp = 0;
+
+ /* Deinitialize accept_queue to trap illegal accesses. */
+ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -2685,6 +2685,7 @@ int tcp_disconnect(struct sock *sk, int
+
+ icsk->icsk_backoff = 0;
+ icsk->icsk_probes_out = 0;
++ icsk->icsk_probes_tstamp = 0;
+ icsk->icsk_rto = TCP_TIMEOUT_INIT;
+ icsk->icsk_rto_min = TCP_RTO_MIN;
+ icsk->icsk_delack_max = TCP_DELACK_MAX;
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -3370,6 +3370,7 @@ static void tcp_ack_probe(struct sock *s
+ return;
+ if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) {
+ icsk->icsk_backoff = 0;
++ icsk->icsk_probes_tstamp = 0;
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0);
+ /* Socket must be waked up by subsequent tcp_data_snd_check().
+ * This function is not for random using!
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -4080,6 +4080,7 @@ void tcp_send_probe0(struct sock *sk)
+ /* Cancel probe timer, if it is not required. */
+ icsk->icsk_probes_out = 0;
+ icsk->icsk_backoff = 0;
++ icsk->icsk_probes_tstamp = 0;
+ return;
+ }
+
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -343,6 +343,7 @@ static void tcp_probe_timer(struct sock
+
+ if (tp->packets_out || !skb) {
+ icsk->icsk_probes_out = 0;
++ icsk->icsk_probes_tstamp = 0;
+ return;
+ }
+
+@@ -354,13 +355,12 @@ static void tcp_probe_timer(struct sock
+ * corresponding system limit. We also implement similar policy when
+ * we use RTO to probe window in tcp_retransmit_timer().
+ */
+- if (icsk->icsk_user_timeout) {
+- u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out,
+- tcp_probe0_base(sk));
+-
+- if (elapsed >= icsk->icsk_user_timeout)
+- goto abort;
+- }
++ if (!icsk->icsk_probes_tstamp)
++ icsk->icsk_probes_tstamp = tcp_jiffies32;
++ else if (icsk->icsk_user_timeout &&
++ (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
++ msecs_to_jiffies(icsk->icsk_user_timeout))
++ goto abort;
+
+ max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
+ if (sock_flag(sk, SOCK_DEAD)) {
--- /dev/null
+From 8d2b51b008c25240914984208b2ced57d1dd25a5 Mon Sep 17 00:00:00 2001
+From: Guillaume Nault <gnault@redhat.com>
+Date: Sat, 16 Jan 2021 11:44:22 +0100
+Subject: udp: mask TOS bits in udp_v4_early_demux()
+
+From: Guillaume Nault <gnault@redhat.com>
+
+commit 8d2b51b008c25240914984208b2ced57d1dd25a5 upstream.
+
+udp_v4_early_demux() is the only function that calls
+ip_mc_validate_source() with a TOS that hasn't been masked with
+IPTOS_RT_MASK.
+
+This results in different behaviours for incoming multicast UDPv4
+packets, depending on if ip_mc_validate_source() is called from the
+early-demux path (udp_v4_early_demux) or from the regular input path
+(ip_route_input_noref).
+
+ECN would normally not be used with UDP multicast packets, so the
+practical consequences should be limited on that side. However,
+IPTOS_RT_MASK is used to also masks the TOS' high order bits, to align
+with the non-early-demux path behaviour.
+
+Reproducer:
+
+ Setup two netns, connected with veth:
+ $ ip netns add ns0
+ $ ip netns add ns1
+ $ ip -netns ns0 link set dev lo up
+ $ ip -netns ns1 link set dev lo up
+ $ ip link add name veth01 netns ns0 type veth peer name veth10 netns ns1
+ $ ip -netns ns0 link set dev veth01 up
+ $ ip -netns ns1 link set dev veth10 up
+ $ ip -netns ns0 address add 192.0.2.10 peer 192.0.2.11/32 dev veth01
+ $ ip -netns ns1 address add 192.0.2.11 peer 192.0.2.10/32 dev veth10
+
+ In ns0, add route to multicast address 224.0.2.0/24 using source
+ address 198.51.100.10:
+ $ ip -netns ns0 address add 198.51.100.10/32 dev lo
+ $ ip -netns ns0 route add 224.0.2.0/24 dev veth01 src 198.51.100.10
+
+ In ns1, define route to 198.51.100.10, only for packets with TOS 4:
+ $ ip -netns ns1 route add 198.51.100.10/32 tos 4 dev veth10
+
+ Also activate rp_filter in ns1, so that incoming packets not matching
+ the above route get dropped:
+ $ ip netns exec ns1 sysctl -wq net.ipv4.conf.veth10.rp_filter=1
+
+ Now try to receive packets on 224.0.2.11:
+ $ ip netns exec ns1 socat UDP-RECVFROM:1111,ip-add-membership=224.0.2.11:veth10,ignoreeof -
+
+ In ns0, send packet to 224.0.2.11 with TOS 4 and ECT(0) (that is,
+ tos 6 for socat):
+ $ echo test0 | ip netns exec ns0 socat - UDP-DATAGRAM:224.0.2.11:1111,bind=:1111,tos=6
+
+ The "test0" message is properly received by socat in ns1, because
+ early-demux has no cached dst to use, so source address validation
+ is done by ip_route_input_mc(), which receives a TOS that has the
+ ECN bits masked.
+
+ Now send another packet to 224.0.2.11, still with TOS 4 and ECT(0):
+ $ echo test1 | ip netns exec ns0 socat - UDP-DATAGRAM:224.0.2.11:1111,bind=:1111,tos=6
+
+ The "test1" message isn't received by socat in ns1, because, now,
+ early-demux has a cached dst to use and calls ip_mc_validate_source()
+ immediately, without masking the ECN bits.
+
+Fixes: bc044e8db796 ("udp: perform source validation for mcast early demux")
+Signed-off-by: Guillaume Nault <gnault@redhat.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/ipv4/udp.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -2553,7 +2553,8 @@ int udp_v4_early_demux(struct sk_buff *s
+ */
+ if (!inet_sk(sk)->inet_daddr && in_dev)
+ return ip_mc_validate_source(skb, iph->daddr,
+- iph->saddr, iph->tos,
++ iph->saddr,
++ iph->tos & IPTOS_RT_MASK,
+ skb->dev, in_dev, &itag);
+ }
+ return 0;