]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 6.1
authorSasha Levin <sashal@kernel.org>
Fri, 19 Apr 2024 11:43:49 +0000 (07:43 -0400)
committerSasha Levin <sashal@kernel.org>
Fri, 19 Apr 2024 11:43:49 +0000 (07:43 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
15 files changed:
queue-6.1/af_unix-call-manage_oob-for-every-skb-in-unix_stream.patch [new file with mode: 0644]
queue-6.1/af_unix-don-t-peek-oob-data-without-msg_oob.patch [new file with mode: 0644]
queue-6.1/ice-tc-allow-zero-flags-in-parsing-tc-flower.patch [new file with mode: 0644]
queue-6.1/net-dsa-mt7530-fix-mirroring-frames-received-on-loca.patch [new file with mode: 0644]
queue-6.1/net-ethernet-ti-am65-cpsw-nuss-cleanup-dma-channels-.patch [new file with mode: 0644]
queue-6.1/net-mlx5-lag-restore-buckets-number-to-default-after.patch [new file with mode: 0644]
queue-6.1/net-mlx5e-prevent-deadlock-while-disabling-arfs.patch [new file with mode: 0644]
queue-6.1/netfilter-br_netfilter-skip-conntrack-input-hook-for.patch [new file with mode: 0644]
queue-6.1/netfilter-flowtable-incorrect-pppoe-tuple.patch [new file with mode: 0644]
queue-6.1/netfilter-flowtable-validate-pppoe-header.patch [new file with mode: 0644]
queue-6.1/netfilter-nf_tables-fix-potential-data-race-in-__nft.patch [new file with mode: 0644]
queue-6.1/netfilter-nf_tables-fix-potential-data-race-in-__nft.patch-21398 [new file with mode: 0644]
queue-6.1/netfilter-nft_set_pipapo-do-not-free-live-element.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/tun-limit-printing-rate-when-illegal-packet-received.patch [new file with mode: 0644]

diff --git a/queue-6.1/af_unix-call-manage_oob-for-every-skb-in-unix_stream.patch b/queue-6.1/af_unix-call-manage_oob-for-every-skb-in-unix_stream.patch
new file mode 100644 (file)
index 0000000..923195f
--- /dev/null
@@ -0,0 +1,73 @@
+From 59c9cdf43109085055c2b243e6a3ee6ea3ceb7a7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 10 Apr 2024 10:10:15 -0700
+Subject: af_unix: Call manage_oob() for every skb in
+ unix_stream_read_generic().
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit 283454c8a123072e5c386a5a2b5fc576aa455b6f ]
+
+When we call recv() for AF_UNIX socket, we first peek one skb and
+calls manage_oob() to check if the skb is sent with MSG_OOB.
+
+However, when we fetch the next (and the following) skb, manage_oob()
+is not called now, leading a wrong behaviour.
+
+Let's say a socket send()s "hello" with MSG_OOB and the peer tries
+to recv() 5 bytes with MSG_PEEK.  Here, we should get only "hell"
+without 'o', but actually not:
+
+  >>> from socket import *
+  >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM)
+  >>> c1.send(b'hello', MSG_OOB)
+  5
+  >>> c2.recv(5, MSG_PEEK)
+  b'hello'
+
+The first skb fills 4 bytes, and the next skb is peeked but not
+properly checked by manage_oob().
+
+Let's move up the again label to call manage_oob() for evry skb.
+
+With this patch:
+
+  >>> from socket import *
+  >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM)
+  >>> c1.send(b'hello', MSG_OOB)
+  5
+  >>> c2.recv(5, MSG_PEEK)
+  b'hell'
+
+Fixes: 314001f0bf92 ("af_unix: Add OOB support")
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240410171016.7621-2-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/unix/af_unix.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
+index 0a75d76535f75..6af6f82e89464 100644
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -2753,6 +2753,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
+               last = skb = skb_peek(&sk->sk_receive_queue);
+               last_len = last ? last->len : 0;
++again:
+ #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+               if (skb) {
+                       skb = manage_oob(skb, sk, flags, copied);
+@@ -2764,7 +2765,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
+                       }
+               }
+ #endif
+-again:
+               if (skb == NULL) {
+                       if (copied >= target)
+                               goto unlock;
+-- 
+2.43.0
+
diff --git a/queue-6.1/af_unix-don-t-peek-oob-data-without-msg_oob.patch b/queue-6.1/af_unix-don-t-peek-oob-data-without-msg_oob.patch
new file mode 100644 (file)
index 0000000..86d127c
--- /dev/null
@@ -0,0 +1,86 @@
+From d6458b46de4e59d551f79a78e82850db9209d4b3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 10 Apr 2024 10:10:16 -0700
+Subject: af_unix: Don't peek OOB data without MSG_OOB.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit 22dd70eb2c3d754862964377a75abafd3167346b ]
+
+Currently, we can read OOB data without MSG_OOB by using MSG_PEEK
+when OOB data is sitting on the front row, which is apparently
+wrong.
+
+  >>> from socket import *
+  >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM)
+  >>> c1.send(b'a', MSG_OOB)
+  1
+  >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT)
+  b'a'
+
+If manage_oob() is called when no data has been copied, we only
+check if the socket enables SO_OOBINLINE or MSG_PEEK is not used.
+Otherwise, the skb is returned as is.
+
+However, here we should return NULL if MSG_PEEK is set and no data
+has been copied.
+
+Also, in such a case, we should not jump to the redo label because
+we will be caught in the loop and hog the CPU until normal data
+comes in.
+
+Then, we need to handle skb == NULL case with the if-clause below
+the manage_oob() block.
+
+With this patch:
+
+  >>> from socket import *
+  >>> c1, c2 = socketpair(AF_UNIX, SOCK_STREAM)
+  >>> c1.send(b'a', MSG_OOB)
+  1
+  >>> c2.recv(1, MSG_PEEK | MSG_DONTWAIT)
+  Traceback (most recent call last):
+    File "<stdin>", line 1, in <module>
+  BlockingIOError: [Errno 11] Resource temporarily unavailable
+
+Fixes: 314001f0bf92 ("af_unix: Add OOB support")
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240410171016.7621-3-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/unix/af_unix.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
+index 6af6f82e89464..f28e2956fea58 100644
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -2675,7 +2675,9 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
+                                       WRITE_ONCE(u->oob_skb, NULL);
+                                       consume_skb(skb);
+                               }
+-                      } else if (!(flags & MSG_PEEK)) {
++                      } else if (flags & MSG_PEEK) {
++                              skb = NULL;
++                      } else {
+                               skb_unlink(skb, &sk->sk_receive_queue);
+                               WRITE_ONCE(u->oob_skb, NULL);
+                               if (!WARN_ON_ONCE(skb_unref(skb)))
+@@ -2757,11 +2759,9 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state,
+ #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+               if (skb) {
+                       skb = manage_oob(skb, sk, flags, copied);
+-                      if (!skb) {
++                      if (!skb && copied) {
+                               unix_state_unlock(sk);
+-                              if (copied)
+-                                      break;
+-                              goto redo;
++                              break;
+                       }
+               }
+ #endif
+-- 
+2.43.0
+
diff --git a/queue-6.1/ice-tc-allow-zero-flags-in-parsing-tc-flower.patch b/queue-6.1/ice-tc-allow-zero-flags-in-parsing-tc-flower.patch
new file mode 100644 (file)
index 0000000..e6408cc
--- /dev/null
@@ -0,0 +1,49 @@
+From 36fbe5e4463909341c634ca6f81c0559c84dafd8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 15 Mar 2024 12:08:21 +0100
+Subject: ice: tc: allow zero flags in parsing tc flower
+
+From: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
+
+[ Upstream commit 73278715725a8347032acf233082ca4eb31e6a56 ]
+
+The check for flags is done to not pass empty lookups to adding switch
+rule functions. Since metadata is always added to lookups there is no
+need to check against the flag.
+
+It is also fixing the problem with such rule:
+$ tc filter add dev gtp_dev ingress protocol ip prio 0 flower \
+       enc_dst_port 2123 action drop
+Switch block in case of GTP can't parse the destination port, because it
+should always be set to GTP specific value. The same with ethertype. The
+result is that there is no other matching criteria than GTP tunnel. In
+this case flags is 0, rule can't be added only because of defensive
+check against flags.
+
+Fixes: 9a225f81f540 ("ice: Support GTP-U and GTP-C offload in switchdev")
+Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
+Signed-off-by: Michal Swiatkowski <michal.swiatkowski@linux.intel.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Tested-by: Sujai Buvaneswaran <sujai.buvaneswaran@intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_tc_lib.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_tc_lib.c b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
+index 652ef09eeb305..ec6628aacc13b 100644
+--- a/drivers/net/ethernet/intel/ice/ice_tc_lib.c
++++ b/drivers/net/ethernet/intel/ice/ice_tc_lib.c
+@@ -663,7 +663,7 @@ ice_eswitch_add_tc_fltr(struct ice_vsi *vsi, struct ice_tc_flower_fltr *fltr)
+       int ret;
+       int i;
+-      if (!flags || (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT)) {
++      if (flags & ICE_TC_FLWR_FIELD_ENC_SRC_L4_PORT) {
+               NL_SET_ERR_MSG_MOD(fltr->extack, "Unsupported encap field(s)");
+               return -EOPNOTSUPP;
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-dsa-mt7530-fix-mirroring-frames-received-on-loca.patch b/queue-6.1/net-dsa-mt7530-fix-mirroring-frames-received-on-loca.patch
new file mode 100644 (file)
index 0000000..35c575e
--- /dev/null
@@ -0,0 +1,81 @@
+From 532cb4e8e8c9d7bc28dd8cb1c98269e4ca271988 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 13 Apr 2024 16:01:39 +0300
+Subject: net: dsa: mt7530: fix mirroring frames received on local port
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Arınç ÜNAL <arinc.unal@arinc9.com>
+
+[ Upstream commit d59cf049c8378677053703e724808836f180888e ]
+
+This switch intellectual property provides a bit on the ARL global control
+register which controls allowing mirroring frames which are received on the
+local port (monitor port). This bit is unset after reset.
+
+This ability must be enabled to fully support the port mirroring feature on
+this switch intellectual property.
+
+Therefore, this patch fixes the traffic not being reflected on a port,
+which would be configured like below:
+
+  tc qdisc add dev swp0 clsact
+
+  tc filter add dev swp0 ingress matchall skip_sw \
+  action mirred egress mirror dev swp0
+
+As a side note, this configuration provides the hairpinning feature for a
+single port.
+
+Fixes: 37feab6076aa ("net: dsa: mt7530: add support for port mirroring")
+Signed-off-by: Arınç ÜNAL <arinc.unal@arinc9.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/dsa/mt7530.c | 6 ++++++
+ drivers/net/dsa/mt7530.h | 4 ++++
+ 2 files changed, 10 insertions(+)
+
+diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
+index d4515c19a5f34..b5f61a9a378eb 100644
+--- a/drivers/net/dsa/mt7530.c
++++ b/drivers/net/dsa/mt7530.c
+@@ -2461,6 +2461,9 @@ mt7530_setup(struct dsa_switch *ds)
+                          PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT));
+       }
++      /* Allow mirroring frames received on the local port (monitor port). */
++      mt7530_set(priv, MT753X_AGC, LOCAL_EN);
++
+       /* Setup VLAN ID 0 for VLAN-unaware bridges */
+       ret = mt7530_setup_vlan0(priv);
+       if (ret)
+@@ -2577,6 +2580,9 @@ mt7531_setup_common(struct dsa_switch *ds)
+                          PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT));
+       }
++      /* Allow mirroring frames received on the local port (monitor port). */
++      mt7530_set(priv, MT753X_AGC, LOCAL_EN);
++
+       /* Flush the FDB table */
+       ret = mt7530_fdb_cmd(priv, MT7530_FDB_FLUSH, NULL);
+       if (ret < 0)
+diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h
+index 2d1ea390f05ab..af18f47f22141 100644
+--- a/drivers/net/dsa/mt7530.h
++++ b/drivers/net/dsa/mt7530.h
+@@ -31,6 +31,10 @@ enum mt753x_id {
+ #define SYSC_REG_RSTCTRL              0x34
+ #define  RESET_MCM                    BIT(2)
++/* Register for ARL global control */
++#define MT753X_AGC                    0xc
++#define  LOCAL_EN                     BIT(7)
++
+ /* Registers to mac forward control for unknown frames */
+ #define MT7530_MFC                    0x10
+ #define  BC_FFP(x)                    (((x) & 0xff) << 24)
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-ethernet-ti-am65-cpsw-nuss-cleanup-dma-channels-.patch b/queue-6.1/net-ethernet-ti-am65-cpsw-nuss-cleanup-dma-channels-.patch
new file mode 100644 (file)
index 0000000..d55c141
--- /dev/null
@@ -0,0 +1,66 @@
+From ccc09f495f21e613feafb1acd3c6c998a0608644 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Apr 2024 15:24:25 +0530
+Subject: net: ethernet: ti: am65-cpsw-nuss: cleanup DMA Channels before using
+ them
+
+From: Siddharth Vadapalli <s-vadapalli@ti.com>
+
+[ Upstream commit c24cd679b075b0e953ea167b0aa2b2d59e4eba7f ]
+
+The TX and RX DMA Channels used by the driver to exchange data with CPSW
+are not guaranteed to be in a clean state during driver initialization.
+The Bootloader could have used the same DMA Channels without cleaning them
+up in the event of failure. Thus, reset and disable the DMA Channels to
+ensure that they are in a clean state before using them.
+
+Fixes: 93a76530316a ("net: ethernet: ti: introduce am65x/j721e gigabit eth subsystem driver")
+Reported-by: Schuyler Patton <spatton@ti.com>
+Signed-off-by: Siddharth Vadapalli <s-vadapalli@ti.com>
+Reviewed-by: Roger Quadros <rogerq@kernel.org>
+Link: https://lore.kernel.org/r/20240417095425.2253876-1-s-vadapalli@ti.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/ti/am65-cpsw-nuss.c | 18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+index 76fabeae512db..33df06a2de13a 100644
+--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
++++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+@@ -2549,6 +2549,8 @@ static void am65_cpsw_unregister_devlink(struct am65_cpsw_common *common)
+ static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common)
+ {
++      struct am65_cpsw_rx_chn *rx_chan = &common->rx_chns;
++      struct am65_cpsw_tx_chn *tx_chan = common->tx_chns;
+       struct device *dev = common->dev;
+       struct devlink_port *dl_port;
+       struct am65_cpsw_port *port;
+@@ -2567,6 +2569,22 @@ static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common)
+               return ret;
+       }
++      /* The DMA Channels are not guaranteed to be in a clean state.
++       * Reset and disable them to ensure that they are back to the
++       * clean state and ready to be used.
++       */
++      for (i = 0; i < common->tx_ch_num; i++) {
++              k3_udma_glue_reset_tx_chn(tx_chan[i].tx_chn, &tx_chan[i],
++                                        am65_cpsw_nuss_tx_cleanup);
++              k3_udma_glue_disable_tx_chn(tx_chan[i].tx_chn);
++      }
++
++      for (i = 0; i < AM65_CPSW_MAX_RX_FLOWS; i++)
++              k3_udma_glue_reset_rx_chn(rx_chan->rx_chn, i, rx_chan,
++                                        am65_cpsw_nuss_rx_cleanup, !!i);
++
++      k3_udma_glue_disable_rx_chn(rx_chan->rx_chn);
++
+       ret = am65_cpsw_nuss_register_devlink(common);
+       if (ret)
+               return ret;
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-mlx5-lag-restore-buckets-number-to-default-after.patch b/queue-6.1/net-mlx5-lag-restore-buckets-number-to-default-after.patch
new file mode 100644 (file)
index 0000000..87e1074
--- /dev/null
@@ -0,0 +1,49 @@
+From 83f896f0387b53a68020c579eb70281fff5fabcd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 11 Apr 2024 14:54:39 +0300
+Subject: net/mlx5: Lag, restore buckets number to default after hash LAG
+ deactivation
+
+From: Shay Drory <shayd@nvidia.com>
+
+[ Upstream commit 37cc10da3a50e6d0cb9808a90b7da9b4868794dd ]
+
+The cited patch introduces the concept of buckets in LAG in hash mode.
+However, the patch doesn't clear the number of buckets in the LAG
+deactivation. This results in using the wrong number of buckets in
+case user create a hash mode LAG and afterwards create a non-hash
+mode LAG.
+
+Hence, restore buckets number to default after hash mode LAG
+deactivation.
+
+Fixes: 352899f384d4 ("net/mlx5: Lag, use buckets in hash mode")
+Signed-off-by: Shay Drory <shayd@nvidia.com>
+Reviewed-by: Maor Gottlieb <maorg@nvidia.com>
+Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
+Link: https://lore.kernel.org/r/20240411115444.374475-2-tariqt@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+index ad32b80e85018..01c0e1ee918d8 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+@@ -679,8 +679,10 @@ static int mlx5_deactivate_lag(struct mlx5_lag *ldev)
+               return err;
+       }
+-      if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
++      if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
+               mlx5_lag_port_sel_destroy(ldev);
++              ldev->buckets = 1;
++      }
+       if (mlx5_lag_has_drop_rule(ldev))
+               mlx5_lag_drop_rule_cleanup(ldev);
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-mlx5e-prevent-deadlock-while-disabling-arfs.patch b/queue-6.1/net-mlx5e-prevent-deadlock-while-disabling-arfs.patch
new file mode 100644 (file)
index 0000000..295abb7
--- /dev/null
@@ -0,0 +1,213 @@
+From 247c49ffcc4ffe91bda55a3bb05910762e684448 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 11 Apr 2024 14:54:44 +0300
+Subject: net/mlx5e: Prevent deadlock while disabling aRFS
+
+From: Carolina Jubran <cjubran@nvidia.com>
+
+[ Upstream commit fef965764cf562f28afb997b626fc7c3cec99693 ]
+
+When disabling aRFS under the `priv->state_lock`, any scheduled
+aRFS works are canceled using the `cancel_work_sync` function,
+which waits for the work to end if it has already started.
+However, while waiting for the work handler, the handler will
+try to acquire the `state_lock` which is already acquired.
+
+The worker acquires the lock to delete the rules if the state
+is down, which is not the worker's responsibility since
+disabling aRFS deletes the rules.
+
+Add an aRFS state variable, which indicates whether the aRFS is
+enabled and prevent adding rules when the aRFS is disabled.
+
+Kernel log:
+
+======================================================
+WARNING: possible circular locking dependency detected
+6.7.0-rc4_net_next_mlx5_5483eb2 #1 Tainted: G          I
+------------------------------------------------------
+ethtool/386089 is trying to acquire lock:
+ffff88810f21ce68 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}, at: __flush_work+0x74/0x4e0
+
+but task is already holding lock:
+ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core]
+
+which lock already depends on the new lock.
+
+the existing dependency chain (in reverse order) is:
+
+-> #1 (&priv->state_lock){+.+.}-{3:3}:
+       __mutex_lock+0x80/0xc90
+       arfs_handle_work+0x4b/0x3b0 [mlx5_core]
+       process_one_work+0x1dc/0x4a0
+       worker_thread+0x1bf/0x3c0
+       kthread+0xd7/0x100
+       ret_from_fork+0x2d/0x50
+       ret_from_fork_asm+0x11/0x20
+
+-> #0 ((work_completion)(&rule->arfs_work)){+.+.}-{0:0}:
+       __lock_acquire+0x17b4/0x2c80
+       lock_acquire+0xd0/0x2b0
+       __flush_work+0x7a/0x4e0
+       __cancel_work_timer+0x131/0x1c0
+       arfs_del_rules+0x143/0x1e0 [mlx5_core]
+       mlx5e_arfs_disable+0x1b/0x30 [mlx5_core]
+       mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core]
+       ethnl_set_channels+0x28f/0x3b0
+       ethnl_default_set_doit+0xec/0x240
+       genl_family_rcv_msg_doit+0xd0/0x120
+       genl_rcv_msg+0x188/0x2c0
+       netlink_rcv_skb+0x54/0x100
+       genl_rcv+0x24/0x40
+       netlink_unicast+0x1a1/0x270
+       netlink_sendmsg+0x214/0x460
+       __sock_sendmsg+0x38/0x60
+       __sys_sendto+0x113/0x170
+       __x64_sys_sendto+0x20/0x30
+       do_syscall_64+0x40/0xe0
+       entry_SYSCALL_64_after_hwframe+0x46/0x4e
+
+other info that might help us debug this:
+
+ Possible unsafe locking scenario:
+
+       CPU0                    CPU1
+       ----                    ----
+  lock(&priv->state_lock);
+                               lock((work_completion)(&rule->arfs_work));
+                               lock(&priv->state_lock);
+  lock((work_completion)(&rule->arfs_work));
+
+ *** DEADLOCK ***
+
+3 locks held by ethtool/386089:
+ #0: ffffffff82ea7210 (cb_lock){++++}-{3:3}, at: genl_rcv+0x15/0x40
+ #1: ffffffff82e94c88 (rtnl_mutex){+.+.}-{3:3}, at: ethnl_default_set_doit+0xd3/0x240
+ #2: ffff8884a1808cc0 (&priv->state_lock){+.+.}-{3:3}, at: mlx5e_ethtool_set_channels+0x53/0x200 [mlx5_core]
+
+stack backtrace:
+CPU: 15 PID: 386089 Comm: ethtool Tainted: G          I        6.7.0-rc4_net_next_mlx5_5483eb2 #1
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x60/0xa0
+ check_noncircular+0x144/0x160
+ __lock_acquire+0x17b4/0x2c80
+ lock_acquire+0xd0/0x2b0
+ ? __flush_work+0x74/0x4e0
+ ? save_trace+0x3e/0x360
+ ? __flush_work+0x74/0x4e0
+ __flush_work+0x7a/0x4e0
+ ? __flush_work+0x74/0x4e0
+ ? __lock_acquire+0xa78/0x2c80
+ ? lock_acquire+0xd0/0x2b0
+ ? mark_held_locks+0x49/0x70
+ __cancel_work_timer+0x131/0x1c0
+ ? mark_held_locks+0x49/0x70
+ arfs_del_rules+0x143/0x1e0 [mlx5_core]
+ mlx5e_arfs_disable+0x1b/0x30 [mlx5_core]
+ mlx5e_ethtool_set_channels+0xcb/0x200 [mlx5_core]
+ ethnl_set_channels+0x28f/0x3b0
+ ethnl_default_set_doit+0xec/0x240
+ genl_family_rcv_msg_doit+0xd0/0x120
+ genl_rcv_msg+0x188/0x2c0
+ ? ethnl_ops_begin+0xb0/0xb0
+ ? genl_family_rcv_msg_dumpit+0xf0/0xf0
+ netlink_rcv_skb+0x54/0x100
+ genl_rcv+0x24/0x40
+ netlink_unicast+0x1a1/0x270
+ netlink_sendmsg+0x214/0x460
+ __sock_sendmsg+0x38/0x60
+ __sys_sendto+0x113/0x170
+ ? do_user_addr_fault+0x53f/0x8f0
+ __x64_sys_sendto+0x20/0x30
+ do_syscall_64+0x40/0xe0
+ entry_SYSCALL_64_after_hwframe+0x46/0x4e
+ </TASK>
+
+Fixes: 45bf454ae884 ("net/mlx5e: Enabling aRFS mechanism")
+Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
+Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
+Link: https://lore.kernel.org/r/20240411115444.374475-7-tariqt@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 27 +++++++++++--------
+ 1 file changed, 16 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+index 58eacba6de8cd..ad51edf553185 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+@@ -45,6 +45,10 @@ struct arfs_table {
+       struct hlist_head        rules_hash[ARFS_HASH_SIZE];
+ };
++enum {
++      MLX5E_ARFS_STATE_ENABLED,
++};
++
+ enum arfs_type {
+       ARFS_IPV4_TCP,
+       ARFS_IPV6_TCP,
+@@ -60,6 +64,7 @@ struct mlx5e_arfs_tables {
+       struct list_head               rules;
+       int                            last_filter_id;
+       struct workqueue_struct        *wq;
++      unsigned long                  state;
+ };
+ struct arfs_tuple {
+@@ -170,6 +175,8 @@ int mlx5e_arfs_enable(struct mlx5e_flow_steering *fs)
+                       return err;
+               }
+       }
++      set_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state);
++
+       return 0;
+ }
+@@ -454,6 +461,8 @@ static void arfs_del_rules(struct mlx5e_flow_steering *fs)
+       int i;
+       int j;
++      clear_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state);
++
+       spin_lock_bh(&arfs->arfs_lock);
+       mlx5e_for_each_arfs_rule(rule, htmp, arfs->arfs_tables, i, j) {
+               hlist_del_init(&rule->hlist);
+@@ -621,17 +630,8 @@ static void arfs_handle_work(struct work_struct *work)
+       struct mlx5_flow_handle *rule;
+       arfs = mlx5e_fs_get_arfs(priv->fs);
+-      mutex_lock(&priv->state_lock);
+-      if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) {
+-              spin_lock_bh(&arfs->arfs_lock);
+-              hlist_del(&arfs_rule->hlist);
+-              spin_unlock_bh(&arfs->arfs_lock);
+-
+-              mutex_unlock(&priv->state_lock);
+-              kfree(arfs_rule);
+-              goto out;
+-      }
+-      mutex_unlock(&priv->state_lock);
++      if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state))
++              return;
+       if (!arfs_rule->rule) {
+               rule = arfs_add_rule(priv, arfs_rule);
+@@ -744,6 +744,11 @@ int mlx5e_rx_flow_steer(struct net_device *dev, const struct sk_buff *skb,
+               return -EPROTONOSUPPORT;
+       spin_lock_bh(&arfs->arfs_lock);
++      if (!test_bit(MLX5E_ARFS_STATE_ENABLED, &arfs->state)) {
++              spin_unlock_bh(&arfs->arfs_lock);
++              return -EPERM;
++      }
++
+       arfs_rule = arfs_find_rule(arfs_t, &fk);
+       if (arfs_rule) {
+               if (arfs_rule->rxq == rxq_index) {
+-- 
+2.43.0
+
diff --git a/queue-6.1/netfilter-br_netfilter-skip-conntrack-input-hook-for.patch b/queue-6.1/netfilter-br_netfilter-skip-conntrack-input-hook-for.patch
new file mode 100644 (file)
index 0000000..1fcd5b6
--- /dev/null
@@ -0,0 +1,220 @@
+From b153d3016ef466f490c91a7cc46edd0db716507c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Apr 2024 11:24:59 +0200
+Subject: netfilter: br_netfilter: skip conntrack input hook for promisc
+ packets
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit 751de2012eafa4d46d8081056761fa0e9cc8a178 ]
+
+For historical reasons, when bridge device is in promisc mode, packets
+that are directed to the taps follow bridge input hook path. This patch
+adds a workaround to reset conntrack for these packets.
+
+Jianbo Liu reports warning splats in their test infrastructure where
+cloned packets reach the br_netfilter input hook to confirm the
+conntrack object.
+
+Scratch one bit from BR_INPUT_SKB_CB to annotate that this packet has
+reached the input hook because it is passed up to the bridge device to
+reach the taps.
+
+[   57.571874] WARNING: CPU: 1 PID: 0 at net/bridge/br_netfilter_hooks.c:616 br_nf_local_in+0x157/0x180 [br_netfilter]
+[   57.572749] Modules linked in: xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_isc si ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5ctl mlx5_core
+[   57.575158] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.8.0+ #19
+[   57.575700] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+[   57.576662] RIP: 0010:br_nf_local_in+0x157/0x180 [br_netfilter]
+[   57.577195] Code: fe ff ff 41 bd 04 00 00 00 be 04 00 00 00 e9 4a ff ff ff be 04 00 00 00 48 89 ef e8 f3 a9 3c e1 66 83 ad b4 00 00 00 04 eb 91 <0f> 0b e9 f1 fe ff ff 0f 0b e9 df fe ff ff 48 89 df e8 b3 53 47 e1
+[   57.578722] RSP: 0018:ffff88885f845a08 EFLAGS: 00010202
+[   57.579207] RAX: 0000000000000002 RBX: ffff88812dfe8000 RCX: 0000000000000000
+[   57.579830] RDX: ffff88885f845a60 RSI: ffff8881022dc300 RDI: 0000000000000000
+[   57.580454] RBP: ffff88885f845a60 R08: 0000000000000001 R09: 0000000000000003
+[   57.581076] R10: 00000000ffff1300 R11: 0000000000000002 R12: 0000000000000000
+[   57.581695] R13: ffff8881047ffe00 R14: ffff888108dbee00 R15: ffff88814519b800
+[   57.582313] FS:  0000000000000000(0000) GS:ffff88885f840000(0000) knlGS:0000000000000000
+[   57.583040] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[   57.583564] CR2: 000000c4206aa000 CR3: 0000000103847001 CR4: 0000000000370eb0
+[   57.584194] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
+0000000000000000
+[   57.584820] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
+0000000000000400
+[   57.585440] Call Trace:
+[   57.585721]  <IRQ>
+[   57.585976]  ? __warn+0x7d/0x130
+[   57.586323]  ? br_nf_local_in+0x157/0x180 [br_netfilter]
+[   57.586811]  ? report_bug+0xf1/0x1c0
+[   57.587177]  ? handle_bug+0x3f/0x70
+[   57.587539]  ? exc_invalid_op+0x13/0x60
+[   57.587929]  ? asm_exc_invalid_op+0x16/0x20
+[   57.588336]  ? br_nf_local_in+0x157/0x180 [br_netfilter]
+[   57.588825]  nf_hook_slow+0x3d/0xd0
+[   57.589188]  ? br_handle_vlan+0x4b/0x110
+[   57.589579]  br_pass_frame_up+0xfc/0x150
+[   57.589970]  ? br_port_flags_change+0x40/0x40
+[   57.590396]  br_handle_frame_finish+0x346/0x5e0
+[   57.590837]  ? ipt_do_table+0x32e/0x430
+[   57.591221]  ? br_handle_local_finish+0x20/0x20
+[   57.591656]  br_nf_hook_thresh+0x4b/0xf0 [br_netfilter]
+[   57.592286]  ? br_handle_local_finish+0x20/0x20
+[   57.592802]  br_nf_pre_routing_finish+0x178/0x480 [br_netfilter]
+[   57.593348]  ? br_handle_local_finish+0x20/0x20
+[   57.593782]  ? nf_nat_ipv4_pre_routing+0x25/0x60 [nf_nat]
+[   57.594279]  br_nf_pre_routing+0x24c/0x550 [br_netfilter]
+[   57.594780]  ? br_nf_hook_thresh+0xf0/0xf0 [br_netfilter]
+[   57.595280]  br_handle_frame+0x1f3/0x3d0
+[   57.595676]  ? br_handle_local_finish+0x20/0x20
+[   57.596118]  ? br_handle_frame_finish+0x5e0/0x5e0
+[   57.596566]  __netif_receive_skb_core+0x25b/0xfc0
+[   57.597017]  ? __napi_build_skb+0x37/0x40
+[   57.597418]  __netif_receive_skb_list_core+0xfb/0x220
+
+Fixes: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack")
+Reported-by: Jianbo Liu <jianbol@nvidia.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/bridge/br_input.c                      | 15 +++++++++++----
+ net/bridge/br_netfilter_hooks.c            |  6 ++++++
+ net/bridge/br_private.h                    |  1 +
+ net/bridge/netfilter/nf_conntrack_bridge.c | 14 ++++++++++----
+ 4 files changed, 28 insertions(+), 8 deletions(-)
+
+diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
+index 6bb272894c960..b94a1783902ea 100644
+--- a/net/bridge/br_input.c
++++ b/net/bridge/br_input.c
+@@ -30,7 +30,7 @@ br_netif_receive_skb(struct net *net, struct sock *sk, struct sk_buff *skb)
+       return netif_receive_skb(skb);
+ }
+-static int br_pass_frame_up(struct sk_buff *skb)
++static int br_pass_frame_up(struct sk_buff *skb, bool promisc)
+ {
+       struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
+       struct net_bridge *br = netdev_priv(brdev);
+@@ -65,6 +65,8 @@ static int br_pass_frame_up(struct sk_buff *skb)
+       br_multicast_count(br, NULL, skb, br_multicast_igmp_type(skb),
+                          BR_MCAST_DIR_TX);
++      BR_INPUT_SKB_CB(skb)->promisc = promisc;
++
+       return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
+                      dev_net(indev), NULL, skb, indev, NULL,
+                      br_netif_receive_skb);
+@@ -82,6 +84,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
+       struct net_bridge_mcast *brmctx;
+       struct net_bridge_vlan *vlan;
+       struct net_bridge *br;
++      bool promisc;
+       u16 vid = 0;
+       u8 state;
+@@ -120,7 +123,9 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
+       if (p->flags & BR_LEARNING)
+               br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, 0);
+-      local_rcv = !!(br->dev->flags & IFF_PROMISC);
++      promisc = !!(br->dev->flags & IFF_PROMISC);
++      local_rcv = promisc;
++
+       if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) {
+               /* by definition the broadcast is also a multicast address */
+               if (is_broadcast_ether_addr(eth_hdr(skb)->h_dest)) {
+@@ -183,7 +188,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
+               unsigned long now = jiffies;
+               if (test_bit(BR_FDB_LOCAL, &dst->flags))
+-                      return br_pass_frame_up(skb);
++                      return br_pass_frame_up(skb, false);
+               if (now != dst->used)
+                       dst->used = now;
+@@ -196,7 +201,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
+       }
+       if (local_rcv)
+-              return br_pass_frame_up(skb);
++              return br_pass_frame_up(skb, promisc);
+ out:
+       return 0;
+@@ -368,6 +373,8 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
+                               goto forward;
+               }
++              BR_INPUT_SKB_CB(skb)->promisc = false;
++
+               /* The else clause should be hit when nf_hook():
+                *   - returns < 0 (drop/error)
+                *   - returns = 0 (stolen/nf_queue)
+diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
+index bff48d5763635..9ac70c27da835 100644
+--- a/net/bridge/br_netfilter_hooks.c
++++ b/net/bridge/br_netfilter_hooks.c
+@@ -600,11 +600,17 @@ static unsigned int br_nf_local_in(void *priv,
+                                  struct sk_buff *skb,
+                                  const struct nf_hook_state *state)
+ {
++      bool promisc = BR_INPUT_SKB_CB(skb)->promisc;
+       struct nf_conntrack *nfct = skb_nfct(skb);
+       const struct nf_ct_hook *ct_hook;
+       struct nf_conn *ct;
+       int ret;
++      if (promisc) {
++              nf_reset_ct(skb);
++              return NF_ACCEPT;
++      }
++
+       if (!nfct || skb->pkt_type == PACKET_HOST)
+               return NF_ACCEPT;
+diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
+index 51d010f64e066..940de95167689 100644
+--- a/net/bridge/br_private.h
++++ b/net/bridge/br_private.h
+@@ -559,6 +559,7 @@ struct br_input_skb_cb {
+ #endif
+       u8 proxyarp_replied:1;
+       u8 src_port_isolated:1;
++      u8 promisc:1;
+ #ifdef CONFIG_BRIDGE_VLAN_FILTERING
+       u8 vlan_filtered:1;
+ #endif
+diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
+index c7c27ada67044..e60c38670f220 100644
+--- a/net/bridge/netfilter/nf_conntrack_bridge.c
++++ b/net/bridge/netfilter/nf_conntrack_bridge.c
+@@ -294,18 +294,24 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
+ static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb,
+                                   const struct nf_hook_state *state)
+ {
+-      enum ip_conntrack_info ctinfo;
++      bool promisc = BR_INPUT_SKB_CB(skb)->promisc;
++      struct nf_conntrack *nfct = skb_nfct(skb);
+       struct nf_conn *ct;
+-      if (skb->pkt_type == PACKET_HOST)
++      if (promisc) {
++              nf_reset_ct(skb);
++              return NF_ACCEPT;
++      }
++
++      if (!nfct || skb->pkt_type == PACKET_HOST)
+               return NF_ACCEPT;
+       /* nf_conntrack_confirm() cannot handle concurrent clones,
+        * this happens for broad/multicast frames with e.g. macvlan on top
+        * of the bridge device.
+        */
+-      ct = nf_ct_get(skb, &ctinfo);
+-      if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct))
++      ct = container_of(nfct, struct nf_conn, ct_general);
++      if (nf_ct_is_confirmed(ct) || nf_ct_is_template(ct))
+               return NF_ACCEPT;
+       /* let inet prerouting call conntrack again */
+-- 
+2.43.0
+
diff --git a/queue-6.1/netfilter-flowtable-incorrect-pppoe-tuple.patch b/queue-6.1/netfilter-flowtable-incorrect-pppoe-tuple.patch
new file mode 100644 (file)
index 0000000..9810635
--- /dev/null
@@ -0,0 +1,37 @@
+From 8eb1eb619ae267f63678529bb56cc93e386a5013 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 11 Apr 2024 00:09:00 +0200
+Subject: netfilter: flowtable: incorrect pppoe tuple
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit 6db5dc7b351b9569940cd1cf445e237c42cd6d27 ]
+
+pppoe traffic reaching ingress path does not match the flowtable entry
+because the pppoe header is expected to be at the network header offset.
+This bug causes a mismatch in the flow table lookup, so pppoe packets
+enter the classical forwarding path.
+
+Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_flow_table_ip.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
+index 306e1ba6012e2..22bc0e3d8a0b5 100644
+--- a/net/netfilter/nf_flow_table_ip.c
++++ b/net/netfilter/nf_flow_table_ip.c
+@@ -156,7 +156,7 @@ static void nf_flow_tuple_encap(struct sk_buff *skb,
+               tuple->encap[i].proto = skb->protocol;
+               break;
+       case htons(ETH_P_PPP_SES):
+-              phdr = (struct pppoe_hdr *)skb_mac_header(skb);
++              phdr = (struct pppoe_hdr *)skb_network_header(skb);
+               tuple->encap[i].id = ntohs(phdr->sid);
+               tuple->encap[i].proto = skb->protocol;
+               break;
+-- 
+2.43.0
+
diff --git a/queue-6.1/netfilter-flowtable-validate-pppoe-header.patch b/queue-6.1/netfilter-flowtable-validate-pppoe-header.patch
new file mode 100644 (file)
index 0000000..7c4fe64
--- /dev/null
@@ -0,0 +1,106 @@
+From baba5ca18028264838911096e06746e96ecb047c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Apr 2024 13:47:33 +0200
+Subject: netfilter: flowtable: validate pppoe header
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit 87b3593bed1868b2d9fe096c01bcdf0ea86cbebf ]
+
+Ensure there is sufficient room to access the protocol field of the
+PPPoe header. Validate it once before the flowtable lookup, then use a
+helper function to access protocol field.
+
+Reported-by: syzbot+b6f07e1c07ef40199081@syzkaller.appspotmail.com
+Fixes: 72efd585f714 ("netfilter: flowtable: add pppoe support")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_flow_table.h | 12 +++++++++++-
+ net/netfilter/nf_flow_table_inet.c    |  3 ++-
+ net/netfilter/nf_flow_table_ip.c      |  8 +++++---
+ 3 files changed, 18 insertions(+), 5 deletions(-)
+
+diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
+index 4a767b3d20b9d..df7775afb92b9 100644
+--- a/include/net/netfilter/nf_flow_table.h
++++ b/include/net/netfilter/nf_flow_table.h
+@@ -335,7 +335,7 @@ int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow,
+ int nf_flow_table_offload_init(void);
+ void nf_flow_table_offload_exit(void);
+-static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb)
++static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb)
+ {
+       __be16 proto;
+@@ -351,6 +351,16 @@ static inline __be16 nf_flow_pppoe_proto(const struct sk_buff *skb)
+       return 0;
+ }
++static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto)
++{
++      if (!pskb_may_pull(skb, PPPOE_SES_HLEN))
++              return false;
++
++      *inner_proto = __nf_flow_pppoe_proto(skb);
++
++      return true;
++}
++
+ #define NF_FLOW_TABLE_STAT_INC(net, count) __this_cpu_inc((net)->ft.stat->count)
+ #define NF_FLOW_TABLE_STAT_DEC(net, count) __this_cpu_dec((net)->ft.stat->count)
+ #define NF_FLOW_TABLE_STAT_INC_ATOMIC(net, count)     \
+diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
+index 9505f9d188ff2..6eef15648b7b0 100644
+--- a/net/netfilter/nf_flow_table_inet.c
++++ b/net/netfilter/nf_flow_table_inet.c
+@@ -21,7 +21,8 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
+               proto = veth->h_vlan_encapsulated_proto;
+               break;
+       case htons(ETH_P_PPP_SES):
+-              proto = nf_flow_pppoe_proto(skb);
++              if (!nf_flow_pppoe_proto(skb, &proto))
++                      return NF_ACCEPT;
+               break;
+       default:
+               proto = skb->protocol;
+diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
+index 6feaac9ab05c8..306e1ba6012e2 100644
+--- a/net/netfilter/nf_flow_table_ip.c
++++ b/net/netfilter/nf_flow_table_ip.c
+@@ -267,10 +267,11 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
+       return NF_STOLEN;
+ }
+-static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto,
++static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
+                                      u32 *offset)
+ {
+       struct vlan_ethhdr *veth;
++      __be16 inner_proto;
+       switch (skb->protocol) {
+       case htons(ETH_P_8021Q):
+@@ -281,7 +282,8 @@ static bool nf_flow_skb_encap_protocol(const struct sk_buff *skb, __be16 proto,
+               }
+               break;
+       case htons(ETH_P_PPP_SES):
+-              if (nf_flow_pppoe_proto(skb) == proto) {
++              if (nf_flow_pppoe_proto(skb, &inner_proto) &&
++                  inner_proto == proto) {
+                       *offset += PPPOE_SES_HLEN;
+                       return true;
+               }
+@@ -310,7 +312,7 @@ static void nf_flow_encap_pop(struct sk_buff *skb,
+                       skb_reset_network_header(skb);
+                       break;
+               case htons(ETH_P_PPP_SES):
+-                      skb->protocol = nf_flow_pppoe_proto(skb);
++                      skb->protocol = __nf_flow_pppoe_proto(skb);
+                       skb_pull(skb, PPPOE_SES_HLEN);
+                       skb_reset_network_header(skb);
+                       break;
+-- 
+2.43.0
+
diff --git a/queue-6.1/netfilter-nf_tables-fix-potential-data-race-in-__nft.patch b/queue-6.1/netfilter-nf_tables-fix-potential-data-race-in-__nft.patch
new file mode 100644 (file)
index 0000000..c6ab9ef
--- /dev/null
@@ -0,0 +1,58 @@
+From a78d81b5a55862818a240dd9ddb7f43bd4171f34 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 7 Apr 2024 14:56:04 +0800
+Subject: netfilter: nf_tables: Fix potential data-race in
+ __nft_expr_type_get()
+
+From: Ziyang Xuan <william.xuanziyang@huawei.com>
+
+[ Upstream commit f969eb84ce482331a991079ab7a5c4dc3b7f89bf ]
+
+nft_unregister_expr() can concurrent with __nft_expr_type_get(),
+and there is not any protection when iterate over nf_tables_expressions
+list in __nft_expr_type_get(). Therefore, there is potential data-race
+of nf_tables_expressions list entry.
+
+Use list_for_each_entry_rcu() to iterate over nf_tables_expressions
+list in __nft_expr_type_get(), and use rcu_read_lock() in the caller
+nft_expr_type_get() to protect the entire type query process.
+
+Fixes: ef1f7df9170d ("netfilter: nf_tables: expression ops overloading")
+Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_tables_api.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
+index 8152a69d82681..ba63866914f18 100644
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -2891,7 +2891,7 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family,
+ {
+       const struct nft_expr_type *type, *candidate = NULL;
+-      list_for_each_entry(type, &nf_tables_expressions, list) {
++      list_for_each_entry_rcu(type, &nf_tables_expressions, list) {
+               if (!nla_strcmp(nla, type->name)) {
+                       if (!type->family && !candidate)
+                               candidate = type;
+@@ -2923,9 +2923,13 @@ static const struct nft_expr_type *nft_expr_type_get(struct net *net,
+       if (nla == NULL)
+               return ERR_PTR(-EINVAL);
++      rcu_read_lock();
+       type = __nft_expr_type_get(family, nla);
+-      if (type != NULL && try_module_get(type->owner))
++      if (type != NULL && try_module_get(type->owner)) {
++              rcu_read_unlock();
+               return type;
++      }
++      rcu_read_unlock();
+       lockdep_nfnl_nft_mutex_not_held();
+ #ifdef CONFIG_MODULES
+-- 
+2.43.0
+
diff --git a/queue-6.1/netfilter-nf_tables-fix-potential-data-race-in-__nft.patch-21398 b/queue-6.1/netfilter-nf_tables-fix-potential-data-race-in-__nft.patch-21398
new file mode 100644 (file)
index 0000000..3310b13
--- /dev/null
@@ -0,0 +1,57 @@
+From 9f9ac30f402f20d9219f324ebb91d6eb1c444443 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 7 Apr 2024 14:56:05 +0800
+Subject: netfilter: nf_tables: Fix potential data-race in __nft_obj_type_get()
+
+From: Ziyang Xuan <william.xuanziyang@huawei.com>
+
+[ Upstream commit d78d867dcea69c328db30df665be5be7d0148484 ]
+
+nft_unregister_obj() can concurrent with __nft_obj_type_get(),
+and there is not any protection when iterate over nf_tables_objects
+list in __nft_obj_type_get(). Therefore, there is potential data-race
+of nf_tables_objects list entry.
+
+Use list_for_each_entry_rcu() to iterate over nf_tables_objects
+list in __nft_obj_type_get(), and use rcu_read_lock() in the caller
+nft_obj_type_get() to protect the entire type query process.
+
+Fixes: e50092404c1b ("netfilter: nf_tables: add stateful objects")
+Signed-off-by: Ziyang Xuan <william.xuanziyang@huawei.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_tables_api.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
+index ba63866914f18..1c4b7a8ec2cc6 100644
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -7175,7 +7175,7 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype, u8 family)
+ {
+       const struct nft_object_type *type;
+-      list_for_each_entry(type, &nf_tables_objects, list) {
++      list_for_each_entry_rcu(type, &nf_tables_objects, list) {
+               if (type->family != NFPROTO_UNSPEC &&
+                   type->family != family)
+                       continue;
+@@ -7191,9 +7191,13 @@ nft_obj_type_get(struct net *net, u32 objtype, u8 family)
+ {
+       const struct nft_object_type *type;
++      rcu_read_lock();
+       type = __nft_obj_type_get(objtype, family);
+-      if (type != NULL && try_module_get(type->owner))
++      if (type != NULL && try_module_get(type->owner)) {
++              rcu_read_unlock();
+               return type;
++      }
++      rcu_read_unlock();
+       lockdep_nfnl_nft_mutex_not_held();
+ #ifdef CONFIG_MODULES
+-- 
+2.43.0
+
diff --git a/queue-6.1/netfilter-nft_set_pipapo-do-not-free-live-element.patch b/queue-6.1/netfilter-nft_set_pipapo-do-not-free-live-element.patch
new file mode 100644 (file)
index 0000000..f9e984c
--- /dev/null
@@ -0,0 +1,105 @@
+From f47c1acc43314a6675a44898f437a4b63e4029ce Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 10 Apr 2024 21:05:13 +0200
+Subject: netfilter: nft_set_pipapo: do not free live element
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 3cfc9ec039af60dbd8965ae085b2c2ccdcfbe1cc ]
+
+Pablo reports a crash with large batches of elements with a
+back-to-back add/remove pattern.  Quoting Pablo:
+
+  add_elem("00000000") timeout 100 ms
+  ...
+  add_elem("0000000X") timeout 100 ms
+  del_elem("0000000X") <---------------- delete one that was just added
+  ...
+  add_elem("00005000") timeout 100 ms
+
+  1) nft_pipapo_remove() removes element 0000000X
+  Then, KASAN shows a splat.
+
+Looking at the remove function there is a chance that we will drop a
+rule that maps to a non-deactivated element.
+
+Removal happens in two steps, first we do a lookup for key k and return the
+to-be-removed element and mark it as inactive in the next generation.
+Then, in a second step, the element gets removed from the set/map.
+
+The _remove function does not work correctly if we have more than one
+element that share the same key.
+
+This can happen if we insert an element into a set when the set already
+holds an element with same key, but the element mapping to the existing
+key has timed out or is not active in the next generation.
+
+In such case its possible that removal will unmap the wrong element.
+If this happens, we will leak the non-deactivated element, it becomes
+unreachable.
+
+The element that got deactivated (and will be freed later) will
+remain reachable in the set data structure, this can result in
+a crash when such an element is retrieved during lookup (stale
+pointer).
+
+Add a check that the fully matching key does in fact map to the element
+that we have marked as inactive in the deactivation step.
+If not, we need to continue searching.
+
+Add a bug/warn trap at the end of the function as well, the remove
+function must not ever be called with an invisible/unreachable/non-existent
+element.
+
+v2: avoid uneeded temporary variable (Stefano)
+
+Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges")
+Reported-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_set_pipapo.c | 14 +++++++++-----
+ 1 file changed, 9 insertions(+), 5 deletions(-)
+
+diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
+index 58eca26162735..2299ced939c47 100644
+--- a/net/netfilter/nft_set_pipapo.c
++++ b/net/netfilter/nft_set_pipapo.c
+@@ -1994,6 +1994,8 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
+               rules_fx = rules_f0;
+               nft_pipapo_for_each_field(f, i, m) {
++                      bool last = i == m->field_count - 1;
++
+                       if (!pipapo_match_field(f, start, rules_fx,
+                                               match_start, match_end))
+                               break;
+@@ -2006,16 +2008,18 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set,
+                       match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+                       match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+-              }
+-              if (i == m->field_count) {
+-                      priv->dirty = true;
+-                      pipapo_drop(m, rulemap);
+-                      return;
++                      if (last && f->mt[rulemap[i].to].e == e) {
++                              priv->dirty = true;
++                              pipapo_drop(m, rulemap);
++                              return;
++                      }
+               }
+               first_rule += rules_f0;
+       }
++
++      WARN_ON_ONCE(1); /* elem_priv not found */
+ }
+ /**
+-- 
+2.43.0
+
index 096e4203d5c05a9ddd0f7c2a4c26402959810de7..e1d2ad17bd553bf7806816c67316957a32d531c5 100644 (file)
@@ -31,3 +31,17 @@ x86-head-64-move-the-__head-definition-to-asm-init.h.patch
 x86-sme-move-early-sme-kernel-encryption-handling-into-.head.text.patch
 x86-sev-move-early-startup-code-into-.head.text-section.patch
 x86-efistub-remap-kernel-text-read-only-before-dropping-nx-attribute.patch
+netfilter-nf_tables-fix-potential-data-race-in-__nft.patch
+netfilter-nf_tables-fix-potential-data-race-in-__nft.patch-21398
+netfilter-br_netfilter-skip-conntrack-input-hook-for.patch
+netfilter-nft_set_pipapo-do-not-free-live-element.patch
+netfilter-flowtable-validate-pppoe-header.patch
+netfilter-flowtable-incorrect-pppoe-tuple.patch
+af_unix-call-manage_oob-for-every-skb-in-unix_stream.patch
+af_unix-don-t-peek-oob-data-without-msg_oob.patch
+net-mlx5-lag-restore-buckets-number-to-default-after.patch
+net-mlx5e-prevent-deadlock-while-disabling-arfs.patch
+ice-tc-allow-zero-flags-in-parsing-tc-flower.patch
+tun-limit-printing-rate-when-illegal-packet-received.patch
+net-dsa-mt7530-fix-mirroring-frames-received-on-loca.patch
+net-ethernet-ti-am65-cpsw-nuss-cleanup-dma-channels-.patch
diff --git a/queue-6.1/tun-limit-printing-rate-when-illegal-packet-received.patch b/queue-6.1/tun-limit-printing-rate-when-illegal-packet-received.patch
new file mode 100644 (file)
index 0000000..6d51f9b
--- /dev/null
@@ -0,0 +1,91 @@
+From 3f1bd052b894042c97a8a371c73fb25004a2b32d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 14 Apr 2024 22:02:46 -0400
+Subject: tun: limit printing rate when illegal packet received by tun dev
+
+From: Lei Chen <lei.chen@smartx.com>
+
+[ Upstream commit f8bbc07ac535593139c875ffa19af924b1084540 ]
+
+vhost_worker will call tun call backs to receive packets. If too many
+illegal packets arrives, tun_do_read will keep dumping packet contents.
+When console is enabled, it will costs much more cpu time to dump
+packet and soft lockup will be detected.
+
+net_ratelimit mechanism can be used to limit the dumping rate.
+
+PID: 33036    TASK: ffff949da6f20000  CPU: 23   COMMAND: "vhost-32980"
+ #0 [fffffe00003fce50] crash_nmi_callback at ffffffff89249253
+ #1 [fffffe00003fce58] nmi_handle at ffffffff89225fa3
+ #2 [fffffe00003fceb0] default_do_nmi at ffffffff8922642e
+ #3 [fffffe00003fced0] do_nmi at ffffffff8922660d
+ #4 [fffffe00003fcef0] end_repeat_nmi at ffffffff89c01663
+    [exception RIP: io_serial_in+20]
+    RIP: ffffffff89792594  RSP: ffffa655314979e8  RFLAGS: 00000002
+    RAX: ffffffff89792500  RBX: ffffffff8af428a0  RCX: 0000000000000000
+    RDX: 00000000000003fd  RSI: 0000000000000005  RDI: ffffffff8af428a0
+    RBP: 0000000000002710   R8: 0000000000000004   R9: 000000000000000f
+    R10: 0000000000000000  R11: ffffffff8acbf64f  R12: 0000000000000020
+    R13: ffffffff8acbf698  R14: 0000000000000058  R15: 0000000000000000
+    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
+ #5 [ffffa655314979e8] io_serial_in at ffffffff89792594
+ #6 [ffffa655314979e8] wait_for_xmitr at ffffffff89793470
+ #7 [ffffa65531497a08] serial8250_console_putchar at ffffffff897934f6
+ #8 [ffffa65531497a20] uart_console_write at ffffffff8978b605
+ #9 [ffffa65531497a48] serial8250_console_write at ffffffff89796558
+ #10 [ffffa65531497ac8] console_unlock at ffffffff89316124
+ #11 [ffffa65531497b10] vprintk_emit at ffffffff89317c07
+ #12 [ffffa65531497b68] printk at ffffffff89318306
+ #13 [ffffa65531497bc8] print_hex_dump at ffffffff89650765
+ #14 [ffffa65531497ca8] tun_do_read at ffffffffc0b06c27 [tun]
+ #15 [ffffa65531497d38] tun_recvmsg at ffffffffc0b06e34 [tun]
+ #16 [ffffa65531497d68] handle_rx at ffffffffc0c5d682 [vhost_net]
+ #17 [ffffa65531497ed0] vhost_worker at ffffffffc0c644dc [vhost]
+ #18 [ffffa65531497f10] kthread at ffffffff892d2e72
+ #19 [ffffa65531497f50] ret_from_fork at ffffffff89c0022f
+
+Fixes: ef3db4a59542 ("tun: avoid BUG, dump packet on GSO errors")
+Signed-off-by: Lei Chen <lei.chen@smartx.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Link: https://lore.kernel.org/r/20240415020247.2207781-1-lei.chen@smartx.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/tun.c | 18 ++++++++++--------
+ 1 file changed, 10 insertions(+), 8 deletions(-)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index 922d6f16d99d1..4af1ba5d074c0 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -2121,14 +2121,16 @@ static ssize_t tun_put_user(struct tun_struct *tun,
+                                           tun_is_little_endian(tun), true,
+                                           vlan_hlen)) {
+                       struct skb_shared_info *sinfo = skb_shinfo(skb);
+-                      pr_err("unexpected GSO type: "
+-                             "0x%x, gso_size %d, hdr_len %d\n",
+-                             sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
+-                             tun16_to_cpu(tun, gso.hdr_len));
+-                      print_hex_dump(KERN_ERR, "tun: ",
+-                                     DUMP_PREFIX_NONE,
+-                                     16, 1, skb->head,
+-                                     min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
++
++                      if (net_ratelimit()) {
++                              netdev_err(tun->dev, "unexpected GSO type: 0x%x, gso_size %d, hdr_len %d\n",
++                                         sinfo->gso_type, tun16_to_cpu(tun, gso.gso_size),
++                                         tun16_to_cpu(tun, gso.hdr_len));
++                              print_hex_dump(KERN_ERR, "tun: ",
++                                             DUMP_PREFIX_NONE,
++                                             16, 1, skb->head,
++                                             min((int)tun16_to_cpu(tun, gso.hdr_len), 64), true);
++                      }
+                       WARN_ON_ONCE(1);
+                       return -EINVAL;
+               }
+-- 
+2.43.0
+