5.10-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 22 Apr 2025 12:32:54 +0000 (14:32 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 22 Apr 2025 12:32:54 +0000 (14:32 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 22 Apr 2025 12:32:54 +0000 (14:32 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 22 Apr 2025 12:32:54 +0000 (14:32 +0200)
diff --git a/queue-5.10/blk-cgroup-support-to-track-if-policy-is-online.patch b/queue-5.10/blk-cgroup-support-to-track-if-policy-is-online.patch

new file mode 100644 (file)

index 0000000..b9631ea
--- /dev/null
+++ b/queue-5.10/blk-cgroup-support-to-track-if-policy-is-online.patch
@@ -0,0 +1,120 @@
+From dfd6200a095440b663099d8d42f1efb0175a1ce3 Mon Sep 17 00:00:00 2001
+From: Yu Kuai <yukuai3@huawei.com>
+Date: Thu, 19 Jan 2023 19:03:49 +0800
+Subject: blk-cgroup: support to track if policy is online
+
+From: Yu Kuai <yukuai3@huawei.com>
+
+commit dfd6200a095440b663099d8d42f1efb0175a1ce3 upstream.
+
+A new field 'online' is added to blkg_policy_data to fix following
+2 problem:
+
+1) In blkcg_activate_policy(), if pd_alloc_fn() with 'GFP_NOWAIT'
+   failed, 'queue_lock' will be dropped and pd_alloc_fn() will try again
+   without 'GFP_NOWAIT'. In the meantime, remove cgroup can race with
+   it, and pd_offline_fn() will be called without pd_init_fn() and
+   pd_online_fn(). This way null-ptr-deference can be triggered.
+
+2) In order to synchronize pd_free_fn() from blkg_free_workfn() and
+   blkcg_deactivate_policy(), 'list_del_init(&blkg->q_node)' will be
+   delayed to blkg_free_workfn(), hence pd_offline_fn() can be called
+   first in blkg_destroy(), and then blkcg_deactivate_policy() will
+   call it again, we must prevent it.
+
+The new field 'online' will be set after pd_online_fn() and will be
+cleared after pd_offline_fn(), in the meantime pd_offline_fn() will only
+be called if 'online' is set.
+
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Link: https://lore.kernel.org/r/20230119110350.2287325-3-yukuai1@huaweicloud.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
+Signed-off-by: He Zhe <zhe.he@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/blk-cgroup.c         |   24 +++++++++++++++++-------
+ include/linux/blk-cgroup.h |    1 +
+ 2 files changed, 18 insertions(+), 7 deletions(-)
+
+--- a/block/blk-cgroup.c
++++ b/block/blk-cgroup.c
+@@ -191,6 +191,7 @@ static struct blkcg_gq *blkg_alloc(struc
+               blkg->pd[i] = pd;
+               pd->blkg = blkg;
+               pd->plid = i;
++              pd->online = false;
+       }
+ 
+       return blkg;
+@@ -288,8 +289,11 @@ static struct blkcg_gq *blkg_create(stru
+               for (i = 0; i < BLKCG_MAX_POLS; i++) {
+                       struct blkcg_policy *pol = blkcg_policy[i];
+ 
+-                      if (blkg->pd[i] && pol->pd_online_fn)
+-                              pol->pd_online_fn(blkg->pd[i]);
++                      if (blkg->pd[i]) {
++                              if (pol->pd_online_fn)
++                                      pol->pd_online_fn(blkg->pd[i]);
++                              blkg->pd[i]->online = true;
++                      }
+               }
+       }
+       blkg->online = true;
+@@ -389,8 +393,11 @@ static void blkg_destroy(struct blkcg_gq
+       for (i = 0; i < BLKCG_MAX_POLS; i++) {
+               struct blkcg_policy *pol = blkcg_policy[i];
+ 
+-              if (blkg->pd[i] && pol->pd_offline_fn)
+-                      pol->pd_offline_fn(blkg->pd[i]);
++              if (blkg->pd[i] && blkg->pd[i]->online) {
++                      if (pol->pd_offline_fn)
++                              pol->pd_offline_fn(blkg->pd[i]);
++                      blkg->pd[i]->online = false;
++              }
+       }
+ 
+       blkg->online = false;
+@@ -1364,6 +1371,7 @@ retry:
+               blkg->pd[pol->plid] = pd;
+               pd->blkg = blkg;
+               pd->plid = pol->plid;
++              pd->online = false;
+       }
+ 
+       /* all allocated, init in the same order */
+@@ -1371,9 +1379,11 @@ retry:
+               list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
+                       pol->pd_init_fn(blkg->pd[pol->plid]);
+ 
+-      if (pol->pd_online_fn)
+-              list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
++      list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
++              if (pol->pd_online_fn)
+                       pol->pd_online_fn(blkg->pd[pol->plid]);
++              blkg->pd[pol->plid]->online = true;
++      }
+ 
+       __set_bit(pol->plid, q->blkcg_pols);
+       ret = 0;
+@@ -1435,7 +1445,7 @@ void blkcg_deactivate_policy(struct requ
+ 
+               spin_lock(&blkcg->lock);
+               if (blkg->pd[pol->plid]) {
+-                      if (pol->pd_offline_fn)
++                      if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
+                               pol->pd_offline_fn(blkg->pd[pol->plid]);
+                       pol->pd_free_fn(blkg->pd[pol->plid]);
+                       blkg->pd[pol->plid] = NULL;
+--- a/include/linux/blk-cgroup.h
++++ b/include/linux/blk-cgroup.h
+@@ -87,6 +87,7 @@ struct blkg_policy_data {
+       /* the blkg and policy id this per-policy data belongs to */
+       struct blkcg_gq                 *blkg;
+       int                             plid;
++      bool                            online;
+ };
+ 
+ /*
diff --git a/queue-5.10/blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch b/queue-5.10/blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch

new file mode 100644 (file)

index 0000000..8c5ea2b
--- /dev/null
+++ b/queue-5.10/blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch
@@ -0,0 +1,59 @@
+From 01bc4fda9ea0a6b52f12326486f07a4910666cf6 Mon Sep 17 00:00:00 2001
+From: Li Nan <linan122@huawei.com>
+Date: Fri, 19 Apr 2024 17:32:57 +0800
+Subject: blk-iocost: do not WARN if iocg was already offlined
+
+From: Li Nan <linan122@huawei.com>
+
+commit 01bc4fda9ea0a6b52f12326486f07a4910666cf6 upstream.
+
+In iocg_pay_debt(), warn is triggered if 'active_list' is empty, which
+is intended to confirm iocg is active when it has debt. However, warn
+can be triggered during a blkcg or disk removal, if iocg_waitq_timer_fn()
+is run at that time:
+
+  WARNING: CPU: 0 PID: 2344971 at block/blk-iocost.c:1402 iocg_pay_debt+0x14c/0x190
+  Call trace:
+  iocg_pay_debt+0x14c/0x190
+  iocg_kick_waitq+0x438/0x4c0
+  iocg_waitq_timer_fn+0xd8/0x130
+  __run_hrtimer+0x144/0x45c
+  __hrtimer_run_queues+0x16c/0x244
+  hrtimer_interrupt+0x2cc/0x7b0
+
+The warn in this situation is meaningless. Since this iocg is being
+removed, the state of the 'active_list' is irrelevant, and 'waitq_timer'
+is canceled after removing 'active_list' in ioc_pd_free(), which ensures
+iocg is freed after iocg_waitq_timer_fn() returns.
+
+Therefore, add the check if iocg was already offlined to avoid warn
+when removing a blkcg or disk.
+
+Signed-off-by: Li Nan <linan122@huawei.com>
+Reviewed-by: Yu Kuai <yukuai3@huawei.com>
+Acked-by: Tejun Heo <tj@kernel.org>
+Link: https://lore.kernel.org/r/20240419093257.3004211-1-linan666@huaweicloud.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
+Signed-off-by: He Zhe <zhe.he@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ block/blk-iocost.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/block/blk-iocost.c
++++ b/block/blk-iocost.c
+@@ -1389,8 +1389,11 @@ static void iocg_pay_debt(struct ioc_gq
+       lockdep_assert_held(&iocg->ioc->lock);
+       lockdep_assert_held(&iocg->waitq.lock);
+ 
+-      /* make sure that nobody messed with @iocg */
+-      WARN_ON_ONCE(list_empty(&iocg->active_list));
++      /*
++       * make sure that nobody messed with @iocg. Check iocg->pd.online
++       * to avoid warn when removing blkcg or disk.
++       */
++      WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online);
+       WARN_ON_ONCE(iocg->inuse > 1);
+ 
+       iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
diff --git a/queue-5.10/ext4-fix-timer-use-after-free-on-failed-mount.patch b/queue-5.10/ext4-fix-timer-use-after-free-on-failed-mount.patch

new file mode 100644 (file)

index 0000000..e26ec88
--- /dev/null
+++ b/queue-5.10/ext4-fix-timer-use-after-free-on-failed-mount.patch
@@ -0,0 +1,49 @@
+From 0ce160c5bdb67081a62293028dc85758a8efb22a Mon Sep 17 00:00:00 2001
+From: Xiaxi Shen <shenxiaxi26@gmail.com>
+Date: Sun, 14 Jul 2024 21:33:36 -0700
+Subject: ext4: fix timer use-after-free on failed mount
+
+From: Xiaxi Shen <shenxiaxi26@gmail.com>
+
+commit 0ce160c5bdb67081a62293028dc85758a8efb22a upstream.
+
+Syzbot has found an ODEBUG bug in ext4_fill_super
+
+The del_timer_sync function cancels the s_err_report timer,
+which reminds about filesystem errors daily. We should
+guarantee the timer is no longer active before kfree(sbi).
+
+When filesystem mounting fails, the flow goes to failed_mount3,
+where an error occurs when ext4_stop_mmpd is called, causing
+a read I/O failure. This triggers the ext4_handle_error function
+that ultimately re-arms the timer,
+leaving the s_err_report timer active before kfree(sbi) is called.
+
+Fix the issue by canceling the s_err_report timer after calling ext4_stop_mmpd.
+
+Signed-off-by: Xiaxi Shen <shenxiaxi26@gmail.com>
+Reported-and-tested-by: syzbot+59e0101c430934bc9a36@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=59e0101c430934bc9a36
+Link: https://patch.msgid.link/20240715043336.98097-1-shenxiaxi26@gmail.com
+Signed-off-by: Theodore Ts'o <tytso@mit.edu>
+Cc: stable@kernel.org
+[Minor context change fixed]
+Signed-off-by: Xiangyu Chen <xiangyu.chen@windriver.com>
+Signed-off-by: He Zhe <zhe.he@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/ext4/super.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -5185,8 +5185,8 @@ failed_mount_wq:
+ failed_mount3a:
+       ext4_es_unregister_shrinker(sbi);
+ failed_mount3:
+-      del_timer_sync(&sbi->s_err_report);
+       ext4_stop_mmpd(sbi);
++      del_timer_sync(&sbi->s_err_report);
+ failed_mount2:
+       rcu_read_lock();
+       group_desc = rcu_dereference(sbi->s_group_desc);
diff --git a/queue-5.10/ipvs-properly-dereference-pe-in-ip_vs_add_service.patch b/queue-5.10/ipvs-properly-dereference-pe-in-ip_vs_add_service.patch

new file mode 100644 (file)

index 0000000..80884c5
--- /dev/null
+++ b/queue-5.10/ipvs-properly-dereference-pe-in-ip_vs_add_service.patch
@@ -0,0 +1,53 @@
+From cbd070a4ae62f119058973f6d2c984e325bce6e7 Mon Sep 17 00:00:00 2001
+From: Chen Hanxiao <chenhx.fnst@fujitsu.com>
+Date: Thu, 27 Jun 2024 14:15:15 +0800
+Subject: ipvs: properly dereference pe in ip_vs_add_service
+
+From: Chen Hanxiao <chenhx.fnst@fujitsu.com>
+
+commit cbd070a4ae62f119058973f6d2c984e325bce6e7 upstream.
+
+Use pe directly to resolve sparse warning:
+
+  net/netfilter/ipvs/ip_vs_ctl.c:1471:27: warning: dereference of noderef expression
+
+Fixes: 39b972231536 ("ipvs: handle connections started by real-servers")
+Signed-off-by: Chen Hanxiao <chenhx.fnst@fujitsu.com>
+Acked-by: Julian Anastasov <ja@ssi.bg>
+Acked-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Cliff Liu <donghua.liu@windriver.com>
+Signed-off-by: He Zhe <Zhe.He@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netfilter/ipvs/ip_vs_ctl.c |   10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/net/netfilter/ipvs/ip_vs_ctl.c
++++ b/net/netfilter/ipvs/ip_vs_ctl.c
+@@ -1384,20 +1384,20 @@ ip_vs_add_service(struct netns_ipvs *ipv
+               sched = NULL;
+       }
+ 
+-      /* Bind the ct retriever */
+-      RCU_INIT_POINTER(svc->pe, pe);
+-      pe = NULL;
+-
+       /* Update the virtual service counters */
+       if (svc->port == FTPPORT)
+               atomic_inc(&ipvs->ftpsvc_counter);
+       else if (svc->port == 0)
+               atomic_inc(&ipvs->nullsvc_counter);
+-      if (svc->pe && svc->pe->conn_out)
++      if (pe && pe->conn_out)
+               atomic_inc(&ipvs->conn_out_counter);
+ 
+       ip_vs_start_estimator(ipvs, &svc->stats);
+ 
++      /* Bind the ct retriever */
++      RCU_INIT_POINTER(svc->pe, pe);
++      pe = NULL;
++
+       /* Count only IPv4 services for old get/setsockopt interface */
+       if (svc->af == AF_INET)
+               ipvs->num_services++;
diff --git a/queue-5.10/mmc-sdhci-brcmstb-add-ability-to-increase-max-clock-rate-for-72116b0.patch b/queue-5.10/mmc-sdhci-brcmstb-add-ability-to-increase-max-clock-rate-for-72116b0.patch

new file mode 100644 (file)

index 0000000..6a01f81
--- /dev/null
+++ b/queue-5.10/mmc-sdhci-brcmstb-add-ability-to-increase-max-clock-rate-for-72116b0.patch
@@ -0,0 +1,142 @@
+From 97904a59855c7ac7c613085bc6bdc550d48524ff Mon Sep 17 00:00:00 2001
+From: Kamal Dasu <kdasu.kdev@gmail.com>
+Date: Fri, 20 May 2022 14:31:08 -0400
+Subject: mmc: sdhci-brcmstb: Add ability to increase max clock rate for 72116b0
+
+From: Kamal Dasu <kdasu.kdev@gmail.com>
+
+commit 97904a59855c7ac7c613085bc6bdc550d48524ff upstream.
+
+The 72116B0 has improved SDIO controllers that allow the max clock
+rate to be increased from a max of 100MHz to a max of 150MHz. The
+driver will need to get the clock and increase it's default rate
+and override the caps register, that still indicates a max of 100MHz.
+The new clock will be named "sdio_freq" in the DT node's "clock-names"
+list. The driver will use a DT property, "clock-frequency", to
+enable this functionality and will get the actual rate in MHz
+from the property to allow various speeds to be requested.
+
+Signed-off-by: Al Cooper <alcooperx@gmail.com>
+Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
+Acked-by: Florian Fainelli <f.fainelli@gmail.com>
+Link: https://lore.kernel.org/r/20220520183108.47358-3-kdasu.kdev@gmail.com
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Kamal Dasu <kamal.dasu@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sdhci-brcmstb.c |   69 ++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 68 insertions(+), 1 deletion(-)
+
+--- a/drivers/mmc/host/sdhci-brcmstb.c
++++ b/drivers/mmc/host/sdhci-brcmstb.c
+@@ -32,6 +32,8 @@
+ struct sdhci_brcmstb_priv {
+       void __iomem *cfg_regs;
+       unsigned int flags;
++      struct clk *base_clk;
++      u32 base_freq_hz;
+ };
+ 
+ struct brcmstb_match_priv {
+@@ -251,9 +253,11 @@ static int sdhci_brcmstb_probe(struct pl
+       struct sdhci_pltfm_host *pltfm_host;
+       const struct of_device_id *match;
+       struct sdhci_brcmstb_priv *priv;
++      u32 actual_clock_mhz;
+       struct sdhci_host *host;
+       struct resource *iomem;
+       struct clk *clk;
++      struct clk *base_clk;
+       int res;
+ 
+       match = of_match_node(sdhci_brcm_of_match, pdev->dev.of_node);
+@@ -331,6 +335,35 @@ static int sdhci_brcmstb_probe(struct pl
+       if (match_priv->flags & BRCMSTB_MATCH_FLAGS_BROKEN_TIMEOUT)
+               host->quirks |= SDHCI_QUIRK_BROKEN_TIMEOUT_VAL;
+ 
++      /* Change the base clock frequency if the DT property exists */
++      if (device_property_read_u32(&pdev->dev, "clock-frequency",
++                                   &priv->base_freq_hz) != 0)
++              goto add_host;
++
++      base_clk = devm_clk_get_optional(&pdev->dev, "sdio_freq");
++      if (IS_ERR(base_clk)) {
++              dev_warn(&pdev->dev, "Clock for \"sdio_freq\" not found\n");
++              goto add_host;
++      }
++
++      res = clk_prepare_enable(base_clk);
++      if (res)
++              goto err;
++
++      /* set improved clock rate */
++      clk_set_rate(base_clk, priv->base_freq_hz);
++      actual_clock_mhz = clk_get_rate(base_clk) / 1000000;
++
++      host->caps &= ~SDHCI_CLOCK_V3_BASE_MASK;
++      host->caps |= (actual_clock_mhz << SDHCI_CLOCK_BASE_SHIFT);
++      /* Disable presets because they are now incorrect */
++      host->quirks2 |= SDHCI_QUIRK2_PRESET_VALUE_BROKEN;
++
++      dev_dbg(&pdev->dev, "Base Clock Frequency changed to %dMHz\n",
++              actual_clock_mhz);
++      priv->base_clk = base_clk;
++
++add_host:
+       res = sdhci_brcmstb_add_host(host, priv);
+       if (res)
+               goto err;
+@@ -341,6 +374,7 @@ static int sdhci_brcmstb_probe(struct pl
+ err:
+       sdhci_pltfm_free(pdev);
+ err_clk:
++      clk_disable_unprepare(base_clk);
+       clk_disable_unprepare(clk);
+       return res;
+ }
+@@ -352,11 +386,44 @@ static void sdhci_brcmstb_shutdown(struc
+ 
+ MODULE_DEVICE_TABLE(of, sdhci_brcm_of_match);
+ 
++#ifdef CONFIG_PM_SLEEP
++static int sdhci_brcmstb_suspend(struct device *dev)
++{
++      struct sdhci_host *host = dev_get_drvdata(dev);
++      struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
++      struct sdhci_brcmstb_priv *priv = sdhci_pltfm_priv(pltfm_host);
++
++      clk_disable_unprepare(priv->base_clk);
++      return sdhci_pltfm_suspend(dev);
++}
++
++static int sdhci_brcmstb_resume(struct device *dev)
++{
++      struct sdhci_host *host = dev_get_drvdata(dev);
++      struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
++      struct sdhci_brcmstb_priv *priv = sdhci_pltfm_priv(pltfm_host);
++      int ret;
++
++      ret = sdhci_pltfm_resume(dev);
++      if (!ret && priv->base_freq_hz) {
++              ret = clk_prepare_enable(priv->base_clk);
++              if (!ret)
++                      ret = clk_set_rate(priv->base_clk, priv->base_freq_hz);
++      }
++
++      return ret;
++}
++#endif
++
++static const struct dev_pm_ops sdhci_brcmstb_pmops = {
++      SET_SYSTEM_SLEEP_PM_OPS(sdhci_brcmstb_suspend, sdhci_brcmstb_resume)
++};
++
+ static struct platform_driver sdhci_brcmstb_driver = {
+       .driver         = {
+               .name   = "sdhci-brcmstb",
+               .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+-              .pm     = &sdhci_pltfm_pmops,
++              .pm     = &sdhci_brcmstb_pmops,
+               .of_match_table = of_match_ptr(sdhci_brcm_of_match),
+       },
+       .probe          = sdhci_brcmstb_probe,
diff --git a/queue-5.10/mmc-sdhci-brcmstb-initialize-base_clk-to-null-in-sdhci_brcmstb_probe.patch b/queue-5.10/mmc-sdhci-brcmstb-initialize-base_clk-to-null-in-sdhci_brcmstb_probe.patch

new file mode 100644 (file)

index 0000000..c573330
--- /dev/null
+++ b/queue-5.10/mmc-sdhci-brcmstb-initialize-base_clk-to-null-in-sdhci_brcmstb_probe.patch
@@ -0,0 +1,46 @@
+From c3c0ed75ffbff5c70667030b5139bbb75b0a30f5 Mon Sep 17 00:00:00 2001
+From: Nathan Chancellor <nathan@kernel.org>
+Date: Wed, 8 Jun 2022 08:27:57 -0700
+Subject: mmc: sdhci-brcmstb: Initialize base_clk to NULL in sdhci_brcmstb_probe()
+
+From: Nathan Chancellor <nathan@kernel.org>
+
+commit c3c0ed75ffbff5c70667030b5139bbb75b0a30f5 upstream.
+
+Clang warns a few times along the lines of:
+
+  drivers/mmc/host/sdhci-brcmstb.c:302:6: warning: variable 'base_clk' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
+          if (res)
+              ^~~
+  drivers/mmc/host/sdhci-brcmstb.c:376:24: note: uninitialized use occurs here
+          clk_disable_unprepare(base_clk);
+                                ^~~~~~~~
+
+base_clk is used in the error path before it is initialized. Initialize
+it to NULL, as clk_disable_unprepare() calls clk_disable() and
+clk_unprepare(), which both handle NULL pointers gracefully.
+
+Link: https://github.com/ClangBuiltLinux/linux/issues/1650
+Reported-by: kernel test robot <lkp@intel.com>
+Signed-off-by: Nathan Chancellor <nathan@kernel.org>
+Acked-by: Florian Fainelli <f.fainelli@gmail.com>
+Acked-by: Adrian Hunter <adrian.hunter@intel.com>
+Link: https://lore.kernel.org/r/20220608152757.82529-1-nathan@kernel.org
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Kamal Dasu <kamal.dasu@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sdhci-brcmstb.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/mmc/host/sdhci-brcmstb.c
++++ b/drivers/mmc/host/sdhci-brcmstb.c
+@@ -257,7 +257,7 @@ static int sdhci_brcmstb_probe(struct pl
+       struct sdhci_host *host;
+       struct resource *iomem;
+       struct clk *clk;
+-      struct clk *base_clk;
++      struct clk *base_clk = NULL;
+       int res;
+ 
+       match = of_match_node(sdhci_brcm_of_match, pdev->dev.of_node);
diff --git a/queue-5.10/mmc-sdhci-brcmstb-use-clk_get_rate-base_clk-in-pm-resume.patch b/queue-5.10/mmc-sdhci-brcmstb-use-clk_get_rate-base_clk-in-pm-resume.patch

new file mode 100644 (file)

index 0000000..6df89a0
--- /dev/null
+++ b/queue-5.10/mmc-sdhci-brcmstb-use-clk_get_rate-base_clk-in-pm-resume.patch
@@ -0,0 +1,43 @@
+From 886201c70a1cab34ef96f867c2b2dd6379ffa7b9 Mon Sep 17 00:00:00 2001
+From: Kamal Dasu <kdasu.kdev@gmail.com>
+Date: Thu, 14 Jul 2022 13:41:32 -0400
+Subject: mmc: sdhci-brcmstb: use clk_get_rate(base_clk) in PM resume
+
+From: Kamal Dasu <kdasu.kdev@gmail.com>
+
+commit 886201c70a1cab34ef96f867c2b2dd6379ffa7b9 upstream.
+
+Use clk_get_rate for base_clk on resume before setting new rate.
+This change ensures that the clock api returns current rate
+and sets the clock to the desired rate and honors CLK_GET_NO_CACHE
+attribute used by clock api.
+
+Fixes: 97904a59855c (mmc: sdhci-brcmstb: Add ability to increase max clock rate for 72116b0)
+Signed-off-by: Kamal Dasu <kdasu.kdev@gmail.com>
+Acked-by: Florian Fainelli <f.fainelli@gmail.com>
+Link: https://lore.kernel.org/r/20220714174132.18541-1-kdasu.kdev@gmail.com
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Kamal Dasu <kamal.dasu@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sdhci-brcmstb.c |    9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/drivers/mmc/host/sdhci-brcmstb.c
++++ b/drivers/mmc/host/sdhci-brcmstb.c
+@@ -407,7 +407,14 @@ static int sdhci_brcmstb_resume(struct d
+       ret = sdhci_pltfm_resume(dev);
+       if (!ret && priv->base_freq_hz) {
+               ret = clk_prepare_enable(priv->base_clk);
+-              if (!ret)
++              /*
++               * Note: using clk_get_rate() below as clk_get_rate()
++               * honors CLK_GET_RATE_NOCACHE attribute, but clk_set_rate()
++               * may do implicit get_rate() calls that do not honor
++               * CLK_GET_RATE_NOCACHE.
++               */
++              if (!ret &&
++                  (clk_get_rate(priv->base_clk) != priv->base_freq_hz))
+                       ret = clk_set_rate(priv->base_clk, priv->base_freq_hz);
+       }
+ 
diff --git a/queue-5.10/net-mlx5e-fix-use-after-free-of-encap-entry-in-neigh-update-handler.patch b/queue-5.10/net-mlx5e-fix-use-after-free-of-encap-entry-in-neigh-update-handler.patch

new file mode 100644 (file)

index 0000000..8187f04
--- /dev/null
+++ b/queue-5.10/net-mlx5e-fix-use-after-free-of-encap-entry-in-neigh-update-handler.patch
@@ -0,0 +1,278 @@
+From fb1a3132ee1ac968316e45d21a48703a6db0b6c3 Mon Sep 17 00:00:00 2001
+From: Vlad Buslov <vladbu@nvidia.com>
+Date: Mon, 31 May 2021 16:28:39 +0300
+Subject: net/mlx5e: Fix use-after-free of encap entry in neigh update handler
+
+From: Vlad Buslov <vladbu@nvidia.com>
+
+commit fb1a3132ee1ac968316e45d21a48703a6db0b6c3 upstream.
+
+Function mlx5e_rep_neigh_update() wasn't updated to accommodate rtnl lock
+removal from TC filter update path and properly handle concurrent encap
+entry insertion/deletion which can lead to following use-after-free:
+
+ [23827.464923] ==================================================================
+ [23827.469446] BUG: KASAN: use-after-free in mlx5e_encap_take+0x72/0x140 [mlx5_core]
+ [23827.470971] Read of size 4 at addr ffff8881d132228c by task kworker/u20:6/21635
+ [23827.472251]
+ [23827.472615] CPU: 9 PID: 21635 Comm: kworker/u20:6 Not tainted 5.13.0-rc3+ #5
+ [23827.473788] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+ [23827.475639] Workqueue: mlx5e mlx5e_rep_neigh_update [mlx5_core]
+ [23827.476731] Call Trace:
+ [23827.477260]  dump_stack+0xbb/0x107
+ [23827.477906]  print_address_description.constprop.0+0x18/0x140
+ [23827.478896]  ? mlx5e_encap_take+0x72/0x140 [mlx5_core]
+ [23827.479879]  ? mlx5e_encap_take+0x72/0x140 [mlx5_core]
+ [23827.480905]  kasan_report.cold+0x7c/0xd8
+ [23827.481701]  ? mlx5e_encap_take+0x72/0x140 [mlx5_core]
+ [23827.482744]  kasan_check_range+0x145/0x1a0
+ [23827.493112]  mlx5e_encap_take+0x72/0x140 [mlx5_core]
+ [23827.494054]  ? mlx5e_tc_tun_encap_info_equal_generic+0x140/0x140 [mlx5_core]
+ [23827.495296]  mlx5e_rep_neigh_update+0x41e/0x5e0 [mlx5_core]
+ [23827.496338]  ? mlx5e_rep_neigh_entry_release+0xb80/0xb80 [mlx5_core]
+ [23827.497486]  ? read_word_at_a_time+0xe/0x20
+ [23827.498250]  ? strscpy+0xa0/0x2a0
+ [23827.498889]  process_one_work+0x8ac/0x14e0
+ [23827.499638]  ? lockdep_hardirqs_on_prepare+0x400/0x400
+ [23827.500537]  ? pwq_dec_nr_in_flight+0x2c0/0x2c0
+ [23827.501359]  ? rwlock_bug.part.0+0x90/0x90
+ [23827.502116]  worker_thread+0x53b/0x1220
+ [23827.502831]  ? process_one_work+0x14e0/0x14e0
+ [23827.503627]  kthread+0x328/0x3f0
+ [23827.504254]  ? _raw_spin_unlock_irq+0x24/0x40
+ [23827.505065]  ? __kthread_bind_mask+0x90/0x90
+ [23827.505912]  ret_from_fork+0x1f/0x30
+ [23827.506621]
+ [23827.506987] Allocated by task 28248:
+ [23827.507694]  kasan_save_stack+0x1b/0x40
+ [23827.508476]  __kasan_kmalloc+0x7c/0x90
+ [23827.509197]  mlx5e_attach_encap+0xde1/0x1d40 [mlx5_core]
+ [23827.510194]  mlx5e_tc_add_fdb_flow+0x397/0xc40 [mlx5_core]
+ [23827.511218]  __mlx5e_add_fdb_flow+0x519/0xb30 [mlx5_core]
+ [23827.512234]  mlx5e_configure_flower+0x191c/0x4870 [mlx5_core]
+ [23827.513298]  tc_setup_cb_add+0x1d5/0x420
+ [23827.514023]  fl_hw_replace_filter+0x382/0x6a0 [cls_flower]
+ [23827.514975]  fl_change+0x2ceb/0x4a51 [cls_flower]
+ [23827.515821]  tc_new_tfilter+0x89a/0x2070
+ [23827.516548]  rtnetlink_rcv_msg+0x644/0x8c0
+ [23827.517300]  netlink_rcv_skb+0x11d/0x340
+ [23827.518021]  netlink_unicast+0x42b/0x700
+ [23827.518742]  netlink_sendmsg+0x743/0xc20
+ [23827.519467]  sock_sendmsg+0xb2/0xe0
+ [23827.520131]  ____sys_sendmsg+0x590/0x770
+ [23827.520851]  ___sys_sendmsg+0xd8/0x160
+ [23827.521552]  __sys_sendmsg+0xb7/0x140
+ [23827.522238]  do_syscall_64+0x3a/0x70
+ [23827.522907]  entry_SYSCALL_64_after_hwframe+0x44/0xae
+ [23827.523797]
+ [23827.524163] Freed by task 25948:
+ [23827.524780]  kasan_save_stack+0x1b/0x40
+ [23827.525488]  kasan_set_track+0x1c/0x30
+ [23827.526187]  kasan_set_free_info+0x20/0x30
+ [23827.526968]  __kasan_slab_free+0xed/0x130
+ [23827.527709]  slab_free_freelist_hook+0xcf/0x1d0
+ [23827.528528]  kmem_cache_free_bulk+0x33a/0x6e0
+ [23827.529317]  kfree_rcu_work+0x55f/0xb70
+ [23827.530024]  process_one_work+0x8ac/0x14e0
+ [23827.530770]  worker_thread+0x53b/0x1220
+ [23827.531480]  kthread+0x328/0x3f0
+ [23827.532114]  ret_from_fork+0x1f/0x30
+ [23827.532785]
+ [23827.533147] Last potentially related work creation:
+ [23827.534007]  kasan_save_stack+0x1b/0x40
+ [23827.534710]  kasan_record_aux_stack+0xab/0xc0
+ [23827.535492]  kvfree_call_rcu+0x31/0x7b0
+ [23827.536206]  mlx5e_tc_del_fdb_flow+0x577/0xef0 [mlx5_core]
+ [23827.537305]  mlx5e_flow_put+0x49/0x80 [mlx5_core]
+ [23827.538290]  mlx5e_delete_flower+0x6d1/0xe60 [mlx5_core]
+ [23827.539300]  tc_setup_cb_destroy+0x18e/0x2f0
+ [23827.540144]  fl_hw_destroy_filter+0x1d2/0x310 [cls_flower]
+ [23827.541148]  __fl_delete+0x4dc/0x660 [cls_flower]
+ [23827.541985]  fl_delete+0x97/0x160 [cls_flower]
+ [23827.542782]  tc_del_tfilter+0x7ab/0x13d0
+ [23827.543503]  rtnetlink_rcv_msg+0x644/0x8c0
+ [23827.544257]  netlink_rcv_skb+0x11d/0x340
+ [23827.544981]  netlink_unicast+0x42b/0x700
+ [23827.545700]  netlink_sendmsg+0x743/0xc20
+ [23827.546424]  sock_sendmsg+0xb2/0xe0
+ [23827.547084]  ____sys_sendmsg+0x590/0x770
+ [23827.547850]  ___sys_sendmsg+0xd8/0x160
+ [23827.548606]  __sys_sendmsg+0xb7/0x140
+ [23827.549303]  do_syscall_64+0x3a/0x70
+ [23827.549969]  entry_SYSCALL_64_after_hwframe+0x44/0xae
+ [23827.550853]
+ [23827.551217] The buggy address belongs to the object at ffff8881d1322200
+ [23827.551217]  which belongs to the cache kmalloc-256 of size 256
+ [23827.553341] The buggy address is located 140 bytes inside of
+ [23827.553341]  256-byte region [ffff8881d1322200, ffff8881d1322300)
+ [23827.555747] The buggy address belongs to the page:
+ [23827.556847] page:00000000898762aa refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1d1320
+ [23827.558651] head:00000000898762aa order:2 compound_mapcount:0 compound_pincount:0
+ [23827.559961] flags: 0x2ffff800010200(slab|head|node=0|zone=2|lastcpupid=0x1ffff)
+ [23827.561243] raw: 002ffff800010200 dead000000000100 dead000000000122 ffff888100042b40
+ [23827.562653] raw: 0000000000000000 0000000000200020 00000001ffffffff 0000000000000000
+ [23827.564112] page dumped because: kasan: bad access detected
+ [23827.565439]
+ [23827.565932] Memory state around the buggy address:
+ [23827.566917]  ffff8881d1322180: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ [23827.568485]  ffff8881d1322200: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ [23827.569818] >ffff8881d1322280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ [23827.571143]                       ^
+ [23827.571879]  ffff8881d1322300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ [23827.573283]  ffff8881d1322380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ [23827.574654] ==================================================================
+
+Most of the necessary logic is already correctly implemented by
+mlx5e_get_next_valid_encap() helper that is used in neigh stats update
+handler. Make the handler generic by renaming it to
+mlx5e_get_next_matching_encap() and use callback to test whether flow is
+matching instead of hardcoded check for 'valid' flag value. Implement
+mlx5e_get_next_valid_encap() by calling mlx5e_get_next_matching_encap()
+with callback that tests encap MLX5_ENCAP_ENTRY_VALID flag. Implement new
+mlx5e_get_next_init_encap() helper by calling
+mlx5e_get_next_matching_encap() with callback that tests encap completion
+result to be non-error and use it in mlx5e_rep_neigh_update() to safely
+iterate over nhe->encap_list.
+
+Remove encap completion logic from mlx5e_rep_update_flows() since the encap
+entries passed to this function are already guaranteed to be properly
+initialized by similar code in mlx5e_get_next_init_encap().
+
+Fixes: 2a1f1768fa17 ("net/mlx5e: Refactor neigh update for concurrent execution")
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Reviewed-by: Roi Dayan <roid@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+[ since kernel 5.10 doesn't have commit 0d9f96471493
+  ("net/mlx5e: Extract tc tunnel encap/decap code to dedicated file")
+  which moved encap/decap from en_tc.c to tc_tun_encap.c, so backport and
+  move the additional functions to en_tc.c instead of tc_tun_encap.c ]
+Signed-off-by: Xiangyu Chen <xiangyu.chen@windriver.com>
+Signed-off-by: He Zhe <zhe.he@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c |   17 +++-----
+ drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c    |    6 ---
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c        |   33 +++++++++++++++--
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.h        |    3 +
+ 4 files changed, 41 insertions(+), 18 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/neigh.c
+@@ -129,9 +129,8 @@ static void mlx5e_rep_neigh_update(struc
+                                                            work);
+       struct mlx5e_neigh_hash_entry *nhe = update_work->nhe;
+       struct neighbour *n = update_work->n;
+-      struct mlx5e_encap_entry *e;
++      struct mlx5e_encap_entry *e = NULL;
+       unsigned char ha[ETH_ALEN];
+-      struct mlx5e_priv *priv;
+       bool neigh_connected;
+       u8 nud_state, dead;
+ 
+@@ -152,14 +151,12 @@ static void mlx5e_rep_neigh_update(struc
+ 
+       trace_mlx5e_rep_neigh_update(nhe, ha, neigh_connected);
+ 
+-      list_for_each_entry(e, &nhe->encap_list, encap_list) {
+-              if (!mlx5e_encap_take(e))
+-                      continue;
+-
+-              priv = netdev_priv(e->out_dev);
+-              mlx5e_rep_update_flows(priv, e, neigh_connected, ha);
+-              mlx5e_encap_put(priv, e);
+-      }
++      /* mlx5e_get_next_init_encap() releases previous encap before returning
++       * the next one.
++       */
++      while ((e = mlx5e_get_next_init_encap(nhe, e)) != NULL)
++              mlx5e_rep_update_flows(netdev_priv(e->out_dev), e, neigh_connected, ha);
++
+       rtnl_unlock();
+       mlx5e_release_neigh_update_work(update_work);
+ }
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c
+@@ -91,13 +91,9 @@ void mlx5e_rep_update_flows(struct mlx5e
+ 
+       ASSERT_RTNL();
+ 
+-      /* wait for encap to be fully initialized */
+-      wait_for_completion(&e->res_ready);
+-
+       mutex_lock(&esw->offloads.encap_tbl_lock);
+       encap_connected = !!(e->flags & MLX5_ENCAP_ENTRY_VALID);
+-      if (e->compl_result < 0 || (encap_connected == neigh_connected &&
+-                                  ether_addr_equal(e->h_dest, ha)))
++      if (encap_connected == neigh_connected && ether_addr_equal(e->h_dest, ha))
+               goto unlock;
+ 
+       mlx5e_take_all_encap_flows(e, &flow_list);
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+@@ -1653,9 +1653,12 @@ void mlx5e_put_encap_flow_list(struct ml
+               mlx5e_flow_put(priv, flow);
+ }
+ 
++typedef bool (match_cb)(struct mlx5e_encap_entry *);
++
+ static struct mlx5e_encap_entry *
+-mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe,
+-                         struct mlx5e_encap_entry *e)
++mlx5e_get_next_matching_encap(struct mlx5e_neigh_hash_entry *nhe,
++                         struct mlx5e_encap_entry *e,
++                         match_cb match)
+ {
+       struct mlx5e_encap_entry *next = NULL;
+ 
+@@ -1690,7 +1693,7 @@ retry:
+       /* wait for encap to be fully initialized */
+       wait_for_completion(&next->res_ready);
+       /* continue searching if encap entry is not in valid state after completion */
+-      if (!(next->flags & MLX5_ENCAP_ENTRY_VALID)) {
++      if (!match(next)) {
+               e = next;
+               goto retry;
+       }
+@@ -1698,6 +1701,30 @@ retry:
+       return next;
+ }
+ 
++static bool mlx5e_encap_valid(struct mlx5e_encap_entry *e)
++{
++      return e->flags & MLX5_ENCAP_ENTRY_VALID;
++}
++
++static struct mlx5e_encap_entry *
++mlx5e_get_next_valid_encap(struct mlx5e_neigh_hash_entry *nhe,
++                         struct mlx5e_encap_entry *e)
++{
++      return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_valid);
++}
++
++static bool mlx5e_encap_initialized(struct mlx5e_encap_entry *e)
++{
++      return e->compl_result >= 0;
++}
++
++struct mlx5e_encap_entry *
++mlx5e_get_next_init_encap(struct mlx5e_neigh_hash_entry *nhe,
++                        struct mlx5e_encap_entry *e)
++{
++      return mlx5e_get_next_matching_encap(nhe, e, mlx5e_encap_initialized);
++}
++
+ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe)
+ {
+       struct mlx5e_neigh *m_neigh = &nhe->m_neigh;
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+@@ -161,6 +161,9 @@ void mlx5e_take_all_encap_flows(struct m
+ void mlx5e_put_encap_flow_list(struct mlx5e_priv *priv, struct list_head *flow_list);
+ 
+ struct mlx5e_neigh_hash_entry;
++struct mlx5e_encap_entry *
++mlx5e_get_next_init_encap(struct mlx5e_neigh_hash_entry *nhe,
++                        struct mlx5e_encap_entry *e);
+ void mlx5e_tc_update_neigh_used_value(struct mlx5e_neigh_hash_entry *nhe);
+ 
+ void mlx5e_tc_reoffload_flows_work(struct work_struct *work);
diff --git a/queue-5.10/net-openvswitch-fix-race-on-port-output.patch b/queue-5.10/net-openvswitch-fix-race-on-port-output.patch

new file mode 100644 (file)

index 0000000..dad11d5
--- /dev/null
+++ b/queue-5.10/net-openvswitch-fix-race-on-port-output.patch
@@ -0,0 +1,235 @@
+From 066b86787fa3d97b7aefb5ac0a99a22dad2d15f8 Mon Sep 17 00:00:00 2001
+From: Felix Huettner <felix.huettner@mail.schwarz>
+Date: Wed, 5 Apr 2023 07:53:41 +0000
+Subject: net: openvswitch: fix race on port output
+
+From: Felix Huettner <felix.huettner@mail.schwarz>
+
+commit 066b86787fa3d97b7aefb5ac0a99a22dad2d15f8 upstream.
+
+assume the following setup on a single machine:
+1. An openvswitch instance with one bridge and default flows
+2. two network namespaces "server" and "client"
+3. two ovs interfaces "server" and "client" on the bridge
+4. for each ovs interface a veth pair with a matching name and 32 rx and
+   tx queues
+5. move the ends of the veth pairs to the respective network namespaces
+6. assign ip addresses to each of the veth ends in the namespaces (needs
+   to be the same subnet)
+7. start some http server on the server network namespace
+8. test if a client in the client namespace can reach the http server
+
+when following the actions below the host has a chance of getting a cpu
+stuck in a infinite loop:
+1. send a large amount of parallel requests to the http server (around
+   3000 curls should work)
+2. in parallel delete the network namespace (do not delete interfaces or
+   stop the server, just kill the namespace)
+
+there is a low chance that this will cause the below kernel cpu stuck
+message. If this does not happen just retry.
+Below there is also the output of bpftrace for the functions mentioned
+in the output.
+
+The series of events happening here is:
+1. the network namespace is deleted calling
+   `unregister_netdevice_many_notify` somewhere in the process
+2. this sets first `NETREG_UNREGISTERING` on both ends of the veth and
+   then runs `synchronize_net`
+3. it then calls `call_netdevice_notifiers` with `NETDEV_UNREGISTER`
+4. this is then handled by `dp_device_event` which calls
+   `ovs_netdev_detach_dev` (if a vport is found, which is the case for
+   the veth interface attached to ovs)
+5. this removes the rx_handlers of the device but does not prevent
+   packages to be sent to the device
+6. `dp_device_event` then queues the vport deletion to work in
+   background as a ovs_lock is needed that we do not hold in the
+   unregistration path
+7. `unregister_netdevice_many_notify` continues to call
+   `netdev_unregister_kobject` which sets `real_num_tx_queues` to 0
+8. port deletion continues (but details are not relevant for this issue)
+9. at some future point the background task deletes the vport
+
+If after 7. but before 9. a packet is send to the ovs vport (which is
+not deleted at this point in time) which forwards it to the
+`dev_queue_xmit` flow even though the device is unregistering.
+In `skb_tx_hash` (which is called in the `dev_queue_xmit`) path there is
+a while loop (if the packet has a rx_queue recorded) that is infinite if
+`dev->real_num_tx_queues` is zero.
+
+To prevent this from happening we update `do_output` to handle devices
+without carrier the same as if the device is not found (which would
+be the code path after 9. is done).
+
+Additionally we now produce a warning in `skb_tx_hash` if we will hit
+the infinite loop.
+
+bpftrace (first word is function name):
+
+__dev_queue_xmit server: real_num_tx_queues: 1, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 1
+netdev_core_pick_tx server: addr: 0xffff9f0a46d4a000 real_num_tx_queues: 1, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 1
+dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 2, reg_state: 1
+synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024
+synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024
+synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024
+synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024
+dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 6, reg_state: 2
+ovs_netdev_detach_dev server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, reg_state: 2
+netdev_rx_handler_unregister server: real_num_tx_queues: 1, cpu: 9, pid: 21024, tid: 21024, reg_state: 2
+synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024
+netdev_rx_handler_unregister ret server: real_num_tx_queues: 1, cpu: 9, pid: 21024, tid: 21024, reg_state: 2
+dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 27, reg_state: 2
+dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 22, reg_state: 2
+dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 18, reg_state: 2
+netdev_unregister_kobject: real_num_tx_queues: 1, cpu: 9, pid: 21024, tid: 21024
+synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024
+ovs_vport_send server: real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 2
+__dev_queue_xmit server: real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 2
+netdev_core_pick_tx server: addr: 0xffff9f0a46d4a000 real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 2
+broken device server: real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024
+ovs_dp_detach_port server: real_num_tx_queues: 0 cpu 9, pid: 9124, tid: 9124, reg_state: 2
+synchronize_rcu_expedited: cpu 9, pid: 33604, tid: 33604
+
+stuck message:
+
+watchdog: BUG: soft lockup - CPU#5 stuck for 26s! [curl:1929279]
+Modules linked in: veth pktgen bridge stp llc ip_set_hash_net nft_counter xt_set nft_compat nf_tables ip_set_hash_ip ip_set nfnetlink_cttimeout nfnetlink openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 tls binfmt_misc nls_iso8859_1 input_leds joydev serio_raw dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua sch_fq_codel drm efi_pstore virtio_rng ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear hid_generic usbhid hid crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel virtio_net ahci net_failover crypto_simd cryptd psmouse libahci virtio_blk failover
+CPU: 5 PID: 1929279 Comm: curl Not tainted 5.15.0-67-generic #74-Ubuntu
+Hardware name: OpenStack Foundation OpenStack Nova, BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
+RIP: 0010:netdev_pick_tx+0xf1/0x320
+Code: 00 00 8d 48 ff 0f b7 c1 66 39 ca 0f 86 e9 01 00 00 45 0f b7 ff 41 39 c7 0f 87 5b 01 00 00 44 29 f8 41 39 c7 0f 87 4f 01 00 00 <eb> f2 0f 1f 44 00 00 49 8b 94 24 28 04 00 00 48 85 d2 0f 84 53 01
+RSP: 0018:ffffb78b40298820 EFLAGS: 00000246
+RAX: 0000000000000000 RBX: ffff9c8773adc2e0 RCX: 000000000000083f
+RDX: 0000000000000000 RSI: ffff9c8773adc2e0 RDI: ffff9c870a25e000
+RBP: ffffb78b40298858 R08: 0000000000000001 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000000 R12: ffff9c870a25e000
+R13: ffff9c870a25e000 R14: ffff9c87fe043480 R15: 0000000000000000
+FS:  00007f7b80008f00(0000) GS:ffff9c8e5f740000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f7b80f6a0b0 CR3: 0000000329d66000 CR4: 0000000000350ee0
+Call Trace:
+ <IRQ>
+ netdev_core_pick_tx+0xa4/0xb0
+ __dev_queue_xmit+0xf8/0x510
+ ? __bpf_prog_exit+0x1e/0x30
+ dev_queue_xmit+0x10/0x20
+ ovs_vport_send+0xad/0x170 [openvswitch]
+ do_output+0x59/0x180 [openvswitch]
+ do_execute_actions+0xa80/0xaa0 [openvswitch]
+ ? kfree+0x1/0x250
+ ? kfree+0x1/0x250
+ ? kprobe_perf_func+0x4f/0x2b0
+ ? flow_lookup.constprop.0+0x5c/0x110 [openvswitch]
+ ovs_execute_actions+0x4c/0x120 [openvswitch]
+ ovs_dp_process_packet+0xa1/0x200 [openvswitch]
+ ? ovs_ct_update_key.isra.0+0xa8/0x120 [openvswitch]
+ ? ovs_ct_fill_key+0x1d/0x30 [openvswitch]
+ ? ovs_flow_key_extract+0x2db/0x350 [openvswitch]
+ ovs_vport_receive+0x77/0xd0 [openvswitch]
+ ? __htab_map_lookup_elem+0x4e/0x60
+ ? bpf_prog_680e8aff8547aec1_kfree+0x3b/0x714
+ ? trace_call_bpf+0xc8/0x150
+ ? kfree+0x1/0x250
+ ? kfree+0x1/0x250
+ ? kprobe_perf_func+0x4f/0x2b0
+ ? kprobe_perf_func+0x4f/0x2b0
+ ? __mod_memcg_lruvec_state+0x63/0xe0
+ netdev_port_receive+0xc4/0x180 [openvswitch]
+ ? netdev_port_receive+0x180/0x180 [openvswitch]
+ netdev_frame_hook+0x1f/0x40 [openvswitch]
+ __netif_receive_skb_core.constprop.0+0x23d/0xf00
+ __netif_receive_skb_one_core+0x3f/0xa0
+ __netif_receive_skb+0x15/0x60
+ process_backlog+0x9e/0x170
+ __napi_poll+0x33/0x180
+ net_rx_action+0x126/0x280
+ ? ttwu_do_activate+0x72/0xf0
+ __do_softirq+0xd9/0x2e7
+ ? rcu_report_exp_cpu_mult+0x1b0/0x1b0
+ do_softirq+0x7d/0xb0
+ </IRQ>
+ <TASK>
+ __local_bh_enable_ip+0x54/0x60
+ ip_finish_output2+0x191/0x460
+ __ip_finish_output+0xb7/0x180
+ ip_finish_output+0x2e/0xc0
+ ip_output+0x78/0x100
+ ? __ip_finish_output+0x180/0x180
+ ip_local_out+0x5e/0x70
+ __ip_queue_xmit+0x184/0x440
+ ? tcp_syn_options+0x1f9/0x300
+ ip_queue_xmit+0x15/0x20
+ __tcp_transmit_skb+0x910/0x9c0
+ ? __mod_memcg_state+0x44/0xa0
+ tcp_connect+0x437/0x4e0
+ ? ktime_get_with_offset+0x60/0xf0
+ tcp_v4_connect+0x436/0x530
+ __inet_stream_connect+0xd4/0x3a0
+ ? kprobe_perf_func+0x4f/0x2b0
+ ? aa_sk_perm+0x43/0x1c0
+ inet_stream_connect+0x3b/0x60
+ __sys_connect_file+0x63/0x70
+ __sys_connect+0xa6/0xd0
+ ? setfl+0x108/0x170
+ ? do_fcntl+0xe8/0x5a0
+ __x64_sys_connect+0x18/0x20
+ do_syscall_64+0x5c/0xc0
+ ? __x64_sys_fcntl+0xa9/0xd0
+ ? exit_to_user_mode_prepare+0x37/0xb0
+ ? syscall_exit_to_user_mode+0x27/0x50
+ ? do_syscall_64+0x69/0xc0
+ ? __sys_setsockopt+0xea/0x1e0
+ ? exit_to_user_mode_prepare+0x37/0xb0
+ ? syscall_exit_to_user_mode+0x27/0x50
+ ? __x64_sys_setsockopt+0x1f/0x30
+ ? do_syscall_64+0x69/0xc0
+ ? irqentry_exit+0x1d/0x30
+ ? exc_page_fault+0x89/0x170
+ entry_SYSCALL_64_after_hwframe+0x61/0xcb
+RIP: 0033:0x7f7b8101c6a7
+Code: 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 18 89 54 24 0c 48 89 34 24 89
+RSP: 002b:00007ffffd6b2198 EFLAGS: 00000246 ORIG_RAX: 000000000000002a
+RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7b8101c6a7
+RDX: 0000000000000010 RSI: 00007ffffd6b2360 RDI: 0000000000000005
+RBP: 0000561f1370d560 R08: 00002795ad21d1ac R09: 0030312e302e302e
+R10: 00007ffffd73f080 R11: 0000000000000246 R12: 0000561f1370c410
+R13: 0000000000000000 R14: 0000000000000005 R15: 0000000000000000
+ </TASK>
+
+Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action")
+Co-developed-by: Luca Czesla <luca.czesla@mail.schwarz>
+Signed-off-by: Luca Czesla <luca.czesla@mail.schwarz>
+Signed-off-by: Felix Huettner <felix.huettner@mail.schwarz>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Simon Horman <simon.horman@corigine.com>
+Link: https://lore.kernel.org/r/ZC0pBXBAgh7c76CA@kernel-bug-kernel-bug
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Carlos Soto <carlos.soto@broadcom.com>
+Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dev.c            |    1 +
+ net/openvswitch/actions.c |    2 +-
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -3186,6 +3186,7 @@ static u16 skb_tx_hash(const struct net_
+       }
+ 
+       if (skb_rx_queue_recorded(skb)) {
++              BUILD_BUG_ON_INVALID(qcount == 0);
+               hash = skb_get_rx_queue(skb);
+               if (hash >= qoffset)
+                       hash -= qoffset;
+--- a/net/openvswitch/actions.c
++++ b/net/openvswitch/actions.c
+@@ -912,7 +912,7 @@ static void do_output(struct datapath *d
+ {
+       struct vport *vport = ovs_vport_rcu(dp, out_port);
+ 
+-      if (likely(vport)) {
++      if (likely(vport && netif_carrier_ok(vport->dev))) {
+               u16 mru = OVS_CB(skb)->mru;
+               u32 cutlen = OVS_CB(skb)->cutlen;
+ 
diff --git a/queue-5.10/openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch b/queue-5.10/openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch

new file mode 100644 (file)

index 0000000..fc47c64
--- /dev/null
+++ b/queue-5.10/openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch
@@ -0,0 +1,77 @@
+From 47e55e4b410f7d552e43011baa5be1aab4093990 Mon Sep 17 00:00:00 2001
+From: Ilya Maximets <i.maximets@ovn.org>
+Date: Thu, 9 Jan 2025 13:21:24 +0100
+Subject: openvswitch: fix lockup on tx to unregistering netdev with carrier
+
+From: Ilya Maximets <i.maximets@ovn.org>
+
+commit 47e55e4b410f7d552e43011baa5be1aab4093990 upstream.
+
+Commit in a fixes tag attempted to fix the issue in the following
+sequence of calls:
+
+    do_output
+    -> ovs_vport_send
+       -> dev_queue_xmit
+          -> __dev_queue_xmit
+             -> netdev_core_pick_tx
+                -> skb_tx_hash
+
+When device is unregistering, the 'dev->real_num_tx_queues' goes to
+zero and the 'while (unlikely(hash >= qcount))' loop inside the
+'skb_tx_hash' becomes infinite, locking up the core forever.
+
+But unfortunately, checking just the carrier status is not enough to
+fix the issue, because some devices may still be in unregistering
+state while reporting carrier status OK.
+
+One example of such device is a net/dummy.  It sets carrier ON
+on start, but it doesn't implement .ndo_stop to set the carrier off.
+And it makes sense, because dummy doesn't really have a carrier.
+Therefore, while this device is unregistering, it's still easy to hit
+the infinite loop in the skb_tx_hash() from the OVS datapath.  There
+might be other drivers that do the same, but dummy by itself is
+important for the OVS ecosystem, because it is frequently used as a
+packet sink for tcpdump while debugging OVS deployments.  And when the
+issue is hit, the only way to recover is to reboot.
+
+Fix that by also checking if the device is running.  The running
+state is handled by the net core during unregistering, so it covers
+unregistering case better, and we don't really need to send packets
+to devices that are not running anyway.
+
+While only checking the running state might be enough, the carrier
+check is preserved.  The running and the carrier states seem disjoined
+throughout the code and different drivers.  And other core functions
+like __dev_direct_xmit() check both before attempting to transmit
+a packet.  So, it seems safer to check both flags in OVS as well.
+
+Fixes: 066b86787fa3 ("net: openvswitch: fix race on port output")
+Reported-by: Friedrich Weber <f.weber@proxmox.com>
+Closes: https://mail.openvswitch.org/pipermail/ovs-discuss/2025-January/053423.html
+Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
+Tested-by: Friedrich Weber <f.weber@proxmox.com>
+Reviewed-by: Aaron Conole <aconole@redhat.com>
+Link: https://patch.msgid.link/20250109122225.4034688-1-i.maximets@ovn.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+Signed-off-by: Carlos Soto <carlos.soto@broadcom.com>
+Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/openvswitch/actions.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/net/openvswitch/actions.c
++++ b/net/openvswitch/actions.c
+@@ -912,7 +912,9 @@ static void do_output(struct datapath *d
+ {
+       struct vport *vport = ovs_vport_rcu(dp, out_port);
+ 
+-      if (likely(vport && netif_carrier_ok(vport->dev))) {
++      if (likely(vport &&
++                 netif_running(vport->dev) &&
++                 netif_carrier_ok(vport->dev))) {
+               u16 mru = OVS_CB(skb)->mru;
+               u32 cutlen = OVS_CB(skb)->cutlen;
+ 
diff --git a/queue-5.10/series b/queue-5.10/series

index 31356db87d7ad31af7d6e825fc6b2f3746c64aff..78b11ed41942d86360ca08dbe2d059b2d1ca39f5 100644 (file)
--- a/queue-5.10/series
+++ b/queue-5.10/series
@@ -165,3 +165,13 @@ smb-client-fix-uaf-in-async-decryption.patch
  smb-client-fix-null-ptr-deref-in-crypto_aead_setkey.patch
  bpf-avoid-holding-freeze_mutex-during-mmap-operation.patch
  bpf-check-rcu_read_lock_trace_held-before-calling-bpf-map-helpers.patch
+blk-cgroup-support-to-track-if-policy-is-online.patch
+blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch
+ext4-fix-timer-use-after-free-on-failed-mount.patch
+mmc-sdhci-brcmstb-add-ability-to-increase-max-clock-rate-for-72116b0.patch
+mmc-sdhci-brcmstb-initialize-base_clk-to-null-in-sdhci_brcmstb_probe.patch
+mmc-sdhci-brcmstb-use-clk_get_rate-base_clk-in-pm-resume.patch
+net-mlx5e-fix-use-after-free-of-encap-entry-in-neigh-update-handler.patch
+ipvs-properly-dereference-pe-in-ip_vs_add_service.patch
+net-openvswitch-fix-race-on-port-output.patch
+openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 22 Apr 2025 12:32:54 +0000 (14:32 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 22 Apr 2025 12:32:54 +0000 (14:32 +0200)
queue-5.10/blk-cgroup-support-to-track-if-policy-is-online.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/ext4-fix-timer-use-after-free-on-failed-mount.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/ipvs-properly-dereference-pe-in-ip_vs_add_service.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/mmc-sdhci-brcmstb-add-ability-to-increase-max-clock-rate-for-72116b0.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/mmc-sdhci-brcmstb-initialize-base_clk-to-null-in-sdhci_brcmstb_probe.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/mmc-sdhci-brcmstb-use-clk_get_rate-base_clk-in-pm-resume.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/net-mlx5e-fix-use-after-free-of-encap-entry-in-neigh-update-handler.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/net-openvswitch-fix-race-on-port-output.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/series		patch \| blob \| blame \| history