From: Sasha Levin Date: Sun, 15 Jun 2025 13:01:13 +0000 (-0400) Subject: Fixes for 5.4 X-Git-Tag: v6.6.94~59 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f724d2960e671efa0e5bcb51327690f791923e4b;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.4 Signed-off-by: Sasha Levin --- diff --git a/queue-5.4/i40e-retry-vflr-handling-if-there-is-ongoing-vf-rese.patch b/queue-5.4/i40e-retry-vflr-handling-if-there-is-ongoing-vf-rese.patch new file mode 100644 index 0000000000..08ad550868 --- /dev/null +++ b/queue-5.4/i40e-retry-vflr-handling-if-there-is-ongoing-vf-rese.patch @@ -0,0 +1,44 @@ +From a4420b7629b53a00678d795c81ee42e59f4587eb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 20 May 2025 10:31:52 +0200 +Subject: i40e: retry VFLR handling if there is ongoing VF reset + +From: Robert Malz + +[ Upstream commit fb4e9239e029954a37a00818b21e837cebf2aa10 ] + +When a VFLR interrupt is received during a VF reset initiated from a +different source, the VFLR may be not fully handled. This can +leave the VF in an undefined state. +To address this, set the I40E_VFLR_EVENT_PENDING bit again during VFLR +handling if the reset is not yet complete. This ensures the driver +will properly complete the VF reset in such scenarios. + +Fixes: 52424f974bc5 ("i40e: Fix VF hang when reset is triggered on another VF") +Signed-off-by: Robert Malz +Tested-by: Rafal Romanowski +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +index 783581739417f..d8ba409122032 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +@@ -4125,7 +4125,10 @@ int i40e_vc_process_vflr_event(struct i40e_pf *pf) + reg = rd32(hw, I40E_GLGEN_VFLRSTAT(reg_idx)); + if (reg & BIT(bit_idx)) + /* i40e_reset_vf will clear the bit in GLGEN_VFLRSTAT */ +- i40e_reset_vf(vf, true); ++ if (!i40e_reset_vf(vf, true)) { ++ /* At least one VF did not finish resetting, retry next time */ ++ set_bit(__I40E_VFLR_EVENT_PENDING, pf->state); ++ } + } + + return 0; +-- +2.39.5 + diff --git a/queue-5.4/i40e-return-false-from-i40e_reset_vf-if-reset-is-in-.patch b/queue-5.4/i40e-return-false-from-i40e_reset_vf-if-reset-is-in-.patch new file mode 100644 index 0000000000..726dc120a0 --- /dev/null +++ b/queue-5.4/i40e-return-false-from-i40e_reset_vf-if-reset-is-in-.patch @@ -0,0 +1,55 @@ +From a65a2d1ffc886e667901fca34f3527996122ac32 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 20 May 2025 10:31:51 +0200 +Subject: i40e: return false from i40e_reset_vf if reset is in progress + +From: Robert Malz + +[ Upstream commit a2c90d63b71223d69a813333c1abf4fdacddbbe5 ] + +The function i40e_vc_reset_vf attempts, up to 20 times, to handle a +VF reset request, using the return value of i40e_reset_vf as an indicator +of whether the reset was successfully triggered. Currently, i40e_reset_vf +always returns true, which causes new reset requests to be ignored if a +different VF reset is already in progress. + +This patch updates the return value of i40e_reset_vf to reflect when +another VF reset is in progress, allowing the caller to properly use +the retry mechanism. + +Fixes: 52424f974bc5 ("i40e: Fix VF hang when reset is triggered on another VF") +Signed-off-by: Robert Malz +Tested-by: Rafal Romanowski +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +index 81f428d0b7a4c..783581739417f 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +@@ -1456,8 +1456,8 @@ static void i40e_cleanup_reset_vf(struct i40e_vf *vf) + * @vf: pointer to the VF structure + * @flr: VFLR was issued or not + * +- * Returns true if the VF is in reset, resets successfully, or resets +- * are disabled and false otherwise. ++ * Return: True if reset was performed successfully or if resets are disabled. ++ * False if reset is already in progress. + **/ + bool i40e_reset_vf(struct i40e_vf *vf, bool flr) + { +@@ -1476,7 +1476,7 @@ bool i40e_reset_vf(struct i40e_vf *vf, bool flr) + + /* If VF is being reset already we don't need to continue. */ + if (test_and_set_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) +- return true; ++ return false; + + i40e_trigger_vf_reset(vf, flr); + +-- +2.39.5 + diff --git a/queue-5.4/net-mdio-c22-is-now-optional-eopnotsupp-if-not-provi.patch b/queue-5.4/net-mdio-c22-is-now-optional-eopnotsupp-if-not-provi.patch new file mode 100644 index 0000000000..c3d5d8f36e --- /dev/null +++ b/queue-5.4/net-mdio-c22-is-now-optional-eopnotsupp-if-not-provi.patch @@ -0,0 +1,53 @@ +From da597af19f72462841e173c8d8d66c61eb0e489d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 9 Jan 2023 16:30:44 +0100 +Subject: net: mdio: C22 is now optional, EOPNOTSUPP if not provided + +From: Andrew Lunn + +[ Upstream commit b063b1924fd9bf0bc157cf644764dc2151d04ccc ] + +When performing a C22 operation, check that the bus driver actually +provides the methods, and return -EOPNOTSUPP if not. C45 only busses +do exist, and in future their C22 methods will be NULL. + +Signed-off-by: Andrew Lunn +Signed-off-by: Michael Walle +Signed-off-by: Jakub Kicinski +Stable-dep-of: 0e629694126c ("net/mdiobus: Fix potential out-of-bounds read/write access") +Signed-off-by: Sasha Levin +--- + drivers/net/phy/mdio_bus.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c +index fdf8221f46fa5..e5c25beae21e0 100644 +--- a/drivers/net/phy/mdio_bus.c ++++ b/drivers/net/phy/mdio_bus.c +@@ -565,7 +565,10 @@ int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum) + + WARN_ON_ONCE(!mutex_is_locked(&bus->mdio_lock)); + +- retval = bus->read(bus, addr, regnum); ++ if (bus->read) ++ retval = bus->read(bus, addr, regnum); ++ else ++ retval = -EOPNOTSUPP; + + trace_mdio_access(bus, 1, addr, regnum, retval, retval); + +@@ -590,7 +593,10 @@ int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val) + + WARN_ON_ONCE(!mutex_is_locked(&bus->mdio_lock)); + +- err = bus->write(bus, addr, regnum, val); ++ if (bus->write) ++ err = bus->write(bus, addr, regnum, val); ++ else ++ err = -EOPNOTSUPP; + + trace_mdio_access(bus, 0, addr, regnum, val, err); + +-- +2.39.5 + diff --git a/queue-5.4/net-mdiobus-fix-potential-out-of-bounds-read-write-a.patch b/queue-5.4/net-mdiobus-fix-potential-out-of-bounds-read-write-a.patch new file mode 100644 index 0000000000..0c7a994236 --- /dev/null +++ b/queue-5.4/net-mdiobus-fix-potential-out-of-bounds-read-write-a.patch @@ -0,0 +1,58 @@ +From c5257e2fd52f6ecd93b70d16c4cd49adf3ef7a5b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 9 Jun 2025 17:31:46 +0200 +Subject: net/mdiobus: Fix potential out-of-bounds read/write access + +From: Jakub Raczynski + +[ Upstream commit 0e629694126ca388916f059453a1c36adde219c4 ] + +When using publicly available tools like 'mdio-tools' to read/write data +from/to network interface and its PHY via mdiobus, there is no verification of +parameters passed to the ioctl and it accepts any mdio address. +Currently there is support for 32 addresses in kernel via PHY_MAX_ADDR define, +but it is possible to pass higher value than that via ioctl. +While read/write operation should generally fail in this case, +mdiobus provides stats array, where wrong address may allow out-of-bounds +read/write. + +Fix that by adding address verification before read/write operation. +While this excludes this access from any statistics, it improves security of +read/write operation. + +Fixes: 080bb352fad00 ("net: phy: Maintain MDIO device and bus statistics") +Signed-off-by: Jakub Raczynski +Reported-by: Wenjing Shan +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/phy/mdio_bus.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c +index e5c25beae21e0..931b9a6c5dc50 100644 +--- a/drivers/net/phy/mdio_bus.c ++++ b/drivers/net/phy/mdio_bus.c +@@ -565,6 +565,9 @@ int __mdiobus_read(struct mii_bus *bus, int addr, u32 regnum) + + WARN_ON_ONCE(!mutex_is_locked(&bus->mdio_lock)); + ++ if (addr >= PHY_MAX_ADDR) ++ return -ENXIO; ++ + if (bus->read) + retval = bus->read(bus, addr, regnum); + else +@@ -593,6 +596,9 @@ int __mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val) + + WARN_ON_ONCE(!mutex_is_locked(&bus->mdio_lock)); + ++ if (addr >= PHY_MAX_ADDR) ++ return -ENXIO; ++ + if (bus->write) + err = bus->write(bus, addr, regnum, val); + else +-- +2.39.5 + diff --git a/queue-5.4/net-mlx5-fix-return-value-when-searching-for-existin.patch b/queue-5.4/net-mlx5-fix-return-value-when-searching-for-existin.patch new file mode 100644 index 0000000000..11a0aecdc8 --- /dev/null +++ b/queue-5.4/net-mlx5-fix-return-value-when-searching-for-existin.patch @@ -0,0 +1,61 @@ +From 4e2ebe4fd1f6d78a8eff03c08e54290dec94b24c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 10 Jun 2025 18:15:08 +0300 +Subject: net/mlx5: Fix return value when searching for existing flow group + +From: Patrisious Haddad + +[ Upstream commit 8ec40e3f1f72bf8f8accf18020d487caa99f46a4 ] + +When attempting to add a rule to an existing flow group, if a matching +flow group exists but is not active, the error code returned should be +EAGAIN, so that the rule can be added to the matching flow group once +it is active, rather than ENOENT, which indicates that no matching +flow group was found. + +Fixes: bd71b08ec2ee ("net/mlx5: Support multiple updates of steering rules in parallel") +Signed-off-by: Gavi Teitz +Signed-off-by: Roi Dayan +Signed-off-by: Patrisious Haddad +Reviewed-by: Tariq Toukan +Signed-off-by: Mark Bloch +Link: https://patch.msgid.link/20250610151514.1094735-4-mbloch@nvidia.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +index 25f9185d5a15e..22318edff5514 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +@@ -1716,6 +1716,7 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft, + struct mlx5_flow_handle *rule; + struct match_list *iter; + bool take_write = false; ++ bool try_again = false; + struct fs_fte *fte; + u64 version; + int err; +@@ -1771,6 +1772,7 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft, + nested_down_write_ref_node(&g->node, FS_LOCK_PARENT); + + if (!g->node.active) { ++ try_again = true; + up_write_ref_node(&g->node, false); + continue; + } +@@ -1792,7 +1794,8 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft, + tree_put_node(&fte->node, false); + return rule; + } +- rule = ERR_PTR(-ENOENT); ++ err = try_again ? -EAGAIN : -ENOENT; ++ rule = ERR_PTR(err); + out: + kmem_cache_free(steering->ftes_cache, fte); + return rule; +-- +2.39.5 + diff --git a/queue-5.4/net-mlx5-wait-for-inactive-autogroups.patch b/queue-5.4/net-mlx5-wait-for-inactive-autogroups.patch new file mode 100644 index 0000000000..a7999bd33f --- /dev/null +++ b/queue-5.4/net-mlx5-wait-for-inactive-autogroups.patch @@ -0,0 +1,51 @@ +From cd6d5cdc02c52ffec8dc01caeda3733baef69a9b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 May 2020 12:01:39 +0300 +Subject: net/mlx5: Wait for inactive autogroups + +From: Paul Blakey + +[ Upstream commit 49c0355d301b4e0e01e0f19ddbb023bd7d0ee48c ] + +Currently, if one thread tries to add an entry to an autogrouped table +with no free matching group, while another thread is in the process of +creating a new matching autogroup, it doesn't wait for the new group +creation, and creates an unnecessary new autogroup. + +Instead of skipping inactive, wait on the write lock of those groups. + +Signed-off-by: Paul Blakey +Reviewed-by: Roi Dayan +Reviewed-by: Mark Bloch +Reviewed-by: Maor Gottlieb +Signed-off-by: Saeed Mahameed +Stable-dep-of: 8ec40e3f1f72 ("net/mlx5: Fix return value when searching for existing flow group") +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +index 30d5b7f52a2a0..25f9185d5a15e 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +@@ -1768,11 +1768,13 @@ try_add_to_existing_fg(struct mlx5_flow_table *ft, + list_for_each_entry(iter, match_head, list) { + g = iter->g; + +- if (!g->node.active) +- continue; +- + nested_down_write_ref_node(&g->node, FS_LOCK_PARENT); + ++ if (!g->node.active) { ++ up_write_ref_node(&g->node, false); ++ continue; ++ } ++ + err = insert_fte(g, fte); + if (err) { + up_write_ref_node(&g->node, false); +-- +2.39.5 + diff --git a/queue-5.4/net-sch_ets-add-a-new-qdisc.patch b/queue-5.4/net-sch_ets-add-a-new-qdisc.patch new file mode 100644 index 0000000000..c4eb4b1472 --- /dev/null +++ b/queue-5.4/net-sch_ets-add-a-new-qdisc.patch @@ -0,0 +1,840 @@ +From 8a50b65fef0c97bc395ff2166a42ea28e89d2f90 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 18 Dec 2019 14:55:13 +0000 +Subject: net: sch_ets: Add a new Qdisc + +From: Petr Machata + +[ Upstream commit dcc68b4d8084e1ac9af0d4022d6b1aff6a139a33 ] + +Introduces a new Qdisc, which is based on 802.1Q-2014 wording. It is +PRIO-like in how it is configured, meaning one needs to specify how many +bands there are, how many are strict and how many are dwrr, quanta for the +latter, and priomap. + +The new Qdisc operates like the PRIO / DRR combo would when configured as +per the standard. The strict classes, if any, are tried for traffic first. +When there's no traffic in any of the strict queues, the ETS ones (if any) +are treated in the same way as in DRR. + +Signed-off-by: Petr Machata +Acked-by: Jiri Pirko +Signed-off-by: David S. Miller +Stable-dep-of: d92adacdd8c2 ("net_sched: ets: fix a race in ets_qdisc_change()") +Signed-off-by: Sasha Levin +--- + include/uapi/linux/pkt_sched.h | 17 + + net/sched/Kconfig | 17 + + net/sched/Makefile | 1 + + net/sched/sch_ets.c | 733 +++++++++++++++++++++++++++++++++ + 4 files changed, 768 insertions(+) + create mode 100644 net/sched/sch_ets.c + +diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h +index 4a245d7a5c8d6..3933f589118c7 100644 +--- a/include/uapi/linux/pkt_sched.h ++++ b/include/uapi/linux/pkt_sched.h +@@ -1183,4 +1183,21 @@ enum { + + #define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + ++/* ETS */ ++ ++#define TCQ_ETS_MAX_BANDS 16 ++ ++enum { ++ TCA_ETS_UNSPEC, ++ TCA_ETS_NBANDS, /* u8 */ ++ TCA_ETS_NSTRICT, /* u8 */ ++ TCA_ETS_QUANTA, /* nested TCA_ETS_QUANTA_BAND */ ++ TCA_ETS_QUANTA_BAND, /* u32 */ ++ TCA_ETS_PRIOMAP, /* nested TCA_ETS_PRIOMAP_BAND */ ++ TCA_ETS_PRIOMAP_BAND, /* u8 */ ++ __TCA_ETS_MAX, ++}; ++ ++#define TCA_ETS_MAX (__TCA_ETS_MAX - 1) ++ + #endif +diff --git a/net/sched/Kconfig b/net/sched/Kconfig +index 49521aa33ab9f..8b255f8914426 100644 +--- a/net/sched/Kconfig ++++ b/net/sched/Kconfig +@@ -367,6 +367,23 @@ config NET_SCH_PLUG + To compile this code as a module, choose M here: the + module will be called sch_plug. + ++config NET_SCH_ETS ++ tristate "Enhanced transmission selection scheduler (ETS)" ++ help ++ The Enhanced Transmission Selection scheduler is a classful ++ queuing discipline that merges functionality of PRIO and DRR ++ qdiscs in one scheduler. ETS makes it easy to configure a set of ++ strict and bandwidth-sharing bands to implement the transmission ++ selection described in 802.1Qaz. ++ ++ Say Y here if you want to use the ETS packet scheduling ++ algorithm. ++ ++ To compile this driver as a module, choose M here: the module ++ will be called sch_ets. ++ ++ If unsure, say N. ++ + menuconfig NET_SCH_DEFAULT + bool "Allow override default queue discipline" + ---help--- +diff --git a/net/sched/Makefile b/net/sched/Makefile +index e5ea44ec13c58..5b457f80de6db 100644 +--- a/net/sched/Makefile ++++ b/net/sched/Makefile +@@ -45,6 +45,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o + obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o + obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o + obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o ++obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o + obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o + obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o + obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o +diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c +new file mode 100644 +index 0000000000000..e6194b23e9b0d +--- /dev/null ++++ b/net/sched/sch_ets.c +@@ -0,0 +1,733 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++ * net/sched/sch_ets.c Enhanced Transmission Selection scheduler ++ * ++ * Description ++ * ----------- ++ * ++ * The Enhanced Transmission Selection scheduler is a classful queuing ++ * discipline that merges functionality of PRIO and DRR qdiscs in one scheduler. ++ * ETS makes it easy to configure a set of strict and bandwidth-sharing bands to ++ * implement the transmission selection described in 802.1Qaz. ++ * ++ * Although ETS is technically classful, it's not possible to add and remove ++ * classes at will. Instead one specifies number of classes, how many are ++ * PRIO-like and how many DRR-like, and quanta for the latter. ++ * ++ * Algorithm ++ * --------- ++ * ++ * The strict classes, if any, are tried for traffic first: first band 0, if it ++ * has no traffic then band 1, etc. ++ * ++ * When there is no traffic in any of the strict queues, the bandwidth-sharing ++ * ones are tried next. Each band is assigned a deficit counter, initialized to ++ * "quantum" of that band. ETS maintains a list of active bandwidth-sharing ++ * bands whose qdiscs are non-empty. A packet is dequeued from the band at the ++ * head of the list if the packet size is smaller or equal to the deficit ++ * counter. If the counter is too small, it is increased by "quantum" and the ++ * scheduler moves on to the next band in the active list. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct ets_class { ++ struct list_head alist; /* In struct ets_sched.active. */ ++ struct Qdisc *qdisc; ++ u32 quantum; ++ u32 deficit; ++ struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_queue qstats; ++}; ++ ++struct ets_sched { ++ struct list_head active; ++ struct tcf_proto __rcu *filter_list; ++ struct tcf_block *block; ++ unsigned int nbands; ++ unsigned int nstrict; ++ u8 prio2band[TC_PRIO_MAX + 1]; ++ struct ets_class classes[TCQ_ETS_MAX_BANDS]; ++}; ++ ++static const struct nla_policy ets_policy[TCA_ETS_MAX + 1] = { ++ [TCA_ETS_NBANDS] = { .type = NLA_U8 }, ++ [TCA_ETS_NSTRICT] = { .type = NLA_U8 }, ++ [TCA_ETS_QUANTA] = { .type = NLA_NESTED }, ++ [TCA_ETS_PRIOMAP] = { .type = NLA_NESTED }, ++}; ++ ++static const struct nla_policy ets_priomap_policy[TCA_ETS_MAX + 1] = { ++ [TCA_ETS_PRIOMAP_BAND] = { .type = NLA_U8 }, ++}; ++ ++static const struct nla_policy ets_quanta_policy[TCA_ETS_MAX + 1] = { ++ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, ++}; ++ ++static const struct nla_policy ets_class_policy[TCA_ETS_MAX + 1] = { ++ [TCA_ETS_QUANTA_BAND] = { .type = NLA_U32 }, ++}; ++ ++static int ets_quantum_parse(struct Qdisc *sch, const struct nlattr *attr, ++ unsigned int *quantum, ++ struct netlink_ext_ack *extack) ++{ ++ *quantum = nla_get_u32(attr); ++ if (!*quantum) { ++ NL_SET_ERR_MSG(extack, "ETS quantum cannot be zero"); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++static struct ets_class * ++ets_class_from_arg(struct Qdisc *sch, unsigned long arg) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ ++ return &q->classes[arg - 1]; ++} ++ ++static u32 ets_class_id(struct Qdisc *sch, const struct ets_class *cl) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ int band = cl - q->classes; ++ ++ return TC_H_MAKE(sch->handle, band + 1); ++} ++ ++static bool ets_class_is_strict(struct ets_sched *q, const struct ets_class *cl) ++{ ++ unsigned int band = cl - q->classes; ++ ++ return band < q->nstrict; ++} ++ ++static int ets_class_change(struct Qdisc *sch, u32 classid, u32 parentid, ++ struct nlattr **tca, unsigned long *arg, ++ struct netlink_ext_ack *extack) ++{ ++ struct ets_class *cl = ets_class_from_arg(sch, *arg); ++ struct ets_sched *q = qdisc_priv(sch); ++ struct nlattr *opt = tca[TCA_OPTIONS]; ++ struct nlattr *tb[TCA_ETS_MAX + 1]; ++ unsigned int quantum; ++ int err; ++ ++ /* Classes can be added and removed only through Qdisc_ops.change ++ * interface. ++ */ ++ if (!cl) { ++ NL_SET_ERR_MSG(extack, "Fine-grained class addition and removal is not supported"); ++ return -EOPNOTSUPP; ++ } ++ ++ if (!opt) { ++ NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); ++ return -EINVAL; ++ } ++ ++ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_class_policy, extack); ++ if (err < 0) ++ return err; ++ ++ if (!tb[TCA_ETS_QUANTA_BAND]) ++ /* Nothing to configure. */ ++ return 0; ++ ++ if (ets_class_is_strict(q, cl)) { ++ NL_SET_ERR_MSG(extack, "Strict bands do not have a configurable quantum"); ++ return -EINVAL; ++ } ++ ++ err = ets_quantum_parse(sch, tb[TCA_ETS_QUANTA_BAND], &quantum, ++ extack); ++ if (err) ++ return err; ++ ++ sch_tree_lock(sch); ++ cl->quantum = quantum; ++ sch_tree_unlock(sch); ++ return 0; ++} ++ ++static int ets_class_graft(struct Qdisc *sch, unsigned long arg, ++ struct Qdisc *new, struct Qdisc **old, ++ struct netlink_ext_ack *extack) ++{ ++ struct ets_class *cl = ets_class_from_arg(sch, arg); ++ ++ if (!new) { ++ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, ++ ets_class_id(sch, cl), NULL); ++ if (!new) ++ new = &noop_qdisc; ++ else ++ qdisc_hash_add(new, true); ++ } ++ ++ *old = qdisc_replace(sch, new, &cl->qdisc); ++ return 0; ++} ++ ++static struct Qdisc *ets_class_leaf(struct Qdisc *sch, unsigned long arg) ++{ ++ struct ets_class *cl = ets_class_from_arg(sch, arg); ++ ++ return cl->qdisc; ++} ++ ++static unsigned long ets_class_find(struct Qdisc *sch, u32 classid) ++{ ++ unsigned long band = TC_H_MIN(classid); ++ struct ets_sched *q = qdisc_priv(sch); ++ ++ if (band - 1 >= q->nbands) ++ return 0; ++ return band; ++} ++ ++static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) ++{ ++ struct ets_class *cl = ets_class_from_arg(sch, arg); ++ struct ets_sched *q = qdisc_priv(sch); ++ ++ /* We get notified about zero-length child Qdiscs as well if they are ++ * offloaded. Those aren't on the active list though, so don't attempt ++ * to remove them. ++ */ ++ if (!ets_class_is_strict(q, cl) && sch->q.qlen) ++ list_del(&cl->alist); ++} ++ ++static int ets_class_dump(struct Qdisc *sch, unsigned long arg, ++ struct sk_buff *skb, struct tcmsg *tcm) ++{ ++ struct ets_class *cl = ets_class_from_arg(sch, arg); ++ struct ets_sched *q = qdisc_priv(sch); ++ struct nlattr *nest; ++ ++ tcm->tcm_parent = TC_H_ROOT; ++ tcm->tcm_handle = ets_class_id(sch, cl); ++ tcm->tcm_info = cl->qdisc->handle; ++ ++ nest = nla_nest_start_noflag(skb, TCA_OPTIONS); ++ if (!nest) ++ goto nla_put_failure; ++ if (!ets_class_is_strict(q, cl)) { ++ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, cl->quantum)) ++ goto nla_put_failure; ++ } ++ return nla_nest_end(skb, nest); ++ ++nla_put_failure: ++ nla_nest_cancel(skb, nest); ++ return -EMSGSIZE; ++} ++ ++static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, ++ struct gnet_dump *d) ++{ ++ struct ets_class *cl = ets_class_from_arg(sch, arg); ++ struct Qdisc *cl_q = cl->qdisc; ++ ++ if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), ++ d, NULL, &cl_q->bstats) < 0 || ++ qdisc_qstats_copy(d, cl_q) < 0) ++ return -1; ++ ++ return 0; ++} ++ ++static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ int i; ++ ++ if (arg->stop) ++ return; ++ ++ for (i = 0; i < q->nbands; i++) { ++ if (arg->count < arg->skip) { ++ arg->count++; ++ continue; ++ } ++ if (arg->fn(sch, i + 1, arg) < 0) { ++ arg->stop = 1; ++ break; ++ } ++ arg->count++; ++ } ++} ++ ++static struct tcf_block * ++ets_qdisc_tcf_block(struct Qdisc *sch, unsigned long cl, ++ struct netlink_ext_ack *extack) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ ++ if (cl) { ++ NL_SET_ERR_MSG(extack, "ETS classid must be zero"); ++ return NULL; ++ } ++ ++ return q->block; ++} ++ ++static unsigned long ets_qdisc_bind_tcf(struct Qdisc *sch, unsigned long parent, ++ u32 classid) ++{ ++ return ets_class_find(sch, classid); ++} ++ ++static void ets_qdisc_unbind_tcf(struct Qdisc *sch, unsigned long arg) ++{ ++} ++ ++static struct ets_class *ets_classify(struct sk_buff *skb, struct Qdisc *sch, ++ int *qerr) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ u32 band = skb->priority; ++ struct tcf_result res; ++ struct tcf_proto *fl; ++ int err; ++ ++ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; ++ if (TC_H_MAJ(skb->priority) != sch->handle) { ++ fl = rcu_dereference_bh(q->filter_list); ++ err = tcf_classify(skb, fl, &res, false); ++#ifdef CONFIG_NET_CLS_ACT ++ switch (err) { ++ case TC_ACT_STOLEN: ++ case TC_ACT_QUEUED: ++ case TC_ACT_TRAP: ++ *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; ++ /* fall through */ ++ case TC_ACT_SHOT: ++ return NULL; ++ } ++#endif ++ if (!fl || err < 0) { ++ if (TC_H_MAJ(band)) ++ band = 0; ++ return &q->classes[q->prio2band[band & TC_PRIO_MAX]]; ++ } ++ band = res.classid; ++ } ++ band = TC_H_MIN(band) - 1; ++ if (band >= q->nbands) ++ return &q->classes[q->prio2band[0]]; ++ return &q->classes[band]; ++} ++ ++static int ets_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, ++ struct sk_buff **to_free) ++{ ++ unsigned int len = qdisc_pkt_len(skb); ++ struct ets_sched *q = qdisc_priv(sch); ++ struct ets_class *cl; ++ int err = 0; ++ bool first; ++ ++ cl = ets_classify(skb, sch, &err); ++ if (!cl) { ++ if (err & __NET_XMIT_BYPASS) ++ qdisc_qstats_drop(sch); ++ __qdisc_drop(skb, to_free); ++ return err; ++ } ++ ++ first = !cl->qdisc->q.qlen; ++ err = qdisc_enqueue(skb, cl->qdisc, to_free); ++ if (unlikely(err != NET_XMIT_SUCCESS)) { ++ if (net_xmit_drop_count(err)) { ++ cl->qstats.drops++; ++ qdisc_qstats_drop(sch); ++ } ++ return err; ++ } ++ ++ if (first && !ets_class_is_strict(q, cl)) { ++ list_add_tail(&cl->alist, &q->active); ++ cl->deficit = cl->quantum; ++ } ++ ++ sch->qstats.backlog += len; ++ sch->q.qlen++; ++ return err; ++} ++ ++static struct sk_buff * ++ets_qdisc_dequeue_skb(struct Qdisc *sch, struct sk_buff *skb) ++{ ++ qdisc_bstats_update(sch, skb); ++ qdisc_qstats_backlog_dec(sch, skb); ++ sch->q.qlen--; ++ return skb; ++} ++ ++static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ struct ets_class *cl; ++ struct sk_buff *skb; ++ unsigned int band; ++ unsigned int len; ++ ++ while (1) { ++ for (band = 0; band < q->nstrict; band++) { ++ cl = &q->classes[band]; ++ skb = qdisc_dequeue_peeked(cl->qdisc); ++ if (skb) ++ return ets_qdisc_dequeue_skb(sch, skb); ++ } ++ ++ if (list_empty(&q->active)) ++ goto out; ++ ++ cl = list_first_entry(&q->active, struct ets_class, alist); ++ skb = cl->qdisc->ops->peek(cl->qdisc); ++ if (!skb) { ++ qdisc_warn_nonwc(__func__, cl->qdisc); ++ goto out; ++ } ++ ++ len = qdisc_pkt_len(skb); ++ if (len <= cl->deficit) { ++ cl->deficit -= len; ++ skb = qdisc_dequeue_peeked(cl->qdisc); ++ if (unlikely(!skb)) ++ goto out; ++ if (cl->qdisc->q.qlen == 0) ++ list_del(&cl->alist); ++ return ets_qdisc_dequeue_skb(sch, skb); ++ } ++ ++ cl->deficit += cl->quantum; ++ list_move_tail(&cl->alist, &q->active); ++ } ++out: ++ return NULL; ++} ++ ++static int ets_qdisc_priomap_parse(struct nlattr *priomap_attr, ++ unsigned int nbands, u8 *priomap, ++ struct netlink_ext_ack *extack) ++{ ++ const struct nlattr *attr; ++ int prio = 0; ++ u8 band; ++ int rem; ++ int err; ++ ++ err = __nla_validate_nested(priomap_attr, TCA_ETS_MAX, ++ ets_priomap_policy, NL_VALIDATE_STRICT, ++ extack); ++ if (err) ++ return err; ++ ++ nla_for_each_nested(attr, priomap_attr, rem) { ++ switch (nla_type(attr)) { ++ case TCA_ETS_PRIOMAP_BAND: ++ if (prio > TC_PRIO_MAX) { ++ NL_SET_ERR_MSG_MOD(extack, "Too many priorities in ETS priomap"); ++ return -EINVAL; ++ } ++ band = nla_get_u8(attr); ++ if (band >= nbands) { ++ NL_SET_ERR_MSG_MOD(extack, "Invalid band number in ETS priomap"); ++ return -EINVAL; ++ } ++ priomap[prio++] = band; ++ break; ++ default: ++ WARN_ON_ONCE(1); /* Validate should have caught this. */ ++ return -EINVAL; ++ } ++ } ++ ++ return 0; ++} ++ ++static int ets_qdisc_quanta_parse(struct Qdisc *sch, struct nlattr *quanta_attr, ++ unsigned int nbands, unsigned int nstrict, ++ unsigned int *quanta, ++ struct netlink_ext_ack *extack) ++{ ++ const struct nlattr *attr; ++ int band = nstrict; ++ int rem; ++ int err; ++ ++ err = __nla_validate_nested(quanta_attr, TCA_ETS_MAX, ++ ets_quanta_policy, NL_VALIDATE_STRICT, ++ extack); ++ if (err < 0) ++ return err; ++ ++ nla_for_each_nested(attr, quanta_attr, rem) { ++ switch (nla_type(attr)) { ++ case TCA_ETS_QUANTA_BAND: ++ if (band >= nbands) { ++ NL_SET_ERR_MSG_MOD(extack, "ETS quanta has more values than bands"); ++ return -EINVAL; ++ } ++ err = ets_quantum_parse(sch, attr, &quanta[band++], ++ extack); ++ if (err) ++ return err; ++ break; ++ default: ++ WARN_ON_ONCE(1); /* Validate should have caught this. */ ++ return -EINVAL; ++ } ++ } ++ ++ return 0; ++} ++ ++static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, ++ struct netlink_ext_ack *extack) ++{ ++ unsigned int quanta[TCQ_ETS_MAX_BANDS] = {0}; ++ struct Qdisc *queues[TCQ_ETS_MAX_BANDS]; ++ struct ets_sched *q = qdisc_priv(sch); ++ struct nlattr *tb[TCA_ETS_MAX + 1]; ++ unsigned int oldbands = q->nbands; ++ u8 priomap[TC_PRIO_MAX + 1]; ++ unsigned int nstrict = 0; ++ unsigned int nbands; ++ unsigned int i; ++ int err; ++ ++ if (!opt) { ++ NL_SET_ERR_MSG(extack, "ETS options are required for this operation"); ++ return -EINVAL; ++ } ++ ++ err = nla_parse_nested(tb, TCA_ETS_MAX, opt, ets_policy, extack); ++ if (err < 0) ++ return err; ++ ++ if (!tb[TCA_ETS_NBANDS]) { ++ NL_SET_ERR_MSG_MOD(extack, "Number of bands is a required argument"); ++ return -EINVAL; ++ } ++ nbands = nla_get_u8(tb[TCA_ETS_NBANDS]); ++ if (nbands < 1 || nbands > TCQ_ETS_MAX_BANDS) { ++ NL_SET_ERR_MSG_MOD(extack, "Invalid number of bands"); ++ return -EINVAL; ++ } ++ /* Unless overridden, traffic goes to the last band. */ ++ memset(priomap, nbands - 1, sizeof(priomap)); ++ ++ if (tb[TCA_ETS_NSTRICT]) { ++ nstrict = nla_get_u8(tb[TCA_ETS_NSTRICT]); ++ if (nstrict > nbands) { ++ NL_SET_ERR_MSG_MOD(extack, "Invalid number of strict bands"); ++ return -EINVAL; ++ } ++ } ++ ++ if (tb[TCA_ETS_PRIOMAP]) { ++ err = ets_qdisc_priomap_parse(tb[TCA_ETS_PRIOMAP], ++ nbands, priomap, extack); ++ if (err) ++ return err; ++ } ++ ++ if (tb[TCA_ETS_QUANTA]) { ++ err = ets_qdisc_quanta_parse(sch, tb[TCA_ETS_QUANTA], ++ nbands, nstrict, quanta, extack); ++ if (err) ++ return err; ++ } ++ /* If there are more bands than strict + quanta provided, the remaining ++ * ones are ETS with quantum of MTU. Initialize the missing values here. ++ */ ++ for (i = nstrict; i < nbands; i++) { ++ if (!quanta[i]) ++ quanta[i] = psched_mtu(qdisc_dev(sch)); ++ } ++ ++ /* Before commit, make sure we can allocate all new qdiscs */ ++ for (i = oldbands; i < nbands; i++) { ++ queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, ++ ets_class_id(sch, &q->classes[i]), ++ extack); ++ if (!queues[i]) { ++ while (i > oldbands) ++ qdisc_put(queues[--i]); ++ return -ENOMEM; ++ } ++ } ++ ++ sch_tree_lock(sch); ++ ++ q->nbands = nbands; ++ q->nstrict = nstrict; ++ memcpy(q->prio2band, priomap, sizeof(priomap)); ++ ++ for (i = q->nbands; i < oldbands; i++) ++ qdisc_tree_flush_backlog(q->classes[i].qdisc); ++ ++ for (i = 0; i < q->nbands; i++) ++ q->classes[i].quantum = quanta[i]; ++ ++ for (i = oldbands; i < q->nbands; i++) { ++ q->classes[i].qdisc = queues[i]; ++ if (q->classes[i].qdisc != &noop_qdisc) ++ qdisc_hash_add(q->classes[i].qdisc, true); ++ } ++ ++ sch_tree_unlock(sch); ++ ++ for (i = q->nbands; i < oldbands; i++) { ++ qdisc_put(q->classes[i].qdisc); ++ memset(&q->classes[i], 0, sizeof(q->classes[i])); ++ } ++ return 0; ++} ++ ++static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, ++ struct netlink_ext_ack *extack) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ int err; ++ ++ if (!opt) ++ return -EINVAL; ++ ++ err = tcf_block_get(&q->block, &q->filter_list, sch, extack); ++ if (err) ++ return err; ++ ++ INIT_LIST_HEAD(&q->active); ++ return ets_qdisc_change(sch, opt, extack); ++} ++ ++static void ets_qdisc_reset(struct Qdisc *sch) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ int band; ++ ++ for (band = q->nstrict; band < q->nbands; band++) { ++ if (q->classes[band].qdisc->q.qlen) ++ list_del(&q->classes[band].alist); ++ } ++ for (band = 0; band < q->nbands; band++) ++ qdisc_reset(q->classes[band].qdisc); ++ sch->qstats.backlog = 0; ++ sch->q.qlen = 0; ++} ++ ++static void ets_qdisc_destroy(struct Qdisc *sch) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ int band; ++ ++ tcf_block_put(q->block); ++ for (band = 0; band < q->nbands; band++) ++ qdisc_put(q->classes[band].qdisc); ++} ++ ++static int ets_qdisc_dump(struct Qdisc *sch, struct sk_buff *skb) ++{ ++ struct ets_sched *q = qdisc_priv(sch); ++ struct nlattr *opts; ++ struct nlattr *nest; ++ int band; ++ int prio; ++ ++ opts = nla_nest_start_noflag(skb, TCA_OPTIONS); ++ if (!opts) ++ goto nla_err; ++ ++ if (nla_put_u8(skb, TCA_ETS_NBANDS, q->nbands)) ++ goto nla_err; ++ ++ if (q->nstrict && ++ nla_put_u8(skb, TCA_ETS_NSTRICT, q->nstrict)) ++ goto nla_err; ++ ++ if (q->nbands > q->nstrict) { ++ nest = nla_nest_start(skb, TCA_ETS_QUANTA); ++ if (!nest) ++ goto nla_err; ++ ++ for (band = q->nstrict; band < q->nbands; band++) { ++ if (nla_put_u32(skb, TCA_ETS_QUANTA_BAND, ++ q->classes[band].quantum)) ++ goto nla_err; ++ } ++ ++ nla_nest_end(skb, nest); ++ } ++ ++ nest = nla_nest_start(skb, TCA_ETS_PRIOMAP); ++ if (!nest) ++ goto nla_err; ++ ++ for (prio = 0; prio <= TC_PRIO_MAX; prio++) { ++ if (nla_put_u8(skb, TCA_ETS_PRIOMAP_BAND, q->prio2band[prio])) ++ goto nla_err; ++ } ++ ++ nla_nest_end(skb, nest); ++ ++ return nla_nest_end(skb, opts); ++ ++nla_err: ++ nla_nest_cancel(skb, opts); ++ return -EMSGSIZE; ++} ++ ++static const struct Qdisc_class_ops ets_class_ops = { ++ .change = ets_class_change, ++ .graft = ets_class_graft, ++ .leaf = ets_class_leaf, ++ .find = ets_class_find, ++ .qlen_notify = ets_class_qlen_notify, ++ .dump = ets_class_dump, ++ .dump_stats = ets_class_dump_stats, ++ .walk = ets_qdisc_walk, ++ .tcf_block = ets_qdisc_tcf_block, ++ .bind_tcf = ets_qdisc_bind_tcf, ++ .unbind_tcf = ets_qdisc_unbind_tcf, ++}; ++ ++static struct Qdisc_ops ets_qdisc_ops __read_mostly = { ++ .cl_ops = &ets_class_ops, ++ .id = "ets", ++ .priv_size = sizeof(struct ets_sched), ++ .enqueue = ets_qdisc_enqueue, ++ .dequeue = ets_qdisc_dequeue, ++ .peek = qdisc_peek_dequeued, ++ .change = ets_qdisc_change, ++ .init = ets_qdisc_init, ++ .reset = ets_qdisc_reset, ++ .destroy = ets_qdisc_destroy, ++ .dump = ets_qdisc_dump, ++ .owner = THIS_MODULE, ++}; ++ ++static int __init ets_init(void) ++{ ++ return register_qdisc(&ets_qdisc_ops); ++} ++ ++static void __exit ets_exit(void) ++{ ++ unregister_qdisc(&ets_qdisc_ops); ++} ++ ++module_init(ets_init); ++module_exit(ets_exit); ++MODULE_LICENSE("GPL"); +-- +2.39.5 + diff --git a/queue-5.4/net-sched-ets-fix-crash-when-flipping-from-strict-to.patch b/queue-5.4/net-sched-ets-fix-crash-when-flipping-from-strict-to.patch new file mode 100644 index 0000000000..9504f779ff --- /dev/null +++ b/queue-5.4/net-sched-ets-fix-crash-when-flipping-from-strict-to.patch @@ -0,0 +1,105 @@ +From bfdb7362c7047a3ca8a2fe830cf6cf8ae6630a3f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Aug 2021 00:33:48 +0200 +Subject: net/sched: ets: fix crash when flipping from 'strict' to 'quantum' + +From: Davide Caratti + +[ Upstream commit cd9b50adc6bb9ad3f7d244590a389522215865c4 ] + +While running kselftests, Hangbin observed that sch_ets.sh often crashes, +and splats like the following one are seen in the output of 'dmesg': + + BUG: kernel NULL pointer dereference, address: 0000000000000000 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 159f12067 P4D 159f12067 PUD 159f13067 PMD 0 + Oops: 0000 [#1] SMP NOPTI + CPU: 2 PID: 921 Comm: tc Not tainted 5.14.0-rc6+ #458 + Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 + RIP: 0010:__list_del_entry_valid+0x2d/0x50 + Code: 48 8b 57 08 48 b9 00 01 00 00 00 00 ad de 48 39 c8 0f 84 ac 6e 5b 00 48 b9 22 01 00 00 00 00 ad de 48 39 ca 0f 84 cf 6e 5b 00 <48> 8b 32 48 39 fe 0f 85 af 6e 5b 00 48 8b 50 08 48 39 f2 0f 85 94 + RSP: 0018:ffffb2da005c3890 EFLAGS: 00010217 + RAX: 0000000000000000 RBX: ffff9073ba23f800 RCX: dead000000000122 + RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff9073ba23fbc8 + RBP: ffff9073ba23f890 R08: 0000000000000001 R09: 0000000000000001 + R10: 0000000000000001 R11: 0000000000000001 R12: dead000000000100 + R13: ffff9073ba23fb00 R14: 0000000000000002 R15: 0000000000000002 + FS: 00007f93e5564e40(0000) GS:ffff9073bba00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000000000 CR3: 000000014ad34000 CR4: 0000000000350ee0 + Call Trace: + ets_qdisc_reset+0x6e/0x100 [sch_ets] + qdisc_reset+0x49/0x1d0 + tbf_reset+0x15/0x60 [sch_tbf] + qdisc_reset+0x49/0x1d0 + dev_reset_queue.constprop.42+0x2f/0x90 + dev_deactivate_many+0x1d3/0x3d0 + dev_deactivate+0x56/0x90 + qdisc_graft+0x47e/0x5a0 + tc_get_qdisc+0x1db/0x3e0 + rtnetlink_rcv_msg+0x164/0x4c0 + netlink_rcv_skb+0x50/0x100 + netlink_unicast+0x1a5/0x280 + netlink_sendmsg+0x242/0x480 + sock_sendmsg+0x5b/0x60 + ____sys_sendmsg+0x1f2/0x260 + ___sys_sendmsg+0x7c/0xc0 + __sys_sendmsg+0x57/0xa0 + do_syscall_64+0x3a/0x80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + RIP: 0033:0x7f93e44b8338 + Code: 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 43 2c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 41 89 d4 55 + RSP: 002b:00007ffc0db737a8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e + RAX: ffffffffffffffda RBX: 0000000061255c06 RCX: 00007f93e44b8338 + RDX: 0000000000000000 RSI: 00007ffc0db73810 RDI: 0000000000000003 + RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000000 + R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000001 + R13: 0000000000687880 R14: 0000000000000000 R15: 0000000000000000 + Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt iTCO_vendor_support intel_rapl_msr intel_rapl_common joydev i2c_i801 pcspkr i2c_smbus lpc_ich virtio_balloon ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel ahci libahci ghash_clmulni_intel libata serio_raw virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod + CR2: 0000000000000000 + +When the change() function decreases the value of 'nstrict', we must take +into account that packets might be already enqueued on a class that flips +from 'strict' to 'quantum': otherwise that class will not be added to the +bandwidth-sharing list. Then, a call to ets_qdisc_reset() will attempt to +do list_del(&alist) with 'alist' filled with zero, hence the NULL pointer +dereference. +For classes flipping from 'strict' to 'quantum', initialize an empty list +and eventually add it to the bandwidth-sharing list, if there are packets +already enqueued. In this way, the kernel will: + a) prevent crashing as described above. + b) avoid retaining the backlog packets (for an arbitrarily long time) in + case no packet is enqueued after a change from 'strict' to 'quantum'. + +Reported-by: Hangbin Liu +Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc") +Signed-off-by: Davide Caratti +Signed-off-by: David S. Miller +Stable-dep-of: d92adacdd8c2 ("net_sched: ets: fix a race in ets_qdisc_change()") +Signed-off-by: Sasha Levin +--- + net/sched/sch_ets.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c +index e6194b23e9b0d..367efeed84e4f 100644 +--- a/net/sched/sch_ets.c ++++ b/net/sched/sch_ets.c +@@ -572,6 +572,13 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + sch_tree_lock(sch); + + q->nbands = nbands; ++ for (i = nstrict; i < q->nstrict; i++) { ++ INIT_LIST_HEAD(&q->classes[i].alist); ++ if (q->classes[i].qdisc->q.qlen) { ++ list_add_tail(&q->classes[i].alist, &q->active); ++ q->classes[i].deficit = quanta[i]; ++ } ++ } + q->nstrict = nstrict; + memcpy(q->prio2band, priomap, sizeof(priomap)); + +-- +2.39.5 + diff --git a/queue-5.4/net-sched-sch_ets-don-t-peek-at-classes-beyond-nband.patch b/queue-5.4/net-sched-sch_ets-don-t-peek-at-classes-beyond-nband.patch new file mode 100644 index 0000000000..cd6395e2df --- /dev/null +++ b/queue-5.4/net-sched-sch_ets-don-t-peek-at-classes-beyond-nband.patch @@ -0,0 +1,102 @@ +From de6a6ce880cea7e589293cbbfb63511ae10fe3a6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Nov 2021 17:14:40 +0100 +Subject: net/sched: sch_ets: don't peek at classes beyond 'nbands' + +From: Davide Caratti + +[ Upstream commit de6d25924c2a8c2988c6a385990cafbe742061bf ] + +when the number of DRR classes decreases, the round-robin active list can +contain elements that have already been freed in ets_qdisc_change(). As a +consequence, it's possible to see a NULL dereference crash, caused by the +attempt to call cl->qdisc->ops->peek(cl->qdisc) when cl->qdisc is NULL: + + BUG: kernel NULL pointer dereference, address: 0000000000000018 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 0 P4D 0 + Oops: 0000 [#1] PREEMPT SMP NOPTI + CPU: 1 PID: 910 Comm: mausezahn Not tainted 5.16.0-rc1+ #475 + Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 + RIP: 0010:ets_qdisc_dequeue+0x129/0x2c0 [sch_ets] + Code: c5 01 41 39 ad e4 02 00 00 0f 87 18 ff ff ff 49 8b 85 c0 02 00 00 49 39 c4 0f 84 ba 00 00 00 49 8b ad c0 02 00 00 48 8b 7d 10 <48> 8b 47 18 48 8b 40 38 0f ae e8 ff d0 48 89 c3 48 85 c0 0f 84 9d + RSP: 0000:ffffbb36c0b5fdd8 EFLAGS: 00010287 + RAX: ffff956678efed30 RBX: 0000000000000000 RCX: 0000000000000000 + RDX: 0000000000000002 RSI: ffffffff9b938dc9 RDI: 0000000000000000 + RBP: ffff956678efed30 R08: e2f3207fe360129c R09: 0000000000000000 + R10: 0000000000000001 R11: 0000000000000001 R12: ffff956678efeac0 + R13: ffff956678efe800 R14: ffff956611545000 R15: ffff95667ac8f100 + FS: 00007f2aa9120740(0000) GS:ffff95667b800000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000000018 CR3: 000000011070c000 CR4: 0000000000350ee0 + Call Trace: + + qdisc_peek_dequeued+0x29/0x70 [sch_ets] + tbf_dequeue+0x22/0x260 [sch_tbf] + __qdisc_run+0x7f/0x630 + net_tx_action+0x290/0x4c0 + __do_softirq+0xee/0x4f8 + irq_exit_rcu+0xf4/0x130 + sysvec_apic_timer_interrupt+0x52/0xc0 + asm_sysvec_apic_timer_interrupt+0x12/0x20 + RIP: 0033:0x7f2aa7fc9ad4 + Code: b9 ff ff 48 8b 54 24 18 48 83 c4 08 48 89 ee 48 89 df 5b 5d e9 ed fc ff ff 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa <53> 48 83 ec 10 48 8b 05 10 64 33 00 48 8b 00 48 85 c0 0f 85 84 00 + RSP: 002b:00007ffe5d33fab8 EFLAGS: 00000202 + RAX: 0000000000000002 RBX: 0000561f72c31460 RCX: 0000561f72c31720 + RDX: 0000000000000002 RSI: 0000561f72c31722 RDI: 0000561f72c31720 + RBP: 000000000000002a R08: 00007ffe5d33fa40 R09: 0000000000000014 + R10: 0000000000000000 R11: 0000000000000246 R12: 0000561f7187e380 + R13: 0000000000000000 R14: 0000000000000000 R15: 0000561f72c31460 + + Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt intel_rapl_msr iTCO_vendor_support intel_rapl_common joydev virtio_balloon lpc_ich i2c_i801 i2c_smbus pcspkr ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel ahci libahci ghash_clmulni_intel serio_raw libata virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod + CR2: 0000000000000018 + +Ensuring that 'alist' was never zeroed [1] was not sufficient, we need to +remove from the active list those elements that are no more SP nor DRR. + +[1] https://lore.kernel.org/netdev/60d274838bf09777f0371253416e8af71360bc08.1633609148.git.dcaratti@redhat.com/ + +v3: fix race between ets_qdisc_change() and ets_qdisc_dequeue() delisting + DRR classes beyond 'nbands' in ets_qdisc_change() with the qdisc lock + acquired, thanks to Cong Wang. + +v2: when a NULL qdisc is found in the DRR active list, try to dequeue skb + from the next list item. + +Reported-by: Hangbin Liu +Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc") +Signed-off-by: Davide Caratti +Link: https://lore.kernel.org/r/7a5c496eed2d62241620bdbb83eb03fb9d571c99.1637762721.git.dcaratti@redhat.com +Signed-off-by: Jakub Kicinski +Stable-dep-of: d92adacdd8c2 ("net_sched: ets: fix a race in ets_qdisc_change()") +Signed-off-by: Sasha Levin +--- + net/sched/sch_ets.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c +index 367efeed84e4f..e2c6f87d7ca99 100644 +--- a/net/sched/sch_ets.c ++++ b/net/sched/sch_ets.c +@@ -579,12 +579,14 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + q->classes[i].deficit = quanta[i]; + } + } ++ for (i = q->nbands; i < oldbands; i++) { ++ qdisc_tree_flush_backlog(q->classes[i].qdisc); ++ if (i >= q->nstrict) ++ list_del(&q->classes[i].alist); ++ } + q->nstrict = nstrict; + memcpy(q->prio2band, priomap, sizeof(priomap)); + +- for (i = q->nbands; i < oldbands; i++) +- qdisc_tree_flush_backlog(q->classes[i].qdisc); +- + for (i = 0; i < q->nbands; i++) + q->classes[i].quantum = quanta[i]; + +-- +2.39.5 + diff --git a/queue-5.4/net-sched-sch_ets-don-t-remove-idle-classes-from-the.patch b/queue-5.4/net-sched-sch_ets-don-t-remove-idle-classes-from-the.patch new file mode 100644 index 0000000000..ff6b009281 --- /dev/null +++ b/queue-5.4/net-sched-sch_ets-don-t-remove-idle-classes-from-the.patch @@ -0,0 +1,110 @@ +From d7fa2ee587e3488c5347dea2fbff0044211dc683 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Dec 2021 17:42:47 +0100 +Subject: net/sched: sch_ets: don't remove idle classes from the round-robin + list + +From: Davide Caratti + +[ Upstream commit c062f2a0b04d86c5b8c9d973bea43493eaca3d32 ] + +Shuang reported that the following script: + + 1) tc qdisc add dev ddd0 handle 10: parent 1: ets bands 8 strict 4 priomap 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 + 2) mausezahn ddd0 -A 10.10.10.1 -B 10.10.10.2 -c 0 -a own -b 00:c1:a0:c1:a0:00 -t udp & + 3) tc qdisc change dev ddd0 handle 10: ets bands 4 strict 2 quanta 2500 2500 priomap 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 + +crashes systematically when line 2) is commented: + + list_del corruption, ffff8e028404bd30->next is LIST_POISON1 (dead000000000100) + ------------[ cut here ]------------ + kernel BUG at lib/list_debug.c:47! + invalid opcode: 0000 [#1] PREEMPT SMP NOPTI + CPU: 0 PID: 954 Comm: tc Not tainted 5.16.0-rc4+ #478 + Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 + RIP: 0010:__list_del_entry_valid.cold.1+0x12/0x47 + Code: fe ff 0f 0b 48 89 c1 4c 89 c6 48 c7 c7 08 42 1b 87 e8 1d c5 fe ff 0f 0b 48 89 fe 48 89 c2 48 c7 c7 98 42 1b 87 e8 09 c5 fe ff <0f> 0b 48 c7 c7 48 43 1b 87 e8 fb c4 fe ff 0f 0b 48 89 f2 48 89 fe + RSP: 0018:ffffae46807a3888 EFLAGS: 00010246 + RAX: 000000000000004e RBX: 0000000000000007 RCX: 0000000000000202 + RDX: 0000000000000000 RSI: ffffffff871ac536 RDI: 00000000ffffffff + RBP: ffffae46807a3a10 R08: 0000000000000000 R09: c0000000ffff7fff + R10: 0000000000000001 R11: ffffae46807a36a8 R12: ffff8e028404b800 + R13: ffff8e028404bd30 R14: dead000000000100 R15: ffff8e02fafa2400 + FS: 00007efdc92e4480(0000) GS:ffff8e02fb600000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000682f48 CR3: 00000001058be000 CR4: 0000000000350ef0 + Call Trace: + + ets_qdisc_change+0x58b/0xa70 [sch_ets] + tc_modify_qdisc+0x323/0x880 + rtnetlink_rcv_msg+0x169/0x4a0 + netlink_rcv_skb+0x50/0x100 + netlink_unicast+0x1a5/0x280 + netlink_sendmsg+0x257/0x4d0 + sock_sendmsg+0x5b/0x60 + ____sys_sendmsg+0x1f2/0x260 + ___sys_sendmsg+0x7c/0xc0 + __sys_sendmsg+0x57/0xa0 + do_syscall_64+0x3a/0x80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + RIP: 0033:0x7efdc8031338 + Code: 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 43 2c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 41 89 d4 55 + RSP: 002b:00007ffdf1ce9828 EFLAGS: 00000246 ORIG_RAX: 000000000000002e + RAX: ffffffffffffffda RBX: 0000000061b37a97 RCX: 00007efdc8031338 + RDX: 0000000000000000 RSI: 00007ffdf1ce9890 RDI: 0000000000000003 + RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000078a940 + R10: 000000000000000c R11: 0000000000000246 R12: 0000000000000001 + R13: 0000000000688880 R14: 0000000000000000 R15: 0000000000000000 + + Modules linked in: sch_ets sch_tbf dummy rfkill iTCO_wdt iTCO_vendor_support intel_rapl_msr intel_rapl_common joydev pcspkr i2c_i801 virtio_balloon i2c_smbus lpc_ich ip_tables xfs libcrc32c crct10dif_pclmul crc32_pclmul crc32c_intel serio_raw ghash_clmulni_intel ahci libahci libata virtio_blk virtio_console virtio_net net_failover failover sunrpc dm_mirror dm_region_hash dm_log dm_mod [last unloaded: sch_ets] + ---[ end trace f35878d1912655c2 ]--- + RIP: 0010:__list_del_entry_valid.cold.1+0x12/0x47 + Code: fe ff 0f 0b 48 89 c1 4c 89 c6 48 c7 c7 08 42 1b 87 e8 1d c5 fe ff 0f 0b 48 89 fe 48 89 c2 48 c7 c7 98 42 1b 87 e8 09 c5 fe ff <0f> 0b 48 c7 c7 48 43 1b 87 e8 fb c4 fe ff 0f 0b 48 89 f2 48 89 fe + RSP: 0018:ffffae46807a3888 EFLAGS: 00010246 + RAX: 000000000000004e RBX: 0000000000000007 RCX: 0000000000000202 + RDX: 0000000000000000 RSI: ffffffff871ac536 RDI: 00000000ffffffff + RBP: ffffae46807a3a10 R08: 0000000000000000 R09: c0000000ffff7fff + R10: 0000000000000001 R11: ffffae46807a36a8 R12: ffff8e028404b800 + R13: ffff8e028404bd30 R14: dead000000000100 R15: ffff8e02fafa2400 + FS: 00007efdc92e4480(0000) GS:ffff8e02fb600000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000682f48 CR3: 00000001058be000 CR4: 0000000000350ef0 + Kernel panic - not syncing: Fatal exception in interrupt + Kernel Offset: 0x4e00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) + ---[ end Kernel panic - not syncing: Fatal exception in interrupt ]--- + +we can remove 'q->classes[i].alist' only if DRR class 'i' was part of the +active list. In the ETS scheduler DRR classes belong to that list only if +the queue length is greater than zero: we need to test for non-zero value +of 'q->classes[i].qdisc->q.qlen' before removing from the list, similarly +to what has been done elsewhere in the ETS code. + +Fixes: de6d25924c2a ("net/sched: sch_ets: don't peek at classes beyond 'nbands'") +Reported-by: Shuang Li +Signed-off-by: Davide Caratti +Signed-off-by: David S. Miller +Stable-dep-of: d92adacdd8c2 ("net_sched: ets: fix a race in ets_qdisc_change()") +Signed-off-by: Sasha Levin +--- + net/sched/sch_ets.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c +index e2c6f87d7ca99..d4a47bb709c5f 100644 +--- a/net/sched/sch_ets.c ++++ b/net/sched/sch_ets.c +@@ -580,9 +580,9 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + } + } + for (i = q->nbands; i < oldbands; i++) { +- qdisc_tree_flush_backlog(q->classes[i].qdisc); +- if (i >= q->nstrict) ++ if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) + list_del(&q->classes[i].alist); ++ qdisc_tree_flush_backlog(q->classes[i].qdisc); + } + q->nstrict = nstrict; + memcpy(q->prio2band, priomap, sizeof(priomap)); +-- +2.39.5 + diff --git a/queue-5.4/net_sched-ets-fix-a-race-in-ets_qdisc_change.patch b/queue-5.4/net_sched-ets-fix-a-race-in-ets_qdisc_change.patch new file mode 100644 index 0000000000..33555ba457 --- /dev/null +++ b/queue-5.4/net_sched-ets-fix-a-race-in-ets_qdisc_change.patch @@ -0,0 +1,58 @@ +From 31b31faea6a9353df07e031aa44ae5e59b40e7f4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 11 Jun 2025 11:15:14 +0000 +Subject: net_sched: ets: fix a race in ets_qdisc_change() + +From: Eric Dumazet + +[ Upstream commit d92adacdd8c2960be856e0b82acc5b7c5395fddb ] + +Gerrard Tai reported a race condition in ETS, whenever SFQ perturb timer +fires at the wrong time. + +The race is as follows: + +CPU 0 CPU 1 +[1]: lock root +[2]: qdisc_tree_flush_backlog() +[3]: unlock root + | + | [5]: lock root + | [6]: rehash + | [7]: qdisc_tree_reduce_backlog() + | +[4]: qdisc_put() + +This can be abused to underflow a parent's qlen. + +Calling qdisc_purge_queue() instead of qdisc_tree_flush_backlog() +should fix the race, because all packets will be purged from the qdisc +before releasing the lock. + +Fixes: b05972f01e7d ("net: sched: tbf: don't call qdisc_put() while holding tree lock") +Reported-by: Gerrard Tai +Suggested-by: Gerrard Tai +Signed-off-by: Eric Dumazet +Link: https://patch.msgid.link/20250611111515.1983366-5-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sched/sch_ets.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c +index fa4eeff4f2fe8..63185983a4206 100644 +--- a/net/sched/sch_ets.c ++++ b/net/sched/sch_ets.c +@@ -582,7 +582,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + for (i = q->nbands; i < oldbands; i++) { + if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) + list_del_init(&q->classes[i].alist); +- qdisc_tree_flush_backlog(q->classes[i].qdisc); ++ qdisc_purge_queue(q->classes[i].qdisc); + } + q->nstrict = nstrict; + memcpy(q->prio2band, priomap, sizeof(priomap)); +-- +2.39.5 + diff --git a/queue-5.4/net_sched-prio-fix-a-race-in-prio_tune.patch b/queue-5.4/net_sched-prio-fix-a-race-in-prio_tune.patch new file mode 100644 index 0000000000..0f1b8838e8 --- /dev/null +++ b/queue-5.4/net_sched-prio-fix-a-race-in-prio_tune.patch @@ -0,0 +1,58 @@ +From 33a9ca8a022090fd38f791fc44e40fd97c2d61dd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 11 Jun 2025 11:15:11 +0000 +Subject: net_sched: prio: fix a race in prio_tune() + +From: Eric Dumazet + +[ Upstream commit d35acc1be3480505b5931f17e4ea9b7617fea4d3 ] + +Gerrard Tai reported a race condition in PRIO, whenever SFQ perturb timer +fires at the wrong time. + +The race is as follows: + +CPU 0 CPU 1 +[1]: lock root +[2]: qdisc_tree_flush_backlog() +[3]: unlock root + | + | [5]: lock root + | [6]: rehash + | [7]: qdisc_tree_reduce_backlog() + | +[4]: qdisc_put() + +This can be abused to underflow a parent's qlen. + +Calling qdisc_purge_queue() instead of qdisc_tree_flush_backlog() +should fix the race, because all packets will be purged from the qdisc +before releasing the lock. + +Fixes: 7b8e0b6e6599 ("net: sched: prio: delay destroying child qdiscs on change") +Reported-by: Gerrard Tai +Suggested-by: Gerrard Tai +Signed-off-by: Eric Dumazet +Link: https://patch.msgid.link/20250611111515.1983366-2-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sched/sch_prio.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c +index 647941702f9fc..62c1b1f352b26 100644 +--- a/net/sched/sch_prio.c ++++ b/net/sched/sch_prio.c +@@ -213,7 +213,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt, + memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); + + for (i = q->bands; i < oldbands; i++) +- qdisc_tree_flush_backlog(q->queues[i]); ++ qdisc_purge_queue(q->queues[i]); + + for (i = oldbands; i < q->bands; i++) { + q->queues[i] = queues[i]; +-- +2.39.5 + diff --git a/queue-5.4/net_sched-red-fix-a-race-in-__red_change.patch b/queue-5.4/net_sched-red-fix-a-race-in-__red_change.patch new file mode 100644 index 0000000000..88b26646df --- /dev/null +++ b/queue-5.4/net_sched-red-fix-a-race-in-__red_change.patch @@ -0,0 +1,58 @@ +From 504bea9f538661d15767688cb01cab5e452bd840 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 11 Jun 2025 11:15:12 +0000 +Subject: net_sched: red: fix a race in __red_change() + +From: Eric Dumazet + +[ Upstream commit 85a3e0ede38450ea3053b8c45d28cf55208409b8 ] + +Gerrard Tai reported a race condition in RED, whenever SFQ perturb timer +fires at the wrong time. + +The race is as follows: + +CPU 0 CPU 1 +[1]: lock root +[2]: qdisc_tree_flush_backlog() +[3]: unlock root + | + | [5]: lock root + | [6]: rehash + | [7]: qdisc_tree_reduce_backlog() + | +[4]: qdisc_put() + +This can be abused to underflow a parent's qlen. + +Calling qdisc_purge_queue() instead of qdisc_tree_flush_backlog() +should fix the race, because all packets will be purged from the qdisc +before releasing the lock. + +Fixes: 0c8d13ac9607 ("net: sched: red: delay destroying child qdisc on replace") +Reported-by: Gerrard Tai +Suggested-by: Gerrard Tai +Signed-off-by: Eric Dumazet +Link: https://patch.msgid.link/20250611111515.1983366-3-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sched/sch_red.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c +index 476853ff69894..64532ee591a96 100644 +--- a/net/sched/sch_red.c ++++ b/net/sched/sch_red.c +@@ -235,7 +235,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt, + q->flags = ctl->flags; + q->limit = ctl->limit; + if (child) { +- qdisc_tree_flush_backlog(q->qdisc); ++ qdisc_purge_queue(q->qdisc); + old_child = q->qdisc; + q->qdisc = child; + } +-- +2.39.5 + diff --git a/queue-5.4/net_sched-sch_sfq-fix-a-potential-crash-on-gso_skb-h.patch b/queue-5.4/net_sched-sch_sfq-fix-a-potential-crash-on-gso_skb-h.patch new file mode 100644 index 0000000000..7b67ca8439 --- /dev/null +++ b/queue-5.4/net_sched-sch_sfq-fix-a-potential-crash-on-gso_skb-h.patch @@ -0,0 +1,70 @@ +From f0150f0c20f017f4ac5a7889f47fdc59711ed0bc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 6 Jun 2025 16:51:27 +0000 +Subject: net_sched: sch_sfq: fix a potential crash on gso_skb handling +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Eric Dumazet + +[ Upstream commit 82ffbe7776d0ac084031f114167712269bf3d832 ] + +SFQ has an assumption of always being able to queue at least one packet. + +However, after the blamed commit, sch->q.len can be inflated by packets +in sch->gso_skb, and an enqueue() on an empty SFQ qdisc can be followed +by an immediate drop. + +Fix sfq_drop() to properly clear q->tail in this situation. + +Tested: + +ip netns add lb +ip link add dev to-lb type veth peer name in-lb netns lb +ethtool -K to-lb tso off # force qdisc to requeue gso_skb +ip netns exec lb ethtool -K in-lb gro on # enable NAPI +ip link set dev to-lb up +ip -netns lb link set dev in-lb up +ip addr add dev to-lb 192.168.20.1/24 +ip -netns lb addr add dev in-lb 192.168.20.2/24 +tc qdisc replace dev to-lb root sfq limit 100 + +ip netns exec lb netserver + +netperf -H 192.168.20.2 -l 100 & +netperf -H 192.168.20.2 -l 100 & +netperf -H 192.168.20.2 -l 100 & +netperf -H 192.168.20.2 -l 100 & + +Fixes: a53851e2c321 ("net: sched: explicit locking in gso_cpu fallback") +Reported-by: Marcus Wichelmann +Closes: https://lore.kernel.org/netdev/9da42688-bfaa-4364-8797-e9271f3bdaef@hetzner-cloud.de/ +Signed-off-by: Eric Dumazet +Reviewed-by: Toke Høiland-Jørgensen +Link: https://patch.msgid.link/20250606165127.3629486-1-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sched/sch_sfq.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c +index d7f910610de97..acda653710288 100644 +--- a/net/sched/sch_sfq.c ++++ b/net/sched/sch_sfq.c +@@ -317,7 +317,10 @@ static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + x = q->tail->next; + slot = &q->slots[x]; +- q->tail->next = slot->next; ++ if (slot->next == x) ++ q->tail = NULL; /* no more active slots */ ++ else ++ q->tail->next = slot->next; + q->ht[slot->hash] = SFQ_EMPTY_SLOT; + goto drop; + } +-- +2.39.5 + diff --git a/queue-5.4/net_sched-tbf-fix-a-race-in-tbf_change.patch b/queue-5.4/net_sched-tbf-fix-a-race-in-tbf_change.patch new file mode 100644 index 0000000000..a0097b049c --- /dev/null +++ b/queue-5.4/net_sched-tbf-fix-a-race-in-tbf_change.patch @@ -0,0 +1,59 @@ +From f1993a8f274f80b38c751bea3a415629b7654a9f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 11 Jun 2025 11:15:13 +0000 +Subject: net_sched: tbf: fix a race in tbf_change() + +From: Eric Dumazet + +[ Upstream commit 43eb466041216d25dedaef1c383ad7bd89929cbc ] + +Gerrard Tai reported a race condition in TBF, whenever SFQ perturb timer +fires at the wrong time. + +The race is as follows: + +CPU 0 CPU 1 +[1]: lock root +[2]: qdisc_tree_flush_backlog() +[3]: unlock root + | + | [5]: lock root + | [6]: rehash + | [7]: qdisc_tree_reduce_backlog() + | +[4]: qdisc_put() + +This can be abused to underflow a parent's qlen. + +Calling qdisc_purge_queue() instead of qdisc_tree_flush_backlog() +should fix the race, because all packets will be purged from the qdisc +before releasing the lock. + +Fixes: b05972f01e7d ("net: sched: tbf: don't call qdisc_put() while holding tree lock") +Reported-by: Gerrard Tai +Suggested-by: Gerrard Tai +Signed-off-by: Eric Dumazet +Cc: Zhengchao Shao +Link: https://patch.msgid.link/20250611111515.1983366-4-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/sched/sch_tbf.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c +index 259a39ca99bfb..9b11e9256336c 100644 +--- a/net/sched/sch_tbf.c ++++ b/net/sched/sch_tbf.c +@@ -394,7 +394,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt, + + sch_tree_lock(sch); + if (child) { +- qdisc_tree_flush_backlog(q->qdisc); ++ qdisc_purge_queue(q->qdisc); + old = q->qdisc; + q->qdisc = child; + } +-- +2.39.5 + diff --git a/queue-5.4/sch_ets-make-est_qlen_notify-idempotent.patch b/queue-5.4/sch_ets-make-est_qlen_notify-idempotent.patch new file mode 100644 index 0000000000..fc701cdc2e --- /dev/null +++ b/queue-5.4/sch_ets-make-est_qlen_notify-idempotent.patch @@ -0,0 +1,71 @@ +From bb8903bffb7764dd4b21a5c0f111197acf45a827 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 3 Apr 2025 14:10:27 -0700 +Subject: sch_ets: make est_qlen_notify() idempotent + +From: Cong Wang + +[ Upstream commit a7a15f39c682ac4268624da2abdb9114bdde96d5 ] + +est_qlen_notify() deletes its class from its active list with +list_del() when qlen is 0, therefore, it is not idempotent and +not friendly to its callers, like fq_codel_dequeue(). + +Let's make it idempotent to ease qdisc_tree_reduce_backlog() callers' +life. Also change other list_del()'s to list_del_init() just to be +extra safe. + +Reported-by: Gerrard Tai +Signed-off-by: Cong Wang +Link: https://patch.msgid.link/20250403211033.166059-6-xiyou.wangcong@gmail.com +Acked-by: Jamal Hadi Salim +Signed-off-by: Paolo Abeni +Stable-dep-of: d92adacdd8c2 ("net_sched: ets: fix a race in ets_qdisc_change()") +Signed-off-by: Sasha Levin +--- + net/sched/sch_ets.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c +index d4a47bb709c5f..fa4eeff4f2fe8 100644 +--- a/net/sched/sch_ets.c ++++ b/net/sched/sch_ets.c +@@ -203,7 +203,7 @@ static void ets_class_qlen_notify(struct Qdisc *sch, unsigned long arg) + * to remove them. + */ + if (!ets_class_is_strict(q, cl) && sch->q.qlen) +- list_del(&cl->alist); ++ list_del_init(&cl->alist); + } + + static int ets_class_dump(struct Qdisc *sch, unsigned long arg, +@@ -406,7 +406,7 @@ static struct sk_buff *ets_qdisc_dequeue(struct Qdisc *sch) + if (unlikely(!skb)) + goto out; + if (cl->qdisc->q.qlen == 0) +- list_del(&cl->alist); ++ list_del_init(&cl->alist); + return ets_qdisc_dequeue_skb(sch, skb); + } + +@@ -581,7 +581,7 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + } + for (i = q->nbands; i < oldbands; i++) { + if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) +- list_del(&q->classes[i].alist); ++ list_del_init(&q->classes[i].alist); + qdisc_tree_flush_backlog(q->classes[i].qdisc); + } + q->nstrict = nstrict; +@@ -629,7 +629,7 @@ static void ets_qdisc_reset(struct Qdisc *sch) + + for (band = q->nstrict; band < q->nbands; band++) { + if (q->classes[band].qdisc->q.qlen) +- list_del(&q->classes[band].alist); ++ list_del_init(&q->classes[band].alist); + } + for (band = 0; band < q->nbands; band++) + qdisc_reset(q->classes[band].qdisc); +-- +2.39.5 + diff --git a/queue-5.4/scsi-iscsi-fix-incorrect-error-path-labels-for-flash.patch b/queue-5.4/scsi-iscsi-fix-incorrect-error-path-labels-for-flash.patch new file mode 100644 index 0000000000..28506d099f --- /dev/null +++ b/queue-5.4/scsi-iscsi-fix-incorrect-error-path-labels-for-flash.patch @@ -0,0 +1,99 @@ +From 708ce798450a747fc19cd6d01889e0a64bd9df34 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 30 May 2025 12:29:35 -0700 +Subject: scsi: iscsi: Fix incorrect error path labels for flashnode operations + +From: Alok Tiwari + +[ Upstream commit 9b17621366d210ffee83262a8754086ebbde5e55 ] + +Correct the error handling goto labels used when host lookup fails in +various flashnode-related event handlers: + + - iscsi_new_flashnode() + - iscsi_del_flashnode() + - iscsi_login_flashnode() + - iscsi_logout_flashnode() + - iscsi_logout_flashnode_sid() + +scsi_host_put() is not required when shost is NULL, so jumping to the +correct label avoids unnecessary operations. These functions previously +jumped to the wrong goto label (put_host), which did not match the +intended cleanup logic. + +Use the correct exit labels (exit_new_fnode, exit_del_fnode, etc.) to +ensure proper error handling. Also remove the unused put_host label +under iscsi_new_flashnode() as it is no longer needed. + +No functional changes beyond accurate error path correction. + +Fixes: c6a4bb2ef596 ("[SCSI] scsi_transport_iscsi: Add flash node mgmt support") +Signed-off-by: Alok Tiwari +Link: https://lore.kernel.org/r/20250530193012.3312911-1-alok.a.tiwari@oracle.com +Reviewed-by: Mike Christie +Signed-off-by: Martin K. Petersen +Signed-off-by: Sasha Levin +--- + drivers/scsi/scsi_transport_iscsi.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c +index d75097f13efcc..0977e4a09db03 100644 +--- a/drivers/scsi/scsi_transport_iscsi.c ++++ b/drivers/scsi/scsi_transport_iscsi.c +@@ -3235,7 +3235,7 @@ static int iscsi_new_flashnode(struct iscsi_transport *transport, + pr_err("%s could not find host no %u\n", + __func__, ev->u.new_flashnode.host_no); + err = -ENODEV; +- goto put_host; ++ goto exit_new_fnode; + } + + index = transport->new_flashnode(shost, data, len); +@@ -3245,7 +3245,6 @@ static int iscsi_new_flashnode(struct iscsi_transport *transport, + else + err = -EIO; + +-put_host: + scsi_host_put(shost); + + exit_new_fnode: +@@ -3270,7 +3269,7 @@ static int iscsi_del_flashnode(struct iscsi_transport *transport, + pr_err("%s could not find host no %u\n", + __func__, ev->u.del_flashnode.host_no); + err = -ENODEV; +- goto put_host; ++ goto exit_del_fnode; + } + + idx = ev->u.del_flashnode.flashnode_idx; +@@ -3312,7 +3311,7 @@ static int iscsi_login_flashnode(struct iscsi_transport *transport, + pr_err("%s could not find host no %u\n", + __func__, ev->u.login_flashnode.host_no); + err = -ENODEV; +- goto put_host; ++ goto exit_login_fnode; + } + + idx = ev->u.login_flashnode.flashnode_idx; +@@ -3364,7 +3363,7 @@ static int iscsi_logout_flashnode(struct iscsi_transport *transport, + pr_err("%s could not find host no %u\n", + __func__, ev->u.logout_flashnode.host_no); + err = -ENODEV; +- goto put_host; ++ goto exit_logout_fnode; + } + + idx = ev->u.logout_flashnode.flashnode_idx; +@@ -3414,7 +3413,7 @@ static int iscsi_logout_flashnode_sid(struct iscsi_transport *transport, + pr_err("%s could not find host no %u\n", + __func__, ev->u.logout_flashnode.host_no); + err = -ENODEV; +- goto put_host; ++ goto exit_logout_sid; + } + + session = iscsi_session_lookup(ev->u.logout_flashnode_sid.sid); +-- +2.39.5 + diff --git a/queue-5.4/series b/queue-5.4/series index c01e5cc285..b7ff9d3086 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -74,3 +74,20 @@ input-synaptics-rmi4-convert-to-use-sysfs_emit-apis.patch input-synaptics-rmi-fix-crash-with-unsupported-versi.patch nfsd-fix-ia_size-underflow.patch nfsd-fix-nfsv3-setattr-create-s-handling-of-large-fi.patch +scsi-iscsi-fix-incorrect-error-path-labels-for-flash.patch +net_sched-sch_sfq-fix-a-potential-crash-on-gso_skb-h.patch +i40e-return-false-from-i40e_reset_vf-if-reset-is-in-.patch +i40e-retry-vflr-handling-if-there-is-ongoing-vf-rese.patch +net-mlx5-wait-for-inactive-autogroups.patch +net-mlx5-fix-return-value-when-searching-for-existin.patch +net_sched-prio-fix-a-race-in-prio_tune.patch +net_sched-red-fix-a-race-in-__red_change.patch +net_sched-tbf-fix-a-race-in-tbf_change.patch +net-sch_ets-add-a-new-qdisc.patch +net-sched-ets-fix-crash-when-flipping-from-strict-to.patch +net-sched-sch_ets-don-t-peek-at-classes-beyond-nband.patch +net-sched-sch_ets-don-t-remove-idle-classes-from-the.patch +sch_ets-make-est_qlen_notify-idempotent.patch +net_sched-ets-fix-a-race-in-ets_qdisc_change.patch +net-mdio-c22-is-now-optional-eopnotsupp-if-not-provi.patch +net-mdiobus-fix-potential-out-of-bounds-read-write-a.patch