From e5d3713dc6f3edc3b5a809650b5437c6bd6a3d04 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 5 Apr 2024 13:09:43 +0200 Subject: [PATCH] 6.8-stable patches added patches: ax25-fix-use-after-free-bugs-caused-by-ax25_ds_del_timer.patch e1000e-workaround-for-sporadic-mdi-error-on-meteor-lake-systems.patch erspan-make-sure-erspan_base_hdr-is-present-in-skb-head.patch i40e-enforce-software-interrupt-during-busy-poll-exit.patch i40e-fix-i40e_count_filters-to-count-only-active-new-filters.patch i40e-fix-vf-mac-filter-removal.patch i40e-fix-vf-may-be-used-uninitialized-in-this-function-warning.patch ice-fix-enabling-rx-vlan-filtering.patch idpf-fix-kernel-panic-on-unknown-packet-types.patch ipv6-fix-infinite-recursion-in-fib6_dump_done.patch mlxbf_gige-stop-interface-during-shutdown.patch octeontx2-af-add-array-index-check.patch octeontx2-af-fix-issue-with-loading-coalesced-kpu-profiles.patch octeontx2-pf-check-negative-error-code-in-otx2_open.patch r8169-skip-dash-fw-status-checks-when-dash-is-disabled.patch selftests-reuseaddr_conflict-add-missing-new-line-at-the-end-of-the-output.patch tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses-again.patch tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses.patch udp-do-not-accept-non-tunnel-gso-skbs-landing-in-a-tunnel.patch udp-do-not-transition-udp-gro-fraglist-partial-checksums-to-unnecessary.patch udp-prevent-local-udp-tunnel-packets-from-being-groed.patch --- ...ree-bugs-caused-by-ax25_ds_del_timer.patch | 48 +++ ...dic-mdi-error-on-meteor-lake-systems.patch | 397 ++++++++++++++++++ ...span_base_hdr-is-present-in-skb-head.patch | 121 ++++++ ...ware-interrupt-during-busy-poll-exit.patch | 307 ++++++++++++++ ...ers-to-count-only-active-new-filters.patch | 44 ++ .../i40e-fix-vf-mac-filter-removal.patch | 65 +++ ...initialized-in-this-function-warning.patch | 146 +++++++ .../ice-fix-enabling-rx-vlan-filtering.patch | 65 +++ ...kernel-panic-on-unknown-packet-types.patch | 52 +++ ...infinite-recursion-in-fib6_dump_done.patch | 134 ++++++ ..._gige-stop-interface-during-shutdown.patch | 109 +++++ .../octeontx2-af-add-array-index-check.patch | 34 ++ ...-with-loading-coalesced-kpu-profiles.patch | 36 ++ ...eck-negative-error-code-in-otx2_open.patch | 35 ++ ...-status-checks-when-dash-is-disabled.patch | 99 +++++ ...ng-new-line-at-the-end-of-the-output.patch | 39 ++ queue-6.8/series | 21 + ...pped-v6-non-wildcard-addresses-again.patch | 94 +++++ ...-v4-mapped-v6-non-wildcard-addresses.patch | 78 ++++ ...-tunnel-gso-skbs-landing-in-a-tunnel.patch | 146 +++++++ ...ist-partial-checksums-to-unnecessary.patch | 74 ++++ ...-udp-tunnel-packets-from-being-groed.patch | 54 +++ 22 files changed, 2198 insertions(+) create mode 100644 queue-6.8/ax25-fix-use-after-free-bugs-caused-by-ax25_ds_del_timer.patch create mode 100644 queue-6.8/e1000e-workaround-for-sporadic-mdi-error-on-meteor-lake-systems.patch create mode 100644 queue-6.8/erspan-make-sure-erspan_base_hdr-is-present-in-skb-head.patch create mode 100644 queue-6.8/i40e-enforce-software-interrupt-during-busy-poll-exit.patch create mode 100644 queue-6.8/i40e-fix-i40e_count_filters-to-count-only-active-new-filters.patch create mode 100644 queue-6.8/i40e-fix-vf-mac-filter-removal.patch create mode 100644 queue-6.8/i40e-fix-vf-may-be-used-uninitialized-in-this-function-warning.patch create mode 100644 queue-6.8/ice-fix-enabling-rx-vlan-filtering.patch create mode 100644 queue-6.8/idpf-fix-kernel-panic-on-unknown-packet-types.patch create mode 100644 queue-6.8/ipv6-fix-infinite-recursion-in-fib6_dump_done.patch create mode 100644 queue-6.8/mlxbf_gige-stop-interface-during-shutdown.patch create mode 100644 queue-6.8/octeontx2-af-add-array-index-check.patch create mode 100644 queue-6.8/octeontx2-af-fix-issue-with-loading-coalesced-kpu-profiles.patch create mode 100644 queue-6.8/octeontx2-pf-check-negative-error-code-in-otx2_open.patch create mode 100644 queue-6.8/r8169-skip-dash-fw-status-checks-when-dash-is-disabled.patch create mode 100644 queue-6.8/selftests-reuseaddr_conflict-add-missing-new-line-at-the-end-of-the-output.patch create mode 100644 queue-6.8/tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses-again.patch create mode 100644 queue-6.8/tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses.patch create mode 100644 queue-6.8/udp-do-not-accept-non-tunnel-gso-skbs-landing-in-a-tunnel.patch create mode 100644 queue-6.8/udp-do-not-transition-udp-gro-fraglist-partial-checksums-to-unnecessary.patch create mode 100644 queue-6.8/udp-prevent-local-udp-tunnel-packets-from-being-groed.patch diff --git a/queue-6.8/ax25-fix-use-after-free-bugs-caused-by-ax25_ds_del_timer.patch b/queue-6.8/ax25-fix-use-after-free-bugs-caused-by-ax25_ds_del_timer.patch new file mode 100644 index 00000000000..d903c1342d7 --- /dev/null +++ b/queue-6.8/ax25-fix-use-after-free-bugs-caused-by-ax25_ds_del_timer.patch @@ -0,0 +1,48 @@ +From fd819ad3ecf6f3c232a06b27423ce9ed8c20da89 Mon Sep 17 00:00:00 2001 +From: Duoming Zhou +Date: Fri, 29 Mar 2024 09:50:23 +0800 +Subject: ax25: fix use-after-free bugs caused by ax25_ds_del_timer + +From: Duoming Zhou + +commit fd819ad3ecf6f3c232a06b27423ce9ed8c20da89 upstream. + +When the ax25 device is detaching, the ax25_dev_device_down() +calls ax25_ds_del_timer() to cleanup the slave_timer. When +the timer handler is running, the ax25_ds_del_timer() that +calls del_timer() in it will return directly. As a result, +the use-after-free bugs could happen, one of the scenarios +is shown below: + + (Thread 1) | (Thread 2) + | ax25_ds_timeout() +ax25_dev_device_down() | + ax25_ds_del_timer() | + del_timer() | + ax25_dev_put() //FREE | + | ax25_dev-> //USE + +In order to mitigate bugs, when the device is detaching, use +timer_shutdown_sync() to stop the timer. + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Signed-off-by: Duoming Zhou +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240329015023.9223-1-duoming@zju.edu.cn +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ax25/ax25_dev.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/ax25/ax25_dev.c ++++ b/net/ax25/ax25_dev.c +@@ -105,7 +105,7 @@ void ax25_dev_device_down(struct net_dev + spin_lock_bh(&ax25_dev_lock); + + #ifdef CONFIG_AX25_DAMA_SLAVE +- ax25_ds_del_timer(ax25_dev); ++ timer_shutdown_sync(&ax25_dev->dama.slave_timer); + #endif + + /* diff --git a/queue-6.8/e1000e-workaround-for-sporadic-mdi-error-on-meteor-lake-systems.patch b/queue-6.8/e1000e-workaround-for-sporadic-mdi-error-on-meteor-lake-systems.patch new file mode 100644 index 00000000000..33a98ea4160 --- /dev/null +++ b/queue-6.8/e1000e-workaround-for-sporadic-mdi-error-on-meteor-lake-systems.patch @@ -0,0 +1,397 @@ +From 6dbdd4de0362c37e54e8b049781402e5a409e7d0 Mon Sep 17 00:00:00 2001 +From: Vitaly Lifshits +Date: Thu, 4 Jan 2024 16:16:52 +0200 +Subject: e1000e: Workaround for sporadic MDI error on Meteor Lake systems + +From: Vitaly Lifshits + +commit 6dbdd4de0362c37e54e8b049781402e5a409e7d0 upstream. + +On some Meteor Lake systems accessing the PHY via the MDIO interface may +result in an MDI error. This issue happens sporadically and in most cases +a second access to the PHY via the MDIO interface results in success. + +As a workaround, introduce a retry counter which is set to 3 on Meteor +Lake systems. The driver will only return an error if 3 consecutive PHY +access attempts fail. The retry mechanism is disabled in specific flows, +where MDI errors are expected. + +Fixes: cc23f4f0b6b9 ("e1000e: Add support for Meteor Lake") +Suggested-by: Nikolay Mushayev +Co-developed-by: Nir Efrati +Signed-off-by: Nir Efrati +Signed-off-by: Vitaly Lifshits +Tested-by: Naama Meir +Signed-off-by: Tony Nguyen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/e1000e/hw.h | 2 + drivers/net/ethernet/intel/e1000e/ich8lan.c | 33 ++++ + drivers/net/ethernet/intel/e1000e/phy.c | 190 +++++++++++++++++----------- + drivers/net/ethernet/intel/e1000e/phy.h | 2 + 4 files changed, 154 insertions(+), 73 deletions(-) + +--- a/drivers/net/ethernet/intel/e1000e/hw.h ++++ b/drivers/net/ethernet/intel/e1000e/hw.h +@@ -628,6 +628,7 @@ struct e1000_phy_info { + u32 id; + u32 reset_delay_us; /* in usec */ + u32 revision; ++ u32 retry_count; + + enum e1000_media_type media_type; + +@@ -644,6 +645,7 @@ struct e1000_phy_info { + bool polarity_correction; + bool speed_downgraded; + bool autoneg_wait_to_complete; ++ bool retry_enabled; + }; + + struct e1000_nvm_info { +--- a/drivers/net/ethernet/intel/e1000e/ich8lan.c ++++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c +@@ -222,11 +222,18 @@ out: + if (hw->mac.type >= e1000_pch_lpt) { + /* Only unforce SMBus if ME is not active */ + if (!(er32(FWSM) & E1000_ICH_FWSM_FW_VALID)) { ++ /* Switching PHY interface always returns MDI error ++ * so disable retry mechanism to avoid wasting time ++ */ ++ e1000e_disable_phy_retry(hw); ++ + /* Unforce SMBus mode in PHY */ + e1e_rphy_locked(hw, CV_SMB_CTRL, &phy_reg); + phy_reg &= ~CV_SMB_CTRL_FORCE_SMBUS; + e1e_wphy_locked(hw, CV_SMB_CTRL, phy_reg); + ++ e1000e_enable_phy_retry(hw); ++ + /* Unforce SMBus mode in MAC */ + mac_reg = er32(CTRL_EXT); + mac_reg &= ~E1000_CTRL_EXT_FORCE_SMBUS; +@@ -310,6 +317,11 @@ static s32 e1000_init_phy_workarounds_pc + goto out; + } + ++ /* There is no guarantee that the PHY is accessible at this time ++ * so disable retry mechanism to avoid wasting time ++ */ ++ e1000e_disable_phy_retry(hw); ++ + /* The MAC-PHY interconnect may be in SMBus mode. If the PHY is + * inaccessible and resetting the PHY is not blocked, toggle the + * LANPHYPC Value bit to force the interconnect to PCIe mode. +@@ -380,6 +392,8 @@ static s32 e1000_init_phy_workarounds_pc + break; + } + ++ e1000e_enable_phy_retry(hw); ++ + hw->phy.ops.release(hw); + if (!ret_val) { + +@@ -449,6 +463,11 @@ static s32 e1000_init_phy_params_pchlan( + + phy->id = e1000_phy_unknown; + ++ if (hw->mac.type == e1000_pch_mtp) { ++ phy->retry_count = 2; ++ e1000e_enable_phy_retry(hw); ++ } ++ + ret_val = e1000_init_phy_workarounds_pchlan(hw); + if (ret_val) + return ret_val; +@@ -1146,6 +1165,11 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000 + if (ret_val) + goto out; + ++ /* Switching PHY interface always returns MDI error ++ * so disable retry mechanism to avoid wasting time ++ */ ++ e1000e_disable_phy_retry(hw); ++ + /* Force SMBus mode in PHY */ + ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); + if (ret_val) +@@ -1153,6 +1177,8 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000 + phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; + e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + ++ e1000e_enable_phy_retry(hw); ++ + /* Force SMBus mode in MAC */ + mac_reg = er32(CTRL_EXT); + mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; +@@ -1313,6 +1339,11 @@ static s32 e1000_disable_ulp_lpt_lp(stru + /* Toggle LANPHYPC Value bit */ + e1000_toggle_lanphypc_pch_lpt(hw); + ++ /* Switching PHY interface always returns MDI error ++ * so disable retry mechanism to avoid wasting time ++ */ ++ e1000e_disable_phy_retry(hw); ++ + /* Unforce SMBus mode in PHY */ + ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); + if (ret_val) { +@@ -1333,6 +1364,8 @@ static s32 e1000_disable_ulp_lpt_lp(stru + phy_reg &= ~CV_SMB_CTRL_FORCE_SMBUS; + e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); + ++ e1000e_enable_phy_retry(hw); ++ + /* Unforce SMBus mode in MAC */ + mac_reg = er32(CTRL_EXT); + mac_reg &= ~E1000_CTRL_EXT_FORCE_SMBUS; +--- a/drivers/net/ethernet/intel/e1000e/phy.c ++++ b/drivers/net/ethernet/intel/e1000e/phy.c +@@ -107,6 +107,16 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw + return e1e_wphy(hw, M88E1000_PHY_GEN_CONTROL, 0); + } + ++void e1000e_disable_phy_retry(struct e1000_hw *hw) ++{ ++ hw->phy.retry_enabled = false; ++} ++ ++void e1000e_enable_phy_retry(struct e1000_hw *hw) ++{ ++ hw->phy.retry_enabled = true; ++} ++ + /** + * e1000e_read_phy_reg_mdic - Read MDI control register + * @hw: pointer to the HW structure +@@ -118,55 +128,73 @@ s32 e1000e_phy_reset_dsp(struct e1000_hw + **/ + s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data) + { ++ u32 i, mdic = 0, retry_counter, retry_max; + struct e1000_phy_info *phy = &hw->phy; +- u32 i, mdic = 0; ++ bool success; + + if (offset > MAX_PHY_REG_ADDRESS) { + e_dbg("PHY Address %d is out of range\n", offset); + return -E1000_ERR_PARAM; + } + ++ retry_max = phy->retry_enabled ? phy->retry_count : 0; ++ + /* Set up Op-code, Phy Address, and register offset in the MDI + * Control register. The MAC will take care of interfacing with the + * PHY to retrieve the desired data. + */ +- mdic = ((offset << E1000_MDIC_REG_SHIFT) | +- (phy->addr << E1000_MDIC_PHY_SHIFT) | +- (E1000_MDIC_OP_READ)); +- +- ew32(MDIC, mdic); +- +- /* Poll the ready bit to see if the MDI read completed +- * Increasing the time out as testing showed failures with +- * the lower time out +- */ +- for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { +- udelay(50); +- mdic = er32(MDIC); +- if (mdic & E1000_MDIC_READY) +- break; +- } +- if (!(mdic & E1000_MDIC_READY)) { +- e_dbg("MDI Read PHY Reg Address %d did not complete\n", offset); +- return -E1000_ERR_PHY; +- } +- if (mdic & E1000_MDIC_ERROR) { +- e_dbg("MDI Read PHY Reg Address %d Error\n", offset); +- return -E1000_ERR_PHY; +- } +- if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { +- e_dbg("MDI Read offset error - requested %d, returned %d\n", +- offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); +- return -E1000_ERR_PHY; ++ for (retry_counter = 0; retry_counter <= retry_max; retry_counter++) { ++ success = true; ++ ++ mdic = ((offset << E1000_MDIC_REG_SHIFT) | ++ (phy->addr << E1000_MDIC_PHY_SHIFT) | ++ (E1000_MDIC_OP_READ)); ++ ++ ew32(MDIC, mdic); ++ ++ /* Poll the ready bit to see if the MDI read completed ++ * Increasing the time out as testing showed failures with ++ * the lower time out ++ */ ++ for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { ++ usleep_range(50, 60); ++ mdic = er32(MDIC); ++ if (mdic & E1000_MDIC_READY) ++ break; ++ } ++ if (!(mdic & E1000_MDIC_READY)) { ++ e_dbg("MDI Read PHY Reg Address %d did not complete\n", ++ offset); ++ success = false; ++ } ++ if (mdic & E1000_MDIC_ERROR) { ++ e_dbg("MDI Read PHY Reg Address %d Error\n", offset); ++ success = false; ++ } ++ if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { ++ e_dbg("MDI Read offset error - requested %d, returned %d\n", ++ offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); ++ success = false; ++ } ++ ++ /* Allow some time after each MDIC transaction to avoid ++ * reading duplicate data in the next MDIC transaction. ++ */ ++ if (hw->mac.type == e1000_pch2lan) ++ usleep_range(100, 150); ++ ++ if (success) { ++ *data = (u16)mdic; ++ return 0; ++ } ++ ++ if (retry_counter != retry_max) { ++ e_dbg("Perform retry on PHY transaction...\n"); ++ mdelay(10); ++ } + } +- *data = (u16)mdic; + +- /* Allow some time after each MDIC transaction to avoid +- * reading duplicate data in the next MDIC transaction. +- */ +- if (hw->mac.type == e1000_pch2lan) +- udelay(100); +- return 0; ++ return -E1000_ERR_PHY; + } + + /** +@@ -179,56 +207,72 @@ s32 e1000e_read_phy_reg_mdic(struct e100 + **/ + s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data) + { ++ u32 i, mdic = 0, retry_counter, retry_max; + struct e1000_phy_info *phy = &hw->phy; +- u32 i, mdic = 0; ++ bool success; + + if (offset > MAX_PHY_REG_ADDRESS) { + e_dbg("PHY Address %d is out of range\n", offset); + return -E1000_ERR_PARAM; + } + ++ retry_max = phy->retry_enabled ? phy->retry_count : 0; ++ + /* Set up Op-code, Phy Address, and register offset in the MDI + * Control register. The MAC will take care of interfacing with the + * PHY to retrieve the desired data. + */ +- mdic = (((u32)data) | +- (offset << E1000_MDIC_REG_SHIFT) | +- (phy->addr << E1000_MDIC_PHY_SHIFT) | +- (E1000_MDIC_OP_WRITE)); +- +- ew32(MDIC, mdic); +- +- /* Poll the ready bit to see if the MDI read completed +- * Increasing the time out as testing showed failures with +- * the lower time out +- */ +- for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { +- udelay(50); +- mdic = er32(MDIC); +- if (mdic & E1000_MDIC_READY) +- break; +- } +- if (!(mdic & E1000_MDIC_READY)) { +- e_dbg("MDI Write PHY Reg Address %d did not complete\n", offset); +- return -E1000_ERR_PHY; +- } +- if (mdic & E1000_MDIC_ERROR) { +- e_dbg("MDI Write PHY Red Address %d Error\n", offset); +- return -E1000_ERR_PHY; +- } +- if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { +- e_dbg("MDI Write offset error - requested %d, returned %d\n", +- offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); +- return -E1000_ERR_PHY; +- } ++ for (retry_counter = 0; retry_counter <= retry_max; retry_counter++) { ++ success = true; + +- /* Allow some time after each MDIC transaction to avoid +- * reading duplicate data in the next MDIC transaction. +- */ +- if (hw->mac.type == e1000_pch2lan) +- udelay(100); ++ mdic = (((u32)data) | ++ (offset << E1000_MDIC_REG_SHIFT) | ++ (phy->addr << E1000_MDIC_PHY_SHIFT) | ++ (E1000_MDIC_OP_WRITE)); ++ ++ ew32(MDIC, mdic); ++ ++ /* Poll the ready bit to see if the MDI read completed ++ * Increasing the time out as testing showed failures with ++ * the lower time out ++ */ ++ for (i = 0; i < (E1000_GEN_POLL_TIMEOUT * 3); i++) { ++ usleep_range(50, 60); ++ mdic = er32(MDIC); ++ if (mdic & E1000_MDIC_READY) ++ break; ++ } ++ if (!(mdic & E1000_MDIC_READY)) { ++ e_dbg("MDI Write PHY Reg Address %d did not complete\n", ++ offset); ++ success = false; ++ } ++ if (mdic & E1000_MDIC_ERROR) { ++ e_dbg("MDI Write PHY Reg Address %d Error\n", offset); ++ success = false; ++ } ++ if (FIELD_GET(E1000_MDIC_REG_MASK, mdic) != offset) { ++ e_dbg("MDI Write offset error - requested %d, returned %d\n", ++ offset, FIELD_GET(E1000_MDIC_REG_MASK, mdic)); ++ success = false; ++ } ++ ++ /* Allow some time after each MDIC transaction to avoid ++ * reading duplicate data in the next MDIC transaction. ++ */ ++ if (hw->mac.type == e1000_pch2lan) ++ usleep_range(100, 150); ++ ++ if (success) ++ return 0; ++ ++ if (retry_counter != retry_max) { ++ e_dbg("Perform retry on PHY transaction...\n"); ++ mdelay(10); ++ } ++ } + +- return 0; ++ return -E1000_ERR_PHY; + } + + /** +--- a/drivers/net/ethernet/intel/e1000e/phy.h ++++ b/drivers/net/ethernet/intel/e1000e/phy.h +@@ -51,6 +51,8 @@ s32 e1000e_read_phy_reg_bm2(struct e1000 + s32 e1000e_write_phy_reg_bm2(struct e1000_hw *hw, u32 offset, u16 data); + void e1000_power_up_phy_copper(struct e1000_hw *hw); + void e1000_power_down_phy_copper(struct e1000_hw *hw); ++void e1000e_disable_phy_retry(struct e1000_hw *hw); ++void e1000e_enable_phy_retry(struct e1000_hw *hw); + s32 e1000e_read_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 *data); + s32 e1000e_write_phy_reg_mdic(struct e1000_hw *hw, u32 offset, u16 data); + s32 e1000_read_phy_reg_hv(struct e1000_hw *hw, u32 offset, u16 *data); diff --git a/queue-6.8/erspan-make-sure-erspan_base_hdr-is-present-in-skb-head.patch b/queue-6.8/erspan-make-sure-erspan_base_hdr-is-present-in-skb-head.patch new file mode 100644 index 00000000000..eb06db27af3 --- /dev/null +++ b/queue-6.8/erspan-make-sure-erspan_base_hdr-is-present-in-skb-head.patch @@ -0,0 +1,121 @@ +From 17af420545a750f763025149fa7b833a4fc8b8f0 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Thu, 28 Mar 2024 11:22:48 +0000 +Subject: erspan: make sure erspan_base_hdr is present in skb->head + +From: Eric Dumazet + +commit 17af420545a750f763025149fa7b833a4fc8b8f0 upstream. + +syzbot reported a problem in ip6erspan_rcv() [1] + +Issue is that ip6erspan_rcv() (and erspan_rcv()) no longer make +sure erspan_base_hdr is present in skb linear part (skb->head) +before getting @ver field from it. + +Add the missing pskb_may_pull() calls. + +v2: Reload iph pointer in erspan_rcv() after pskb_may_pull() + because skb->head might have changed. + +[1] + + BUG: KMSAN: uninit-value in pskb_may_pull_reason include/linux/skbuff.h:2742 [inline] + BUG: KMSAN: uninit-value in pskb_may_pull include/linux/skbuff.h:2756 [inline] + BUG: KMSAN: uninit-value in ip6erspan_rcv net/ipv6/ip6_gre.c:541 [inline] + BUG: KMSAN: uninit-value in gre_rcv+0x11f8/0x1930 net/ipv6/ip6_gre.c:610 + pskb_may_pull_reason include/linux/skbuff.h:2742 [inline] + pskb_may_pull include/linux/skbuff.h:2756 [inline] + ip6erspan_rcv net/ipv6/ip6_gre.c:541 [inline] + gre_rcv+0x11f8/0x1930 net/ipv6/ip6_gre.c:610 + ip6_protocol_deliver_rcu+0x1d4c/0x2ca0 net/ipv6/ip6_input.c:438 + ip6_input_finish net/ipv6/ip6_input.c:483 [inline] + NF_HOOK include/linux/netfilter.h:314 [inline] + ip6_input+0x15d/0x430 net/ipv6/ip6_input.c:492 + ip6_mc_input+0xa7e/0xc80 net/ipv6/ip6_input.c:586 + dst_input include/net/dst.h:460 [inline] + ip6_rcv_finish+0x955/0x970 net/ipv6/ip6_input.c:79 + NF_HOOK include/linux/netfilter.h:314 [inline] + ipv6_rcv+0xde/0x390 net/ipv6/ip6_input.c:310 + __netif_receive_skb_one_core net/core/dev.c:5538 [inline] + __netif_receive_skb+0x1da/0xa00 net/core/dev.c:5652 + netif_receive_skb_internal net/core/dev.c:5738 [inline] + netif_receive_skb+0x58/0x660 net/core/dev.c:5798 + tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1549 + tun_get_user+0x5566/0x69e0 drivers/net/tun.c:2002 + tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 + call_write_iter include/linux/fs.h:2108 [inline] + new_sync_write fs/read_write.c:497 [inline] + vfs_write+0xb63/0x1520 fs/read_write.c:590 + ksys_write+0x20f/0x4c0 fs/read_write.c:643 + __do_sys_write fs/read_write.c:655 [inline] + __se_sys_write fs/read_write.c:652 [inline] + __x64_sys_write+0x93/0xe0 fs/read_write.c:652 + do_syscall_64+0xd5/0x1f0 + entry_SYSCALL_64_after_hwframe+0x6d/0x75 + +Uninit was created at: + slab_post_alloc_hook mm/slub.c:3804 [inline] + slab_alloc_node mm/slub.c:3845 [inline] + kmem_cache_alloc_node+0x613/0xc50 mm/slub.c:3888 + kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:577 + __alloc_skb+0x35b/0x7a0 net/core/skbuff.c:668 + alloc_skb include/linux/skbuff.h:1318 [inline] + alloc_skb_with_frags+0xc8/0xbf0 net/core/skbuff.c:6504 + sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2795 + tun_alloc_skb drivers/net/tun.c:1525 [inline] + tun_get_user+0x209a/0x69e0 drivers/net/tun.c:1846 + tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 + call_write_iter include/linux/fs.h:2108 [inline] + new_sync_write fs/read_write.c:497 [inline] + vfs_write+0xb63/0x1520 fs/read_write.c:590 + ksys_write+0x20f/0x4c0 fs/read_write.c:643 + __do_sys_write fs/read_write.c:655 [inline] + __se_sys_write fs/read_write.c:652 [inline] + __x64_sys_write+0x93/0xe0 fs/read_write.c:652 + do_syscall_64+0xd5/0x1f0 + entry_SYSCALL_64_after_hwframe+0x6d/0x75 + +CPU: 1 PID: 5045 Comm: syz-executor114 Not tainted 6.9.0-rc1-syzkaller-00021-g962490525cff #0 + +Fixes: cb73ee40b1b3 ("net: ip_gre: use erspan key field for tunnel lookup") +Reported-by: syzbot+1c1cf138518bf0c53d68@syzkaller.appspotmail.com +Closes: https://lore.kernel.org/netdev/000000000000772f2c0614b66ef7@google.com/ +Signed-off-by: Eric Dumazet +Cc: Lorenzo Bianconi +Link: https://lore.kernel.org/r/20240328112248.1101491-1-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_gre.c | 5 +++++ + net/ipv6/ip6_gre.c | 3 +++ + 2 files changed, 8 insertions(+) + +--- a/net/ipv4/ip_gre.c ++++ b/net/ipv4/ip_gre.c +@@ -280,8 +280,13 @@ static int erspan_rcv(struct sk_buff *sk + tpi->flags | TUNNEL_NO_KEY, + iph->saddr, iph->daddr, 0); + } else { ++ if (unlikely(!pskb_may_pull(skb, ++ gre_hdr_len + sizeof(*ershdr)))) ++ return PACKET_REJECT; ++ + ershdr = (struct erspan_base_hdr *)(skb->data + gre_hdr_len); + ver = ershdr->ver; ++ iph = ip_hdr(skb); + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, + tpi->flags | TUNNEL_KEY, + iph->saddr, iph->daddr, tpi->key); +--- a/net/ipv6/ip6_gre.c ++++ b/net/ipv6/ip6_gre.c +@@ -528,6 +528,9 @@ static int ip6erspan_rcv(struct sk_buff + struct ip6_tnl *tunnel; + u8 ver; + ++ if (unlikely(!pskb_may_pull(skb, sizeof(*ershdr)))) ++ return PACKET_REJECT; ++ + ipv6h = ipv6_hdr(skb); + ershdr = (struct erspan_base_hdr *)skb->data; + ver = ershdr->ver; diff --git a/queue-6.8/i40e-enforce-software-interrupt-during-busy-poll-exit.patch b/queue-6.8/i40e-enforce-software-interrupt-during-busy-poll-exit.patch new file mode 100644 index 00000000000..bf4efa8b716 --- /dev/null +++ b/queue-6.8/i40e-enforce-software-interrupt-during-busy-poll-exit.patch @@ -0,0 +1,307 @@ +From ea558de7238bb12c3435c47f0631e9d17bf4a09f Mon Sep 17 00:00:00 2001 +From: Ivan Vecera +Date: Sat, 16 Mar 2024 12:38:29 +0100 +Subject: i40e: Enforce software interrupt during busy-poll exit + +From: Ivan Vecera + +commit ea558de7238bb12c3435c47f0631e9d17bf4a09f upstream. + +As for ice bug fixed by commit b7306b42beaf ("ice: manage interrupts +during poll exit") followed by commit 23be7075b318 ("ice: fix software +generating extra interrupts") I'm seeing the similar issue also with +i40e driver. + +In certain situation when busy-loop is enabled together with adaptive +coalescing, the driver occasionally misses that there are outstanding +descriptors to clean when exiting busy poll. + +Try to catch the remaining work by triggering a software interrupt +when exiting busy poll. No extra interrupts will be generated when +busy polling is not used. + +The issue was found when running sockperf ping-pong tcp test with +adaptive coalescing and busy poll enabled (50 as value busy_pool +and busy_read sysctl knobs) and results in huge latency spikes +with more than 100000us. + +The fix is inspired from the ice driver and do the following: +1) During napi poll exit in case of busy-poll (napo_complete_done() + returns false) this is recorded to q_vector that we were in busy + loop. +2) Extends i40e_buildreg_itr() to be able to add an enforced software + interrupt into built value +2) In i40e_update_enable_itr() enforces a software interrupt trigger + if we are exiting busy poll to catch any pending clean-ups +3) Reuses unused 3rd ITR (interrupt throttle) index and set it to + 20K interrupts per second to limit the number of these sw interrupts. + +Test results +============ +Prior: +[root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 +sockperf: == version #3.10-no.git == +sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) + +[ 0] IP = 10.9.9.1 PORT = 11111 # TCP +sockperf: Warmup stage (sending a few dummy messages)... +sockperf: Starting test... +sockperf: Test end (interrupted by timer) +sockperf: Test ended +sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2438563; ReceivedMessages=2438562 +sockperf: ========= Printing statistics for Server No: 0 +sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2429473; ReceivedMessages=2429473 +sockperf: ====> avg-latency=24.571 (std-dev=93.297, mean-ad=4.904, median-ad=1.510, siqr=1.063, cv=3.797, std-error=0.060, 99.0% ci=[24.417, 24.725]) +sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 +sockperf: Summary: Latency is 24.571 usec +sockperf: Total 2429473 observations; each percentile contains 24294.73 observations +sockperf: ---> observation = 103294.331 +sockperf: ---> percentile 99.999 = 45.633 +sockperf: ---> percentile 99.990 = 37.013 +sockperf: ---> percentile 99.900 = 35.910 +sockperf: ---> percentile 99.000 = 33.390 +sockperf: ---> percentile 90.000 = 28.626 +sockperf: ---> percentile 75.000 = 27.741 +sockperf: ---> percentile 50.000 = 26.743 +sockperf: ---> percentile 25.000 = 25.614 +sockperf: ---> observation = 12.220 + +After: +[root@dell-per640-07 net]# sockperf ping-pong -i 10.9.9.1 --tcp -m 1000 --mps=max -t 120 +sockperf: == version #3.10-no.git == +sockperf[CLIENT] send on:sockperf: using recvfrom() to block on socket(s) + +[ 0] IP = 10.9.9.1 PORT = 11111 # TCP +sockperf: Warmup stage (sending a few dummy messages)... +sockperf: Starting test... +sockperf: Test end (interrupted by timer) +sockperf: Test ended +sockperf: [Total Run] RunTime=119.999 sec; Warm up time=400 msec; SentMessages=2400055; ReceivedMessages=2400054 +sockperf: ========= Printing statistics for Server No: 0 +sockperf: [Valid Duration] RunTime=119.549 sec; SentMessages=2391186; ReceivedMessages=2391186 +sockperf: ====> avg-latency=24.965 (std-dev=5.934, mean-ad=4.642, median-ad=1.485, siqr=1.067, cv=0.238, std-error=0.004, 99.0% ci=[24.955, 24.975]) +sockperf: # dropped messages = 0; # duplicated messages = 0; # out-of-order messages = 0 +sockperf: Summary: Latency is 24.965 usec +sockperf: Total 2391186 observations; each percentile contains 23911.86 observations +sockperf: ---> observation = 195.841 +sockperf: ---> percentile 99.999 = 45.026 +sockperf: ---> percentile 99.990 = 39.009 +sockperf: ---> percentile 99.900 = 35.922 +sockperf: ---> percentile 99.000 = 33.482 +sockperf: ---> percentile 90.000 = 28.902 +sockperf: ---> percentile 75.000 = 27.821 +sockperf: ---> percentile 50.000 = 26.860 +sockperf: ---> percentile 25.000 = 25.685 +sockperf: ---> observation = 12.277 + +Fixes: 0bcd952feec7 ("ethernet/intel: consolidate NAPI and NAPI exit") +Reported-by: Hugo Ferreira +Reviewed-by: Michal Schmidt +Signed-off-by: Ivan Vecera +Reviewed-by: Jesse Brandeburg +Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) +Signed-off-by: Tony Nguyen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 6 + + drivers/net/ethernet/intel/i40e/i40e_register.h | 3 + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 82 +++++++++++++++++------- + drivers/net/ethernet/intel/i40e/i40e_txrx.h | 1 + 5 files changed, 72 insertions(+), 21 deletions(-) + +--- a/drivers/net/ethernet/intel/i40e/i40e.h ++++ b/drivers/net/ethernet/intel/i40e/i40e.h +@@ -908,6 +908,7 @@ struct i40e_q_vector { + struct rcu_head rcu; /* to avoid race with update stats on free */ + char name[I40E_INT_NAME_STR_LEN]; + bool arm_wb_state; ++ bool in_busy_poll; + int irq_num; /* IRQ assigned to this q_vector */ + } ____cacheline_internodealigned_in_smp; + +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -3918,6 +3918,12 @@ static void i40e_vsi_configure_msix(stru + q_vector->tx.target_itr >> 1); + q_vector->tx.current_itr = q_vector->tx.target_itr; + ++ /* Set ITR for software interrupts triggered after exiting ++ * busy-loop polling. ++ */ ++ wr32(hw, I40E_PFINT_ITRN(I40E_SW_ITR, vector - 1), ++ I40E_ITR_20K); ++ + wr32(hw, I40E_PFINT_RATEN(vector - 1), + i40e_intrl_usec_to_reg(vsi->int_rate_limit)); + +--- a/drivers/net/ethernet/intel/i40e/i40e_register.h ++++ b/drivers/net/ethernet/intel/i40e/i40e_register.h +@@ -333,8 +333,11 @@ + #define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT 3 + #define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) + #define I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT 5 ++#define I40E_PFINT_DYN_CTLN_INTERVAL_MASK I40E_MASK(0xFFF, I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT) + #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT 24 + #define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK I40E_MASK(0x1, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_SHIFT) ++#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT 25 ++#define I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK I40E_MASK(0x3, I40E_PFINT_DYN_CTLN_SW_ITR_INDX_SHIFT) + #define I40E_PFINT_ICR0 0x00038780 /* Reset: CORER */ + #define I40E_PFINT_ICR0_INTEVENT_SHIFT 0 + #define I40E_PFINT_ICR0_INTEVENT_MASK I40E_MASK(0x1, I40E_PFINT_ICR0_INTEVENT_SHIFT) +--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +@@ -2630,7 +2630,22 @@ process_next: + return failure ? budget : (int)total_rx_packets; + } + +-static inline u32 i40e_buildreg_itr(const int type, u16 itr) ++/** ++ * i40e_buildreg_itr - build a value for writing to I40E_PFINT_DYN_CTLN register ++ * @itr_idx: interrupt throttling index ++ * @interval: interrupt throttling interval value in usecs ++ * @force_swint: force software interrupt ++ * ++ * The function builds a value for I40E_PFINT_DYN_CTLN register that ++ * is used to update interrupt throttling interval for specified ITR index ++ * and optionally enforces a software interrupt. If the @itr_idx is equal ++ * to I40E_ITR_NONE then no interval change is applied and only @force_swint ++ * parameter is taken into account. If the interval change and enforced ++ * software interrupt are not requested then the built value just enables ++ * appropriate vector interrupt. ++ **/ ++static u32 i40e_buildreg_itr(enum i40e_dyn_idx itr_idx, u16 interval, ++ bool force_swint) + { + u32 val; + +@@ -2644,23 +2659,33 @@ static inline u32 i40e_buildreg_itr(cons + * an event in the PBA anyway so we need to rely on the automask + * to hold pending events for us until the interrupt is re-enabled + * +- * The itr value is reported in microseconds, and the register +- * value is recorded in 2 microsecond units. For this reason we +- * only need to shift by the interval shift - 1 instead of the +- * full value. ++ * We have to shift the given value as it is reported in microseconds ++ * and the register value is recorded in 2 microsecond units. + */ +- itr &= I40E_ITR_MASK; ++ interval >>= 1; + ++ /* 1. Enable vector interrupt ++ * 2. Update the interval for the specified ITR index ++ * (I40E_ITR_NONE in the register is used to indicate that ++ * no interval update is requested) ++ */ + val = I40E_PFINT_DYN_CTLN_INTENA_MASK | +- (type << I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT) | +- (itr << (I40E_PFINT_DYN_CTLN_INTERVAL_SHIFT - 1)); ++ FIELD_PREP(I40E_PFINT_DYN_CTLN_ITR_INDX_MASK, itr_idx) | ++ FIELD_PREP(I40E_PFINT_DYN_CTLN_INTERVAL_MASK, interval); ++ ++ /* 3. Enforce software interrupt trigger if requested ++ * (These software interrupts rate is limited by ITR2 that is ++ * set to 20K interrupts per second) ++ */ ++ if (force_swint) ++ val |= I40E_PFINT_DYN_CTLN_SWINT_TRIG_MASK | ++ I40E_PFINT_DYN_CTLN_SW_ITR_INDX_ENA_MASK | ++ FIELD_PREP(I40E_PFINT_DYN_CTLN_SW_ITR_INDX_MASK, ++ I40E_SW_ITR); + + return val; + } + +-/* a small macro to shorten up some long lines */ +-#define INTREG I40E_PFINT_DYN_CTLN +- + /* The act of updating the ITR will cause it to immediately trigger. In order + * to prevent this from throwing off adaptive update statistics we defer the + * update so that it can only happen so often. So after either Tx or Rx are +@@ -2679,8 +2704,10 @@ static inline u32 i40e_buildreg_itr(cons + static inline void i40e_update_enable_itr(struct i40e_vsi *vsi, + struct i40e_q_vector *q_vector) + { ++ enum i40e_dyn_idx itr_idx = I40E_ITR_NONE; + struct i40e_hw *hw = &vsi->back->hw; +- u32 intval; ++ u16 interval = 0; ++ u32 itr_val; + + /* If we don't have MSIX, then we only need to re-enable icr0 */ + if (!test_bit(I40E_FLAG_MSIX_ENA, vsi->back->flags)) { +@@ -2702,8 +2729,8 @@ static inline void i40e_update_enable_it + */ + if (q_vector->rx.target_itr < q_vector->rx.current_itr) { + /* Rx ITR needs to be reduced, this is highest priority */ +- intval = i40e_buildreg_itr(I40E_RX_ITR, +- q_vector->rx.target_itr); ++ itr_idx = I40E_RX_ITR; ++ interval = q_vector->rx.target_itr; + q_vector->rx.current_itr = q_vector->rx.target_itr; + q_vector->itr_countdown = ITR_COUNTDOWN_START; + } else if ((q_vector->tx.target_itr < q_vector->tx.current_itr) || +@@ -2712,25 +2739,36 @@ static inline void i40e_update_enable_it + /* Tx ITR needs to be reduced, this is second priority + * Tx ITR needs to be increased more than Rx, fourth priority + */ +- intval = i40e_buildreg_itr(I40E_TX_ITR, +- q_vector->tx.target_itr); ++ itr_idx = I40E_TX_ITR; ++ interval = q_vector->tx.target_itr; + q_vector->tx.current_itr = q_vector->tx.target_itr; + q_vector->itr_countdown = ITR_COUNTDOWN_START; + } else if (q_vector->rx.current_itr != q_vector->rx.target_itr) { + /* Rx ITR needs to be increased, third priority */ +- intval = i40e_buildreg_itr(I40E_RX_ITR, +- q_vector->rx.target_itr); ++ itr_idx = I40E_RX_ITR; ++ interval = q_vector->rx.target_itr; + q_vector->rx.current_itr = q_vector->rx.target_itr; + q_vector->itr_countdown = ITR_COUNTDOWN_START; + } else { + /* No ITR update, lowest priority */ +- intval = i40e_buildreg_itr(I40E_ITR_NONE, 0); + if (q_vector->itr_countdown) + q_vector->itr_countdown--; + } + +- if (!test_bit(__I40E_VSI_DOWN, vsi->state)) +- wr32(hw, INTREG(q_vector->reg_idx), intval); ++ /* Do not update interrupt control register if VSI is down */ ++ if (test_bit(__I40E_VSI_DOWN, vsi->state)) ++ return; ++ ++ /* Update ITR interval if necessary and enforce software interrupt ++ * if we are exiting busy poll. ++ */ ++ if (q_vector->in_busy_poll) { ++ itr_val = i40e_buildreg_itr(itr_idx, interval, true); ++ q_vector->in_busy_poll = false; ++ } else { ++ itr_val = i40e_buildreg_itr(itr_idx, interval, false); ++ } ++ wr32(hw, I40E_PFINT_DYN_CTLN(q_vector->reg_idx), itr_val); + } + + /** +@@ -2845,6 +2883,8 @@ tx_only: + */ + if (likely(napi_complete_done(napi, work_done))) + i40e_update_enable_itr(vsi, q_vector); ++ else ++ q_vector->in_busy_poll = true; + + return min(work_done, budget - 1); + } +--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h ++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h +@@ -68,6 +68,7 @@ enum i40e_dyn_idx { + /* these are indexes into ITRN registers */ + #define I40E_RX_ITR I40E_IDX_ITR0 + #define I40E_TX_ITR I40E_IDX_ITR1 ++#define I40E_SW_ITR I40E_IDX_ITR2 + + /* Supported RSS offloads */ + #define I40E_DEFAULT_RSS_HENA ( \ diff --git a/queue-6.8/i40e-fix-i40e_count_filters-to-count-only-active-new-filters.patch b/queue-6.8/i40e-fix-i40e_count_filters-to-count-only-active-new-filters.patch new file mode 100644 index 00000000000..2da4f9cfb3a --- /dev/null +++ b/queue-6.8/i40e-fix-i40e_count_filters-to-count-only-active-new-filters.patch @@ -0,0 +1,44 @@ +From eb58c598ce45b7e787568fe27016260417c3d807 Mon Sep 17 00:00:00 2001 +From: Aleksandr Loktionov +Date: Wed, 13 Mar 2024 10:44:00 +0100 +Subject: i40e: fix i40e_count_filters() to count only active/new filters + +From: Aleksandr Loktionov + +commit eb58c598ce45b7e787568fe27016260417c3d807 upstream. + +The bug usually affects untrusted VFs, because they are limited to 18 MACs, +it affects them badly, not letting to create MAC all filters. +Not stable to reproduce, it happens when VF user creates MAC filters +when other MACVLAN operations are happened in parallel. +But consequence is that VF can't receive desired traffic. + +Fix counter to be bumped only for new or active filters. + +Fixes: 621650cabee5 ("i40e: Refactoring VF MAC filters counting to make more reliable") +Signed-off-by: Aleksandr Loktionov +Reviewed-by: Arkadiusz Kubalewski +Reviewed-by: Paul Menzel +Tested-by: Rafal Romanowski +Signed-off-by: Tony Nguyen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -1257,8 +1257,11 @@ int i40e_count_filters(struct i40e_vsi * + int bkt; + int cnt = 0; + +- hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) +- ++cnt; ++ hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { ++ if (f->state == I40E_FILTER_NEW || ++ f->state == I40E_FILTER_ACTIVE) ++ ++cnt; ++ } + + return cnt; + } diff --git a/queue-6.8/i40e-fix-vf-mac-filter-removal.patch b/queue-6.8/i40e-fix-vf-mac-filter-removal.patch new file mode 100644 index 00000000000..8319316a838 --- /dev/null +++ b/queue-6.8/i40e-fix-vf-mac-filter-removal.patch @@ -0,0 +1,65 @@ +From ea2a1cfc3b2019bdea6324acd3c03606b60d71ad Mon Sep 17 00:00:00 2001 +From: Ivan Vecera +Date: Fri, 29 Mar 2024 11:06:37 -0700 +Subject: i40e: Fix VF MAC filter removal + +From: Ivan Vecera + +commit ea2a1cfc3b2019bdea6324acd3c03606b60d71ad upstream. + +Commit 73d9629e1c8c ("i40e: Do not allow untrusted VF to remove +administratively set MAC") fixed an issue where untrusted VF was +allowed to remove its own MAC address although this was assigned +administratively from PF. Unfortunately the introduced check +is wrong because it causes that MAC filters for other MAC addresses +including multi-cast ones are not removed. + + + if (ether_addr_equal(addr, vf->default_lan_addr.addr) && + i40e_can_vf_change_mac(vf)) + was_unimac_deleted = true; + else + continue; + + if (i40e_del_mac_filter(vsi, al->list[i].addr)) { + ... + + +The else path with `continue` effectively skips any MAC filter +removal except one for primary MAC addr when VF is allowed to do so. +Fix the check condition so the `continue` is only done for primary +MAC address. + +Fixes: 73d9629e1c8c ("i40e: Do not allow untrusted VF to remove administratively set MAC") +Signed-off-by: Ivan Vecera +Reviewed-by: Michal Schmidt +Reviewed-by: Brett Creeley +Tested-by: Rafal Romanowski +Signed-off-by: Tony Nguyen +Link: https://lore.kernel.org/r/20240329180638.211412-1-anthony.l.nguyen@intel.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +@@ -3143,11 +3143,12 @@ static int i40e_vc_del_mac_addr_msg(stru + /* Allow to delete VF primary MAC only if it was not set + * administratively by PF or if VF is trusted. + */ +- if (ether_addr_equal(addr, vf->default_lan_addr.addr) && +- i40e_can_vf_change_mac(vf)) +- was_unimac_deleted = true; +- else +- continue; ++ if (ether_addr_equal(addr, vf->default_lan_addr.addr)) { ++ if (i40e_can_vf_change_mac(vf)) ++ was_unimac_deleted = true; ++ else ++ continue; ++ } + + if (i40e_del_mac_filter(vsi, al->list[i].addr)) { + ret = -EINVAL; diff --git a/queue-6.8/i40e-fix-vf-may-be-used-uninitialized-in-this-function-warning.patch b/queue-6.8/i40e-fix-vf-may-be-used-uninitialized-in-this-function-warning.patch new file mode 100644 index 00000000000..20371901940 --- /dev/null +++ b/queue-6.8/i40e-fix-vf-may-be-used-uninitialized-in-this-function-warning.patch @@ -0,0 +1,146 @@ +From f37c4eac99c258111d414d31b740437e1925b8e8 Mon Sep 17 00:00:00 2001 +From: Aleksandr Loktionov +Date: Wed, 13 Mar 2024 10:56:39 +0100 +Subject: i40e: fix vf may be used uninitialized in this function warning + +From: Aleksandr Loktionov + +commit f37c4eac99c258111d414d31b740437e1925b8e8 upstream. + +To fix the regression introduced by commit 52424f974bc5, which causes +servers hang in very hard to reproduce conditions with resets races. +Using two sources for the information is the root cause. +In this function before the fix bumping v didn't mean bumping vf +pointer. But the code used this variables interchangeably, so stale vf +could point to different/not intended vf. + +Remove redundant "v" variable and iterate via single VF pointer across +whole function instead to guarantee VF pointer validity. + +Fixes: 52424f974bc5 ("i40e: Fix VF hang when reset is triggered on another VF") +Signed-off-by: Aleksandr Loktionov +Reviewed-by: Arkadiusz Kubalewski +Reviewed-by: Przemek Kitszel +Reviewed-by: Paul Menzel +Tested-by: Rafal Romanowski +Signed-off-by: Tony Nguyen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 34 +++++++++------------ + 1 file changed, 16 insertions(+), 18 deletions(-) + +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +@@ -1628,8 +1628,8 @@ bool i40e_reset_all_vfs(struct i40e_pf * + { + struct i40e_hw *hw = &pf->hw; + struct i40e_vf *vf; +- int i, v; + u32 reg; ++ int i; + + /* If we don't have any VFs, then there is nothing to reset */ + if (!pf->num_alloc_vfs) +@@ -1640,11 +1640,10 @@ bool i40e_reset_all_vfs(struct i40e_pf * + return false; + + /* Begin reset on all VFs at once */ +- for (v = 0; v < pf->num_alloc_vfs; v++) { +- vf = &pf->vf[v]; ++ for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { + /* If VF is being reset no need to trigger reset again */ + if (!test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) +- i40e_trigger_vf_reset(&pf->vf[v], flr); ++ i40e_trigger_vf_reset(vf, flr); + } + + /* HW requires some time to make sure it can flush the FIFO for a VF +@@ -1653,14 +1652,13 @@ bool i40e_reset_all_vfs(struct i40e_pf * + * the VFs using a simple iterator that increments once that VF has + * finished resetting. + */ +- for (i = 0, v = 0; i < 10 && v < pf->num_alloc_vfs; i++) { ++ for (i = 0, vf = &pf->vf[0]; i < 10 && vf < &pf->vf[pf->num_alloc_vfs]; ++i) { + usleep_range(10000, 20000); + + /* Check each VF in sequence, beginning with the VF to fail + * the previous check. + */ +- while (v < pf->num_alloc_vfs) { +- vf = &pf->vf[v]; ++ while (vf < &pf->vf[pf->num_alloc_vfs]) { + if (!test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) { + reg = rd32(hw, I40E_VPGEN_VFRSTAT(vf->vf_id)); + if (!(reg & I40E_VPGEN_VFRSTAT_VFRD_MASK)) +@@ -1670,7 +1668,7 @@ bool i40e_reset_all_vfs(struct i40e_pf * + /* If the current VF has finished resetting, move on + * to the next VF in sequence. + */ +- v++; ++ ++vf; + } + } + +@@ -1680,39 +1678,39 @@ bool i40e_reset_all_vfs(struct i40e_pf * + /* Display a warning if at least one VF didn't manage to reset in + * time, but continue on with the operation. + */ +- if (v < pf->num_alloc_vfs) ++ if (vf < &pf->vf[pf->num_alloc_vfs]) + dev_err(&pf->pdev->dev, "VF reset check timeout on VF %d\n", +- pf->vf[v].vf_id); ++ vf->vf_id); + usleep_range(10000, 20000); + + /* Begin disabling all the rings associated with VFs, but do not wait + * between each VF. + */ +- for (v = 0; v < pf->num_alloc_vfs; v++) { ++ for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { + /* On initial reset, we don't have any queues to disable */ +- if (pf->vf[v].lan_vsi_idx == 0) ++ if (vf->lan_vsi_idx == 0) + continue; + + /* If VF is reset in another thread just continue */ + if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) + continue; + +- i40e_vsi_stop_rings_no_wait(pf->vsi[pf->vf[v].lan_vsi_idx]); ++ i40e_vsi_stop_rings_no_wait(pf->vsi[vf->lan_vsi_idx]); + } + + /* Now that we've notified HW to disable all of the VF rings, wait + * until they finish. + */ +- for (v = 0; v < pf->num_alloc_vfs; v++) { ++ for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { + /* On initial reset, we don't have any queues to disable */ +- if (pf->vf[v].lan_vsi_idx == 0) ++ if (vf->lan_vsi_idx == 0) + continue; + + /* If VF is reset in another thread just continue */ + if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) + continue; + +- i40e_vsi_wait_queues_disabled(pf->vsi[pf->vf[v].lan_vsi_idx]); ++ i40e_vsi_wait_queues_disabled(pf->vsi[vf->lan_vsi_idx]); + } + + /* Hw may need up to 50ms to finish disabling the RX queues. We +@@ -1721,12 +1719,12 @@ bool i40e_reset_all_vfs(struct i40e_pf * + mdelay(50); + + /* Finish the reset on each VF */ +- for (v = 0; v < pf->num_alloc_vfs; v++) { ++ for (vf = &pf->vf[0]; vf < &pf->vf[pf->num_alloc_vfs]; ++vf) { + /* If VF is reset in another thread just continue */ + if (test_bit(I40E_VF_STATE_RESETTING, &vf->vf_states)) + continue; + +- i40e_cleanup_reset_vf(&pf->vf[v]); ++ i40e_cleanup_reset_vf(vf); + } + + i40e_flush(hw); diff --git a/queue-6.8/ice-fix-enabling-rx-vlan-filtering.patch b/queue-6.8/ice-fix-enabling-rx-vlan-filtering.patch new file mode 100644 index 00000000000..b1d904e5f08 --- /dev/null +++ b/queue-6.8/ice-fix-enabling-rx-vlan-filtering.patch @@ -0,0 +1,65 @@ +From 8edfc7a40e3300fc6c5fa7a3228a24d5bcd86ba5 Mon Sep 17 00:00:00 2001 +From: Petr Oros +Date: Mon, 25 Mar 2024 21:19:01 +0100 +Subject: ice: fix enabling RX VLAN filtering + +From: Petr Oros + +commit 8edfc7a40e3300fc6c5fa7a3228a24d5bcd86ba5 upstream. + +ice_port_vlan_on/off() was introduced in commit 2946204b3fa8 ("ice: +implement bridge port vlan"). But ice_port_vlan_on() incorrectly assigns +ena_rx_filtering to inner_vlan_ops in DVM mode. +This causes an error when rx_filtering cannot be enabled in legacy mode. + +Reproducer: + echo 1 > /sys/class/net/$PF/device/sriov_numvfs + ip link set $PF vf 0 spoofchk off trust on vlan 3 +dmesg: + ice 0000:41:00.0: failed to enable Rx VLAN filtering for VF 0 VSI 9 during VF rebuild, error -95 + +Fixes: 2946204b3fa8 ("ice: implement bridge port vlan") +Signed-off-by: Petr Oros +Reviewed-by: Michal Swiatkowski +Tested-by: Rafal Romanowski +Signed-off-by: Tony Nguyen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c | 18 ++++++++---------- + 1 file changed, 8 insertions(+), 10 deletions(-) + +--- a/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c ++++ b/drivers/net/ethernet/intel/ice/ice_vf_vsi_vlan_ops.c +@@ -26,24 +26,22 @@ static void ice_port_vlan_on(struct ice_ + struct ice_vsi_vlan_ops *vlan_ops; + struct ice_pf *pf = vsi->back; + +- if (ice_is_dvm_ena(&pf->hw)) { +- vlan_ops = &vsi->outer_vlan_ops; ++ /* setup inner VLAN ops */ ++ vlan_ops = &vsi->inner_vlan_ops; + +- /* setup outer VLAN ops */ +- vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan; +- vlan_ops->clear_port_vlan = ice_vsi_clear_outer_port_vlan; +- +- /* setup inner VLAN ops */ +- vlan_ops = &vsi->inner_vlan_ops; ++ if (ice_is_dvm_ena(&pf->hw)) { + vlan_ops->add_vlan = noop_vlan_arg; + vlan_ops->del_vlan = noop_vlan_arg; + vlan_ops->ena_stripping = ice_vsi_ena_inner_stripping; + vlan_ops->dis_stripping = ice_vsi_dis_inner_stripping; + vlan_ops->ena_insertion = ice_vsi_ena_inner_insertion; + vlan_ops->dis_insertion = ice_vsi_dis_inner_insertion; +- } else { +- vlan_ops = &vsi->inner_vlan_ops; + ++ /* setup outer VLAN ops */ ++ vlan_ops = &vsi->outer_vlan_ops; ++ vlan_ops->set_port_vlan = ice_vsi_set_outer_port_vlan; ++ vlan_ops->clear_port_vlan = ice_vsi_clear_outer_port_vlan; ++ } else { + vlan_ops->set_port_vlan = ice_vsi_set_inner_port_vlan; + vlan_ops->clear_port_vlan = ice_vsi_clear_inner_port_vlan; + } diff --git a/queue-6.8/idpf-fix-kernel-panic-on-unknown-packet-types.patch b/queue-6.8/idpf-fix-kernel-panic-on-unknown-packet-types.patch new file mode 100644 index 00000000000..b21ac97fb13 --- /dev/null +++ b/queue-6.8/idpf-fix-kernel-panic-on-unknown-packet-types.patch @@ -0,0 +1,52 @@ +From dd19e827d63ac60debf117676d1126bff884bdb8 Mon Sep 17 00:00:00 2001 +From: Joshua Hay +Date: Wed, 20 Mar 2024 17:09:25 -0700 +Subject: idpf: fix kernel panic on unknown packet types + +From: Joshua Hay + +commit dd19e827d63ac60debf117676d1126bff884bdb8 upstream. + +In the very rare case where a packet type is unknown to the driver, +idpf_rx_process_skb_fields would return early without calling +eth_type_trans to set the skb protocol / the network layer handler. +This is especially problematic if tcpdump is running when such a +packet is received, i.e. it would cause a kernel panic. + +Instead, call eth_type_trans for every single packet, even when +the packet type is unknown. + +Fixes: 3a8845af66ed ("idpf: add RX splitq napi poll support") +Reported-by: Balazs Nemeth +Signed-off-by: Joshua Hay +Reviewed-by: Jesse Brandeburg +Reviewed-by: Przemek Kitszel +Tested-by: Salvatore Daniele +Signed-off-by: Pavan Kumar Linga +Tested-by: Krishneil Singh +Signed-off-by: Tony Nguyen +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/idpf/idpf_txrx.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c ++++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c +@@ -2940,6 +2940,8 @@ static int idpf_rx_process_skb_fields(st + rx_ptype = le16_get_bits(rx_desc->ptype_err_fflags0, + VIRTCHNL2_RX_FLEX_DESC_ADV_PTYPE_M); + ++ skb->protocol = eth_type_trans(skb, rxq->vport->netdev); ++ + decoded = rxq->vport->rx_ptype_lkup[rx_ptype]; + /* If we don't know the ptype we can't do anything else with it. Just + * pass it up the stack as-is. +@@ -2950,8 +2952,6 @@ static int idpf_rx_process_skb_fields(st + /* process RSS/hash */ + idpf_rx_hash(rxq, skb, rx_desc, &decoded); + +- skb->protocol = eth_type_trans(skb, rxq->vport->netdev); +- + if (le16_get_bits(rx_desc->hdrlen_flags, + VIRTCHNL2_RX_FLEX_DESC_ADV_RSC_M)) + return idpf_rx_rsc(rxq, skb, rx_desc, &decoded); diff --git a/queue-6.8/ipv6-fix-infinite-recursion-in-fib6_dump_done.patch b/queue-6.8/ipv6-fix-infinite-recursion-in-fib6_dump_done.patch new file mode 100644 index 00000000000..780311e6e02 --- /dev/null +++ b/queue-6.8/ipv6-fix-infinite-recursion-in-fib6_dump_done.patch @@ -0,0 +1,134 @@ +From d21d40605bca7bd5fc23ef03d4c1ca1f48bc2cae Mon Sep 17 00:00:00 2001 +From: Kuniyuki Iwashima +Date: Mon, 1 Apr 2024 14:10:04 -0700 +Subject: ipv6: Fix infinite recursion in fib6_dump_done(). + +From: Kuniyuki Iwashima + +commit d21d40605bca7bd5fc23ef03d4c1ca1f48bc2cae upstream. + +syzkaller reported infinite recursive calls of fib6_dump_done() during +netlink socket destruction. [1] + +From the log, syzkaller sent an AF_UNSPEC RTM_GETROUTE message, and then +the response was generated. The following recvmmsg() resumed the dump +for IPv6, but the first call of inet6_dump_fib() failed at kzalloc() due +to the fault injection. [0] + + 12:01:34 executing program 3: + r0 = socket$nl_route(0x10, 0x3, 0x0) + sendmsg$nl_route(r0, ... snip ...) + recvmmsg(r0, ... snip ...) (fail_nth: 8) + +Here, fib6_dump_done() was set to nlk_sk(sk)->cb.done, and the next call +of inet6_dump_fib() set it to nlk_sk(sk)->cb.args[3]. syzkaller stopped +receiving the response halfway through, and finally netlink_sock_destruct() +called nlk_sk(sk)->cb.done(). + +fib6_dump_done() calls fib6_dump_end() and nlk_sk(sk)->cb.done() if it +is still not NULL. fib6_dump_end() rewrites nlk_sk(sk)->cb.done() by +nlk_sk(sk)->cb.args[3], but it has the same function, not NULL, calling +itself recursively and hitting the stack guard page. + +To avoid the issue, let's set the destructor after kzalloc(). + +[0]: +FAULT_INJECTION: forcing a failure. +name failslab, interval 1, probability 0, space 0, times 0 +CPU: 1 PID: 432110 Comm: syz-executor.3 Not tainted 6.8.0-12821-g537c2e91d354-dirty #11 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 +Call Trace: + + dump_stack_lvl (lib/dump_stack.c:117) + should_fail_ex (lib/fault-inject.c:52 lib/fault-inject.c:153) + should_failslab (mm/slub.c:3733) + kmalloc_trace (mm/slub.c:3748 mm/slub.c:3827 mm/slub.c:3992) + inet6_dump_fib (./include/linux/slab.h:628 ./include/linux/slab.h:749 net/ipv6/ip6_fib.c:662) + rtnl_dump_all (net/core/rtnetlink.c:4029) + netlink_dump (net/netlink/af_netlink.c:2269) + netlink_recvmsg (net/netlink/af_netlink.c:1988) + ____sys_recvmsg (net/socket.c:1046 net/socket.c:2801) + ___sys_recvmsg (net/socket.c:2846) + do_recvmmsg (net/socket.c:2943) + __x64_sys_recvmmsg (net/socket.c:3041 net/socket.c:3034 net/socket.c:3034) + +[1]: +BUG: TASK stack guard page was hit at 00000000f2fa9af1 (stack is 00000000b7912430..000000009a436beb) +stack guard page: 0000 [#1] PREEMPT SMP KASAN +CPU: 1 PID: 223719 Comm: kworker/1:3 Not tainted 6.8.0-12821-g537c2e91d354-dirty #11 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 +Workqueue: events netlink_sock_destruct_work +RIP: 0010:fib6_dump_done (net/ipv6/ip6_fib.c:570) +Code: 3c 24 e8 f3 e9 51 fd e9 28 fd ff ff 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 f3 0f 1e fa 41 57 41 56 41 55 41 54 55 48 89 fd <53> 48 8d 5d 60 e8 b6 4d 07 fd 48 89 da 48 b8 00 00 00 00 00 fc ff +RSP: 0018:ffffc9000d980000 EFLAGS: 00010293 +RAX: 0000000000000000 RBX: ffffffff84405990 RCX: ffffffff844059d3 +RDX: ffff8881028e0000 RSI: ffffffff84405ac2 RDI: ffff88810c02f358 +RBP: ffff88810c02f358 R08: 0000000000000007 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000224 R12: 0000000000000000 +R13: ffff888007c82c78 R14: ffff888007c82c68 R15: ffff888007c82c68 +FS: 0000000000000000(0000) GS:ffff88811b100000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: ffffc9000d97fff8 CR3: 0000000102309002 CR4: 0000000000770ef0 +PKRU: 55555554 +Call Trace: + <#DF> + + + fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) + fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) + ... + fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) + fib6_dump_done (net/ipv6/ip6_fib.c:572 (discriminator 1)) + netlink_sock_destruct (net/netlink/af_netlink.c:401) + __sk_destruct (net/core/sock.c:2177 (discriminator 2)) + sk_destruct (net/core/sock.c:2224) + __sk_free (net/core/sock.c:2235) + sk_free (net/core/sock.c:2246) + process_one_work (kernel/workqueue.c:3259) + worker_thread (kernel/workqueue.c:3329 kernel/workqueue.c:3416) + kthread (kernel/kthread.c:388) + ret_from_fork (arch/x86/kernel/process.c:153) + ret_from_fork_asm (arch/x86/entry/entry_64.S:256) +Modules linked in: + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Reported-by: syzkaller +Signed-off-by: Kuniyuki Iwashima +Reviewed-by: Eric Dumazet +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20240401211003.25274-1-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/ip6_fib.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +--- a/net/ipv6/ip6_fib.c ++++ b/net/ipv6/ip6_fib.c +@@ -645,19 +645,19 @@ static int inet6_dump_fib(struct sk_buff + if (!w) { + /* New dump: + * +- * 1. hook callback destructor. +- */ +- cb->args[3] = (long)cb->done; +- cb->done = fib6_dump_done; +- +- /* +- * 2. allocate and initialize walker. ++ * 1. allocate and initialize walker. + */ + w = kzalloc(sizeof(*w), GFP_ATOMIC); + if (!w) + return -ENOMEM; + w->func = fib6_dump_node; + cb->args[2] = (long)w; ++ ++ /* 2. hook callback destructor. ++ */ ++ cb->args[3] = (long)cb->done; ++ cb->done = fib6_dump_done; ++ + } + + arg.skb = skb; diff --git a/queue-6.8/mlxbf_gige-stop-interface-during-shutdown.patch b/queue-6.8/mlxbf_gige-stop-interface-during-shutdown.patch new file mode 100644 index 00000000000..6dcbcc86348 --- /dev/null +++ b/queue-6.8/mlxbf_gige-stop-interface-during-shutdown.patch @@ -0,0 +1,109 @@ +From 09ba28e1cd3cf715daab1fca6e1623e22fd754a6 Mon Sep 17 00:00:00 2001 +From: David Thompson +Date: Mon, 25 Mar 2024 17:09:29 -0400 +Subject: mlxbf_gige: stop interface during shutdown + +From: David Thompson + +commit 09ba28e1cd3cf715daab1fca6e1623e22fd754a6 upstream. + +The mlxbf_gige driver intermittantly encounters a NULL pointer +exception while the system is shutting down via "reboot" command. +The mlxbf_driver will experience an exception right after executing +its shutdown() method. One example of this exception is: + +Unable to handle kernel NULL pointer dereference at virtual address 0000000000000070 +Mem abort info: + ESR = 0x0000000096000004 + EC = 0x25: DABT (current EL), IL = 32 bits + SET = 0, FnV = 0 + EA = 0, S1PTW = 0 + FSC = 0x04: level 0 translation fault +Data abort info: + ISV = 0, ISS = 0x00000004 + CM = 0, WnR = 0 +user pgtable: 4k pages, 48-bit VAs, pgdp=000000011d373000 +[0000000000000070] pgd=0000000000000000, p4d=0000000000000000 +Internal error: Oops: 96000004 [#1] SMP +CPU: 0 PID: 13 Comm: ksoftirqd/0 Tainted: G S OE 5.15.0-bf.6.gef6992a #1 +Hardware name: https://www.mellanox.com BlueField SoC/BlueField SoC, BIOS 4.0.2.12669 Apr 21 2023 +pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) +pc : mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige] +lr : mlxbf_gige_poll+0x54/0x160 [mlxbf_gige] +sp : ffff8000080d3c10 +x29: ffff8000080d3c10 x28: ffffcce72cbb7000 x27: ffff8000080d3d58 +x26: ffff0000814e7340 x25: ffff331cd1a05000 x24: ffffcce72c4ea008 +x23: ffff0000814e4b40 x22: ffff0000814e4d10 x21: ffff0000814e4128 +x20: 0000000000000000 x19: ffff0000814e4a80 x18: ffffffffffffffff +x17: 000000000000001c x16: ffffcce72b4553f4 x15: ffff80008805b8a7 +x14: 0000000000000000 x13: 0000000000000030 x12: 0101010101010101 +x11: 7f7f7f7f7f7f7f7f x10: c2ac898b17576267 x9 : ffffcce720fa5404 +x8 : ffff000080812138 x7 : 0000000000002e9a x6 : 0000000000000080 +x5 : ffff00008de3b000 x4 : 0000000000000000 x3 : 0000000000000001 +x2 : 0000000000000000 x1 : 0000000000000000 x0 : 0000000000000000 +Call trace: + mlxbf_gige_handle_tx_complete+0xc8/0x170 [mlxbf_gige] + mlxbf_gige_poll+0x54/0x160 [mlxbf_gige] + __napi_poll+0x40/0x1c8 + net_rx_action+0x314/0x3a0 + __do_softirq+0x128/0x334 + run_ksoftirqd+0x54/0x6c + smpboot_thread_fn+0x14c/0x190 + kthread+0x10c/0x110 + ret_from_fork+0x10/0x20 +Code: 8b070000 f9000ea0 f95056c0 f86178a1 (b9407002) +---[ end trace 7cc3941aa0d8e6a4 ]--- +Kernel panic - not syncing: Oops: Fatal exception in interrupt +Kernel Offset: 0x4ce722520000 from 0xffff800008000000 +PHYS_OFFSET: 0x80000000 +CPU features: 0x000005c1,a3330e5a +Memory Limit: none +---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]--- + +During system shutdown, the mlxbf_gige driver's shutdown() is always executed. +However, the driver's stop() method will only execute if networking interface +configuration logic within the Linux distribution has been setup to do so. + +If shutdown() executes but stop() does not execute, NAPI remains enabled +and this can lead to an exception if NAPI is scheduled while the hardware +interface has only been partially deinitialized. + +The networking interface managed by the mlxbf_gige driver must be properly +stopped during system shutdown so that IFF_UP is cleared, the hardware +interface is put into a clean state, and NAPI is fully deinitialized. + +Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") +Signed-off-by: David Thompson +Link: https://lore.kernel.org/r/20240325210929.25362-1-davthompson@nvidia.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c ++++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + + #include "mlxbf_gige.h" +@@ -492,8 +493,13 @@ static void mlxbf_gige_shutdown(struct p + { + struct mlxbf_gige *priv = platform_get_drvdata(pdev); + +- writeq(0, priv->base + MLXBF_GIGE_INT_EN); +- mlxbf_gige_clean_port(priv); ++ rtnl_lock(); ++ netif_device_detach(priv->netdev); ++ ++ if (netif_running(priv->netdev)) ++ dev_close(priv->netdev); ++ ++ rtnl_unlock(); + } + + static const struct acpi_device_id __maybe_unused mlxbf_gige_acpi_match[] = { diff --git a/queue-6.8/octeontx2-af-add-array-index-check.patch b/queue-6.8/octeontx2-af-add-array-index-check.patch new file mode 100644 index 00000000000..57ba30ee3d4 --- /dev/null +++ b/queue-6.8/octeontx2-af-add-array-index-check.patch @@ -0,0 +1,34 @@ +From ef15ddeeb6bee87c044bf7754fac524545bf71e8 Mon Sep 17 00:00:00 2001 +From: Aleksandr Mishin +Date: Thu, 28 Mar 2024 19:55:05 +0300 +Subject: octeontx2-af: Add array index check + +From: Aleksandr Mishin + +commit ef15ddeeb6bee87c044bf7754fac524545bf71e8 upstream. + +In rvu_map_cgx_lmac_pf() the 'iter', which is used as an array index, can reach +value (up to 14) that exceed the size (MAX_LMAC_COUNT = 8) of the array. +Fix this bug by adding 'iter' value check. + +Found by Linux Verification Center (linuxtesting.org) with SVACE. + +Fixes: 91c6945ea1f9 ("octeontx2-af: cn10k: Add RPM MAC support") +Signed-off-by: Aleksandr Mishin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +@@ -160,6 +160,8 @@ static int rvu_map_cgx_lmac_pf(struct rv + continue; + lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu)); + for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) { ++ if (iter >= MAX_LMAC_COUNT) ++ continue; + lmac = cgx_get_lmacid(rvu_cgx_pdata(cgx, rvu), + iter); + rvu->pf2cgxlmac_map[pf] = cgxlmac_id_to_bmap(cgx, lmac); diff --git a/queue-6.8/octeontx2-af-fix-issue-with-loading-coalesced-kpu-profiles.patch b/queue-6.8/octeontx2-af-fix-issue-with-loading-coalesced-kpu-profiles.patch new file mode 100644 index 00000000000..2064c687c15 --- /dev/null +++ b/queue-6.8/octeontx2-af-fix-issue-with-loading-coalesced-kpu-profiles.patch @@ -0,0 +1,36 @@ +From 0ba80d96585662299d4ea4624043759ce9015421 Mon Sep 17 00:00:00 2001 +From: Hariprasad Kelam +Date: Tue, 26 Mar 2024 17:51:49 +0530 +Subject: octeontx2-af: Fix issue with loading coalesced KPU profiles + +From: Hariprasad Kelam + +commit 0ba80d96585662299d4ea4624043759ce9015421 upstream. + +The current implementation for loading coalesced KPU profiles has +a limitation. The "offset" field, which is used to locate profiles +within the profile is restricted to a u16. + +This restricts the number of profiles that can be loaded. This patch +addresses this limitation by increasing the size of the "offset" field. + +Fixes: 11c730bfbf5b ("octeontx2-af: support for coalescing KPU profiles") +Signed-off-by: Hariprasad Kelam +Reviewed-by: Kalesh AP +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +@@ -1657,7 +1657,7 @@ static int npc_fwdb_detect_load_prfl_img + struct npc_coalesced_kpu_prfl *img_data = NULL; + int i = 0, rc = -EINVAL; + void __iomem *kpu_prfl_addr; +- u16 offset; ++ u32 offset; + + img_data = (struct npc_coalesced_kpu_prfl __force *)rvu->kpu_prfl_addr; + if (le64_to_cpu(img_data->signature) == KPU_SIGN && diff --git a/queue-6.8/octeontx2-pf-check-negative-error-code-in-otx2_open.patch b/queue-6.8/octeontx2-pf-check-negative-error-code-in-otx2_open.patch new file mode 100644 index 00000000000..ac5e92956bb --- /dev/null +++ b/queue-6.8/octeontx2-pf-check-negative-error-code-in-otx2_open.patch @@ -0,0 +1,35 @@ +From e709acbd84fb6ef32736331b0147f027a3ef4c20 Mon Sep 17 00:00:00 2001 +From: Su Hui +Date: Thu, 28 Mar 2024 10:06:21 +0800 +Subject: octeontx2-pf: check negative error code in otx2_open() + +From: Su Hui + +commit e709acbd84fb6ef32736331b0147f027a3ef4c20 upstream. + +otx2_rxtx_enable() return negative error code such as -EIO, +check -EIO rather than EIO to fix this problem. + +Fixes: c926252205c4 ("octeontx2-pf: Disable packet I/O for graceful exit") +Signed-off-by: Su Hui +Reviewed-by: Subbaraya Sundeep +Reviewed-by: Simon Horman +Reviewed-by: Kalesh AP +Link: https://lore.kernel.org/r/20240328020620.4054692-1-suhui@nfschina.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +@@ -1933,7 +1933,7 @@ int otx2_open(struct net_device *netdev) + * mcam entries are enabled to receive the packets. Hence disable the + * packet I/O. + */ +- if (err == EIO) ++ if (err == -EIO) + goto err_disable_rxtx; + else if (err) + goto err_tx_stop_queues; diff --git a/queue-6.8/r8169-skip-dash-fw-status-checks-when-dash-is-disabled.patch b/queue-6.8/r8169-skip-dash-fw-status-checks-when-dash-is-disabled.patch new file mode 100644 index 00000000000..75d4832e49e --- /dev/null +++ b/queue-6.8/r8169-skip-dash-fw-status-checks-when-dash-is-disabled.patch @@ -0,0 +1,99 @@ +From 5e864d90b20803edf6bd44a99fb9afa7171785f2 Mon Sep 17 00:00:00 2001 +From: Atlas Yu +Date: Thu, 28 Mar 2024 13:51:52 +0800 +Subject: r8169: skip DASH fw status checks when DASH is disabled + +From: Atlas Yu + +commit 5e864d90b20803edf6bd44a99fb9afa7171785f2 upstream. + +On devices that support DASH, the current code in the "rtl_loop_wait" function +raises false alarms when DASH is disabled. This occurs because the function +attempts to wait for the DASH firmware to be ready, even though it's not +relevant in this case. + +r8169 0000:0c:00.0 eth0: RTL8168ep/8111ep, 38:7c:76:49:08:d9, XID 502, IRQ 86 +r8169 0000:0c:00.0 eth0: jumbo features [frames: 9194 bytes, tx checksumming: ko] +r8169 0000:0c:00.0 eth0: DASH disabled +... +r8169 0000:0c:00.0 eth0: rtl_ep_ocp_read_cond == 0 (loop: 30, delay: 10000). + +This patch modifies the driver start/stop functions to skip checking the DASH +firmware status when DASH is explicitly disabled. This prevents unnecessary +delays and false alarms. + +The patch has been tested on several ThinkStation P8/PX workstations. + +Fixes: 0ab0c45d8aae ("r8169: add handling DASH when DASH is disabled") +Signed-off-by: Atlas Yu +Reviewed-by: Heiner Kallweit +Link: https://lore.kernel.org/r/20240328055152.18443-1-atlas.yu@canonical.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/realtek/r8169_main.c | 31 ++++++++++++++++++++++++++---- + 1 file changed, 27 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/realtek/r8169_main.c ++++ b/drivers/net/ethernet/realtek/r8169_main.c +@@ -1255,17 +1255,40 @@ static void rtl8168ep_stop_cmac(struct r + RTL_W8(tp, IBCR0, RTL_R8(tp, IBCR0) & ~0x01); + } + ++static void rtl_dash_loop_wait(struct rtl8169_private *tp, ++ const struct rtl_cond *c, ++ unsigned long usecs, int n, bool high) ++{ ++ if (!tp->dash_enabled) ++ return; ++ rtl_loop_wait(tp, c, usecs, n, high); ++} ++ ++static void rtl_dash_loop_wait_high(struct rtl8169_private *tp, ++ const struct rtl_cond *c, ++ unsigned long d, int n) ++{ ++ rtl_dash_loop_wait(tp, c, d, n, true); ++} ++ ++static void rtl_dash_loop_wait_low(struct rtl8169_private *tp, ++ const struct rtl_cond *c, ++ unsigned long d, int n) ++{ ++ rtl_dash_loop_wait(tp, c, d, n, false); ++} ++ + static void rtl8168dp_driver_start(struct rtl8169_private *tp) + { + r8168dp_oob_notify(tp, OOB_CMD_DRIVER_START); +- rtl_loop_wait_high(tp, &rtl_dp_ocp_read_cond, 10000, 10); ++ rtl_dash_loop_wait_high(tp, &rtl_dp_ocp_read_cond, 10000, 10); + } + + static void rtl8168ep_driver_start(struct rtl8169_private *tp) + { + r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_START); + r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); +- rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); ++ rtl_dash_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); + } + + static void rtl8168_driver_start(struct rtl8169_private *tp) +@@ -1279,7 +1302,7 @@ static void rtl8168_driver_start(struct + static void rtl8168dp_driver_stop(struct rtl8169_private *tp) + { + r8168dp_oob_notify(tp, OOB_CMD_DRIVER_STOP); +- rtl_loop_wait_low(tp, &rtl_dp_ocp_read_cond, 10000, 10); ++ rtl_dash_loop_wait_low(tp, &rtl_dp_ocp_read_cond, 10000, 10); + } + + static void rtl8168ep_driver_stop(struct rtl8169_private *tp) +@@ -1287,7 +1310,7 @@ static void rtl8168ep_driver_stop(struct + rtl8168ep_stop_cmac(tp); + r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_STOP); + r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); +- rtl_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10); ++ rtl_dash_loop_wait_low(tp, &rtl_ep_ocp_read_cond, 10000, 10); + } + + static void rtl8168_driver_stop(struct rtl8169_private *tp) diff --git a/queue-6.8/selftests-reuseaddr_conflict-add-missing-new-line-at-the-end-of-the-output.patch b/queue-6.8/selftests-reuseaddr_conflict-add-missing-new-line-at-the-end-of-the-output.patch new file mode 100644 index 00000000000..dcdc3e9857f --- /dev/null +++ b/queue-6.8/selftests-reuseaddr_conflict-add-missing-new-line-at-the-end-of-the-output.patch @@ -0,0 +1,39 @@ +From 31974122cfdeaf56abc18d8ab740d580d9833e90 Mon Sep 17 00:00:00 2001 +From: Jakub Kicinski +Date: Fri, 29 Mar 2024 09:05:59 -0700 +Subject: selftests: reuseaddr_conflict: add missing new line at the end of the output + +From: Jakub Kicinski + +commit 31974122cfdeaf56abc18d8ab740d580d9833e90 upstream. + +The netdev CI runs in a VM and captures serial, so stdout and +stderr get combined. Because there's a missing new line in +stderr the test ends up corrupting KTAP: + + # Successok 1 selftests: net: reuseaddr_conflict + +which should have been: + + # Success + ok 1 selftests: net: reuseaddr_conflict + +Fixes: 422d8dc6fd3a ("selftest: add a reuseaddr test") +Reviewed-by: Muhammad Usama Anjum +Link: https://lore.kernel.org/r/20240329160559.249476-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/net/reuseaddr_conflict.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/tools/testing/selftests/net/reuseaddr_conflict.c ++++ b/tools/testing/selftests/net/reuseaddr_conflict.c +@@ -109,6 +109,6 @@ int main(void) + fd1 = open_port(0, 1); + if (fd1 >= 0) + error(1, 0, "Was allowed to create an ipv4 reuseport on an already bound non-reuseport socket with no ipv6"); +- fprintf(stderr, "Success"); ++ fprintf(stderr, "Success\n"); + return 0; + } diff --git a/queue-6.8/series b/queue-6.8/series index eef244341f9..76c872658c2 100644 --- a/queue-6.8/series +++ b/queue-6.8/series @@ -106,3 +106,24 @@ net-phy-micrel-fix-potential-null-pointer-dereference.patch net-dsa-mv88e6xxx-fix-usable-ports-on-88e6020.patch selftests-net-gro-fwd-update-vxlan-gro-test-expectations.patch gro-fix-ownership-transfer.patch +idpf-fix-kernel-panic-on-unknown-packet-types.patch +ice-fix-enabling-rx-vlan-filtering.patch +i40e-fix-vf-mac-filter-removal.patch +tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses.patch +erspan-make-sure-erspan_base_hdr-is-present-in-skb-head.patch +selftests-reuseaddr_conflict-add-missing-new-line-at-the-end-of-the-output.patch +tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses-again.patch +ax25-fix-use-after-free-bugs-caused-by-ax25_ds_del_timer.patch +e1000e-workaround-for-sporadic-mdi-error-on-meteor-lake-systems.patch +ipv6-fix-infinite-recursion-in-fib6_dump_done.patch +mlxbf_gige-stop-interface-during-shutdown.patch +r8169-skip-dash-fw-status-checks-when-dash-is-disabled.patch +udp-do-not-accept-non-tunnel-gso-skbs-landing-in-a-tunnel.patch +udp-do-not-transition-udp-gro-fraglist-partial-checksums-to-unnecessary.patch +udp-prevent-local-udp-tunnel-packets-from-being-groed.patch +octeontx2-af-fix-issue-with-loading-coalesced-kpu-profiles.patch +octeontx2-pf-check-negative-error-code-in-otx2_open.patch +octeontx2-af-add-array-index-check.patch +i40e-fix-i40e_count_filters-to-count-only-active-new-filters.patch +i40e-fix-vf-may-be-used-uninitialized-in-this-function-warning.patch +i40e-enforce-software-interrupt-during-busy-poll-exit.patch diff --git a/queue-6.8/tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses-again.patch b/queue-6.8/tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses-again.patch new file mode 100644 index 00000000000..fdecf07ba49 --- /dev/null +++ b/queue-6.8/tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses-again.patch @@ -0,0 +1,94 @@ +From d91ef1e1b55f730bee8ce286b02b7bdccbc42973 Mon Sep 17 00:00:00 2001 +From: Kuniyuki Iwashima +Date: Tue, 26 Mar 2024 13:42:45 -0700 +Subject: tcp: Fix bind() regression for v6-only wildcard and v4(-mapped-v6) non-wildcard addresses. + +From: Kuniyuki Iwashima + +commit d91ef1e1b55f730bee8ce286b02b7bdccbc42973 upstream. + +Jianguo Wu reported another bind() regression introduced by bhash2. + +Calling bind() for the following 3 addresses on the same port, the +3rd one should fail but now succeeds. + + 1. 0.0.0.0 or ::ffff:0.0.0.0 + 2. [::] w/ IPV6_V6ONLY + 3. IPv4 non-wildcard address or v4-mapped-v6 non-wildcard address + +The first two bind() create tb2 like this: + + bhash2 -> tb2(:: w/ IPV6_V6ONLY) -> tb2(0.0.0.0) + +The 3rd bind() will match with the IPv6 only wildcard address bucket +in inet_bind2_bucket_match_addr_any(), however, no conflicting socket +exists in the bucket. So, inet_bhash2_conflict() will returns false, +and thus, inet_bhash2_addr_any_conflict() returns false consequently. + +As a result, the 3rd bind() bypasses conflict check, which should be +done against the IPv4 wildcard address bucket. + +So, in inet_bhash2_addr_any_conflict(), we must iterate over all buckets. + +Note that we cannot add ipv6_only flag for inet_bind2_bucket as it +would confuse the following patetrn. + + 1. [::] w/ SO_REUSE{ADDR,PORT} and IPV6_V6ONLY + 2. [::] w/ SO_REUSE{ADDR,PORT} + 3. IPv4 non-wildcard address or v4-mapped-v6 non-wildcard address + +The first bind() would create a bucket with ipv6_only flag true, +the second bind() would add the [::] socket into the same bucket, +and the third bind() could succeed based on the wrong assumption +that ipv6_only bucket would not conflict with v4(-mapped-v6) address. + +Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address") +Diagnosed-by: Jianguo Wu +Signed-off-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20240326204251.51301-3-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_connection_sock.c | 21 ++++++++++++--------- + 1 file changed, 12 insertions(+), 9 deletions(-) + +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -294,6 +294,7 @@ static bool inet_bhash2_addr_any_conflic + struct sock_reuseport *reuseport_cb; + struct inet_bind_hashbucket *head2; + struct inet_bind2_bucket *tb2; ++ bool conflict = false; + bool reuseport_cb_ok; + + rcu_read_lock(); +@@ -306,18 +307,20 @@ static bool inet_bhash2_addr_any_conflic + + spin_lock(&head2->lock); + +- inet_bind_bucket_for_each(tb2, &head2->chain) +- if (inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) +- break; +- +- if (tb2 && inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, +- reuseport_ok)) { +- spin_unlock(&head2->lock); +- return true; ++ inet_bind_bucket_for_each(tb2, &head2->chain) { ++ if (!inet_bind2_bucket_match_addr_any(tb2, net, port, l3mdev, sk)) ++ continue; ++ ++ if (!inet_bhash2_conflict(sk, tb2, uid, relax, reuseport_cb_ok, reuseport_ok)) ++ continue; ++ ++ conflict = true; ++ break; + } + + spin_unlock(&head2->lock); +- return false; ++ ++ return conflict; + } + + /* diff --git a/queue-6.8/tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses.patch b/queue-6.8/tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses.patch new file mode 100644 index 00000000000..144940bdc9c --- /dev/null +++ b/queue-6.8/tcp-fix-bind-regression-for-v6-only-wildcard-and-v4-mapped-v6-non-wildcard-addresses.patch @@ -0,0 +1,78 @@ +From ea111449501ea32bf6da82750de860243691efc7 Mon Sep 17 00:00:00 2001 +From: Kuniyuki Iwashima +Date: Tue, 26 Mar 2024 13:42:44 -0700 +Subject: tcp: Fix bind() regression for v6-only wildcard and v4-mapped-v6 non-wildcard addresses. + +From: Kuniyuki Iwashima + +commit ea111449501ea32bf6da82750de860243691efc7 upstream. + +Commit 5e07e672412b ("tcp: Use bhash2 for v4-mapped-v6 non-wildcard +address.") introduced bind() regression for v4-mapped-v6 address. + +When we bind() the following two addresses on the same port, the 2nd +bind() should succeed but fails now. + + 1. [::] w/ IPV6_ONLY + 2. ::ffff:127.0.0.1 + +After the chagne, v4-mapped-v6 uses bhash2 instead of bhash to +detect conflict faster, but I forgot to add a necessary change. + +During the 2nd bind(), inet_bind2_bucket_match_addr_any() returns +the tb2 bucket of [::], and inet_bhash2_conflict() finally calls +inet_bind_conflict(), which returns true, meaning conflict. + + inet_bhash2_addr_any_conflict + |- inet_bind2_bucket_match_addr_any <-- return [::] bucket + `- inet_bhash2_conflict + `- __inet_bhash2_conflict <-- checks IPV6_ONLY for AF_INET + | but not for v4-mapped-v6 address + `- inet_bind_conflict <-- does not check address + +inet_bind_conflict() does not check socket addresses because +__inet_bhash2_conflict() is expected to do so. + +However, it checks IPV6_V6ONLY attribute only against AF_INET +socket, and not for v4-mapped-v6 address. + +As a result, v4-mapped-v6 address conflicts with v6-only wildcard +address. + +To avoid that, let's add the missing test to use bhash2 for +v4-mapped-v6 address. + +Fixes: 5e07e672412b ("tcp: Use bhash2 for v4-mapped-v6 non-wildcard address.") +Signed-off-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20240326204251.51301-2-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/inet_connection_sock.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index c038e28e2f1e..4184d45f890c 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -203,8 +203,15 @@ static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, + kuid_t sk_uid, bool relax, + bool reuseport_cb_ok, bool reuseport_ok) + { +- if (sk->sk_family == AF_INET && ipv6_only_sock(sk2)) +- return false; ++ if (ipv6_only_sock(sk2)) { ++ if (sk->sk_family == AF_INET) ++ return false; ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ++ return false; ++#endif ++ } + + return inet_bind_conflict(sk, sk2, sk_uid, relax, + reuseport_cb_ok, reuseport_ok); +-- +2.44.0 + diff --git a/queue-6.8/udp-do-not-accept-non-tunnel-gso-skbs-landing-in-a-tunnel.patch b/queue-6.8/udp-do-not-accept-non-tunnel-gso-skbs-landing-in-a-tunnel.patch new file mode 100644 index 00000000000..5499019ddec --- /dev/null +++ b/queue-6.8/udp-do-not-accept-non-tunnel-gso-skbs-landing-in-a-tunnel.patch @@ -0,0 +1,146 @@ +From 3d010c8031e39f5fa1e8b13ada77e0321091011f Mon Sep 17 00:00:00 2001 +From: Antoine Tenart +Date: Tue, 26 Mar 2024 12:33:58 +0100 +Subject: udp: do not accept non-tunnel GSO skbs landing in a tunnel + +From: Antoine Tenart + +commit 3d010c8031e39f5fa1e8b13ada77e0321091011f upstream. + +When rx-udp-gro-forwarding is enabled UDP packets might be GROed when +being forwarded. If such packets might land in a tunnel this can cause +various issues and udp_gro_receive makes sure this isn't the case by +looking for a matching socket. This is performed in +udp4/6_gro_lookup_skb but only in the current netns. This is an issue +with tunneled packets when the endpoint is in another netns. In such +cases the packets will be GROed at the UDP level, which leads to various +issues later on. The same thing can happen with rx-gro-list. + +We saw this with geneve packets being GROed at the UDP level. In such +case gso_size is set; later the packet goes through the geneve rx path, +the geneve header is pulled, the offset are adjusted and frag_list skbs +are not adjusted with regard to geneve. When those skbs hit +skb_fragment, it will misbehave. Different outcomes are possible +depending on what the GROed skbs look like; from corrupted packets to +kernel crashes. + +One example is a BUG_ON[1] triggered in skb_segment while processing the +frag_list. Because gso_size is wrong (geneve header was pulled) +skb_segment thinks there is "geneve header size" of data in frag_list, +although it's in fact the next packet. The BUG_ON itself has nothing to +do with the issue. This is only one of the potential issues. + +Looking up for a matching socket in udp_gro_receive is fragile: the +lookup could be extended to all netns (not speaking about performances) +but nothing prevents those packets from being modified in between and we +could still not find a matching socket. It's OK to keep the current +logic there as it should cover most cases but we also need to make sure +we handle tunnel packets being GROed too early. + +This is done by extending the checks in udp_unexpected_gso: GSO packets +lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits and landing in a tunnel must +be segmented. + +[1] kernel BUG at net/core/skbuff.c:4408! + RIP: 0010:skb_segment+0xd2a/0xf70 + __udp_gso_segment+0xaa/0x560 + +Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") +Fixes: 36707061d6ba ("udp: allow forwarding of plain (non-fraglisted) UDP GRO packets") +Signed-off-by: Antoine Tenart +Reviewed-by: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/udp.h | 28 ++++++++++++++++++++++++++++ + net/ipv4/udp.c | 7 +++++++ + net/ipv4/udp_offload.c | 6 ++++-- + net/ipv6/udp.c | 2 +- + 4 files changed, 40 insertions(+), 3 deletions(-) + +--- a/include/linux/udp.h ++++ b/include/linux/udp.h +@@ -140,6 +140,24 @@ static inline void udp_cmsg_recv(struct + } + } + ++DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); ++#if IS_ENABLED(CONFIG_IPV6) ++DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); ++#endif ++ ++static inline bool udp_encap_needed(void) ++{ ++ if (static_branch_unlikely(&udp_encap_needed_key)) ++ return true; ++ ++#if IS_ENABLED(CONFIG_IPV6) ++ if (static_branch_unlikely(&udpv6_encap_needed_key)) ++ return true; ++#endif ++ ++ return false; ++} ++ + static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) + { + if (!skb_is_gso(skb)) +@@ -153,6 +171,16 @@ static inline bool udp_unexpected_gso(st + !udp_test_bit(ACCEPT_FRAGLIST, sk)) + return true; + ++ /* GSO packets lacking the SKB_GSO_UDP_TUNNEL/_CSUM bits might still ++ * land in a tunnel as the socket check in udp_gro_receive cannot be ++ * foolproof. ++ */ ++ if (udp_encap_needed() && ++ READ_ONCE(udp_sk(sk)->encap_rcv) && ++ !(skb_shinfo(skb)->gso_type & ++ (SKB_GSO_UDP_TUNNEL | SKB_GSO_UDP_TUNNEL_CSUM))) ++ return true; ++ + return false; + } + +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -584,6 +584,13 @@ static inline bool __udp_is_mcast_sock(s + } + + DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); ++EXPORT_SYMBOL(udp_encap_needed_key); ++ ++#if IS_ENABLED(CONFIG_IPV6) ++DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); ++EXPORT_SYMBOL(udpv6_encap_needed_key); ++#endif ++ + void udp_encap_enable(void) + { + static_branch_inc(&udp_encap_needed_key); +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -552,8 +552,10 @@ struct sk_buff *udp_gro_receive(struct l + unsigned int off = skb_gro_offset(skb); + int flush = 1; + +- /* we can do L4 aggregation only if the packet can't land in a tunnel +- * otherwise we could corrupt the inner stream ++ /* We can do L4 aggregation only if the packet can't land in a tunnel ++ * otherwise we could corrupt the inner stream. Detecting such packets ++ * cannot be foolproof and the aggregation might still happen in some ++ * cases. Such packets should be caught in udp_unexpected_gso later. + */ + NAPI_GRO_CB(skb)->is_flist = 0; + if (!sk || !udp_sk(sk)->gro_receive) { +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -450,7 +450,7 @@ csum_copy_err: + goto try_again; + } + +-DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); ++DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); + void udpv6_encap_enable(void) + { + static_branch_inc(&udpv6_encap_needed_key); diff --git a/queue-6.8/udp-do-not-transition-udp-gro-fraglist-partial-checksums-to-unnecessary.patch b/queue-6.8/udp-do-not-transition-udp-gro-fraglist-partial-checksums-to-unnecessary.patch new file mode 100644 index 00000000000..822c3f5f6f2 --- /dev/null +++ b/queue-6.8/udp-do-not-transition-udp-gro-fraglist-partial-checksums-to-unnecessary.patch @@ -0,0 +1,74 @@ +From f0b8c30345565344df2e33a8417a27503589247d Mon Sep 17 00:00:00 2001 +From: Antoine Tenart +Date: Tue, 26 Mar 2024 12:34:00 +0100 +Subject: udp: do not transition UDP GRO fraglist partial checksums to unnecessary + +From: Antoine Tenart + +commit f0b8c30345565344df2e33a8417a27503589247d upstream. + +UDP GRO validates checksums and in udp4/6_gro_complete fraglist packets +are converted to CHECKSUM_UNNECESSARY to avoid later checks. However +this is an issue for CHECKSUM_PARTIAL packets as they can be looped in +an egress path and then their partial checksums are not fixed. + +Different issues can be observed, from invalid checksum on packets to +traces like: + + gen01: hw csum failure + skb len=3008 headroom=160 headlen=1376 tailroom=0 + mac=(106,14) net=(120,40) trans=160 + shinfo(txflags=0 nr_frags=0 gso(size=0 type=0 segs=0)) + csum(0xffff232e ip_summed=2 complete_sw=0 valid=0 level=0) + hash(0x77e3d716 sw=1 l4=1) proto=0x86dd pkttype=0 iif=12 + ... + +Fix this by only converting CHECKSUM_NONE packets to +CHECKSUM_UNNECESSARY by reusing __skb_incr_checksum_unnecessary. All +other checksum types are kept as-is, including CHECKSUM_COMPLETE as +fraglist packets being segmented back would have their skb->csum valid. + +Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") +Signed-off-by: Antoine Tenart +Reviewed-by: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/udp_offload.c | 8 +------- + net/ipv6/udp_offload.c | 8 +------- + 2 files changed, 2 insertions(+), 14 deletions(-) + +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -722,13 +722,7 @@ INDIRECT_CALLABLE_SCOPE int udp4_gro_com + skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + +- if (skb->ip_summed == CHECKSUM_UNNECESSARY) { +- if (skb->csum_level < SKB_MAX_CSUM_LEVEL) +- skb->csum_level++; +- } else { +- skb->ip_summed = CHECKSUM_UNNECESSARY; +- skb->csum_level = 0; +- } ++ __skb_incr_checksum_unnecessary(skb); + + return 0; + } +--- a/net/ipv6/udp_offload.c ++++ b/net/ipv6/udp_offload.c +@@ -174,13 +174,7 @@ INDIRECT_CALLABLE_SCOPE int udp6_gro_com + skb_shinfo(skb)->gso_type |= (SKB_GSO_FRAGLIST|SKB_GSO_UDP_L4); + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + +- if (skb->ip_summed == CHECKSUM_UNNECESSARY) { +- if (skb->csum_level < SKB_MAX_CSUM_LEVEL) +- skb->csum_level++; +- } else { +- skb->ip_summed = CHECKSUM_UNNECESSARY; +- skb->csum_level = 0; +- } ++ __skb_incr_checksum_unnecessary(skb); + + return 0; + } diff --git a/queue-6.8/udp-prevent-local-udp-tunnel-packets-from-being-groed.patch b/queue-6.8/udp-prevent-local-udp-tunnel-packets-from-being-groed.patch new file mode 100644 index 00000000000..19b9a0d106e --- /dev/null +++ b/queue-6.8/udp-prevent-local-udp-tunnel-packets-from-being-groed.patch @@ -0,0 +1,54 @@ +From 64235eabc4b5b18c507c08a1f16cdac6c5661220 Mon Sep 17 00:00:00 2001 +From: Antoine Tenart +Date: Tue, 26 Mar 2024 12:34:01 +0100 +Subject: udp: prevent local UDP tunnel packets from being GROed + +From: Antoine Tenart + +commit 64235eabc4b5b18c507c08a1f16cdac6c5661220 upstream. + +GRO has a fundamental issue with UDP tunnel packets as it can't detect +those in a foolproof way and GRO could happen before they reach the +tunnel endpoint. Previous commits have fixed issues when UDP tunnel +packets come from a remote host, but if those packets are issued locally +they could run into checksum issues. + +If the inner packet has a partial checksum the information will be lost +in the GRO logic, either in udp4/6_gro_complete or in +udp_gro_complete_segment and packets will have an invalid checksum when +leaving the host. + +Prevent local UDP tunnel packets from ever being GROed at the outer UDP +level. + +Due to skb->encapsulation being wrongly used in some drivers this is +actually only preventing UDP tunnel packets with a partial checksum to +be GROed (see iptunnel_handle_offloads) but those were also the packets +triggering issues so in practice this should be sufficient. + +Fixes: 9fd1ff5d2ac7 ("udp: Support UDP fraglist GRO/GSO.") +Fixes: 36707061d6ba ("udp: allow forwarding of plain (non-fraglisted) UDP GRO packets") +Suggested-by: Paolo Abeni +Signed-off-by: Antoine Tenart +Reviewed-by: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/udp_offload.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -559,6 +559,12 @@ struct sk_buff *udp_gro_receive(struct l + */ + NAPI_GRO_CB(skb)->is_flist = 0; + if (!sk || !udp_sk(sk)->gro_receive) { ++ /* If the packet was locally encapsulated in a UDP tunnel that ++ * wasn't detected above, do not GRO. ++ */ ++ if (skb->encapsulation) ++ goto out; ++ + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) + NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; + -- 2.47.2