--- /dev/null
+From 4ee3561ae2a5201c99335c17f13f5ba05802f179 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Dec 2023 19:20:35 +0100
+Subject: ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7
+
+From: Takashi Iwai <tiwai@suse.de>
+
+[ Upstream commit 634e5e1e06f5cdd614a1bc429ecb243a51cc009d ]
+
+Lenovo Yoga Pro 7 14APH8 (PCI SSID 17aa:3882) seems requiring the
+similar workaround like Yoga 9 model for the bass speaker.
+
+Cc: <stable@vger.kernel.org>
+Link: https://lore.kernel.org/r/CAGGk=CRRQ1L9p771HsXTN_ebZP41Qj+3gw35Gezurn+nokRewg@mail.gmail.com
+Link: https://lore.kernel.org/r/20231207182035.30248-1-tiwai@suse.de
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/pci/hda/patch_realtek.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
+index a88ed60dcd96a..48155aa52828c 100644
+--- a/sound/pci/hda/patch_realtek.c
++++ b/sound/pci/hda/patch_realtek.c
+@@ -9904,6 +9904,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
+ SND_PCI_QUIRK(0x1558, 0xc019, "Clevo NH77D[BE]Q", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
+ SND_PCI_QUIRK(0x1558, 0xc022, "Clevo NH77[DC][QW]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE),
+ SND_PCI_QUIRK(0x17aa, 0x1036, "Lenovo P520", ALC233_FIXUP_LENOVO_MULTI_CODECS),
++ SND_PCI_QUIRK(0x17aa, 0x3882, "Lenovo Yoga Pro 7 14APH8", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN),
+ SND_PCI_QUIRK(0x17aa, 0x1048, "ThinkCentre Station", ALC623_FIXUP_LENOVO_THINKSTATION_P340),
+ SND_PCI_QUIRK(0x17aa, 0x20f2, "Thinkpad SL410/510", ALC269_FIXUP_SKU_IGNORE),
+ SND_PCI_QUIRK(0x17aa, 0x215e, "Thinkpad L512", ALC269_FIXUP_SKU_IGNORE),
+--
+2.43.0
+
--- /dev/null
+From 3e16e0cda98b5db7e47533ca0dcd626b759cc327 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 28 Dec 2023 20:39:02 +0100
+Subject: ARM: sun9i: smp: Fix array-index-out-of-bounds read in
+ sunxi_mc_smp_init
+
+From: Stefan Wahren <wahrenst@gmx.net>
+
+[ Upstream commit 72ad3b772b6d393701df58ba1359b0bb346a19ed ]
+
+Running a multi-arch kernel (multi_v7_defconfig) on a Raspberry Pi 3B+
+with enabled CONFIG_UBSAN triggers the following warning:
+
+ UBSAN: array-index-out-of-bounds in arch/arm/mach-sunxi/mc_smp.c:810:29
+ index 2 is out of range for type 'sunxi_mc_smp_data [2]'
+ CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc6-00248-g5254c0cbc92d
+ Hardware name: BCM2835
+ unwind_backtrace from show_stack+0x10/0x14
+ show_stack from dump_stack_lvl+0x40/0x4c
+ dump_stack_lvl from ubsan_epilogue+0x8/0x34
+ ubsan_epilogue from __ubsan_handle_out_of_bounds+0x78/0x80
+ __ubsan_handle_out_of_bounds from sunxi_mc_smp_init+0xe4/0x4cc
+ sunxi_mc_smp_init from do_one_initcall+0xa0/0x2fc
+ do_one_initcall from kernel_init_freeable+0xf4/0x2f4
+ kernel_init_freeable from kernel_init+0x18/0x158
+ kernel_init from ret_from_fork+0x14/0x28
+
+Since the enabled method couldn't match with any entry from
+sunxi_mc_smp_data, the value of the index shouldn't be used right after
+the loop. So move it after the check of ret in order to have a valid
+index.
+
+Fixes: 1631090e34f5 ("ARM: sun9i: smp: Add is_a83t field")
+Signed-off-by: Stefan Wahren <wahrenst@gmx.net>
+Link: https://lore.kernel.org/r/20231228193903.9078-1-wahrenst@gmx.net
+Reviewed-by: Chen-Yu Tsai <wens@csie.org>
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/arm/mach-sunxi/mc_smp.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/arch/arm/mach-sunxi/mc_smp.c b/arch/arm/mach-sunxi/mc_smp.c
+index 26cbce1353387..b2f5f4f28705f 100644
+--- a/arch/arm/mach-sunxi/mc_smp.c
++++ b/arch/arm/mach-sunxi/mc_smp.c
+@@ -808,12 +808,12 @@ static int __init sunxi_mc_smp_init(void)
+ break;
+ }
+
+- is_a83t = sunxi_mc_smp_data[i].is_a83t;
+-
+ of_node_put(node);
+ if (ret)
+ return -ENODEV;
+
++ is_a83t = sunxi_mc_smp_data[i].is_a83t;
++
+ if (!sunxi_mc_smp_cpu_table_init())
+ return -EINVAL;
+
+--
+2.43.0
+
--- /dev/null
+From e19d878f0cb36876b9df3d6fa13866e0e1f207f3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 27 Jan 2023 12:43:42 +0100
+Subject: arm64: dts: qcom: sdm845: align RPMh regulator nodes with bindings
+
+From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
+
+[ Upstream commit 86dd19bbdea2b7d3feb69c0c39f141de30a18ec9 ]
+
+Device node names should be generic and bindings expect certain pattern
+for RPMh regulator nodes.
+
+Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
+Reviewed-by: Konrad Dybcio <konrad.dybcio@linaro.org>
+Signed-off-by: Bjorn Andersson <andersson@kernel.org>
+Link: https://lore.kernel.org/r/20230127114347.235963-6-krzysztof.kozlowski@linaro.org
+Stable-dep-of: a5f01673d394 ("arm64: dts: qcom: sdm845: Fix PSCI power domain names")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi | 4 ++--
+ arch/arm64/boot/dts/qcom/sdm845-db845c.dts | 4 ++--
+ arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi | 6 +++---
+ arch/arm64/boot/dts/qcom/sdm845-mtp.dts | 6 +++---
+ arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi | 6 +++---
+ arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts | 6 +++---
+ arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi | 6 +++---
+ arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts | 2 +-
+ arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts | 6 +++---
+ arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts | 2 +-
+ arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts | 2 +-
+ 11 files changed, 25 insertions(+), 25 deletions(-)
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
+index a5c0c788969fb..985824032c522 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
++++ b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
+@@ -351,7 +351,7 @@ flash@0 {
+
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+@@ -633,7 +633,7 @@ src_pp1800_lvs2: lvs2 {
+ };
+ };
+
+- pm8005-rpmh-regulators {
++ regulators-1 {
+ compatible = "qcom,pm8005-rpmh-regulators";
+ qcom,pmic-id = "c";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts
+index c9efcb894a52f..8c9ccf5b4ea41 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts
++++ b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts
+@@ -271,7 +271,7 @@ &adsp_pas {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+ vdd-s1-supply = <&vph_pwr>;
+@@ -396,7 +396,7 @@ vreg_lvs2a_1p8: lvs2 {
+ };
+ };
+
+- pmi8998-rpmh-regulators {
++ regulators-1 {
+ compatible = "qcom,pmi8998-rpmh-regulators";
+ qcom,pmic-id = "b";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi b/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi
+index 20f275f8694dc..e2921640880a1 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi
++++ b/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi
+@@ -166,7 +166,7 @@ &adsp_pas {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+@@ -419,7 +419,7 @@ vreg_lvs2a_1p8: lvs2 {
+ };
+ };
+
+- pmi8998-rpmh-regulators {
++ regulators-1 {
+ compatible = "qcom,pmi8998-rpmh-regulators";
+ qcom,pmic-id = "b";
+
+@@ -433,7 +433,7 @@ vreg_bob: bob {
+ };
+ };
+
+- pm8005-rpmh-regulators {
++ regulators-2 {
+ compatible = "qcom,pm8005-rpmh-regulators";
+ qcom,pmic-id = "c";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-mtp.dts b/arch/arm64/boot/dts/qcom/sdm845-mtp.dts
+index 64958dee17d8b..b47e333aa3510 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-mtp.dts
++++ b/arch/arm64/boot/dts/qcom/sdm845-mtp.dts
+@@ -117,7 +117,7 @@ &adsp_pas {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+@@ -382,7 +382,7 @@ vreg_lvs2a_1p8: lvs2 {
+ };
+ };
+
+- pmi8998-rpmh-regulators {
++ regulators-1 {
+ compatible = "qcom,pmi8998-rpmh-regulators";
+ qcom,pmic-id = "b";
+
+@@ -396,7 +396,7 @@ vreg_bob: bob {
+ };
+ };
+
+- pm8005-rpmh-regulators {
++ regulators-2 {
+ compatible = "qcom,pm8005-rpmh-regulators";
+ qcom,pmic-id = "c";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi b/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi
+index 392461c29e76e..0713b774a97be 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi
++++ b/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi
+@@ -144,7 +144,7 @@ &adsp_pas {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+@@ -280,7 +280,7 @@ vreg_l28a_3p0: ldo28 {
+ };
+ };
+
+- pmi8998-rpmh-regulators {
++ regulators-1 {
+ compatible = "qcom,pmi8998-rpmh-regulators";
+ qcom,pmic-id = "b";
+
+@@ -294,7 +294,7 @@ vreg_bob: bob {
+ };
+ };
+
+- pm8005-rpmh-regulators {
++ regulators-2 {
+ compatible = "qcom,pm8005-rpmh-regulators";
+ qcom,pmic-id = "c";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts
+index 83261c9bb4f23..b65c35865dab9 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts
++++ b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts
+@@ -110,7 +110,7 @@ &adsp_pas {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+@@ -375,7 +375,7 @@ vreg_lvs2a_1p8: lvs2 {
+ };
+ };
+
+- pmi8998-rpmh-regulators {
++ regulators-1 {
+ compatible = "qcom,pmi8998-rpmh-regulators";
+ qcom,pmic-id = "b";
+
+@@ -389,7 +389,7 @@ vreg_bob: bob {
+ };
+ };
+
+- pm8005-rpmh-regulators {
++ regulators-2 {
+ compatible = "qcom,pm8005-rpmh-regulators";
+ qcom,pmic-id = "c";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi b/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi
+index d6918e6d19799..249a715d5aae1 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi
++++ b/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi
+@@ -78,7 +78,7 @@ ramoops@ffc00000 {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+@@ -308,7 +308,7 @@ vreg_lvs2a_1p8: lvs2 {
+ };
+ };
+
+- pmi8998-rpmh-regulators {
++ regulators-1 {
+ compatible = "qcom,pmi8998-rpmh-regulators";
+ qcom,pmic-id = "b";
+
+@@ -319,7 +319,7 @@ src_vreg_bob: bob {
+ };
+ };
+
+- pm8005-rpmh-regulators {
++ regulators-2 {
+ compatible = "qcom,pm8005-rpmh-regulators";
+ qcom,pmic-id = "c";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts
+index 0f470cf1ed1c1..6d6b3dd699475 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts
++++ b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts
+@@ -125,7 +125,7 @@ &adsp_pas {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts
+index 093b04359ec39..ffbe45a99b74a 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts
++++ b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts
+@@ -143,7 +143,7 @@ vreg_s4a_1p8: vreg-s4a-1p8 {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+@@ -343,7 +343,7 @@ vreg_lvs2a_1p8: lvs2 {
+ };
+ };
+
+- pmi8998-rpmh-regulators {
++ regulators-1 {
+ compatible = "qcom,pmi8998-rpmh-regulators";
+ qcom,pmic-id = "b";
+
+@@ -355,7 +355,7 @@ vreg_bob: bob {
+ };
+ };
+
+- pm8005-rpmh-regulators {
++ regulators-2 {
+ compatible = "qcom,pm8005-rpmh-regulators";
+ qcom,pmic-id = "c";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts
+index 74f43da51fa50..48a41ace8fc58 100644
+--- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts
++++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts
+@@ -99,7 +99,7 @@ &adsp_pas {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts
+index d028a7eb364a6..c169d2870bdf4 100644
+--- a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts
++++ b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts
+@@ -129,7 +129,7 @@ &adsp_pas {
+ };
+
+ &apps_rsc {
+- pm8998-rpmh-regulators {
++ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+
+--
+2.43.0
+
--- /dev/null
+From e0335f9198238cec81a096f299f7f121093303f7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Sep 2023 12:42:03 +0530
+Subject: arm64: dts: qcom: sdm845: Fix PSCI power domain names
+
+From: David Heidelberg <david@ixit.cz>
+
+[ Upstream commit a5f01673d3946e424091e6b8ff274716f9c21454 ]
+
+The original commit hasn't been updated according to
+refactoring done in sdm845.dtsi.
+
+Fixes: a1ade6cac5a2 ("arm64: dts: qcom: sdm845: Switch PSCI cpu idle states from PC to OSI")
+Suggested-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
+Reviewed-by: Douglas Anderson <dianders@chromium.org>
+Signed-off-by: David Heidelberg <david@ixit.cz>
+Reviewed-by: Stephen Boyd <swboyd@chromium.org>
+Reviewed-by: Abel Vesa <abel.vesa@linaro.org>
+Link: https://lore.kernel.org/r/20230912071205.11502-1-david@ixit.cz
+Signed-off-by: Bjorn Andersson <andersson@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi | 20 +++++++++++---------
+ 1 file changed, 11 insertions(+), 9 deletions(-)
+
+diff --git a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
+index 985824032c522..43ee28db61aa8 100644
+--- a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
++++ b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
+@@ -150,15 +150,15 @@ &cpufreq_hw {
+ };
+
+ &psci {
+- /delete-node/ cpu0;
+- /delete-node/ cpu1;
+- /delete-node/ cpu2;
+- /delete-node/ cpu3;
+- /delete-node/ cpu4;
+- /delete-node/ cpu5;
+- /delete-node/ cpu6;
+- /delete-node/ cpu7;
+- /delete-node/ cpu-cluster0;
++ /delete-node/ power-domain-cpu0;
++ /delete-node/ power-domain-cpu1;
++ /delete-node/ power-domain-cpu2;
++ /delete-node/ power-domain-cpu3;
++ /delete-node/ power-domain-cpu4;
++ /delete-node/ power-domain-cpu5;
++ /delete-node/ power-domain-cpu6;
++ /delete-node/ power-domain-cpu7;
++ /delete-node/ power-domain-cluster;
+ };
+
+ &cpus {
+@@ -351,6 +351,8 @@ flash@0 {
+
+
+ &apps_rsc {
++ /delete-property/ power-domains;
++
+ regulators-0 {
+ compatible = "qcom,pm8998-rpmh-regulators";
+ qcom,pmic-id = "a";
+--
+2.43.0
+
--- /dev/null
+From b44044d49c2870abfd79fa40e990603cafdfaf2e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jan 2024 03:35:34 +0000
+Subject: asix: Add check for usbnet_get_endpoints
+
+From: Chen Ni <nichen@iscas.ac.cn>
+
+[ Upstream commit eaac6a2d26b65511e164772bec6918fcbc61938e ]
+
+Add check for usbnet_get_endpoints() and return the error if it fails
+in order to transfer the error.
+
+Fixes: 16626b0cc3d5 ("asix: Add a new driver for the AX88172A")
+Signed-off-by: Chen Ni <nichen@iscas.ac.cn>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/usb/ax88172a.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c
+index 3777c7e2e6fc0..e47bb125048d4 100644
+--- a/drivers/net/usb/ax88172a.c
++++ b/drivers/net/usb/ax88172a.c
+@@ -161,7 +161,9 @@ static int ax88172a_bind(struct usbnet *dev, struct usb_interface *intf)
+ u8 buf[ETH_ALEN];
+ struct ax88172a_private *priv;
+
+- usbnet_get_endpoints(dev, intf);
++ ret = usbnet_get_endpoints(dev, intf);
++ if (ret)
++ return ret;
+
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+--
+2.43.0
+
--- /dev/null
+From 49ca35addbb9c2d01e802de2b473789fe8bb5f35 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 25 Dec 2023 17:06:08 +0900
+Subject: ASoC: fsl_rpmsg: Fix error handler with pm_runtime_enable
+
+From: Chancel Liu <chancel.liu@nxp.com>
+
+[ Upstream commit f9d378fc68c43fd41b35133edec9cd902ec334ec ]
+
+There is error message when defer probe happens:
+
+fsl_rpmsg rpmsg_audio: Unbalanced pm_runtime_enable!
+
+Fix the error handler with pm_runtime_enable.
+
+Fixes: b73d9e6225e8 ("ASoC: fsl_rpmsg: Add CPU DAI driver for audio base on rpmsg")
+Signed-off-by: Chancel Liu <chancel.liu@nxp.com>
+Acked-by: Shengjiu Wang <shengjiu.wang@gmail.com>
+Link: https://lore.kernel.org/r/20231225080608.967953-1-chancel.liu@nxp.com
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/soc/fsl/fsl_rpmsg.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/sound/soc/fsl/fsl_rpmsg.c b/sound/soc/fsl/fsl_rpmsg.c
+index bf94838bdbefe..5c07a8ff0c9c0 100644
+--- a/sound/soc/fsl/fsl_rpmsg.c
++++ b/sound/soc/fsl/fsl_rpmsg.c
+@@ -231,7 +231,7 @@ static int fsl_rpmsg_probe(struct platform_device *pdev)
+ ret = devm_snd_soc_register_component(&pdev->dev, &fsl_component,
+ &fsl_rpmsg_dai, 1);
+ if (ret)
+- return ret;
++ goto err_pm_disable;
+
+ rpmsg->card_pdev = platform_device_register_data(&pdev->dev,
+ "imx-audio-rpmsg",
+@@ -241,16 +241,22 @@ static int fsl_rpmsg_probe(struct platform_device *pdev)
+ if (IS_ERR(rpmsg->card_pdev)) {
+ dev_err(&pdev->dev, "failed to register rpmsg card\n");
+ ret = PTR_ERR(rpmsg->card_pdev);
+- return ret;
++ goto err_pm_disable;
+ }
+
+ return 0;
++
++err_pm_disable:
++ pm_runtime_disable(&pdev->dev);
++ return ret;
+ }
+
+ static int fsl_rpmsg_remove(struct platform_device *pdev)
+ {
+ struct fsl_rpmsg *rpmsg = platform_get_drvdata(pdev);
+
++ pm_runtime_disable(&pdev->dev);
++
+ if (rpmsg->card_pdev)
+ platform_device_unregister(rpmsg->card_pdev);
+
+--
+2.43.0
+
--- /dev/null
+From e73ec909528dde319b11d6058116ce61ef4cf670 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 29 Dec 2023 13:43:42 +0200
+Subject: ASoC: mediatek: mt8186: fix AUD_PAD_TOP register and offset
+
+From: Eugen Hristev <eugen.hristev@collabora.com>
+
+[ Upstream commit 38744c3fa00109c51076121c2deb4f02e2f09194 ]
+
+AUD_PAD_TOP widget's correct register is AFE_AUD_PAD_TOP , and not zero.
+Having a zero as register, it would mean that the `snd_soc_dapm_new_widgets`
+would try to read the register at offset zero when trying to get the power
+status of this widget, which is incorrect.
+
+Fixes: b65c466220b3 ("ASoC: mediatek: mt8186: support adda in platform driver")
+Signed-off-by: Eugen Hristev <eugen.hristev@collabora.com>
+Link: https://lore.kernel.org/r/20231229114342.195867-1-eugen.hristev@collabora.com
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/soc/mediatek/mt8186/mt8186-dai-adda.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sound/soc/mediatek/mt8186/mt8186-dai-adda.c b/sound/soc/mediatek/mt8186/mt8186-dai-adda.c
+index 094402470dc23..858b95b199dcb 100644
+--- a/sound/soc/mediatek/mt8186/mt8186-dai-adda.c
++++ b/sound/soc/mediatek/mt8186/mt8186-dai-adda.c
+@@ -499,7 +499,7 @@ static const struct snd_soc_dapm_widget mtk_dai_adda_widgets[] = {
+ SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD),
+
+ SND_SOC_DAPM_SUPPLY_S("AUD_PAD_TOP", SUPPLY_SEQ_ADDA_AUD_PAD_TOP,
+- 0, 0, 0,
++ AFE_AUD_PAD_TOP, RG_RX_FIFO_ON_SFT, 0,
+ mtk_adda_pad_top_event,
+ SND_SOC_DAPM_PRE_PMU),
+ SND_SOC_DAPM_SUPPLY_S("ADDA_MTKAIF_CFG", SUPPLY_SEQ_ADDA_MTKAIF_CFG,
+--
+2.43.0
+
--- /dev/null
+From 1aae4192aa31ef02321a89bd34bbf0650c634bb1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jan 2024 18:34:03 +0000
+Subject: ASoC: meson: g12a-toacodec: Fix event generation
+
+From: Mark Brown <broonie@kernel.org>
+
+[ Upstream commit 172c88244b5f2d3375403ebb504d407be0fded59 ]
+
+When a control changes value the return value from _put() should be 1 so
+we get events generated to userspace notifying applications of the change.
+We are checking if there has been a change and exiting early if not but we
+are not providing the correct return value in the latter case, fix this.
+
+Fixes: af2618a2eee8 ("ASoC: meson: g12a: add internal DAC glue driver")
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-3-424af7a8fb91@kernel.org
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/soc/meson/g12a-toacodec.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sound/soc/meson/g12a-toacodec.c b/sound/soc/meson/g12a-toacodec.c
+index 3b1ce9143c653..8d8d848ebd58b 100644
+--- a/sound/soc/meson/g12a-toacodec.c
++++ b/sound/soc/meson/g12a-toacodec.c
+@@ -104,7 +104,7 @@ static int g12a_toacodec_mux_put_enum(struct snd_kcontrol *kcontrol,
+
+ snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL);
+
+- return 0;
++ return 1;
+ }
+
+ static SOC_ENUM_SINGLE_DECL(g12a_toacodec_mux_enum, TOACODEC_CTRL0,
+--
+2.43.0
+
--- /dev/null
+From 69dc7179c8414af40c52fc10df2814e6916b95a8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jan 2024 18:34:01 +0000
+Subject: ASoC: meson: g12a-toacodec: Validate written enum values
+
+From: Mark Brown <broonie@kernel.org>
+
+[ Upstream commit 3150b70e944ead909260285dfb5707d0bedcf87b ]
+
+When writing to an enum we need to verify that the value written is valid
+for the enumeration, the helper function snd_soc_item_enum_to_val() doesn't
+do it since it needs to return an unsigned (and in any case we'd need to
+check the return value).
+
+Fixes: af2618a2eee8 ("ASoC: meson: g12a: add internal DAC glue driver")
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-1-424af7a8fb91@kernel.org
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/soc/meson/g12a-toacodec.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/sound/soc/meson/g12a-toacodec.c b/sound/soc/meson/g12a-toacodec.c
+index ddc667956cf5e..3b1ce9143c653 100644
+--- a/sound/soc/meson/g12a-toacodec.c
++++ b/sound/soc/meson/g12a-toacodec.c
+@@ -71,6 +71,9 @@ static int g12a_toacodec_mux_put_enum(struct snd_kcontrol *kcontrol,
+ struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
+ unsigned int mux, reg;
+
++ if (ucontrol->value.enumerated.item[0] >= e->items)
++ return -EINVAL;
++
+ mux = snd_soc_enum_item_to_val(e, ucontrol->value.enumerated.item[0]);
+ regmap_field_read(priv->field_dat_sel, ®);
+
+--
+2.43.0
+
--- /dev/null
+From 4329af718ecce922ee648d6baabf15690d321821 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jan 2024 18:34:04 +0000
+Subject: ASoC: meson: g12a-tohdmitx: Fix event generation for S/PDIF mux
+
+From: Mark Brown <broonie@kernel.org>
+
+[ Upstream commit b036d8ef3120b996751495ce25994eea58032a98 ]
+
+When a control changes value the return value from _put() should be 1 so
+we get events generated to userspace notifying applications of the change.
+While the I2S mux gets this right the S/PDIF mux does not, fix the return
+value.
+
+Fixes: c8609f3870f7 ("ASoC: meson: add g12a tohdmitx control")
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-4-424af7a8fb91@kernel.org
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/soc/meson/g12a-tohdmitx.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sound/soc/meson/g12a-tohdmitx.c b/sound/soc/meson/g12a-tohdmitx.c
+index 46d1f04e0e8a3..154c324fdd42a 100644
+--- a/sound/soc/meson/g12a-tohdmitx.c
++++ b/sound/soc/meson/g12a-tohdmitx.c
+@@ -118,7 +118,7 @@ static int g12a_tohdmitx_spdif_mux_put_enum(struct snd_kcontrol *kcontrol,
+
+ snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL);
+
+- return 0;
++ return 1;
+ }
+
+ static SOC_ENUM_SINGLE_DECL(g12a_tohdmitx_spdif_mux_enum, TOHDMITX_CTRL0,
+--
+2.43.0
+
--- /dev/null
+From c652c462e27c9733ea410b8af1d8eccf55790e67 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jan 2024 18:34:02 +0000
+Subject: ASoC: meson: g12a-tohdmitx: Validate written enum values
+
+From: Mark Brown <broonie@kernel.org>
+
+[ Upstream commit 1e001206804be3f3d21f4a1cf16e5d059d75643f ]
+
+When writing to an enum we need to verify that the value written is valid
+for the enumeration, the helper function snd_soc_item_enum_to_val() doesn't
+do it since it needs to return an unsigned (and in any case we'd need to
+check the return value).
+
+Fixes: c8609f3870f7 ("ASoC: meson: add g12a tohdmitx control")
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-2-424af7a8fb91@kernel.org
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/soc/meson/g12a-tohdmitx.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/sound/soc/meson/g12a-tohdmitx.c b/sound/soc/meson/g12a-tohdmitx.c
+index 579a04ad4d197..46d1f04e0e8a3 100644
+--- a/sound/soc/meson/g12a-tohdmitx.c
++++ b/sound/soc/meson/g12a-tohdmitx.c
+@@ -45,6 +45,9 @@ static int g12a_tohdmitx_i2s_mux_put_enum(struct snd_kcontrol *kcontrol,
+ struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
+ unsigned int mux, changed;
+
++ if (ucontrol->value.enumerated.item[0] >= e->items)
++ return -EINVAL;
++
+ mux = snd_soc_enum_item_to_val(e, ucontrol->value.enumerated.item[0]);
+ changed = snd_soc_component_test_bits(component, e->reg,
+ CTRL0_I2S_DAT_SEL,
+@@ -93,6 +96,9 @@ static int g12a_tohdmitx_spdif_mux_put_enum(struct snd_kcontrol *kcontrol,
+ struct soc_enum *e = (struct soc_enum *)kcontrol->private_value;
+ unsigned int mux, changed;
+
++ if (ucontrol->value.enumerated.item[0] >= e->items)
++ return -EINVAL;
++
+ mux = snd_soc_enum_item_to_val(e, ucontrol->value.enumerated.item[0]);
+ changed = snd_soc_component_test_bits(component, TOHDMITX_CTRL0,
+ CTRL0_SPDIF_SEL,
+--
+2.43.0
+
--- /dev/null
+From 3e937b5c8e6387b2914b22893d6bc030db02f58f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Nov 2023 11:52:31 +0800
+Subject: blk-mq: make sure active queue usage is held for bio_integrity_prep()
+
+From: Christoph Hellwig <hch@infradead.org>
+
+[ Upstream commit b0077e269f6c152e807fdac90b58caf012cdbaab ]
+
+blk_integrity_unregister() can come if queue usage counter isn't held
+for one bio with integrity prepared, so this request may be completed with
+calling profile->complete_fn, then kernel panic.
+
+Another constraint is that bio_integrity_prep() needs to be called
+before bio merge.
+
+Fix the issue by:
+
+- call bio_integrity_prep() with one queue usage counter grabbed reliably
+
+- call bio_integrity_prep() before bio merge
+
+Fixes: 900e080752025f00 ("block: move queue enter logic into blk_mq_submit_bio()")
+Reported-by: Yi Zhang <yi.zhang@redhat.com>
+Cc: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Tested-by: Yi Zhang <yi.zhang@redhat.com>
+Link: https://lore.kernel.org/r/20231113035231.2708053-1-ming.lei@redhat.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ block/blk-mq.c | 75 +++++++++++++++++++++++++-------------------------
+ 1 file changed, 38 insertions(+), 37 deletions(-)
+
+diff --git a/block/blk-mq.c b/block/blk-mq.c
+index 100fb0c3114f8..383d94615e502 100644
+--- a/block/blk-mq.c
++++ b/block/blk-mq.c
+@@ -2855,11 +2855,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
+ };
+ struct request *rq;
+
+- if (unlikely(bio_queue_enter(bio)))
+- return NULL;
+-
+ if (blk_mq_attempt_bio_merge(q, bio, nsegs))
+- goto queue_exit;
++ return NULL;
+
+ rq_qos_throttle(q, bio);
+
+@@ -2875,35 +2872,23 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
+ rq_qos_cleanup(q, bio);
+ if (bio->bi_opf & REQ_NOWAIT)
+ bio_wouldblock_error(bio);
+-queue_exit:
+- blk_queue_exit(q);
+ return NULL;
+ }
+
+-static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
+- struct blk_plug *plug, struct bio **bio, unsigned int nsegs)
++/* return true if this @rq can be used for @bio */
++static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
++ struct bio *bio)
+ {
+- struct request *rq;
+- enum hctx_type type, hctx_type;
++ enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
++ enum hctx_type hctx_type = rq->mq_hctx->type;
+
+- if (!plug)
+- return NULL;
+- rq = rq_list_peek(&plug->cached_rq);
+- if (!rq || rq->q != q)
+- return NULL;
+-
+- if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) {
+- *bio = NULL;
+- return NULL;
+- }
++ WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+
+- type = blk_mq_get_hctx_type((*bio)->bi_opf);
+- hctx_type = rq->mq_hctx->type;
+ if (type != hctx_type &&
+ !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
+- return NULL;
+- if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
+- return NULL;
++ return false;
++ if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
++ return false;
+
+ /*
+ * If any qos ->throttle() end up blocking, we will have flushed the
+@@ -2911,11 +2896,11 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
+ * before we throttle.
+ */
+ plug->cached_rq = rq_list_next(rq);
+- rq_qos_throttle(q, *bio);
++ rq_qos_throttle(rq->q, bio);
+
+- rq->cmd_flags = (*bio)->bi_opf;
++ rq->cmd_flags = bio->bi_opf;
+ INIT_LIST_HEAD(&rq->queuelist);
+- return rq;
++ return true;
+ }
+
+ static void bio_set_ioprio(struct bio *bio)
+@@ -2944,7 +2929,7 @@ void blk_mq_submit_bio(struct bio *bio)
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ struct blk_plug *plug = blk_mq_plug(bio);
+ const int is_sync = op_is_sync(bio->bi_opf);
+- struct request *rq;
++ struct request *rq = NULL;
+ unsigned int nr_segs = 1;
+ blk_status_t ret;
+
+@@ -2955,20 +2940,36 @@ void blk_mq_submit_bio(struct bio *bio)
+ return;
+ }
+
+- if (!bio_integrity_prep(bio))
+- return;
+-
+ bio_set_ioprio(bio);
+
+- rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
+- if (!rq) {
+- if (!bio)
++ if (plug) {
++ rq = rq_list_peek(&plug->cached_rq);
++ if (rq && rq->q != q)
++ rq = NULL;
++ }
++ if (rq) {
++ if (!bio_integrity_prep(bio))
+ return;
+- rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
+- if (unlikely(!rq))
++ if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
+ return;
++ if (blk_mq_can_use_cached_rq(rq, plug, bio))
++ goto done;
++ percpu_ref_get(&q->q_usage_counter);
++ } else {
++ if (unlikely(bio_queue_enter(bio)))
++ return;
++ if (!bio_integrity_prep(bio))
++ goto fail;
++ }
++
++ rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
++ if (unlikely(!rq)) {
++fail:
++ blk_queue_exit(q);
++ return;
+ }
+
++done:
+ trace_block_getrq(bio);
+
+ rq_qos_track(q, rq, bio);
+--
+2.43.0
+
--- /dev/null
+From 4620179873d798d8815800dcaa3f411857d6aee7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 Oct 2023 16:10:18 +0200
+Subject: block: update the stable_writes flag in bdev_add
+
+From: Christoph Hellwig <hch@lst.de>
+
+[ Upstream commit 1898efcdbed32bb1c67269c985a50bab0dbc9493 ]
+
+Propagate the per-queue stable_write flags into each bdev inode in bdev_add.
+This makes sure devices that require stable writes have it set for I/O
+on the block device node as well.
+
+Note that this doesn't cover the case of a flag changing on a live device
+yet. We should handle that as well, but I plan to cover it as part of a
+more general rework of how changing runtime paramters on block devices
+works.
+
+Fixes: 1cb039f3dc16 ("bdi: replace BDI_CAP_STABLE_WRITES with a queue and a sb flag")
+Reported-by: Ilya Dryomov <idryomov@gmail.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Link: https://lore.kernel.org/r/20231025141020.192413-3-hch@lst.de
+Tested-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ block/bdev.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/block/bdev.c b/block/bdev.c
+index d699ecdb32604..b61502ec8da06 100644
+--- a/block/bdev.c
++++ b/block/bdev.c
+@@ -507,6 +507,8 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
+
+ void bdev_add(struct block_device *bdev, dev_t dev)
+ {
++ if (bdev_stable_writes(bdev))
++ mapping_set_stable_writes(bdev->bd_inode->i_mapping);
+ bdev->bd_dev = dev;
+ bdev->bd_inode->i_rdev = dev;
+ bdev->bd_inode->i_ino = dev;
+--
+2.43.0
+
--- /dev/null
+From af39ac5b0695d95e5a080366d3ec9115d9fa2e72 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jan 2024 16:59:24 -0800
+Subject: bnxt_en: Remove mis-applied code from bnxt_cfg_ntp_filters()
+
+From: Michael Chan <michael.chan@broadcom.com>
+
+[ Upstream commit e009b2efb7a8850498796b360043ac25c8d3d28f ]
+
+The 2 lines to check for the BNXT_HWRM_PF_UNLOAD_SP_EVENT bit was
+mis-applied to bnxt_cfg_ntp_filters() and should have been applied to
+bnxt_sp_task().
+
+Fixes: 19241368443f ("bnxt_en: Send PF driver unload notification to all VFs.")
+Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
+Signed-off-by: Michael Chan <michael.chan@broadcom.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index 623cdeb29ed90..df4d88d35701b 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -12081,6 +12081,8 @@ static void bnxt_sp_task(struct work_struct *work)
+ bnxt_cfg_ntp_filters(bp);
+ if (test_and_clear_bit(BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT, &bp->sp_event))
+ bnxt_hwrm_exec_fwd_req(bp);
++ if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event))
++ netdev_info(bp->dev, "Receive PF driver unload event!\n");
+ if (test_and_clear_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event)) {
+ bnxt_hwrm_port_qstats(bp, 0);
+ bnxt_hwrm_port_qstats_ext(bp, 0);
+@@ -13059,8 +13061,6 @@ static void bnxt_cfg_ntp_filters(struct bnxt *bp)
+ }
+ }
+ }
+- if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event))
+- netdev_info(bp->dev, "Receive PF driver unload event!\n");
+ }
+
+ #else
+--
+2.43.0
+
--- /dev/null
+From 02818dc2580eae9be766e1be3885bdeeeb7ef526 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 2 Mar 2023 15:50:04 -0800
+Subject: bpf: clean up visit_insn()'s instruction processing
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit 653ae3a874aca6764a4c1f5a8bf1b072ade0d6f4 ]
+
+Instead of referencing processed instruction repeatedly as insns[t]
+throughout entire visit_insn() function, take a local insn pointer and
+work with it in a cleaner way.
+
+It makes enhancing this function further a bit easier as well.
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20230302235015.2044271-7-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/verifier.c | 25 ++++++++++++-------------
+ 1 file changed, 12 insertions(+), 13 deletions(-)
+
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index d1393e07ab2c9..73d500c51bd86 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -11115,44 +11115,43 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
+ */
+ static int visit_insn(int t, struct bpf_verifier_env *env)
+ {
+- struct bpf_insn *insns = env->prog->insnsi;
++ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
+ int ret;
+
+- if (bpf_pseudo_func(insns + t))
++ if (bpf_pseudo_func(insn))
+ return visit_func_call_insn(t, insns, env, true);
+
+ /* All non-branch instructions have a single fall-through edge. */
+- if (BPF_CLASS(insns[t].code) != BPF_JMP &&
+- BPF_CLASS(insns[t].code) != BPF_JMP32)
++ if (BPF_CLASS(insn->code) != BPF_JMP &&
++ BPF_CLASS(insn->code) != BPF_JMP32)
+ return push_insn(t, t + 1, FALLTHROUGH, env, false);
+
+- switch (BPF_OP(insns[t].code)) {
++ switch (BPF_OP(insn->code)) {
+ case BPF_EXIT:
+ return DONE_EXPLORING;
+
+ case BPF_CALL:
+- if (insns[t].imm == BPF_FUNC_timer_set_callback)
++ if (insn->imm == BPF_FUNC_timer_set_callback)
+ /* Mark this call insn as a prune point to trigger
+ * is_state_visited() check before call itself is
+ * processed by __check_func_call(). Otherwise new
+ * async state will be pushed for further exploration.
+ */
+ mark_prune_point(env, t);
+- return visit_func_call_insn(t, insns, env,
+- insns[t].src_reg == BPF_PSEUDO_CALL);
++ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL);
+
+ case BPF_JA:
+- if (BPF_SRC(insns[t].code) != BPF_K)
++ if (BPF_SRC(insn->code) != BPF_K)
+ return -EINVAL;
+
+ /* unconditional jump with single edge */
+- ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env,
++ ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env,
+ true);
+ if (ret)
+ return ret;
+
+- mark_prune_point(env, t + insns[t].off + 1);
+- mark_jmp_point(env, t + insns[t].off + 1);
++ mark_prune_point(env, t + insn->off + 1);
++ mark_jmp_point(env, t + insn->off + 1);
+
+ return ret;
+
+@@ -11164,7 +11163,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
+ if (ret)
+ return ret;
+
+- return push_insn(t, t + insns[t].off + 1, BRANCH, env, true);
++ return push_insn(t, t + insn->off + 1, BRANCH, env, true);
+ }
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 1d848bcf5df37f2bdcc07a0518140fe62ed6383b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 6 Dec 2022 15:33:43 -0800
+Subject: bpf: decouple prune and jump points
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit bffdeaa8a5af7200b0e74c9d5a41167f86626a36 ]
+
+BPF verifier marks some instructions as prune points. Currently these
+prune points serve two purposes.
+
+It's a point where verifier tries to find previously verified state and
+check current state's equivalence to short circuit verification for
+current code path.
+
+But also currently it's a point where jump history, used for precision
+backtracking, is updated. This is done so that non-linear flow of
+execution could be properly backtracked.
+
+Such coupling is coincidental and unnecessary. Some prune points are not
+part of some non-linear jump path, so don't need update of jump history.
+On the other hand, not all instructions which have to be recorded in
+jump history necessarily are good prune points.
+
+This patch splits prune and jump points into independent flags.
+Currently all prune points are marked as jump points to minimize amount
+of changes in this patch, but next patch will perform some optimization
+of prune vs jmp point placement.
+
+No functional changes are intended.
+
+Acked-by: John Fastabend <john.fastabend@gmail.com>
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20221206233345.438540-2-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/bpf_verifier.h | 1 +
+ kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++---------
+ 2 files changed, 44 insertions(+), 14 deletions(-)
+
+diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
+index 1a32baa78ce26..f080ccf27d256 100644
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -429,6 +429,7 @@ struct bpf_insn_aux_data {
+ /* below fields are initialized once */
+ unsigned int orig_idx; /* original instruction index */
+ bool prune_point;
++ bool jmp_point;
+ };
+
+ #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index ee6e811b43158..ec688665aaa25 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2512,6 +2512,16 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
+ return 0;
+ }
+
++static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
++{
++ env->insn_aux_data[idx].jmp_point = true;
++}
++
++static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx)
++{
++ return env->insn_aux_data[insn_idx].jmp_point;
++}
++
+ /* for any branch, call, exit record the history of jmps in the given state */
+ static int push_jmp_history(struct bpf_verifier_env *env,
+ struct bpf_verifier_state *cur)
+@@ -2520,6 +2530,9 @@ static int push_jmp_history(struct bpf_verifier_env *env,
+ struct bpf_idx_pair *p;
+ size_t alloc_size;
+
++ if (!is_jmp_point(env, env->insn_idx))
++ return 0;
++
+ cnt++;
+ alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p)));
+ p = krealloc(cur->jmp_history, alloc_size, GFP_USER);
+@@ -11000,11 +11013,16 @@ static struct bpf_verifier_state_list **explored_state(
+ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)];
+ }
+
+-static void init_explored_state(struct bpf_verifier_env *env, int idx)
++static void mark_prune_point(struct bpf_verifier_env *env, int idx)
+ {
+ env->insn_aux_data[idx].prune_point = true;
+ }
+
++static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx)
++{
++ return env->insn_aux_data[insn_idx].prune_point;
++}
++
+ enum {
+ DONE_EXPLORING = 0,
+ KEEP_EXPLORING = 1,
+@@ -11033,9 +11051,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
+ return -EINVAL;
+ }
+
+- if (e == BRANCH)
++ if (e == BRANCH) {
+ /* mark branch target for state pruning */
+- init_explored_state(env, w);
++ mark_prune_point(env, w);
++ mark_jmp_point(env, w);
++ }
+
+ if (insn_state[w] == 0) {
+ /* tree-edge */
+@@ -11073,10 +11093,13 @@ static int visit_func_call_insn(int t, int insn_cnt,
+ if (ret)
+ return ret;
+
+- if (t + 1 < insn_cnt)
+- init_explored_state(env, t + 1);
++ if (t + 1 < insn_cnt) {
++ mark_prune_point(env, t + 1);
++ mark_jmp_point(env, t + 1);
++ }
+ if (visit_callee) {
+- init_explored_state(env, t);
++ mark_prune_point(env, t);
++ mark_jmp_point(env, t);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
+ /* It's ok to allow recursion from CFG point of
+ * view. __check_func_call() will do the actual
+@@ -11110,13 +11133,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+ return DONE_EXPLORING;
+
+ case BPF_CALL:
+- if (insns[t].imm == BPF_FUNC_timer_set_callback)
++ if (insns[t].imm == BPF_FUNC_timer_set_callback) {
+ /* Mark this call insn to trigger is_state_visited() check
+ * before call itself is processed by __check_func_call().
+ * Otherwise new async state will be pushed for further
+ * exploration.
+ */
+- init_explored_state(env, t);
++ mark_prune_point(env, t);
++ mark_jmp_point(env, t);
++ }
+ return visit_func_call_insn(t, insn_cnt, insns, env,
+ insns[t].src_reg == BPF_PSEUDO_CALL);
+
+@@ -11134,18 +11159,22 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+ * but it's marked, since backtracking needs
+ * to record jmp history in is_state_visited().
+ */
+- init_explored_state(env, t + insns[t].off + 1);
++ mark_prune_point(env, t + insns[t].off + 1);
++ mark_jmp_point(env, t + insns[t].off + 1);
+ /* tell verifier to check for equivalent states
+ * after every call and jump
+ */
+- if (t + 1 < insn_cnt)
+- init_explored_state(env, t + 1);
++ if (t + 1 < insn_cnt) {
++ mark_prune_point(env, t + 1);
++ mark_jmp_point(env, t + 1);
++ }
+
+ return ret;
+
+ default:
+ /* conditional jump with two edges */
+- init_explored_state(env, t);
++ mark_prune_point(env, t);
++ mark_jmp_point(env, t);
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
+ if (ret)
+ return ret;
+@@ -12178,11 +12207,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
+ bool add_new_state = env->test_state_freq ? true : false;
+
+ cur->last_insn_idx = env->prev_insn_idx;
+- if (!env->insn_aux_data[insn_idx].prune_point)
++ if (!is_prune_point(env, insn_idx))
+ /* this 'insn_idx' instruction wasn't marked, so we will not
+ * be doing state search here
+ */
+- return 0;
++ return push_jmp_history(env, cur);
+
+ /* bpf progs typically have pruning point every 4 instructions
+ * http://vger.kernel.org/bpfconf2019.html#session-1
+--
+2.43.0
+
--- /dev/null
+From 5f576d9732e2017e5f5e1da533df5a11be2b311b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 9 Nov 2023 16:26:37 -0800
+Subject: bpf: fix precision backtracking instruction iteration
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit 4bb7ea946a370707315ab774432963ce47291946 ]
+
+Fix an edge case in __mark_chain_precision() which prematurely stops
+backtracking instructions in a state if it happens that state's first
+and last instruction indexes are the same. This situations doesn't
+necessarily mean that there were no instructions simulated in a state,
+but rather that we starting from the instruction, jumped around a bit,
+and then ended up at the same instruction before checkpointing or
+marking precision.
+
+To distinguish between these two possible situations, we need to consult
+jump history. If it's empty or contain a single record "bridging" parent
+state and first instruction of processed state, then we indeed
+backtracked all instructions in this state. But if history is not empty,
+we are definitely not done yet.
+
+Move this logic inside get_prev_insn_idx() to contain it more nicely.
+Use -ENOENT return code to denote "we are out of instructions"
+situation.
+
+This bug was exposed by verifier_loop1.c's bounded_recursion subtest, once
+the next fix in this patch set is applied.
+
+Acked-by: Eduard Zingerman <eddyz87@gmail.com>
+Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking")
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20231110002638.4168352-3-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/verifier.c | 21 +++++++++++++++++++--
+ 1 file changed, 19 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index 95521beec66c5..142e10d49fd81 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2551,12 +2551,29 @@ static int push_jmp_history(struct bpf_verifier_env *env,
+
+ /* Backtrack one insn at a time. If idx is not at the top of recorded
+ * history then previous instruction came from straight line execution.
++ * Return -ENOENT if we exhausted all instructions within given state.
++ *
++ * It's legal to have a bit of a looping with the same starting and ending
++ * insn index within the same state, e.g.: 3->4->5->3, so just because current
++ * instruction index is the same as state's first_idx doesn't mean we are
++ * done. If there is still some jump history left, we should keep going. We
++ * need to take into account that we might have a jump history between given
++ * state's parent and itself, due to checkpointing. In this case, we'll have
++ * history entry recording a jump from last instruction of parent state and
++ * first instruction of given state.
+ */
+ static int get_prev_insn_idx(struct bpf_verifier_state *st, int i,
+ u32 *history)
+ {
+ u32 cnt = *history;
+
++ if (i == st->first_insn_idx) {
++ if (cnt == 0)
++ return -ENOENT;
++ if (cnt == 1 && st->jmp_history[0].idx == i)
++ return -ENOENT;
++ }
++
+ if (cnt && st->jmp_history[cnt - 1].idx == i) {
+ i = st->jmp_history[cnt - 1].prev_idx;
+ (*history)--;
+@@ -3052,9 +3069,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r
+ * Nothing to be tracked further in the parent state.
+ */
+ return 0;
+- if (i == first_idx)
+- break;
+ i = get_prev_insn_idx(st, i, &history);
++ if (i == -ENOENT)
++ break;
+ if (i >= env->prog->len) {
+ /* This can happen if backtracking reached insn 0
+ * and there are still reg_mask or stack_mask
+--
+2.43.0
+
--- /dev/null
+From e72d96cb30d0d7cac5d70679da65b38e3fded5d9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 9 Nov 2023 16:26:36 -0800
+Subject: bpf: handle ldimm64 properly in check_cfg()
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 ]
+
+ldimm64 instructions are 16-byte long, and so have to be handled
+appropriately in check_cfg(), just like the rest of BPF verifier does.
+
+This has implications in three places:
+ - when determining next instruction for non-jump instructions;
+ - when determining next instruction for callback address ldimm64
+ instructions (in visit_func_call_insn());
+ - when checking for unreachable instructions, where second half of
+ ldimm64 is expected to be unreachable;
+
+We take this also as an opportunity to report jump into the middle of
+ldimm64. And adjust few test_verifier tests accordingly.
+
+Acked-by: Eduard Zingerman <eddyz87@gmail.com>
+Reported-by: Hao Sun <sunhao.th@gmail.com>
+Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)")
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/bpf.h | 8 ++++--
+ kernel/bpf/verifier.c | 27 ++++++++++++++-----
+ .../testing/selftests/bpf/verifier/ld_imm64.c | 8 +++---
+ 3 files changed, 30 insertions(+), 13 deletions(-)
+
+diff --git a/include/linux/bpf.h b/include/linux/bpf.h
+index 619fcba84be22..ba22cf4f5fc0e 100644
+--- a/include/linux/bpf.h
++++ b/include/linux/bpf.h
+@@ -702,10 +702,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size)
+ aux->ctx_field_size = size;
+ }
+
++static bool bpf_is_ldimm64(const struct bpf_insn *insn)
++{
++ return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
++}
++
+ static inline bool bpf_pseudo_func(const struct bpf_insn *insn)
+ {
+- return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+- insn->src_reg == BPF_PSEUDO_FUNC;
++ return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
+ }
+
+ struct bpf_prog_ops {
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index dd025f66efabc..95521beec66c5 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -11090,15 +11090,16 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
+ struct bpf_verifier_env *env,
+ bool visit_callee)
+ {
+- int ret;
++ int ret, insn_sz;
+
+- ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
++ insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1;
++ ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
+ if (ret)
+ return ret;
+
+- mark_prune_point(env, t + 1);
++ mark_prune_point(env, t + insn_sz);
+ /* when we exit from subprog, we need to record non-linear history */
+- mark_jmp_point(env, t + 1);
++ mark_jmp_point(env, t + insn_sz);
+
+ if (visit_callee) {
+ mark_prune_point(env, t);
+@@ -11120,15 +11121,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
+ static int visit_insn(int t, struct bpf_verifier_env *env)
+ {
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
+- int ret, off;
++ int ret, off, insn_sz;
+
+ if (bpf_pseudo_func(insn))
+ return visit_func_call_insn(t, insns, env, true);
+
+ /* All non-branch instructions have a single fall-through edge. */
+ if (BPF_CLASS(insn->code) != BPF_JMP &&
+- BPF_CLASS(insn->code) != BPF_JMP32)
+- return push_insn(t, t + 1, FALLTHROUGH, env, false);
++ BPF_CLASS(insn->code) != BPF_JMP32) {
++ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1;
++ return push_insn(t, t + insn_sz, FALLTHROUGH, env, false);
++ }
+
+ switch (BPF_OP(insn->code)) {
+ case BPF_EXIT:
+@@ -11227,11 +11230,21 @@ static int check_cfg(struct bpf_verifier_env *env)
+ }
+
+ for (i = 0; i < insn_cnt; i++) {
++ struct bpf_insn *insn = &env->prog->insnsi[i];
++
+ if (insn_state[i] != EXPLORED) {
+ verbose(env, "unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
++ if (bpf_is_ldimm64(insn)) {
++ if (insn_state[i + 1] != 0) {
++ verbose(env, "jump into the middle of ldimm64 insn %d\n", i);
++ ret = -EINVAL;
++ goto err_free;
++ }
++ i++; /* skip second half of ldimm64 */
++ }
+ }
+ ret = 0; /* cfg looks good */
+
+diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c
+index f9297900cea6d..78f19c255f20b 100644
+--- a/tools/testing/selftests/bpf/verifier/ld_imm64.c
++++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c
+@@ -9,8 +9,8 @@
+ BPF_MOV64_IMM(BPF_REG_0, 2),
+ BPF_EXIT_INSN(),
+ },
+- .errstr = "invalid BPF_LD_IMM insn",
+- .errstr_unpriv = "R1 pointer comparison",
++ .errstr = "jump into the middle of ldimm64 insn 1",
++ .errstr_unpriv = "jump into the middle of ldimm64 insn 1",
+ .result = REJECT,
+ },
+ {
+@@ -23,8 +23,8 @@
+ BPF_LD_IMM64(BPF_REG_0, 1),
+ BPF_EXIT_INSN(),
+ },
+- .errstr = "invalid BPF_LD_IMM insn",
+- .errstr_unpriv = "R1 pointer comparison",
++ .errstr = "jump into the middle of ldimm64 insn 1",
++ .errstr_unpriv = "jump into the middle of ldimm64 insn 1",
+ .result = REJECT,
+ },
+ {
+--
+2.43.0
+
--- /dev/null
+From 90b6441df9cf455cd5ad99ec2231d29e605a5a47 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 6 Dec 2022 15:33:45 -0800
+Subject: bpf: remove unnecessary prune and jump points
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit 618945fbed501b6e5865042068a51edfb2dda948 ]
+
+Don't mark some instructions as jump points when there are actually no
+jumps and instructions are just processed sequentially. Such case is
+handled naturally by precision backtracking logic without the need to
+update jump history. See get_prev_insn_idx(). It goes back linearly by
+one instruction, unless current top of jmp_history is pointing to
+current instruction. In such case we use `st->jmp_history[cnt - 1].prev_idx`
+to find instruction from which we jumped to the current instruction
+non-linearly.
+
+Also remove both jump and prune point marking for instruction right
+after unconditional jumps, as program flow can get to the instruction
+right after unconditional jump instruction only if there is a jump to
+that instruction from somewhere else in the program. In such case we'll
+mark such instruction as prune/jump point because it's a destination of
+a jump.
+
+This change has no changes in terms of number of instructions or states
+processes across Cilium and selftests programs.
+
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Acked-by: John Fastabend <john.fastabend@gmail.com>
+Link: https://lore.kernel.org/r/20221206233345.438540-4-andrii@kernel.org
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/verifier.c | 34 ++++++++++------------------------
+ 1 file changed, 10 insertions(+), 24 deletions(-)
+
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index ec688665aaa25..09631797d9e0c 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -11093,13 +11093,12 @@ static int visit_func_call_insn(int t, int insn_cnt,
+ if (ret)
+ return ret;
+
+- if (t + 1 < insn_cnt) {
+- mark_prune_point(env, t + 1);
+- mark_jmp_point(env, t + 1);
+- }
++ mark_prune_point(env, t + 1);
++ /* when we exit from subprog, we need to record non-linear history */
++ mark_jmp_point(env, t + 1);
++
+ if (visit_callee) {
+ mark_prune_point(env, t);
+- mark_jmp_point(env, t);
+ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env,
+ /* It's ok to allow recursion from CFG point of
+ * view. __check_func_call() will do the actual
+@@ -11133,15 +11132,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+ return DONE_EXPLORING;
+
+ case BPF_CALL:
+- if (insns[t].imm == BPF_FUNC_timer_set_callback) {
+- /* Mark this call insn to trigger is_state_visited() check
+- * before call itself is processed by __check_func_call().
+- * Otherwise new async state will be pushed for further
+- * exploration.
++ if (insns[t].imm == BPF_FUNC_timer_set_callback)
++ /* Mark this call insn as a prune point to trigger
++ * is_state_visited() check before call itself is
++ * processed by __check_func_call(). Otherwise new
++ * async state will be pushed for further exploration.
+ */
+ mark_prune_point(env, t);
+- mark_jmp_point(env, t);
+- }
+ return visit_func_call_insn(t, insn_cnt, insns, env,
+ insns[t].src_reg == BPF_PSEUDO_CALL);
+
+@@ -11155,26 +11152,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+ if (ret)
+ return ret;
+
+- /* unconditional jmp is not a good pruning point,
+- * but it's marked, since backtracking needs
+- * to record jmp history in is_state_visited().
+- */
+ mark_prune_point(env, t + insns[t].off + 1);
+ mark_jmp_point(env, t + insns[t].off + 1);
+- /* tell verifier to check for equivalent states
+- * after every call and jump
+- */
+- if (t + 1 < insn_cnt) {
+- mark_prune_point(env, t + 1);
+- mark_jmp_point(env, t + 1);
+- }
+
+ return ret;
+
+ default:
+ /* conditional jump with two edges */
+ mark_prune_point(env, t);
+- mark_jmp_point(env, t);
++
+ ret = push_insn(t, t + 1, FALLTHROUGH, env, true);
+ if (ret)
+ return ret;
+--
+2.43.0
+
--- /dev/null
+From b9857568c364a47cb907e60b86ee7c0a1f73a7b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 7 Dec 2022 11:55:34 -0800
+Subject: bpf: Remove unused insn_cnt argument from visit_[func_call_]insn()
+
+From: Andrii Nakryiko <andrii@kernel.org>
+
+[ Upstream commit dcb2288b1fd9a8cdf2f3b8c0c7b3763346ef515f ]
+
+Number of total instructions in BPF program (including subprogs) can and
+is accessed from env->prog->len. visit_func_call_insn() doesn't do any
+checks against insn_cnt anymore, relying on push_insn() to do this check
+internally. So remove unnecessary insn_cnt input argument from
+visit_func_call_insn() and visit_insn() functions.
+
+Suggested-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Link: https://lore.kernel.org/bpf/20221207195534.2866030-1-andrii@kernel.org
+Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/verifier.c | 11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index 09631797d9e0c..d1393e07ab2c9 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -11082,8 +11082,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env,
+ return DONE_EXPLORING;
+ }
+
+-static int visit_func_call_insn(int t, int insn_cnt,
+- struct bpf_insn *insns,
++static int visit_func_call_insn(int t, struct bpf_insn *insns,
+ struct bpf_verifier_env *env,
+ bool visit_callee)
+ {
+@@ -11114,13 +11113,13 @@ static int visit_func_call_insn(int t, int insn_cnt,
+ * DONE_EXPLORING - the instruction was fully explored
+ * KEEP_EXPLORING - there is still work to be done before it is fully explored
+ */
+-static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
++static int visit_insn(int t, struct bpf_verifier_env *env)
+ {
+ struct bpf_insn *insns = env->prog->insnsi;
+ int ret;
+
+ if (bpf_pseudo_func(insns + t))
+- return visit_func_call_insn(t, insn_cnt, insns, env, true);
++ return visit_func_call_insn(t, insns, env, true);
+
+ /* All non-branch instructions have a single fall-through edge. */
+ if (BPF_CLASS(insns[t].code) != BPF_JMP &&
+@@ -11139,7 +11138,7 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env)
+ * async state will be pushed for further exploration.
+ */
+ mark_prune_point(env, t);
+- return visit_func_call_insn(t, insn_cnt, insns, env,
++ return visit_func_call_insn(t, insns, env,
+ insns[t].src_reg == BPF_PSEUDO_CALL);
+
+ case BPF_JA:
+@@ -11196,7 +11195,7 @@ static int check_cfg(struct bpf_verifier_env *env)
+ while (env->cfg.cur_stack > 0) {
+ int t = insn_stack[env->cfg.cur_stack - 1];
+
+- ret = visit_insn(t, insn_cnt, env);
++ ret = visit_insn(t, env);
+ switch (ret) {
+ case DONE_EXPLORING:
+ insn_state[t] = EXPLORED;
+--
+2.43.0
+
--- /dev/null
+From 2e5ec045cba65071ef0736ce3d6a2e56106c261d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Nov 2023 17:25:56 -0800
+Subject: bpf, sockmap: af_unix stream sockets need to hold ref for pair sock
+
+From: John Fastabend <john.fastabend@gmail.com>
+
+[ Upstream commit 8866730aed5100f06d3d965c22f1c61f74942541 ]
+
+AF_UNIX stream sockets are a paired socket. So sending on one of the pairs
+will lookup the paired socket as part of the send operation. It is possible
+however to put just one of the pairs in a BPF map. This currently increments
+the refcnt on the sock in the sockmap to ensure it is not free'd by the
+stack before sockmap cleans up its state and stops any skbs being sent/recv'd
+to that socket.
+
+But we missed a case. If the peer socket is closed it will be free'd by the
+stack. However, the paired socket can still be referenced from BPF sockmap
+side because we hold a reference there. Then if we are sending traffic through
+BPF sockmap to that socket it will try to dereference the free'd pair in its
+send logic creating a use after free. And following splat:
+
+ [59.900375] BUG: KASAN: slab-use-after-free in sk_wake_async+0x31/0x1b0
+ [59.901211] Read of size 8 at addr ffff88811acbf060 by task kworker/1:2/954
+ [...]
+ [59.905468] Call Trace:
+ [59.905787] <TASK>
+ [59.906066] dump_stack_lvl+0x130/0x1d0
+ [59.908877] print_report+0x16f/0x740
+ [59.910629] kasan_report+0x118/0x160
+ [59.912576] sk_wake_async+0x31/0x1b0
+ [59.913554] sock_def_readable+0x156/0x2a0
+ [59.914060] unix_stream_sendmsg+0x3f9/0x12a0
+ [59.916398] sock_sendmsg+0x20e/0x250
+ [59.916854] skb_send_sock+0x236/0xac0
+ [59.920527] sk_psock_backlog+0x287/0xaa0
+
+To fix let BPF sockmap hold a refcnt on both the socket in the sockmap and its
+paired socket. It wasn't obvious how to contain the fix to bpf_unix logic. The
+primarily problem with keeping this logic in bpf_unix was: In the sock close()
+we could handle the deref by having a close handler. But, when we are destroying
+the psock through a map delete operation we wouldn't have gotten any signal
+thorugh the proto struct other than it being replaced. If we do the deref from
+the proto replace its too early because we need to deref the sk_pair after the
+backlog worker has been stopped.
+
+Given all this it seems best to just cache it at the end of the psock and eat 8B
+for the af_unix and vsock users. Notice dgram sockets are OK because they handle
+locking already.
+
+Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap")
+Signed-off-by: John Fastabend <john.fastabend@gmail.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Reviewed-by: Jakub Sitnicki <jakub@cloudflare.com>
+Link: https://lore.kernel.org/bpf/20231129012557.95371-2-john.fastabend@gmail.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/skmsg.h | 1 +
+ include/net/af_unix.h | 1 +
+ net/core/skmsg.c | 2 ++
+ net/unix/af_unix.c | 2 --
+ net/unix/unix_bpf.c | 5 +++++
+ 5 files changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
+index c1637515a8a41..c953b8c0d2f43 100644
+--- a/include/linux/skmsg.h
++++ b/include/linux/skmsg.h
+@@ -106,6 +106,7 @@ struct sk_psock {
+ struct mutex work_mutex;
+ struct sk_psock_work_state work_state;
+ struct delayed_work work;
++ struct sock *sk_pair;
+ struct rcu_work rwork;
+ };
+
+diff --git a/include/net/af_unix.h b/include/net/af_unix.h
+index 480fa579787e5..55ca217c626b7 100644
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -77,6 +77,7 @@ static inline struct unix_sock *unix_sk(const struct sock *sk)
+ {
+ return (struct unix_sock *)sk;
+ }
++#define unix_peer(sk) (unix_sk(sk)->peer)
+
+ #define peer_wait peer_wq.wait
+
+diff --git a/net/core/skmsg.c b/net/core/skmsg.c
+index a5c1f67dc96ec..3818035ea0021 100644
+--- a/net/core/skmsg.c
++++ b/net/core/skmsg.c
+@@ -825,6 +825,8 @@ static void sk_psock_destroy(struct work_struct *work)
+
+ if (psock->sk_redir)
+ sock_put(psock->sk_redir);
++ if (psock->sk_pair)
++ sock_put(psock->sk_pair);
+ sock_put(psock->sk);
+ kfree(psock);
+ }
+diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
+index 6dbeb80073338..be2ed7b0fe21c 100644
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -211,8 +211,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
+ }
+ #endif /* CONFIG_SECURITY_NETWORK */
+
+-#define unix_peer(sk) (unix_sk(sk)->peer)
+-
+ static inline int unix_our_peer(struct sock *sk, struct sock *osk)
+ {
+ return unix_peer(osk) == sk;
+diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
+index 2f9d8271c6ec7..7ea7c3a0d0d06 100644
+--- a/net/unix/unix_bpf.c
++++ b/net/unix/unix_bpf.c
+@@ -159,12 +159,17 @@ int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool re
+
+ int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
+ {
++ struct sock *sk_pair;
++
+ if (restore) {
+ sk->sk_write_space = psock->saved_write_space;
+ sock_replace_proto(sk, psock->sk_proto);
+ return 0;
+ }
+
++ sk_pair = unix_peer(sk);
++ sock_hold(sk_pair);
++ psock->sk_pair = sk_pair;
+ unix_stream_bpf_check_needs_rebuild(psock->sk_proto);
+ sock_replace_proto(sk, &unix_stream_bpf_prot);
+ return 0;
+--
+2.43.0
+
--- /dev/null
+From 7c7c0669562f577fee8dbb0e780a26bc7a1b146a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 27 Jul 2023 18:12:31 -0700
+Subject: bpf: Support new 32bit offset jmp instruction
+
+From: Yonghong Song <yonghong.song@linux.dev>
+
+[ Upstream commit 4cd58e9af8b9d9fff6b7145e742abbfcda0af4af ]
+
+Add interpreter/jit/verifier support for 32bit offset jmp instruction.
+If a conditional jmp instruction needs more than 16bit offset,
+it can be simulated with a conditional jmp + a 32bit jmp insn.
+
+Acked-by: Eduard Zingerman <eddyz87@gmail.com>
+Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
+Link: https://lore.kernel.org/r/20230728011231.3716103-1-yonghong.song@linux.dev
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++----------
+ kernel/bpf/core.c | 19 ++++++++++++++++---
+ kernel/bpf/verifier.c | 32 ++++++++++++++++++++++----------
+ 3 files changed, 56 insertions(+), 23 deletions(-)
+
+diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
+index 84c695ae1940f..b69aee6245e4a 100644
+--- a/arch/x86/net/bpf_jit_comp.c
++++ b/arch/x86/net/bpf_jit_comp.c
+@@ -1625,16 +1625,24 @@ st: if (is_imm8(insn->off))
+ break;
+
+ case BPF_JMP | BPF_JA:
+- if (insn->off == -1)
+- /* -1 jmp instructions will always jump
+- * backwards two bytes. Explicitly handling
+- * this case avoids wasting too many passes
+- * when there are long sequences of replaced
+- * dead code.
+- */
+- jmp_offset = -2;
+- else
+- jmp_offset = addrs[i + insn->off] - addrs[i];
++ case BPF_JMP32 | BPF_JA:
++ if (BPF_CLASS(insn->code) == BPF_JMP) {
++ if (insn->off == -1)
++ /* -1 jmp instructions will always jump
++ * backwards two bytes. Explicitly handling
++ * this case avoids wasting too many passes
++ * when there are long sequences of replaced
++ * dead code.
++ */
++ jmp_offset = -2;
++ else
++ jmp_offset = addrs[i + insn->off] - addrs[i];
++ } else {
++ if (insn->imm == -1)
++ jmp_offset = -2;
++ else
++ jmp_offset = addrs[i + insn->imm] - addrs[i];
++ }
+
+ if (!jmp_offset) {
+ /*
+diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
+index 7225cb67c0d3a..0b55ebf4a9b1f 100644
+--- a/kernel/bpf/core.c
++++ b/kernel/bpf/core.c
+@@ -367,7 +367,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
+ {
+ const s32 off_min = S16_MIN, off_max = S16_MAX;
+ s32 delta = end_new - end_old;
+- s32 off = insn->off;
++ s32 off;
++
++ if (insn->code == (BPF_JMP32 | BPF_JA))
++ off = insn->imm;
++ else
++ off = insn->off;
+
+ if (curr < pos && curr + off + 1 >= end_old)
+ off += delta;
+@@ -375,8 +380,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old,
+ off -= delta;
+ if (off < off_min || off > off_max)
+ return -ERANGE;
+- if (!probe_pass)
+- insn->off = off;
++ if (!probe_pass) {
++ if (insn->code == (BPF_JMP32 | BPF_JA))
++ insn->imm = off;
++ else
++ insn->off = off;
++ }
+ return 0;
+ }
+
+@@ -1586,6 +1595,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
+ INSN_3(JMP, JSLE, K), \
+ INSN_3(JMP, JSET, K), \
+ INSN_2(JMP, JA), \
++ INSN_2(JMP32, JA), \
+ /* Store instructions. */ \
+ /* Register based. */ \
+ INSN_3(STX, MEM, B), \
+@@ -1862,6 +1872,9 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
+ JMP_JA:
+ insn += insn->off;
+ CONT;
++ JMP32_JA:
++ insn += insn->imm;
++ CONT;
+ JMP_EXIT:
+ return BPF_R0;
+ /* JMP */
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index 73d500c51bd86..dd025f66efabc 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -2254,7 +2254,10 @@ static int check_subprogs(struct bpf_verifier_env *env)
+ goto next;
+ if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL)
+ goto next;
+- off = i + insn[i].off + 1;
++ if (code == (BPF_JMP32 | BPF_JA))
++ off = i + insn[i].imm + 1;
++ else
++ off = i + insn[i].off + 1;
+ if (off < subprog_start || off >= subprog_end) {
+ verbose(env, "jump out of range from insn %d to %d\n", i, off);
+ return -EINVAL;
+@@ -2266,6 +2269,7 @@ static int check_subprogs(struct bpf_verifier_env *env)
+ * or unconditional jump back
+ */
+ if (code != (BPF_JMP | BPF_EXIT) &&
++ code != (BPF_JMP32 | BPF_JA) &&
+ code != (BPF_JMP | BPF_JA)) {
+ verbose(env, "last insn is not an exit or jmp\n");
+ return -EINVAL;
+@@ -11116,7 +11120,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns,
+ static int visit_insn(int t, struct bpf_verifier_env *env)
+ {
+ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t];
+- int ret;
++ int ret, off;
+
+ if (bpf_pseudo_func(insn))
+ return visit_func_call_insn(t, insns, env, true);
+@@ -11144,14 +11148,19 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
+ if (BPF_SRC(insn->code) != BPF_K)
+ return -EINVAL;
+
++ if (BPF_CLASS(insn->code) == BPF_JMP)
++ off = insn->off;
++ else
++ off = insn->imm;
++
+ /* unconditional jump with single edge */
+- ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env,
++ ret = push_insn(t, t + off + 1, FALLTHROUGH, env,
+ true);
+ if (ret)
+ return ret;
+
+- mark_prune_point(env, t + insn->off + 1);
+- mark_jmp_point(env, t + insn->off + 1);
++ mark_prune_point(env, t + off + 1);
++ mark_jmp_point(env, t + off + 1);
+
+ return ret;
+
+@@ -12687,15 +12696,18 @@ static int do_check(struct bpf_verifier_env *env)
+ return err;
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+- insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0 ||
+- class == BPF_JMP32) {
++ (class == BPF_JMP && insn->imm != 0) ||
++ (class == BPF_JMP32 && insn->off != 0)) {
+ verbose(env, "BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+- env->insn_idx += insn->off + 1;
++ if (class == BPF_JMP)
++ env->insn_idx += insn->off + 1;
++ else
++ env->insn_idx += insn->imm + 1;
+ continue;
+
+ } else if (opcode == BPF_EXIT) {
+@@ -13521,13 +13533,13 @@ static bool insn_is_cond_jump(u8 code)
+ {
+ u8 op;
+
++ op = BPF_OP(code);
+ if (BPF_CLASS(code) == BPF_JMP32)
+- return true;
++ return op != BPF_JA;
+
+ if (BPF_CLASS(code) != BPF_JMP)
+ return false;
+
+- op = BPF_OP(code);
+ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 1a57e1d64338a8af8a056d73c2ebac861d202331 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Sep 2023 23:04:41 +0800
+Subject: bpf, x64: Fix tailcall infinite loop
+
+From: Leon Hwang <hffilwlqm@gmail.com>
+
+[ Upstream commit 2b5dcb31a19a2e0acd869b12c9db9b2d696ef544 ]
+
+From commit ebf7d1f508a73871 ("bpf, x64: rework pro/epilogue and tailcall
+handling in JIT"), the tailcall on x64 works better than before.
+
+From commit e411901c0b775a3a ("bpf: allow for tailcalls in BPF subprograms
+for x64 JIT"), tailcall is able to run in BPF subprograms on x64.
+
+From commit 5b92a28aae4dd0f8 ("bpf: Support attaching tracing BPF program
+to other BPF programs"), BPF program is able to trace other BPF programs.
+
+How about combining them all together?
+
+1. FENTRY/FEXIT on a BPF subprogram.
+2. A tailcall runs in the BPF subprogram.
+3. The tailcall calls the subprogram's caller.
+
+As a result, a tailcall infinite loop comes up. And the loop would halt
+the machine.
+
+As we know, in tail call context, the tail_call_cnt propagates by stack
+and rax register between BPF subprograms. So do in trampolines.
+
+Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
+Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT")
+Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
+Link: https://lore.kernel.org/r/20230912150442.2009-3-hffilwlqm@gmail.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++------
+ include/linux/bpf.h | 5 +++++
+ kernel/bpf/trampoline.c | 4 ++--
+ kernel/bpf/verifier.c | 3 +++
+ 4 files changed, 32 insertions(+), 8 deletions(-)
+
+diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
+index 4686c1d9d0cfd..e6a031f8dd2e9 100644
+--- a/arch/x86/net/bpf_jit_comp.c
++++ b/arch/x86/net/bpf_jit_comp.c
+@@ -893,6 +893,10 @@ static void emit_nops(u8 **pprog, int len)
+
+ #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
+
++/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
++#define RESTORE_TAIL_CALL_CNT(stack) \
++ EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8)
++
+ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
+ int oldproglen, struct jit_context *ctx, bool jmp_padding)
+ {
+@@ -1436,9 +1440,7 @@ st: if (is_imm8(insn->off))
+ case BPF_JMP | BPF_CALL:
+ func = (u8 *) __bpf_call_base + imm32;
+ if (tail_call_reachable) {
+- /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
+- EMIT3_off32(0x48, 0x8B, 0x85,
+- -round_up(bpf_prog->aux->stack_depth, 8) - 8);
++ RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
+ if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7))
+ return -EINVAL;
+ } else {
+@@ -2070,6 +2072,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
+ *
+ * RBP - run_ctx_off [ bpf_tramp_run_ctx ]
++ * RSP [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX
+ */
+
+ /* room for return value of orig_call or fentry prog */
+@@ -2106,6 +2109,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ EMIT1(0x55); /* push rbp */
+ EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */
+ EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
++ if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
++ EMIT1(0x50); /* push rax */
+ EMIT1(0x53); /* push rbx */
+
+ /* Store number of argument registers of the traced function:
+@@ -2156,9 +2161,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
+ restore_regs(m, &prog, nr_args, regs_off);
+
++ if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
++ /* Before calling the original function, restore the
++ * tail_call_cnt from stack to rax.
++ */
++ RESTORE_TAIL_CALL_CNT(stack_size);
++
+ if (flags & BPF_TRAMP_F_ORIG_STACK) {
+- emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
+- EMIT2(0xff, 0xd0); /* call *rax */
++ emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8);
++ EMIT2(0xff, 0xd3); /* call *rbx */
+ } else {
+ /* call original function */
+ if (emit_call(&prog, orig_call, prog)) {
+@@ -2209,7 +2220,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ ret = -EINVAL;
+ goto cleanup;
+ }
+- }
++ } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
++ /* Before running the original function, restore the
++ * tail_call_cnt from stack to rax.
++ */
++ RESTORE_TAIL_CALL_CNT(stack_size);
++
+ /* restore return value of orig_call or fentry prog back into RAX */
+ if (save_ret)
+ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8);
+diff --git a/include/linux/bpf.h b/include/linux/bpf.h
+index 3ce9e39ecdb85..619fcba84be22 100644
+--- a/include/linux/bpf.h
++++ b/include/linux/bpf.h
+@@ -825,6 +825,11 @@ struct btf_func_model {
+ */
+ #define BPF_TRAMP_F_SHARE_IPMODIFY BIT(6)
+
++/* Indicate that current trampoline is in a tail call context. Then, it has to
++ * cache and restore tail_call_cnt to avoid infinite tail call loop.
++ */
++#define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7)
++
+ /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50
+ * bytes on x86.
+ */
+diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
+index c4381dfcd6b09..748ac86169941 100644
+--- a/kernel/bpf/trampoline.c
++++ b/kernel/bpf/trampoline.c
+@@ -443,8 +443,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut
+ goto out;
+ }
+
+- /* clear all bits except SHARE_IPMODIFY */
+- tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
++ /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */
++ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX);
+
+ if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
+ tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index 12d360d80c149..ee6e811b43158 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -15442,6 +15442,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
+ if (!tr)
+ return -ENOMEM;
+
++ if (tgt_prog && tgt_prog->aux->tail_call_reachable)
++ tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX;
++
+ prog->aux->dst_trampoline = tr;
+ return 0;
+ }
+--
+2.43.0
+
--- /dev/null
+From c56095a745ac4ce4fa4f5e267d8e5610efb53c12 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 13 Jul 2023 12:07:36 +0800
+Subject: bpf, x86: save/restore regs with BPF_DW size
+
+From: Menglong Dong <imagedong@tencent.com>
+
+[ Upstream commit 02a6dfa8ff43efb1c989f87a4d862aedf436088a ]
+
+As we already reserve 8 byte in the stack for each reg, it is ok to
+store/restore the regs in BPF_DW size. This will make the code in
+save_regs()/restore_regs() simpler.
+
+Signed-off-by: Menglong Dong <imagedong@tencent.com>
+Acked-by: Yonghong Song <yhs@fb.com>
+Link: https://lore.kernel.org/r/20230713040738.1789742-2-imagedong@tencent.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Stable-dep-of: 2b5dcb31a19a ("bpf, x64: Fix tailcall infinite loop")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/net/bpf_jit_comp.c | 35 ++++++-----------------------------
+ 1 file changed, 6 insertions(+), 29 deletions(-)
+
+diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
+index 87cea23f2da16..84c695ae1940f 100644
+--- a/arch/x86/net/bpf_jit_comp.c
++++ b/arch/x86/net/bpf_jit_comp.c
+@@ -1755,57 +1755,34 @@ st: if (is_imm8(insn->off))
+ static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
+ int stack_size)
+ {
+- int i, j, arg_size;
+- bool next_same_struct = false;
++ int i;
+
+ /* Store function arguments to stack.
+ * For a function that accepts two pointers the sequence will be:
+ * mov QWORD PTR [rbp-0x10],rdi
+ * mov QWORD PTR [rbp-0x8],rsi
+ */
+- for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
+- /* The arg_size is at most 16 bytes, enforced by the verifier. */
+- arg_size = m->arg_size[j];
+- if (arg_size > 8) {
+- arg_size = 8;
+- next_same_struct = !next_same_struct;
+- }
+-
+- emit_stx(prog, bytes_to_bpf_size(arg_size),
+- BPF_REG_FP,
++ for (i = 0; i < min(nr_regs, 6); i++)
++ emit_stx(prog, BPF_DW, BPF_REG_FP,
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+ -(stack_size - i * 8));
+-
+- j = next_same_struct ? j : j + 1;
+- }
+ }
+
+ static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
+ int stack_size)
+ {
+- int i, j, arg_size;
+- bool next_same_struct = false;
++ int i;
+
+ /* Restore function arguments from stack.
+ * For a function that accepts two pointers the sequence will be:
+ * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
+ * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
+ */
+- for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
+- /* The arg_size is at most 16 bytes, enforced by the verifier. */
+- arg_size = m->arg_size[j];
+- if (arg_size > 8) {
+- arg_size = 8;
+- next_same_struct = !next_same_struct;
+- }
+-
+- emit_ldx(prog, bytes_to_bpf_size(arg_size),
++ for (i = 0; i < min(nr_regs, 6); i++)
++ emit_ldx(prog, BPF_DW,
+ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
+ BPF_REG_FP,
+ -(stack_size - i * 8));
+-
+- j = next_same_struct ? j : j + 1;
+- }
+ }
+
+ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
+--
+2.43.0
+
--- /dev/null
+From 0cc5afc6ba7a0afb7289de880f54e9d3715ee8be Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 5 Jan 2023 11:50:26 +0800
+Subject: bpf, x86: Simplify the parsing logic of structure parameters
+
+From: Pu Lehui <pulehui@huawei.com>
+
+[ Upstream commit 7f7880495770329d095d402c2865bfa7089192f8 ]
+
+Extra_nregs of structure parameters and nr_args can be
+added directly at the beginning, and using a flip flag
+to identifiy structure parameters. Meantime, renaming
+some variables to make them more sense.
+
+Signed-off-by: Pu Lehui <pulehui@huawei.com>
+Acked-by: Yonghong Song <yhs@fb.com>
+Link: https://lore.kernel.org/r/20230105035026.3091988-1-pulehui@huaweicloud.com
+Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
+Stable-dep-of: 2b5dcb31a19a ("bpf, x64: Fix tailcall infinite loop")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/net/bpf_jit_comp.c | 101 +++++++++++++++++-------------------
+ 1 file changed, 48 insertions(+), 53 deletions(-)
+
+diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
+index e6a031f8dd2e9..87cea23f2da16 100644
+--- a/arch/x86/net/bpf_jit_comp.c
++++ b/arch/x86/net/bpf_jit_comp.c
+@@ -1752,62 +1752,59 @@ st: if (is_imm8(insn->off))
+ return proglen;
+ }
+
+-static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
++static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
+ int stack_size)
+ {
+- int i, j, arg_size, nr_regs;
++ int i, j, arg_size;
++ bool next_same_struct = false;
++
+ /* Store function arguments to stack.
+ * For a function that accepts two pointers the sequence will be:
+ * mov QWORD PTR [rbp-0x10],rdi
+ * mov QWORD PTR [rbp-0x8],rsi
+ */
+- for (i = 0, j = 0; i < min(nr_args, 6); i++) {
+- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
+- nr_regs = (m->arg_size[i] + 7) / 8;
++ for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
++ /* The arg_size is at most 16 bytes, enforced by the verifier. */
++ arg_size = m->arg_size[j];
++ if (arg_size > 8) {
+ arg_size = 8;
+- } else {
+- nr_regs = 1;
+- arg_size = m->arg_size[i];
++ next_same_struct = !next_same_struct;
+ }
+
+- while (nr_regs) {
+- emit_stx(prog, bytes_to_bpf_size(arg_size),
+- BPF_REG_FP,
+- j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
+- -(stack_size - j * 8));
+- nr_regs--;
+- j++;
+- }
++ emit_stx(prog, bytes_to_bpf_size(arg_size),
++ BPF_REG_FP,
++ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
++ -(stack_size - i * 8));
++
++ j = next_same_struct ? j : j + 1;
+ }
+ }
+
+-static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
++static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs,
+ int stack_size)
+ {
+- int i, j, arg_size, nr_regs;
++ int i, j, arg_size;
++ bool next_same_struct = false;
+
+ /* Restore function arguments from stack.
+ * For a function that accepts two pointers the sequence will be:
+ * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
+ * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
+ */
+- for (i = 0, j = 0; i < min(nr_args, 6); i++) {
+- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
+- nr_regs = (m->arg_size[i] + 7) / 8;
++ for (i = 0, j = 0; i < min(nr_regs, 6); i++) {
++ /* The arg_size is at most 16 bytes, enforced by the verifier. */
++ arg_size = m->arg_size[j];
++ if (arg_size > 8) {
+ arg_size = 8;
+- } else {
+- nr_regs = 1;
+- arg_size = m->arg_size[i];
++ next_same_struct = !next_same_struct;
+ }
+
+- while (nr_regs) {
+- emit_ldx(prog, bytes_to_bpf_size(arg_size),
+- j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
+- BPF_REG_FP,
+- -(stack_size - j * 8));
+- nr_regs--;
+- j++;
+- }
++ emit_ldx(prog, bytes_to_bpf_size(arg_size),
++ i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
++ BPF_REG_FP,
++ -(stack_size - i * 8));
++
++ j = next_same_struct ? j : j + 1;
+ }
+ }
+
+@@ -2033,8 +2030,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ struct bpf_tramp_links *tlinks,
+ void *func_addr)
+ {
+- int ret, i, nr_args = m->nr_args, extra_nregs = 0;
+- int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off;
++ int i, ret, nr_regs = m->nr_args, stack_size = 0;
++ int regs_off, nregs_off, ip_off, run_ctx_off;
+ struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
+ struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
+ struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+@@ -2043,17 +2040,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ u8 *prog;
+ bool save_ret;
+
+- /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
+- if (nr_args > 6)
+- return -ENOTSUPP;
+-
+- for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
++ /* extra registers for struct arguments */
++ for (i = 0; i < m->nr_args; i++)
+ if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
+- extra_nregs += (m->arg_size[i] + 7) / 8 - 1;
+- }
+- if (nr_args + extra_nregs > 6)
++ nr_regs += (m->arg_size[i] + 7) / 8 - 1;
++
++ /* x86-64 supports up to 6 arguments. 7+ can be added in the future */
++ if (nr_regs > 6)
+ return -ENOTSUPP;
+- stack_size += extra_nregs * 8;
+
+ /* Generated trampoline stack layout:
+ *
+@@ -2067,7 +2061,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ * [ ... ]
+ * RBP - regs_off [ reg_arg1 ] program's ctx pointer
+ *
+- * RBP - args_off [ arg regs count ] always
++ * RBP - nregs_off [ regs count ] always
+ *
+ * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
+ *
+@@ -2080,11 +2074,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ if (save_ret)
+ stack_size += 8;
+
++ stack_size += nr_regs * 8;
+ regs_off = stack_size;
+
+- /* args count */
++ /* regs count */
+ stack_size += 8;
+- args_off = stack_size;
++ nregs_off = stack_size;
+
+ if (flags & BPF_TRAMP_F_IP_ARG)
+ stack_size += 8; /* room for IP address argument */
+@@ -2114,11 +2109,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ EMIT1(0x53); /* push rbx */
+
+ /* Store number of argument registers of the traced function:
+- * mov rax, nr_args + extra_nregs
+- * mov QWORD PTR [rbp - args_off], rax
++ * mov rax, nr_regs
++ * mov QWORD PTR [rbp - nregs_off], rax
+ */
+- emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs);
+- emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
++ emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs);
++ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off);
+
+ if (flags & BPF_TRAMP_F_IP_ARG) {
+ /* Store IP address of the traced function:
+@@ -2129,7 +2124,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
+ }
+
+- save_regs(m, &prog, nr_args, regs_off);
++ save_regs(m, &prog, nr_regs, regs_off);
+
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
+ /* arg1: mov rdi, im */
+@@ -2159,7 +2154,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ }
+
+ if (flags & BPF_TRAMP_F_CALL_ORIG) {
+- restore_regs(m, &prog, nr_args, regs_off);
++ restore_regs(m, &prog, nr_regs, regs_off);
+
+ if (flags & BPF_TRAMP_F_TAIL_CALL_CTX)
+ /* Before calling the original function, restore the
+@@ -2206,7 +2201,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
+ }
+
+ if (flags & BPF_TRAMP_F_RESTORE_REGS)
+- restore_regs(m, &prog, nr_args, regs_off);
++ restore_regs(m, &prog, nr_regs, regs_off);
+
+ /* This needs to be done regardless. If there were fmod_ret programs,
+ * the return value is only updated on the stack and still needs to be
+--
+2.43.0
+
--- /dev/null
+From 513d47d3ddb69a718de4359b8ccbf68b1431cdde Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Dec 2023 13:00:10 -0800
+Subject: btrfs: fix qgroup_free_reserved_data int overflow
+
+From: Boris Burkov <boris@bur.io>
+
+[ Upstream commit 9e65bfca24cf1d77e4a5c7a170db5867377b3fe7 ]
+
+The reserved data counter and input parameter is a u64, but we
+inadvertently accumulate it in an int. Overflowing that int results in
+freeing the wrong amount of data and breaking reserve accounting.
+
+Unfortunately, this overflow rot spreads from there, as the qgroup
+release/free functions rely on returning an int to take advantage of
+negative values for error codes.
+
+Therefore, the full fix is to return the "released" or "freed" amount by
+a u64 argument and to return 0 or negative error code via the return
+value.
+
+Most of the call sites simply ignore the return value, though some
+of them handle the error and count the returned bytes. Change all of
+them accordingly.
+
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Boris Burkov <boris@bur.io>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/delalloc-space.c | 2 +-
+ fs/btrfs/file.c | 2 +-
+ fs/btrfs/inode.c | 16 ++++++++--------
+ fs/btrfs/ordered-data.c | 7 ++++---
+ fs/btrfs/qgroup.c | 25 +++++++++++++++----------
+ fs/btrfs/qgroup.h | 4 ++--
+ 6 files changed, 31 insertions(+), 25 deletions(-)
+
+diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
+index 0b62ce77053f5..f2bc5563c0f92 100644
+--- a/fs/btrfs/delalloc-space.c
++++ b/fs/btrfs/delalloc-space.c
+@@ -197,7 +197,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
+ start = round_down(start, fs_info->sectorsize);
+
+ btrfs_free_reserved_data_space_noquota(fs_info, len);
+- btrfs_qgroup_free_data(inode, reserved, start, len);
++ btrfs_qgroup_free_data(inode, reserved, start, len, NULL);
+ }
+
+ /**
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index 0a46fff3dd067..1783a0fbf1665 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -3191,7 +3191,7 @@ static long btrfs_fallocate(struct file *file, int mode,
+ qgroup_reserved -= range->len;
+ } else if (qgroup_reserved > 0) {
+ btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
+- range->start, range->len);
++ range->start, range->len, NULL);
+ qgroup_reserved -= range->len;
+ }
+ list_del(&range->list);
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 81eac121c6b23..9a7d77c410e22 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -466,7 +466,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
+ * And at reserve time, it's always aligned to page size, so
+ * just free one page here.
+ */
+- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
++ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
+ btrfs_free_path(path);
+ btrfs_end_transaction(trans);
+ return ret;
+@@ -5372,7 +5372,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
+ */
+ if (state_flags & EXTENT_DELALLOC)
+ btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
+- end - start + 1);
++ end - start + 1, NULL);
+
+ clear_extent_bit(io_tree, start, end,
+ EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
+@@ -8440,7 +8440,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
+ * reserved data space.
+ * Since the IO will never happen for this page.
+ */
+- btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur);
++ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
+ if (!inode_evicting) {
+ clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
+ EXTENT_DELALLOC | EXTENT_UPTODATE |
+@@ -9902,7 +9902,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ struct btrfs_path *path;
+ u64 start = ins->objectid;
+ u64 len = ins->offset;
+- int qgroup_released;
++ u64 qgroup_released = 0;
+ int ret;
+
+ memset(&stack_fi, 0, sizeof(stack_fi));
+@@ -9915,9 +9915,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
+ btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
+ /* Encryption and other encoding is reserved and all 0 */
+
+- qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len);
+- if (qgroup_released < 0)
+- return ERR_PTR(qgroup_released);
++ ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
++ if (ret < 0)
++ return ERR_PTR(ret);
+
+ if (trans) {
+ ret = insert_reserved_file_extent(trans, inode,
+@@ -10903,7 +10903,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
+ out_qgroup_free_data:
+ if (ret < 0)
+- btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes);
++ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
+ out_free_data_space:
+ /*
+ * If btrfs_reserve_extent() succeeded, then we already decremented
+diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
+index 0321753c16b9f..1b2af4785c0e2 100644
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -172,11 +172,12 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ struct rb_node *node;
+ struct btrfs_ordered_extent *entry;
+ int ret;
++ u64 qgroup_rsv = 0;
+
+ if (flags &
+ ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
+ /* For nocow write, we can release the qgroup rsv right now */
+- ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
++ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv);
+ if (ret < 0)
+ return ret;
+ ret = 0;
+@@ -185,7 +186,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ * The ordered extent has reserved qgroup space, release now
+ * and pass the reserved number for qgroup_record to free.
+ */
+- ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
++ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv);
+ if (ret < 0)
+ return ret;
+ }
+@@ -203,7 +204,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ entry->inode = igrab(&inode->vfs_inode);
+ entry->compress_type = compress_type;
+ entry->truncated_len = (u64)-1;
+- entry->qgroup_rsv = ret;
++ entry->qgroup_rsv = qgroup_rsv;
+ entry->physical = (u64)-1;
+
+ ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
+diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
+index 26cabffd59710..96ec9ccc2ef61 100644
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3833,13 +3833,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+
+ /* Free ranges specified by @reserved, normally in error path */
+ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
+- struct extent_changeset *reserved, u64 start, u64 len)
++ struct extent_changeset *reserved,
++ u64 start, u64 len, u64 *freed_ret)
+ {
+ struct btrfs_root *root = inode->root;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ struct extent_changeset changeset;
+- int freed = 0;
++ u64 freed = 0;
+ int ret;
+
+ extent_changeset_init(&changeset);
+@@ -3880,7 +3881,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
+ }
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
+ BTRFS_QGROUP_RSV_DATA);
+- ret = freed;
++ if (freed_ret)
++ *freed_ret = freed;
++ ret = 0;
+ out:
+ extent_changeset_release(&changeset);
+ return ret;
+@@ -3888,7 +3891,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode,
+
+ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start, u64 len,
+- int free)
++ u64 *released, int free)
+ {
+ struct extent_changeset changeset;
+ int trace_op = QGROUP_RELEASE;
+@@ -3900,7 +3903,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
+ /* In release case, we shouldn't have @reserved */
+ WARN_ON(!free && reserved);
+ if (free && reserved)
+- return qgroup_free_reserved_data(inode, reserved, start, len);
++ return qgroup_free_reserved_data(inode, reserved, start, len, released);
+ extent_changeset_init(&changeset);
+ ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1,
+ EXTENT_QGROUP_RESERVED, &changeset);
+@@ -3915,7 +3918,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
+ changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
+- ret = changeset.bytes_changed;
++ if (released)
++ *released = changeset.bytes_changed;
+ out:
+ extent_changeset_release(&changeset);
+ return ret;
+@@ -3934,9 +3938,10 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode,
+ * NOTE: This function may sleep for memory allocation.
+ */
+ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
+- struct extent_changeset *reserved, u64 start, u64 len)
++ struct extent_changeset *reserved,
++ u64 start, u64 len, u64 *freed)
+ {
+- return __btrfs_qgroup_release_data(inode, reserved, start, len, 1);
++ return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1);
+ }
+
+ /*
+@@ -3954,9 +3959,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len)
++int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released)
+ {
+- return __btrfs_qgroup_release_data(inode, NULL, start, len, 0);
++ return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0);
+ }
+
+ static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes,
+diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
+index 578c77e94200f..c382923f7628e 100644
+--- a/fs/btrfs/qgroup.h
++++ b/fs/btrfs/qgroup.h
+@@ -360,10 +360,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
+ /* New io_tree based accurate qgroup reserve API */
+ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+ struct extent_changeset **reserved, u64 start, u64 len);
+-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len);
++int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released);
+ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+- u64 len);
++ u64 len, u64 *freed);
+ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce);
+ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+--
+2.43.0
+
--- /dev/null
+From c5154bfdcfc857cf2ee5f1b2d6b0778c026c11b2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 May 2023 17:03:06 +0200
+Subject: btrfs: mark the len field in struct btrfs_ordered_sum as unsigned
+
+From: Christoph Hellwig <hch@lst.de>
+
+[ Upstream commit 6e4b2479ab38b3f949a85964da212295d32102f0 ]
+
+len can't ever be negative, so mark it as an u32 instead of int.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Stable-dep-of: 9e65bfca24cf ("btrfs: fix qgroup_free_reserved_data int overflow")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/file-item.c | 2 +-
+ fs/btrfs/ordered-data.h | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
+index b14d2da9b26d3..14478da875313 100644
+--- a/fs/btrfs/file-item.c
++++ b/fs/btrfs/file-item.c
+@@ -602,7 +602,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
+ }
+
+ sums->bytenr = start;
+- sums->len = (int)size;
++ sums->len = size;
+
+ offset = (start - key.offset) >> fs_info->sectorsize_bits;
+ offset *= csum_size;
+diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
+index f59f2dbdb25ed..cc3ca4bb9bd54 100644
+--- a/fs/btrfs/ordered-data.h
++++ b/fs/btrfs/ordered-data.h
+@@ -20,7 +20,7 @@ struct btrfs_ordered_sum {
+ /*
+ * this is the length in bytes covered by the sums array below.
+ */
+- int len;
++ u32 len;
+ struct list_head list;
+ /* last field is a variable length array of csums */
+ u8 sums[];
+--
+2.43.0
+
--- /dev/null
+From 17fe236d4580c1fb90b59345b81b667d28253b36 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 9 Dec 2022 10:10:08 +0100
+Subject: can: raw: add support for SO_MARK
+
+From: Marc Kleine-Budde <mkl@pengutronix.de>
+
+[ Upstream commit 0826e82b8a32e646b7b32ba8b68ba30812028e47 ]
+
+Add support for SO_MARK to the CAN_RAW protocol. This makes it
+possible to add traffic control filters based on the fwmark.
+
+Link: https://lore.kernel.org/all/20221210113653.170346-1-mkl@pengutronix.de
+Acked-by: Oliver Hartkopp <socketcan@hartkopp.net>
+Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
+Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/can/raw.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/net/can/raw.c b/net/can/raw.c
+index 8c104339d538d..488320738e319 100644
+--- a/net/can/raw.c
++++ b/net/can/raw.c
+@@ -881,6 +881,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+
+ skb->dev = dev;
+ skb->priority = sk->sk_priority;
++ skb->mark = sk->sk_mark;
+ skb->tstamp = sockc.transmit_time;
+
+ skb_setup_tx_timestamp(skb, sockc.tsflags);
+--
+2.43.0
+
--- /dev/null
+From 853dc4a7fe0d006cee6fde50262d67a545487936 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 5 Jul 2023 16:51:39 +0200
+Subject: cpu/SMT: Create topology_smt_thread_allowed()
+
+From: Michael Ellerman <mpe@ellerman.id.au>
+
+[ Upstream commit 38253464bc821d6de6bba81bb1412ebb36f6cbd1 ]
+
+Some architectures allows partial SMT states, i.e. when not all SMT threads
+are brought online.
+
+To support that, add an architecture helper which checks whether a given
+CPU is allowed to be brought online depending on how many SMT threads are
+currently enabled. Since this is only applicable to architecture supporting
+partial SMT, only these architectures should select the new configuration
+variable CONFIG_SMT_NUM_THREADS_DYNAMIC. For the other architectures, not
+supporting the partial SMT states, there is no need to define
+topology_cpu_smt_allowed(), the generic code assumed that all the threads
+are allowed or only the primary ones.
+
+Call the helper from cpu_smt_enable(), and cpu_smt_allowed() when SMT is
+enabled, to check if the particular thread should be onlined. Notably,
+also call it from cpu_smt_disable() if CPU_SMT_ENABLED, to allow
+offlining some threads to move from a higher to lower number of threads
+online.
+
+[ ldufour: Slightly reword the commit's description ]
+[ ldufour: Introduce CONFIG_SMT_NUM_THREADS_DYNAMIC ]
+
+Suggested-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Signed-off-by: Laurent Dufour <ldufour@linux.ibm.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Link: https://lore.kernel.org/r/20230705145143.40545-7-ldufour@linux.ibm.com
+Stable-dep-of: d91bdd96b55c ("cpu/SMT: Make SMT control more robust against enumeration failures")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/Kconfig | 3 +++
+ kernel/cpu.c | 24 +++++++++++++++++++++++-
+ 2 files changed, 26 insertions(+), 1 deletion(-)
+
+diff --git a/arch/Kconfig b/arch/Kconfig
+index b60d271bf76a9..14273a6203dfc 100644
+--- a/arch/Kconfig
++++ b/arch/Kconfig
+@@ -34,6 +34,9 @@ config ARCH_HAS_SUBPAGE_FAULTS
+ config HOTPLUG_SMT
+ bool
+
++config SMT_NUM_THREADS_DYNAMIC
++ bool
++
+ config GENERIC_ENTRY
+ bool
+
+diff --git a/kernel/cpu.c b/kernel/cpu.c
+index 551468d9c5a85..c37f1758a4865 100644
+--- a/kernel/cpu.c
++++ b/kernel/cpu.c
+@@ -446,9 +446,23 @@ static int __init smt_cmdline_disable(char *str)
+ }
+ early_param("nosmt", smt_cmdline_disable);
+
++/*
++ * For Archicture supporting partial SMT states check if the thread is allowed.
++ * Otherwise this has already been checked through cpu_smt_max_threads when
++ * setting the SMT level.
++ */
++static inline bool cpu_smt_thread_allowed(unsigned int cpu)
++{
++#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC
++ return topology_smt_thread_allowed(cpu);
++#else
++ return true;
++#endif
++}
++
+ static inline bool cpu_smt_allowed(unsigned int cpu)
+ {
+- if (cpu_smt_control == CPU_SMT_ENABLED)
++ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ return true;
+
+ if (topology_is_primary_thread(cpu))
+@@ -2294,6 +2308,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval)
+ for_each_online_cpu(cpu) {
+ if (topology_is_primary_thread(cpu))
+ continue;
++ /*
++ * Disable can be called with CPU_SMT_ENABLED when changing
++ * from a higher to lower number of SMT threads per core.
++ */
++ if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
++ continue;
+ ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE);
+ if (ret)
+ break;
+@@ -2328,6 +2348,8 @@ int cpuhp_smt_enable(void)
+ /* Skip online CPUs and CPUs on offline nodes */
+ if (cpu_online(cpu) || !node_online(cpu_to_node(cpu)))
+ continue;
++ if (!cpu_smt_thread_allowed(cpu))
++ continue;
+ ret = _cpu_up(cpu, 0, CPUHP_ONLINE);
+ if (ret)
+ break;
+--
+2.43.0
+
--- /dev/null
+From 09e97aec954cf0a31689861b27a859e63d278e0a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 14 Aug 2023 10:18:27 +0200
+Subject: cpu/SMT: Make SMT control more robust against enumeration failures
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+[ Upstream commit d91bdd96b55cc3ce98d883a60f133713821b80a6 ]
+
+The SMT control mechanism got added as speculation attack vector
+mitigation. The implemented logic relies on the primary thread mask to
+be set up properly.
+
+This turns out to be an issue with XEN/PV guests because their CPU hotplug
+mechanics do not enumerate APICs and therefore the mask is never correctly
+populated.
+
+This went unnoticed so far because by chance XEN/PV ends up with
+smp_num_siblings == 2. So smt_hotplug_control stays at its default value
+CPU_SMT_ENABLED and the primary thread mask is never evaluated in the
+context of CPU hotplug.
+
+This stopped "working" with the upcoming overhaul of the topology
+evaluation which legitimately provides a fake topology for XEN/PV. That
+sets smp_num_siblings to 1, which causes the core CPU hot-plug core to
+refuse to bring up the APs.
+
+This happens because smt_hotplug_control is set to CPU_SMT_NOT_SUPPORTED
+which causes cpu_smt_allowed() to evaluate the unpopulated primary thread
+mask with the conclusion that all non-boot CPUs are not valid to be
+plugged.
+
+Make cpu_smt_allowed() more robust and take CPU_SMT_NOT_SUPPORTED and
+CPU_SMT_NOT_IMPLEMENTED into account. Rename it to cpu_bootable() while at
+it as that makes it more clear what the function is about.
+
+The primary mask issue on x86 XEN/PV needs to be addressed separately as
+there are users outside of the CPU hotplug code too.
+
+Fixes: 05736e4ac13c ("cpu/hotplug: Provide knobs to control SMT")
+Reported-by: Juergen Gross <jgross@suse.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Juergen Gross <jgross@suse.com>
+Tested-by: Sohil Mehta <sohil.mehta@intel.com>
+Tested-by: Michael Kelley <mikelley@microsoft.com>
+Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lore.kernel.org/r/20230814085112.149440843@linutronix.de
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cpu.c | 18 +++++++++++++-----
+ 1 file changed, 13 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/cpu.c b/kernel/cpu.c
+index c37f1758a4865..e6f0101941ed8 100644
+--- a/kernel/cpu.c
++++ b/kernel/cpu.c
+@@ -460,11 +460,19 @@ static inline bool cpu_smt_thread_allowed(unsigned int cpu)
+ #endif
+ }
+
+-static inline bool cpu_smt_allowed(unsigned int cpu)
++static inline bool cpu_bootable(unsigned int cpu)
+ {
+ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu))
+ return true;
+
++ /* All CPUs are bootable if controls are not configured */
++ if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED)
++ return true;
++
++ /* All CPUs are bootable if CPU is not SMT capable */
++ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED)
++ return true;
++
+ if (topology_is_primary_thread(cpu))
+ return true;
+
+@@ -485,7 +493,7 @@ bool cpu_smt_possible(void)
+ }
+ EXPORT_SYMBOL_GPL(cpu_smt_possible);
+ #else
+-static inline bool cpu_smt_allowed(unsigned int cpu) { return true; }
++static inline bool cpu_bootable(unsigned int cpu) { return true; }
+ #endif
+
+ static inline enum cpuhp_state
+@@ -588,10 +596,10 @@ static int bringup_wait_for_ap(unsigned int cpu)
+ * SMT soft disabling on X86 requires to bring the CPU out of the
+ * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The
+ * CPU marked itself as booted_once in notify_cpu_starting() so the
+- * cpu_smt_allowed() check will now return false if this is not the
++ * cpu_bootable() check will now return false if this is not the
+ * primary sibling.
+ */
+- if (!cpu_smt_allowed(cpu))
++ if (!cpu_bootable(cpu))
+ return -ECANCELED;
+
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+@@ -1478,7 +1486,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target)
+ err = -EBUSY;
+ goto out;
+ }
+- if (!cpu_smt_allowed(cpu)) {
++ if (!cpu_bootable(cpu)) {
+ err = -EPERM;
+ goto out;
+ }
+--
+2.43.0
+
--- /dev/null
+From 95fa91911ce94d90029ca22af93007ce4b006574 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 24 Nov 2023 12:28:05 +0200
+Subject: dpaa2-eth: recycle the RX buffer only after all processing done
+
+From: Ioana Ciornei <ioana.ciornei@nxp.com>
+
+[ Upstream commit beb1930f966d1517921488bd5d64147f58f79abf ]
+
+The blamed commit added support for Rx copybreak. This meant that for
+certain frame sizes, a new skb was allocated and the initial data buffer
+was recycled. Instead of waiting to recycle the Rx buffer only after all
+processing was done on it (like accessing the parse results or timestamp
+information), the code path just went ahead and re-used the buffer right
+away.
+
+This sometimes lead to corrupted HW and SW annotation areas.
+Fix this by delaying the moment when the buffer is recycled.
+
+Fixes: 50f826999a80 ("dpaa2-eth: add rx copybreak support")
+Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+index b58162ce81d87..de62eee58a00e 100644
+--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
++++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+@@ -509,8 +509,6 @@ static struct sk_buff *dpaa2_eth_copybreak(struct dpaa2_eth_channel *ch,
+
+ memcpy(skb->data, fd_vaddr + fd_offset, fd_length);
+
+- dpaa2_eth_recycle_buf(priv, ch, dpaa2_fd_get_addr(fd));
+-
+ return skb;
+ }
+
+@@ -528,6 +526,7 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
+ struct dpaa2_eth_drv_stats *percpu_extras;
+ struct device *dev = priv->net_dev->dev.parent;
+ struct dpaa2_fas *fas;
++ bool recycle_rx_buf = false;
+ void *buf_data;
+ u32 status = 0;
+ u32 xdp_act;
+@@ -560,6 +559,8 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
+ dma_unmap_page(dev, addr, priv->rx_buf_size,
+ DMA_BIDIRECTIONAL);
+ skb = dpaa2_eth_build_linear_skb(ch, fd, vaddr);
++ } else {
++ recycle_rx_buf = true;
+ }
+ } else if (fd_format == dpaa2_fd_sg) {
+ WARN_ON(priv->xdp_prog);
+@@ -607,6 +608,8 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv,
+
+ list_add_tail(&skb->list, ch->rx_list);
+
++ if (recycle_rx_buf)
++ dpaa2_eth_recycle_buf(priv, ch, dpaa2_fd_get_addr(fd));
+ return;
+
+ err_build_skb:
+--
+2.43.0
+
--- /dev/null
+From 57568971e8ca978db98bde4b8e417daebc3ba871 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Dec 2023 12:37:52 -0800
+Subject: drm/bridge: ti-sn65dsi86: Never store more than msg->size bytes in
+ AUX xfer
+
+From: Douglas Anderson <dianders@chromium.org>
+
+[ Upstream commit aca58eac52b88138ab98c814afb389a381725cd7 ]
+
+For aux reads, the value `msg->size` indicates the size of the buffer
+provided by `msg->buffer`. We should never in any circumstances write
+more bytes to the buffer since it may overflow the buffer.
+
+In the ti-sn65dsi86 driver there is one code path that reads the
+transfer length from hardware. Even though it's never been seen to be
+a problem, we should make extra sure that the hardware isn't
+increasing the length since doing so would cause us to overrun the
+buffer.
+
+Fixes: 982f589bde7a ("drm/bridge: ti-sn65dsi86: Update reply on aux failures")
+Reviewed-by: Stephen Boyd <swboyd@chromium.org>
+Reviewed-by: Guenter Roeck <groeck@chromium.org>
+Signed-off-by: Douglas Anderson <dianders@chromium.org>
+Link: https://patchwork.freedesktop.org/patch/msgid/20231214123752.v3.2.I7b83c0f31aeedc6b1dc98c7c741d3e1f94f040f8@changeid
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/bridge/ti-sn65dsi86.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+index 1b5c27ed27370..ff4d0564122a3 100644
+--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c
++++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c
+@@ -527,6 +527,7 @@ static ssize_t ti_sn_aux_transfer(struct drm_dp_aux *aux,
+ u32 request_val = AUX_CMD_REQ(msg->request);
+ u8 *buf = msg->buffer;
+ unsigned int len = msg->size;
++ unsigned int short_len;
+ unsigned int val;
+ int ret;
+ u8 addr_len[SN_AUX_LENGTH_REG + 1 - SN_AUX_ADDR_19_16_REG];
+@@ -600,7 +601,8 @@ static ssize_t ti_sn_aux_transfer(struct drm_dp_aux *aux,
+ }
+
+ if (val & AUX_IRQ_STATUS_AUX_SHORT) {
+- ret = regmap_read(pdata->regmap, SN_AUX_LENGTH_REG, &len);
++ ret = regmap_read(pdata->regmap, SN_AUX_LENGTH_REG, &short_len);
++ len = min(len, short_len);
+ if (ret)
+ goto exit;
+ } else if (val & AUX_IRQ_STATUS_NAT_I2C_FAIL) {
+--
+2.43.0
+
--- /dev/null
+From 232617028da8530bf010d2b095c3985e085efc4d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 13 Dec 2023 13:15:42 -0800
+Subject: drm/i915/dp: Fix passing the correct DPCD_REV for
+ drm_dp_set_phy_test_pattern
+
+From: Khaled Almahallawy <khaled.almahallawy@intel.com>
+
+[ Upstream commit 2bd7a06a1208aaacb4e7a2a5436c23bce8d70801 ]
+
+Using link_status to get DPCD_REV fails when disabling/defaulting
+phy pattern. Use intel_dp->dpcd to access DPCD_REV correctly.
+
+Fixes: 8cdf72711928 ("drm/i915/dp: Program vswing, pre-emphasis, test-pattern")
+Cc: Jani Nikula <jani.nikula@intel.com>
+Cc: Imre Deak <imre.deak@intel.com>
+Cc: Lee Shawn C <shawn.c.lee@intel.com>
+Signed-off-by: Khaled Almahallawy <khaled.almahallawy@intel.com>
+Signed-off-by: Jani Nikula <jani.nikula@intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20231213211542.3585105-3-khaled.almahallawy@intel.com
+(cherry picked from commit 3ee302ec22d6e1d7d1e6d381b0d507ee80f2135c)
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/i915/display/intel_dp.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c
+index 5970f4149090f..4699c21102261 100644
+--- a/drivers/gpu/drm/i915/display/intel_dp.c
++++ b/drivers/gpu/drm/i915/display/intel_dp.c
+@@ -3707,7 +3707,7 @@ static void intel_dp_process_phy_request(struct intel_dp *intel_dp,
+ intel_dp->train_set, crtc_state->lane_count);
+
+ drm_dp_set_phy_test_pattern(&intel_dp->aux, data,
+- link_status[DP_DPCD_REV]);
++ intel_dp->dpcd[DP_DPCD_REV]);
+ }
+
+ static u8 intel_dp_autotest_phy_pattern(struct intel_dp *intel_dp)
+--
+2.43.0
+
--- /dev/null
+From e5e3d5fd00ba6004228b46e43f6ee0e8588c8fa3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 26 Nov 2023 14:58:06 -0800
+Subject: ethtool: don't propagate EOPNOTSUPP from dumps
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit cbeb989e41f4094f54bec2cecce993f26f547bea ]
+
+The default dump handler needs to clear ret before returning.
+Otherwise if the last interface returns an inconsequential
+error this error will propagate to user space.
+
+This may confuse user space (ethtool CLI seems to ignore it,
+but YNL doesn't). It will also terminate the dump early
+for mutli-skb dump, because netlink core treats EOPNOTSUPP
+as a real error.
+
+Fixes: 728480f12442 ("ethtool: default handlers for GET requests")
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20231126225806.2143528-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ethtool/netlink.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
+index 1a4c11356c96c..fc4ccecf9495c 100644
+--- a/net/ethtool/netlink.c
++++ b/net/ethtool/netlink.c
+@@ -509,7 +509,7 @@ static int ethnl_default_dumpit(struct sk_buff *skb,
+ cont:
+ idx++;
+ }
+-
++ ret = 0;
+ }
+ rtnl_unlock();
+
+--
+2.43.0
+
--- /dev/null
+From 89cf2bd933e2b50444696df7ae8d806046d290e5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Nov 2022 23:30:52 -0800
+Subject: ext4: convert move_extent_per_page() to use folios
+
+From: Vishal Moola (Oracle) <vishal.moola@gmail.com>
+
+[ Upstream commit 6dd8fe86fa84729538d8bed3149faf9c5886bb5b ]
+
+Patch series "Removing the try_to_release_page() wrapper", v3.
+
+This patchset replaces the remaining calls of try_to_release_page() with
+the folio equivalent: filemap_release_folio(). This allows us to remove
+the wrapper.
+
+This patch (of 4):
+
+Convert move_extent_per_page() to use folios. This change removes 5 calls
+to compound_head() and is in preparation for the removal of the
+try_to_release_page() wrapper.
+
+Link: https://lkml.kernel.org/r/20221118073055.55694-1-vishal.moola@gmail.com
+Link: https://lkml.kernel.org/r/20221118073055.55694-2-vishal.moola@gmail.com
+Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/move_extent.c | 52 ++++++++++++++++++++++++++-----------------
+ 1 file changed, 31 insertions(+), 21 deletions(-)
+
+diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
+index 044e34cd835c1..8dbb87edf24c4 100644
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -253,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ {
+ struct inode *orig_inode = file_inode(o_filp);
+ struct page *pagep[2] = {NULL, NULL};
++ struct folio *folio[2] = {NULL, NULL};
+ handle_t *handle;
+ ext4_lblk_t orig_blk_offset, donor_blk_offset;
+ unsigned long blocksize = orig_inode->i_sb->s_blocksize;
+@@ -313,6 +314,13 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ * hold page's lock, if it is still the case data copy is not
+ * necessary, just swap data blocks between orig and donor.
+ */
++ folio[0] = page_folio(pagep[0]);
++ folio[1] = page_folio(pagep[1]);
++
++ VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
++ VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
++ VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
++
+ if (unwritten) {
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ /* If any of extents in range became initialized we have to
+@@ -331,10 +339,10 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ goto data_copy;
+ }
+- if ((page_has_private(pagep[0]) &&
+- !try_to_release_page(pagep[0], 0)) ||
+- (page_has_private(pagep[1]) &&
+- !try_to_release_page(pagep[1], 0))) {
++ if ((folio_has_private(folio[0]) &&
++ !filemap_release_folio(folio[0], 0)) ||
++ (folio_has_private(folio[1]) &&
++ !filemap_release_folio(folio[1], 0))) {
+ *err = -EBUSY;
+ goto drop_data_sem;
+ }
+@@ -344,19 +352,21 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ block_len_in_page, 1, err);
+ drop_data_sem:
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+- goto unlock_pages;
++ goto unlock_folios;
+ }
+ data_copy:
+- *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size);
++ *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size);
+ if (*err)
+- goto unlock_pages;
++ goto unlock_folios;
+
+ /* At this point all buffers in range are uptodate, old mapping layout
+ * is no longer required, try to drop it now. */
+- if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) ||
+- (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) {
++ if ((folio_has_private(folio[0]) &&
++ !filemap_release_folio(folio[0], 0)) ||
++ (folio_has_private(folio[1]) &&
++ !filemap_release_folio(folio[1], 0))) {
+ *err = -EBUSY;
+- goto unlock_pages;
++ goto unlock_folios;
+ }
+ ext4_double_down_write_data_sem(orig_inode, donor_inode);
+ replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
+@@ -369,13 +379,13 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ replaced_size =
+ block_len_in_page << orig_inode->i_blkbits;
+ } else
+- goto unlock_pages;
++ goto unlock_folios;
+ }
+ /* Perform all necessary steps similar write_begin()/write_end()
+ * but keeping in mind that i_size will not change */
+- if (!page_has_buffers(pagep[0]))
+- create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0);
+- bh = page_buffers(pagep[0]);
++ if (!folio_buffers(folio[0]))
++ create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0);
++ bh = folio_buffers(folio[0]);
+ for (i = 0; i < data_offset_in_page; i++)
+ bh = bh->b_this_page;
+ for (i = 0; i < block_len_in_page; i++) {
+@@ -385,7 +395,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ bh = bh->b_this_page;
+ }
+ if (!*err)
+- *err = block_commit_write(pagep[0], from, from + replaced_size);
++ *err = block_commit_write(&folio[0]->page, from, from + replaced_size);
+
+ if (unlikely(*err < 0))
+ goto repair_branches;
+@@ -395,11 +405,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ *err = ext4_jbd2_inode_add_write(handle, orig_inode,
+ (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size);
+
+-unlock_pages:
+- unlock_page(pagep[0]);
+- put_page(pagep[0]);
+- unlock_page(pagep[1]);
+- put_page(pagep[1]);
++unlock_folios:
++ folio_unlock(folio[0]);
++ folio_put(folio[0]);
++ folio_unlock(folio[1]);
++ folio_put(folio[1]);
+ stop_journal:
+ ext4_journal_stop(handle);
+ if (*err == -ENOSPC &&
+@@ -430,7 +440,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ *err = -EIO;
+ }
+ replaced_count = 0;
+- goto unlock_pages;
++ goto unlock_folios;
+ }
+
+ /**
+--
+2.43.0
+
--- /dev/null
+From e4a655eebbd80e0178fe542d71ec653c4f3486cb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 12 Jun 2023 12:58:34 -0700
+Subject: f2fs: assign default compression level
+
+From: Jaegeuk Kim <jaegeuk@kernel.org>
+
+[ Upstream commit 00e120b5e4b5638cf19eee96d4332f2d100746ba ]
+
+Let's avoid any confusion from assigning compress_level=0 for LZ4HC and ZSTD.
+
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Stable-dep-of: f5f3bd903a5d ("f2fs: set the default compress_level on ioctl")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/f2fs/compress.c | 3 +--
+ fs/f2fs/f2fs.h | 2 ++
+ fs/f2fs/super.c | 12 +++++++-----
+ 3 files changed, 10 insertions(+), 7 deletions(-)
+
+diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
+index c3ba202a7c29f..4cb58e8d699e2 100644
+--- a/fs/f2fs/compress.c
++++ b/fs/f2fs/compress.c
+@@ -331,8 +331,6 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = {
+ #endif
+
+ #ifdef CONFIG_F2FS_FS_ZSTD
+-#define F2FS_ZSTD_DEFAULT_CLEVEL 1
+-
+ static int zstd_init_compress_ctx(struct compress_ctx *cc)
+ {
+ zstd_parameters params;
+@@ -341,6 +339,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
+ unsigned int workspace_size;
+ unsigned char level = F2FS_I(cc->inode)->i_compress_level;
+
++ /* Need to remain this for backward compatibility */
+ if (!level)
+ level = F2FS_ZSTD_DEFAULT_CLEVEL;
+
+diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
+index 6fa3ac2097b27..5c76ba764b71f 100644
+--- a/fs/f2fs/f2fs.h
++++ b/fs/f2fs/f2fs.h
+@@ -1501,6 +1501,8 @@ struct compress_data {
+
+ #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000
+
++#define F2FS_ZSTD_DEFAULT_CLEVEL 1
++
+ #define COMPRESS_LEVEL_OFFSET 8
+
+ /* compress context */
+diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
+index 4f87e0e374c25..584fe00fdeeb1 100644
+--- a/fs/f2fs/super.c
++++ b/fs/f2fs/super.c
+@@ -613,14 +613,12 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
+ {
+ #ifdef CONFIG_F2FS_FS_LZ4HC
+ unsigned int level;
+-#endif
+
+ if (strlen(str) == 3) {
+- F2FS_OPTION(sbi).compress_level = 0;
++ F2FS_OPTION(sbi).compress_level = LZ4HC_DEFAULT_CLEVEL;
+ return 0;
+ }
+
+-#ifdef CONFIG_F2FS_FS_LZ4HC
+ str += 3;
+
+ if (str[0] != ':') {
+@@ -638,6 +636,10 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
+ F2FS_OPTION(sbi).compress_level = level;
+ return 0;
+ #else
++ if (strlen(str) == 3) {
++ F2FS_OPTION(sbi).compress_level = 0;
++ return 0;
++ }
+ f2fs_info(sbi, "kernel doesn't support lz4hc compression");
+ return -EINVAL;
+ #endif
+@@ -651,7 +653,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
+ int len = 4;
+
+ if (strlen(str) == len) {
+- F2FS_OPTION(sbi).compress_level = 0;
++ F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL;
+ return 0;
+ }
+
+@@ -664,7 +666,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
+ if (kstrtouint(str + 1, 10, &level))
+ return -EINVAL;
+
+- if (!level || level > zstd_max_clevel()) {
++ if (level < zstd_min_clevel() || level > zstd_max_clevel()) {
+ f2fs_info(sbi, "invalid zstd compress level: %d", level);
+ return -EINVAL;
+ }
+--
+2.43.0
+
--- /dev/null
+From f8166c0421b9a097ec4c3230e5a09dec56b64c23 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 28 Jan 2023 18:30:11 +0800
+Subject: f2fs: clean up i_compress_flag and i_compress_level usage
+
+From: Chao Yu <chao@kernel.org>
+
+[ Upstream commit b90e5086df6bf5ba819216d5ecf0667370bd565f ]
+
+.i_compress_level was introduced by commit 3fde13f817e2 ("f2fs: compress:
+support compress level"), but never be used.
+
+This patch updates as below:
+- load high 8-bits of on-disk .i_compress_flag to in-memory .i_compress_level
+- load low 8-bits of on-disk .i_compress_flag to in-memory .i_compress_flag
+- change type of in-memory .i_compress_flag from unsigned short to unsigned
+char.
+
+w/ above changes, we can avoid unneeded bit shift whenever during
+.init_compress_ctx(), and shrink size of struct f2fs_inode_info.
+
+Signed-off-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Stable-dep-of: f5f3bd903a5d ("f2fs: set the default compress_level on ioctl")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/f2fs/compress.c | 8 +++-----
+ fs/f2fs/f2fs.h | 7 +++----
+ fs/f2fs/inode.c | 16 +++++++++++++---
+ 3 files changed, 19 insertions(+), 12 deletions(-)
+
+diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
+index 11d9dce994dbe..d509b47381d51 100644
+--- a/fs/f2fs/compress.c
++++ b/fs/f2fs/compress.c
+@@ -241,7 +241,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc)
+ unsigned int size = LZ4_MEM_COMPRESS;
+
+ #ifdef CONFIG_F2FS_FS_LZ4HC
+- if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET)
++ if (F2FS_I(cc->inode)->i_compress_level)
+ size = LZ4HC_MEM_COMPRESS;
+ #endif
+
+@@ -267,8 +267,7 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc)
+ #ifdef CONFIG_F2FS_FS_LZ4HC
+ static int lz4hc_compress_pages(struct compress_ctx *cc)
+ {
+- unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
+- COMPRESS_LEVEL_OFFSET;
++ unsigned char level = F2FS_I(cc->inode)->i_compress_level;
+ int len;
+
+ if (level)
+@@ -340,8 +339,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc)
+ zstd_cstream *stream;
+ void *workspace;
+ unsigned int workspace_size;
+- unsigned char level = F2FS_I(cc->inode)->i_compress_flag >>
+- COMPRESS_LEVEL_OFFSET;
++ unsigned char level = F2FS_I(cc->inode)->i_compress_level;
+
+ if (!level)
+ level = F2FS_ZSTD_DEFAULT_CLEVEL;
+diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
+index f56abb39601ac..faf1a4953e845 100644
+--- a/fs/f2fs/f2fs.h
++++ b/fs/f2fs/f2fs.h
+@@ -840,7 +840,7 @@ struct f2fs_inode_info {
+ unsigned char i_compress_algorithm; /* algorithm type */
+ unsigned char i_log_cluster_size; /* log of cluster size */
+ unsigned char i_compress_level; /* compress level (lz4hc,zstd) */
+- unsigned short i_compress_flag; /* compress flag */
++ unsigned char i_compress_flag; /* compress flag */
+ unsigned int i_cluster_size; /* cluster size */
+
+ unsigned int atomic_write_cnt;
+@@ -4339,9 +4339,8 @@ static inline int set_compress_context(struct inode *inode)
+ if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 ||
+ F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) &&
+ F2FS_OPTION(sbi).compress_level)
+- F2FS_I(inode)->i_compress_flag |=
+- F2FS_OPTION(sbi).compress_level <<
+- COMPRESS_LEVEL_OFFSET;
++ F2FS_I(inode)->i_compress_level =
++ F2FS_OPTION(sbi).compress_level;
+ F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
+ set_inode_flag(inode, FI_COMPRESSED_FILE);
+ stat_inc_compr_inode(inode);
+diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
+index 1fc7760499f10..933554985d328 100644
+--- a/fs/f2fs/inode.c
++++ b/fs/f2fs/inode.c
+@@ -450,11 +450,17 @@ static int do_read_inode(struct inode *inode)
+ (fi->i_flags & F2FS_COMPR_FL)) {
+ if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
+ i_log_cluster_size)) {
++ unsigned short compress_flag;
++
+ atomic_set(&fi->i_compr_blocks,
+ le64_to_cpu(ri->i_compr_blocks));
+ fi->i_compress_algorithm = ri->i_compress_algorithm;
+ fi->i_log_cluster_size = ri->i_log_cluster_size;
+- fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag);
++ compress_flag = le16_to_cpu(ri->i_compress_flag);
++ fi->i_compress_level = compress_flag >>
++ COMPRESS_LEVEL_OFFSET;
++ fi->i_compress_flag = compress_flag &
++ (BIT(COMPRESS_LEVEL_OFFSET) - 1);
+ fi->i_cluster_size = 1 << fi->i_log_cluster_size;
+ set_inode_flag(inode, FI_COMPRESSED_FILE);
+ }
+@@ -675,13 +681,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
+ if (f2fs_sb_has_compression(F2FS_I_SB(inode)) &&
+ F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
+ i_log_cluster_size)) {
++ unsigned short compress_flag;
++
+ ri->i_compr_blocks =
+ cpu_to_le64(atomic_read(
+ &F2FS_I(inode)->i_compr_blocks));
+ ri->i_compress_algorithm =
+ F2FS_I(inode)->i_compress_algorithm;
+- ri->i_compress_flag =
+- cpu_to_le16(F2FS_I(inode)->i_compress_flag);
++ compress_flag = F2FS_I(inode)->i_compress_flag |
++ F2FS_I(inode)->i_compress_level <<
++ COMPRESS_LEVEL_OFFSET;
++ ri->i_compress_flag = cpu_to_le16(compress_flag);
+ ri->i_log_cluster_size =
+ F2FS_I(inode)->i_log_cluster_size;
+ }
+--
+2.43.0
+
--- /dev/null
+From 5a0421b515853a9187b83dfe12fc55938c7eaa84 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 16 Feb 2023 21:53:24 +0800
+Subject: f2fs: convert to use bitmap API
+
+From: Yangtao Li <frank.li@vivo.com>
+
+[ Upstream commit 447286ebadaafa551550704ff0b42eb08b1d1cb2 ]
+
+Let's use BIT() and GENMASK() instead of open it.
+
+Signed-off-by: Yangtao Li <frank.li@vivo.com>
+Reviewed-by: Chao Yu <chao@kernel.org>
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Stable-dep-of: f5f3bd903a5d ("f2fs: set the default compress_level on ioctl")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/f2fs/checkpoint.c | 2 +-
+ fs/f2fs/compress.c | 4 ++--
+ fs/f2fs/data.c | 12 ++++++------
+ fs/f2fs/dir.c | 2 +-
+ fs/f2fs/f2fs.h | 26 +++++++++++++-------------
+ fs/f2fs/file.c | 2 +-
+ fs/f2fs/inode.c | 4 ++--
+ fs/f2fs/node.h | 20 +++++++++-----------
+ fs/f2fs/super.c | 16 ++++++++--------
+ fs/f2fs/sysfs.c | 2 +-
+ include/linux/f2fs_fs.h | 9 ++++-----
+ 11 files changed, 48 insertions(+), 51 deletions(-)
+
+diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
+index 5df04ed010cae..eb4d69f53337f 100644
+--- a/fs/f2fs/checkpoint.c
++++ b/fs/f2fs/checkpoint.c
+@@ -984,7 +984,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
+
+ cp_blk_no = le32_to_cpu(fsb->cp_blkaddr);
+ if (cur_page == cp2)
+- cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg);
++ cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg));
+
+ for (i = 1; i < cp_blks; i++) {
+ void *sit_bitmap_ptr;
+diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
+index d509b47381d51..c3ba202a7c29f 100644
+--- a/fs/f2fs/compress.c
++++ b/fs/f2fs/compress.c
+@@ -673,7 +673,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
+
+ cc->cbuf->clen = cpu_to_le32(cc->clen);
+
+- if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)
++ if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))
+ chksum = f2fs_crc32(F2FS_I_SB(cc->inode),
+ cc->cbuf->cdata, cc->clen);
+ cc->cbuf->chksum = cpu_to_le32(chksum);
+@@ -771,7 +771,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
+
+ ret = cops->decompress_pages(dic);
+
+- if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) {
++ if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) {
+ u32 provided = le32_to_cpu(dic->cbuf->chksum);
+ u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen);
+
+diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
+index ea05710ca9bdf..3666c1fd77a64 100644
+--- a/fs/f2fs/data.c
++++ b/fs/f2fs/data.c
+@@ -95,17 +95,17 @@ static enum count_type __read_io_type(struct page *page)
+ /* postprocessing steps for read bios */
+ enum bio_post_read_step {
+ #ifdef CONFIG_FS_ENCRYPTION
+- STEP_DECRYPT = 1 << 0,
++ STEP_DECRYPT = BIT(0),
+ #else
+ STEP_DECRYPT = 0, /* compile out the decryption-related code */
+ #endif
+ #ifdef CONFIG_F2FS_FS_COMPRESSION
+- STEP_DECOMPRESS = 1 << 1,
++ STEP_DECOMPRESS = BIT(1),
+ #else
+ STEP_DECOMPRESS = 0, /* compile out the decompression-related code */
+ #endif
+ #ifdef CONFIG_FS_VERITY
+- STEP_VERITY = 1 << 2,
++ STEP_VERITY = BIT(2),
+ #else
+ STEP_VERITY = 0, /* compile out the verity-related code */
+ #endif
+@@ -409,7 +409,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
+
+ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
+ {
+- unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1;
++ unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0);
+ unsigned int fua_flag, meta_flag, io_flag;
+ blk_opf_t op_flags = 0;
+
+@@ -431,9 +431,9 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio)
+ * 5 | 4 | 3 | 2 | 1 | 0 |
+ * Cold | Warm | Hot | Cold | Warm | Hot |
+ */
+- if ((1 << fio->temp) & meta_flag)
++ if (BIT(fio->temp) & meta_flag)
+ op_flags |= REQ_META;
+- if ((1 << fio->temp) & fua_flag)
++ if (BIT(fio->temp) & fua_flag)
+ op_flags |= REQ_FUA;
+ return op_flags;
+ }
+diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
+index 8373eba3a1337..510736d2ae110 100644
+--- a/fs/f2fs/dir.c
++++ b/fs/f2fs/dir.c
+@@ -29,7 +29,7 @@ static unsigned long dir_blocks(struct inode *inode)
+ static unsigned int dir_buckets(unsigned int level, int dir_level)
+ {
+ if (level + dir_level < MAX_DIR_HASH_DEPTH / 2)
+- return 1 << (level + dir_level);
++ return BIT(level + dir_level);
+ else
+ return MAX_DIR_BUCKETS;
+ }
+diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
+index faf1a4953e845..6fa3ac2097b27 100644
+--- a/fs/f2fs/f2fs.h
++++ b/fs/f2fs/f2fs.h
+@@ -64,7 +64,7 @@ enum {
+ };
+
+ #ifdef CONFIG_F2FS_FAULT_INJECTION
+-#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1)
++#define F2FS_ALL_FAULT_TYPE (GENMASK(FAULT_MAX - 1, 0))
+
+ struct f2fs_fault_info {
+ atomic_t inject_ops;
+@@ -73,7 +73,7 @@ struct f2fs_fault_info {
+ };
+
+ extern const char *f2fs_fault_name[FAULT_MAX];
+-#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
++#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type))
+ #endif
+
+ /*
+@@ -1412,7 +1412,7 @@ static inline void set_page_private_##name(struct page *page) \
+ static inline void clear_page_private_##name(struct page *page) \
+ { \
+ clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \
+- if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \
++ if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { \
+ set_page_private(page, 0); \
+ if (PagePrivate(page)) { \
+ ClearPagePrivate(page); \
+@@ -1462,8 +1462,8 @@ static inline void set_page_private_data(struct page *page, unsigned long data)
+
+ static inline void clear_page_private_data(struct page *page)
+ {
+- page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1;
+- if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) {
++ page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0);
++ if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) {
+ set_page_private(page, 0);
+ if (PagePrivate(page)) {
+ ClearPagePrivate(page);
+@@ -2882,7 +2882,7 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr)
+ int mask;
+
+ addr += (nr >> 3);
+- mask = 1 << (7 - (nr & 0x07));
++ mask = BIT(7 - (nr & 0x07));
+ return mask & *addr;
+ }
+
+@@ -2891,7 +2891,7 @@ static inline void f2fs_set_bit(unsigned int nr, char *addr)
+ int mask;
+
+ addr += (nr >> 3);
+- mask = 1 << (7 - (nr & 0x07));
++ mask = BIT(7 - (nr & 0x07));
+ *addr |= mask;
+ }
+
+@@ -2900,7 +2900,7 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr)
+ int mask;
+
+ addr += (nr >> 3);
+- mask = 1 << (7 - (nr & 0x07));
++ mask = BIT(7 - (nr & 0x07));
+ *addr &= ~mask;
+ }
+
+@@ -2910,7 +2910,7 @@ static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr)
+ int ret;
+
+ addr += (nr >> 3);
+- mask = 1 << (7 - (nr & 0x07));
++ mask = BIT(7 - (nr & 0x07));
+ ret = mask & *addr;
+ *addr |= mask;
+ return ret;
+@@ -2922,7 +2922,7 @@ static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr)
+ int ret;
+
+ addr += (nr >> 3);
+- mask = 1 << (7 - (nr & 0x07));
++ mask = BIT(7 - (nr & 0x07));
+ ret = mask & *addr;
+ *addr &= ~mask;
+ return ret;
+@@ -2933,7 +2933,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr)
+ int mask;
+
+ addr += (nr >> 3);
+- mask = 1 << (7 - (nr & 0x07));
++ mask = BIT(7 - (nr & 0x07));
+ *addr ^= mask;
+ }
+
+@@ -4333,9 +4333,9 @@ static inline int set_compress_context(struct inode *inode)
+ F2FS_OPTION(sbi).compress_log_size;
+ F2FS_I(inode)->i_compress_flag =
+ F2FS_OPTION(sbi).compress_chksum ?
+- 1 << COMPRESS_CHKSUM : 0;
++ BIT(COMPRESS_CHKSUM) : 0;
+ F2FS_I(inode)->i_cluster_size =
+- 1 << F2FS_I(inode)->i_log_cluster_size;
++ BIT(F2FS_I(inode)->i_log_cluster_size);
+ if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 ||
+ F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) &&
+ F2FS_OPTION(sbi).compress_level)
+diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
+index d0c17366ebf48..126c074deebdc 100644
+--- a/fs/f2fs/file.c
++++ b/fs/f2fs/file.c
+@@ -3983,7 +3983,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
+
+ F2FS_I(inode)->i_compress_algorithm = option.algorithm;
+ F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size;
+- F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size;
++ F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size);
+ f2fs_mark_inode_dirty_sync(inode, true);
+
+ if (!f2fs_is_compress_backend_ready(inode))
+diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
+index 933554985d328..0010579f17368 100644
+--- a/fs/f2fs/inode.c
++++ b/fs/f2fs/inode.c
+@@ -460,8 +460,8 @@ static int do_read_inode(struct inode *inode)
+ fi->i_compress_level = compress_flag >>
+ COMPRESS_LEVEL_OFFSET;
+ fi->i_compress_flag = compress_flag &
+- (BIT(COMPRESS_LEVEL_OFFSET) - 1);
+- fi->i_cluster_size = 1 << fi->i_log_cluster_size;
++ GENMASK(COMPRESS_LEVEL_OFFSET - 1, 0);
++ fi->i_cluster_size = BIT(fi->i_log_cluster_size);
+ set_inode_flag(inode, FI_COMPRESSED_FILE);
+ }
+ }
+diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
+index 0aa48704c77a0..7068f3ac036a5 100644
+--- a/fs/f2fs/node.h
++++ b/fs/f2fs/node.h
+@@ -93,17 +93,15 @@ static inline void copy_node_info(struct node_info *dst,
+ static inline void set_nat_flag(struct nat_entry *ne,
+ unsigned int type, bool set)
+ {
+- unsigned char mask = 0x01 << type;
+ if (set)
+- ne->ni.flag |= mask;
++ ne->ni.flag |= BIT(type);
+ else
+- ne->ni.flag &= ~mask;
++ ne->ni.flag &= ~BIT(type);
+ }
+
+ static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
+ {
+- unsigned char mask = 0x01 << type;
+- return ne->ni.flag & mask;
++ return ne->ni.flag & BIT(type);
+ }
+
+ static inline void nat_reset_flag(struct nat_entry *ne)
+@@ -224,7 +222,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi,
+ struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+ block_addr -= nm_i->nat_blkaddr;
+- block_addr ^= 1 << sbi->log_blocks_per_seg;
++ block_addr ^= BIT(sbi->log_blocks_per_seg);
+ return block_addr + nm_i->nat_blkaddr;
+ }
+
+@@ -394,7 +392,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i)
+ static inline int is_node(struct page *page, int type)
+ {
+ struct f2fs_node *rn = F2FS_NODE(page);
+- return le32_to_cpu(rn->footer.flag) & (1 << type);
++ return le32_to_cpu(rn->footer.flag) & BIT(type);
+ }
+
+ #define is_cold_node(page) is_node(page, COLD_BIT_SHIFT)
+@@ -407,9 +405,9 @@ static inline void set_cold_node(struct page *page, bool is_dir)
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+
+ if (is_dir)
+- flag &= ~(0x1 << COLD_BIT_SHIFT);
++ flag &= ~BIT(COLD_BIT_SHIFT);
+ else
+- flag |= (0x1 << COLD_BIT_SHIFT);
++ flag |= BIT(COLD_BIT_SHIFT);
+ rn->footer.flag = cpu_to_le32(flag);
+ }
+
+@@ -418,9 +416,9 @@ static inline void set_mark(struct page *page, int mark, int type)
+ struct f2fs_node *rn = F2FS_NODE(page);
+ unsigned int flag = le32_to_cpu(rn->footer.flag);
+ if (mark)
+- flag |= (0x1 << type);
++ flag |= BIT(type);
+ else
+- flag &= ~(0x1 << type);
++ flag &= ~BIT(type);
+ rn->footer.flag = cpu_to_le32(flag);
+
+ #ifdef CONFIG_F2FS_CHECK_FS
+diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
+index 1ba85ef97cbd3..4f87e0e374c25 100644
+--- a/fs/f2fs/super.c
++++ b/fs/f2fs/super.c
+@@ -898,8 +898,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
+ if (args->from && match_int(args, &arg))
+ return -EINVAL;
+ if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) {
+- f2fs_warn(sbi, "Not support %d, larger than %d",
+- 1 << arg, BIO_MAX_VECS);
++ f2fs_warn(sbi, "Not support %ld, larger than %d",
++ BIT(arg), BIO_MAX_VECS);
+ return -EINVAL;
+ }
+ F2FS_OPTION(sbi).write_io_size_bits = arg;
+@@ -1340,7 +1340,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
+ #endif
+
+ if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
+- f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO",
++ f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO",
+ F2FS_IO_SIZE_KB(sbi));
+ return -EINVAL;
+ }
+@@ -3356,7 +3356,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
+ total_sections = le32_to_cpu(raw_super->section_count);
+
+ /* blocks_per_seg should be 512, given the above check */
+- blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg);
++ blocks_per_seg = BIT(le32_to_cpu(raw_super->log_blocks_per_seg));
+
+ if (segment_count > F2FS_MAX_SEGMENT ||
+ segment_count < F2FS_MIN_SEGMENTS) {
+@@ -3625,9 +3625,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
+ sbi->log_sectors_per_block =
+ le32_to_cpu(raw_super->log_sectors_per_block);
+ sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize);
+- sbi->blocksize = 1 << sbi->log_blocksize;
++ sbi->blocksize = BIT(sbi->log_blocksize);
+ sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+- sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg;
++ sbi->blocks_per_seg = BIT(sbi->log_blocks_per_seg);
+ sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
+ sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
+ sbi->total_sections = le32_to_cpu(raw_super->section_count);
+@@ -3883,7 +3883,7 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason)
+
+ f2fs_down_write(&sbi->sb_lock);
+
+- if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1))
++ if (raw_super->s_stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0))
+ raw_super->s_stop_reason[reason]++;
+
+ err = f2fs_commit_super(sbi, false);
+@@ -4033,7 +4033,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
+ FDEV(i).start_blk, FDEV(i).end_blk);
+ }
+ f2fs_info(sbi,
+- "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi));
++ "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi));
+ return 0;
+ }
+
+diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
+index 3d68bfa75cf2a..751a108e612ff 100644
+--- a/fs/f2fs/sysfs.c
++++ b/fs/f2fs/sysfs.c
+@@ -451,7 +451,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a,
+ if (ret < 0)
+ return ret;
+ #ifdef CONFIG_F2FS_FAULT_INJECTION
+- if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX))
++ if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX))
+ return -EINVAL;
+ if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX)
+ return -EINVAL;
+diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
+index ee0d75d9a302d..1e0df607e40c4 100644
+--- a/include/linux/f2fs_fs.h
++++ b/include/linux/f2fs_fs.h
+@@ -40,9 +40,8 @@
+
+ #define F2FS_ENC_UTF8_12_1 1
+
+-#define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */
+-#define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */
+-#define F2FS_IO_SIZE_BYTES(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */
++#define F2FS_IO_SIZE(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */
++#define F2FS_IO_SIZE_KB(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits + 2) /* KB */
+ #define F2FS_IO_SIZE_BITS(sbi) (F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */
+ #define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1)
+ #define F2FS_IO_ALIGNED(sbi) (F2FS_IO_SIZE(sbi) > 1)
+@@ -340,7 +339,7 @@ enum {
+ OFFSET_BIT_SHIFT
+ };
+
+-#define OFFSET_BIT_MASK (0x07) /* (0x01 << OFFSET_BIT_SHIFT) - 1 */
++#define OFFSET_BIT_MASK GENMASK(OFFSET_BIT_SHIFT - 1, 0)
+
+ struct node_footer {
+ __le32 nid; /* node id */
+@@ -545,7 +544,7 @@ typedef __le32 f2fs_hash_t;
+ #define MAX_DIR_HASH_DEPTH 63
+
+ /* MAX buckets in one level of dir */
+-#define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1))
++#define MAX_DIR_BUCKETS BIT((MAX_DIR_HASH_DEPTH / 2) - 1)
+
+ /*
+ * space utilization of regular dentry and inline dentry (w/o extra reservation)
+--
+2.43.0
+
--- /dev/null
+From 58e5af6fa360d2c24949fe3057b862c27142ed6c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Sep 2023 15:41:42 -0700
+Subject: f2fs: set the default compress_level on ioctl
+
+From: Jaegeuk Kim <jaegeuk@kernel.org>
+
+[ Upstream commit f5f3bd903a5d3e3b2ba89f11e0e29db25e60c048 ]
+
+Otherwise, we'll get a broken inode.
+
+ # touch $FILE
+ # f2fs_io setflags compression $FILE
+ # f2fs_io set_coption 2 8 $FILE
+
+[ 112.227612] F2FS-fs (dm-51): sanity_check_compress_inode: inode (ino=8d3fe) has unsupported compress level: 0, run fsck to fix
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/f2fs/file.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
+index 126c074deebdc..9b9fb3c57ec6c 100644
+--- a/fs/f2fs/file.c
++++ b/fs/f2fs/file.c
+@@ -3984,6 +3984,15 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
+ F2FS_I(inode)->i_compress_algorithm = option.algorithm;
+ F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size;
+ F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size);
++ /* Set default level */
++ if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD)
++ F2FS_I(inode)->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL;
++ else
++ F2FS_I(inode)->i_compress_level = 0;
++ /* Adjust mount option level */
++ if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm &&
++ F2FS_OPTION(sbi).compress_level)
++ F2FS_I(inode)->i_compress_level = F2FS_OPTION(sbi).compress_level;
+ f2fs_mark_inode_dirty_sync(inode, true);
+
+ if (!f2fs_is_compress_backend_ready(inode))
+--
+2.43.0
+
--- /dev/null
+From 0dba7e14edb61efe1ef25501ef2902a7619970f8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 27 Oct 2023 15:04:56 +0300
+Subject: fbdev: imsttfb: fix double free in probe()
+
+From: Dan Carpenter <dan.carpenter@linaro.org>
+
+[ Upstream commit e08c30efda21ef4c0ec084a3a9581c220b442ba9 ]
+
+The init_imstt() function calls framebuffer_release() on error and then
+the probe() function calls it again. It should only be done in probe.
+
+Fixes: 518ecb6a209f ("fbdev: imsttfb: Fix error path of imsttfb_probe()")
+Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
+Signed-off-by: Helge Deller <deller@gmx.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/video/fbdev/imsttfb.c | 6 +-----
+ 1 file changed, 1 insertion(+), 5 deletions(-)
+
+diff --git a/drivers/video/fbdev/imsttfb.c b/drivers/video/fbdev/imsttfb.c
+index 3d1ae5267a738..aa51cb72cbba5 100644
+--- a/drivers/video/fbdev/imsttfb.c
++++ b/drivers/video/fbdev/imsttfb.c
+@@ -1419,7 +1419,6 @@ static int init_imstt(struct fb_info *info)
+ if ((info->var.xres * info->var.yres) * (info->var.bits_per_pixel >> 3) > info->fix.smem_len
+ || !(compute_imstt_regvals(par, info->var.xres, info->var.yres))) {
+ printk("imsttfb: %ux%ux%u not supported\n", info->var.xres, info->var.yres, info->var.bits_per_pixel);
+- framebuffer_release(info);
+ return -ENODEV;
+ }
+
+@@ -1452,14 +1451,11 @@ static int init_imstt(struct fb_info *info)
+ FBINFO_HWACCEL_FILLRECT |
+ FBINFO_HWACCEL_YPAN;
+
+- if (fb_alloc_cmap(&info->cmap, 0, 0)) {
+- framebuffer_release(info);
++ if (fb_alloc_cmap(&info->cmap, 0, 0))
+ return -ENODEV;
+- }
+
+ if (register_framebuffer(info) < 0) {
+ fb_dealloc_cmap(&info->cmap);
+- framebuffer_release(info);
+ return -ENODEV;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 1d9d0ecc1ce7f53db132d35e037803dd1265e7a4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 27 May 2023 11:28:36 +0200
+Subject: fbdev: imsttfb: Release framebuffer and dealloc cmap on error path
+
+From: Helge Deller <deller@gmx.de>
+
+[ Upstream commit 5cf9a090a39c97f4506b7b53739d469b1c05a7e9 ]
+
+Add missing cleanups in error path.
+
+Signed-off-by: Helge Deller <deller@gmx.de>
+Stable-dep-of: e08c30efda21 ("fbdev: imsttfb: fix double free in probe()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/video/fbdev/imsttfb.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/video/fbdev/imsttfb.c b/drivers/video/fbdev/imsttfb.c
+index b194e71f07bfc..3d1ae5267a738 100644
+--- a/drivers/video/fbdev/imsttfb.c
++++ b/drivers/video/fbdev/imsttfb.c
+@@ -1452,9 +1452,13 @@ static int init_imstt(struct fb_info *info)
+ FBINFO_HWACCEL_FILLRECT |
+ FBINFO_HWACCEL_YPAN;
+
+- fb_alloc_cmap(&info->cmap, 0, 0);
++ if (fb_alloc_cmap(&info->cmap, 0, 0)) {
++ framebuffer_release(info);
++ return -ENODEV;
++ }
+
+ if (register_framebuffer(info) < 0) {
++ fb_dealloc_cmap(&info->cmap);
+ framebuffer_release(info);
+ return -ENODEV;
+ }
+--
+2.43.0
+
--- /dev/null
+From 28a5490b3586d1c511530d0848ade4165e206e96 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 Oct 2023 16:10:17 +0200
+Subject: filemap: add a per-mapping stable writes flag
+
+From: Christoph Hellwig <hch@lst.de>
+
+[ Upstream commit 762321dab9a72760bf9aec48362f932717c9424d ]
+
+folio_wait_stable waits for writeback to finish before modifying the
+contents of a folio again, e.g. to support check summing of the data
+in the block integrity code.
+
+Currently this behavior is controlled by the SB_I_STABLE_WRITES flag
+on the super_block, which means it is uniform for the entire file system.
+This is wrong for the block device pseudofs which is shared by all
+block devices, or file systems that can use multiple devices like XFS
+witht the RT subvolume or btrfs (although btrfs currently reimplements
+folio_wait_stable anyway).
+
+Add a per-address_space AS_STABLE_WRITES flag to control the behavior
+in a more fine grained way. The existing SB_I_STABLE_WRITES is kept
+to initialize AS_STABLE_WRITES to the existing default which covers
+most cases.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Link: https://lore.kernel.org/r/20231025141020.192413-2-hch@lst.de
+Tested-by: Ilya Dryomov <idryomov@gmail.com>
+Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/inode.c | 2 ++
+ include/linux/pagemap.h | 17 +++++++++++++++++
+ mm/page-writeback.c | 2 +-
+ 3 files changed, 20 insertions(+), 1 deletion(-)
+
+diff --git a/fs/inode.c b/fs/inode.c
+index 73ad1b0d47758..8cfda7a6d5900 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -215,6 +215,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
+ lockdep_set_class_and_name(&mapping->invalidate_lock,
+ &sb->s_type->invalidate_lock_key,
+ "mapping.invalidate_lock");
++ if (sb->s_iflags & SB_I_STABLE_WRITES)
++ mapping_set_stable_writes(mapping);
+ inode->i_private = NULL;
+ inode->i_mapping = mapping;
+ INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index fdbb90ae56c70..1be5a1fa6a3a8 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -200,6 +200,8 @@ enum mapping_flags {
+ AS_NO_WRITEBACK_TAGS = 5,
+ AS_LARGE_FOLIO_SUPPORT = 6,
+ AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
++ AS_STABLE_WRITES, /* must wait for writeback before modifying
++ folio contents */
+ };
+
+ /**
+@@ -285,6 +287,21 @@ static inline void mapping_clear_release_always(struct address_space *mapping)
+ clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
+ }
+
++static inline bool mapping_stable_writes(const struct address_space *mapping)
++{
++ return test_bit(AS_STABLE_WRITES, &mapping->flags);
++}
++
++static inline void mapping_set_stable_writes(struct address_space *mapping)
++{
++ set_bit(AS_STABLE_WRITES, &mapping->flags);
++}
++
++static inline void mapping_clear_stable_writes(struct address_space *mapping)
++{
++ clear_bit(AS_STABLE_WRITES, &mapping->flags);
++}
++
+ static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
+ {
+ return mapping->gfp_mask;
+diff --git a/mm/page-writeback.c b/mm/page-writeback.c
+index 7e9d8d857ecca..de5f69921b946 100644
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -3078,7 +3078,7 @@ EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);
+ */
+ void folio_wait_stable(struct folio *folio)
+ {
+- if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES)
++ if (mapping_stable_writes(folio_mapping(folio)))
+ folio_wait_writeback(folio);
+ }
+ EXPORT_SYMBOL_GPL(folio_wait_stable);
+--
+2.43.0
+
--- /dev/null
+From 0be29a4228fb46ed71ceb9d3ce17be8b03862eba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 30 Nov 2023 20:43:42 +0000
+Subject: firmware: arm_scmi: Fix frequency truncation by promoting multiplier
+ type
+
+From: Sudeep Holla <sudeep.holla@arm.com>
+
+[ Upstream commit 8e3c98d9187e09274fc000a7d1a77b070a42d259 ]
+
+Fix the possible frequency truncation for all values equal to or greater
+4GHz on 64bit machines by updating the multiplier 'mult_factor' to
+'unsigned long' type. It is also possible that the multiplier itself can
+be greater than or equal to 2^32. So we need to also fix the equation
+computing the value of the multiplier.
+
+Fixes: a9e3fbfaa0ff ("firmware: arm_scmi: add initial support for performance protocol")
+Reported-by: Sibi Sankar <quic_sibis@quicinc.com>
+Closes: https://lore.kernel.org/all/20231129065748.19871-3-quic_sibis@quicinc.com/
+Cc: Cristian Marussi <cristian.marussi@arm.com>
+Link: https://lore.kernel.org/r/20231130204343.503076-1-sudeep.holla@arm.com
+Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/firmware/arm_scmi/perf.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
+index 431bda9165c3d..2775bcafe40f6 100644
+--- a/drivers/firmware/arm_scmi/perf.c
++++ b/drivers/firmware/arm_scmi/perf.c
+@@ -131,7 +131,7 @@ struct perf_dom_info {
+ u32 opp_count;
+ u32 sustained_freq_khz;
+ u32 sustained_perf_level;
+- u32 mult_factor;
++ unsigned long mult_factor;
+ char name[SCMI_MAX_STR_SIZE];
+ struct scmi_opp opp[MAX_OPPS];
+ struct scmi_fc_info *fc_info;
+@@ -223,8 +223,8 @@ scmi_perf_domain_attributes_get(const struct scmi_protocol_handle *ph,
+ dom_info->mult_factor = 1000;
+ else
+ dom_info->mult_factor =
+- (dom_info->sustained_freq_khz * 1000) /
+- dom_info->sustained_perf_level;
++ (dom_info->sustained_freq_khz * 1000UL)
++ / dom_info->sustained_perf_level;
+ strscpy(dom_info->name, attr->name, SCMI_SHORT_NAME_MAX_SIZE);
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 88fe3a4d6d033d9103e986952d44d4c647deba38 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Dec 2022 10:29:02 +0800
+Subject: genirq/affinity: Don't pass irq_affinity_desc array to
+ irq_build_affinity_masks
+
+From: Ming Lei <ming.lei@redhat.com>
+
+[ Upstream commit e7bdd7f0cbd1c001bb9b4d3313edc5ee094bc3f8 ]
+
+Prepare for abstracting irq_build_affinity_masks() into a public function
+for assigning all CPUs evenly into several groups.
+
+Don't pass irq_affinity_desc array to irq_build_affinity_masks, instead
+return a cpumask array by storing each assigned group into one element of
+the array.
+
+This allows to provide a generic interface for grouping all CPUs evenly
+from a NUMA and CPU locality viewpoint, and the cost is one extra allocation
+in irq_build_affinity_masks(), which should be fine since it is done via
+GFP_KERNEL and irq_build_affinity_masks() is a slow path anyway.
+
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: John Garry <john.g.garry@oracle.com>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20221227022905.352674-4-ming.lei@redhat.com
+Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/irq/affinity.c | 34 ++++++++++++++++++++++++----------
+ 1 file changed, 24 insertions(+), 10 deletions(-)
+
+diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
+index da6379cd27fd4..00bba1020ecb2 100644
+--- a/kernel/irq/affinity.c
++++ b/kernel/irq/affinity.c
+@@ -249,7 +249,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ struct cpumask *nmsk,
+- struct irq_affinity_desc *masks)
++ struct cpumask *masks)
+ {
+ unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
+ unsigned int last_affv = numvecs;
+@@ -270,7 +270,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+ for_each_node_mask(n, nodemsk) {
+ /* Ensure that only CPUs which are in both masks are set */
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+- cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
++ cpumask_or(&masks[curvec], &masks[curvec], nmsk);
+ if (++curvec == last_affv)
+ curvec = 0;
+ }
+@@ -321,7 +321,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+ */
+ if (curvec >= last_affv)
+ curvec = 0;
+- irq_spread_init_one(&masks[curvec].mask, nmsk,
++ irq_spread_init_one(&masks[curvec], nmsk,
+ cpus_per_vec);
+ }
+ done += nv->nvectors;
+@@ -335,16 +335,16 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+ * 1) spread present CPU on these vectors
+ * 2) spread other possible CPUs on these vectors
+ */
+-static int irq_build_affinity_masks(unsigned int numvecs,
+- struct irq_affinity_desc *masks)
++static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
+ {
+ unsigned int curvec = 0, nr_present = 0, nr_others = 0;
+ cpumask_var_t *node_to_cpumask;
+ cpumask_var_t nmsk, npresmsk;
+ int ret = -ENOMEM;
++ struct cpumask *masks = NULL;
+
+ if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+- return ret;
++ return NULL;
+
+ if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
+ goto fail_nmsk;
+@@ -353,6 +353,10 @@ static int irq_build_affinity_masks(unsigned int numvecs,
+ if (!node_to_cpumask)
+ goto fail_npresmsk;
+
++ masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL);
++ if (!masks)
++ goto fail_node_to_cpumask;
++
+ /* Stabilize the cpumasks */
+ cpus_read_lock();
+ build_node_to_cpumask(node_to_cpumask);
+@@ -386,6 +390,7 @@ static int irq_build_affinity_masks(unsigned int numvecs,
+ if (ret >= 0)
+ WARN_ON(nr_present + nr_others < numvecs);
+
++ fail_node_to_cpumask:
+ free_node_to_cpumask(node_to_cpumask);
+
+ fail_npresmsk:
+@@ -393,7 +398,11 @@ static int irq_build_affinity_masks(unsigned int numvecs,
+
+ fail_nmsk:
+ free_cpumask_var(nmsk);
+- return ret < 0 ? ret : 0;
++ if (ret < 0) {
++ kfree(masks);
++ return NULL;
++ }
++ return masks;
+ }
+
+ static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+@@ -457,13 +466,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
+ */
+ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+ unsigned int this_vecs = affd->set_size[i];
+- int ret;
++ int j;
++ struct cpumask *result = irq_build_affinity_masks(this_vecs);
+
+- ret = irq_build_affinity_masks(this_vecs, &masks[curvec]);
+- if (ret) {
++ if (!result) {
+ kfree(masks);
+ return NULL;
+ }
++
++ for (j = 0; j < this_vecs; j++)
++ cpumask_copy(&masks[curvec + j].mask, &result[j]);
++ kfree(result);
++
+ curvec += this_vecs;
+ usedvecs += this_vecs;
+ }
+--
+2.43.0
+
--- /dev/null
+From ef4de3476be1d045915045b849fb143020fc8b84 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Dec 2022 10:29:04 +0800
+Subject: genirq/affinity: Move group_cpus_evenly() into lib/
+
+From: Ming Lei <ming.lei@redhat.com>
+
+[ Upstream commit f7b3ea8cf72f3d6060fe08e461805181e7450a13 ]
+
+group_cpus_evenly() has become a generic function which can be used for
+other subsystems than the interrupt subsystem, so move it into lib/.
+
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20221227022905.352674-6-ming.lei@redhat.com
+Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ MAINTAINERS | 2 +
+ include/linux/group_cpus.h | 14 ++
+ kernel/irq/affinity.c | 398 +---------------------------------
+ lib/Makefile | 2 +
+ lib/group_cpus.c | 427 +++++++++++++++++++++++++++++++++++++
+ 5 files changed, 446 insertions(+), 397 deletions(-)
+ create mode 100644 include/linux/group_cpus.h
+ create mode 100644 lib/group_cpus.c
+
+diff --git a/MAINTAINERS b/MAINTAINERS
+index 07a9c274c0e29..13d1078808bb5 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -10803,6 +10803,8 @@ L: linux-kernel@vger.kernel.org
+ S: Maintained
+ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core
+ F: kernel/irq/
++F: include/linux/group_cpus.h
++F: lib/group_cpus.c
+
+ IRQCHIP DRIVERS
+ M: Thomas Gleixner <tglx@linutronix.de>
+diff --git a/include/linux/group_cpus.h b/include/linux/group_cpus.h
+new file mode 100644
+index 0000000000000..e42807ec61f6e
+--- /dev/null
++++ b/include/linux/group_cpus.h
+@@ -0,0 +1,14 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++/*
++ * Copyright (C) 2016 Thomas Gleixner.
++ * Copyright (C) 2016-2017 Christoph Hellwig.
++ */
++
++#ifndef __LINUX_GROUP_CPUS_H
++#define __LINUX_GROUP_CPUS_H
++#include <linux/kernel.h>
++#include <linux/cpu.h>
++
++struct cpumask *group_cpus_evenly(unsigned int numgrps);
++
++#endif
+diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
+index 54083331f1bcb..44a4eba80315c 100644
+--- a/kernel/irq/affinity.c
++++ b/kernel/irq/affinity.c
+@@ -7,403 +7,7 @@
+ #include <linux/kernel.h>
+ #include <linux/slab.h>
+ #include <linux/cpu.h>
+-#include <linux/sort.h>
+-
+-static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+- unsigned int cpus_per_grp)
+-{
+- const struct cpumask *siblmsk;
+- int cpu, sibl;
+-
+- for ( ; cpus_per_grp > 0; ) {
+- cpu = cpumask_first(nmsk);
+-
+- /* Should not happen, but I'm too lazy to think about it */
+- if (cpu >= nr_cpu_ids)
+- return;
+-
+- cpumask_clear_cpu(cpu, nmsk);
+- cpumask_set_cpu(cpu, irqmsk);
+- cpus_per_grp--;
+-
+- /* If the cpu has siblings, use them first */
+- siblmsk = topology_sibling_cpumask(cpu);
+- for (sibl = -1; cpus_per_grp > 0; ) {
+- sibl = cpumask_next(sibl, siblmsk);
+- if (sibl >= nr_cpu_ids)
+- break;
+- if (!cpumask_test_and_clear_cpu(sibl, nmsk))
+- continue;
+- cpumask_set_cpu(sibl, irqmsk);
+- cpus_per_grp--;
+- }
+- }
+-}
+-
+-static cpumask_var_t *alloc_node_to_cpumask(void)
+-{
+- cpumask_var_t *masks;
+- int node;
+-
+- masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
+- if (!masks)
+- return NULL;
+-
+- for (node = 0; node < nr_node_ids; node++) {
+- if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
+- goto out_unwind;
+- }
+-
+- return masks;
+-
+-out_unwind:
+- while (--node >= 0)
+- free_cpumask_var(masks[node]);
+- kfree(masks);
+- return NULL;
+-}
+-
+-static void free_node_to_cpumask(cpumask_var_t *masks)
+-{
+- int node;
+-
+- for (node = 0; node < nr_node_ids; node++)
+- free_cpumask_var(masks[node]);
+- kfree(masks);
+-}
+-
+-static void build_node_to_cpumask(cpumask_var_t *masks)
+-{
+- int cpu;
+-
+- for_each_possible_cpu(cpu)
+- cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
+-}
+-
+-static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
+- const struct cpumask *mask, nodemask_t *nodemsk)
+-{
+- int n, nodes = 0;
+-
+- /* Calculate the number of nodes in the supplied affinity mask */
+- for_each_node(n) {
+- if (cpumask_intersects(mask, node_to_cpumask[n])) {
+- node_set(n, *nodemsk);
+- nodes++;
+- }
+- }
+- return nodes;
+-}
+-
+-struct node_groups {
+- unsigned id;
+-
+- union {
+- unsigned ngroups;
+- unsigned ncpus;
+- };
+-};
+-
+-static int ncpus_cmp_func(const void *l, const void *r)
+-{
+- const struct node_groups *ln = l;
+- const struct node_groups *rn = r;
+-
+- return ln->ncpus - rn->ncpus;
+-}
+-
+-/*
+- * Allocate group number for each node, so that for each node:
+- *
+- * 1) the allocated number is >= 1
+- *
+- * 2) the allocated number is <= active CPU number of this node
+- *
+- * The actual allocated total groups may be less than @numgrps when
+- * active total CPU number is less than @numgrps.
+- *
+- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
+- * for each node.
+- */
+-static void alloc_nodes_groups(unsigned int numgrps,
+- cpumask_var_t *node_to_cpumask,
+- const struct cpumask *cpu_mask,
+- const nodemask_t nodemsk,
+- struct cpumask *nmsk,
+- struct node_groups *node_groups)
+-{
+- unsigned n, remaining_ncpus = 0;
+-
+- for (n = 0; n < nr_node_ids; n++) {
+- node_groups[n].id = n;
+- node_groups[n].ncpus = UINT_MAX;
+- }
+-
+- for_each_node_mask(n, nodemsk) {
+- unsigned ncpus;
+-
+- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+- ncpus = cpumask_weight(nmsk);
+-
+- if (!ncpus)
+- continue;
+- remaining_ncpus += ncpus;
+- node_groups[n].ncpus = ncpus;
+- }
+-
+- numgrps = min_t(unsigned, remaining_ncpus, numgrps);
+-
+- sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
+- ncpus_cmp_func, NULL);
+-
+- /*
+- * Allocate groups for each node according to the ratio of this
+- * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
+- * bigger than number of active numa nodes. Always start the
+- * allocation from the node with minimized nr_cpus.
+- *
+- * This way guarantees that each active node gets allocated at
+- * least one group, and the theory is simple: over-allocation
+- * is only done when this node is assigned by one group, so
+- * other nodes will be allocated >= 1 groups, since 'numgrps' is
+- * bigger than number of numa nodes.
+- *
+- * One perfect invariant is that number of allocated groups for
+- * each node is <= CPU count of this node:
+- *
+- * 1) suppose there are two nodes: A and B
+- * ncpu(X) is CPU count of node X
+- * grps(X) is the group count allocated to node X via this
+- * algorithm
+- *
+- * ncpu(A) <= ncpu(B)
+- * ncpu(A) + ncpu(B) = N
+- * grps(A) + grps(B) = G
+- *
+- * grps(A) = max(1, round_down(G * ncpu(A) / N))
+- * grps(B) = G - grps(A)
+- *
+- * both N and G are integer, and 2 <= G <= N, suppose
+- * G = N - delta, and 0 <= delta <= N - 2
+- *
+- * 2) obviously grps(A) <= ncpu(A) because:
+- *
+- * if grps(A) is 1, then grps(A) <= ncpu(A) given
+- * ncpu(A) >= 1
+- *
+- * otherwise,
+- * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
+- *
+- * 3) prove how grps(B) <= ncpu(B):
+- *
+- * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
+- * over-allocated, so grps(B) <= ncpu(B),
+- *
+- * otherwise:
+- *
+- * grps(A) =
+- * round_down(G * ncpu(A) / N) =
+- * round_down((N - delta) * ncpu(A) / N) =
+- * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
+- * round_down((N * ncpu(A) - delta * N) / N) =
+- * cpu(A) - delta
+- *
+- * then:
+- *
+- * grps(A) - G >= ncpu(A) - delta - G
+- * =>
+- * G - grps(A) <= G + delta - ncpu(A)
+- * =>
+- * grps(B) <= N - ncpu(A)
+- * =>
+- * grps(B) <= cpu(B)
+- *
+- * For nodes >= 3, it can be thought as one node and another big
+- * node given that is exactly what this algorithm is implemented,
+- * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
+- * finally for each node X: grps(X) <= ncpu(X).
+- *
+- */
+- for (n = 0; n < nr_node_ids; n++) {
+- unsigned ngroups, ncpus;
+-
+- if (node_groups[n].ncpus == UINT_MAX)
+- continue;
+-
+- WARN_ON_ONCE(numgrps == 0);
+-
+- ncpus = node_groups[n].ncpus;
+- ngroups = max_t(unsigned, 1,
+- numgrps * ncpus / remaining_ncpus);
+- WARN_ON_ONCE(ngroups > ncpus);
+-
+- node_groups[n].ngroups = ngroups;
+-
+- remaining_ncpus -= ncpus;
+- numgrps -= ngroups;
+- }
+-}
+-
+-static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
+- cpumask_var_t *node_to_cpumask,
+- const struct cpumask *cpu_mask,
+- struct cpumask *nmsk, struct cpumask *masks)
+-{
+- unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
+- unsigned int last_grp = numgrps;
+- unsigned int curgrp = startgrp;
+- nodemask_t nodemsk = NODE_MASK_NONE;
+- struct node_groups *node_groups;
+-
+- if (cpumask_empty(cpu_mask))
+- return 0;
+-
+- nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
+-
+- /*
+- * If the number of nodes in the mask is greater than or equal the
+- * number of groups we just spread the groups across the nodes.
+- */
+- if (numgrps <= nodes) {
+- for_each_node_mask(n, nodemsk) {
+- /* Ensure that only CPUs which are in both masks are set */
+- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+- cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
+- if (++curgrp == last_grp)
+- curgrp = 0;
+- }
+- return numgrps;
+- }
+-
+- node_groups = kcalloc(nr_node_ids,
+- sizeof(struct node_groups),
+- GFP_KERNEL);
+- if (!node_groups)
+- return -ENOMEM;
+-
+- /* allocate group number for each node */
+- alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
+- nodemsk, nmsk, node_groups);
+- for (i = 0; i < nr_node_ids; i++) {
+- unsigned int ncpus, v;
+- struct node_groups *nv = &node_groups[i];
+-
+- if (nv->ngroups == UINT_MAX)
+- continue;
+-
+- /* Get the cpus on this node which are in the mask */
+- cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
+- ncpus = cpumask_weight(nmsk);
+- if (!ncpus)
+- continue;
+-
+- WARN_ON_ONCE(nv->ngroups > ncpus);
+-
+- /* Account for rounding errors */
+- extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
+-
+- /* Spread allocated groups on CPUs of the current node */
+- for (v = 0; v < nv->ngroups; v++, curgrp++) {
+- cpus_per_grp = ncpus / nv->ngroups;
+-
+- /* Account for extra groups to compensate rounding errors */
+- if (extra_grps) {
+- cpus_per_grp++;
+- --extra_grps;
+- }
+-
+- /*
+- * wrapping has to be considered given 'startgrp'
+- * may start anywhere
+- */
+- if (curgrp >= last_grp)
+- curgrp = 0;
+- grp_spread_init_one(&masks[curgrp], nmsk,
+- cpus_per_grp);
+- }
+- done += nv->ngroups;
+- }
+- kfree(node_groups);
+- return done;
+-}
+-
+-/*
+- * build affinity in two stages for each group, and try to put close CPUs
+- * in viewpoint of CPU and NUMA locality into same group, and we run
+- * two-stage grouping:
+- *
+- * 1) allocate present CPUs on these groups evenly first
+- * 2) allocate other possible CPUs on these groups evenly
+- */
+-static struct cpumask *group_cpus_evenly(unsigned int numgrps)
+-{
+- unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
+- cpumask_var_t *node_to_cpumask;
+- cpumask_var_t nmsk, npresmsk;
+- int ret = -ENOMEM;
+- struct cpumask *masks = NULL;
+-
+- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+- return NULL;
+-
+- if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
+- goto fail_nmsk;
+-
+- node_to_cpumask = alloc_node_to_cpumask();
+- if (!node_to_cpumask)
+- goto fail_npresmsk;
+-
+- masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
+- if (!masks)
+- goto fail_node_to_cpumask;
+-
+- /* Stabilize the cpumasks */
+- cpus_read_lock();
+- build_node_to_cpumask(node_to_cpumask);
+-
+- /* grouping present CPUs first */
+- ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+- cpu_present_mask, nmsk, masks);
+- if (ret < 0)
+- goto fail_build_affinity;
+- nr_present = ret;
+-
+- /*
+- * Allocate non present CPUs starting from the next group to be
+- * handled. If the grouping of present CPUs already exhausted the
+- * group space, assign the non present CPUs to the already
+- * allocated out groups.
+- */
+- if (nr_present >= numgrps)
+- curgrp = 0;
+- else
+- curgrp = nr_present;
+- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
+- ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+- npresmsk, nmsk, masks);
+- if (ret >= 0)
+- nr_others = ret;
+-
+- fail_build_affinity:
+- cpus_read_unlock();
+-
+- if (ret >= 0)
+- WARN_ON(nr_present + nr_others < numgrps);
+-
+- fail_node_to_cpumask:
+- free_node_to_cpumask(node_to_cpumask);
+-
+- fail_npresmsk:
+- free_cpumask_var(npresmsk);
+-
+- fail_nmsk:
+- free_cpumask_var(nmsk);
+- if (ret < 0) {
+- kfree(masks);
+- return NULL;
+- }
+- return masks;
+-}
++#include <linux/group_cpus.h>
+
+ static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+ {
+diff --git a/lib/Makefile b/lib/Makefile
+index 5ffe72ec99797..6f1611d053e6a 100644
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -361,6 +361,8 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o
+
+ obj-$(CONFIG_PARMAN) += parman.o
+
++obj-y += group_cpus.o
++
+ # GCC library routines
+ obj-$(CONFIG_GENERIC_LIB_ASHLDI3) += ashldi3.o
+ obj-$(CONFIG_GENERIC_LIB_ASHRDI3) += ashrdi3.o
+diff --git a/lib/group_cpus.c b/lib/group_cpus.c
+new file mode 100644
+index 0000000000000..99f08c6cb9d97
+--- /dev/null
++++ b/lib/group_cpus.c
+@@ -0,0 +1,427 @@
++// SPDX-License-Identifier: GPL-2.0
++/*
++ * Copyright (C) 2016 Thomas Gleixner.
++ * Copyright (C) 2016-2017 Christoph Hellwig.
++ */
++#include <linux/kernel.h>
++#include <linux/slab.h>
++#include <linux/cpu.h>
++#include <linux/sort.h>
++#include <linux/group_cpus.h>
++
++static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
++ unsigned int cpus_per_grp)
++{
++ const struct cpumask *siblmsk;
++ int cpu, sibl;
++
++ for ( ; cpus_per_grp > 0; ) {
++ cpu = cpumask_first(nmsk);
++
++ /* Should not happen, but I'm too lazy to think about it */
++ if (cpu >= nr_cpu_ids)
++ return;
++
++ cpumask_clear_cpu(cpu, nmsk);
++ cpumask_set_cpu(cpu, irqmsk);
++ cpus_per_grp--;
++
++ /* If the cpu has siblings, use them first */
++ siblmsk = topology_sibling_cpumask(cpu);
++ for (sibl = -1; cpus_per_grp > 0; ) {
++ sibl = cpumask_next(sibl, siblmsk);
++ if (sibl >= nr_cpu_ids)
++ break;
++ if (!cpumask_test_and_clear_cpu(sibl, nmsk))
++ continue;
++ cpumask_set_cpu(sibl, irqmsk);
++ cpus_per_grp--;
++ }
++ }
++}
++
++static cpumask_var_t *alloc_node_to_cpumask(void)
++{
++ cpumask_var_t *masks;
++ int node;
++
++ masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL);
++ if (!masks)
++ return NULL;
++
++ for (node = 0; node < nr_node_ids; node++) {
++ if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
++ goto out_unwind;
++ }
++
++ return masks;
++
++out_unwind:
++ while (--node >= 0)
++ free_cpumask_var(masks[node]);
++ kfree(masks);
++ return NULL;
++}
++
++static void free_node_to_cpumask(cpumask_var_t *masks)
++{
++ int node;
++
++ for (node = 0; node < nr_node_ids; node++)
++ free_cpumask_var(masks[node]);
++ kfree(masks);
++}
++
++static void build_node_to_cpumask(cpumask_var_t *masks)
++{
++ int cpu;
++
++ for_each_possible_cpu(cpu)
++ cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
++}
++
++static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
++ const struct cpumask *mask, nodemask_t *nodemsk)
++{
++ int n, nodes = 0;
++
++ /* Calculate the number of nodes in the supplied affinity mask */
++ for_each_node(n) {
++ if (cpumask_intersects(mask, node_to_cpumask[n])) {
++ node_set(n, *nodemsk);
++ nodes++;
++ }
++ }
++ return nodes;
++}
++
++struct node_groups {
++ unsigned id;
++
++ union {
++ unsigned ngroups;
++ unsigned ncpus;
++ };
++};
++
++static int ncpus_cmp_func(const void *l, const void *r)
++{
++ const struct node_groups *ln = l;
++ const struct node_groups *rn = r;
++
++ return ln->ncpus - rn->ncpus;
++}
++
++/*
++ * Allocate group number for each node, so that for each node:
++ *
++ * 1) the allocated number is >= 1
++ *
++ * 2) the allocated number is <= active CPU number of this node
++ *
++ * The actual allocated total groups may be less than @numgrps when
++ * active total CPU number is less than @numgrps.
++ *
++ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
++ * for each node.
++ */
++static void alloc_nodes_groups(unsigned int numgrps,
++ cpumask_var_t *node_to_cpumask,
++ const struct cpumask *cpu_mask,
++ const nodemask_t nodemsk,
++ struct cpumask *nmsk,
++ struct node_groups *node_groups)
++{
++ unsigned n, remaining_ncpus = 0;
++
++ for (n = 0; n < nr_node_ids; n++) {
++ node_groups[n].id = n;
++ node_groups[n].ncpus = UINT_MAX;
++ }
++
++ for_each_node_mask(n, nodemsk) {
++ unsigned ncpus;
++
++ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
++ ncpus = cpumask_weight(nmsk);
++
++ if (!ncpus)
++ continue;
++ remaining_ncpus += ncpus;
++ node_groups[n].ncpus = ncpus;
++ }
++
++ numgrps = min_t(unsigned, remaining_ncpus, numgrps);
++
++ sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
++ ncpus_cmp_func, NULL);
++
++ /*
++ * Allocate groups for each node according to the ratio of this
++ * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
++ * bigger than number of active numa nodes. Always start the
++ * allocation from the node with minimized nr_cpus.
++ *
++ * This way guarantees that each active node gets allocated at
++ * least one group, and the theory is simple: over-allocation
++ * is only done when this node is assigned by one group, so
++ * other nodes will be allocated >= 1 groups, since 'numgrps' is
++ * bigger than number of numa nodes.
++ *
++ * One perfect invariant is that number of allocated groups for
++ * each node is <= CPU count of this node:
++ *
++ * 1) suppose there are two nodes: A and B
++ * ncpu(X) is CPU count of node X
++ * grps(X) is the group count allocated to node X via this
++ * algorithm
++ *
++ * ncpu(A) <= ncpu(B)
++ * ncpu(A) + ncpu(B) = N
++ * grps(A) + grps(B) = G
++ *
++ * grps(A) = max(1, round_down(G * ncpu(A) / N))
++ * grps(B) = G - grps(A)
++ *
++ * both N and G are integer, and 2 <= G <= N, suppose
++ * G = N - delta, and 0 <= delta <= N - 2
++ *
++ * 2) obviously grps(A) <= ncpu(A) because:
++ *
++ * if grps(A) is 1, then grps(A) <= ncpu(A) given
++ * ncpu(A) >= 1
++ *
++ * otherwise,
++ * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
++ *
++ * 3) prove how grps(B) <= ncpu(B):
++ *
++ * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
++ * over-allocated, so grps(B) <= ncpu(B),
++ *
++ * otherwise:
++ *
++ * grps(A) =
++ * round_down(G * ncpu(A) / N) =
++ * round_down((N - delta) * ncpu(A) / N) =
++ * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
++ * round_down((N * ncpu(A) - delta * N) / N) =
++ * cpu(A) - delta
++ *
++ * then:
++ *
++ * grps(A) - G >= ncpu(A) - delta - G
++ * =>
++ * G - grps(A) <= G + delta - ncpu(A)
++ * =>
++ * grps(B) <= N - ncpu(A)
++ * =>
++ * grps(B) <= cpu(B)
++ *
++ * For nodes >= 3, it can be thought as one node and another big
++ * node given that is exactly what this algorithm is implemented,
++ * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
++ * finally for each node X: grps(X) <= ncpu(X).
++ *
++ */
++ for (n = 0; n < nr_node_ids; n++) {
++ unsigned ngroups, ncpus;
++
++ if (node_groups[n].ncpus == UINT_MAX)
++ continue;
++
++ WARN_ON_ONCE(numgrps == 0);
++
++ ncpus = node_groups[n].ncpus;
++ ngroups = max_t(unsigned, 1,
++ numgrps * ncpus / remaining_ncpus);
++ WARN_ON_ONCE(ngroups > ncpus);
++
++ node_groups[n].ngroups = ngroups;
++
++ remaining_ncpus -= ncpus;
++ numgrps -= ngroups;
++ }
++}
++
++static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
++ cpumask_var_t *node_to_cpumask,
++ const struct cpumask *cpu_mask,
++ struct cpumask *nmsk, struct cpumask *masks)
++{
++ unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
++ unsigned int last_grp = numgrps;
++ unsigned int curgrp = startgrp;
++ nodemask_t nodemsk = NODE_MASK_NONE;
++ struct node_groups *node_groups;
++
++ if (cpumask_empty(cpu_mask))
++ return 0;
++
++ nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
++
++ /*
++ * If the number of nodes in the mask is greater than or equal the
++ * number of groups we just spread the groups across the nodes.
++ */
++ if (numgrps <= nodes) {
++ for_each_node_mask(n, nodemsk) {
++ /* Ensure that only CPUs which are in both masks are set */
++ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
++ cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
++ if (++curgrp == last_grp)
++ curgrp = 0;
++ }
++ return numgrps;
++ }
++
++ node_groups = kcalloc(nr_node_ids,
++ sizeof(struct node_groups),
++ GFP_KERNEL);
++ if (!node_groups)
++ return -ENOMEM;
++
++ /* allocate group number for each node */
++ alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
++ nodemsk, nmsk, node_groups);
++ for (i = 0; i < nr_node_ids; i++) {
++ unsigned int ncpus, v;
++ struct node_groups *nv = &node_groups[i];
++
++ if (nv->ngroups == UINT_MAX)
++ continue;
++
++ /* Get the cpus on this node which are in the mask */
++ cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
++ ncpus = cpumask_weight(nmsk);
++ if (!ncpus)
++ continue;
++
++ WARN_ON_ONCE(nv->ngroups > ncpus);
++
++ /* Account for rounding errors */
++ extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
++
++ /* Spread allocated groups on CPUs of the current node */
++ for (v = 0; v < nv->ngroups; v++, curgrp++) {
++ cpus_per_grp = ncpus / nv->ngroups;
++
++ /* Account for extra groups to compensate rounding errors */
++ if (extra_grps) {
++ cpus_per_grp++;
++ --extra_grps;
++ }
++
++ /*
++ * wrapping has to be considered given 'startgrp'
++ * may start anywhere
++ */
++ if (curgrp >= last_grp)
++ curgrp = 0;
++ grp_spread_init_one(&masks[curgrp], nmsk,
++ cpus_per_grp);
++ }
++ done += nv->ngroups;
++ }
++ kfree(node_groups);
++ return done;
++}
++
++#ifdef CONFIG_SMP
++/**
++ * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
++ * @numgrps: number of groups
++ *
++ * Return: cpumask array if successful, NULL otherwise. And each element
++ * includes CPUs assigned to this group
++ *
++ * Try to put close CPUs from viewpoint of CPU and NUMA locality into
++ * same group, and run two-stage grouping:
++ * 1) allocate present CPUs on these groups evenly first
++ * 2) allocate other possible CPUs on these groups evenly
++ *
++ * We guarantee in the resulted grouping that all CPUs are covered, and
++ * no same CPU is assigned to multiple groups
++ */
++struct cpumask *group_cpus_evenly(unsigned int numgrps)
++{
++ unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
++ cpumask_var_t *node_to_cpumask;
++ cpumask_var_t nmsk, npresmsk;
++ int ret = -ENOMEM;
++ struct cpumask *masks = NULL;
++
++ if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
++ return NULL;
++
++ if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
++ goto fail_nmsk;
++
++ node_to_cpumask = alloc_node_to_cpumask();
++ if (!node_to_cpumask)
++ goto fail_npresmsk;
++
++ masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
++ if (!masks)
++ goto fail_node_to_cpumask;
++
++ /* Stabilize the cpumasks */
++ cpus_read_lock();
++ build_node_to_cpumask(node_to_cpumask);
++
++ /* grouping present CPUs first */
++ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
++ cpu_present_mask, nmsk, masks);
++ if (ret < 0)
++ goto fail_build_affinity;
++ nr_present = ret;
++
++ /*
++ * Allocate non present CPUs starting from the next group to be
++ * handled. If the grouping of present CPUs already exhausted the
++ * group space, assign the non present CPUs to the already
++ * allocated out groups.
++ */
++ if (nr_present >= numgrps)
++ curgrp = 0;
++ else
++ curgrp = nr_present;
++ cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
++ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
++ npresmsk, nmsk, masks);
++ if (ret >= 0)
++ nr_others = ret;
++
++ fail_build_affinity:
++ cpus_read_unlock();
++
++ if (ret >= 0)
++ WARN_ON(nr_present + nr_others < numgrps);
++
++ fail_node_to_cpumask:
++ free_node_to_cpumask(node_to_cpumask);
++
++ fail_npresmsk:
++ free_cpumask_var(npresmsk);
++
++ fail_nmsk:
++ free_cpumask_var(nmsk);
++ if (ret < 0) {
++ kfree(masks);
++ return NULL;
++ }
++ return masks;
++}
++#else
++struct cpumask *group_cpus_evenly(unsigned int numgrps)
++{
++ struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
++
++ if (!masks)
++ return NULL;
++
++ /* assign all CPUs(cpu 0) to the 1st group only */
++ cpumask_copy(&masks[0], cpu_possible_mask);
++ return masks;
++}
++#endif
+--
+2.43.0
+
--- /dev/null
+From 8dadc19b3f0f31cb7d083c07257a1a72dc988e35 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Dec 2022 10:29:01 +0800
+Subject: genirq/affinity: Pass affinity managed mask array to
+ irq_build_affinity_masks
+
+From: Ming Lei <ming.lei@redhat.com>
+
+[ Upstream commit 1f962d91a15af54301c63febb8ac2ba07aa3654f ]
+
+Pass affinity managed mask array to irq_build_affinity_masks() so that the
+index of the first affinity managed vector is always zero.
+
+This allows to simplify the implementation a bit.
+
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: John Garry <john.g.garry@oracle.com>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20221227022905.352674-3-ming.lei@redhat.com
+Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/irq/affinity.c | 28 ++++++++++++----------------
+ 1 file changed, 12 insertions(+), 16 deletions(-)
+
+diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
+index 3361e36ebaa1e..da6379cd27fd4 100644
+--- a/kernel/irq/affinity.c
++++ b/kernel/irq/affinity.c
+@@ -246,14 +246,13 @@ static void alloc_nodes_vectors(unsigned int numvecs,
+
+ static int __irq_build_affinity_masks(unsigned int startvec,
+ unsigned int numvecs,
+- unsigned int firstvec,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ struct cpumask *nmsk,
+ struct irq_affinity_desc *masks)
+ {
+ unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
+- unsigned int last_affv = firstvec + numvecs;
++ unsigned int last_affv = numvecs;
+ unsigned int curvec = startvec;
+ nodemask_t nodemsk = NODE_MASK_NONE;
+ struct node_vectors *node_vectors;
+@@ -273,7 +272,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
+ if (++curvec == last_affv)
+- curvec = firstvec;
++ curvec = 0;
+ }
+ return numvecs;
+ }
+@@ -321,7 +320,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+ * may start anywhere
+ */
+ if (curvec >= last_affv)
+- curvec = firstvec;
++ curvec = 0;
+ irq_spread_init_one(&masks[curvec].mask, nmsk,
+ cpus_per_vec);
+ }
+@@ -336,11 +335,10 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+ * 1) spread present CPU on these vectors
+ * 2) spread other possible CPUs on these vectors
+ */
+-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
++static int irq_build_affinity_masks(unsigned int numvecs,
+ struct irq_affinity_desc *masks)
+ {
+- unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
+- unsigned int firstvec = startvec;
++ unsigned int curvec = 0, nr_present = 0, nr_others = 0;
+ cpumask_var_t *node_to_cpumask;
+ cpumask_var_t nmsk, npresmsk;
+ int ret = -ENOMEM;
+@@ -360,9 +358,8 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
+ build_node_to_cpumask(node_to_cpumask);
+
+ /* Spread on present CPUs starting from affd->pre_vectors */
+- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
+- node_to_cpumask, cpu_present_mask,
+- nmsk, masks);
++ ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
++ cpu_present_mask, nmsk, masks);
+ if (ret < 0)
+ goto fail_build_affinity;
+ nr_present = ret;
+@@ -374,13 +371,12 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
+ * out vectors.
+ */
+ if (nr_present >= numvecs)
+- curvec = firstvec;
++ curvec = 0;
+ else
+- curvec = firstvec + nr_present;
++ curvec = nr_present;
+ cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
+- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
+- node_to_cpumask, npresmsk, nmsk,
+- masks);
++ ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
++ npresmsk, nmsk, masks);
+ if (ret >= 0)
+ nr_others = ret;
+
+@@ -463,7 +459,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
+ unsigned int this_vecs = affd->set_size[i];
+ int ret;
+
+- ret = irq_build_affinity_masks(curvec, this_vecs, masks);
++ ret = irq_build_affinity_masks(this_vecs, &masks[curvec]);
+ if (ret) {
+ kfree(masks);
+ return NULL;
+--
+2.43.0
+
--- /dev/null
+From 3f9eac627a4179298074566b0149198d817ff10c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Dec 2022 10:29:00 +0800
+Subject: genirq/affinity: Remove the 'firstvec' parameter from
+ irq_build_affinity_masks
+
+From: Ming Lei <ming.lei@redhat.com>
+
+[ Upstream commit cdf07f0ea48a3b52f924714d477366ac510ee870 ]
+
+The 'firstvec' parameter is always same with the parameter of
+'startvec', so use 'startvec' directly inside irq_build_affinity_masks().
+
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: John Garry <john.g.garry@oracle.com>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20221227022905.352674-2-ming.lei@redhat.com
+Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/irq/affinity.c | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
+index d9a5c1d65a79d..3361e36ebaa1e 100644
+--- a/kernel/irq/affinity.c
++++ b/kernel/irq/affinity.c
+@@ -337,10 +337,10 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+ * 2) spread other possible CPUs on these vectors
+ */
+ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
+- unsigned int firstvec,
+ struct irq_affinity_desc *masks)
+ {
+ unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
++ unsigned int firstvec = startvec;
+ cpumask_var_t *node_to_cpumask;
+ cpumask_var_t nmsk, npresmsk;
+ int ret = -ENOMEM;
+@@ -463,8 +463,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
+ unsigned int this_vecs = affd->set_size[i];
+ int ret;
+
+- ret = irq_build_affinity_masks(curvec, this_vecs,
+- curvec, masks);
++ ret = irq_build_affinity_masks(curvec, this_vecs, masks);
+ if (ret) {
+ kfree(masks);
+ return NULL;
+--
+2.43.0
+
--- /dev/null
+From 2b38a67a94c19fcf3c655f12980a7a16eee4e44e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Dec 2022 10:29:03 +0800
+Subject: genirq/affinity: Rename irq_build_affinity_masks as group_cpus_evenly
+
+From: Ming Lei <ming.lei@redhat.com>
+
+[ Upstream commit 523f1ea76aad9025f9bd5258d77f4406fa9dbe5d ]
+
+Map irq vector into group, which allows to abstract the algorithm for
+a generic use case outside of the interrupt core.
+
+Rename irq_build_affinity_masks as group_cpus_evenly, so the API can be
+reused for blk-mq to make default queue mapping even though irq vectors
+aren't involved.
+
+No functional change, just rename vector as group.
+
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20221227022905.352674-5-ming.lei@redhat.com
+Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/irq/affinity.c | 242 +++++++++++++++++++++---------------------
+ 1 file changed, 121 insertions(+), 121 deletions(-)
+
+diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
+index 00bba1020ecb2..54083331f1bcb 100644
+--- a/kernel/irq/affinity.c
++++ b/kernel/irq/affinity.c
+@@ -9,13 +9,13 @@
+ #include <linux/cpu.h>
+ #include <linux/sort.h>
+
+-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+- unsigned int cpus_per_vec)
++static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
++ unsigned int cpus_per_grp)
+ {
+ const struct cpumask *siblmsk;
+ int cpu, sibl;
+
+- for ( ; cpus_per_vec > 0; ) {
++ for ( ; cpus_per_grp > 0; ) {
+ cpu = cpumask_first(nmsk);
+
+ /* Should not happen, but I'm too lazy to think about it */
+@@ -24,18 +24,18 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+
+ cpumask_clear_cpu(cpu, nmsk);
+ cpumask_set_cpu(cpu, irqmsk);
+- cpus_per_vec--;
++ cpus_per_grp--;
+
+ /* If the cpu has siblings, use them first */
+ siblmsk = topology_sibling_cpumask(cpu);
+- for (sibl = -1; cpus_per_vec > 0; ) {
++ for (sibl = -1; cpus_per_grp > 0; ) {
+ sibl = cpumask_next(sibl, siblmsk);
+ if (sibl >= nr_cpu_ids)
+ break;
+ if (!cpumask_test_and_clear_cpu(sibl, nmsk))
+ continue;
+ cpumask_set_cpu(sibl, irqmsk);
+- cpus_per_vec--;
++ cpus_per_grp--;
+ }
+ }
+ }
+@@ -95,48 +95,48 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
+ return nodes;
+ }
+
+-struct node_vectors {
++struct node_groups {
+ unsigned id;
+
+ union {
+- unsigned nvectors;
++ unsigned ngroups;
+ unsigned ncpus;
+ };
+ };
+
+ static int ncpus_cmp_func(const void *l, const void *r)
+ {
+- const struct node_vectors *ln = l;
+- const struct node_vectors *rn = r;
++ const struct node_groups *ln = l;
++ const struct node_groups *rn = r;
+
+ return ln->ncpus - rn->ncpus;
+ }
+
+ /*
+- * Allocate vector number for each node, so that for each node:
++ * Allocate group number for each node, so that for each node:
+ *
+ * 1) the allocated number is >= 1
+ *
+- * 2) the allocated numbver is <= active CPU number of this node
++ * 2) the allocated number is <= active CPU number of this node
+ *
+- * The actual allocated total vectors may be less than @numvecs when
+- * active total CPU number is less than @numvecs.
++ * The actual allocated total groups may be less than @numgrps when
++ * active total CPU number is less than @numgrps.
+ *
+ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
+ * for each node.
+ */
+-static void alloc_nodes_vectors(unsigned int numvecs,
+- cpumask_var_t *node_to_cpumask,
+- const struct cpumask *cpu_mask,
+- const nodemask_t nodemsk,
+- struct cpumask *nmsk,
+- struct node_vectors *node_vectors)
++static void alloc_nodes_groups(unsigned int numgrps,
++ cpumask_var_t *node_to_cpumask,
++ const struct cpumask *cpu_mask,
++ const nodemask_t nodemsk,
++ struct cpumask *nmsk,
++ struct node_groups *node_groups)
+ {
+ unsigned n, remaining_ncpus = 0;
+
+ for (n = 0; n < nr_node_ids; n++) {
+- node_vectors[n].id = n;
+- node_vectors[n].ncpus = UINT_MAX;
++ node_groups[n].id = n;
++ node_groups[n].ncpus = UINT_MAX;
+ }
+
+ for_each_node_mask(n, nodemsk) {
+@@ -148,61 +148,61 @@ static void alloc_nodes_vectors(unsigned int numvecs,
+ if (!ncpus)
+ continue;
+ remaining_ncpus += ncpus;
+- node_vectors[n].ncpus = ncpus;
++ node_groups[n].ncpus = ncpus;
+ }
+
+- numvecs = min_t(unsigned, remaining_ncpus, numvecs);
++ numgrps = min_t(unsigned, remaining_ncpus, numgrps);
+
+- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
++ sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
+ ncpus_cmp_func, NULL);
+
+ /*
+- * Allocate vectors for each node according to the ratio of this
+- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
++ * Allocate groups for each node according to the ratio of this
++ * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
+ * bigger than number of active numa nodes. Always start the
+ * allocation from the node with minimized nr_cpus.
+ *
+ * This way guarantees that each active node gets allocated at
+- * least one vector, and the theory is simple: over-allocation
+- * is only done when this node is assigned by one vector, so
+- * other nodes will be allocated >= 1 vector, since 'numvecs' is
++ * least one group, and the theory is simple: over-allocation
++ * is only done when this node is assigned by one group, so
++ * other nodes will be allocated >= 1 groups, since 'numgrps' is
+ * bigger than number of numa nodes.
+ *
+- * One perfect invariant is that number of allocated vectors for
++ * One perfect invariant is that number of allocated groups for
+ * each node is <= CPU count of this node:
+ *
+ * 1) suppose there are two nodes: A and B
+ * ncpu(X) is CPU count of node X
+- * vecs(X) is the vector count allocated to node X via this
++ * grps(X) is the group count allocated to node X via this
+ * algorithm
+ *
+ * ncpu(A) <= ncpu(B)
+ * ncpu(A) + ncpu(B) = N
+- * vecs(A) + vecs(B) = V
++ * grps(A) + grps(B) = G
+ *
+- * vecs(A) = max(1, round_down(V * ncpu(A) / N))
+- * vecs(B) = V - vecs(A)
++ * grps(A) = max(1, round_down(G * ncpu(A) / N))
++ * grps(B) = G - grps(A)
+ *
+- * both N and V are integer, and 2 <= V <= N, suppose
+- * V = N - delta, and 0 <= delta <= N - 2
++ * both N and G are integer, and 2 <= G <= N, suppose
++ * G = N - delta, and 0 <= delta <= N - 2
+ *
+- * 2) obviously vecs(A) <= ncpu(A) because:
++ * 2) obviously grps(A) <= ncpu(A) because:
+ *
+- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
++ * if grps(A) is 1, then grps(A) <= ncpu(A) given
+ * ncpu(A) >= 1
+ *
+ * otherwise,
+- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
++ * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
+ *
+- * 3) prove how vecs(B) <= ncpu(B):
++ * 3) prove how grps(B) <= ncpu(B):
+ *
+- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
+- * over-allocated, so vecs(B) <= ncpu(B),
++ * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
++ * over-allocated, so grps(B) <= ncpu(B),
+ *
+ * otherwise:
+ *
+- * vecs(A) =
+- * round_down(V * ncpu(A) / N) =
++ * grps(A) =
++ * round_down(G * ncpu(A) / N) =
+ * round_down((N - delta) * ncpu(A) / N) =
+ * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
+ * round_down((N * ncpu(A) - delta * N) / N) =
+@@ -210,52 +210,50 @@ static void alloc_nodes_vectors(unsigned int numvecs,
+ *
+ * then:
+ *
+- * vecs(A) - V >= ncpu(A) - delta - V
++ * grps(A) - G >= ncpu(A) - delta - G
+ * =>
+- * V - vecs(A) <= V + delta - ncpu(A)
++ * G - grps(A) <= G + delta - ncpu(A)
+ * =>
+- * vecs(B) <= N - ncpu(A)
++ * grps(B) <= N - ncpu(A)
+ * =>
+- * vecs(B) <= cpu(B)
++ * grps(B) <= cpu(B)
+ *
+ * For nodes >= 3, it can be thought as one node and another big
+ * node given that is exactly what this algorithm is implemented,
+- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
+- * finally for each node X: vecs(X) <= ncpu(X).
++ * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
++ * finally for each node X: grps(X) <= ncpu(X).
+ *
+ */
+ for (n = 0; n < nr_node_ids; n++) {
+- unsigned nvectors, ncpus;
++ unsigned ngroups, ncpus;
+
+- if (node_vectors[n].ncpus == UINT_MAX)
++ if (node_groups[n].ncpus == UINT_MAX)
+ continue;
+
+- WARN_ON_ONCE(numvecs == 0);
++ WARN_ON_ONCE(numgrps == 0);
+
+- ncpus = node_vectors[n].ncpus;
+- nvectors = max_t(unsigned, 1,
+- numvecs * ncpus / remaining_ncpus);
+- WARN_ON_ONCE(nvectors > ncpus);
++ ncpus = node_groups[n].ncpus;
++ ngroups = max_t(unsigned, 1,
++ numgrps * ncpus / remaining_ncpus);
++ WARN_ON_ONCE(ngroups > ncpus);
+
+- node_vectors[n].nvectors = nvectors;
++ node_groups[n].ngroups = ngroups;
+
+ remaining_ncpus -= ncpus;
+- numvecs -= nvectors;
++ numgrps -= ngroups;
+ }
+ }
+
+-static int __irq_build_affinity_masks(unsigned int startvec,
+- unsigned int numvecs,
+- cpumask_var_t *node_to_cpumask,
+- const struct cpumask *cpu_mask,
+- struct cpumask *nmsk,
+- struct cpumask *masks)
++static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
++ cpumask_var_t *node_to_cpumask,
++ const struct cpumask *cpu_mask,
++ struct cpumask *nmsk, struct cpumask *masks)
+ {
+- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
+- unsigned int last_affv = numvecs;
+- unsigned int curvec = startvec;
++ unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
++ unsigned int last_grp = numgrps;
++ unsigned int curgrp = startgrp;
+ nodemask_t nodemsk = NODE_MASK_NONE;
+- struct node_vectors *node_vectors;
++ struct node_groups *node_groups;
+
+ if (cpumask_empty(cpu_mask))
+ return 0;
+@@ -264,34 +262,33 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+
+ /*
+ * If the number of nodes in the mask is greater than or equal the
+- * number of vectors we just spread the vectors across the nodes.
++ * number of groups we just spread the groups across the nodes.
+ */
+- if (numvecs <= nodes) {
++ if (numgrps <= nodes) {
+ for_each_node_mask(n, nodemsk) {
+ /* Ensure that only CPUs which are in both masks are set */
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+- cpumask_or(&masks[curvec], &masks[curvec], nmsk);
+- if (++curvec == last_affv)
+- curvec = 0;
++ cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
++ if (++curgrp == last_grp)
++ curgrp = 0;
+ }
+- return numvecs;
++ return numgrps;
+ }
+
+- node_vectors = kcalloc(nr_node_ids,
+- sizeof(struct node_vectors),
++ node_groups = kcalloc(nr_node_ids,
++ sizeof(struct node_groups),
+ GFP_KERNEL);
+- if (!node_vectors)
++ if (!node_groups)
+ return -ENOMEM;
+
+- /* allocate vector number for each node */
+- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
+- nodemsk, nmsk, node_vectors);
+-
++ /* allocate group number for each node */
++ alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
++ nodemsk, nmsk, node_groups);
+ for (i = 0; i < nr_node_ids; i++) {
+ unsigned int ncpus, v;
+- struct node_vectors *nv = &node_vectors[i];
++ struct node_groups *nv = &node_groups[i];
+
+- if (nv->nvectors == UINT_MAX)
++ if (nv->ngroups == UINT_MAX)
+ continue;
+
+ /* Get the cpus on this node which are in the mask */
+@@ -300,44 +297,47 @@ static int __irq_build_affinity_masks(unsigned int startvec,
+ if (!ncpus)
+ continue;
+
+- WARN_ON_ONCE(nv->nvectors > ncpus);
++ WARN_ON_ONCE(nv->ngroups > ncpus);
+
+ /* Account for rounding errors */
+- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
++ extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
+
+- /* Spread allocated vectors on CPUs of the current node */
+- for (v = 0; v < nv->nvectors; v++, curvec++) {
+- cpus_per_vec = ncpus / nv->nvectors;
++ /* Spread allocated groups on CPUs of the current node */
++ for (v = 0; v < nv->ngroups; v++, curgrp++) {
++ cpus_per_grp = ncpus / nv->ngroups;
+
+- /* Account for extra vectors to compensate rounding errors */
+- if (extra_vecs) {
+- cpus_per_vec++;
+- --extra_vecs;
++ /* Account for extra groups to compensate rounding errors */
++ if (extra_grps) {
++ cpus_per_grp++;
++ --extra_grps;
+ }
+
+ /*
+- * wrapping has to be considered given 'startvec'
++ * wrapping has to be considered given 'startgrp'
+ * may start anywhere
+ */
+- if (curvec >= last_affv)
+- curvec = 0;
+- irq_spread_init_one(&masks[curvec], nmsk,
+- cpus_per_vec);
++ if (curgrp >= last_grp)
++ curgrp = 0;
++ grp_spread_init_one(&masks[curgrp], nmsk,
++ cpus_per_grp);
+ }
+- done += nv->nvectors;
++ done += nv->ngroups;
+ }
+- kfree(node_vectors);
++ kfree(node_groups);
+ return done;
+ }
+
+ /*
+- * build affinity in two stages:
+- * 1) spread present CPU on these vectors
+- * 2) spread other possible CPUs on these vectors
++ * build affinity in two stages for each group, and try to put close CPUs
++ * in viewpoint of CPU and NUMA locality into same group, and we run
++ * two-stage grouping:
++ *
++ * 1) allocate present CPUs on these groups evenly first
++ * 2) allocate other possible CPUs on these groups evenly
+ */
+-static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
++static struct cpumask *group_cpus_evenly(unsigned int numgrps)
+ {
+- unsigned int curvec = 0, nr_present = 0, nr_others = 0;
++ unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
+ cpumask_var_t *node_to_cpumask;
+ cpumask_var_t nmsk, npresmsk;
+ int ret = -ENOMEM;
+@@ -353,7 +353,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
+ if (!node_to_cpumask)
+ goto fail_npresmsk;
+
+- masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL);
++ masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ goto fail_node_to_cpumask;
+
+@@ -361,26 +361,26 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
+ cpus_read_lock();
+ build_node_to_cpumask(node_to_cpumask);
+
+- /* Spread on present CPUs starting from affd->pre_vectors */
+- ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
+- cpu_present_mask, nmsk, masks);
++ /* grouping present CPUs first */
++ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
++ cpu_present_mask, nmsk, masks);
+ if (ret < 0)
+ goto fail_build_affinity;
+ nr_present = ret;
+
+ /*
+- * Spread on non present CPUs starting from the next vector to be
+- * handled. If the spreading of present CPUs already exhausted the
+- * vector space, assign the non present CPUs to the already spread
+- * out vectors.
++ * Allocate non present CPUs starting from the next group to be
++ * handled. If the grouping of present CPUs already exhausted the
++ * group space, assign the non present CPUs to the already
++ * allocated out groups.
+ */
+- if (nr_present >= numvecs)
+- curvec = 0;
++ if (nr_present >= numgrps)
++ curgrp = 0;
+ else
+- curvec = nr_present;
++ curgrp = nr_present;
+ cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
+- ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
+- npresmsk, nmsk, masks);
++ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
++ npresmsk, nmsk, masks);
+ if (ret >= 0)
+ nr_others = ret;
+
+@@ -388,7 +388,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
+ cpus_read_unlock();
+
+ if (ret >= 0)
+- WARN_ON(nr_present + nr_others < numvecs);
++ WARN_ON(nr_present + nr_others < numgrps);
+
+ fail_node_to_cpumask:
+ free_node_to_cpumask(node_to_cpumask);
+@@ -467,7 +467,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
+ for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+ unsigned int this_vecs = affd->set_size[i];
+ int j;
+- struct cpumask *result = irq_build_affinity_masks(this_vecs);
++ struct cpumask *result = group_cpus_evenly(this_vecs);
+
+ if (!result) {
+ kfree(masks);
+--
+2.43.0
+
--- /dev/null
+From 0ddfc8bc46129c7a83ae4cf6d0cc4063fbfc2355 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 29 Nov 2023 11:23:11 +0100
+Subject: i40e: Fix filter input checks to prevent config with invalid values
+
+From: Sudheer Mogilappagari <sudheer.mogilappagari@intel.com>
+
+[ Upstream commit 3e48041d9820c17e0a51599d12e66c6e12a8d08d ]
+
+Prevent VF from configuring filters with unsupported actions or use
+REDIRECT action with invalid tc number. Current checks could cause
+out of bounds access on PF side.
+
+Fixes: e284fc280473 ("i40e: Add and delete cloud filter")
+Reviewed-by: Andrii Staikov <andrii.staikov@intel.com>
+Signed-off-by: Sudheer Mogilappagari <sudheer.mogilappagari@intel.com>
+Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Tested-by: Bharathi Sreenivas <bharathi.sreenivas@intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+index cb925baf72ce0..3c38129a5224a 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+@@ -3451,16 +3451,16 @@ static int i40e_validate_cloud_filter(struct i40e_vf *vf,
+ bool found = false;
+ int bkt;
+
+- if (!tc_filter->action) {
++ if (tc_filter->action != VIRTCHNL_ACTION_TC_REDIRECT) {
+ dev_info(&pf->pdev->dev,
+- "VF %d: Currently ADq doesn't support Drop Action\n",
+- vf->vf_id);
++ "VF %d: ADQ doesn't support this action (%d)\n",
++ vf->vf_id, tc_filter->action);
+ goto err;
+ }
+
+ /* action_meta is TC number here to which the filter is applied */
+ if (!tc_filter->action_meta ||
+- tc_filter->action_meta > I40E_MAX_VF_VSI) {
++ tc_filter->action_meta > vf->num_tc) {
+ dev_info(&pf->pdev->dev, "VF %d: Invalid TC number %u\n",
+ vf->vf_id, tc_filter->action_meta);
+ goto err;
+--
+2.43.0
+
--- /dev/null
+From 20287328081684e38abeda69f42fe548148fc294 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 18 Dec 2023 15:08:50 +0800
+Subject: i40e: fix use-after-free in i40e_aqc_add_filters()
+
+From: Ke Xiao <xiaoke@sangfor.com.cn>
+
+[ Upstream commit 6a15584e99db8918b60e507539c7446375dcf366 ]
+
+Commit 3116f59c12bd ("i40e: fix use-after-free in
+i40e_sync_filters_subtask()") avoided use-after-free issues,
+by increasing refcount during update the VSI filter list to
+the HW. However, it missed the unicast situation.
+
+When deleting an unicast FDB entry, the i40e driver will release
+the mac_filter, and i40e_service_task will concurrently request
+firmware to add the mac_filter, which will lead to the following
+use-after-free issue.
+
+Fix again for both netdev->uc and netdev->mc.
+
+BUG: KASAN: use-after-free in i40e_aqc_add_filters+0x55c/0x5b0 [i40e]
+Read of size 2 at addr ffff888eb3452d60 by task kworker/8:7/6379
+
+CPU: 8 PID: 6379 Comm: kworker/8:7 Kdump: loaded Tainted: G
+Workqueue: i40e i40e_service_task [i40e]
+Call Trace:
+ dump_stack+0x71/0xab
+ print_address_description+0x6b/0x290
+ kasan_report+0x14a/0x2b0
+ i40e_aqc_add_filters+0x55c/0x5b0 [i40e]
+ i40e_sync_vsi_filters+0x1676/0x39c0 [i40e]
+ i40e_service_task+0x1397/0x2bb0 [i40e]
+ process_one_work+0x56a/0x11f0
+ worker_thread+0x8f/0xf40
+ kthread+0x2a0/0x390
+ ret_from_fork+0x1f/0x40
+
+Allocated by task 21948:
+ kasan_kmalloc+0xa6/0xd0
+ kmem_cache_alloc_trace+0xdb/0x1c0
+ i40e_add_filter+0x11e/0x520 [i40e]
+ i40e_addr_sync+0x37/0x60 [i40e]
+ __hw_addr_sync_dev+0x1f5/0x2f0
+ i40e_set_rx_mode+0x61/0x1e0 [i40e]
+ dev_uc_add_excl+0x137/0x190
+ i40e_ndo_fdb_add+0x161/0x260 [i40e]
+ rtnl_fdb_add+0x567/0x950
+ rtnetlink_rcv_msg+0x5db/0x880
+ netlink_rcv_skb+0x254/0x380
+ netlink_unicast+0x454/0x610
+ netlink_sendmsg+0x747/0xb00
+ sock_sendmsg+0xe2/0x120
+ __sys_sendto+0x1ae/0x290
+ __x64_sys_sendto+0xdd/0x1b0
+ do_syscall_64+0xa0/0x370
+ entry_SYSCALL_64_after_hwframe+0x65/0xca
+
+Freed by task 21948:
+ __kasan_slab_free+0x137/0x190
+ kfree+0x8b/0x1b0
+ __i40e_del_filter+0x116/0x1e0 [i40e]
+ i40e_del_mac_filter+0x16c/0x300 [i40e]
+ i40e_addr_unsync+0x134/0x1b0 [i40e]
+ __hw_addr_sync_dev+0xff/0x2f0
+ i40e_set_rx_mode+0x61/0x1e0 [i40e]
+ dev_uc_del+0x77/0x90
+ rtnl_fdb_del+0x6a5/0x860
+ rtnetlink_rcv_msg+0x5db/0x880
+ netlink_rcv_skb+0x254/0x380
+ netlink_unicast+0x454/0x610
+ netlink_sendmsg+0x747/0xb00
+ sock_sendmsg+0xe2/0x120
+ __sys_sendto+0x1ae/0x290
+ __x64_sys_sendto+0xdd/0x1b0
+ do_syscall_64+0xa0/0x370
+ entry_SYSCALL_64_after_hwframe+0x65/0xca
+
+Fixes: 3116f59c12bd ("i40e: fix use-after-free in i40e_sync_filters_subtask()")
+Fixes: 41c445ff0f48 ("i40e: main driver core")
+Signed-off-by: Ke Xiao <xiaoke@sangfor.com.cn>
+Signed-off-by: Ding Hui <dinghui@sangfor.com.cn>
+Cc: Di Zhu <zhudi2@huawei.com>
+Reviewed-by: Jan Sokolowski <jan.sokolowski@intel.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
+Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel)
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_main.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
+index b4157ff370a31..cdc68b78bd9ea 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
+@@ -104,12 +104,18 @@ static struct workqueue_struct *i40e_wq;
+ static void netdev_hw_addr_refcnt(struct i40e_mac_filter *f,
+ struct net_device *netdev, int delta)
+ {
++ struct netdev_hw_addr_list *ha_list;
+ struct netdev_hw_addr *ha;
+
+ if (!f || !netdev)
+ return;
+
+- netdev_for_each_mc_addr(ha, netdev) {
++ if (is_unicast_ether_addr(f->macaddr) || is_link_local_ether_addr(f->macaddr))
++ ha_list = &netdev->uc;
++ else
++ ha_list = &netdev->mc;
++
++ netdev_hw_addr_list_for_each(ha, ha_list) {
+ if (ether_addr_equal(ha->addr, f->macaddr)) {
+ ha->refcount += delta;
+ if (ha->refcount <= 0)
+--
+2.43.0
+
--- /dev/null
+From 7b1f4a98a68f67ebaea752502865a2679eea1b6f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Dec 2023 14:27:35 +0100
+Subject: i40e: Restore VF MSI-X state during PCI reset
+
+From: Andrii Staikov <andrii.staikov@intel.com>
+
+[ Upstream commit 371e576ff3e8580d91d49026e5d5faebf5565558 ]
+
+During a PCI FLR the MSI-X Enable flag in the VF PCI MSI-X capability
+register will be cleared. This can lead to issues when a VF is
+assigned to a VM because in these cases the VF driver receives no
+indication of the PF PCI error/reset and additionally it is incapable
+of restoring the cleared flag in the hypervisor configuration space
+without fully reinitializing the driver interrupt functionality.
+
+Since the VF driver is unable to easily resolve this condition on its own,
+restore the VF MSI-X flag during the PF PCI reset handling.
+
+Fixes: 19b7960b2da1 ("i40e: implement split PCI error reset handler")
+Co-developed-by: Karen Ostrowska <karen.ostrowska@intel.com>
+Signed-off-by: Karen Ostrowska <karen.ostrowska@intel.com>
+Co-developed-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
+Signed-off-by: Mateusz Palczewski <mateusz.palczewski@intel.com>
+Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
+Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
+Signed-off-by: Andrii Staikov <andrii.staikov@intel.com>
+Tested-by: Rafal Romanowski <rafal.romanowski@intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +++
+ .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 26 +++++++++++++++++++
+ .../ethernet/intel/i40e/i40e_virtchnl_pf.h | 3 +++
+ 3 files changed, 32 insertions(+)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
+index cdc68b78bd9ea..63d43ef86f9b9 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
+@@ -16450,6 +16450,9 @@ static void i40e_pci_error_reset_done(struct pci_dev *pdev)
+ return;
+
+ i40e_reset_and_rebuild(pf, false, false);
++#ifdef CONFIG_PCI_IOV
++ i40e_restore_all_vfs_msi_state(pdev);
++#endif /* CONFIG_PCI_IOV */
+ }
+
+ /**
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+index 3c38129a5224a..c7d761426d6ce 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+@@ -152,6 +152,32 @@ void i40e_vc_notify_reset(struct i40e_pf *pf)
+ (u8 *)&pfe, sizeof(struct virtchnl_pf_event));
+ }
+
++#ifdef CONFIG_PCI_IOV
++void i40e_restore_all_vfs_msi_state(struct pci_dev *pdev)
++{
++ u16 vf_id;
++ u16 pos;
++
++ /* Continue only if this is a PF */
++ if (!pdev->is_physfn)
++ return;
++
++ if (!pci_num_vf(pdev))
++ return;
++
++ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV);
++ if (pos) {
++ struct pci_dev *vf_dev = NULL;
++
++ pci_read_config_word(pdev, pos + PCI_SRIOV_VF_DID, &vf_id);
++ while ((vf_dev = pci_get_device(pdev->vendor, vf_id, vf_dev))) {
++ if (vf_dev->is_virtfn && vf_dev->physfn == pdev)
++ pci_restore_msi_state(vf_dev);
++ }
++ }
++}
++#endif /* CONFIG_PCI_IOV */
++
+ /**
+ * i40e_vc_notify_vf_reset
+ * @vf: pointer to the VF structure
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+index 358bbdb587951..bd497cc5303a1 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h
+@@ -135,6 +135,9 @@ int i40e_ndo_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool enable);
+
+ void i40e_vc_notify_link_state(struct i40e_pf *pf);
+ void i40e_vc_notify_reset(struct i40e_pf *pf);
++#ifdef CONFIG_PCI_IOV
++void i40e_restore_all_vfs_msi_state(struct pci_dev *pdev);
++#endif /* CONFIG_PCI_IOV */
+ int i40e_get_vf_stats(struct net_device *netdev, int vf_id,
+ struct ifla_vf_stats *vf_stats);
+
+--
+2.43.0
+
--- /dev/null
+From 2dd7c71e40d1a2ab164d9905c6bf8e507590d539 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 15 Dec 2023 12:01:56 +0100
+Subject: ice: Fix link_down_on_close message
+
+From: Katarzyna Wieczerzycka <katarzyna.wieczerzycka@intel.com>
+
+[ Upstream commit 6a8d8bb55e7001de2d50920381cc858f3a3e9fb7 ]
+
+The driver should not report an error message when for a medialess port
+the link_down_on_close flag is enabled and the physical link cannot be
+set down.
+
+Fixes: 8ac7132704f3 ("ice: Fix interface being down after reset with link-down-on-close flag on")
+Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
+Signed-off-by: Katarzyna Wieczerzycka <katarzyna.wieczerzycka@intel.com>
+Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
+Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel)
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_main.c | 10 +++++++---
+ 1 file changed, 7 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
+index f0f39364819ac..5eb3b80b293c0 100644
+--- a/drivers/net/ethernet/intel/ice/ice_main.c
++++ b/drivers/net/ethernet/intel/ice/ice_main.c
+@@ -2138,7 +2138,7 @@ static int ice_configure_phy(struct ice_vsi *vsi)
+
+ /* Ensure we have media as we cannot configure a medialess port */
+ if (!(phy->link_info.link_info & ICE_AQ_MEDIA_AVAILABLE))
+- return -EPERM;
++ return -ENOMEDIUM;
+
+ ice_print_topo_conflict(vsi);
+
+@@ -9065,8 +9065,12 @@ int ice_stop(struct net_device *netdev)
+ int link_err = ice_force_phys_link_state(vsi, false);
+
+ if (link_err) {
+- netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n",
+- vsi->vsi_num, link_err);
++ if (link_err == -ENOMEDIUM)
++ netdev_info(vsi->netdev, "Skipping link reconfig - no media attached, VSI %d\n",
++ vsi->vsi_num);
++ else
++ netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n",
++ vsi->vsi_num, link_err);
+ return -EIO;
+ }
+ }
+--
+2.43.0
+
--- /dev/null
+From f1ef60049882de4af95c17ef50adb9017ecbaa09 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 15 Dec 2023 12:01:57 +0100
+Subject: ice: Shut down VSI with "link-down-on-close" enabled
+
+From: Ngai-Mint Kwan <ngai-mint.kwan@intel.com>
+
+[ Upstream commit 6d05ff55ef4f4954d28551236239f297bd52ea48 ]
+
+Disabling netdev with ethtool private flag "link-down-on-close" enabled
+can cause NULL pointer dereference bug. Shut down VSI regardless of
+"link-down-on-close" state.
+
+Fixes: 8ac7132704f3 ("ice: Fix interface being down after reset with link-down-on-close flag on")
+Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
+Signed-off-by: Ngai-Mint Kwan <ngai-mint.kwan@intel.com>
+Signed-off-by: Wojciech Drewek <wojciech.drewek@intel.com>
+Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel)
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_main.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
+index 5eb3b80b293c0..ab46cfca4028d 100644
+--- a/drivers/net/ethernet/intel/ice/ice_main.c
++++ b/drivers/net/ethernet/intel/ice/ice_main.c
+@@ -9071,6 +9071,8 @@ int ice_stop(struct net_device *netdev)
+ else
+ netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n",
+ vsi->vsi_num, link_err);
++
++ ice_vsi_close(vsi);
+ return -EIO;
+ }
+ }
+--
+2.43.0
+
--- /dev/null
+From e09a381b3b1d4fa9bd86c9d51bbd7c9766cc671a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 6 Dec 2023 15:07:18 +0100
+Subject: igc: Check VLAN EtherType mask
+
+From: Kurt Kanzenbach <kurt@linutronix.de>
+
+[ Upstream commit 7afd49a38e73afd57ff62c8d1cf5af760c4d49c0 ]
+
+Currently the driver accepts VLAN EtherType steering rules regardless of
+the configured mask. And things might fail silently or with confusing error
+messages to the user. The VLAN EtherType can only be matched by full
+mask. Therefore, add a check for that.
+
+For instance the following rule is invalid, but the driver accepts it and
+ignores the user specified mask:
+|root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 \
+| m 0x00ff action 0
+|Added rule with ID 63
+|root@host:~# ethtool --show-ntuple enp3s0
+|4 RX rings available
+|Total 1 rules
+|
+|Filter: 63
+| Flow Type: Raw Ethernet
+| Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF
+| Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF
+| Ethertype: 0x0 mask: 0xFFFF
+| VLAN EtherType: 0x8100 mask: 0x0
+| VLAN: 0x0 mask: 0xffff
+| User-defined: 0x0 mask: 0xffffffffffffffff
+| Action: Direct to queue 0
+
+After:
+|root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 \
+| m 0x00ff action 0
+|rmgr: Cannot insert RX class rule: Operation not supported
+
+Fixes: 2b477d057e33 ("igc: Integrate flex filter into ethtool ops")
+Suggested-by: Suman Ghosh <sumang@marvell.com>
+Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
+Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Tested-by: Naama Meir <naamax.meir@linux.intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/igc/igc_ethtool.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
+index e146357d61a8a..2bee9cace5983 100644
+--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
++++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
+@@ -1356,6 +1356,14 @@ static int igc_ethtool_add_nfc_rule(struct igc_adapter *adapter,
+ return -EOPNOTSUPP;
+ }
+
++ /* VLAN EtherType can only be matched by full mask. */
++ if ((fsp->flow_type & FLOW_EXT) &&
++ fsp->m_ext.vlan_etype &&
++ fsp->m_ext.vlan_etype != ETHER_TYPE_FULL_MASK) {
++ netdev_dbg(netdev, "VLAN EtherType mask not supported\n");
++ return -EOPNOTSUPP;
++ }
++
+ if (fsp->location >= IGC_MAX_RXNFC_RULES) {
+ netdev_dbg(netdev, "Invalid location\n");
+ return -EINVAL;
+--
+2.43.0
+
--- /dev/null
+From ab151d4a86bceafa58b773d11dd768f176a291af Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Dec 2023 08:50:43 +0100
+Subject: igc: Check VLAN TCI mask
+
+From: Kurt Kanzenbach <kurt@linutronix.de>
+
+[ Upstream commit b5063cbe148b829e8eb97672c2cbccc058835476 ]
+
+Currently the driver accepts VLAN TCI steering rules regardless of the
+configured mask. And things might fail silently or with confusing error
+messages to the user.
+
+There are two ways to handle the VLAN TCI mask:
+
+ 1. Match on the PCP field using a VLAN prio filter
+ 2. Match on complete TCI field using a flex filter
+
+Therefore, add checks and code for that.
+
+For instance the following rule is invalid and will be converted into a
+VLAN prio rule which is not correct:
+|root@host:~# ethtool -N enp3s0 flow-type ether vlan 0x0001 m 0xf000 \
+| action 1
+|Added rule with ID 61
+|root@host:~# ethtool --show-ntuple enp3s0
+|4 RX rings available
+|Total 1 rules
+|
+|Filter: 61
+| Flow Type: Raw Ethernet
+| Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF
+| Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF
+| Ethertype: 0x0 mask: 0xFFFF
+| VLAN EtherType: 0x0 mask: 0xffff
+| VLAN: 0x1 mask: 0x1fff
+| User-defined: 0x0 mask: 0xffffffffffffffff
+| Action: Direct to queue 1
+
+After:
+|root@host:~# ethtool -N enp3s0 flow-type ether vlan 0x0001 m 0xf000 \
+| action 1
+|rmgr: Cannot insert RX class rule: Operation not supported
+
+Fixes: 7991487ecb2d ("igc: Allow for Flex Filters to be installed")
+Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
+Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Tested-by: Naama Meir <naamax.meir@linux.intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/igc/igc.h | 1 +
+ drivers/net/ethernet/intel/igc/igc_ethtool.c | 28 +++++++++++++++++---
+ 2 files changed, 26 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h
+index 43c05b41627f7..2a894ca49d93b 100644
+--- a/drivers/net/ethernet/intel/igc/igc.h
++++ b/drivers/net/ethernet/intel/igc/igc.h
+@@ -538,6 +538,7 @@ struct igc_nfc_filter {
+ u16 etype;
+ __be16 vlan_etype;
+ u16 vlan_tci;
++ u16 vlan_tci_mask;
+ u8 src_addr[ETH_ALEN];
+ u8 dst_addr[ETH_ALEN];
+ u8 user_data[8];
+diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
+index 51ef18060dbc4..e146357d61a8a 100644
+--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
++++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
+@@ -957,6 +957,7 @@ static int igc_ethtool_set_coalesce(struct net_device *netdev,
+ }
+
+ #define ETHER_TYPE_FULL_MASK ((__force __be16)~0)
++#define VLAN_TCI_FULL_MASK ((__force __be16)~0)
+ static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter,
+ struct ethtool_rxnfc *cmd)
+ {
+@@ -988,7 +989,7 @@ static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter,
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
+ fsp->flow_type |= FLOW_EXT;
+ fsp->h_ext.vlan_tci = htons(rule->filter.vlan_tci);
+- fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK);
++ fsp->m_ext.vlan_tci = htons(rule->filter.vlan_tci_mask);
+ }
+
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) {
+@@ -1223,6 +1224,7 @@ static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule,
+
+ if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) {
+ rule->filter.vlan_tci = ntohs(fsp->h_ext.vlan_tci);
++ rule->filter.vlan_tci_mask = ntohs(fsp->m_ext.vlan_tci);
+ rule->filter.match_flags |= IGC_FILTER_FLAG_VLAN_TCI;
+ }
+
+@@ -1260,11 +1262,19 @@ static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule,
+ memcpy(rule->filter.user_mask, fsp->m_ext.data, sizeof(fsp->m_ext.data));
+ }
+
+- /* When multiple filter options or user data or vlan etype is set, use a
+- * flex filter.
++ /* The i225/i226 has various different filters. Flex filters provide a
++ * way to match up to the first 128 bytes of a packet. Use them for:
++ * a) For specific user data
++ * b) For VLAN EtherType
++ * c) For full TCI match
++ * d) Or in case multiple filter criteria are set
++ *
++ * Otherwise, use the simple MAC, VLAN PRIO or EtherType filters.
+ */
+ if ((rule->filter.match_flags & IGC_FILTER_FLAG_USER_DATA) ||
+ (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_ETYPE) ||
++ ((rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) &&
++ rule->filter.vlan_tci_mask == ntohs(VLAN_TCI_FULL_MASK)) ||
+ (rule->filter.match_flags & (rule->filter.match_flags - 1)))
+ rule->flex = true;
+ else
+@@ -1334,6 +1344,18 @@ static int igc_ethtool_add_nfc_rule(struct igc_adapter *adapter,
+ return -EINVAL;
+ }
+
++ /* There are two ways to match the VLAN TCI:
++ * 1. Match on PCP field and use vlan prio filter for it
++ * 2. Match on complete TCI field and use flex filter for it
++ */
++ if ((fsp->flow_type & FLOW_EXT) &&
++ fsp->m_ext.vlan_tci &&
++ fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK) &&
++ fsp->m_ext.vlan_tci != VLAN_TCI_FULL_MASK) {
++ netdev_dbg(netdev, "VLAN mask not supported\n");
++ return -EOPNOTSUPP;
++ }
++
+ if (fsp->location >= IGC_MAX_RXNFC_RULES) {
+ netdev_dbg(netdev, "Invalid location\n");
+ return -EINVAL;
+--
+2.43.0
+
--- /dev/null
+From 4b3b14b400fefd4fa7447adb596675bb2e8637e0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Dec 2023 15:58:16 +0100
+Subject: igc: Fix hicredit calculation
+
+From: Rodrigo Cataldo <rodrigo.cadore@l-acoustics.com>
+
+[ Upstream commit 947dfc8138dfaeb6e966e2d661de89eb203e3064 ]
+
+According to the Intel Software Manual for I225, Section 7.5.2.7,
+hicredit should be multiplied by the constant link-rate value, 0x7736.
+
+Currently, the old constant link-rate value, 0x7735, from the boards
+supported on igb are being used, most likely due to a copy'n'paste, as
+the rest of the logic is the same for both drivers.
+
+Update hicredit accordingly.
+
+Fixes: 1ab011b0bf07 ("igc: Add support for CBS offloading")
+Reviewed-by: Kurt Kanzenbach <kurt@linutronix.de>
+Signed-off-by: Rodrigo Cataldo <rodrigo.cadore@l-acoustics.com>
+Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Tested-by: Naama Meir <naamax.meir@linux.intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/igc/igc_tsn.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c
+index 725db36e399d2..31ea0781b65ec 100644
+--- a/drivers/net/ethernet/intel/igc/igc_tsn.c
++++ b/drivers/net/ethernet/intel/igc/igc_tsn.c
+@@ -178,7 +178,7 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter)
+ wr32(IGC_TQAVCC(i), tqavcc);
+
+ wr32(IGC_TQAVHC(i),
+- 0x80000000 + ring->hicredit * 0x7735);
++ 0x80000000 + ring->hicredit * 0x7736);
+ } else {
+ /* Disable any CBS for the queue */
+ txqctl &= ~(IGC_TXQCTL_QAV_SEL_MASK);
+--
+2.43.0
+
--- /dev/null
+From 0d687ebbf03e0fea5331b2481ed7bc3e89afd878 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 1 Dec 2023 08:50:42 +0100
+Subject: igc: Report VLAN EtherType matching back to user
+
+From: Kurt Kanzenbach <kurt@linutronix.de>
+
+[ Upstream commit 088464abd48cf3735aee91f9e211b32da9d81117 ]
+
+Currently the driver allows to configure matching by VLAN EtherType.
+However, the retrieval function does not report it back to the user. Add
+it.
+
+Before:
+|root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 action 0
+|Added rule with ID 63
+|root@host:~# ethtool --show-ntuple enp3s0
+|4 RX rings available
+|Total 1 rules
+|
+|Filter: 63
+| Flow Type: Raw Ethernet
+| Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF
+| Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF
+| Ethertype: 0x0 mask: 0xFFFF
+| Action: Direct to queue 0
+
+After:
+|root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 action 0
+|Added rule with ID 63
+|root@host:~# ethtool --show-ntuple enp3s0
+|4 RX rings available
+|Total 1 rules
+|
+|Filter: 63
+| Flow Type: Raw Ethernet
+| Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF
+| Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF
+| Ethertype: 0x0 mask: 0xFFFF
+| VLAN EtherType: 0x8100 mask: 0x0
+| VLAN: 0x0 mask: 0xffff
+| User-defined: 0x0 mask: 0xffffffffffffffff
+| Action: Direct to queue 0
+
+Fixes: 2b477d057e33 ("igc: Integrate flex filter into ethtool ops")
+Signed-off-by: Kurt Kanzenbach <kurt@linutronix.de>
+Acked-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Tested-by: Naama Meir <naamax.meir@linux.intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/igc/igc_ethtool.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c
+index 81897f7a90a91..51ef18060dbc4 100644
+--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c
++++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c
+@@ -979,6 +979,12 @@ static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter,
+ fsp->m_u.ether_spec.h_proto = ETHER_TYPE_FULL_MASK;
+ }
+
++ if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_ETYPE) {
++ fsp->flow_type |= FLOW_EXT;
++ fsp->h_ext.vlan_etype = rule->filter.vlan_etype;
++ fsp->m_ext.vlan_etype = ETHER_TYPE_FULL_MASK;
++ }
++
+ if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) {
+ fsp->flow_type |= FLOW_EXT;
+ fsp->h_ext.vlan_tci = htons(rule->filter.vlan_tci);
+--
+2.43.0
+
--- /dev/null
+From 5611af5949dfd630156868ccdfe55a978083caf4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 7 Jun 2023 19:19:13 +0100
+Subject: ipv4, ipv6: Use splice_eof() to flush
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 1d7e4538a5463faa0b0e26a7a7b6bd68c7dfdd78 ]
+
+Allow splice to undo the effects of MSG_MORE after prematurely ending a
+splice/sendfile due to getting an EOF condition (->splice_read() returned
+0) after splice had called sendmsg() with MSG_MORE set when the user didn't
+set MSG_MORE.
+
+For UDP, a pending packet will not be emitted if the socket is closed
+before it is flushed; with this change, it be flushed by ->splice_eof().
+
+For TCP, it's not clear that MSG_MORE is actually effective.
+
+Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Kuniyuki Iwashima <kuniyu@amazon.com>
+cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
+cc: David Ahern <dsahern@kernel.org>
+cc: Jens Axboe <axboe@kernel.dk>
+cc: Matthew Wilcox <willy@infradead.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/inet_common.h | 1 +
+ include/net/tcp.h | 1 +
+ include/net/udp.h | 1 +
+ net/ipv4/af_inet.c | 18 ++++++++++++++++++
+ net/ipv4/tcp.c | 16 ++++++++++++++++
+ net/ipv4/tcp_ipv4.c | 1 +
+ net/ipv4/udp.c | 16 ++++++++++++++++
+ net/ipv6/af_inet6.c | 1 +
+ net/ipv6/tcp_ipv6.c | 1 +
+ net/ipv6/udp.c | 15 +++++++++++++++
+ 10 files changed, 71 insertions(+)
+
+diff --git a/include/net/inet_common.h b/include/net/inet_common.h
+index cec453c18f1d6..4673bbfd2811f 100644
+--- a/include/net/inet_common.h
++++ b/include/net/inet_common.h
+@@ -33,6 +33,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags,
+ bool kern);
+ int inet_send_prepare(struct sock *sk);
+ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size);
++void inet_splice_eof(struct socket *sock);
+ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags);
+ int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index c3d56b337f358..4c838f7290dd9 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -332,6 +332,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
+ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
+ size_t size, struct ubuf_info *uarg);
++void tcp_splice_eof(struct socket *sock);
+ int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
+ int flags);
+ int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
+diff --git a/include/net/udp.h b/include/net/udp.h
+index fee053bcd17c6..fa4cdbe55552c 100644
+--- a/include/net/udp.h
++++ b/include/net/udp.h
+@@ -269,6 +269,7 @@ int udp_get_port(struct sock *sk, unsigned short snum,
+ int udp_err(struct sk_buff *, u32);
+ int udp_abort(struct sock *sk, int err);
+ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
++void udp_splice_eof(struct socket *sock);
+ int udp_push_pending_frames(struct sock *sk);
+ void udp_flush_pending_frames(struct sock *sk);
+ int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size);
+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
+index 5d379df90c826..347c3768df6e8 100644
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -838,6 +838,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
+ }
+ EXPORT_SYMBOL(inet_sendmsg);
+
++void inet_splice_eof(struct socket *sock)
++{
++ const struct proto *prot;
++ struct sock *sk = sock->sk;
++
++ if (unlikely(inet_send_prepare(sk)))
++ return;
++
++ /* IPV6_ADDRFORM can change sk->sk_prot under us. */
++ prot = READ_ONCE(sk->sk_prot);
++ if (prot->splice_eof)
++ prot->splice_eof(sock);
++}
++EXPORT_SYMBOL_GPL(inet_splice_eof);
++
+ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
+ {
+@@ -1057,6 +1072,7 @@ const struct proto_ops inet_stream_ops = {
+ #ifdef CONFIG_MMU
+ .mmap = tcp_mmap,
+ #endif
++ .splice_eof = inet_splice_eof,
+ .sendpage = inet_sendpage,
+ .splice_read = tcp_splice_read,
+ .read_sock = tcp_read_sock,
+@@ -1091,6 +1107,7 @@ const struct proto_ops inet_dgram_ops = {
+ .read_skb = udp_read_skb,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
++ .splice_eof = inet_splice_eof,
+ .sendpage = inet_sendpage,
+ .set_peek_off = sk_set_peek_off,
+ #ifdef CONFIG_COMPAT
+@@ -1122,6 +1139,7 @@ static const struct proto_ops inet_sockraw_ops = {
+ .sendmsg = inet_sendmsg,
+ .recvmsg = inet_recvmsg,
+ .mmap = sock_no_mmap,
++ .splice_eof = inet_splice_eof,
+ .sendpage = inet_sendpage,
+ #ifdef CONFIG_COMPAT
+ .compat_ioctl = inet_compat_ioctl,
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 3935451ad061e..0b7844a8d5711 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -1492,6 +1492,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
+ }
+ EXPORT_SYMBOL(tcp_sendmsg);
+
++void tcp_splice_eof(struct socket *sock)
++{
++ struct sock *sk = sock->sk;
++ struct tcp_sock *tp = tcp_sk(sk);
++ int mss_now, size_goal;
++
++ if (!tcp_write_queue_tail(sk))
++ return;
++
++ lock_sock(sk);
++ mss_now = tcp_send_mss(sk, &size_goal, 0);
++ tcp_push(sk, 0, mss_now, tp->nonagle, size_goal);
++ release_sock(sk);
++}
++EXPORT_SYMBOL_GPL(tcp_splice_eof);
++
+ /*
+ * Handle reading urgent data. BSD has very simple semantics for
+ * this, no blocking and very strange errors 8)
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index 7ebbbe561e402..be2c807eed15d 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -3067,6 +3067,7 @@ struct proto tcp_prot = {
+ .keepalive = tcp_set_keepalive,
+ .recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
++ .splice_eof = tcp_splice_eof,
+ .sendpage = tcp_sendpage,
+ .backlog_rcv = tcp_v4_do_rcv,
+ .release_cb = tcp_release_cb,
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index b49cb3df01bb4..e8dd2880ac9aa 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -1332,6 +1332,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+ }
+ EXPORT_SYMBOL(udp_sendmsg);
+
++void udp_splice_eof(struct socket *sock)
++{
++ struct sock *sk = sock->sk;
++ struct udp_sock *up = udp_sk(sk);
++
++ if (!up->pending || READ_ONCE(up->corkflag))
++ return;
++
++ lock_sock(sk);
++ if (up->pending && !READ_ONCE(up->corkflag))
++ udp_push_pending_frames(sk);
++ release_sock(sk);
++}
++EXPORT_SYMBOL_GPL(udp_splice_eof);
++
+ int udp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+ {
+@@ -2907,6 +2922,7 @@ struct proto udp_prot = {
+ .getsockopt = udp_getsockopt,
+ .sendmsg = udp_sendmsg,
+ .recvmsg = udp_recvmsg,
++ .splice_eof = udp_splice_eof,
+ .sendpage = udp_sendpage,
+ .release_cb = ip4_datagram_release_cb,
+ .hash = udp_lib_hash,
+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
+index b5309ae87fd79..a2f29ca516000 100644
+--- a/net/ipv6/af_inet6.c
++++ b/net/ipv6/af_inet6.c
+@@ -711,6 +711,7 @@ const struct proto_ops inet6_stream_ops = {
+ #ifdef CONFIG_MMU
+ .mmap = tcp_mmap,
+ #endif
++ .splice_eof = inet_splice_eof,
+ .sendpage = inet_sendpage,
+ .sendmsg_locked = tcp_sendmsg_locked,
+ .sendpage_locked = tcp_sendpage_locked,
+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
+index 7be89dcfd5fc5..ba9a22db5805c 100644
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -2158,6 +2158,7 @@ struct proto tcpv6_prot = {
+ .keepalive = tcp_set_keepalive,
+ .recvmsg = tcp_recvmsg,
+ .sendmsg = tcp_sendmsg,
++ .splice_eof = tcp_splice_eof,
+ .sendpage = tcp_sendpage,
+ .backlog_rcv = tcp_v6_do_rcv,
+ .release_cb = tcp_release_cb,
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index 7f49f69226a21..2a65136dca773 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -1657,6 +1657,20 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+ goto out;
+ }
+
++static void udpv6_splice_eof(struct socket *sock)
++{
++ struct sock *sk = sock->sk;
++ struct udp_sock *up = udp_sk(sk);
++
++ if (!up->pending || READ_ONCE(up->corkflag))
++ return;
++
++ lock_sock(sk);
++ if (up->pending && !READ_ONCE(up->corkflag))
++ udp_v6_push_pending_frames(sk);
++ release_sock(sk);
++}
++
+ void udpv6_destroy_sock(struct sock *sk)
+ {
+ struct udp_sock *up = udp_sk(sk);
+@@ -1768,6 +1782,7 @@ struct proto udpv6_prot = {
+ .getsockopt = udpv6_getsockopt,
+ .sendmsg = udpv6_sendmsg,
+ .recvmsg = udpv6_recvmsg,
++ .splice_eof = udpv6_splice_eof,
+ .release_cb = ip6_datagram_release_cb,
+ .hash = udp_lib_hash,
+ .unhash = udp_lib_unhash,
+--
+2.43.0
+
--- /dev/null
+From 3d1c97b9a2cc1afdd2fa063fb59338e2a8a04818 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Nov 2022 23:30:53 -0800
+Subject: khugepage: replace try_to_release_page() with filemap_release_folio()
+
+From: Vishal Moola (Oracle) <vishal.moola@gmail.com>
+
+[ Upstream commit 64ab3195ea077eaeedc8b382939c3dc5ca56f369 ]
+
+Replace some calls with their folio equivalents. This change removes 4
+calls to compound_head() and is in preparation for the removal of the
+try_to_release_page() wrapper.
+
+Link: https://lkml.kernel.org/r/20221118073055.55694-3-vishal.moola@gmail.com
+Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/khugepaged.c | 23 ++++++++++++-----------
+ 1 file changed, 12 insertions(+), 11 deletions(-)
+
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index ef72d3df4b65b..6fc7db587c453 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1818,6 +1818,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ struct page *page = xas_next(&xas);
++ struct folio *folio;
+
+ VM_BUG_ON(index != xas.xa_index);
+ if (is_shmem) {
+@@ -1844,8 +1845,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
+ }
+
+ if (xa_is_value(page) || !PageUptodate(page)) {
+- struct folio *folio;
+-
+ xas_unlock_irq(&xas);
+ /* swap in or instantiate fallocated page */
+ if (shmem_get_folio(mapping->host, index,
+@@ -1933,13 +1932,15 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
+ goto out_unlock;
+ }
+
+- if (page_mapping(page) != mapping) {
++ folio = page_folio(page);
++
++ if (folio_mapping(folio) != mapping) {
+ result = SCAN_TRUNCATED;
+ goto out_unlock;
+ }
+
+- if (!is_shmem && (PageDirty(page) ||
+- PageWriteback(page))) {
++ if (!is_shmem && (folio_test_dirty(folio) ||
++ folio_test_writeback(folio))) {
+ /*
+ * khugepaged only works on read-only fd, so this
+ * page is dirty because it hasn't been flushed
+@@ -1949,20 +1950,20 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
+ goto out_unlock;
+ }
+
+- if (isolate_lru_page(page)) {
++ if (folio_isolate_lru(folio)) {
+ result = SCAN_DEL_PAGE_LRU;
+ goto out_unlock;
+ }
+
+- if (page_has_private(page) &&
+- !try_to_release_page(page, GFP_KERNEL)) {
++ if (folio_has_private(folio) &&
++ !filemap_release_folio(folio, GFP_KERNEL)) {
+ result = SCAN_PAGE_HAS_PRIVATE;
+- putback_lru_page(page);
++ folio_putback_lru(folio);
+ goto out_unlock;
+ }
+
+- if (page_mapped(page))
+- try_to_unmap(page_folio(page),
++ if (folio_mapped(folio))
++ try_to_unmap(folio,
+ TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
+
+ xas_lock_irq(&xas);
+--
+2.43.0
+
--- /dev/null
+From f07953806fd1f09054b8a7c16085bb0faaba9aec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 20 Nov 2023 16:35:59 +0800
+Subject: lib/group_cpus.c: avoid acquiring cpu hotplug lock in
+ group_cpus_evenly
+
+From: Ming Lei <ming.lei@redhat.com>
+
+[ Upstream commit 0263f92fadbb9d294d5971ac57743f882c93b2b3 ]
+
+group_cpus_evenly() could be part of storage driver's error handler, such
+as nvme driver, when may happen during CPU hotplug, in which storage queue
+has to drain its pending IOs because all CPUs associated with the queue
+are offline and the queue is becoming inactive. And handling IO needs
+error handler to provide forward progress.
+
+Then deadlock is caused:
+
+1) inside CPU hotplug handler, CPU hotplug lock is held, and blk-mq's
+ handler is waiting for inflight IO
+
+2) error handler is waiting for CPU hotplug lock
+
+3) inflight IO can't be completed in blk-mq's CPU hotplug handler
+ because error handling can't provide forward progress.
+
+Solve the deadlock by not holding CPU hotplug lock in group_cpus_evenly(),
+in which two stage spreads are taken: 1) the 1st stage is over all present
+CPUs; 2) the end stage is over all other CPUs.
+
+Turns out the two stage spread just needs consistent 'cpu_present_mask',
+and remove the CPU hotplug lock by storing it into one local cache. This
+way doesn't change correctness, because all CPUs are still covered.
+
+Link: https://lkml.kernel.org/r/20231120083559.285174-1-ming.lei@redhat.com
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Reported-by: Yi Zhang <yi.zhang@redhat.com>
+Reported-by: Guangwu Zhang <guazhang@redhat.com>
+Tested-by: Guangwu Zhang <guazhang@redhat.com>
+Reviewed-by: Chengming Zhou <zhouchengming@bytedance.com>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Cc: Keith Busch <kbusch@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ lib/group_cpus.c | 22 ++++++++++++++++------
+ 1 file changed, 16 insertions(+), 6 deletions(-)
+
+diff --git a/lib/group_cpus.c b/lib/group_cpus.c
+index 99f08c6cb9d97..156b1446d2a20 100644
+--- a/lib/group_cpus.c
++++ b/lib/group_cpus.c
+@@ -365,13 +365,25 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
+ if (!masks)
+ goto fail_node_to_cpumask;
+
+- /* Stabilize the cpumasks */
+- cpus_read_lock();
+ build_node_to_cpumask(node_to_cpumask);
+
++ /*
++ * Make a local cache of 'cpu_present_mask', so the two stages
++ * spread can observe consistent 'cpu_present_mask' without holding
++ * cpu hotplug lock, then we can reduce deadlock risk with cpu
++ * hotplug code.
++ *
++ * Here CPU hotplug may happen when reading `cpu_present_mask`, and
++ * we can live with the case because it only affects that hotplug
++ * CPU is handled in the 1st or 2nd stage, and either way is correct
++ * from API user viewpoint since 2-stage spread is sort of
++ * optimization.
++ */
++ cpumask_copy(npresmsk, data_race(cpu_present_mask));
++
+ /* grouping present CPUs first */
+ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+- cpu_present_mask, nmsk, masks);
++ npresmsk, nmsk, masks);
+ if (ret < 0)
+ goto fail_build_affinity;
+ nr_present = ret;
+@@ -386,15 +398,13 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
+ curgrp = 0;
+ else
+ curgrp = nr_present;
+- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
++ cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
+ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+ npresmsk, nmsk, masks);
+ if (ret >= 0)
+ nr_others = ret;
+
+ fail_build_affinity:
+- cpus_read_unlock();
+-
+ if (ret >= 0)
+ WARN_ON(nr_present + nr_others < numgrps);
+
+--
+2.43.0
+
--- /dev/null
+From e153f80eac85c4d13fc6aa0c5ddb79469a59ee34 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 9 Dec 2022 11:40:34 +0200
+Subject: media: camss: sm8250: Virtual channels for CSID
+
+From: Milen Mitkov <quic_mmitkov@quicinc.com>
+
+[ Upstream commit 3c4ed72a16bc6733cda9c65048af74a2e8eaa0eb ]
+
+CSID hardware on SM8250 can demux up to 4 simultaneous streams
+based on virtual channel (vc) or datatype (dt).
+The CSID subdevice entity now has 4 source ports that can be
+enabled/disabled and thus can control which virtual channels
+are enabled. Datatype demuxing not tested.
+
+In order to keep a valid internal state of the subdevice,
+implicit format propagation from the sink to the source pads
+has been preserved. However, the format on each source pad
+can be different and in that case it must be configured explicitly.
+
+CSID's s_stream is called when any stream is started or stopped.
+It will call configure_streams() that will rewrite IRQ settings to HW.
+When multiple streams are running simultaneously there is an issue
+when writing IRQ settings for one stream while another is still
+running, thus avoid re-writing settings if they were not changed
+in link setup, or by fully powering off the CSID hardware.
+
+Signed-off-by: Milen Mitkov <quic_mmitkov@quicinc.com>
+Reviewed-by: Robert Foss <robert.foss@linaro.org>
+Tested-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
+Acked-by: Robert Foss <robert.foss@linaro.org>
+Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
+Stable-dep-of: e655d1ae9703 ("media: qcom: camss: Fix set CSI2_RX_CFG1_VC_MODE when VC is greater than 3")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../platform/qcom/camss/camss-csid-gen2.c | 54 ++++++++++++-------
+ .../media/platform/qcom/camss/camss-csid.c | 44 ++++++++++-----
+ .../media/platform/qcom/camss/camss-csid.h | 11 +++-
+ 3 files changed, 74 insertions(+), 35 deletions(-)
+
+diff --git a/drivers/media/platform/qcom/camss/camss-csid-gen2.c b/drivers/media/platform/qcom/camss/camss-csid-gen2.c
+index 904208f6f9546..2e015e69a6ad6 100644
+--- a/drivers/media/platform/qcom/camss/camss-csid-gen2.c
++++ b/drivers/media/platform/qcom/camss/camss-csid-gen2.c
+@@ -334,13 +334,14 @@ static const struct csid_format csid_formats[] = {
+ },
+ };
+
+-static void csid_configure_stream(struct csid_device *csid, u8 enable)
++static void __csid_configure_stream(struct csid_device *csid, u8 enable, u8 vc)
+ {
+ struct csid_testgen_config *tg = &csid->testgen;
+ u32 val;
+ u32 phy_sel = 0;
+ u8 lane_cnt = csid->phy.lane_cnt;
+- struct v4l2_mbus_framefmt *input_format = &csid->fmt[MSM_CSID_PAD_SRC];
++ /* Source pads matching RDI channels on hardware. Pad 1 -> RDI0, Pad 2 -> RDI1, etc. */
++ struct v4l2_mbus_framefmt *input_format = &csid->fmt[MSM_CSID_PAD_FIRST_SRC + vc];
+ const struct csid_format *format = csid_get_fmt_entry(csid->formats, csid->nformats,
+ input_format->code);
+
+@@ -351,8 +352,7 @@ static void csid_configure_stream(struct csid_device *csid, u8 enable)
+ phy_sel = csid->phy.csiphy_id;
+
+ if (enable) {
+- u8 vc = 0; /* Virtual Channel 0 */
+- u8 dt_id = vc * 4;
++ u8 dt_id = vc;
+
+ if (tg->enabled) {
+ /* configure one DT, infinite frames */
+@@ -392,42 +392,42 @@ static void csid_configure_stream(struct csid_device *csid, u8 enable)
+ val |= format->data_type << RDI_CFG0_DATA_TYPE;
+ val |= vc << RDI_CFG0_VIRTUAL_CHANNEL;
+ val |= dt_id << RDI_CFG0_DT_ID;
+- writel_relaxed(val, csid->base + CSID_RDI_CFG0(0));
++ writel_relaxed(val, csid->base + CSID_RDI_CFG0(vc));
+
+ /* CSID_TIMESTAMP_STB_POST_IRQ */
+ val = 2 << RDI_CFG1_TIMESTAMP_STB_SEL;
+- writel_relaxed(val, csid->base + CSID_RDI_CFG1(0));
++ writel_relaxed(val, csid->base + CSID_RDI_CFG1(vc));
+
+ val = 1;
+- writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PERIOD(0));
++ writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PERIOD(vc));
+
+ val = 0;
+- writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PATTERN(0));
++ writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PATTERN(vc));
+
+ val = 1;
+- writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PERIOD(0));
++ writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PERIOD(vc));
+
+ val = 0;
+- writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PATTERN(0));
++ writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PATTERN(vc));
+
+ val = 1;
+- writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PERIOD(0));
++ writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PERIOD(vc));
+
+ val = 0;
+- writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PATTERN(0));
++ writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PATTERN(vc));
+
+ val = 1;
+- writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PERIOD(0));
++ writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PERIOD(vc));
+
+ val = 0;
+- writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PATTERN(0));
++ writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PATTERN(vc));
+
+ val = 0;
+- writel_relaxed(val, csid->base + CSID_RDI_CTRL(0));
++ writel_relaxed(val, csid->base + CSID_RDI_CTRL(vc));
+
+- val = readl_relaxed(csid->base + CSID_RDI_CFG0(0));
++ val = readl_relaxed(csid->base + CSID_RDI_CFG0(vc));
+ val |= 1 << RDI_CFG0_ENABLE;
+- writel_relaxed(val, csid->base + CSID_RDI_CFG0(0));
++ writel_relaxed(val, csid->base + CSID_RDI_CFG0(vc));
+ }
+
+ if (tg->enabled) {
+@@ -453,7 +453,16 @@ static void csid_configure_stream(struct csid_device *csid, u8 enable)
+ val = HALT_CMD_RESUME_AT_FRAME_BOUNDARY << RDI_CTRL_HALT_CMD;
+ else
+ val = HALT_CMD_HALT_AT_FRAME_BOUNDARY << RDI_CTRL_HALT_CMD;
+- writel_relaxed(val, csid->base + CSID_RDI_CTRL(0));
++ writel_relaxed(val, csid->base + CSID_RDI_CTRL(vc));
++}
++
++static void csid_configure_stream(struct csid_device *csid, u8 enable)
++{
++ u8 i;
++ /* Loop through all enabled VCs and configure stream for each */
++ for (i = 0; i < MSM_CSID_MAX_SRC_STREAMS; i++)
++ if (csid->phy.en_vc & BIT(i))
++ __csid_configure_stream(csid, enable, i);
+ }
+
+ static int csid_configure_testgen_pattern(struct csid_device *csid, s32 val)
+@@ -499,6 +508,7 @@ static irqreturn_t csid_isr(int irq, void *dev)
+ struct csid_device *csid = dev;
+ u32 val;
+ u8 reset_done;
++ int i;
+
+ val = readl_relaxed(csid->base + CSID_TOP_IRQ_STATUS);
+ writel_relaxed(val, csid->base + CSID_TOP_IRQ_CLEAR);
+@@ -507,8 +517,12 @@ static irqreturn_t csid_isr(int irq, void *dev)
+ val = readl_relaxed(csid->base + CSID_CSI2_RX_IRQ_STATUS);
+ writel_relaxed(val, csid->base + CSID_CSI2_RX_IRQ_CLEAR);
+
+- val = readl_relaxed(csid->base + CSID_CSI2_RDIN_IRQ_STATUS(0));
+- writel_relaxed(val, csid->base + CSID_CSI2_RDIN_IRQ_CLEAR(0));
++ /* Read and clear IRQ status for each enabled RDI channel */
++ for (i = 0; i < MSM_CSID_MAX_SRC_STREAMS; i++)
++ if (csid->phy.en_vc & BIT(i)) {
++ val = readl_relaxed(csid->base + CSID_CSI2_RDIN_IRQ_STATUS(i));
++ writel_relaxed(val, csid->base + CSID_CSI2_RDIN_IRQ_CLEAR(i));
++ }
+
+ val = 1 << IRQ_CMD_CLEAR;
+ writel_relaxed(val, csid->base + CSID_IRQ_CMD);
+diff --git a/drivers/media/platform/qcom/camss/camss-csid.c b/drivers/media/platform/qcom/camss/camss-csid.c
+index 88f188e0f7501..6360314f04a63 100644
+--- a/drivers/media/platform/qcom/camss/camss-csid.c
++++ b/drivers/media/platform/qcom/camss/camss-csid.c
+@@ -196,6 +196,8 @@ static int csid_set_power(struct v4l2_subdev *sd, int on)
+ return ret;
+ }
+
++ csid->phy.need_vc_update = true;
++
+ enable_irq(csid->irq);
+
+ ret = csid->ops->reset(csid);
+@@ -249,7 +251,10 @@ static int csid_set_stream(struct v4l2_subdev *sd, int enable)
+ return -ENOLINK;
+ }
+
+- csid->ops->configure_stream(csid, enable);
++ if (csid->phy.need_vc_update) {
++ csid->ops->configure_stream(csid, enable);
++ csid->phy.need_vc_update = false;
++ }
+
+ return 0;
+ }
+@@ -460,6 +465,7 @@ static int csid_set_format(struct v4l2_subdev *sd,
+ {
+ struct csid_device *csid = v4l2_get_subdevdata(sd);
+ struct v4l2_mbus_framefmt *format;
++ int i;
+
+ format = __csid_get_format(csid, sd_state, fmt->pad, fmt->which);
+ if (format == NULL)
+@@ -468,14 +474,14 @@ static int csid_set_format(struct v4l2_subdev *sd,
+ csid_try_format(csid, sd_state, fmt->pad, &fmt->format, fmt->which);
+ *format = fmt->format;
+
+- /* Propagate the format from sink to source */
++ /* Propagate the format from sink to source pads */
+ if (fmt->pad == MSM_CSID_PAD_SINK) {
+- format = __csid_get_format(csid, sd_state, MSM_CSID_PAD_SRC,
+- fmt->which);
++ for (i = MSM_CSID_PAD_FIRST_SRC; i < MSM_CSID_PADS_NUM; ++i) {
++ format = __csid_get_format(csid, sd_state, i, fmt->which);
+
+- *format = fmt->format;
+- csid_try_format(csid, sd_state, MSM_CSID_PAD_SRC, format,
+- fmt->which);
++ *format = fmt->format;
++ csid_try_format(csid, sd_state, i, format, fmt->which);
++ }
+ }
+
+ return 0;
+@@ -738,7 +744,6 @@ static int csid_link_setup(struct media_entity *entity,
+ struct csid_device *csid;
+ struct csiphy_device *csiphy;
+ struct csiphy_lanes_cfg *lane_cfg;
+- struct v4l2_subdev_format format = { 0 };
+
+ sd = media_entity_to_v4l2_subdev(entity);
+ csid = v4l2_get_subdevdata(sd);
+@@ -761,11 +766,22 @@ static int csid_link_setup(struct media_entity *entity,
+ lane_cfg = &csiphy->cfg.csi2->lane_cfg;
+ csid->phy.lane_cnt = lane_cfg->num_data;
+ csid->phy.lane_assign = csid_get_lane_assign(lane_cfg);
++ }
++ /* Decide which virtual channels to enable based on which source pads are enabled */
++ if (local->flags & MEDIA_PAD_FL_SOURCE) {
++ struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity);
++ struct csid_device *csid = v4l2_get_subdevdata(sd);
++ struct device *dev = csid->camss->dev;
++
++ if (flags & MEDIA_LNK_FL_ENABLED)
++ csid->phy.en_vc |= BIT(local->index - 1);
++ else
++ csid->phy.en_vc &= ~BIT(local->index - 1);
+
+- /* Reset format on source pad to sink pad format */
+- format.pad = MSM_CSID_PAD_SRC;
+- format.which = V4L2_SUBDEV_FORMAT_ACTIVE;
+- csid_set_format(&csid->subdev, NULL, &format);
++ csid->phy.need_vc_update = true;
++
++ dev_dbg(dev, "%s: Enabled CSID virtual channels mask 0x%x\n",
++ __func__, csid->phy.en_vc);
+ }
+
+ return 0;
+@@ -816,6 +832,7 @@ int msm_csid_register_entity(struct csid_device *csid,
+ struct v4l2_subdev *sd = &csid->subdev;
+ struct media_pad *pads = csid->pads;
+ struct device *dev = csid->camss->dev;
++ int i;
+ int ret;
+
+ v4l2_subdev_init(sd, &csid_v4l2_ops);
+@@ -852,7 +869,8 @@ int msm_csid_register_entity(struct csid_device *csid,
+ }
+
+ pads[MSM_CSID_PAD_SINK].flags = MEDIA_PAD_FL_SINK;
+- pads[MSM_CSID_PAD_SRC].flags = MEDIA_PAD_FL_SOURCE;
++ for (i = MSM_CSID_PAD_FIRST_SRC; i < MSM_CSID_PADS_NUM; ++i)
++ pads[i].flags = MEDIA_PAD_FL_SOURCE;
+
+ sd->entity.function = MEDIA_ENT_F_PROC_VIDEO_PIXEL_FORMATTER;
+ sd->entity.ops = &csid_media_ops;
+diff --git a/drivers/media/platform/qcom/camss/camss-csid.h b/drivers/media/platform/qcom/camss/camss-csid.h
+index f06040e44c515..d4b48432a0973 100644
+--- a/drivers/media/platform/qcom/camss/camss-csid.h
++++ b/drivers/media/platform/qcom/camss/camss-csid.h
+@@ -19,8 +19,13 @@
+ #include <media/v4l2-subdev.h>
+
+ #define MSM_CSID_PAD_SINK 0
+-#define MSM_CSID_PAD_SRC 1
+-#define MSM_CSID_PADS_NUM 2
++#define MSM_CSID_PAD_FIRST_SRC 1
++#define MSM_CSID_PADS_NUM 5
++
++#define MSM_CSID_PAD_SRC (MSM_CSID_PAD_FIRST_SRC)
++
++/* CSID hardware can demultiplex up to 4 outputs */
++#define MSM_CSID_MAX_SRC_STREAMS 4
+
+ #define DATA_TYPE_EMBEDDED_DATA_8BIT 0x12
+ #define DATA_TYPE_YUV420_8BIT 0x18
+@@ -81,6 +86,8 @@ struct csid_phy_config {
+ u8 csiphy_id;
+ u8 lane_cnt;
+ u32 lane_assign;
++ u32 en_vc;
++ u8 need_vc_update;
+ };
+
+ struct csid_device;
+--
+2.43.0
+
--- /dev/null
+From b716307f6947508dbb996139baebff85b0be36ae Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 30 Aug 2023 16:16:14 +0100
+Subject: media: qcom: camss: Fix set CSI2_RX_CFG1_VC_MODE when VC is greater
+ than 3
+
+From: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
+
+[ Upstream commit e655d1ae9703286cef7fda8675cad62f649dc183 ]
+
+VC_MODE = 0 implies a two bit VC address.
+VC_MODE = 1 is required for VCs with a larger address than two bits.
+
+Fixes: eebe6d00e9bf ("media: camss: Add support for CSID hardware version Titan 170")
+Cc: stable@vger.kernel.org
+Signed-off-by: Bryan O'Donoghue <bryan.odonoghue@linaro.org>
+Reviewed-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
+Signed-off-by: Hans Verkuil <hverkuil-cisco@xs4all.nl>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/media/platform/qcom/camss/camss-csid-gen2.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/media/platform/qcom/camss/camss-csid-gen2.c b/drivers/media/platform/qcom/camss/camss-csid-gen2.c
+index 2e015e69a6ad6..23acc387be5f0 100644
+--- a/drivers/media/platform/qcom/camss/camss-csid-gen2.c
++++ b/drivers/media/platform/qcom/camss/camss-csid-gen2.c
+@@ -446,6 +446,8 @@ static void __csid_configure_stream(struct csid_device *csid, u8 enable, u8 vc)
+ writel_relaxed(val, csid->base + CSID_CSI2_RX_CFG0);
+
+ val = 1 << CSI2_RX_CFG1_PACKET_ECC_CORRECTION_EN;
++ if (vc > 3)
++ val |= 1 << CSI2_RX_CFG1_VC_MODE;
+ val |= 1 << CSI2_RX_CFG1_MISR_EN;
+ writel_relaxed(val, csid->base + CSID_CSI2_RX_CFG1);
+
+--
+2.43.0
+
--- /dev/null
+From 86430873bd38064e37a7298e400a5f663c4efa25 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Nov 2022 23:30:54 -0800
+Subject: memory-failure: convert truncate_error_page() to use folio
+
+From: Vishal Moola (Oracle) <vishal.moola@gmail.com>
+
+[ Upstream commit ac5efa782041670b63a05c36d92d02a80e50bb63 ]
+
+Replace try_to_release_page() with filemap_release_folio(). This change
+is in preparation for the removal of the try_to_release_page() wrapper.
+
+Link: https://lkml.kernel.org/r/20221118073055.55694-4-vishal.moola@gmail.com
+Signed-off-by: Vishal Moola (Oracle) <vishal.moola@gmail.com>
+Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Theodore Ts'o <tytso@mit.edu>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/memory-failure.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/mm/memory-failure.c b/mm/memory-failure.c
+index ebd717157c813..6355166a6bb28 100644
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -827,12 +827,13 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
+ int ret = MF_FAILED;
+
+ if (mapping->a_ops->error_remove_page) {
++ struct folio *folio = page_folio(p);
+ int err = mapping->a_ops->error_remove_page(mapping, p);
+
+ if (err != 0) {
+ pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
+- } else if (page_has_private(p) &&
+- !try_to_release_page(p, GFP_NOIO)) {
++ } else if (folio_has_private(folio) &&
++ !filemap_release_folio(folio, GFP_NOIO)) {
+ pr_info("%#lx: failed to release buffers\n", pfn);
+ } else {
+ ret = MF_RECOVERED;
+--
+2.43.0
+
--- /dev/null
+From e38ef647ff2cf5958850b2c4b30eebe83d34dcaf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 20 Dec 2023 18:47:39 -0500
+Subject: mlxbf_gige: fix receive packet race condition
+
+From: David Thompson <davthompson@nvidia.com>
+
+[ Upstream commit dcea1bd45e6d111cc8fc1aaefa7e31694089bda3 ]
+
+Under heavy traffic, the BlueField Gigabit interface can
+become unresponsive. This is due to a possible race condition
+in the mlxbf_gige_rx_packet function, where the function exits
+with producer and consumer indices equal but there are remaining
+packet(s) to be processed. In order to prevent this situation,
+read receive consumer index *before* the HW replenish so that
+the mlxbf_gige_rx_packet function returns an accurate return
+value even if a packet is received into just-replenished buffer
+prior to exiting this routine. If the just-replenished buffer
+is received and occupies the last RX ring entry, the interface
+would not recover and instead would encounter RX packet drops
+related to internal buffer shortages since the driver RX logic
+is not being triggered to drain the RX ring. This patch will
+address and prevent this "ring full" condition.
+
+Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver")
+Reviewed-by: Asmaa Mnebhi <asmaa@nvidia.com>
+Signed-off-by: David Thompson <davthompson@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c
+index 0d5a41a2ae010..227d01cace3f0 100644
+--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c
++++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c
+@@ -267,6 +267,13 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts)
+ priv->stats.rx_truncate_errors++;
+ }
+
++ /* Read receive consumer index before replenish so that this routine
++ * returns accurate return value even if packet is received into
++ * just-replenished buffer prior to exiting this routine.
++ */
++ rx_ci = readq(priv->base + MLXBF_GIGE_RX_CQE_PACKET_CI);
++ rx_ci_rem = rx_ci % priv->rx_q_entries;
++
+ /* Let hardware know we've replenished one buffer */
+ rx_pi++;
+
+@@ -279,8 +286,6 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts)
+ rx_pi_rem = rx_pi % priv->rx_q_entries;
+ if (rx_pi_rem == 0)
+ priv->valid_polarity ^= 1;
+- rx_ci = readq(priv->base + MLXBF_GIGE_RX_CQE_PACKET_CI);
+- rx_ci_rem = rx_ci % priv->rx_q_entries;
+
+ if (skb)
+ netif_receive_skb(skb);
+--
+2.43.0
+
--- /dev/null
+From 670dabf41eb1dc619547a684c591cbef6598cb48 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 20 Nov 2023 15:53:52 +0100
+Subject: mm/memory_hotplug: add missing mem_hotplug_lock
+
+From: Sumanth Korikkar <sumanthk@linux.ibm.com>
+
+[ Upstream commit 001002e73712cdf6b8d9a103648cda3040ad7647 ]
+
+From Documentation/core-api/memory-hotplug.rst:
+When adding/removing/onlining/offlining memory or adding/removing
+heterogeneous/device memory, we should always hold the mem_hotplug_lock
+in write mode to serialise memory hotplug (e.g. access to global/zone
+variables).
+
+mhp_(de)init_memmap_on_memory() functions can change zone stats and
+struct page content, but they are currently called w/o the
+mem_hotplug_lock.
+
+When memory block is being offlined and when kmemleak goes through each
+populated zone, the following theoretical race conditions could occur:
+CPU 0: | CPU 1:
+memory_offline() |
+-> offline_pages() |
+ -> mem_hotplug_begin() |
+ ... |
+ -> mem_hotplug_done() |
+ | kmemleak_scan()
+ | -> get_online_mems()
+ | ...
+-> mhp_deinit_memmap_on_memory() |
+ [not protected by mem_hotplug_begin/done()]|
+ Marks memory section as offline, | Retrieves zone_start_pfn
+ poisons vmemmap struct pages and updates | and struct page members.
+ the zone related data |
+ | ...
+ | -> put_online_mems()
+
+Fix this by ensuring mem_hotplug_lock is taken before performing
+mhp_init_memmap_on_memory(). Also ensure that
+mhp_deinit_memmap_on_memory() holds the lock.
+
+online/offline_pages() are currently only called from
+memory_block_online/offline(), so it is safe to move the locking there.
+
+Link: https://lkml.kernel.org/r/20231120145354.308999-2-sumanthk@linux.ibm.com
+Fixes: a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range")
+Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
+Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Alexander Gordeev <agordeev@linux.ibm.com>
+Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Heiko Carstens <hca@linux.ibm.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Vasily Gorbik <gor@linux.ibm.com>
+Cc: kernel test robot <lkp@intel.com>
+Cc: <stable@vger.kernel.org> [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/base/memory.c | 18 +++++++++++++++---
+ mm/memory_hotplug.c | 13 ++++++-------
+ 2 files changed, 21 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/base/memory.c b/drivers/base/memory.c
+index 9aa0da991cfb9..5d39f3e374dae 100644
+--- a/drivers/base/memory.c
++++ b/drivers/base/memory.c
+@@ -175,6 +175,9 @@ int memory_notify(unsigned long val, void *v)
+ return blocking_notifier_call_chain(&memory_chain, val, v);
+ }
+
++/*
++ * Must acquire mem_hotplug_lock in write mode.
++ */
+ static int memory_block_online(struct memory_block *mem)
+ {
+ unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+@@ -193,10 +196,11 @@ static int memory_block_online(struct memory_block *mem)
+ * stage helps to keep accounting easier to follow - e.g vmemmaps
+ * belong to the same zone as the memory they backed.
+ */
++ mem_hotplug_begin();
+ if (nr_vmemmap_pages) {
+ ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone);
+ if (ret)
+- return ret;
++ goto out;
+ }
+
+ ret = online_pages(start_pfn + nr_vmemmap_pages,
+@@ -204,7 +208,7 @@ static int memory_block_online(struct memory_block *mem)
+ if (ret) {
+ if (nr_vmemmap_pages)
+ mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+- return ret;
++ goto out;
+ }
+
+ /*
+@@ -216,9 +220,14 @@ static int memory_block_online(struct memory_block *mem)
+ nr_vmemmap_pages);
+
+ mem->zone = zone;
++out:
++ mem_hotplug_done();
+ return ret;
+ }
+
++/*
++ * Must acquire mem_hotplug_lock in write mode.
++ */
+ static int memory_block_offline(struct memory_block *mem)
+ {
+ unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr);
+@@ -233,6 +242,7 @@ static int memory_block_offline(struct memory_block *mem)
+ * Unaccount before offlining, such that unpopulated zone and kthreads
+ * can properly be torn down in offline_pages().
+ */
++ mem_hotplug_begin();
+ if (nr_vmemmap_pages)
+ adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
+ -nr_vmemmap_pages);
+@@ -244,13 +254,15 @@ static int memory_block_offline(struct memory_block *mem)
+ if (nr_vmemmap_pages)
+ adjust_present_page_count(pfn_to_page(start_pfn),
+ mem->group, nr_vmemmap_pages);
+- return ret;
++ goto out;
+ }
+
+ if (nr_vmemmap_pages)
+ mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
+
+ mem->zone = NULL;
++out:
++ mem_hotplug_done();
+ return ret;
+ }
+
+diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
+index bd2570b4f9b7b..d02722bbfcf33 100644
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -1069,6 +1069,9 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
+ kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
+ }
+
++/*
++ * Must be called with mem_hotplug_lock in write mode.
++ */
+ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+ struct zone *zone, struct memory_group *group)
+ {
+@@ -1089,7 +1092,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+ !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
+- mem_hotplug_begin();
+
+ /* associate pfn range with the zone */
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
+@@ -1148,7 +1150,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+ writeback_set_ratelimit();
+
+ memory_notify(MEM_ONLINE, &arg);
+- mem_hotplug_done();
+ return 0;
+
+ failed_addition:
+@@ -1157,7 +1158,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
+ memory_notify(MEM_CANCEL_ONLINE, &arg);
+ remove_pfn_range_from_zone(zone, pfn, nr_pages);
+- mem_hotplug_done();
+ return ret;
+ }
+
+@@ -1787,6 +1787,9 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
+ return 0;
+ }
+
++/*
++ * Must be called with mem_hotplug_lock in write mode.
++ */
+ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+ struct zone *zone, struct memory_group *group)
+ {
+@@ -1809,8 +1812,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+ !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
+- mem_hotplug_begin();
+-
+ /*
+ * Don't allow to offline memory blocks that contain holes.
+ * Consequently, memory blocks with holes can never get onlined
+@@ -1946,7 +1947,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+
+ memory_notify(MEM_OFFLINE, &arg);
+ remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
+- mem_hotplug_done();
+ return 0;
+
+ failed_removal_isolated:
+@@ -1961,7 +1961,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
+ reason);
+- mem_hotplug_done();
+ return ret;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 9345b30fdfb2604449065987afce0aa558347408 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 20 Nov 2023 15:53:53 +0100
+Subject: mm/memory_hotplug: fix error handling in add_memory_resource()
+
+From: Sumanth Korikkar <sumanthk@linux.ibm.com>
+
+[ Upstream commit f42ce5f087eb69e47294ababd2e7e6f88a82d308 ]
+
+In add_memory_resource(), creation of memory block devices occurs after
+successful call to arch_add_memory(). However, creation of memory block
+devices could fail. In that case, arch_remove_memory() is called to
+perform necessary cleanup.
+
+Currently with or without altmap support, arch_remove_memory() is always
+passed with altmap set to NULL during error handling. This leads to
+freeing of struct pages using free_pages(), eventhough the allocation
+might have been performed with altmap support via
+altmap_alloc_block_buf().
+
+Fix the error handling by passing altmap in arch_remove_memory(). This
+ensures the following:
+* When altmap is disabled, deallocation of the struct pages array occurs
+ via free_pages().
+* When altmap is enabled, deallocation occurs via vmem_altmap_free().
+
+Link: https://lkml.kernel.org/r/20231120145354.308999-3-sumanthk@linux.ibm.com
+Fixes: a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range")
+Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
+Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Alexander Gordeev <agordeev@linux.ibm.com>
+Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Heiko Carstens <hca@linux.ibm.com>
+Cc: kernel test robot <lkp@intel.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Vasily Gorbik <gor@linux.ibm.com>
+Cc: <stable@vger.kernel.org> [5.15+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/memory_hotplug.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
+index d02722bbfcf33..3b9d3a4b43869 100644
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -1382,7 +1382,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
+ ret = create_memory_block_devices(start, size, mhp_altmap.alloc,
+ group);
+ if (ret) {
+- arch_remove_memory(start, size, NULL);
++ arch_remove_memory(start, size, params.altmap);
+ goto error;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 060289f8c5d7dc83b3980d57bc014879b377c9a9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 28 Jun 2023 11:48:51 +0100
+Subject: mm: merge folio_has_private()/filemap_release_folio() call pairs
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 0201ebf274a306a6ebb95e5dc2d6a0a27c737cac ]
+
+Patch series "mm, netfs, fscache: Stop read optimisation when folio
+removed from pagecache", v7.
+
+This fixes an optimisation in fscache whereby we don't read from the cache
+for a particular file until we know that there's data there that we don't
+have in the pagecache. The problem is that I'm no longer using PG_fscache
+(aka PG_private_2) to indicate that the page is cached and so I don't get
+a notification when a cached page is dropped from the pagecache.
+
+The first patch merges some folio_has_private() and
+filemap_release_folio() pairs and introduces a helper,
+folio_needs_release(), to indicate if a release is required.
+
+The second patch is the actual fix. Following Willy's suggestions[1], it
+adds an AS_RELEASE_ALWAYS flag to an address_space that will make
+filemap_release_folio() always call ->release_folio(), even if
+PG_private/PG_private_2 aren't set. folio_needs_release() is altered to
+add a check for this.
+
+This patch (of 2):
+
+Make filemap_release_folio() check folio_has_private(). Then, in most
+cases, where a call to folio_has_private() is immediately followed by a
+call to filemap_release_folio(), we can get rid of the test in the pair.
+
+There are a couple of sites in mm/vscan.c that this can't so easily be
+done. In shrink_folio_list(), there are actually three cases (something
+different is done for incompletely invalidated buffers), but
+filemap_release_folio() elides two of them.
+
+In shrink_active_list(), we don't have have the folio lock yet, so the
+check allows us to avoid locking the page unnecessarily.
+
+A wrapper function to check if a folio needs release is provided for those
+places that still need to do it in the mm/ directory. This will acquire
+additional parts to the condition in a future patch.
+
+After this, the only remaining caller of folio_has_private() outside of
+mm/ is a check in fuse.
+
+Link: https://lkml.kernel.org/r/20230628104852.3391651-1-dhowells@redhat.com
+Link: https://lkml.kernel.org/r/20230628104852.3391651-2-dhowells@redhat.com
+Reported-by: Rohith Surabattula <rohiths.msft@gmail.com>
+Suggested-by: Matthew Wilcox <willy@infradead.org>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Steve French <sfrench@samba.org>
+Cc: Shyam Prasad N <nspmangalore@gmail.com>
+Cc: Rohith Surabattula <rohiths.msft@gmail.com>
+Cc: Dave Wysochanski <dwysocha@redhat.com>
+Cc: Dominique Martinet <asmadeus@codewreck.org>
+Cc: Ilya Dryomov <idryomov@gmail.com>
+Cc: "Theodore Ts'o" <tytso@mit.edu>
+Cc: Andreas Dilger <adilger.kernel@dilger.ca>
+Cc: Xiubo Li <xiubli@redhat.com>
+Cc: Jingbo Xu <jefflexu@linux.alibaba.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ext4/move_extent.c | 12 ++++--------
+ fs/splice.c | 3 +--
+ mm/filemap.c | 2 ++
+ mm/huge_memory.c | 3 +--
+ mm/internal.h | 8 ++++++++
+ mm/khugepaged.c | 3 +--
+ mm/memory-failure.c | 8 +++-----
+ mm/migrate.c | 3 +--
+ mm/truncate.c | 6 ++----
+ mm/vmscan.c | 8 ++++----
+ 10 files changed, 27 insertions(+), 29 deletions(-)
+
+diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
+index 8dbb87edf24c4..dedc9d445f243 100644
+--- a/fs/ext4/move_extent.c
++++ b/fs/ext4/move_extent.c
+@@ -339,10 +339,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+ ext4_double_up_write_data_sem(orig_inode, donor_inode);
+ goto data_copy;
+ }
+- if ((folio_has_private(folio[0]) &&
+- !filemap_release_folio(folio[0], 0)) ||
+- (folio_has_private(folio[1]) &&
+- !filemap_release_folio(folio[1], 0))) {
++ if (!filemap_release_folio(folio[0], 0) ||
++ !filemap_release_folio(folio[1], 0)) {
+ *err = -EBUSY;
+ goto drop_data_sem;
+ }
+@@ -361,10 +359,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
+
+ /* At this point all buffers in range are uptodate, old mapping layout
+ * is no longer required, try to drop it now. */
+- if ((folio_has_private(folio[0]) &&
+- !filemap_release_folio(folio[0], 0)) ||
+- (folio_has_private(folio[1]) &&
+- !filemap_release_folio(folio[1], 0))) {
++ if (!filemap_release_folio(folio[0], 0) ||
++ !filemap_release_folio(folio[1], 0)) {
+ *err = -EBUSY;
+ goto unlock_folios;
+ }
+diff --git a/fs/splice.c b/fs/splice.c
+index c4ae54deac42c..d0230cf8ec571 100644
+--- a/fs/splice.c
++++ b/fs/splice.c
+@@ -65,8 +65,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
+ */
+ folio_wait_writeback(folio);
+
+- if (folio_has_private(folio) &&
+- !filemap_release_folio(folio, GFP_KERNEL))
++ if (!filemap_release_folio(folio, GFP_KERNEL))
+ goto out_unlock;
+
+ /*
+diff --git a/mm/filemap.c b/mm/filemap.c
+index 10fe6430693bd..2809b1174f04e 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -4005,6 +4005,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
+ struct address_space * const mapping = folio->mapping;
+
+ BUG_ON(!folio_test_locked(folio));
++ if (!folio_needs_release(folio))
++ return true;
+ if (folio_test_writeback(folio))
+ return false;
+
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 2753fb54cdf38..59577946735b1 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2694,8 +2694,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+ gfp = current_gfp_context(mapping_gfp_mask(mapping) &
+ GFP_RECLAIM_MASK);
+
+- if (folio_test_private(folio) &&
+- !filemap_release_folio(folio, gfp)) {
++ if (!filemap_release_folio(folio, gfp)) {
+ ret = -EBUSY;
+ goto out;
+ }
+diff --git a/mm/internal.h b/mm/internal.h
+index 6b7ef495b56d3..1fefb5181ab78 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -163,6 +163,14 @@ static inline void set_page_refcounted(struct page *page)
+ set_page_count(page, 1);
+ }
+
++/*
++ * Return true if a folio needs ->release_folio() calling upon it.
++ */
++static inline bool folio_needs_release(struct folio *folio)
++{
++ return folio_has_private(folio);
++}
++
+ extern unsigned long highest_memmap_pfn;
+
+ /*
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 6fc7db587c453..65bd0b105266a 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1955,8 +1955,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
+ goto out_unlock;
+ }
+
+- if (folio_has_private(folio) &&
+- !filemap_release_folio(folio, GFP_KERNEL)) {
++ if (!filemap_release_folio(folio, GFP_KERNEL)) {
+ result = SCAN_PAGE_HAS_PRIVATE;
+ folio_putback_lru(folio);
+ goto out_unlock;
+diff --git a/mm/memory-failure.c b/mm/memory-failure.c
+index 6355166a6bb28..5b846ed5dcbe9 100644
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -830,14 +830,12 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
+ struct folio *folio = page_folio(p);
+ int err = mapping->a_ops->error_remove_page(mapping, p);
+
+- if (err != 0) {
++ if (err != 0)
+ pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
+- } else if (folio_has_private(folio) &&
+- !filemap_release_folio(folio, GFP_NOIO)) {
++ else if (!filemap_release_folio(folio, GFP_NOIO))
+ pr_info("%#lx: failed to release buffers\n", pfn);
+- } else {
++ else
+ ret = MF_RECOVERED;
+- }
+ } else {
+ /*
+ * If the file system doesn't support it just invalidate
+diff --git a/mm/migrate.c b/mm/migrate.c
+index 91bd69c61148e..c93dd6a31c31a 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -914,8 +914,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
+ * Buffers may be managed in a filesystem specific way.
+ * We must have no buffers or drop them.
+ */
+- if (folio_test_private(src) &&
+- !filemap_release_folio(src, GFP_KERNEL))
++ if (!filemap_release_folio(src, GFP_KERNEL))
+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
+
+ return migrate_folio(mapping, dst, src, mode);
+diff --git a/mm/truncate.c b/mm/truncate.c
+index c0be77e5c0083..0d4dd233f5187 100644
+--- a/mm/truncate.c
++++ b/mm/truncate.c
+@@ -19,7 +19,6 @@
+ #include <linux/highmem.h>
+ #include <linux/pagevec.h>
+ #include <linux/task_io_accounting_ops.h>
+-#include <linux/buffer_head.h> /* grr. try_to_release_page */
+ #include <linux/shmem_fs.h>
+ #include <linux/rmap.h>
+ #include "internal.h"
+@@ -276,7 +275,7 @@ static long mapping_evict_folio(struct address_space *mapping,
+ if (folio_ref_count(folio) >
+ folio_nr_pages(folio) + folio_has_private(folio) + 1)
+ return 0;
+- if (folio_has_private(folio) && !filemap_release_folio(folio, 0))
++ if (!filemap_release_folio(folio, 0))
+ return 0;
+
+ return remove_mapping(mapping, folio);
+@@ -581,8 +580,7 @@ static int invalidate_complete_folio2(struct address_space *mapping,
+ if (folio->mapping != mapping)
+ return 0;
+
+- if (folio_has_private(folio) &&
+- !filemap_release_folio(folio, GFP_KERNEL))
++ if (!filemap_release_folio(folio, GFP_KERNEL))
+ return 0;
+
+ spin_lock(&mapping->host->i_lock);
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 3f090faa6377f..9f3cfb7caa48d 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1992,7 +1992,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
+ * (refcount == 1) it can be freed. Otherwise, leave
+ * the folio on the LRU so it is swappable.
+ */
+- if (folio_has_private(folio)) {
++ if (folio_needs_release(folio)) {
+ if (!filemap_release_folio(folio, sc->gfp_mask))
+ goto activate_locked;
+ if (!mapping && folio_ref_count(folio) == 1) {
+@@ -2618,9 +2618,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
+ }
+
+ if (unlikely(buffer_heads_over_limit)) {
+- if (folio_test_private(folio) && folio_trylock(folio)) {
+- if (folio_test_private(folio))
+- filemap_release_folio(folio, 0);
++ if (folio_needs_release(folio) &&
++ folio_trylock(folio)) {
++ filemap_release_folio(folio, 0);
+ folio_unlock(folio);
+ }
+ }
+--
+2.43.0
+
--- /dev/null
+From ed65a1b09f78fea9d521a21c25bb036dc802af12 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 28 Jun 2023 11:48:52 +0100
+Subject: mm, netfs, fscache: stop read optimisation when folio removed from
+ pagecache
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit b4fa966f03b7401ceacd4ffd7227197afb2b8376 ]
+
+Fscache has an optimisation by which reads from the cache are skipped
+until we know that (a) there's data there to be read and (b) that data
+isn't entirely covered by pages resident in the netfs pagecache. This is
+done with two flags manipulated by fscache_note_page_release():
+
+ if (...
+ test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) &&
+ test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags))
+ clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
+
+where the NO_DATA_TO_READ flag causes cachefiles_prepare_read() to
+indicate that netfslib should download from the server or clear the page
+instead.
+
+The fscache_note_page_release() function is intended to be called from
+->releasepage() - but that only gets called if PG_private or PG_private_2
+is set - and currently the former is at the discretion of the network
+filesystem and the latter is only set whilst a page is being written to
+the cache, so sometimes we miss clearing the optimisation.
+
+Fix this by following Willy's suggestion[1] and adding an address_space
+flag, AS_RELEASE_ALWAYS, that causes filemap_release_folio() to always call
+->release_folio() if it's set, even if PG_private or PG_private_2 aren't
+set.
+
+Note that this would require folio_test_private() and page_has_private() to
+become more complicated. To avoid that, in the places[*] where these are
+used to conditionalise calls to filemap_release_folio() and
+try_to_release_page(), the tests are removed the those functions just
+jumped to unconditionally and the test is performed there.
+
+[*] There are some exceptions in vmscan.c where the check guards more than
+just a call to the releaser. I've added a function, folio_needs_release()
+to wrap all the checks for that.
+
+AS_RELEASE_ALWAYS should be set if a non-NULL cookie is obtained from
+fscache and cleared in ->evict_inode() before truncate_inode_pages_final()
+is called.
+
+Additionally, the FSCACHE_COOKIE_NO_DATA_TO_READ flag needs to be cleared
+and the optimisation cancelled if a cachefiles object already contains data
+when we open it.
+
+[dwysocha@redhat.com: call folio_mapping() inside folio_needs_release()]
+ Link: https://github.com/DaveWysochanskiRH/kernel/commit/902c990e311120179fa5de99d68364b2947b79ec
+Link: https://lkml.kernel.org/r/20230628104852.3391651-3-dhowells@redhat.com
+Fixes: 1f67e6d0b188 ("fscache: Provide a function to note the release of a page")
+Fixes: 047487c947e8 ("cachefiles: Implement the I/O routines")
+Signed-off-by: David Howells <dhowells@redhat.com>
+Signed-off-by: Dave Wysochanski <dwysocha@redhat.com>
+Reported-by: Rohith Surabattula <rohiths.msft@gmail.com>
+Suggested-by: Matthew Wilcox <willy@infradead.org>
+Tested-by: SeongJae Park <sj@kernel.org>
+Cc: Daire Byrne <daire.byrne@gmail.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Steve French <sfrench@samba.org>
+Cc: Shyam Prasad N <nspmangalore@gmail.com>
+Cc: Rohith Surabattula <rohiths.msft@gmail.com>
+Cc: Dave Wysochanski <dwysocha@redhat.com>
+Cc: Dominique Martinet <asmadeus@codewreck.org>
+Cc: Ilya Dryomov <idryomov@gmail.com>
+Cc: Andreas Dilger <adilger.kernel@dilger.ca>
+Cc: Jingbo Xu <jefflexu@linux.alibaba.com>
+Cc: "Theodore Ts'o" <tytso@mit.edu>
+Cc: Xiubo Li <xiubli@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/9p/cache.c | 2 ++
+ fs/afs/internal.h | 2 ++
+ fs/cachefiles/namei.c | 2 ++
+ fs/ceph/cache.c | 2 ++
+ fs/nfs/fscache.c | 3 +++
+ fs/smb/client/fscache.c | 2 ++
+ include/linux/pagemap.h | 16 ++++++++++++++++
+ mm/internal.h | 5 ++++-
+ 8 files changed, 33 insertions(+), 1 deletion(-)
+
+diff --git a/fs/9p/cache.c b/fs/9p/cache.c
+index cebba4eaa0b57..12c0ae29f1857 100644
+--- a/fs/9p/cache.c
++++ b/fs/9p/cache.c
+@@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode)
+ &path, sizeof(path),
+ &version, sizeof(version),
+ i_size_read(&v9inode->netfs.inode));
++ if (v9inode->netfs.cache)
++ mapping_set_release_always(inode->i_mapping);
+
+ p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n",
+ inode, v9fs_inode_cookie(v9inode));
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index fcbb598d8c85d..a25fdc3e52310 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -682,6 +682,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode,
+ {
+ #ifdef CONFIG_AFS_FSCACHE
+ vnode->netfs.cache = cookie;
++ if (cookie)
++ mapping_set_release_always(vnode->netfs.inode.i_mapping);
+ #endif
+ }
+
+diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
+index 03ca8f2f657ab..50b2ee163af60 100644
+--- a/fs/cachefiles/namei.c
++++ b/fs/cachefiles/namei.c
+@@ -584,6 +584,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object,
+ if (ret < 0)
+ goto check_failed;
+
++ clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags);
++
+ object->file = file;
+
+ /* Always update the atime on an object we've just looked up (this is
+diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
+index 177d8e8d73fe4..de1dee46d3df7 100644
+--- a/fs/ceph/cache.c
++++ b/fs/ceph/cache.c
+@@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode)
+ &ci->i_vino, sizeof(ci->i_vino),
+ &ci->i_version, sizeof(ci->i_version),
+ i_size_read(inode));
++ if (ci->netfs.cache)
++ mapping_set_release_always(inode->i_mapping);
+ }
+
+ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci)
+diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
+index e731c00a9fcbc..d3c938dd2b12a 100644
+--- a/fs/nfs/fscache.c
++++ b/fs/nfs/fscache.c
+@@ -176,6 +176,9 @@ void nfs_fscache_init_inode(struct inode *inode)
+ &auxdata, /* aux_data */
+ sizeof(auxdata),
+ i_size_read(inode));
++
++ if (netfs_inode(inode)->cache)
++ mapping_set_release_always(inode->i_mapping);
+ }
+
+ /*
+diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
+index e73625b5d0cc6..f64bad513ba6d 100644
+--- a/fs/smb/client/fscache.c
++++ b/fs/smb/client/fscache.c
+@@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode)
+ &cifsi->uniqueid, sizeof(cifsi->uniqueid),
+ &cd, sizeof(cd),
+ i_size_read(&cifsi->netfs.inode));
++ if (cifsi->netfs.cache)
++ mapping_set_release_always(inode->i_mapping);
+ }
+
+ void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update)
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index 03307b72de6c6..fdbb90ae56c70 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -199,6 +199,7 @@ enum mapping_flags {
+ /* writeback related tags are not used */
+ AS_NO_WRITEBACK_TAGS = 5,
+ AS_LARGE_FOLIO_SUPPORT = 6,
++ AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */
+ };
+
+ /**
+@@ -269,6 +270,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping)
+ return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags);
+ }
+
++static inline bool mapping_release_always(const struct address_space *mapping)
++{
++ return test_bit(AS_RELEASE_ALWAYS, &mapping->flags);
++}
++
++static inline void mapping_set_release_always(struct address_space *mapping)
++{
++ set_bit(AS_RELEASE_ALWAYS, &mapping->flags);
++}
++
++static inline void mapping_clear_release_always(struct address_space *mapping)
++{
++ clear_bit(AS_RELEASE_ALWAYS, &mapping->flags);
++}
++
+ static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
+ {
+ return mapping->gfp_mask;
+diff --git a/mm/internal.h b/mm/internal.h
+index 1fefb5181ab78..d01130efce5fb 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -168,7 +168,10 @@ static inline void set_page_refcounted(struct page *page)
+ */
+ static inline bool folio_needs_release(struct folio *folio)
+ {
+- return folio_has_private(folio);
++ struct address_space *mapping = folio_mapping(folio);
++
++ return folio_has_private(folio) ||
++ (mapping && mapping_release_always(mapping));
+ }
+
+ extern unsigned long highest_memmap_pfn;
+--
+2.43.0
+
--- /dev/null
+From 1b799e9a0670b2cf155f5463f9b42e791668abaa Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 31 Aug 2023 13:52:12 +0000
+Subject: net: annotate data-races around sk->sk_bind_phc
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 251cd405a9e6e70b92fe5afbdd17fd5caf9d3266 ]
+
+sk->sk_bind_phc is read locklessly. Add corresponding annotations.
+
+Fixes: d463126e23f1 ("net: sock: extend SO_TIMESTAMPING for PHC binding")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Yangbo Lu <yangbo.lu@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/sock.c | 4 ++--
+ net/socket.c | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/net/core/sock.c b/net/core/sock.c
+index 929055bc0cc7b..49b7f252ddae4 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -890,7 +890,7 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
+ if (!match)
+ return -EINVAL;
+
+- sk->sk_bind_phc = phc_index;
++ WRITE_ONCE(sk->sk_bind_phc, phc_index);
+
+ return 0;
+ }
+@@ -1706,7 +1706,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
+ case SO_TIMESTAMPING_OLD:
+ lv = sizeof(v.timestamping);
+ v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
+- v.timestamping.bind_phc = sk->sk_bind_phc;
++ v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
+ break;
+
+ case SO_RCVTIMEO_OLD:
+diff --git a/net/socket.c b/net/socket.c
+index 9c1fb94b12851..07470724e7358 100644
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -940,7 +940,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
+
+ if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
+ hwtstamp = ptp_convert_timestamp(&hwtstamp,
+- sk->sk_bind_phc);
++ READ_ONCE(sk->sk_bind_phc));
+
+ if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) {
+ empty = 0;
+--
+2.43.0
+
--- /dev/null
+From e1f7cc7fc59e4d300f8a27e6ce20ed53893823db Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 31 Aug 2023 13:52:11 +0000
+Subject: net: annotate data-races around sk->sk_tsflags
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit e3390b30a5dfb112e8e802a59c0f68f947b638b2 ]
+
+sk->sk_tsflags can be read locklessly, add corresponding annotations.
+
+Fixes: b9f40e21ef42 ("net-timestamp: move timestamp flags out of sk_flags")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/ip.h | 2 +-
+ include/net/sock.h | 17 ++++++++++-------
+ net/can/j1939/socket.c | 10 ++++++----
+ net/core/skbuff.c | 10 ++++++----
+ net/core/sock.c | 4 ++--
+ net/ipv4/ip_output.c | 2 +-
+ net/ipv4/ip_sockglue.c | 2 +-
+ net/ipv4/tcp.c | 4 ++--
+ net/ipv6/ip6_output.c | 2 +-
+ net/ipv6/ping.c | 2 +-
+ net/ipv6/raw.c | 2 +-
+ net/ipv6/udp.c | 2 +-
+ net/socket.c | 13 +++++++------
+ 13 files changed, 40 insertions(+), 32 deletions(-)
+
+diff --git a/include/net/ip.h b/include/net/ip.h
+index c286344628dba..c83c09c65623f 100644
+--- a/include/net/ip.h
++++ b/include/net/ip.h
+@@ -95,7 +95,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm,
+ ipcm_init(ipcm);
+
+ ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark);
+- ipcm->sockc.tsflags = inet->sk.sk_tsflags;
++ ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags);
+ ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if);
+ ipcm->addr = inet->inet_saddr;
+ ipcm->protocol = inet->inet_num;
+diff --git a/include/net/sock.h b/include/net/sock.h
+index b6027b01c2455..d8ed62a8e1a3e 100644
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -1928,7 +1928,9 @@ struct sockcm_cookie {
+ static inline void sockcm_init(struct sockcm_cookie *sockc,
+ const struct sock *sk)
+ {
+- *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags };
++ *sockc = (struct sockcm_cookie) {
++ .tsflags = READ_ONCE(sk->sk_tsflags)
++ };
+ }
+
+ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+@@ -2741,9 +2743,9 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk,
+ static inline void
+ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
+ {
+- ktime_t kt = skb->tstamp;
+ struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb);
+-
++ u32 tsflags = READ_ONCE(sk->sk_tsflags);
++ ktime_t kt = skb->tstamp;
+ /*
+ * generate control messages if
+ * - receive time stamping in software requested
+@@ -2751,10 +2753,10 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
+ * - hardware time stamps available and wanted
+ */
+ if (sock_flag(sk, SOCK_RCVTSTAMP) ||
+- (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
+- (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
++ (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) ||
++ (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) ||
+ (hwtstamps->hwtstamp &&
+- (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
++ (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)))
+ __sock_recv_timestamp(msg, sk, skb);
+ else
+ sock_write_timestamp(sk, kt);
+@@ -2776,7 +2778,8 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
+ #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \
+ SOF_TIMESTAMPING_RAW_HARDWARE)
+
+- if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY)
++ if (sk->sk_flags & FLAGS_RECV_CMSGS ||
++ READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY)
+ __sock_recv_cmsgs(msg, sk, skb);
+ else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP)))
+ sock_write_timestamp(sk, skb->tstamp);
+diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
+index 9c828067b4481..b0be23559243c 100644
+--- a/net/can/j1939/socket.c
++++ b/net/can/j1939/socket.c
+@@ -974,6 +974,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk,
+ struct sock_exterr_skb *serr;
+ struct sk_buff *skb;
+ char *state = "UNK";
++ u32 tsflags;
+ int err;
+
+ jsk = j1939_sk(sk);
+@@ -981,13 +982,14 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk,
+ if (!(jsk->state & J1939_SOCK_ERRQUEUE))
+ return;
+
++ tsflags = READ_ONCE(sk->sk_tsflags);
+ switch (type) {
+ case J1939_ERRQUEUE_TX_ACK:
+- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK))
++ if (!(tsflags & SOF_TIMESTAMPING_TX_ACK))
+ return;
+ break;
+ case J1939_ERRQUEUE_TX_SCHED:
+- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED))
++ if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED))
+ return;
+ break;
+ case J1939_ERRQUEUE_TX_ABORT:
+@@ -997,7 +999,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk,
+ case J1939_ERRQUEUE_RX_DPO:
+ fallthrough;
+ case J1939_ERRQUEUE_RX_ABORT:
+- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE))
++ if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE))
+ return;
+ break;
+ default:
+@@ -1054,7 +1056,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk,
+ }
+
+ serr->opt_stats = true;
+- if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
++ if (tsflags & SOF_TIMESTAMPING_OPT_ID)
+ serr->ee.ee_data = session->tskey;
+
+ netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n",
+diff --git a/net/core/skbuff.c b/net/core/skbuff.c
+index 73b1e0e53534e..8a819d0a7bfb0 100644
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -4913,7 +4913,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
+ serr->ee.ee_info = tstype;
+ serr->opt_stats = opt_stats;
+ serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
+- if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
++ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
+ serr->ee.ee_data = skb_shinfo(skb)->tskey;
+ if (sk_is_tcp(sk))
+ serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
+@@ -4969,21 +4969,23 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
+ {
+ struct sk_buff *skb;
+ bool tsonly, opt_stats = false;
++ u32 tsflags;
+
+ if (!sk)
+ return;
+
+- if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
++ tsflags = READ_ONCE(sk->sk_tsflags);
++ if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
+ skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
+ return;
+
+- tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
++ tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
+ if (!skb_may_tx_timestamp(sk, tsonly))
+ return;
+
+ if (tsonly) {
+ #ifdef CONFIG_INET
+- if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
++ if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ sk_is_tcp(sk)) {
+ skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
+ ack_skb);
+diff --git a/net/core/sock.c b/net/core/sock.c
+index 4305e55dbfba4..929055bc0cc7b 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -926,7 +926,7 @@ int sock_set_timestamping(struct sock *sk, int optname,
+ return ret;
+ }
+
+- sk->sk_tsflags = val;
++ WRITE_ONCE(sk->sk_tsflags, val);
+ sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
+
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+@@ -1705,7 +1705,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
+
+ case SO_TIMESTAMPING_OLD:
+ lv = sizeof(v.timestamping);
+- v.timestamping.flags = sk->sk_tsflags;
++ v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
+ v.timestamping.bind_phc = sk->sk_bind_phc;
+ break;
+
+diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
+index d8ec802f97524..e19ef88ae181f 100644
+--- a/net/ipv4/ip_output.c
++++ b/net/ipv4/ip_output.c
+@@ -991,7 +991,7 @@ static int __ip_append_data(struct sock *sk,
+ paged = !!cork->gso_size;
+
+ if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
+- sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
++ READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
+index 63aa52becd880..c1fb7580ea581 100644
+--- a/net/ipv4/ip_sockglue.c
++++ b/net/ipv4/ip_sockglue.c
+@@ -509,7 +509,7 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk,
+ * or without payload (SOF_TIMESTAMPING_OPT_TSONLY).
+ */
+ info = PKTINFO_SKB_CB(skb);
+- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) ||
++ if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) ||
+ !info->ipi_ifindex)
+ return false;
+
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index 58409ea2da0af..3935451ad061e 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -2359,14 +2359,14 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
+ }
+ }
+
+- if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE)
++ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE)
+ has_timestamping = true;
+ else
+ tss->ts[0] = (struct timespec64) {0};
+ }
+
+ if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) {
+- if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE)
++ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE)
+ has_timestamping = true;
+ else
+ tss->ts[2] = (struct timespec64) {0};
+diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
+index 04822e2cba74a..e9ae084d038d1 100644
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -1507,7 +1507,7 @@ static int __ip6_append_data(struct sock *sk,
+ orig_mtu = mtu;
+
+ if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
+- sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
++ READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID)
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+
+ hh_len = LL_RESERVED_SPACE(rt->dst.dev);
+diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
+index 4d5a27dd9a4b2..a5d7d1915ba7e 100644
+--- a/net/ipv6/ping.c
++++ b/net/ipv6/ping.c
+@@ -119,7 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+ return -EINVAL;
+
+ ipcm6_init_sk(&ipc6, np);
+- ipc6.sockc.tsflags = sk->sk_tsflags;
++ ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
+ ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
+
+ fl6.flowi6_oif = oif;
+diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
+index df3abd9e5237c..dc31752a7edcc 100644
+--- a/net/ipv6/raw.c
++++ b/net/ipv6/raw.c
+@@ -776,7 +776,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+ fl6.flowi6_uid = sk->sk_uid;
+
+ ipcm6_init(&ipc6);
+- ipc6.sockc.tsflags = sk->sk_tsflags;
++ ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
+ ipc6.sockc.mark = fl6.flowi6_mark;
+
+ if (sin6) {
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index 64b36c2ba774a..7f49f69226a21 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -1358,7 +1358,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+
+ ipcm6_init(&ipc6);
+ ipc6.gso_size = READ_ONCE(up->gso_size);
+- ipc6.sockc.tsflags = sk->sk_tsflags;
++ ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
+ ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
+
+ /* destination address check */
+diff --git a/net/socket.c b/net/socket.c
+index 04cba91c7cbe5..9c1fb94b12851 100644
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -826,7 +826,7 @@ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp)
+
+ static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index)
+ {
+- bool cycles = sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC;
++ bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC;
+ struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+ struct net_device *orig_dev;
+ ktime_t hwtstamp;
+@@ -878,12 +878,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
+ int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP);
+ int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW);
+ struct scm_timestamping_internal tss;
+-
+ int empty = 1, false_tstamp = 0;
+ struct skb_shared_hwtstamps *shhwtstamps =
+ skb_hwtstamps(skb);
+ int if_index;
+ ktime_t hwtstamp;
++ u32 tsflags;
+
+ /* Race occurred between timestamp enabling and packet
+ receiving. Fill in the current time for now. */
+@@ -925,11 +925,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
+ }
+
+ memset(&tss, 0, sizeof(tss));
+- if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
++ tsflags = READ_ONCE(sk->sk_tsflags);
++ if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) &&
+ ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0))
+ empty = 0;
+ if (shhwtstamps &&
+- (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
++ (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+ !skb_is_swtx_tstamp(skb, false_tstamp)) {
+ if_index = 0;
+ if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV)
+@@ -937,14 +938,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
+ else
+ hwtstamp = shhwtstamps->hwtstamp;
+
+- if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC)
++ if (tsflags & SOF_TIMESTAMPING_BIND_PHC)
+ hwtstamp = ptp_convert_timestamp(&hwtstamp,
+ sk->sk_bind_phc);
+
+ if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) {
+ empty = 0;
+
+- if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
++ if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) &&
+ !skb_is_err_queue(skb))
+ put_ts_pktinfo(msg, skb, if_index);
+ }
+--
+2.43.0
+
--- /dev/null
+From 0f89a214d5bd7890cd44370aca6aade6589a47b3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 28 Dec 2023 14:56:38 +0100
+Subject: net: bcmgenet: Fix FCS generation for fragmented skbuffs
+
+From: Adrian Cinal <adriancinal@gmail.com>
+
+[ Upstream commit e584f2ff1e6cc9b1d99e8a6b0f3415940d1b3eb3 ]
+
+The flag DMA_TX_APPEND_CRC was only written to the first DMA descriptor
+in the TX path, where each descriptor corresponds to a single skbuff
+fragment (or the skbuff head). This led to packets with no FCS appearing
+on the wire if the kernel allocated the packet in fragments, which would
+always happen when using PACKET_MMAP/TPACKET (cf. tpacket_fill_skb() in
+net/af_packet.c).
+
+Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file")
+Signed-off-by: Adrian Cinal <adriancinal1@gmail.com>
+Acked-by: Doug Berger <opendmb@gmail.com>
+Acked-by: Florian Fainelli <florian.fainelli@broadcom.com>
+Link: https://lore.kernel.org/r/20231228135638.1339245-1-adriancinal1@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+index 1ae082eb9e905..c2a9913082153 100644
+--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c
++++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c
+@@ -2131,8 +2131,10 @@ static netdev_tx_t bcmgenet_xmit(struct sk_buff *skb, struct net_device *dev)
+ /* Note: if we ever change from DMA_TX_APPEND_CRC below we
+ * will need to restore software padding of "runt" packets
+ */
++ len_stat |= DMA_TX_APPEND_CRC;
++
+ if (!i) {
+- len_stat |= DMA_TX_APPEND_CRC | DMA_SOP;
++ len_stat |= DMA_SOP;
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ len_stat |= DMA_TX_DO_CSUM;
+ }
+--
+2.43.0
+
--- /dev/null
+From 75dffd6df5e444bb377e400ba3e8acf49ca982d3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 May 2023 13:11:10 +0100
+Subject: net: Declare MSG_SPLICE_PAGES internal sendmsg() flag
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit b841b901c452d92610f739a36e54978453528876 ]
+
+Declare MSG_SPLICE_PAGES, an internal sendmsg() flag, that hints to a
+network protocol that it should splice pages from the source iterator
+rather than copying the data if it can. This flag is added to a list that
+is cleared by sendmsg syscalls on entry.
+
+This is intended as a replacement for the ->sendpage() op, allowing a way
+to splice in several multipage folios in one go.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+cc: Jens Axboe <axboe@kernel.dk>
+cc: Matthew Wilcox <willy@infradead.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/socket.h | 3 +++
+ io_uring/net.c | 2 ++
+ net/socket.c | 2 ++
+ 3 files changed, 7 insertions(+)
+
+diff --git a/include/linux/socket.h b/include/linux/socket.h
+index 1db29aab8f9c3..b3c58042bd254 100644
+--- a/include/linux/socket.h
++++ b/include/linux/socket.h
+@@ -324,6 +324,7 @@ struct ucred {
+ */
+
+ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */
++#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */
+ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */
+ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file
+ descriptor received through
+@@ -334,6 +335,8 @@ struct ucred {
+ #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */
+ #endif
+
++/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */
++#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES)
+
+ /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */
+ #define SOL_IP 0
+diff --git a/io_uring/net.c b/io_uring/net.c
+index 57c626cb4d1a5..67f09a40bcb21 100644
+--- a/io_uring/net.c
++++ b/io_uring/net.c
+@@ -389,6 +389,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags)
+ if (flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&msg.msg_iter);
+
++ flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
+ msg.msg_flags = flags;
+ ret = sock_sendmsg(sock, &msg);
+ if (ret < min_ret) {
+@@ -1137,6 +1138,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags)
+ msg_flags |= MSG_DONTWAIT;
+ if (msg_flags & MSG_WAITALL)
+ min_ret = iov_iter_count(&msg.msg_iter);
++ msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
+
+ msg.msg_flags = msg_flags;
+ msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg;
+diff --git a/net/socket.c b/net/socket.c
+index 0104617b440dc..6f39f7b0cc85c 100644
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -2131,6 +2131,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags,
+ msg.msg_name = (struct sockaddr *)&address;
+ msg.msg_namelen = addr_len;
+ }
++ flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
+ if (sock->file->f_flags & O_NONBLOCK)
+ flags |= MSG_DONTWAIT;
+ msg.msg_flags = flags;
+@@ -2482,6 +2483,7 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
+ msg_sys->msg_control = ctl_buf;
+ msg_sys->msg_control_is_user = false;
+ }
++ flags &= ~MSG_INTERNAL_SENDMSG_FLAGS;
+ msg_sys->msg_flags = flags;
+
+ if (sock->file->f_flags & O_NONBLOCK)
+--
+2.43.0
+
--- /dev/null
+From 8e5b100ede5240de3c21551e38e66faf2d685c09 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 18 Oct 2022 17:18:51 +0300
+Subject: net: dpaa2-eth: rearrange variable in dpaa2_eth_get_ethtool_stats
+
+From: Ioana Ciornei <ioana.ciornei@nxp.com>
+
+[ Upstream commit 3313206827678f6f036eca601a51f6c4524b559a ]
+
+Rearrange the variables in the dpaa2_eth_get_ethtool_stats() function so
+that we adhere to the reverse Christmas tree rule.
+Also, in the next patch we are adding more variables and I didn't know
+where to place them with the current ordering.
+
+Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: beb1930f966d ("dpaa2-eth: recycle the RX buffer only after all processing done")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../ethernet/freescale/dpaa2/dpaa2-ethtool.c | 18 ++++++++----------
+ 1 file changed, 8 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
+index eea7d7a07c007..59888826469b9 100644
+--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
++++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c
+@@ -227,17 +227,8 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device *net_dev,
+ struct ethtool_stats *stats,
+ u64 *data)
+ {
+- int i = 0;
+- int j, k, err;
+- int num_cnt;
+- union dpni_statistics dpni_stats;
+- u32 fcnt, bcnt;
+- u32 fcnt_rx_total = 0, fcnt_tx_total = 0;
+- u32 bcnt_rx_total = 0, bcnt_tx_total = 0;
+- u32 buf_cnt;
+ struct dpaa2_eth_priv *priv = netdev_priv(net_dev);
+- struct dpaa2_eth_drv_stats *extras;
+- struct dpaa2_eth_ch_stats *ch_stats;
++ union dpni_statistics dpni_stats;
+ int dpni_stats_page_size[DPNI_STATISTICS_CNT] = {
+ sizeof(dpni_stats.page_0),
+ sizeof(dpni_stats.page_1),
+@@ -247,6 +238,13 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device *net_dev,
+ sizeof(dpni_stats.page_5),
+ sizeof(dpni_stats.page_6),
+ };
++ u32 fcnt_rx_total = 0, fcnt_tx_total = 0;
++ u32 bcnt_rx_total = 0, bcnt_tx_total = 0;
++ struct dpaa2_eth_ch_stats *ch_stats;
++ struct dpaa2_eth_drv_stats *extras;
++ int j, k, err, num_cnt, i = 0;
++ u32 fcnt, bcnt;
++ u32 buf_cnt;
+
+ memset(data, 0,
+ sizeof(u64) * (DPAA2_ETH_NUM_STATS + DPAA2_ETH_NUM_EXTRA_STATS));
+--
+2.43.0
+
--- /dev/null
+From f3ca390d856050f4a3be15ee0cec3f772f96b860 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 22 Dec 2023 00:19:01 +0100
+Subject: net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jörn-Thorben Hinz <jthinz@mailbox.tu-berlin.de>
+
+[ Upstream commit 7f6ca95d16b96567ce4cf458a2790ff17fa620c3 ]
+
+Commit 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") added the new
+socket option SO_TIMESTAMPING_NEW. Setting the option is handled in
+sk_setsockopt(), querying it was not handled in sk_getsockopt(), though.
+
+Following remarks on an earlier submission of this patch, keep the old
+behavior of getsockopt(SO_TIMESTAMPING_OLD) which returns the active
+flags even if they actually have been set through SO_TIMESTAMPING_NEW.
+
+The new getsockopt(SO_TIMESTAMPING_NEW) is stricter, returning flags
+only if they have been set through the same option.
+
+Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW")
+Link: https://lore.kernel.org/lkml/20230703175048.151683-1-jthinz@mailbox.tu-berlin.de/
+Link: https://lore.kernel.org/netdev/0d7cddc9-03fa-43db-a579-14f3e822615b@app.fastmail.com/
+Signed-off-by: Jörn-Thorben Hinz <jthinz@mailbox.tu-berlin.de>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/sock.c | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/net/core/sock.c b/net/core/sock.c
+index 49b7f252ddae4..0d8754ec837dc 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -1704,9 +1704,16 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
+ break;
+
+ case SO_TIMESTAMPING_OLD:
++ case SO_TIMESTAMPING_NEW:
+ lv = sizeof(v.timestamping);
+- v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
+- v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
++ /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
++ * returning the flags when they were set through the same option.
++ * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
++ */
++ if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
++ v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
++ v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
++ }
+ break;
+
+ case SO_RCVTIMEO_OLD:
+--
+2.43.0
+
--- /dev/null
+From e6b1f3de357f796324e9e623e65680e3c7fff48f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 4 Jan 2024 09:57:44 +0100
+Subject: net: Implement missing SO_TIMESTAMPING_NEW cmsg support
+
+From: Thomas Lange <thomas@corelatus.se>
+
+[ Upstream commit 382a32018b74f407008615e0e831d05ed28e81cd ]
+
+Commit 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") added the new
+socket option SO_TIMESTAMPING_NEW. However, it was never implemented in
+__sock_cmsg_send thus breaking SO_TIMESTAMPING cmsg for platforms using
+SO_TIMESTAMPING_NEW.
+
+Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW")
+Link: https://lore.kernel.org/netdev/6a7281bf-bc4a-4f75-bb88-7011908ae471@app.fastmail.com/
+Signed-off-by: Thomas Lange <thomas@corelatus.se>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Link: https://lore.kernel.org/r/20240104085744.49164-1-thomas@corelatus.se
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/sock.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/net/core/sock.c b/net/core/sock.c
+index 0d8754ec837dc..c50a14a02edd4 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -2771,6 +2771,7 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+ sockc->mark = *(u32 *)CMSG_DATA(cmsg);
+ break;
+ case SO_TIMESTAMPING_OLD:
++ case SO_TIMESTAMPING_NEW:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+
+--
+2.43.0
+
--- /dev/null
+From 2a83821a4f768e3f7e4d98d0b8623c31ade327a1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 14 Nov 2023 13:58:43 -0800
+Subject: net/mlx5: Increase size of irq name buffer
+
+From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+
+[ Upstream commit 3338bebfc26a1e2cebbba82a1cf12c0159608e73 ]
+
+Without increased buffer size, will trigger -Wformat-truncation with W=1
+for the snprintf operation writing to the buffer.
+
+ drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c: In function 'mlx5_irq_alloc':
+ drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:296:7: error: '@pci:' directive output may be truncated writing 5 bytes into a region of size between 1 and 32 [-Werror=format-truncation=]
+ 296 | "%s@pci:%s", name, pci_name(dev->pdev));
+ | ^~~~~
+ drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:295:2: note: 'snprintf' output 6 or more bytes (assuming 37) into a destination of size 32
+ 295 | snprintf(irq->name, MLX5_MAX_IRQ_NAME,
+ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ 296 | "%s@pci:%s", name, pci_name(dev->pdev));
+ | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Fixes: ada9f5d00797 ("IB/mlx5: Fix eq names to display nicely in /proc/interrupts")
+Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c
+Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+Reviewed-by: Dragos Tatulea <dtatulea@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Link: https://lore.kernel.org/r/20231114215846.5902-13-saeed@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 6 +++---
+ drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h | 3 +++
+ 2 files changed, 6 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+index d136360ac6a98..a6d3fc96e1685 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+@@ -25,7 +25,7 @@
+ struct mlx5_irq {
+ struct atomic_notifier_head nh;
+ cpumask_var_t mask;
+- char name[MLX5_MAX_IRQ_NAME];
++ char name[MLX5_MAX_IRQ_FORMATTED_NAME];
+ struct mlx5_irq_pool *pool;
+ int refcount;
+ u32 index;
+@@ -236,8 +236,8 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i,
+ else
+ irq_sf_set_name(pool, name, i);
+ ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh);
+- snprintf(irq->name, MLX5_MAX_IRQ_NAME,
+- "%s@pci:%s", name, pci_name(dev->pdev));
++ snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME,
++ MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev));
+ err = request_irq(irq->irqn, irq_int_handler, 0, irq->name,
+ &irq->nh);
+ if (err) {
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
+index 5c7e68bee43a0..4047179307c4a 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
++++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h
+@@ -7,6 +7,9 @@
+ #include <linux/mlx5/driver.h>
+
+ #define MLX5_MAX_IRQ_NAME (32)
++#define MLX5_IRQ_NAME_FORMAT_STR ("%s@pci:%s")
++#define MLX5_MAX_IRQ_FORMATTED_NAME \
++ (MLX5_MAX_IRQ_NAME + sizeof(MLX5_IRQ_NAME_FORMAT_STR))
+ /* max irq_index is 2047, so four chars */
+ #define MLX5_MAX_IRQ_IDX_CHARS (4)
+ #define MLX5_EQ_REFS_PER_IRQ (2)
+--
+2.43.0
+
--- /dev/null
+From 9c442a6aebc6eef0931aa962bc9c2dc82e4ac4a5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Dec 2023 15:02:27 +0800
+Subject: net/qla3xxx: fix potential memleak in ql_alloc_buffer_queues
+
+From: Dinghao Liu <dinghao.liu@zju.edu.cn>
+
+[ Upstream commit 89f45c30172c80e55c887f32f1af8e184124577b ]
+
+When dma_alloc_coherent() fails, we should free qdev->lrg_buf
+to prevent potential memleak.
+
+Fixes: 1357bfcf7106 ("qla3xxx: Dynamically size the rx buffer queue based on the MTU.")
+Signed-off-by: Dinghao Liu <dinghao.liu@zju.edu.cn>
+Link: https://lore.kernel.org/r/20231227070227.10527-1-dinghao.liu@zju.edu.cn
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/qlogic/qla3xxx.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c
+index 0d57ffcedf0c6..fc78bc959ded8 100644
+--- a/drivers/net/ethernet/qlogic/qla3xxx.c
++++ b/drivers/net/ethernet/qlogic/qla3xxx.c
+@@ -2591,6 +2591,7 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev)
+
+ if (qdev->lrg_buf_q_alloc_virt_addr == NULL) {
+ netdev_err(qdev->ndev, "lBufQ failed\n");
++ kfree(qdev->lrg_buf);
+ return -ENOMEM;
+ }
+ qdev->lrg_buf_q_virt_addr = qdev->lrg_buf_q_alloc_virt_addr;
+@@ -2615,6 +2616,7 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev)
+ qdev->lrg_buf_q_alloc_size,
+ qdev->lrg_buf_q_alloc_virt_addr,
+ qdev->lrg_buf_q_alloc_phy_addr);
++ kfree(qdev->lrg_buf);
+ return -ENOMEM;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 369ba8d2f5585f0a8e7f716d5dfd513881c4a891 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jan 2024 10:13:53 +0200
+Subject: net: ravb: Wait for operating mode to be applied
+
+From: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
+
+[ Upstream commit 9039cd4c61635b2d541009a7cd5e2cc052402f28 ]
+
+CSR.OPS bits specify the current operating mode and (according to
+documentation) they are updated by HW when the operating mode change
+request is processed. To comply with this check CSR.OPS before proceeding.
+
+Commit introduces ravb_set_opmode() that does all the necessities for
+setting the operating mode (set CCC.OPC (and CCC.GAC, CCC.CSEL, if any) and
+wait for CSR.OPS) and call it where needed. This should comply with all the
+HW manuals requirements as different manual variants specify that different
+modes need to be checked in CSR.OPS when setting CCC.OPC.
+
+If gPTP active in config mode is supported and it needs to be enabled, the
+CCC.GAC and CCC.CSEL needs to be configured along with CCC.OPC in the same
+write access. For this, ravb_set_opmode() allows passing GAC and CSEL as
+part of opmode and the function updates accordingly CCC register.
+
+Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper")
+Signed-off-by: Claudiu Beznea <claudiu.beznea.uj@bp.renesas.com>
+Reviewed-by: Sergey Shtylyov <s.shtylyov@omp.ru>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/renesas/ravb_main.c | 65 +++++++++++++++---------
+ 1 file changed, 42 insertions(+), 23 deletions(-)
+
+diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
+index 68cb5616ef991..c2c56a5289caf 100644
+--- a/drivers/net/ethernet/renesas/ravb_main.c
++++ b/drivers/net/ethernet/renesas/ravb_main.c
+@@ -68,16 +68,27 @@ int ravb_wait(struct net_device *ndev, enum ravb_reg reg, u32 mask, u32 value)
+ return -ETIMEDOUT;
+ }
+
+-static int ravb_config(struct net_device *ndev)
++static int ravb_set_opmode(struct net_device *ndev, u32 opmode)
+ {
++ u32 csr_ops = 1U << (opmode & CCC_OPC);
++ u32 ccc_mask = CCC_OPC;
+ int error;
+
+- /* Set config mode */
+- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG);
+- /* Check if the operating mode is changed to the config mode */
+- error = ravb_wait(ndev, CSR, CSR_OPS, CSR_OPS_CONFIG);
+- if (error)
+- netdev_err(ndev, "failed to switch device to config mode\n");
++ /* If gPTP active in config mode is supported it needs to be configured
++ * along with CSEL and operating mode in the same access. This is a
++ * hardware limitation.
++ */
++ if (opmode & CCC_GAC)
++ ccc_mask |= CCC_GAC | CCC_CSEL;
++
++ /* Set operating mode */
++ ravb_modify(ndev, CCC, ccc_mask, opmode);
++ /* Check if the operating mode is changed to the requested one */
++ error = ravb_wait(ndev, CSR, CSR_OPS, csr_ops);
++ if (error) {
++ netdev_err(ndev, "failed to switch device to requested mode (%u)\n",
++ opmode & CCC_OPC);
++ }
+
+ return error;
+ }
+@@ -675,7 +686,7 @@ static int ravb_dmac_init(struct net_device *ndev)
+ int error;
+
+ /* Set CONFIG mode */
+- error = ravb_config(ndev);
++ error = ravb_set_opmode(ndev, CCC_OPC_CONFIG);
+ if (error)
+ return error;
+
+@@ -684,9 +695,7 @@ static int ravb_dmac_init(struct net_device *ndev)
+ return error;
+
+ /* Setting the control will start the AVB-DMAC process. */
+- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_OPERATION);
+-
+- return 0;
++ return ravb_set_opmode(ndev, CCC_OPC_OPERATION);
+ }
+
+ static void ravb_get_tx_tstamp(struct net_device *ndev)
+@@ -1048,7 +1057,7 @@ static int ravb_stop_dma(struct net_device *ndev)
+ return error;
+
+ /* Stop AVB-DMAC process */
+- return ravb_config(ndev);
++ return ravb_set_opmode(ndev, CCC_OPC_CONFIG);
+ }
+
+ /* E-MAC interrupt handler */
+@@ -2576,21 +2585,25 @@ static int ravb_set_gti(struct net_device *ndev)
+ return 0;
+ }
+
+-static void ravb_set_config_mode(struct net_device *ndev)
++static int ravb_set_config_mode(struct net_device *ndev)
+ {
+ struct ravb_private *priv = netdev_priv(ndev);
+ const struct ravb_hw_info *info = priv->info;
++ int error;
+
+ if (info->gptp) {
+- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG);
++ error = ravb_set_opmode(ndev, CCC_OPC_CONFIG);
++ if (error)
++ return error;
+ /* Set CSEL value */
+ ravb_modify(ndev, CCC, CCC_CSEL, CCC_CSEL_HPB);
+ } else if (info->ccc_gac) {
+- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG |
+- CCC_GAC | CCC_CSEL_HPB);
++ error = ravb_set_opmode(ndev, CCC_OPC_CONFIG | CCC_GAC | CCC_CSEL_HPB);
+ } else {
+- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG);
++ error = ravb_set_opmode(ndev, CCC_OPC_CONFIG);
+ }
++
++ return error;
+ }
+
+ /* Set tx and rx clock internal delay modes */
+@@ -2810,7 +2823,9 @@ static int ravb_probe(struct platform_device *pdev)
+ ndev->ethtool_ops = &ravb_ethtool_ops;
+
+ /* Set AVB config mode */
+- ravb_set_config_mode(ndev);
++ error = ravb_set_config_mode(ndev);
++ if (error)
++ goto out_disable_gptp_clk;
+
+ if (info->gptp || info->ccc_gac) {
+ /* Set GTI value */
+@@ -2933,8 +2948,7 @@ static int ravb_remove(struct platform_device *pdev)
+ dma_free_coherent(ndev->dev.parent, priv->desc_bat_size, priv->desc_bat,
+ priv->desc_bat_dma);
+
+- /* Set reset mode */
+- ravb_write(ndev, CCC_OPC_RESET, CCC);
++ ravb_set_opmode(ndev, CCC_OPC_RESET);
+
+ clk_disable_unprepare(priv->gptp_clk);
+ clk_disable_unprepare(priv->refclk);
+@@ -3018,8 +3032,11 @@ static int __maybe_unused ravb_resume(struct device *dev)
+ int ret = 0;
+
+ /* If WoL is enabled set reset mode to rearm the WoL logic */
+- if (priv->wol_enabled)
+- ravb_write(ndev, CCC_OPC_RESET, CCC);
++ if (priv->wol_enabled) {
++ ret = ravb_set_opmode(ndev, CCC_OPC_RESET);
++ if (ret)
++ return ret;
++ }
+
+ /* All register have been reset to default values.
+ * Restore all registers which where setup at probe time and
+@@ -3027,7 +3044,9 @@ static int __maybe_unused ravb_resume(struct device *dev)
+ */
+
+ /* Set AVB config mode */
+- ravb_set_config_mode(ndev);
++ ret = ravb_set_config_mode(ndev);
++ if (ret)
++ return ret;
+
+ if (info->gptp || info->ccc_gac) {
+ /* Set GTI value */
+--
+2.43.0
+
--- /dev/null
+From 1c052bf518018a0db7e7a4b8e3f63445d941d7b5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Dec 2023 09:12:30 -0400
+Subject: net: Save and restore msg_namelen in sock_sendmsg
+
+From: Marc Dionne <marc.dionne@auristor.com>
+
+[ Upstream commit 01b2885d9415152bcb12ff1f7788f500a74ea0ed ]
+
+Commit 86a7e0b69bd5 ("net: prevent rewrite of msg_name in
+sock_sendmsg()") made sock_sendmsg save the incoming msg_name pointer
+and restore it before returning, to insulate the caller against
+msg_name being changed by the called code. If the address length
+was also changed however, we may return with an inconsistent structure
+where the length doesn't match the address, and attempts to reuse it may
+lead to lost packets.
+
+For example, a kernel that doesn't have commit 1c5950fc6fe9 ("udp6: fix
+potential access to stale information") will replace a v4 mapped address
+with its ipv4 equivalent, and shorten namelen accordingly from 28 to 16.
+If the caller attempts to reuse the resulting msg structure, it will have
+the original ipv6 (v4 mapped) address but an incorrect v4 length.
+
+Fixes: 86a7e0b69bd5 ("net: prevent rewrite of msg_name in sock_sendmsg()")
+Signed-off-by: Marc Dionne <marc.dionne@auristor.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/socket.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/net/socket.c b/net/socket.c
+index 07470724e7358..0104617b440dc 100644
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -740,6 +740,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg)
+ {
+ struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name;
+ struct sockaddr_storage address;
++ int save_len = msg->msg_namelen;
+ int ret;
+
+ if (msg->msg_name) {
+@@ -749,6 +750,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg)
+
+ ret = __sock_sendmsg(sock, msg);
+ msg->msg_name = save_addr;
++ msg->msg_namelen = save_len;
+
+ return ret;
+ }
+--
+2.43.0
+
--- /dev/null
+From 14e25d537fb93353328283053064f4589dcff379 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 9 Jun 2023 15:22:59 +0300
+Subject: net/sched: act_ct: Fix promotion of offloaded unreplied tuple
+
+From: Paul Blakey <paulb@nvidia.com>
+
+[ Upstream commit 41f2c7c342d3adb1c4dd5f2e3dd831adff16a669 ]
+
+Currently UNREPLIED and UNASSURED connections are added to the nf flow
+table. This causes the following connection packets to be processed
+by the flow table which then skips conntrack_in(), and thus such the
+connections will remain UNREPLIED and UNASSURED even if reply traffic
+is then seen. Even still, the unoffloaded reply packets are the ones
+triggering hardware update from new to established state, and if
+there aren't any to triger an update and/or previous update was
+missed, hardware can get out of sync with sw and still mark
+packets as new.
+
+Fix the above by:
+1) Not skipping conntrack_in() for UNASSURED packets, but still
+ refresh for hardware, as before the cited patch.
+2) Try and force a refresh by reply-direction packets that update
+ the hardware rules from new to established state.
+3) Remove any bidirectional flows that didn't failed to update in
+ hardware for re-insertion as bidrectional once any new packet
+ arrives.
+
+Fixes: 6a9bad0069cf ("net/sched: act_ct: offload UDP NEW connections")
+Co-developed-by: Vlad Buslov <vladbu@nvidia.com>
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Signed-off-by: Paul Blakey <paulb@nvidia.com>
+Reviewed-by: Florian Westphal <fw@strlen.de>
+Link: https://lore.kernel.org/r/1686313379-117663-1-git-send-email-paulb@nvidia.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_flow_table.h | 2 +-
+ net/netfilter/nf_flow_table_core.c | 13 ++++++++++---
+ net/netfilter/nf_flow_table_ip.c | 4 ++--
+ net/sched/act_ct.c | 9 ++++++++-
+ 4 files changed, 21 insertions(+), 7 deletions(-)
+
+diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
+index ebb28ec5b6faf..f37f9f34430c1 100644
+--- a/include/net/netfilter/nf_flow_table.h
++++ b/include/net/netfilter/nf_flow_table.h
+@@ -268,7 +268,7 @@ int flow_offload_route_init(struct flow_offload *flow,
+
+ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow);
+ void flow_offload_refresh(struct nf_flowtable *flow_table,
+- struct flow_offload *flow);
++ struct flow_offload *flow, bool force);
+
+ struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table,
+ struct flow_offload_tuple *tuple);
+diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
+index 81c26a96c30bb..baddb93a5e8cf 100644
+--- a/net/netfilter/nf_flow_table_core.c
++++ b/net/netfilter/nf_flow_table_core.c
+@@ -314,12 +314,12 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
+ EXPORT_SYMBOL_GPL(flow_offload_add);
+
+ void flow_offload_refresh(struct nf_flowtable *flow_table,
+- struct flow_offload *flow)
++ struct flow_offload *flow, bool force)
+ {
+ u32 timeout;
+
+ timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
+- if (timeout - READ_ONCE(flow->timeout) > HZ)
++ if (force || timeout - READ_ONCE(flow->timeout) > HZ)
+ WRITE_ONCE(flow->timeout, timeout);
+ else
+ return;
+@@ -331,6 +331,12 @@ void flow_offload_refresh(struct nf_flowtable *flow_table,
+ }
+ EXPORT_SYMBOL_GPL(flow_offload_refresh);
+
++static bool nf_flow_is_outdated(const struct flow_offload *flow)
++{
++ return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) &&
++ !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
++}
++
+ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+ {
+ return nf_flow_timeout_delta(flow->timeout) <= 0;
+@@ -420,7 +426,8 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
+ {
+ if (nf_flow_has_expired(flow) ||
+- nf_ct_is_dying(flow->ct))
++ nf_ct_is_dying(flow->ct) ||
++ nf_flow_is_outdated(flow))
+ flow_offload_teardown(flow);
+
+ if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
+diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
+index b350fe9d00b0b..6feaac9ab05c8 100644
+--- a/net/netfilter/nf_flow_table_ip.c
++++ b/net/netfilter/nf_flow_table_ip.c
+@@ -384,7 +384,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
+ if (skb_try_make_writable(skb, thoff + hdrsize))
+ return NF_DROP;
+
+- flow_offload_refresh(flow_table, flow);
++ flow_offload_refresh(flow_table, flow, false);
+
+ nf_flow_encap_pop(skb, tuplehash);
+ thoff -= offset;
+@@ -646,7 +646,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
+ if (skb_try_make_writable(skb, thoff + hdrsize))
+ return NF_DROP;
+
+- flow_offload_refresh(flow_table, flow);
++ flow_offload_refresh(flow_table, flow, false);
+
+ nf_flow_encap_pop(skb, tuplehash);
+
+diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
+index 3c063065f125f..b80a58d3bf0f3 100644
+--- a/net/sched/act_ct.c
++++ b/net/sched/act_ct.c
+@@ -606,6 +606,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
+ struct flow_offload_tuple tuple = {};
+ enum ip_conntrack_info ctinfo;
+ struct tcphdr *tcph = NULL;
++ bool force_refresh = false;
+ struct flow_offload *flow;
+ struct nf_conn *ct;
+ u8 dir;
+@@ -643,6 +644,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
+ * established state, then don't refresh.
+ */
+ return false;
++ force_refresh = true;
+ }
+
+ if (tcph && (unlikely(tcph->fin || tcph->rst))) {
+@@ -656,7 +658,12 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
+ else
+ ctinfo = IP_CT_ESTABLISHED_REPLY;
+
+- flow_offload_refresh(nf_ft, flow);
++ flow_offload_refresh(nf_ft, flow, force_refresh);
++ if (!test_bit(IPS_ASSURED_BIT, &ct->status)) {
++ /* Process this flow in SW to allow promoting to ASSURED */
++ return false;
++ }
++
+ nf_conntrack_get(&ct->ct_general);
+ nf_ct_set(skb, ct, ctinfo);
+ if (nf_ft->flags & NF_FLOWTABLE_COUNTER)
+--
+2.43.0
+
--- /dev/null
+From da420921aeb41458470ef982be25475a762e01c9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 1 Feb 2023 17:30:59 +0100
+Subject: net/sched: act_ct: offload UDP NEW connections
+
+From: Vlad Buslov <vladbu@nvidia.com>
+
+[ Upstream commit 6a9bad0069cf306f3df6ac53cf02438d4e15f296 ]
+
+Modify the offload algorithm of UDP connections to the following:
+
+- Offload NEW connection as unidirectional.
+
+- When connection state changes to ESTABLISHED also update the hardware
+flow. However, in order to prevent act_ct from spamming offload add wq for
+every packet coming in reply direction in this state verify whether
+connection has already been updated to ESTABLISHED in the drivers. If that
+it the case, then skip flow_table and let conntrack handle such packets
+which will also allow conntrack to potentially promote the connection to
+ASSURED.
+
+- When connection state changes to ASSURED set the flow_table flow
+NF_FLOW_HW_BIDIRECTIONAL flag which will cause refresh mechanism to offload
+the reply direction.
+
+All other protocols have their offload algorithm preserved and are always
+offloaded as bidirectional.
+
+Note that this change tries to minimize the load on flow_table add
+workqueue. First, it tracks the last ctinfo that was offloaded by using new
+flow 'NF_FLOW_HW_ESTABLISHED' flag and doesn't schedule the refresh for
+reply direction packets when the offloads have already been updated with
+current ctinfo. Second, when 'add' task executes on workqueue it always
+update the offload with current flow state (by checking 'bidirectional'
+flow flag and obtaining actual ctinfo/cookie through meta action instead of
+caching any of these from the moment of scheduling the 'add' work)
+preventing the need from scheduling more updates if state changed
+concurrently while the 'add' work was pending on workqueue.
+
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sched/act_ct.c | 51 +++++++++++++++++++++++++++++++++++-----------
+ 1 file changed, 39 insertions(+), 12 deletions(-)
+
+diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
+index 86d269724485a..3c063065f125f 100644
+--- a/net/sched/act_ct.c
++++ b/net/sched/act_ct.c
+@@ -365,7 +365,7 @@ static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry,
+
+ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
+ struct nf_conn *ct,
+- bool tcp)
++ bool tcp, bool bidirectional)
+ {
+ struct nf_conn_act_ct_ext *act_ct_ext;
+ struct flow_offload *entry;
+@@ -384,6 +384,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
++ if (bidirectional)
++ __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags);
+
+ act_ct_ext = nf_conn_act_ct_ext_find(ct);
+ if (act_ct_ext) {
+@@ -407,26 +409,34 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+ {
+- bool tcp = false;
+-
+- if ((ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) ||
+- !test_bit(IPS_ASSURED_BIT, &ct->status))
+- return;
++ bool tcp = false, bidirectional = true;
+
+ switch (nf_ct_protonum(ct)) {
+ case IPPROTO_TCP:
+- tcp = true;
+- if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
++ if ((ctinfo != IP_CT_ESTABLISHED &&
++ ctinfo != IP_CT_ESTABLISHED_REPLY) ||
++ !test_bit(IPS_ASSURED_BIT, &ct->status) ||
++ ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
+ return;
++
++ tcp = true;
+ break;
+ case IPPROTO_UDP:
++ if (!nf_ct_is_confirmed(ct))
++ return;
++ if (!test_bit(IPS_ASSURED_BIT, &ct->status))
++ bidirectional = false;
+ break;
+ #ifdef CONFIG_NF_CT_PROTO_GRE
+ case IPPROTO_GRE: {
+ struct nf_conntrack_tuple *tuple;
+
+- if (ct->status & IPS_NAT_MASK)
++ if ((ctinfo != IP_CT_ESTABLISHED &&
++ ctinfo != IP_CT_ESTABLISHED_REPLY) ||
++ !test_bit(IPS_ASSURED_BIT, &ct->status) ||
++ ct->status & IPS_NAT_MASK)
+ return;
++
+ tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ /* No support for GRE v1 */
+ if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
+@@ -442,7 +452,7 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
+ ct->status & IPS_SEQ_ADJUST)
+ return;
+
+- tcf_ct_flow_table_add(ct_ft, ct, tcp);
++ tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional);
+ }
+
+ static bool
+@@ -621,13 +631,30 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
+ flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
+ ct = flow->ct;
+
++ if (dir == FLOW_OFFLOAD_DIR_REPLY &&
++ !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) {
++ /* Only offload reply direction after connection became
++ * assured.
++ */
++ if (test_bit(IPS_ASSURED_BIT, &ct->status))
++ set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags);
++ else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags))
++ /* If flow_table flow has already been updated to the
++ * established state, then don't refresh.
++ */
++ return false;
++ }
++
+ if (tcph && (unlikely(tcph->fin || tcph->rst))) {
+ flow_offload_teardown(flow);
+ return false;
+ }
+
+- ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
+- IP_CT_ESTABLISHED_REPLY;
++ if (dir == FLOW_OFFLOAD_DIR_ORIGINAL)
++ ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
++ IP_CT_ESTABLISHED : IP_CT_NEW;
++ else
++ ctinfo = IP_CT_ESTABLISHED_REPLY;
+
+ flow_offload_refresh(nf_ft, flow);
+ nf_conntrack_get(&ct->ct_general);
+--
+2.43.0
+
--- /dev/null
+From baf1515c7f3f16801fb896b07d179e5aa3fe924b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 5 Dec 2023 18:25:54 +0100
+Subject: net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table
+
+From: Vlad Buslov <vladbu@nvidia.com>
+
+[ Upstream commit 125f1c7f26ffcdbf96177abe75b70c1a6ceb17bc ]
+
+The referenced change added custom cleanup code to act_ct to delete any
+callbacks registered on the parent block when deleting the
+tcf_ct_flow_table instance. However, the underlying issue is that the
+drivers don't obtain the reference to the tcf_ct_flow_table instance when
+registering callbacks which means that not only driver callbacks may still
+be on the table when deleting it but also that the driver can still have
+pointers to its internal nf_flowtable and can use it concurrently which
+results either warning in netfilter[0] or use-after-free.
+
+Fix the issue by taking a reference to the underlying struct
+tcf_ct_flow_table instance when registering the callback and release the
+reference when unregistering. Expose new API required for such reference
+counting by adding two new callbacks to nf_flowtable_type and implementing
+them for act_ct flowtable_ct type. This fixes the issue by extending the
+lifetime of nf_flowtable until all users have unregistered.
+
+[0]:
+[106170.938634] ------------[ cut here ]------------
+[106170.939111] WARNING: CPU: 21 PID: 3688 at include/net/netfilter/nf_flow_table.h:262 mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core]
+[106170.940108] Modules linked in: act_ct nf_flow_table act_mirred act_skbedit act_tunnel_key vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa bonding openvswitch nsh rpcrdma rdma_ucm
+ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_regis
+try overlay mlx5_core
+[106170.943496] CPU: 21 PID: 3688 Comm: kworker/u48:0 Not tainted 6.6.0-rc7_for_upstream_min_debug_2023_11_01_13_02 #1
+[106170.944361] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+[106170.945292] Workqueue: mlx5e mlx5e_rep_neigh_update [mlx5_core]
+[106170.945846] RIP: 0010:mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core]
+[106170.946413] Code: 89 ef 48 83 05 71 a4 14 00 01 e8 f4 06 04 e1 48 83 05 6c a4 14 00 01 48 83 c4 28 5b 5d 41 5c 41 5d c3 48 83 05 d1 8b 14 00 01 <0f> 0b 48 83 05 d7 8b 14 00 01 e9 96 fe ff ff 48 83 05 a2 90 14 00
+[106170.947924] RSP: 0018:ffff88813ff0fcb8 EFLAGS: 00010202
+[106170.948397] RAX: 0000000000000000 RBX: ffff88811eabac40 RCX: ffff88811eabad48
+[106170.949040] RDX: ffff88811eab8000 RSI: ffffffffa02cd560 RDI: 0000000000000000
+[106170.949679] RBP: ffff88811eab8000 R08: 0000000000000001 R09: ffffffffa0229700
+[106170.950317] R10: ffff888103538fc0 R11: 0000000000000001 R12: ffff88811eabad58
+[106170.950969] R13: ffff888110c01c00 R14: ffff888106b40000 R15: 0000000000000000
+[106170.951616] FS: 0000000000000000(0000) GS:ffff88885fd40000(0000) knlGS:0000000000000000
+[106170.952329] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[106170.952834] CR2: 00007f1cefd28cb0 CR3: 000000012181b006 CR4: 0000000000370ea0
+[106170.953482] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+[106170.954121] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+[106170.954766] Call Trace:
+[106170.955057] <TASK>
+[106170.955315] ? __warn+0x79/0x120
+[106170.955648] ? mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core]
+[106170.956172] ? report_bug+0x17c/0x190
+[106170.956537] ? handle_bug+0x3c/0x60
+[106170.956891] ? exc_invalid_op+0x14/0x70
+[106170.957264] ? asm_exc_invalid_op+0x16/0x20
+[106170.957666] ? mlx5_del_flow_rules+0x10/0x310 [mlx5_core]
+[106170.958172] ? mlx5_tc_ct_block_flow_offload_add+0x1240/0x1240 [mlx5_core]
+[106170.958788] ? mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core]
+[106170.959339] ? mlx5_tc_ct_del_ft_cb+0xc6/0x2b0 [mlx5_core]
+[106170.959854] ? mapping_remove+0x154/0x1d0 [mlx5_core]
+[106170.960342] ? mlx5e_tc_action_miss_mapping_put+0x4f/0x80 [mlx5_core]
+[106170.960927] mlx5_tc_ct_delete_flow+0x76/0xc0 [mlx5_core]
+[106170.961441] mlx5_free_flow_attr_actions+0x13b/0x220 [mlx5_core]
+[106170.962001] mlx5e_tc_del_fdb_flow+0x22c/0x3b0 [mlx5_core]
+[106170.962524] mlx5e_tc_del_flow+0x95/0x3c0 [mlx5_core]
+[106170.963034] mlx5e_flow_put+0x73/0xe0 [mlx5_core]
+[106170.963506] mlx5e_put_flow_list+0x38/0x70 [mlx5_core]
+[106170.964002] mlx5e_rep_update_flows+0xec/0x290 [mlx5_core]
+[106170.964525] mlx5e_rep_neigh_update+0x1da/0x310 [mlx5_core]
+[106170.965056] process_one_work+0x13a/0x2c0
+[106170.965443] worker_thread+0x2e5/0x3f0
+[106170.965808] ? rescuer_thread+0x410/0x410
+[106170.966192] kthread+0xc6/0xf0
+[106170.966515] ? kthread_complete_and_exit+0x20/0x20
+[106170.966970] ret_from_fork+0x2d/0x50
+[106170.967332] ? kthread_complete_and_exit+0x20/0x20
+[106170.967774] ret_from_fork_asm+0x11/0x20
+[106170.970466] </TASK>
+[106170.970726] ---[ end trace 0000000000000000 ]---
+
+Fixes: 77ac5e40c44e ("net/sched: act_ct: remove and free nf_table callbacks")
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Reviewed-by: Paul Blakey <paulb@nvidia.com>
+Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_flow_table.h | 10 ++++++++
+ net/sched/act_ct.c | 34 ++++++++++++++++++++++-----
+ 2 files changed, 38 insertions(+), 6 deletions(-)
+
+diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
+index 0b163ead95c9f..dde4dd9c4012c 100644
+--- a/include/net/netfilter/nf_flow_table.h
++++ b/include/net/netfilter/nf_flow_table.h
+@@ -62,6 +62,8 @@ struct nf_flowtable_type {
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule);
+ void (*free)(struct nf_flowtable *ft);
++ void (*get)(struct nf_flowtable *ft);
++ void (*put)(struct nf_flowtable *ft);
+ nf_hookfn *hook;
+ struct module *owner;
+ };
+@@ -240,6 +242,11 @@ nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,
+ }
+
+ list_add_tail(&block_cb->list, &block->cb_list);
++ up_write(&flow_table->flow_block_lock);
++
++ if (flow_table->type->get)
++ flow_table->type->get(flow_table);
++ return 0;
+
+ unlock:
+ up_write(&flow_table->flow_block_lock);
+@@ -262,6 +269,9 @@ nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table,
+ WARN_ON(true);
+ }
+ up_write(&flow_table->flow_block_lock);
++
++ if (flow_table->type->put)
++ flow_table->type->put(flow_table);
+ }
+
+ int flow_offload_route_init(struct flow_offload *flow,
+diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
+index 4d34474f2cc0e..faf798133059b 100644
+--- a/net/sched/act_ct.c
++++ b/net/sched/act_ct.c
+@@ -280,9 +280,31 @@ static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow)
+ !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
+ }
+
++static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft);
++
++static void tcf_ct_nf_get(struct nf_flowtable *ft)
++{
++ struct tcf_ct_flow_table *ct_ft =
++ container_of(ft, struct tcf_ct_flow_table, nf_ft);
++
++ tcf_ct_flow_table_get_ref(ct_ft);
++}
++
++static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft);
++
++static void tcf_ct_nf_put(struct nf_flowtable *ft)
++{
++ struct tcf_ct_flow_table *ct_ft =
++ container_of(ft, struct tcf_ct_flow_table, nf_ft);
++
++ tcf_ct_flow_table_put(ct_ft);
++}
++
+ static struct nf_flowtable_type flowtable_ct = {
+ .gc = tcf_ct_flow_is_outdated,
+ .action = tcf_ct_flow_table_fill_actions,
++ .get = tcf_ct_nf_get,
++ .put = tcf_ct_nf_put,
+ .owner = THIS_MODULE,
+ };
+
+@@ -331,9 +353,13 @@ static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params)
+ return err;
+ }
+
++static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft)
++{
++ refcount_inc(&ct_ft->ref);
++}
++
+ static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
+ {
+- struct flow_block_cb *block_cb, *tmp_cb;
+ struct tcf_ct_flow_table *ct_ft;
+ struct flow_block *block;
+
+@@ -341,13 +367,9 @@ static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
+ rwork);
+ nf_flow_table_free(&ct_ft->nf_ft);
+
+- /* Remove any remaining callbacks before cleanup */
+ block = &ct_ft->nf_ft.flow_block;
+ down_write(&ct_ft->nf_ft.flow_block_lock);
+- list_for_each_entry_safe(block_cb, tmp_cb, &block->cb_list, list) {
+- list_del(&block_cb->list);
+- flow_block_cb_free(block_cb);
+- }
++ WARN_ON(!list_empty(&block->cb_list));
+ up_write(&ct_ft->nf_ft.flow_block_lock);
+ kfree(ct_ft);
+
+--
+2.43.0
+
--- /dev/null
+From 89293e3bc421a92dfd4935a5bec34d30ab89aba1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 6 Nov 2022 15:34:16 -0500
+Subject: net: sched: call tcf_ct_params_free to free params in tcf_ct_init
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit 1913894100ca53205f2d56091cb34b8eba1de217 ]
+
+This patch is to make the err path simple by calling tcf_ct_params_free(),
+so that it won't cause problems when more members are added into param and
+need freeing on the err path.
+
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sched/act_ct.c | 35 ++++++++++++++++++-----------------
+ 1 file changed, 18 insertions(+), 17 deletions(-)
+
+diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
+index 4c7f7861ea967..478cedc29b737 100644
+--- a/net/sched/act_ct.c
++++ b/net/sched/act_ct.c
+@@ -345,11 +345,9 @@ static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
+ module_put(THIS_MODULE);
+ }
+
+-static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
++static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft)
+ {
+- struct tcf_ct_flow_table *ct_ft = params->ct_ft;
+-
+- if (refcount_dec_and_test(¶ms->ct_ft->ref)) {
++ if (refcount_dec_and_test(&ct_ft->ref)) {
+ rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
+ INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
+ queue_rcu_work(act_ct_wq, &ct_ft->rwork);
+@@ -832,18 +830,23 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
+ return err;
+ }
+
+-static void tcf_ct_params_free(struct rcu_head *head)
++static void tcf_ct_params_free(struct tcf_ct_params *params)
+ {
+- struct tcf_ct_params *params = container_of(head,
+- struct tcf_ct_params, rcu);
+-
+- tcf_ct_flow_table_put(params);
+-
++ if (params->ct_ft)
++ tcf_ct_flow_table_put(params->ct_ft);
+ if (params->tmpl)
+ nf_ct_put(params->tmpl);
+ kfree(params);
+ }
+
++static void tcf_ct_params_free_rcu(struct rcu_head *head)
++{
++ struct tcf_ct_params *params;
++
++ params = container_of(head, struct tcf_ct_params, rcu);
++ tcf_ct_params_free(params);
++}
++
+ #if IS_ENABLED(CONFIG_NF_NAT)
+ /* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+@@ -1390,7 +1393,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
+
+ err = tcf_ct_flow_table_get(net, params);
+ if (err)
+- goto cleanup_params;
++ goto cleanup;
+
+ spin_lock_bh(&c->tcf_lock);
+ goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
+@@ -1401,17 +1404,15 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla,
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+ if (params)
+- call_rcu(¶ms->rcu, tcf_ct_params_free);
++ call_rcu(¶ms->rcu, tcf_ct_params_free_rcu);
+
+ return res;
+
+-cleanup_params:
+- if (params->tmpl)
+- nf_ct_put(params->tmpl);
+ cleanup:
+ if (goto_ch)
+ tcf_chain_put_by_act(goto_ch);
+- kfree(params);
++ if (params)
++ tcf_ct_params_free(params);
+ tcf_idr_release(*a, bind);
+ return err;
+ }
+@@ -1423,7 +1424,7 @@ static void tcf_ct_cleanup(struct tc_action *a)
+
+ params = rcu_dereference_protected(c->params, 1);
+ if (params)
+- call_rcu(¶ms->rcu, tcf_ct_params_free);
++ call_rcu(¶ms->rcu, tcf_ct_params_free_rcu);
+ }
+
+ static int tcf_ct_dump_key_val(struct sk_buff *skb,
+--
+2.43.0
+
--- /dev/null
+From 93c23c768858a1ae196116f045b4c1ff98e4e843 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 21 Dec 2023 10:25:31 +0800
+Subject: net: sched: em_text: fix possible memory leak in em_text_destroy()
+
+From: Hangyu Hua <hbh25y@gmail.com>
+
+[ Upstream commit 8fcb0382af6f1ef50936f1be05b8149eb2f88496 ]
+
+m->data needs to be freed when em_text_destroy is called.
+
+Fixes: d675c989ed2d ("[PKT_SCHED]: Packet classification based on textsearch (ematch)")
+Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
+Signed-off-by: Hangyu Hua <hbh25y@gmail.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sched/em_text.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/net/sched/em_text.c b/net/sched/em_text.c
+index 6f3c1fb2fb44c..f176afb70559e 100644
+--- a/net/sched/em_text.c
++++ b/net/sched/em_text.c
+@@ -97,8 +97,10 @@ static int em_text_change(struct net *net, void *data, int len,
+
+ static void em_text_destroy(struct tcf_ematch *m)
+ {
+- if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config)
++ if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) {
+ textsearch_destroy(EM_TEXT_PRIV(m)->config);
++ kfree(EM_TEXT_PRIV(m));
++ }
+ }
+
+ static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m)
+--
+2.43.0
+
--- /dev/null
+From a55dfee1f458e66ebb434d93453283be3b49b991 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Dec 2023 15:40:35 +0800
+Subject: net/smc: fix invalid link access in dumping SMC-R connections
+
+From: Wen Gu <guwen@linux.alibaba.com>
+
+[ Upstream commit 9dbe086c69b8902c85cece394760ac212e9e4ccc ]
+
+A crash was found when dumping SMC-R connections. It can be reproduced
+by following steps:
+
+- environment: two RNICs on both sides.
+- run SMC-R between two sides, now a SMC_LGR_SYMMETRIC type link group
+ will be created.
+- set the first RNIC down on either side and link group will turn to
+ SMC_LGR_ASYMMETRIC_LOCAL then.
+- run 'smcss -R' and the crash will be triggered.
+
+ BUG: kernel NULL pointer dereference, address: 0000000000000010
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD 8000000101fdd067 P4D 8000000101fdd067 PUD 10ce46067 PMD 0
+ Oops: 0000 [#1] PREEMPT SMP PTI
+ CPU: 3 PID: 1810 Comm: smcss Kdump: loaded Tainted: G W E 6.7.0-rc6+ #51
+ RIP: 0010:__smc_diag_dump.constprop.0+0x36e/0x620 [smc_diag]
+ Call Trace:
+ <TASK>
+ ? __die+0x24/0x70
+ ? page_fault_oops+0x66/0x150
+ ? exc_page_fault+0x69/0x140
+ ? asm_exc_page_fault+0x26/0x30
+ ? __smc_diag_dump.constprop.0+0x36e/0x620 [smc_diag]
+ smc_diag_dump_proto+0xd0/0xf0 [smc_diag]
+ smc_diag_dump+0x26/0x60 [smc_diag]
+ netlink_dump+0x19f/0x320
+ __netlink_dump_start+0x1dc/0x300
+ smc_diag_handler_dump+0x6a/0x80 [smc_diag]
+ ? __pfx_smc_diag_dump+0x10/0x10 [smc_diag]
+ sock_diag_rcv_msg+0x121/0x140
+ ? __pfx_sock_diag_rcv_msg+0x10/0x10
+ netlink_rcv_skb+0x5a/0x110
+ sock_diag_rcv+0x28/0x40
+ netlink_unicast+0x22a/0x330
+ netlink_sendmsg+0x240/0x4a0
+ __sock_sendmsg+0xb0/0xc0
+ ____sys_sendmsg+0x24e/0x300
+ ? copy_msghdr_from_user+0x62/0x80
+ ___sys_sendmsg+0x7c/0xd0
+ ? __do_fault+0x34/0x1a0
+ ? do_read_fault+0x5f/0x100
+ ? do_fault+0xb0/0x110
+ __sys_sendmsg+0x4d/0x80
+ do_syscall_64+0x45/0xf0
+ entry_SYSCALL_64_after_hwframe+0x6e/0x76
+
+When the first RNIC is set down, the lgr->lnk[0] will be cleared and an
+asymmetric link will be allocated in lgr->link[SMC_LINKS_PER_LGR_MAX - 1]
+by smc_llc_alloc_alt_link(). Then when we try to dump SMC-R connections
+in __smc_diag_dump(), the invalid lgr->lnk[0] will be accessed, resulting
+in this issue. So fix it by accessing the right link.
+
+Fixes: f16a7dd5cf27 ("smc: netlink interface for SMC sockets")
+Reported-by: henaumars <henaumars@sina.com>
+Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7616
+Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
+Reviewed-by: Tony Lu <tonylu@linux.alibaba.com>
+Link: https://lore.kernel.org/r/1703662835-53416-1-git-send-email-guwen@linux.alibaba.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/smc/smc_diag.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
+index 80ea7d954eceb..801044e7d1949 100644
+--- a/net/smc/smc_diag.c
++++ b/net/smc/smc_diag.c
+@@ -153,8 +153,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+ .lnk[0].link_id = link->link_id,
+ };
+
+- memcpy(linfo.lnk[0].ibname,
+- smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
++ memcpy(linfo.lnk[0].ibname, link->smcibdev->ibdev->name,
+ sizeof(link->smcibdev->ibdev->name));
+ smc_gid_be16_convert(linfo.lnk[0].gid, link->gid);
+ smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid);
+--
+2.43.0
+
--- /dev/null
+From f34a1a0c97dbe98c13f2e62a01b50c39a1ff419d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 6 Mar 2023 08:07:38 -0800
+Subject: net-timestamp: extend SOF_TIMESTAMPING_OPT_ID to HW timestamps
+
+From: Vadim Fedorenko <vadfed@meta.com>
+
+[ Upstream commit 8ca5a5790b9a1ce147484d2a2c4e66d2553f3d6c ]
+
+When the feature was added it was enabled for SW timestamps only but
+with current hardware the same out-of-order timestamps can be seen.
+Let's expand the area for the feature to all types of timestamps.
+
+Signed-off-by: Vadim Fedorenko <vadfed@meta.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/ip_output.c | 2 +-
+ net/ipv6/ip6_output.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
+index 493c679ea54f3..d8ec802f97524 100644
+--- a/net/ipv4/ip_output.c
++++ b/net/ipv4/ip_output.c
+@@ -990,7 +990,7 @@ static int __ip_append_data(struct sock *sk,
+ mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize;
+ paged = !!cork->gso_size;
+
+- if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
++ if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
+ sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+
+diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
+index 3c2b2a85de367..04822e2cba74a 100644
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -1506,7 +1506,7 @@ static int __ip6_append_data(struct sock *sk,
+ mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
+ orig_mtu = mtu;
+
+- if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP &&
++ if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
+ sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
+ tskey = atomic_inc_return(&sk->sk_tskey) - 1;
+
+--
+2.43.0
+
--- /dev/null
+From de10b8ea976d0c729b8d59713e03fe511557d6b9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 1 Feb 2023 17:30:56 +0100
+Subject: netfilter: flowtable: allow unidirectional rules
+
+From: Vlad Buslov <vladbu@nvidia.com>
+
+[ Upstream commit 8f84780b84d645d6e35467f4a6f3236b20d7f4b2 ]
+
+Modify flow table offload to support unidirectional connections by
+extending enum nf_flow_flags with new "NF_FLOW_HW_BIDIRECTIONAL" flag. Only
+offload reply direction when the flag is set. This infrastructure change is
+necessary to support offloading UDP NEW connections in original direction
+in following patches in series.
+
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_flow_table.h | 1 +
+ net/netfilter/nf_flow_table_offload.c | 12 ++++++++----
+ 2 files changed, 9 insertions(+), 4 deletions(-)
+
+diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
+index cd982f4a0f50c..88ab98ab41d9f 100644
+--- a/include/net/netfilter/nf_flow_table.h
++++ b/include/net/netfilter/nf_flow_table.h
+@@ -164,6 +164,7 @@ enum nf_flow_flags {
+ NF_FLOW_HW_DYING,
+ NF_FLOW_HW_DEAD,
+ NF_FLOW_HW_PENDING,
++ NF_FLOW_HW_BIDIRECTIONAL,
+ };
+
+ enum flow_offload_type {
+diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
+index 4d9b99abe37d6..8b852f10fab4b 100644
+--- a/net/netfilter/nf_flow_table_offload.c
++++ b/net/netfilter/nf_flow_table_offload.c
+@@ -895,8 +895,9 @@ static int flow_offload_rule_add(struct flow_offload_work *offload,
+
+ ok_count += flow_offload_tuple_add(offload, flow_rule[0],
+ FLOW_OFFLOAD_DIR_ORIGINAL);
+- ok_count += flow_offload_tuple_add(offload, flow_rule[1],
+- FLOW_OFFLOAD_DIR_REPLY);
++ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
++ ok_count += flow_offload_tuple_add(offload, flow_rule[1],
++ FLOW_OFFLOAD_DIR_REPLY);
+ if (ok_count == 0)
+ return -ENOENT;
+
+@@ -926,7 +927,8 @@ static void flow_offload_work_del(struct flow_offload_work *offload)
+ {
+ clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status);
+ flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL);
+- flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
++ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
++ flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY);
+ set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags);
+ }
+
+@@ -946,7 +948,9 @@ static void flow_offload_work_stats(struct flow_offload_work *offload)
+ u64 lastused;
+
+ flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]);
+- flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]);
++ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags))
++ flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY,
++ &stats[1]);
+
+ lastused = max_t(u64, stats[0].lastused, stats[1].lastused);
+ offload->flow->timeout = max_t(u64, offload->flow->timeout,
+--
+2.43.0
+
--- /dev/null
+From aa8689eb8935d603a6f52824c4f47b3279e22da5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 1 Feb 2023 17:30:57 +0100
+Subject: netfilter: flowtable: cache info of last offload
+
+From: Vlad Buslov <vladbu@nvidia.com>
+
+[ Upstream commit 1a441a9b8be8849957a01413a144f84932c324cb ]
+
+Modify flow table offload to cache the last ct info status that was passed
+to the driver offload callbacks by extending enum nf_flow_flags with new
+"NF_FLOW_HW_ESTABLISHED" flag. Set the flag if ctinfo was 'established'
+during last act_ct meta actions fill call. This infrastructure change is
+necessary to optimize promoting of UDP connections from 'new' to
+'established' in following patches in this series.
+
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_flow_table.h | 7 ++++---
+ net/netfilter/nf_flow_table_inet.c | 2 +-
+ net/netfilter/nf_flow_table_offload.c | 6 +++---
+ net/sched/act_ct.c | 12 +++++++-----
+ 4 files changed, 15 insertions(+), 12 deletions(-)
+
+diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
+index 88ab98ab41d9f..ebb28ec5b6faf 100644
+--- a/include/net/netfilter/nf_flow_table.h
++++ b/include/net/netfilter/nf_flow_table.h
+@@ -57,7 +57,7 @@ struct nf_flowtable_type {
+ struct net_device *dev,
+ enum flow_block_command cmd);
+ int (*action)(struct net *net,
+- const struct flow_offload *flow,
++ struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule);
+ void (*free)(struct nf_flowtable *ft);
+@@ -165,6 +165,7 @@ enum nf_flow_flags {
+ NF_FLOW_HW_DEAD,
+ NF_FLOW_HW_PENDING,
+ NF_FLOW_HW_BIDIRECTIONAL,
++ NF_FLOW_HW_ESTABLISHED,
+ };
+
+ enum flow_offload_type {
+@@ -313,10 +314,10 @@ void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable);
+ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable,
+ struct net_device *dev,
+ enum flow_block_command cmd);
+-int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
++int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule);
+-int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
++int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule);
+
+diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
+index 0ccabf3fa6aa3..9505f9d188ff2 100644
+--- a/net/netfilter/nf_flow_table_inet.c
++++ b/net/netfilter/nf_flow_table_inet.c
+@@ -39,7 +39,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
+ }
+
+ static int nf_flow_rule_route_inet(struct net *net,
+- const struct flow_offload *flow,
++ struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+ {
+diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
+index 8b852f10fab4b..1c26f03fc6617 100644
+--- a/net/netfilter/nf_flow_table_offload.c
++++ b/net/netfilter/nf_flow_table_offload.c
+@@ -679,7 +679,7 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow,
+ return 0;
+ }
+
+-int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
++int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+ {
+@@ -704,7 +704,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow,
+ }
+ EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4);
+
+-int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow,
++int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow,
+ enum flow_offload_tuple_dir dir,
+ struct nf_flow_rule *flow_rule)
+ {
+@@ -735,7 +735,7 @@ nf_flow_offload_rule_alloc(struct net *net,
+ {
+ const struct nf_flowtable *flowtable = offload->flowtable;
+ const struct flow_offload_tuple *tuple, *other_tuple;
+- const struct flow_offload *flow = offload->flow;
++ struct flow_offload *flow = offload->flow;
+ struct dst_entry *other_dst = NULL;
+ struct nf_flow_rule *flow_rule;
+ int err = -ENOMEM;
+diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
+index 478cedc29b737..86d269724485a 100644
+--- a/net/sched/act_ct.c
++++ b/net/sched/act_ct.c
+@@ -168,11 +168,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
+
+ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
+ enum ip_conntrack_dir dir,
++ enum ip_conntrack_info ctinfo,
+ struct flow_action *action)
+ {
+ struct nf_conn_labels *ct_labels;
+ struct flow_action_entry *entry;
+- enum ip_conntrack_info ctinfo;
+ u32 *act_ct_labels;
+
+ entry = tcf_ct_flow_table_flow_action_get_next(action);
+@@ -180,8 +180,6 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
+ #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ entry->ct_metadata.mark = READ_ONCE(ct->mark);
+ #endif
+- ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
+- IP_CT_ESTABLISHED_REPLY;
+ /* aligns with the CT reference on the SKB nf_ct_set */
+ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
+ entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL;
+@@ -235,22 +233,26 @@ static int tcf_ct_flow_table_add_action_nat(struct net *net,
+ }
+
+ static int tcf_ct_flow_table_fill_actions(struct net *net,
+- const struct flow_offload *flow,
++ struct flow_offload *flow,
+ enum flow_offload_tuple_dir tdir,
+ struct nf_flow_rule *flow_rule)
+ {
+ struct flow_action *action = &flow_rule->rule->action;
+ int num_entries = action->num_entries;
+ struct nf_conn *ct = flow->ct;
++ enum ip_conntrack_info ctinfo;
+ enum ip_conntrack_dir dir;
+ int i, err;
+
+ switch (tdir) {
+ case FLOW_OFFLOAD_DIR_ORIGINAL:
+ dir = IP_CT_DIR_ORIGINAL;
++ ctinfo = IP_CT_ESTABLISHED;
++ set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
+ break;
+ case FLOW_OFFLOAD_DIR_REPLY:
+ dir = IP_CT_DIR_REPLY;
++ ctinfo = IP_CT_ESTABLISHED_REPLY;
+ break;
+ default:
+ return -EOPNOTSUPP;
+@@ -260,7 +262,7 @@ static int tcf_ct_flow_table_fill_actions(struct net *net,
+ if (err)
+ goto err_nat;
+
+- tcf_ct_flow_table_add_action_meta(ct, dir, action);
++ tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action);
+ return 0;
+
+ err_nat:
+--
+2.43.0
+
--- /dev/null
+From 0449b478e6c121959ee093763e1b856302a2a0bf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 24 Oct 2023 21:09:47 +0200
+Subject: netfilter: flowtable: GC pushes back packets to classic path
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit 735795f68b37e9bb49f642407a0d49b1631ea1c7 ]
+
+Since 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded
+unreplied tuple"), flowtable GC pushes back flows with IPS_SEEN_REPLY
+back to classic path in every run, ie. every second. This is because of
+a new check for NF_FLOW_HW_ESTABLISHED which is specific of sched/act_ct.
+
+In Netfilter's flowtable case, NF_FLOW_HW_ESTABLISHED never gets set on
+and IPS_SEEN_REPLY is unreliable since users decide when to offload the
+flow before, such bit might be set on at a later stage.
+
+Fix it by adding a custom .gc handler that sched/act_ct can use to
+deal with its NF_FLOW_HW_ESTABLISHED bit.
+
+Fixes: 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded unreplied tuple")
+Reported-by: Vladimir Smelhaus <vl.sm@email.cz>
+Reviewed-by: Paul Blakey <paulb@nvidia.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_flow_table.h | 1 +
+ net/netfilter/nf_flow_table_core.c | 14 +++++++-------
+ net/sched/act_ct.c | 7 +++++++
+ 3 files changed, 15 insertions(+), 7 deletions(-)
+
+diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h
+index f37f9f34430c1..0b163ead95c9f 100644
+--- a/include/net/netfilter/nf_flow_table.h
++++ b/include/net/netfilter/nf_flow_table.h
+@@ -53,6 +53,7 @@ struct nf_flowtable_type {
+ struct list_head list;
+ int family;
+ int (*init)(struct nf_flowtable *ft);
++ bool (*gc)(const struct flow_offload *flow);
+ int (*setup)(struct nf_flowtable *ft,
+ struct net_device *dev,
+ enum flow_block_command cmd);
+diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
+index baddb93a5e8cf..c1d99cb370b44 100644
+--- a/net/netfilter/nf_flow_table_core.c
++++ b/net/netfilter/nf_flow_table_core.c
+@@ -331,12 +331,6 @@ void flow_offload_refresh(struct nf_flowtable *flow_table,
+ }
+ EXPORT_SYMBOL_GPL(flow_offload_refresh);
+
+-static bool nf_flow_is_outdated(const struct flow_offload *flow)
+-{
+- return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) &&
+- !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
+-}
+-
+ static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+ {
+ return nf_flow_timeout_delta(flow->timeout) <= 0;
+@@ -422,12 +416,18 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table,
+ return err;
+ }
+
++static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
++ const struct flow_offload *flow)
++{
++ return flow_table->type->gc && flow_table->type->gc(flow);
++}
++
+ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
+ struct flow_offload *flow, void *data)
+ {
+ if (nf_flow_has_expired(flow) ||
+ nf_ct_is_dying(flow->ct) ||
+- nf_flow_is_outdated(flow))
++ nf_flow_custom_gc(flow_table, flow))
+ flow_offload_teardown(flow);
+
+ if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
+diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
+index b80a58d3bf0f3..4d34474f2cc0e 100644
+--- a/net/sched/act_ct.c
++++ b/net/sched/act_ct.c
+@@ -274,7 +274,14 @@ static int tcf_ct_flow_table_fill_actions(struct net *net,
+ return err;
+ }
+
++static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow)
++{
++ return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) &&
++ !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags);
++}
++
+ static struct nf_flowtable_type flowtable_ct = {
++ .gc = tcf_ct_flow_is_outdated,
+ .action = tcf_ct_flow_table_fill_actions,
+ .owner = THIS_MODULE,
+ };
+--
+2.43.0
+
--- /dev/null
+From 56cc1e9b5b7e464b9e998329d7173330be70efb2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Dec 2023 11:50:12 +0100
+Subject: netfilter: nf_tables: set transport offset from mac header for
+ netdev/egress
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit 0ae8e4cca78781401b17721bfb72718fdf7b4912 ]
+
+Before this patch, transport offset (pkt->thoff) provides an offset
+relative to the network header. This is fine for the inet families
+because skb->data points to the network header in such case. However,
+from netdev/egress, skb->data points to the mac header (if available),
+thus, pkt->thoff is missing the mac header length.
+
+Add skb_network_offset() to the transport offset (pkt->thoff) for
+netdev, so transport header mangling works as expected. Adjust payload
+fast eval function to use skb->data now that pkt->thoff provides an
+absolute offset. This explains why users report that matching on
+egress/netdev works but payload mangling does not.
+
+This patch implicitly fixes payload mangling for IPv4 packets in
+netdev/egress given skb_store_bits() requires an offset from skb->data
+to reach the transport header.
+
+I suspect that nft_exthdr and the trace infra were also broken from
+netdev/egress because they also take skb->data as start, and pkt->thoff
+was not correct.
+
+Note that IPv6 is fine because ipv6_find_hdr() already provides a
+transport offset starting from skb->data, which includes
+skb_network_offset().
+
+The bridge family also uses nft_set_pktinfo_ipv4_validate(), but there
+skb_network_offset() is zero, so the update in this patch does not alter
+the existing behaviour.
+
+Fixes: 42df6e1d221d ("netfilter: Introduce egress hook")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_tables_ipv4.h | 2 +-
+ net/netfilter/nf_tables_core.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
+index d8f6cb47ebe37..5225d2bd1a6e9 100644
+--- a/include/net/netfilter/nf_tables_ipv4.h
++++ b/include/net/netfilter/nf_tables_ipv4.h
+@@ -30,7 +30,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
+ return -1;
+
+ len = iph_totlen(pkt->skb, iph);
+- thoff = iph->ihl * 4;
++ thoff = skb_network_offset(pkt->skb) + (iph->ihl * 4);
+ if (pkt->skb->len < len)
+ return -1;
+ else if (len < thoff)
+diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
+index cee3e4e905ec8..e0c117229ee9d 100644
+--- a/net/netfilter/nf_tables_core.c
++++ b/net/netfilter/nf_tables_core.c
+@@ -141,7 +141,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr,
+ else {
+ if (!(pkt->flags & NFT_PKTINFO_L4PROTO))
+ return false;
+- ptr = skb_network_header(skb) + nft_thoff(pkt);
++ ptr = skb->data + nft_thoff(pkt);
+ }
+
+ ptr += priv->offset;
+--
+2.43.0
+
--- /dev/null
+From d39cbbf50dc98ed34532f9728b5fb98aa77c1b82 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 1 Jan 2024 20:15:33 +0100
+Subject: netfilter: nft_immediate: drop chain reference counter on error
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit b29be0ca8e816119ccdf95cc7d7c7be9bde005f1 ]
+
+In the init path, nft_data_init() bumps the chain reference counter,
+decrement it on error by following the error path which calls
+nft_data_release() to restore it.
+
+Fixes: 4bedf9eee016 ("netfilter: nf_tables: fix chain binding transaction logic")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_immediate.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
+index 5f59dbab3e933..55fcf0280c5c3 100644
+--- a/net/netfilter/nft_immediate.c
++++ b/net/netfilter/nft_immediate.c
+@@ -78,7 +78,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx,
+ case NFT_GOTO:
+ err = nf_tables_bind_chain(ctx, chain);
+ if (err < 0)
+- return err;
++ goto err1;
+ break;
+ default:
+ break;
+--
+2.43.0
+
--- /dev/null
+From d9408d0e798b54be53f54b011e26b31027f18849 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 28 Jan 2023 10:58:34 -0500
+Subject: netfilter: use skb_ip_totlen and iph_totlen
+
+From: Xin Long <lucien.xin@gmail.com>
+
+[ Upstream commit a13fbf5ed5b4fc9095f12e955ca3a59b5507ff01 ]
+
+There are also quite some places in netfilter that may process IPv4 TCP
+GSO packets, we need to replace them too.
+
+In length_mt(), we have to use u_int32_t/int to accept skb_ip_totlen()
+return value, otherwise it may overflow and mismatch. This change will
+also help us add selftest for IPv4 BIG TCP in the following patch.
+
+Note that we don't need to replace the one in tcpmss_tg4(), as it will
+return if there is data after tcphdr in tcpmss_mangle_packet(). The
+same in mangle_contents() in nf_nat_helper.c, it returns false when
+skb->len + extra > 65535 in enlarge_skb().
+
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: 0ae8e4cca787 ("netfilter: nf_tables: set transport offset from mac header for netdev/egress")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_tables_ipv4.h | 4 ++--
+ net/netfilter/ipvs/ip_vs_xmit.c | 2 +-
+ net/netfilter/nf_log_syslog.c | 2 +-
+ net/netfilter/xt_length.c | 2 +-
+ 4 files changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
+index c4a6147b0ef8c..d8f6cb47ebe37 100644
+--- a/include/net/netfilter/nf_tables_ipv4.h
++++ b/include/net/netfilter/nf_tables_ipv4.h
+@@ -29,7 +29,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
+ if (iph->ihl < 5 || iph->version != 4)
+ return -1;
+
+- len = ntohs(iph->tot_len);
++ len = iph_totlen(pkt->skb, iph);
+ thoff = iph->ihl * 4;
+ if (pkt->skb->len < len)
+ return -1;
+@@ -62,7 +62,7 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt)
+ if (iph->ihl < 5 || iph->version != 4)
+ goto inhdr_error;
+
+- len = ntohs(iph->tot_len);
++ len = iph_totlen(pkt->skb, iph);
+ thoff = iph->ihl * 4;
+ if (pkt->skb->len < len) {
+ __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS);
+diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
+index 7243079ef3546..b452eb3ddcecb 100644
+--- a/net/netfilter/ipvs/ip_vs_xmit.c
++++ b/net/netfilter/ipvs/ip_vs_xmit.c
+@@ -994,7 +994,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
+ old_dsfield = ipv4_get_dsfield(old_iph);
+ *ttl = old_iph->ttl;
+ if (payload_len)
+- *payload_len = ntohs(old_iph->tot_len);
++ *payload_len = skb_ip_totlen(skb);
+ }
+
+ /* Implement full-functionality option for ECN encapsulation */
+diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
+index cb894f0d63e9d..c66689ad2b491 100644
+--- a/net/netfilter/nf_log_syslog.c
++++ b/net/netfilter/nf_log_syslog.c
+@@ -322,7 +322,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m,
+
+ /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+ nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+- ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
++ iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK,
+ ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+ /* Max length: 6 "CE DF MF " */
+diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c
+index 9fbfad13176f0..ca730cedb5d41 100644
+--- a/net/netfilter/xt_length.c
++++ b/net/netfilter/xt_length.c
+@@ -21,7 +21,7 @@ static bool
+ length_mt(const struct sk_buff *skb, struct xt_action_param *par)
+ {
+ const struct xt_length_info *info = par->matchinfo;
+- u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len);
++ u32 pktlen = skb_ip_totlen(skb);
+
+ return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+ }
+--
+2.43.0
+
--- /dev/null
+From 931ea9a2205ca793f1dcdff5f7f215cc9d0f2826 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 19 Dec 2023 23:19:43 +0530
+Subject: nfc: llcp_core: Hold a ref to llcp_local->dev when holding a ref to
+ llcp_local
+
+From: Siddh Raman Pant <code@siddh.me>
+
+[ Upstream commit c95f919567d6f1914f13350af61a1b044ac85014 ]
+
+llcp_sock_sendmsg() calls nfc_llcp_send_ui_frame() which in turn calls
+nfc_alloc_send_skb(), which accesses the nfc_dev from the llcp_sock for
+getting the headroom and tailroom needed for skb allocation.
+
+Parallelly the nfc_dev can be freed, as the refcount is decreased via
+nfc_free_device(), leading to a UAF reported by Syzkaller, which can
+be summarized as follows:
+
+(1) llcp_sock_sendmsg() -> nfc_llcp_send_ui_frame()
+ -> nfc_alloc_send_skb() -> Dereference *nfc_dev
+(2) virtual_ncidev_close() -> nci_free_device() -> nfc_free_device()
+ -> put_device() -> nfc_release() -> Free *nfc_dev
+
+When a reference to llcp_local is acquired, we do not acquire the same
+for the nfc_dev. This leads to freeing even when the llcp_local is in
+use, and this is the case with the UAF described above too.
+
+Thus, when we acquire a reference to llcp_local, we should acquire a
+reference to nfc_dev, and release the references appropriately later.
+
+References for llcp_local is initialized in nfc_llcp_register_device()
+(which is called by nfc_register_device()). Thus, we should acquire a
+reference to nfc_dev there.
+
+nfc_unregister_device() calls nfc_llcp_unregister_device() which in
+turn calls nfc_llcp_local_put(). Thus, the reference to nfc_dev is
+appropriately released later.
+
+Reported-and-tested-by: syzbot+bbe84a4010eeea00982d@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=bbe84a4010eeea00982d
+Fixes: c7aa12252f51 ("NFC: Take a reference on the LLCP local pointer when creating a socket")
+Reviewed-by: Suman Ghosh <sumang@marvell.com>
+Signed-off-by: Siddh Raman Pant <code@siddh.me>
+Reviewed-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/nfc/llcp_core.c | 39 ++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 36 insertions(+), 3 deletions(-)
+
+diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c
+index 1dac28136e6a3..18be13fb9b75a 100644
+--- a/net/nfc/llcp_core.c
++++ b/net/nfc/llcp_core.c
+@@ -145,6 +145,13 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device,
+
+ static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local)
+ {
++ /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever
++ * we hold a reference to local, we also need to hold a reference to
++ * the device to avoid UAF.
++ */
++ if (!nfc_get_device(local->dev->idx))
++ return NULL;
++
+ kref_get(&local->ref);
+
+ return local;
+@@ -177,10 +184,18 @@ static void local_release(struct kref *ref)
+
+ int nfc_llcp_local_put(struct nfc_llcp_local *local)
+ {
++ struct nfc_dev *dev;
++ int ret;
++
+ if (local == NULL)
+ return 0;
+
+- return kref_put(&local->ref, local_release);
++ dev = local->dev;
++
++ ret = kref_put(&local->ref, local_release);
++ nfc_put_device(dev);
++
++ return ret;
+ }
+
+ static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local,
+@@ -959,8 +974,17 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local,
+ }
+
+ new_sock = nfc_llcp_sock(new_sk);
+- new_sock->dev = local->dev;
++
+ new_sock->local = nfc_llcp_local_get(local);
++ if (!new_sock->local) {
++ reason = LLCP_DM_REJ;
++ sock_put(&new_sock->sk);
++ release_sock(&sock->sk);
++ sock_put(&sock->sk);
++ goto fail;
++ }
++
++ new_sock->dev = local->dev;
+ new_sock->rw = sock->rw;
+ new_sock->miux = sock->miux;
+ new_sock->nfc_protocol = sock->nfc_protocol;
+@@ -1597,7 +1621,16 @@ int nfc_llcp_register_device(struct nfc_dev *ndev)
+ if (local == NULL)
+ return -ENOMEM;
+
+- local->dev = ndev;
++ /* As we are going to initialize local's refcount, we need to get the
++ * nfc_dev to avoid UAF, otherwise there is no point in continuing.
++ * See nfc_llcp_local_get().
++ */
++ local->dev = nfc_get_device(ndev->idx);
++ if (!local->dev) {
++ kfree(local);
++ return -ENODEV;
++ }
++
+ INIT_LIST_HEAD(&local->list);
+ kref_init(&local->ref);
+ mutex_init(&local->sdp_lock);
+--
+2.43.0
+
--- /dev/null
+From 6e614dd77c02e745841357aa5fe9f9b4b1a63b2b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 2 Jan 2024 15:26:43 +0530
+Subject: octeontx2-af: Always configure NIX TX link credits based on max frame
+ size
+
+From: Naveen Mamindlapalli <naveenm@marvell.com>
+
+[ Upstream commit a0d9528f6daf7fe8de217fa80a94d2989d2a57a7 ]
+
+Currently the NIX TX link credits are initialized based on the max frame
+size that can be transmitted on a link but when the MTU is changed, the
+NIX TX link credits are reprogrammed by the SW based on the new MTU value.
+Since SMQ max packet length is programmed to max frame size by default,
+there is a chance that NIX TX may stall while sending a max frame sized
+packet on the link with insufficient credits to send the packet all at
+once. This patch avoids stall issue by not changing the link credits
+dynamically when the MTU is changed.
+
+Fixes: 1c74b89171c3 ("octeontx2-af: Wait for TX link idle for credits change")
+Signed-off-by: Naveen Mamindlapalli <naveenm@marvell.com>
+Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
+Signed-off-by: Nithin Kumar Dabilpuram <ndabilpuram@marvell.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../ethernet/marvell/octeontx2/af/rvu_nix.c | 110 +-----------------
+ 1 file changed, 3 insertions(+), 107 deletions(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+index 959f36efdc4a6..15f698020ec44 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+@@ -3923,90 +3923,18 @@ static void nix_find_link_frs(struct rvu *rvu,
+ req->minlen = minlen;
+ }
+
+-static int
+-nix_config_link_credits(struct rvu *rvu, int blkaddr, int link,
+- u16 pcifunc, u64 tx_credits)
+-{
+- struct rvu_hwinfo *hw = rvu->hw;
+- int pf = rvu_get_pf(pcifunc);
+- u8 cgx_id = 0, lmac_id = 0;
+- unsigned long poll_tmo;
+- bool restore_tx_en = 0;
+- struct nix_hw *nix_hw;
+- u64 cfg, sw_xoff = 0;
+- u32 schq = 0;
+- u32 credits;
+- int rc;
+-
+- nix_hw = get_nix_hw(rvu->hw, blkaddr);
+- if (!nix_hw)
+- return NIX_AF_ERR_INVALID_NIXBLK;
+-
+- if (tx_credits == nix_hw->tx_credits[link])
+- return 0;
+-
+- /* Enable cgx tx if disabled for credits to be back */
+- if (is_pf_cgxmapped(rvu, pf)) {
+- rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
+- restore_tx_en = !rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu),
+- lmac_id, true);
+- }
+-
+- mutex_lock(&rvu->rsrc_lock);
+- /* Disable new traffic to link */
+- if (hw->cap.nix_shaping) {
+- schq = nix_get_tx_link(rvu, pcifunc);
+- sw_xoff = rvu_read64(rvu, blkaddr, NIX_AF_TL1X_SW_XOFF(schq));
+- rvu_write64(rvu, blkaddr,
+- NIX_AF_TL1X_SW_XOFF(schq), BIT_ULL(0));
+- }
+-
+- rc = NIX_AF_ERR_LINK_CREDITS;
+- poll_tmo = jiffies + usecs_to_jiffies(200000);
+- /* Wait for credits to return */
+- do {
+- if (time_after(jiffies, poll_tmo))
+- goto exit;
+- usleep_range(100, 200);
+-
+- cfg = rvu_read64(rvu, blkaddr,
+- NIX_AF_TX_LINKX_NORM_CREDIT(link));
+- credits = (cfg >> 12) & 0xFFFFFULL;
+- } while (credits != nix_hw->tx_credits[link]);
+-
+- cfg &= ~(0xFFFFFULL << 12);
+- cfg |= (tx_credits << 12);
+- rvu_write64(rvu, blkaddr, NIX_AF_TX_LINKX_NORM_CREDIT(link), cfg);
+- rc = 0;
+-
+- nix_hw->tx_credits[link] = tx_credits;
+-
+-exit:
+- /* Enable traffic back */
+- if (hw->cap.nix_shaping && !sw_xoff)
+- rvu_write64(rvu, blkaddr, NIX_AF_TL1X_SW_XOFF(schq), 0);
+-
+- /* Restore state of cgx tx */
+- if (restore_tx_en)
+- rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu), lmac_id, false);
+-
+- mutex_unlock(&rvu->rsrc_lock);
+- return rc;
+-}
+-
+ int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req,
+ struct msg_rsp *rsp)
+ {
+ struct rvu_hwinfo *hw = rvu->hw;
+ u16 pcifunc = req->hdr.pcifunc;
+ int pf = rvu_get_pf(pcifunc);
+- int blkaddr, schq, link = -1;
+- struct nix_txsch *txsch;
+- u64 cfg, lmac_fifo_len;
++ int blkaddr, link = -1;
+ struct nix_hw *nix_hw;
+ struct rvu_pfvf *pfvf;
+ u8 cgx = 0, lmac = 0;
+ u16 max_mtu;
++ u64 cfg;
+
+ blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc);
+ if (blkaddr < 0)
+@@ -4027,25 +3955,6 @@ int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req,
+ if (req->update_minlen && req->minlen < NIC_HW_MIN_FRS)
+ return NIX_AF_ERR_FRS_INVALID;
+
+- /* Check if requester wants to update SMQ's */
+- if (!req->update_smq)
+- goto rx_frscfg;
+-
+- /* Update min/maxlen in each of the SMQ attached to this PF/VF */
+- txsch = &nix_hw->txsch[NIX_TXSCH_LVL_SMQ];
+- mutex_lock(&rvu->rsrc_lock);
+- for (schq = 0; schq < txsch->schq.max; schq++) {
+- if (TXSCH_MAP_FUNC(txsch->pfvf_map[schq]) != pcifunc)
+- continue;
+- cfg = rvu_read64(rvu, blkaddr, NIX_AF_SMQX_CFG(schq));
+- cfg = (cfg & ~(0xFFFFULL << 8)) | ((u64)req->maxlen << 8);
+- if (req->update_minlen)
+- cfg = (cfg & ~0x7FULL) | ((u64)req->minlen & 0x7F);
+- rvu_write64(rvu, blkaddr, NIX_AF_SMQX_CFG(schq), cfg);
+- }
+- mutex_unlock(&rvu->rsrc_lock);
+-
+-rx_frscfg:
+ /* Check if config is for SDP link */
+ if (req->sdp_link) {
+ if (!hw->sdp_links)
+@@ -4068,7 +3977,6 @@ int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req,
+ if (link < 0)
+ return NIX_AF_ERR_RX_LINK_INVALID;
+
+-
+ linkcfg:
+ nix_find_link_frs(rvu, req, pcifunc);
+
+@@ -4078,19 +3986,7 @@ int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req,
+ cfg = (cfg & ~0xFFFFULL) | req->minlen;
+ rvu_write64(rvu, blkaddr, NIX_AF_RX_LINKX_CFG(link), cfg);
+
+- if (req->sdp_link || pf == 0)
+- return 0;
+-
+- /* Update transmit credits for CGX links */
+- lmac_fifo_len = rvu_cgx_get_lmac_fifolen(rvu, cgx, lmac);
+- if (!lmac_fifo_len) {
+- dev_err(rvu->dev,
+- "%s: Failed to get CGX/RPM%d:LMAC%d FIFO size\n",
+- __func__, cgx, lmac);
+- return 0;
+- }
+- return nix_config_link_credits(rvu, blkaddr, link, pcifunc,
+- (lmac_fifo_len - req->maxlen) / 16);
++ return 0;
+ }
+
+ int rvu_mbox_handler_nix_set_rx_cfg(struct rvu *rvu, struct nix_rx_cfg *req,
+--
+2.43.0
+
--- /dev/null
+From abb25686716bdc469df9eb0f9cea42cd499e4e1e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 19 Dec 2023 19:56:33 +0530
+Subject: octeontx2-af: Fix marking couple of structure as __packed
+
+From: Suman Ghosh <sumang@marvell.com>
+
+[ Upstream commit 0ee2384a5a0f3b4eeac8d10bb01a0609d245a4d1 ]
+
+Couple of structures was not marked as __packed. This patch
+fixes the same and mark them as __packed.
+
+Fixes: 42006910b5ea ("octeontx2-af: cleanup KPU config data")
+Signed-off-by: Suman Ghosh <sumang@marvell.com>
+Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/marvell/octeontx2/af/npc.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
+index d027c23b8ef8e..aaff91bc7415a 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/npc.h
++++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h
+@@ -514,7 +514,7 @@ struct npc_lt_def {
+ u8 ltype_mask;
+ u8 ltype_match;
+ u8 lid;
+-};
++} __packed;
+
+ struct npc_lt_def_ipsec {
+ u8 ltype_mask;
+@@ -522,7 +522,7 @@ struct npc_lt_def_ipsec {
+ u8 lid;
+ u8 spi_offset;
+ u8 spi_nz;
+-};
++} __packed;
+
+ struct npc_lt_def_apad {
+ u8 ltype_mask;
+--
+2.43.0
+
--- /dev/null
+From c1c4d52f9e1a5f81883fafcb8b866f3bb4e20f70 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Dec 2023 14:57:54 +0530
+Subject: octeontx2-af: Fix pause frame configuration
+
+From: Hariprasad Kelam <hkelam@marvell.com>
+
+[ Upstream commit e307b5a845c5951dabafc48d00b6424ee64716c4 ]
+
+The current implementation's default Pause Forward setting is causing
+unnecessary network traffic. This patch disables Pause Forward to
+address this issue.
+
+Fixes: 1121f6b02e7a ("octeontx2-af: Priority flow control configuration support")
+Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
+Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/marvell/octeontx2/af/rpm.c | 11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c
+index a70e1153fa04b..6b4792a942d84 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c
+@@ -283,6 +283,11 @@ void rpm_lmac_pause_frm_config(void *rpmd, int lmac_id, bool enable)
+ cfg = FIELD_SET(RPM_PFC_CLASS_MASK, 0, cfg);
+ rpm_write(rpm, lmac_id, RPMX_CMRX_PRT_CBFC_CTL, cfg);
+
++ /* Disable forward pause to driver */
++ cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG);
++ cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD;
++ rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg);
++
+ /* Enable channel mask for all LMACS */
+ rpm_write(rpm, 0, RPMX_CMR_CHAN_MSK_OR, ~0ULL);
+ }
+@@ -451,12 +456,10 @@ int rpm_lmac_pfc_config(void *rpmd, int lmac_id, u8 tx_pause, u8 rx_pause, u16 p
+
+ if (rx_pause) {
+ cfg &= ~(RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE |
+- RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE |
+- RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD);
++ RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE);
+ } else {
+ cfg |= (RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE |
+- RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE |
+- RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD);
++ RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE);
+ }
+
+ if (tx_pause) {
+--
+2.43.0
+
--- /dev/null
+From a82d68a811ec60556c67c80775519e4d65f02f35 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 2 Jan 2024 19:44:00 +0530
+Subject: octeontx2-af: Re-enable MAC TX in otx2_stop processing
+
+From: Naveen Mamindlapalli <naveenm@marvell.com>
+
+[ Upstream commit 818ed8933bd17bc91a9fa8b94a898189c546fc1a ]
+
+During QoS scheduling testing with multiple strict priority flows, the
+netdev tx watchdog timeout routine is invoked when a low priority QoS
+queue doesn't get a chance to transmit the packets because other high
+priority flows are completely subscribing the transmit link. The netdev
+tx watchdog timeout routine will stop MAC RX and TX functionality in
+otx2_stop() routine before cleanup of HW TX queues which results in SMQ
+flush errors because the packets belonging to low priority queues will
+never gets flushed since MAC TX is disabled. This patch fixes the issue
+by re-enabling MAC TX to ensure the packets in HW pipeline gets flushed
+properly.
+
+Fixes: a7faa68b4e7f ("octeontx2-af: Start/Stop traffic in CGX along with NPC")
+Signed-off-by: Naveen Mamindlapalli <naveenm@marvell.com>
+Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/marvell/octeontx2/af/rvu.h | 1 +
+ .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 17 +++++++++++++++++
+ .../net/ethernet/marvell/octeontx2/af/rvu_nix.c | 8 +++++++-
+ 3 files changed, 25 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+index 95a7bc396e8ea..ab78e9d020751 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+@@ -850,6 +850,7 @@ u32 rvu_cgx_get_fifolen(struct rvu *rvu);
+ void *rvu_first_cgx_pdata(struct rvu *rvu);
+ int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id);
+ int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable);
++int rvu_cgx_tx_enable(struct rvu *rvu, u16 pcifunc, bool enable);
+ int rvu_cgx_prio_flow_ctrl_cfg(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause,
+ u16 pfc_en);
+ int rvu_cgx_cfg_pause_frm(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause);
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
+index c60b9580ca969..fa658bd4dfb3b 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
+@@ -456,6 +456,23 @@ int rvu_cgx_config_rxtx(struct rvu *rvu, u16 pcifunc, bool start)
+ return mac_ops->mac_rx_tx_enable(cgxd, lmac_id, start);
+ }
+
++int rvu_cgx_tx_enable(struct rvu *rvu, u16 pcifunc, bool enable)
++{
++ int pf = rvu_get_pf(pcifunc);
++ struct mac_ops *mac_ops;
++ u8 cgx_id, lmac_id;
++ void *cgxd;
++
++ if (!is_cgx_config_permitted(rvu, pcifunc))
++ return LMAC_AF_ERR_PERM_DENIED;
++
++ rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id);
++ cgxd = rvu_cgx_pdata(cgx_id, rvu);
++ mac_ops = get_mac_ops(cgxd);
++
++ return mac_ops->mac_tx_enable(cgxd, lmac_id, enable);
++}
++
+ int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable)
+ {
+ struct mac_ops *mac_ops;
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+index 15f698020ec44..7f9581ce7f1fe 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+@@ -4506,7 +4506,13 @@ int rvu_mbox_handler_nix_lf_stop_rx(struct rvu *rvu, struct msg_req *req,
+ pfvf = rvu_get_pfvf(rvu, pcifunc);
+ clear_bit(NIXLF_INITIALIZED, &pfvf->flags);
+
+- return rvu_cgx_start_stop_io(rvu, pcifunc, false);
++ err = rvu_cgx_start_stop_io(rvu, pcifunc, false);
++ if (err)
++ return err;
++
++ rvu_cgx_tx_enable(rvu, pcifunc, true);
++
++ return 0;
+ }
+
+ #define RX_SA_BASE GENMASK_ULL(52, 7)
+--
+2.43.0
+
--- /dev/null
+From cc60721ea4736bbfd6d8355b455f0c847a4ccbe0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 5 Dec 2022 12:35:18 +0530
+Subject: octeontx2-af: Support variable number of lmacs
+
+From: Rakesh Babu Saladi <rsaladi2@marvell.com>
+
+[ Upstream commit f2e664ad503d4e5ce7c42a0862ab164331a0ef37 ]
+
+Most of the code in CGX/RPM driver assumes that max lmacs per
+given MAC as always, 4 and the number of MAC blocks also as 4.
+With this assumption, the max number of interfaces supported is
+hardcoded to 16. This creates a problem as next gen CN10KB silicon
+MAC supports 8 lmacs per MAC block.
+
+This patch solves the problem by using "max lmac per MAC block"
+value from constant csrs and uses cgx_cnt_max value which is
+populated based number of MAC blocks supported by silicon.
+
+Signed-off-by: Rakesh Babu Saladi <rsaladi2@marvell.com>
+Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
+Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Stable-dep-of: e307b5a845c5 ("octeontx2-af: Fix pause frame configuration")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/marvell/octeontx2/af/cgx.c | 35 ++++++++-----------
+ .../net/ethernet/marvell/octeontx2/af/cgx.h | 6 ++--
+ .../marvell/octeontx2/af/lmac_common.h | 5 ++-
+ .../net/ethernet/marvell/octeontx2/af/rvu.h | 2 +-
+ .../ethernet/marvell/octeontx2/af/rvu_cgx.c | 26 ++++++++------
+ .../marvell/octeontx2/af/rvu_debugfs.c | 2 +-
+ .../ethernet/marvell/octeontx2/af/rvu_nix.c | 2 +-
+ .../marvell/octeontx2/af/rvu_npc_hash.c | 4 ++-
+ 8 files changed, 42 insertions(+), 40 deletions(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+index 65c0373d34d12..90be87dc105d3 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c
+@@ -78,7 +78,7 @@ static bool is_dev_rpm(void *cgxd)
+
+ bool is_lmac_valid(struct cgx *cgx, int lmac_id)
+ {
+- if (!cgx || lmac_id < 0 || lmac_id >= MAX_LMAC_PER_CGX)
++ if (!cgx || lmac_id < 0 || lmac_id >= cgx->max_lmac_per_mac)
+ return false;
+ return test_bit(lmac_id, &cgx->lmac_bmap);
+ }
+@@ -90,7 +90,7 @@ static int get_sequence_id_of_lmac(struct cgx *cgx, int lmac_id)
+ {
+ int tmp, id = 0;
+
+- for_each_set_bit(tmp, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) {
++ for_each_set_bit(tmp, &cgx->lmac_bmap, cgx->max_lmac_per_mac) {
+ if (tmp == lmac_id)
+ break;
+ id++;
+@@ -121,7 +121,7 @@ u64 cgx_read(struct cgx *cgx, u64 lmac, u64 offset)
+
+ struct lmac *lmac_pdata(u8 lmac_id, struct cgx *cgx)
+ {
+- if (!cgx || lmac_id >= MAX_LMAC_PER_CGX)
++ if (!cgx || lmac_id >= cgx->max_lmac_per_mac)
+ return NULL;
+
+ return cgx->lmac_idmap[lmac_id];
+@@ -1410,7 +1410,7 @@ int cgx_get_fwdata_base(u64 *base)
+ if (!cgx)
+ return -ENXIO;
+
+- first_lmac = find_first_bit(&cgx->lmac_bmap, MAX_LMAC_PER_CGX);
++ first_lmac = find_first_bit(&cgx->lmac_bmap, cgx->max_lmac_per_mac);
+ req = FIELD_SET(CMDREG_ID, CGX_CMD_GET_FWD_BASE, req);
+ err = cgx_fwi_cmd_generic(req, &resp, cgx, first_lmac);
+ if (!err)
+@@ -1499,7 +1499,7 @@ static int cgx_fwi_link_change(struct cgx *cgx, int lmac_id, bool enable)
+
+ static inline int cgx_fwi_read_version(u64 *resp, struct cgx *cgx)
+ {
+- int first_lmac = find_first_bit(&cgx->lmac_bmap, MAX_LMAC_PER_CGX);
++ int first_lmac = find_first_bit(&cgx->lmac_bmap, cgx->max_lmac_per_mac);
+ u64 req = 0;
+
+ req = FIELD_SET(CMDREG_ID, CGX_CMD_GET_FW_VER, req);
+@@ -1537,7 +1537,7 @@ static void cgx_lmac_linkup_work(struct work_struct *work)
+ int i, err;
+
+ /* Do Link up for all the enabled lmacs */
+- for_each_set_bit(i, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) {
++ for_each_set_bit(i, &cgx->lmac_bmap, cgx->max_lmac_per_mac) {
+ err = cgx_fwi_link_change(cgx, i, true);
+ if (err)
+ dev_info(dev, "cgx port %d:%d Link up command failed\n",
+@@ -1557,14 +1557,6 @@ int cgx_lmac_linkup_start(void *cgxd)
+ return 0;
+ }
+
+-static void cgx_lmac_get_fifolen(struct cgx *cgx)
+-{
+- u64 cfg;
+-
+- cfg = cgx_read(cgx, 0, CGX_CONST);
+- cgx->mac_ops->fifo_len = FIELD_GET(CGX_CONST_RXFIFO_SIZE, cfg);
+-}
+-
+ static int cgx_configure_interrupt(struct cgx *cgx, struct lmac *lmac,
+ int cnt, bool req_free)
+ {
+@@ -1619,17 +1611,14 @@ static int cgx_lmac_init(struct cgx *cgx)
+ u64 lmac_list;
+ int i, err;
+
+- cgx_lmac_get_fifolen(cgx);
+-
+- cgx->lmac_count = cgx->mac_ops->get_nr_lmacs(cgx);
+ /* lmac_list specifies which lmacs are enabled
+ * when bit n is set to 1, LMAC[n] is enabled
+ */
+ if (cgx->mac_ops->non_contiguous_serdes_lane)
+ lmac_list = cgx_read(cgx, 0, CGXX_CMRX_RX_LMACS) & 0xFULL;
+
+- if (cgx->lmac_count > MAX_LMAC_PER_CGX)
+- cgx->lmac_count = MAX_LMAC_PER_CGX;
++ if (cgx->lmac_count > cgx->max_lmac_per_mac)
++ cgx->lmac_count = cgx->max_lmac_per_mac;
+
+ for (i = 0; i < cgx->lmac_count; i++) {
+ lmac = kzalloc(sizeof(struct lmac), GFP_KERNEL);
+@@ -1707,7 +1696,7 @@ static int cgx_lmac_exit(struct cgx *cgx)
+ }
+
+ /* Free all lmac related resources */
+- for_each_set_bit(i, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) {
++ for_each_set_bit(i, &cgx->lmac_bmap, cgx->max_lmac_per_mac) {
+ lmac = cgx->lmac_idmap[i];
+ if (!lmac)
+ continue;
+@@ -1723,6 +1712,12 @@ static int cgx_lmac_exit(struct cgx *cgx)
+
+ static void cgx_populate_features(struct cgx *cgx)
+ {
++ u64 cfg;
++
++ cfg = cgx_read(cgx, 0, CGX_CONST);
++ cgx->mac_ops->fifo_len = FIELD_GET(CGX_CONST_RXFIFO_SIZE, cfg);
++ cgx->max_lmac_per_mac = FIELD_GET(CGX_CONST_MAX_LMACS, cfg);
++
+ if (is_dev_rpm(cgx))
+ cgx->hw_features = (RVU_LMAC_FEAT_DMACF | RVU_MAC_RPM |
+ RVU_LMAC_FEAT_FC | RVU_LMAC_FEAT_PTP);
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
+index 04338db38671b..09ddb00f63cc7 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
++++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h
+@@ -18,11 +18,8 @@
+ /* PCI BAR nos */
+ #define PCI_CFG_REG_BAR_NUM 0
+
+-#define CGX_ID_MASK 0x7
+-#define MAX_LMAC_PER_CGX 4
++#define CGX_ID_MASK 0xF
+ #define MAX_DMAC_ENTRIES_PER_CGX 32
+-#define CGX_FIFO_LEN 65536 /* 64K for both Rx & Tx */
+-#define CGX_OFFSET(x) ((x) * MAX_LMAC_PER_CGX)
+
+ /* Registers */
+ #define CGXX_CMRX_CFG 0x00
+@@ -56,6 +53,7 @@
+ #define CGXX_SCRATCH1_REG 0x1058
+ #define CGX_CONST 0x2000
+ #define CGX_CONST_RXFIFO_SIZE GENMASK_ULL(23, 0)
++#define CGX_CONST_MAX_LMACS GENMASK_ULL(31, 24)
+ #define CGXX_SPUX_CONTROL1 0x10000
+ #define CGXX_SPUX_LNX_FEC_CORR_BLOCKS 0x10700
+ #define CGXX_SPUX_LNX_FEC_UNCORR_BLOCKS 0x10800
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h
+index 52b6016789fa4..697cfec74aa1e 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h
++++ b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h
+@@ -128,7 +128,10 @@ struct cgx {
+ struct pci_dev *pdev;
+ u8 cgx_id;
+ u8 lmac_count;
+- struct lmac *lmac_idmap[MAX_LMAC_PER_CGX];
++ /* number of LMACs per MAC could be 4 or 8 */
++ u8 max_lmac_per_mac;
++#define MAX_LMAC_COUNT 8
++ struct lmac *lmac_idmap[MAX_LMAC_COUNT];
+ struct work_struct cgx_cmd_work;
+ struct workqueue_struct *cgx_cmd_workq;
+ struct list_head cgx_list;
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+index ab78e9d020751..0b76dfa979d4e 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+@@ -480,7 +480,7 @@ struct rvu {
+ u8 cgx_mapped_pfs;
+ u8 cgx_cnt_max; /* CGX port count max */
+ u8 *pf2cgxlmac_map; /* pf to cgx_lmac map */
+- u16 *cgxlmac2pf_map; /* bitmap of mapped pfs for
++ u64 *cgxlmac2pf_map; /* bitmap of mapped pfs for
+ * every cgx lmac port
+ */
+ unsigned long pf_notify_bmap; /* Flags for PF notification */
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
+index fa658bd4dfb3b..bcb4385d0621c 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c
+@@ -55,8 +55,9 @@ bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature)
+ return (cgx_features_get(cgxd) & feature);
+ }
+
++#define CGX_OFFSET(x) ((x) * rvu->hw->lmac_per_cgx)
+ /* Returns bitmap of mapped PFs */
+-static u16 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id)
++static u64 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id)
+ {
+ return rvu->cgxlmac2pf_map[CGX_OFFSET(cgx_id) + lmac_id];
+ }
+@@ -71,7 +72,8 @@ int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id)
+ if (!pfmap)
+ return -ENODEV;
+ else
+- return find_first_bit(&pfmap, 16);
++ return find_first_bit(&pfmap,
++ rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx);
+ }
+
+ static u8 cgxlmac_id_to_bmap(u8 cgx_id, u8 lmac_id)
+@@ -129,14 +131,14 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu)
+ if (!cgx_cnt_max)
+ return 0;
+
+- if (cgx_cnt_max > 0xF || MAX_LMAC_PER_CGX > 0xF)
++ if (cgx_cnt_max > 0xF || rvu->hw->lmac_per_cgx > 0xF)
+ return -EINVAL;
+
+ /* Alloc map table
+ * An additional entry is required since PF id starts from 1 and
+ * hence entry at offset 0 is invalid.
+ */
+- size = (cgx_cnt_max * MAX_LMAC_PER_CGX + 1) * sizeof(u8);
++ size = (cgx_cnt_max * rvu->hw->lmac_per_cgx + 1) * sizeof(u8);
+ rvu->pf2cgxlmac_map = devm_kmalloc(rvu->dev, size, GFP_KERNEL);
+ if (!rvu->pf2cgxlmac_map)
+ return -ENOMEM;
+@@ -145,9 +147,10 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu)
+ memset(rvu->pf2cgxlmac_map, 0xFF, size);
+
+ /* Reverse map table */
+- rvu->cgxlmac2pf_map = devm_kzalloc(rvu->dev,
+- cgx_cnt_max * MAX_LMAC_PER_CGX * sizeof(u16),
+- GFP_KERNEL);
++ rvu->cgxlmac2pf_map =
++ devm_kzalloc(rvu->dev,
++ cgx_cnt_max * rvu->hw->lmac_per_cgx * sizeof(u64),
++ GFP_KERNEL);
+ if (!rvu->cgxlmac2pf_map)
+ return -ENOMEM;
+
+@@ -156,7 +159,7 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu)
+ if (!rvu_cgx_pdata(cgx, rvu))
+ continue;
+ lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu));
+- for_each_set_bit(iter, &lmac_bmap, MAX_LMAC_PER_CGX) {
++ for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) {
+ lmac = cgx_get_lmacid(rvu_cgx_pdata(cgx, rvu),
+ iter);
+ rvu->pf2cgxlmac_map[pf] = cgxlmac_id_to_bmap(cgx, lmac);
+@@ -235,7 +238,8 @@ static void cgx_notify_pfs(struct cgx_link_event *event, struct rvu *rvu)
+ pfmap = cgxlmac_to_pfmap(rvu, event->cgx_id, event->lmac_id);
+
+ do {
+- pfid = find_first_bit(&pfmap, 16);
++ pfid = find_first_bit(&pfmap,
++ rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx);
+ clear_bit(pfid, &pfmap);
+
+ /* check if notification is enabled */
+@@ -310,7 +314,7 @@ static int cgx_lmac_event_handler_init(struct rvu *rvu)
+ if (!cgxd)
+ continue;
+ lmac_bmap = cgx_get_lmac_bmap(cgxd);
+- for_each_set_bit(lmac, &lmac_bmap, MAX_LMAC_PER_CGX) {
++ for_each_set_bit(lmac, &lmac_bmap, rvu->hw->lmac_per_cgx) {
+ err = cgx_lmac_evh_register(&cb, cgxd, lmac);
+ if (err)
+ dev_err(rvu->dev,
+@@ -396,7 +400,7 @@ int rvu_cgx_exit(struct rvu *rvu)
+ if (!cgxd)
+ continue;
+ lmac_bmap = cgx_get_lmac_bmap(cgxd);
+- for_each_set_bit(lmac, &lmac_bmap, MAX_LMAC_PER_CGX)
++ for_each_set_bit(lmac, &lmac_bmap, rvu->hw->lmac_per_cgx)
+ cgx_lmac_evh_unregister(cgxd, lmac);
+ }
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+index 5c9dc3f9262f5..cc5d342e026c7 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+@@ -2618,7 +2618,7 @@ static void rvu_dbg_cgx_init(struct rvu *rvu)
+ rvu->rvu_dbg.cgx = debugfs_create_dir(dname,
+ rvu->rvu_dbg.cgx_root);
+
+- for_each_set_bit(lmac_id, &lmac_bmap, MAX_LMAC_PER_CGX) {
++ for_each_set_bit(lmac_id, &lmac_bmap, rvu->hw->lmac_per_cgx) {
+ /* lmac debugfs dir */
+ sprintf(dname, "lmac%d", lmac_id);
+ rvu->rvu_dbg.lmac =
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+index 7f9581ce7f1fe..bb99302eab67a 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+@@ -4079,7 +4079,7 @@ static void nix_link_config(struct rvu *rvu, int blkaddr,
+
+ /* Get LMAC id's from bitmap */
+ lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu));
+- for_each_set_bit(iter, &lmac_bmap, MAX_LMAC_PER_CGX) {
++ for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) {
+ lmac_fifo_len = rvu_cgx_get_lmac_fifolen(rvu, cgx, iter);
+ if (!lmac_fifo_len) {
+ dev_err(rvu->dev,
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c
+index 34fa59575fa91..54e0dfdc9d984 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c
+@@ -1999,7 +1999,9 @@ int rvu_npc_exact_init(struct rvu *rvu)
+ /* Install SDP drop rule */
+ drop_mcam_idx = &table->num_drop_rules;
+
+- max_lmac_cnt = rvu->cgx_cnt_max * MAX_LMAC_PER_CGX + PF_CGXMAP_BASE;
++ max_lmac_cnt = rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx +
++ PF_CGXMAP_BASE;
++
+ for (i = PF_CGXMAP_BASE; i < max_lmac_cnt; i++) {
+ if (rvu->pf2cgxlmac_map[i] == 0xFF)
+ continue;
+--
+2.43.0
+
--- /dev/null
+From 583a0fa5c6f48858b3592059eba607d74041c813 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 22 Dec 2023 12:34:09 +0800
+Subject: r8169: Fix PCI error on system resume
+
+From: Kai-Heng Feng <kai.heng.feng@canonical.com>
+
+[ Upstream commit 9c476269bff2908a20930c58085bf0b05ebd569a ]
+
+Some r8168 NICs stop working upon system resume:
+
+[ 688.051096] r8169 0000:02:00.1 enp2s0f1: rtl_ep_ocp_read_cond == 0 (loop: 10, delay: 10000).
+[ 688.175131] r8169 0000:02:00.1 enp2s0f1: Link is Down
+...
+[ 691.534611] r8169 0000:02:00.1 enp2s0f1: PCI error (cmd = 0x0407, status_errs = 0x0000)
+
+Not sure if it's related, but those NICs have a BMC device at function
+0:
+02:00.0 Unassigned class [ff00]: Realtek Semiconductor Co., Ltd. Realtek RealManage BMC [10ec:816e] (rev 1a)
+
+Trial and error shows that increase the loop wait on
+rtl_ep_ocp_read_cond to 30 can eliminate the issue, so let
+rtl8168ep_driver_start() to wait a bit longer.
+
+Fixes: e6d6ca6e1204 ("r8169: Add support for another RTL8168FP")
+Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
+Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/realtek/r8169_main.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
+index d22457f2cf9cf..06663c11ca96d 100644
+--- a/drivers/net/ethernet/realtek/r8169_main.c
++++ b/drivers/net/ethernet/realtek/r8169_main.c
+@@ -1145,7 +1145,7 @@ static void rtl8168ep_driver_start(struct rtl8169_private *tp)
+ {
+ r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_START);
+ r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01);
+- rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 10);
++ rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30);
+ }
+
+ static void rtl8168_driver_start(struct rtl8169_private *tp)
+--
+2.43.0
+
--- /dev/null
+From 861ba5891da49cc7768295f98827c32ed0dcd73a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Dec 2023 14:30:49 -0500
+Subject: ring-buffer: Fix 32-bit rb_time_read() race with rb_time_cmpxchg()
+
+From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+
+[ Upstream commit dec890089bf79a4954b61482715ee2d084364856 ]
+
+The following race can cause rb_time_read() to observe a corrupted time
+stamp:
+
+rb_time_cmpxchg()
+[...]
+ if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
+ return false;
+ if (!rb_time_read_cmpxchg(&t->top, top, top2))
+ return false;
+<interrupted before updating bottom>
+__rb_time_read()
+[...]
+ do {
+ c = local_read(&t->cnt);
+ top = local_read(&t->top);
+ bottom = local_read(&t->bottom);
+ msb = local_read(&t->msb);
+ } while (c != local_read(&t->cnt));
+
+ *cnt = rb_time_cnt(top);
+
+ /* If top and msb counts don't match, this interrupted a write */
+ if (*cnt != rb_time_cnt(msb))
+ return false;
+ ^ this check fails to catch that "bottom" is still not updated.
+
+So the old "bottom" value is returned, which is wrong.
+
+Fix this by checking that all three of msb, top, and bottom 2-bit cnt
+values match.
+
+The reason to favor checking all three fields over requiring a specific
+update order for both rb_time_set() and rb_time_cmpxchg() is because
+checking all three fields is more robust to handle partial failures of
+rb_time_cmpxchg() when interrupted by nested rb_time_set().
+
+Link: https://lore.kernel.org/lkml/20231211201324.652870-1-mathieu.desnoyers@efficios.com/
+Link: https://lore.kernel.org/linux-trace-kernel/20231212193049.680122-1-mathieu.desnoyers@efficios.com
+
+Fixes: f458a1453424e ("ring-buffer: Test last update in 32bit version of __rb_time_read()")
+Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/trace/ring_buffer.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
+index 06d52525407b8..71cad4f1323c6 100644
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -646,8 +646,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
+
+ *cnt = rb_time_cnt(top);
+
+- /* If top and msb counts don't match, this interrupted a write */
+- if (*cnt != rb_time_cnt(msb))
++ /* If top, msb or bottom counts don't match, this interrupted a write */
++ if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
+ return false;
+
+ /* The shift to msb will lose its cnt bits */
+--
+2.43.0
+
--- /dev/null
+From 06bb501441103a4e4dd88b341771b77debd509e7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 23 Dec 2022 11:03:32 +0100
+Subject: s390/cpumf: support user space events for counting
+
+From: Thomas Richter <tmricht@linux.ibm.com>
+
+[ Upstream commit 91d5364dc673fa9cf3a5b7b30cf33c70803eb3a4 ]
+
+CPU Measurement counting facility events PROBLEM_STATE_CPU_CYCLES(32)
+and PROBLEM_STATE_INSTRUCTIONS(33) are valid events. However the device
+driver returns error -EOPNOTSUPP when these event are to be installed.
+
+Fix this and allow installation of events PROBLEM_STATE_CPU_CYCLES,
+PROBLEM_STATE_CPU_CYCLES:u, PROBLEM_STATE_INSTRUCTIONS and
+PROBLEM_STATE_INSTRUCTIONS:u.
+Kernel space counting only is still not supported by s390.
+
+Signed-off-by: Thomas Richter <tmricht@linux.ibm.com>
+Acked-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
+Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
+Stable-dep-of: 09cda0a40051 ("s390/mm: add missing arch_set_page_dat() call to vmem_crst_alloc()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/s390/kernel/perf_cpum_cf.c | 35 ++++++++++++++++++++++-----------
+ 1 file changed, 24 insertions(+), 11 deletions(-)
+
+diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
+index f043a7ff220b7..28fa80fd69fa0 100644
+--- a/arch/s390/kernel/perf_cpum_cf.c
++++ b/arch/s390/kernel/perf_cpum_cf.c
+@@ -2,7 +2,7 @@
+ /*
+ * Performance event support for s390x - CPU-measurement Counter Facility
+ *
+- * Copyright IBM Corp. 2012, 2021
++ * Copyright IBM Corp. 2012, 2022
+ * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com>
+ * Thomas Richter <tmricht@linux.ibm.com>
+ */
+@@ -434,6 +434,12 @@ static void cpumf_hw_inuse(void)
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+
++static int is_userspace_event(u64 ev)
++{
++ return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev ||
++ cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev;
++}
++
+ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
+ {
+ struct perf_event_attr *attr = &event->attr;
+@@ -456,19 +462,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type)
+ if (is_sampling_event(event)) /* No sampling support */
+ return -ENOENT;
+ ev = attr->config;
+- /* Count user space (problem-state) only */
+ if (!attr->exclude_user && attr->exclude_kernel) {
+- if (ev >= ARRAY_SIZE(cpumf_generic_events_user))
+- return -EOPNOTSUPP;
+- ev = cpumf_generic_events_user[ev];
+-
+- /* No support for kernel space counters only */
++ /*
++ * Count user space (problem-state) only
++ * Handle events 32 and 33 as 0:u and 1:u
++ */
++ if (!is_userspace_event(ev)) {
++ if (ev >= ARRAY_SIZE(cpumf_generic_events_user))
++ return -EOPNOTSUPP;
++ ev = cpumf_generic_events_user[ev];
++ }
+ } else if (!attr->exclude_kernel && attr->exclude_user) {
++ /* No support for kernel space counters only */
+ return -EOPNOTSUPP;
+- } else { /* Count user and kernel space */
+- if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
+- return -EOPNOTSUPP;
+- ev = cpumf_generic_events_basic[ev];
++ } else {
++ /* Count user and kernel space, incl. events 32 + 33 */
++ if (!is_userspace_event(ev)) {
++ if (ev >= ARRAY_SIZE(cpumf_generic_events_basic))
++ return -EOPNOTSUPP;
++ ev = cpumf_generic_events_basic[ev];
++ }
+ }
+ break;
+
+--
+2.43.0
+
--- /dev/null
+From 2c2e7b36450c06fc93a5c80dd815dfff68ba45f9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 17 Oct 2023 21:07:04 +0200
+Subject: s390/mm: add missing arch_set_page_dat() call to vmem_crst_alloc()
+
+From: Heiko Carstens <hca@linux.ibm.com>
+
+[ Upstream commit 09cda0a400519b1541591c506e54c9c48e3101bf ]
+
+If the cmma no-dat feature is available all pages that are not used for
+dynamic address translation are marked as "no-dat" with the ESSA
+instruction. This information is visible to the hypervisor, so that the
+hypervisor can optimize purging of guest TLB entries. This also means that
+pages which are used for dynamic address translation must not be marked as
+"no-dat", since the hypervisor may then incorrectly not purge guest TLB
+entries.
+
+Region and segment tables allocated via vmem_crst_alloc() are incorrectly
+marked as "no-dat", as soon as slab_is_available() returns true.
+
+Such tables are allocated e.g. when kernel page tables are split, memory is
+hotplugged, or a DCSS segment is loaded.
+
+Fix this by adding the missing arch_set_page_dat() call.
+
+Cc: <stable@vger.kernel.org>
+Reviewed-by: Claudio Imbrenda <imbrenda@linux.ibm.com>
+Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
+Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/s390/mm/vmem.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
+index 9a0ce5315f36d..3cbb461820666 100644
+--- a/arch/s390/mm/vmem.c
++++ b/arch/s390/mm/vmem.c
+@@ -11,6 +11,7 @@
+ #include <linux/list.h>
+ #include <linux/hugetlb.h>
+ #include <linux/slab.h>
++#include <asm/page-states.h>
+ #include <asm/cacheflush.h>
+ #include <asm/nospec-branch.h>
+ #include <asm/pgalloc.h>
+@@ -44,8 +45,11 @@ void *vmem_crst_alloc(unsigned long val)
+ unsigned long *table;
+
+ table = vmem_alloc_pages(CRST_ALLOC_ORDER);
+- if (table)
+- crst_table_init(table, val);
++ if (!table)
++ return NULL;
++ crst_table_init(table, val);
++ if (slab_is_available())
++ arch_set_page_dat(virt_to_page(table), CRST_ALLOC_ORDER);
+ return table;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From d7b27f0f9c2a4de4109f6bda4aae6edb2b4cc9b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 23 Dec 2023 20:59:22 +0800
+Subject: selftests: bonding: do not set port down when adding to bond
+
+From: Hangbin Liu <liuhangbin@gmail.com>
+
+[ Upstream commit 61fa2493ca76fd7bb74e13f0205274f4ab0aa696 ]
+
+Similar to commit be809424659c ("selftests: bonding: do not set port down
+before adding to bond"). The bond-arp-interval-causes-panic test failed
+after commit a4abfa627c38 ("net: rtnetlink: Enslave device before bringing
+it up") as the kernel will set the port down _after_ adding to bond if setting
+port down specifically.
+
+Fix it by removing the link down operation when adding to bond.
+
+Fixes: 2ffd57327ff1 ("selftests: bonding: cause oops in bond_rr_gen_slave_id")
+Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
+Tested-by: Benjamin Poirier <benjamin.poirier@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../drivers/net/bonding/bond-arp-interval-causes-panic.sh | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh
+index 71c00bfafbc99..2ff58fed76e28 100755
+--- a/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh
++++ b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh
+@@ -33,16 +33,16 @@ ip netns add "client"
+ ip link set dev link1_1 netns client down name eth0
+ ip netns exec client ip link add dev bond0 down type bond mode 1 \
+ miimon 100 all_slaves_active 1
+-ip netns exec client ip link set dev eth0 down master bond0
++ip netns exec client ip link set dev eth0 master bond0
+ ip netns exec client ip link set dev bond0 up
+ ip netns exec client ip addr add ${client_ip4}/24 dev bond0
+ ip netns exec client ping -c 5 $server_ip4 >/dev/null
+
+-ip netns exec client ip link set dev eth0 down nomaster
++ip netns exec client ip link set dev eth0 nomaster
+ ip netns exec client ip link set dev bond0 down
+ ip netns exec client ip link set dev bond0 type bond mode 0 \
+ arp_interval 1000 arp_ip_target "+${server_ip4}"
+-ip netns exec client ip link set dev eth0 down master bond0
++ip netns exec client ip link set dev eth0 master bond0
+ ip netns exec client ip link set dev bond0 up
+ ip netns exec client ping -c 5 $server_ip4 >/dev/null
+
+--
+2.43.0
+
--- /dev/null
+From 329325f50f83617625c97e5ba5e4dcfd74ed1e79 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 14 Nov 2023 00:16:17 +0100
+Subject: selftests: mptcp: fix fastclose with csum failure
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+[ Upstream commit 7cefbe5e1dacc7236caa77e9d072423f21422fe2 ]
+
+Running the mp_join selftest manually with the following command line:
+
+ ./mptcp_join.sh -z -C
+
+leads to some failures:
+
+ 002 fastclose server test
+ # ...
+ rtx [fail] got 1 MP_RST[s] TX expected 0
+ # ...
+ rstrx [fail] got 1 MP_RST[s] RX expected 0
+
+The problem is really in the wrong expectations for the RST checks
+implied by the csum validation. Note that the same check is repeated
+explicitly in the same test-case, with the correct expectation and
+pass successfully.
+
+Address the issue explicitly setting the correct expectation for
+the failing checks.
+
+Reported-by: Xiumei Mu <xmu@redhat.com>
+Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts <matttbe@kernel.org>
+Signed-off-by: Matthieu Baerts <matttbe@kernel.org>
+Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-5-7b9cd6a7b7f4@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
+index e52d513009fb0..9d8dde3b5c332 100755
+--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
++++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
+@@ -3041,7 +3041,7 @@ fastclose_tests()
+
+ if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then
+ run_tests $ns1 $ns2 10.0.1.1 1024 0 fastclose_server
+- chk_join_nr 0 0 0
++ chk_join_nr 0 0 0 0 0 0 1
+ chk_fclose_nr 1 1 invert
+ chk_rst_nr 1 1
+ fi
+--
+2.43.0
+
--- /dev/null
+From 6465bc887b9ddfd7a4e8a118a4c52b4bf285ea3f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 23 Jun 2023 10:34:09 -0700
+Subject: selftests: mptcp: set FAILING_LINKS in run_tests
+
+From: Geliang Tang <geliang.tang@suse.com>
+
+[ Upstream commit be7e9786c9155c2942cd53b813e4723be67e07c4 ]
+
+Set FAILING_LINKS as an env var with a limited scope only when calling
+run_tests().
+
+Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Geliang Tang <geliang.tang@suse.com>
+Signed-off-by: Mat Martineau <martineau@kernel.org>
+Link: https://lore.kernel.org/r/20230623-send-net-next-20230623-v1-3-a883213c8ba9@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: 7cefbe5e1dac ("selftests: mptcp: fix fastclose with csum failure")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/mptcp/mptcp_join.sh | 12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh
+index 9d8dde3b5c332..2107579e2939d 100755
+--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh
++++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
+@@ -2167,9 +2167,9 @@ link_failure_tests()
+ pm_nl_set_limits $ns1 0 2
+ pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal
+ pm_nl_set_limits $ns2 1 2
+- FAILING_LINKS="1"
+ pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup
+- run_tests $ns1 $ns2 10.0.1.1 1
++ FAILING_LINKS="1" \
++ run_tests $ns1 $ns2 10.0.1.1 1
+ chk_join_nr 2 2 2
+ chk_add_nr 1 1
+ chk_link_usage $ns2 ns2eth3 $cinsent 0
+@@ -2183,8 +2183,8 @@ link_failure_tests()
+ pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal
+ pm_nl_set_limits $ns2 1 2
+ pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup
+- FAILING_LINKS="1 2"
+- run_tests $ns1 $ns2 10.0.1.1 1
++ FAILING_LINKS="1 2" \
++ run_tests $ns1 $ns2 10.0.1.1 1
+ chk_join_nr 2 2 2
+ chk_add_nr 1 1
+ chk_stale_nr $ns2 2 4 2
+@@ -2199,8 +2199,8 @@ link_failure_tests()
+ pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal
+ pm_nl_set_limits $ns2 1 3
+ pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup
+- FAILING_LINKS="1 2"
+- run_tests $ns1 $ns2 10.0.1.1 2
++ FAILING_LINKS="1 2" \
++ run_tests $ns1 $ns2 10.0.1.1 2
+ chk_join_nr 2 2 2
+ chk_add_nr 1 1
+ chk_stale_nr $ns2 1 -1 2
+--
+2.43.0
+
--- /dev/null
+From 966552f614ba6fbff3836f33c83f31ff6ef93760 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Dec 2023 15:19:30 +0500
+Subject: selftests: secretmem: floor the memory size to the multiple of
+ page_size
+
+From: Muhammad Usama Anjum <usama.anjum@collabora.com>
+
+[ Upstream commit 0aac13add26d546ac74c89d2883b3a5f0fbea039 ]
+
+The "locked-in-memory size" limit per process can be non-multiple of
+page_size. The mmap() fails if we try to allocate locked-in-memory with
+same size as the allowed limit if it isn't multiple of the page_size
+because mmap() rounds off the memory size to be allocated to next multiple
+of page_size.
+
+Fix this by flooring the length to be allocated with mmap() to the
+previous multiple of the page_size.
+
+This was getting triggered on KernelCI regularly because of different
+ulimit settings which wasn't multiple of the page_size. Find logs
+here: https://linux.kernelci.org/test/plan/id/657654bd8e81e654fae13532/
+The bug in was present from the time test was first added.
+
+Link: https://lkml.kernel.org/r/20231214101931.1155586-1-usama.anjum@collabora.com
+Fixes: 76fe17ef588a ("secretmem: test: add basic selftest for memfd_secret(2)")
+Signed-off-by: Muhammad Usama Anjum <usama.anjum@collabora.com>
+Reported-by: "kernelci.org bot" <bot@kernelci.org>
+Closes: https://linux.kernelci.org/test/plan/id/657654bd8e81e654fae13532/
+Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com>
+Cc: Mike Rapoport (IBM) <rppt@kernel.org>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/vm/memfd_secret.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c
+index 957b9e18c7295..9b298f6a04b37 100644
+--- a/tools/testing/selftests/vm/memfd_secret.c
++++ b/tools/testing/selftests/vm/memfd_secret.c
+@@ -62,6 +62,9 @@ static void test_mlock_limit(int fd)
+ char *mem;
+
+ len = mlock_limit_cur;
++ if (len % page_size != 0)
++ len = (len/page_size) * page_size;
++
+ mem = mmap(NULL, len, prot, mode, fd, 0);
+ if (mem == MAP_FAILED) {
+ fail("unable to mmap secret memory\n");
+--
+2.43.0
+
keys-dns-fix-missing-size-check-of-v1-server-list-header.patch
block-don-t-invalidate-pagecache-for-invalid-falloc-modes.patch
+wifi-iwlwifi-pcie-don-t-synchronize-irqs-from-irq.patch
+drm-bridge-ti-sn65dsi86-never-store-more-than-msg-si.patch
+netfilter-use-skb_ip_totlen-and-iph_totlen.patch
+netfilter-nf_tables-set-transport-offset-from-mac-he.patch
+nfc-llcp_core-hold-a-ref-to-llcp_local-dev-when-hold.patch
+octeontx2-af-fix-marking-couple-of-structure-as-__pa.patch
+drm-i915-dp-fix-passing-the-correct-dpcd_rev-for-drm.patch
+ice-fix-link_down_on_close-message.patch
+ice-shut-down-vsi-with-link-down-on-close-enabled.patch
+i40e-fix-filter-input-checks-to-prevent-config-with-.patch
+igc-report-vlan-ethertype-matching-back-to-user.patch
+igc-check-vlan-tci-mask.patch
+igc-check-vlan-ethertype-mask.patch
+asoc-fsl_rpmsg-fix-error-handler-with-pm_runtime_ena.patch
+asoc-mediatek-mt8186-fix-aud_pad_top-register-and-of.patch
+mlxbf_gige-fix-receive-packet-race-condition.patch
+net-sched-em_text-fix-possible-memory-leak-in-em_tex.patch
+r8169-fix-pci-error-on-system-resume.patch
+can-raw-add-support-for-so_mark.patch
+net-timestamp-extend-sof_timestamping_opt_id-to-hw-t.patch
+net-annotate-data-races-around-sk-sk_tsflags.patch
+net-annotate-data-races-around-sk-sk_bind_phc.patch
+net-implement-missing-getsockopt-so_timestamping_new.patch
+selftests-bonding-do-not-set-port-down-when-adding-t.patch
+arm-sun9i-smp-fix-array-index-out-of-bounds-read-in-.patch
+sfc-fix-a-double-free-bug-in-efx_probe_filters.patch
+net-bcmgenet-fix-fcs-generation-for-fragmented-skbuf.patch
+netfilter-nft_immediate-drop-chain-reference-counter.patch
+net-save-and-restore-msg_namelen-in-sock_sendmsg.patch
+i40e-fix-use-after-free-in-i40e_aqc_add_filters.patch
+asoc-meson-g12a-toacodec-validate-written-enum-value.patch
+asoc-meson-g12a-tohdmitx-validate-written-enum-value.patch
+asoc-meson-g12a-toacodec-fix-event-generation.patch
+asoc-meson-g12a-tohdmitx-fix-event-generation-for-s-.patch
+i40e-restore-vf-msi-x-state-during-pci-reset.patch
+igc-fix-hicredit-calculation.patch
+net-qla3xxx-fix-potential-memleak-in-ql_alloc_buffer.patch
+net-smc-fix-invalid-link-access-in-dumping-smc-r-con.patch
+octeontx2-af-always-configure-nix-tx-link-credits-ba.patch
+octeontx2-af-re-enable-mac-tx-in-otx2_stop-processin.patch
+asix-add-check-for-usbnet_get_endpoints.patch
+net-ravb-wait-for-operating-mode-to-be-applied.patch
+bnxt_en-remove-mis-applied-code-from-bnxt_cfg_ntp_fi.patch
+net-implement-missing-so_timestamping_new-cmsg-suppo.patch
+selftests-secretmem-floor-the-memory-size-to-the-mul.patch
+cpu-smt-create-topology_smt_thread_allowed.patch
+cpu-smt-make-smt-control-more-robust-against-enumera.patch
+srcu-fix-callbacks-acceleration-mishandling.patch
+bpf-x64-fix-tailcall-infinite-loop.patch
+bpf-x86-simplify-the-parsing-logic-of-structure-para.patch
+bpf-x86-save-restore-regs-with-bpf_dw-size.patch
+net-declare-msg_splice_pages-internal-sendmsg-flag.patch
+udp-convert-udp_sendpage-to-use-msg_splice_pages.patch
+splice-net-add-a-splice_eof-op-to-file-ops-and-socke.patch
+ipv4-ipv6-use-splice_eof-to-flush.patch
+udp-introduce-udp-udp_flags.patch
+udp-move-udp-no_check6_tx-to-udp-udp_flags.patch
+udp-move-udp-no_check6_rx-to-udp-udp_flags.patch
+udp-move-udp-gro_enabled-to-udp-udp_flags.patch
+udp-move-udp-accept_udp_-l4-fraglist-to-udp-udp_flag.patch
+udp-lockless-udp_encap_l2tpinudp-udp_gro.patch
+udp-annotate-data-races-around-udp-encap_type.patch
+wifi-iwlwifi-yoyo-swap-cdb-and-jacket-bits-values.patch
+arm64-dts-qcom-sdm845-align-rpmh-regulator-nodes-wit.patch
+arm64-dts-qcom-sdm845-fix-psci-power-domain-names.patch
+fbdev-imsttfb-release-framebuffer-and-dealloc-cmap-o.patch
+fbdev-imsttfb-fix-double-free-in-probe.patch
+bpf-decouple-prune-and-jump-points.patch
+bpf-remove-unnecessary-prune-and-jump-points.patch
+bpf-remove-unused-insn_cnt-argument-from-visit_-func.patch
+bpf-clean-up-visit_insn-s-instruction-processing.patch
+bpf-support-new-32bit-offset-jmp-instruction.patch
+bpf-handle-ldimm64-properly-in-check_cfg.patch
+bpf-fix-precision-backtracking-instruction-iteration.patch
+blk-mq-make-sure-active-queue-usage-is-held-for-bio_.patch
+net-mlx5-increase-size-of-irq-name-buffer.patch
+s390-mm-add-missing-arch_set_page_dat-call-to-vmem_c.patch
+s390-cpumf-support-user-space-events-for-counting.patch
+f2fs-clean-up-i_compress_flag-and-i_compress_level-u.patch
+f2fs-convert-to-use-bitmap-api.patch
+f2fs-assign-default-compression-level.patch
+f2fs-set-the-default-compress_level-on-ioctl.patch
+selftests-mptcp-fix-fastclose-with-csum-failure.patch
+selftests-mptcp-set-failing_links-in-run_tests.patch
+media-camss-sm8250-virtual-channels-for-csid.patch
+media-qcom-camss-fix-set-csi2_rx_cfg1_vc_mode-when-v.patch
+ext4-convert-move_extent_per_page-to-use-folios.patch
+khugepage-replace-try_to_release_page-with-filemap_r.patch
+memory-failure-convert-truncate_error_page-to-use-fo.patch
+mm-merge-folio_has_private-filemap_release_folio-cal.patch
+mm-netfs-fscache-stop-read-optimisation-when-folio-r.patch
+filemap-add-a-per-mapping-stable-writes-flag.patch
+block-update-the-stable_writes-flag-in-bdev_add.patch
+smb-client-fix-missing-mode-bits-for-smb-symlinks.patch
+net-dpaa2-eth-rearrange-variable-in-dpaa2_eth_get_et.patch
+dpaa2-eth-recycle-the-rx-buffer-only-after-all-proce.patch
+ethtool-don-t-propagate-eopnotsupp-from-dumps.patch
+bpf-sockmap-af_unix-stream-sockets-need-to-hold-ref-.patch
+firmware-arm_scmi-fix-frequency-truncation-by-promot.patch
+alsa-hda-realtek-add-quirk-for-lenovo-yoga-pro-7.patch
+genirq-affinity-remove-the-firstvec-parameter-from-i.patch
+genirq-affinity-pass-affinity-managed-mask-array-to-.patch
+genirq-affinity-don-t-pass-irq_affinity_desc-array-t.patch
+genirq-affinity-rename-irq_build_affinity_masks-as-g.patch
+genirq-affinity-move-group_cpus_evenly-into-lib.patch
+lib-group_cpus.c-avoid-acquiring-cpu-hotplug-lock-in.patch
+mm-memory_hotplug-add-missing-mem_hotplug_lock.patch
+mm-memory_hotplug-fix-error-handling-in-add_memory_r.patch
+net-sched-call-tcf_ct_params_free-to-free-params-in-.patch
+netfilter-flowtable-allow-unidirectional-rules.patch
+netfilter-flowtable-cache-info-of-last-offload.patch
+net-sched-act_ct-offload-udp-new-connections.patch
+net-sched-act_ct-fix-promotion-of-offloaded-unreplie.patch
+netfilter-flowtable-gc-pushes-back-packets-to-classi.patch
+net-sched-act_ct-take-per-cb-reference-to-tcf_ct_flo.patch
+octeontx2-af-fix-pause-frame-configuration.patch
+octeontx2-af-support-variable-number-of-lmacs.patch
+btrfs-fix-qgroup_free_reserved_data-int-overflow.patch
+btrfs-mark-the-len-field-in-struct-btrfs_ordered_sum.patch
+ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch
--- /dev/null
+From 8c1095d26a73f2833e1f8e43056a8371a254b8f5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 25 Dec 2023 19:29:14 +0800
+Subject: sfc: fix a double-free bug in efx_probe_filters
+
+From: Zhipeng Lu <alexious@zju.edu.cn>
+
+[ Upstream commit d5a306aedba34e640b11d7026dbbafb78ee3a5f6 ]
+
+In efx_probe_filters, the channel->rps_flow_id is freed in a
+efx_for_each_channel marco when success equals to 0.
+However, after the following call chain:
+
+ef100_net_open
+ |-> efx_probe_filters
+ |-> ef100_net_stop
+ |-> efx_remove_filters
+
+The channel->rps_flow_id is freed again in the efx_for_each_channel of
+efx_remove_filters, triggering a double-free bug.
+
+Fixes: a9dc3d5612ce ("sfc_ef100: RX filter table management and related gubbins")
+Reviewed-by: Simon Horman <horms@kernel.org>
+Reviewed-by: Edward Cree <ecree.xilinx@gmail.com>
+Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
+Link: https://lore.kernel.org/r/20231225112915.3544581-1-alexious@zju.edu.cn
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/sfc/rx_common.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c
+index 9220afeddee81..3f290791df1c4 100644
+--- a/drivers/net/ethernet/sfc/rx_common.c
++++ b/drivers/net/ethernet/sfc/rx_common.c
+@@ -820,8 +820,10 @@ int efx_probe_filters(struct efx_nic *efx)
+ }
+
+ if (!success) {
+- efx_for_each_channel(channel, efx)
++ efx_for_each_channel(channel, efx) {
+ kfree(channel->rps_flow_id);
++ channel->rps_flow_id = NULL;
++ }
+ efx->type->filter_table_remove(efx);
+ rc = -ENOMEM;
+ goto out_unlock;
+--
+2.43.0
+
--- /dev/null
+From 9ab12c755c1e01616c2dcfea64cf55fb2c382d0b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 25 Nov 2023 23:55:10 -0300
+Subject: smb: client: fix missing mode bits for SMB symlinks
+
+From: Paulo Alcantara <pc@manguebit.com>
+
+[ Upstream commit ef22bb800d967616c7638d204bc1b425beac7f5f ]
+
+When instantiating inodes for SMB symlinks, add the mode bits from
+@cifs_sb->ctx->file_mode as we already do for the other special files.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Paulo Alcantara (SUSE) <pc@manguebit.com>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/smb/client/inode.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
+index 7be51f9d2fa18..5343898bac8a6 100644
+--- a/fs/smb/client/inode.c
++++ b/fs/smb/client/inode.c
+@@ -264,7 +264,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
+ fattr->cf_dtype = DT_REG;
+ break;
+ case UNIX_SYMLINK:
+- fattr->cf_mode |= S_IFLNK;
++ fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode;
+ fattr->cf_dtype = DT_LNK;
+ break;
+ case UNIX_DIR:
+--
+2.43.0
+
--- /dev/null
+From 4539a0bb3af7906c4281c52b7c7e1f6ccdebe5e6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 7 Jun 2023 19:19:10 +0100
+Subject: splice, net: Add a splice_eof op to file-ops and socket-ops
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 2bfc66850952b6921b2033b09729ec59eabbc81d ]
+
+Add an optional method, ->splice_eof(), to allow splice to indicate the
+premature termination of a splice to struct file_operations and struct
+proto_ops.
+
+This is called if sendfile() or splice() encounters all of the following
+conditions inside splice_direct_to_actor():
+
+ (1) the user did not set SPLICE_F_MORE (splice only), and
+
+ (2) an EOF condition occurred (->splice_read() returned 0), and
+
+ (3) we haven't read enough to fulfill the request (ie. len > 0 still), and
+
+ (4) we have already spliced at least one byte.
+
+A further patch will modify the behaviour of SPLICE_F_MORE to always be
+passed to the actor if either the user set it or we haven't yet read
+sufficient data to fulfill the request.
+
+Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/
+Signed-off-by: David Howells <dhowells@redhat.com>
+Reviewed-by: Jakub Kicinski <kuba@kernel.org>
+cc: Jens Axboe <axboe@kernel.dk>
+cc: Christoph Hellwig <hch@lst.de>
+cc: Al Viro <viro@zeniv.linux.org.uk>
+cc: Matthew Wilcox <willy@infradead.org>
+cc: Jan Kara <jack@suse.cz>
+cc: Jeff Layton <jlayton@kernel.org>
+cc: David Hildenbrand <david@redhat.com>
+cc: Christian Brauner <brauner@kernel.org>
+cc: Chuck Lever <chuck.lever@oracle.com>
+cc: Boris Pismenny <borisp@nvidia.com>
+cc: John Fastabend <john.fastabend@gmail.com>
+cc: linux-mm@kvack.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/splice.c | 31 ++++++++++++++++++++++++++++++-
+ include/linux/fs.h | 1 +
+ include/linux/net.h | 1 +
+ include/linux/splice.h | 1 +
+ include/net/sock.h | 1 +
+ net/socket.c | 10 ++++++++++
+ 6 files changed, 44 insertions(+), 1 deletion(-)
+
+diff --git a/fs/splice.c b/fs/splice.c
+index 5969b7a1d353a..c4ae54deac42c 100644
+--- a/fs/splice.c
++++ b/fs/splice.c
+@@ -764,6 +764,17 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+ return out->f_op->splice_write(pipe, out, ppos, len, flags);
+ }
+
++/*
++ * Indicate to the caller that there was a premature EOF when reading from the
++ * source and the caller didn't indicate they would be sending more data after
++ * this.
++ */
++static void do_splice_eof(struct splice_desc *sd)
++{
++ if (sd->splice_eof)
++ sd->splice_eof(sd);
++}
++
+ /*
+ * Attempt to initiate a splice from a file to a pipe.
+ */
+@@ -864,7 +875,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
+
+ ret = do_splice_to(in, &pos, pipe, len, flags);
+ if (unlikely(ret <= 0))
+- goto out_release;
++ goto read_failure;
+
+ read_len = ret;
+ sd->total_len = read_len;
+@@ -904,6 +915,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
+ file_accessed(in);
+ return bytes;
+
++read_failure:
++ /*
++ * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that
++ * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a
++ * "->splice_in()" that returned EOF (ie zero) *and* we have sent at
++ * least 1 byte *then* we will also do the ->splice_eof() call.
++ */
++ if (ret == 0 && !more && len > 0 && bytes)
++ do_splice_eof(sd);
+ out_release:
+ /*
+ * If we did an incomplete transfer we must release
+@@ -932,6 +952,14 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
+ sd->flags);
+ }
+
++static void direct_file_splice_eof(struct splice_desc *sd)
++{
++ struct file *file = sd->u.file;
++
++ if (file->f_op->splice_eof)
++ file->f_op->splice_eof(file);
++}
++
+ /**
+ * do_splice_direct - splices data directly between two files
+ * @in: file to splice from
+@@ -957,6 +985,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+ .flags = flags,
+ .pos = *ppos,
+ .u.file = out,
++ .splice_eof = direct_file_splice_eof,
+ .opos = opos,
+ };
+ long ret;
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index b6af6abc7a77f..4a1911dcf834b 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -2177,6 +2177,7 @@ struct file_operations {
+ int (*flock) (struct file *, int, struct file_lock *);
+ ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
+ ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
++ void (*splice_eof)(struct file *file);
+ int (*setlease)(struct file *, long, struct file_lock **, void **);
+ long (*fallocate)(struct file *file, int mode, loff_t offset,
+ loff_t len);
+diff --git a/include/linux/net.h b/include/linux/net.h
+index 18d942bbdf6e0..25baca60f6cba 100644
+--- a/include/linux/net.h
++++ b/include/linux/net.h
+@@ -209,6 +209,7 @@ struct proto_ops {
+ int offset, size_t size, int flags);
+ ssize_t (*splice_read)(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len, unsigned int flags);
++ void (*splice_eof)(struct socket *sock);
+ int (*set_peek_off)(struct sock *sk, int val);
+ int (*peek_len)(struct socket *sock);
+
+diff --git a/include/linux/splice.h b/include/linux/splice.h
+index a55179fd60fc3..41a70687be853 100644
+--- a/include/linux/splice.h
++++ b/include/linux/splice.h
+@@ -38,6 +38,7 @@ struct splice_desc {
+ struct file *file; /* file to read/write */
+ void *data; /* cookie */
+ } u;
++ void (*splice_eof)(struct splice_desc *sd); /* Unexpected EOF handler */
+ loff_t pos; /* file position */
+ loff_t *opos; /* sendfile: output position */
+ size_t num_spliced; /* number of bytes already spliced */
+diff --git a/include/net/sock.h b/include/net/sock.h
+index d8ed62a8e1a3e..9de9f070537cc 100644
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -1279,6 +1279,7 @@ struct proto {
+ size_t len, int flags, int *addr_len);
+ int (*sendpage)(struct sock *sk, struct page *page,
+ int offset, size_t size, int flags);
++ void (*splice_eof)(struct socket *sock);
+ int (*bind)(struct sock *sk,
+ struct sockaddr *addr, int addr_len);
+ int (*bind_add)(struct sock *sk,
+diff --git a/net/socket.c b/net/socket.c
+index 6f39f7b0cc85c..639d76f20384e 100644
+--- a/net/socket.c
++++ b/net/socket.c
+@@ -130,6 +130,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page,
+ static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags);
++static void sock_splice_eof(struct file *file);
+
+ #ifdef CONFIG_PROC_FS
+ static void sock_show_fdinfo(struct seq_file *m, struct file *f)
+@@ -164,6 +165,7 @@ static const struct file_operations socket_file_ops = {
+ .sendpage = sock_sendpage,
+ .splice_write = generic_splice_sendpage,
+ .splice_read = sock_splice_read,
++ .splice_eof = sock_splice_eof,
+ .show_fdinfo = sock_show_fdinfo,
+ };
+
+@@ -1091,6 +1093,14 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos,
+ return sock->ops->splice_read(sock, ppos, pipe, len, flags);
+ }
+
++static void sock_splice_eof(struct file *file)
++{
++ struct socket *sock = file->private_data;
++
++ if (sock->ops->splice_eof)
++ sock->ops->splice_eof(sock);
++}
++
+ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to)
+ {
+ struct file *file = iocb->ki_filp;
+--
+2.43.0
+
--- /dev/null
+From 87882bb82acf16fc4e9d159032c1e6e7a25a3f87 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 4 Oct 2023 01:28:59 +0200
+Subject: srcu: Fix callbacks acceleration mishandling
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+[ Upstream commit 4a8e65b0c348e42107c64381e692e282900be361 ]
+
+SRCU callbacks acceleration might fail if the preceding callbacks
+advance also fails. This can happen when the following steps are met:
+
+1) The RCU_WAIT_TAIL segment has callbacks (say for gp_num 8) and the
+ RCU_NEXT_READY_TAIL also has callbacks (say for gp_num 12).
+
+2) The grace period for RCU_WAIT_TAIL is observed as started but not yet
+ completed so rcu_seq_current() returns 4 + SRCU_STATE_SCAN1 = 5.
+
+3) This value is passed to rcu_segcblist_advance() which can't move
+ any segment forward and fails.
+
+4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
+ But then the call to rcu_seq_snap() observes the grace period for the
+ RCU_WAIT_TAIL segment (gp_num 8) as completed and the subsequent one
+ for the RCU_NEXT_READY_TAIL segment as started
+ (ie: 8 + SRCU_STATE_SCAN1 = 9) so it returns a snapshot of the
+ next grace period, which is 16.
+
+5) The value of 16 is passed to rcu_segcblist_accelerate() but the
+ freshly enqueued callback in RCU_NEXT_TAIL can't move to
+ RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
+ period (gp_num = 12). So acceleration fails.
+
+6) Note in all these steps, srcu_invoke_callbacks() hadn't had a chance
+ to run srcu_invoke_callbacks().
+
+Then some very bad outcome may happen if the following happens:
+
+7) Some other CPU races and starts the grace period number 16 before the
+ CPU handling previous steps had a chance. Therefore srcu_gp_start()
+ isn't called on the latter sdp to fix the acceleration leak from
+ previous steps with a new pair of call to advance/accelerate.
+
+8) The grace period 16 completes and srcu_invoke_callbacks() is finally
+ called. All the callbacks from previous grace periods (8 and 12) are
+ correctly advanced and executed but callbacks in RCU_NEXT_READY_TAIL
+ still remain. Then rcu_segcblist_accelerate() is called with a
+ snaphot of 20.
+
+9) Since nothing started the grace period number 20, callbacks stay
+ unhandled.
+
+This has been reported in real load:
+
+ [3144162.608392] INFO: task kworker/136:12:252684 blocked for more
+ than 122 seconds.
+ [3144162.615986] Tainted: G O K 5.4.203-1-tlinux4-0011.1 #1
+ [3144162.623053] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs"
+ disables this message.
+ [3144162.631162] kworker/136:12 D 0 252684 2 0x90004000
+ [3144162.631189] Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm]
+ [3144162.631192] Call Trace:
+ [3144162.631202] __schedule+0x2ee/0x660
+ [3144162.631206] schedule+0x33/0xa0
+ [3144162.631209] schedule_timeout+0x1c4/0x340
+ [3144162.631214] ? update_load_avg+0x82/0x660
+ [3144162.631217] ? raw_spin_rq_lock_nested+0x1f/0x30
+ [3144162.631218] wait_for_completion+0x119/0x180
+ [3144162.631220] ? wake_up_q+0x80/0x80
+ [3144162.631224] __synchronize_srcu.part.19+0x81/0xb0
+ [3144162.631226] ? __bpf_trace_rcu_utilization+0x10/0x10
+ [3144162.631227] synchronize_srcu+0x5f/0xc0
+ [3144162.631236] irqfd_shutdown+0x3c/0xb0 [kvm]
+ [3144162.631239] ? __schedule+0x2f6/0x660
+ [3144162.631243] process_one_work+0x19a/0x3a0
+ [3144162.631244] worker_thread+0x37/0x3a0
+ [3144162.631247] kthread+0x117/0x140
+ [3144162.631247] ? process_one_work+0x3a0/0x3a0
+ [3144162.631248] ? __kthread_cancel_work+0x40/0x40
+ [3144162.631250] ret_from_fork+0x1f/0x30
+
+Fix this with taking the snapshot for acceleration _before_ the read
+of the current grace period number.
+
+The only side effect of this solution is that callbacks advancing happen
+then _after_ the full barrier in rcu_seq_snap(). This is not a problem
+because that barrier only cares about:
+
+1) Ordering accesses of the update side before call_srcu() so they don't
+ bleed.
+2) See all the accesses prior to the grace period of the current gp_num
+
+The only things callbacks advancing need to be ordered against are
+carried by snp locking.
+
+Reported-by: Yong He <alexyonghe@tencent.com>
+Co-developed-by:: Yong He <alexyonghe@tencent.com>
+Signed-off-by: Yong He <alexyonghe@tencent.com>
+Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
+Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
+Co-developed-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com>
+Signed-off-by: Neeraj upadhyay <Neeraj.Upadhyay@amd.com>
+Link: http://lore.kernel.org/CANZk6aR+CqZaqmMWrC2eRRPY12qAZnDZLwLnHZbNi=xXMB401g@mail.gmail.com
+Fixes: da915ad5cf25 ("srcu: Parallelize callback handling")
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/rcu/srcutree.c | 31 +++++++++++++++++++++++++++++--
+ 1 file changed, 29 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
+index 8fdf076720384..929dcbc04d29c 100644
+--- a/kernel/rcu/srcutree.c
++++ b/kernel/rcu/srcutree.c
+@@ -1100,10 +1100,37 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp,
+ spin_lock_irqsave_sdp_contention(sdp, &flags);
+ if (rhp)
+ rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
++ /*
++ * The snapshot for acceleration must be taken _before_ the read of the
++ * current gp sequence used for advancing, otherwise advancing may fail
++ * and acceleration may then fail too.
++ *
++ * This could happen if:
++ *
++ * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the
++ * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8).
++ *
++ * 2) The grace period for RCU_WAIT_TAIL is seen as started but not
++ * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1.
++ *
++ * 3) This value is passed to rcu_segcblist_advance() which can't move
++ * any segment forward and fails.
++ *
++ * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration.
++ * But then the call to rcu_seq_snap() observes the grace period for the
++ * RCU_WAIT_TAIL segment as completed and the subsequent one for the
++ * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1)
++ * so it returns a snapshot of the next grace period, which is X + 12.
++ *
++ * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the
++ * freshly enqueued callback in RCU_NEXT_TAIL can't move to
++ * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace
++ * period (gp_num = X + 8). So acceleration fails.
++ */
++ s = rcu_seq_snap(&ssp->srcu_gp_seq);
+ rcu_segcblist_advance(&sdp->srcu_cblist,
+ rcu_seq_current(&ssp->srcu_gp_seq));
+- s = rcu_seq_snap(&ssp->srcu_gp_seq);
+- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s);
++ WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp);
+ if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) {
+ sdp->srcu_gp_seq_needed = s;
+ needgp = true;
+--
+2.43.0
+
--- /dev/null
+From be36ec0d25d48e42c708f492bd350b9e4cc0e19e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Sep 2023 09:17:28 +0000
+Subject: udp: annotate data-races around udp->encap_type
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 70a36f571362a8de8b8c02d21ae524fc776287f2 ]
+
+syzbot/KCSAN complained about UDP_ENCAP_L2TPINUDP setsockopt() racing.
+
+Add READ_ONCE()/WRITE_ONCE() to document races on this lockless field.
+
+syzbot report was:
+BUG: KCSAN: data-race in udp_lib_setsockopt / udp_lib_setsockopt
+
+read-write to 0xffff8881083603fa of 1 bytes by task 16557 on cpu 0:
+udp_lib_setsockopt+0x682/0x6c0
+udp_setsockopt+0x73/0xa0 net/ipv4/udp.c:2779
+sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697
+__sys_setsockopt+0x1c9/0x230 net/socket.c:2263
+__do_sys_setsockopt net/socket.c:2274 [inline]
+__se_sys_setsockopt net/socket.c:2271 [inline]
+__x64_sys_setsockopt+0x66/0x80 net/socket.c:2271
+do_syscall_x64 arch/x86/entry/common.c:50 [inline]
+do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
+entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+read-write to 0xffff8881083603fa of 1 bytes by task 16554 on cpu 1:
+udp_lib_setsockopt+0x682/0x6c0
+udp_setsockopt+0x73/0xa0 net/ipv4/udp.c:2779
+sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697
+__sys_setsockopt+0x1c9/0x230 net/socket.c:2263
+__do_sys_setsockopt net/socket.c:2274 [inline]
+__se_sys_setsockopt net/socket.c:2271 [inline]
+__x64_sys_setsockopt+0x66/0x80 net/socket.c:2271
+do_syscall_x64 arch/x86/entry/common.c:50 [inline]
+do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
+entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+value changed: 0x01 -> 0x05
+
+Reported by Kernel Concurrency Sanitizer on:
+CPU: 1 PID: 16554 Comm: syz-executor.5 Not tainted 6.5.0-rc7-syzkaller-00004-gf7757129e3de #0
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/gtp.c | 4 ++--
+ net/ipv4/udp.c | 9 +++++----
+ net/ipv4/xfrm4_input.c | 4 ++--
+ net/ipv6/udp.c | 5 +++--
+ net/ipv6/xfrm6_input.c | 4 ++--
+ net/l2tp/l2tp_core.c | 6 +++---
+ 6 files changed, 17 insertions(+), 15 deletions(-)
+
+diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c
+index 477b4d4f860bd..bace989591f75 100644
+--- a/drivers/net/gtp.c
++++ b/drivers/net/gtp.c
+@@ -629,7 +629,7 @@ static void __gtp_encap_destroy(struct sock *sk)
+ gtp->sk0 = NULL;
+ else
+ gtp->sk1u = NULL;
+- udp_sk(sk)->encap_type = 0;
++ WRITE_ONCE(udp_sk(sk)->encap_type, 0);
+ rcu_assign_sk_user_data(sk, NULL);
+ release_sock(sk);
+ sock_put(sk);
+@@ -681,7 +681,7 @@ static int gtp_encap_recv(struct sock *sk, struct sk_buff *skb)
+
+ netdev_dbg(gtp->dev, "encap_recv sk=%p\n", sk);
+
+- switch (udp_sk(sk)->encap_type) {
++ switch (READ_ONCE(udp_sk(sk)->encap_type)) {
+ case UDP_ENCAP_GTP0:
+ netdev_dbg(gtp->dev, "received GTP0 packet\n");
+ ret = gtp0_udp_encap_recv(gtp, skb);
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 267f77633a8f3..5672d9a86c5d2 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -733,7 +733,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
+ iph->saddr, uh->source, skb->dev->ifindex,
+ inet_sdif(skb), udptable, NULL);
+
+- if (!sk || udp_sk(sk)->encap_type) {
++ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
+ /* No socket for error: try tunnels before discarding */
+ if (static_branch_unlikely(&udp_encap_needed_key)) {
+ sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb,
+@@ -2114,7 +2114,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
+ }
+ nf_reset_ct(skb);
+
+- if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) {
++ if (static_branch_unlikely(&udp_encap_needed_key) &&
++ READ_ONCE(up->encap_type)) {
+ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+
+ /*
+@@ -2699,7 +2700,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ #endif
+ fallthrough;
+ case UDP_ENCAP_L2TPINUDP:
+- up->encap_type = val;
++ WRITE_ONCE(up->encap_type, val);
+ udp_tunnel_encap_enable(sk);
+ break;
+ default:
+@@ -2800,7 +2801,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
+ break;
+
+ case UDP_ENCAP:
+- val = up->encap_type;
++ val = READ_ONCE(up->encap_type);
+ break;
+
+ case UDP_NO_CHECK6_TX:
+diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
+index eac206a290d05..183f6dc372429 100644
+--- a/net/ipv4/xfrm4_input.c
++++ b/net/ipv4/xfrm4_input.c
+@@ -85,11 +85,11 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+ struct udphdr *uh;
+ struct iphdr *iph;
+ int iphlen, len;
+-
+ __u8 *udpdata;
+ __be32 *udpdata32;
+- __u16 encap_type = up->encap_type;
++ u16 encap_type;
+
++ encap_type = READ_ONCE(up->encap_type);
+ /* if this is not encapsulated socket, then just return now */
+ if (!encap_type)
+ return 1;
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index 5b7c4f8e2ed03..961106eda69d0 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -598,7 +598,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
+ inet6_iif(skb), inet6_sdif(skb), udptable, NULL);
+
+- if (!sk || udp_sk(sk)->encap_type) {
++ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) {
+ /* No socket for error: try tunnels before discarding */
+ if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+ sk = __udp6_lib_err_encap(net, hdr, offset, uh,
+@@ -712,7 +712,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
+ }
+ nf_reset_ct(skb);
+
+- if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) {
++ if (static_branch_unlikely(&udpv6_encap_needed_key) &&
++ READ_ONCE(up->encap_type)) {
+ int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+
+ /*
+diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
+index 4907ab241d6be..4156387248e40 100644
+--- a/net/ipv6/xfrm6_input.c
++++ b/net/ipv6/xfrm6_input.c
+@@ -81,14 +81,14 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
+ struct ipv6hdr *ip6h;
+ int len;
+ int ip6hlen = sizeof(struct ipv6hdr);
+-
+ __u8 *udpdata;
+ __be32 *udpdata32;
+- __u16 encap_type = up->encap_type;
++ u16 encap_type;
+
+ if (skb->protocol == htons(ETH_P_IP))
+ return xfrm4_udp_encap_rcv(sk, skb);
+
++ encap_type = READ_ONCE(up->encap_type);
+ /* if this is not encapsulated socket, then just return now */
+ if (!encap_type)
+ return 1;
+diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
+index 03608d3ded4b8..8d21ff25f1602 100644
+--- a/net/l2tp/l2tp_core.c
++++ b/net/l2tp/l2tp_core.c
+@@ -1139,9 +1139,9 @@ static void l2tp_tunnel_destruct(struct sock *sk)
+ switch (tunnel->encap) {
+ case L2TP_ENCAPTYPE_UDP:
+ /* No longer an encapsulation socket. See net/ipv4/udp.c */
+- (udp_sk(sk))->encap_type = 0;
+- (udp_sk(sk))->encap_rcv = NULL;
+- (udp_sk(sk))->encap_destroy = NULL;
++ WRITE_ONCE(udp_sk(sk)->encap_type, 0);
++ udp_sk(sk)->encap_rcv = NULL;
++ udp_sk(sk)->encap_destroy = NULL;
+ break;
+ case L2TP_ENCAPTYPE_IP:
+ break;
+--
+2.43.0
+
--- /dev/null
+From 18625f6ea3f1ce6d5e70c59bd187fa0323530c26 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 May 2023 13:11:22 +0100
+Subject: udp: Convert udp_sendpage() to use MSG_SPLICE_PAGES
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 7ac7c987850c3ec617c778f7bd871804dc1c648d ]
+
+Convert udp_sendpage() to use sendmsg() with MSG_SPLICE_PAGES rather than
+directly splicing in the pages itself.
+
+This allows ->sendpage() to be replaced by something that can handle
+multiple multipage folios in a single transaction.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Willem de Bruijn <willemdebruijn.kernel@gmail.com>
+cc: David Ahern <dsahern@kernel.org>
+cc: Jens Axboe <axboe@kernel.dk>
+cc: Matthew Wilcox <willy@infradead.org>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/udp.c | 51 ++++++--------------------------------------------
+ 1 file changed, 6 insertions(+), 45 deletions(-)
+
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 65abc92a81bd0..b49cb3df01bb4 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -1335,54 +1335,15 @@ EXPORT_SYMBOL(udp_sendmsg);
+ int udp_sendpage(struct sock *sk, struct page *page, int offset,
+ size_t size, int flags)
+ {
+- struct inet_sock *inet = inet_sk(sk);
+- struct udp_sock *up = udp_sk(sk);
+- int ret;
++ struct bio_vec bvec;
++ struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES };
+
+ if (flags & MSG_SENDPAGE_NOTLAST)
+- flags |= MSG_MORE;
+-
+- if (!up->pending) {
+- struct msghdr msg = { .msg_flags = flags|MSG_MORE };
+-
+- /* Call udp_sendmsg to specify destination address which
+- * sendpage interface can't pass.
+- * This will succeed only when the socket is connected.
+- */
+- ret = udp_sendmsg(sk, &msg, 0);
+- if (ret < 0)
+- return ret;
+- }
+-
+- lock_sock(sk);
++ msg.msg_flags |= MSG_MORE;
+
+- if (unlikely(!up->pending)) {
+- release_sock(sk);
+-
+- net_dbg_ratelimited("cork failed\n");
+- return -EINVAL;
+- }
+-
+- ret = ip_append_page(sk, &inet->cork.fl.u.ip4,
+- page, offset, size, flags);
+- if (ret == -EOPNOTSUPP) {
+- release_sock(sk);
+- return sock_no_sendpage(sk->sk_socket, page, offset,
+- size, flags);
+- }
+- if (ret < 0) {
+- udp_flush_pending_frames(sk);
+- goto out;
+- }
+-
+- up->len += size;
+- if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE)))
+- ret = udp_push_pending_frames(sk);
+- if (!ret)
+- ret = size;
+-out:
+- release_sock(sk);
+- return ret;
++ bvec_set_page(&bvec, page, size, offset);
++ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size);
++ return udp_sendmsg(sk, &msg, size);
+ }
+
+ #define UDP_SKB_IS_STATELESS 0x80000000
+--
+2.43.0
+
--- /dev/null
+From ceb0fec094adcb6586e574c81ad754f61512c4eb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Sep 2023 09:17:21 +0000
+Subject: udp: introduce udp->udp_flags
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 81b36803ac139827538ac5ce4028e750a3c53f53 ]
+
+According to syzbot, it is time to use proper atomic flags
+for various UDP flags.
+
+Add udp_flags field, and convert udp->corkflag to first
+bit in it.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/udp.h | 28 +++++++++++++++++++++-------
+ net/ipv4/udp.c | 12 ++++++------
+ net/ipv6/udp.c | 6 +++---
+ 3 files changed, 30 insertions(+), 16 deletions(-)
+
+diff --git a/include/linux/udp.h b/include/linux/udp.h
+index e96da4157d04d..10b56b8231e3c 100644
+--- a/include/linux/udp.h
++++ b/include/linux/udp.h
+@@ -30,14 +30,20 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
+ return (num + net_hash_mix(net)) & mask;
+ }
+
++enum {
++ UDP_FLAGS_CORK, /* Cork is required */
++};
++
+ struct udp_sock {
+ /* inet_sock has to be the first member */
+ struct inet_sock inet;
+ #define udp_port_hash inet.sk.__sk_common.skc_u16hashes[0]
+ #define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1]
+ #define udp_portaddr_node inet.sk.__sk_common.skc_portaddr_node
++
++ unsigned long udp_flags;
++
+ int pending; /* Any pending frames ? */
+- unsigned int corkflag; /* Cork is required */
+ __u8 encap_type; /* Is this an Encapsulation socket? */
+ unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
+ no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
+@@ -49,6 +55,11 @@ struct udp_sock {
+ gro_enabled:1, /* Request GRO aggregation */
+ accept_udp_l4:1,
+ accept_udp_fraglist:1;
++/* indicator bits used by pcflag: */
++#define UDPLITE_BIT 0x1 /* set by udplite proto init function */
++#define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */
++#define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */
++ __u8 pcflag; /* marks socket as UDP-Lite if > 0 */
+ /*
+ * Following member retains the information to create a UDP header
+ * when the socket is uncorked.
+@@ -60,12 +71,6 @@ struct udp_sock {
+ */
+ __u16 pcslen;
+ __u16 pcrlen;
+-/* indicator bits used by pcflag: */
+-#define UDPLITE_BIT 0x1 /* set by udplite proto init function */
+-#define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */
+-#define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */
+- __u8 pcflag; /* marks socket as UDP-Lite if > 0 */
+- __u8 unused[3];
+ /*
+ * For encapsulation sockets.
+ */
+@@ -89,6 +94,15 @@ struct udp_sock {
+ int forward_deficit;
+ };
+
++#define udp_test_bit(nr, sk) \
++ test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
++#define udp_set_bit(nr, sk) \
++ set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
++#define udp_clear_bit(nr, sk) \
++ clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
++#define udp_assign_bit(nr, sk, val) \
++ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val)
++
+ #define UDP_MAX_SEGMENTS (1 << 6UL)
+
+ static inline struct udp_sock *udp_sk(const struct sock *sk)
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index e8dd2880ac9aa..60a754477efb2 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -1068,7 +1068,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+ __be16 dport;
+ u8 tos;
+ int err, is_udplite = IS_UDPLITE(sk);
+- int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
++ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
+ int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+ struct sk_buff *skb;
+ struct ip_options_data opt_copy;
+@@ -1337,11 +1337,11 @@ void udp_splice_eof(struct socket *sock)
+ struct sock *sk = sock->sk;
+ struct udp_sock *up = udp_sk(sk);
+
+- if (!up->pending || READ_ONCE(up->corkflag))
++ if (!up->pending || udp_test_bit(CORK, sk))
+ return;
+
+ lock_sock(sk);
+- if (up->pending && !READ_ONCE(up->corkflag))
++ if (up->pending && !udp_test_bit(CORK, sk))
+ udp_push_pending_frames(sk);
+ release_sock(sk);
+ }
+@@ -2673,9 +2673,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ switch (optname) {
+ case UDP_CORK:
+ if (val != 0) {
+- WRITE_ONCE(up->corkflag, 1);
++ udp_set_bit(CORK, sk);
+ } else {
+- WRITE_ONCE(up->corkflag, 0);
++ udp_clear_bit(CORK, sk);
+ lock_sock(sk);
+ push_pending_frames(sk);
+ release_sock(sk);
+@@ -2800,7 +2800,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
+
+ switch (optname) {
+ case UDP_CORK:
+- val = READ_ONCE(up->corkflag);
++ val = udp_test_bit(CORK, sk);
+ break;
+
+ case UDP_ENCAP:
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index 2a65136dca773..85653e3a04fe8 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -1351,7 +1351,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+ int addr_len = msg->msg_namelen;
+ bool connected = false;
+ int ulen = len;
+- int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE;
++ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
+ int err;
+ int is_udplite = IS_UDPLITE(sk);
+ int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
+@@ -1662,11 +1662,11 @@ static void udpv6_splice_eof(struct socket *sock)
+ struct sock *sk = sock->sk;
+ struct udp_sock *up = udp_sk(sk);
+
+- if (!up->pending || READ_ONCE(up->corkflag))
++ if (!up->pending || udp_test_bit(CORK, sk))
+ return;
+
+ lock_sock(sk);
+- if (up->pending && !READ_ONCE(up->corkflag))
++ if (up->pending && !udp_test_bit(CORK, sk))
+ udp_v6_push_pending_frames(sk);
+ release_sock(sk);
+ }
+--
+2.43.0
+
--- /dev/null
+From d2f165afbbc9ce0af6beddcde9af3f3d368908f5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Sep 2023 09:17:27 +0000
+Subject: udp: lockless UDP_ENCAP_L2TPINUDP / UDP_GRO
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit ac9a7f4ce5dda1472e8f44096f33066c6ec1a3b4 ]
+
+Move udp->encap_enabled to udp->udp_flags.
+
+Add udp_test_and_set_bit() helper to allow lockless
+udp_tunnel_encap_enable() implementation.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Stable-dep-of: 70a36f571362 ("udp: annotate data-races around udp->encap_type")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/udp.h | 9 ++++-----
+ include/net/udp_tunnel.h | 9 +++------
+ net/ipv4/udp.c | 10 +++-------
+ net/ipv4/udp_tunnel_core.c | 2 +-
+ net/ipv6/udp.c | 2 +-
+ 5 files changed, 12 insertions(+), 20 deletions(-)
+
+diff --git a/include/linux/udp.h b/include/linux/udp.h
+index 0e6880856246a..efd9ab6df3797 100644
+--- a/include/linux/udp.h
++++ b/include/linux/udp.h
+@@ -37,6 +37,7 @@ enum {
+ UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */
+ UDP_FLAGS_ACCEPT_FRAGLIST,
+ UDP_FLAGS_ACCEPT_L4,
++ UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */
+ };
+
+ struct udp_sock {
+@@ -50,11 +51,7 @@ struct udp_sock {
+
+ int pending; /* Any pending frames ? */
+ __u8 encap_type; /* Is this an Encapsulation socket? */
+- unsigned char encap_enabled:1; /* This socket enabled encap
+- * processing; UDP tunnels and
+- * different encapsulation layer set
+- * this
+- */
++
+ /* indicator bits used by pcflag: */
+ #define UDPLITE_BIT 0x1 /* set by udplite proto init function */
+ #define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */
+@@ -98,6 +95,8 @@ struct udp_sock {
+ test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+ #define udp_set_bit(nr, sk) \
+ set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
++#define udp_test_and_set_bit(nr, sk) \
++ test_and_set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+ #define udp_clear_bit(nr, sk) \
+ clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags)
+ #define udp_assign_bit(nr, sk, val) \
+diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
+index 72394f441dad8..e5f81710b18f4 100644
+--- a/include/net/udp_tunnel.h
++++ b/include/net/udp_tunnel.h
+@@ -174,16 +174,13 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum)
+ }
+ #endif
+
+-static inline void udp_tunnel_encap_enable(struct socket *sock)
++static inline void udp_tunnel_encap_enable(struct sock *sk)
+ {
+- struct udp_sock *up = udp_sk(sock->sk);
+-
+- if (up->encap_enabled)
++ if (udp_test_and_set_bit(ENCAP_ENABLED, sk))
+ return;
+
+- up->encap_enabled = 1;
+ #if IS_ENABLED(CONFIG_IPV6)
+- if (sock->sk->sk_family == PF_INET6)
++ if (READ_ONCE(sk->sk_family) == PF_INET6)
+ ipv6_stub->udpv6_encap_enable();
+ #endif
+ udp_encap_enable();
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index df0ea45b8b8f2..267f77633a8f3 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -2645,7 +2645,7 @@ void udp_destroy_sock(struct sock *sk)
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+- if (up->encap_enabled)
++ if (udp_test_bit(ENCAP_ENABLED, sk))
+ static_branch_dec(&udp_encap_needed_key);
+ }
+ }
+@@ -2700,9 +2700,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ fallthrough;
+ case UDP_ENCAP_L2TPINUDP:
+ up->encap_type = val;
+- lock_sock(sk);
+- udp_tunnel_encap_enable(sk->sk_socket);
+- release_sock(sk);
++ udp_tunnel_encap_enable(sk);
+ break;
+ default:
+ err = -ENOPROTOOPT;
+@@ -2725,14 +2723,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ break;
+
+ case UDP_GRO:
+- lock_sock(sk);
+
+ /* when enabling GRO, accept the related GSO packet type */
+ if (valbool)
+- udp_tunnel_encap_enable(sk->sk_socket);
++ udp_tunnel_encap_enable(sk);
+ udp_assign_bit(GRO_ENABLED, sk, valbool);
+ udp_assign_bit(ACCEPT_L4, sk, valbool);
+- release_sock(sk);
+ break;
+
+ /*
+diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
+index 5f8104cf082d0..732e21b75ba28 100644
+--- a/net/ipv4/udp_tunnel_core.c
++++ b/net/ipv4/udp_tunnel_core.c
+@@ -78,7 +78,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
+ udp_sk(sk)->gro_receive = cfg->gro_receive;
+ udp_sk(sk)->gro_complete = cfg->gro_complete;
+
+- udp_tunnel_encap_enable(sock);
++ udp_tunnel_encap_enable(sk);
+ }
+ EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
+
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index ddd17b5ea4259..5b7c4f8e2ed03 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -1688,7 +1688,7 @@ void udpv6_destroy_sock(struct sock *sk)
+ if (encap_destroy)
+ encap_destroy(sk);
+ }
+- if (up->encap_enabled) {
++ if (udp_test_bit(ENCAP_ENABLED, sk)) {
+ static_branch_dec(&udpv6_encap_needed_key);
+ udp_encap_disable();
+ }
+--
+2.43.0
+
--- /dev/null
+From f8848188eeb61db01317edf7b603cd83e93eef38 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Sep 2023 09:17:26 +0000
+Subject: udp: move udp->accept_udp_{l4|fraglist} to udp->udp_flags
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit f5f52f0884a595ff99ab1a608643fe4025fca2d5 ]
+
+These are read locklessly, move them to udp_flags to fix data-races.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Stable-dep-of: 70a36f571362 ("udp: annotate data-races around udp->encap_type")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/udp.h | 16 +++++++++-------
+ net/ipv4/udp.c | 2 +-
+ 2 files changed, 10 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/udp.h b/include/linux/udp.h
+index f87e2123fe7b0..0e6880856246a 100644
+--- a/include/linux/udp.h
++++ b/include/linux/udp.h
+@@ -35,6 +35,8 @@ enum {
+ UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
+ UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
+ UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */
++ UDP_FLAGS_ACCEPT_FRAGLIST,
++ UDP_FLAGS_ACCEPT_L4,
+ };
+
+ struct udp_sock {
+@@ -48,13 +50,11 @@ struct udp_sock {
+
+ int pending; /* Any pending frames ? */
+ __u8 encap_type; /* Is this an Encapsulation socket? */
+- unsigned char encap_enabled:1, /* This socket enabled encap
++ unsigned char encap_enabled:1; /* This socket enabled encap
+ * processing; UDP tunnels and
+ * different encapsulation layer set
+ * this
+ */
+- accept_udp_l4:1,
+- accept_udp_fraglist:1;
+ /* indicator bits used by pcflag: */
+ #define UDPLITE_BIT 0x1 /* set by udplite proto init function */
+ #define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */
+@@ -146,10 +146,12 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
+ if (!skb_is_gso(skb))
+ return false;
+
+- if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4)
++ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
++ !udp_test_bit(ACCEPT_L4, sk))
+ return true;
+
+- if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist)
++ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST &&
++ !udp_test_bit(ACCEPT_FRAGLIST, sk))
+ return true;
+
+ return false;
+@@ -157,8 +159,8 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb)
+
+ static inline void udp_allow_gso(struct sock *sk)
+ {
+- udp_sk(sk)->accept_udp_l4 = 1;
+- udp_sk(sk)->accept_udp_fraglist = 1;
++ udp_set_bit(ACCEPT_L4, sk);
++ udp_set_bit(ACCEPT_FRAGLIST, sk);
+ }
+
+ #define udp_portaddr_for_each_entry(__sk, list) \
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 28292fcf07075..df0ea45b8b8f2 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -2731,7 +2731,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ if (valbool)
+ udp_tunnel_encap_enable(sk->sk_socket);
+ udp_assign_bit(GRO_ENABLED, sk, valbool);
+- up->accept_udp_l4 = valbool;
++ udp_assign_bit(ACCEPT_L4, sk, valbool);
+ release_sock(sk);
+ break;
+
+--
+2.43.0
+
--- /dev/null
+From a7beff020a1a4657b4250f8b820c5cfbd77d49a5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Sep 2023 09:17:24 +0000
+Subject: udp: move udp->gro_enabled to udp->udp_flags
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit e1dc0615c6b08ef36414f08c011965b8fb56198b ]
+
+syzbot reported that udp->gro_enabled can be read locklessly.
+Use one atomic bit from udp->udp_flags.
+
+Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.")
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/udp.h | 2 +-
+ net/ipv4/udp.c | 6 +++---
+ net/ipv4/udp_offload.c | 4 ++--
+ net/ipv6/udp.c | 2 +-
+ 4 files changed, 7 insertions(+), 7 deletions(-)
+
+diff --git a/include/linux/udp.h b/include/linux/udp.h
+index e6cd46e2b0831..f87e2123fe7b0 100644
+--- a/include/linux/udp.h
++++ b/include/linux/udp.h
+@@ -34,6 +34,7 @@ enum {
+ UDP_FLAGS_CORK, /* Cork is required */
+ UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
+ UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
++ UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */
+ };
+
+ struct udp_sock {
+@@ -52,7 +53,6 @@ struct udp_sock {
+ * different encapsulation layer set
+ * this
+ */
+- gro_enabled:1, /* Request GRO aggregation */
+ accept_udp_l4:1,
+ accept_udp_fraglist:1;
+ /* indicator bits used by pcflag: */
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 01e74919885ad..28292fcf07075 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -1901,7 +1901,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+ (struct sockaddr *)sin);
+ }
+
+- if (udp_sk(sk)->gro_enabled)
++ if (udp_test_bit(GRO_ENABLED, sk))
+ udp_cmsg_recv(msg, sk, skb);
+
+ if (inet->cmsg_flags)
+@@ -2730,7 +2730,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ /* when enabling GRO, accept the related GSO packet type */
+ if (valbool)
+ udp_tunnel_encap_enable(sk->sk_socket);
+- up->gro_enabled = valbool;
++ udp_assign_bit(GRO_ENABLED, sk, valbool);
+ up->accept_udp_l4 = valbool;
+ release_sock(sk);
+ break;
+@@ -2820,7 +2820,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
+ break;
+
+ case UDP_GRO:
+- val = up->gro_enabled;
++ val = udp_test_bit(GRO_ENABLED, sk);
+ break;
+
+ /* The following two cannot be changed on UDP sockets, the return is
+diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
+index 6d1a4bec2614d..8096576fd9bde 100644
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -549,10 +549,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb,
+ NAPI_GRO_CB(skb)->is_flist = 0;
+ if (!sk || !udp_sk(sk)->gro_receive) {
+ if (skb->dev->features & NETIF_F_GRO_FRAGLIST)
+- NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1;
++ NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1;
+
+ if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) ||
+- (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist)
++ (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist)
+ return call_gro_receive(udp_gro_receive_segment, head, skb);
+
+ /* no GRO, be sure flush the current packet */
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index ae4f7f983f951..ddd17b5ea4259 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -440,7 +440,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+ (struct sockaddr *)sin6);
+ }
+
+- if (udp_sk(sk)->gro_enabled)
++ if (udp_test_bit(GRO_ENABLED, sk))
+ udp_cmsg_recv(msg, sk, skb);
+
+ if (np->rxopt.all)
+--
+2.43.0
+
--- /dev/null
+From e1834f0244ebec827e6c0f8f4cf0bce3dc679841 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Sep 2023 09:17:23 +0000
+Subject: udp: move udp->no_check6_rx to udp->udp_flags
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit bcbc1b1de884647aa0318bf74eb7f293d72a1e40 ]
+
+syzbot reported that udp->no_check6_rx can be read locklessly.
+Use one atomic bit from udp->udp_flags.
+
+Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive")
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/udp.h | 10 +++++-----
+ net/ipv4/udp.c | 4 ++--
+ net/ipv6/udp.c | 6 +++---
+ 3 files changed, 10 insertions(+), 10 deletions(-)
+
+diff --git a/include/linux/udp.h b/include/linux/udp.h
+index b5ca5760ae34b..e6cd46e2b0831 100644
+--- a/include/linux/udp.h
++++ b/include/linux/udp.h
+@@ -33,6 +33,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
+ enum {
+ UDP_FLAGS_CORK, /* Cork is required */
+ UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
++ UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */
+ };
+
+ struct udp_sock {
+@@ -46,8 +47,7 @@ struct udp_sock {
+
+ int pending; /* Any pending frames ? */
+ __u8 encap_type; /* Is this an Encapsulation socket? */
+- unsigned char no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
+- encap_enabled:1, /* This socket enabled encap
++ unsigned char encap_enabled:1, /* This socket enabled encap
+ * processing; UDP tunnels and
+ * different encapsulation layer set
+ * this
+@@ -117,7 +117,7 @@ static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
+
+ static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
+ {
+- udp_sk(sk)->no_check6_rx = val;
++ udp_assign_bit(NO_CHECK6_RX, sk, val);
+ }
+
+ static inline bool udp_get_no_check6_tx(const struct sock *sk)
+@@ -125,9 +125,9 @@ static inline bool udp_get_no_check6_tx(const struct sock *sk)
+ return udp_test_bit(NO_CHECK6_TX, sk);
+ }
+
+-static inline bool udp_get_no_check6_rx(struct sock *sk)
++static inline bool udp_get_no_check6_rx(const struct sock *sk)
+ {
+- return udp_sk(sk)->no_check6_rx;
++ return udp_test_bit(NO_CHECK6_RX, sk);
+ }
+
+ static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk,
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 513035e83a820..01e74919885ad 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -2715,7 +2715,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ break;
+
+ case UDP_NO_CHECK6_RX:
+- up->no_check6_rx = valbool;
++ udp_set_no_check6_rx(sk, valbool);
+ break;
+
+ case UDP_SEGMENT:
+@@ -2812,7 +2812,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
+ break;
+
+ case UDP_NO_CHECK6_RX:
+- val = up->no_check6_rx;
++ val = udp_get_no_check6_rx(sk);
+ break;
+
+ case UDP_SEGMENT:
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index c6e20293c521f..ae4f7f983f951 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -882,7 +882,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
+ /* If zero checksum and no_check is not on for
+ * the socket then skip it.
+ */
+- if (!uh->check && !udp_sk(sk)->no_check6_rx)
++ if (!uh->check && !udp_get_no_check6_rx(sk))
+ continue;
+ if (!first) {
+ first = sk;
+@@ -1000,7 +1000,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+ if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst))
+ udp6_sk_rx_dst_set(sk, dst);
+
+- if (!uh->check && !udp_sk(sk)->no_check6_rx) {
++ if (!uh->check && !udp_get_no_check6_rx(sk)) {
+ if (refcounted)
+ sock_put(sk);
+ goto report_csum_error;
+@@ -1022,7 +1022,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
+ /* Unicast */
+ sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
+ if (sk) {
+- if (!uh->check && !udp_sk(sk)->no_check6_rx)
++ if (!uh->check && !udp_get_no_check6_rx(sk))
+ goto report_csum_error;
+ return udp6_unicast_rcv_skb(sk, skb, uh);
+ }
+--
+2.43.0
+
--- /dev/null
+From db4859b6d666990e4f3bb767f895153034944c21 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Sep 2023 09:17:22 +0000
+Subject: udp: move udp->no_check6_tx to udp->udp_flags
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit a0002127cd746fcaa182ad3386ef6931c37f3bda ]
+
+syzbot reported that udp->no_check6_tx can be read locklessly.
+Use one atomic bit from udp->udp_flags
+
+Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive")
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/udp.h | 10 +++++-----
+ net/ipv4/udp.c | 4 ++--
+ net/ipv6/udp.c | 4 ++--
+ 3 files changed, 9 insertions(+), 9 deletions(-)
+
+diff --git a/include/linux/udp.h b/include/linux/udp.h
+index 10b56b8231e3c..b5ca5760ae34b 100644
+--- a/include/linux/udp.h
++++ b/include/linux/udp.h
+@@ -32,6 +32,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
+
+ enum {
+ UDP_FLAGS_CORK, /* Cork is required */
++ UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */
+ };
+
+ struct udp_sock {
+@@ -45,8 +46,7 @@ struct udp_sock {
+
+ int pending; /* Any pending frames ? */
+ __u8 encap_type; /* Is this an Encapsulation socket? */
+- unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */
+- no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
++ unsigned char no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */
+ encap_enabled:1, /* This socket enabled encap
+ * processing; UDP tunnels and
+ * different encapsulation layer set
+@@ -112,7 +112,7 @@ static inline struct udp_sock *udp_sk(const struct sock *sk)
+
+ static inline void udp_set_no_check6_tx(struct sock *sk, bool val)
+ {
+- udp_sk(sk)->no_check6_tx = val;
++ udp_assign_bit(NO_CHECK6_TX, sk, val);
+ }
+
+ static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
+@@ -120,9 +120,9 @@ static inline void udp_set_no_check6_rx(struct sock *sk, bool val)
+ udp_sk(sk)->no_check6_rx = val;
+ }
+
+-static inline bool udp_get_no_check6_tx(struct sock *sk)
++static inline bool udp_get_no_check6_tx(const struct sock *sk)
+ {
+- return udp_sk(sk)->no_check6_tx;
++ return udp_test_bit(NO_CHECK6_TX, sk);
+ }
+
+ static inline bool udp_get_no_check6_rx(struct sock *sk)
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 60a754477efb2..513035e83a820 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -2711,7 +2711,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
+ break;
+
+ case UDP_NO_CHECK6_TX:
+- up->no_check6_tx = valbool;
++ udp_set_no_check6_tx(sk, valbool);
+ break;
+
+ case UDP_NO_CHECK6_RX:
+@@ -2808,7 +2808,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname,
+ break;
+
+ case UDP_NO_CHECK6_TX:
+- val = up->no_check6_tx;
++ val = udp_get_no_check6_tx(sk);
+ break;
+
+ case UDP_NO_CHECK6_RX:
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index 85653e3a04fe8..c6e20293c521f 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -1260,7 +1260,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+- if (udp_sk(sk)->no_check6_tx) {
++ if (udp_get_no_check6_tx(sk)) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+@@ -1281,7 +1281,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6,
+
+ if (is_udplite)
+ csum = udplite_csum(skb);
+- else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */
++ else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */
+ skb->ip_summed = CHECKSUM_NONE;
+ goto send;
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */
+--
+2.43.0
+
--- /dev/null
+From 011b4aa30e6bcbdc4307b512c1563b39d38981b5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 15 Dec 2023 11:13:34 +0100
+Subject: wifi: iwlwifi: pcie: don't synchronize IRQs from IRQ
+
+From: Johannes Berg <johannes.berg@intel.com>
+
+[ Upstream commit 400f6ebbc175286576c7f7fddf3c347d09d12310 ]
+
+On older devices (before unified image!) we can end up calling
+stop_device from an rfkill interrupt. However, in stop_device
+we attempt to synchronize IRQs, which then of course deadlocks.
+
+Avoid this by checking the context, if running from the IRQ
+thread then don't synchronize. This wouldn't be correct on a
+new device since RSS is supported, but older devices only have
+a single interrupt/queue.
+
+Fixes: 37fb29bd1f90 ("wifi: iwlwifi: pcie: synchronize IRQs before NAPI")
+Reviewed-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
+Reviewed-by: Emmanuel Grumbach <emmanuel.grumbach@intel.com>
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Signed-off-by: Kalle Valo <kvalo@kernel.org>
+Link: https://msgid.link/20231215111335.59aab00baed7.Iadfe154d6248e7f9dfd69522e5429dbbd72925d7@changeid
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/wireless/intel/iwlwifi/pcie/internal.h | 4 ++--
+ drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 8 ++++----
+ drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 17 +++++++++--------
+ 3 files changed, 15 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
+index 69b95ad5993b0..2ec4ee8ab317c 100644
+--- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
++++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h
+@@ -745,7 +745,7 @@ static inline void iwl_enable_rfkill_int(struct iwl_trans *trans)
+ }
+ }
+
+-void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans);
++void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans, bool from_irq);
+
+ static inline bool iwl_is_rfkill_set(struct iwl_trans *trans)
+ {
+@@ -792,7 +792,7 @@ static inline bool iwl_pcie_dbg_on(struct iwl_trans *trans)
+ return (trans->dbg.dest_tlv || iwl_trans_dbg_ini_valid(trans));
+ }
+
+-void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state);
++void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state, bool from_irq);
+ void iwl_trans_pcie_dump_regs(struct iwl_trans *trans);
+
+ #ifdef CONFIG_IWLWIFI_DEBUGFS
+diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
+index 90a46faaaffdf..57a11ee05bc36 100644
+--- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
++++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c
+@@ -1781,7 +1781,7 @@ static u32 iwl_pcie_int_cause_ict(struct iwl_trans *trans)
+ return inta;
+ }
+
+-void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans)
++void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans, bool from_irq)
+ {
+ struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
+ struct isr_statistics *isr_stats = &trans_pcie->isr_stats;
+@@ -1805,7 +1805,7 @@ void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans)
+ isr_stats->rfkill++;
+
+ if (prev != report)
+- iwl_trans_pcie_rf_kill(trans, report);
++ iwl_trans_pcie_rf_kill(trans, report, from_irq);
+ mutex_unlock(&trans_pcie->mutex);
+
+ if (hw_rfkill) {
+@@ -1945,7 +1945,7 @@ irqreturn_t iwl_pcie_irq_handler(int irq, void *dev_id)
+
+ /* HW RF KILL switch toggled */
+ if (inta & CSR_INT_BIT_RF_KILL) {
+- iwl_pcie_handle_rfkill_irq(trans);
++ iwl_pcie_handle_rfkill_irq(trans, true);
+ handled |= CSR_INT_BIT_RF_KILL;
+ }
+
+@@ -2362,7 +2362,7 @@ irqreturn_t iwl_pcie_irq_msix_handler(int irq, void *dev_id)
+
+ /* HW RF KILL switch toggled */
+ if (inta_hw & MSIX_HW_INT_CAUSES_REG_RF_KILL)
+- iwl_pcie_handle_rfkill_irq(trans);
++ iwl_pcie_handle_rfkill_irq(trans, true);
+
+ if (inta_hw & MSIX_HW_INT_CAUSES_REG_HW_ERR) {
+ IWL_ERR(trans,
+diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
+index 796972f224326..c7ed35b3dd8d5 100644
+--- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
++++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c
+@@ -1080,7 +1080,7 @@ bool iwl_pcie_check_hw_rf_kill(struct iwl_trans *trans)
+ report = test_bit(STATUS_RFKILL_OPMODE, &trans->status);
+
+ if (prev != report)
+- iwl_trans_pcie_rf_kill(trans, report);
++ iwl_trans_pcie_rf_kill(trans, report, false);
+
+ return hw_rfkill;
+ }
+@@ -1234,7 +1234,7 @@ static void iwl_pcie_init_msix(struct iwl_trans_pcie *trans_pcie)
+ trans_pcie->hw_mask = trans_pcie->hw_init_mask;
+ }
+
+-static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans)
++static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool from_irq)
+ {
+ struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans);
+
+@@ -1261,7 +1261,8 @@ static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans)
+ if (test_and_clear_bit(STATUS_DEVICE_ENABLED, &trans->status)) {
+ IWL_DEBUG_INFO(trans,
+ "DEVICE_ENABLED bit was set and is now cleared\n");
+- iwl_pcie_synchronize_irqs(trans);
++ if (!from_irq)
++ iwl_pcie_synchronize_irqs(trans);
+ iwl_pcie_rx_napi_sync(trans);
+ iwl_pcie_tx_stop(trans);
+ iwl_pcie_rx_stop(trans);
+@@ -1451,7 +1452,7 @@ void iwl_trans_pcie_handle_stop_rfkill(struct iwl_trans *trans,
+ clear_bit(STATUS_RFKILL_OPMODE, &trans->status);
+ }
+ if (hw_rfkill != was_in_rfkill)
+- iwl_trans_pcie_rf_kill(trans, hw_rfkill);
++ iwl_trans_pcie_rf_kill(trans, hw_rfkill, false);
+ }
+
+ static void iwl_trans_pcie_stop_device(struct iwl_trans *trans)
+@@ -1466,12 +1467,12 @@ static void iwl_trans_pcie_stop_device(struct iwl_trans *trans)
+ mutex_lock(&trans_pcie->mutex);
+ trans_pcie->opmode_down = true;
+ was_in_rfkill = test_bit(STATUS_RFKILL_OPMODE, &trans->status);
+- _iwl_trans_pcie_stop_device(trans);
++ _iwl_trans_pcie_stop_device(trans, false);
+ iwl_trans_pcie_handle_stop_rfkill(trans, was_in_rfkill);
+ mutex_unlock(&trans_pcie->mutex);
+ }
+
+-void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state)
++void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state, bool from_irq)
+ {
+ struct iwl_trans_pcie __maybe_unused *trans_pcie =
+ IWL_TRANS_GET_PCIE_TRANS(trans);
+@@ -1484,7 +1485,7 @@ void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state)
+ if (trans->trans_cfg->gen2)
+ _iwl_trans_pcie_gen2_stop_device(trans);
+ else
+- _iwl_trans_pcie_stop_device(trans);
++ _iwl_trans_pcie_stop_device(trans, from_irq);
+ }
+ }
+
+@@ -2815,7 +2816,7 @@ static ssize_t iwl_dbgfs_rfkill_write(struct file *file,
+ IWL_WARN(trans, "changing debug rfkill %d->%d\n",
+ trans_pcie->debug_rfkill, new_value);
+ trans_pcie->debug_rfkill = new_value;
+- iwl_pcie_handle_rfkill_irq(trans);
++ iwl_pcie_handle_rfkill_irq(trans, false);
+
+ return count;
+ }
+--
+2.43.0
+
--- /dev/null
+From 11af298a892bfd2816d2ccff3eaa3db927072f70 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 4 Oct 2023 12:36:22 +0300
+Subject: wifi: iwlwifi: yoyo: swap cdb and jacket bits values
+
+From: Rotem Saado <rotem.saado@intel.com>
+
+[ Upstream commit 65008777b9dcd2002414ddb2c2158293a6e2fd6f ]
+
+The bits are wrong, the jacket bit should be 5 and cdb bit 4.
+Fix it.
+
+Fixes: 1f171f4f1437 ("iwlwifi: Add support for getting rf id with blank otp")
+Signed-off-by: Rotem Saado <rotem.saado@intel.com>
+Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
+Link: https://lore.kernel.org/r/20231004123422.356d8dacda2f.I349ab888b43a11baa2453a1d6978a6a703e422f0@changeid
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/wireless/intel/iwlwifi/iwl-prph.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-prph.h b/drivers/net/wireless/intel/iwlwifi/iwl-prph.h
+index 157d1f31c4871..c5a306b01fe20 100644
+--- a/drivers/net/wireless/intel/iwlwifi/iwl-prph.h
++++ b/drivers/net/wireless/intel/iwlwifi/iwl-prph.h
+@@ -348,8 +348,8 @@
+ #define RFIC_REG_RD 0xAD0470
+ #define WFPM_CTRL_REG 0xA03030
+ #define WFPM_OTP_CFG1_ADDR 0x00a03098
+-#define WFPM_OTP_CFG1_IS_JACKET_BIT BIT(4)
+-#define WFPM_OTP_CFG1_IS_CDB_BIT BIT(5)
++#define WFPM_OTP_CFG1_IS_JACKET_BIT BIT(5)
++#define WFPM_OTP_CFG1_IS_CDB_BIT BIT(4)
+
+ #define WFPM_GP2 0xA030B4
+
+--
+2.43.0
+