From 17b9ff90c9bfcb43bd2e66a83aa285b31fc6cb86 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 6 Jan 2024 20:58:50 -0500 Subject: [PATCH] Fixes for 6.1 Signed-off-by: Sasha Levin --- ...ltek-add-quirk-for-lenovo-yoga-pro-7.patch | 36 + ...x-array-index-out-of-bounds-read-in-.patch | 64 ++ ...dm845-align-rpmh-regulator-nodes-wit.patch | 304 ++++++ ...m-sdm845-fix-psci-power-domain-names.patch | 66 ++ ...x-add-check-for-usbnet_get_endpoints.patch | 38 + ...ix-error-handler-with-pm_runtime_ena.patch | 65 ++ ...8186-fix-aud_pad_top-register-and-of.patch | 39 + ...n-g12a-toacodec-fix-event-generation.patch | 39 + ...toacodec-validate-written-enum-value.patch | 40 + ...tohdmitx-fix-event-generation-for-s-.patch | 39 + ...tohdmitx-validate-written-enum-value.patch | 50 + ...-active-queue-usage-is-held-for-bio_.patch | 168 ++++ ...e-the-stable_writes-flag-in-bdev_add.patch | 46 + ...is-applied-code-from-bnxt_cfg_ntp_fi.patch | 47 + ...-visit_insn-s-instruction-processing.patch | 96 ++ .../bpf-decouple-prune-and-jump-points.patch | 197 ++++ ...n-backtracking-instruction-iteration.patch | 89 ++ ...handle-ldimm64-properly-in-check_cfg.patch | 153 +++ ...ve-unnecessary-prune-and-jump-points.patch | 112 +++ ...d-insn_cnt-argument-from-visit_-func.patch | 76 ++ ...nix-stream-sockets-need-to-hold-ref-.patch | 139 +++ ...ort-new-32bit-offset-jmp-instruction.patch | 212 ++++ .../bpf-x64-fix-tailcall-infinite-loop.patch | 167 ++++ ...6-save-restore-regs-with-bpf_dw-size.patch | 94 ++ ...-the-parsing-logic-of-structure-para.patch | 225 +++++ ...roup_free_reserved_data-int-overflow.patch | 269 +++++ ...en-field-in-struct-btrfs_ordered_sum.patch | 51 + .../can-raw-add-support-for-so_mark.patch | 36 + ...t-create-topology_smt_thread_allowed.patch | 111 +++ ...-control-more-robust-against-enumera.patch | 113 +++ ...e-the-rx-buffer-only-after-all-proce.patch | 69 ++ ...65dsi86-never-store-more-than-msg-si.patch | 55 ++ ...passing-the-correct-dpcd_rev-for-drm.patch | 42 + ...on-t-propagate-eopnotsupp-from-dumps.patch | 43 + ...t-move_extent_per_page-to-use-folios.patch | 159 +++ ...2fs-assign-default-compression-level.patch | 106 ++ ...compress_flag-and-i_compress_level-u.patch | 135 +++ .../f2fs-convert-to-use-bitmap-api.patch | 440 +++++++++ ...-the-default-compress_level-on-ioctl.patch | 47 + ...dev-imsttfb-fix-double-free-in-probe.patch | 51 + ...lease-framebuffer-and-dealloc-cmap-o.patch | 40 + ...add-a-per-mapping-stable-writes-flag.patch | 103 ++ ...i-fix-frequency-truncation-by-promot.patch | 54 + ...don-t-pass-irq_affinity_desc-array-t.patch | 142 +++ ...nity-move-group_cpus_evenly-into-lib.patch | 920 ++++++++++++++++++ ...pass-affinity-managed-mask-array-to-.patch | 121 +++ ...remove-the-firstvec-parameter-from-i.patch | 54 + ...rename-irq_build_affinity_masks-as-g.patch | 485 +++++++++ ...input-checks-to-prevent-config-with-.patch | 53 + ...e-after-free-in-i40e_aqc_add_filters.patch | 120 +++ ...tore-vf-msi-x-state-during-pci-reset.patch | 104 ++ .../ice-fix-link_down_on_close-message.patch | 55 ++ ...-vsi-with-link-down-on-close-enabled.patch | 40 + queue-6.1/igc-check-vlan-ethertype-mask.patch | 72 ++ queue-6.1/igc-check-vlan-tci-mask.patch | 141 +++ queue-6.1/igc-fix-hicredit-calculation.patch | 45 + ...vlan-ethertype-matching-back-to-user.patch | 75 ++ .../ipv4-ipv6-use-splice_eof-to-flush.patch | 262 +++++ ...e-try_to_release_page-with-filemap_r.patch | 95 ++ ...-avoid-acquiring-cpu-hotplug-lock-in.patch | 102 ++ ...mss-sm8250-virtual-channels-for-csid.patch | 307 ++++++ ...-fix-set-csi2_rx_cfg1_vc_mode-when-v.patch | 39 + ...onvert-truncate_error_page-to-use-fo.patch | 47 + ...ge-fix-receive-packet-race-condition.patch | 63 ++ ...hotplug-add-missing-mem_hotplug_lock.patch | 218 +++++ ...g-fix-error-handling-in-add_memory_r.patch | 62 ++ ...as_private-filemap_release_folio-cal.patch | 282 ++++++ ...-stop-read-optimisation-when-folio-r.patch | 222 +++++ ...ate-data-races-around-sk-sk_bind_phc.patch | 60 ++ ...tate-data-races-around-sk-sk_tsflags.patch | 367 +++++++ ...-fcs-generation-for-fragmented-skbuf.patch | 46 + ...g_splice_pages-internal-sendmsg-flag.patch | 94 ++ ...arrange-variable-in-dpaa2_eth_get_et.patch | 62 ++ ...ssing-getsockopt-so_timestamping_new.patch | 60 ++ ...ssing-so_timestamping_new-cmsg-suppo.patch | 40 + ...lx5-increase-size-of-irq-name-buffer.patch | 76 ++ ...potential-memleak-in-ql_alloc_buffer.patch | 44 + ...ait-for-operating-mode-to-be-applied.patch | 181 ++++ ...-restore-msg_namelen-in-sock_sendmsg.patch | 55 ++ ...-fix-promotion-of-offloaded-unreplie.patch | 158 +++ ...d-act_ct-offload-udp-new-connections.patch | 157 +++ ...-take-per-cb-reference-to-tcf_ct_flo.patch | 195 ++++ ...cf_ct_params_free-to-free-params-in-.patch | 112 +++ ...t-fix-possible-memory-leak-in-em_tex.patch | 40 + ...lid-link-access-in-dumping-smc-r-con.patch | 91 ++ ...tend-sof_timestamping_opt_id-to-hw-t.patch | 52 + ...flowtable-allow-unidirectional-rules.patch | 76 ++ ...flowtable-cache-info-of-last-offload.patch | 171 ++++ ...ble-gc-pushes-back-packets-to-classi.patch | 104 ++ ...les-set-transport-offset-from-mac-he.patch | 75 ++ ...mediate-drop-chain-reference-counter.patch | 36 + ...ter-use-skb_ip_totlen-and-iph_totlen.patch | 97 ++ ...ld-a-ref-to-llcp_local-dev-when-hold.patch | 128 +++ ...ays-configure-nix-tx-link-credits-ba.patch | 184 ++++ ...-marking-couple-of-structure-as-__pa.patch | 46 + ...tx2-af-fix-pause-frame-configuration.patch | 56 ++ ...enable-mac-tx-in-otx2_stop-processin.patch | 93 ++ ...-af-support-variable-number-of-lmacs.patch | 342 +++++++ ...r8169-fix-pci-error-on-system-resume.patch | 49 + ...32-bit-rb_time_read-race-with-rb_tim.patch | 74 ++ ...pport-user-space-events-for-counting.patch | 93 ++ ...ing-arch_set_page_dat-call-to-vmem_c.patch | 63 ++ ...g-do-not-set-port-down-when-adding-t.patch | 53 + ...ptcp-fix-fastclose-with-csum-failure.patch | 58 ++ ...mptcp-set-failing_links-in-run_tests.patch | 64 ++ ...mem-floor-the-memory-size-to-the-mul.patch | 56 ++ queue-6.1/series | 120 +++ ...double-free-bug-in-efx_probe_filters.patch | 51 + ...x-missing-mode-bits-for-smb-symlinks.patch | 36 + ...-splice_eof-op-to-file-ops-and-socke.patch | 212 ++++ ...x-callbacks-acceleration-mishandling.patch | 157 +++ ...ate-data-races-around-udp-encap_type.patch | 205 ++++ ...udp_sendpage-to-use-msg_splice_pages.patch | 95 ++ queue-6.1/udp-introduce-udp-udp_flags.patch | 171 ++++ ...lockless-udp_encap_l2tpinudp-udp_gro.patch | 154 +++ ...ept_udp_-l4-fraglist-to-udp-udp_flag.patch | 91 ++ ...ove-udp-gro_enabled-to-udp-udp_flags.patch | 109 +++ ...ve-udp-no_check6_rx-to-udp-udp_flags.patch | 123 +++ ...ve-udp-no_check6_tx-to-udp-udp_flags.patch | 114 +++ ...pcie-don-t-synchronize-irqs-from-irq.patch | 170 ++++ ...yoyo-swap-cdb-and-jacket-bits-values.patch | 40 + 121 files changed, 14307 insertions(+) create mode 100644 queue-6.1/alsa-hda-realtek-add-quirk-for-lenovo-yoga-pro-7.patch create mode 100644 queue-6.1/arm-sun9i-smp-fix-array-index-out-of-bounds-read-in-.patch create mode 100644 queue-6.1/arm64-dts-qcom-sdm845-align-rpmh-regulator-nodes-wit.patch create mode 100644 queue-6.1/arm64-dts-qcom-sdm845-fix-psci-power-domain-names.patch create mode 100644 queue-6.1/asix-add-check-for-usbnet_get_endpoints.patch create mode 100644 queue-6.1/asoc-fsl_rpmsg-fix-error-handler-with-pm_runtime_ena.patch create mode 100644 queue-6.1/asoc-mediatek-mt8186-fix-aud_pad_top-register-and-of.patch create mode 100644 queue-6.1/asoc-meson-g12a-toacodec-fix-event-generation.patch create mode 100644 queue-6.1/asoc-meson-g12a-toacodec-validate-written-enum-value.patch create mode 100644 queue-6.1/asoc-meson-g12a-tohdmitx-fix-event-generation-for-s-.patch create mode 100644 queue-6.1/asoc-meson-g12a-tohdmitx-validate-written-enum-value.patch create mode 100644 queue-6.1/blk-mq-make-sure-active-queue-usage-is-held-for-bio_.patch create mode 100644 queue-6.1/block-update-the-stable_writes-flag-in-bdev_add.patch create mode 100644 queue-6.1/bnxt_en-remove-mis-applied-code-from-bnxt_cfg_ntp_fi.patch create mode 100644 queue-6.1/bpf-clean-up-visit_insn-s-instruction-processing.patch create mode 100644 queue-6.1/bpf-decouple-prune-and-jump-points.patch create mode 100644 queue-6.1/bpf-fix-precision-backtracking-instruction-iteration.patch create mode 100644 queue-6.1/bpf-handle-ldimm64-properly-in-check_cfg.patch create mode 100644 queue-6.1/bpf-remove-unnecessary-prune-and-jump-points.patch create mode 100644 queue-6.1/bpf-remove-unused-insn_cnt-argument-from-visit_-func.patch create mode 100644 queue-6.1/bpf-sockmap-af_unix-stream-sockets-need-to-hold-ref-.patch create mode 100644 queue-6.1/bpf-support-new-32bit-offset-jmp-instruction.patch create mode 100644 queue-6.1/bpf-x64-fix-tailcall-infinite-loop.patch create mode 100644 queue-6.1/bpf-x86-save-restore-regs-with-bpf_dw-size.patch create mode 100644 queue-6.1/bpf-x86-simplify-the-parsing-logic-of-structure-para.patch create mode 100644 queue-6.1/btrfs-fix-qgroup_free_reserved_data-int-overflow.patch create mode 100644 queue-6.1/btrfs-mark-the-len-field-in-struct-btrfs_ordered_sum.patch create mode 100644 queue-6.1/can-raw-add-support-for-so_mark.patch create mode 100644 queue-6.1/cpu-smt-create-topology_smt_thread_allowed.patch create mode 100644 queue-6.1/cpu-smt-make-smt-control-more-robust-against-enumera.patch create mode 100644 queue-6.1/dpaa2-eth-recycle-the-rx-buffer-only-after-all-proce.patch create mode 100644 queue-6.1/drm-bridge-ti-sn65dsi86-never-store-more-than-msg-si.patch create mode 100644 queue-6.1/drm-i915-dp-fix-passing-the-correct-dpcd_rev-for-drm.patch create mode 100644 queue-6.1/ethtool-don-t-propagate-eopnotsupp-from-dumps.patch create mode 100644 queue-6.1/ext4-convert-move_extent_per_page-to-use-folios.patch create mode 100644 queue-6.1/f2fs-assign-default-compression-level.patch create mode 100644 queue-6.1/f2fs-clean-up-i_compress_flag-and-i_compress_level-u.patch create mode 100644 queue-6.1/f2fs-convert-to-use-bitmap-api.patch create mode 100644 queue-6.1/f2fs-set-the-default-compress_level-on-ioctl.patch create mode 100644 queue-6.1/fbdev-imsttfb-fix-double-free-in-probe.patch create mode 100644 queue-6.1/fbdev-imsttfb-release-framebuffer-and-dealloc-cmap-o.patch create mode 100644 queue-6.1/filemap-add-a-per-mapping-stable-writes-flag.patch create mode 100644 queue-6.1/firmware-arm_scmi-fix-frequency-truncation-by-promot.patch create mode 100644 queue-6.1/genirq-affinity-don-t-pass-irq_affinity_desc-array-t.patch create mode 100644 queue-6.1/genirq-affinity-move-group_cpus_evenly-into-lib.patch create mode 100644 queue-6.1/genirq-affinity-pass-affinity-managed-mask-array-to-.patch create mode 100644 queue-6.1/genirq-affinity-remove-the-firstvec-parameter-from-i.patch create mode 100644 queue-6.1/genirq-affinity-rename-irq_build_affinity_masks-as-g.patch create mode 100644 queue-6.1/i40e-fix-filter-input-checks-to-prevent-config-with-.patch create mode 100644 queue-6.1/i40e-fix-use-after-free-in-i40e_aqc_add_filters.patch create mode 100644 queue-6.1/i40e-restore-vf-msi-x-state-during-pci-reset.patch create mode 100644 queue-6.1/ice-fix-link_down_on_close-message.patch create mode 100644 queue-6.1/ice-shut-down-vsi-with-link-down-on-close-enabled.patch create mode 100644 queue-6.1/igc-check-vlan-ethertype-mask.patch create mode 100644 queue-6.1/igc-check-vlan-tci-mask.patch create mode 100644 queue-6.1/igc-fix-hicredit-calculation.patch create mode 100644 queue-6.1/igc-report-vlan-ethertype-matching-back-to-user.patch create mode 100644 queue-6.1/ipv4-ipv6-use-splice_eof-to-flush.patch create mode 100644 queue-6.1/khugepage-replace-try_to_release_page-with-filemap_r.patch create mode 100644 queue-6.1/lib-group_cpus.c-avoid-acquiring-cpu-hotplug-lock-in.patch create mode 100644 queue-6.1/media-camss-sm8250-virtual-channels-for-csid.patch create mode 100644 queue-6.1/media-qcom-camss-fix-set-csi2_rx_cfg1_vc_mode-when-v.patch create mode 100644 queue-6.1/memory-failure-convert-truncate_error_page-to-use-fo.patch create mode 100644 queue-6.1/mlxbf_gige-fix-receive-packet-race-condition.patch create mode 100644 queue-6.1/mm-memory_hotplug-add-missing-mem_hotplug_lock.patch create mode 100644 queue-6.1/mm-memory_hotplug-fix-error-handling-in-add_memory_r.patch create mode 100644 queue-6.1/mm-merge-folio_has_private-filemap_release_folio-cal.patch create mode 100644 queue-6.1/mm-netfs-fscache-stop-read-optimisation-when-folio-r.patch create mode 100644 queue-6.1/net-annotate-data-races-around-sk-sk_bind_phc.patch create mode 100644 queue-6.1/net-annotate-data-races-around-sk-sk_tsflags.patch create mode 100644 queue-6.1/net-bcmgenet-fix-fcs-generation-for-fragmented-skbuf.patch create mode 100644 queue-6.1/net-declare-msg_splice_pages-internal-sendmsg-flag.patch create mode 100644 queue-6.1/net-dpaa2-eth-rearrange-variable-in-dpaa2_eth_get_et.patch create mode 100644 queue-6.1/net-implement-missing-getsockopt-so_timestamping_new.patch create mode 100644 queue-6.1/net-implement-missing-so_timestamping_new-cmsg-suppo.patch create mode 100644 queue-6.1/net-mlx5-increase-size-of-irq-name-buffer.patch create mode 100644 queue-6.1/net-qla3xxx-fix-potential-memleak-in-ql_alloc_buffer.patch create mode 100644 queue-6.1/net-ravb-wait-for-operating-mode-to-be-applied.patch create mode 100644 queue-6.1/net-save-and-restore-msg_namelen-in-sock_sendmsg.patch create mode 100644 queue-6.1/net-sched-act_ct-fix-promotion-of-offloaded-unreplie.patch create mode 100644 queue-6.1/net-sched-act_ct-offload-udp-new-connections.patch create mode 100644 queue-6.1/net-sched-act_ct-take-per-cb-reference-to-tcf_ct_flo.patch create mode 100644 queue-6.1/net-sched-call-tcf_ct_params_free-to-free-params-in-.patch create mode 100644 queue-6.1/net-sched-em_text-fix-possible-memory-leak-in-em_tex.patch create mode 100644 queue-6.1/net-smc-fix-invalid-link-access-in-dumping-smc-r-con.patch create mode 100644 queue-6.1/net-timestamp-extend-sof_timestamping_opt_id-to-hw-t.patch create mode 100644 queue-6.1/netfilter-flowtable-allow-unidirectional-rules.patch create mode 100644 queue-6.1/netfilter-flowtable-cache-info-of-last-offload.patch create mode 100644 queue-6.1/netfilter-flowtable-gc-pushes-back-packets-to-classi.patch create mode 100644 queue-6.1/netfilter-nf_tables-set-transport-offset-from-mac-he.patch create mode 100644 queue-6.1/netfilter-nft_immediate-drop-chain-reference-counter.patch create mode 100644 queue-6.1/netfilter-use-skb_ip_totlen-and-iph_totlen.patch create mode 100644 queue-6.1/nfc-llcp_core-hold-a-ref-to-llcp_local-dev-when-hold.patch create mode 100644 queue-6.1/octeontx2-af-always-configure-nix-tx-link-credits-ba.patch create mode 100644 queue-6.1/octeontx2-af-fix-marking-couple-of-structure-as-__pa.patch create mode 100644 queue-6.1/octeontx2-af-fix-pause-frame-configuration.patch create mode 100644 queue-6.1/octeontx2-af-re-enable-mac-tx-in-otx2_stop-processin.patch create mode 100644 queue-6.1/octeontx2-af-support-variable-number-of-lmacs.patch create mode 100644 queue-6.1/r8169-fix-pci-error-on-system-resume.patch create mode 100644 queue-6.1/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch create mode 100644 queue-6.1/s390-cpumf-support-user-space-events-for-counting.patch create mode 100644 queue-6.1/s390-mm-add-missing-arch_set_page_dat-call-to-vmem_c.patch create mode 100644 queue-6.1/selftests-bonding-do-not-set-port-down-when-adding-t.patch create mode 100644 queue-6.1/selftests-mptcp-fix-fastclose-with-csum-failure.patch create mode 100644 queue-6.1/selftests-mptcp-set-failing_links-in-run_tests.patch create mode 100644 queue-6.1/selftests-secretmem-floor-the-memory-size-to-the-mul.patch create mode 100644 queue-6.1/sfc-fix-a-double-free-bug-in-efx_probe_filters.patch create mode 100644 queue-6.1/smb-client-fix-missing-mode-bits-for-smb-symlinks.patch create mode 100644 queue-6.1/splice-net-add-a-splice_eof-op-to-file-ops-and-socke.patch create mode 100644 queue-6.1/srcu-fix-callbacks-acceleration-mishandling.patch create mode 100644 queue-6.1/udp-annotate-data-races-around-udp-encap_type.patch create mode 100644 queue-6.1/udp-convert-udp_sendpage-to-use-msg_splice_pages.patch create mode 100644 queue-6.1/udp-introduce-udp-udp_flags.patch create mode 100644 queue-6.1/udp-lockless-udp_encap_l2tpinudp-udp_gro.patch create mode 100644 queue-6.1/udp-move-udp-accept_udp_-l4-fraglist-to-udp-udp_flag.patch create mode 100644 queue-6.1/udp-move-udp-gro_enabled-to-udp-udp_flags.patch create mode 100644 queue-6.1/udp-move-udp-no_check6_rx-to-udp-udp_flags.patch create mode 100644 queue-6.1/udp-move-udp-no_check6_tx-to-udp-udp_flags.patch create mode 100644 queue-6.1/wifi-iwlwifi-pcie-don-t-synchronize-irqs-from-irq.patch create mode 100644 queue-6.1/wifi-iwlwifi-yoyo-swap-cdb-and-jacket-bits-values.patch diff --git a/queue-6.1/alsa-hda-realtek-add-quirk-for-lenovo-yoga-pro-7.patch b/queue-6.1/alsa-hda-realtek-add-quirk-for-lenovo-yoga-pro-7.patch new file mode 100644 index 00000000000..54a423e018d --- /dev/null +++ b/queue-6.1/alsa-hda-realtek-add-quirk-for-lenovo-yoga-pro-7.patch @@ -0,0 +1,36 @@ +From 4ee3561ae2a5201c99335c17f13f5ba05802f179 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 7 Dec 2023 19:20:35 +0100 +Subject: ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 + +From: Takashi Iwai + +[ Upstream commit 634e5e1e06f5cdd614a1bc429ecb243a51cc009d ] + +Lenovo Yoga Pro 7 14APH8 (PCI SSID 17aa:3882) seems requiring the +similar workaround like Yoga 9 model for the bass speaker. + +Cc: +Link: https://lore.kernel.org/r/CAGGk=CRRQ1L9p771HsXTN_ebZP41Qj+3gw35Gezurn+nokRewg@mail.gmail.com +Link: https://lore.kernel.org/r/20231207182035.30248-1-tiwai@suse.de +Signed-off-by: Takashi Iwai +Signed-off-by: Sasha Levin +--- + sound/pci/hda/patch_realtek.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index a88ed60dcd96a..48155aa52828c 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -9904,6 +9904,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x1558, 0xc019, "Clevo NH77D[BE]Q", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x1558, 0xc022, "Clevo NH77[DC][QW]", ALC293_FIXUP_SYSTEM76_MIC_NO_PRESENCE), + SND_PCI_QUIRK(0x17aa, 0x1036, "Lenovo P520", ALC233_FIXUP_LENOVO_MULTI_CODECS), ++ SND_PCI_QUIRK(0x17aa, 0x3882, "Lenovo Yoga Pro 7 14APH8", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), + SND_PCI_QUIRK(0x17aa, 0x1048, "ThinkCentre Station", ALC623_FIXUP_LENOVO_THINKSTATION_P340), + SND_PCI_QUIRK(0x17aa, 0x20f2, "Thinkpad SL410/510", ALC269_FIXUP_SKU_IGNORE), + SND_PCI_QUIRK(0x17aa, 0x215e, "Thinkpad L512", ALC269_FIXUP_SKU_IGNORE), +-- +2.43.0 + diff --git a/queue-6.1/arm-sun9i-smp-fix-array-index-out-of-bounds-read-in-.patch b/queue-6.1/arm-sun9i-smp-fix-array-index-out-of-bounds-read-in-.patch new file mode 100644 index 00000000000..71b08b4187e --- /dev/null +++ b/queue-6.1/arm-sun9i-smp-fix-array-index-out-of-bounds-read-in-.patch @@ -0,0 +1,64 @@ +From 3e16e0cda98b5db7e47533ca0dcd626b759cc327 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 Dec 2023 20:39:02 +0100 +Subject: ARM: sun9i: smp: Fix array-index-out-of-bounds read in + sunxi_mc_smp_init + +From: Stefan Wahren + +[ Upstream commit 72ad3b772b6d393701df58ba1359b0bb346a19ed ] + +Running a multi-arch kernel (multi_v7_defconfig) on a Raspberry Pi 3B+ +with enabled CONFIG_UBSAN triggers the following warning: + + UBSAN: array-index-out-of-bounds in arch/arm/mach-sunxi/mc_smp.c:810:29 + index 2 is out of range for type 'sunxi_mc_smp_data [2]' + CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.7.0-rc6-00248-g5254c0cbc92d + Hardware name: BCM2835 + unwind_backtrace from show_stack+0x10/0x14 + show_stack from dump_stack_lvl+0x40/0x4c + dump_stack_lvl from ubsan_epilogue+0x8/0x34 + ubsan_epilogue from __ubsan_handle_out_of_bounds+0x78/0x80 + __ubsan_handle_out_of_bounds from sunxi_mc_smp_init+0xe4/0x4cc + sunxi_mc_smp_init from do_one_initcall+0xa0/0x2fc + do_one_initcall from kernel_init_freeable+0xf4/0x2f4 + kernel_init_freeable from kernel_init+0x18/0x158 + kernel_init from ret_from_fork+0x14/0x28 + +Since the enabled method couldn't match with any entry from +sunxi_mc_smp_data, the value of the index shouldn't be used right after +the loop. So move it after the check of ret in order to have a valid +index. + +Fixes: 1631090e34f5 ("ARM: sun9i: smp: Add is_a83t field") +Signed-off-by: Stefan Wahren +Link: https://lore.kernel.org/r/20231228193903.9078-1-wahrenst@gmx.net +Reviewed-by: Chen-Yu Tsai +Signed-off-by: Arnd Bergmann +Signed-off-by: Sasha Levin +--- + arch/arm/mach-sunxi/mc_smp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/arm/mach-sunxi/mc_smp.c b/arch/arm/mach-sunxi/mc_smp.c +index 26cbce1353387..b2f5f4f28705f 100644 +--- a/arch/arm/mach-sunxi/mc_smp.c ++++ b/arch/arm/mach-sunxi/mc_smp.c +@@ -808,12 +808,12 @@ static int __init sunxi_mc_smp_init(void) + break; + } + +- is_a83t = sunxi_mc_smp_data[i].is_a83t; +- + of_node_put(node); + if (ret) + return -ENODEV; + ++ is_a83t = sunxi_mc_smp_data[i].is_a83t; ++ + if (!sunxi_mc_smp_cpu_table_init()) + return -EINVAL; + +-- +2.43.0 + diff --git a/queue-6.1/arm64-dts-qcom-sdm845-align-rpmh-regulator-nodes-wit.patch b/queue-6.1/arm64-dts-qcom-sdm845-align-rpmh-regulator-nodes-wit.patch new file mode 100644 index 00000000000..04538d83eb2 --- /dev/null +++ b/queue-6.1/arm64-dts-qcom-sdm845-align-rpmh-regulator-nodes-wit.patch @@ -0,0 +1,304 @@ +From e19d878f0cb36876b9df3d6fa13866e0e1f207f3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 27 Jan 2023 12:43:42 +0100 +Subject: arm64: dts: qcom: sdm845: align RPMh regulator nodes with bindings + +From: Krzysztof Kozlowski + +[ Upstream commit 86dd19bbdea2b7d3feb69c0c39f141de30a18ec9 ] + +Device node names should be generic and bindings expect certain pattern +for RPMh regulator nodes. + +Signed-off-by: Krzysztof Kozlowski +Reviewed-by: Konrad Dybcio +Signed-off-by: Bjorn Andersson +Link: https://lore.kernel.org/r/20230127114347.235963-6-krzysztof.kozlowski@linaro.org +Stable-dep-of: a5f01673d394 ("arm64: dts: qcom: sdm845: Fix PSCI power domain names") +Signed-off-by: Sasha Levin +--- + arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi | 4 ++-- + arch/arm64/boot/dts/qcom/sdm845-db845c.dts | 4 ++-- + arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi | 6 +++--- + arch/arm64/boot/dts/qcom/sdm845-mtp.dts | 6 +++--- + arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi | 6 +++--- + arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts | 6 +++--- + arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi | 6 +++--- + arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts | 2 +- + arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts | 6 +++--- + arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts | 2 +- + arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts | 2 +- + 11 files changed, 25 insertions(+), 25 deletions(-) + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi +index a5c0c788969fb..985824032c522 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi ++++ b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi +@@ -351,7 +351,7 @@ flash@0 { + + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +@@ -633,7 +633,7 @@ src_pp1800_lvs2: lvs2 { + }; + }; + +- pm8005-rpmh-regulators { ++ regulators-1 { + compatible = "qcom,pm8005-rpmh-regulators"; + qcom,pmic-id = "c"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts +index c9efcb894a52f..8c9ccf5b4ea41 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-db845c.dts ++++ b/arch/arm64/boot/dts/qcom/sdm845-db845c.dts +@@ -271,7 +271,7 @@ &adsp_pas { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + vdd-s1-supply = <&vph_pwr>; +@@ -396,7 +396,7 @@ vreg_lvs2a_1p8: lvs2 { + }; + }; + +- pmi8998-rpmh-regulators { ++ regulators-1 { + compatible = "qcom,pmi8998-rpmh-regulators"; + qcom,pmic-id = "b"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi b/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi +index 20f275f8694dc..e2921640880a1 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi ++++ b/arch/arm64/boot/dts/qcom/sdm845-lg-common.dtsi +@@ -166,7 +166,7 @@ &adsp_pas { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +@@ -419,7 +419,7 @@ vreg_lvs2a_1p8: lvs2 { + }; + }; + +- pmi8998-rpmh-regulators { ++ regulators-1 { + compatible = "qcom,pmi8998-rpmh-regulators"; + qcom,pmic-id = "b"; + +@@ -433,7 +433,7 @@ vreg_bob: bob { + }; + }; + +- pm8005-rpmh-regulators { ++ regulators-2 { + compatible = "qcom,pm8005-rpmh-regulators"; + qcom,pmic-id = "c"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-mtp.dts b/arch/arm64/boot/dts/qcom/sdm845-mtp.dts +index 64958dee17d8b..b47e333aa3510 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-mtp.dts ++++ b/arch/arm64/boot/dts/qcom/sdm845-mtp.dts +@@ -117,7 +117,7 @@ &adsp_pas { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +@@ -382,7 +382,7 @@ vreg_lvs2a_1p8: lvs2 { + }; + }; + +- pmi8998-rpmh-regulators { ++ regulators-1 { + compatible = "qcom,pmi8998-rpmh-regulators"; + qcom,pmic-id = "b"; + +@@ -396,7 +396,7 @@ vreg_bob: bob { + }; + }; + +- pm8005-rpmh-regulators { ++ regulators-2 { + compatible = "qcom,pm8005-rpmh-regulators"; + qcom,pmic-id = "c"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi b/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi +index 392461c29e76e..0713b774a97be 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi ++++ b/arch/arm64/boot/dts/qcom/sdm845-oneplus-common.dtsi +@@ -144,7 +144,7 @@ &adsp_pas { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +@@ -280,7 +280,7 @@ vreg_l28a_3p0: ldo28 { + }; + }; + +- pmi8998-rpmh-regulators { ++ regulators-1 { + compatible = "qcom,pmi8998-rpmh-regulators"; + qcom,pmic-id = "b"; + +@@ -294,7 +294,7 @@ vreg_bob: bob { + }; + }; + +- pm8005-rpmh-regulators { ++ regulators-2 { + compatible = "qcom,pm8005-rpmh-regulators"; + qcom,pmic-id = "c"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts +index 83261c9bb4f23..b65c35865dab9 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts ++++ b/arch/arm64/boot/dts/qcom/sdm845-shift-axolotl.dts +@@ -110,7 +110,7 @@ &adsp_pas { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +@@ -375,7 +375,7 @@ vreg_lvs2a_1p8: lvs2 { + }; + }; + +- pmi8998-rpmh-regulators { ++ regulators-1 { + compatible = "qcom,pmi8998-rpmh-regulators"; + qcom,pmic-id = "b"; + +@@ -389,7 +389,7 @@ vreg_bob: bob { + }; + }; + +- pm8005-rpmh-regulators { ++ regulators-2 { + compatible = "qcom,pm8005-rpmh-regulators"; + qcom,pmic-id = "c"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi b/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi +index d6918e6d19799..249a715d5aae1 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi ++++ b/arch/arm64/boot/dts/qcom/sdm845-sony-xperia-tama.dtsi +@@ -78,7 +78,7 @@ ramoops@ffc00000 { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +@@ -308,7 +308,7 @@ vreg_lvs2a_1p8: lvs2 { + }; + }; + +- pmi8998-rpmh-regulators { ++ regulators-1 { + compatible = "qcom,pmi8998-rpmh-regulators"; + qcom,pmic-id = "b"; + +@@ -319,7 +319,7 @@ src_vreg_bob: bob { + }; + }; + +- pm8005-rpmh-regulators { ++ regulators-2 { + compatible = "qcom,pm8005-rpmh-regulators"; + qcom,pmic-id = "c"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts +index 0f470cf1ed1c1..6d6b3dd699475 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts ++++ b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-beryllium.dts +@@ -125,7 +125,7 @@ &adsp_pas { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts +index 093b04359ec39..ffbe45a99b74a 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts ++++ b/arch/arm64/boot/dts/qcom/sdm845-xiaomi-polaris.dts +@@ -143,7 +143,7 @@ vreg_s4a_1p8: vreg-s4a-1p8 { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +@@ -343,7 +343,7 @@ vreg_lvs2a_1p8: lvs2 { + }; + }; + +- pmi8998-rpmh-regulators { ++ regulators-1 { + compatible = "qcom,pmi8998-rpmh-regulators"; + qcom,pmic-id = "b"; + +@@ -355,7 +355,7 @@ vreg_bob: bob { + }; + }; + +- pm8005-rpmh-regulators { ++ regulators-2 { + compatible = "qcom,pm8005-rpmh-regulators"; + qcom,pmic-id = "c"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +index 74f43da51fa50..48a41ace8fc58 100644 +--- a/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts ++++ b/arch/arm64/boot/dts/qcom/sdm850-lenovo-yoga-c630.dts +@@ -99,7 +99,7 @@ &adsp_pas { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +diff --git a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts +index d028a7eb364a6..c169d2870bdf4 100644 +--- a/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts ++++ b/arch/arm64/boot/dts/qcom/sdm850-samsung-w737.dts +@@ -129,7 +129,7 @@ &adsp_pas { + }; + + &apps_rsc { +- pm8998-rpmh-regulators { ++ regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; + +-- +2.43.0 + diff --git a/queue-6.1/arm64-dts-qcom-sdm845-fix-psci-power-domain-names.patch b/queue-6.1/arm64-dts-qcom-sdm845-fix-psci-power-domain-names.patch new file mode 100644 index 00000000000..769a9fad6ab --- /dev/null +++ b/queue-6.1/arm64-dts-qcom-sdm845-fix-psci-power-domain-names.patch @@ -0,0 +1,66 @@ +From e0335f9198238cec81a096f299f7f121093303f7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Sep 2023 12:42:03 +0530 +Subject: arm64: dts: qcom: sdm845: Fix PSCI power domain names + +From: David Heidelberg + +[ Upstream commit a5f01673d3946e424091e6b8ff274716f9c21454 ] + +The original commit hasn't been updated according to +refactoring done in sdm845.dtsi. + +Fixes: a1ade6cac5a2 ("arm64: dts: qcom: sdm845: Switch PSCI cpu idle states from PC to OSI") +Suggested-by: Dmitry Baryshkov +Reviewed-by: Douglas Anderson +Signed-off-by: David Heidelberg +Reviewed-by: Stephen Boyd +Reviewed-by: Abel Vesa +Link: https://lore.kernel.org/r/20230912071205.11502-1-david@ixit.cz +Signed-off-by: Bjorn Andersson +Signed-off-by: Sasha Levin +--- + arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi | 20 +++++++++++--------- + 1 file changed, 11 insertions(+), 9 deletions(-) + +diff --git a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi +index 985824032c522..43ee28db61aa8 100644 +--- a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi ++++ b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi +@@ -150,15 +150,15 @@ &cpufreq_hw { + }; + + &psci { +- /delete-node/ cpu0; +- /delete-node/ cpu1; +- /delete-node/ cpu2; +- /delete-node/ cpu3; +- /delete-node/ cpu4; +- /delete-node/ cpu5; +- /delete-node/ cpu6; +- /delete-node/ cpu7; +- /delete-node/ cpu-cluster0; ++ /delete-node/ power-domain-cpu0; ++ /delete-node/ power-domain-cpu1; ++ /delete-node/ power-domain-cpu2; ++ /delete-node/ power-domain-cpu3; ++ /delete-node/ power-domain-cpu4; ++ /delete-node/ power-domain-cpu5; ++ /delete-node/ power-domain-cpu6; ++ /delete-node/ power-domain-cpu7; ++ /delete-node/ power-domain-cluster; + }; + + &cpus { +@@ -351,6 +351,8 @@ flash@0 { + + + &apps_rsc { ++ /delete-property/ power-domains; ++ + regulators-0 { + compatible = "qcom,pm8998-rpmh-regulators"; + qcom,pmic-id = "a"; +-- +2.43.0 + diff --git a/queue-6.1/asix-add-check-for-usbnet_get_endpoints.patch b/queue-6.1/asix-add-check-for-usbnet_get_endpoints.patch new file mode 100644 index 00000000000..3feea064e36 --- /dev/null +++ b/queue-6.1/asix-add-check-for-usbnet_get_endpoints.patch @@ -0,0 +1,38 @@ +From b44044d49c2870abfd79fa40e990603cafdfaf2e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jan 2024 03:35:34 +0000 +Subject: asix: Add check for usbnet_get_endpoints + +From: Chen Ni + +[ Upstream commit eaac6a2d26b65511e164772bec6918fcbc61938e ] + +Add check for usbnet_get_endpoints() and return the error if it fails +in order to transfer the error. + +Fixes: 16626b0cc3d5 ("asix: Add a new driver for the AX88172A") +Signed-off-by: Chen Ni +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/usb/ax88172a.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/usb/ax88172a.c b/drivers/net/usb/ax88172a.c +index 3777c7e2e6fc0..e47bb125048d4 100644 +--- a/drivers/net/usb/ax88172a.c ++++ b/drivers/net/usb/ax88172a.c +@@ -161,7 +161,9 @@ static int ax88172a_bind(struct usbnet *dev, struct usb_interface *intf) + u8 buf[ETH_ALEN]; + struct ax88172a_private *priv; + +- usbnet_get_endpoints(dev, intf); ++ ret = usbnet_get_endpoints(dev, intf); ++ if (ret) ++ return ret; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) +-- +2.43.0 + diff --git a/queue-6.1/asoc-fsl_rpmsg-fix-error-handler-with-pm_runtime_ena.patch b/queue-6.1/asoc-fsl_rpmsg-fix-error-handler-with-pm_runtime_ena.patch new file mode 100644 index 00000000000..76b1a85ce47 --- /dev/null +++ b/queue-6.1/asoc-fsl_rpmsg-fix-error-handler-with-pm_runtime_ena.patch @@ -0,0 +1,65 @@ +From 49ca35addbb9c2d01e802de2b473789fe8bb5f35 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 25 Dec 2023 17:06:08 +0900 +Subject: ASoC: fsl_rpmsg: Fix error handler with pm_runtime_enable + +From: Chancel Liu + +[ Upstream commit f9d378fc68c43fd41b35133edec9cd902ec334ec ] + +There is error message when defer probe happens: + +fsl_rpmsg rpmsg_audio: Unbalanced pm_runtime_enable! + +Fix the error handler with pm_runtime_enable. + +Fixes: b73d9e6225e8 ("ASoC: fsl_rpmsg: Add CPU DAI driver for audio base on rpmsg") +Signed-off-by: Chancel Liu +Acked-by: Shengjiu Wang +Link: https://lore.kernel.org/r/20231225080608.967953-1-chancel.liu@nxp.com +Signed-off-by: Mark Brown +Signed-off-by: Sasha Levin +--- + sound/soc/fsl/fsl_rpmsg.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/sound/soc/fsl/fsl_rpmsg.c b/sound/soc/fsl/fsl_rpmsg.c +index bf94838bdbefe..5c07a8ff0c9c0 100644 +--- a/sound/soc/fsl/fsl_rpmsg.c ++++ b/sound/soc/fsl/fsl_rpmsg.c +@@ -231,7 +231,7 @@ static int fsl_rpmsg_probe(struct platform_device *pdev) + ret = devm_snd_soc_register_component(&pdev->dev, &fsl_component, + &fsl_rpmsg_dai, 1); + if (ret) +- return ret; ++ goto err_pm_disable; + + rpmsg->card_pdev = platform_device_register_data(&pdev->dev, + "imx-audio-rpmsg", +@@ -241,16 +241,22 @@ static int fsl_rpmsg_probe(struct platform_device *pdev) + if (IS_ERR(rpmsg->card_pdev)) { + dev_err(&pdev->dev, "failed to register rpmsg card\n"); + ret = PTR_ERR(rpmsg->card_pdev); +- return ret; ++ goto err_pm_disable; + } + + return 0; ++ ++err_pm_disable: ++ pm_runtime_disable(&pdev->dev); ++ return ret; + } + + static int fsl_rpmsg_remove(struct platform_device *pdev) + { + struct fsl_rpmsg *rpmsg = platform_get_drvdata(pdev); + ++ pm_runtime_disable(&pdev->dev); ++ + if (rpmsg->card_pdev) + platform_device_unregister(rpmsg->card_pdev); + +-- +2.43.0 + diff --git a/queue-6.1/asoc-mediatek-mt8186-fix-aud_pad_top-register-and-of.patch b/queue-6.1/asoc-mediatek-mt8186-fix-aud_pad_top-register-and-of.patch new file mode 100644 index 00000000000..b3d59b763f4 --- /dev/null +++ b/queue-6.1/asoc-mediatek-mt8186-fix-aud_pad_top-register-and-of.patch @@ -0,0 +1,39 @@ +From e73ec909528dde319b11d6058116ce61ef4cf670 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 29 Dec 2023 13:43:42 +0200 +Subject: ASoC: mediatek: mt8186: fix AUD_PAD_TOP register and offset + +From: Eugen Hristev + +[ Upstream commit 38744c3fa00109c51076121c2deb4f02e2f09194 ] + +AUD_PAD_TOP widget's correct register is AFE_AUD_PAD_TOP , and not zero. +Having a zero as register, it would mean that the `snd_soc_dapm_new_widgets` +would try to read the register at offset zero when trying to get the power +status of this widget, which is incorrect. + +Fixes: b65c466220b3 ("ASoC: mediatek: mt8186: support adda in platform driver") +Signed-off-by: Eugen Hristev +Link: https://lore.kernel.org/r/20231229114342.195867-1-eugen.hristev@collabora.com +Signed-off-by: Mark Brown +Signed-off-by: Sasha Levin +--- + sound/soc/mediatek/mt8186/mt8186-dai-adda.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sound/soc/mediatek/mt8186/mt8186-dai-adda.c b/sound/soc/mediatek/mt8186/mt8186-dai-adda.c +index 094402470dc23..858b95b199dcb 100644 +--- a/sound/soc/mediatek/mt8186/mt8186-dai-adda.c ++++ b/sound/soc/mediatek/mt8186/mt8186-dai-adda.c +@@ -499,7 +499,7 @@ static const struct snd_soc_dapm_widget mtk_dai_adda_widgets[] = { + SND_SOC_DAPM_PRE_PMU | SND_SOC_DAPM_POST_PMD), + + SND_SOC_DAPM_SUPPLY_S("AUD_PAD_TOP", SUPPLY_SEQ_ADDA_AUD_PAD_TOP, +- 0, 0, 0, ++ AFE_AUD_PAD_TOP, RG_RX_FIFO_ON_SFT, 0, + mtk_adda_pad_top_event, + SND_SOC_DAPM_PRE_PMU), + SND_SOC_DAPM_SUPPLY_S("ADDA_MTKAIF_CFG", SUPPLY_SEQ_ADDA_MTKAIF_CFG, +-- +2.43.0 + diff --git a/queue-6.1/asoc-meson-g12a-toacodec-fix-event-generation.patch b/queue-6.1/asoc-meson-g12a-toacodec-fix-event-generation.patch new file mode 100644 index 00000000000..9ccd2961fd7 --- /dev/null +++ b/queue-6.1/asoc-meson-g12a-toacodec-fix-event-generation.patch @@ -0,0 +1,39 @@ +From 1aae4192aa31ef02321a89bd34bbf0650c634bb1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jan 2024 18:34:03 +0000 +Subject: ASoC: meson: g12a-toacodec: Fix event generation + +From: Mark Brown + +[ Upstream commit 172c88244b5f2d3375403ebb504d407be0fded59 ] + +When a control changes value the return value from _put() should be 1 so +we get events generated to userspace notifying applications of the change. +We are checking if there has been a change and exiting early if not but we +are not providing the correct return value in the latter case, fix this. + +Fixes: af2618a2eee8 ("ASoC: meson: g12a: add internal DAC glue driver") +Signed-off-by: Mark Brown +Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-3-424af7a8fb91@kernel.org +Signed-off-by: Mark Brown +Signed-off-by: Sasha Levin +--- + sound/soc/meson/g12a-toacodec.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sound/soc/meson/g12a-toacodec.c b/sound/soc/meson/g12a-toacodec.c +index 3b1ce9143c653..8d8d848ebd58b 100644 +--- a/sound/soc/meson/g12a-toacodec.c ++++ b/sound/soc/meson/g12a-toacodec.c +@@ -104,7 +104,7 @@ static int g12a_toacodec_mux_put_enum(struct snd_kcontrol *kcontrol, + + snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL); + +- return 0; ++ return 1; + } + + static SOC_ENUM_SINGLE_DECL(g12a_toacodec_mux_enum, TOACODEC_CTRL0, +-- +2.43.0 + diff --git a/queue-6.1/asoc-meson-g12a-toacodec-validate-written-enum-value.patch b/queue-6.1/asoc-meson-g12a-toacodec-validate-written-enum-value.patch new file mode 100644 index 00000000000..4196bab4d66 --- /dev/null +++ b/queue-6.1/asoc-meson-g12a-toacodec-validate-written-enum-value.patch @@ -0,0 +1,40 @@ +From 69dc7179c8414af40c52fc10df2814e6916b95a8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jan 2024 18:34:01 +0000 +Subject: ASoC: meson: g12a-toacodec: Validate written enum values + +From: Mark Brown + +[ Upstream commit 3150b70e944ead909260285dfb5707d0bedcf87b ] + +When writing to an enum we need to verify that the value written is valid +for the enumeration, the helper function snd_soc_item_enum_to_val() doesn't +do it since it needs to return an unsigned (and in any case we'd need to +check the return value). + +Fixes: af2618a2eee8 ("ASoC: meson: g12a: add internal DAC glue driver") +Signed-off-by: Mark Brown +Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-1-424af7a8fb91@kernel.org +Signed-off-by: Mark Brown +Signed-off-by: Sasha Levin +--- + sound/soc/meson/g12a-toacodec.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/sound/soc/meson/g12a-toacodec.c b/sound/soc/meson/g12a-toacodec.c +index ddc667956cf5e..3b1ce9143c653 100644 +--- a/sound/soc/meson/g12a-toacodec.c ++++ b/sound/soc/meson/g12a-toacodec.c +@@ -71,6 +71,9 @@ static int g12a_toacodec_mux_put_enum(struct snd_kcontrol *kcontrol, + struct soc_enum *e = (struct soc_enum *)kcontrol->private_value; + unsigned int mux, reg; + ++ if (ucontrol->value.enumerated.item[0] >= e->items) ++ return -EINVAL; ++ + mux = snd_soc_enum_item_to_val(e, ucontrol->value.enumerated.item[0]); + regmap_field_read(priv->field_dat_sel, ®); + +-- +2.43.0 + diff --git a/queue-6.1/asoc-meson-g12a-tohdmitx-fix-event-generation-for-s-.patch b/queue-6.1/asoc-meson-g12a-tohdmitx-fix-event-generation-for-s-.patch new file mode 100644 index 00000000000..89d91a2cfa4 --- /dev/null +++ b/queue-6.1/asoc-meson-g12a-tohdmitx-fix-event-generation-for-s-.patch @@ -0,0 +1,39 @@ +From 4329af718ecce922ee648d6baabf15690d321821 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jan 2024 18:34:04 +0000 +Subject: ASoC: meson: g12a-tohdmitx: Fix event generation for S/PDIF mux + +From: Mark Brown + +[ Upstream commit b036d8ef3120b996751495ce25994eea58032a98 ] + +When a control changes value the return value from _put() should be 1 so +we get events generated to userspace notifying applications of the change. +While the I2S mux gets this right the S/PDIF mux does not, fix the return +value. + +Fixes: c8609f3870f7 ("ASoC: meson: add g12a tohdmitx control") +Signed-off-by: Mark Brown +Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-4-424af7a8fb91@kernel.org +Signed-off-by: Mark Brown +Signed-off-by: Sasha Levin +--- + sound/soc/meson/g12a-tohdmitx.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sound/soc/meson/g12a-tohdmitx.c b/sound/soc/meson/g12a-tohdmitx.c +index 46d1f04e0e8a3..154c324fdd42a 100644 +--- a/sound/soc/meson/g12a-tohdmitx.c ++++ b/sound/soc/meson/g12a-tohdmitx.c +@@ -118,7 +118,7 @@ static int g12a_tohdmitx_spdif_mux_put_enum(struct snd_kcontrol *kcontrol, + + snd_soc_dapm_mux_update_power(dapm, kcontrol, mux, e, NULL); + +- return 0; ++ return 1; + } + + static SOC_ENUM_SINGLE_DECL(g12a_tohdmitx_spdif_mux_enum, TOHDMITX_CTRL0, +-- +2.43.0 + diff --git a/queue-6.1/asoc-meson-g12a-tohdmitx-validate-written-enum-value.patch b/queue-6.1/asoc-meson-g12a-tohdmitx-validate-written-enum-value.patch new file mode 100644 index 00000000000..45199199924 --- /dev/null +++ b/queue-6.1/asoc-meson-g12a-tohdmitx-validate-written-enum-value.patch @@ -0,0 +1,50 @@ +From c652c462e27c9733ea410b8af1d8eccf55790e67 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jan 2024 18:34:02 +0000 +Subject: ASoC: meson: g12a-tohdmitx: Validate written enum values + +From: Mark Brown + +[ Upstream commit 1e001206804be3f3d21f4a1cf16e5d059d75643f ] + +When writing to an enum we need to verify that the value written is valid +for the enumeration, the helper function snd_soc_item_enum_to_val() doesn't +do it since it needs to return an unsigned (and in any case we'd need to +check the return value). + +Fixes: c8609f3870f7 ("ASoC: meson: add g12a tohdmitx control") +Signed-off-by: Mark Brown +Link: https://lore.kernel.org/r/20240103-meson-enum-val-v1-2-424af7a8fb91@kernel.org +Signed-off-by: Mark Brown +Signed-off-by: Sasha Levin +--- + sound/soc/meson/g12a-tohdmitx.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/sound/soc/meson/g12a-tohdmitx.c b/sound/soc/meson/g12a-tohdmitx.c +index 579a04ad4d197..46d1f04e0e8a3 100644 +--- a/sound/soc/meson/g12a-tohdmitx.c ++++ b/sound/soc/meson/g12a-tohdmitx.c +@@ -45,6 +45,9 @@ static int g12a_tohdmitx_i2s_mux_put_enum(struct snd_kcontrol *kcontrol, + struct soc_enum *e = (struct soc_enum *)kcontrol->private_value; + unsigned int mux, changed; + ++ if (ucontrol->value.enumerated.item[0] >= e->items) ++ return -EINVAL; ++ + mux = snd_soc_enum_item_to_val(e, ucontrol->value.enumerated.item[0]); + changed = snd_soc_component_test_bits(component, e->reg, + CTRL0_I2S_DAT_SEL, +@@ -93,6 +96,9 @@ static int g12a_tohdmitx_spdif_mux_put_enum(struct snd_kcontrol *kcontrol, + struct soc_enum *e = (struct soc_enum *)kcontrol->private_value; + unsigned int mux, changed; + ++ if (ucontrol->value.enumerated.item[0] >= e->items) ++ return -EINVAL; ++ + mux = snd_soc_enum_item_to_val(e, ucontrol->value.enumerated.item[0]); + changed = snd_soc_component_test_bits(component, TOHDMITX_CTRL0, + CTRL0_SPDIF_SEL, +-- +2.43.0 + diff --git a/queue-6.1/blk-mq-make-sure-active-queue-usage-is-held-for-bio_.patch b/queue-6.1/blk-mq-make-sure-active-queue-usage-is-held-for-bio_.patch new file mode 100644 index 00000000000..1f907b6d1ab --- /dev/null +++ b/queue-6.1/blk-mq-make-sure-active-queue-usage-is-held-for-bio_.patch @@ -0,0 +1,168 @@ +From 3e937b5c8e6387b2914b22893d6bc030db02f58f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Nov 2023 11:52:31 +0800 +Subject: blk-mq: make sure active queue usage is held for bio_integrity_prep() + +From: Christoph Hellwig + +[ Upstream commit b0077e269f6c152e807fdac90b58caf012cdbaab ] + +blk_integrity_unregister() can come if queue usage counter isn't held +for one bio with integrity prepared, so this request may be completed with +calling profile->complete_fn, then kernel panic. + +Another constraint is that bio_integrity_prep() needs to be called +before bio merge. + +Fix the issue by: + +- call bio_integrity_prep() with one queue usage counter grabbed reliably + +- call bio_integrity_prep() before bio merge + +Fixes: 900e080752025f00 ("block: move queue enter logic into blk_mq_submit_bio()") +Reported-by: Yi Zhang +Cc: Christoph Hellwig +Signed-off-by: Ming Lei +Tested-by: Yi Zhang +Link: https://lore.kernel.org/r/20231113035231.2708053-1-ming.lei@redhat.com +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + block/blk-mq.c | 75 +++++++++++++++++++++++++------------------------- + 1 file changed, 38 insertions(+), 37 deletions(-) + +diff --git a/block/blk-mq.c b/block/blk-mq.c +index 100fb0c3114f8..383d94615e502 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -2855,11 +2855,8 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, + }; + struct request *rq; + +- if (unlikely(bio_queue_enter(bio))) +- return NULL; +- + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) +- goto queue_exit; ++ return NULL; + + rq_qos_throttle(q, bio); + +@@ -2875,35 +2872,23 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, + rq_qos_cleanup(q, bio); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); +-queue_exit: +- blk_queue_exit(q); + return NULL; + } + +-static inline struct request *blk_mq_get_cached_request(struct request_queue *q, +- struct blk_plug *plug, struct bio **bio, unsigned int nsegs) ++/* return true if this @rq can be used for @bio */ ++static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, ++ struct bio *bio) + { +- struct request *rq; +- enum hctx_type type, hctx_type; ++ enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); ++ enum hctx_type hctx_type = rq->mq_hctx->type; + +- if (!plug) +- return NULL; +- rq = rq_list_peek(&plug->cached_rq); +- if (!rq || rq->q != q) +- return NULL; +- +- if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { +- *bio = NULL; +- return NULL; +- } ++ WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + +- type = blk_mq_get_hctx_type((*bio)->bi_opf); +- hctx_type = rq->mq_hctx->type; + if (type != hctx_type && + !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) +- return NULL; +- if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) +- return NULL; ++ return false; ++ if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) ++ return false; + + /* + * If any qos ->throttle() end up blocking, we will have flushed the +@@ -2911,11 +2896,11 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, + * before we throttle. + */ + plug->cached_rq = rq_list_next(rq); +- rq_qos_throttle(q, *bio); ++ rq_qos_throttle(rq->q, bio); + +- rq->cmd_flags = (*bio)->bi_opf; ++ rq->cmd_flags = bio->bi_opf; + INIT_LIST_HEAD(&rq->queuelist); +- return rq; ++ return true; + } + + static void bio_set_ioprio(struct bio *bio) +@@ -2944,7 +2929,7 @@ void blk_mq_submit_bio(struct bio *bio) + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct blk_plug *plug = blk_mq_plug(bio); + const int is_sync = op_is_sync(bio->bi_opf); +- struct request *rq; ++ struct request *rq = NULL; + unsigned int nr_segs = 1; + blk_status_t ret; + +@@ -2955,20 +2940,36 @@ void blk_mq_submit_bio(struct bio *bio) + return; + } + +- if (!bio_integrity_prep(bio)) +- return; +- + bio_set_ioprio(bio); + +- rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); +- if (!rq) { +- if (!bio) ++ if (plug) { ++ rq = rq_list_peek(&plug->cached_rq); ++ if (rq && rq->q != q) ++ rq = NULL; ++ } ++ if (rq) { ++ if (!bio_integrity_prep(bio)) + return; +- rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); +- if (unlikely(!rq)) ++ if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) + return; ++ if (blk_mq_can_use_cached_rq(rq, plug, bio)) ++ goto done; ++ percpu_ref_get(&q->q_usage_counter); ++ } else { ++ if (unlikely(bio_queue_enter(bio))) ++ return; ++ if (!bio_integrity_prep(bio)) ++ goto fail; ++ } ++ ++ rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); ++ if (unlikely(!rq)) { ++fail: ++ blk_queue_exit(q); ++ return; + } + ++done: + trace_block_getrq(bio); + + rq_qos_track(q, rq, bio); +-- +2.43.0 + diff --git a/queue-6.1/block-update-the-stable_writes-flag-in-bdev_add.patch b/queue-6.1/block-update-the-stable_writes-flag-in-bdev_add.patch new file mode 100644 index 00000000000..4030c68bf65 --- /dev/null +++ b/queue-6.1/block-update-the-stable_writes-flag-in-bdev_add.patch @@ -0,0 +1,46 @@ +From 4620179873d798d8815800dcaa3f411857d6aee7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Oct 2023 16:10:18 +0200 +Subject: block: update the stable_writes flag in bdev_add + +From: Christoph Hellwig + +[ Upstream commit 1898efcdbed32bb1c67269c985a50bab0dbc9493 ] + +Propagate the per-queue stable_write flags into each bdev inode in bdev_add. +This makes sure devices that require stable writes have it set for I/O +on the block device node as well. + +Note that this doesn't cover the case of a flag changing on a live device +yet. We should handle that as well, but I plan to cover it as part of a +more general rework of how changing runtime paramters on block devices +works. + +Fixes: 1cb039f3dc16 ("bdi: replace BDI_CAP_STABLE_WRITES with a queue and a sb flag") +Reported-by: Ilya Dryomov +Signed-off-by: Christoph Hellwig +Link: https://lore.kernel.org/r/20231025141020.192413-3-hch@lst.de +Tested-by: Ilya Dryomov +Reviewed-by: Darrick J. Wong +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + block/bdev.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/block/bdev.c b/block/bdev.c +index d699ecdb32604..b61502ec8da06 100644 +--- a/block/bdev.c ++++ b/block/bdev.c +@@ -507,6 +507,8 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) + + void bdev_add(struct block_device *bdev, dev_t dev) + { ++ if (bdev_stable_writes(bdev)) ++ mapping_set_stable_writes(bdev->bd_inode->i_mapping); + bdev->bd_dev = dev; + bdev->bd_inode->i_rdev = dev; + bdev->bd_inode->i_ino = dev; +-- +2.43.0 + diff --git a/queue-6.1/bnxt_en-remove-mis-applied-code-from-bnxt_cfg_ntp_fi.patch b/queue-6.1/bnxt_en-remove-mis-applied-code-from-bnxt_cfg_ntp_fi.patch new file mode 100644 index 00000000000..ebaec4bed42 --- /dev/null +++ b/queue-6.1/bnxt_en-remove-mis-applied-code-from-bnxt_cfg_ntp_fi.patch @@ -0,0 +1,47 @@ +From af39ac5b0695d95e5a080366d3ec9115d9fa2e72 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jan 2024 16:59:24 -0800 +Subject: bnxt_en: Remove mis-applied code from bnxt_cfg_ntp_filters() + +From: Michael Chan + +[ Upstream commit e009b2efb7a8850498796b360043ac25c8d3d28f ] + +The 2 lines to check for the BNXT_HWRM_PF_UNLOAD_SP_EVENT bit was +mis-applied to bnxt_cfg_ntp_filters() and should have been applied to +bnxt_sp_task(). + +Fixes: 19241368443f ("bnxt_en: Send PF driver unload notification to all VFs.") +Reviewed-by: Andy Gospodarek +Signed-off-by: Michael Chan +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 623cdeb29ed90..df4d88d35701b 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -12081,6 +12081,8 @@ static void bnxt_sp_task(struct work_struct *work) + bnxt_cfg_ntp_filters(bp); + if (test_and_clear_bit(BNXT_HWRM_EXEC_FWD_REQ_SP_EVENT, &bp->sp_event)) + bnxt_hwrm_exec_fwd_req(bp); ++ if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event)) ++ netdev_info(bp->dev, "Receive PF driver unload event!\n"); + if (test_and_clear_bit(BNXT_PERIODIC_STATS_SP_EVENT, &bp->sp_event)) { + bnxt_hwrm_port_qstats(bp, 0); + bnxt_hwrm_port_qstats_ext(bp, 0); +@@ -13059,8 +13061,6 @@ static void bnxt_cfg_ntp_filters(struct bnxt *bp) + } + } + } +- if (test_and_clear_bit(BNXT_HWRM_PF_UNLOAD_SP_EVENT, &bp->sp_event)) +- netdev_info(bp->dev, "Receive PF driver unload event!\n"); + } + + #else +-- +2.43.0 + diff --git a/queue-6.1/bpf-clean-up-visit_insn-s-instruction-processing.patch b/queue-6.1/bpf-clean-up-visit_insn-s-instruction-processing.patch new file mode 100644 index 00000000000..595498a9ad6 --- /dev/null +++ b/queue-6.1/bpf-clean-up-visit_insn-s-instruction-processing.patch @@ -0,0 +1,96 @@ +From 02818dc2580eae9be766e1be3885bdeeeb7ef526 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 2 Mar 2023 15:50:04 -0800 +Subject: bpf: clean up visit_insn()'s instruction processing + +From: Andrii Nakryiko + +[ Upstream commit 653ae3a874aca6764a4c1f5a8bf1b072ade0d6f4 ] + +Instead of referencing processed instruction repeatedly as insns[t] +throughout entire visit_insn() function, take a local insn pointer and +work with it in a cleaner way. + +It makes enhancing this function further a bit easier as well. + +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20230302235015.2044271-7-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") +Signed-off-by: Sasha Levin +--- + kernel/bpf/verifier.c | 25 ++++++++++++------------- + 1 file changed, 12 insertions(+), 13 deletions(-) + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index d1393e07ab2c9..73d500c51bd86 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -11115,44 +11115,43 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, + */ + static int visit_insn(int t, struct bpf_verifier_env *env) + { +- struct bpf_insn *insns = env->prog->insnsi; ++ struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; + int ret; + +- if (bpf_pseudo_func(insns + t)) ++ if (bpf_pseudo_func(insn)) + return visit_func_call_insn(t, insns, env, true); + + /* All non-branch instructions have a single fall-through edge. */ +- if (BPF_CLASS(insns[t].code) != BPF_JMP && +- BPF_CLASS(insns[t].code) != BPF_JMP32) ++ if (BPF_CLASS(insn->code) != BPF_JMP && ++ BPF_CLASS(insn->code) != BPF_JMP32) + return push_insn(t, t + 1, FALLTHROUGH, env, false); + +- switch (BPF_OP(insns[t].code)) { ++ switch (BPF_OP(insn->code)) { + case BPF_EXIT: + return DONE_EXPLORING; + + case BPF_CALL: +- if (insns[t].imm == BPF_FUNC_timer_set_callback) ++ if (insn->imm == BPF_FUNC_timer_set_callback) + /* Mark this call insn as a prune point to trigger + * is_state_visited() check before call itself is + * processed by __check_func_call(). Otherwise new + * async state will be pushed for further exploration. + */ + mark_prune_point(env, t); +- return visit_func_call_insn(t, insns, env, +- insns[t].src_reg == BPF_PSEUDO_CALL); ++ return visit_func_call_insn(t, insns, env, insn->src_reg == BPF_PSEUDO_CALL); + + case BPF_JA: +- if (BPF_SRC(insns[t].code) != BPF_K) ++ if (BPF_SRC(insn->code) != BPF_K) + return -EINVAL; + + /* unconditional jump with single edge */ +- ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, ++ ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env, + true); + if (ret) + return ret; + +- mark_prune_point(env, t + insns[t].off + 1); +- mark_jmp_point(env, t + insns[t].off + 1); ++ mark_prune_point(env, t + insn->off + 1); ++ mark_jmp_point(env, t + insn->off + 1); + + return ret; + +@@ -11164,7 +11163,7 @@ static int visit_insn(int t, struct bpf_verifier_env *env) + if (ret) + return ret; + +- return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); ++ return push_insn(t, t + insn->off + 1, BRANCH, env, true); + } + } + +-- +2.43.0 + diff --git a/queue-6.1/bpf-decouple-prune-and-jump-points.patch b/queue-6.1/bpf-decouple-prune-and-jump-points.patch new file mode 100644 index 00000000000..5a9392d3dbf --- /dev/null +++ b/queue-6.1/bpf-decouple-prune-and-jump-points.patch @@ -0,0 +1,197 @@ +From 1d848bcf5df37f2bdcc07a0518140fe62ed6383b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 6 Dec 2022 15:33:43 -0800 +Subject: bpf: decouple prune and jump points + +From: Andrii Nakryiko + +[ Upstream commit bffdeaa8a5af7200b0e74c9d5a41167f86626a36 ] + +BPF verifier marks some instructions as prune points. Currently these +prune points serve two purposes. + +It's a point where verifier tries to find previously verified state and +check current state's equivalence to short circuit verification for +current code path. + +But also currently it's a point where jump history, used for precision +backtracking, is updated. This is done so that non-linear flow of +execution could be properly backtracked. + +Such coupling is coincidental and unnecessary. Some prune points are not +part of some non-linear jump path, so don't need update of jump history. +On the other hand, not all instructions which have to be recorded in +jump history necessarily are good prune points. + +This patch splits prune and jump points into independent flags. +Currently all prune points are marked as jump points to minimize amount +of changes in this patch, but next patch will perform some optimization +of prune vs jmp point placement. + +No functional changes are intended. + +Acked-by: John Fastabend +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20221206233345.438540-2-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") +Signed-off-by: Sasha Levin +--- + include/linux/bpf_verifier.h | 1 + + kernel/bpf/verifier.c | 57 +++++++++++++++++++++++++++--------- + 2 files changed, 44 insertions(+), 14 deletions(-) + +diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h +index 1a32baa78ce26..f080ccf27d256 100644 +--- a/include/linux/bpf_verifier.h ++++ b/include/linux/bpf_verifier.h +@@ -429,6 +429,7 @@ struct bpf_insn_aux_data { + /* below fields are initialized once */ + unsigned int orig_idx; /* original instruction index */ + bool prune_point; ++ bool jmp_point; + }; + + #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index ee6e811b43158..ec688665aaa25 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2512,6 +2512,16 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, + return 0; + } + ++static void mark_jmp_point(struct bpf_verifier_env *env, int idx) ++{ ++ env->insn_aux_data[idx].jmp_point = true; ++} ++ ++static bool is_jmp_point(struct bpf_verifier_env *env, int insn_idx) ++{ ++ return env->insn_aux_data[insn_idx].jmp_point; ++} ++ + /* for any branch, call, exit record the history of jmps in the given state */ + static int push_jmp_history(struct bpf_verifier_env *env, + struct bpf_verifier_state *cur) +@@ -2520,6 +2530,9 @@ static int push_jmp_history(struct bpf_verifier_env *env, + struct bpf_idx_pair *p; + size_t alloc_size; + ++ if (!is_jmp_point(env, env->insn_idx)) ++ return 0; ++ + cnt++; + alloc_size = kmalloc_size_roundup(size_mul(cnt, sizeof(*p))); + p = krealloc(cur->jmp_history, alloc_size, GFP_USER); +@@ -11000,11 +11013,16 @@ static struct bpf_verifier_state_list **explored_state( + return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; + } + +-static void init_explored_state(struct bpf_verifier_env *env, int idx) ++static void mark_prune_point(struct bpf_verifier_env *env, int idx) + { + env->insn_aux_data[idx].prune_point = true; + } + ++static bool is_prune_point(struct bpf_verifier_env *env, int insn_idx) ++{ ++ return env->insn_aux_data[insn_idx].prune_point; ++} ++ + enum { + DONE_EXPLORING = 0, + KEEP_EXPLORING = 1, +@@ -11033,9 +11051,11 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, + return -EINVAL; + } + +- if (e == BRANCH) ++ if (e == BRANCH) { + /* mark branch target for state pruning */ +- init_explored_state(env, w); ++ mark_prune_point(env, w); ++ mark_jmp_point(env, w); ++ } + + if (insn_state[w] == 0) { + /* tree-edge */ +@@ -11073,10 +11093,13 @@ static int visit_func_call_insn(int t, int insn_cnt, + if (ret) + return ret; + +- if (t + 1 < insn_cnt) +- init_explored_state(env, t + 1); ++ if (t + 1 < insn_cnt) { ++ mark_prune_point(env, t + 1); ++ mark_jmp_point(env, t + 1); ++ } + if (visit_callee) { +- init_explored_state(env, t); ++ mark_prune_point(env, t); ++ mark_jmp_point(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, + /* It's ok to allow recursion from CFG point of + * view. __check_func_call() will do the actual +@@ -11110,13 +11133,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) + return DONE_EXPLORING; + + case BPF_CALL: +- if (insns[t].imm == BPF_FUNC_timer_set_callback) ++ if (insns[t].imm == BPF_FUNC_timer_set_callback) { + /* Mark this call insn to trigger is_state_visited() check + * before call itself is processed by __check_func_call(). + * Otherwise new async state will be pushed for further + * exploration. + */ +- init_explored_state(env, t); ++ mark_prune_point(env, t); ++ mark_jmp_point(env, t); ++ } + return visit_func_call_insn(t, insn_cnt, insns, env, + insns[t].src_reg == BPF_PSEUDO_CALL); + +@@ -11134,18 +11159,22 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) + * but it's marked, since backtracking needs + * to record jmp history in is_state_visited(). + */ +- init_explored_state(env, t + insns[t].off + 1); ++ mark_prune_point(env, t + insns[t].off + 1); ++ mark_jmp_point(env, t + insns[t].off + 1); + /* tell verifier to check for equivalent states + * after every call and jump + */ +- if (t + 1 < insn_cnt) +- init_explored_state(env, t + 1); ++ if (t + 1 < insn_cnt) { ++ mark_prune_point(env, t + 1); ++ mark_jmp_point(env, t + 1); ++ } + + return ret; + + default: + /* conditional jump with two edges */ +- init_explored_state(env, t); ++ mark_prune_point(env, t); ++ mark_jmp_point(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + if (ret) + return ret; +@@ -12178,11 +12207,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) + bool add_new_state = env->test_state_freq ? true : false; + + cur->last_insn_idx = env->prev_insn_idx; +- if (!env->insn_aux_data[insn_idx].prune_point) ++ if (!is_prune_point(env, insn_idx)) + /* this 'insn_idx' instruction wasn't marked, so we will not + * be doing state search here + */ +- return 0; ++ return push_jmp_history(env, cur); + + /* bpf progs typically have pruning point every 4 instructions + * http://vger.kernel.org/bpfconf2019.html#session-1 +-- +2.43.0 + diff --git a/queue-6.1/bpf-fix-precision-backtracking-instruction-iteration.patch b/queue-6.1/bpf-fix-precision-backtracking-instruction-iteration.patch new file mode 100644 index 00000000000..83a120e90bf --- /dev/null +++ b/queue-6.1/bpf-fix-precision-backtracking-instruction-iteration.patch @@ -0,0 +1,89 @@ +From 5f576d9732e2017e5f5e1da533df5a11be2b311b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Nov 2023 16:26:37 -0800 +Subject: bpf: fix precision backtracking instruction iteration + +From: Andrii Nakryiko + +[ Upstream commit 4bb7ea946a370707315ab774432963ce47291946 ] + +Fix an edge case in __mark_chain_precision() which prematurely stops +backtracking instructions in a state if it happens that state's first +and last instruction indexes are the same. This situations doesn't +necessarily mean that there were no instructions simulated in a state, +but rather that we starting from the instruction, jumped around a bit, +and then ended up at the same instruction before checkpointing or +marking precision. + +To distinguish between these two possible situations, we need to consult +jump history. If it's empty or contain a single record "bridging" parent +state and first instruction of processed state, then we indeed +backtracked all instructions in this state. But if history is not empty, +we are definitely not done yet. + +Move this logic inside get_prev_insn_idx() to contain it more nicely. +Use -ENOENT return code to denote "we are out of instructions" +situation. + +This bug was exposed by verifier_loop1.c's bounded_recursion subtest, once +the next fix in this patch set is applied. + +Acked-by: Eduard Zingerman +Fixes: b5dc0163d8fd ("bpf: precise scalar_value tracking") +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20231110002638.4168352-3-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + kernel/bpf/verifier.c | 21 +++++++++++++++++++-- + 1 file changed, 19 insertions(+), 2 deletions(-) + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 95521beec66c5..142e10d49fd81 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2551,12 +2551,29 @@ static int push_jmp_history(struct bpf_verifier_env *env, + + /* Backtrack one insn at a time. If idx is not at the top of recorded + * history then previous instruction came from straight line execution. ++ * Return -ENOENT if we exhausted all instructions within given state. ++ * ++ * It's legal to have a bit of a looping with the same starting and ending ++ * insn index within the same state, e.g.: 3->4->5->3, so just because current ++ * instruction index is the same as state's first_idx doesn't mean we are ++ * done. If there is still some jump history left, we should keep going. We ++ * need to take into account that we might have a jump history between given ++ * state's parent and itself, due to checkpointing. In this case, we'll have ++ * history entry recording a jump from last instruction of parent state and ++ * first instruction of given state. + */ + static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, + u32 *history) + { + u32 cnt = *history; + ++ if (i == st->first_insn_idx) { ++ if (cnt == 0) ++ return -ENOENT; ++ if (cnt == 1 && st->jmp_history[0].idx == i) ++ return -ENOENT; ++ } ++ + if (cnt && st->jmp_history[cnt - 1].idx == i) { + i = st->jmp_history[cnt - 1].prev_idx; + (*history)--; +@@ -3052,9 +3069,9 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int frame, int r + * Nothing to be tracked further in the parent state. + */ + return 0; +- if (i == first_idx) +- break; + i = get_prev_insn_idx(st, i, &history); ++ if (i == -ENOENT) ++ break; + if (i >= env->prog->len) { + /* This can happen if backtracking reached insn 0 + * and there are still reg_mask or stack_mask +-- +2.43.0 + diff --git a/queue-6.1/bpf-handle-ldimm64-properly-in-check_cfg.patch b/queue-6.1/bpf-handle-ldimm64-properly-in-check_cfg.patch new file mode 100644 index 00000000000..546520d0817 --- /dev/null +++ b/queue-6.1/bpf-handle-ldimm64-properly-in-check_cfg.patch @@ -0,0 +1,153 @@ +From e72d96cb30d0d7cac5d70679da65b38e3fded5d9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 9 Nov 2023 16:26:36 -0800 +Subject: bpf: handle ldimm64 properly in check_cfg() + +From: Andrii Nakryiko + +[ Upstream commit 3feb263bb516ee7e1da0acd22b15afbb9a7daa19 ] + +ldimm64 instructions are 16-byte long, and so have to be handled +appropriately in check_cfg(), just like the rest of BPF verifier does. + +This has implications in three places: + - when determining next instruction for non-jump instructions; + - when determining next instruction for callback address ldimm64 + instructions (in visit_func_call_insn()); + - when checking for unreachable instructions, where second half of + ldimm64 is expected to be unreachable; + +We take this also as an opportunity to report jump into the middle of +ldimm64. And adjust few test_verifier tests accordingly. + +Acked-by: Eduard Zingerman +Reported-by: Hao Sun +Fixes: 475fb78fbf48 ("bpf: verifier (add branch/goto checks)") +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20231110002638.4168352-2-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + include/linux/bpf.h | 8 ++++-- + kernel/bpf/verifier.c | 27 ++++++++++++++----- + .../testing/selftests/bpf/verifier/ld_imm64.c | 8 +++--- + 3 files changed, 30 insertions(+), 13 deletions(-) + +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index 619fcba84be22..ba22cf4f5fc0e 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -702,10 +702,14 @@ bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) + aux->ctx_field_size = size; + } + ++static bool bpf_is_ldimm64(const struct bpf_insn *insn) ++{ ++ return insn->code == (BPF_LD | BPF_IMM | BPF_DW); ++} ++ + static inline bool bpf_pseudo_func(const struct bpf_insn *insn) + { +- return insn->code == (BPF_LD | BPF_IMM | BPF_DW) && +- insn->src_reg == BPF_PSEUDO_FUNC; ++ return bpf_is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC; + } + + struct bpf_prog_ops { +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index dd025f66efabc..95521beec66c5 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -11090,15 +11090,16 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, + struct bpf_verifier_env *env, + bool visit_callee) + { +- int ret; ++ int ret, insn_sz; + +- ret = push_insn(t, t + 1, FALLTHROUGH, env, false); ++ insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; ++ ret = push_insn(t, t + insn_sz, FALLTHROUGH, env, false); + if (ret) + return ret; + +- mark_prune_point(env, t + 1); ++ mark_prune_point(env, t + insn_sz); + /* when we exit from subprog, we need to record non-linear history */ +- mark_jmp_point(env, t + 1); ++ mark_jmp_point(env, t + insn_sz); + + if (visit_callee) { + mark_prune_point(env, t); +@@ -11120,15 +11121,17 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, + static int visit_insn(int t, struct bpf_verifier_env *env) + { + struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; +- int ret, off; ++ int ret, off, insn_sz; + + if (bpf_pseudo_func(insn)) + return visit_func_call_insn(t, insns, env, true); + + /* All non-branch instructions have a single fall-through edge. */ + if (BPF_CLASS(insn->code) != BPF_JMP && +- BPF_CLASS(insn->code) != BPF_JMP32) +- return push_insn(t, t + 1, FALLTHROUGH, env, false); ++ BPF_CLASS(insn->code) != BPF_JMP32) { ++ insn_sz = bpf_is_ldimm64(insn) ? 2 : 1; ++ return push_insn(t, t + insn_sz, FALLTHROUGH, env, false); ++ } + + switch (BPF_OP(insn->code)) { + case BPF_EXIT: +@@ -11227,11 +11230,21 @@ static int check_cfg(struct bpf_verifier_env *env) + } + + for (i = 0; i < insn_cnt; i++) { ++ struct bpf_insn *insn = &env->prog->insnsi[i]; ++ + if (insn_state[i] != EXPLORED) { + verbose(env, "unreachable insn %d\n", i); + ret = -EINVAL; + goto err_free; + } ++ if (bpf_is_ldimm64(insn)) { ++ if (insn_state[i + 1] != 0) { ++ verbose(env, "jump into the middle of ldimm64 insn %d\n", i); ++ ret = -EINVAL; ++ goto err_free; ++ } ++ i++; /* skip second half of ldimm64 */ ++ } + } + ret = 0; /* cfg looks good */ + +diff --git a/tools/testing/selftests/bpf/verifier/ld_imm64.c b/tools/testing/selftests/bpf/verifier/ld_imm64.c +index f9297900cea6d..78f19c255f20b 100644 +--- a/tools/testing/selftests/bpf/verifier/ld_imm64.c ++++ b/tools/testing/selftests/bpf/verifier/ld_imm64.c +@@ -9,8 +9,8 @@ + BPF_MOV64_IMM(BPF_REG_0, 2), + BPF_EXIT_INSN(), + }, +- .errstr = "invalid BPF_LD_IMM insn", +- .errstr_unpriv = "R1 pointer comparison", ++ .errstr = "jump into the middle of ldimm64 insn 1", ++ .errstr_unpriv = "jump into the middle of ldimm64 insn 1", + .result = REJECT, + }, + { +@@ -23,8 +23,8 @@ + BPF_LD_IMM64(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, +- .errstr = "invalid BPF_LD_IMM insn", +- .errstr_unpriv = "R1 pointer comparison", ++ .errstr = "jump into the middle of ldimm64 insn 1", ++ .errstr_unpriv = "jump into the middle of ldimm64 insn 1", + .result = REJECT, + }, + { +-- +2.43.0 + diff --git a/queue-6.1/bpf-remove-unnecessary-prune-and-jump-points.patch b/queue-6.1/bpf-remove-unnecessary-prune-and-jump-points.patch new file mode 100644 index 00000000000..1096eae636f --- /dev/null +++ b/queue-6.1/bpf-remove-unnecessary-prune-and-jump-points.patch @@ -0,0 +1,112 @@ +From 90b6441df9cf455cd5ad99ec2231d29e605a5a47 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 6 Dec 2022 15:33:45 -0800 +Subject: bpf: remove unnecessary prune and jump points + +From: Andrii Nakryiko + +[ Upstream commit 618945fbed501b6e5865042068a51edfb2dda948 ] + +Don't mark some instructions as jump points when there are actually no +jumps and instructions are just processed sequentially. Such case is +handled naturally by precision backtracking logic without the need to +update jump history. See get_prev_insn_idx(). It goes back linearly by +one instruction, unless current top of jmp_history is pointing to +current instruction. In such case we use `st->jmp_history[cnt - 1].prev_idx` +to find instruction from which we jumped to the current instruction +non-linearly. + +Also remove both jump and prune point marking for instruction right +after unconditional jumps, as program flow can get to the instruction +right after unconditional jump instruction only if there is a jump to +that instruction from somewhere else in the program. In such case we'll +mark such instruction as prune/jump point because it's a destination of +a jump. + +This change has no changes in terms of number of instructions or states +processes across Cilium and selftests programs. + +Signed-off-by: Andrii Nakryiko +Acked-by: John Fastabend +Link: https://lore.kernel.org/r/20221206233345.438540-4-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") +Signed-off-by: Sasha Levin +--- + kernel/bpf/verifier.c | 34 ++++++++++------------------------ + 1 file changed, 10 insertions(+), 24 deletions(-) + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index ec688665aaa25..09631797d9e0c 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -11093,13 +11093,12 @@ static int visit_func_call_insn(int t, int insn_cnt, + if (ret) + return ret; + +- if (t + 1 < insn_cnt) { +- mark_prune_point(env, t + 1); +- mark_jmp_point(env, t + 1); +- } ++ mark_prune_point(env, t + 1); ++ /* when we exit from subprog, we need to record non-linear history */ ++ mark_jmp_point(env, t + 1); ++ + if (visit_callee) { + mark_prune_point(env, t); +- mark_jmp_point(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env, + /* It's ok to allow recursion from CFG point of + * view. __check_func_call() will do the actual +@@ -11133,15 +11132,13 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) + return DONE_EXPLORING; + + case BPF_CALL: +- if (insns[t].imm == BPF_FUNC_timer_set_callback) { +- /* Mark this call insn to trigger is_state_visited() check +- * before call itself is processed by __check_func_call(). +- * Otherwise new async state will be pushed for further +- * exploration. ++ if (insns[t].imm == BPF_FUNC_timer_set_callback) ++ /* Mark this call insn as a prune point to trigger ++ * is_state_visited() check before call itself is ++ * processed by __check_func_call(). Otherwise new ++ * async state will be pushed for further exploration. + */ + mark_prune_point(env, t); +- mark_jmp_point(env, t); +- } + return visit_func_call_insn(t, insn_cnt, insns, env, + insns[t].src_reg == BPF_PSEUDO_CALL); + +@@ -11155,26 +11152,15 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) + if (ret) + return ret; + +- /* unconditional jmp is not a good pruning point, +- * but it's marked, since backtracking needs +- * to record jmp history in is_state_visited(). +- */ + mark_prune_point(env, t + insns[t].off + 1); + mark_jmp_point(env, t + insns[t].off + 1); +- /* tell verifier to check for equivalent states +- * after every call and jump +- */ +- if (t + 1 < insn_cnt) { +- mark_prune_point(env, t + 1); +- mark_jmp_point(env, t + 1); +- } + + return ret; + + default: + /* conditional jump with two edges */ + mark_prune_point(env, t); +- mark_jmp_point(env, t); ++ + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + if (ret) + return ret; +-- +2.43.0 + diff --git a/queue-6.1/bpf-remove-unused-insn_cnt-argument-from-visit_-func.patch b/queue-6.1/bpf-remove-unused-insn_cnt-argument-from-visit_-func.patch new file mode 100644 index 00000000000..6e5546db2c4 --- /dev/null +++ b/queue-6.1/bpf-remove-unused-insn_cnt-argument-from-visit_-func.patch @@ -0,0 +1,76 @@ +From b9857568c364a47cb907e60b86ee7c0a1f73a7b7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 7 Dec 2022 11:55:34 -0800 +Subject: bpf: Remove unused insn_cnt argument from visit_[func_call_]insn() + +From: Andrii Nakryiko + +[ Upstream commit dcb2288b1fd9a8cdf2f3b8c0c7b3763346ef515f ] + +Number of total instructions in BPF program (including subprogs) can and +is accessed from env->prog->len. visit_func_call_insn() doesn't do any +checks against insn_cnt anymore, relying on push_insn() to do this check +internally. So remove unnecessary insn_cnt input argument from +visit_func_call_insn() and visit_insn() functions. + +Suggested-by: Alexei Starovoitov +Signed-off-by: Andrii Nakryiko +Signed-off-by: Daniel Borkmann +Link: https://lore.kernel.org/bpf/20221207195534.2866030-1-andrii@kernel.org +Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") +Signed-off-by: Sasha Levin +--- + kernel/bpf/verifier.c | 11 +++++------ + 1 file changed, 5 insertions(+), 6 deletions(-) + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 09631797d9e0c..d1393e07ab2c9 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -11082,8 +11082,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, + return DONE_EXPLORING; + } + +-static int visit_func_call_insn(int t, int insn_cnt, +- struct bpf_insn *insns, ++static int visit_func_call_insn(int t, struct bpf_insn *insns, + struct bpf_verifier_env *env, + bool visit_callee) + { +@@ -11114,13 +11113,13 @@ static int visit_func_call_insn(int t, int insn_cnt, + * DONE_EXPLORING - the instruction was fully explored + * KEEP_EXPLORING - there is still work to be done before it is fully explored + */ +-static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) ++static int visit_insn(int t, struct bpf_verifier_env *env) + { + struct bpf_insn *insns = env->prog->insnsi; + int ret; + + if (bpf_pseudo_func(insns + t)) +- return visit_func_call_insn(t, insn_cnt, insns, env, true); ++ return visit_func_call_insn(t, insns, env, true); + + /* All non-branch instructions have a single fall-through edge. */ + if (BPF_CLASS(insns[t].code) != BPF_JMP && +@@ -11139,7 +11138,7 @@ static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) + * async state will be pushed for further exploration. + */ + mark_prune_point(env, t); +- return visit_func_call_insn(t, insn_cnt, insns, env, ++ return visit_func_call_insn(t, insns, env, + insns[t].src_reg == BPF_PSEUDO_CALL); + + case BPF_JA: +@@ -11196,7 +11195,7 @@ static int check_cfg(struct bpf_verifier_env *env) + while (env->cfg.cur_stack > 0) { + int t = insn_stack[env->cfg.cur_stack - 1]; + +- ret = visit_insn(t, insn_cnt, env); ++ ret = visit_insn(t, env); + switch (ret) { + case DONE_EXPLORING: + insn_state[t] = EXPLORED; +-- +2.43.0 + diff --git a/queue-6.1/bpf-sockmap-af_unix-stream-sockets-need-to-hold-ref-.patch b/queue-6.1/bpf-sockmap-af_unix-stream-sockets-need-to-hold-ref-.patch new file mode 100644 index 00000000000..e8786d2e6e6 --- /dev/null +++ b/queue-6.1/bpf-sockmap-af_unix-stream-sockets-need-to-hold-ref-.patch @@ -0,0 +1,139 @@ +From 2e5ec045cba65071ef0736ce3d6a2e56106c261d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 28 Nov 2023 17:25:56 -0800 +Subject: bpf, sockmap: af_unix stream sockets need to hold ref for pair sock + +From: John Fastabend + +[ Upstream commit 8866730aed5100f06d3d965c22f1c61f74942541 ] + +AF_UNIX stream sockets are a paired socket. So sending on one of the pairs +will lookup the paired socket as part of the send operation. It is possible +however to put just one of the pairs in a BPF map. This currently increments +the refcnt on the sock in the sockmap to ensure it is not free'd by the +stack before sockmap cleans up its state and stops any skbs being sent/recv'd +to that socket. + +But we missed a case. If the peer socket is closed it will be free'd by the +stack. However, the paired socket can still be referenced from BPF sockmap +side because we hold a reference there. Then if we are sending traffic through +BPF sockmap to that socket it will try to dereference the free'd pair in its +send logic creating a use after free. And following splat: + + [59.900375] BUG: KASAN: slab-use-after-free in sk_wake_async+0x31/0x1b0 + [59.901211] Read of size 8 at addr ffff88811acbf060 by task kworker/1:2/954 + [...] + [59.905468] Call Trace: + [59.905787] + [59.906066] dump_stack_lvl+0x130/0x1d0 + [59.908877] print_report+0x16f/0x740 + [59.910629] kasan_report+0x118/0x160 + [59.912576] sk_wake_async+0x31/0x1b0 + [59.913554] sock_def_readable+0x156/0x2a0 + [59.914060] unix_stream_sendmsg+0x3f9/0x12a0 + [59.916398] sock_sendmsg+0x20e/0x250 + [59.916854] skb_send_sock+0x236/0xac0 + [59.920527] sk_psock_backlog+0x287/0xaa0 + +To fix let BPF sockmap hold a refcnt on both the socket in the sockmap and its +paired socket. It wasn't obvious how to contain the fix to bpf_unix logic. The +primarily problem with keeping this logic in bpf_unix was: In the sock close() +we could handle the deref by having a close handler. But, when we are destroying +the psock through a map delete operation we wouldn't have gotten any signal +thorugh the proto struct other than it being replaced. If we do the deref from +the proto replace its too early because we need to deref the sk_pair after the +backlog worker has been stopped. + +Given all this it seems best to just cache it at the end of the psock and eat 8B +for the af_unix and vsock users. Notice dgram sockets are OK because they handle +locking already. + +Fixes: 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap") +Signed-off-by: John Fastabend +Signed-off-by: Daniel Borkmann +Reviewed-by: Jakub Sitnicki +Link: https://lore.kernel.org/bpf/20231129012557.95371-2-john.fastabend@gmail.com +Signed-off-by: Sasha Levin +--- + include/linux/skmsg.h | 1 + + include/net/af_unix.h | 1 + + net/core/skmsg.c | 2 ++ + net/unix/af_unix.c | 2 -- + net/unix/unix_bpf.c | 5 +++++ + 5 files changed, 9 insertions(+), 2 deletions(-) + +diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h +index c1637515a8a41..c953b8c0d2f43 100644 +--- a/include/linux/skmsg.h ++++ b/include/linux/skmsg.h +@@ -106,6 +106,7 @@ struct sk_psock { + struct mutex work_mutex; + struct sk_psock_work_state work_state; + struct delayed_work work; ++ struct sock *sk_pair; + struct rcu_work rwork; + }; + +diff --git a/include/net/af_unix.h b/include/net/af_unix.h +index 480fa579787e5..55ca217c626b7 100644 +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -77,6 +77,7 @@ static inline struct unix_sock *unix_sk(const struct sock *sk) + { + return (struct unix_sock *)sk; + } ++#define unix_peer(sk) (unix_sk(sk)->peer) + + #define peer_wait peer_wq.wait + +diff --git a/net/core/skmsg.c b/net/core/skmsg.c +index a5c1f67dc96ec..3818035ea0021 100644 +--- a/net/core/skmsg.c ++++ b/net/core/skmsg.c +@@ -825,6 +825,8 @@ static void sk_psock_destroy(struct work_struct *work) + + if (psock->sk_redir) + sock_put(psock->sk_redir); ++ if (psock->sk_pair) ++ sock_put(psock->sk_pair); + sock_put(psock->sk); + kfree(psock); + } +diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c +index 6dbeb80073338..be2ed7b0fe21c 100644 +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -211,8 +211,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) + } + #endif /* CONFIG_SECURITY_NETWORK */ + +-#define unix_peer(sk) (unix_sk(sk)->peer) +- + static inline int unix_our_peer(struct sock *sk, struct sock *osk) + { + return unix_peer(osk) == sk; +diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c +index 2f9d8271c6ec7..7ea7c3a0d0d06 100644 +--- a/net/unix/unix_bpf.c ++++ b/net/unix/unix_bpf.c +@@ -159,12 +159,17 @@ int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool re + + int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) + { ++ struct sock *sk_pair; ++ + if (restore) { + sk->sk_write_space = psock->saved_write_space; + sock_replace_proto(sk, psock->sk_proto); + return 0; + } + ++ sk_pair = unix_peer(sk); ++ sock_hold(sk_pair); ++ psock->sk_pair = sk_pair; + unix_stream_bpf_check_needs_rebuild(psock->sk_proto); + sock_replace_proto(sk, &unix_stream_bpf_prot); + return 0; +-- +2.43.0 + diff --git a/queue-6.1/bpf-support-new-32bit-offset-jmp-instruction.patch b/queue-6.1/bpf-support-new-32bit-offset-jmp-instruction.patch new file mode 100644 index 00000000000..d4048e1ef0a --- /dev/null +++ b/queue-6.1/bpf-support-new-32bit-offset-jmp-instruction.patch @@ -0,0 +1,212 @@ +From 7c7c0669562f577fee8dbb0e780a26bc7a1b146a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 27 Jul 2023 18:12:31 -0700 +Subject: bpf: Support new 32bit offset jmp instruction + +From: Yonghong Song + +[ Upstream commit 4cd58e9af8b9d9fff6b7145e742abbfcda0af4af ] + +Add interpreter/jit/verifier support for 32bit offset jmp instruction. +If a conditional jmp instruction needs more than 16bit offset, +it can be simulated with a conditional jmp + a 32bit jmp insn. + +Acked-by: Eduard Zingerman +Signed-off-by: Yonghong Song +Link: https://lore.kernel.org/r/20230728011231.3716103-1-yonghong.song@linux.dev +Signed-off-by: Alexei Starovoitov +Stable-dep-of: 3feb263bb516 ("bpf: handle ldimm64 properly in check_cfg()") +Signed-off-by: Sasha Levin +--- + arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++---------- + kernel/bpf/core.c | 19 ++++++++++++++++--- + kernel/bpf/verifier.c | 32 ++++++++++++++++++++++---------- + 3 files changed, 56 insertions(+), 23 deletions(-) + +diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c +index 84c695ae1940f..b69aee6245e4a 100644 +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -1625,16 +1625,24 @@ st: if (is_imm8(insn->off)) + break; + + case BPF_JMP | BPF_JA: +- if (insn->off == -1) +- /* -1 jmp instructions will always jump +- * backwards two bytes. Explicitly handling +- * this case avoids wasting too many passes +- * when there are long sequences of replaced +- * dead code. +- */ +- jmp_offset = -2; +- else +- jmp_offset = addrs[i + insn->off] - addrs[i]; ++ case BPF_JMP32 | BPF_JA: ++ if (BPF_CLASS(insn->code) == BPF_JMP) { ++ if (insn->off == -1) ++ /* -1 jmp instructions will always jump ++ * backwards two bytes. Explicitly handling ++ * this case avoids wasting too many passes ++ * when there are long sequences of replaced ++ * dead code. ++ */ ++ jmp_offset = -2; ++ else ++ jmp_offset = addrs[i + insn->off] - addrs[i]; ++ } else { ++ if (insn->imm == -1) ++ jmp_offset = -2; ++ else ++ jmp_offset = addrs[i + insn->imm] - addrs[i]; ++ } + + if (!jmp_offset) { + /* +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index 7225cb67c0d3a..0b55ebf4a9b1f 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -367,7 +367,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + { + const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 delta = end_new - end_old; +- s32 off = insn->off; ++ s32 off; ++ ++ if (insn->code == (BPF_JMP32 | BPF_JA)) ++ off = insn->imm; ++ else ++ off = insn->off; + + if (curr < pos && curr + off + 1 >= end_old) + off += delta; +@@ -375,8 +380,12 @@ static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, + off -= delta; + if (off < off_min || off > off_max) + return -ERANGE; +- if (!probe_pass) +- insn->off = off; ++ if (!probe_pass) { ++ if (insn->code == (BPF_JMP32 | BPF_JA)) ++ insn->imm = off; ++ else ++ insn->off = off; ++ } + return 0; + } + +@@ -1586,6 +1595,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); + INSN_3(JMP, JSLE, K), \ + INSN_3(JMP, JSET, K), \ + INSN_2(JMP, JA), \ ++ INSN_2(JMP32, JA), \ + /* Store instructions. */ \ + /* Register based. */ \ + INSN_3(STX, MEM, B), \ +@@ -1862,6 +1872,9 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) + JMP_JA: + insn += insn->off; + CONT; ++ JMP32_JA: ++ insn += insn->imm; ++ CONT; + JMP_EXIT: + return BPF_R0; + /* JMP */ +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 73d500c51bd86..dd025f66efabc 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -2254,7 +2254,10 @@ static int check_subprogs(struct bpf_verifier_env *env) + goto next; + if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + goto next; +- off = i + insn[i].off + 1; ++ if (code == (BPF_JMP32 | BPF_JA)) ++ off = i + insn[i].imm + 1; ++ else ++ off = i + insn[i].off + 1; + if (off < subprog_start || off >= subprog_end) { + verbose(env, "jump out of range from insn %d to %d\n", i, off); + return -EINVAL; +@@ -2266,6 +2269,7 @@ static int check_subprogs(struct bpf_verifier_env *env) + * or unconditional jump back + */ + if (code != (BPF_JMP | BPF_EXIT) && ++ code != (BPF_JMP32 | BPF_JA) && + code != (BPF_JMP | BPF_JA)) { + verbose(env, "last insn is not an exit or jmp\n"); + return -EINVAL; +@@ -11116,7 +11120,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, + static int visit_insn(int t, struct bpf_verifier_env *env) + { + struct bpf_insn *insns = env->prog->insnsi, *insn = &insns[t]; +- int ret; ++ int ret, off; + + if (bpf_pseudo_func(insn)) + return visit_func_call_insn(t, insns, env, true); +@@ -11144,14 +11148,19 @@ static int visit_insn(int t, struct bpf_verifier_env *env) + if (BPF_SRC(insn->code) != BPF_K) + return -EINVAL; + ++ if (BPF_CLASS(insn->code) == BPF_JMP) ++ off = insn->off; ++ else ++ off = insn->imm; ++ + /* unconditional jump with single edge */ +- ret = push_insn(t, t + insn->off + 1, FALLTHROUGH, env, ++ ret = push_insn(t, t + off + 1, FALLTHROUGH, env, + true); + if (ret) + return ret; + +- mark_prune_point(env, t + insn->off + 1); +- mark_jmp_point(env, t + insn->off + 1); ++ mark_prune_point(env, t + off + 1); ++ mark_jmp_point(env, t + off + 1); + + return ret; + +@@ -12687,15 +12696,18 @@ static int do_check(struct bpf_verifier_env *env) + return err; + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) != BPF_K || +- insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0 || +- class == BPF_JMP32) { ++ (class == BPF_JMP && insn->imm != 0) || ++ (class == BPF_JMP32 && insn->off != 0)) { + verbose(env, "BPF_JA uses reserved fields\n"); + return -EINVAL; + } + +- env->insn_idx += insn->off + 1; ++ if (class == BPF_JMP) ++ env->insn_idx += insn->off + 1; ++ else ++ env->insn_idx += insn->imm + 1; + continue; + + } else if (opcode == BPF_EXIT) { +@@ -13521,13 +13533,13 @@ static bool insn_is_cond_jump(u8 code) + { + u8 op; + ++ op = BPF_OP(code); + if (BPF_CLASS(code) == BPF_JMP32) +- return true; ++ return op != BPF_JA; + + if (BPF_CLASS(code) != BPF_JMP) + return false; + +- op = BPF_OP(code); + return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; + } + +-- +2.43.0 + diff --git a/queue-6.1/bpf-x64-fix-tailcall-infinite-loop.patch b/queue-6.1/bpf-x64-fix-tailcall-infinite-loop.patch new file mode 100644 index 00000000000..cd789f08860 --- /dev/null +++ b/queue-6.1/bpf-x64-fix-tailcall-infinite-loop.patch @@ -0,0 +1,167 @@ +From 1a57e1d64338a8af8a056d73c2ebac861d202331 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Sep 2023 23:04:41 +0800 +Subject: bpf, x64: Fix tailcall infinite loop + +From: Leon Hwang + +[ Upstream commit 2b5dcb31a19a2e0acd869b12c9db9b2d696ef544 ] + +From commit ebf7d1f508a73871 ("bpf, x64: rework pro/epilogue and tailcall +handling in JIT"), the tailcall on x64 works better than before. + +From commit e411901c0b775a3a ("bpf: allow for tailcalls in BPF subprograms +for x64 JIT"), tailcall is able to run in BPF subprograms on x64. + +From commit 5b92a28aae4dd0f8 ("bpf: Support attaching tracing BPF program +to other BPF programs"), BPF program is able to trace other BPF programs. + +How about combining them all together? + +1. FENTRY/FEXIT on a BPF subprogram. +2. A tailcall runs in the BPF subprogram. +3. The tailcall calls the subprogram's caller. + +As a result, a tailcall infinite loop comes up. And the loop would halt +the machine. + +As we know, in tail call context, the tail_call_cnt propagates by stack +and rax register between BPF subprograms. So do in trampolines. + +Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT") +Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT") +Reviewed-by: Maciej Fijalkowski +Signed-off-by: Leon Hwang +Link: https://lore.kernel.org/r/20230912150442.2009-3-hffilwlqm@gmail.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + arch/x86/net/bpf_jit_comp.c | 28 ++++++++++++++++++++++------ + include/linux/bpf.h | 5 +++++ + kernel/bpf/trampoline.c | 4 ++-- + kernel/bpf/verifier.c | 3 +++ + 4 files changed, 32 insertions(+), 8 deletions(-) + +diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c +index 4686c1d9d0cfd..e6a031f8dd2e9 100644 +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -893,6 +893,10 @@ static void emit_nops(u8 **pprog, int len) + + #define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp))) + ++/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ ++#define RESTORE_TAIL_CALL_CNT(stack) \ ++ EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8) ++ + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, + int oldproglen, struct jit_context *ctx, bool jmp_padding) + { +@@ -1436,9 +1440,7 @@ st: if (is_imm8(insn->off)) + case BPF_JMP | BPF_CALL: + func = (u8 *) __bpf_call_base + imm32; + if (tail_call_reachable) { +- /* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */ +- EMIT3_off32(0x48, 0x8B, 0x85, +- -round_up(bpf_prog->aux->stack_depth, 8) - 8); ++ RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth); + if (!imm32 || emit_call(&prog, func, image + addrs[i - 1] + 7)) + return -EINVAL; + } else { +@@ -2070,6 +2072,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag + * + * RBP - run_ctx_off [ bpf_tramp_run_ctx ] ++ * RSP [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX + */ + + /* room for return value of orig_call or fentry prog */ +@@ -2106,6 +2109,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + EMIT1(0x55); /* push rbp */ + EMIT3(0x48, 0x89, 0xE5); /* mov rbp, rsp */ + EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */ ++ if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) ++ EMIT1(0x50); /* push rax */ + EMIT1(0x53); /* push rbx */ + + /* Store number of argument registers of the traced function: +@@ -2156,9 +2161,15 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + if (flags & BPF_TRAMP_F_CALL_ORIG) { + restore_regs(m, &prog, nr_args, regs_off); + ++ if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) ++ /* Before calling the original function, restore the ++ * tail_call_cnt from stack to rax. ++ */ ++ RESTORE_TAIL_CALL_CNT(stack_size); ++ + if (flags & BPF_TRAMP_F_ORIG_STACK) { +- emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8); +- EMIT2(0xff, 0xd0); /* call *rax */ ++ emit_ldx(&prog, BPF_DW, BPF_REG_6, BPF_REG_FP, 8); ++ EMIT2(0xff, 0xd3); /* call *rbx */ + } else { + /* call original function */ + if (emit_call(&prog, orig_call, prog)) { +@@ -2209,7 +2220,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + ret = -EINVAL; + goto cleanup; + } +- } ++ } else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) ++ /* Before running the original function, restore the ++ * tail_call_cnt from stack to rax. ++ */ ++ RESTORE_TAIL_CALL_CNT(stack_size); ++ + /* restore return value of orig_call or fentry prog back into RAX */ + if (save_ret) + emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); +diff --git a/include/linux/bpf.h b/include/linux/bpf.h +index 3ce9e39ecdb85..619fcba84be22 100644 +--- a/include/linux/bpf.h ++++ b/include/linux/bpf.h +@@ -825,6 +825,11 @@ struct btf_func_model { + */ + #define BPF_TRAMP_F_SHARE_IPMODIFY BIT(6) + ++/* Indicate that current trampoline is in a tail call context. Then, it has to ++ * cache and restore tail_call_cnt to avoid infinite tail call loop. ++ */ ++#define BPF_TRAMP_F_TAIL_CALL_CTX BIT(7) ++ + /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 + * bytes on x86. + */ +diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c +index c4381dfcd6b09..748ac86169941 100644 +--- a/kernel/bpf/trampoline.c ++++ b/kernel/bpf/trampoline.c +@@ -443,8 +443,8 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mut + goto out; + } + +- /* clear all bits except SHARE_IPMODIFY */ +- tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY; ++ /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ ++ tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); + + if (tlinks[BPF_TRAMP_FEXIT].nr_links || + tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 12d360d80c149..ee6e811b43158 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -15442,6 +15442,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) + if (!tr) + return -ENOMEM; + ++ if (tgt_prog && tgt_prog->aux->tail_call_reachable) ++ tr->flags = BPF_TRAMP_F_TAIL_CALL_CTX; ++ + prog->aux->dst_trampoline = tr; + return 0; + } +-- +2.43.0 + diff --git a/queue-6.1/bpf-x86-save-restore-regs-with-bpf_dw-size.patch b/queue-6.1/bpf-x86-save-restore-regs-with-bpf_dw-size.patch new file mode 100644 index 00000000000..ccc233f0151 --- /dev/null +++ b/queue-6.1/bpf-x86-save-restore-regs-with-bpf_dw-size.patch @@ -0,0 +1,94 @@ +From c56095a745ac4ce4fa4f5e267d8e5610efb53c12 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 13 Jul 2023 12:07:36 +0800 +Subject: bpf, x86: save/restore regs with BPF_DW size + +From: Menglong Dong + +[ Upstream commit 02a6dfa8ff43efb1c989f87a4d862aedf436088a ] + +As we already reserve 8 byte in the stack for each reg, it is ok to +store/restore the regs in BPF_DW size. This will make the code in +save_regs()/restore_regs() simpler. + +Signed-off-by: Menglong Dong +Acked-by: Yonghong Song +Link: https://lore.kernel.org/r/20230713040738.1789742-2-imagedong@tencent.com +Signed-off-by: Alexei Starovoitov +Stable-dep-of: 2b5dcb31a19a ("bpf, x64: Fix tailcall infinite loop") +Signed-off-by: Sasha Levin +--- + arch/x86/net/bpf_jit_comp.c | 35 ++++++----------------------------- + 1 file changed, 6 insertions(+), 29 deletions(-) + +diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c +index 87cea23f2da16..84c695ae1940f 100644 +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -1755,57 +1755,34 @@ st: if (is_imm8(insn->off)) + static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, + int stack_size) + { +- int i, j, arg_size; +- bool next_same_struct = false; ++ int i; + + /* Store function arguments to stack. + * For a function that accepts two pointers the sequence will be: + * mov QWORD PTR [rbp-0x10],rdi + * mov QWORD PTR [rbp-0x8],rsi + */ +- for (i = 0, j = 0; i < min(nr_regs, 6); i++) { +- /* The arg_size is at most 16 bytes, enforced by the verifier. */ +- arg_size = m->arg_size[j]; +- if (arg_size > 8) { +- arg_size = 8; +- next_same_struct = !next_same_struct; +- } +- +- emit_stx(prog, bytes_to_bpf_size(arg_size), +- BPF_REG_FP, ++ for (i = 0; i < min(nr_regs, 6); i++) ++ emit_stx(prog, BPF_DW, BPF_REG_FP, + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + -(stack_size - i * 8)); +- +- j = next_same_struct ? j : j + 1; +- } + } + + static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, + int stack_size) + { +- int i, j, arg_size; +- bool next_same_struct = false; ++ int i; + + /* Restore function arguments from stack. + * For a function that accepts two pointers the sequence will be: + * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] + * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] + */ +- for (i = 0, j = 0; i < min(nr_regs, 6); i++) { +- /* The arg_size is at most 16 bytes, enforced by the verifier. */ +- arg_size = m->arg_size[j]; +- if (arg_size > 8) { +- arg_size = 8; +- next_same_struct = !next_same_struct; +- } +- +- emit_ldx(prog, bytes_to_bpf_size(arg_size), ++ for (i = 0; i < min(nr_regs, 6); i++) ++ emit_ldx(prog, BPF_DW, + i == 5 ? X86_REG_R9 : BPF_REG_1 + i, + BPF_REG_FP, + -(stack_size - i * 8)); +- +- j = next_same_struct ? j : j + 1; +- } + } + + static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, +-- +2.43.0 + diff --git a/queue-6.1/bpf-x86-simplify-the-parsing-logic-of-structure-para.patch b/queue-6.1/bpf-x86-simplify-the-parsing-logic-of-structure-para.patch new file mode 100644 index 00000000000..5d1b4cc7a88 --- /dev/null +++ b/queue-6.1/bpf-x86-simplify-the-parsing-logic-of-structure-para.patch @@ -0,0 +1,225 @@ +From 0cc5afc6ba7a0afb7289de880f54e9d3715ee8be Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 5 Jan 2023 11:50:26 +0800 +Subject: bpf, x86: Simplify the parsing logic of structure parameters + +From: Pu Lehui + +[ Upstream commit 7f7880495770329d095d402c2865bfa7089192f8 ] + +Extra_nregs of structure parameters and nr_args can be +added directly at the beginning, and using a flip flag +to identifiy structure parameters. Meantime, renaming +some variables to make them more sense. + +Signed-off-by: Pu Lehui +Acked-by: Yonghong Song +Link: https://lore.kernel.org/r/20230105035026.3091988-1-pulehui@huaweicloud.com +Signed-off-by: Martin KaFai Lau +Stable-dep-of: 2b5dcb31a19a ("bpf, x64: Fix tailcall infinite loop") +Signed-off-by: Sasha Levin +--- + arch/x86/net/bpf_jit_comp.c | 101 +++++++++++++++++------------------- + 1 file changed, 48 insertions(+), 53 deletions(-) + +diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c +index e6a031f8dd2e9..87cea23f2da16 100644 +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -1752,62 +1752,59 @@ st: if (is_imm8(insn->off)) + return proglen; + } + +-static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args, ++static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, + int stack_size) + { +- int i, j, arg_size, nr_regs; ++ int i, j, arg_size; ++ bool next_same_struct = false; ++ + /* Store function arguments to stack. + * For a function that accepts two pointers the sequence will be: + * mov QWORD PTR [rbp-0x10],rdi + * mov QWORD PTR [rbp-0x8],rsi + */ +- for (i = 0, j = 0; i < min(nr_args, 6); i++) { +- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) { +- nr_regs = (m->arg_size[i] + 7) / 8; ++ for (i = 0, j = 0; i < min(nr_regs, 6); i++) { ++ /* The arg_size is at most 16 bytes, enforced by the verifier. */ ++ arg_size = m->arg_size[j]; ++ if (arg_size > 8) { + arg_size = 8; +- } else { +- nr_regs = 1; +- arg_size = m->arg_size[i]; ++ next_same_struct = !next_same_struct; + } + +- while (nr_regs) { +- emit_stx(prog, bytes_to_bpf_size(arg_size), +- BPF_REG_FP, +- j == 5 ? X86_REG_R9 : BPF_REG_1 + j, +- -(stack_size - j * 8)); +- nr_regs--; +- j++; +- } ++ emit_stx(prog, bytes_to_bpf_size(arg_size), ++ BPF_REG_FP, ++ i == 5 ? X86_REG_R9 : BPF_REG_1 + i, ++ -(stack_size - i * 8)); ++ ++ j = next_same_struct ? j : j + 1; + } + } + +-static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args, ++static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_regs, + int stack_size) + { +- int i, j, arg_size, nr_regs; ++ int i, j, arg_size; ++ bool next_same_struct = false; + + /* Restore function arguments from stack. + * For a function that accepts two pointers the sequence will be: + * EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10] + * EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8] + */ +- for (i = 0, j = 0; i < min(nr_args, 6); i++) { +- if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) { +- nr_regs = (m->arg_size[i] + 7) / 8; ++ for (i = 0, j = 0; i < min(nr_regs, 6); i++) { ++ /* The arg_size is at most 16 bytes, enforced by the verifier. */ ++ arg_size = m->arg_size[j]; ++ if (arg_size > 8) { + arg_size = 8; +- } else { +- nr_regs = 1; +- arg_size = m->arg_size[i]; ++ next_same_struct = !next_same_struct; + } + +- while (nr_regs) { +- emit_ldx(prog, bytes_to_bpf_size(arg_size), +- j == 5 ? X86_REG_R9 : BPF_REG_1 + j, +- BPF_REG_FP, +- -(stack_size - j * 8)); +- nr_regs--; +- j++; +- } ++ emit_ldx(prog, bytes_to_bpf_size(arg_size), ++ i == 5 ? X86_REG_R9 : BPF_REG_1 + i, ++ BPF_REG_FP, ++ -(stack_size - i * 8)); ++ ++ j = next_same_struct ? j : j + 1; + } + } + +@@ -2033,8 +2030,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + struct bpf_tramp_links *tlinks, + void *func_addr) + { +- int ret, i, nr_args = m->nr_args, extra_nregs = 0; +- int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off; ++ int i, ret, nr_regs = m->nr_args, stack_size = 0; ++ int regs_off, nregs_off, ip_off, run_ctx_off; + struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY]; + struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT]; + struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN]; +@@ -2043,17 +2040,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + u8 *prog; + bool save_ret; + +- /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ +- if (nr_args > 6) +- return -ENOTSUPP; +- +- for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) { ++ /* extra registers for struct arguments */ ++ for (i = 0; i < m->nr_args; i++) + if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) +- extra_nregs += (m->arg_size[i] + 7) / 8 - 1; +- } +- if (nr_args + extra_nregs > 6) ++ nr_regs += (m->arg_size[i] + 7) / 8 - 1; ++ ++ /* x86-64 supports up to 6 arguments. 7+ can be added in the future */ ++ if (nr_regs > 6) + return -ENOTSUPP; +- stack_size += extra_nregs * 8; + + /* Generated trampoline stack layout: + * +@@ -2067,7 +2061,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + * [ ... ] + * RBP - regs_off [ reg_arg1 ] program's ctx pointer + * +- * RBP - args_off [ arg regs count ] always ++ * RBP - nregs_off [ regs count ] always + * + * RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag + * +@@ -2080,11 +2074,12 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + if (save_ret) + stack_size += 8; + ++ stack_size += nr_regs * 8; + regs_off = stack_size; + +- /* args count */ ++ /* regs count */ + stack_size += 8; +- args_off = stack_size; ++ nregs_off = stack_size; + + if (flags & BPF_TRAMP_F_IP_ARG) + stack_size += 8; /* room for IP address argument */ +@@ -2114,11 +2109,11 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + EMIT1(0x53); /* push rbx */ + + /* Store number of argument registers of the traced function: +- * mov rax, nr_args + extra_nregs +- * mov QWORD PTR [rbp - args_off], rax ++ * mov rax, nr_regs ++ * mov QWORD PTR [rbp - nregs_off], rax + */ +- emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs); +- emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off); ++ emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_regs); ++ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -nregs_off); + + if (flags & BPF_TRAMP_F_IP_ARG) { + /* Store IP address of the traced function: +@@ -2129,7 +2124,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off); + } + +- save_regs(m, &prog, nr_args, regs_off); ++ save_regs(m, &prog, nr_regs, regs_off); + + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ +@@ -2159,7 +2154,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + } + + if (flags & BPF_TRAMP_F_CALL_ORIG) { +- restore_regs(m, &prog, nr_args, regs_off); ++ restore_regs(m, &prog, nr_regs, regs_off); + + if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) + /* Before calling the original function, restore the +@@ -2206,7 +2201,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i + } + + if (flags & BPF_TRAMP_F_RESTORE_REGS) +- restore_regs(m, &prog, nr_args, regs_off); ++ restore_regs(m, &prog, nr_regs, regs_off); + + /* This needs to be done regardless. If there were fmod_ret programs, + * the return value is only updated on the stack and still needs to be +-- +2.43.0 + diff --git a/queue-6.1/btrfs-fix-qgroup_free_reserved_data-int-overflow.patch b/queue-6.1/btrfs-fix-qgroup_free_reserved_data-int-overflow.patch new file mode 100644 index 00000000000..a375df2e220 --- /dev/null +++ b/queue-6.1/btrfs-fix-qgroup_free_reserved_data-int-overflow.patch @@ -0,0 +1,269 @@ +From 513d47d3ddb69a718de4359b8ccbf68b1431cdde Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Dec 2023 13:00:10 -0800 +Subject: btrfs: fix qgroup_free_reserved_data int overflow + +From: Boris Burkov + +[ Upstream commit 9e65bfca24cf1d77e4a5c7a170db5867377b3fe7 ] + +The reserved data counter and input parameter is a u64, but we +inadvertently accumulate it in an int. Overflowing that int results in +freeing the wrong amount of data and breaking reserve accounting. + +Unfortunately, this overflow rot spreads from there, as the qgroup +release/free functions rely on returning an int to take advantage of +negative values for error codes. + +Therefore, the full fix is to return the "released" or "freed" amount by +a u64 argument and to return 0 or negative error code via the return +value. + +Most of the call sites simply ignore the return value, though some +of them handle the error and count the returned bytes. Change all of +them accordingly. + +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Qu Wenruo +Signed-off-by: Boris Burkov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/delalloc-space.c | 2 +- + fs/btrfs/file.c | 2 +- + fs/btrfs/inode.c | 16 ++++++++-------- + fs/btrfs/ordered-data.c | 7 ++++--- + fs/btrfs/qgroup.c | 25 +++++++++++++++---------- + fs/btrfs/qgroup.h | 4 ++-- + 6 files changed, 31 insertions(+), 25 deletions(-) + +diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c +index 0b62ce77053f5..f2bc5563c0f92 100644 +--- a/fs/btrfs/delalloc-space.c ++++ b/fs/btrfs/delalloc-space.c +@@ -197,7 +197,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, + start = round_down(start, fs_info->sectorsize); + + btrfs_free_reserved_data_space_noquota(fs_info, len); +- btrfs_qgroup_free_data(inode, reserved, start, len); ++ btrfs_qgroup_free_data(inode, reserved, start, len, NULL); + } + + /** +diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c +index 0a46fff3dd067..1783a0fbf1665 100644 +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -3191,7 +3191,7 @@ static long btrfs_fallocate(struct file *file, int mode, + qgroup_reserved -= range->len; + } else if (qgroup_reserved > 0) { + btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, +- range->start, range->len); ++ range->start, range->len, NULL); + qgroup_reserved -= range->len; + } + list_del(&range->list); +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 81eac121c6b23..9a7d77c410e22 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -466,7 +466,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, + * And at reserve time, it's always aligned to page size, so + * just free one page here. + */ +- btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); ++ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_free_path(path); + btrfs_end_transaction(trans); + return ret; +@@ -5372,7 +5372,7 @@ static void evict_inode_truncate_pages(struct inode *inode) + */ + if (state_flags & EXTENT_DELALLOC) + btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, +- end - start + 1); ++ end - start + 1, NULL); + + clear_extent_bit(io_tree, start, end, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, +@@ -8440,7 +8440,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, + * reserved data space. + * Since the IO will never happen for this page. + */ +- btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); ++ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); + if (!inode_evicting) { + clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_UPTODATE | +@@ -9902,7 +9902,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( + struct btrfs_path *path; + u64 start = ins->objectid; + u64 len = ins->offset; +- int qgroup_released; ++ u64 qgroup_released = 0; + int ret; + + memset(&stack_fi, 0, sizeof(stack_fi)); +@@ -9915,9 +9915,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( + btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); + /* Encryption and other encoding is reserved and all 0 */ + +- qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); +- if (qgroup_released < 0) +- return ERR_PTR(qgroup_released); ++ ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); ++ if (ret < 0) ++ return ERR_PTR(ret); + + if (trans) { + ret = insert_reserved_file_extent(trans, inode, +@@ -10903,7 +10903,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); + out_qgroup_free_data: + if (ret < 0) +- btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); ++ btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); + out_free_data_space: + /* + * If btrfs_reserve_extent() succeeded, then we already decremented +diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c +index 0321753c16b9f..1b2af4785c0e2 100644 +--- a/fs/btrfs/ordered-data.c ++++ b/fs/btrfs/ordered-data.c +@@ -172,11 +172,12 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + struct rb_node *node; + struct btrfs_ordered_extent *entry; + int ret; ++ u64 qgroup_rsv = 0; + + if (flags & + ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { + /* For nocow write, we can release the qgroup rsv right now */ +- ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); ++ ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); + if (ret < 0) + return ret; + ret = 0; +@@ -185,7 +186,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + * The ordered extent has reserved qgroup space, release now + * and pass the reserved number for qgroup_record to free. + */ +- ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); ++ ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); + if (ret < 0) + return ret; + } +@@ -203,7 +204,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + entry->inode = igrab(&inode->vfs_inode); + entry->compress_type = compress_type; + entry->truncated_len = (u64)-1; +- entry->qgroup_rsv = ret; ++ entry->qgroup_rsv = qgroup_rsv; + entry->physical = (u64)-1; + + ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0); +diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c +index 26cabffd59710..96ec9ccc2ef61 100644 +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3833,13 +3833,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + + /* Free ranges specified by @reserved, normally in error path */ + static int qgroup_free_reserved_data(struct btrfs_inode *inode, +- struct extent_changeset *reserved, u64 start, u64 len) ++ struct extent_changeset *reserved, ++ u64 start, u64 len, u64 *freed_ret) + { + struct btrfs_root *root = inode->root; + struct ulist_node *unode; + struct ulist_iterator uiter; + struct extent_changeset changeset; +- int freed = 0; ++ u64 freed = 0; + int ret; + + extent_changeset_init(&changeset); +@@ -3880,7 +3881,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, + } + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, + BTRFS_QGROUP_RSV_DATA); +- ret = freed; ++ if (freed_ret) ++ *freed_ret = freed; ++ ret = 0; + out: + extent_changeset_release(&changeset); + return ret; +@@ -3888,7 +3891,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, + + static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, u64 len, +- int free) ++ u64 *released, int free) + { + struct extent_changeset changeset; + int trace_op = QGROUP_RELEASE; +@@ -3900,7 +3903,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, + /* In release case, we shouldn't have @reserved */ + WARN_ON(!free && reserved); + if (free && reserved) +- return qgroup_free_reserved_data(inode, reserved, start, len); ++ return qgroup_free_reserved_data(inode, reserved, start, len, released); + extent_changeset_init(&changeset); + ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, + EXTENT_QGROUP_RESERVED, &changeset); +@@ -3915,7 +3918,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, + btrfs_qgroup_free_refroot(inode->root->fs_info, + inode->root->root_key.objectid, + changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); +- ret = changeset.bytes_changed; ++ if (released) ++ *released = changeset.bytes_changed; + out: + extent_changeset_release(&changeset); + return ret; +@@ -3934,9 +3938,10 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, + * NOTE: This function may sleep for memory allocation. + */ + int btrfs_qgroup_free_data(struct btrfs_inode *inode, +- struct extent_changeset *reserved, u64 start, u64 len) ++ struct extent_changeset *reserved, ++ u64 start, u64 len, u64 *freed) + { +- return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); ++ return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); + } + + /* +@@ -3954,9 +3959,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, + * + * NOTE: This function may sleep for memory allocation. + */ +-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) ++int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) + { +- return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); ++ return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); + } + + static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, +diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h +index 578c77e94200f..c382923f7628e 100644 +--- a/fs/btrfs/qgroup.h ++++ b/fs/btrfs/qgroup.h +@@ -360,10 +360,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, + /* New io_tree based accurate qgroup reserve API */ + int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, + struct extent_changeset **reserved, u64 start, u64 len); +-int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); ++int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released); + int btrfs_qgroup_free_data(struct btrfs_inode *inode, + struct extent_changeset *reserved, u64 start, +- u64 len); ++ u64 len, u64 *freed); + int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce); + int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, +-- +2.43.0 + diff --git a/queue-6.1/btrfs-mark-the-len-field-in-struct-btrfs_ordered_sum.patch b/queue-6.1/btrfs-mark-the-len-field-in-struct-btrfs_ordered_sum.patch new file mode 100644 index 00000000000..bbbb423faeb --- /dev/null +++ b/queue-6.1/btrfs-mark-the-len-field-in-struct-btrfs_ordered_sum.patch @@ -0,0 +1,51 @@ +From c5154bfdcfc857cf2ee5f1b2d6b0778c026c11b2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 May 2023 17:03:06 +0200 +Subject: btrfs: mark the len field in struct btrfs_ordered_sum as unsigned + +From: Christoph Hellwig + +[ Upstream commit 6e4b2479ab38b3f949a85964da212295d32102f0 ] + +len can't ever be negative, so mark it as an u32 instead of int. + +Reviewed-by: Johannes Thumshirn +Signed-off-by: Christoph Hellwig +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Stable-dep-of: 9e65bfca24cf ("btrfs: fix qgroup_free_reserved_data int overflow") +Signed-off-by: Sasha Levin +--- + fs/btrfs/file-item.c | 2 +- + fs/btrfs/ordered-data.h | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c +index b14d2da9b26d3..14478da875313 100644 +--- a/fs/btrfs/file-item.c ++++ b/fs/btrfs/file-item.c +@@ -602,7 +602,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + } + + sums->bytenr = start; +- sums->len = (int)size; ++ sums->len = size; + + offset = (start - key.offset) >> fs_info->sectorsize_bits; + offset *= csum_size; +diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h +index f59f2dbdb25ed..cc3ca4bb9bd54 100644 +--- a/fs/btrfs/ordered-data.h ++++ b/fs/btrfs/ordered-data.h +@@ -20,7 +20,7 @@ struct btrfs_ordered_sum { + /* + * this is the length in bytes covered by the sums array below. + */ +- int len; ++ u32 len; + struct list_head list; + /* last field is a variable length array of csums */ + u8 sums[]; +-- +2.43.0 + diff --git a/queue-6.1/can-raw-add-support-for-so_mark.patch b/queue-6.1/can-raw-add-support-for-so_mark.patch new file mode 100644 index 00000000000..7da0798bbba --- /dev/null +++ b/queue-6.1/can-raw-add-support-for-so_mark.patch @@ -0,0 +1,36 @@ +From 17fe236d4580c1fb90b59345b81b667d28253b36 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 9 Dec 2022 10:10:08 +0100 +Subject: can: raw: add support for SO_MARK + +From: Marc Kleine-Budde + +[ Upstream commit 0826e82b8a32e646b7b32ba8b68ba30812028e47 ] + +Add support for SO_MARK to the CAN_RAW protocol. This makes it +possible to add traffic control filters based on the fwmark. + +Link: https://lore.kernel.org/all/20221210113653.170346-1-mkl@pengutronix.de +Acked-by: Oliver Hartkopp +Signed-off-by: Marc Kleine-Budde +Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)") +Signed-off-by: Sasha Levin +--- + net/can/raw.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/can/raw.c b/net/can/raw.c +index 8c104339d538d..488320738e319 100644 +--- a/net/can/raw.c ++++ b/net/can/raw.c +@@ -881,6 +881,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) + + skb->dev = dev; + skb->priority = sk->sk_priority; ++ skb->mark = sk->sk_mark; + skb->tstamp = sockc.transmit_time; + + skb_setup_tx_timestamp(skb, sockc.tsflags); +-- +2.43.0 + diff --git a/queue-6.1/cpu-smt-create-topology_smt_thread_allowed.patch b/queue-6.1/cpu-smt-create-topology_smt_thread_allowed.patch new file mode 100644 index 00000000000..161626b1245 --- /dev/null +++ b/queue-6.1/cpu-smt-create-topology_smt_thread_allowed.patch @@ -0,0 +1,111 @@ +From 853dc4a7fe0d006cee6fde50262d67a545487936 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Jul 2023 16:51:39 +0200 +Subject: cpu/SMT: Create topology_smt_thread_allowed() + +From: Michael Ellerman + +[ Upstream commit 38253464bc821d6de6bba81bb1412ebb36f6cbd1 ] + +Some architectures allows partial SMT states, i.e. when not all SMT threads +are brought online. + +To support that, add an architecture helper which checks whether a given +CPU is allowed to be brought online depending on how many SMT threads are +currently enabled. Since this is only applicable to architecture supporting +partial SMT, only these architectures should select the new configuration +variable CONFIG_SMT_NUM_THREADS_DYNAMIC. For the other architectures, not +supporting the partial SMT states, there is no need to define +topology_cpu_smt_allowed(), the generic code assumed that all the threads +are allowed or only the primary ones. + +Call the helper from cpu_smt_enable(), and cpu_smt_allowed() when SMT is +enabled, to check if the particular thread should be onlined. Notably, +also call it from cpu_smt_disable() if CPU_SMT_ENABLED, to allow +offlining some threads to move from a higher to lower number of threads +online. + +[ ldufour: Slightly reword the commit's description ] +[ ldufour: Introduce CONFIG_SMT_NUM_THREADS_DYNAMIC ] + +Suggested-by: Thomas Gleixner +Signed-off-by: Michael Ellerman +Signed-off-by: Laurent Dufour +Signed-off-by: Thomas Gleixner +Tested-by: Zhang Rui +Link: https://lore.kernel.org/r/20230705145143.40545-7-ldufour@linux.ibm.com +Stable-dep-of: d91bdd96b55c ("cpu/SMT: Make SMT control more robust against enumeration failures") +Signed-off-by: Sasha Levin +--- + arch/Kconfig | 3 +++ + kernel/cpu.c | 24 +++++++++++++++++++++++- + 2 files changed, 26 insertions(+), 1 deletion(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index b60d271bf76a9..14273a6203dfc 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -34,6 +34,9 @@ config ARCH_HAS_SUBPAGE_FAULTS + config HOTPLUG_SMT + bool + ++config SMT_NUM_THREADS_DYNAMIC ++ bool ++ + config GENERIC_ENTRY + bool + +diff --git a/kernel/cpu.c b/kernel/cpu.c +index 551468d9c5a85..c37f1758a4865 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -446,9 +446,23 @@ static int __init smt_cmdline_disable(char *str) + } + early_param("nosmt", smt_cmdline_disable); + ++/* ++ * For Archicture supporting partial SMT states check if the thread is allowed. ++ * Otherwise this has already been checked through cpu_smt_max_threads when ++ * setting the SMT level. ++ */ ++static inline bool cpu_smt_thread_allowed(unsigned int cpu) ++{ ++#ifdef CONFIG_SMT_NUM_THREADS_DYNAMIC ++ return topology_smt_thread_allowed(cpu); ++#else ++ return true; ++#endif ++} ++ + static inline bool cpu_smt_allowed(unsigned int cpu) + { +- if (cpu_smt_control == CPU_SMT_ENABLED) ++ if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) + return true; + + if (topology_is_primary_thread(cpu)) +@@ -2294,6 +2308,12 @@ int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) + for_each_online_cpu(cpu) { + if (topology_is_primary_thread(cpu)) + continue; ++ /* ++ * Disable can be called with CPU_SMT_ENABLED when changing ++ * from a higher to lower number of SMT threads per core. ++ */ ++ if (ctrlval == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) ++ continue; + ret = cpu_down_maps_locked(cpu, CPUHP_OFFLINE); + if (ret) + break; +@@ -2328,6 +2348,8 @@ int cpuhp_smt_enable(void) + /* Skip online CPUs and CPUs on offline nodes */ + if (cpu_online(cpu) || !node_online(cpu_to_node(cpu))) + continue; ++ if (!cpu_smt_thread_allowed(cpu)) ++ continue; + ret = _cpu_up(cpu, 0, CPUHP_ONLINE); + if (ret) + break; +-- +2.43.0 + diff --git a/queue-6.1/cpu-smt-make-smt-control-more-robust-against-enumera.patch b/queue-6.1/cpu-smt-make-smt-control-more-robust-against-enumera.patch new file mode 100644 index 00000000000..f768759cce0 --- /dev/null +++ b/queue-6.1/cpu-smt-make-smt-control-more-robust-against-enumera.patch @@ -0,0 +1,113 @@ +From 09e97aec954cf0a31689861b27a859e63d278e0a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 14 Aug 2023 10:18:27 +0200 +Subject: cpu/SMT: Make SMT control more robust against enumeration failures + +From: Thomas Gleixner + +[ Upstream commit d91bdd96b55cc3ce98d883a60f133713821b80a6 ] + +The SMT control mechanism got added as speculation attack vector +mitigation. The implemented logic relies on the primary thread mask to +be set up properly. + +This turns out to be an issue with XEN/PV guests because their CPU hotplug +mechanics do not enumerate APICs and therefore the mask is never correctly +populated. + +This went unnoticed so far because by chance XEN/PV ends up with +smp_num_siblings == 2. So smt_hotplug_control stays at its default value +CPU_SMT_ENABLED and the primary thread mask is never evaluated in the +context of CPU hotplug. + +This stopped "working" with the upcoming overhaul of the topology +evaluation which legitimately provides a fake topology for XEN/PV. That +sets smp_num_siblings to 1, which causes the core CPU hot-plug core to +refuse to bring up the APs. + +This happens because smt_hotplug_control is set to CPU_SMT_NOT_SUPPORTED +which causes cpu_smt_allowed() to evaluate the unpopulated primary thread +mask with the conclusion that all non-boot CPUs are not valid to be +plugged. + +Make cpu_smt_allowed() more robust and take CPU_SMT_NOT_SUPPORTED and +CPU_SMT_NOT_IMPLEMENTED into account. Rename it to cpu_bootable() while at +it as that makes it more clear what the function is about. + +The primary mask issue on x86 XEN/PV needs to be addressed separately as +there are users outside of the CPU hotplug code too. + +Fixes: 05736e4ac13c ("cpu/hotplug: Provide knobs to control SMT") +Reported-by: Juergen Gross +Signed-off-by: Thomas Gleixner +Tested-by: Juergen Gross +Tested-by: Sohil Mehta +Tested-by: Michael Kelley +Tested-by: Peter Zijlstra (Intel) +Tested-by: Zhang Rui +Acked-by: Peter Zijlstra (Intel) +Link: https://lore.kernel.org/r/20230814085112.149440843@linutronix.de +Signed-off-by: Sasha Levin +--- + kernel/cpu.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/kernel/cpu.c b/kernel/cpu.c +index c37f1758a4865..e6f0101941ed8 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -460,11 +460,19 @@ static inline bool cpu_smt_thread_allowed(unsigned int cpu) + #endif + } + +-static inline bool cpu_smt_allowed(unsigned int cpu) ++static inline bool cpu_bootable(unsigned int cpu) + { + if (cpu_smt_control == CPU_SMT_ENABLED && cpu_smt_thread_allowed(cpu)) + return true; + ++ /* All CPUs are bootable if controls are not configured */ ++ if (cpu_smt_control == CPU_SMT_NOT_IMPLEMENTED) ++ return true; ++ ++ /* All CPUs are bootable if CPU is not SMT capable */ ++ if (cpu_smt_control == CPU_SMT_NOT_SUPPORTED) ++ return true; ++ + if (topology_is_primary_thread(cpu)) + return true; + +@@ -485,7 +493,7 @@ bool cpu_smt_possible(void) + } + EXPORT_SYMBOL_GPL(cpu_smt_possible); + #else +-static inline bool cpu_smt_allowed(unsigned int cpu) { return true; } ++static inline bool cpu_bootable(unsigned int cpu) { return true; } + #endif + + static inline enum cpuhp_state +@@ -588,10 +596,10 @@ static int bringup_wait_for_ap(unsigned int cpu) + * SMT soft disabling on X86 requires to bring the CPU out of the + * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The + * CPU marked itself as booted_once in notify_cpu_starting() so the +- * cpu_smt_allowed() check will now return false if this is not the ++ * cpu_bootable() check will now return false if this is not the + * primary sibling. + */ +- if (!cpu_smt_allowed(cpu)) ++ if (!cpu_bootable(cpu)) + return -ECANCELED; + + if (st->target <= CPUHP_AP_ONLINE_IDLE) +@@ -1478,7 +1486,7 @@ static int cpu_up(unsigned int cpu, enum cpuhp_state target) + err = -EBUSY; + goto out; + } +- if (!cpu_smt_allowed(cpu)) { ++ if (!cpu_bootable(cpu)) { + err = -EPERM; + goto out; + } +-- +2.43.0 + diff --git a/queue-6.1/dpaa2-eth-recycle-the-rx-buffer-only-after-all-proce.patch b/queue-6.1/dpaa2-eth-recycle-the-rx-buffer-only-after-all-proce.patch new file mode 100644 index 00000000000..77c132658cc --- /dev/null +++ b/queue-6.1/dpaa2-eth-recycle-the-rx-buffer-only-after-all-proce.patch @@ -0,0 +1,69 @@ +From 95fa91911ce94d90029ca22af93007ce4b006574 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 24 Nov 2023 12:28:05 +0200 +Subject: dpaa2-eth: recycle the RX buffer only after all processing done + +From: Ioana Ciornei + +[ Upstream commit beb1930f966d1517921488bd5d64147f58f79abf ] + +The blamed commit added support for Rx copybreak. This meant that for +certain frame sizes, a new skb was allocated and the initial data buffer +was recycled. Instead of waiting to recycle the Rx buffer only after all +processing was done on it (like accessing the parse results or timestamp +information), the code path just went ahead and re-used the buffer right +away. + +This sometimes lead to corrupted HW and SW annotation areas. +Fix this by delaying the moment when the buffer is recycled. + +Fixes: 50f826999a80 ("dpaa2-eth: add rx copybreak support") +Signed-off-by: Ioana Ciornei +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +index b58162ce81d87..de62eee58a00e 100644 +--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c ++++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +@@ -509,8 +509,6 @@ static struct sk_buff *dpaa2_eth_copybreak(struct dpaa2_eth_channel *ch, + + memcpy(skb->data, fd_vaddr + fd_offset, fd_length); + +- dpaa2_eth_recycle_buf(priv, ch, dpaa2_fd_get_addr(fd)); +- + return skb; + } + +@@ -528,6 +526,7 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, + struct dpaa2_eth_drv_stats *percpu_extras; + struct device *dev = priv->net_dev->dev.parent; + struct dpaa2_fas *fas; ++ bool recycle_rx_buf = false; + void *buf_data; + u32 status = 0; + u32 xdp_act; +@@ -560,6 +559,8 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, + dma_unmap_page(dev, addr, priv->rx_buf_size, + DMA_BIDIRECTIONAL); + skb = dpaa2_eth_build_linear_skb(ch, fd, vaddr); ++ } else { ++ recycle_rx_buf = true; + } + } else if (fd_format == dpaa2_fd_sg) { + WARN_ON(priv->xdp_prog); +@@ -607,6 +608,8 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, + + list_add_tail(&skb->list, ch->rx_list); + ++ if (recycle_rx_buf) ++ dpaa2_eth_recycle_buf(priv, ch, dpaa2_fd_get_addr(fd)); + return; + + err_build_skb: +-- +2.43.0 + diff --git a/queue-6.1/drm-bridge-ti-sn65dsi86-never-store-more-than-msg-si.patch b/queue-6.1/drm-bridge-ti-sn65dsi86-never-store-more-than-msg-si.patch new file mode 100644 index 00000000000..2b64c927acc --- /dev/null +++ b/queue-6.1/drm-bridge-ti-sn65dsi86-never-store-more-than-msg-si.patch @@ -0,0 +1,55 @@ +From 57568971e8ca978db98bde4b8e417daebc3ba871 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Dec 2023 12:37:52 -0800 +Subject: drm/bridge: ti-sn65dsi86: Never store more than msg->size bytes in + AUX xfer + +From: Douglas Anderson + +[ Upstream commit aca58eac52b88138ab98c814afb389a381725cd7 ] + +For aux reads, the value `msg->size` indicates the size of the buffer +provided by `msg->buffer`. We should never in any circumstances write +more bytes to the buffer since it may overflow the buffer. + +In the ti-sn65dsi86 driver there is one code path that reads the +transfer length from hardware. Even though it's never been seen to be +a problem, we should make extra sure that the hardware isn't +increasing the length since doing so would cause us to overrun the +buffer. + +Fixes: 982f589bde7a ("drm/bridge: ti-sn65dsi86: Update reply on aux failures") +Reviewed-by: Stephen Boyd +Reviewed-by: Guenter Roeck +Signed-off-by: Douglas Anderson +Link: https://patchwork.freedesktop.org/patch/msgid/20231214123752.v3.2.I7b83c0f31aeedc6b1dc98c7c741d3e1f94f040f8@changeid +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/bridge/ti-sn65dsi86.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c +index 1b5c27ed27370..ff4d0564122a3 100644 +--- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c ++++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c +@@ -527,6 +527,7 @@ static ssize_t ti_sn_aux_transfer(struct drm_dp_aux *aux, + u32 request_val = AUX_CMD_REQ(msg->request); + u8 *buf = msg->buffer; + unsigned int len = msg->size; ++ unsigned int short_len; + unsigned int val; + int ret; + u8 addr_len[SN_AUX_LENGTH_REG + 1 - SN_AUX_ADDR_19_16_REG]; +@@ -600,7 +601,8 @@ static ssize_t ti_sn_aux_transfer(struct drm_dp_aux *aux, + } + + if (val & AUX_IRQ_STATUS_AUX_SHORT) { +- ret = regmap_read(pdata->regmap, SN_AUX_LENGTH_REG, &len); ++ ret = regmap_read(pdata->regmap, SN_AUX_LENGTH_REG, &short_len); ++ len = min(len, short_len); + if (ret) + goto exit; + } else if (val & AUX_IRQ_STATUS_NAT_I2C_FAIL) { +-- +2.43.0 + diff --git a/queue-6.1/drm-i915-dp-fix-passing-the-correct-dpcd_rev-for-drm.patch b/queue-6.1/drm-i915-dp-fix-passing-the-correct-dpcd_rev-for-drm.patch new file mode 100644 index 00000000000..ee1500eeafd --- /dev/null +++ b/queue-6.1/drm-i915-dp-fix-passing-the-correct-dpcd_rev-for-drm.patch @@ -0,0 +1,42 @@ +From 232617028da8530bf010d2b095c3985e085efc4d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Dec 2023 13:15:42 -0800 +Subject: drm/i915/dp: Fix passing the correct DPCD_REV for + drm_dp_set_phy_test_pattern + +From: Khaled Almahallawy + +[ Upstream commit 2bd7a06a1208aaacb4e7a2a5436c23bce8d70801 ] + +Using link_status to get DPCD_REV fails when disabling/defaulting +phy pattern. Use intel_dp->dpcd to access DPCD_REV correctly. + +Fixes: 8cdf72711928 ("drm/i915/dp: Program vswing, pre-emphasis, test-pattern") +Cc: Jani Nikula +Cc: Imre Deak +Cc: Lee Shawn C +Signed-off-by: Khaled Almahallawy +Signed-off-by: Jani Nikula +Link: https://patchwork.freedesktop.org/patch/msgid/20231213211542.3585105-3-khaled.almahallawy@intel.com +(cherry picked from commit 3ee302ec22d6e1d7d1e6d381b0d507ee80f2135c) +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/i915/display/intel_dp.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c +index 5970f4149090f..4699c21102261 100644 +--- a/drivers/gpu/drm/i915/display/intel_dp.c ++++ b/drivers/gpu/drm/i915/display/intel_dp.c +@@ -3707,7 +3707,7 @@ static void intel_dp_process_phy_request(struct intel_dp *intel_dp, + intel_dp->train_set, crtc_state->lane_count); + + drm_dp_set_phy_test_pattern(&intel_dp->aux, data, +- link_status[DP_DPCD_REV]); ++ intel_dp->dpcd[DP_DPCD_REV]); + } + + static u8 intel_dp_autotest_phy_pattern(struct intel_dp *intel_dp) +-- +2.43.0 + diff --git a/queue-6.1/ethtool-don-t-propagate-eopnotsupp-from-dumps.patch b/queue-6.1/ethtool-don-t-propagate-eopnotsupp-from-dumps.patch new file mode 100644 index 00000000000..9af0257b2a9 --- /dev/null +++ b/queue-6.1/ethtool-don-t-propagate-eopnotsupp-from-dumps.patch @@ -0,0 +1,43 @@ +From e5e3d5fd00ba6004228b46e43f6ee0e8588c8fa3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 26 Nov 2023 14:58:06 -0800 +Subject: ethtool: don't propagate EOPNOTSUPP from dumps + +From: Jakub Kicinski + +[ Upstream commit cbeb989e41f4094f54bec2cecce993f26f547bea ] + +The default dump handler needs to clear ret before returning. +Otherwise if the last interface returns an inconsequential +error this error will propagate to user space. + +This may confuse user space (ethtool CLI seems to ignore it, +but YNL doesn't). It will also terminate the dump early +for mutli-skb dump, because netlink core treats EOPNOTSUPP +as a real error. + +Fixes: 728480f12442 ("ethtool: default handlers for GET requests") +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20231126225806.2143528-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/ethtool/netlink.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c +index 1a4c11356c96c..fc4ccecf9495c 100644 +--- a/net/ethtool/netlink.c ++++ b/net/ethtool/netlink.c +@@ -509,7 +509,7 @@ static int ethnl_default_dumpit(struct sk_buff *skb, + cont: + idx++; + } +- ++ ret = 0; + } + rtnl_unlock(); + +-- +2.43.0 + diff --git a/queue-6.1/ext4-convert-move_extent_per_page-to-use-folios.patch b/queue-6.1/ext4-convert-move_extent_per_page-to-use-folios.patch new file mode 100644 index 00000000000..feb31bd337d --- /dev/null +++ b/queue-6.1/ext4-convert-move_extent_per_page-to-use-folios.patch @@ -0,0 +1,159 @@ +From 89cf2bd933e2b50444696df7ae8d806046d290e5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 17 Nov 2022 23:30:52 -0800 +Subject: ext4: convert move_extent_per_page() to use folios + +From: Vishal Moola (Oracle) + +[ Upstream commit 6dd8fe86fa84729538d8bed3149faf9c5886bb5b ] + +Patch series "Removing the try_to_release_page() wrapper", v3. + +This patchset replaces the remaining calls of try_to_release_page() with +the folio equivalent: filemap_release_folio(). This allows us to remove +the wrapper. + +This patch (of 4): + +Convert move_extent_per_page() to use folios. This change removes 5 calls +to compound_head() and is in preparation for the removal of the +try_to_release_page() wrapper. + +Link: https://lkml.kernel.org/r/20221118073055.55694-1-vishal.moola@gmail.com +Link: https://lkml.kernel.org/r/20221118073055.55694-2-vishal.moola@gmail.com +Signed-off-by: Vishal Moola (Oracle) +Cc: Matthew Wilcox +Cc: Naoya Horiguchi +Cc: Theodore Ts'o +Signed-off-by: Andrew Morton +Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") +Signed-off-by: Sasha Levin +--- + fs/ext4/move_extent.c | 52 ++++++++++++++++++++++++++----------------- + 1 file changed, 31 insertions(+), 21 deletions(-) + +diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c +index 044e34cd835c1..8dbb87edf24c4 100644 +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -253,6 +253,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + { + struct inode *orig_inode = file_inode(o_filp); + struct page *pagep[2] = {NULL, NULL}; ++ struct folio *folio[2] = {NULL, NULL}; + handle_t *handle; + ext4_lblk_t orig_blk_offset, donor_blk_offset; + unsigned long blocksize = orig_inode->i_sb->s_blocksize; +@@ -313,6 +314,13 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + * hold page's lock, if it is still the case data copy is not + * necessary, just swap data blocks between orig and donor. + */ ++ folio[0] = page_folio(pagep[0]); ++ folio[1] = page_folio(pagep[1]); ++ ++ VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); ++ VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); ++ VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); ++ + if (unwritten) { + ext4_double_down_write_data_sem(orig_inode, donor_inode); + /* If any of extents in range became initialized we have to +@@ -331,10 +339,10 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + ext4_double_up_write_data_sem(orig_inode, donor_inode); + goto data_copy; + } +- if ((page_has_private(pagep[0]) && +- !try_to_release_page(pagep[0], 0)) || +- (page_has_private(pagep[1]) && +- !try_to_release_page(pagep[1], 0))) { ++ if ((folio_has_private(folio[0]) && ++ !filemap_release_folio(folio[0], 0)) || ++ (folio_has_private(folio[1]) && ++ !filemap_release_folio(folio[1], 0))) { + *err = -EBUSY; + goto drop_data_sem; + } +@@ -344,19 +352,21 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + block_len_in_page, 1, err); + drop_data_sem: + ext4_double_up_write_data_sem(orig_inode, donor_inode); +- goto unlock_pages; ++ goto unlock_folios; + } + data_copy: +- *err = mext_page_mkuptodate(pagep[0], from, from + replaced_size); ++ *err = mext_page_mkuptodate(&folio[0]->page, from, from + replaced_size); + if (*err) +- goto unlock_pages; ++ goto unlock_folios; + + /* At this point all buffers in range are uptodate, old mapping layout + * is no longer required, try to drop it now. */ +- if ((page_has_private(pagep[0]) && !try_to_release_page(pagep[0], 0)) || +- (page_has_private(pagep[1]) && !try_to_release_page(pagep[1], 0))) { ++ if ((folio_has_private(folio[0]) && ++ !filemap_release_folio(folio[0], 0)) || ++ (folio_has_private(folio[1]) && ++ !filemap_release_folio(folio[1], 0))) { + *err = -EBUSY; +- goto unlock_pages; ++ goto unlock_folios; + } + ext4_double_down_write_data_sem(orig_inode, donor_inode); + replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, +@@ -369,13 +379,13 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + replaced_size = + block_len_in_page << orig_inode->i_blkbits; + } else +- goto unlock_pages; ++ goto unlock_folios; + } + /* Perform all necessary steps similar write_begin()/write_end() + * but keeping in mind that i_size will not change */ +- if (!page_has_buffers(pagep[0])) +- create_empty_buffers(pagep[0], 1 << orig_inode->i_blkbits, 0); +- bh = page_buffers(pagep[0]); ++ if (!folio_buffers(folio[0])) ++ create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); ++ bh = folio_buffers(folio[0]); + for (i = 0; i < data_offset_in_page; i++) + bh = bh->b_this_page; + for (i = 0; i < block_len_in_page; i++) { +@@ -385,7 +395,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + bh = bh->b_this_page; + } + if (!*err) +- *err = block_commit_write(pagep[0], from, from + replaced_size); ++ *err = block_commit_write(&folio[0]->page, from, from + replaced_size); + + if (unlikely(*err < 0)) + goto repair_branches; +@@ -395,11 +405,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + *err = ext4_jbd2_inode_add_write(handle, orig_inode, + (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); + +-unlock_pages: +- unlock_page(pagep[0]); +- put_page(pagep[0]); +- unlock_page(pagep[1]); +- put_page(pagep[1]); ++unlock_folios: ++ folio_unlock(folio[0]); ++ folio_put(folio[0]); ++ folio_unlock(folio[1]); ++ folio_put(folio[1]); + stop_journal: + ext4_journal_stop(handle); + if (*err == -ENOSPC && +@@ -430,7 +440,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + *err = -EIO; + } + replaced_count = 0; +- goto unlock_pages; ++ goto unlock_folios; + } + + /** +-- +2.43.0 + diff --git a/queue-6.1/f2fs-assign-default-compression-level.patch b/queue-6.1/f2fs-assign-default-compression-level.patch new file mode 100644 index 00000000000..f0c80be1ddb --- /dev/null +++ b/queue-6.1/f2fs-assign-default-compression-level.patch @@ -0,0 +1,106 @@ +From e4a655eebbd80e0178fe542d71ec653c4f3486cb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 12 Jun 2023 12:58:34 -0700 +Subject: f2fs: assign default compression level + +From: Jaegeuk Kim + +[ Upstream commit 00e120b5e4b5638cf19eee96d4332f2d100746ba ] + +Let's avoid any confusion from assigning compress_level=0 for LZ4HC and ZSTD. + +Signed-off-by: Jaegeuk Kim +Stable-dep-of: f5f3bd903a5d ("f2fs: set the default compress_level on ioctl") +Signed-off-by: Sasha Levin +--- + fs/f2fs/compress.c | 3 +-- + fs/f2fs/f2fs.h | 2 ++ + fs/f2fs/super.c | 12 +++++++----- + 3 files changed, 10 insertions(+), 7 deletions(-) + +diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c +index c3ba202a7c29f..4cb58e8d699e2 100644 +--- a/fs/f2fs/compress.c ++++ b/fs/f2fs/compress.c +@@ -331,8 +331,6 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = { + #endif + + #ifdef CONFIG_F2FS_FS_ZSTD +-#define F2FS_ZSTD_DEFAULT_CLEVEL 1 +- + static int zstd_init_compress_ctx(struct compress_ctx *cc) + { + zstd_parameters params; +@@ -341,6 +339,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) + unsigned int workspace_size; + unsigned char level = F2FS_I(cc->inode)->i_compress_level; + ++ /* Need to remain this for backward compatibility */ + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; + +diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h +index 6fa3ac2097b27..5c76ba764b71f 100644 +--- a/fs/f2fs/f2fs.h ++++ b/fs/f2fs/f2fs.h +@@ -1501,6 +1501,8 @@ struct compress_data { + + #define F2FS_COMPRESSED_PAGE_MAGIC 0xF5F2C000 + ++#define F2FS_ZSTD_DEFAULT_CLEVEL 1 ++ + #define COMPRESS_LEVEL_OFFSET 8 + + /* compress context */ +diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c +index 4f87e0e374c25..584fe00fdeeb1 100644 +--- a/fs/f2fs/super.c ++++ b/fs/f2fs/super.c +@@ -613,14 +613,12 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) + { + #ifdef CONFIG_F2FS_FS_LZ4HC + unsigned int level; +-#endif + + if (strlen(str) == 3) { +- F2FS_OPTION(sbi).compress_level = 0; ++ F2FS_OPTION(sbi).compress_level = LZ4HC_DEFAULT_CLEVEL; + return 0; + } + +-#ifdef CONFIG_F2FS_FS_LZ4HC + str += 3; + + if (str[0] != ':') { +@@ -638,6 +636,10 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str) + F2FS_OPTION(sbi).compress_level = level; + return 0; + #else ++ if (strlen(str) == 3) { ++ F2FS_OPTION(sbi).compress_level = 0; ++ return 0; ++ } + f2fs_info(sbi, "kernel doesn't support lz4hc compression"); + return -EINVAL; + #endif +@@ -651,7 +653,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) + int len = 4; + + if (strlen(str) == len) { +- F2FS_OPTION(sbi).compress_level = 0; ++ F2FS_OPTION(sbi).compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; + return 0; + } + +@@ -664,7 +666,7 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) + if (kstrtouint(str + 1, 10, &level)) + return -EINVAL; + +- if (!level || level > zstd_max_clevel()) { ++ if (level < zstd_min_clevel() || level > zstd_max_clevel()) { + f2fs_info(sbi, "invalid zstd compress level: %d", level); + return -EINVAL; + } +-- +2.43.0 + diff --git a/queue-6.1/f2fs-clean-up-i_compress_flag-and-i_compress_level-u.patch b/queue-6.1/f2fs-clean-up-i_compress_flag-and-i_compress_level-u.patch new file mode 100644 index 00000000000..990b2a81aaf --- /dev/null +++ b/queue-6.1/f2fs-clean-up-i_compress_flag-and-i_compress_level-u.patch @@ -0,0 +1,135 @@ +From f8166c0421b9a097ec4c3230e5a09dec56b64c23 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 28 Jan 2023 18:30:11 +0800 +Subject: f2fs: clean up i_compress_flag and i_compress_level usage + +From: Chao Yu + +[ Upstream commit b90e5086df6bf5ba819216d5ecf0667370bd565f ] + +.i_compress_level was introduced by commit 3fde13f817e2 ("f2fs: compress: +support compress level"), but never be used. + +This patch updates as below: +- load high 8-bits of on-disk .i_compress_flag to in-memory .i_compress_level +- load low 8-bits of on-disk .i_compress_flag to in-memory .i_compress_flag +- change type of in-memory .i_compress_flag from unsigned short to unsigned +char. + +w/ above changes, we can avoid unneeded bit shift whenever during +.init_compress_ctx(), and shrink size of struct f2fs_inode_info. + +Signed-off-by: Chao Yu +Signed-off-by: Jaegeuk Kim +Stable-dep-of: f5f3bd903a5d ("f2fs: set the default compress_level on ioctl") +Signed-off-by: Sasha Levin +--- + fs/f2fs/compress.c | 8 +++----- + fs/f2fs/f2fs.h | 7 +++---- + fs/f2fs/inode.c | 16 +++++++++++++--- + 3 files changed, 19 insertions(+), 12 deletions(-) + +diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c +index 11d9dce994dbe..d509b47381d51 100644 +--- a/fs/f2fs/compress.c ++++ b/fs/f2fs/compress.c +@@ -241,7 +241,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc) + unsigned int size = LZ4_MEM_COMPRESS; + + #ifdef CONFIG_F2FS_FS_LZ4HC +- if (F2FS_I(cc->inode)->i_compress_flag >> COMPRESS_LEVEL_OFFSET) ++ if (F2FS_I(cc->inode)->i_compress_level) + size = LZ4HC_MEM_COMPRESS; + #endif + +@@ -267,8 +267,7 @@ static void lz4_destroy_compress_ctx(struct compress_ctx *cc) + #ifdef CONFIG_F2FS_FS_LZ4HC + static int lz4hc_compress_pages(struct compress_ctx *cc) + { +- unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> +- COMPRESS_LEVEL_OFFSET; ++ unsigned char level = F2FS_I(cc->inode)->i_compress_level; + int len; + + if (level) +@@ -340,8 +339,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) + zstd_cstream *stream; + void *workspace; + unsigned int workspace_size; +- unsigned char level = F2FS_I(cc->inode)->i_compress_flag >> +- COMPRESS_LEVEL_OFFSET; ++ unsigned char level = F2FS_I(cc->inode)->i_compress_level; + + if (!level) + level = F2FS_ZSTD_DEFAULT_CLEVEL; +diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h +index f56abb39601ac..faf1a4953e845 100644 +--- a/fs/f2fs/f2fs.h ++++ b/fs/f2fs/f2fs.h +@@ -840,7 +840,7 @@ struct f2fs_inode_info { + unsigned char i_compress_algorithm; /* algorithm type */ + unsigned char i_log_cluster_size; /* log of cluster size */ + unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ +- unsigned short i_compress_flag; /* compress flag */ ++ unsigned char i_compress_flag; /* compress flag */ + unsigned int i_cluster_size; /* cluster size */ + + unsigned int atomic_write_cnt; +@@ -4339,9 +4339,8 @@ static inline int set_compress_context(struct inode *inode) + if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || + F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && + F2FS_OPTION(sbi).compress_level) +- F2FS_I(inode)->i_compress_flag |= +- F2FS_OPTION(sbi).compress_level << +- COMPRESS_LEVEL_OFFSET; ++ F2FS_I(inode)->i_compress_level = ++ F2FS_OPTION(sbi).compress_level; + F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; + set_inode_flag(inode, FI_COMPRESSED_FILE); + stat_inc_compr_inode(inode); +diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c +index 1fc7760499f10..933554985d328 100644 +--- a/fs/f2fs/inode.c ++++ b/fs/f2fs/inode.c +@@ -450,11 +450,17 @@ static int do_read_inode(struct inode *inode) + (fi->i_flags & F2FS_COMPR_FL)) { + if (F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, + i_log_cluster_size)) { ++ unsigned short compress_flag; ++ + atomic_set(&fi->i_compr_blocks, + le64_to_cpu(ri->i_compr_blocks)); + fi->i_compress_algorithm = ri->i_compress_algorithm; + fi->i_log_cluster_size = ri->i_log_cluster_size; +- fi->i_compress_flag = le16_to_cpu(ri->i_compress_flag); ++ compress_flag = le16_to_cpu(ri->i_compress_flag); ++ fi->i_compress_level = compress_flag >> ++ COMPRESS_LEVEL_OFFSET; ++ fi->i_compress_flag = compress_flag & ++ (BIT(COMPRESS_LEVEL_OFFSET) - 1); + fi->i_cluster_size = 1 << fi->i_log_cluster_size; + set_inode_flag(inode, FI_COMPRESSED_FILE); + } +@@ -675,13 +681,17 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) + if (f2fs_sb_has_compression(F2FS_I_SB(inode)) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_log_cluster_size)) { ++ unsigned short compress_flag; ++ + ri->i_compr_blocks = + cpu_to_le64(atomic_read( + &F2FS_I(inode)->i_compr_blocks)); + ri->i_compress_algorithm = + F2FS_I(inode)->i_compress_algorithm; +- ri->i_compress_flag = +- cpu_to_le16(F2FS_I(inode)->i_compress_flag); ++ compress_flag = F2FS_I(inode)->i_compress_flag | ++ F2FS_I(inode)->i_compress_level << ++ COMPRESS_LEVEL_OFFSET; ++ ri->i_compress_flag = cpu_to_le16(compress_flag); + ri->i_log_cluster_size = + F2FS_I(inode)->i_log_cluster_size; + } +-- +2.43.0 + diff --git a/queue-6.1/f2fs-convert-to-use-bitmap-api.patch b/queue-6.1/f2fs-convert-to-use-bitmap-api.patch new file mode 100644 index 00000000000..95b95b78dad --- /dev/null +++ b/queue-6.1/f2fs-convert-to-use-bitmap-api.patch @@ -0,0 +1,440 @@ +From 5a0421b515853a9187b83dfe12fc55938c7eaa84 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 16 Feb 2023 21:53:24 +0800 +Subject: f2fs: convert to use bitmap API + +From: Yangtao Li + +[ Upstream commit 447286ebadaafa551550704ff0b42eb08b1d1cb2 ] + +Let's use BIT() and GENMASK() instead of open it. + +Signed-off-by: Yangtao Li +Reviewed-by: Chao Yu +Signed-off-by: Jaegeuk Kim +Stable-dep-of: f5f3bd903a5d ("f2fs: set the default compress_level on ioctl") +Signed-off-by: Sasha Levin +--- + fs/f2fs/checkpoint.c | 2 +- + fs/f2fs/compress.c | 4 ++-- + fs/f2fs/data.c | 12 ++++++------ + fs/f2fs/dir.c | 2 +- + fs/f2fs/f2fs.h | 26 +++++++++++++------------- + fs/f2fs/file.c | 2 +- + fs/f2fs/inode.c | 4 ++-- + fs/f2fs/node.h | 20 +++++++++----------- + fs/f2fs/super.c | 16 ++++++++-------- + fs/f2fs/sysfs.c | 2 +- + include/linux/f2fs_fs.h | 9 ++++----- + 11 files changed, 48 insertions(+), 51 deletions(-) + +diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c +index 5df04ed010cae..eb4d69f53337f 100644 +--- a/fs/f2fs/checkpoint.c ++++ b/fs/f2fs/checkpoint.c +@@ -984,7 +984,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) + + cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); + if (cur_page == cp2) +- cp_blk_no += 1 << le32_to_cpu(fsb->log_blocks_per_seg); ++ cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg)); + + for (i = 1; i < cp_blks; i++) { + void *sit_bitmap_ptr; +diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c +index d509b47381d51..c3ba202a7c29f 100644 +--- a/fs/f2fs/compress.c ++++ b/fs/f2fs/compress.c +@@ -673,7 +673,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) + + cc->cbuf->clen = cpu_to_le32(cc->clen); + +- if (fi->i_compress_flag & 1 << COMPRESS_CHKSUM) ++ if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM)) + chksum = f2fs_crc32(F2FS_I_SB(cc->inode), + cc->cbuf->cdata, cc->clen); + cc->cbuf->chksum = cpu_to_le32(chksum); +@@ -771,7 +771,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) + + ret = cops->decompress_pages(dic); + +- if (!ret && (fi->i_compress_flag & 1 << COMPRESS_CHKSUM)) { ++ if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) { + u32 provided = le32_to_cpu(dic->cbuf->chksum); + u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); + +diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c +index ea05710ca9bdf..3666c1fd77a64 100644 +--- a/fs/f2fs/data.c ++++ b/fs/f2fs/data.c +@@ -95,17 +95,17 @@ static enum count_type __read_io_type(struct page *page) + /* postprocessing steps for read bios */ + enum bio_post_read_step { + #ifdef CONFIG_FS_ENCRYPTION +- STEP_DECRYPT = 1 << 0, ++ STEP_DECRYPT = BIT(0), + #else + STEP_DECRYPT = 0, /* compile out the decryption-related code */ + #endif + #ifdef CONFIG_F2FS_FS_COMPRESSION +- STEP_DECOMPRESS = 1 << 1, ++ STEP_DECOMPRESS = BIT(1), + #else + STEP_DECOMPRESS = 0, /* compile out the decompression-related code */ + #endif + #ifdef CONFIG_FS_VERITY +- STEP_VERITY = 1 << 2, ++ STEP_VERITY = BIT(2), + #else + STEP_VERITY = 0, /* compile out the verity-related code */ + #endif +@@ -409,7 +409,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) + + static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) + { +- unsigned int temp_mask = (1 << NR_TEMP_TYPE) - 1; ++ unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); + unsigned int fua_flag, meta_flag, io_flag; + blk_opf_t op_flags = 0; + +@@ -431,9 +431,9 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) + * 5 | 4 | 3 | 2 | 1 | 0 | + * Cold | Warm | Hot | Cold | Warm | Hot | + */ +- if ((1 << fio->temp) & meta_flag) ++ if (BIT(fio->temp) & meta_flag) + op_flags |= REQ_META; +- if ((1 << fio->temp) & fua_flag) ++ if (BIT(fio->temp) & fua_flag) + op_flags |= REQ_FUA; + return op_flags; + } +diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c +index 8373eba3a1337..510736d2ae110 100644 +--- a/fs/f2fs/dir.c ++++ b/fs/f2fs/dir.c +@@ -29,7 +29,7 @@ static unsigned long dir_blocks(struct inode *inode) + static unsigned int dir_buckets(unsigned int level, int dir_level) + { + if (level + dir_level < MAX_DIR_HASH_DEPTH / 2) +- return 1 << (level + dir_level); ++ return BIT(level + dir_level); + else + return MAX_DIR_BUCKETS; + } +diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h +index faf1a4953e845..6fa3ac2097b27 100644 +--- a/fs/f2fs/f2fs.h ++++ b/fs/f2fs/f2fs.h +@@ -64,7 +64,7 @@ enum { + }; + + #ifdef CONFIG_F2FS_FAULT_INJECTION +-#define F2FS_ALL_FAULT_TYPE ((1 << FAULT_MAX) - 1) ++#define F2FS_ALL_FAULT_TYPE (GENMASK(FAULT_MAX - 1, 0)) + + struct f2fs_fault_info { + atomic_t inject_ops; +@@ -73,7 +73,7 @@ struct f2fs_fault_info { + }; + + extern const char *f2fs_fault_name[FAULT_MAX]; +-#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) ++#define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type)) + #endif + + /* +@@ -1412,7 +1412,7 @@ static inline void set_page_private_##name(struct page *page) \ + static inline void clear_page_private_##name(struct page *page) \ + { \ + clear_bit(PAGE_PRIVATE_##flagname, &page_private(page)); \ +- if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { \ ++ if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { \ + set_page_private(page, 0); \ + if (PagePrivate(page)) { \ + ClearPagePrivate(page); \ +@@ -1462,8 +1462,8 @@ static inline void set_page_private_data(struct page *page, unsigned long data) + + static inline void clear_page_private_data(struct page *page) + { +- page_private(page) &= (1 << PAGE_PRIVATE_MAX) - 1; +- if (page_private(page) == 1 << PAGE_PRIVATE_NOT_POINTER) { ++ page_private(page) &= GENMASK(PAGE_PRIVATE_MAX - 1, 0); ++ if (page_private(page) == BIT(PAGE_PRIVATE_NOT_POINTER)) { + set_page_private(page, 0); + if (PagePrivate(page)) { + ClearPagePrivate(page); +@@ -2882,7 +2882,7 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr) + int mask; + + addr += (nr >> 3); +- mask = 1 << (7 - (nr & 0x07)); ++ mask = BIT(7 - (nr & 0x07)); + return mask & *addr; + } + +@@ -2891,7 +2891,7 @@ static inline void f2fs_set_bit(unsigned int nr, char *addr) + int mask; + + addr += (nr >> 3); +- mask = 1 << (7 - (nr & 0x07)); ++ mask = BIT(7 - (nr & 0x07)); + *addr |= mask; + } + +@@ -2900,7 +2900,7 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr) + int mask; + + addr += (nr >> 3); +- mask = 1 << (7 - (nr & 0x07)); ++ mask = BIT(7 - (nr & 0x07)); + *addr &= ~mask; + } + +@@ -2910,7 +2910,7 @@ static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) + int ret; + + addr += (nr >> 3); +- mask = 1 << (7 - (nr & 0x07)); ++ mask = BIT(7 - (nr & 0x07)); + ret = mask & *addr; + *addr |= mask; + return ret; +@@ -2922,7 +2922,7 @@ static inline int f2fs_test_and_clear_bit(unsigned int nr, char *addr) + int ret; + + addr += (nr >> 3); +- mask = 1 << (7 - (nr & 0x07)); ++ mask = BIT(7 - (nr & 0x07)); + ret = mask & *addr; + *addr &= ~mask; + return ret; +@@ -2933,7 +2933,7 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) + int mask; + + addr += (nr >> 3); +- mask = 1 << (7 - (nr & 0x07)); ++ mask = BIT(7 - (nr & 0x07)); + *addr ^= mask; + } + +@@ -4333,9 +4333,9 @@ static inline int set_compress_context(struct inode *inode) + F2FS_OPTION(sbi).compress_log_size; + F2FS_I(inode)->i_compress_flag = + F2FS_OPTION(sbi).compress_chksum ? +- 1 << COMPRESS_CHKSUM : 0; ++ BIT(COMPRESS_CHKSUM) : 0; + F2FS_I(inode)->i_cluster_size = +- 1 << F2FS_I(inode)->i_log_cluster_size; ++ BIT(F2FS_I(inode)->i_log_cluster_size); + if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || + F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && + F2FS_OPTION(sbi).compress_level) +diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c +index d0c17366ebf48..126c074deebdc 100644 +--- a/fs/f2fs/file.c ++++ b/fs/f2fs/file.c +@@ -3983,7 +3983,7 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) + + F2FS_I(inode)->i_compress_algorithm = option.algorithm; + F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; +- F2FS_I(inode)->i_cluster_size = 1 << option.log_cluster_size; ++ F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size); + f2fs_mark_inode_dirty_sync(inode, true); + + if (!f2fs_is_compress_backend_ready(inode)) +diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c +index 933554985d328..0010579f17368 100644 +--- a/fs/f2fs/inode.c ++++ b/fs/f2fs/inode.c +@@ -460,8 +460,8 @@ static int do_read_inode(struct inode *inode) + fi->i_compress_level = compress_flag >> + COMPRESS_LEVEL_OFFSET; + fi->i_compress_flag = compress_flag & +- (BIT(COMPRESS_LEVEL_OFFSET) - 1); +- fi->i_cluster_size = 1 << fi->i_log_cluster_size; ++ GENMASK(COMPRESS_LEVEL_OFFSET - 1, 0); ++ fi->i_cluster_size = BIT(fi->i_log_cluster_size); + set_inode_flag(inode, FI_COMPRESSED_FILE); + } + } +diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h +index 0aa48704c77a0..7068f3ac036a5 100644 +--- a/fs/f2fs/node.h ++++ b/fs/f2fs/node.h +@@ -93,17 +93,15 @@ static inline void copy_node_info(struct node_info *dst, + static inline void set_nat_flag(struct nat_entry *ne, + unsigned int type, bool set) + { +- unsigned char mask = 0x01 << type; + if (set) +- ne->ni.flag |= mask; ++ ne->ni.flag |= BIT(type); + else +- ne->ni.flag &= ~mask; ++ ne->ni.flag &= ~BIT(type); + } + + static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) + { +- unsigned char mask = 0x01 << type; +- return ne->ni.flag & mask; ++ return ne->ni.flag & BIT(type); + } + + static inline void nat_reset_flag(struct nat_entry *ne) +@@ -224,7 +222,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, + struct f2fs_nm_info *nm_i = NM_I(sbi); + + block_addr -= nm_i->nat_blkaddr; +- block_addr ^= 1 << sbi->log_blocks_per_seg; ++ block_addr ^= BIT(sbi->log_blocks_per_seg); + return block_addr + nm_i->nat_blkaddr; + } + +@@ -394,7 +392,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i) + static inline int is_node(struct page *page, int type) + { + struct f2fs_node *rn = F2FS_NODE(page); +- return le32_to_cpu(rn->footer.flag) & (1 << type); ++ return le32_to_cpu(rn->footer.flag) & BIT(type); + } + + #define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) +@@ -407,9 +405,9 @@ static inline void set_cold_node(struct page *page, bool is_dir) + unsigned int flag = le32_to_cpu(rn->footer.flag); + + if (is_dir) +- flag &= ~(0x1 << COLD_BIT_SHIFT); ++ flag &= ~BIT(COLD_BIT_SHIFT); + else +- flag |= (0x1 << COLD_BIT_SHIFT); ++ flag |= BIT(COLD_BIT_SHIFT); + rn->footer.flag = cpu_to_le32(flag); + } + +@@ -418,9 +416,9 @@ static inline void set_mark(struct page *page, int mark, int type) + struct f2fs_node *rn = F2FS_NODE(page); + unsigned int flag = le32_to_cpu(rn->footer.flag); + if (mark) +- flag |= (0x1 << type); ++ flag |= BIT(type); + else +- flag &= ~(0x1 << type); ++ flag &= ~BIT(type); + rn->footer.flag = cpu_to_le32(flag); + + #ifdef CONFIG_F2FS_CHECK_FS +diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c +index 1ba85ef97cbd3..4f87e0e374c25 100644 +--- a/fs/f2fs/super.c ++++ b/fs/f2fs/super.c +@@ -898,8 +898,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) { +- f2fs_warn(sbi, "Not support %d, larger than %d", +- 1 << arg, BIO_MAX_VECS); ++ f2fs_warn(sbi, "Not support %ld, larger than %d", ++ BIT(arg), BIO_MAX_VECS); + return -EINVAL; + } + F2FS_OPTION(sbi).write_io_size_bits = arg; +@@ -1340,7 +1340,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) + #endif + + if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) { +- f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO", ++ f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO", + F2FS_IO_SIZE_KB(sbi)); + return -EINVAL; + } +@@ -3356,7 +3356,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + total_sections = le32_to_cpu(raw_super->section_count); + + /* blocks_per_seg should be 512, given the above check */ +- blocks_per_seg = 1 << le32_to_cpu(raw_super->log_blocks_per_seg); ++ blocks_per_seg = BIT(le32_to_cpu(raw_super->log_blocks_per_seg)); + + if (segment_count > F2FS_MAX_SEGMENT || + segment_count < F2FS_MIN_SEGMENTS) { +@@ -3625,9 +3625,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) + sbi->log_sectors_per_block = + le32_to_cpu(raw_super->log_sectors_per_block); + sbi->log_blocksize = le32_to_cpu(raw_super->log_blocksize); +- sbi->blocksize = 1 << sbi->log_blocksize; ++ sbi->blocksize = BIT(sbi->log_blocksize); + sbi->log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); +- sbi->blocks_per_seg = 1 << sbi->log_blocks_per_seg; ++ sbi->blocks_per_seg = BIT(sbi->log_blocks_per_seg); + sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec); + sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone); + sbi->total_sections = le32_to_cpu(raw_super->section_count); +@@ -3883,7 +3883,7 @@ void f2fs_handle_stop(struct f2fs_sb_info *sbi, unsigned char reason) + + f2fs_down_write(&sbi->sb_lock); + +- if (raw_super->s_stop_reason[reason] < ((1 << BITS_PER_BYTE) - 1)) ++ if (raw_super->s_stop_reason[reason] < GENMASK(BITS_PER_BYTE - 1, 0)) + raw_super->s_stop_reason[reason]++; + + err = f2fs_commit_super(sbi, false); +@@ -4033,7 +4033,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) + FDEV(i).start_blk, FDEV(i).end_blk); + } + f2fs_info(sbi, +- "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); ++ "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi)); + return 0; + } + +diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c +index 3d68bfa75cf2a..751a108e612ff 100644 +--- a/fs/f2fs/sysfs.c ++++ b/fs/f2fs/sysfs.c +@@ -451,7 +451,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, + if (ret < 0) + return ret; + #ifdef CONFIG_F2FS_FAULT_INJECTION +- if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) ++ if (a->struct_type == FAULT_INFO_TYPE && t >= BIT(FAULT_MAX)) + return -EINVAL; + if (a->struct_type == FAULT_INFO_RATE && t >= UINT_MAX) + return -EINVAL; +diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h +index ee0d75d9a302d..1e0df607e40c4 100644 +--- a/include/linux/f2fs_fs.h ++++ b/include/linux/f2fs_fs.h +@@ -40,9 +40,8 @@ + + #define F2FS_ENC_UTF8_12_1 1 + +-#define F2FS_IO_SIZE(sbi) (1 << F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ +-#define F2FS_IO_SIZE_KB(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 2)) /* KB */ +-#define F2FS_IO_SIZE_BYTES(sbi) (1 << (F2FS_OPTION(sbi).write_io_size_bits + 12)) /* B */ ++#define F2FS_IO_SIZE(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits) /* Blocks */ ++#define F2FS_IO_SIZE_KB(sbi) BIT(F2FS_OPTION(sbi).write_io_size_bits + 2) /* KB */ + #define F2FS_IO_SIZE_BITS(sbi) (F2FS_OPTION(sbi).write_io_size_bits) /* power of 2 */ + #define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) + #define F2FS_IO_ALIGNED(sbi) (F2FS_IO_SIZE(sbi) > 1) +@@ -340,7 +339,7 @@ enum { + OFFSET_BIT_SHIFT + }; + +-#define OFFSET_BIT_MASK (0x07) /* (0x01 << OFFSET_BIT_SHIFT) - 1 */ ++#define OFFSET_BIT_MASK GENMASK(OFFSET_BIT_SHIFT - 1, 0) + + struct node_footer { + __le32 nid; /* node id */ +@@ -545,7 +544,7 @@ typedef __le32 f2fs_hash_t; + #define MAX_DIR_HASH_DEPTH 63 + + /* MAX buckets in one level of dir */ +-#define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1)) ++#define MAX_DIR_BUCKETS BIT((MAX_DIR_HASH_DEPTH / 2) - 1) + + /* + * space utilization of regular dentry and inline dentry (w/o extra reservation) +-- +2.43.0 + diff --git a/queue-6.1/f2fs-set-the-default-compress_level-on-ioctl.patch b/queue-6.1/f2fs-set-the-default-compress_level-on-ioctl.patch new file mode 100644 index 00000000000..3ed08a4075a --- /dev/null +++ b/queue-6.1/f2fs-set-the-default-compress_level-on-ioctl.patch @@ -0,0 +1,47 @@ +From 58e5af6fa360d2c24949fe3057b862c27142ed6c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Sep 2023 15:41:42 -0700 +Subject: f2fs: set the default compress_level on ioctl + +From: Jaegeuk Kim + +[ Upstream commit f5f3bd903a5d3e3b2ba89f11e0e29db25e60c048 ] + +Otherwise, we'll get a broken inode. + + # touch $FILE + # f2fs_io setflags compression $FILE + # f2fs_io set_coption 2 8 $FILE + +[ 112.227612] F2FS-fs (dm-51): sanity_check_compress_inode: inode (ino=8d3fe) has unsupported compress level: 0, run fsck to fix + +Cc: stable@vger.kernel.org +Signed-off-by: Jaegeuk Kim +Signed-off-by: Sasha Levin +--- + fs/f2fs/file.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c +index 126c074deebdc..9b9fb3c57ec6c 100644 +--- a/fs/f2fs/file.c ++++ b/fs/f2fs/file.c +@@ -3984,6 +3984,15 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg) + F2FS_I(inode)->i_compress_algorithm = option.algorithm; + F2FS_I(inode)->i_log_cluster_size = option.log_cluster_size; + F2FS_I(inode)->i_cluster_size = BIT(option.log_cluster_size); ++ /* Set default level */ ++ if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) ++ F2FS_I(inode)->i_compress_level = F2FS_ZSTD_DEFAULT_CLEVEL; ++ else ++ F2FS_I(inode)->i_compress_level = 0; ++ /* Adjust mount option level */ ++ if (option.algorithm == F2FS_OPTION(sbi).compress_algorithm && ++ F2FS_OPTION(sbi).compress_level) ++ F2FS_I(inode)->i_compress_level = F2FS_OPTION(sbi).compress_level; + f2fs_mark_inode_dirty_sync(inode, true); + + if (!f2fs_is_compress_backend_ready(inode)) +-- +2.43.0 + diff --git a/queue-6.1/fbdev-imsttfb-fix-double-free-in-probe.patch b/queue-6.1/fbdev-imsttfb-fix-double-free-in-probe.patch new file mode 100644 index 00000000000..eeba0229412 --- /dev/null +++ b/queue-6.1/fbdev-imsttfb-fix-double-free-in-probe.patch @@ -0,0 +1,51 @@ +From 0dba7e14edb61efe1ef25501ef2902a7619970f8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 27 Oct 2023 15:04:56 +0300 +Subject: fbdev: imsttfb: fix double free in probe() + +From: Dan Carpenter + +[ Upstream commit e08c30efda21ef4c0ec084a3a9581c220b442ba9 ] + +The init_imstt() function calls framebuffer_release() on error and then +the probe() function calls it again. It should only be done in probe. + +Fixes: 518ecb6a209f ("fbdev: imsttfb: Fix error path of imsttfb_probe()") +Signed-off-by: Dan Carpenter +Signed-off-by: Helge Deller +Signed-off-by: Sasha Levin +--- + drivers/video/fbdev/imsttfb.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/drivers/video/fbdev/imsttfb.c b/drivers/video/fbdev/imsttfb.c +index 3d1ae5267a738..aa51cb72cbba5 100644 +--- a/drivers/video/fbdev/imsttfb.c ++++ b/drivers/video/fbdev/imsttfb.c +@@ -1419,7 +1419,6 @@ static int init_imstt(struct fb_info *info) + if ((info->var.xres * info->var.yres) * (info->var.bits_per_pixel >> 3) > info->fix.smem_len + || !(compute_imstt_regvals(par, info->var.xres, info->var.yres))) { + printk("imsttfb: %ux%ux%u not supported\n", info->var.xres, info->var.yres, info->var.bits_per_pixel); +- framebuffer_release(info); + return -ENODEV; + } + +@@ -1452,14 +1451,11 @@ static int init_imstt(struct fb_info *info) + FBINFO_HWACCEL_FILLRECT | + FBINFO_HWACCEL_YPAN; + +- if (fb_alloc_cmap(&info->cmap, 0, 0)) { +- framebuffer_release(info); ++ if (fb_alloc_cmap(&info->cmap, 0, 0)) + return -ENODEV; +- } + + if (register_framebuffer(info) < 0) { + fb_dealloc_cmap(&info->cmap); +- framebuffer_release(info); + return -ENODEV; + } + +-- +2.43.0 + diff --git a/queue-6.1/fbdev-imsttfb-release-framebuffer-and-dealloc-cmap-o.patch b/queue-6.1/fbdev-imsttfb-release-framebuffer-and-dealloc-cmap-o.patch new file mode 100644 index 00000000000..8571648b2a4 --- /dev/null +++ b/queue-6.1/fbdev-imsttfb-release-framebuffer-and-dealloc-cmap-o.patch @@ -0,0 +1,40 @@ +From 1d9d0ecc1ce7f53db132d35e037803dd1265e7a4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 27 May 2023 11:28:36 +0200 +Subject: fbdev: imsttfb: Release framebuffer and dealloc cmap on error path + +From: Helge Deller + +[ Upstream commit 5cf9a090a39c97f4506b7b53739d469b1c05a7e9 ] + +Add missing cleanups in error path. + +Signed-off-by: Helge Deller +Stable-dep-of: e08c30efda21 ("fbdev: imsttfb: fix double free in probe()") +Signed-off-by: Sasha Levin +--- + drivers/video/fbdev/imsttfb.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/drivers/video/fbdev/imsttfb.c b/drivers/video/fbdev/imsttfb.c +index b194e71f07bfc..3d1ae5267a738 100644 +--- a/drivers/video/fbdev/imsttfb.c ++++ b/drivers/video/fbdev/imsttfb.c +@@ -1452,9 +1452,13 @@ static int init_imstt(struct fb_info *info) + FBINFO_HWACCEL_FILLRECT | + FBINFO_HWACCEL_YPAN; + +- fb_alloc_cmap(&info->cmap, 0, 0); ++ if (fb_alloc_cmap(&info->cmap, 0, 0)) { ++ framebuffer_release(info); ++ return -ENODEV; ++ } + + if (register_framebuffer(info) < 0) { ++ fb_dealloc_cmap(&info->cmap); + framebuffer_release(info); + return -ENODEV; + } +-- +2.43.0 + diff --git a/queue-6.1/filemap-add-a-per-mapping-stable-writes-flag.patch b/queue-6.1/filemap-add-a-per-mapping-stable-writes-flag.patch new file mode 100644 index 00000000000..8d79232decc --- /dev/null +++ b/queue-6.1/filemap-add-a-per-mapping-stable-writes-flag.patch @@ -0,0 +1,103 @@ +From 28a5490b3586d1c511530d0848ade4165e206e96 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Oct 2023 16:10:17 +0200 +Subject: filemap: add a per-mapping stable writes flag + +From: Christoph Hellwig + +[ Upstream commit 762321dab9a72760bf9aec48362f932717c9424d ] + +folio_wait_stable waits for writeback to finish before modifying the +contents of a folio again, e.g. to support check summing of the data +in the block integrity code. + +Currently this behavior is controlled by the SB_I_STABLE_WRITES flag +on the super_block, which means it is uniform for the entire file system. +This is wrong for the block device pseudofs which is shared by all +block devices, or file systems that can use multiple devices like XFS +witht the RT subvolume or btrfs (although btrfs currently reimplements +folio_wait_stable anyway). + +Add a per-address_space AS_STABLE_WRITES flag to control the behavior +in a more fine grained way. The existing SB_I_STABLE_WRITES is kept +to initialize AS_STABLE_WRITES to the existing default which covers +most cases. + +Signed-off-by: Christoph Hellwig +Link: https://lore.kernel.org/r/20231025141020.192413-2-hch@lst.de +Tested-by: Ilya Dryomov +Reviewed-by: Matthew Wilcox (Oracle) +Reviewed-by: Darrick J. Wong +Signed-off-by: Christian Brauner +Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") +Signed-off-by: Sasha Levin +--- + fs/inode.c | 2 ++ + include/linux/pagemap.h | 17 +++++++++++++++++ + mm/page-writeback.c | 2 +- + 3 files changed, 20 insertions(+), 1 deletion(-) + +diff --git a/fs/inode.c b/fs/inode.c +index 73ad1b0d47758..8cfda7a6d5900 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -215,6 +215,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) + lockdep_set_class_and_name(&mapping->invalidate_lock, + &sb->s_type->invalidate_lock_key, + "mapping.invalidate_lock"); ++ if (sb->s_iflags & SB_I_STABLE_WRITES) ++ mapping_set_stable_writes(mapping); + inode->i_private = NULL; + inode->i_mapping = mapping; + INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index fdbb90ae56c70..1be5a1fa6a3a8 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -200,6 +200,8 @@ enum mapping_flags { + AS_NO_WRITEBACK_TAGS = 5, + AS_LARGE_FOLIO_SUPPORT = 6, + AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ ++ AS_STABLE_WRITES, /* must wait for writeback before modifying ++ folio contents */ + }; + + /** +@@ -285,6 +287,21 @@ static inline void mapping_clear_release_always(struct address_space *mapping) + clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); + } + ++static inline bool mapping_stable_writes(const struct address_space *mapping) ++{ ++ return test_bit(AS_STABLE_WRITES, &mapping->flags); ++} ++ ++static inline void mapping_set_stable_writes(struct address_space *mapping) ++{ ++ set_bit(AS_STABLE_WRITES, &mapping->flags); ++} ++ ++static inline void mapping_clear_stable_writes(struct address_space *mapping) ++{ ++ clear_bit(AS_STABLE_WRITES, &mapping->flags); ++} ++ + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) + { + return mapping->gfp_mask; +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 7e9d8d857ecca..de5f69921b946 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -3078,7 +3078,7 @@ EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); + */ + void folio_wait_stable(struct folio *folio) + { +- if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES) ++ if (mapping_stable_writes(folio_mapping(folio))) + folio_wait_writeback(folio); + } + EXPORT_SYMBOL_GPL(folio_wait_stable); +-- +2.43.0 + diff --git a/queue-6.1/firmware-arm_scmi-fix-frequency-truncation-by-promot.patch b/queue-6.1/firmware-arm_scmi-fix-frequency-truncation-by-promot.patch new file mode 100644 index 00000000000..4ab2bec1049 --- /dev/null +++ b/queue-6.1/firmware-arm_scmi-fix-frequency-truncation-by-promot.patch @@ -0,0 +1,54 @@ +From 0be29a4228fb46ed71ceb9d3ce17be8b03862eba Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 30 Nov 2023 20:43:42 +0000 +Subject: firmware: arm_scmi: Fix frequency truncation by promoting multiplier + type + +From: Sudeep Holla + +[ Upstream commit 8e3c98d9187e09274fc000a7d1a77b070a42d259 ] + +Fix the possible frequency truncation for all values equal to or greater +4GHz on 64bit machines by updating the multiplier 'mult_factor' to +'unsigned long' type. It is also possible that the multiplier itself can +be greater than or equal to 2^32. So we need to also fix the equation +computing the value of the multiplier. + +Fixes: a9e3fbfaa0ff ("firmware: arm_scmi: add initial support for performance protocol") +Reported-by: Sibi Sankar +Closes: https://lore.kernel.org/all/20231129065748.19871-3-quic_sibis@quicinc.com/ +Cc: Cristian Marussi +Link: https://lore.kernel.org/r/20231130204343.503076-1-sudeep.holla@arm.com +Signed-off-by: Sudeep Holla +Signed-off-by: Sasha Levin +--- + drivers/firmware/arm_scmi/perf.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c +index 431bda9165c3d..2775bcafe40f6 100644 +--- a/drivers/firmware/arm_scmi/perf.c ++++ b/drivers/firmware/arm_scmi/perf.c +@@ -131,7 +131,7 @@ struct perf_dom_info { + u32 opp_count; + u32 sustained_freq_khz; + u32 sustained_perf_level; +- u32 mult_factor; ++ unsigned long mult_factor; + char name[SCMI_MAX_STR_SIZE]; + struct scmi_opp opp[MAX_OPPS]; + struct scmi_fc_info *fc_info; +@@ -223,8 +223,8 @@ scmi_perf_domain_attributes_get(const struct scmi_protocol_handle *ph, + dom_info->mult_factor = 1000; + else + dom_info->mult_factor = +- (dom_info->sustained_freq_khz * 1000) / +- dom_info->sustained_perf_level; ++ (dom_info->sustained_freq_khz * 1000UL) ++ / dom_info->sustained_perf_level; + strscpy(dom_info->name, attr->name, SCMI_SHORT_NAME_MAX_SIZE); + } + +-- +2.43.0 + diff --git a/queue-6.1/genirq-affinity-don-t-pass-irq_affinity_desc-array-t.patch b/queue-6.1/genirq-affinity-don-t-pass-irq_affinity_desc-array-t.patch new file mode 100644 index 00000000000..c12b7d62434 --- /dev/null +++ b/queue-6.1/genirq-affinity-don-t-pass-irq_affinity_desc-array-t.patch @@ -0,0 +1,142 @@ +From 88fe3a4d6d033d9103e986952d44d4c647deba38 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Dec 2022 10:29:02 +0800 +Subject: genirq/affinity: Don't pass irq_affinity_desc array to + irq_build_affinity_masks + +From: Ming Lei + +[ Upstream commit e7bdd7f0cbd1c001bb9b4d3313edc5ee094bc3f8 ] + +Prepare for abstracting irq_build_affinity_masks() into a public function +for assigning all CPUs evenly into several groups. + +Don't pass irq_affinity_desc array to irq_build_affinity_masks, instead +return a cpumask array by storing each assigned group into one element of +the array. + +This allows to provide a generic interface for grouping all CPUs evenly +from a NUMA and CPU locality viewpoint, and the cost is one extra allocation +in irq_build_affinity_masks(), which should be fine since it is done via +GFP_KERNEL and irq_build_affinity_masks() is a slow path anyway. + +Signed-off-by: Ming Lei +Signed-off-by: Thomas Gleixner +Reviewed-by: Christoph Hellwig +Reviewed-by: John Garry +Reviewed-by: Jens Axboe +Link: https://lore.kernel.org/r/20221227022905.352674-4-ming.lei@redhat.com +Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") +Signed-off-by: Sasha Levin +--- + kernel/irq/affinity.c | 34 ++++++++++++++++++++++++---------- + 1 file changed, 24 insertions(+), 10 deletions(-) + +diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c +index da6379cd27fd4..00bba1020ecb2 100644 +--- a/kernel/irq/affinity.c ++++ b/kernel/irq/affinity.c +@@ -249,7 +249,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, +- struct irq_affinity_desc *masks) ++ struct cpumask *masks) + { + unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; + unsigned int last_affv = numvecs; +@@ -270,7 +270,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, + for_each_node_mask(n, nodemsk) { + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); +- cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); ++ cpumask_or(&masks[curvec], &masks[curvec], nmsk); + if (++curvec == last_affv) + curvec = 0; + } +@@ -321,7 +321,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, + */ + if (curvec >= last_affv) + curvec = 0; +- irq_spread_init_one(&masks[curvec].mask, nmsk, ++ irq_spread_init_one(&masks[curvec], nmsk, + cpus_per_vec); + } + done += nv->nvectors; +@@ -335,16 +335,16 @@ static int __irq_build_affinity_masks(unsigned int startvec, + * 1) spread present CPU on these vectors + * 2) spread other possible CPUs on these vectors + */ +-static int irq_build_affinity_masks(unsigned int numvecs, +- struct irq_affinity_desc *masks) ++static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) + { + unsigned int curvec = 0, nr_present = 0, nr_others = 0; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; ++ struct cpumask *masks = NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) +- return ret; ++ return NULL; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto fail_nmsk; +@@ -353,6 +353,10 @@ static int irq_build_affinity_masks(unsigned int numvecs, + if (!node_to_cpumask) + goto fail_npresmsk; + ++ masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL); ++ if (!masks) ++ goto fail_node_to_cpumask; ++ + /* Stabilize the cpumasks */ + cpus_read_lock(); + build_node_to_cpumask(node_to_cpumask); +@@ -386,6 +390,7 @@ static int irq_build_affinity_masks(unsigned int numvecs, + if (ret >= 0) + WARN_ON(nr_present + nr_others < numvecs); + ++ fail_node_to_cpumask: + free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk: +@@ -393,7 +398,11 @@ static int irq_build_affinity_masks(unsigned int numvecs, + + fail_nmsk: + free_cpumask_var(nmsk); +- return ret < 0 ? ret : 0; ++ if (ret < 0) { ++ kfree(masks); ++ return NULL; ++ } ++ return masks; + } + + static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) +@@ -457,13 +466,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) + */ + for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { + unsigned int this_vecs = affd->set_size[i]; +- int ret; ++ int j; ++ struct cpumask *result = irq_build_affinity_masks(this_vecs); + +- ret = irq_build_affinity_masks(this_vecs, &masks[curvec]); +- if (ret) { ++ if (!result) { + kfree(masks); + return NULL; + } ++ ++ for (j = 0; j < this_vecs; j++) ++ cpumask_copy(&masks[curvec + j].mask, &result[j]); ++ kfree(result); ++ + curvec += this_vecs; + usedvecs += this_vecs; + } +-- +2.43.0 + diff --git a/queue-6.1/genirq-affinity-move-group_cpus_evenly-into-lib.patch b/queue-6.1/genirq-affinity-move-group_cpus_evenly-into-lib.patch new file mode 100644 index 00000000000..fb55891bb2b --- /dev/null +++ b/queue-6.1/genirq-affinity-move-group_cpus_evenly-into-lib.patch @@ -0,0 +1,920 @@ +From ef4de3476be1d045915045b849fb143020fc8b84 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Dec 2022 10:29:04 +0800 +Subject: genirq/affinity: Move group_cpus_evenly() into lib/ + +From: Ming Lei + +[ Upstream commit f7b3ea8cf72f3d6060fe08e461805181e7450a13 ] + +group_cpus_evenly() has become a generic function which can be used for +other subsystems than the interrupt subsystem, so move it into lib/. + +Signed-off-by: Ming Lei +Signed-off-by: Thomas Gleixner +Reviewed-by: Christoph Hellwig +Reviewed-by: Jens Axboe +Link: https://lore.kernel.org/r/20221227022905.352674-6-ming.lei@redhat.com +Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") +Signed-off-by: Sasha Levin +--- + MAINTAINERS | 2 + + include/linux/group_cpus.h | 14 ++ + kernel/irq/affinity.c | 398 +--------------------------------- + lib/Makefile | 2 + + lib/group_cpus.c | 427 +++++++++++++++++++++++++++++++++++++ + 5 files changed, 446 insertions(+), 397 deletions(-) + create mode 100644 include/linux/group_cpus.h + create mode 100644 lib/group_cpus.c + +diff --git a/MAINTAINERS b/MAINTAINERS +index 07a9c274c0e29..13d1078808bb5 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -10803,6 +10803,8 @@ L: linux-kernel@vger.kernel.org + S: Maintained + T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core + F: kernel/irq/ ++F: include/linux/group_cpus.h ++F: lib/group_cpus.c + + IRQCHIP DRIVERS + M: Thomas Gleixner +diff --git a/include/linux/group_cpus.h b/include/linux/group_cpus.h +new file mode 100644 +index 0000000000000..e42807ec61f6e +--- /dev/null ++++ b/include/linux/group_cpus.h +@@ -0,0 +1,14 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++ * Copyright (C) 2016 Thomas Gleixner. ++ * Copyright (C) 2016-2017 Christoph Hellwig. ++ */ ++ ++#ifndef __LINUX_GROUP_CPUS_H ++#define __LINUX_GROUP_CPUS_H ++#include ++#include ++ ++struct cpumask *group_cpus_evenly(unsigned int numgrps); ++ ++#endif +diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c +index 54083331f1bcb..44a4eba80315c 100644 +--- a/kernel/irq/affinity.c ++++ b/kernel/irq/affinity.c +@@ -7,403 +7,7 @@ + #include + #include + #include +-#include +- +-static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, +- unsigned int cpus_per_grp) +-{ +- const struct cpumask *siblmsk; +- int cpu, sibl; +- +- for ( ; cpus_per_grp > 0; ) { +- cpu = cpumask_first(nmsk); +- +- /* Should not happen, but I'm too lazy to think about it */ +- if (cpu >= nr_cpu_ids) +- return; +- +- cpumask_clear_cpu(cpu, nmsk); +- cpumask_set_cpu(cpu, irqmsk); +- cpus_per_grp--; +- +- /* If the cpu has siblings, use them first */ +- siblmsk = topology_sibling_cpumask(cpu); +- for (sibl = -1; cpus_per_grp > 0; ) { +- sibl = cpumask_next(sibl, siblmsk); +- if (sibl >= nr_cpu_ids) +- break; +- if (!cpumask_test_and_clear_cpu(sibl, nmsk)) +- continue; +- cpumask_set_cpu(sibl, irqmsk); +- cpus_per_grp--; +- } +- } +-} +- +-static cpumask_var_t *alloc_node_to_cpumask(void) +-{ +- cpumask_var_t *masks; +- int node; +- +- masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); +- if (!masks) +- return NULL; +- +- for (node = 0; node < nr_node_ids; node++) { +- if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) +- goto out_unwind; +- } +- +- return masks; +- +-out_unwind: +- while (--node >= 0) +- free_cpumask_var(masks[node]); +- kfree(masks); +- return NULL; +-} +- +-static void free_node_to_cpumask(cpumask_var_t *masks) +-{ +- int node; +- +- for (node = 0; node < nr_node_ids; node++) +- free_cpumask_var(masks[node]); +- kfree(masks); +-} +- +-static void build_node_to_cpumask(cpumask_var_t *masks) +-{ +- int cpu; +- +- for_each_possible_cpu(cpu) +- cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); +-} +- +-static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, +- const struct cpumask *mask, nodemask_t *nodemsk) +-{ +- int n, nodes = 0; +- +- /* Calculate the number of nodes in the supplied affinity mask */ +- for_each_node(n) { +- if (cpumask_intersects(mask, node_to_cpumask[n])) { +- node_set(n, *nodemsk); +- nodes++; +- } +- } +- return nodes; +-} +- +-struct node_groups { +- unsigned id; +- +- union { +- unsigned ngroups; +- unsigned ncpus; +- }; +-}; +- +-static int ncpus_cmp_func(const void *l, const void *r) +-{ +- const struct node_groups *ln = l; +- const struct node_groups *rn = r; +- +- return ln->ncpus - rn->ncpus; +-} +- +-/* +- * Allocate group number for each node, so that for each node: +- * +- * 1) the allocated number is >= 1 +- * +- * 2) the allocated number is <= active CPU number of this node +- * +- * The actual allocated total groups may be less than @numgrps when +- * active total CPU number is less than @numgrps. +- * +- * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' +- * for each node. +- */ +-static void alloc_nodes_groups(unsigned int numgrps, +- cpumask_var_t *node_to_cpumask, +- const struct cpumask *cpu_mask, +- const nodemask_t nodemsk, +- struct cpumask *nmsk, +- struct node_groups *node_groups) +-{ +- unsigned n, remaining_ncpus = 0; +- +- for (n = 0; n < nr_node_ids; n++) { +- node_groups[n].id = n; +- node_groups[n].ncpus = UINT_MAX; +- } +- +- for_each_node_mask(n, nodemsk) { +- unsigned ncpus; +- +- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); +- ncpus = cpumask_weight(nmsk); +- +- if (!ncpus) +- continue; +- remaining_ncpus += ncpus; +- node_groups[n].ncpus = ncpus; +- } +- +- numgrps = min_t(unsigned, remaining_ncpus, numgrps); +- +- sort(node_groups, nr_node_ids, sizeof(node_groups[0]), +- ncpus_cmp_func, NULL); +- +- /* +- * Allocate groups for each node according to the ratio of this +- * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is +- * bigger than number of active numa nodes. Always start the +- * allocation from the node with minimized nr_cpus. +- * +- * This way guarantees that each active node gets allocated at +- * least one group, and the theory is simple: over-allocation +- * is only done when this node is assigned by one group, so +- * other nodes will be allocated >= 1 groups, since 'numgrps' is +- * bigger than number of numa nodes. +- * +- * One perfect invariant is that number of allocated groups for +- * each node is <= CPU count of this node: +- * +- * 1) suppose there are two nodes: A and B +- * ncpu(X) is CPU count of node X +- * grps(X) is the group count allocated to node X via this +- * algorithm +- * +- * ncpu(A) <= ncpu(B) +- * ncpu(A) + ncpu(B) = N +- * grps(A) + grps(B) = G +- * +- * grps(A) = max(1, round_down(G * ncpu(A) / N)) +- * grps(B) = G - grps(A) +- * +- * both N and G are integer, and 2 <= G <= N, suppose +- * G = N - delta, and 0 <= delta <= N - 2 +- * +- * 2) obviously grps(A) <= ncpu(A) because: +- * +- * if grps(A) is 1, then grps(A) <= ncpu(A) given +- * ncpu(A) >= 1 +- * +- * otherwise, +- * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N +- * +- * 3) prove how grps(B) <= ncpu(B): +- * +- * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be +- * over-allocated, so grps(B) <= ncpu(B), +- * +- * otherwise: +- * +- * grps(A) = +- * round_down(G * ncpu(A) / N) = +- * round_down((N - delta) * ncpu(A) / N) = +- * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= +- * round_down((N * ncpu(A) - delta * N) / N) = +- * cpu(A) - delta +- * +- * then: +- * +- * grps(A) - G >= ncpu(A) - delta - G +- * => +- * G - grps(A) <= G + delta - ncpu(A) +- * => +- * grps(B) <= N - ncpu(A) +- * => +- * grps(B) <= cpu(B) +- * +- * For nodes >= 3, it can be thought as one node and another big +- * node given that is exactly what this algorithm is implemented, +- * and we always re-calculate 'remaining_ncpus' & 'numgrps', and +- * finally for each node X: grps(X) <= ncpu(X). +- * +- */ +- for (n = 0; n < nr_node_ids; n++) { +- unsigned ngroups, ncpus; +- +- if (node_groups[n].ncpus == UINT_MAX) +- continue; +- +- WARN_ON_ONCE(numgrps == 0); +- +- ncpus = node_groups[n].ncpus; +- ngroups = max_t(unsigned, 1, +- numgrps * ncpus / remaining_ncpus); +- WARN_ON_ONCE(ngroups > ncpus); +- +- node_groups[n].ngroups = ngroups; +- +- remaining_ncpus -= ncpus; +- numgrps -= ngroups; +- } +-} +- +-static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, +- cpumask_var_t *node_to_cpumask, +- const struct cpumask *cpu_mask, +- struct cpumask *nmsk, struct cpumask *masks) +-{ +- unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; +- unsigned int last_grp = numgrps; +- unsigned int curgrp = startgrp; +- nodemask_t nodemsk = NODE_MASK_NONE; +- struct node_groups *node_groups; +- +- if (cpumask_empty(cpu_mask)) +- return 0; +- +- nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); +- +- /* +- * If the number of nodes in the mask is greater than or equal the +- * number of groups we just spread the groups across the nodes. +- */ +- if (numgrps <= nodes) { +- for_each_node_mask(n, nodemsk) { +- /* Ensure that only CPUs which are in both masks are set */ +- cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); +- cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); +- if (++curgrp == last_grp) +- curgrp = 0; +- } +- return numgrps; +- } +- +- node_groups = kcalloc(nr_node_ids, +- sizeof(struct node_groups), +- GFP_KERNEL); +- if (!node_groups) +- return -ENOMEM; +- +- /* allocate group number for each node */ +- alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, +- nodemsk, nmsk, node_groups); +- for (i = 0; i < nr_node_ids; i++) { +- unsigned int ncpus, v; +- struct node_groups *nv = &node_groups[i]; +- +- if (nv->ngroups == UINT_MAX) +- continue; +- +- /* Get the cpus on this node which are in the mask */ +- cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); +- ncpus = cpumask_weight(nmsk); +- if (!ncpus) +- continue; +- +- WARN_ON_ONCE(nv->ngroups > ncpus); +- +- /* Account for rounding errors */ +- extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); +- +- /* Spread allocated groups on CPUs of the current node */ +- for (v = 0; v < nv->ngroups; v++, curgrp++) { +- cpus_per_grp = ncpus / nv->ngroups; +- +- /* Account for extra groups to compensate rounding errors */ +- if (extra_grps) { +- cpus_per_grp++; +- --extra_grps; +- } +- +- /* +- * wrapping has to be considered given 'startgrp' +- * may start anywhere +- */ +- if (curgrp >= last_grp) +- curgrp = 0; +- grp_spread_init_one(&masks[curgrp], nmsk, +- cpus_per_grp); +- } +- done += nv->ngroups; +- } +- kfree(node_groups); +- return done; +-} +- +-/* +- * build affinity in two stages for each group, and try to put close CPUs +- * in viewpoint of CPU and NUMA locality into same group, and we run +- * two-stage grouping: +- * +- * 1) allocate present CPUs on these groups evenly first +- * 2) allocate other possible CPUs on these groups evenly +- */ +-static struct cpumask *group_cpus_evenly(unsigned int numgrps) +-{ +- unsigned int curgrp = 0, nr_present = 0, nr_others = 0; +- cpumask_var_t *node_to_cpumask; +- cpumask_var_t nmsk, npresmsk; +- int ret = -ENOMEM; +- struct cpumask *masks = NULL; +- +- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) +- return NULL; +- +- if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) +- goto fail_nmsk; +- +- node_to_cpumask = alloc_node_to_cpumask(); +- if (!node_to_cpumask) +- goto fail_npresmsk; +- +- masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); +- if (!masks) +- goto fail_node_to_cpumask; +- +- /* Stabilize the cpumasks */ +- cpus_read_lock(); +- build_node_to_cpumask(node_to_cpumask); +- +- /* grouping present CPUs first */ +- ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, +- cpu_present_mask, nmsk, masks); +- if (ret < 0) +- goto fail_build_affinity; +- nr_present = ret; +- +- /* +- * Allocate non present CPUs starting from the next group to be +- * handled. If the grouping of present CPUs already exhausted the +- * group space, assign the non present CPUs to the already +- * allocated out groups. +- */ +- if (nr_present >= numgrps) +- curgrp = 0; +- else +- curgrp = nr_present; +- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); +- ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, +- npresmsk, nmsk, masks); +- if (ret >= 0) +- nr_others = ret; +- +- fail_build_affinity: +- cpus_read_unlock(); +- +- if (ret >= 0) +- WARN_ON(nr_present + nr_others < numgrps); +- +- fail_node_to_cpumask: +- free_node_to_cpumask(node_to_cpumask); +- +- fail_npresmsk: +- free_cpumask_var(npresmsk); +- +- fail_nmsk: +- free_cpumask_var(nmsk); +- if (ret < 0) { +- kfree(masks); +- return NULL; +- } +- return masks; +-} ++#include + + static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs) + { +diff --git a/lib/Makefile b/lib/Makefile +index 5ffe72ec99797..6f1611d053e6a 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -361,6 +361,8 @@ obj-$(CONFIG_SBITMAP) += sbitmap.o + + obj-$(CONFIG_PARMAN) += parman.o + ++obj-y += group_cpus.o ++ + # GCC library routines + obj-$(CONFIG_GENERIC_LIB_ASHLDI3) += ashldi3.o + obj-$(CONFIG_GENERIC_LIB_ASHRDI3) += ashrdi3.o +diff --git a/lib/group_cpus.c b/lib/group_cpus.c +new file mode 100644 +index 0000000000000..99f08c6cb9d97 +--- /dev/null ++++ b/lib/group_cpus.c +@@ -0,0 +1,427 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2016 Thomas Gleixner. ++ * Copyright (C) 2016-2017 Christoph Hellwig. ++ */ ++#include ++#include ++#include ++#include ++#include ++ ++static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, ++ unsigned int cpus_per_grp) ++{ ++ const struct cpumask *siblmsk; ++ int cpu, sibl; ++ ++ for ( ; cpus_per_grp > 0; ) { ++ cpu = cpumask_first(nmsk); ++ ++ /* Should not happen, but I'm too lazy to think about it */ ++ if (cpu >= nr_cpu_ids) ++ return; ++ ++ cpumask_clear_cpu(cpu, nmsk); ++ cpumask_set_cpu(cpu, irqmsk); ++ cpus_per_grp--; ++ ++ /* If the cpu has siblings, use them first */ ++ siblmsk = topology_sibling_cpumask(cpu); ++ for (sibl = -1; cpus_per_grp > 0; ) { ++ sibl = cpumask_next(sibl, siblmsk); ++ if (sibl >= nr_cpu_ids) ++ break; ++ if (!cpumask_test_and_clear_cpu(sibl, nmsk)) ++ continue; ++ cpumask_set_cpu(sibl, irqmsk); ++ cpus_per_grp--; ++ } ++ } ++} ++ ++static cpumask_var_t *alloc_node_to_cpumask(void) ++{ ++ cpumask_var_t *masks; ++ int node; ++ ++ masks = kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); ++ if (!masks) ++ return NULL; ++ ++ for (node = 0; node < nr_node_ids; node++) { ++ if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) ++ goto out_unwind; ++ } ++ ++ return masks; ++ ++out_unwind: ++ while (--node >= 0) ++ free_cpumask_var(masks[node]); ++ kfree(masks); ++ return NULL; ++} ++ ++static void free_node_to_cpumask(cpumask_var_t *masks) ++{ ++ int node; ++ ++ for (node = 0; node < nr_node_ids; node++) ++ free_cpumask_var(masks[node]); ++ kfree(masks); ++} ++ ++static void build_node_to_cpumask(cpumask_var_t *masks) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); ++} ++ ++static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, ++ const struct cpumask *mask, nodemask_t *nodemsk) ++{ ++ int n, nodes = 0; ++ ++ /* Calculate the number of nodes in the supplied affinity mask */ ++ for_each_node(n) { ++ if (cpumask_intersects(mask, node_to_cpumask[n])) { ++ node_set(n, *nodemsk); ++ nodes++; ++ } ++ } ++ return nodes; ++} ++ ++struct node_groups { ++ unsigned id; ++ ++ union { ++ unsigned ngroups; ++ unsigned ncpus; ++ }; ++}; ++ ++static int ncpus_cmp_func(const void *l, const void *r) ++{ ++ const struct node_groups *ln = l; ++ const struct node_groups *rn = r; ++ ++ return ln->ncpus - rn->ncpus; ++} ++ ++/* ++ * Allocate group number for each node, so that for each node: ++ * ++ * 1) the allocated number is >= 1 ++ * ++ * 2) the allocated number is <= active CPU number of this node ++ * ++ * The actual allocated total groups may be less than @numgrps when ++ * active total CPU number is less than @numgrps. ++ * ++ * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' ++ * for each node. ++ */ ++static void alloc_nodes_groups(unsigned int numgrps, ++ cpumask_var_t *node_to_cpumask, ++ const struct cpumask *cpu_mask, ++ const nodemask_t nodemsk, ++ struct cpumask *nmsk, ++ struct node_groups *node_groups) ++{ ++ unsigned n, remaining_ncpus = 0; ++ ++ for (n = 0; n < nr_node_ids; n++) { ++ node_groups[n].id = n; ++ node_groups[n].ncpus = UINT_MAX; ++ } ++ ++ for_each_node_mask(n, nodemsk) { ++ unsigned ncpus; ++ ++ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); ++ ncpus = cpumask_weight(nmsk); ++ ++ if (!ncpus) ++ continue; ++ remaining_ncpus += ncpus; ++ node_groups[n].ncpus = ncpus; ++ } ++ ++ numgrps = min_t(unsigned, remaining_ncpus, numgrps); ++ ++ sort(node_groups, nr_node_ids, sizeof(node_groups[0]), ++ ncpus_cmp_func, NULL); ++ ++ /* ++ * Allocate groups for each node according to the ratio of this ++ * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is ++ * bigger than number of active numa nodes. Always start the ++ * allocation from the node with minimized nr_cpus. ++ * ++ * This way guarantees that each active node gets allocated at ++ * least one group, and the theory is simple: over-allocation ++ * is only done when this node is assigned by one group, so ++ * other nodes will be allocated >= 1 groups, since 'numgrps' is ++ * bigger than number of numa nodes. ++ * ++ * One perfect invariant is that number of allocated groups for ++ * each node is <= CPU count of this node: ++ * ++ * 1) suppose there are two nodes: A and B ++ * ncpu(X) is CPU count of node X ++ * grps(X) is the group count allocated to node X via this ++ * algorithm ++ * ++ * ncpu(A) <= ncpu(B) ++ * ncpu(A) + ncpu(B) = N ++ * grps(A) + grps(B) = G ++ * ++ * grps(A) = max(1, round_down(G * ncpu(A) / N)) ++ * grps(B) = G - grps(A) ++ * ++ * both N and G are integer, and 2 <= G <= N, suppose ++ * G = N - delta, and 0 <= delta <= N - 2 ++ * ++ * 2) obviously grps(A) <= ncpu(A) because: ++ * ++ * if grps(A) is 1, then grps(A) <= ncpu(A) given ++ * ncpu(A) >= 1 ++ * ++ * otherwise, ++ * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N ++ * ++ * 3) prove how grps(B) <= ncpu(B): ++ * ++ * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be ++ * over-allocated, so grps(B) <= ncpu(B), ++ * ++ * otherwise: ++ * ++ * grps(A) = ++ * round_down(G * ncpu(A) / N) = ++ * round_down((N - delta) * ncpu(A) / N) = ++ * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= ++ * round_down((N * ncpu(A) - delta * N) / N) = ++ * cpu(A) - delta ++ * ++ * then: ++ * ++ * grps(A) - G >= ncpu(A) - delta - G ++ * => ++ * G - grps(A) <= G + delta - ncpu(A) ++ * => ++ * grps(B) <= N - ncpu(A) ++ * => ++ * grps(B) <= cpu(B) ++ * ++ * For nodes >= 3, it can be thought as one node and another big ++ * node given that is exactly what this algorithm is implemented, ++ * and we always re-calculate 'remaining_ncpus' & 'numgrps', and ++ * finally for each node X: grps(X) <= ncpu(X). ++ * ++ */ ++ for (n = 0; n < nr_node_ids; n++) { ++ unsigned ngroups, ncpus; ++ ++ if (node_groups[n].ncpus == UINT_MAX) ++ continue; ++ ++ WARN_ON_ONCE(numgrps == 0); ++ ++ ncpus = node_groups[n].ncpus; ++ ngroups = max_t(unsigned, 1, ++ numgrps * ncpus / remaining_ncpus); ++ WARN_ON_ONCE(ngroups > ncpus); ++ ++ node_groups[n].ngroups = ngroups; ++ ++ remaining_ncpus -= ncpus; ++ numgrps -= ngroups; ++ } ++} ++ ++static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, ++ cpumask_var_t *node_to_cpumask, ++ const struct cpumask *cpu_mask, ++ struct cpumask *nmsk, struct cpumask *masks) ++{ ++ unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; ++ unsigned int last_grp = numgrps; ++ unsigned int curgrp = startgrp; ++ nodemask_t nodemsk = NODE_MASK_NONE; ++ struct node_groups *node_groups; ++ ++ if (cpumask_empty(cpu_mask)) ++ return 0; ++ ++ nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); ++ ++ /* ++ * If the number of nodes in the mask is greater than or equal the ++ * number of groups we just spread the groups across the nodes. ++ */ ++ if (numgrps <= nodes) { ++ for_each_node_mask(n, nodemsk) { ++ /* Ensure that only CPUs which are in both masks are set */ ++ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); ++ cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); ++ if (++curgrp == last_grp) ++ curgrp = 0; ++ } ++ return numgrps; ++ } ++ ++ node_groups = kcalloc(nr_node_ids, ++ sizeof(struct node_groups), ++ GFP_KERNEL); ++ if (!node_groups) ++ return -ENOMEM; ++ ++ /* allocate group number for each node */ ++ alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, ++ nodemsk, nmsk, node_groups); ++ for (i = 0; i < nr_node_ids; i++) { ++ unsigned int ncpus, v; ++ struct node_groups *nv = &node_groups[i]; ++ ++ if (nv->ngroups == UINT_MAX) ++ continue; ++ ++ /* Get the cpus on this node which are in the mask */ ++ cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); ++ ncpus = cpumask_weight(nmsk); ++ if (!ncpus) ++ continue; ++ ++ WARN_ON_ONCE(nv->ngroups > ncpus); ++ ++ /* Account for rounding errors */ ++ extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); ++ ++ /* Spread allocated groups on CPUs of the current node */ ++ for (v = 0; v < nv->ngroups; v++, curgrp++) { ++ cpus_per_grp = ncpus / nv->ngroups; ++ ++ /* Account for extra groups to compensate rounding errors */ ++ if (extra_grps) { ++ cpus_per_grp++; ++ --extra_grps; ++ } ++ ++ /* ++ * wrapping has to be considered given 'startgrp' ++ * may start anywhere ++ */ ++ if (curgrp >= last_grp) ++ curgrp = 0; ++ grp_spread_init_one(&masks[curgrp], nmsk, ++ cpus_per_grp); ++ } ++ done += nv->ngroups; ++ } ++ kfree(node_groups); ++ return done; ++} ++ ++#ifdef CONFIG_SMP ++/** ++ * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality ++ * @numgrps: number of groups ++ * ++ * Return: cpumask array if successful, NULL otherwise. And each element ++ * includes CPUs assigned to this group ++ * ++ * Try to put close CPUs from viewpoint of CPU and NUMA locality into ++ * same group, and run two-stage grouping: ++ * 1) allocate present CPUs on these groups evenly first ++ * 2) allocate other possible CPUs on these groups evenly ++ * ++ * We guarantee in the resulted grouping that all CPUs are covered, and ++ * no same CPU is assigned to multiple groups ++ */ ++struct cpumask *group_cpus_evenly(unsigned int numgrps) ++{ ++ unsigned int curgrp = 0, nr_present = 0, nr_others = 0; ++ cpumask_var_t *node_to_cpumask; ++ cpumask_var_t nmsk, npresmsk; ++ int ret = -ENOMEM; ++ struct cpumask *masks = NULL; ++ ++ if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) ++ return NULL; ++ ++ if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) ++ goto fail_nmsk; ++ ++ node_to_cpumask = alloc_node_to_cpumask(); ++ if (!node_to_cpumask) ++ goto fail_npresmsk; ++ ++ masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); ++ if (!masks) ++ goto fail_node_to_cpumask; ++ ++ /* Stabilize the cpumasks */ ++ cpus_read_lock(); ++ build_node_to_cpumask(node_to_cpumask); ++ ++ /* grouping present CPUs first */ ++ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, ++ cpu_present_mask, nmsk, masks); ++ if (ret < 0) ++ goto fail_build_affinity; ++ nr_present = ret; ++ ++ /* ++ * Allocate non present CPUs starting from the next group to be ++ * handled. If the grouping of present CPUs already exhausted the ++ * group space, assign the non present CPUs to the already ++ * allocated out groups. ++ */ ++ if (nr_present >= numgrps) ++ curgrp = 0; ++ else ++ curgrp = nr_present; ++ cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); ++ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, ++ npresmsk, nmsk, masks); ++ if (ret >= 0) ++ nr_others = ret; ++ ++ fail_build_affinity: ++ cpus_read_unlock(); ++ ++ if (ret >= 0) ++ WARN_ON(nr_present + nr_others < numgrps); ++ ++ fail_node_to_cpumask: ++ free_node_to_cpumask(node_to_cpumask); ++ ++ fail_npresmsk: ++ free_cpumask_var(npresmsk); ++ ++ fail_nmsk: ++ free_cpumask_var(nmsk); ++ if (ret < 0) { ++ kfree(masks); ++ return NULL; ++ } ++ return masks; ++} ++#else ++struct cpumask *group_cpus_evenly(unsigned int numgrps) ++{ ++ struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); ++ ++ if (!masks) ++ return NULL; ++ ++ /* assign all CPUs(cpu 0) to the 1st group only */ ++ cpumask_copy(&masks[0], cpu_possible_mask); ++ return masks; ++} ++#endif +-- +2.43.0 + diff --git a/queue-6.1/genirq-affinity-pass-affinity-managed-mask-array-to-.patch b/queue-6.1/genirq-affinity-pass-affinity-managed-mask-array-to-.patch new file mode 100644 index 00000000000..09a8df87cd9 --- /dev/null +++ b/queue-6.1/genirq-affinity-pass-affinity-managed-mask-array-to-.patch @@ -0,0 +1,121 @@ +From 8dadc19b3f0f31cb7d083c07257a1a72dc988e35 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Dec 2022 10:29:01 +0800 +Subject: genirq/affinity: Pass affinity managed mask array to + irq_build_affinity_masks + +From: Ming Lei + +[ Upstream commit 1f962d91a15af54301c63febb8ac2ba07aa3654f ] + +Pass affinity managed mask array to irq_build_affinity_masks() so that the +index of the first affinity managed vector is always zero. + +This allows to simplify the implementation a bit. + +Signed-off-by: Ming Lei +Signed-off-by: Thomas Gleixner +Reviewed-by: Christoph Hellwig +Reviewed-by: John Garry +Reviewed-by: Jens Axboe +Link: https://lore.kernel.org/r/20221227022905.352674-3-ming.lei@redhat.com +Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") +Signed-off-by: Sasha Levin +--- + kernel/irq/affinity.c | 28 ++++++++++++---------------- + 1 file changed, 12 insertions(+), 16 deletions(-) + +diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c +index 3361e36ebaa1e..da6379cd27fd4 100644 +--- a/kernel/irq/affinity.c ++++ b/kernel/irq/affinity.c +@@ -246,14 +246,13 @@ static void alloc_nodes_vectors(unsigned int numvecs, + + static int __irq_build_affinity_masks(unsigned int startvec, + unsigned int numvecs, +- unsigned int firstvec, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, + struct irq_affinity_desc *masks) + { + unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; +- unsigned int last_affv = firstvec + numvecs; ++ unsigned int last_affv = numvecs; + unsigned int curvec = startvec; + nodemask_t nodemsk = NODE_MASK_NONE; + struct node_vectors *node_vectors; +@@ -273,7 +272,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk); + if (++curvec == last_affv) +- curvec = firstvec; ++ curvec = 0; + } + return numvecs; + } +@@ -321,7 +320,7 @@ static int __irq_build_affinity_masks(unsigned int startvec, + * may start anywhere + */ + if (curvec >= last_affv) +- curvec = firstvec; ++ curvec = 0; + irq_spread_init_one(&masks[curvec].mask, nmsk, + cpus_per_vec); + } +@@ -336,11 +335,10 @@ static int __irq_build_affinity_masks(unsigned int startvec, + * 1) spread present CPU on these vectors + * 2) spread other possible CPUs on these vectors + */ +-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, ++static int irq_build_affinity_masks(unsigned int numvecs, + struct irq_affinity_desc *masks) + { +- unsigned int curvec = startvec, nr_present = 0, nr_others = 0; +- unsigned int firstvec = startvec; ++ unsigned int curvec = 0, nr_present = 0, nr_others = 0; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; +@@ -360,9 +358,8 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, + build_node_to_cpumask(node_to_cpumask); + + /* Spread on present CPUs starting from affd->pre_vectors */ +- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, +- node_to_cpumask, cpu_present_mask, +- nmsk, masks); ++ ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, ++ cpu_present_mask, nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; +@@ -374,13 +371,12 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, + * out vectors. + */ + if (nr_present >= numvecs) +- curvec = firstvec; ++ curvec = 0; + else +- curvec = firstvec + nr_present; ++ curvec = nr_present; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); +- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec, +- node_to_cpumask, npresmsk, nmsk, +- masks); ++ ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, ++ npresmsk, nmsk, masks); + if (ret >= 0) + nr_others = ret; + +@@ -463,7 +459,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) + unsigned int this_vecs = affd->set_size[i]; + int ret; + +- ret = irq_build_affinity_masks(curvec, this_vecs, masks); ++ ret = irq_build_affinity_masks(this_vecs, &masks[curvec]); + if (ret) { + kfree(masks); + return NULL; +-- +2.43.0 + diff --git a/queue-6.1/genirq-affinity-remove-the-firstvec-parameter-from-i.patch b/queue-6.1/genirq-affinity-remove-the-firstvec-parameter-from-i.patch new file mode 100644 index 00000000000..97e95a27278 --- /dev/null +++ b/queue-6.1/genirq-affinity-remove-the-firstvec-parameter-from-i.patch @@ -0,0 +1,54 @@ +From 3f9eac627a4179298074566b0149198d817ff10c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Dec 2022 10:29:00 +0800 +Subject: genirq/affinity: Remove the 'firstvec' parameter from + irq_build_affinity_masks + +From: Ming Lei + +[ Upstream commit cdf07f0ea48a3b52f924714d477366ac510ee870 ] + +The 'firstvec' parameter is always same with the parameter of +'startvec', so use 'startvec' directly inside irq_build_affinity_masks(). + +Signed-off-by: Ming Lei +Signed-off-by: Thomas Gleixner +Reviewed-by: Christoph Hellwig +Reviewed-by: John Garry +Reviewed-by: Jens Axboe +Link: https://lore.kernel.org/r/20221227022905.352674-2-ming.lei@redhat.com +Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") +Signed-off-by: Sasha Levin +--- + kernel/irq/affinity.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c +index d9a5c1d65a79d..3361e36ebaa1e 100644 +--- a/kernel/irq/affinity.c ++++ b/kernel/irq/affinity.c +@@ -337,10 +337,10 @@ static int __irq_build_affinity_masks(unsigned int startvec, + * 2) spread other possible CPUs on these vectors + */ + static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, +- unsigned int firstvec, + struct irq_affinity_desc *masks) + { + unsigned int curvec = startvec, nr_present = 0, nr_others = 0; ++ unsigned int firstvec = startvec; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; +@@ -463,8 +463,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) + unsigned int this_vecs = affd->set_size[i]; + int ret; + +- ret = irq_build_affinity_masks(curvec, this_vecs, +- curvec, masks); ++ ret = irq_build_affinity_masks(curvec, this_vecs, masks); + if (ret) { + kfree(masks); + return NULL; +-- +2.43.0 + diff --git a/queue-6.1/genirq-affinity-rename-irq_build_affinity_masks-as-g.patch b/queue-6.1/genirq-affinity-rename-irq_build_affinity_masks-as-g.patch new file mode 100644 index 00000000000..2e543c326c6 --- /dev/null +++ b/queue-6.1/genirq-affinity-rename-irq_build_affinity_masks-as-g.patch @@ -0,0 +1,485 @@ +From 2b38a67a94c19fcf3c655f12980a7a16eee4e44e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Dec 2022 10:29:03 +0800 +Subject: genirq/affinity: Rename irq_build_affinity_masks as group_cpus_evenly + +From: Ming Lei + +[ Upstream commit 523f1ea76aad9025f9bd5258d77f4406fa9dbe5d ] + +Map irq vector into group, which allows to abstract the algorithm for +a generic use case outside of the interrupt core. + +Rename irq_build_affinity_masks as group_cpus_evenly, so the API can be +reused for blk-mq to make default queue mapping even though irq vectors +aren't involved. + +No functional change, just rename vector as group. + +Signed-off-by: Ming Lei +Signed-off-by: Thomas Gleixner +Reviewed-by: Christoph Hellwig +Reviewed-by: Jens Axboe +Link: https://lore.kernel.org/r/20221227022905.352674-5-ming.lei@redhat.com +Stable-dep-of: 0263f92fadbb ("lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly") +Signed-off-by: Sasha Levin +--- + kernel/irq/affinity.c | 242 +++++++++++++++++++++--------------------- + 1 file changed, 121 insertions(+), 121 deletions(-) + +diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c +index 00bba1020ecb2..54083331f1bcb 100644 +--- a/kernel/irq/affinity.c ++++ b/kernel/irq/affinity.c +@@ -9,13 +9,13 @@ + #include + #include + +-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, +- unsigned int cpus_per_vec) ++static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, ++ unsigned int cpus_per_grp) + { + const struct cpumask *siblmsk; + int cpu, sibl; + +- for ( ; cpus_per_vec > 0; ) { ++ for ( ; cpus_per_grp > 0; ) { + cpu = cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ +@@ -24,18 +24,18 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); +- cpus_per_vec--; ++ cpus_per_grp--; + + /* If the cpu has siblings, use them first */ + siblmsk = topology_sibling_cpumask(cpu); +- for (sibl = -1; cpus_per_vec > 0; ) { ++ for (sibl = -1; cpus_per_grp > 0; ) { + sibl = cpumask_next(sibl, siblmsk); + if (sibl >= nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); +- cpus_per_vec--; ++ cpus_per_grp--; + } + } + } +@@ -95,48 +95,48 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, + return nodes; + } + +-struct node_vectors { ++struct node_groups { + unsigned id; + + union { +- unsigned nvectors; ++ unsigned ngroups; + unsigned ncpus; + }; + }; + + static int ncpus_cmp_func(const void *l, const void *r) + { +- const struct node_vectors *ln = l; +- const struct node_vectors *rn = r; ++ const struct node_groups *ln = l; ++ const struct node_groups *rn = r; + + return ln->ncpus - rn->ncpus; + } + + /* +- * Allocate vector number for each node, so that for each node: ++ * Allocate group number for each node, so that for each node: + * + * 1) the allocated number is >= 1 + * +- * 2) the allocated numbver is <= active CPU number of this node ++ * 2) the allocated number is <= active CPU number of this node + * +- * The actual allocated total vectors may be less than @numvecs when +- * active total CPU number is less than @numvecs. ++ * The actual allocated total groups may be less than @numgrps when ++ * active total CPU number is less than @numgrps. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +-static void alloc_nodes_vectors(unsigned int numvecs, +- cpumask_var_t *node_to_cpumask, +- const struct cpumask *cpu_mask, +- const nodemask_t nodemsk, +- struct cpumask *nmsk, +- struct node_vectors *node_vectors) ++static void alloc_nodes_groups(unsigned int numgrps, ++ cpumask_var_t *node_to_cpumask, ++ const struct cpumask *cpu_mask, ++ const nodemask_t nodemsk, ++ struct cpumask *nmsk, ++ struct node_groups *node_groups) + { + unsigned n, remaining_ncpus = 0; + + for (n = 0; n < nr_node_ids; n++) { +- node_vectors[n].id = n; +- node_vectors[n].ncpus = UINT_MAX; ++ node_groups[n].id = n; ++ node_groups[n].ncpus = UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { +@@ -148,61 +148,61 @@ static void alloc_nodes_vectors(unsigned int numvecs, + if (!ncpus) + continue; + remaining_ncpus += ncpus; +- node_vectors[n].ncpus = ncpus; ++ node_groups[n].ncpus = ncpus; + } + +- numvecs = min_t(unsigned, remaining_ncpus, numvecs); ++ numgrps = min_t(unsigned, remaining_ncpus, numgrps); + +- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]), ++ sort(node_groups, nr_node_ids, sizeof(node_groups[0]), + ncpus_cmp_func, NULL); + + /* +- * Allocate vectors for each node according to the ratio of this +- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is ++ * Allocate groups for each node according to the ratio of this ++ * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is + * bigger than number of active numa nodes. Always start the + * allocation from the node with minimized nr_cpus. + * + * This way guarantees that each active node gets allocated at +- * least one vector, and the theory is simple: over-allocation +- * is only done when this node is assigned by one vector, so +- * other nodes will be allocated >= 1 vector, since 'numvecs' is ++ * least one group, and the theory is simple: over-allocation ++ * is only done when this node is assigned by one group, so ++ * other nodes will be allocated >= 1 groups, since 'numgrps' is + * bigger than number of numa nodes. + * +- * One perfect invariant is that number of allocated vectors for ++ * One perfect invariant is that number of allocated groups for + * each node is <= CPU count of this node: + * + * 1) suppose there are two nodes: A and B + * ncpu(X) is CPU count of node X +- * vecs(X) is the vector count allocated to node X via this ++ * grps(X) is the group count allocated to node X via this + * algorithm + * + * ncpu(A) <= ncpu(B) + * ncpu(A) + ncpu(B) = N +- * vecs(A) + vecs(B) = V ++ * grps(A) + grps(B) = G + * +- * vecs(A) = max(1, round_down(V * ncpu(A) / N)) +- * vecs(B) = V - vecs(A) ++ * grps(A) = max(1, round_down(G * ncpu(A) / N)) ++ * grps(B) = G - grps(A) + * +- * both N and V are integer, and 2 <= V <= N, suppose +- * V = N - delta, and 0 <= delta <= N - 2 ++ * both N and G are integer, and 2 <= G <= N, suppose ++ * G = N - delta, and 0 <= delta <= N - 2 + * +- * 2) obviously vecs(A) <= ncpu(A) because: ++ * 2) obviously grps(A) <= ncpu(A) because: + * +- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given ++ * if grps(A) is 1, then grps(A) <= ncpu(A) given + * ncpu(A) >= 1 + * + * otherwise, +- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N ++ * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N + * +- * 3) prove how vecs(B) <= ncpu(B): ++ * 3) prove how grps(B) <= ncpu(B): + * +- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be +- * over-allocated, so vecs(B) <= ncpu(B), ++ * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be ++ * over-allocated, so grps(B) <= ncpu(B), + * + * otherwise: + * +- * vecs(A) = +- * round_down(V * ncpu(A) / N) = ++ * grps(A) = ++ * round_down(G * ncpu(A) / N) = + * round_down((N - delta) * ncpu(A) / N) = + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= + * round_down((N * ncpu(A) - delta * N) / N) = +@@ -210,52 +210,50 @@ static void alloc_nodes_vectors(unsigned int numvecs, + * + * then: + * +- * vecs(A) - V >= ncpu(A) - delta - V ++ * grps(A) - G >= ncpu(A) - delta - G + * => +- * V - vecs(A) <= V + delta - ncpu(A) ++ * G - grps(A) <= G + delta - ncpu(A) + * => +- * vecs(B) <= N - ncpu(A) ++ * grps(B) <= N - ncpu(A) + * => +- * vecs(B) <= cpu(B) ++ * grps(B) <= cpu(B) + * + * For nodes >= 3, it can be thought as one node and another big + * node given that is exactly what this algorithm is implemented, +- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and +- * finally for each node X: vecs(X) <= ncpu(X). ++ * and we always re-calculate 'remaining_ncpus' & 'numgrps', and ++ * finally for each node X: grps(X) <= ncpu(X). + * + */ + for (n = 0; n < nr_node_ids; n++) { +- unsigned nvectors, ncpus; ++ unsigned ngroups, ncpus; + +- if (node_vectors[n].ncpus == UINT_MAX) ++ if (node_groups[n].ncpus == UINT_MAX) + continue; + +- WARN_ON_ONCE(numvecs == 0); ++ WARN_ON_ONCE(numgrps == 0); + +- ncpus = node_vectors[n].ncpus; +- nvectors = max_t(unsigned, 1, +- numvecs * ncpus / remaining_ncpus); +- WARN_ON_ONCE(nvectors > ncpus); ++ ncpus = node_groups[n].ncpus; ++ ngroups = max_t(unsigned, 1, ++ numgrps * ncpus / remaining_ncpus); ++ WARN_ON_ONCE(ngroups > ncpus); + +- node_vectors[n].nvectors = nvectors; ++ node_groups[n].ngroups = ngroups; + + remaining_ncpus -= ncpus; +- numvecs -= nvectors; ++ numgrps -= ngroups; + } + } + +-static int __irq_build_affinity_masks(unsigned int startvec, +- unsigned int numvecs, +- cpumask_var_t *node_to_cpumask, +- const struct cpumask *cpu_mask, +- struct cpumask *nmsk, +- struct cpumask *masks) ++static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, ++ cpumask_var_t *node_to_cpumask, ++ const struct cpumask *cpu_mask, ++ struct cpumask *nmsk, struct cpumask *masks) + { +- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0; +- unsigned int last_affv = numvecs; +- unsigned int curvec = startvec; ++ unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0; ++ unsigned int last_grp = numgrps; ++ unsigned int curgrp = startgrp; + nodemask_t nodemsk = NODE_MASK_NONE; +- struct node_vectors *node_vectors; ++ struct node_groups *node_groups; + + if (cpumask_empty(cpu_mask)) + return 0; +@@ -264,34 +262,33 @@ static int __irq_build_affinity_masks(unsigned int startvec, + + /* + * If the number of nodes in the mask is greater than or equal the +- * number of vectors we just spread the vectors across the nodes. ++ * number of groups we just spread the groups across the nodes. + */ +- if (numvecs <= nodes) { ++ if (numgrps <= nodes) { + for_each_node_mask(n, nodemsk) { + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); +- cpumask_or(&masks[curvec], &masks[curvec], nmsk); +- if (++curvec == last_affv) +- curvec = 0; ++ cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); ++ if (++curgrp == last_grp) ++ curgrp = 0; + } +- return numvecs; ++ return numgrps; + } + +- node_vectors = kcalloc(nr_node_ids, +- sizeof(struct node_vectors), ++ node_groups = kcalloc(nr_node_ids, ++ sizeof(struct node_groups), + GFP_KERNEL); +- if (!node_vectors) ++ if (!node_groups) + return -ENOMEM; + +- /* allocate vector number for each node */ +- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask, +- nodemsk, nmsk, node_vectors); +- ++ /* allocate group number for each node */ ++ alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, ++ nodemsk, nmsk, node_groups); + for (i = 0; i < nr_node_ids; i++) { + unsigned int ncpus, v; +- struct node_vectors *nv = &node_vectors[i]; ++ struct node_groups *nv = &node_groups[i]; + +- if (nv->nvectors == UINT_MAX) ++ if (nv->ngroups == UINT_MAX) + continue; + + /* Get the cpus on this node which are in the mask */ +@@ -300,44 +297,47 @@ static int __irq_build_affinity_masks(unsigned int startvec, + if (!ncpus) + continue; + +- WARN_ON_ONCE(nv->nvectors > ncpus); ++ WARN_ON_ONCE(nv->ngroups > ncpus); + + /* Account for rounding errors */ +- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors); ++ extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); + +- /* Spread allocated vectors on CPUs of the current node */ +- for (v = 0; v < nv->nvectors; v++, curvec++) { +- cpus_per_vec = ncpus / nv->nvectors; ++ /* Spread allocated groups on CPUs of the current node */ ++ for (v = 0; v < nv->ngroups; v++, curgrp++) { ++ cpus_per_grp = ncpus / nv->ngroups; + +- /* Account for extra vectors to compensate rounding errors */ +- if (extra_vecs) { +- cpus_per_vec++; +- --extra_vecs; ++ /* Account for extra groups to compensate rounding errors */ ++ if (extra_grps) { ++ cpus_per_grp++; ++ --extra_grps; + } + + /* +- * wrapping has to be considered given 'startvec' ++ * wrapping has to be considered given 'startgrp' + * may start anywhere + */ +- if (curvec >= last_affv) +- curvec = 0; +- irq_spread_init_one(&masks[curvec], nmsk, +- cpus_per_vec); ++ if (curgrp >= last_grp) ++ curgrp = 0; ++ grp_spread_init_one(&masks[curgrp], nmsk, ++ cpus_per_grp); + } +- done += nv->nvectors; ++ done += nv->ngroups; + } +- kfree(node_vectors); ++ kfree(node_groups); + return done; + } + + /* +- * build affinity in two stages: +- * 1) spread present CPU on these vectors +- * 2) spread other possible CPUs on these vectors ++ * build affinity in two stages for each group, and try to put close CPUs ++ * in viewpoint of CPU and NUMA locality into same group, and we run ++ * two-stage grouping: ++ * ++ * 1) allocate present CPUs on these groups evenly first ++ * 2) allocate other possible CPUs on these groups evenly + */ +-static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) ++static struct cpumask *group_cpus_evenly(unsigned int numgrps) + { +- unsigned int curvec = 0, nr_present = 0, nr_others = 0; ++ unsigned int curgrp = 0, nr_present = 0, nr_others = 0; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, npresmsk; + int ret = -ENOMEM; +@@ -353,7 +353,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) + if (!node_to_cpumask) + goto fail_npresmsk; + +- masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL); ++ masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto fail_node_to_cpumask; + +@@ -361,26 +361,26 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) + cpus_read_lock(); + build_node_to_cpumask(node_to_cpumask); + +- /* Spread on present CPUs starting from affd->pre_vectors */ +- ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, +- cpu_present_mask, nmsk, masks); ++ /* grouping present CPUs first */ ++ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, ++ cpu_present_mask, nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; + + /* +- * Spread on non present CPUs starting from the next vector to be +- * handled. If the spreading of present CPUs already exhausted the +- * vector space, assign the non present CPUs to the already spread +- * out vectors. ++ * Allocate non present CPUs starting from the next group to be ++ * handled. If the grouping of present CPUs already exhausted the ++ * group space, assign the non present CPUs to the already ++ * allocated out groups. + */ +- if (nr_present >= numvecs) +- curvec = 0; ++ if (nr_present >= numgrps) ++ curgrp = 0; + else +- curvec = nr_present; ++ curgrp = nr_present; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); +- ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask, +- npresmsk, nmsk, masks); ++ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, ++ npresmsk, nmsk, masks); + if (ret >= 0) + nr_others = ret; + +@@ -388,7 +388,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs) + cpus_read_unlock(); + + if (ret >= 0) +- WARN_ON(nr_present + nr_others < numvecs); ++ WARN_ON(nr_present + nr_others < numgrps); + + fail_node_to_cpumask: + free_node_to_cpumask(node_to_cpumask); +@@ -467,7 +467,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) + for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) { + unsigned int this_vecs = affd->set_size[i]; + int j; +- struct cpumask *result = irq_build_affinity_masks(this_vecs); ++ struct cpumask *result = group_cpus_evenly(this_vecs); + + if (!result) { + kfree(masks); +-- +2.43.0 + diff --git a/queue-6.1/i40e-fix-filter-input-checks-to-prevent-config-with-.patch b/queue-6.1/i40e-fix-filter-input-checks-to-prevent-config-with-.patch new file mode 100644 index 00000000000..7c1ad181b0a --- /dev/null +++ b/queue-6.1/i40e-fix-filter-input-checks-to-prevent-config-with-.patch @@ -0,0 +1,53 @@ +From 0ddfc8bc46129c7a83ae4cf6d0cc4063fbfc2355 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 29 Nov 2023 11:23:11 +0100 +Subject: i40e: Fix filter input checks to prevent config with invalid values + +From: Sudheer Mogilappagari + +[ Upstream commit 3e48041d9820c17e0a51599d12e66c6e12a8d08d ] + +Prevent VF from configuring filters with unsupported actions or use +REDIRECT action with invalid tc number. Current checks could cause +out of bounds access on PF side. + +Fixes: e284fc280473 ("i40e: Add and delete cloud filter") +Reviewed-by: Andrii Staikov +Signed-off-by: Sudheer Mogilappagari +Signed-off-by: Aleksandr Loktionov +Reviewed-by: Simon Horman +Tested-by: Bharathi Sreenivas +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +index cb925baf72ce0..3c38129a5224a 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +@@ -3451,16 +3451,16 @@ static int i40e_validate_cloud_filter(struct i40e_vf *vf, + bool found = false; + int bkt; + +- if (!tc_filter->action) { ++ if (tc_filter->action != VIRTCHNL_ACTION_TC_REDIRECT) { + dev_info(&pf->pdev->dev, +- "VF %d: Currently ADq doesn't support Drop Action\n", +- vf->vf_id); ++ "VF %d: ADQ doesn't support this action (%d)\n", ++ vf->vf_id, tc_filter->action); + goto err; + } + + /* action_meta is TC number here to which the filter is applied */ + if (!tc_filter->action_meta || +- tc_filter->action_meta > I40E_MAX_VF_VSI) { ++ tc_filter->action_meta > vf->num_tc) { + dev_info(&pf->pdev->dev, "VF %d: Invalid TC number %u\n", + vf->vf_id, tc_filter->action_meta); + goto err; +-- +2.43.0 + diff --git a/queue-6.1/i40e-fix-use-after-free-in-i40e_aqc_add_filters.patch b/queue-6.1/i40e-fix-use-after-free-in-i40e_aqc_add_filters.patch new file mode 100644 index 00000000000..c71b68fa12d --- /dev/null +++ b/queue-6.1/i40e-fix-use-after-free-in-i40e_aqc_add_filters.patch @@ -0,0 +1,120 @@ +From 20287328081684e38abeda69f42fe548148fc294 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 18 Dec 2023 15:08:50 +0800 +Subject: i40e: fix use-after-free in i40e_aqc_add_filters() + +From: Ke Xiao + +[ Upstream commit 6a15584e99db8918b60e507539c7446375dcf366 ] + +Commit 3116f59c12bd ("i40e: fix use-after-free in +i40e_sync_filters_subtask()") avoided use-after-free issues, +by increasing refcount during update the VSI filter list to +the HW. However, it missed the unicast situation. + +When deleting an unicast FDB entry, the i40e driver will release +the mac_filter, and i40e_service_task will concurrently request +firmware to add the mac_filter, which will lead to the following +use-after-free issue. + +Fix again for both netdev->uc and netdev->mc. + +BUG: KASAN: use-after-free in i40e_aqc_add_filters+0x55c/0x5b0 [i40e] +Read of size 2 at addr ffff888eb3452d60 by task kworker/8:7/6379 + +CPU: 8 PID: 6379 Comm: kworker/8:7 Kdump: loaded Tainted: G +Workqueue: i40e i40e_service_task [i40e] +Call Trace: + dump_stack+0x71/0xab + print_address_description+0x6b/0x290 + kasan_report+0x14a/0x2b0 + i40e_aqc_add_filters+0x55c/0x5b0 [i40e] + i40e_sync_vsi_filters+0x1676/0x39c0 [i40e] + i40e_service_task+0x1397/0x2bb0 [i40e] + process_one_work+0x56a/0x11f0 + worker_thread+0x8f/0xf40 + kthread+0x2a0/0x390 + ret_from_fork+0x1f/0x40 + +Allocated by task 21948: + kasan_kmalloc+0xa6/0xd0 + kmem_cache_alloc_trace+0xdb/0x1c0 + i40e_add_filter+0x11e/0x520 [i40e] + i40e_addr_sync+0x37/0x60 [i40e] + __hw_addr_sync_dev+0x1f5/0x2f0 + i40e_set_rx_mode+0x61/0x1e0 [i40e] + dev_uc_add_excl+0x137/0x190 + i40e_ndo_fdb_add+0x161/0x260 [i40e] + rtnl_fdb_add+0x567/0x950 + rtnetlink_rcv_msg+0x5db/0x880 + netlink_rcv_skb+0x254/0x380 + netlink_unicast+0x454/0x610 + netlink_sendmsg+0x747/0xb00 + sock_sendmsg+0xe2/0x120 + __sys_sendto+0x1ae/0x290 + __x64_sys_sendto+0xdd/0x1b0 + do_syscall_64+0xa0/0x370 + entry_SYSCALL_64_after_hwframe+0x65/0xca + +Freed by task 21948: + __kasan_slab_free+0x137/0x190 + kfree+0x8b/0x1b0 + __i40e_del_filter+0x116/0x1e0 [i40e] + i40e_del_mac_filter+0x16c/0x300 [i40e] + i40e_addr_unsync+0x134/0x1b0 [i40e] + __hw_addr_sync_dev+0xff/0x2f0 + i40e_set_rx_mode+0x61/0x1e0 [i40e] + dev_uc_del+0x77/0x90 + rtnl_fdb_del+0x6a5/0x860 + rtnetlink_rcv_msg+0x5db/0x880 + netlink_rcv_skb+0x254/0x380 + netlink_unicast+0x454/0x610 + netlink_sendmsg+0x747/0xb00 + sock_sendmsg+0xe2/0x120 + __sys_sendto+0x1ae/0x290 + __x64_sys_sendto+0xdd/0x1b0 + do_syscall_64+0xa0/0x370 + entry_SYSCALL_64_after_hwframe+0x65/0xca + +Fixes: 3116f59c12bd ("i40e: fix use-after-free in i40e_sync_filters_subtask()") +Fixes: 41c445ff0f48 ("i40e: main driver core") +Signed-off-by: Ke Xiao +Signed-off-by: Ding Hui +Cc: Di Zhu +Reviewed-by: Jan Sokolowski +Reviewed-by: Simon Horman +Reviewed-by: Jacob Keller +Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index b4157ff370a31..cdc68b78bd9ea 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -104,12 +104,18 @@ static struct workqueue_struct *i40e_wq; + static void netdev_hw_addr_refcnt(struct i40e_mac_filter *f, + struct net_device *netdev, int delta) + { ++ struct netdev_hw_addr_list *ha_list; + struct netdev_hw_addr *ha; + + if (!f || !netdev) + return; + +- netdev_for_each_mc_addr(ha, netdev) { ++ if (is_unicast_ether_addr(f->macaddr) || is_link_local_ether_addr(f->macaddr)) ++ ha_list = &netdev->uc; ++ else ++ ha_list = &netdev->mc; ++ ++ netdev_hw_addr_list_for_each(ha, ha_list) { + if (ether_addr_equal(ha->addr, f->macaddr)) { + ha->refcount += delta; + if (ha->refcount <= 0) +-- +2.43.0 + diff --git a/queue-6.1/i40e-restore-vf-msi-x-state-during-pci-reset.patch b/queue-6.1/i40e-restore-vf-msi-x-state-during-pci-reset.patch new file mode 100644 index 00000000000..3bf1bef6cca --- /dev/null +++ b/queue-6.1/i40e-restore-vf-msi-x-state-during-pci-reset.patch @@ -0,0 +1,104 @@ +From 7b1f4a98a68f67ebaea752502865a2679eea1b6f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Dec 2023 14:27:35 +0100 +Subject: i40e: Restore VF MSI-X state during PCI reset + +From: Andrii Staikov + +[ Upstream commit 371e576ff3e8580d91d49026e5d5faebf5565558 ] + +During a PCI FLR the MSI-X Enable flag in the VF PCI MSI-X capability +register will be cleared. This can lead to issues when a VF is +assigned to a VM because in these cases the VF driver receives no +indication of the PF PCI error/reset and additionally it is incapable +of restoring the cleared flag in the hypervisor configuration space +without fully reinitializing the driver interrupt functionality. + +Since the VF driver is unable to easily resolve this condition on its own, +restore the VF MSI-X flag during the PF PCI reset handling. + +Fixes: 19b7960b2da1 ("i40e: implement split PCI error reset handler") +Co-developed-by: Karen Ostrowska +Signed-off-by: Karen Ostrowska +Co-developed-by: Mateusz Palczewski +Signed-off-by: Mateusz Palczewski +Reviewed-by: Wojciech Drewek +Reviewed-by: Przemek Kitszel +Signed-off-by: Andrii Staikov +Tested-by: Rafal Romanowski +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 3 +++ + .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 26 +++++++++++++++++++ + .../ethernet/intel/i40e/i40e_virtchnl_pf.h | 3 +++ + 3 files changed, 32 insertions(+) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index cdc68b78bd9ea..63d43ef86f9b9 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -16450,6 +16450,9 @@ static void i40e_pci_error_reset_done(struct pci_dev *pdev) + return; + + i40e_reset_and_rebuild(pf, false, false); ++#ifdef CONFIG_PCI_IOV ++ i40e_restore_all_vfs_msi_state(pdev); ++#endif /* CONFIG_PCI_IOV */ + } + + /** +diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +index 3c38129a5224a..c7d761426d6ce 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +@@ -152,6 +152,32 @@ void i40e_vc_notify_reset(struct i40e_pf *pf) + (u8 *)&pfe, sizeof(struct virtchnl_pf_event)); + } + ++#ifdef CONFIG_PCI_IOV ++void i40e_restore_all_vfs_msi_state(struct pci_dev *pdev) ++{ ++ u16 vf_id; ++ u16 pos; ++ ++ /* Continue only if this is a PF */ ++ if (!pdev->is_physfn) ++ return; ++ ++ if (!pci_num_vf(pdev)) ++ return; ++ ++ pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_SRIOV); ++ if (pos) { ++ struct pci_dev *vf_dev = NULL; ++ ++ pci_read_config_word(pdev, pos + PCI_SRIOV_VF_DID, &vf_id); ++ while ((vf_dev = pci_get_device(pdev->vendor, vf_id, vf_dev))) { ++ if (vf_dev->is_virtfn && vf_dev->physfn == pdev) ++ pci_restore_msi_state(vf_dev); ++ } ++ } ++} ++#endif /* CONFIG_PCI_IOV */ ++ + /** + * i40e_vc_notify_vf_reset + * @vf: pointer to the VF structure +diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +index 358bbdb587951..bd497cc5303a1 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h ++++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h +@@ -135,6 +135,9 @@ int i40e_ndo_set_vf_spoofchk(struct net_device *netdev, int vf_id, bool enable); + + void i40e_vc_notify_link_state(struct i40e_pf *pf); + void i40e_vc_notify_reset(struct i40e_pf *pf); ++#ifdef CONFIG_PCI_IOV ++void i40e_restore_all_vfs_msi_state(struct pci_dev *pdev); ++#endif /* CONFIG_PCI_IOV */ + int i40e_get_vf_stats(struct net_device *netdev, int vf_id, + struct ifla_vf_stats *vf_stats); + +-- +2.43.0 + diff --git a/queue-6.1/ice-fix-link_down_on_close-message.patch b/queue-6.1/ice-fix-link_down_on_close-message.patch new file mode 100644 index 00000000000..e9926e9de63 --- /dev/null +++ b/queue-6.1/ice-fix-link_down_on_close-message.patch @@ -0,0 +1,55 @@ +From 2dd7c71e40d1a2ab164d9905c6bf8e507590d539 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Dec 2023 12:01:56 +0100 +Subject: ice: Fix link_down_on_close message + +From: Katarzyna Wieczerzycka + +[ Upstream commit 6a8d8bb55e7001de2d50920381cc858f3a3e9fb7 ] + +The driver should not report an error message when for a medialess port +the link_down_on_close flag is enabled and the physical link cannot be +set down. + +Fixes: 8ac7132704f3 ("ice: Fix interface being down after reset with link-down-on-close flag on") +Reviewed-by: Przemek Kitszel +Signed-off-by: Katarzyna Wieczerzycka +Signed-off-by: Wojciech Drewek +Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/ice/ice_main.c | 10 +++++++--- + 1 file changed, 7 insertions(+), 3 deletions(-) + +diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c +index f0f39364819ac..5eb3b80b293c0 100644 +--- a/drivers/net/ethernet/intel/ice/ice_main.c ++++ b/drivers/net/ethernet/intel/ice/ice_main.c +@@ -2138,7 +2138,7 @@ static int ice_configure_phy(struct ice_vsi *vsi) + + /* Ensure we have media as we cannot configure a medialess port */ + if (!(phy->link_info.link_info & ICE_AQ_MEDIA_AVAILABLE)) +- return -EPERM; ++ return -ENOMEDIUM; + + ice_print_topo_conflict(vsi); + +@@ -9065,8 +9065,12 @@ int ice_stop(struct net_device *netdev) + int link_err = ice_force_phys_link_state(vsi, false); + + if (link_err) { +- netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n", +- vsi->vsi_num, link_err); ++ if (link_err == -ENOMEDIUM) ++ netdev_info(vsi->netdev, "Skipping link reconfig - no media attached, VSI %d\n", ++ vsi->vsi_num); ++ else ++ netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n", ++ vsi->vsi_num, link_err); + return -EIO; + } + } +-- +2.43.0 + diff --git a/queue-6.1/ice-shut-down-vsi-with-link-down-on-close-enabled.patch b/queue-6.1/ice-shut-down-vsi-with-link-down-on-close-enabled.patch new file mode 100644 index 00000000000..b95ffe09596 --- /dev/null +++ b/queue-6.1/ice-shut-down-vsi-with-link-down-on-close-enabled.patch @@ -0,0 +1,40 @@ +From f1ef60049882de4af95c17ef50adb9017ecbaa09 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Dec 2023 12:01:57 +0100 +Subject: ice: Shut down VSI with "link-down-on-close" enabled + +From: Ngai-Mint Kwan + +[ Upstream commit 6d05ff55ef4f4954d28551236239f297bd52ea48 ] + +Disabling netdev with ethtool private flag "link-down-on-close" enabled +can cause NULL pointer dereference bug. Shut down VSI regardless of +"link-down-on-close" state. + +Fixes: 8ac7132704f3 ("ice: Fix interface being down after reset with link-down-on-close flag on") +Reviewed-by: Przemek Kitszel +Signed-off-by: Ngai-Mint Kwan +Signed-off-by: Wojciech Drewek +Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/ice/ice_main.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c +index 5eb3b80b293c0..ab46cfca4028d 100644 +--- a/drivers/net/ethernet/intel/ice/ice_main.c ++++ b/drivers/net/ethernet/intel/ice/ice_main.c +@@ -9071,6 +9071,8 @@ int ice_stop(struct net_device *netdev) + else + netdev_err(vsi->netdev, "Failed to set physical link down, VSI %d error %d\n", + vsi->vsi_num, link_err); ++ ++ ice_vsi_close(vsi); + return -EIO; + } + } +-- +2.43.0 + diff --git a/queue-6.1/igc-check-vlan-ethertype-mask.patch b/queue-6.1/igc-check-vlan-ethertype-mask.patch new file mode 100644 index 00000000000..a79d1ce04e6 --- /dev/null +++ b/queue-6.1/igc-check-vlan-ethertype-mask.patch @@ -0,0 +1,72 @@ +From e09a381b3b1d4fa9bd86c9d51bbd7c9766cc671a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 6 Dec 2023 15:07:18 +0100 +Subject: igc: Check VLAN EtherType mask + +From: Kurt Kanzenbach + +[ Upstream commit 7afd49a38e73afd57ff62c8d1cf5af760c4d49c0 ] + +Currently the driver accepts VLAN EtherType steering rules regardless of +the configured mask. And things might fail silently or with confusing error +messages to the user. The VLAN EtherType can only be matched by full +mask. Therefore, add a check for that. + +For instance the following rule is invalid, but the driver accepts it and +ignores the user specified mask: +|root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 \ +| m 0x00ff action 0 +|Added rule with ID 63 +|root@host:~# ethtool --show-ntuple enp3s0 +|4 RX rings available +|Total 1 rules +| +|Filter: 63 +| Flow Type: Raw Ethernet +| Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF +| Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF +| Ethertype: 0x0 mask: 0xFFFF +| VLAN EtherType: 0x8100 mask: 0x0 +| VLAN: 0x0 mask: 0xffff +| User-defined: 0x0 mask: 0xffffffffffffffff +| Action: Direct to queue 0 + +After: +|root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 \ +| m 0x00ff action 0 +|rmgr: Cannot insert RX class rule: Operation not supported + +Fixes: 2b477d057e33 ("igc: Integrate flex filter into ethtool ops") +Suggested-by: Suman Ghosh +Signed-off-by: Kurt Kanzenbach +Acked-by: Vinicius Costa Gomes +Reviewed-by: Simon Horman +Tested-by: Naama Meir +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/igc/igc_ethtool.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c +index e146357d61a8a..2bee9cace5983 100644 +--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c ++++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c +@@ -1356,6 +1356,14 @@ static int igc_ethtool_add_nfc_rule(struct igc_adapter *adapter, + return -EOPNOTSUPP; + } + ++ /* VLAN EtherType can only be matched by full mask. */ ++ if ((fsp->flow_type & FLOW_EXT) && ++ fsp->m_ext.vlan_etype && ++ fsp->m_ext.vlan_etype != ETHER_TYPE_FULL_MASK) { ++ netdev_dbg(netdev, "VLAN EtherType mask not supported\n"); ++ return -EOPNOTSUPP; ++ } ++ + if (fsp->location >= IGC_MAX_RXNFC_RULES) { + netdev_dbg(netdev, "Invalid location\n"); + return -EINVAL; +-- +2.43.0 + diff --git a/queue-6.1/igc-check-vlan-tci-mask.patch b/queue-6.1/igc-check-vlan-tci-mask.patch new file mode 100644 index 00000000000..164e81e1df9 --- /dev/null +++ b/queue-6.1/igc-check-vlan-tci-mask.patch @@ -0,0 +1,141 @@ +From ab151d4a86bceafa58b773d11dd768f176a291af Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Dec 2023 08:50:43 +0100 +Subject: igc: Check VLAN TCI mask + +From: Kurt Kanzenbach + +[ Upstream commit b5063cbe148b829e8eb97672c2cbccc058835476 ] + +Currently the driver accepts VLAN TCI steering rules regardless of the +configured mask. And things might fail silently or with confusing error +messages to the user. + +There are two ways to handle the VLAN TCI mask: + + 1. Match on the PCP field using a VLAN prio filter + 2. Match on complete TCI field using a flex filter + +Therefore, add checks and code for that. + +For instance the following rule is invalid and will be converted into a +VLAN prio rule which is not correct: +|root@host:~# ethtool -N enp3s0 flow-type ether vlan 0x0001 m 0xf000 \ +| action 1 +|Added rule with ID 61 +|root@host:~# ethtool --show-ntuple enp3s0 +|4 RX rings available +|Total 1 rules +| +|Filter: 61 +| Flow Type: Raw Ethernet +| Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF +| Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF +| Ethertype: 0x0 mask: 0xFFFF +| VLAN EtherType: 0x0 mask: 0xffff +| VLAN: 0x1 mask: 0x1fff +| User-defined: 0x0 mask: 0xffffffffffffffff +| Action: Direct to queue 1 + +After: +|root@host:~# ethtool -N enp3s0 flow-type ether vlan 0x0001 m 0xf000 \ +| action 1 +|rmgr: Cannot insert RX class rule: Operation not supported + +Fixes: 7991487ecb2d ("igc: Allow for Flex Filters to be installed") +Signed-off-by: Kurt Kanzenbach +Acked-by: Vinicius Costa Gomes +Reviewed-by: Simon Horman +Tested-by: Naama Meir +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/igc/igc.h | 1 + + drivers/net/ethernet/intel/igc/igc_ethtool.c | 28 +++++++++++++++++--- + 2 files changed, 26 insertions(+), 3 deletions(-) + +diff --git a/drivers/net/ethernet/intel/igc/igc.h b/drivers/net/ethernet/intel/igc/igc.h +index 43c05b41627f7..2a894ca49d93b 100644 +--- a/drivers/net/ethernet/intel/igc/igc.h ++++ b/drivers/net/ethernet/intel/igc/igc.h +@@ -538,6 +538,7 @@ struct igc_nfc_filter { + u16 etype; + __be16 vlan_etype; + u16 vlan_tci; ++ u16 vlan_tci_mask; + u8 src_addr[ETH_ALEN]; + u8 dst_addr[ETH_ALEN]; + u8 user_data[8]; +diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c +index 51ef18060dbc4..e146357d61a8a 100644 +--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c ++++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c +@@ -957,6 +957,7 @@ static int igc_ethtool_set_coalesce(struct net_device *netdev, + } + + #define ETHER_TYPE_FULL_MASK ((__force __be16)~0) ++#define VLAN_TCI_FULL_MASK ((__force __be16)~0) + static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter, + struct ethtool_rxnfc *cmd) + { +@@ -988,7 +989,7 @@ static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter, + if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { + fsp->flow_type |= FLOW_EXT; + fsp->h_ext.vlan_tci = htons(rule->filter.vlan_tci); +- fsp->m_ext.vlan_tci = htons(VLAN_PRIO_MASK); ++ fsp->m_ext.vlan_tci = htons(rule->filter.vlan_tci_mask); + } + + if (rule->filter.match_flags & IGC_FILTER_FLAG_DST_MAC_ADDR) { +@@ -1223,6 +1224,7 @@ static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule, + + if ((fsp->flow_type & FLOW_EXT) && fsp->m_ext.vlan_tci) { + rule->filter.vlan_tci = ntohs(fsp->h_ext.vlan_tci); ++ rule->filter.vlan_tci_mask = ntohs(fsp->m_ext.vlan_tci); + rule->filter.match_flags |= IGC_FILTER_FLAG_VLAN_TCI; + } + +@@ -1260,11 +1262,19 @@ static void igc_ethtool_init_nfc_rule(struct igc_nfc_rule *rule, + memcpy(rule->filter.user_mask, fsp->m_ext.data, sizeof(fsp->m_ext.data)); + } + +- /* When multiple filter options or user data or vlan etype is set, use a +- * flex filter. ++ /* The i225/i226 has various different filters. Flex filters provide a ++ * way to match up to the first 128 bytes of a packet. Use them for: ++ * a) For specific user data ++ * b) For VLAN EtherType ++ * c) For full TCI match ++ * d) Or in case multiple filter criteria are set ++ * ++ * Otherwise, use the simple MAC, VLAN PRIO or EtherType filters. + */ + if ((rule->filter.match_flags & IGC_FILTER_FLAG_USER_DATA) || + (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_ETYPE) || ++ ((rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) && ++ rule->filter.vlan_tci_mask == ntohs(VLAN_TCI_FULL_MASK)) || + (rule->filter.match_flags & (rule->filter.match_flags - 1))) + rule->flex = true; + else +@@ -1334,6 +1344,18 @@ static int igc_ethtool_add_nfc_rule(struct igc_adapter *adapter, + return -EINVAL; + } + ++ /* There are two ways to match the VLAN TCI: ++ * 1. Match on PCP field and use vlan prio filter for it ++ * 2. Match on complete TCI field and use flex filter for it ++ */ ++ if ((fsp->flow_type & FLOW_EXT) && ++ fsp->m_ext.vlan_tci && ++ fsp->m_ext.vlan_tci != htons(VLAN_PRIO_MASK) && ++ fsp->m_ext.vlan_tci != VLAN_TCI_FULL_MASK) { ++ netdev_dbg(netdev, "VLAN mask not supported\n"); ++ return -EOPNOTSUPP; ++ } ++ + if (fsp->location >= IGC_MAX_RXNFC_RULES) { + netdev_dbg(netdev, "Invalid location\n"); + return -EINVAL; +-- +2.43.0 + diff --git a/queue-6.1/igc-fix-hicredit-calculation.patch b/queue-6.1/igc-fix-hicredit-calculation.patch new file mode 100644 index 00000000000..cc61a83e454 --- /dev/null +++ b/queue-6.1/igc-fix-hicredit-calculation.patch @@ -0,0 +1,45 @@ +From 4b3b14b400fefd4fa7447adb596675bb2e8637e0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Dec 2023 15:58:16 +0100 +Subject: igc: Fix hicredit calculation + +From: Rodrigo Cataldo + +[ Upstream commit 947dfc8138dfaeb6e966e2d661de89eb203e3064 ] + +According to the Intel Software Manual for I225, Section 7.5.2.7, +hicredit should be multiplied by the constant link-rate value, 0x7736. + +Currently, the old constant link-rate value, 0x7735, from the boards +supported on igb are being used, most likely due to a copy'n'paste, as +the rest of the logic is the same for both drivers. + +Update hicredit accordingly. + +Fixes: 1ab011b0bf07 ("igc: Add support for CBS offloading") +Reviewed-by: Kurt Kanzenbach +Signed-off-by: Rodrigo Cataldo +Acked-by: Vinicius Costa Gomes +Tested-by: Naama Meir +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/igc/igc_tsn.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/intel/igc/igc_tsn.c b/drivers/net/ethernet/intel/igc/igc_tsn.c +index 725db36e399d2..31ea0781b65ec 100644 +--- a/drivers/net/ethernet/intel/igc/igc_tsn.c ++++ b/drivers/net/ethernet/intel/igc/igc_tsn.c +@@ -178,7 +178,7 @@ static int igc_tsn_enable_offload(struct igc_adapter *adapter) + wr32(IGC_TQAVCC(i), tqavcc); + + wr32(IGC_TQAVHC(i), +- 0x80000000 + ring->hicredit * 0x7735); ++ 0x80000000 + ring->hicredit * 0x7736); + } else { + /* Disable any CBS for the queue */ + txqctl &= ~(IGC_TXQCTL_QAV_SEL_MASK); +-- +2.43.0 + diff --git a/queue-6.1/igc-report-vlan-ethertype-matching-back-to-user.patch b/queue-6.1/igc-report-vlan-ethertype-matching-back-to-user.patch new file mode 100644 index 00000000000..87852f6ba3e --- /dev/null +++ b/queue-6.1/igc-report-vlan-ethertype-matching-back-to-user.patch @@ -0,0 +1,75 @@ +From 0d687ebbf03e0fea5331b2481ed7bc3e89afd878 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 1 Dec 2023 08:50:42 +0100 +Subject: igc: Report VLAN EtherType matching back to user + +From: Kurt Kanzenbach + +[ Upstream commit 088464abd48cf3735aee91f9e211b32da9d81117 ] + +Currently the driver allows to configure matching by VLAN EtherType. +However, the retrieval function does not report it back to the user. Add +it. + +Before: +|root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 action 0 +|Added rule with ID 63 +|root@host:~# ethtool --show-ntuple enp3s0 +|4 RX rings available +|Total 1 rules +| +|Filter: 63 +| Flow Type: Raw Ethernet +| Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF +| Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF +| Ethertype: 0x0 mask: 0xFFFF +| Action: Direct to queue 0 + +After: +|root@host:~# ethtool -N enp3s0 flow-type ether vlan-etype 0x8100 action 0 +|Added rule with ID 63 +|root@host:~# ethtool --show-ntuple enp3s0 +|4 RX rings available +|Total 1 rules +| +|Filter: 63 +| Flow Type: Raw Ethernet +| Src MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF +| Dest MAC addr: 00:00:00:00:00:00 mask: FF:FF:FF:FF:FF:FF +| Ethertype: 0x0 mask: 0xFFFF +| VLAN EtherType: 0x8100 mask: 0x0 +| VLAN: 0x0 mask: 0xffff +| User-defined: 0x0 mask: 0xffffffffffffffff +| Action: Direct to queue 0 + +Fixes: 2b477d057e33 ("igc: Integrate flex filter into ethtool ops") +Signed-off-by: Kurt Kanzenbach +Acked-by: Vinicius Costa Gomes +Reviewed-by: Simon Horman +Tested-by: Naama Meir +Signed-off-by: Tony Nguyen +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/igc/igc_ethtool.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/drivers/net/ethernet/intel/igc/igc_ethtool.c b/drivers/net/ethernet/intel/igc/igc_ethtool.c +index 81897f7a90a91..51ef18060dbc4 100644 +--- a/drivers/net/ethernet/intel/igc/igc_ethtool.c ++++ b/drivers/net/ethernet/intel/igc/igc_ethtool.c +@@ -979,6 +979,12 @@ static int igc_ethtool_get_nfc_rule(struct igc_adapter *adapter, + fsp->m_u.ether_spec.h_proto = ETHER_TYPE_FULL_MASK; + } + ++ if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_ETYPE) { ++ fsp->flow_type |= FLOW_EXT; ++ fsp->h_ext.vlan_etype = rule->filter.vlan_etype; ++ fsp->m_ext.vlan_etype = ETHER_TYPE_FULL_MASK; ++ } ++ + if (rule->filter.match_flags & IGC_FILTER_FLAG_VLAN_TCI) { + fsp->flow_type |= FLOW_EXT; + fsp->h_ext.vlan_tci = htons(rule->filter.vlan_tci); +-- +2.43.0 + diff --git a/queue-6.1/ipv4-ipv6-use-splice_eof-to-flush.patch b/queue-6.1/ipv4-ipv6-use-splice_eof-to-flush.patch new file mode 100644 index 00000000000..fe1c46675d3 --- /dev/null +++ b/queue-6.1/ipv4-ipv6-use-splice_eof-to-flush.patch @@ -0,0 +1,262 @@ +From 5611af5949dfd630156868ccdfe55a978083caf4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 7 Jun 2023 19:19:13 +0100 +Subject: ipv4, ipv6: Use splice_eof() to flush + +From: David Howells + +[ Upstream commit 1d7e4538a5463faa0b0e26a7a7b6bd68c7dfdd78 ] + +Allow splice to undo the effects of MSG_MORE after prematurely ending a +splice/sendfile due to getting an EOF condition (->splice_read() returned +0) after splice had called sendmsg() with MSG_MORE set when the user didn't +set MSG_MORE. + +For UDP, a pending packet will not be emitted if the socket is closed +before it is flushed; with this change, it be flushed by ->splice_eof(). + +For TCP, it's not clear that MSG_MORE is actually effective. + +Suggested-by: Linus Torvalds +Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ +Signed-off-by: David Howells +cc: Kuniyuki Iwashima +cc: Willem de Bruijn +cc: David Ahern +cc: Jens Axboe +cc: Matthew Wilcox +Signed-off-by: Jakub Kicinski +Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") +Signed-off-by: Sasha Levin +--- + include/net/inet_common.h | 1 + + include/net/tcp.h | 1 + + include/net/udp.h | 1 + + net/ipv4/af_inet.c | 18 ++++++++++++++++++ + net/ipv4/tcp.c | 16 ++++++++++++++++ + net/ipv4/tcp_ipv4.c | 1 + + net/ipv4/udp.c | 16 ++++++++++++++++ + net/ipv6/af_inet6.c | 1 + + net/ipv6/tcp_ipv6.c | 1 + + net/ipv6/udp.c | 15 +++++++++++++++ + 10 files changed, 71 insertions(+) + +diff --git a/include/net/inet_common.h b/include/net/inet_common.h +index cec453c18f1d6..4673bbfd2811f 100644 +--- a/include/net/inet_common.h ++++ b/include/net/inet_common.h +@@ -33,6 +33,7 @@ int inet_accept(struct socket *sock, struct socket *newsock, int flags, + bool kern); + int inet_send_prepare(struct sock *sk); + int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); ++void inet_splice_eof(struct socket *sock); + ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags); + int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, +diff --git a/include/net/tcp.h b/include/net/tcp.h +index c3d56b337f358..4c838f7290dd9 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -332,6 +332,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); + int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); + int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, + size_t size, struct ubuf_info *uarg); ++void tcp_splice_eof(struct socket *sock); + int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, + int flags); + int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, +diff --git a/include/net/udp.h b/include/net/udp.h +index fee053bcd17c6..fa4cdbe55552c 100644 +--- a/include/net/udp.h ++++ b/include/net/udp.h +@@ -269,6 +269,7 @@ int udp_get_port(struct sock *sk, unsigned short snum, + int udp_err(struct sk_buff *, u32); + int udp_abort(struct sock *sk, int err); + int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); ++void udp_splice_eof(struct socket *sock); + int udp_push_pending_frames(struct sock *sk); + void udp_flush_pending_frames(struct sock *sk); + int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index 5d379df90c826..347c3768df6e8 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -838,6 +838,21 @@ int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) + } + EXPORT_SYMBOL(inet_sendmsg); + ++void inet_splice_eof(struct socket *sock) ++{ ++ const struct proto *prot; ++ struct sock *sk = sock->sk; ++ ++ if (unlikely(inet_send_prepare(sk))) ++ return; ++ ++ /* IPV6_ADDRFORM can change sk->sk_prot under us. */ ++ prot = READ_ONCE(sk->sk_prot); ++ if (prot->splice_eof) ++ prot->splice_eof(sock); ++} ++EXPORT_SYMBOL_GPL(inet_splice_eof); ++ + ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) + { +@@ -1057,6 +1072,7 @@ const struct proto_ops inet_stream_ops = { + #ifdef CONFIG_MMU + .mmap = tcp_mmap, + #endif ++ .splice_eof = inet_splice_eof, + .sendpage = inet_sendpage, + .splice_read = tcp_splice_read, + .read_sock = tcp_read_sock, +@@ -1091,6 +1107,7 @@ const struct proto_ops inet_dgram_ops = { + .read_skb = udp_read_skb, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, ++ .splice_eof = inet_splice_eof, + .sendpage = inet_sendpage, + .set_peek_off = sk_set_peek_off, + #ifdef CONFIG_COMPAT +@@ -1122,6 +1139,7 @@ static const struct proto_ops inet_sockraw_ops = { + .sendmsg = inet_sendmsg, + .recvmsg = inet_recvmsg, + .mmap = sock_no_mmap, ++ .splice_eof = inet_splice_eof, + .sendpage = inet_sendpage, + #ifdef CONFIG_COMPAT + .compat_ioctl = inet_compat_ioctl, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 3935451ad061e..0b7844a8d5711 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -1492,6 +1492,22 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) + } + EXPORT_SYMBOL(tcp_sendmsg); + ++void tcp_splice_eof(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct tcp_sock *tp = tcp_sk(sk); ++ int mss_now, size_goal; ++ ++ if (!tcp_write_queue_tail(sk)) ++ return; ++ ++ lock_sock(sk); ++ mss_now = tcp_send_mss(sk, &size_goal, 0); ++ tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); ++ release_sock(sk); ++} ++EXPORT_SYMBOL_GPL(tcp_splice_eof); ++ + /* + * Handle reading urgent data. BSD has very simple semantics for + * this, no blocking and very strange errors 8) +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index 7ebbbe561e402..be2c807eed15d 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -3067,6 +3067,7 @@ struct proto tcp_prot = { + .keepalive = tcp_set_keepalive, + .recvmsg = tcp_recvmsg, + .sendmsg = tcp_sendmsg, ++ .splice_eof = tcp_splice_eof, + .sendpage = tcp_sendpage, + .backlog_rcv = tcp_v4_do_rcv, + .release_cb = tcp_release_cb, +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index b49cb3df01bb4..e8dd2880ac9aa 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -1332,6 +1332,21 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) + } + EXPORT_SYMBOL(udp_sendmsg); + ++void udp_splice_eof(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct udp_sock *up = udp_sk(sk); ++ ++ if (!up->pending || READ_ONCE(up->corkflag)) ++ return; ++ ++ lock_sock(sk); ++ if (up->pending && !READ_ONCE(up->corkflag)) ++ udp_push_pending_frames(sk); ++ release_sock(sk); ++} ++EXPORT_SYMBOL_GPL(udp_splice_eof); ++ + int udp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) + { +@@ -2907,6 +2922,7 @@ struct proto udp_prot = { + .getsockopt = udp_getsockopt, + .sendmsg = udp_sendmsg, + .recvmsg = udp_recvmsg, ++ .splice_eof = udp_splice_eof, + .sendpage = udp_sendpage, + .release_cb = ip4_datagram_release_cb, + .hash = udp_lib_hash, +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c +index b5309ae87fd79..a2f29ca516000 100644 +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -711,6 +711,7 @@ const struct proto_ops inet6_stream_ops = { + #ifdef CONFIG_MMU + .mmap = tcp_mmap, + #endif ++ .splice_eof = inet_splice_eof, + .sendpage = inet_sendpage, + .sendmsg_locked = tcp_sendmsg_locked, + .sendpage_locked = tcp_sendpage_locked, +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c +index 7be89dcfd5fc5..ba9a22db5805c 100644 +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -2158,6 +2158,7 @@ struct proto tcpv6_prot = { + .keepalive = tcp_set_keepalive, + .recvmsg = tcp_recvmsg, + .sendmsg = tcp_sendmsg, ++ .splice_eof = tcp_splice_eof, + .sendpage = tcp_sendpage, + .backlog_rcv = tcp_v6_do_rcv, + .release_cb = tcp_release_cb, +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index 7f49f69226a21..2a65136dca773 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -1657,6 +1657,20 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) + goto out; + } + ++static void udpv6_splice_eof(struct socket *sock) ++{ ++ struct sock *sk = sock->sk; ++ struct udp_sock *up = udp_sk(sk); ++ ++ if (!up->pending || READ_ONCE(up->corkflag)) ++ return; ++ ++ lock_sock(sk); ++ if (up->pending && !READ_ONCE(up->corkflag)) ++ udp_v6_push_pending_frames(sk); ++ release_sock(sk); ++} ++ + void udpv6_destroy_sock(struct sock *sk) + { + struct udp_sock *up = udp_sk(sk); +@@ -1768,6 +1782,7 @@ struct proto udpv6_prot = { + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, ++ .splice_eof = udpv6_splice_eof, + .release_cb = ip6_datagram_release_cb, + .hash = udp_lib_hash, + .unhash = udp_lib_unhash, +-- +2.43.0 + diff --git a/queue-6.1/khugepage-replace-try_to_release_page-with-filemap_r.patch b/queue-6.1/khugepage-replace-try_to_release_page-with-filemap_r.patch new file mode 100644 index 00000000000..bdbdadc9d89 --- /dev/null +++ b/queue-6.1/khugepage-replace-try_to_release_page-with-filemap_r.patch @@ -0,0 +1,95 @@ +From 3d1c97b9a2cc1afdd2fa063fb59338e2a8a04818 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 17 Nov 2022 23:30:53 -0800 +Subject: khugepage: replace try_to_release_page() with filemap_release_folio() + +From: Vishal Moola (Oracle) + +[ Upstream commit 64ab3195ea077eaeedc8b382939c3dc5ca56f369 ] + +Replace some calls with their folio equivalents. This change removes 4 +calls to compound_head() and is in preparation for the removal of the +try_to_release_page() wrapper. + +Link: https://lkml.kernel.org/r/20221118073055.55694-3-vishal.moola@gmail.com +Signed-off-by: Vishal Moola (Oracle) +Cc: Matthew Wilcox +Cc: Naoya Horiguchi +Cc: Theodore Ts'o +Signed-off-by: Andrew Morton +Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") +Signed-off-by: Sasha Levin +--- + mm/khugepaged.c | 23 ++++++++++++----------- + 1 file changed, 12 insertions(+), 11 deletions(-) + +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index ef72d3df4b65b..6fc7db587c453 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1818,6 +1818,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, + xas_set(&xas, start); + for (index = start; index < end; index++) { + struct page *page = xas_next(&xas); ++ struct folio *folio; + + VM_BUG_ON(index != xas.xa_index); + if (is_shmem) { +@@ -1844,8 +1845,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, + } + + if (xa_is_value(page) || !PageUptodate(page)) { +- struct folio *folio; +- + xas_unlock_irq(&xas); + /* swap in or instantiate fallocated page */ + if (shmem_get_folio(mapping->host, index, +@@ -1933,13 +1932,15 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, + goto out_unlock; + } + +- if (page_mapping(page) != mapping) { ++ folio = page_folio(page); ++ ++ if (folio_mapping(folio) != mapping) { + result = SCAN_TRUNCATED; + goto out_unlock; + } + +- if (!is_shmem && (PageDirty(page) || +- PageWriteback(page))) { ++ if (!is_shmem && (folio_test_dirty(folio) || ++ folio_test_writeback(folio))) { + /* + * khugepaged only works on read-only fd, so this + * page is dirty because it hasn't been flushed +@@ -1949,20 +1950,20 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, + goto out_unlock; + } + +- if (isolate_lru_page(page)) { ++ if (folio_isolate_lru(folio)) { + result = SCAN_DEL_PAGE_LRU; + goto out_unlock; + } + +- if (page_has_private(page) && +- !try_to_release_page(page, GFP_KERNEL)) { ++ if (folio_has_private(folio) && ++ !filemap_release_folio(folio, GFP_KERNEL)) { + result = SCAN_PAGE_HAS_PRIVATE; +- putback_lru_page(page); ++ folio_putback_lru(folio); + goto out_unlock; + } + +- if (page_mapped(page)) +- try_to_unmap(page_folio(page), ++ if (folio_mapped(folio)) ++ try_to_unmap(folio, + TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); + + xas_lock_irq(&xas); +-- +2.43.0 + diff --git a/queue-6.1/lib-group_cpus.c-avoid-acquiring-cpu-hotplug-lock-in.patch b/queue-6.1/lib-group_cpus.c-avoid-acquiring-cpu-hotplug-lock-in.patch new file mode 100644 index 00000000000..02d52714674 --- /dev/null +++ b/queue-6.1/lib-group_cpus.c-avoid-acquiring-cpu-hotplug-lock-in.patch @@ -0,0 +1,102 @@ +From f07953806fd1f09054b8a7c16085bb0faaba9aec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 20 Nov 2023 16:35:59 +0800 +Subject: lib/group_cpus.c: avoid acquiring cpu hotplug lock in + group_cpus_evenly + +From: Ming Lei + +[ Upstream commit 0263f92fadbb9d294d5971ac57743f882c93b2b3 ] + +group_cpus_evenly() could be part of storage driver's error handler, such +as nvme driver, when may happen during CPU hotplug, in which storage queue +has to drain its pending IOs because all CPUs associated with the queue +are offline and the queue is becoming inactive. And handling IO needs +error handler to provide forward progress. + +Then deadlock is caused: + +1) inside CPU hotplug handler, CPU hotplug lock is held, and blk-mq's + handler is waiting for inflight IO + +2) error handler is waiting for CPU hotplug lock + +3) inflight IO can't be completed in blk-mq's CPU hotplug handler + because error handling can't provide forward progress. + +Solve the deadlock by not holding CPU hotplug lock in group_cpus_evenly(), +in which two stage spreads are taken: 1) the 1st stage is over all present +CPUs; 2) the end stage is over all other CPUs. + +Turns out the two stage spread just needs consistent 'cpu_present_mask', +and remove the CPU hotplug lock by storing it into one local cache. This +way doesn't change correctness, because all CPUs are still covered. + +Link: https://lkml.kernel.org/r/20231120083559.285174-1-ming.lei@redhat.com +Signed-off-by: Ming Lei +Reported-by: Yi Zhang +Reported-by: Guangwu Zhang +Tested-by: Guangwu Zhang +Reviewed-by: Chengming Zhou +Reviewed-by: Jens Axboe +Cc: Keith Busch +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + lib/group_cpus.c | 22 ++++++++++++++++------ + 1 file changed, 16 insertions(+), 6 deletions(-) + +diff --git a/lib/group_cpus.c b/lib/group_cpus.c +index 99f08c6cb9d97..156b1446d2a20 100644 +--- a/lib/group_cpus.c ++++ b/lib/group_cpus.c +@@ -365,13 +365,25 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) + if (!masks) + goto fail_node_to_cpumask; + +- /* Stabilize the cpumasks */ +- cpus_read_lock(); + build_node_to_cpumask(node_to_cpumask); + ++ /* ++ * Make a local cache of 'cpu_present_mask', so the two stages ++ * spread can observe consistent 'cpu_present_mask' without holding ++ * cpu hotplug lock, then we can reduce deadlock risk with cpu ++ * hotplug code. ++ * ++ * Here CPU hotplug may happen when reading `cpu_present_mask`, and ++ * we can live with the case because it only affects that hotplug ++ * CPU is handled in the 1st or 2nd stage, and either way is correct ++ * from API user viewpoint since 2-stage spread is sort of ++ * optimization. ++ */ ++ cpumask_copy(npresmsk, data_race(cpu_present_mask)); ++ + /* grouping present CPUs first */ + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, +- cpu_present_mask, nmsk, masks); ++ npresmsk, nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present = ret; +@@ -386,15 +398,13 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps) + curgrp = 0; + else + curgrp = nr_present; +- cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); ++ cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk); + ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + npresmsk, nmsk, masks); + if (ret >= 0) + nr_others = ret; + + fail_build_affinity: +- cpus_read_unlock(); +- + if (ret >= 0) + WARN_ON(nr_present + nr_others < numgrps); + +-- +2.43.0 + diff --git a/queue-6.1/media-camss-sm8250-virtual-channels-for-csid.patch b/queue-6.1/media-camss-sm8250-virtual-channels-for-csid.patch new file mode 100644 index 00000000000..7b657bf9f1d --- /dev/null +++ b/queue-6.1/media-camss-sm8250-virtual-channels-for-csid.patch @@ -0,0 +1,307 @@ +From e153f80eac85c4d13fc6aa0c5ddb79469a59ee34 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 9 Dec 2022 11:40:34 +0200 +Subject: media: camss: sm8250: Virtual channels for CSID + +From: Milen Mitkov + +[ Upstream commit 3c4ed72a16bc6733cda9c65048af74a2e8eaa0eb ] + +CSID hardware on SM8250 can demux up to 4 simultaneous streams +based on virtual channel (vc) or datatype (dt). +The CSID subdevice entity now has 4 source ports that can be +enabled/disabled and thus can control which virtual channels +are enabled. Datatype demuxing not tested. + +In order to keep a valid internal state of the subdevice, +implicit format propagation from the sink to the source pads +has been preserved. However, the format on each source pad +can be different and in that case it must be configured explicitly. + +CSID's s_stream is called when any stream is started or stopped. +It will call configure_streams() that will rewrite IRQ settings to HW. +When multiple streams are running simultaneously there is an issue +when writing IRQ settings for one stream while another is still +running, thus avoid re-writing settings if they were not changed +in link setup, or by fully powering off the CSID hardware. + +Signed-off-by: Milen Mitkov +Reviewed-by: Robert Foss +Tested-by: Bryan O'Donoghue +Acked-by: Robert Foss +Signed-off-by: Hans Verkuil +Stable-dep-of: e655d1ae9703 ("media: qcom: camss: Fix set CSI2_RX_CFG1_VC_MODE when VC is greater than 3") +Signed-off-by: Sasha Levin +--- + .../platform/qcom/camss/camss-csid-gen2.c | 54 ++++++++++++------- + .../media/platform/qcom/camss/camss-csid.c | 44 ++++++++++----- + .../media/platform/qcom/camss/camss-csid.h | 11 +++- + 3 files changed, 74 insertions(+), 35 deletions(-) + +diff --git a/drivers/media/platform/qcom/camss/camss-csid-gen2.c b/drivers/media/platform/qcom/camss/camss-csid-gen2.c +index 904208f6f9546..2e015e69a6ad6 100644 +--- a/drivers/media/platform/qcom/camss/camss-csid-gen2.c ++++ b/drivers/media/platform/qcom/camss/camss-csid-gen2.c +@@ -334,13 +334,14 @@ static const struct csid_format csid_formats[] = { + }, + }; + +-static void csid_configure_stream(struct csid_device *csid, u8 enable) ++static void __csid_configure_stream(struct csid_device *csid, u8 enable, u8 vc) + { + struct csid_testgen_config *tg = &csid->testgen; + u32 val; + u32 phy_sel = 0; + u8 lane_cnt = csid->phy.lane_cnt; +- struct v4l2_mbus_framefmt *input_format = &csid->fmt[MSM_CSID_PAD_SRC]; ++ /* Source pads matching RDI channels on hardware. Pad 1 -> RDI0, Pad 2 -> RDI1, etc. */ ++ struct v4l2_mbus_framefmt *input_format = &csid->fmt[MSM_CSID_PAD_FIRST_SRC + vc]; + const struct csid_format *format = csid_get_fmt_entry(csid->formats, csid->nformats, + input_format->code); + +@@ -351,8 +352,7 @@ static void csid_configure_stream(struct csid_device *csid, u8 enable) + phy_sel = csid->phy.csiphy_id; + + if (enable) { +- u8 vc = 0; /* Virtual Channel 0 */ +- u8 dt_id = vc * 4; ++ u8 dt_id = vc; + + if (tg->enabled) { + /* configure one DT, infinite frames */ +@@ -392,42 +392,42 @@ static void csid_configure_stream(struct csid_device *csid, u8 enable) + val |= format->data_type << RDI_CFG0_DATA_TYPE; + val |= vc << RDI_CFG0_VIRTUAL_CHANNEL; + val |= dt_id << RDI_CFG0_DT_ID; +- writel_relaxed(val, csid->base + CSID_RDI_CFG0(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_CFG0(vc)); + + /* CSID_TIMESTAMP_STB_POST_IRQ */ + val = 2 << RDI_CFG1_TIMESTAMP_STB_SEL; +- writel_relaxed(val, csid->base + CSID_RDI_CFG1(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_CFG1(vc)); + + val = 1; +- writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PERIOD(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PERIOD(vc)); + + val = 0; +- writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PATTERN(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_FRM_DROP_PATTERN(vc)); + + val = 1; +- writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PERIOD(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PERIOD(vc)); + + val = 0; +- writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PATTERN(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_IRQ_SUBSAMPLE_PATTERN(vc)); + + val = 1; +- writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PERIOD(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PERIOD(vc)); + + val = 0; +- writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PATTERN(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_RPP_PIX_DROP_PATTERN(vc)); + + val = 1; +- writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PERIOD(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PERIOD(vc)); + + val = 0; +- writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PATTERN(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_RPP_LINE_DROP_PATTERN(vc)); + + val = 0; +- writel_relaxed(val, csid->base + CSID_RDI_CTRL(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_CTRL(vc)); + +- val = readl_relaxed(csid->base + CSID_RDI_CFG0(0)); ++ val = readl_relaxed(csid->base + CSID_RDI_CFG0(vc)); + val |= 1 << RDI_CFG0_ENABLE; +- writel_relaxed(val, csid->base + CSID_RDI_CFG0(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_CFG0(vc)); + } + + if (tg->enabled) { +@@ -453,7 +453,16 @@ static void csid_configure_stream(struct csid_device *csid, u8 enable) + val = HALT_CMD_RESUME_AT_FRAME_BOUNDARY << RDI_CTRL_HALT_CMD; + else + val = HALT_CMD_HALT_AT_FRAME_BOUNDARY << RDI_CTRL_HALT_CMD; +- writel_relaxed(val, csid->base + CSID_RDI_CTRL(0)); ++ writel_relaxed(val, csid->base + CSID_RDI_CTRL(vc)); ++} ++ ++static void csid_configure_stream(struct csid_device *csid, u8 enable) ++{ ++ u8 i; ++ /* Loop through all enabled VCs and configure stream for each */ ++ for (i = 0; i < MSM_CSID_MAX_SRC_STREAMS; i++) ++ if (csid->phy.en_vc & BIT(i)) ++ __csid_configure_stream(csid, enable, i); + } + + static int csid_configure_testgen_pattern(struct csid_device *csid, s32 val) +@@ -499,6 +508,7 @@ static irqreturn_t csid_isr(int irq, void *dev) + struct csid_device *csid = dev; + u32 val; + u8 reset_done; ++ int i; + + val = readl_relaxed(csid->base + CSID_TOP_IRQ_STATUS); + writel_relaxed(val, csid->base + CSID_TOP_IRQ_CLEAR); +@@ -507,8 +517,12 @@ static irqreturn_t csid_isr(int irq, void *dev) + val = readl_relaxed(csid->base + CSID_CSI2_RX_IRQ_STATUS); + writel_relaxed(val, csid->base + CSID_CSI2_RX_IRQ_CLEAR); + +- val = readl_relaxed(csid->base + CSID_CSI2_RDIN_IRQ_STATUS(0)); +- writel_relaxed(val, csid->base + CSID_CSI2_RDIN_IRQ_CLEAR(0)); ++ /* Read and clear IRQ status for each enabled RDI channel */ ++ for (i = 0; i < MSM_CSID_MAX_SRC_STREAMS; i++) ++ if (csid->phy.en_vc & BIT(i)) { ++ val = readl_relaxed(csid->base + CSID_CSI2_RDIN_IRQ_STATUS(i)); ++ writel_relaxed(val, csid->base + CSID_CSI2_RDIN_IRQ_CLEAR(i)); ++ } + + val = 1 << IRQ_CMD_CLEAR; + writel_relaxed(val, csid->base + CSID_IRQ_CMD); +diff --git a/drivers/media/platform/qcom/camss/camss-csid.c b/drivers/media/platform/qcom/camss/camss-csid.c +index 88f188e0f7501..6360314f04a63 100644 +--- a/drivers/media/platform/qcom/camss/camss-csid.c ++++ b/drivers/media/platform/qcom/camss/camss-csid.c +@@ -196,6 +196,8 @@ static int csid_set_power(struct v4l2_subdev *sd, int on) + return ret; + } + ++ csid->phy.need_vc_update = true; ++ + enable_irq(csid->irq); + + ret = csid->ops->reset(csid); +@@ -249,7 +251,10 @@ static int csid_set_stream(struct v4l2_subdev *sd, int enable) + return -ENOLINK; + } + +- csid->ops->configure_stream(csid, enable); ++ if (csid->phy.need_vc_update) { ++ csid->ops->configure_stream(csid, enable); ++ csid->phy.need_vc_update = false; ++ } + + return 0; + } +@@ -460,6 +465,7 @@ static int csid_set_format(struct v4l2_subdev *sd, + { + struct csid_device *csid = v4l2_get_subdevdata(sd); + struct v4l2_mbus_framefmt *format; ++ int i; + + format = __csid_get_format(csid, sd_state, fmt->pad, fmt->which); + if (format == NULL) +@@ -468,14 +474,14 @@ static int csid_set_format(struct v4l2_subdev *sd, + csid_try_format(csid, sd_state, fmt->pad, &fmt->format, fmt->which); + *format = fmt->format; + +- /* Propagate the format from sink to source */ ++ /* Propagate the format from sink to source pads */ + if (fmt->pad == MSM_CSID_PAD_SINK) { +- format = __csid_get_format(csid, sd_state, MSM_CSID_PAD_SRC, +- fmt->which); ++ for (i = MSM_CSID_PAD_FIRST_SRC; i < MSM_CSID_PADS_NUM; ++i) { ++ format = __csid_get_format(csid, sd_state, i, fmt->which); + +- *format = fmt->format; +- csid_try_format(csid, sd_state, MSM_CSID_PAD_SRC, format, +- fmt->which); ++ *format = fmt->format; ++ csid_try_format(csid, sd_state, i, format, fmt->which); ++ } + } + + return 0; +@@ -738,7 +744,6 @@ static int csid_link_setup(struct media_entity *entity, + struct csid_device *csid; + struct csiphy_device *csiphy; + struct csiphy_lanes_cfg *lane_cfg; +- struct v4l2_subdev_format format = { 0 }; + + sd = media_entity_to_v4l2_subdev(entity); + csid = v4l2_get_subdevdata(sd); +@@ -761,11 +766,22 @@ static int csid_link_setup(struct media_entity *entity, + lane_cfg = &csiphy->cfg.csi2->lane_cfg; + csid->phy.lane_cnt = lane_cfg->num_data; + csid->phy.lane_assign = csid_get_lane_assign(lane_cfg); ++ } ++ /* Decide which virtual channels to enable based on which source pads are enabled */ ++ if (local->flags & MEDIA_PAD_FL_SOURCE) { ++ struct v4l2_subdev *sd = media_entity_to_v4l2_subdev(entity); ++ struct csid_device *csid = v4l2_get_subdevdata(sd); ++ struct device *dev = csid->camss->dev; ++ ++ if (flags & MEDIA_LNK_FL_ENABLED) ++ csid->phy.en_vc |= BIT(local->index - 1); ++ else ++ csid->phy.en_vc &= ~BIT(local->index - 1); + +- /* Reset format on source pad to sink pad format */ +- format.pad = MSM_CSID_PAD_SRC; +- format.which = V4L2_SUBDEV_FORMAT_ACTIVE; +- csid_set_format(&csid->subdev, NULL, &format); ++ csid->phy.need_vc_update = true; ++ ++ dev_dbg(dev, "%s: Enabled CSID virtual channels mask 0x%x\n", ++ __func__, csid->phy.en_vc); + } + + return 0; +@@ -816,6 +832,7 @@ int msm_csid_register_entity(struct csid_device *csid, + struct v4l2_subdev *sd = &csid->subdev; + struct media_pad *pads = csid->pads; + struct device *dev = csid->camss->dev; ++ int i; + int ret; + + v4l2_subdev_init(sd, &csid_v4l2_ops); +@@ -852,7 +869,8 @@ int msm_csid_register_entity(struct csid_device *csid, + } + + pads[MSM_CSID_PAD_SINK].flags = MEDIA_PAD_FL_SINK; +- pads[MSM_CSID_PAD_SRC].flags = MEDIA_PAD_FL_SOURCE; ++ for (i = MSM_CSID_PAD_FIRST_SRC; i < MSM_CSID_PADS_NUM; ++i) ++ pads[i].flags = MEDIA_PAD_FL_SOURCE; + + sd->entity.function = MEDIA_ENT_F_PROC_VIDEO_PIXEL_FORMATTER; + sd->entity.ops = &csid_media_ops; +diff --git a/drivers/media/platform/qcom/camss/camss-csid.h b/drivers/media/platform/qcom/camss/camss-csid.h +index f06040e44c515..d4b48432a0973 100644 +--- a/drivers/media/platform/qcom/camss/camss-csid.h ++++ b/drivers/media/platform/qcom/camss/camss-csid.h +@@ -19,8 +19,13 @@ + #include + + #define MSM_CSID_PAD_SINK 0 +-#define MSM_CSID_PAD_SRC 1 +-#define MSM_CSID_PADS_NUM 2 ++#define MSM_CSID_PAD_FIRST_SRC 1 ++#define MSM_CSID_PADS_NUM 5 ++ ++#define MSM_CSID_PAD_SRC (MSM_CSID_PAD_FIRST_SRC) ++ ++/* CSID hardware can demultiplex up to 4 outputs */ ++#define MSM_CSID_MAX_SRC_STREAMS 4 + + #define DATA_TYPE_EMBEDDED_DATA_8BIT 0x12 + #define DATA_TYPE_YUV420_8BIT 0x18 +@@ -81,6 +86,8 @@ struct csid_phy_config { + u8 csiphy_id; + u8 lane_cnt; + u32 lane_assign; ++ u32 en_vc; ++ u8 need_vc_update; + }; + + struct csid_device; +-- +2.43.0 + diff --git a/queue-6.1/media-qcom-camss-fix-set-csi2_rx_cfg1_vc_mode-when-v.patch b/queue-6.1/media-qcom-camss-fix-set-csi2_rx_cfg1_vc_mode-when-v.patch new file mode 100644 index 00000000000..a318ff4b8ee --- /dev/null +++ b/queue-6.1/media-qcom-camss-fix-set-csi2_rx_cfg1_vc_mode-when-v.patch @@ -0,0 +1,39 @@ +From b716307f6947508dbb996139baebff85b0be36ae Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 30 Aug 2023 16:16:14 +0100 +Subject: media: qcom: camss: Fix set CSI2_RX_CFG1_VC_MODE when VC is greater + than 3 + +From: Bryan O'Donoghue + +[ Upstream commit e655d1ae9703286cef7fda8675cad62f649dc183 ] + +VC_MODE = 0 implies a two bit VC address. +VC_MODE = 1 is required for VCs with a larger address than two bits. + +Fixes: eebe6d00e9bf ("media: camss: Add support for CSID hardware version Titan 170") +Cc: stable@vger.kernel.org +Signed-off-by: Bryan O'Donoghue +Reviewed-by: Laurent Pinchart +Signed-off-by: Hans Verkuil +Signed-off-by: Sasha Levin +--- + drivers/media/platform/qcom/camss/camss-csid-gen2.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/media/platform/qcom/camss/camss-csid-gen2.c b/drivers/media/platform/qcom/camss/camss-csid-gen2.c +index 2e015e69a6ad6..23acc387be5f0 100644 +--- a/drivers/media/platform/qcom/camss/camss-csid-gen2.c ++++ b/drivers/media/platform/qcom/camss/camss-csid-gen2.c +@@ -446,6 +446,8 @@ static void __csid_configure_stream(struct csid_device *csid, u8 enable, u8 vc) + writel_relaxed(val, csid->base + CSID_CSI2_RX_CFG0); + + val = 1 << CSI2_RX_CFG1_PACKET_ECC_CORRECTION_EN; ++ if (vc > 3) ++ val |= 1 << CSI2_RX_CFG1_VC_MODE; + val |= 1 << CSI2_RX_CFG1_MISR_EN; + writel_relaxed(val, csid->base + CSID_CSI2_RX_CFG1); + +-- +2.43.0 + diff --git a/queue-6.1/memory-failure-convert-truncate_error_page-to-use-fo.patch b/queue-6.1/memory-failure-convert-truncate_error_page-to-use-fo.patch new file mode 100644 index 00000000000..35b503537ea --- /dev/null +++ b/queue-6.1/memory-failure-convert-truncate_error_page-to-use-fo.patch @@ -0,0 +1,47 @@ +From 86430873bd38064e37a7298e400a5f663c4efa25 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 17 Nov 2022 23:30:54 -0800 +Subject: memory-failure: convert truncate_error_page() to use folio + +From: Vishal Moola (Oracle) + +[ Upstream commit ac5efa782041670b63a05c36d92d02a80e50bb63 ] + +Replace try_to_release_page() with filemap_release_folio(). This change +is in preparation for the removal of the try_to_release_page() wrapper. + +Link: https://lkml.kernel.org/r/20221118073055.55694-4-vishal.moola@gmail.com +Signed-off-by: Vishal Moola (Oracle) +Acked-by: Naoya Horiguchi +Cc: Matthew Wilcox +Cc: Theodore Ts'o +Signed-off-by: Andrew Morton +Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") +Signed-off-by: Sasha Levin +--- + mm/memory-failure.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index ebd717157c813..6355166a6bb28 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -827,12 +827,13 @@ static int truncate_error_page(struct page *p, unsigned long pfn, + int ret = MF_FAILED; + + if (mapping->a_ops->error_remove_page) { ++ struct folio *folio = page_folio(p); + int err = mapping->a_ops->error_remove_page(mapping, p); + + if (err != 0) { + pr_info("%#lx: Failed to punch page: %d\n", pfn, err); +- } else if (page_has_private(p) && +- !try_to_release_page(p, GFP_NOIO)) { ++ } else if (folio_has_private(folio) && ++ !filemap_release_folio(folio, GFP_NOIO)) { + pr_info("%#lx: failed to release buffers\n", pfn); + } else { + ret = MF_RECOVERED; +-- +2.43.0 + diff --git a/queue-6.1/mlxbf_gige-fix-receive-packet-race-condition.patch b/queue-6.1/mlxbf_gige-fix-receive-packet-race-condition.patch new file mode 100644 index 00000000000..8792e43a984 --- /dev/null +++ b/queue-6.1/mlxbf_gige-fix-receive-packet-race-condition.patch @@ -0,0 +1,63 @@ +From e38ef647ff2cf5958850b2c4b30eebe83d34dcaf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Dec 2023 18:47:39 -0500 +Subject: mlxbf_gige: fix receive packet race condition + +From: David Thompson + +[ Upstream commit dcea1bd45e6d111cc8fc1aaefa7e31694089bda3 ] + +Under heavy traffic, the BlueField Gigabit interface can +become unresponsive. This is due to a possible race condition +in the mlxbf_gige_rx_packet function, where the function exits +with producer and consumer indices equal but there are remaining +packet(s) to be processed. In order to prevent this situation, +read receive consumer index *before* the HW replenish so that +the mlxbf_gige_rx_packet function returns an accurate return +value even if a packet is received into just-replenished buffer +prior to exiting this routine. If the just-replenished buffer +is received and occupies the last RX ring entry, the interface +would not recover and instead would encounter RX packet drops +related to internal buffer shortages since the driver RX logic +is not being triggered to drain the RX ring. This patch will +address and prevent this "ring full" condition. + +Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") +Reviewed-by: Asmaa Mnebhi +Signed-off-by: David Thompson +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c +index 0d5a41a2ae010..227d01cace3f0 100644 +--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c ++++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_rx.c +@@ -267,6 +267,13 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts) + priv->stats.rx_truncate_errors++; + } + ++ /* Read receive consumer index before replenish so that this routine ++ * returns accurate return value even if packet is received into ++ * just-replenished buffer prior to exiting this routine. ++ */ ++ rx_ci = readq(priv->base + MLXBF_GIGE_RX_CQE_PACKET_CI); ++ rx_ci_rem = rx_ci % priv->rx_q_entries; ++ + /* Let hardware know we've replenished one buffer */ + rx_pi++; + +@@ -279,8 +286,6 @@ static bool mlxbf_gige_rx_packet(struct mlxbf_gige *priv, int *rx_pkts) + rx_pi_rem = rx_pi % priv->rx_q_entries; + if (rx_pi_rem == 0) + priv->valid_polarity ^= 1; +- rx_ci = readq(priv->base + MLXBF_GIGE_RX_CQE_PACKET_CI); +- rx_ci_rem = rx_ci % priv->rx_q_entries; + + if (skb) + netif_receive_skb(skb); +-- +2.43.0 + diff --git a/queue-6.1/mm-memory_hotplug-add-missing-mem_hotplug_lock.patch b/queue-6.1/mm-memory_hotplug-add-missing-mem_hotplug_lock.patch new file mode 100644 index 00000000000..dc74f9293fc --- /dev/null +++ b/queue-6.1/mm-memory_hotplug-add-missing-mem_hotplug_lock.patch @@ -0,0 +1,218 @@ +From 670dabf41eb1dc619547a684c591cbef6598cb48 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 20 Nov 2023 15:53:52 +0100 +Subject: mm/memory_hotplug: add missing mem_hotplug_lock + +From: Sumanth Korikkar + +[ Upstream commit 001002e73712cdf6b8d9a103648cda3040ad7647 ] + +From Documentation/core-api/memory-hotplug.rst: +When adding/removing/onlining/offlining memory or adding/removing +heterogeneous/device memory, we should always hold the mem_hotplug_lock +in write mode to serialise memory hotplug (e.g. access to global/zone +variables). + +mhp_(de)init_memmap_on_memory() functions can change zone stats and +struct page content, but they are currently called w/o the +mem_hotplug_lock. + +When memory block is being offlined and when kmemleak goes through each +populated zone, the following theoretical race conditions could occur: +CPU 0: | CPU 1: +memory_offline() | +-> offline_pages() | + -> mem_hotplug_begin() | + ... | + -> mem_hotplug_done() | + | kmemleak_scan() + | -> get_online_mems() + | ... +-> mhp_deinit_memmap_on_memory() | + [not protected by mem_hotplug_begin/done()]| + Marks memory section as offline, | Retrieves zone_start_pfn + poisons vmemmap struct pages and updates | and struct page members. + the zone related data | + | ... + | -> put_online_mems() + +Fix this by ensuring mem_hotplug_lock is taken before performing +mhp_init_memmap_on_memory(). Also ensure that +mhp_deinit_memmap_on_memory() holds the lock. + +online/offline_pages() are currently only called from +memory_block_online/offline(), so it is safe to move the locking there. + +Link: https://lkml.kernel.org/r/20231120145354.308999-2-sumanthk@linux.ibm.com +Fixes: a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range") +Signed-off-by: Sumanth Korikkar +Reviewed-by: Gerald Schaefer +Acked-by: David Hildenbrand +Cc: Alexander Gordeev +Cc: Aneesh Kumar K.V +Cc: Anshuman Khandual +Cc: Heiko Carstens +Cc: Michal Hocko +Cc: Oscar Salvador +Cc: Vasily Gorbik +Cc: kernel test robot +Cc: [5.15+] +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + drivers/base/memory.c | 18 +++++++++++++++--- + mm/memory_hotplug.c | 13 ++++++------- + 2 files changed, 21 insertions(+), 10 deletions(-) + +diff --git a/drivers/base/memory.c b/drivers/base/memory.c +index 9aa0da991cfb9..5d39f3e374dae 100644 +--- a/drivers/base/memory.c ++++ b/drivers/base/memory.c +@@ -175,6 +175,9 @@ int memory_notify(unsigned long val, void *v) + return blocking_notifier_call_chain(&memory_chain, val, v); + } + ++/* ++ * Must acquire mem_hotplug_lock in write mode. ++ */ + static int memory_block_online(struct memory_block *mem) + { + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); +@@ -193,10 +196,11 @@ static int memory_block_online(struct memory_block *mem) + * stage helps to keep accounting easier to follow - e.g vmemmaps + * belong to the same zone as the memory they backed. + */ ++ mem_hotplug_begin(); + if (nr_vmemmap_pages) { + ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); + if (ret) +- return ret; ++ goto out; + } + + ret = online_pages(start_pfn + nr_vmemmap_pages, +@@ -204,7 +208,7 @@ static int memory_block_online(struct memory_block *mem) + if (ret) { + if (nr_vmemmap_pages) + mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); +- return ret; ++ goto out; + } + + /* +@@ -216,9 +220,14 @@ static int memory_block_online(struct memory_block *mem) + nr_vmemmap_pages); + + mem->zone = zone; ++out: ++ mem_hotplug_done(); + return ret; + } + ++/* ++ * Must acquire mem_hotplug_lock in write mode. ++ */ + static int memory_block_offline(struct memory_block *mem) + { + unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); +@@ -233,6 +242,7 @@ static int memory_block_offline(struct memory_block *mem) + * Unaccount before offlining, such that unpopulated zone and kthreads + * can properly be torn down in offline_pages(). + */ ++ mem_hotplug_begin(); + if (nr_vmemmap_pages) + adjust_present_page_count(pfn_to_page(start_pfn), mem->group, + -nr_vmemmap_pages); +@@ -244,13 +254,15 @@ static int memory_block_offline(struct memory_block *mem) + if (nr_vmemmap_pages) + adjust_present_page_count(pfn_to_page(start_pfn), + mem->group, nr_vmemmap_pages); +- return ret; ++ goto out; + } + + if (nr_vmemmap_pages) + mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages); + + mem->zone = NULL; ++out: ++ mem_hotplug_done(); + return ret; + } + +diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c +index bd2570b4f9b7b..d02722bbfcf33 100644 +--- a/mm/memory_hotplug.c ++++ b/mm/memory_hotplug.c +@@ -1069,6 +1069,9 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages) + kasan_remove_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages)); + } + ++/* ++ * Must be called with mem_hotplug_lock in write mode. ++ */ + int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group) + { +@@ -1089,7 +1092,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) + return -EINVAL; + +- mem_hotplug_begin(); + + /* associate pfn range with the zone */ + move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE); +@@ -1148,7 +1150,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + writeback_set_ratelimit(); + + memory_notify(MEM_ONLINE, &arg); +- mem_hotplug_done(); + return 0; + + failed_addition: +@@ -1157,7 +1158,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); + memory_notify(MEM_CANCEL_ONLINE, &arg); + remove_pfn_range_from_zone(zone, pfn, nr_pages); +- mem_hotplug_done(); + return ret; + } + +@@ -1787,6 +1787,9 @@ static int count_system_ram_pages_cb(unsigned long start_pfn, + return 0; + } + ++/* ++ * Must be called with mem_hotplug_lock in write mode. ++ */ + int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, + struct zone *zone, struct memory_group *group) + { +@@ -1809,8 +1812,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, + !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) + return -EINVAL; + +- mem_hotplug_begin(); +- + /* + * Don't allow to offline memory blocks that contain holes. + * Consequently, memory blocks with holes can never get onlined +@@ -1946,7 +1947,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, + + memory_notify(MEM_OFFLINE, &arg); + remove_pfn_range_from_zone(zone, start_pfn, nr_pages); +- mem_hotplug_done(); + return 0; + + failed_removal_isolated: +@@ -1961,7 +1961,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, + (unsigned long long) start_pfn << PAGE_SHIFT, + ((unsigned long long) end_pfn << PAGE_SHIFT) - 1, + reason); +- mem_hotplug_done(); + return ret; + } + +-- +2.43.0 + diff --git a/queue-6.1/mm-memory_hotplug-fix-error-handling-in-add_memory_r.patch b/queue-6.1/mm-memory_hotplug-fix-error-handling-in-add_memory_r.patch new file mode 100644 index 00000000000..5f2af8e9a8a --- /dev/null +++ b/queue-6.1/mm-memory_hotplug-fix-error-handling-in-add_memory_r.patch @@ -0,0 +1,62 @@ +From 9345b30fdfb2604449065987afce0aa558347408 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 20 Nov 2023 15:53:53 +0100 +Subject: mm/memory_hotplug: fix error handling in add_memory_resource() + +From: Sumanth Korikkar + +[ Upstream commit f42ce5f087eb69e47294ababd2e7e6f88a82d308 ] + +In add_memory_resource(), creation of memory block devices occurs after +successful call to arch_add_memory(). However, creation of memory block +devices could fail. In that case, arch_remove_memory() is called to +perform necessary cleanup. + +Currently with or without altmap support, arch_remove_memory() is always +passed with altmap set to NULL during error handling. This leads to +freeing of struct pages using free_pages(), eventhough the allocation +might have been performed with altmap support via +altmap_alloc_block_buf(). + +Fix the error handling by passing altmap in arch_remove_memory(). This +ensures the following: +* When altmap is disabled, deallocation of the struct pages array occurs + via free_pages(). +* When altmap is enabled, deallocation occurs via vmem_altmap_free(). + +Link: https://lkml.kernel.org/r/20231120145354.308999-3-sumanthk@linux.ibm.com +Fixes: a08a2ae34613 ("mm,memory_hotplug: allocate memmap from the added memory range") +Signed-off-by: Sumanth Korikkar +Reviewed-by: Gerald Schaefer +Acked-by: David Hildenbrand +Cc: Alexander Gordeev +Cc: Aneesh Kumar K.V +Cc: Anshuman Khandual +Cc: Heiko Carstens +Cc: kernel test robot +Cc: Michal Hocko +Cc: Oscar Salvador +Cc: Vasily Gorbik +Cc: [5.15+] +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/memory_hotplug.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c +index d02722bbfcf33..3b9d3a4b43869 100644 +--- a/mm/memory_hotplug.c ++++ b/mm/memory_hotplug.c +@@ -1382,7 +1382,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) + ret = create_memory_block_devices(start, size, mhp_altmap.alloc, + group); + if (ret) { +- arch_remove_memory(start, size, NULL); ++ arch_remove_memory(start, size, params.altmap); + goto error; + } + +-- +2.43.0 + diff --git a/queue-6.1/mm-merge-folio_has_private-filemap_release_folio-cal.patch b/queue-6.1/mm-merge-folio_has_private-filemap_release_folio-cal.patch new file mode 100644 index 00000000000..5ddd7a60302 --- /dev/null +++ b/queue-6.1/mm-merge-folio_has_private-filemap_release_folio-cal.patch @@ -0,0 +1,282 @@ +From 060289f8c5d7dc83b3980d57bc014879b377c9a9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 28 Jun 2023 11:48:51 +0100 +Subject: mm: merge folio_has_private()/filemap_release_folio() call pairs + +From: David Howells + +[ Upstream commit 0201ebf274a306a6ebb95e5dc2d6a0a27c737cac ] + +Patch series "mm, netfs, fscache: Stop read optimisation when folio +removed from pagecache", v7. + +This fixes an optimisation in fscache whereby we don't read from the cache +for a particular file until we know that there's data there that we don't +have in the pagecache. The problem is that I'm no longer using PG_fscache +(aka PG_private_2) to indicate that the page is cached and so I don't get +a notification when a cached page is dropped from the pagecache. + +The first patch merges some folio_has_private() and +filemap_release_folio() pairs and introduces a helper, +folio_needs_release(), to indicate if a release is required. + +The second patch is the actual fix. Following Willy's suggestions[1], it +adds an AS_RELEASE_ALWAYS flag to an address_space that will make +filemap_release_folio() always call ->release_folio(), even if +PG_private/PG_private_2 aren't set. folio_needs_release() is altered to +add a check for this. + +This patch (of 2): + +Make filemap_release_folio() check folio_has_private(). Then, in most +cases, where a call to folio_has_private() is immediately followed by a +call to filemap_release_folio(), we can get rid of the test in the pair. + +There are a couple of sites in mm/vscan.c that this can't so easily be +done. In shrink_folio_list(), there are actually three cases (something +different is done for incompletely invalidated buffers), but +filemap_release_folio() elides two of them. + +In shrink_active_list(), we don't have have the folio lock yet, so the +check allows us to avoid locking the page unnecessarily. + +A wrapper function to check if a folio needs release is provided for those +places that still need to do it in the mm/ directory. This will acquire +additional parts to the condition in a future patch. + +After this, the only remaining caller of folio_has_private() outside of +mm/ is a check in fuse. + +Link: https://lkml.kernel.org/r/20230628104852.3391651-1-dhowells@redhat.com +Link: https://lkml.kernel.org/r/20230628104852.3391651-2-dhowells@redhat.com +Reported-by: Rohith Surabattula +Suggested-by: Matthew Wilcox +Signed-off-by: David Howells +Cc: Matthew Wilcox +Cc: Linus Torvalds +Cc: Steve French +Cc: Shyam Prasad N +Cc: Rohith Surabattula +Cc: Dave Wysochanski +Cc: Dominique Martinet +Cc: Ilya Dryomov +Cc: "Theodore Ts'o" +Cc: Andreas Dilger +Cc: Xiubo Li +Cc: Jingbo Xu +Signed-off-by: Andrew Morton +Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") +Signed-off-by: Sasha Levin +--- + fs/ext4/move_extent.c | 12 ++++-------- + fs/splice.c | 3 +-- + mm/filemap.c | 2 ++ + mm/huge_memory.c | 3 +-- + mm/internal.h | 8 ++++++++ + mm/khugepaged.c | 3 +-- + mm/memory-failure.c | 8 +++----- + mm/migrate.c | 3 +-- + mm/truncate.c | 6 ++---- + mm/vmscan.c | 8 ++++---- + 10 files changed, 27 insertions(+), 29 deletions(-) + +diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c +index 8dbb87edf24c4..dedc9d445f243 100644 +--- a/fs/ext4/move_extent.c ++++ b/fs/ext4/move_extent.c +@@ -339,10 +339,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + ext4_double_up_write_data_sem(orig_inode, donor_inode); + goto data_copy; + } +- if ((folio_has_private(folio[0]) && +- !filemap_release_folio(folio[0], 0)) || +- (folio_has_private(folio[1]) && +- !filemap_release_folio(folio[1], 0))) { ++ if (!filemap_release_folio(folio[0], 0) || ++ !filemap_release_folio(folio[1], 0)) { + *err = -EBUSY; + goto drop_data_sem; + } +@@ -361,10 +359,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, + + /* At this point all buffers in range are uptodate, old mapping layout + * is no longer required, try to drop it now. */ +- if ((folio_has_private(folio[0]) && +- !filemap_release_folio(folio[0], 0)) || +- (folio_has_private(folio[1]) && +- !filemap_release_folio(folio[1], 0))) { ++ if (!filemap_release_folio(folio[0], 0) || ++ !filemap_release_folio(folio[1], 0)) { + *err = -EBUSY; + goto unlock_folios; + } +diff --git a/fs/splice.c b/fs/splice.c +index c4ae54deac42c..d0230cf8ec571 100644 +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -65,8 +65,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, + */ + folio_wait_writeback(folio); + +- if (folio_has_private(folio) && +- !filemap_release_folio(folio, GFP_KERNEL)) ++ if (!filemap_release_folio(folio, GFP_KERNEL)) + goto out_unlock; + + /* +diff --git a/mm/filemap.c b/mm/filemap.c +index 10fe6430693bd..2809b1174f04e 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -4005,6 +4005,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) + struct address_space * const mapping = folio->mapping; + + BUG_ON(!folio_test_locked(folio)); ++ if (!folio_needs_release(folio)) ++ return true; + if (folio_test_writeback(folio)) + return false; + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 2753fb54cdf38..59577946735b1 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2694,8 +2694,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + gfp = current_gfp_context(mapping_gfp_mask(mapping) & + GFP_RECLAIM_MASK); + +- if (folio_test_private(folio) && +- !filemap_release_folio(folio, gfp)) { ++ if (!filemap_release_folio(folio, gfp)) { + ret = -EBUSY; + goto out; + } +diff --git a/mm/internal.h b/mm/internal.h +index 6b7ef495b56d3..1fefb5181ab78 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -163,6 +163,14 @@ static inline void set_page_refcounted(struct page *page) + set_page_count(page, 1); + } + ++/* ++ * Return true if a folio needs ->release_folio() calling upon it. ++ */ ++static inline bool folio_needs_release(struct folio *folio) ++{ ++ return folio_has_private(folio); ++} ++ + extern unsigned long highest_memmap_pfn; + + /* +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 6fc7db587c453..65bd0b105266a 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1955,8 +1955,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, + goto out_unlock; + } + +- if (folio_has_private(folio) && +- !filemap_release_folio(folio, GFP_KERNEL)) { ++ if (!filemap_release_folio(folio, GFP_KERNEL)) { + result = SCAN_PAGE_HAS_PRIVATE; + folio_putback_lru(folio); + goto out_unlock; +diff --git a/mm/memory-failure.c b/mm/memory-failure.c +index 6355166a6bb28..5b846ed5dcbe9 100644 +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -830,14 +830,12 @@ static int truncate_error_page(struct page *p, unsigned long pfn, + struct folio *folio = page_folio(p); + int err = mapping->a_ops->error_remove_page(mapping, p); + +- if (err != 0) { ++ if (err != 0) + pr_info("%#lx: Failed to punch page: %d\n", pfn, err); +- } else if (folio_has_private(folio) && +- !filemap_release_folio(folio, GFP_NOIO)) { ++ else if (!filemap_release_folio(folio, GFP_NOIO)) + pr_info("%#lx: failed to release buffers\n", pfn); +- } else { ++ else + ret = MF_RECOVERED; +- } + } else { + /* + * If the file system doesn't support it just invalidate +diff --git a/mm/migrate.c b/mm/migrate.c +index 91bd69c61148e..c93dd6a31c31a 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -914,8 +914,7 @@ static int fallback_migrate_folio(struct address_space *mapping, + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ +- if (folio_test_private(src) && +- !filemap_release_folio(src, GFP_KERNEL)) ++ if (!filemap_release_folio(src, GFP_KERNEL)) + return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; + + return migrate_folio(mapping, dst, src, mode); +diff --git a/mm/truncate.c b/mm/truncate.c +index c0be77e5c0083..0d4dd233f5187 100644 +--- a/mm/truncate.c ++++ b/mm/truncate.c +@@ -19,7 +19,6 @@ + #include + #include + #include +-#include /* grr. try_to_release_page */ + #include + #include + #include "internal.h" +@@ -276,7 +275,7 @@ static long mapping_evict_folio(struct address_space *mapping, + if (folio_ref_count(folio) > + folio_nr_pages(folio) + folio_has_private(folio) + 1) + return 0; +- if (folio_has_private(folio) && !filemap_release_folio(folio, 0)) ++ if (!filemap_release_folio(folio, 0)) + return 0; + + return remove_mapping(mapping, folio); +@@ -581,8 +580,7 @@ static int invalidate_complete_folio2(struct address_space *mapping, + if (folio->mapping != mapping) + return 0; + +- if (folio_has_private(folio) && +- !filemap_release_folio(folio, GFP_KERNEL)) ++ if (!filemap_release_folio(folio, GFP_KERNEL)) + return 0; + + spin_lock(&mapping->host->i_lock); +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 3f090faa6377f..9f3cfb7caa48d 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -1992,7 +1992,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, + * (refcount == 1) it can be freed. Otherwise, leave + * the folio on the LRU so it is swappable. + */ +- if (folio_has_private(folio)) { ++ if (folio_needs_release(folio)) { + if (!filemap_release_folio(folio, sc->gfp_mask)) + goto activate_locked; + if (!mapping && folio_ref_count(folio) == 1) { +@@ -2618,9 +2618,9 @@ static void shrink_active_list(unsigned long nr_to_scan, + } + + if (unlikely(buffer_heads_over_limit)) { +- if (folio_test_private(folio) && folio_trylock(folio)) { +- if (folio_test_private(folio)) +- filemap_release_folio(folio, 0); ++ if (folio_needs_release(folio) && ++ folio_trylock(folio)) { ++ filemap_release_folio(folio, 0); + folio_unlock(folio); + } + } +-- +2.43.0 + diff --git a/queue-6.1/mm-netfs-fscache-stop-read-optimisation-when-folio-r.patch b/queue-6.1/mm-netfs-fscache-stop-read-optimisation-when-folio-r.patch new file mode 100644 index 00000000000..28f64330f6b --- /dev/null +++ b/queue-6.1/mm-netfs-fscache-stop-read-optimisation-when-folio-r.patch @@ -0,0 +1,222 @@ +From ed65a1b09f78fea9d521a21c25bb036dc802af12 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 28 Jun 2023 11:48:52 +0100 +Subject: mm, netfs, fscache: stop read optimisation when folio removed from + pagecache + +From: David Howells + +[ Upstream commit b4fa966f03b7401ceacd4ffd7227197afb2b8376 ] + +Fscache has an optimisation by which reads from the cache are skipped +until we know that (a) there's data there to be read and (b) that data +isn't entirely covered by pages resident in the netfs pagecache. This is +done with two flags manipulated by fscache_note_page_release(): + + if (... + test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && + test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) + clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); + +where the NO_DATA_TO_READ flag causes cachefiles_prepare_read() to +indicate that netfslib should download from the server or clear the page +instead. + +The fscache_note_page_release() function is intended to be called from +->releasepage() - but that only gets called if PG_private or PG_private_2 +is set - and currently the former is at the discretion of the network +filesystem and the latter is only set whilst a page is being written to +the cache, so sometimes we miss clearing the optimisation. + +Fix this by following Willy's suggestion[1] and adding an address_space +flag, AS_RELEASE_ALWAYS, that causes filemap_release_folio() to always call +->release_folio() if it's set, even if PG_private or PG_private_2 aren't +set. + +Note that this would require folio_test_private() and page_has_private() to +become more complicated. To avoid that, in the places[*] where these are +used to conditionalise calls to filemap_release_folio() and +try_to_release_page(), the tests are removed the those functions just +jumped to unconditionally and the test is performed there. + +[*] There are some exceptions in vmscan.c where the check guards more than +just a call to the releaser. I've added a function, folio_needs_release() +to wrap all the checks for that. + +AS_RELEASE_ALWAYS should be set if a non-NULL cookie is obtained from +fscache and cleared in ->evict_inode() before truncate_inode_pages_final() +is called. + +Additionally, the FSCACHE_COOKIE_NO_DATA_TO_READ flag needs to be cleared +and the optimisation cancelled if a cachefiles object already contains data +when we open it. + +[dwysocha@redhat.com: call folio_mapping() inside folio_needs_release()] + Link: https://github.com/DaveWysochanskiRH/kernel/commit/902c990e311120179fa5de99d68364b2947b79ec +Link: https://lkml.kernel.org/r/20230628104852.3391651-3-dhowells@redhat.com +Fixes: 1f67e6d0b188 ("fscache: Provide a function to note the release of a page") +Fixes: 047487c947e8 ("cachefiles: Implement the I/O routines") +Signed-off-by: David Howells +Signed-off-by: Dave Wysochanski +Reported-by: Rohith Surabattula +Suggested-by: Matthew Wilcox +Tested-by: SeongJae Park +Cc: Daire Byrne +Cc: Matthew Wilcox +Cc: Linus Torvalds +Cc: Steve French +Cc: Shyam Prasad N +Cc: Rohith Surabattula +Cc: Dave Wysochanski +Cc: Dominique Martinet +Cc: Ilya Dryomov +Cc: Andreas Dilger +Cc: Jingbo Xu +Cc: "Theodore Ts'o" +Cc: Xiubo Li +Signed-off-by: Andrew Morton +Stable-dep-of: 1898efcdbed3 ("block: update the stable_writes flag in bdev_add") +Signed-off-by: Sasha Levin +--- + fs/9p/cache.c | 2 ++ + fs/afs/internal.h | 2 ++ + fs/cachefiles/namei.c | 2 ++ + fs/ceph/cache.c | 2 ++ + fs/nfs/fscache.c | 3 +++ + fs/smb/client/fscache.c | 2 ++ + include/linux/pagemap.h | 16 ++++++++++++++++ + mm/internal.h | 5 ++++- + 8 files changed, 33 insertions(+), 1 deletion(-) + +diff --git a/fs/9p/cache.c b/fs/9p/cache.c +index cebba4eaa0b57..12c0ae29f1857 100644 +--- a/fs/9p/cache.c ++++ b/fs/9p/cache.c +@@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) + &path, sizeof(path), + &version, sizeof(version), + i_size_read(&v9inode->netfs.inode)); ++ if (v9inode->netfs.cache) ++ mapping_set_release_always(inode->i_mapping); + + p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", + inode, v9fs_inode_cookie(v9inode)); +diff --git a/fs/afs/internal.h b/fs/afs/internal.h +index fcbb598d8c85d..a25fdc3e52310 100644 +--- a/fs/afs/internal.h ++++ b/fs/afs/internal.h +@@ -682,6 +682,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, + { + #ifdef CONFIG_AFS_FSCACHE + vnode->netfs.cache = cookie; ++ if (cookie) ++ mapping_set_release_always(vnode->netfs.inode.i_mapping); + #endif + } + +diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c +index 03ca8f2f657ab..50b2ee163af60 100644 +--- a/fs/cachefiles/namei.c ++++ b/fs/cachefiles/namei.c +@@ -584,6 +584,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, + if (ret < 0) + goto check_failed; + ++ clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags); ++ + object->file = file; + + /* Always update the atime on an object we've just looked up (this is +diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c +index 177d8e8d73fe4..de1dee46d3df7 100644 +--- a/fs/ceph/cache.c ++++ b/fs/ceph/cache.c +@@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) + &ci->i_vino, sizeof(ci->i_vino), + &ci->i_version, sizeof(ci->i_version), + i_size_read(inode)); ++ if (ci->netfs.cache) ++ mapping_set_release_always(inode->i_mapping); + } + + void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) +diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c +index e731c00a9fcbc..d3c938dd2b12a 100644 +--- a/fs/nfs/fscache.c ++++ b/fs/nfs/fscache.c +@@ -176,6 +176,9 @@ void nfs_fscache_init_inode(struct inode *inode) + &auxdata, /* aux_data */ + sizeof(auxdata), + i_size_read(inode)); ++ ++ if (netfs_inode(inode)->cache) ++ mapping_set_release_always(inode->i_mapping); + } + + /* +diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c +index e73625b5d0cc6..f64bad513ba6d 100644 +--- a/fs/smb/client/fscache.c ++++ b/fs/smb/client/fscache.c +@@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) + &cifsi->uniqueid, sizeof(cifsi->uniqueid), + &cd, sizeof(cd), + i_size_read(&cifsi->netfs.inode)); ++ if (cifsi->netfs.cache) ++ mapping_set_release_always(inode->i_mapping); + } + + void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 03307b72de6c6..fdbb90ae56c70 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -199,6 +199,7 @@ enum mapping_flags { + /* writeback related tags are not used */ + AS_NO_WRITEBACK_TAGS = 5, + AS_LARGE_FOLIO_SUPPORT = 6, ++ AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ + }; + + /** +@@ -269,6 +270,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping) + return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); + } + ++static inline bool mapping_release_always(const struct address_space *mapping) ++{ ++ return test_bit(AS_RELEASE_ALWAYS, &mapping->flags); ++} ++ ++static inline void mapping_set_release_always(struct address_space *mapping) ++{ ++ set_bit(AS_RELEASE_ALWAYS, &mapping->flags); ++} ++ ++static inline void mapping_clear_release_always(struct address_space *mapping) ++{ ++ clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); ++} ++ + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) + { + return mapping->gfp_mask; +diff --git a/mm/internal.h b/mm/internal.h +index 1fefb5181ab78..d01130efce5fb 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -168,7 +168,10 @@ static inline void set_page_refcounted(struct page *page) + */ + static inline bool folio_needs_release(struct folio *folio) + { +- return folio_has_private(folio); ++ struct address_space *mapping = folio_mapping(folio); ++ ++ return folio_has_private(folio) || ++ (mapping && mapping_release_always(mapping)); + } + + extern unsigned long highest_memmap_pfn; +-- +2.43.0 + diff --git a/queue-6.1/net-annotate-data-races-around-sk-sk_bind_phc.patch b/queue-6.1/net-annotate-data-races-around-sk-sk_bind_phc.patch new file mode 100644 index 00000000000..c53afba645d --- /dev/null +++ b/queue-6.1/net-annotate-data-races-around-sk-sk_bind_phc.patch @@ -0,0 +1,60 @@ +From 1b799e9a0670b2cf155f5463f9b42e791668abaa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 31 Aug 2023 13:52:12 +0000 +Subject: net: annotate data-races around sk->sk_bind_phc + +From: Eric Dumazet + +[ Upstream commit 251cd405a9e6e70b92fe5afbdd17fd5caf9d3266 ] + +sk->sk_bind_phc is read locklessly. Add corresponding annotations. + +Fixes: d463126e23f1 ("net: sock: extend SO_TIMESTAMPING for PHC binding") +Signed-off-by: Eric Dumazet +Cc: Yangbo Lu +Signed-off-by: David S. Miller +Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)") +Signed-off-by: Sasha Levin +--- + net/core/sock.c | 4 ++-- + net/socket.c | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/net/core/sock.c b/net/core/sock.c +index 929055bc0cc7b..49b7f252ddae4 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -890,7 +890,7 @@ static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) + if (!match) + return -EINVAL; + +- sk->sk_bind_phc = phc_index; ++ WRITE_ONCE(sk->sk_bind_phc, phc_index); + + return 0; + } +@@ -1706,7 +1706,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, + case SO_TIMESTAMPING_OLD: + lv = sizeof(v.timestamping); + v.timestamping.flags = READ_ONCE(sk->sk_tsflags); +- v.timestamping.bind_phc = sk->sk_bind_phc; ++ v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); + break; + + case SO_RCVTIMEO_OLD: +diff --git a/net/socket.c b/net/socket.c +index 9c1fb94b12851..07470724e7358 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -940,7 +940,7 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + + if (tsflags & SOF_TIMESTAMPING_BIND_PHC) + hwtstamp = ptp_convert_timestamp(&hwtstamp, +- sk->sk_bind_phc); ++ READ_ONCE(sk->sk_bind_phc)); + + if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { + empty = 0; +-- +2.43.0 + diff --git a/queue-6.1/net-annotate-data-races-around-sk-sk_tsflags.patch b/queue-6.1/net-annotate-data-races-around-sk-sk_tsflags.patch new file mode 100644 index 00000000000..fadc546cbbe --- /dev/null +++ b/queue-6.1/net-annotate-data-races-around-sk-sk_tsflags.patch @@ -0,0 +1,367 @@ +From e1f7cc7fc59e4d300f8a27e6ce20ed53893823db Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 31 Aug 2023 13:52:11 +0000 +Subject: net: annotate data-races around sk->sk_tsflags + +From: Eric Dumazet + +[ Upstream commit e3390b30a5dfb112e8e802a59c0f68f947b638b2 ] + +sk->sk_tsflags can be read locklessly, add corresponding annotations. + +Fixes: b9f40e21ef42 ("net-timestamp: move timestamp flags out of sk_flags") +Signed-off-by: Eric Dumazet +Cc: Willem de Bruijn +Signed-off-by: David S. Miller +Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)") +Signed-off-by: Sasha Levin +--- + include/net/ip.h | 2 +- + include/net/sock.h | 17 ++++++++++------- + net/can/j1939/socket.c | 10 ++++++---- + net/core/skbuff.c | 10 ++++++---- + net/core/sock.c | 4 ++-- + net/ipv4/ip_output.c | 2 +- + net/ipv4/ip_sockglue.c | 2 +- + net/ipv4/tcp.c | 4 ++-- + net/ipv6/ip6_output.c | 2 +- + net/ipv6/ping.c | 2 +- + net/ipv6/raw.c | 2 +- + net/ipv6/udp.c | 2 +- + net/socket.c | 13 +++++++------ + 13 files changed, 40 insertions(+), 32 deletions(-) + +diff --git a/include/net/ip.h b/include/net/ip.h +index c286344628dba..c83c09c65623f 100644 +--- a/include/net/ip.h ++++ b/include/net/ip.h +@@ -95,7 +95,7 @@ static inline void ipcm_init_sk(struct ipcm_cookie *ipcm, + ipcm_init(ipcm); + + ipcm->sockc.mark = READ_ONCE(inet->sk.sk_mark); +- ipcm->sockc.tsflags = inet->sk.sk_tsflags; ++ ipcm->sockc.tsflags = READ_ONCE(inet->sk.sk_tsflags); + ipcm->oif = READ_ONCE(inet->sk.sk_bound_dev_if); + ipcm->addr = inet->inet_saddr; + ipcm->protocol = inet->inet_num; +diff --git a/include/net/sock.h b/include/net/sock.h +index b6027b01c2455..d8ed62a8e1a3e 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -1928,7 +1928,9 @@ struct sockcm_cookie { + static inline void sockcm_init(struct sockcm_cookie *sockc, + const struct sock *sk) + { +- *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; ++ *sockc = (struct sockcm_cookie) { ++ .tsflags = READ_ONCE(sk->sk_tsflags) ++ }; + } + + int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, +@@ -2741,9 +2743,9 @@ void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, + static inline void + sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) + { +- ktime_t kt = skb->tstamp; + struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); +- ++ u32 tsflags = READ_ONCE(sk->sk_tsflags); ++ ktime_t kt = skb->tstamp; + /* + * generate control messages if + * - receive time stamping in software requested +@@ -2751,10 +2753,10 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) + * - hardware time stamps available and wanted + */ + if (sock_flag(sk, SOCK_RCVTSTAMP) || +- (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || +- (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) || ++ (tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || ++ (kt && tsflags & SOF_TIMESTAMPING_SOFTWARE) || + (hwtstamps->hwtstamp && +- (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) ++ (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) + __sock_recv_timestamp(msg, sk, skb); + else + sock_write_timestamp(sk, kt); +@@ -2776,7 +2778,8 @@ static inline void sock_recv_cmsgs(struct msghdr *msg, struct sock *sk, + #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ + SOF_TIMESTAMPING_RAW_HARDWARE) + +- if (sk->sk_flags & FLAGS_RECV_CMSGS || sk->sk_tsflags & TSFLAGS_ANY) ++ if (sk->sk_flags & FLAGS_RECV_CMSGS || ++ READ_ONCE(sk->sk_tsflags) & TSFLAGS_ANY) + __sock_recv_cmsgs(msg, sk, skb); + else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) + sock_write_timestamp(sk, skb->tstamp); +diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c +index 9c828067b4481..b0be23559243c 100644 +--- a/net/can/j1939/socket.c ++++ b/net/can/j1939/socket.c +@@ -974,6 +974,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, + struct sock_exterr_skb *serr; + struct sk_buff *skb; + char *state = "UNK"; ++ u32 tsflags; + int err; + + jsk = j1939_sk(sk); +@@ -981,13 +982,14 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, + if (!(jsk->state & J1939_SOCK_ERRQUEUE)) + return; + ++ tsflags = READ_ONCE(sk->sk_tsflags); + switch (type) { + case J1939_ERRQUEUE_TX_ACK: +- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_ACK)) ++ if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) + return; + break; + case J1939_ERRQUEUE_TX_SCHED: +- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_TX_SCHED)) ++ if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) + return; + break; + case J1939_ERRQUEUE_TX_ABORT: +@@ -997,7 +999,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, + case J1939_ERRQUEUE_RX_DPO: + fallthrough; + case J1939_ERRQUEUE_RX_ABORT: +- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) ++ if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) + return; + break; + default: +@@ -1054,7 +1056,7 @@ static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, + } + + serr->opt_stats = true; +- if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) ++ if (tsflags & SOF_TIMESTAMPING_OPT_ID) + serr->ee.ee_data = session->tskey; + + netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", +diff --git a/net/core/skbuff.c b/net/core/skbuff.c +index 73b1e0e53534e..8a819d0a7bfb0 100644 +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -4913,7 +4913,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, + serr->ee.ee_info = tstype; + serr->opt_stats = opt_stats; + serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; +- if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { ++ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { + serr->ee.ee_data = skb_shinfo(skb)->tskey; + if (sk_is_tcp(sk)) + serr->ee.ee_data -= atomic_read(&sk->sk_tskey); +@@ -4969,21 +4969,23 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, + { + struct sk_buff *skb; + bool tsonly, opt_stats = false; ++ u32 tsflags; + + if (!sk) + return; + +- if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && ++ tsflags = READ_ONCE(sk->sk_tsflags); ++ if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) + return; + +- tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; ++ tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + if (!skb_may_tx_timestamp(sk, tsonly)) + return; + + if (tsonly) { + #ifdef CONFIG_INET +- if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && ++ if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && + sk_is_tcp(sk)) { + skb = tcp_get_timestamping_opt_stats(sk, orig_skb, + ack_skb); +diff --git a/net/core/sock.c b/net/core/sock.c +index 4305e55dbfba4..929055bc0cc7b 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -926,7 +926,7 @@ int sock_set_timestamping(struct sock *sk, int optname, + return ret; + } + +- sk->sk_tsflags = val; ++ WRITE_ONCE(sk->sk_tsflags, val); + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) +@@ -1705,7 +1705,7 @@ int sk_getsockopt(struct sock *sk, int level, int optname, + + case SO_TIMESTAMPING_OLD: + lv = sizeof(v.timestamping); +- v.timestamping.flags = sk->sk_tsflags; ++ v.timestamping.flags = READ_ONCE(sk->sk_tsflags); + v.timestamping.bind_phc = sk->sk_bind_phc; + break; + +diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c +index d8ec802f97524..e19ef88ae181f 100644 +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -991,7 +991,7 @@ static int __ip_append_data(struct sock *sk, + paged = !!cork->gso_size; + + if (cork->tx_flags & SKBTX_ANY_TSTAMP && +- sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) ++ READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + + hh_len = LL_RESERVED_SPACE(rt->dst.dev); +diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c +index 63aa52becd880..c1fb7580ea581 100644 +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -509,7 +509,7 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk, + * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). + */ + info = PKTINFO_SKB_CB(skb); +- if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) || ++ if (!(READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_CMSG) || + !info->ipi_ifindex) + return false; + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 58409ea2da0af..3935451ad061e 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -2359,14 +2359,14 @@ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, + } + } + +- if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) ++ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE) + has_timestamping = true; + else + tss->ts[0] = (struct timespec64) {0}; + } + + if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { +- if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) ++ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE) + has_timestamping = true; + else + tss->ts[2] = (struct timespec64) {0}; +diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c +index 04822e2cba74a..e9ae084d038d1 100644 +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -1507,7 +1507,7 @@ static int __ip6_append_data(struct sock *sk, + orig_mtu = mtu; + + if (cork->tx_flags & SKBTX_ANY_TSTAMP && +- sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) ++ READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + + hh_len = LL_RESERVED_SPACE(rt->dst.dev); +diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c +index 4d5a27dd9a4b2..a5d7d1915ba7e 100644 +--- a/net/ipv6/ping.c ++++ b/net/ipv6/ping.c +@@ -119,7 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) + return -EINVAL; + + ipcm6_init_sk(&ipc6, np); +- ipc6.sockc.tsflags = sk->sk_tsflags; ++ ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); + ipc6.sockc.mark = READ_ONCE(sk->sk_mark); + + fl6.flowi6_oif = oif; +diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c +index df3abd9e5237c..dc31752a7edcc 100644 +--- a/net/ipv6/raw.c ++++ b/net/ipv6/raw.c +@@ -776,7 +776,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) + fl6.flowi6_uid = sk->sk_uid; + + ipcm6_init(&ipc6); +- ipc6.sockc.tsflags = sk->sk_tsflags; ++ ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); + ipc6.sockc.mark = fl6.flowi6_mark; + + if (sin6) { +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index 64b36c2ba774a..7f49f69226a21 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -1358,7 +1358,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) + + ipcm6_init(&ipc6); + ipc6.gso_size = READ_ONCE(up->gso_size); +- ipc6.sockc.tsflags = sk->sk_tsflags; ++ ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags); + ipc6.sockc.mark = READ_ONCE(sk->sk_mark); + + /* destination address check */ +diff --git a/net/socket.c b/net/socket.c +index 04cba91c7cbe5..9c1fb94b12851 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -826,7 +826,7 @@ static bool skb_is_swtx_tstamp(const struct sk_buff *skb, int false_tstamp) + + static ktime_t get_timestamp(struct sock *sk, struct sk_buff *skb, int *if_index) + { +- bool cycles = sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC; ++ bool cycles = READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_BIND_PHC; + struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); + struct net_device *orig_dev; + ktime_t hwtstamp; +@@ -878,12 +878,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); + int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); + struct scm_timestamping_internal tss; +- + int empty = 1, false_tstamp = 0; + struct skb_shared_hwtstamps *shhwtstamps = + skb_hwtstamps(skb); + int if_index; + ktime_t hwtstamp; ++ u32 tsflags; + + /* Race occurred between timestamp enabling and packet + receiving. Fill in the current time for now. */ +@@ -925,11 +925,12 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + } + + memset(&tss, 0, sizeof(tss)); +- if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) && ++ tsflags = READ_ONCE(sk->sk_tsflags); ++ if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && + ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) + empty = 0; + if (shhwtstamps && +- (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && ++ (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + !skb_is_swtx_tstamp(skb, false_tstamp)) { + if_index = 0; + if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) +@@ -937,14 +938,14 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + else + hwtstamp = shhwtstamps->hwtstamp; + +- if (sk->sk_tsflags & SOF_TIMESTAMPING_BIND_PHC) ++ if (tsflags & SOF_TIMESTAMPING_BIND_PHC) + hwtstamp = ptp_convert_timestamp(&hwtstamp, + sk->sk_bind_phc); + + if (ktime_to_timespec64_cond(hwtstamp, tss.ts + 2)) { + empty = 0; + +- if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && ++ if ((tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && + !skb_is_err_queue(skb)) + put_ts_pktinfo(msg, skb, if_index); + } +-- +2.43.0 + diff --git a/queue-6.1/net-bcmgenet-fix-fcs-generation-for-fragmented-skbuf.patch b/queue-6.1/net-bcmgenet-fix-fcs-generation-for-fragmented-skbuf.patch new file mode 100644 index 00000000000..78419a9cb47 --- /dev/null +++ b/queue-6.1/net-bcmgenet-fix-fcs-generation-for-fragmented-skbuf.patch @@ -0,0 +1,46 @@ +From 0f89a214d5bd7890cd44370aca6aade6589a47b3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 28 Dec 2023 14:56:38 +0100 +Subject: net: bcmgenet: Fix FCS generation for fragmented skbuffs + +From: Adrian Cinal + +[ Upstream commit e584f2ff1e6cc9b1d99e8a6b0f3415940d1b3eb3 ] + +The flag DMA_TX_APPEND_CRC was only written to the first DMA descriptor +in the TX path, where each descriptor corresponds to a single skbuff +fragment (or the skbuff head). This led to packets with no FCS appearing +on the wire if the kernel allocated the packet in fragments, which would +always happen when using PACKET_MMAP/TPACKET (cf. tpacket_fill_skb() in +net/af_packet.c). + +Fixes: 1c1008c793fa ("net: bcmgenet: add main driver file") +Signed-off-by: Adrian Cinal +Acked-by: Doug Berger +Acked-by: Florian Fainelli +Link: https://lore.kernel.org/r/20231228135638.1339245-1-adriancinal1@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/genet/bcmgenet.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c +index 1ae082eb9e905..c2a9913082153 100644 +--- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c ++++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c +@@ -2131,8 +2131,10 @@ static netdev_tx_t bcmgenet_xmit(struct sk_buff *skb, struct net_device *dev) + /* Note: if we ever change from DMA_TX_APPEND_CRC below we + * will need to restore software padding of "runt" packets + */ ++ len_stat |= DMA_TX_APPEND_CRC; ++ + if (!i) { +- len_stat |= DMA_TX_APPEND_CRC | DMA_SOP; ++ len_stat |= DMA_SOP; + if (skb->ip_summed == CHECKSUM_PARTIAL) + len_stat |= DMA_TX_DO_CSUM; + } +-- +2.43.0 + diff --git a/queue-6.1/net-declare-msg_splice_pages-internal-sendmsg-flag.patch b/queue-6.1/net-declare-msg_splice_pages-internal-sendmsg-flag.patch new file mode 100644 index 00000000000..0323681a866 --- /dev/null +++ b/queue-6.1/net-declare-msg_splice_pages-internal-sendmsg-flag.patch @@ -0,0 +1,94 @@ +From 75dffd6df5e444bb377e400ba3e8acf49ca982d3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 May 2023 13:11:10 +0100 +Subject: net: Declare MSG_SPLICE_PAGES internal sendmsg() flag + +From: David Howells + +[ Upstream commit b841b901c452d92610f739a36e54978453528876 ] + +Declare MSG_SPLICE_PAGES, an internal sendmsg() flag, that hints to a +network protocol that it should splice pages from the source iterator +rather than copying the data if it can. This flag is added to a list that +is cleared by sendmsg syscalls on entry. + +This is intended as a replacement for the ->sendpage() op, allowing a way +to splice in several multipage folios in one go. + +Signed-off-by: David Howells +Reviewed-by: Willem de Bruijn +cc: Jens Axboe +cc: Matthew Wilcox +Signed-off-by: Jakub Kicinski +Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") +Signed-off-by: Sasha Levin +--- + include/linux/socket.h | 3 +++ + io_uring/net.c | 2 ++ + net/socket.c | 2 ++ + 3 files changed, 7 insertions(+) + +diff --git a/include/linux/socket.h b/include/linux/socket.h +index 1db29aab8f9c3..b3c58042bd254 100644 +--- a/include/linux/socket.h ++++ b/include/linux/socket.h +@@ -324,6 +324,7 @@ struct ucred { + */ + + #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ ++#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ + #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ + #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file + descriptor received through +@@ -334,6 +335,8 @@ struct ucred { + #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ + #endif + ++/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ ++#define MSG_INTERNAL_SENDMSG_FLAGS (MSG_SPLICE_PAGES) + + /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ + #define SOL_IP 0 +diff --git a/io_uring/net.c b/io_uring/net.c +index 57c626cb4d1a5..67f09a40bcb21 100644 +--- a/io_uring/net.c ++++ b/io_uring/net.c +@@ -389,6 +389,7 @@ int io_send(struct io_kiocb *req, unsigned int issue_flags) + if (flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); + ++ flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); + if (ret < min_ret) { +@@ -1137,6 +1138,7 @@ int io_send_zc(struct io_kiocb *req, unsigned int issue_flags) + msg_flags |= MSG_DONTWAIT; + if (msg_flags & MSG_WAITALL) + min_ret = iov_iter_count(&msg.msg_iter); ++ msg_flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; + + msg.msg_flags = msg_flags; + msg.msg_ubuf = &io_notif_to_data(zc->notif)->uarg; +diff --git a/net/socket.c b/net/socket.c +index 0104617b440dc..6f39f7b0cc85c 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -2131,6 +2131,7 @@ int __sys_sendto(int fd, void __user *buff, size_t len, unsigned int flags, + msg.msg_name = (struct sockaddr *)&address; + msg.msg_namelen = addr_len; + } ++ flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + msg.msg_flags = flags; +@@ -2482,6 +2483,7 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, + msg_sys->msg_control = ctl_buf; + msg_sys->msg_control_is_user = false; + } ++ flags &= ~MSG_INTERNAL_SENDMSG_FLAGS; + msg_sys->msg_flags = flags; + + if (sock->file->f_flags & O_NONBLOCK) +-- +2.43.0 + diff --git a/queue-6.1/net-dpaa2-eth-rearrange-variable-in-dpaa2_eth_get_et.patch b/queue-6.1/net-dpaa2-eth-rearrange-variable-in-dpaa2_eth_get_et.patch new file mode 100644 index 00000000000..014b5758f28 --- /dev/null +++ b/queue-6.1/net-dpaa2-eth-rearrange-variable-in-dpaa2_eth_get_et.patch @@ -0,0 +1,62 @@ +From 8e5b100ede5240de3c21551e38e66faf2d685c09 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 18 Oct 2022 17:18:51 +0300 +Subject: net: dpaa2-eth: rearrange variable in dpaa2_eth_get_ethtool_stats + +From: Ioana Ciornei + +[ Upstream commit 3313206827678f6f036eca601a51f6c4524b559a ] + +Rearrange the variables in the dpaa2_eth_get_ethtool_stats() function so +that we adhere to the reverse Christmas tree rule. +Also, in the next patch we are adding more variables and I didn't know +where to place them with the current ordering. + +Signed-off-by: Ioana Ciornei +Signed-off-by: David S. Miller +Stable-dep-of: beb1930f966d ("dpaa2-eth: recycle the RX buffer only after all processing done") +Signed-off-by: Sasha Levin +--- + .../ethernet/freescale/dpaa2/dpaa2-ethtool.c | 18 ++++++++---------- + 1 file changed, 8 insertions(+), 10 deletions(-) + +diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c +index eea7d7a07c007..59888826469b9 100644 +--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c ++++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c +@@ -227,17 +227,8 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device *net_dev, + struct ethtool_stats *stats, + u64 *data) + { +- int i = 0; +- int j, k, err; +- int num_cnt; +- union dpni_statistics dpni_stats; +- u32 fcnt, bcnt; +- u32 fcnt_rx_total = 0, fcnt_tx_total = 0; +- u32 bcnt_rx_total = 0, bcnt_tx_total = 0; +- u32 buf_cnt; + struct dpaa2_eth_priv *priv = netdev_priv(net_dev); +- struct dpaa2_eth_drv_stats *extras; +- struct dpaa2_eth_ch_stats *ch_stats; ++ union dpni_statistics dpni_stats; + int dpni_stats_page_size[DPNI_STATISTICS_CNT] = { + sizeof(dpni_stats.page_0), + sizeof(dpni_stats.page_1), +@@ -247,6 +238,13 @@ static void dpaa2_eth_get_ethtool_stats(struct net_device *net_dev, + sizeof(dpni_stats.page_5), + sizeof(dpni_stats.page_6), + }; ++ u32 fcnt_rx_total = 0, fcnt_tx_total = 0; ++ u32 bcnt_rx_total = 0, bcnt_tx_total = 0; ++ struct dpaa2_eth_ch_stats *ch_stats; ++ struct dpaa2_eth_drv_stats *extras; ++ int j, k, err, num_cnt, i = 0; ++ u32 fcnt, bcnt; ++ u32 buf_cnt; + + memset(data, 0, + sizeof(u64) * (DPAA2_ETH_NUM_STATS + DPAA2_ETH_NUM_EXTRA_STATS)); +-- +2.43.0 + diff --git a/queue-6.1/net-implement-missing-getsockopt-so_timestamping_new.patch b/queue-6.1/net-implement-missing-getsockopt-so_timestamping_new.patch new file mode 100644 index 00000000000..340dbc02a63 --- /dev/null +++ b/queue-6.1/net-implement-missing-getsockopt-so_timestamping_new.patch @@ -0,0 +1,60 @@ +From f3ca390d856050f4a3be15ee0cec3f772f96b860 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Dec 2023 00:19:01 +0100 +Subject: net: Implement missing getsockopt(SO_TIMESTAMPING_NEW) +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Jörn-Thorben Hinz + +[ Upstream commit 7f6ca95d16b96567ce4cf458a2790ff17fa620c3 ] + +Commit 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") added the new +socket option SO_TIMESTAMPING_NEW. Setting the option is handled in +sk_setsockopt(), querying it was not handled in sk_getsockopt(), though. + +Following remarks on an earlier submission of this patch, keep the old +behavior of getsockopt(SO_TIMESTAMPING_OLD) which returns the active +flags even if they actually have been set through SO_TIMESTAMPING_NEW. + +The new getsockopt(SO_TIMESTAMPING_NEW) is stricter, returning flags +only if they have been set through the same option. + +Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") +Link: https://lore.kernel.org/lkml/20230703175048.151683-1-jthinz@mailbox.tu-berlin.de/ +Link: https://lore.kernel.org/netdev/0d7cddc9-03fa-43db-a579-14f3e822615b@app.fastmail.com/ +Signed-off-by: Jörn-Thorben Hinz +Reviewed-by: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/core/sock.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/net/core/sock.c b/net/core/sock.c +index 49b7f252ddae4..0d8754ec837dc 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -1704,9 +1704,16 @@ int sk_getsockopt(struct sock *sk, int level, int optname, + break; + + case SO_TIMESTAMPING_OLD: ++ case SO_TIMESTAMPING_NEW: + lv = sizeof(v.timestamping); +- v.timestamping.flags = READ_ONCE(sk->sk_tsflags); +- v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); ++ /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only ++ * returning the flags when they were set through the same option. ++ * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. ++ */ ++ if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { ++ v.timestamping.flags = READ_ONCE(sk->sk_tsflags); ++ v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); ++ } + break; + + case SO_RCVTIMEO_OLD: +-- +2.43.0 + diff --git a/queue-6.1/net-implement-missing-so_timestamping_new-cmsg-suppo.patch b/queue-6.1/net-implement-missing-so_timestamping_new-cmsg-suppo.patch new file mode 100644 index 00000000000..9dc4c47a662 --- /dev/null +++ b/queue-6.1/net-implement-missing-so_timestamping_new-cmsg-suppo.patch @@ -0,0 +1,40 @@ +From e6b1f3de357f796324e9e623e65680e3c7fff48f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 4 Jan 2024 09:57:44 +0100 +Subject: net: Implement missing SO_TIMESTAMPING_NEW cmsg support + +From: Thomas Lange + +[ Upstream commit 382a32018b74f407008615e0e831d05ed28e81cd ] + +Commit 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") added the new +socket option SO_TIMESTAMPING_NEW. However, it was never implemented in +__sock_cmsg_send thus breaking SO_TIMESTAMPING cmsg for platforms using +SO_TIMESTAMPING_NEW. + +Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") +Link: https://lore.kernel.org/netdev/6a7281bf-bc4a-4f75-bb88-7011908ae471@app.fastmail.com/ +Signed-off-by: Thomas Lange +Reviewed-by: Willem de Bruijn +Link: https://lore.kernel.org/r/20240104085744.49164-1-thomas@corelatus.se +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/core/sock.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/core/sock.c b/net/core/sock.c +index 0d8754ec837dc..c50a14a02edd4 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -2771,6 +2771,7 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, + sockc->mark = *(u32 *)CMSG_DATA(cmsg); + break; + case SO_TIMESTAMPING_OLD: ++ case SO_TIMESTAMPING_NEW: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) + return -EINVAL; + +-- +2.43.0 + diff --git a/queue-6.1/net-mlx5-increase-size-of-irq-name-buffer.patch b/queue-6.1/net-mlx5-increase-size-of-irq-name-buffer.patch new file mode 100644 index 00000000000..aa683dd36ce --- /dev/null +++ b/queue-6.1/net-mlx5-increase-size-of-irq-name-buffer.patch @@ -0,0 +1,76 @@ +From 2a83821a4f768e3f7e4d98d0b8623c31ade327a1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Nov 2023 13:58:43 -0800 +Subject: net/mlx5: Increase size of irq name buffer + +From: Rahul Rameshbabu + +[ Upstream commit 3338bebfc26a1e2cebbba82a1cf12c0159608e73 ] + +Without increased buffer size, will trigger -Wformat-truncation with W=1 +for the snprintf operation writing to the buffer. + + drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c: In function 'mlx5_irq_alloc': + drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:296:7: error: '@pci:' directive output may be truncated writing 5 bytes into a region of size between 1 and 32 [-Werror=format-truncation=] + 296 | "%s@pci:%s", name, pci_name(dev->pdev)); + | ^~~~~ + drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c:295:2: note: 'snprintf' output 6 or more bytes (assuming 37) into a destination of size 32 + 295 | snprintf(irq->name, MLX5_MAX_IRQ_NAME, + | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + 296 | "%s@pci:%s", name, pci_name(dev->pdev)); + | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Fixes: ada9f5d00797 ("IB/mlx5: Fix eq names to display nicely in /proc/interrupts") +Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6d4ab2e97dcfbcd748ae71761a9d8e5e41cc732c +Signed-off-by: Rahul Rameshbabu +Reviewed-by: Dragos Tatulea +Signed-off-by: Saeed Mahameed +Link: https://lore.kernel.org/r/20231114215846.5902-13-saeed@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 6 +++--- + drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h | 3 +++ + 2 files changed, 6 insertions(+), 3 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +index d136360ac6a98..a6d3fc96e1685 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +@@ -25,7 +25,7 @@ + struct mlx5_irq { + struct atomic_notifier_head nh; + cpumask_var_t mask; +- char name[MLX5_MAX_IRQ_NAME]; ++ char name[MLX5_MAX_IRQ_FORMATTED_NAME]; + struct mlx5_irq_pool *pool; + int refcount; + u32 index; +@@ -236,8 +236,8 @@ struct mlx5_irq *mlx5_irq_alloc(struct mlx5_irq_pool *pool, int i, + else + irq_sf_set_name(pool, name, i); + ATOMIC_INIT_NOTIFIER_HEAD(&irq->nh); +- snprintf(irq->name, MLX5_MAX_IRQ_NAME, +- "%s@pci:%s", name, pci_name(dev->pdev)); ++ snprintf(irq->name, MLX5_MAX_IRQ_FORMATTED_NAME, ++ MLX5_IRQ_NAME_FORMAT_STR, name, pci_name(dev->pdev)); + err = request_irq(irq->irqn, irq_int_handler, 0, irq->name, + &irq->nh); + if (err) { +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h +index 5c7e68bee43a0..4047179307c4a 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.h +@@ -7,6 +7,9 @@ + #include + + #define MLX5_MAX_IRQ_NAME (32) ++#define MLX5_IRQ_NAME_FORMAT_STR ("%s@pci:%s") ++#define MLX5_MAX_IRQ_FORMATTED_NAME \ ++ (MLX5_MAX_IRQ_NAME + sizeof(MLX5_IRQ_NAME_FORMAT_STR)) + /* max irq_index is 2047, so four chars */ + #define MLX5_MAX_IRQ_IDX_CHARS (4) + #define MLX5_EQ_REFS_PER_IRQ (2) +-- +2.43.0 + diff --git a/queue-6.1/net-qla3xxx-fix-potential-memleak-in-ql_alloc_buffer.patch b/queue-6.1/net-qla3xxx-fix-potential-memleak-in-ql_alloc_buffer.patch new file mode 100644 index 00000000000..c35fba3dfe4 --- /dev/null +++ b/queue-6.1/net-qla3xxx-fix-potential-memleak-in-ql_alloc_buffer.patch @@ -0,0 +1,44 @@ +From 9c442a6aebc6eef0931aa962bc9c2dc82e4ac4a5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Dec 2023 15:02:27 +0800 +Subject: net/qla3xxx: fix potential memleak in ql_alloc_buffer_queues + +From: Dinghao Liu + +[ Upstream commit 89f45c30172c80e55c887f32f1af8e184124577b ] + +When dma_alloc_coherent() fails, we should free qdev->lrg_buf +to prevent potential memleak. + +Fixes: 1357bfcf7106 ("qla3xxx: Dynamically size the rx buffer queue based on the MTU.") +Signed-off-by: Dinghao Liu +Link: https://lore.kernel.org/r/20231227070227.10527-1-dinghao.liu@zju.edu.cn +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/qlogic/qla3xxx.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/net/ethernet/qlogic/qla3xxx.c b/drivers/net/ethernet/qlogic/qla3xxx.c +index 0d57ffcedf0c6..fc78bc959ded8 100644 +--- a/drivers/net/ethernet/qlogic/qla3xxx.c ++++ b/drivers/net/ethernet/qlogic/qla3xxx.c +@@ -2591,6 +2591,7 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev) + + if (qdev->lrg_buf_q_alloc_virt_addr == NULL) { + netdev_err(qdev->ndev, "lBufQ failed\n"); ++ kfree(qdev->lrg_buf); + return -ENOMEM; + } + qdev->lrg_buf_q_virt_addr = qdev->lrg_buf_q_alloc_virt_addr; +@@ -2615,6 +2616,7 @@ static int ql_alloc_buffer_queues(struct ql3_adapter *qdev) + qdev->lrg_buf_q_alloc_size, + qdev->lrg_buf_q_alloc_virt_addr, + qdev->lrg_buf_q_alloc_phy_addr); ++ kfree(qdev->lrg_buf); + return -ENOMEM; + } + +-- +2.43.0 + diff --git a/queue-6.1/net-ravb-wait-for-operating-mode-to-be-applied.patch b/queue-6.1/net-ravb-wait-for-operating-mode-to-be-applied.patch new file mode 100644 index 00000000000..09181e25851 --- /dev/null +++ b/queue-6.1/net-ravb-wait-for-operating-mode-to-be-applied.patch @@ -0,0 +1,181 @@ +From 369ba8d2f5585f0a8e7f716d5dfd513881c4a891 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 3 Jan 2024 10:13:53 +0200 +Subject: net: ravb: Wait for operating mode to be applied + +From: Claudiu Beznea + +[ Upstream commit 9039cd4c61635b2d541009a7cd5e2cc052402f28 ] + +CSR.OPS bits specify the current operating mode and (according to +documentation) they are updated by HW when the operating mode change +request is processed. To comply with this check CSR.OPS before proceeding. + +Commit introduces ravb_set_opmode() that does all the necessities for +setting the operating mode (set CCC.OPC (and CCC.GAC, CCC.CSEL, if any) and +wait for CSR.OPS) and call it where needed. This should comply with all the +HW manuals requirements as different manual variants specify that different +modes need to be checked in CSR.OPS when setting CCC.OPC. + +If gPTP active in config mode is supported and it needs to be enabled, the +CCC.GAC and CCC.CSEL needs to be configured along with CCC.OPC in the same +write access. For this, ravb_set_opmode() allows passing GAC and CSEL as +part of opmode and the function updates accordingly CCC register. + +Fixes: c156633f1353 ("Renesas Ethernet AVB driver proper") +Signed-off-by: Claudiu Beznea +Reviewed-by: Sergey Shtylyov +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/renesas/ravb_main.c | 65 +++++++++++++++--------- + 1 file changed, 42 insertions(+), 23 deletions(-) + +diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c +index 68cb5616ef991..c2c56a5289caf 100644 +--- a/drivers/net/ethernet/renesas/ravb_main.c ++++ b/drivers/net/ethernet/renesas/ravb_main.c +@@ -68,16 +68,27 @@ int ravb_wait(struct net_device *ndev, enum ravb_reg reg, u32 mask, u32 value) + return -ETIMEDOUT; + } + +-static int ravb_config(struct net_device *ndev) ++static int ravb_set_opmode(struct net_device *ndev, u32 opmode) + { ++ u32 csr_ops = 1U << (opmode & CCC_OPC); ++ u32 ccc_mask = CCC_OPC; + int error; + +- /* Set config mode */ +- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG); +- /* Check if the operating mode is changed to the config mode */ +- error = ravb_wait(ndev, CSR, CSR_OPS, CSR_OPS_CONFIG); +- if (error) +- netdev_err(ndev, "failed to switch device to config mode\n"); ++ /* If gPTP active in config mode is supported it needs to be configured ++ * along with CSEL and operating mode in the same access. This is a ++ * hardware limitation. ++ */ ++ if (opmode & CCC_GAC) ++ ccc_mask |= CCC_GAC | CCC_CSEL; ++ ++ /* Set operating mode */ ++ ravb_modify(ndev, CCC, ccc_mask, opmode); ++ /* Check if the operating mode is changed to the requested one */ ++ error = ravb_wait(ndev, CSR, CSR_OPS, csr_ops); ++ if (error) { ++ netdev_err(ndev, "failed to switch device to requested mode (%u)\n", ++ opmode & CCC_OPC); ++ } + + return error; + } +@@ -675,7 +686,7 @@ static int ravb_dmac_init(struct net_device *ndev) + int error; + + /* Set CONFIG mode */ +- error = ravb_config(ndev); ++ error = ravb_set_opmode(ndev, CCC_OPC_CONFIG); + if (error) + return error; + +@@ -684,9 +695,7 @@ static int ravb_dmac_init(struct net_device *ndev) + return error; + + /* Setting the control will start the AVB-DMAC process. */ +- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_OPERATION); +- +- return 0; ++ return ravb_set_opmode(ndev, CCC_OPC_OPERATION); + } + + static void ravb_get_tx_tstamp(struct net_device *ndev) +@@ -1048,7 +1057,7 @@ static int ravb_stop_dma(struct net_device *ndev) + return error; + + /* Stop AVB-DMAC process */ +- return ravb_config(ndev); ++ return ravb_set_opmode(ndev, CCC_OPC_CONFIG); + } + + /* E-MAC interrupt handler */ +@@ -2576,21 +2585,25 @@ static int ravb_set_gti(struct net_device *ndev) + return 0; + } + +-static void ravb_set_config_mode(struct net_device *ndev) ++static int ravb_set_config_mode(struct net_device *ndev) + { + struct ravb_private *priv = netdev_priv(ndev); + const struct ravb_hw_info *info = priv->info; ++ int error; + + if (info->gptp) { +- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG); ++ error = ravb_set_opmode(ndev, CCC_OPC_CONFIG); ++ if (error) ++ return error; + /* Set CSEL value */ + ravb_modify(ndev, CCC, CCC_CSEL, CCC_CSEL_HPB); + } else if (info->ccc_gac) { +- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG | +- CCC_GAC | CCC_CSEL_HPB); ++ error = ravb_set_opmode(ndev, CCC_OPC_CONFIG | CCC_GAC | CCC_CSEL_HPB); + } else { +- ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG); ++ error = ravb_set_opmode(ndev, CCC_OPC_CONFIG); + } ++ ++ return error; + } + + /* Set tx and rx clock internal delay modes */ +@@ -2810,7 +2823,9 @@ static int ravb_probe(struct platform_device *pdev) + ndev->ethtool_ops = &ravb_ethtool_ops; + + /* Set AVB config mode */ +- ravb_set_config_mode(ndev); ++ error = ravb_set_config_mode(ndev); ++ if (error) ++ goto out_disable_gptp_clk; + + if (info->gptp || info->ccc_gac) { + /* Set GTI value */ +@@ -2933,8 +2948,7 @@ static int ravb_remove(struct platform_device *pdev) + dma_free_coherent(ndev->dev.parent, priv->desc_bat_size, priv->desc_bat, + priv->desc_bat_dma); + +- /* Set reset mode */ +- ravb_write(ndev, CCC_OPC_RESET, CCC); ++ ravb_set_opmode(ndev, CCC_OPC_RESET); + + clk_disable_unprepare(priv->gptp_clk); + clk_disable_unprepare(priv->refclk); +@@ -3018,8 +3032,11 @@ static int __maybe_unused ravb_resume(struct device *dev) + int ret = 0; + + /* If WoL is enabled set reset mode to rearm the WoL logic */ +- if (priv->wol_enabled) +- ravb_write(ndev, CCC_OPC_RESET, CCC); ++ if (priv->wol_enabled) { ++ ret = ravb_set_opmode(ndev, CCC_OPC_RESET); ++ if (ret) ++ return ret; ++ } + + /* All register have been reset to default values. + * Restore all registers which where setup at probe time and +@@ -3027,7 +3044,9 @@ static int __maybe_unused ravb_resume(struct device *dev) + */ + + /* Set AVB config mode */ +- ravb_set_config_mode(ndev); ++ ret = ravb_set_config_mode(ndev); ++ if (ret) ++ return ret; + + if (info->gptp || info->ccc_gac) { + /* Set GTI value */ +-- +2.43.0 + diff --git a/queue-6.1/net-save-and-restore-msg_namelen-in-sock_sendmsg.patch b/queue-6.1/net-save-and-restore-msg_namelen-in-sock_sendmsg.patch new file mode 100644 index 00000000000..49b044712a8 --- /dev/null +++ b/queue-6.1/net-save-and-restore-msg_namelen-in-sock_sendmsg.patch @@ -0,0 +1,55 @@ +From 1c052bf518018a0db7e7a4b8e3f63445d941d7b5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Dec 2023 09:12:30 -0400 +Subject: net: Save and restore msg_namelen in sock_sendmsg + +From: Marc Dionne + +[ Upstream commit 01b2885d9415152bcb12ff1f7788f500a74ea0ed ] + +Commit 86a7e0b69bd5 ("net: prevent rewrite of msg_name in +sock_sendmsg()") made sock_sendmsg save the incoming msg_name pointer +and restore it before returning, to insulate the caller against +msg_name being changed by the called code. If the address length +was also changed however, we may return with an inconsistent structure +where the length doesn't match the address, and attempts to reuse it may +lead to lost packets. + +For example, a kernel that doesn't have commit 1c5950fc6fe9 ("udp6: fix +potential access to stale information") will replace a v4 mapped address +with its ipv4 equivalent, and shorten namelen accordingly from 28 to 16. +If the caller attempts to reuse the resulting msg structure, it will have +the original ipv6 (v4 mapped) address but an incorrect v4 length. + +Fixes: 86a7e0b69bd5 ("net: prevent rewrite of msg_name in sock_sendmsg()") +Signed-off-by: Marc Dionne +Reviewed-by: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/socket.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/net/socket.c b/net/socket.c +index 07470724e7358..0104617b440dc 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -740,6 +740,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg) + { + struct sockaddr_storage *save_addr = (struct sockaddr_storage *)msg->msg_name; + struct sockaddr_storage address; ++ int save_len = msg->msg_namelen; + int ret; + + if (msg->msg_name) { +@@ -749,6 +750,7 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg) + + ret = __sock_sendmsg(sock, msg); + msg->msg_name = save_addr; ++ msg->msg_namelen = save_len; + + return ret; + } +-- +2.43.0 + diff --git a/queue-6.1/net-sched-act_ct-fix-promotion-of-offloaded-unreplie.patch b/queue-6.1/net-sched-act_ct-fix-promotion-of-offloaded-unreplie.patch new file mode 100644 index 00000000000..19b468d44de --- /dev/null +++ b/queue-6.1/net-sched-act_ct-fix-promotion-of-offloaded-unreplie.patch @@ -0,0 +1,158 @@ +From 14e25d537fb93353328283053064f4589dcff379 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 9 Jun 2023 15:22:59 +0300 +Subject: net/sched: act_ct: Fix promotion of offloaded unreplied tuple + +From: Paul Blakey + +[ Upstream commit 41f2c7c342d3adb1c4dd5f2e3dd831adff16a669 ] + +Currently UNREPLIED and UNASSURED connections are added to the nf flow +table. This causes the following connection packets to be processed +by the flow table which then skips conntrack_in(), and thus such the +connections will remain UNREPLIED and UNASSURED even if reply traffic +is then seen. Even still, the unoffloaded reply packets are the ones +triggering hardware update from new to established state, and if +there aren't any to triger an update and/or previous update was +missed, hardware can get out of sync with sw and still mark +packets as new. + +Fix the above by: +1) Not skipping conntrack_in() for UNASSURED packets, but still + refresh for hardware, as before the cited patch. +2) Try and force a refresh by reply-direction packets that update + the hardware rules from new to established state. +3) Remove any bidirectional flows that didn't failed to update in + hardware for re-insertion as bidrectional once any new packet + arrives. + +Fixes: 6a9bad0069cf ("net/sched: act_ct: offload UDP NEW connections") +Co-developed-by: Vlad Buslov +Signed-off-by: Vlad Buslov +Signed-off-by: Paul Blakey +Reviewed-by: Florian Westphal +Link: https://lore.kernel.org/r/1686313379-117663-1-git-send-email-paulb@nvidia.com +Signed-off-by: Paolo Abeni +Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_flow_table.h | 2 +- + net/netfilter/nf_flow_table_core.c | 13 ++++++++++--- + net/netfilter/nf_flow_table_ip.c | 4 ++-- + net/sched/act_ct.c | 9 ++++++++- + 4 files changed, 21 insertions(+), 7 deletions(-) + +diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h +index ebb28ec5b6faf..f37f9f34430c1 100644 +--- a/include/net/netfilter/nf_flow_table.h ++++ b/include/net/netfilter/nf_flow_table.h +@@ -268,7 +268,7 @@ int flow_offload_route_init(struct flow_offload *flow, + + int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow); + void flow_offload_refresh(struct nf_flowtable *flow_table, +- struct flow_offload *flow); ++ struct flow_offload *flow, bool force); + + struct flow_offload_tuple_rhash *flow_offload_lookup(struct nf_flowtable *flow_table, + struct flow_offload_tuple *tuple); +diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c +index 81c26a96c30bb..baddb93a5e8cf 100644 +--- a/net/netfilter/nf_flow_table_core.c ++++ b/net/netfilter/nf_flow_table_core.c +@@ -314,12 +314,12 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) + EXPORT_SYMBOL_GPL(flow_offload_add); + + void flow_offload_refresh(struct nf_flowtable *flow_table, +- struct flow_offload *flow) ++ struct flow_offload *flow, bool force) + { + u32 timeout; + + timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow); +- if (timeout - READ_ONCE(flow->timeout) > HZ) ++ if (force || timeout - READ_ONCE(flow->timeout) > HZ) + WRITE_ONCE(flow->timeout, timeout); + else + return; +@@ -331,6 +331,12 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, + } + EXPORT_SYMBOL_GPL(flow_offload_refresh); + ++static bool nf_flow_is_outdated(const struct flow_offload *flow) ++{ ++ return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && ++ !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); ++} ++ + static inline bool nf_flow_has_expired(const struct flow_offload *flow) + { + return nf_flow_timeout_delta(flow->timeout) <= 0; +@@ -420,7 +426,8 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, + struct flow_offload *flow, void *data) + { + if (nf_flow_has_expired(flow) || +- nf_ct_is_dying(flow->ct)) ++ nf_ct_is_dying(flow->ct) || ++ nf_flow_is_outdated(flow)) + flow_offload_teardown(flow); + + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { +diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c +index b350fe9d00b0b..6feaac9ab05c8 100644 +--- a/net/netfilter/nf_flow_table_ip.c ++++ b/net/netfilter/nf_flow_table_ip.c +@@ -384,7 +384,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, + if (skb_try_make_writable(skb, thoff + hdrsize)) + return NF_DROP; + +- flow_offload_refresh(flow_table, flow); ++ flow_offload_refresh(flow_table, flow, false); + + nf_flow_encap_pop(skb, tuplehash); + thoff -= offset; +@@ -646,7 +646,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, + if (skb_try_make_writable(skb, thoff + hdrsize)) + return NF_DROP; + +- flow_offload_refresh(flow_table, flow); ++ flow_offload_refresh(flow_table, flow, false); + + nf_flow_encap_pop(skb, tuplehash); + +diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c +index 3c063065f125f..b80a58d3bf0f3 100644 +--- a/net/sched/act_ct.c ++++ b/net/sched/act_ct.c +@@ -606,6 +606,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, + struct flow_offload_tuple tuple = {}; + enum ip_conntrack_info ctinfo; + struct tcphdr *tcph = NULL; ++ bool force_refresh = false; + struct flow_offload *flow; + struct nf_conn *ct; + u8 dir; +@@ -643,6 +644,7 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, + * established state, then don't refresh. + */ + return false; ++ force_refresh = true; + } + + if (tcph && (unlikely(tcph->fin || tcph->rst))) { +@@ -656,7 +658,12 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, + else + ctinfo = IP_CT_ESTABLISHED_REPLY; + +- flow_offload_refresh(nf_ft, flow); ++ flow_offload_refresh(nf_ft, flow, force_refresh); ++ if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { ++ /* Process this flow in SW to allow promoting to ASSURED */ ++ return false; ++ } ++ + nf_conntrack_get(&ct->ct_general); + nf_ct_set(skb, ct, ctinfo); + if (nf_ft->flags & NF_FLOWTABLE_COUNTER) +-- +2.43.0 + diff --git a/queue-6.1/net-sched-act_ct-offload-udp-new-connections.patch b/queue-6.1/net-sched-act_ct-offload-udp-new-connections.patch new file mode 100644 index 00000000000..86b6ea9a94e --- /dev/null +++ b/queue-6.1/net-sched-act_ct-offload-udp-new-connections.patch @@ -0,0 +1,157 @@ +From da420921aeb41458470ef982be25475a762e01c9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 1 Feb 2023 17:30:59 +0100 +Subject: net/sched: act_ct: offload UDP NEW connections + +From: Vlad Buslov + +[ Upstream commit 6a9bad0069cf306f3df6ac53cf02438d4e15f296 ] + +Modify the offload algorithm of UDP connections to the following: + +- Offload NEW connection as unidirectional. + +- When connection state changes to ESTABLISHED also update the hardware +flow. However, in order to prevent act_ct from spamming offload add wq for +every packet coming in reply direction in this state verify whether +connection has already been updated to ESTABLISHED in the drivers. If that +it the case, then skip flow_table and let conntrack handle such packets +which will also allow conntrack to potentially promote the connection to +ASSURED. + +- When connection state changes to ASSURED set the flow_table flow +NF_FLOW_HW_BIDIRECTIONAL flag which will cause refresh mechanism to offload +the reply direction. + +All other protocols have their offload algorithm preserved and are always +offloaded as bidirectional. + +Note that this change tries to minimize the load on flow_table add +workqueue. First, it tracks the last ctinfo that was offloaded by using new +flow 'NF_FLOW_HW_ESTABLISHED' flag and doesn't schedule the refresh for +reply direction packets when the offloads have already been updated with +current ctinfo. Second, when 'add' task executes on workqueue it always +update the offload with current flow state (by checking 'bidirectional' +flow flag and obtaining actual ctinfo/cookie through meta action instead of +caching any of these from the moment of scheduling the 'add' work) +preventing the need from scheduling more updates if state changed +concurrently while the 'add' work was pending on workqueue. + +Signed-off-by: Vlad Buslov +Signed-off-by: David S. Miller +Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") +Signed-off-by: Sasha Levin +--- + net/sched/act_ct.c | 51 +++++++++++++++++++++++++++++++++++----------- + 1 file changed, 39 insertions(+), 12 deletions(-) + +diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c +index 86d269724485a..3c063065f125f 100644 +--- a/net/sched/act_ct.c ++++ b/net/sched/act_ct.c +@@ -365,7 +365,7 @@ static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, + + static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, + struct nf_conn *ct, +- bool tcp) ++ bool tcp, bool bidirectional) + { + struct nf_conn_act_ct_ext *act_ct_ext; + struct flow_offload *entry; +@@ -384,6 +384,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, + ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; + } ++ if (bidirectional) ++ __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags); + + act_ct_ext = nf_conn_act_ct_ext_find(ct); + if (act_ct_ext) { +@@ -407,26 +409,34 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo) + { +- bool tcp = false; +- +- if ((ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) || +- !test_bit(IPS_ASSURED_BIT, &ct->status)) +- return; ++ bool tcp = false, bidirectional = true; + + switch (nf_ct_protonum(ct)) { + case IPPROTO_TCP: +- tcp = true; +- if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) ++ if ((ctinfo != IP_CT_ESTABLISHED && ++ ctinfo != IP_CT_ESTABLISHED_REPLY) || ++ !test_bit(IPS_ASSURED_BIT, &ct->status) || ++ ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) + return; ++ ++ tcp = true; + break; + case IPPROTO_UDP: ++ if (!nf_ct_is_confirmed(ct)) ++ return; ++ if (!test_bit(IPS_ASSURED_BIT, &ct->status)) ++ bidirectional = false; + break; + #ifdef CONFIG_NF_CT_PROTO_GRE + case IPPROTO_GRE: { + struct nf_conntrack_tuple *tuple; + +- if (ct->status & IPS_NAT_MASK) ++ if ((ctinfo != IP_CT_ESTABLISHED && ++ ctinfo != IP_CT_ESTABLISHED_REPLY) || ++ !test_bit(IPS_ASSURED_BIT, &ct->status) || ++ ct->status & IPS_NAT_MASK) + return; ++ + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + /* No support for GRE v1 */ + if (tuple->src.u.gre.key || tuple->dst.u.gre.key) +@@ -442,7 +452,7 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, + ct->status & IPS_SEQ_ADJUST) + return; + +- tcf_ct_flow_table_add(ct_ft, ct, tcp); ++ tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional); + } + + static bool +@@ -621,13 +631,30 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, + flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + ct = flow->ct; + ++ if (dir == FLOW_OFFLOAD_DIR_REPLY && ++ !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) { ++ /* Only offload reply direction after connection became ++ * assured. ++ */ ++ if (test_bit(IPS_ASSURED_BIT, &ct->status)) ++ set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); ++ else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags)) ++ /* If flow_table flow has already been updated to the ++ * established state, then don't refresh. ++ */ ++ return false; ++ } ++ + if (tcph && (unlikely(tcph->fin || tcph->rst))) { + flow_offload_teardown(flow); + return false; + } + +- ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : +- IP_CT_ESTABLISHED_REPLY; ++ if (dir == FLOW_OFFLOAD_DIR_ORIGINAL) ++ ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? ++ IP_CT_ESTABLISHED : IP_CT_NEW; ++ else ++ ctinfo = IP_CT_ESTABLISHED_REPLY; + + flow_offload_refresh(nf_ft, flow); + nf_conntrack_get(&ct->ct_general); +-- +2.43.0 + diff --git a/queue-6.1/net-sched-act_ct-take-per-cb-reference-to-tcf_ct_flo.patch b/queue-6.1/net-sched-act_ct-take-per-cb-reference-to-tcf_ct_flo.patch new file mode 100644 index 00000000000..cecac2bfe79 --- /dev/null +++ b/queue-6.1/net-sched-act_ct-take-per-cb-reference-to-tcf_ct_flo.patch @@ -0,0 +1,195 @@ +From baf1515c7f3f16801fb896b07d179e5aa3fe924b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 5 Dec 2023 18:25:54 +0100 +Subject: net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table + +From: Vlad Buslov + +[ Upstream commit 125f1c7f26ffcdbf96177abe75b70c1a6ceb17bc ] + +The referenced change added custom cleanup code to act_ct to delete any +callbacks registered on the parent block when deleting the +tcf_ct_flow_table instance. However, the underlying issue is that the +drivers don't obtain the reference to the tcf_ct_flow_table instance when +registering callbacks which means that not only driver callbacks may still +be on the table when deleting it but also that the driver can still have +pointers to its internal nf_flowtable and can use it concurrently which +results either warning in netfilter[0] or use-after-free. + +Fix the issue by taking a reference to the underlying struct +tcf_ct_flow_table instance when registering the callback and release the +reference when unregistering. Expose new API required for such reference +counting by adding two new callbacks to nf_flowtable_type and implementing +them for act_ct flowtable_ct type. This fixes the issue by extending the +lifetime of nf_flowtable until all users have unregistered. + +[0]: +[106170.938634] ------------[ cut here ]------------ +[106170.939111] WARNING: CPU: 21 PID: 3688 at include/net/netfilter/nf_flow_table.h:262 mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] +[106170.940108] Modules linked in: act_ct nf_flow_table act_mirred act_skbedit act_tunnel_key vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa bonding openvswitch nsh rpcrdma rdma_ucm +ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_regis +try overlay mlx5_core +[106170.943496] CPU: 21 PID: 3688 Comm: kworker/u48:0 Not tainted 6.6.0-rc7_for_upstream_min_debug_2023_11_01_13_02 #1 +[106170.944361] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 +[106170.945292] Workqueue: mlx5e mlx5e_rep_neigh_update [mlx5_core] +[106170.945846] RIP: 0010:mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] +[106170.946413] Code: 89 ef 48 83 05 71 a4 14 00 01 e8 f4 06 04 e1 48 83 05 6c a4 14 00 01 48 83 c4 28 5b 5d 41 5c 41 5d c3 48 83 05 d1 8b 14 00 01 <0f> 0b 48 83 05 d7 8b 14 00 01 e9 96 fe ff ff 48 83 05 a2 90 14 00 +[106170.947924] RSP: 0018:ffff88813ff0fcb8 EFLAGS: 00010202 +[106170.948397] RAX: 0000000000000000 RBX: ffff88811eabac40 RCX: ffff88811eabad48 +[106170.949040] RDX: ffff88811eab8000 RSI: ffffffffa02cd560 RDI: 0000000000000000 +[106170.949679] RBP: ffff88811eab8000 R08: 0000000000000001 R09: ffffffffa0229700 +[106170.950317] R10: ffff888103538fc0 R11: 0000000000000001 R12: ffff88811eabad58 +[106170.950969] R13: ffff888110c01c00 R14: ffff888106b40000 R15: 0000000000000000 +[106170.951616] FS: 0000000000000000(0000) GS:ffff88885fd40000(0000) knlGS:0000000000000000 +[106170.952329] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[106170.952834] CR2: 00007f1cefd28cb0 CR3: 000000012181b006 CR4: 0000000000370ea0 +[106170.953482] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[106170.954121] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[106170.954766] Call Trace: +[106170.955057] +[106170.955315] ? __warn+0x79/0x120 +[106170.955648] ? mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] +[106170.956172] ? report_bug+0x17c/0x190 +[106170.956537] ? handle_bug+0x3c/0x60 +[106170.956891] ? exc_invalid_op+0x14/0x70 +[106170.957264] ? asm_exc_invalid_op+0x16/0x20 +[106170.957666] ? mlx5_del_flow_rules+0x10/0x310 [mlx5_core] +[106170.958172] ? mlx5_tc_ct_block_flow_offload_add+0x1240/0x1240 [mlx5_core] +[106170.958788] ? mlx5_tc_ct_del_ft_cb+0x267/0x2b0 [mlx5_core] +[106170.959339] ? mlx5_tc_ct_del_ft_cb+0xc6/0x2b0 [mlx5_core] +[106170.959854] ? mapping_remove+0x154/0x1d0 [mlx5_core] +[106170.960342] ? mlx5e_tc_action_miss_mapping_put+0x4f/0x80 [mlx5_core] +[106170.960927] mlx5_tc_ct_delete_flow+0x76/0xc0 [mlx5_core] +[106170.961441] mlx5_free_flow_attr_actions+0x13b/0x220 [mlx5_core] +[106170.962001] mlx5e_tc_del_fdb_flow+0x22c/0x3b0 [mlx5_core] +[106170.962524] mlx5e_tc_del_flow+0x95/0x3c0 [mlx5_core] +[106170.963034] mlx5e_flow_put+0x73/0xe0 [mlx5_core] +[106170.963506] mlx5e_put_flow_list+0x38/0x70 [mlx5_core] +[106170.964002] mlx5e_rep_update_flows+0xec/0x290 [mlx5_core] +[106170.964525] mlx5e_rep_neigh_update+0x1da/0x310 [mlx5_core] +[106170.965056] process_one_work+0x13a/0x2c0 +[106170.965443] worker_thread+0x2e5/0x3f0 +[106170.965808] ? rescuer_thread+0x410/0x410 +[106170.966192] kthread+0xc6/0xf0 +[106170.966515] ? kthread_complete_and_exit+0x20/0x20 +[106170.966970] ret_from_fork+0x2d/0x50 +[106170.967332] ? kthread_complete_and_exit+0x20/0x20 +[106170.967774] ret_from_fork_asm+0x11/0x20 +[106170.970466] +[106170.970726] ---[ end trace 0000000000000000 ]--- + +Fixes: 77ac5e40c44e ("net/sched: act_ct: remove and free nf_table callbacks") +Signed-off-by: Vlad Buslov +Reviewed-by: Paul Blakey +Acked-by: Pablo Neira Ayuso +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_flow_table.h | 10 ++++++++ + net/sched/act_ct.c | 34 ++++++++++++++++++++++----- + 2 files changed, 38 insertions(+), 6 deletions(-) + +diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h +index 0b163ead95c9f..dde4dd9c4012c 100644 +--- a/include/net/netfilter/nf_flow_table.h ++++ b/include/net/netfilter/nf_flow_table.h +@@ -62,6 +62,8 @@ struct nf_flowtable_type { + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); + void (*free)(struct nf_flowtable *ft); ++ void (*get)(struct nf_flowtable *ft); ++ void (*put)(struct nf_flowtable *ft); + nf_hookfn *hook; + struct module *owner; + }; +@@ -240,6 +242,11 @@ nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table, + } + + list_add_tail(&block_cb->list, &block->cb_list); ++ up_write(&flow_table->flow_block_lock); ++ ++ if (flow_table->type->get) ++ flow_table->type->get(flow_table); ++ return 0; + + unlock: + up_write(&flow_table->flow_block_lock); +@@ -262,6 +269,9 @@ nf_flow_table_offload_del_cb(struct nf_flowtable *flow_table, + WARN_ON(true); + } + up_write(&flow_table->flow_block_lock); ++ ++ if (flow_table->type->put) ++ flow_table->type->put(flow_table); + } + + int flow_offload_route_init(struct flow_offload *flow, +diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c +index 4d34474f2cc0e..faf798133059b 100644 +--- a/net/sched/act_ct.c ++++ b/net/sched/act_ct.c +@@ -280,9 +280,31 @@ static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow) + !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); + } + ++static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft); ++ ++static void tcf_ct_nf_get(struct nf_flowtable *ft) ++{ ++ struct tcf_ct_flow_table *ct_ft = ++ container_of(ft, struct tcf_ct_flow_table, nf_ft); ++ ++ tcf_ct_flow_table_get_ref(ct_ft); ++} ++ ++static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft); ++ ++static void tcf_ct_nf_put(struct nf_flowtable *ft) ++{ ++ struct tcf_ct_flow_table *ct_ft = ++ container_of(ft, struct tcf_ct_flow_table, nf_ft); ++ ++ tcf_ct_flow_table_put(ct_ft); ++} ++ + static struct nf_flowtable_type flowtable_ct = { + .gc = tcf_ct_flow_is_outdated, + .action = tcf_ct_flow_table_fill_actions, ++ .get = tcf_ct_nf_get, ++ .put = tcf_ct_nf_put, + .owner = THIS_MODULE, + }; + +@@ -331,9 +353,13 @@ static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params) + return err; + } + ++static void tcf_ct_flow_table_get_ref(struct tcf_ct_flow_table *ct_ft) ++{ ++ refcount_inc(&ct_ft->ref); ++} ++ + static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) + { +- struct flow_block_cb *block_cb, *tmp_cb; + struct tcf_ct_flow_table *ct_ft; + struct flow_block *block; + +@@ -341,13 +367,9 @@ static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) + rwork); + nf_flow_table_free(&ct_ft->nf_ft); + +- /* Remove any remaining callbacks before cleanup */ + block = &ct_ft->nf_ft.flow_block; + down_write(&ct_ft->nf_ft.flow_block_lock); +- list_for_each_entry_safe(block_cb, tmp_cb, &block->cb_list, list) { +- list_del(&block_cb->list); +- flow_block_cb_free(block_cb); +- } ++ WARN_ON(!list_empty(&block->cb_list)); + up_write(&ct_ft->nf_ft.flow_block_lock); + kfree(ct_ft); + +-- +2.43.0 + diff --git a/queue-6.1/net-sched-call-tcf_ct_params_free-to-free-params-in-.patch b/queue-6.1/net-sched-call-tcf_ct_params_free-to-free-params-in-.patch new file mode 100644 index 00000000000..827f58b00a7 --- /dev/null +++ b/queue-6.1/net-sched-call-tcf_ct_params_free-to-free-params-in-.patch @@ -0,0 +1,112 @@ +From 89293e3bc421a92dfd4935a5bec34d30ab89aba1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 6 Nov 2022 15:34:16 -0500 +Subject: net: sched: call tcf_ct_params_free to free params in tcf_ct_init + +From: Xin Long + +[ Upstream commit 1913894100ca53205f2d56091cb34b8eba1de217 ] + +This patch is to make the err path simple by calling tcf_ct_params_free(), +so that it won't cause problems when more members are added into param and +need freeing on the err path. + +Acked-by: Marcelo Ricardo Leitner +Signed-off-by: Xin Long +Signed-off-by: Paolo Abeni +Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") +Signed-off-by: Sasha Levin +--- + net/sched/act_ct.c | 35 ++++++++++++++++++----------------- + 1 file changed, 18 insertions(+), 17 deletions(-) + +diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c +index 4c7f7861ea967..478cedc29b737 100644 +--- a/net/sched/act_ct.c ++++ b/net/sched/act_ct.c +@@ -345,11 +345,9 @@ static void tcf_ct_flow_table_cleanup_work(struct work_struct *work) + module_put(THIS_MODULE); + } + +-static void tcf_ct_flow_table_put(struct tcf_ct_params *params) ++static void tcf_ct_flow_table_put(struct tcf_ct_flow_table *ct_ft) + { +- struct tcf_ct_flow_table *ct_ft = params->ct_ft; +- +- if (refcount_dec_and_test(¶ms->ct_ft->ref)) { ++ if (refcount_dec_and_test(&ct_ft->ref)) { + rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params); + INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work); + queue_rcu_work(act_ct_wq, &ct_ft->rwork); +@@ -832,18 +830,23 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, + return err; + } + +-static void tcf_ct_params_free(struct rcu_head *head) ++static void tcf_ct_params_free(struct tcf_ct_params *params) + { +- struct tcf_ct_params *params = container_of(head, +- struct tcf_ct_params, rcu); +- +- tcf_ct_flow_table_put(params); +- ++ if (params->ct_ft) ++ tcf_ct_flow_table_put(params->ct_ft); + if (params->tmpl) + nf_ct_put(params->tmpl); + kfree(params); + } + ++static void tcf_ct_params_free_rcu(struct rcu_head *head) ++{ ++ struct tcf_ct_params *params; ++ ++ params = container_of(head, struct tcf_ct_params, rcu); ++ tcf_ct_params_free(params); ++} ++ + #if IS_ENABLED(CONFIG_NF_NAT) + /* Modelled after nf_nat_ipv[46]_fn(). + * range is only used for new, uninitialized NAT state. +@@ -1390,7 +1393,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, + + err = tcf_ct_flow_table_get(net, params); + if (err) +- goto cleanup_params; ++ goto cleanup; + + spin_lock_bh(&c->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); +@@ -1401,17 +1404,15 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + if (params) +- call_rcu(¶ms->rcu, tcf_ct_params_free); ++ call_rcu(¶ms->rcu, tcf_ct_params_free_rcu); + + return res; + +-cleanup_params: +- if (params->tmpl) +- nf_ct_put(params->tmpl); + cleanup: + if (goto_ch) + tcf_chain_put_by_act(goto_ch); +- kfree(params); ++ if (params) ++ tcf_ct_params_free(params); + tcf_idr_release(*a, bind); + return err; + } +@@ -1423,7 +1424,7 @@ static void tcf_ct_cleanup(struct tc_action *a) + + params = rcu_dereference_protected(c->params, 1); + if (params) +- call_rcu(¶ms->rcu, tcf_ct_params_free); ++ call_rcu(¶ms->rcu, tcf_ct_params_free_rcu); + } + + static int tcf_ct_dump_key_val(struct sk_buff *skb, +-- +2.43.0 + diff --git a/queue-6.1/net-sched-em_text-fix-possible-memory-leak-in-em_tex.patch b/queue-6.1/net-sched-em_text-fix-possible-memory-leak-in-em_tex.patch new file mode 100644 index 00000000000..b4907c2d3e7 --- /dev/null +++ b/queue-6.1/net-sched-em_text-fix-possible-memory-leak-in-em_tex.patch @@ -0,0 +1,40 @@ +From 93c23c768858a1ae196116f045b4c1ff98e4e843 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 21 Dec 2023 10:25:31 +0800 +Subject: net: sched: em_text: fix possible memory leak in em_text_destroy() + +From: Hangyu Hua + +[ Upstream commit 8fcb0382af6f1ef50936f1be05b8149eb2f88496 ] + +m->data needs to be freed when em_text_destroy is called. + +Fixes: d675c989ed2d ("[PKT_SCHED]: Packet classification based on textsearch (ematch)") +Acked-by: Jamal Hadi Salim +Signed-off-by: Hangyu Hua +Reviewed-by: Simon Horman +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/sched/em_text.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/net/sched/em_text.c b/net/sched/em_text.c +index 6f3c1fb2fb44c..f176afb70559e 100644 +--- a/net/sched/em_text.c ++++ b/net/sched/em_text.c +@@ -97,8 +97,10 @@ static int em_text_change(struct net *net, void *data, int len, + + static void em_text_destroy(struct tcf_ematch *m) + { +- if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) ++ if (EM_TEXT_PRIV(m) && EM_TEXT_PRIV(m)->config) { + textsearch_destroy(EM_TEXT_PRIV(m)->config); ++ kfree(EM_TEXT_PRIV(m)); ++ } + } + + static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) +-- +2.43.0 + diff --git a/queue-6.1/net-smc-fix-invalid-link-access-in-dumping-smc-r-con.patch b/queue-6.1/net-smc-fix-invalid-link-access-in-dumping-smc-r-con.patch new file mode 100644 index 00000000000..98bc1545022 --- /dev/null +++ b/queue-6.1/net-smc-fix-invalid-link-access-in-dumping-smc-r-con.patch @@ -0,0 +1,91 @@ +From a55dfee1f458e66ebb434d93453283be3b49b991 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Dec 2023 15:40:35 +0800 +Subject: net/smc: fix invalid link access in dumping SMC-R connections + +From: Wen Gu + +[ Upstream commit 9dbe086c69b8902c85cece394760ac212e9e4ccc ] + +A crash was found when dumping SMC-R connections. It can be reproduced +by following steps: + +- environment: two RNICs on both sides. +- run SMC-R between two sides, now a SMC_LGR_SYMMETRIC type link group + will be created. +- set the first RNIC down on either side and link group will turn to + SMC_LGR_ASYMMETRIC_LOCAL then. +- run 'smcss -R' and the crash will be triggered. + + BUG: kernel NULL pointer dereference, address: 0000000000000010 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 8000000101fdd067 P4D 8000000101fdd067 PUD 10ce46067 PMD 0 + Oops: 0000 [#1] PREEMPT SMP PTI + CPU: 3 PID: 1810 Comm: smcss Kdump: loaded Tainted: G W E 6.7.0-rc6+ #51 + RIP: 0010:__smc_diag_dump.constprop.0+0x36e/0x620 [smc_diag] + Call Trace: + + ? __die+0x24/0x70 + ? page_fault_oops+0x66/0x150 + ? exc_page_fault+0x69/0x140 + ? asm_exc_page_fault+0x26/0x30 + ? __smc_diag_dump.constprop.0+0x36e/0x620 [smc_diag] + smc_diag_dump_proto+0xd0/0xf0 [smc_diag] + smc_diag_dump+0x26/0x60 [smc_diag] + netlink_dump+0x19f/0x320 + __netlink_dump_start+0x1dc/0x300 + smc_diag_handler_dump+0x6a/0x80 [smc_diag] + ? __pfx_smc_diag_dump+0x10/0x10 [smc_diag] + sock_diag_rcv_msg+0x121/0x140 + ? __pfx_sock_diag_rcv_msg+0x10/0x10 + netlink_rcv_skb+0x5a/0x110 + sock_diag_rcv+0x28/0x40 + netlink_unicast+0x22a/0x330 + netlink_sendmsg+0x240/0x4a0 + __sock_sendmsg+0xb0/0xc0 + ____sys_sendmsg+0x24e/0x300 + ? copy_msghdr_from_user+0x62/0x80 + ___sys_sendmsg+0x7c/0xd0 + ? __do_fault+0x34/0x1a0 + ? do_read_fault+0x5f/0x100 + ? do_fault+0xb0/0x110 + __sys_sendmsg+0x4d/0x80 + do_syscall_64+0x45/0xf0 + entry_SYSCALL_64_after_hwframe+0x6e/0x76 + +When the first RNIC is set down, the lgr->lnk[0] will be cleared and an +asymmetric link will be allocated in lgr->link[SMC_LINKS_PER_LGR_MAX - 1] +by smc_llc_alloc_alt_link(). Then when we try to dump SMC-R connections +in __smc_diag_dump(), the invalid lgr->lnk[0] will be accessed, resulting +in this issue. So fix it by accessing the right link. + +Fixes: f16a7dd5cf27 ("smc: netlink interface for SMC sockets") +Reported-by: henaumars +Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=7616 +Signed-off-by: Wen Gu +Reviewed-by: Tony Lu +Link: https://lore.kernel.org/r/1703662835-53416-1-git-send-email-guwen@linux.alibaba.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/smc/smc_diag.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c +index 80ea7d954eceb..801044e7d1949 100644 +--- a/net/smc/smc_diag.c ++++ b/net/smc/smc_diag.c +@@ -153,8 +153,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, + .lnk[0].link_id = link->link_id, + }; + +- memcpy(linfo.lnk[0].ibname, +- smc->conn.lgr->lnk[0].smcibdev->ibdev->name, ++ memcpy(linfo.lnk[0].ibname, link->smcibdev->ibdev->name, + sizeof(link->smcibdev->ibdev->name)); + smc_gid_be16_convert(linfo.lnk[0].gid, link->gid); + smc_gid_be16_convert(linfo.lnk[0].peer_gid, link->peer_gid); +-- +2.43.0 + diff --git a/queue-6.1/net-timestamp-extend-sof_timestamping_opt_id-to-hw-t.patch b/queue-6.1/net-timestamp-extend-sof_timestamping_opt_id-to-hw-t.patch new file mode 100644 index 00000000000..f052b5e6986 --- /dev/null +++ b/queue-6.1/net-timestamp-extend-sof_timestamping_opt_id-to-hw-t.patch @@ -0,0 +1,52 @@ +From f34a1a0c97dbe98c13f2e62a01b50c39a1ff419d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 6 Mar 2023 08:07:38 -0800 +Subject: net-timestamp: extend SOF_TIMESTAMPING_OPT_ID to HW timestamps + +From: Vadim Fedorenko + +[ Upstream commit 8ca5a5790b9a1ce147484d2a2c4e66d2553f3d6c ] + +When the feature was added it was enabled for SW timestamps only but +with current hardware the same out-of-order timestamps can be seen. +Let's expand the area for the feature to all types of timestamps. + +Signed-off-by: Vadim Fedorenko +Reviewed-by: Willem de Bruijn +Signed-off-by: David S. Miller +Stable-dep-of: 7f6ca95d16b9 ("net: Implement missing getsockopt(SO_TIMESTAMPING_NEW)") +Signed-off-by: Sasha Levin +--- + net/ipv4/ip_output.c | 2 +- + net/ipv6/ip6_output.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c +index 493c679ea54f3..d8ec802f97524 100644 +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -990,7 +990,7 @@ static int __ip_append_data(struct sock *sk, + mtu = cork->gso_size ? IP_MAX_MTU : cork->fragsize; + paged = !!cork->gso_size; + +- if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && ++ if (cork->tx_flags & SKBTX_ANY_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + +diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c +index 3c2b2a85de367..04822e2cba74a 100644 +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -1506,7 +1506,7 @@ static int __ip6_append_data(struct sock *sk, + mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; + orig_mtu = mtu; + +- if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && ++ if (cork->tx_flags & SKBTX_ANY_TSTAMP && + sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) + tskey = atomic_inc_return(&sk->sk_tskey) - 1; + +-- +2.43.0 + diff --git a/queue-6.1/netfilter-flowtable-allow-unidirectional-rules.patch b/queue-6.1/netfilter-flowtable-allow-unidirectional-rules.patch new file mode 100644 index 00000000000..fcb18397321 --- /dev/null +++ b/queue-6.1/netfilter-flowtable-allow-unidirectional-rules.patch @@ -0,0 +1,76 @@ +From de10b8ea976d0c729b8d59713e03fe511557d6b9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 1 Feb 2023 17:30:56 +0100 +Subject: netfilter: flowtable: allow unidirectional rules + +From: Vlad Buslov + +[ Upstream commit 8f84780b84d645d6e35467f4a6f3236b20d7f4b2 ] + +Modify flow table offload to support unidirectional connections by +extending enum nf_flow_flags with new "NF_FLOW_HW_BIDIRECTIONAL" flag. Only +offload reply direction when the flag is set. This infrastructure change is +necessary to support offloading UDP NEW connections in original direction +in following patches in series. + +Signed-off-by: Vlad Buslov +Signed-off-by: David S. Miller +Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_flow_table.h | 1 + + net/netfilter/nf_flow_table_offload.c | 12 ++++++++---- + 2 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h +index cd982f4a0f50c..88ab98ab41d9f 100644 +--- a/include/net/netfilter/nf_flow_table.h ++++ b/include/net/netfilter/nf_flow_table.h +@@ -164,6 +164,7 @@ enum nf_flow_flags { + NF_FLOW_HW_DYING, + NF_FLOW_HW_DEAD, + NF_FLOW_HW_PENDING, ++ NF_FLOW_HW_BIDIRECTIONAL, + }; + + enum flow_offload_type { +diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c +index 4d9b99abe37d6..8b852f10fab4b 100644 +--- a/net/netfilter/nf_flow_table_offload.c ++++ b/net/netfilter/nf_flow_table_offload.c +@@ -895,8 +895,9 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, + + ok_count += flow_offload_tuple_add(offload, flow_rule[0], + FLOW_OFFLOAD_DIR_ORIGINAL); +- ok_count += flow_offload_tuple_add(offload, flow_rule[1], +- FLOW_OFFLOAD_DIR_REPLY); ++ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) ++ ok_count += flow_offload_tuple_add(offload, flow_rule[1], ++ FLOW_OFFLOAD_DIR_REPLY); + if (ok_count == 0) + return -ENOENT; + +@@ -926,7 +927,8 @@ static void flow_offload_work_del(struct flow_offload_work *offload) + { + clear_bit(IPS_HW_OFFLOAD_BIT, &offload->flow->ct->status); + flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_ORIGINAL); +- flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); ++ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) ++ flow_offload_tuple_del(offload, FLOW_OFFLOAD_DIR_REPLY); + set_bit(NF_FLOW_HW_DEAD, &offload->flow->flags); + } + +@@ -946,7 +948,9 @@ static void flow_offload_work_stats(struct flow_offload_work *offload) + u64 lastused; + + flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_ORIGINAL, &stats[0]); +- flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, &stats[1]); ++ if (test_bit(NF_FLOW_HW_BIDIRECTIONAL, &offload->flow->flags)) ++ flow_offload_tuple_stats(offload, FLOW_OFFLOAD_DIR_REPLY, ++ &stats[1]); + + lastused = max_t(u64, stats[0].lastused, stats[1].lastused); + offload->flow->timeout = max_t(u64, offload->flow->timeout, +-- +2.43.0 + diff --git a/queue-6.1/netfilter-flowtable-cache-info-of-last-offload.patch b/queue-6.1/netfilter-flowtable-cache-info-of-last-offload.patch new file mode 100644 index 00000000000..f468da93ca2 --- /dev/null +++ b/queue-6.1/netfilter-flowtable-cache-info-of-last-offload.patch @@ -0,0 +1,171 @@ +From aa8689eb8935d603a6f52824c4f47b3279e22da5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 1 Feb 2023 17:30:57 +0100 +Subject: netfilter: flowtable: cache info of last offload + +From: Vlad Buslov + +[ Upstream commit 1a441a9b8be8849957a01413a144f84932c324cb ] + +Modify flow table offload to cache the last ct info status that was passed +to the driver offload callbacks by extending enum nf_flow_flags with new +"NF_FLOW_HW_ESTABLISHED" flag. Set the flag if ctinfo was 'established' +during last act_ct meta actions fill call. This infrastructure change is +necessary to optimize promoting of UDP connections from 'new' to +'established' in following patches in this series. + +Signed-off-by: Vlad Buslov +Signed-off-by: David S. Miller +Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_flow_table.h | 7 ++++--- + net/netfilter/nf_flow_table_inet.c | 2 +- + net/netfilter/nf_flow_table_offload.c | 6 +++--- + net/sched/act_ct.c | 12 +++++++----- + 4 files changed, 15 insertions(+), 12 deletions(-) + +diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h +index 88ab98ab41d9f..ebb28ec5b6faf 100644 +--- a/include/net/netfilter/nf_flow_table.h ++++ b/include/net/netfilter/nf_flow_table.h +@@ -57,7 +57,7 @@ struct nf_flowtable_type { + struct net_device *dev, + enum flow_block_command cmd); + int (*action)(struct net *net, +- const struct flow_offload *flow, ++ struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); + void (*free)(struct nf_flowtable *ft); +@@ -165,6 +165,7 @@ enum nf_flow_flags { + NF_FLOW_HW_DEAD, + NF_FLOW_HW_PENDING, + NF_FLOW_HW_BIDIRECTIONAL, ++ NF_FLOW_HW_ESTABLISHED, + }; + + enum flow_offload_type { +@@ -313,10 +314,10 @@ void nf_flow_table_offload_flush_cleanup(struct nf_flowtable *flowtable); + int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd); +-int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, ++int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); +-int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, ++int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule); + +diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c +index 0ccabf3fa6aa3..9505f9d188ff2 100644 +--- a/net/netfilter/nf_flow_table_inet.c ++++ b/net/netfilter/nf_flow_table_inet.c +@@ -39,7 +39,7 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb, + } + + static int nf_flow_rule_route_inet(struct net *net, +- const struct flow_offload *flow, ++ struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) + { +diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c +index 8b852f10fab4b..1c26f03fc6617 100644 +--- a/net/netfilter/nf_flow_table_offload.c ++++ b/net/netfilter/nf_flow_table_offload.c +@@ -679,7 +679,7 @@ nf_flow_rule_route_common(struct net *net, const struct flow_offload *flow, + return 0; + } + +-int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, ++int nf_flow_rule_route_ipv4(struct net *net, struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) + { +@@ -704,7 +704,7 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, + } + EXPORT_SYMBOL_GPL(nf_flow_rule_route_ipv4); + +-int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, ++int nf_flow_rule_route_ipv6(struct net *net, struct flow_offload *flow, + enum flow_offload_tuple_dir dir, + struct nf_flow_rule *flow_rule) + { +@@ -735,7 +735,7 @@ nf_flow_offload_rule_alloc(struct net *net, + { + const struct nf_flowtable *flowtable = offload->flowtable; + const struct flow_offload_tuple *tuple, *other_tuple; +- const struct flow_offload *flow = offload->flow; ++ struct flow_offload *flow = offload->flow; + struct dst_entry *other_dst = NULL; + struct nf_flow_rule *flow_rule; + int err = -ENOMEM; +diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c +index 478cedc29b737..86d269724485a 100644 +--- a/net/sched/act_ct.c ++++ b/net/sched/act_ct.c +@@ -168,11 +168,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple, + + static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, + enum ip_conntrack_dir dir, ++ enum ip_conntrack_info ctinfo, + struct flow_action *action) + { + struct nf_conn_labels *ct_labels; + struct flow_action_entry *entry; +- enum ip_conntrack_info ctinfo; + u32 *act_ct_labels; + + entry = tcf_ct_flow_table_flow_action_get_next(action); +@@ -180,8 +180,6 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, + #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + entry->ct_metadata.mark = READ_ONCE(ct->mark); + #endif +- ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED : +- IP_CT_ESTABLISHED_REPLY; + /* aligns with the CT reference on the SKB nf_ct_set */ + entry->ct_metadata.cookie = (unsigned long)ct | ctinfo; + entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL; +@@ -235,22 +233,26 @@ static int tcf_ct_flow_table_add_action_nat(struct net *net, + } + + static int tcf_ct_flow_table_fill_actions(struct net *net, +- const struct flow_offload *flow, ++ struct flow_offload *flow, + enum flow_offload_tuple_dir tdir, + struct nf_flow_rule *flow_rule) + { + struct flow_action *action = &flow_rule->rule->action; + int num_entries = action->num_entries; + struct nf_conn *ct = flow->ct; ++ enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + int i, err; + + switch (tdir) { + case FLOW_OFFLOAD_DIR_ORIGINAL: + dir = IP_CT_DIR_ORIGINAL; ++ ctinfo = IP_CT_ESTABLISHED; ++ set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); + break; + case FLOW_OFFLOAD_DIR_REPLY: + dir = IP_CT_DIR_REPLY; ++ ctinfo = IP_CT_ESTABLISHED_REPLY; + break; + default: + return -EOPNOTSUPP; +@@ -260,7 +262,7 @@ static int tcf_ct_flow_table_fill_actions(struct net *net, + if (err) + goto err_nat; + +- tcf_ct_flow_table_add_action_meta(ct, dir, action); ++ tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action); + return 0; + + err_nat: +-- +2.43.0 + diff --git a/queue-6.1/netfilter-flowtable-gc-pushes-back-packets-to-classi.patch b/queue-6.1/netfilter-flowtable-gc-pushes-back-packets-to-classi.patch new file mode 100644 index 00000000000..1c70921c422 --- /dev/null +++ b/queue-6.1/netfilter-flowtable-gc-pushes-back-packets-to-classi.patch @@ -0,0 +1,104 @@ +From 0449b478e6c121959ee093763e1b856302a2a0bf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 24 Oct 2023 21:09:47 +0200 +Subject: netfilter: flowtable: GC pushes back packets to classic path + +From: Pablo Neira Ayuso + +[ Upstream commit 735795f68b37e9bb49f642407a0d49b1631ea1c7 ] + +Since 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded +unreplied tuple"), flowtable GC pushes back flows with IPS_SEEN_REPLY +back to classic path in every run, ie. every second. This is because of +a new check for NF_FLOW_HW_ESTABLISHED which is specific of sched/act_ct. + +In Netfilter's flowtable case, NF_FLOW_HW_ESTABLISHED never gets set on +and IPS_SEEN_REPLY is unreliable since users decide when to offload the +flow before, such bit might be set on at a later stage. + +Fix it by adding a custom .gc handler that sched/act_ct can use to +deal with its NF_FLOW_HW_ESTABLISHED bit. + +Fixes: 41f2c7c342d3 ("net/sched: act_ct: Fix promotion of offloaded unreplied tuple") +Reported-by: Vladimir Smelhaus +Reviewed-by: Paul Blakey +Signed-off-by: Pablo Neira Ayuso +Stable-dep-of: 125f1c7f26ff ("net/sched: act_ct: Take per-cb reference to tcf_ct_flow_table") +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_flow_table.h | 1 + + net/netfilter/nf_flow_table_core.c | 14 +++++++------- + net/sched/act_ct.c | 7 +++++++ + 3 files changed, 15 insertions(+), 7 deletions(-) + +diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h +index f37f9f34430c1..0b163ead95c9f 100644 +--- a/include/net/netfilter/nf_flow_table.h ++++ b/include/net/netfilter/nf_flow_table.h +@@ -53,6 +53,7 @@ struct nf_flowtable_type { + struct list_head list; + int family; + int (*init)(struct nf_flowtable *ft); ++ bool (*gc)(const struct flow_offload *flow); + int (*setup)(struct nf_flowtable *ft, + struct net_device *dev, + enum flow_block_command cmd); +diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c +index baddb93a5e8cf..c1d99cb370b44 100644 +--- a/net/netfilter/nf_flow_table_core.c ++++ b/net/netfilter/nf_flow_table_core.c +@@ -331,12 +331,6 @@ void flow_offload_refresh(struct nf_flowtable *flow_table, + } + EXPORT_SYMBOL_GPL(flow_offload_refresh); + +-static bool nf_flow_is_outdated(const struct flow_offload *flow) +-{ +- return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && +- !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); +-} +- + static inline bool nf_flow_has_expired(const struct flow_offload *flow) + { + return nf_flow_timeout_delta(flow->timeout) <= 0; +@@ -422,12 +416,18 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, + return err; + } + ++static bool nf_flow_custom_gc(struct nf_flowtable *flow_table, ++ const struct flow_offload *flow) ++{ ++ return flow_table->type->gc && flow_table->type->gc(flow); ++} ++ + static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table, + struct flow_offload *flow, void *data) + { + if (nf_flow_has_expired(flow) || + nf_ct_is_dying(flow->ct) || +- nf_flow_is_outdated(flow)) ++ nf_flow_custom_gc(flow_table, flow)) + flow_offload_teardown(flow); + + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { +diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c +index b80a58d3bf0f3..4d34474f2cc0e 100644 +--- a/net/sched/act_ct.c ++++ b/net/sched/act_ct.c +@@ -274,7 +274,14 @@ static int tcf_ct_flow_table_fill_actions(struct net *net, + return err; + } + ++static bool tcf_ct_flow_is_outdated(const struct flow_offload *flow) ++{ ++ return test_bit(IPS_SEEN_REPLY_BIT, &flow->ct->status) && ++ !test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); ++} ++ + static struct nf_flowtable_type flowtable_ct = { ++ .gc = tcf_ct_flow_is_outdated, + .action = tcf_ct_flow_table_fill_actions, + .owner = THIS_MODULE, + }; +-- +2.43.0 + diff --git a/queue-6.1/netfilter-nf_tables-set-transport-offset-from-mac-he.patch b/queue-6.1/netfilter-nf_tables-set-transport-offset-from-mac-he.patch new file mode 100644 index 00000000000..6ded120e31d --- /dev/null +++ b/queue-6.1/netfilter-nf_tables-set-transport-offset-from-mac-he.patch @@ -0,0 +1,75 @@ +From 56cc1e9b5b7e464b9e998329d7173330be70efb2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Dec 2023 11:50:12 +0100 +Subject: netfilter: nf_tables: set transport offset from mac header for + netdev/egress + +From: Pablo Neira Ayuso + +[ Upstream commit 0ae8e4cca78781401b17721bfb72718fdf7b4912 ] + +Before this patch, transport offset (pkt->thoff) provides an offset +relative to the network header. This is fine for the inet families +because skb->data points to the network header in such case. However, +from netdev/egress, skb->data points to the mac header (if available), +thus, pkt->thoff is missing the mac header length. + +Add skb_network_offset() to the transport offset (pkt->thoff) for +netdev, so transport header mangling works as expected. Adjust payload +fast eval function to use skb->data now that pkt->thoff provides an +absolute offset. This explains why users report that matching on +egress/netdev works but payload mangling does not. + +This patch implicitly fixes payload mangling for IPv4 packets in +netdev/egress given skb_store_bits() requires an offset from skb->data +to reach the transport header. + +I suspect that nft_exthdr and the trace infra were also broken from +netdev/egress because they also take skb->data as start, and pkt->thoff +was not correct. + +Note that IPv6 is fine because ipv6_find_hdr() already provides a +transport offset starting from skb->data, which includes +skb_network_offset(). + +The bridge family also uses nft_set_pktinfo_ipv4_validate(), but there +skb_network_offset() is zero, so the update in this patch does not alter +the existing behaviour. + +Fixes: 42df6e1d221d ("netfilter: Introduce egress hook") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables_ipv4.h | 2 +- + net/netfilter/nf_tables_core.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h +index d8f6cb47ebe37..5225d2bd1a6e9 100644 +--- a/include/net/netfilter/nf_tables_ipv4.h ++++ b/include/net/netfilter/nf_tables_ipv4.h +@@ -30,7 +30,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) + return -1; + + len = iph_totlen(pkt->skb, iph); +- thoff = iph->ihl * 4; ++ thoff = skb_network_offset(pkt->skb) + (iph->ihl * 4); + if (pkt->skb->len < len) + return -1; + else if (len < thoff) +diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c +index cee3e4e905ec8..e0c117229ee9d 100644 +--- a/net/netfilter/nf_tables_core.c ++++ b/net/netfilter/nf_tables_core.c +@@ -141,7 +141,7 @@ static bool nft_payload_fast_eval(const struct nft_expr *expr, + else { + if (!(pkt->flags & NFT_PKTINFO_L4PROTO)) + return false; +- ptr = skb_network_header(skb) + nft_thoff(pkt); ++ ptr = skb->data + nft_thoff(pkt); + } + + ptr += priv->offset; +-- +2.43.0 + diff --git a/queue-6.1/netfilter-nft_immediate-drop-chain-reference-counter.patch b/queue-6.1/netfilter-nft_immediate-drop-chain-reference-counter.patch new file mode 100644 index 00000000000..adc031e98bf --- /dev/null +++ b/queue-6.1/netfilter-nft_immediate-drop-chain-reference-counter.patch @@ -0,0 +1,36 @@ +From d39cbbf50dc98ed34532f9728b5fb98aa77c1b82 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 1 Jan 2024 20:15:33 +0100 +Subject: netfilter: nft_immediate: drop chain reference counter on error + +From: Pablo Neira Ayuso + +[ Upstream commit b29be0ca8e816119ccdf95cc7d7c7be9bde005f1 ] + +In the init path, nft_data_init() bumps the chain reference counter, +decrement it on error by following the error path which calls +nft_data_release() to restore it. + +Fixes: 4bedf9eee016 ("netfilter: nf_tables: fix chain binding transaction logic") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_immediate.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c +index 5f59dbab3e933..55fcf0280c5c3 100644 +--- a/net/netfilter/nft_immediate.c ++++ b/net/netfilter/nft_immediate.c +@@ -78,7 +78,7 @@ static int nft_immediate_init(const struct nft_ctx *ctx, + case NFT_GOTO: + err = nf_tables_bind_chain(ctx, chain); + if (err < 0) +- return err; ++ goto err1; + break; + default: + break; +-- +2.43.0 + diff --git a/queue-6.1/netfilter-use-skb_ip_totlen-and-iph_totlen.patch b/queue-6.1/netfilter-use-skb_ip_totlen-and-iph_totlen.patch new file mode 100644 index 00000000000..556c8c8d772 --- /dev/null +++ b/queue-6.1/netfilter-use-skb_ip_totlen-and-iph_totlen.patch @@ -0,0 +1,97 @@ +From d9408d0e798b54be53f54b011e26b31027f18849 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 28 Jan 2023 10:58:34 -0500 +Subject: netfilter: use skb_ip_totlen and iph_totlen + +From: Xin Long + +[ Upstream commit a13fbf5ed5b4fc9095f12e955ca3a59b5507ff01 ] + +There are also quite some places in netfilter that may process IPv4 TCP +GSO packets, we need to replace them too. + +In length_mt(), we have to use u_int32_t/int to accept skb_ip_totlen() +return value, otherwise it may overflow and mismatch. This change will +also help us add selftest for IPv4 BIG TCP in the following patch. + +Note that we don't need to replace the one in tcpmss_tg4(), as it will +return if there is data after tcphdr in tcpmss_mangle_packet(). The +same in mangle_contents() in nf_nat_helper.c, it returns false when +skb->len + extra > 65535 in enlarge_skb(). + +Signed-off-by: Xin Long +Reviewed-by: Eric Dumazet +Signed-off-by: Jakub Kicinski +Stable-dep-of: 0ae8e4cca787 ("netfilter: nf_tables: set transport offset from mac header for netdev/egress") +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables_ipv4.h | 4 ++-- + net/netfilter/ipvs/ip_vs_xmit.c | 2 +- + net/netfilter/nf_log_syslog.c | 2 +- + net/netfilter/xt_length.c | 2 +- + 4 files changed, 5 insertions(+), 5 deletions(-) + +diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h +index c4a6147b0ef8c..d8f6cb47ebe37 100644 +--- a/include/net/netfilter/nf_tables_ipv4.h ++++ b/include/net/netfilter/nf_tables_ipv4.h +@@ -29,7 +29,7 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) + if (iph->ihl < 5 || iph->version != 4) + return -1; + +- len = ntohs(iph->tot_len); ++ len = iph_totlen(pkt->skb, iph); + thoff = iph->ihl * 4; + if (pkt->skb->len < len) + return -1; +@@ -62,7 +62,7 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + +- len = ntohs(iph->tot_len); ++ len = iph_totlen(pkt->skb, iph); + thoff = iph->ihl * 4; + if (pkt->skb->len < len) { + __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); +diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c +index 7243079ef3546..b452eb3ddcecb 100644 +--- a/net/netfilter/ipvs/ip_vs_xmit.c ++++ b/net/netfilter/ipvs/ip_vs_xmit.c +@@ -994,7 +994,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, + old_dsfield = ipv4_get_dsfield(old_iph); + *ttl = old_iph->ttl; + if (payload_len) +- *payload_len = ntohs(old_iph->tot_len); ++ *payload_len = skb_ip_totlen(skb); + } + + /* Implement full-functionality option for ECN encapsulation */ +diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c +index cb894f0d63e9d..c66689ad2b491 100644 +--- a/net/netfilter/nf_log_syslog.c ++++ b/net/netfilter/nf_log_syslog.c +@@ -322,7 +322,7 @@ dump_ipv4_packet(struct net *net, struct nf_log_buf *m, + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + nf_log_buf_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", +- ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, ++ iph_totlen(skb, ih), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ +diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c +index 9fbfad13176f0..ca730cedb5d41 100644 +--- a/net/netfilter/xt_length.c ++++ b/net/netfilter/xt_length.c +@@ -21,7 +21,7 @@ static bool + length_mt(const struct sk_buff *skb, struct xt_action_param *par) + { + const struct xt_length_info *info = par->matchinfo; +- u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); ++ u32 pktlen = skb_ip_totlen(skb); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; + } +-- +2.43.0 + diff --git a/queue-6.1/nfc-llcp_core-hold-a-ref-to-llcp_local-dev-when-hold.patch b/queue-6.1/nfc-llcp_core-hold-a-ref-to-llcp_local-dev-when-hold.patch new file mode 100644 index 00000000000..8eabbedee26 --- /dev/null +++ b/queue-6.1/nfc-llcp_core-hold-a-ref-to-llcp_local-dev-when-hold.patch @@ -0,0 +1,128 @@ +From 931ea9a2205ca793f1dcdff5f7f215cc9d0f2826 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 19 Dec 2023 23:19:43 +0530 +Subject: nfc: llcp_core: Hold a ref to llcp_local->dev when holding a ref to + llcp_local + +From: Siddh Raman Pant + +[ Upstream commit c95f919567d6f1914f13350af61a1b044ac85014 ] + +llcp_sock_sendmsg() calls nfc_llcp_send_ui_frame() which in turn calls +nfc_alloc_send_skb(), which accesses the nfc_dev from the llcp_sock for +getting the headroom and tailroom needed for skb allocation. + +Parallelly the nfc_dev can be freed, as the refcount is decreased via +nfc_free_device(), leading to a UAF reported by Syzkaller, which can +be summarized as follows: + +(1) llcp_sock_sendmsg() -> nfc_llcp_send_ui_frame() + -> nfc_alloc_send_skb() -> Dereference *nfc_dev +(2) virtual_ncidev_close() -> nci_free_device() -> nfc_free_device() + -> put_device() -> nfc_release() -> Free *nfc_dev + +When a reference to llcp_local is acquired, we do not acquire the same +for the nfc_dev. This leads to freeing even when the llcp_local is in +use, and this is the case with the UAF described above too. + +Thus, when we acquire a reference to llcp_local, we should acquire a +reference to nfc_dev, and release the references appropriately later. + +References for llcp_local is initialized in nfc_llcp_register_device() +(which is called by nfc_register_device()). Thus, we should acquire a +reference to nfc_dev there. + +nfc_unregister_device() calls nfc_llcp_unregister_device() which in +turn calls nfc_llcp_local_put(). Thus, the reference to nfc_dev is +appropriately released later. + +Reported-and-tested-by: syzbot+bbe84a4010eeea00982d@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=bbe84a4010eeea00982d +Fixes: c7aa12252f51 ("NFC: Take a reference on the LLCP local pointer when creating a socket") +Reviewed-by: Suman Ghosh +Signed-off-by: Siddh Raman Pant +Reviewed-by: Krzysztof Kozlowski +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/nfc/llcp_core.c | 39 ++++++++++++++++++++++++++++++++++++--- + 1 file changed, 36 insertions(+), 3 deletions(-) + +diff --git a/net/nfc/llcp_core.c b/net/nfc/llcp_core.c +index 1dac28136e6a3..18be13fb9b75a 100644 +--- a/net/nfc/llcp_core.c ++++ b/net/nfc/llcp_core.c +@@ -145,6 +145,13 @@ static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, + + static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) + { ++ /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever ++ * we hold a reference to local, we also need to hold a reference to ++ * the device to avoid UAF. ++ */ ++ if (!nfc_get_device(local->dev->idx)) ++ return NULL; ++ + kref_get(&local->ref); + + return local; +@@ -177,10 +184,18 @@ static void local_release(struct kref *ref) + + int nfc_llcp_local_put(struct nfc_llcp_local *local) + { ++ struct nfc_dev *dev; ++ int ret; ++ + if (local == NULL) + return 0; + +- return kref_put(&local->ref, local_release); ++ dev = local->dev; ++ ++ ret = kref_put(&local->ref, local_release); ++ nfc_put_device(dev); ++ ++ return ret; + } + + static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, +@@ -959,8 +974,17 @@ static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, + } + + new_sock = nfc_llcp_sock(new_sk); +- new_sock->dev = local->dev; ++ + new_sock->local = nfc_llcp_local_get(local); ++ if (!new_sock->local) { ++ reason = LLCP_DM_REJ; ++ sock_put(&new_sock->sk); ++ release_sock(&sock->sk); ++ sock_put(&sock->sk); ++ goto fail; ++ } ++ ++ new_sock->dev = local->dev; + new_sock->rw = sock->rw; + new_sock->miux = sock->miux; + new_sock->nfc_protocol = sock->nfc_protocol; +@@ -1597,7 +1621,16 @@ int nfc_llcp_register_device(struct nfc_dev *ndev) + if (local == NULL) + return -ENOMEM; + +- local->dev = ndev; ++ /* As we are going to initialize local's refcount, we need to get the ++ * nfc_dev to avoid UAF, otherwise there is no point in continuing. ++ * See nfc_llcp_local_get(). ++ */ ++ local->dev = nfc_get_device(ndev->idx); ++ if (!local->dev) { ++ kfree(local); ++ return -ENODEV; ++ } ++ + INIT_LIST_HEAD(&local->list); + kref_init(&local->ref); + mutex_init(&local->sdp_lock); +-- +2.43.0 + diff --git a/queue-6.1/octeontx2-af-always-configure-nix-tx-link-credits-ba.patch b/queue-6.1/octeontx2-af-always-configure-nix-tx-link-credits-ba.patch new file mode 100644 index 00000000000..c4a723215be --- /dev/null +++ b/queue-6.1/octeontx2-af-always-configure-nix-tx-link-credits-ba.patch @@ -0,0 +1,184 @@ +From 6e614dd77c02e745841357aa5fe9f9b4b1a63b2b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Jan 2024 15:26:43 +0530 +Subject: octeontx2-af: Always configure NIX TX link credits based on max frame + size + +From: Naveen Mamindlapalli + +[ Upstream commit a0d9528f6daf7fe8de217fa80a94d2989d2a57a7 ] + +Currently the NIX TX link credits are initialized based on the max frame +size that can be transmitted on a link but when the MTU is changed, the +NIX TX link credits are reprogrammed by the SW based on the new MTU value. +Since SMQ max packet length is programmed to max frame size by default, +there is a chance that NIX TX may stall while sending a max frame sized +packet on the link with insufficient credits to send the packet all at +once. This patch avoids stall issue by not changing the link credits +dynamically when the MTU is changed. + +Fixes: 1c74b89171c3 ("octeontx2-af: Wait for TX link idle for credits change") +Signed-off-by: Naveen Mamindlapalli +Signed-off-by: Sunil Kovvuri Goutham +Signed-off-by: Nithin Kumar Dabilpuram +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + .../ethernet/marvell/octeontx2/af/rvu_nix.c | 110 +----------------- + 1 file changed, 3 insertions(+), 107 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +index 959f36efdc4a6..15f698020ec44 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +@@ -3923,90 +3923,18 @@ static void nix_find_link_frs(struct rvu *rvu, + req->minlen = minlen; + } + +-static int +-nix_config_link_credits(struct rvu *rvu, int blkaddr, int link, +- u16 pcifunc, u64 tx_credits) +-{ +- struct rvu_hwinfo *hw = rvu->hw; +- int pf = rvu_get_pf(pcifunc); +- u8 cgx_id = 0, lmac_id = 0; +- unsigned long poll_tmo; +- bool restore_tx_en = 0; +- struct nix_hw *nix_hw; +- u64 cfg, sw_xoff = 0; +- u32 schq = 0; +- u32 credits; +- int rc; +- +- nix_hw = get_nix_hw(rvu->hw, blkaddr); +- if (!nix_hw) +- return NIX_AF_ERR_INVALID_NIXBLK; +- +- if (tx_credits == nix_hw->tx_credits[link]) +- return 0; +- +- /* Enable cgx tx if disabled for credits to be back */ +- if (is_pf_cgxmapped(rvu, pf)) { +- rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); +- restore_tx_en = !rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu), +- lmac_id, true); +- } +- +- mutex_lock(&rvu->rsrc_lock); +- /* Disable new traffic to link */ +- if (hw->cap.nix_shaping) { +- schq = nix_get_tx_link(rvu, pcifunc); +- sw_xoff = rvu_read64(rvu, blkaddr, NIX_AF_TL1X_SW_XOFF(schq)); +- rvu_write64(rvu, blkaddr, +- NIX_AF_TL1X_SW_XOFF(schq), BIT_ULL(0)); +- } +- +- rc = NIX_AF_ERR_LINK_CREDITS; +- poll_tmo = jiffies + usecs_to_jiffies(200000); +- /* Wait for credits to return */ +- do { +- if (time_after(jiffies, poll_tmo)) +- goto exit; +- usleep_range(100, 200); +- +- cfg = rvu_read64(rvu, blkaddr, +- NIX_AF_TX_LINKX_NORM_CREDIT(link)); +- credits = (cfg >> 12) & 0xFFFFFULL; +- } while (credits != nix_hw->tx_credits[link]); +- +- cfg &= ~(0xFFFFFULL << 12); +- cfg |= (tx_credits << 12); +- rvu_write64(rvu, blkaddr, NIX_AF_TX_LINKX_NORM_CREDIT(link), cfg); +- rc = 0; +- +- nix_hw->tx_credits[link] = tx_credits; +- +-exit: +- /* Enable traffic back */ +- if (hw->cap.nix_shaping && !sw_xoff) +- rvu_write64(rvu, blkaddr, NIX_AF_TL1X_SW_XOFF(schq), 0); +- +- /* Restore state of cgx tx */ +- if (restore_tx_en) +- rvu_cgx_config_tx(rvu_cgx_pdata(cgx_id, rvu), lmac_id, false); +- +- mutex_unlock(&rvu->rsrc_lock); +- return rc; +-} +- + int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req, + struct msg_rsp *rsp) + { + struct rvu_hwinfo *hw = rvu->hw; + u16 pcifunc = req->hdr.pcifunc; + int pf = rvu_get_pf(pcifunc); +- int blkaddr, schq, link = -1; +- struct nix_txsch *txsch; +- u64 cfg, lmac_fifo_len; ++ int blkaddr, link = -1; + struct nix_hw *nix_hw; + struct rvu_pfvf *pfvf; + u8 cgx = 0, lmac = 0; + u16 max_mtu; ++ u64 cfg; + + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, pcifunc); + if (blkaddr < 0) +@@ -4027,25 +3955,6 @@ int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req, + if (req->update_minlen && req->minlen < NIC_HW_MIN_FRS) + return NIX_AF_ERR_FRS_INVALID; + +- /* Check if requester wants to update SMQ's */ +- if (!req->update_smq) +- goto rx_frscfg; +- +- /* Update min/maxlen in each of the SMQ attached to this PF/VF */ +- txsch = &nix_hw->txsch[NIX_TXSCH_LVL_SMQ]; +- mutex_lock(&rvu->rsrc_lock); +- for (schq = 0; schq < txsch->schq.max; schq++) { +- if (TXSCH_MAP_FUNC(txsch->pfvf_map[schq]) != pcifunc) +- continue; +- cfg = rvu_read64(rvu, blkaddr, NIX_AF_SMQX_CFG(schq)); +- cfg = (cfg & ~(0xFFFFULL << 8)) | ((u64)req->maxlen << 8); +- if (req->update_minlen) +- cfg = (cfg & ~0x7FULL) | ((u64)req->minlen & 0x7F); +- rvu_write64(rvu, blkaddr, NIX_AF_SMQX_CFG(schq), cfg); +- } +- mutex_unlock(&rvu->rsrc_lock); +- +-rx_frscfg: + /* Check if config is for SDP link */ + if (req->sdp_link) { + if (!hw->sdp_links) +@@ -4068,7 +3977,6 @@ int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req, + if (link < 0) + return NIX_AF_ERR_RX_LINK_INVALID; + +- + linkcfg: + nix_find_link_frs(rvu, req, pcifunc); + +@@ -4078,19 +3986,7 @@ int rvu_mbox_handler_nix_set_hw_frs(struct rvu *rvu, struct nix_frs_cfg *req, + cfg = (cfg & ~0xFFFFULL) | req->minlen; + rvu_write64(rvu, blkaddr, NIX_AF_RX_LINKX_CFG(link), cfg); + +- if (req->sdp_link || pf == 0) +- return 0; +- +- /* Update transmit credits for CGX links */ +- lmac_fifo_len = rvu_cgx_get_lmac_fifolen(rvu, cgx, lmac); +- if (!lmac_fifo_len) { +- dev_err(rvu->dev, +- "%s: Failed to get CGX/RPM%d:LMAC%d FIFO size\n", +- __func__, cgx, lmac); +- return 0; +- } +- return nix_config_link_credits(rvu, blkaddr, link, pcifunc, +- (lmac_fifo_len - req->maxlen) / 16); ++ return 0; + } + + int rvu_mbox_handler_nix_set_rx_cfg(struct rvu *rvu, struct nix_rx_cfg *req, +-- +2.43.0 + diff --git a/queue-6.1/octeontx2-af-fix-marking-couple-of-structure-as-__pa.patch b/queue-6.1/octeontx2-af-fix-marking-couple-of-structure-as-__pa.patch new file mode 100644 index 00000000000..a32a0a426be --- /dev/null +++ b/queue-6.1/octeontx2-af-fix-marking-couple-of-structure-as-__pa.patch @@ -0,0 +1,46 @@ +From abb25686716bdc469df9eb0f9cea42cd499e4e1e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 19 Dec 2023 19:56:33 +0530 +Subject: octeontx2-af: Fix marking couple of structure as __packed + +From: Suman Ghosh + +[ Upstream commit 0ee2384a5a0f3b4eeac8d10bb01a0609d245a4d1 ] + +Couple of structures was not marked as __packed. This patch +fixes the same and mark them as __packed. + +Fixes: 42006910b5ea ("octeontx2-af: cleanup KPU config data") +Signed-off-by: Suman Ghosh +Reviewed-by: Jacob Keller +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/marvell/octeontx2/af/npc.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/npc.h b/drivers/net/ethernet/marvell/octeontx2/af/npc.h +index d027c23b8ef8e..aaff91bc7415a 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/npc.h ++++ b/drivers/net/ethernet/marvell/octeontx2/af/npc.h +@@ -514,7 +514,7 @@ struct npc_lt_def { + u8 ltype_mask; + u8 ltype_match; + u8 lid; +-}; ++} __packed; + + struct npc_lt_def_ipsec { + u8 ltype_mask; +@@ -522,7 +522,7 @@ struct npc_lt_def_ipsec { + u8 lid; + u8 spi_offset; + u8 spi_nz; +-}; ++} __packed; + + struct npc_lt_def_apad { + u8 ltype_mask; +-- +2.43.0 + diff --git a/queue-6.1/octeontx2-af-fix-pause-frame-configuration.patch b/queue-6.1/octeontx2-af-fix-pause-frame-configuration.patch new file mode 100644 index 00000000000..0f9960f96df --- /dev/null +++ b/queue-6.1/octeontx2-af-fix-pause-frame-configuration.patch @@ -0,0 +1,56 @@ +From c1c4d52f9e1a5f81883fafcb8b866f3bb4e20f70 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 8 Dec 2023 14:57:54 +0530 +Subject: octeontx2-af: Fix pause frame configuration + +From: Hariprasad Kelam + +[ Upstream commit e307b5a845c5951dabafc48d00b6424ee64716c4 ] + +The current implementation's default Pause Forward setting is causing +unnecessary network traffic. This patch disables Pause Forward to +address this issue. + +Fixes: 1121f6b02e7a ("octeontx2-af: Priority flow control configuration support") +Signed-off-by: Hariprasad Kelam +Signed-off-by: Sunil Kovvuri Goutham +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/marvell/octeontx2/af/rpm.c | 11 +++++++---- + 1 file changed, 7 insertions(+), 4 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c +index a70e1153fa04b..6b4792a942d84 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c +@@ -283,6 +283,11 @@ void rpm_lmac_pause_frm_config(void *rpmd, int lmac_id, bool enable) + cfg = FIELD_SET(RPM_PFC_CLASS_MASK, 0, cfg); + rpm_write(rpm, lmac_id, RPMX_CMRX_PRT_CBFC_CTL, cfg); + ++ /* Disable forward pause to driver */ ++ cfg = rpm_read(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG); ++ cfg &= ~RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD; ++ rpm_write(rpm, lmac_id, RPMX_MTI_MAC100X_COMMAND_CONFIG, cfg); ++ + /* Enable channel mask for all LMACS */ + rpm_write(rpm, 0, RPMX_CMR_CHAN_MSK_OR, ~0ULL); + } +@@ -451,12 +456,10 @@ int rpm_lmac_pfc_config(void *rpmd, int lmac_id, u8 tx_pause, u8 rx_pause, u16 p + + if (rx_pause) { + cfg &= ~(RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE | +- RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE | +- RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD); ++ RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE); + } else { + cfg |= (RPMX_MTI_MAC100X_COMMAND_CONFIG_RX_P_DISABLE | +- RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE | +- RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_FWD); ++ RPMX_MTI_MAC100X_COMMAND_CONFIG_PAUSE_IGNORE); + } + + if (tx_pause) { +-- +2.43.0 + diff --git a/queue-6.1/octeontx2-af-re-enable-mac-tx-in-otx2_stop-processin.patch b/queue-6.1/octeontx2-af-re-enable-mac-tx-in-otx2_stop-processin.patch new file mode 100644 index 00000000000..a41ad10e878 --- /dev/null +++ b/queue-6.1/octeontx2-af-re-enable-mac-tx-in-otx2_stop-processin.patch @@ -0,0 +1,93 @@ +From a82d68a811ec60556c67c80775519e4d65f02f35 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Jan 2024 19:44:00 +0530 +Subject: octeontx2-af: Re-enable MAC TX in otx2_stop processing + +From: Naveen Mamindlapalli + +[ Upstream commit 818ed8933bd17bc91a9fa8b94a898189c546fc1a ] + +During QoS scheduling testing with multiple strict priority flows, the +netdev tx watchdog timeout routine is invoked when a low priority QoS +queue doesn't get a chance to transmit the packets because other high +priority flows are completely subscribing the transmit link. The netdev +tx watchdog timeout routine will stop MAC RX and TX functionality in +otx2_stop() routine before cleanup of HW TX queues which results in SMQ +flush errors because the packets belonging to low priority queues will +never gets flushed since MAC TX is disabled. This patch fixes the issue +by re-enabling MAC TX to ensure the packets in HW pipeline gets flushed +properly. + +Fixes: a7faa68b4e7f ("octeontx2-af: Start/Stop traffic in CGX along with NPC") +Signed-off-by: Naveen Mamindlapalli +Signed-off-by: Sunil Kovvuri Goutham +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/marvell/octeontx2/af/rvu.h | 1 + + .../net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 17 +++++++++++++++++ + .../net/ethernet/marvell/octeontx2/af/rvu_nix.c | 8 +++++++- + 3 files changed, 25 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +index 95a7bc396e8ea..ab78e9d020751 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +@@ -850,6 +850,7 @@ u32 rvu_cgx_get_fifolen(struct rvu *rvu); + void *rvu_first_cgx_pdata(struct rvu *rvu); + int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id); + int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable); ++int rvu_cgx_tx_enable(struct rvu *rvu, u16 pcifunc, bool enable); + int rvu_cgx_prio_flow_ctrl_cfg(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause, + u16 pfc_en); + int rvu_cgx_cfg_pause_frm(struct rvu *rvu, u16 pcifunc, u8 tx_pause, u8 rx_pause); +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +index c60b9580ca969..fa658bd4dfb3b 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +@@ -456,6 +456,23 @@ int rvu_cgx_config_rxtx(struct rvu *rvu, u16 pcifunc, bool start) + return mac_ops->mac_rx_tx_enable(cgxd, lmac_id, start); + } + ++int rvu_cgx_tx_enable(struct rvu *rvu, u16 pcifunc, bool enable) ++{ ++ int pf = rvu_get_pf(pcifunc); ++ struct mac_ops *mac_ops; ++ u8 cgx_id, lmac_id; ++ void *cgxd; ++ ++ if (!is_cgx_config_permitted(rvu, pcifunc)) ++ return LMAC_AF_ERR_PERM_DENIED; ++ ++ rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); ++ cgxd = rvu_cgx_pdata(cgx_id, rvu); ++ mac_ops = get_mac_ops(cgxd); ++ ++ return mac_ops->mac_tx_enable(cgxd, lmac_id, enable); ++} ++ + int rvu_cgx_config_tx(void *cgxd, int lmac_id, bool enable) + { + struct mac_ops *mac_ops; +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +index 15f698020ec44..7f9581ce7f1fe 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +@@ -4506,7 +4506,13 @@ int rvu_mbox_handler_nix_lf_stop_rx(struct rvu *rvu, struct msg_req *req, + pfvf = rvu_get_pfvf(rvu, pcifunc); + clear_bit(NIXLF_INITIALIZED, &pfvf->flags); + +- return rvu_cgx_start_stop_io(rvu, pcifunc, false); ++ err = rvu_cgx_start_stop_io(rvu, pcifunc, false); ++ if (err) ++ return err; ++ ++ rvu_cgx_tx_enable(rvu, pcifunc, true); ++ ++ return 0; + } + + #define RX_SA_BASE GENMASK_ULL(52, 7) +-- +2.43.0 + diff --git a/queue-6.1/octeontx2-af-support-variable-number-of-lmacs.patch b/queue-6.1/octeontx2-af-support-variable-number-of-lmacs.patch new file mode 100644 index 00000000000..fec38be5af0 --- /dev/null +++ b/queue-6.1/octeontx2-af-support-variable-number-of-lmacs.patch @@ -0,0 +1,342 @@ +From cc60721ea4736bbfd6d8355b455f0c847a4ccbe0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 5 Dec 2022 12:35:18 +0530 +Subject: octeontx2-af: Support variable number of lmacs + +From: Rakesh Babu Saladi + +[ Upstream commit f2e664ad503d4e5ce7c42a0862ab164331a0ef37 ] + +Most of the code in CGX/RPM driver assumes that max lmacs per +given MAC as always, 4 and the number of MAC blocks also as 4. +With this assumption, the max number of interfaces supported is +hardcoded to 16. This creates a problem as next gen CN10KB silicon +MAC supports 8 lmacs per MAC block. + +This patch solves the problem by using "max lmac per MAC block" +value from constant csrs and uses cgx_cnt_max value which is +populated based number of MAC blocks supported by silicon. + +Signed-off-by: Rakesh Babu Saladi +Signed-off-by: Hariprasad Kelam +Signed-off-by: Sunil Kovvuri Goutham +Signed-off-by: Paolo Abeni +Stable-dep-of: e307b5a845c5 ("octeontx2-af: Fix pause frame configuration") +Signed-off-by: Sasha Levin +--- + .../net/ethernet/marvell/octeontx2/af/cgx.c | 35 ++++++++----------- + .../net/ethernet/marvell/octeontx2/af/cgx.h | 6 ++-- + .../marvell/octeontx2/af/lmac_common.h | 5 ++- + .../net/ethernet/marvell/octeontx2/af/rvu.h | 2 +- + .../ethernet/marvell/octeontx2/af/rvu_cgx.c | 26 ++++++++------ + .../marvell/octeontx2/af/rvu_debugfs.c | 2 +- + .../ethernet/marvell/octeontx2/af/rvu_nix.c | 2 +- + .../marvell/octeontx2/af/rvu_npc_hash.c | 4 ++- + 8 files changed, 42 insertions(+), 40 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +index 65c0373d34d12..90be87dc105d3 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +@@ -78,7 +78,7 @@ static bool is_dev_rpm(void *cgxd) + + bool is_lmac_valid(struct cgx *cgx, int lmac_id) + { +- if (!cgx || lmac_id < 0 || lmac_id >= MAX_LMAC_PER_CGX) ++ if (!cgx || lmac_id < 0 || lmac_id >= cgx->max_lmac_per_mac) + return false; + return test_bit(lmac_id, &cgx->lmac_bmap); + } +@@ -90,7 +90,7 @@ static int get_sequence_id_of_lmac(struct cgx *cgx, int lmac_id) + { + int tmp, id = 0; + +- for_each_set_bit(tmp, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) { ++ for_each_set_bit(tmp, &cgx->lmac_bmap, cgx->max_lmac_per_mac) { + if (tmp == lmac_id) + break; + id++; +@@ -121,7 +121,7 @@ u64 cgx_read(struct cgx *cgx, u64 lmac, u64 offset) + + struct lmac *lmac_pdata(u8 lmac_id, struct cgx *cgx) + { +- if (!cgx || lmac_id >= MAX_LMAC_PER_CGX) ++ if (!cgx || lmac_id >= cgx->max_lmac_per_mac) + return NULL; + + return cgx->lmac_idmap[lmac_id]; +@@ -1410,7 +1410,7 @@ int cgx_get_fwdata_base(u64 *base) + if (!cgx) + return -ENXIO; + +- first_lmac = find_first_bit(&cgx->lmac_bmap, MAX_LMAC_PER_CGX); ++ first_lmac = find_first_bit(&cgx->lmac_bmap, cgx->max_lmac_per_mac); + req = FIELD_SET(CMDREG_ID, CGX_CMD_GET_FWD_BASE, req); + err = cgx_fwi_cmd_generic(req, &resp, cgx, first_lmac); + if (!err) +@@ -1499,7 +1499,7 @@ static int cgx_fwi_link_change(struct cgx *cgx, int lmac_id, bool enable) + + static inline int cgx_fwi_read_version(u64 *resp, struct cgx *cgx) + { +- int first_lmac = find_first_bit(&cgx->lmac_bmap, MAX_LMAC_PER_CGX); ++ int first_lmac = find_first_bit(&cgx->lmac_bmap, cgx->max_lmac_per_mac); + u64 req = 0; + + req = FIELD_SET(CMDREG_ID, CGX_CMD_GET_FW_VER, req); +@@ -1537,7 +1537,7 @@ static void cgx_lmac_linkup_work(struct work_struct *work) + int i, err; + + /* Do Link up for all the enabled lmacs */ +- for_each_set_bit(i, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) { ++ for_each_set_bit(i, &cgx->lmac_bmap, cgx->max_lmac_per_mac) { + err = cgx_fwi_link_change(cgx, i, true); + if (err) + dev_info(dev, "cgx port %d:%d Link up command failed\n", +@@ -1557,14 +1557,6 @@ int cgx_lmac_linkup_start(void *cgxd) + return 0; + } + +-static void cgx_lmac_get_fifolen(struct cgx *cgx) +-{ +- u64 cfg; +- +- cfg = cgx_read(cgx, 0, CGX_CONST); +- cgx->mac_ops->fifo_len = FIELD_GET(CGX_CONST_RXFIFO_SIZE, cfg); +-} +- + static int cgx_configure_interrupt(struct cgx *cgx, struct lmac *lmac, + int cnt, bool req_free) + { +@@ -1619,17 +1611,14 @@ static int cgx_lmac_init(struct cgx *cgx) + u64 lmac_list; + int i, err; + +- cgx_lmac_get_fifolen(cgx); +- +- cgx->lmac_count = cgx->mac_ops->get_nr_lmacs(cgx); + /* lmac_list specifies which lmacs are enabled + * when bit n is set to 1, LMAC[n] is enabled + */ + if (cgx->mac_ops->non_contiguous_serdes_lane) + lmac_list = cgx_read(cgx, 0, CGXX_CMRX_RX_LMACS) & 0xFULL; + +- if (cgx->lmac_count > MAX_LMAC_PER_CGX) +- cgx->lmac_count = MAX_LMAC_PER_CGX; ++ if (cgx->lmac_count > cgx->max_lmac_per_mac) ++ cgx->lmac_count = cgx->max_lmac_per_mac; + + for (i = 0; i < cgx->lmac_count; i++) { + lmac = kzalloc(sizeof(struct lmac), GFP_KERNEL); +@@ -1707,7 +1696,7 @@ static int cgx_lmac_exit(struct cgx *cgx) + } + + /* Free all lmac related resources */ +- for_each_set_bit(i, &cgx->lmac_bmap, MAX_LMAC_PER_CGX) { ++ for_each_set_bit(i, &cgx->lmac_bmap, cgx->max_lmac_per_mac) { + lmac = cgx->lmac_idmap[i]; + if (!lmac) + continue; +@@ -1723,6 +1712,12 @@ static int cgx_lmac_exit(struct cgx *cgx) + + static void cgx_populate_features(struct cgx *cgx) + { ++ u64 cfg; ++ ++ cfg = cgx_read(cgx, 0, CGX_CONST); ++ cgx->mac_ops->fifo_len = FIELD_GET(CGX_CONST_RXFIFO_SIZE, cfg); ++ cgx->max_lmac_per_mac = FIELD_GET(CGX_CONST_MAX_LMACS, cfg); ++ + if (is_dev_rpm(cgx)) + cgx->hw_features = (RVU_LMAC_FEAT_DMACF | RVU_MAC_RPM | + RVU_LMAC_FEAT_FC | RVU_LMAC_FEAT_PTP); +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h +index 04338db38671b..09ddb00f63cc7 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.h ++++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.h +@@ -18,11 +18,8 @@ + /* PCI BAR nos */ + #define PCI_CFG_REG_BAR_NUM 0 + +-#define CGX_ID_MASK 0x7 +-#define MAX_LMAC_PER_CGX 4 ++#define CGX_ID_MASK 0xF + #define MAX_DMAC_ENTRIES_PER_CGX 32 +-#define CGX_FIFO_LEN 65536 /* 64K for both Rx & Tx */ +-#define CGX_OFFSET(x) ((x) * MAX_LMAC_PER_CGX) + + /* Registers */ + #define CGXX_CMRX_CFG 0x00 +@@ -56,6 +53,7 @@ + #define CGXX_SCRATCH1_REG 0x1058 + #define CGX_CONST 0x2000 + #define CGX_CONST_RXFIFO_SIZE GENMASK_ULL(23, 0) ++#define CGX_CONST_MAX_LMACS GENMASK_ULL(31, 24) + #define CGXX_SPUX_CONTROL1 0x10000 + #define CGXX_SPUX_LNX_FEC_CORR_BLOCKS 0x10700 + #define CGXX_SPUX_LNX_FEC_UNCORR_BLOCKS 0x10800 +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h +index 52b6016789fa4..697cfec74aa1e 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h ++++ b/drivers/net/ethernet/marvell/octeontx2/af/lmac_common.h +@@ -128,7 +128,10 @@ struct cgx { + struct pci_dev *pdev; + u8 cgx_id; + u8 lmac_count; +- struct lmac *lmac_idmap[MAX_LMAC_PER_CGX]; ++ /* number of LMACs per MAC could be 4 or 8 */ ++ u8 max_lmac_per_mac; ++#define MAX_LMAC_COUNT 8 ++ struct lmac *lmac_idmap[MAX_LMAC_COUNT]; + struct work_struct cgx_cmd_work; + struct workqueue_struct *cgx_cmd_workq; + struct list_head cgx_list; +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +index ab78e9d020751..0b76dfa979d4e 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +@@ -480,7 +480,7 @@ struct rvu { + u8 cgx_mapped_pfs; + u8 cgx_cnt_max; /* CGX port count max */ + u8 *pf2cgxlmac_map; /* pf to cgx_lmac map */ +- u16 *cgxlmac2pf_map; /* bitmap of mapped pfs for ++ u64 *cgxlmac2pf_map; /* bitmap of mapped pfs for + * every cgx lmac port + */ + unsigned long pf_notify_bmap; /* Flags for PF notification */ +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +index fa658bd4dfb3b..bcb4385d0621c 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +@@ -55,8 +55,9 @@ bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature) + return (cgx_features_get(cgxd) & feature); + } + ++#define CGX_OFFSET(x) ((x) * rvu->hw->lmac_per_cgx) + /* Returns bitmap of mapped PFs */ +-static u16 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id) ++static u64 cgxlmac_to_pfmap(struct rvu *rvu, u8 cgx_id, u8 lmac_id) + { + return rvu->cgxlmac2pf_map[CGX_OFFSET(cgx_id) + lmac_id]; + } +@@ -71,7 +72,8 @@ int cgxlmac_to_pf(struct rvu *rvu, int cgx_id, int lmac_id) + if (!pfmap) + return -ENODEV; + else +- return find_first_bit(&pfmap, 16); ++ return find_first_bit(&pfmap, ++ rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx); + } + + static u8 cgxlmac_id_to_bmap(u8 cgx_id, u8 lmac_id) +@@ -129,14 +131,14 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) + if (!cgx_cnt_max) + return 0; + +- if (cgx_cnt_max > 0xF || MAX_LMAC_PER_CGX > 0xF) ++ if (cgx_cnt_max > 0xF || rvu->hw->lmac_per_cgx > 0xF) + return -EINVAL; + + /* Alloc map table + * An additional entry is required since PF id starts from 1 and + * hence entry at offset 0 is invalid. + */ +- size = (cgx_cnt_max * MAX_LMAC_PER_CGX + 1) * sizeof(u8); ++ size = (cgx_cnt_max * rvu->hw->lmac_per_cgx + 1) * sizeof(u8); + rvu->pf2cgxlmac_map = devm_kmalloc(rvu->dev, size, GFP_KERNEL); + if (!rvu->pf2cgxlmac_map) + return -ENOMEM; +@@ -145,9 +147,10 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) + memset(rvu->pf2cgxlmac_map, 0xFF, size); + + /* Reverse map table */ +- rvu->cgxlmac2pf_map = devm_kzalloc(rvu->dev, +- cgx_cnt_max * MAX_LMAC_PER_CGX * sizeof(u16), +- GFP_KERNEL); ++ rvu->cgxlmac2pf_map = ++ devm_kzalloc(rvu->dev, ++ cgx_cnt_max * rvu->hw->lmac_per_cgx * sizeof(u64), ++ GFP_KERNEL); + if (!rvu->cgxlmac2pf_map) + return -ENOMEM; + +@@ -156,7 +159,7 @@ static int rvu_map_cgx_lmac_pf(struct rvu *rvu) + if (!rvu_cgx_pdata(cgx, rvu)) + continue; + lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu)); +- for_each_set_bit(iter, &lmac_bmap, MAX_LMAC_PER_CGX) { ++ for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) { + lmac = cgx_get_lmacid(rvu_cgx_pdata(cgx, rvu), + iter); + rvu->pf2cgxlmac_map[pf] = cgxlmac_id_to_bmap(cgx, lmac); +@@ -235,7 +238,8 @@ static void cgx_notify_pfs(struct cgx_link_event *event, struct rvu *rvu) + pfmap = cgxlmac_to_pfmap(rvu, event->cgx_id, event->lmac_id); + + do { +- pfid = find_first_bit(&pfmap, 16); ++ pfid = find_first_bit(&pfmap, ++ rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx); + clear_bit(pfid, &pfmap); + + /* check if notification is enabled */ +@@ -310,7 +314,7 @@ static int cgx_lmac_event_handler_init(struct rvu *rvu) + if (!cgxd) + continue; + lmac_bmap = cgx_get_lmac_bmap(cgxd); +- for_each_set_bit(lmac, &lmac_bmap, MAX_LMAC_PER_CGX) { ++ for_each_set_bit(lmac, &lmac_bmap, rvu->hw->lmac_per_cgx) { + err = cgx_lmac_evh_register(&cb, cgxd, lmac); + if (err) + dev_err(rvu->dev, +@@ -396,7 +400,7 @@ int rvu_cgx_exit(struct rvu *rvu) + if (!cgxd) + continue; + lmac_bmap = cgx_get_lmac_bmap(cgxd); +- for_each_set_bit(lmac, &lmac_bmap, MAX_LMAC_PER_CGX) ++ for_each_set_bit(lmac, &lmac_bmap, rvu->hw->lmac_per_cgx) + cgx_lmac_evh_unregister(cgxd, lmac); + } + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +index 5c9dc3f9262f5..cc5d342e026c7 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +@@ -2618,7 +2618,7 @@ static void rvu_dbg_cgx_init(struct rvu *rvu) + rvu->rvu_dbg.cgx = debugfs_create_dir(dname, + rvu->rvu_dbg.cgx_root); + +- for_each_set_bit(lmac_id, &lmac_bmap, MAX_LMAC_PER_CGX) { ++ for_each_set_bit(lmac_id, &lmac_bmap, rvu->hw->lmac_per_cgx) { + /* lmac debugfs dir */ + sprintf(dname, "lmac%d", lmac_id); + rvu->rvu_dbg.lmac = +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +index 7f9581ce7f1fe..bb99302eab67a 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +@@ -4079,7 +4079,7 @@ static void nix_link_config(struct rvu *rvu, int blkaddr, + + /* Get LMAC id's from bitmap */ + lmac_bmap = cgx_get_lmac_bmap(rvu_cgx_pdata(cgx, rvu)); +- for_each_set_bit(iter, &lmac_bmap, MAX_LMAC_PER_CGX) { ++ for_each_set_bit(iter, &lmac_bmap, rvu->hw->lmac_per_cgx) { + lmac_fifo_len = rvu_cgx_get_lmac_fifolen(rvu, cgx, iter); + if (!lmac_fifo_len) { + dev_err(rvu->dev, +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c +index 34fa59575fa91..54e0dfdc9d984 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_hash.c +@@ -1999,7 +1999,9 @@ int rvu_npc_exact_init(struct rvu *rvu) + /* Install SDP drop rule */ + drop_mcam_idx = &table->num_drop_rules; + +- max_lmac_cnt = rvu->cgx_cnt_max * MAX_LMAC_PER_CGX + PF_CGXMAP_BASE; ++ max_lmac_cnt = rvu->cgx_cnt_max * rvu->hw->lmac_per_cgx + ++ PF_CGXMAP_BASE; ++ + for (i = PF_CGXMAP_BASE; i < max_lmac_cnt; i++) { + if (rvu->pf2cgxlmac_map[i] == 0xFF) + continue; +-- +2.43.0 + diff --git a/queue-6.1/r8169-fix-pci-error-on-system-resume.patch b/queue-6.1/r8169-fix-pci-error-on-system-resume.patch new file mode 100644 index 00000000000..c9305e56a43 --- /dev/null +++ b/queue-6.1/r8169-fix-pci-error-on-system-resume.patch @@ -0,0 +1,49 @@ +From 583a0fa5c6f48858b3592059eba607d74041c813 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Dec 2023 12:34:09 +0800 +Subject: r8169: Fix PCI error on system resume + +From: Kai-Heng Feng + +[ Upstream commit 9c476269bff2908a20930c58085bf0b05ebd569a ] + +Some r8168 NICs stop working upon system resume: + +[ 688.051096] r8169 0000:02:00.1 enp2s0f1: rtl_ep_ocp_read_cond == 0 (loop: 10, delay: 10000). +[ 688.175131] r8169 0000:02:00.1 enp2s0f1: Link is Down +... +[ 691.534611] r8169 0000:02:00.1 enp2s0f1: PCI error (cmd = 0x0407, status_errs = 0x0000) + +Not sure if it's related, but those NICs have a BMC device at function +0: +02:00.0 Unassigned class [ff00]: Realtek Semiconductor Co., Ltd. Realtek RealManage BMC [10ec:816e] (rev 1a) + +Trial and error shows that increase the loop wait on +rtl_ep_ocp_read_cond to 30 can eliminate the issue, so let +rtl8168ep_driver_start() to wait a bit longer. + +Fixes: e6d6ca6e1204 ("r8169: Add support for another RTL8168FP") +Signed-off-by: Kai-Heng Feng +Reviewed-by: Heiner Kallweit +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/realtek/r8169_main.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c +index d22457f2cf9cf..06663c11ca96d 100644 +--- a/drivers/net/ethernet/realtek/r8169_main.c ++++ b/drivers/net/ethernet/realtek/r8169_main.c +@@ -1145,7 +1145,7 @@ static void rtl8168ep_driver_start(struct rtl8169_private *tp) + { + r8168ep_ocp_write(tp, 0x01, 0x180, OOB_CMD_DRIVER_START); + r8168ep_ocp_write(tp, 0x01, 0x30, r8168ep_ocp_read(tp, 0x30) | 0x01); +- rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 10); ++ rtl_loop_wait_high(tp, &rtl_ep_ocp_read_cond, 10000, 30); + } + + static void rtl8168_driver_start(struct rtl8169_private *tp) +-- +2.43.0 + diff --git a/queue-6.1/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch b/queue-6.1/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch new file mode 100644 index 00000000000..60f68491c7b --- /dev/null +++ b/queue-6.1/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch @@ -0,0 +1,74 @@ +From 861ba5891da49cc7768295f98827c32ed0dcd73a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Dec 2023 14:30:49 -0500 +Subject: ring-buffer: Fix 32-bit rb_time_read() race with rb_time_cmpxchg() + +From: Mathieu Desnoyers + +[ Upstream commit dec890089bf79a4954b61482715ee2d084364856 ] + +The following race can cause rb_time_read() to observe a corrupted time +stamp: + +rb_time_cmpxchg() +[...] + if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) + return false; + if (!rb_time_read_cmpxchg(&t->top, top, top2)) + return false; + +__rb_time_read() +[...] + do { + c = local_read(&t->cnt); + top = local_read(&t->top); + bottom = local_read(&t->bottom); + msb = local_read(&t->msb); + } while (c != local_read(&t->cnt)); + + *cnt = rb_time_cnt(top); + + /* If top and msb counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(msb)) + return false; + ^ this check fails to catch that "bottom" is still not updated. + +So the old "bottom" value is returned, which is wrong. + +Fix this by checking that all three of msb, top, and bottom 2-bit cnt +values match. + +The reason to favor checking all three fields over requiring a specific +update order for both rb_time_set() and rb_time_cmpxchg() is because +checking all three fields is more robust to handle partial failures of +rb_time_cmpxchg() when interrupted by nested rb_time_set(). + +Link: https://lore.kernel.org/lkml/20231211201324.652870-1-mathieu.desnoyers@efficios.com/ +Link: https://lore.kernel.org/linux-trace-kernel/20231212193049.680122-1-mathieu.desnoyers@efficios.com + +Fixes: f458a1453424e ("ring-buffer: Test last update in 32bit version of __rb_time_read()") +Signed-off-by: Mathieu Desnoyers +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + kernel/trace/ring_buffer.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c +index 06d52525407b8..71cad4f1323c6 100644 +--- a/kernel/trace/ring_buffer.c ++++ b/kernel/trace/ring_buffer.c +@@ -646,8 +646,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) + + *cnt = rb_time_cnt(top); + +- /* If top and msb counts don't match, this interrupted a write */ +- if (*cnt != rb_time_cnt(msb)) ++ /* If top, msb or bottom counts don't match, this interrupted a write */ ++ if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) + return false; + + /* The shift to msb will lose its cnt bits */ +-- +2.43.0 + diff --git a/queue-6.1/s390-cpumf-support-user-space-events-for-counting.patch b/queue-6.1/s390-cpumf-support-user-space-events-for-counting.patch new file mode 100644 index 00000000000..b38d22dc75b --- /dev/null +++ b/queue-6.1/s390-cpumf-support-user-space-events-for-counting.patch @@ -0,0 +1,93 @@ +From 06bb501441103a4e4dd88b341771b77debd509e7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 23 Dec 2022 11:03:32 +0100 +Subject: s390/cpumf: support user space events for counting + +From: Thomas Richter + +[ Upstream commit 91d5364dc673fa9cf3a5b7b30cf33c70803eb3a4 ] + +CPU Measurement counting facility events PROBLEM_STATE_CPU_CYCLES(32) +and PROBLEM_STATE_INSTRUCTIONS(33) are valid events. However the device +driver returns error -EOPNOTSUPP when these event are to be installed. + +Fix this and allow installation of events PROBLEM_STATE_CPU_CYCLES, +PROBLEM_STATE_CPU_CYCLES:u, PROBLEM_STATE_INSTRUCTIONS and +PROBLEM_STATE_INSTRUCTIONS:u. +Kernel space counting only is still not supported by s390. + +Signed-off-by: Thomas Richter +Acked-by: Sumanth Korikkar +Signed-off-by: Heiko Carstens +Stable-dep-of: 09cda0a40051 ("s390/mm: add missing arch_set_page_dat() call to vmem_crst_alloc()") +Signed-off-by: Sasha Levin +--- + arch/s390/kernel/perf_cpum_cf.c | 35 ++++++++++++++++++++++----------- + 1 file changed, 24 insertions(+), 11 deletions(-) + +diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c +index f043a7ff220b7..28fa80fd69fa0 100644 +--- a/arch/s390/kernel/perf_cpum_cf.c ++++ b/arch/s390/kernel/perf_cpum_cf.c +@@ -2,7 +2,7 @@ + /* + * Performance event support for s390x - CPU-measurement Counter Facility + * +- * Copyright IBM Corp. 2012, 2021 ++ * Copyright IBM Corp. 2012, 2022 + * Author(s): Hendrik Brueckner + * Thomas Richter + */ +@@ -434,6 +434,12 @@ static void cpumf_hw_inuse(void) + mutex_unlock(&pmc_reserve_mutex); + } + ++static int is_userspace_event(u64 ev) ++{ ++ return cpumf_generic_events_user[PERF_COUNT_HW_CPU_CYCLES] == ev || ++ cpumf_generic_events_user[PERF_COUNT_HW_INSTRUCTIONS] == ev; ++} ++ + static int __hw_perf_event_init(struct perf_event *event, unsigned int type) + { + struct perf_event_attr *attr = &event->attr; +@@ -456,19 +462,26 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) + if (is_sampling_event(event)) /* No sampling support */ + return -ENOENT; + ev = attr->config; +- /* Count user space (problem-state) only */ + if (!attr->exclude_user && attr->exclude_kernel) { +- if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) +- return -EOPNOTSUPP; +- ev = cpumf_generic_events_user[ev]; +- +- /* No support for kernel space counters only */ ++ /* ++ * Count user space (problem-state) only ++ * Handle events 32 and 33 as 0:u and 1:u ++ */ ++ if (!is_userspace_event(ev)) { ++ if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) ++ return -EOPNOTSUPP; ++ ev = cpumf_generic_events_user[ev]; ++ } + } else if (!attr->exclude_kernel && attr->exclude_user) { ++ /* No support for kernel space counters only */ + return -EOPNOTSUPP; +- } else { /* Count user and kernel space */ +- if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) +- return -EOPNOTSUPP; +- ev = cpumf_generic_events_basic[ev]; ++ } else { ++ /* Count user and kernel space, incl. events 32 + 33 */ ++ if (!is_userspace_event(ev)) { ++ if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) ++ return -EOPNOTSUPP; ++ ev = cpumf_generic_events_basic[ev]; ++ } + } + break; + +-- +2.43.0 + diff --git a/queue-6.1/s390-mm-add-missing-arch_set_page_dat-call-to-vmem_c.patch b/queue-6.1/s390-mm-add-missing-arch_set_page_dat-call-to-vmem_c.patch new file mode 100644 index 00000000000..0b98ed273e5 --- /dev/null +++ b/queue-6.1/s390-mm-add-missing-arch_set_page_dat-call-to-vmem_c.patch @@ -0,0 +1,63 @@ +From 2c2e7b36450c06fc93a5c80dd815dfff68ba45f9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 17 Oct 2023 21:07:04 +0200 +Subject: s390/mm: add missing arch_set_page_dat() call to vmem_crst_alloc() + +From: Heiko Carstens + +[ Upstream commit 09cda0a400519b1541591c506e54c9c48e3101bf ] + +If the cmma no-dat feature is available all pages that are not used for +dynamic address translation are marked as "no-dat" with the ESSA +instruction. This information is visible to the hypervisor, so that the +hypervisor can optimize purging of guest TLB entries. This also means that +pages which are used for dynamic address translation must not be marked as +"no-dat", since the hypervisor may then incorrectly not purge guest TLB +entries. + +Region and segment tables allocated via vmem_crst_alloc() are incorrectly +marked as "no-dat", as soon as slab_is_available() returns true. + +Such tables are allocated e.g. when kernel page tables are split, memory is +hotplugged, or a DCSS segment is loaded. + +Fix this by adding the missing arch_set_page_dat() call. + +Cc: +Reviewed-by: Claudio Imbrenda +Signed-off-by: Heiko Carstens +Signed-off-by: Vasily Gorbik +Signed-off-by: Sasha Levin +--- + arch/s390/mm/vmem.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c +index 9a0ce5315f36d..3cbb461820666 100644 +--- a/arch/s390/mm/vmem.c ++++ b/arch/s390/mm/vmem.c +@@ -11,6 +11,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -44,8 +45,11 @@ void *vmem_crst_alloc(unsigned long val) + unsigned long *table; + + table = vmem_alloc_pages(CRST_ALLOC_ORDER); +- if (table) +- crst_table_init(table, val); ++ if (!table) ++ return NULL; ++ crst_table_init(table, val); ++ if (slab_is_available()) ++ arch_set_page_dat(virt_to_page(table), CRST_ALLOC_ORDER); + return table; + } + +-- +2.43.0 + diff --git a/queue-6.1/selftests-bonding-do-not-set-port-down-when-adding-t.patch b/queue-6.1/selftests-bonding-do-not-set-port-down-when-adding-t.patch new file mode 100644 index 00000000000..8f553906244 --- /dev/null +++ b/queue-6.1/selftests-bonding-do-not-set-port-down-when-adding-t.patch @@ -0,0 +1,53 @@ +From d7b27f0f9c2a4de4109f6bda4aae6edb2b4cc9b7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 23 Dec 2023 20:59:22 +0800 +Subject: selftests: bonding: do not set port down when adding to bond + +From: Hangbin Liu + +[ Upstream commit 61fa2493ca76fd7bb74e13f0205274f4ab0aa696 ] + +Similar to commit be809424659c ("selftests: bonding: do not set port down +before adding to bond"). The bond-arp-interval-causes-panic test failed +after commit a4abfa627c38 ("net: rtnetlink: Enslave device before bringing +it up") as the kernel will set the port down _after_ adding to bond if setting +port down specifically. + +Fix it by removing the link down operation when adding to bond. + +Fixes: 2ffd57327ff1 ("selftests: bonding: cause oops in bond_rr_gen_slave_id") +Signed-off-by: Hangbin Liu +Tested-by: Benjamin Poirier +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + .../drivers/net/bonding/bond-arp-interval-causes-panic.sh | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh +index 71c00bfafbc99..2ff58fed76e28 100755 +--- a/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh ++++ b/tools/testing/selftests/drivers/net/bonding/bond-arp-interval-causes-panic.sh +@@ -33,16 +33,16 @@ ip netns add "client" + ip link set dev link1_1 netns client down name eth0 + ip netns exec client ip link add dev bond0 down type bond mode 1 \ + miimon 100 all_slaves_active 1 +-ip netns exec client ip link set dev eth0 down master bond0 ++ip netns exec client ip link set dev eth0 master bond0 + ip netns exec client ip link set dev bond0 up + ip netns exec client ip addr add ${client_ip4}/24 dev bond0 + ip netns exec client ping -c 5 $server_ip4 >/dev/null + +-ip netns exec client ip link set dev eth0 down nomaster ++ip netns exec client ip link set dev eth0 nomaster + ip netns exec client ip link set dev bond0 down + ip netns exec client ip link set dev bond0 type bond mode 0 \ + arp_interval 1000 arp_ip_target "+${server_ip4}" +-ip netns exec client ip link set dev eth0 down master bond0 ++ip netns exec client ip link set dev eth0 master bond0 + ip netns exec client ip link set dev bond0 up + ip netns exec client ping -c 5 $server_ip4 >/dev/null + +-- +2.43.0 + diff --git a/queue-6.1/selftests-mptcp-fix-fastclose-with-csum-failure.patch b/queue-6.1/selftests-mptcp-fix-fastclose-with-csum-failure.patch new file mode 100644 index 00000000000..d7e4aeba017 --- /dev/null +++ b/queue-6.1/selftests-mptcp-fix-fastclose-with-csum-failure.patch @@ -0,0 +1,58 @@ +From 329325f50f83617625c97e5ba5e4dcfd74ed1e79 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 14 Nov 2023 00:16:17 +0100 +Subject: selftests: mptcp: fix fastclose with csum failure + +From: Paolo Abeni + +[ Upstream commit 7cefbe5e1dacc7236caa77e9d072423f21422fe2 ] + +Running the mp_join selftest manually with the following command line: + + ./mptcp_join.sh -z -C + +leads to some failures: + + 002 fastclose server test + # ... + rtx [fail] got 1 MP_RST[s] TX expected 0 + # ... + rstrx [fail] got 1 MP_RST[s] RX expected 0 + +The problem is really in the wrong expectations for the RST checks +implied by the csum validation. Note that the same check is repeated +explicitly in the same test-case, with the correct expectation and +pass successfully. + +Address the issue explicitly setting the correct expectation for +the failing checks. + +Reported-by: Xiumei Mu +Fixes: 6bf41020b72b ("selftests: mptcp: update and extend fastclose test-cases") +Cc: stable@vger.kernel.org +Signed-off-by: Paolo Abeni +Reviewed-by: Matthieu Baerts +Signed-off-by: Matthieu Baerts +Link: https://lore.kernel.org/r/20231114-upstream-net-20231113-mptcp-misc-fixes-6-7-rc2-v1-5-7b9cd6a7b7f4@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/mptcp/mptcp_join.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh +index e52d513009fb0..9d8dde3b5c332 100755 +--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh ++++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh +@@ -3041,7 +3041,7 @@ fastclose_tests() + + if reset_check_counter "fastclose server test" "MPTcpExtMPFastcloseRx"; then + run_tests $ns1 $ns2 10.0.1.1 1024 0 fastclose_server +- chk_join_nr 0 0 0 ++ chk_join_nr 0 0 0 0 0 0 1 + chk_fclose_nr 1 1 invert + chk_rst_nr 1 1 + fi +-- +2.43.0 + diff --git a/queue-6.1/selftests-mptcp-set-failing_links-in-run_tests.patch b/queue-6.1/selftests-mptcp-set-failing_links-in-run_tests.patch new file mode 100644 index 00000000000..65556f63a58 --- /dev/null +++ b/queue-6.1/selftests-mptcp-set-failing_links-in-run_tests.patch @@ -0,0 +1,64 @@ +From 6465bc887b9ddfd7a4e8a118a4c52b4bf285ea3f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 23 Jun 2023 10:34:09 -0700 +Subject: selftests: mptcp: set FAILING_LINKS in run_tests + +From: Geliang Tang + +[ Upstream commit be7e9786c9155c2942cd53b813e4723be67e07c4 ] + +Set FAILING_LINKS as an env var with a limited scope only when calling +run_tests(). + +Reviewed-by: Matthieu Baerts +Signed-off-by: Geliang Tang +Signed-off-by: Mat Martineau +Link: https://lore.kernel.org/r/20230623-send-net-next-20230623-v1-3-a883213c8ba9@kernel.org +Signed-off-by: Jakub Kicinski +Stable-dep-of: 7cefbe5e1dac ("selftests: mptcp: fix fastclose with csum failure") +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/mptcp/mptcp_join.sh | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh +index 9d8dde3b5c332..2107579e2939d 100755 +--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh ++++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh +@@ -2167,9 +2167,9 @@ link_failure_tests() + pm_nl_set_limits $ns1 0 2 + pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal + pm_nl_set_limits $ns2 1 2 +- FAILING_LINKS="1" + pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup +- run_tests $ns1 $ns2 10.0.1.1 1 ++ FAILING_LINKS="1" \ ++ run_tests $ns1 $ns2 10.0.1.1 1 + chk_join_nr 2 2 2 + chk_add_nr 1 1 + chk_link_usage $ns2 ns2eth3 $cinsent 0 +@@ -2183,8 +2183,8 @@ link_failure_tests() + pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal + pm_nl_set_limits $ns2 1 2 + pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup +- FAILING_LINKS="1 2" +- run_tests $ns1 $ns2 10.0.1.1 1 ++ FAILING_LINKS="1 2" \ ++ run_tests $ns1 $ns2 10.0.1.1 1 + chk_join_nr 2 2 2 + chk_add_nr 1 1 + chk_stale_nr $ns2 2 4 2 +@@ -2199,8 +2199,8 @@ link_failure_tests() + pm_nl_add_endpoint $ns1 10.0.2.1 dev ns1eth2 flags signal + pm_nl_set_limits $ns2 1 3 + pm_nl_add_endpoint $ns2 10.0.3.2 dev ns2eth3 flags subflow,backup +- FAILING_LINKS="1 2" +- run_tests $ns1 $ns2 10.0.1.1 2 ++ FAILING_LINKS="1 2" \ ++ run_tests $ns1 $ns2 10.0.1.1 2 + chk_join_nr 2 2 2 + chk_add_nr 1 1 + chk_stale_nr $ns2 1 -1 2 +-- +2.43.0 + diff --git a/queue-6.1/selftests-secretmem-floor-the-memory-size-to-the-mul.patch b/queue-6.1/selftests-secretmem-floor-the-memory-size-to-the-mul.patch new file mode 100644 index 00000000000..b1723b8a3de --- /dev/null +++ b/queue-6.1/selftests-secretmem-floor-the-memory-size-to-the-mul.patch @@ -0,0 +1,56 @@ +From 966552f614ba6fbff3836f33c83f31ff6ef93760 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 14 Dec 2023 15:19:30 +0500 +Subject: selftests: secretmem: floor the memory size to the multiple of + page_size + +From: Muhammad Usama Anjum + +[ Upstream commit 0aac13add26d546ac74c89d2883b3a5f0fbea039 ] + +The "locked-in-memory size" limit per process can be non-multiple of +page_size. The mmap() fails if we try to allocate locked-in-memory with +same size as the allowed limit if it isn't multiple of the page_size +because mmap() rounds off the memory size to be allocated to next multiple +of page_size. + +Fix this by flooring the length to be allocated with mmap() to the +previous multiple of the page_size. + +This was getting triggered on KernelCI regularly because of different +ulimit settings which wasn't multiple of the page_size. Find logs +here: https://linux.kernelci.org/test/plan/id/657654bd8e81e654fae13532/ +The bug in was present from the time test was first added. + +Link: https://lkml.kernel.org/r/20231214101931.1155586-1-usama.anjum@collabora.com +Fixes: 76fe17ef588a ("secretmem: test: add basic selftest for memfd_secret(2)") +Signed-off-by: Muhammad Usama Anjum +Reported-by: "kernelci.org bot" +Closes: https://linux.kernelci.org/test/plan/id/657654bd8e81e654fae13532/ +Cc: "James E.J. Bottomley" +Cc: Mike Rapoport (IBM) +Cc: Shuah Khan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/vm/memfd_secret.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/tools/testing/selftests/vm/memfd_secret.c b/tools/testing/selftests/vm/memfd_secret.c +index 957b9e18c7295..9b298f6a04b37 100644 +--- a/tools/testing/selftests/vm/memfd_secret.c ++++ b/tools/testing/selftests/vm/memfd_secret.c +@@ -62,6 +62,9 @@ static void test_mlock_limit(int fd) + char *mem; + + len = mlock_limit_cur; ++ if (len % page_size != 0) ++ len = (len/page_size) * page_size; ++ + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("unable to mmap secret memory\n"); +-- +2.43.0 + diff --git a/queue-6.1/series b/queue-6.1/series index c071d30dc1d..10889fb5446 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -1,2 +1,122 @@ keys-dns-fix-missing-size-check-of-v1-server-list-header.patch block-don-t-invalidate-pagecache-for-invalid-falloc-modes.patch +wifi-iwlwifi-pcie-don-t-synchronize-irqs-from-irq.patch +drm-bridge-ti-sn65dsi86-never-store-more-than-msg-si.patch +netfilter-use-skb_ip_totlen-and-iph_totlen.patch +netfilter-nf_tables-set-transport-offset-from-mac-he.patch +nfc-llcp_core-hold-a-ref-to-llcp_local-dev-when-hold.patch +octeontx2-af-fix-marking-couple-of-structure-as-__pa.patch +drm-i915-dp-fix-passing-the-correct-dpcd_rev-for-drm.patch +ice-fix-link_down_on_close-message.patch +ice-shut-down-vsi-with-link-down-on-close-enabled.patch +i40e-fix-filter-input-checks-to-prevent-config-with-.patch +igc-report-vlan-ethertype-matching-back-to-user.patch +igc-check-vlan-tci-mask.patch +igc-check-vlan-ethertype-mask.patch +asoc-fsl_rpmsg-fix-error-handler-with-pm_runtime_ena.patch +asoc-mediatek-mt8186-fix-aud_pad_top-register-and-of.patch +mlxbf_gige-fix-receive-packet-race-condition.patch +net-sched-em_text-fix-possible-memory-leak-in-em_tex.patch +r8169-fix-pci-error-on-system-resume.patch +can-raw-add-support-for-so_mark.patch +net-timestamp-extend-sof_timestamping_opt_id-to-hw-t.patch +net-annotate-data-races-around-sk-sk_tsflags.patch +net-annotate-data-races-around-sk-sk_bind_phc.patch +net-implement-missing-getsockopt-so_timestamping_new.patch +selftests-bonding-do-not-set-port-down-when-adding-t.patch +arm-sun9i-smp-fix-array-index-out-of-bounds-read-in-.patch +sfc-fix-a-double-free-bug-in-efx_probe_filters.patch +net-bcmgenet-fix-fcs-generation-for-fragmented-skbuf.patch +netfilter-nft_immediate-drop-chain-reference-counter.patch +net-save-and-restore-msg_namelen-in-sock_sendmsg.patch +i40e-fix-use-after-free-in-i40e_aqc_add_filters.patch +asoc-meson-g12a-toacodec-validate-written-enum-value.patch +asoc-meson-g12a-tohdmitx-validate-written-enum-value.patch +asoc-meson-g12a-toacodec-fix-event-generation.patch +asoc-meson-g12a-tohdmitx-fix-event-generation-for-s-.patch +i40e-restore-vf-msi-x-state-during-pci-reset.patch +igc-fix-hicredit-calculation.patch +net-qla3xxx-fix-potential-memleak-in-ql_alloc_buffer.patch +net-smc-fix-invalid-link-access-in-dumping-smc-r-con.patch +octeontx2-af-always-configure-nix-tx-link-credits-ba.patch +octeontx2-af-re-enable-mac-tx-in-otx2_stop-processin.patch +asix-add-check-for-usbnet_get_endpoints.patch +net-ravb-wait-for-operating-mode-to-be-applied.patch +bnxt_en-remove-mis-applied-code-from-bnxt_cfg_ntp_fi.patch +net-implement-missing-so_timestamping_new-cmsg-suppo.patch +selftests-secretmem-floor-the-memory-size-to-the-mul.patch +cpu-smt-create-topology_smt_thread_allowed.patch +cpu-smt-make-smt-control-more-robust-against-enumera.patch +srcu-fix-callbacks-acceleration-mishandling.patch +bpf-x64-fix-tailcall-infinite-loop.patch +bpf-x86-simplify-the-parsing-logic-of-structure-para.patch +bpf-x86-save-restore-regs-with-bpf_dw-size.patch +net-declare-msg_splice_pages-internal-sendmsg-flag.patch +udp-convert-udp_sendpage-to-use-msg_splice_pages.patch +splice-net-add-a-splice_eof-op-to-file-ops-and-socke.patch +ipv4-ipv6-use-splice_eof-to-flush.patch +udp-introduce-udp-udp_flags.patch +udp-move-udp-no_check6_tx-to-udp-udp_flags.patch +udp-move-udp-no_check6_rx-to-udp-udp_flags.patch +udp-move-udp-gro_enabled-to-udp-udp_flags.patch +udp-move-udp-accept_udp_-l4-fraglist-to-udp-udp_flag.patch +udp-lockless-udp_encap_l2tpinudp-udp_gro.patch +udp-annotate-data-races-around-udp-encap_type.patch +wifi-iwlwifi-yoyo-swap-cdb-and-jacket-bits-values.patch +arm64-dts-qcom-sdm845-align-rpmh-regulator-nodes-wit.patch +arm64-dts-qcom-sdm845-fix-psci-power-domain-names.patch +fbdev-imsttfb-release-framebuffer-and-dealloc-cmap-o.patch +fbdev-imsttfb-fix-double-free-in-probe.patch +bpf-decouple-prune-and-jump-points.patch +bpf-remove-unnecessary-prune-and-jump-points.patch +bpf-remove-unused-insn_cnt-argument-from-visit_-func.patch +bpf-clean-up-visit_insn-s-instruction-processing.patch +bpf-support-new-32bit-offset-jmp-instruction.patch +bpf-handle-ldimm64-properly-in-check_cfg.patch +bpf-fix-precision-backtracking-instruction-iteration.patch +blk-mq-make-sure-active-queue-usage-is-held-for-bio_.patch +net-mlx5-increase-size-of-irq-name-buffer.patch +s390-mm-add-missing-arch_set_page_dat-call-to-vmem_c.patch +s390-cpumf-support-user-space-events-for-counting.patch +f2fs-clean-up-i_compress_flag-and-i_compress_level-u.patch +f2fs-convert-to-use-bitmap-api.patch +f2fs-assign-default-compression-level.patch +f2fs-set-the-default-compress_level-on-ioctl.patch +selftests-mptcp-fix-fastclose-with-csum-failure.patch +selftests-mptcp-set-failing_links-in-run_tests.patch +media-camss-sm8250-virtual-channels-for-csid.patch +media-qcom-camss-fix-set-csi2_rx_cfg1_vc_mode-when-v.patch +ext4-convert-move_extent_per_page-to-use-folios.patch +khugepage-replace-try_to_release_page-with-filemap_r.patch +memory-failure-convert-truncate_error_page-to-use-fo.patch +mm-merge-folio_has_private-filemap_release_folio-cal.patch +mm-netfs-fscache-stop-read-optimisation-when-folio-r.patch +filemap-add-a-per-mapping-stable-writes-flag.patch +block-update-the-stable_writes-flag-in-bdev_add.patch +smb-client-fix-missing-mode-bits-for-smb-symlinks.patch +net-dpaa2-eth-rearrange-variable-in-dpaa2_eth_get_et.patch +dpaa2-eth-recycle-the-rx-buffer-only-after-all-proce.patch +ethtool-don-t-propagate-eopnotsupp-from-dumps.patch +bpf-sockmap-af_unix-stream-sockets-need-to-hold-ref-.patch +firmware-arm_scmi-fix-frequency-truncation-by-promot.patch +alsa-hda-realtek-add-quirk-for-lenovo-yoga-pro-7.patch +genirq-affinity-remove-the-firstvec-parameter-from-i.patch +genirq-affinity-pass-affinity-managed-mask-array-to-.patch +genirq-affinity-don-t-pass-irq_affinity_desc-array-t.patch +genirq-affinity-rename-irq_build_affinity_masks-as-g.patch +genirq-affinity-move-group_cpus_evenly-into-lib.patch +lib-group_cpus.c-avoid-acquiring-cpu-hotplug-lock-in.patch +mm-memory_hotplug-add-missing-mem_hotplug_lock.patch +mm-memory_hotplug-fix-error-handling-in-add_memory_r.patch +net-sched-call-tcf_ct_params_free-to-free-params-in-.patch +netfilter-flowtable-allow-unidirectional-rules.patch +netfilter-flowtable-cache-info-of-last-offload.patch +net-sched-act_ct-offload-udp-new-connections.patch +net-sched-act_ct-fix-promotion-of-offloaded-unreplie.patch +netfilter-flowtable-gc-pushes-back-packets-to-classi.patch +net-sched-act_ct-take-per-cb-reference-to-tcf_ct_flo.patch +octeontx2-af-fix-pause-frame-configuration.patch +octeontx2-af-support-variable-number-of-lmacs.patch +btrfs-fix-qgroup_free_reserved_data-int-overflow.patch +btrfs-mark-the-len-field-in-struct-btrfs_ordered_sum.patch +ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch diff --git a/queue-6.1/sfc-fix-a-double-free-bug-in-efx_probe_filters.patch b/queue-6.1/sfc-fix-a-double-free-bug-in-efx_probe_filters.patch new file mode 100644 index 00000000000..296cc2357d6 --- /dev/null +++ b/queue-6.1/sfc-fix-a-double-free-bug-in-efx_probe_filters.patch @@ -0,0 +1,51 @@ +From 8c1095d26a73f2833e1f8e43056a8371a254b8f5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 25 Dec 2023 19:29:14 +0800 +Subject: sfc: fix a double-free bug in efx_probe_filters + +From: Zhipeng Lu + +[ Upstream commit d5a306aedba34e640b11d7026dbbafb78ee3a5f6 ] + +In efx_probe_filters, the channel->rps_flow_id is freed in a +efx_for_each_channel marco when success equals to 0. +However, after the following call chain: + +ef100_net_open + |-> efx_probe_filters + |-> ef100_net_stop + |-> efx_remove_filters + +The channel->rps_flow_id is freed again in the efx_for_each_channel of +efx_remove_filters, triggering a double-free bug. + +Fixes: a9dc3d5612ce ("sfc_ef100: RX filter table management and related gubbins") +Reviewed-by: Simon Horman +Reviewed-by: Edward Cree +Signed-off-by: Zhipeng Lu +Link: https://lore.kernel.org/r/20231225112915.3544581-1-alexious@zju.edu.cn +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/sfc/rx_common.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/sfc/rx_common.c b/drivers/net/ethernet/sfc/rx_common.c +index 9220afeddee81..3f290791df1c4 100644 +--- a/drivers/net/ethernet/sfc/rx_common.c ++++ b/drivers/net/ethernet/sfc/rx_common.c +@@ -820,8 +820,10 @@ int efx_probe_filters(struct efx_nic *efx) + } + + if (!success) { +- efx_for_each_channel(channel, efx) ++ efx_for_each_channel(channel, efx) { + kfree(channel->rps_flow_id); ++ channel->rps_flow_id = NULL; ++ } + efx->type->filter_table_remove(efx); + rc = -ENOMEM; + goto out_unlock; +-- +2.43.0 + diff --git a/queue-6.1/smb-client-fix-missing-mode-bits-for-smb-symlinks.patch b/queue-6.1/smb-client-fix-missing-mode-bits-for-smb-symlinks.patch new file mode 100644 index 00000000000..68b97b5f766 --- /dev/null +++ b/queue-6.1/smb-client-fix-missing-mode-bits-for-smb-symlinks.patch @@ -0,0 +1,36 @@ +From 9ab12c755c1e01616c2dcfea64cf55fb2c382d0b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 25 Nov 2023 23:55:10 -0300 +Subject: smb: client: fix missing mode bits for SMB symlinks + +From: Paulo Alcantara + +[ Upstream commit ef22bb800d967616c7638d204bc1b425beac7f5f ] + +When instantiating inodes for SMB symlinks, add the mode bits from +@cifs_sb->ctx->file_mode as we already do for the other special files. + +Cc: stable@vger.kernel.org +Signed-off-by: Paulo Alcantara (SUSE) +Signed-off-by: Steve French +Signed-off-by: Sasha Levin +--- + fs/smb/client/inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c +index 7be51f9d2fa18..5343898bac8a6 100644 +--- a/fs/smb/client/inode.c ++++ b/fs/smb/client/inode.c +@@ -264,7 +264,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, + fattr->cf_dtype = DT_REG; + break; + case UNIX_SYMLINK: +- fattr->cf_mode |= S_IFLNK; ++ fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode; + fattr->cf_dtype = DT_LNK; + break; + case UNIX_DIR: +-- +2.43.0 + diff --git a/queue-6.1/splice-net-add-a-splice_eof-op-to-file-ops-and-socke.patch b/queue-6.1/splice-net-add-a-splice_eof-op-to-file-ops-and-socke.patch new file mode 100644 index 00000000000..1ec5fdd9b6c --- /dev/null +++ b/queue-6.1/splice-net-add-a-splice_eof-op-to-file-ops-and-socke.patch @@ -0,0 +1,212 @@ +From 4539a0bb3af7906c4281c52b7c7e1f6ccdebe5e6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 7 Jun 2023 19:19:10 +0100 +Subject: splice, net: Add a splice_eof op to file-ops and socket-ops + +From: David Howells + +[ Upstream commit 2bfc66850952b6921b2033b09729ec59eabbc81d ] + +Add an optional method, ->splice_eof(), to allow splice to indicate the +premature termination of a splice to struct file_operations and struct +proto_ops. + +This is called if sendfile() or splice() encounters all of the following +conditions inside splice_direct_to_actor(): + + (1) the user did not set SPLICE_F_MORE (splice only), and + + (2) an EOF condition occurred (->splice_read() returned 0), and + + (3) we haven't read enough to fulfill the request (ie. len > 0 still), and + + (4) we have already spliced at least one byte. + +A further patch will modify the behaviour of SPLICE_F_MORE to always be +passed to the actor if either the user set it or we haven't yet read +sufficient data to fulfill the request. + +Suggested-by: Linus Torvalds +Link: https://lore.kernel.org/r/CAHk-=wh=V579PDYvkpnTobCLGczbgxpMgGmmhqiTyE34Cpi5Gg@mail.gmail.com/ +Signed-off-by: David Howells +Reviewed-by: Jakub Kicinski +cc: Jens Axboe +cc: Christoph Hellwig +cc: Al Viro +cc: Matthew Wilcox +cc: Jan Kara +cc: Jeff Layton +cc: David Hildenbrand +cc: Christian Brauner +cc: Chuck Lever +cc: Boris Pismenny +cc: John Fastabend +cc: linux-mm@kvack.org +Signed-off-by: Jakub Kicinski +Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") +Signed-off-by: Sasha Levin +--- + fs/splice.c | 31 ++++++++++++++++++++++++++++++- + include/linux/fs.h | 1 + + include/linux/net.h | 1 + + include/linux/splice.h | 1 + + include/net/sock.h | 1 + + net/socket.c | 10 ++++++++++ + 6 files changed, 44 insertions(+), 1 deletion(-) + +diff --git a/fs/splice.c b/fs/splice.c +index 5969b7a1d353a..c4ae54deac42c 100644 +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -764,6 +764,17 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + return out->f_op->splice_write(pipe, out, ppos, len, flags); + } + ++/* ++ * Indicate to the caller that there was a premature EOF when reading from the ++ * source and the caller didn't indicate they would be sending more data after ++ * this. ++ */ ++static void do_splice_eof(struct splice_desc *sd) ++{ ++ if (sd->splice_eof) ++ sd->splice_eof(sd); ++} ++ + /* + * Attempt to initiate a splice from a file to a pipe. + */ +@@ -864,7 +875,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, + + ret = do_splice_to(in, &pos, pipe, len, flags); + if (unlikely(ret <= 0)) +- goto out_release; ++ goto read_failure; + + read_len = ret; + sd->total_len = read_len; +@@ -904,6 +915,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, + file_accessed(in); + return bytes; + ++read_failure: ++ /* ++ * If the user did *not* set SPLICE_F_MORE *and* we didn't hit that ++ * "use all of len" case that cleared SPLICE_F_MORE, *and* we did a ++ * "->splice_in()" that returned EOF (ie zero) *and* we have sent at ++ * least 1 byte *then* we will also do the ->splice_eof() call. ++ */ ++ if (ret == 0 && !more && len > 0 && bytes) ++ do_splice_eof(sd); + out_release: + /* + * If we did an incomplete transfer we must release +@@ -932,6 +952,14 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, + sd->flags); + } + ++static void direct_file_splice_eof(struct splice_desc *sd) ++{ ++ struct file *file = sd->u.file; ++ ++ if (file->f_op->splice_eof) ++ file->f_op->splice_eof(file); ++} ++ + /** + * do_splice_direct - splices data directly between two files + * @in: file to splice from +@@ -957,6 +985,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + .flags = flags, + .pos = *ppos, + .u.file = out, ++ .splice_eof = direct_file_splice_eof, + .opos = opos, + }; + long ret; +diff --git a/include/linux/fs.h b/include/linux/fs.h +index b6af6abc7a77f..4a1911dcf834b 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -2177,6 +2177,7 @@ struct file_operations { + int (*flock) (struct file *, int, struct file_lock *); + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); + ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); ++ void (*splice_eof)(struct file *file); + int (*setlease)(struct file *, long, struct file_lock **, void **); + long (*fallocate)(struct file *file, int mode, loff_t offset, + loff_t len); +diff --git a/include/linux/net.h b/include/linux/net.h +index 18d942bbdf6e0..25baca60f6cba 100644 +--- a/include/linux/net.h ++++ b/include/linux/net.h +@@ -209,6 +209,7 @@ struct proto_ops { + int offset, size_t size, int flags); + ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); ++ void (*splice_eof)(struct socket *sock); + int (*set_peek_off)(struct sock *sk, int val); + int (*peek_len)(struct socket *sock); + +diff --git a/include/linux/splice.h b/include/linux/splice.h +index a55179fd60fc3..41a70687be853 100644 +--- a/include/linux/splice.h ++++ b/include/linux/splice.h +@@ -38,6 +38,7 @@ struct splice_desc { + struct file *file; /* file to read/write */ + void *data; /* cookie */ + } u; ++ void (*splice_eof)(struct splice_desc *sd); /* Unexpected EOF handler */ + loff_t pos; /* file position */ + loff_t *opos; /* sendfile: output position */ + size_t num_spliced; /* number of bytes already spliced */ +diff --git a/include/net/sock.h b/include/net/sock.h +index d8ed62a8e1a3e..9de9f070537cc 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -1279,6 +1279,7 @@ struct proto { + size_t len, int flags, int *addr_len); + int (*sendpage)(struct sock *sk, struct page *page, + int offset, size_t size, int flags); ++ void (*splice_eof)(struct socket *sock); + int (*bind)(struct sock *sk, + struct sockaddr *addr, int addr_len); + int (*bind_add)(struct sock *sk, +diff --git a/net/socket.c b/net/socket.c +index 6f39f7b0cc85c..639d76f20384e 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -130,6 +130,7 @@ static ssize_t sock_sendpage(struct file *file, struct page *page, + static ssize_t sock_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); ++static void sock_splice_eof(struct file *file); + + #ifdef CONFIG_PROC_FS + static void sock_show_fdinfo(struct seq_file *m, struct file *f) +@@ -164,6 +165,7 @@ static const struct file_operations socket_file_ops = { + .sendpage = sock_sendpage, + .splice_write = generic_splice_sendpage, + .splice_read = sock_splice_read, ++ .splice_eof = sock_splice_eof, + .show_fdinfo = sock_show_fdinfo, + }; + +@@ -1091,6 +1093,14 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, + return sock->ops->splice_read(sock, ppos, pipe, len, flags); + } + ++static void sock_splice_eof(struct file *file) ++{ ++ struct socket *sock = file->private_data; ++ ++ if (sock->ops->splice_eof) ++ sock->ops->splice_eof(sock); ++} ++ + static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) + { + struct file *file = iocb->ki_filp; +-- +2.43.0 + diff --git a/queue-6.1/srcu-fix-callbacks-acceleration-mishandling.patch b/queue-6.1/srcu-fix-callbacks-acceleration-mishandling.patch new file mode 100644 index 00000000000..8223bedb3a5 --- /dev/null +++ b/queue-6.1/srcu-fix-callbacks-acceleration-mishandling.patch @@ -0,0 +1,157 @@ +From 87882bb82acf16fc4e9d159032c1e6e7a25a3f87 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 Oct 2023 01:28:59 +0200 +Subject: srcu: Fix callbacks acceleration mishandling + +From: Frederic Weisbecker + +[ Upstream commit 4a8e65b0c348e42107c64381e692e282900be361 ] + +SRCU callbacks acceleration might fail if the preceding callbacks +advance also fails. This can happen when the following steps are met: + +1) The RCU_WAIT_TAIL segment has callbacks (say for gp_num 8) and the + RCU_NEXT_READY_TAIL also has callbacks (say for gp_num 12). + +2) The grace period for RCU_WAIT_TAIL is observed as started but not yet + completed so rcu_seq_current() returns 4 + SRCU_STATE_SCAN1 = 5. + +3) This value is passed to rcu_segcblist_advance() which can't move + any segment forward and fails. + +4) srcu_gp_start_if_needed() still proceeds with callback acceleration. + But then the call to rcu_seq_snap() observes the grace period for the + RCU_WAIT_TAIL segment (gp_num 8) as completed and the subsequent one + for the RCU_NEXT_READY_TAIL segment as started + (ie: 8 + SRCU_STATE_SCAN1 = 9) so it returns a snapshot of the + next grace period, which is 16. + +5) The value of 16 is passed to rcu_segcblist_accelerate() but the + freshly enqueued callback in RCU_NEXT_TAIL can't move to + RCU_NEXT_READY_TAIL which already has callbacks for a previous grace + period (gp_num = 12). So acceleration fails. + +6) Note in all these steps, srcu_invoke_callbacks() hadn't had a chance + to run srcu_invoke_callbacks(). + +Then some very bad outcome may happen if the following happens: + +7) Some other CPU races and starts the grace period number 16 before the + CPU handling previous steps had a chance. Therefore srcu_gp_start() + isn't called on the latter sdp to fix the acceleration leak from + previous steps with a new pair of call to advance/accelerate. + +8) The grace period 16 completes and srcu_invoke_callbacks() is finally + called. All the callbacks from previous grace periods (8 and 12) are + correctly advanced and executed but callbacks in RCU_NEXT_READY_TAIL + still remain. Then rcu_segcblist_accelerate() is called with a + snaphot of 20. + +9) Since nothing started the grace period number 20, callbacks stay + unhandled. + +This has been reported in real load: + + [3144162.608392] INFO: task kworker/136:12:252684 blocked for more + than 122 seconds. + [3144162.615986] Tainted: G O K 5.4.203-1-tlinux4-0011.1 #1 + [3144162.623053] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" + disables this message. + [3144162.631162] kworker/136:12 D 0 252684 2 0x90004000 + [3144162.631189] Workqueue: kvm-irqfd-cleanup irqfd_shutdown [kvm] + [3144162.631192] Call Trace: + [3144162.631202] __schedule+0x2ee/0x660 + [3144162.631206] schedule+0x33/0xa0 + [3144162.631209] schedule_timeout+0x1c4/0x340 + [3144162.631214] ? update_load_avg+0x82/0x660 + [3144162.631217] ? raw_spin_rq_lock_nested+0x1f/0x30 + [3144162.631218] wait_for_completion+0x119/0x180 + [3144162.631220] ? wake_up_q+0x80/0x80 + [3144162.631224] __synchronize_srcu.part.19+0x81/0xb0 + [3144162.631226] ? __bpf_trace_rcu_utilization+0x10/0x10 + [3144162.631227] synchronize_srcu+0x5f/0xc0 + [3144162.631236] irqfd_shutdown+0x3c/0xb0 [kvm] + [3144162.631239] ? __schedule+0x2f6/0x660 + [3144162.631243] process_one_work+0x19a/0x3a0 + [3144162.631244] worker_thread+0x37/0x3a0 + [3144162.631247] kthread+0x117/0x140 + [3144162.631247] ? process_one_work+0x3a0/0x3a0 + [3144162.631248] ? __kthread_cancel_work+0x40/0x40 + [3144162.631250] ret_from_fork+0x1f/0x30 + +Fix this with taking the snapshot for acceleration _before_ the read +of the current grace period number. + +The only side effect of this solution is that callbacks advancing happen +then _after_ the full barrier in rcu_seq_snap(). This is not a problem +because that barrier only cares about: + +1) Ordering accesses of the update side before call_srcu() so they don't + bleed. +2) See all the accesses prior to the grace period of the current gp_num + +The only things callbacks advancing need to be ordered against are +carried by snp locking. + +Reported-by: Yong He +Co-developed-by:: Yong He +Signed-off-by: Yong He +Co-developed-by: Joel Fernandes (Google) +Signed-off-by: Joel Fernandes (Google) +Co-developed-by: Neeraj upadhyay +Signed-off-by: Neeraj upadhyay +Link: http://lore.kernel.org/CANZk6aR+CqZaqmMWrC2eRRPY12qAZnDZLwLnHZbNi=xXMB401g@mail.gmail.com +Fixes: da915ad5cf25 ("srcu: Parallelize callback handling") +Signed-off-by: Frederic Weisbecker +Signed-off-by: Sasha Levin +--- + kernel/rcu/srcutree.c | 31 +++++++++++++++++++++++++++++-- + 1 file changed, 29 insertions(+), 2 deletions(-) + +diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c +index 8fdf076720384..929dcbc04d29c 100644 +--- a/kernel/rcu/srcutree.c ++++ b/kernel/rcu/srcutree.c +@@ -1100,10 +1100,37 @@ static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, + spin_lock_irqsave_sdp_contention(sdp, &flags); + if (rhp) + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); ++ /* ++ * The snapshot for acceleration must be taken _before_ the read of the ++ * current gp sequence used for advancing, otherwise advancing may fail ++ * and acceleration may then fail too. ++ * ++ * This could happen if: ++ * ++ * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the ++ * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). ++ * ++ * 2) The grace period for RCU_WAIT_TAIL is seen as started but not ++ * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1. ++ * ++ * 3) This value is passed to rcu_segcblist_advance() which can't move ++ * any segment forward and fails. ++ * ++ * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. ++ * But then the call to rcu_seq_snap() observes the grace period for the ++ * RCU_WAIT_TAIL segment as completed and the subsequent one for the ++ * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1) ++ * so it returns a snapshot of the next grace period, which is X + 12. ++ * ++ * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the ++ * freshly enqueued callback in RCU_NEXT_TAIL can't move to ++ * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace ++ * period (gp_num = X + 8). So acceleration fails. ++ */ ++ s = rcu_seq_snap(&ssp->srcu_gp_seq); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&ssp->srcu_gp_seq)); +- s = rcu_seq_snap(&ssp->srcu_gp_seq); +- (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); ++ WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s) && rhp); + if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { + sdp->srcu_gp_seq_needed = s; + needgp = true; +-- +2.43.0 + diff --git a/queue-6.1/udp-annotate-data-races-around-udp-encap_type.patch b/queue-6.1/udp-annotate-data-races-around-udp-encap_type.patch new file mode 100644 index 00000000000..681e0515008 --- /dev/null +++ b/queue-6.1/udp-annotate-data-races-around-udp-encap_type.patch @@ -0,0 +1,205 @@ +From be36ec0d25d48e42c708f492bd350b9e4cc0e19e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Sep 2023 09:17:28 +0000 +Subject: udp: annotate data-races around udp->encap_type + +From: Eric Dumazet + +[ Upstream commit 70a36f571362a8de8b8c02d21ae524fc776287f2 ] + +syzbot/KCSAN complained about UDP_ENCAP_L2TPINUDP setsockopt() racing. + +Add READ_ONCE()/WRITE_ONCE() to document races on this lockless field. + +syzbot report was: +BUG: KCSAN: data-race in udp_lib_setsockopt / udp_lib_setsockopt + +read-write to 0xffff8881083603fa of 1 bytes by task 16557 on cpu 0: +udp_lib_setsockopt+0x682/0x6c0 +udp_setsockopt+0x73/0xa0 net/ipv4/udp.c:2779 +sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 +__sys_setsockopt+0x1c9/0x230 net/socket.c:2263 +__do_sys_setsockopt net/socket.c:2274 [inline] +__se_sys_setsockopt net/socket.c:2271 [inline] +__x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 +do_syscall_x64 arch/x86/entry/common.c:50 [inline] +do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +read-write to 0xffff8881083603fa of 1 bytes by task 16554 on cpu 1: +udp_lib_setsockopt+0x682/0x6c0 +udp_setsockopt+0x73/0xa0 net/ipv4/udp.c:2779 +sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 +__sys_setsockopt+0x1c9/0x230 net/socket.c:2263 +__do_sys_setsockopt net/socket.c:2274 [inline] +__se_sys_setsockopt net/socket.c:2271 [inline] +__x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 +do_syscall_x64 arch/x86/entry/common.c:50 [inline] +do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +value changed: 0x01 -> 0x05 + +Reported by Kernel Concurrency Sanitizer on: +CPU: 1 PID: 16554 Comm: syz-executor.5 Not tainted 6.5.0-rc7-syzkaller-00004-gf7757129e3de #0 + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Reported-by: syzbot +Signed-off-by: Eric Dumazet +Reviewed-by: Willem de Bruijn +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/gtp.c | 4 ++-- + net/ipv4/udp.c | 9 +++++---- + net/ipv4/xfrm4_input.c | 4 ++-- + net/ipv6/udp.c | 5 +++-- + net/ipv6/xfrm6_input.c | 4 ++-- + net/l2tp/l2tp_core.c | 6 +++--- + 6 files changed, 17 insertions(+), 15 deletions(-) + +diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c +index 477b4d4f860bd..bace989591f75 100644 +--- a/drivers/net/gtp.c ++++ b/drivers/net/gtp.c +@@ -629,7 +629,7 @@ static void __gtp_encap_destroy(struct sock *sk) + gtp->sk0 = NULL; + else + gtp->sk1u = NULL; +- udp_sk(sk)->encap_type = 0; ++ WRITE_ONCE(udp_sk(sk)->encap_type, 0); + rcu_assign_sk_user_data(sk, NULL); + release_sock(sk); + sock_put(sk); +@@ -681,7 +681,7 @@ static int gtp_encap_recv(struct sock *sk, struct sk_buff *skb) + + netdev_dbg(gtp->dev, "encap_recv sk=%p\n", sk); + +- switch (udp_sk(sk)->encap_type) { ++ switch (READ_ONCE(udp_sk(sk)->encap_type)) { + case UDP_ENCAP_GTP0: + netdev_dbg(gtp->dev, "received GTP0 packet\n"); + ret = gtp0_udp_encap_recv(gtp, skb); +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 267f77633a8f3..5672d9a86c5d2 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -733,7 +733,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) + iph->saddr, uh->source, skb->dev->ifindex, + inet_sdif(skb), udptable, NULL); + +- if (!sk || udp_sk(sk)->encap_type) { ++ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { + /* No socket for error: try tunnels before discarding */ + if (static_branch_unlikely(&udp_encap_needed_key)) { + sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb, +@@ -2114,7 +2114,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) + } + nf_reset_ct(skb); + +- if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { ++ if (static_branch_unlikely(&udp_encap_needed_key) && ++ READ_ONCE(up->encap_type)) { + int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + + /* +@@ -2699,7 +2700,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, + #endif + fallthrough; + case UDP_ENCAP_L2TPINUDP: +- up->encap_type = val; ++ WRITE_ONCE(up->encap_type, val); + udp_tunnel_encap_enable(sk); + break; + default: +@@ -2800,7 +2801,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, + break; + + case UDP_ENCAP: +- val = up->encap_type; ++ val = READ_ONCE(up->encap_type); + break; + + case UDP_NO_CHECK6_TX: +diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c +index eac206a290d05..183f6dc372429 100644 +--- a/net/ipv4/xfrm4_input.c ++++ b/net/ipv4/xfrm4_input.c +@@ -85,11 +85,11 @@ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) + struct udphdr *uh; + struct iphdr *iph; + int iphlen, len; +- + __u8 *udpdata; + __be32 *udpdata32; +- __u16 encap_type = up->encap_type; ++ u16 encap_type; + ++ encap_type = READ_ONCE(up->encap_type); + /* if this is not encapsulated socket, then just return now */ + if (!encap_type) + return 1; +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index 5b7c4f8e2ed03..961106eda69d0 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -598,7 +598,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, + inet6_iif(skb), inet6_sdif(skb), udptable, NULL); + +- if (!sk || udp_sk(sk)->encap_type) { ++ if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { + /* No socket for error: try tunnels before discarding */ + if (static_branch_unlikely(&udpv6_encap_needed_key)) { + sk = __udp6_lib_err_encap(net, hdr, offset, uh, +@@ -712,7 +712,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) + } + nf_reset_ct(skb); + +- if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) { ++ if (static_branch_unlikely(&udpv6_encap_needed_key) && ++ READ_ONCE(up->encap_type)) { + int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); + + /* +diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c +index 4907ab241d6be..4156387248e40 100644 +--- a/net/ipv6/xfrm6_input.c ++++ b/net/ipv6/xfrm6_input.c +@@ -81,14 +81,14 @@ int xfrm6_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) + struct ipv6hdr *ip6h; + int len; + int ip6hlen = sizeof(struct ipv6hdr); +- + __u8 *udpdata; + __be32 *udpdata32; +- __u16 encap_type = up->encap_type; ++ u16 encap_type; + + if (skb->protocol == htons(ETH_P_IP)) + return xfrm4_udp_encap_rcv(sk, skb); + ++ encap_type = READ_ONCE(up->encap_type); + /* if this is not encapsulated socket, then just return now */ + if (!encap_type) + return 1; +diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c +index 03608d3ded4b8..8d21ff25f1602 100644 +--- a/net/l2tp/l2tp_core.c ++++ b/net/l2tp/l2tp_core.c +@@ -1139,9 +1139,9 @@ static void l2tp_tunnel_destruct(struct sock *sk) + switch (tunnel->encap) { + case L2TP_ENCAPTYPE_UDP: + /* No longer an encapsulation socket. See net/ipv4/udp.c */ +- (udp_sk(sk))->encap_type = 0; +- (udp_sk(sk))->encap_rcv = NULL; +- (udp_sk(sk))->encap_destroy = NULL; ++ WRITE_ONCE(udp_sk(sk)->encap_type, 0); ++ udp_sk(sk)->encap_rcv = NULL; ++ udp_sk(sk)->encap_destroy = NULL; + break; + case L2TP_ENCAPTYPE_IP: + break; +-- +2.43.0 + diff --git a/queue-6.1/udp-convert-udp_sendpage-to-use-msg_splice_pages.patch b/queue-6.1/udp-convert-udp_sendpage-to-use-msg_splice_pages.patch new file mode 100644 index 00000000000..7bd883f96f7 --- /dev/null +++ b/queue-6.1/udp-convert-udp_sendpage-to-use-msg_splice_pages.patch @@ -0,0 +1,95 @@ +From 18625f6ea3f1ce6d5e70c59bd187fa0323530c26 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 May 2023 13:11:22 +0100 +Subject: udp: Convert udp_sendpage() to use MSG_SPLICE_PAGES + +From: David Howells + +[ Upstream commit 7ac7c987850c3ec617c778f7bd871804dc1c648d ] + +Convert udp_sendpage() to use sendmsg() with MSG_SPLICE_PAGES rather than +directly splicing in the pages itself. + +This allows ->sendpage() to be replaced by something that can handle +multiple multipage folios in a single transaction. + +Signed-off-by: David Howells +cc: Willem de Bruijn +cc: David Ahern +cc: Jens Axboe +cc: Matthew Wilcox +Signed-off-by: Jakub Kicinski +Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") +Signed-off-by: Sasha Levin +--- + net/ipv4/udp.c | 51 ++++++-------------------------------------------- + 1 file changed, 6 insertions(+), 45 deletions(-) + +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 65abc92a81bd0..b49cb3df01bb4 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -1335,54 +1335,15 @@ EXPORT_SYMBOL(udp_sendmsg); + int udp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) + { +- struct inet_sock *inet = inet_sk(sk); +- struct udp_sock *up = udp_sk(sk); +- int ret; ++ struct bio_vec bvec; ++ struct msghdr msg = { .msg_flags = flags | MSG_SPLICE_PAGES }; + + if (flags & MSG_SENDPAGE_NOTLAST) +- flags |= MSG_MORE; +- +- if (!up->pending) { +- struct msghdr msg = { .msg_flags = flags|MSG_MORE }; +- +- /* Call udp_sendmsg to specify destination address which +- * sendpage interface can't pass. +- * This will succeed only when the socket is connected. +- */ +- ret = udp_sendmsg(sk, &msg, 0); +- if (ret < 0) +- return ret; +- } +- +- lock_sock(sk); ++ msg.msg_flags |= MSG_MORE; + +- if (unlikely(!up->pending)) { +- release_sock(sk); +- +- net_dbg_ratelimited("cork failed\n"); +- return -EINVAL; +- } +- +- ret = ip_append_page(sk, &inet->cork.fl.u.ip4, +- page, offset, size, flags); +- if (ret == -EOPNOTSUPP) { +- release_sock(sk); +- return sock_no_sendpage(sk->sk_socket, page, offset, +- size, flags); +- } +- if (ret < 0) { +- udp_flush_pending_frames(sk); +- goto out; +- } +- +- up->len += size; +- if (!(READ_ONCE(up->corkflag) || (flags&MSG_MORE))) +- ret = udp_push_pending_frames(sk); +- if (!ret) +- ret = size; +-out: +- release_sock(sk); +- return ret; ++ bvec_set_page(&bvec, page, size, offset); ++ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); ++ return udp_sendmsg(sk, &msg, size); + } + + #define UDP_SKB_IS_STATELESS 0x80000000 +-- +2.43.0 + diff --git a/queue-6.1/udp-introduce-udp-udp_flags.patch b/queue-6.1/udp-introduce-udp-udp_flags.patch new file mode 100644 index 00000000000..6dc60074fdf --- /dev/null +++ b/queue-6.1/udp-introduce-udp-udp_flags.patch @@ -0,0 +1,171 @@ +From ceb0fec094adcb6586e574c81ad754f61512c4eb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Sep 2023 09:17:21 +0000 +Subject: udp: introduce udp->udp_flags + +From: Eric Dumazet + +[ Upstream commit 81b36803ac139827538ac5ce4028e750a3c53f53 ] + +According to syzbot, it is time to use proper atomic flags +for various UDP flags. + +Add udp_flags field, and convert udp->corkflag to first +bit in it. + +Signed-off-by: Eric Dumazet +Reviewed-by: Willem de Bruijn +Signed-off-by: Paolo Abeni +Stable-dep-of: a0002127cd74 ("udp: move udp->no_check6_tx to udp->udp_flags") +Signed-off-by: Sasha Levin +--- + include/linux/udp.h | 28 +++++++++++++++++++++------- + net/ipv4/udp.c | 12 ++++++------ + net/ipv6/udp.c | 6 +++--- + 3 files changed, 30 insertions(+), 16 deletions(-) + +diff --git a/include/linux/udp.h b/include/linux/udp.h +index e96da4157d04d..10b56b8231e3c 100644 +--- a/include/linux/udp.h ++++ b/include/linux/udp.h +@@ -30,14 +30,20 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) + return (num + net_hash_mix(net)) & mask; + } + ++enum { ++ UDP_FLAGS_CORK, /* Cork is required */ ++}; ++ + struct udp_sock { + /* inet_sock has to be the first member */ + struct inet_sock inet; + #define udp_port_hash inet.sk.__sk_common.skc_u16hashes[0] + #define udp_portaddr_hash inet.sk.__sk_common.skc_u16hashes[1] + #define udp_portaddr_node inet.sk.__sk_common.skc_portaddr_node ++ ++ unsigned long udp_flags; ++ + int pending; /* Any pending frames ? */ +- unsigned int corkflag; /* Cork is required */ + __u8 encap_type; /* Is this an Encapsulation socket? */ + unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ + no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ +@@ -49,6 +55,11 @@ struct udp_sock { + gro_enabled:1, /* Request GRO aggregation */ + accept_udp_l4:1, + accept_udp_fraglist:1; ++/* indicator bits used by pcflag: */ ++#define UDPLITE_BIT 0x1 /* set by udplite proto init function */ ++#define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ ++#define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */ ++ __u8 pcflag; /* marks socket as UDP-Lite if > 0 */ + /* + * Following member retains the information to create a UDP header + * when the socket is uncorked. +@@ -60,12 +71,6 @@ struct udp_sock { + */ + __u16 pcslen; + __u16 pcrlen; +-/* indicator bits used by pcflag: */ +-#define UDPLITE_BIT 0x1 /* set by udplite proto init function */ +-#define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ +-#define UDPLITE_RECV_CC 0x4 /* set via udplite setsocktopt */ +- __u8 pcflag; /* marks socket as UDP-Lite if > 0 */ +- __u8 unused[3]; + /* + * For encapsulation sockets. + */ +@@ -89,6 +94,15 @@ struct udp_sock { + int forward_deficit; + }; + ++#define udp_test_bit(nr, sk) \ ++ test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) ++#define udp_set_bit(nr, sk) \ ++ set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) ++#define udp_clear_bit(nr, sk) \ ++ clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) ++#define udp_assign_bit(nr, sk, val) \ ++ assign_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags, val) ++ + #define UDP_MAX_SEGMENTS (1 << 6UL) + + static inline struct udp_sock *udp_sk(const struct sock *sk) +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index e8dd2880ac9aa..60a754477efb2 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -1068,7 +1068,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) + __be16 dport; + u8 tos; + int err, is_udplite = IS_UDPLITE(sk); +- int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; ++ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; + int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); + struct sk_buff *skb; + struct ip_options_data opt_copy; +@@ -1337,11 +1337,11 @@ void udp_splice_eof(struct socket *sock) + struct sock *sk = sock->sk; + struct udp_sock *up = udp_sk(sk); + +- if (!up->pending || READ_ONCE(up->corkflag)) ++ if (!up->pending || udp_test_bit(CORK, sk)) + return; + + lock_sock(sk); +- if (up->pending && !READ_ONCE(up->corkflag)) ++ if (up->pending && !udp_test_bit(CORK, sk)) + udp_push_pending_frames(sk); + release_sock(sk); + } +@@ -2673,9 +2673,9 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, + switch (optname) { + case UDP_CORK: + if (val != 0) { +- WRITE_ONCE(up->corkflag, 1); ++ udp_set_bit(CORK, sk); + } else { +- WRITE_ONCE(up->corkflag, 0); ++ udp_clear_bit(CORK, sk); + lock_sock(sk); + push_pending_frames(sk); + release_sock(sk); +@@ -2800,7 +2800,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, + + switch (optname) { + case UDP_CORK: +- val = READ_ONCE(up->corkflag); ++ val = udp_test_bit(CORK, sk); + break; + + case UDP_ENCAP: +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index 2a65136dca773..85653e3a04fe8 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -1351,7 +1351,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) + int addr_len = msg->msg_namelen; + bool connected = false; + int ulen = len; +- int corkreq = READ_ONCE(up->corkflag) || msg->msg_flags&MSG_MORE; ++ int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; + int err; + int is_udplite = IS_UDPLITE(sk); + int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); +@@ -1662,11 +1662,11 @@ static void udpv6_splice_eof(struct socket *sock) + struct sock *sk = sock->sk; + struct udp_sock *up = udp_sk(sk); + +- if (!up->pending || READ_ONCE(up->corkflag)) ++ if (!up->pending || udp_test_bit(CORK, sk)) + return; + + lock_sock(sk); +- if (up->pending && !READ_ONCE(up->corkflag)) ++ if (up->pending && !udp_test_bit(CORK, sk)) + udp_v6_push_pending_frames(sk); + release_sock(sk); + } +-- +2.43.0 + diff --git a/queue-6.1/udp-lockless-udp_encap_l2tpinudp-udp_gro.patch b/queue-6.1/udp-lockless-udp_encap_l2tpinudp-udp_gro.patch new file mode 100644 index 00000000000..86655331418 --- /dev/null +++ b/queue-6.1/udp-lockless-udp_encap_l2tpinudp-udp_gro.patch @@ -0,0 +1,154 @@ +From d2f165afbbc9ce0af6beddcde9af3f3d368908f5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Sep 2023 09:17:27 +0000 +Subject: udp: lockless UDP_ENCAP_L2TPINUDP / UDP_GRO + +From: Eric Dumazet + +[ Upstream commit ac9a7f4ce5dda1472e8f44096f33066c6ec1a3b4 ] + +Move udp->encap_enabled to udp->udp_flags. + +Add udp_test_and_set_bit() helper to allow lockless +udp_tunnel_encap_enable() implementation. + +Signed-off-by: Eric Dumazet +Reviewed-by: Willem de Bruijn +Signed-off-by: Paolo Abeni +Stable-dep-of: 70a36f571362 ("udp: annotate data-races around udp->encap_type") +Signed-off-by: Sasha Levin +--- + include/linux/udp.h | 9 ++++----- + include/net/udp_tunnel.h | 9 +++------ + net/ipv4/udp.c | 10 +++------- + net/ipv4/udp_tunnel_core.c | 2 +- + net/ipv6/udp.c | 2 +- + 5 files changed, 12 insertions(+), 20 deletions(-) + +diff --git a/include/linux/udp.h b/include/linux/udp.h +index 0e6880856246a..efd9ab6df3797 100644 +--- a/include/linux/udp.h ++++ b/include/linux/udp.h +@@ -37,6 +37,7 @@ enum { + UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */ + UDP_FLAGS_ACCEPT_FRAGLIST, + UDP_FLAGS_ACCEPT_L4, ++ UDP_FLAGS_ENCAP_ENABLED, /* This socket enabled encap */ + }; + + struct udp_sock { +@@ -50,11 +51,7 @@ struct udp_sock { + + int pending; /* Any pending frames ? */ + __u8 encap_type; /* Is this an Encapsulation socket? */ +- unsigned char encap_enabled:1; /* This socket enabled encap +- * processing; UDP tunnels and +- * different encapsulation layer set +- * this +- */ ++ + /* indicator bits used by pcflag: */ + #define UDPLITE_BIT 0x1 /* set by udplite proto init function */ + #define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ +@@ -98,6 +95,8 @@ struct udp_sock { + test_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) + #define udp_set_bit(nr, sk) \ + set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) ++#define udp_test_and_set_bit(nr, sk) \ ++ test_and_set_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) + #define udp_clear_bit(nr, sk) \ + clear_bit(UDP_FLAGS_##nr, &udp_sk(sk)->udp_flags) + #define udp_assign_bit(nr, sk, val) \ +diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h +index 72394f441dad8..e5f81710b18f4 100644 +--- a/include/net/udp_tunnel.h ++++ b/include/net/udp_tunnel.h +@@ -174,16 +174,13 @@ static inline int udp_tunnel_handle_offloads(struct sk_buff *skb, bool udp_csum) + } + #endif + +-static inline void udp_tunnel_encap_enable(struct socket *sock) ++static inline void udp_tunnel_encap_enable(struct sock *sk) + { +- struct udp_sock *up = udp_sk(sock->sk); +- +- if (up->encap_enabled) ++ if (udp_test_and_set_bit(ENCAP_ENABLED, sk)) + return; + +- up->encap_enabled = 1; + #if IS_ENABLED(CONFIG_IPV6) +- if (sock->sk->sk_family == PF_INET6) ++ if (READ_ONCE(sk->sk_family) == PF_INET6) + ipv6_stub->udpv6_encap_enable(); + #endif + udp_encap_enable(); +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index df0ea45b8b8f2..267f77633a8f3 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -2645,7 +2645,7 @@ void udp_destroy_sock(struct sock *sk) + if (encap_destroy) + encap_destroy(sk); + } +- if (up->encap_enabled) ++ if (udp_test_bit(ENCAP_ENABLED, sk)) + static_branch_dec(&udp_encap_needed_key); + } + } +@@ -2700,9 +2700,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, + fallthrough; + case UDP_ENCAP_L2TPINUDP: + up->encap_type = val; +- lock_sock(sk); +- udp_tunnel_encap_enable(sk->sk_socket); +- release_sock(sk); ++ udp_tunnel_encap_enable(sk); + break; + default: + err = -ENOPROTOOPT; +@@ -2725,14 +2723,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, + break; + + case UDP_GRO: +- lock_sock(sk); + + /* when enabling GRO, accept the related GSO packet type */ + if (valbool) +- udp_tunnel_encap_enable(sk->sk_socket); ++ udp_tunnel_encap_enable(sk); + udp_assign_bit(GRO_ENABLED, sk, valbool); + udp_assign_bit(ACCEPT_L4, sk, valbool); +- release_sock(sk); + break; + + /* +diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c +index 5f8104cf082d0..732e21b75ba28 100644 +--- a/net/ipv4/udp_tunnel_core.c ++++ b/net/ipv4/udp_tunnel_core.c +@@ -78,7 +78,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, + udp_sk(sk)->gro_receive = cfg->gro_receive; + udp_sk(sk)->gro_complete = cfg->gro_complete; + +- udp_tunnel_encap_enable(sock); ++ udp_tunnel_encap_enable(sk); + } + EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); + +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index ddd17b5ea4259..5b7c4f8e2ed03 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -1688,7 +1688,7 @@ void udpv6_destroy_sock(struct sock *sk) + if (encap_destroy) + encap_destroy(sk); + } +- if (up->encap_enabled) { ++ if (udp_test_bit(ENCAP_ENABLED, sk)) { + static_branch_dec(&udpv6_encap_needed_key); + udp_encap_disable(); + } +-- +2.43.0 + diff --git a/queue-6.1/udp-move-udp-accept_udp_-l4-fraglist-to-udp-udp_flag.patch b/queue-6.1/udp-move-udp-accept_udp_-l4-fraglist-to-udp-udp_flag.patch new file mode 100644 index 00000000000..c3353f0e5a6 --- /dev/null +++ b/queue-6.1/udp-move-udp-accept_udp_-l4-fraglist-to-udp-udp_flag.patch @@ -0,0 +1,91 @@ +From f8848188eeb61db01317edf7b603cd83e93eef38 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Sep 2023 09:17:26 +0000 +Subject: udp: move udp->accept_udp_{l4|fraglist} to udp->udp_flags + +From: Eric Dumazet + +[ Upstream commit f5f52f0884a595ff99ab1a608643fe4025fca2d5 ] + +These are read locklessly, move them to udp_flags to fix data-races. + +Signed-off-by: Eric Dumazet +Reviewed-by: Willem de Bruijn +Signed-off-by: Paolo Abeni +Stable-dep-of: 70a36f571362 ("udp: annotate data-races around udp->encap_type") +Signed-off-by: Sasha Levin +--- + include/linux/udp.h | 16 +++++++++------- + net/ipv4/udp.c | 2 +- + 2 files changed, 10 insertions(+), 8 deletions(-) + +diff --git a/include/linux/udp.h b/include/linux/udp.h +index f87e2123fe7b0..0e6880856246a 100644 +--- a/include/linux/udp.h ++++ b/include/linux/udp.h +@@ -35,6 +35,8 @@ enum { + UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */ + UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */ + UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */ ++ UDP_FLAGS_ACCEPT_FRAGLIST, ++ UDP_FLAGS_ACCEPT_L4, + }; + + struct udp_sock { +@@ -48,13 +50,11 @@ struct udp_sock { + + int pending; /* Any pending frames ? */ + __u8 encap_type; /* Is this an Encapsulation socket? */ +- unsigned char encap_enabled:1, /* This socket enabled encap ++ unsigned char encap_enabled:1; /* This socket enabled encap + * processing; UDP tunnels and + * different encapsulation layer set + * this + */ +- accept_udp_l4:1, +- accept_udp_fraglist:1; + /* indicator bits used by pcflag: */ + #define UDPLITE_BIT 0x1 /* set by udplite proto init function */ + #define UDPLITE_SEND_CC 0x2 /* set via udplite setsockopt */ +@@ -146,10 +146,12 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) + if (!skb_is_gso(skb)) + return false; + +- if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && !udp_sk(sk)->accept_udp_l4) ++ if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && ++ !udp_test_bit(ACCEPT_L4, sk)) + return true; + +- if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && !udp_sk(sk)->accept_udp_fraglist) ++ if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST && ++ !udp_test_bit(ACCEPT_FRAGLIST, sk)) + return true; + + return false; +@@ -157,8 +159,8 @@ static inline bool udp_unexpected_gso(struct sock *sk, struct sk_buff *skb) + + static inline void udp_allow_gso(struct sock *sk) + { +- udp_sk(sk)->accept_udp_l4 = 1; +- udp_sk(sk)->accept_udp_fraglist = 1; ++ udp_set_bit(ACCEPT_L4, sk); ++ udp_set_bit(ACCEPT_FRAGLIST, sk); + } + + #define udp_portaddr_for_each_entry(__sk, list) \ +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 28292fcf07075..df0ea45b8b8f2 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -2731,7 +2731,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, + if (valbool) + udp_tunnel_encap_enable(sk->sk_socket); + udp_assign_bit(GRO_ENABLED, sk, valbool); +- up->accept_udp_l4 = valbool; ++ udp_assign_bit(ACCEPT_L4, sk, valbool); + release_sock(sk); + break; + +-- +2.43.0 + diff --git a/queue-6.1/udp-move-udp-gro_enabled-to-udp-udp_flags.patch b/queue-6.1/udp-move-udp-gro_enabled-to-udp-udp_flags.patch new file mode 100644 index 00000000000..4aae43cec87 --- /dev/null +++ b/queue-6.1/udp-move-udp-gro_enabled-to-udp-udp_flags.patch @@ -0,0 +1,109 @@ +From a7beff020a1a4657b4250f8b820c5cfbd77d49a5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Sep 2023 09:17:24 +0000 +Subject: udp: move udp->gro_enabled to udp->udp_flags + +From: Eric Dumazet + +[ Upstream commit e1dc0615c6b08ef36414f08c011965b8fb56198b ] + +syzbot reported that udp->gro_enabled can be read locklessly. +Use one atomic bit from udp->udp_flags. + +Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.") +Reported-by: syzbot +Signed-off-by: Eric Dumazet +Reviewed-by: Willem de Bruijn +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + include/linux/udp.h | 2 +- + net/ipv4/udp.c | 6 +++--- + net/ipv4/udp_offload.c | 4 ++-- + net/ipv6/udp.c | 2 +- + 4 files changed, 7 insertions(+), 7 deletions(-) + +diff --git a/include/linux/udp.h b/include/linux/udp.h +index e6cd46e2b0831..f87e2123fe7b0 100644 +--- a/include/linux/udp.h ++++ b/include/linux/udp.h +@@ -34,6 +34,7 @@ enum { + UDP_FLAGS_CORK, /* Cork is required */ + UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */ + UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */ ++ UDP_FLAGS_GRO_ENABLED, /* Request GRO aggregation */ + }; + + struct udp_sock { +@@ -52,7 +53,6 @@ struct udp_sock { + * different encapsulation layer set + * this + */ +- gro_enabled:1, /* Request GRO aggregation */ + accept_udp_l4:1, + accept_udp_fraglist:1; + /* indicator bits used by pcflag: */ +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 01e74919885ad..28292fcf07075 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -1901,7 +1901,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, + (struct sockaddr *)sin); + } + +- if (udp_sk(sk)->gro_enabled) ++ if (udp_test_bit(GRO_ENABLED, sk)) + udp_cmsg_recv(msg, sk, skb); + + if (inet->cmsg_flags) +@@ -2730,7 +2730,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, + /* when enabling GRO, accept the related GSO packet type */ + if (valbool) + udp_tunnel_encap_enable(sk->sk_socket); +- up->gro_enabled = valbool; ++ udp_assign_bit(GRO_ENABLED, sk, valbool); + up->accept_udp_l4 = valbool; + release_sock(sk); + break; +@@ -2820,7 +2820,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, + break; + + case UDP_GRO: +- val = up->gro_enabled; ++ val = udp_test_bit(GRO_ENABLED, sk); + break; + + /* The following two cannot be changed on UDP sockets, the return is +diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c +index 6d1a4bec2614d..8096576fd9bde 100644 +--- a/net/ipv4/udp_offload.c ++++ b/net/ipv4/udp_offload.c +@@ -549,10 +549,10 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, + NAPI_GRO_CB(skb)->is_flist = 0; + if (!sk || !udp_sk(sk)->gro_receive) { + if (skb->dev->features & NETIF_F_GRO_FRAGLIST) +- NAPI_GRO_CB(skb)->is_flist = sk ? !udp_sk(sk)->gro_enabled : 1; ++ NAPI_GRO_CB(skb)->is_flist = sk ? !udp_test_bit(GRO_ENABLED, sk) : 1; + + if ((!sk && (skb->dev->features & NETIF_F_GRO_UDP_FWD)) || +- (sk && udp_sk(sk)->gro_enabled) || NAPI_GRO_CB(skb)->is_flist) ++ (sk && udp_test_bit(GRO_ENABLED, sk)) || NAPI_GRO_CB(skb)->is_flist) + return call_gro_receive(udp_gro_receive_segment, head, skb); + + /* no GRO, be sure flush the current packet */ +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index ae4f7f983f951..ddd17b5ea4259 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -440,7 +440,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + (struct sockaddr *)sin6); + } + +- if (udp_sk(sk)->gro_enabled) ++ if (udp_test_bit(GRO_ENABLED, sk)) + udp_cmsg_recv(msg, sk, skb); + + if (np->rxopt.all) +-- +2.43.0 + diff --git a/queue-6.1/udp-move-udp-no_check6_rx-to-udp-udp_flags.patch b/queue-6.1/udp-move-udp-no_check6_rx-to-udp-udp_flags.patch new file mode 100644 index 00000000000..9a420d670cf --- /dev/null +++ b/queue-6.1/udp-move-udp-no_check6_rx-to-udp-udp_flags.patch @@ -0,0 +1,123 @@ +From e1834f0244ebec827e6c0f8f4cf0bce3dc679841 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Sep 2023 09:17:23 +0000 +Subject: udp: move udp->no_check6_rx to udp->udp_flags + +From: Eric Dumazet + +[ Upstream commit bcbc1b1de884647aa0318bf74eb7f293d72a1e40 ] + +syzbot reported that udp->no_check6_rx can be read locklessly. +Use one atomic bit from udp->udp_flags. + +Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive") +Reported-by: syzbot +Signed-off-by: Eric Dumazet +Reviewed-by: Willem de Bruijn +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + include/linux/udp.h | 10 +++++----- + net/ipv4/udp.c | 4 ++-- + net/ipv6/udp.c | 6 +++--- + 3 files changed, 10 insertions(+), 10 deletions(-) + +diff --git a/include/linux/udp.h b/include/linux/udp.h +index b5ca5760ae34b..e6cd46e2b0831 100644 +--- a/include/linux/udp.h ++++ b/include/linux/udp.h +@@ -33,6 +33,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) + enum { + UDP_FLAGS_CORK, /* Cork is required */ + UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */ ++ UDP_FLAGS_NO_CHECK6_RX, /* Allow zero UDP6 checksums on RX? */ + }; + + struct udp_sock { +@@ -46,8 +47,7 @@ struct udp_sock { + + int pending; /* Any pending frames ? */ + __u8 encap_type; /* Is this an Encapsulation socket? */ +- unsigned char no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ +- encap_enabled:1, /* This socket enabled encap ++ unsigned char encap_enabled:1, /* This socket enabled encap + * processing; UDP tunnels and + * different encapsulation layer set + * this +@@ -117,7 +117,7 @@ static inline void udp_set_no_check6_tx(struct sock *sk, bool val) + + static inline void udp_set_no_check6_rx(struct sock *sk, bool val) + { +- udp_sk(sk)->no_check6_rx = val; ++ udp_assign_bit(NO_CHECK6_RX, sk, val); + } + + static inline bool udp_get_no_check6_tx(const struct sock *sk) +@@ -125,9 +125,9 @@ static inline bool udp_get_no_check6_tx(const struct sock *sk) + return udp_test_bit(NO_CHECK6_TX, sk); + } + +-static inline bool udp_get_no_check6_rx(struct sock *sk) ++static inline bool udp_get_no_check6_rx(const struct sock *sk) + { +- return udp_sk(sk)->no_check6_rx; ++ return udp_test_bit(NO_CHECK6_RX, sk); + } + + static inline void udp_cmsg_recv(struct msghdr *msg, struct sock *sk, +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 513035e83a820..01e74919885ad 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -2715,7 +2715,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, + break; + + case UDP_NO_CHECK6_RX: +- up->no_check6_rx = valbool; ++ udp_set_no_check6_rx(sk, valbool); + break; + + case UDP_SEGMENT: +@@ -2812,7 +2812,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, + break; + + case UDP_NO_CHECK6_RX: +- val = up->no_check6_rx; ++ val = udp_get_no_check6_rx(sk); + break; + + case UDP_SEGMENT: +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index c6e20293c521f..ae4f7f983f951 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -882,7 +882,7 @@ static int __udp6_lib_mcast_deliver(struct net *net, struct sk_buff *skb, + /* If zero checksum and no_check is not on for + * the socket then skip it. + */ +- if (!uh->check && !udp_sk(sk)->no_check6_rx) ++ if (!uh->check && !udp_get_no_check6_rx(sk)) + continue; + if (!first) { + first = sk; +@@ -1000,7 +1000,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, + if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) + udp6_sk_rx_dst_set(sk, dst); + +- if (!uh->check && !udp_sk(sk)->no_check6_rx) { ++ if (!uh->check && !udp_get_no_check6_rx(sk)) { + if (refcounted) + sock_put(sk); + goto report_csum_error; +@@ -1022,7 +1022,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, + /* Unicast */ + sk = __udp6_lib_lookup_skb(skb, uh->source, uh->dest, udptable); + if (sk) { +- if (!uh->check && !udp_sk(sk)->no_check6_rx) ++ if (!uh->check && !udp_get_no_check6_rx(sk)) + goto report_csum_error; + return udp6_unicast_rcv_skb(sk, skb, uh); + } +-- +2.43.0 + diff --git a/queue-6.1/udp-move-udp-no_check6_tx-to-udp-udp_flags.patch b/queue-6.1/udp-move-udp-no_check6_tx-to-udp-udp_flags.patch new file mode 100644 index 00000000000..0b5f161bab5 --- /dev/null +++ b/queue-6.1/udp-move-udp-no_check6_tx-to-udp-udp_flags.patch @@ -0,0 +1,114 @@ +From db4859b6d666990e4f3bb767f895153034944c21 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Sep 2023 09:17:22 +0000 +Subject: udp: move udp->no_check6_tx to udp->udp_flags + +From: Eric Dumazet + +[ Upstream commit a0002127cd746fcaa182ad3386ef6931c37f3bda ] + +syzbot reported that udp->no_check6_tx can be read locklessly. +Use one atomic bit from udp->udp_flags + +Fixes: 1c19448c9ba6 ("net: Make enabling of zero UDP6 csums more restrictive") +Reported-by: syzbot +Signed-off-by: Eric Dumazet +Reviewed-by: Willem de Bruijn +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + include/linux/udp.h | 10 +++++----- + net/ipv4/udp.c | 4 ++-- + net/ipv6/udp.c | 4 ++-- + 3 files changed, 9 insertions(+), 9 deletions(-) + +diff --git a/include/linux/udp.h b/include/linux/udp.h +index 10b56b8231e3c..b5ca5760ae34b 100644 +--- a/include/linux/udp.h ++++ b/include/linux/udp.h +@@ -32,6 +32,7 @@ static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask) + + enum { + UDP_FLAGS_CORK, /* Cork is required */ ++ UDP_FLAGS_NO_CHECK6_TX, /* Send zero UDP6 checksums on TX? */ + }; + + struct udp_sock { +@@ -45,8 +46,7 @@ struct udp_sock { + + int pending; /* Any pending frames ? */ + __u8 encap_type; /* Is this an Encapsulation socket? */ +- unsigned char no_check6_tx:1,/* Send zero UDP6 checksums on TX? */ +- no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ ++ unsigned char no_check6_rx:1,/* Allow zero UDP6 checksums on RX? */ + encap_enabled:1, /* This socket enabled encap + * processing; UDP tunnels and + * different encapsulation layer set +@@ -112,7 +112,7 @@ static inline struct udp_sock *udp_sk(const struct sock *sk) + + static inline void udp_set_no_check6_tx(struct sock *sk, bool val) + { +- udp_sk(sk)->no_check6_tx = val; ++ udp_assign_bit(NO_CHECK6_TX, sk, val); + } + + static inline void udp_set_no_check6_rx(struct sock *sk, bool val) +@@ -120,9 +120,9 @@ static inline void udp_set_no_check6_rx(struct sock *sk, bool val) + udp_sk(sk)->no_check6_rx = val; + } + +-static inline bool udp_get_no_check6_tx(struct sock *sk) ++static inline bool udp_get_no_check6_tx(const struct sock *sk) + { +- return udp_sk(sk)->no_check6_tx; ++ return udp_test_bit(NO_CHECK6_TX, sk); + } + + static inline bool udp_get_no_check6_rx(struct sock *sk) +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 60a754477efb2..513035e83a820 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -2711,7 +2711,7 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, + break; + + case UDP_NO_CHECK6_TX: +- up->no_check6_tx = valbool; ++ udp_set_no_check6_tx(sk, valbool); + break; + + case UDP_NO_CHECK6_RX: +@@ -2808,7 +2808,7 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, + break; + + case UDP_NO_CHECK6_TX: +- val = up->no_check6_tx; ++ val = udp_get_no_check6_tx(sk); + break; + + case UDP_NO_CHECK6_RX: +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index 85653e3a04fe8..c6e20293c521f 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -1260,7 +1260,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, + kfree_skb(skb); + return -EINVAL; + } +- if (udp_sk(sk)->no_check6_tx) { ++ if (udp_get_no_check6_tx(sk)) { + kfree_skb(skb); + return -EINVAL; + } +@@ -1281,7 +1281,7 @@ static int udp_v6_send_skb(struct sk_buff *skb, struct flowi6 *fl6, + + if (is_udplite) + csum = udplite_csum(skb); +- else if (udp_sk(sk)->no_check6_tx) { /* UDP csum disabled */ ++ else if (udp_get_no_check6_tx(sk)) { /* UDP csum disabled */ + skb->ip_summed = CHECKSUM_NONE; + goto send; + } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ +-- +2.43.0 + diff --git a/queue-6.1/wifi-iwlwifi-pcie-don-t-synchronize-irqs-from-irq.patch b/queue-6.1/wifi-iwlwifi-pcie-don-t-synchronize-irqs-from-irq.patch new file mode 100644 index 00000000000..96697b7b7c5 --- /dev/null +++ b/queue-6.1/wifi-iwlwifi-pcie-don-t-synchronize-irqs-from-irq.patch @@ -0,0 +1,170 @@ +From 011b4aa30e6bcbdc4307b512c1563b39d38981b5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Dec 2023 11:13:34 +0100 +Subject: wifi: iwlwifi: pcie: don't synchronize IRQs from IRQ + +From: Johannes Berg + +[ Upstream commit 400f6ebbc175286576c7f7fddf3c347d09d12310 ] + +On older devices (before unified image!) we can end up calling +stop_device from an rfkill interrupt. However, in stop_device +we attempt to synchronize IRQs, which then of course deadlocks. + +Avoid this by checking the context, if running from the IRQ +thread then don't synchronize. This wouldn't be correct on a +new device since RSS is supported, but older devices only have +a single interrupt/queue. + +Fixes: 37fb29bd1f90 ("wifi: iwlwifi: pcie: synchronize IRQs before NAPI") +Reviewed-by: Miri Korenblit +Reviewed-by: Emmanuel Grumbach +Signed-off-by: Johannes Berg +Signed-off-by: Kalle Valo +Link: https://msgid.link/20231215111335.59aab00baed7.Iadfe154d6248e7f9dfd69522e5429dbbd72925d7@changeid +Signed-off-by: Sasha Levin +--- + .../net/wireless/intel/iwlwifi/pcie/internal.h | 4 ++-- + drivers/net/wireless/intel/iwlwifi/pcie/rx.c | 8 ++++---- + drivers/net/wireless/intel/iwlwifi/pcie/trans.c | 17 +++++++++-------- + 3 files changed, 15 insertions(+), 14 deletions(-) + +diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +index 69b95ad5993b0..2ec4ee8ab317c 100644 +--- a/drivers/net/wireless/intel/iwlwifi/pcie/internal.h ++++ b/drivers/net/wireless/intel/iwlwifi/pcie/internal.h +@@ -745,7 +745,7 @@ static inline void iwl_enable_rfkill_int(struct iwl_trans *trans) + } + } + +-void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans); ++void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans, bool from_irq); + + static inline bool iwl_is_rfkill_set(struct iwl_trans *trans) + { +@@ -792,7 +792,7 @@ static inline bool iwl_pcie_dbg_on(struct iwl_trans *trans) + return (trans->dbg.dest_tlv || iwl_trans_dbg_ini_valid(trans)); + } + +-void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state); ++void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state, bool from_irq); + void iwl_trans_pcie_dump_regs(struct iwl_trans *trans); + + #ifdef CONFIG_IWLWIFI_DEBUGFS +diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c +index 90a46faaaffdf..57a11ee05bc36 100644 +--- a/drivers/net/wireless/intel/iwlwifi/pcie/rx.c ++++ b/drivers/net/wireless/intel/iwlwifi/pcie/rx.c +@@ -1781,7 +1781,7 @@ static u32 iwl_pcie_int_cause_ict(struct iwl_trans *trans) + return inta; + } + +-void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans) ++void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans, bool from_irq) + { + struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); + struct isr_statistics *isr_stats = &trans_pcie->isr_stats; +@@ -1805,7 +1805,7 @@ void iwl_pcie_handle_rfkill_irq(struct iwl_trans *trans) + isr_stats->rfkill++; + + if (prev != report) +- iwl_trans_pcie_rf_kill(trans, report); ++ iwl_trans_pcie_rf_kill(trans, report, from_irq); + mutex_unlock(&trans_pcie->mutex); + + if (hw_rfkill) { +@@ -1945,7 +1945,7 @@ irqreturn_t iwl_pcie_irq_handler(int irq, void *dev_id) + + /* HW RF KILL switch toggled */ + if (inta & CSR_INT_BIT_RF_KILL) { +- iwl_pcie_handle_rfkill_irq(trans); ++ iwl_pcie_handle_rfkill_irq(trans, true); + handled |= CSR_INT_BIT_RF_KILL; + } + +@@ -2362,7 +2362,7 @@ irqreturn_t iwl_pcie_irq_msix_handler(int irq, void *dev_id) + + /* HW RF KILL switch toggled */ + if (inta_hw & MSIX_HW_INT_CAUSES_REG_RF_KILL) +- iwl_pcie_handle_rfkill_irq(trans); ++ iwl_pcie_handle_rfkill_irq(trans, true); + + if (inta_hw & MSIX_HW_INT_CAUSES_REG_HW_ERR) { + IWL_ERR(trans, +diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +index 796972f224326..c7ed35b3dd8d5 100644 +--- a/drivers/net/wireless/intel/iwlwifi/pcie/trans.c ++++ b/drivers/net/wireless/intel/iwlwifi/pcie/trans.c +@@ -1080,7 +1080,7 @@ bool iwl_pcie_check_hw_rf_kill(struct iwl_trans *trans) + report = test_bit(STATUS_RFKILL_OPMODE, &trans->status); + + if (prev != report) +- iwl_trans_pcie_rf_kill(trans, report); ++ iwl_trans_pcie_rf_kill(trans, report, false); + + return hw_rfkill; + } +@@ -1234,7 +1234,7 @@ static void iwl_pcie_init_msix(struct iwl_trans_pcie *trans_pcie) + trans_pcie->hw_mask = trans_pcie->hw_init_mask; + } + +-static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans) ++static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans, bool from_irq) + { + struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); + +@@ -1261,7 +1261,8 @@ static void _iwl_trans_pcie_stop_device(struct iwl_trans *trans) + if (test_and_clear_bit(STATUS_DEVICE_ENABLED, &trans->status)) { + IWL_DEBUG_INFO(trans, + "DEVICE_ENABLED bit was set and is now cleared\n"); +- iwl_pcie_synchronize_irqs(trans); ++ if (!from_irq) ++ iwl_pcie_synchronize_irqs(trans); + iwl_pcie_rx_napi_sync(trans); + iwl_pcie_tx_stop(trans); + iwl_pcie_rx_stop(trans); +@@ -1451,7 +1452,7 @@ void iwl_trans_pcie_handle_stop_rfkill(struct iwl_trans *trans, + clear_bit(STATUS_RFKILL_OPMODE, &trans->status); + } + if (hw_rfkill != was_in_rfkill) +- iwl_trans_pcie_rf_kill(trans, hw_rfkill); ++ iwl_trans_pcie_rf_kill(trans, hw_rfkill, false); + } + + static void iwl_trans_pcie_stop_device(struct iwl_trans *trans) +@@ -1466,12 +1467,12 @@ static void iwl_trans_pcie_stop_device(struct iwl_trans *trans) + mutex_lock(&trans_pcie->mutex); + trans_pcie->opmode_down = true; + was_in_rfkill = test_bit(STATUS_RFKILL_OPMODE, &trans->status); +- _iwl_trans_pcie_stop_device(trans); ++ _iwl_trans_pcie_stop_device(trans, false); + iwl_trans_pcie_handle_stop_rfkill(trans, was_in_rfkill); + mutex_unlock(&trans_pcie->mutex); + } + +-void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state) ++void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state, bool from_irq) + { + struct iwl_trans_pcie __maybe_unused *trans_pcie = + IWL_TRANS_GET_PCIE_TRANS(trans); +@@ -1484,7 +1485,7 @@ void iwl_trans_pcie_rf_kill(struct iwl_trans *trans, bool state) + if (trans->trans_cfg->gen2) + _iwl_trans_pcie_gen2_stop_device(trans); + else +- _iwl_trans_pcie_stop_device(trans); ++ _iwl_trans_pcie_stop_device(trans, from_irq); + } + } + +@@ -2815,7 +2816,7 @@ static ssize_t iwl_dbgfs_rfkill_write(struct file *file, + IWL_WARN(trans, "changing debug rfkill %d->%d\n", + trans_pcie->debug_rfkill, new_value); + trans_pcie->debug_rfkill = new_value; +- iwl_pcie_handle_rfkill_irq(trans); ++ iwl_pcie_handle_rfkill_irq(trans, false); + + return count; + } +-- +2.43.0 + diff --git a/queue-6.1/wifi-iwlwifi-yoyo-swap-cdb-and-jacket-bits-values.patch b/queue-6.1/wifi-iwlwifi-yoyo-swap-cdb-and-jacket-bits-values.patch new file mode 100644 index 00000000000..f493be62463 --- /dev/null +++ b/queue-6.1/wifi-iwlwifi-yoyo-swap-cdb-and-jacket-bits-values.patch @@ -0,0 +1,40 @@ +From 11af298a892bfd2816d2ccff3eaa3db927072f70 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 Oct 2023 12:36:22 +0300 +Subject: wifi: iwlwifi: yoyo: swap cdb and jacket bits values + +From: Rotem Saado + +[ Upstream commit 65008777b9dcd2002414ddb2c2158293a6e2fd6f ] + +The bits are wrong, the jacket bit should be 5 and cdb bit 4. +Fix it. + +Fixes: 1f171f4f1437 ("iwlwifi: Add support for getting rf id with blank otp") +Signed-off-by: Rotem Saado +Signed-off-by: Gregory Greenman +Link: https://lore.kernel.org/r/20231004123422.356d8dacda2f.I349ab888b43a11baa2453a1d6978a6a703e422f0@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Sasha Levin +--- + drivers/net/wireless/intel/iwlwifi/iwl-prph.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-prph.h b/drivers/net/wireless/intel/iwlwifi/iwl-prph.h +index 157d1f31c4871..c5a306b01fe20 100644 +--- a/drivers/net/wireless/intel/iwlwifi/iwl-prph.h ++++ b/drivers/net/wireless/intel/iwlwifi/iwl-prph.h +@@ -348,8 +348,8 @@ + #define RFIC_REG_RD 0xAD0470 + #define WFPM_CTRL_REG 0xA03030 + #define WFPM_OTP_CFG1_ADDR 0x00a03098 +-#define WFPM_OTP_CFG1_IS_JACKET_BIT BIT(4) +-#define WFPM_OTP_CFG1_IS_CDB_BIT BIT(5) ++#define WFPM_OTP_CFG1_IS_JACKET_BIT BIT(5) ++#define WFPM_OTP_CFG1_IS_CDB_BIT BIT(4) + + #define WFPM_GP2 0xA030B4 + +-- +2.43.0 + -- 2.47.3