From ef17ce4ad0a3aac0464344b77a04dcdef9de492e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 19 Aug 2024 12:08:29 +0200 Subject: [PATCH] 6.10-stable patches added patches: acpi-ec-evaluate-_reg-outside-the-ec-scope-more-carefully.patch acpica-add-a-depth-argument-to-acpi_execute_reg_methods.patch alloc_tag-introduce-clear_page_tag_ref-helper-function.patch alloc_tag-mark-pages-reserved-during-cma-activation-as-not-tagged.patch alsa-hda-tas2781-fix-wrong-calibrated-data-order.patch alsa-timer-relax-start-tick-time-check-for-slave-timer-elements.patch alsa-usb-audio-add-delay-quirk-for-vivo-usb-c-xe710-headset.patch alsa-usb-audio-support-yamaha-p-125-quirk-entry.patch arm64-acpi-numa-initialize-all-values-of-acpi_early_node_map-to-numa_no_node.patch btrfs-check-delayed-refs-when-we-re-checking-if-a-ref-exists.patch btrfs-only-enable-extent-map-shrinker-for-debug-builds.patch btrfs-only-run-the-extent-map-shrinker-from-kswapd-tasks.patch btrfs-send-allow-cloning-non-aligned-extent-if-it-ends-at-i_size.patch btrfs-tree-checker-add-dev-extent-item-checks.patch btrfs-tree-checker-reject-btrfs_ft_unknown-dir-type.patch btrfs-zoned-properly-take-lock-to-read-update-block-group-s-zoned-variables.patch char-xillybus-check-usb-endpoints-when-probing-device.patch char-xillybus-don-t-destroy-workqueue-from-work-item-running-on-it.patch char-xillybus-refine-workqueue-handling.patch dm-persistent-data-fix-memory-allocation-failure.patch dm-resume-don-t-return-einval-when-signalled.patch drm-amdgpu-actually-check-flags-for-all-context-ops.patch fix-bitmap-corruption-on-close_range-with-close_range_unshare.patch fs-netfs-fscache_cookie-add-missing-n_accesses-check.patch i2c-qcom-geni-add-missing-geni_icc_disable-in-geni_i2c_runtime_resume.patch i2c-tegra-do-not-mark-acpi-devices-as-irq-safe.patch keys-trusted-dcp-fix-leak-of-blob-encryption-key.patch keys-trusted-fix-dcp-blob-payload-length-assignment.patch kvm-s390-fix-validity-interception-issue-when-gisa-is-switched-off.patch md-raid1-fix-data-corruption-for-degraded-array-with-slow-disk.patch media-atomisp-fix-streaming-no-longer-working-on-byt-isp2400-devices.patch memcg_write_event_control-fix-a-user-triggerable-oops.patch mm-fix-endless-reclaim-on-machines-with-unaccepted-memory.patch mm-hugetlb-fix-hugetlb-vs.-core-mm-pt-locking.patch mm-memory-failure-use-raw_spinlock_t-in-struct-memory_failure_cpu.patch mm-numa-no-task_numa_fault-call-if-pmd-is-changed.patch mm-numa-no-task_numa_fault-call-if-pte-is-changed.patch mm-vmalloc-fix-page-mapping-if-vm_area_alloc_pages-with-high-order-fallback-to-order-0.patch mseal-fix-is_madv_discard.patch net-mana-fix-doorbell-out-of-order-violation-and-avoid-unnecessary-doorbell-rings.patch net-mana-fix-rx-buf-alloc_size-alignment-and-atomic-op-panic.patch perf-bpf-don-t-call-bpf_overflow_handler-for-tracing-events.patch riscv-change-xip-s-kernel_map.size-to-be-size-of-the-entire-kernel.patch riscv-entry-always-initialize-regs-a0-to-enosys.patch rtla-osnoise-prevent-null-dereference-in-error-handling.patch s390-dasd-fix-error-recovery-leading-to-data-corruption-on-ese-devices.patch selftests-memfd_secret-don-t-build-memfd_secret-test-on-unsupported-arches.patch selinux-add-the-processing-of-the-failure-of-avc_add_xperms_decision.patch selinux-fix-potential-counting-error-in-avc_add_xperms_decision.patch smb3-fix-lock-breakage-for-cached-writes.patch thermal-gov_bang_bang-call-__thermal_cdev_update-directly.patch thunderbolt-mark-xdomain-as-unplugged-when-router-is-removed.patch tracing-return-from-tracing_buffers_read-if-the-file-has-been-closed.patch usb-misc-ljca-add-lunar-lake-ljca-gpio-hid-to-ljca_gpio_hids.patch usb-xhci-check-for-xhci-interrupters-being-allocated-in-xhci_mem_clearup.patch vfs-don-t-evict-inode-under-the-inode-lru-traversing-context.patch wifi-brcmfmac-cfg80211-handle-ssid-based-pmksa-deletion.patch xhci-fix-panther-point-null-pointer-deref-at-full-speed-re-enumeration.patch --- ...-outside-the-ec-scope-more-carefully.patch | 103 +++++++ ...argument-to-acpi_execute_reg_methods.patch | 135 ++++++++++ ...e-clear_page_tag_ref-helper-function.patch | 100 +++++++ ...-during-cma-activation-as-not-tagged.patch | 41 +++ ...2781-fix-wrong-calibrated-data-order.patch | 66 +++++ ...-time-check-for-slave-timer-elements.patch | 38 +++ ...y-quirk-for-vivo-usb-c-xe710-headset.patch | 32 +++ ...dio-support-yamaha-p-125-quirk-entry.patch | 33 +++ ...-acpi_early_node_map-to-numa_no_node.patch | 42 +++ ...-when-we-re-checking-if-a-ref-exists.patch | 253 ++++++++++++++++++ ...extent-map-shrinker-for-debug-builds.patch | 45 ++++ ...xtent-map-shrinker-from-kswapd-tasks.patch | 139 ++++++++++ ...-aligned-extent-if-it-ends-at-i_size.patch | 188 +++++++++++++ ...e-checker-add-dev-extent-item-checks.patch | 162 +++++++++++ ...ker-reject-btrfs_ft_unknown-dir-type.patch | 58 ++++ ...update-block-group-s-zoned-variables.patch | 82 ++++++ ...ck-usb-endpoints-when-probing-device.patch | 93 +++++++ ...rkqueue-from-work-item-running-on-it.patch | 83 ++++++ ...r-xillybus-refine-workqueue-handling.patch | 52 ++++ ...t-data-fix-memory-allocation-failure.patch | 45 ++++ ...e-don-t-return-einval-when-signalled.patch | 60 +++++ ...ally-check-flags-for-all-context-ops.patch | 50 ++++ ...close_range-with-close_range_unshare.patch | 184 +++++++++++++ ..._cookie-add-missing-n_accesses-check.patch | 114 ++++++++ ...c_disable-in-geni_i2c_runtime_resume.patch | 39 +++ ...do-not-mark-acpi-devices-as-irq-safe.patch | 58 ++++ ...-dcp-fix-leak-of-blob-encryption-key.patch | 141 ++++++++++ ...x-dcp-blob-payload-length-assignment.patch | 44 +++ ...tion-issue-when-gisa-is-switched-off.patch | 88 ++++++ ...on-for-degraded-array-with-slow-disk.patch | 93 +++++++ ...onger-working-on-byt-isp2400-devices.patch | 98 +++++++ ..._control-fix-a-user-triggerable-oops.patch | 39 +++ ...m-on-machines-with-unaccepted-memory.patch | 157 +++++++++++ ...b-fix-hugetlb-vs.-core-mm-pt-locking.patch | 161 +++++++++++ ...nlock_t-in-struct-memory_failure_cpu.patch | 129 +++++++++ ...sk_numa_fault-call-if-pmd-is-changed.patch | 93 +++++++ ...sk_numa_fault-call-if-pte-is-changed.patch | 97 +++++++ ...-with-high-order-fallback-to-order-0.patch | 68 +++++ queue-6.10/mseal-fix-is_madv_discard.patch | 66 +++++ ...and-avoid-unnecessary-doorbell-rings.patch | 89 ++++++ ...c_size-alignment-and-atomic-op-panic.patch | 67 +++++ ..._overflow_handler-for-tracing-events.patch | 51 ++++ ...size-to-be-size-of-the-entire-kernel.patch | 50 ++++ ...-always-initialize-regs-a0-to-enosys.patch | 49 ++++ ...t-null-dereference-in-error-handling.patch | 52 ++++ ...ng-to-data-corruption-on-ese-devices.patch | 242 +++++++++++++++++ ...fd_secret-test-on-unsupported-arches.patch | 73 +++++ ...e-failure-of-avc_add_xperms_decision.patch | 39 +++ ...ing-error-in-avc_add_xperms_decision.patch | 38 +++ queue-6.10/series | 58 ++++ ...-fix-lock-breakage-for-cached-writes.patch | 66 +++++ ...-call-__thermal_cdev_update-directly.patch | 46 ++++ ...-as-unplugged-when-router-is-removed.patch | 39 +++ ...ers_read-if-the-file-has-been-closed.patch | 59 ++++ ...lake-ljca-gpio-hid-to-ljca_gpio_hids.patch | 32 +++ ...-being-allocated-in-xhci_mem_clearup.patch | 45 ++++ ...der-the-inode-lru-traversing-context.patch | 215 +++++++++++++++ ...211-handle-ssid-based-pmksa-deletion.patch | 49 ++++ ...r-deref-at-full-speed-re-enumeration.patch | 82 ++++++ 59 files changed, 5010 insertions(+) create mode 100644 queue-6.10/acpi-ec-evaluate-_reg-outside-the-ec-scope-more-carefully.patch create mode 100644 queue-6.10/acpica-add-a-depth-argument-to-acpi_execute_reg_methods.patch create mode 100644 queue-6.10/alloc_tag-introduce-clear_page_tag_ref-helper-function.patch create mode 100644 queue-6.10/alloc_tag-mark-pages-reserved-during-cma-activation-as-not-tagged.patch create mode 100644 queue-6.10/alsa-hda-tas2781-fix-wrong-calibrated-data-order.patch create mode 100644 queue-6.10/alsa-timer-relax-start-tick-time-check-for-slave-timer-elements.patch create mode 100644 queue-6.10/alsa-usb-audio-add-delay-quirk-for-vivo-usb-c-xe710-headset.patch create mode 100644 queue-6.10/alsa-usb-audio-support-yamaha-p-125-quirk-entry.patch create mode 100644 queue-6.10/arm64-acpi-numa-initialize-all-values-of-acpi_early_node_map-to-numa_no_node.patch create mode 100644 queue-6.10/btrfs-check-delayed-refs-when-we-re-checking-if-a-ref-exists.patch create mode 100644 queue-6.10/btrfs-only-enable-extent-map-shrinker-for-debug-builds.patch create mode 100644 queue-6.10/btrfs-only-run-the-extent-map-shrinker-from-kswapd-tasks.patch create mode 100644 queue-6.10/btrfs-send-allow-cloning-non-aligned-extent-if-it-ends-at-i_size.patch create mode 100644 queue-6.10/btrfs-tree-checker-add-dev-extent-item-checks.patch create mode 100644 queue-6.10/btrfs-tree-checker-reject-btrfs_ft_unknown-dir-type.patch create mode 100644 queue-6.10/btrfs-zoned-properly-take-lock-to-read-update-block-group-s-zoned-variables.patch create mode 100644 queue-6.10/char-xillybus-check-usb-endpoints-when-probing-device.patch create mode 100644 queue-6.10/char-xillybus-don-t-destroy-workqueue-from-work-item-running-on-it.patch create mode 100644 queue-6.10/char-xillybus-refine-workqueue-handling.patch create mode 100644 queue-6.10/dm-persistent-data-fix-memory-allocation-failure.patch create mode 100644 queue-6.10/dm-resume-don-t-return-einval-when-signalled.patch create mode 100644 queue-6.10/drm-amdgpu-actually-check-flags-for-all-context-ops.patch create mode 100644 queue-6.10/fix-bitmap-corruption-on-close_range-with-close_range_unshare.patch create mode 100644 queue-6.10/fs-netfs-fscache_cookie-add-missing-n_accesses-check.patch create mode 100644 queue-6.10/i2c-qcom-geni-add-missing-geni_icc_disable-in-geni_i2c_runtime_resume.patch create mode 100644 queue-6.10/i2c-tegra-do-not-mark-acpi-devices-as-irq-safe.patch create mode 100644 queue-6.10/keys-trusted-dcp-fix-leak-of-blob-encryption-key.patch create mode 100644 queue-6.10/keys-trusted-fix-dcp-blob-payload-length-assignment.patch create mode 100644 queue-6.10/kvm-s390-fix-validity-interception-issue-when-gisa-is-switched-off.patch create mode 100644 queue-6.10/md-raid1-fix-data-corruption-for-degraded-array-with-slow-disk.patch create mode 100644 queue-6.10/media-atomisp-fix-streaming-no-longer-working-on-byt-isp2400-devices.patch create mode 100644 queue-6.10/memcg_write_event_control-fix-a-user-triggerable-oops.patch create mode 100644 queue-6.10/mm-fix-endless-reclaim-on-machines-with-unaccepted-memory.patch create mode 100644 queue-6.10/mm-hugetlb-fix-hugetlb-vs.-core-mm-pt-locking.patch create mode 100644 queue-6.10/mm-memory-failure-use-raw_spinlock_t-in-struct-memory_failure_cpu.patch create mode 100644 queue-6.10/mm-numa-no-task_numa_fault-call-if-pmd-is-changed.patch create mode 100644 queue-6.10/mm-numa-no-task_numa_fault-call-if-pte-is-changed.patch create mode 100644 queue-6.10/mm-vmalloc-fix-page-mapping-if-vm_area_alloc_pages-with-high-order-fallback-to-order-0.patch create mode 100644 queue-6.10/mseal-fix-is_madv_discard.patch create mode 100644 queue-6.10/net-mana-fix-doorbell-out-of-order-violation-and-avoid-unnecessary-doorbell-rings.patch create mode 100644 queue-6.10/net-mana-fix-rx-buf-alloc_size-alignment-and-atomic-op-panic.patch create mode 100644 queue-6.10/perf-bpf-don-t-call-bpf_overflow_handler-for-tracing-events.patch create mode 100644 queue-6.10/riscv-change-xip-s-kernel_map.size-to-be-size-of-the-entire-kernel.patch create mode 100644 queue-6.10/riscv-entry-always-initialize-regs-a0-to-enosys.patch create mode 100644 queue-6.10/rtla-osnoise-prevent-null-dereference-in-error-handling.patch create mode 100644 queue-6.10/s390-dasd-fix-error-recovery-leading-to-data-corruption-on-ese-devices.patch create mode 100644 queue-6.10/selftests-memfd_secret-don-t-build-memfd_secret-test-on-unsupported-arches.patch create mode 100644 queue-6.10/selinux-add-the-processing-of-the-failure-of-avc_add_xperms_decision.patch create mode 100644 queue-6.10/selinux-fix-potential-counting-error-in-avc_add_xperms_decision.patch create mode 100644 queue-6.10/smb3-fix-lock-breakage-for-cached-writes.patch create mode 100644 queue-6.10/thermal-gov_bang_bang-call-__thermal_cdev_update-directly.patch create mode 100644 queue-6.10/thunderbolt-mark-xdomain-as-unplugged-when-router-is-removed.patch create mode 100644 queue-6.10/tracing-return-from-tracing_buffers_read-if-the-file-has-been-closed.patch create mode 100644 queue-6.10/usb-misc-ljca-add-lunar-lake-ljca-gpio-hid-to-ljca_gpio_hids.patch create mode 100644 queue-6.10/usb-xhci-check-for-xhci-interrupters-being-allocated-in-xhci_mem_clearup.patch create mode 100644 queue-6.10/vfs-don-t-evict-inode-under-the-inode-lru-traversing-context.patch create mode 100644 queue-6.10/wifi-brcmfmac-cfg80211-handle-ssid-based-pmksa-deletion.patch create mode 100644 queue-6.10/xhci-fix-panther-point-null-pointer-deref-at-full-speed-re-enumeration.patch diff --git a/queue-6.10/acpi-ec-evaluate-_reg-outside-the-ec-scope-more-carefully.patch b/queue-6.10/acpi-ec-evaluate-_reg-outside-the-ec-scope-more-carefully.patch new file mode 100644 index 00000000000..bbeb125796b --- /dev/null +++ b/queue-6.10/acpi-ec-evaluate-_reg-outside-the-ec-scope-more-carefully.patch @@ -0,0 +1,103 @@ +From 71bf41b8e913ec9fc91f0d39ab8fb320229ec604 Mon Sep 17 00:00:00 2001 +From: "Rafael J. Wysocki" +Date: Mon, 12 Aug 2024 15:16:21 +0200 +Subject: ACPI: EC: Evaluate _REG outside the EC scope more carefully + +From: Rafael J. Wysocki + +commit 71bf41b8e913ec9fc91f0d39ab8fb320229ec604 upstream. + +Commit 60fa6ae6e6d0 ("ACPI: EC: Install address space handler at the +namespace root") caused _REG methods for EC operation regions outside +the EC device scope to be evaluated which on some systems leads to the +evaluation of _REG methods in the scopes of device objects representing +devices that are not present and not functional according to the _STA +return values. Some of those device objects represent EC "alternatives" +and if _REG is evaluated for their operation regions, the platform +firmware may be confused and the platform may start to behave +incorrectly. + +To avoid this problem, only evaluate _REG for EC operation regions +located in the scopes of device objects representing known-to-be-present +devices. + +For this purpose, partially revert commit 60fa6ae6e6d0 and trigger the +evaluation of _REG for EC operation regions from acpi_bus_attach() for +the known-valid devices. + +Fixes: 60fa6ae6e6d0 ("ACPI: EC: Install address space handler at the namespace root") +Link: https://lore.kernel.org/linux-acpi/1f76b7e2-1928-4598-8037-28a1785c2d13@redhat.com +Link: https://bugzilla.redhat.com/show_bug.cgi?id=2298938 +Link: https://bugzilla.redhat.com/show_bug.cgi?id=2302253 +Reported-by: Hans de Goede +Signed-off-by: Rafael J. Wysocki +Reviewed-by: Hans de Goede +Cc: All applicable +Link: https://patch.msgid.link/23612351.6Emhk5qWAg@rjwysocki.net +Signed-off-by: Greg Kroah-Hartman +--- + drivers/acpi/ec.c | 11 +++++++++-- + drivers/acpi/internal.h | 1 + + drivers/acpi/scan.c | 2 ++ + 3 files changed, 12 insertions(+), 2 deletions(-) + +--- a/drivers/acpi/ec.c ++++ b/drivers/acpi/ec.c +@@ -1487,12 +1487,13 @@ static bool install_gpio_irq_event_handl + static int ec_install_handlers(struct acpi_ec *ec, struct acpi_device *device, + bool call_reg) + { +- acpi_handle scope_handle = ec == first_ec ? ACPI_ROOT_OBJECT : ec->handle; + acpi_status status; + + acpi_ec_start(ec, false); + + if (!test_bit(EC_FLAGS_EC_HANDLER_INSTALLED, &ec->flags)) { ++ acpi_handle scope_handle = ec == first_ec ? ACPI_ROOT_OBJECT : ec->handle; ++ + acpi_ec_enter_noirq(ec); + status = acpi_install_address_space_handler_no_reg(scope_handle, + ACPI_ADR_SPACE_EC, +@@ -1506,7 +1507,7 @@ static int ec_install_handlers(struct ac + } + + if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { +- acpi_execute_reg_methods(scope_handle, ACPI_UINT32_MAX, ACPI_ADR_SPACE_EC); ++ acpi_execute_reg_methods(ec->handle, ACPI_UINT32_MAX, ACPI_ADR_SPACE_EC); + set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); + } + +@@ -1721,6 +1722,12 @@ static void acpi_ec_remove(struct acpi_d + } + } + ++void acpi_ec_register_opregions(struct acpi_device *adev) ++{ ++ if (first_ec && first_ec->handle != adev->handle) ++ acpi_execute_reg_methods(adev->handle, 1, ACPI_ADR_SPACE_EC); ++} ++ + static acpi_status + ec_parse_io_ports(struct acpi_resource *resource, void *context) + { +--- a/drivers/acpi/internal.h ++++ b/drivers/acpi/internal.h +@@ -223,6 +223,7 @@ int acpi_ec_add_query_handler(struct acp + acpi_handle handle, acpi_ec_query_func func, + void *data); + void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit); ++void acpi_ec_register_opregions(struct acpi_device *adev); + + #ifdef CONFIG_PM_SLEEP + void acpi_ec_flush_work(void); +--- a/drivers/acpi/scan.c ++++ b/drivers/acpi/scan.c +@@ -2264,6 +2264,8 @@ static int acpi_bus_attach(struct acpi_d + if (device->handler) + goto ok; + ++ acpi_ec_register_opregions(device); ++ + if (!device->flags.initialized) { + device->flags.power_manageable = + device->power.states[ACPI_STATE_D0].flags.valid; diff --git a/queue-6.10/acpica-add-a-depth-argument-to-acpi_execute_reg_methods.patch b/queue-6.10/acpica-add-a-depth-argument-to-acpi_execute_reg_methods.patch new file mode 100644 index 00000000000..e43edaedbeb --- /dev/null +++ b/queue-6.10/acpica-add-a-depth-argument-to-acpi_execute_reg_methods.patch @@ -0,0 +1,135 @@ +From cdf65d73e001fde600b18d7e45afadf559425ce5 Mon Sep 17 00:00:00 2001 +From: "Rafael J. Wysocki" +Date: Mon, 12 Aug 2024 15:11:42 +0200 +Subject: ACPICA: Add a depth argument to acpi_execute_reg_methods() + +From: Rafael J. Wysocki + +commit cdf65d73e001fde600b18d7e45afadf559425ce5 upstream. + +A subsequent change will need to pass a depth argument to +acpi_execute_reg_methods(), so prepare that function for it. + +No intentional functional changes. + +Signed-off-by: Rafael J. Wysocki +Reviewed-by: Hans de Goede +Cc: All applicable +Link: https://patch.msgid.link/8451567.NyiUUSuA9g@rjwysocki.net +Signed-off-by: Greg Kroah-Hartman +--- + drivers/acpi/acpica/acevents.h | 2 +- + drivers/acpi/acpica/evregion.c | 6 ++++-- + drivers/acpi/acpica/evxfregn.c | 10 +++++++--- + drivers/acpi/ec.c | 2 +- + include/acpi/acpixf.h | 1 + + 5 files changed, 14 insertions(+), 7 deletions(-) + +--- a/drivers/acpi/acpica/acevents.h ++++ b/drivers/acpi/acpica/acevents.h +@@ -188,7 +188,7 @@ acpi_ev_detach_region(union acpi_operand + u8 acpi_ns_is_locked); + + void +-acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, ++acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, u32 max_depth, + acpi_adr_space_type space_id, u32 function); + + acpi_status +--- a/drivers/acpi/acpica/evregion.c ++++ b/drivers/acpi/acpica/evregion.c +@@ -65,6 +65,7 @@ acpi_status acpi_ev_initialize_op_region + acpi_gbl_default_address_spaces + [i])) { + acpi_ev_execute_reg_methods(acpi_gbl_root_node, ++ ACPI_UINT32_MAX, + acpi_gbl_default_address_spaces + [i], ACPI_REG_CONNECT); + } +@@ -672,6 +673,7 @@ cleanup1: + * FUNCTION: acpi_ev_execute_reg_methods + * + * PARAMETERS: node - Namespace node for the device ++ * max_depth - Depth to which search for _REG + * space_id - The address space ID + * function - Passed to _REG: On (1) or Off (0) + * +@@ -683,7 +685,7 @@ cleanup1: + ******************************************************************************/ + + void +-acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, ++acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, u32 max_depth, + acpi_adr_space_type space_id, u32 function) + { + struct acpi_reg_walk_info info; +@@ -717,7 +719,7 @@ acpi_ev_execute_reg_methods(struct acpi_ + * regions and _REG methods. (i.e. handlers must be installed for all + * regions of this Space ID before we can run any _REG methods) + */ +- (void)acpi_ns_walk_namespace(ACPI_TYPE_ANY, node, ACPI_UINT32_MAX, ++ (void)acpi_ns_walk_namespace(ACPI_TYPE_ANY, node, max_depth, + ACPI_NS_WALK_UNLOCK, acpi_ev_reg_run, NULL, + &info, NULL); + +--- a/drivers/acpi/acpica/evxfregn.c ++++ b/drivers/acpi/acpica/evxfregn.c +@@ -85,7 +85,8 @@ acpi_install_address_space_handler_inter + /* Run all _REG methods for this address space */ + + if (run_reg) { +- acpi_ev_execute_reg_methods(node, space_id, ACPI_REG_CONNECT); ++ acpi_ev_execute_reg_methods(node, ACPI_UINT32_MAX, space_id, ++ ACPI_REG_CONNECT); + } + + unlock_and_exit: +@@ -263,6 +264,7 @@ ACPI_EXPORT_SYMBOL(acpi_remove_address_s + * FUNCTION: acpi_execute_reg_methods + * + * PARAMETERS: device - Handle for the device ++ * max_depth - Depth to which search for _REG + * space_id - The address space ID + * + * RETURN: Status +@@ -271,7 +273,8 @@ ACPI_EXPORT_SYMBOL(acpi_remove_address_s + * + ******************************************************************************/ + acpi_status +-acpi_execute_reg_methods(acpi_handle device, acpi_adr_space_type space_id) ++acpi_execute_reg_methods(acpi_handle device, u32 max_depth, ++ acpi_adr_space_type space_id) + { + struct acpi_namespace_node *node; + acpi_status status; +@@ -296,7 +299,8 @@ acpi_execute_reg_methods(acpi_handle dev + + /* Run all _REG methods for this address space */ + +- acpi_ev_execute_reg_methods(node, space_id, ACPI_REG_CONNECT); ++ acpi_ev_execute_reg_methods(node, max_depth, space_id, ++ ACPI_REG_CONNECT); + } else { + status = AE_BAD_PARAMETER; + } +--- a/drivers/acpi/ec.c ++++ b/drivers/acpi/ec.c +@@ -1506,7 +1506,7 @@ static int ec_install_handlers(struct ac + } + + if (call_reg && !test_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags)) { +- acpi_execute_reg_methods(scope_handle, ACPI_ADR_SPACE_EC); ++ acpi_execute_reg_methods(scope_handle, ACPI_UINT32_MAX, ACPI_ADR_SPACE_EC); + set_bit(EC_FLAGS_EC_REG_CALLED, &ec->flags); + } + +--- a/include/acpi/acpixf.h ++++ b/include/acpi/acpixf.h +@@ -660,6 +660,7 @@ ACPI_EXTERNAL_RETURN_STATUS(acpi_status + void *context)) + ACPI_EXTERNAL_RETURN_STATUS(acpi_status + acpi_execute_reg_methods(acpi_handle device, ++ u32 nax_depth, + acpi_adr_space_type + space_id)) + ACPI_EXTERNAL_RETURN_STATUS(acpi_status diff --git a/queue-6.10/alloc_tag-introduce-clear_page_tag_ref-helper-function.patch b/queue-6.10/alloc_tag-introduce-clear_page_tag_ref-helper-function.patch new file mode 100644 index 00000000000..1546afed8e6 --- /dev/null +++ b/queue-6.10/alloc_tag-introduce-clear_page_tag_ref-helper-function.patch @@ -0,0 +1,100 @@ +From a8fc28dad6d574582cdf2f7e78c73c59c623df30 Mon Sep 17 00:00:00 2001 +From: Suren Baghdasaryan +Date: Tue, 13 Aug 2024 08:07:56 -0700 +Subject: alloc_tag: introduce clear_page_tag_ref() helper function + +From: Suren Baghdasaryan + +commit a8fc28dad6d574582cdf2f7e78c73c59c623df30 upstream. + +In several cases we are freeing pages which were not allocated using +common page allocators. For such cases, in order to keep allocation +accounting correct, we should clear the page tag to indicate that the page +being freed is expected to not have a valid allocation tag. Introduce +clear_page_tag_ref() helper function to be used for this. + +Link: https://lkml.kernel.org/r/20240813150758.855881-1-surenb@google.com +Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty") +Signed-off-by: Suren Baghdasaryan +Suggested-by: David Hildenbrand +Acked-by: David Hildenbrand +Reviewed-by: Pasha Tatashin +Cc: Kees Cook +Cc: Kent Overstreet +Cc: Sourav Panda +Cc: Vlastimil Babka +Cc: [6.10] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/pgalloc_tag.h | 13 +++++++++++++ + mm/mm_init.c | 10 +--------- + mm/page_alloc.c | 9 +-------- + 3 files changed, 15 insertions(+), 17 deletions(-) + +--- a/include/linux/pgalloc_tag.h ++++ b/include/linux/pgalloc_tag.h +@@ -43,6 +43,18 @@ static inline void put_page_tag_ref(unio + page_ext_put(page_ext_from_codetag_ref(ref)); + } + ++static inline void clear_page_tag_ref(struct page *page) ++{ ++ if (mem_alloc_profiling_enabled()) { ++ union codetag_ref *ref = get_page_tag_ref(page); ++ ++ if (ref) { ++ set_codetag_empty(ref); ++ put_page_tag_ref(ref); ++ } ++ } ++} ++ + static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int nr) + { +@@ -126,6 +138,7 @@ static inline void pgalloc_tag_sub_pages + + static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } + static inline void put_page_tag_ref(union codetag_ref *ref) {} ++static inline void clear_page_tag_ref(struct page *page) {} + static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int nr) {} + static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -2507,15 +2507,7 @@ void __init memblock_free_pages(struct p + } + + /* pages were reserved and not allocated */ +- if (mem_alloc_profiling_enabled()) { +- union codetag_ref *ref = get_page_tag_ref(page); +- +- if (ref) { +- set_codetag_empty(ref); +- put_page_tag_ref(ref); +- } +- } +- ++ clear_page_tag_ref(page); + __free_pages_core(page, order); + } + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -5806,14 +5806,7 @@ unsigned long free_reserved_area(void *s + + void free_reserved_page(struct page *page) + { +- if (mem_alloc_profiling_enabled()) { +- union codetag_ref *ref = get_page_tag_ref(page); +- +- if (ref) { +- set_codetag_empty(ref); +- put_page_tag_ref(ref); +- } +- } ++ clear_page_tag_ref(page); + ClearPageReserved(page); + init_page_count(page); + __free_page(page); diff --git a/queue-6.10/alloc_tag-mark-pages-reserved-during-cma-activation-as-not-tagged.patch b/queue-6.10/alloc_tag-mark-pages-reserved-during-cma-activation-as-not-tagged.patch new file mode 100644 index 00000000000..7dbf30b7e7f --- /dev/null +++ b/queue-6.10/alloc_tag-mark-pages-reserved-during-cma-activation-as-not-tagged.patch @@ -0,0 +1,41 @@ +From 766c163c2068b45330664fb67df67268e588a22d Mon Sep 17 00:00:00 2001 +From: Suren Baghdasaryan +Date: Tue, 13 Aug 2024 08:07:57 -0700 +Subject: alloc_tag: mark pages reserved during CMA activation as not tagged + +From: Suren Baghdasaryan + +commit 766c163c2068b45330664fb67df67268e588a22d upstream. + +During CMA activation, pages in CMA area are prepared and then freed +without being allocated. This triggers warnings when memory allocation +debug config (CONFIG_MEM_ALLOC_PROFILING_DEBUG) is enabled. Fix this by +marking these pages not tagged before freeing them. + +Link: https://lkml.kernel.org/r/20240813150758.855881-2-surenb@google.com +Fixes: d224eb0287fb ("codetag: debug: mark codetags for reserved pages as empty") +Signed-off-by: Suren Baghdasaryan +Acked-by: David Hildenbrand +Cc: Kees Cook +Cc: Kent Overstreet +Cc: Pasha Tatashin +Cc: Sourav Panda +Cc: Vlastimil Babka +Cc: [6.10] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/mm_init.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -2293,6 +2293,8 @@ void __init init_cma_reserved_pageblock( + + set_pageblock_migratetype(page, MIGRATE_CMA); + set_page_refcounted(page); ++ /* pages were reserved and not allocated */ ++ clear_page_tag_ref(page); + __free_pages(page, pageblock_order); + + adjust_managed_page_count(page, pageblock_nr_pages); diff --git a/queue-6.10/alsa-hda-tas2781-fix-wrong-calibrated-data-order.patch b/queue-6.10/alsa-hda-tas2781-fix-wrong-calibrated-data-order.patch new file mode 100644 index 00000000000..fde0e8f16dd --- /dev/null +++ b/queue-6.10/alsa-hda-tas2781-fix-wrong-calibrated-data-order.patch @@ -0,0 +1,66 @@ +From 3beddef84d90590270465a907de1cfe2539ac70d Mon Sep 17 00:00:00 2001 +From: Baojun Xu +Date: Tue, 13 Aug 2024 12:37:48 +0800 +Subject: ALSA: hda/tas2781: fix wrong calibrated data order + +From: Baojun Xu + +commit 3beddef84d90590270465a907de1cfe2539ac70d upstream. + +Wrong calibration data order cause sound too low in some device. +Fix wrong calibrated data order, add calibration data converssion +by get_unaligned_be32() after reading from UEFI. + +Fixes: 5be27f1e3ec9 ("ALSA: hda/tas2781: Add tas2781 HDA driver") +Cc: +Signed-off-by: Baojun Xu +Link: https://patch.msgid.link/20240813043749.108-1-shenghao-ding@ti.com +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman +--- + sound/pci/hda/tas2781_hda_i2c.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +--- a/sound/pci/hda/tas2781_hda_i2c.c ++++ b/sound/pci/hda/tas2781_hda_i2c.c +@@ -2,10 +2,12 @@ + // + // TAS2781 HDA I2C driver + // +-// Copyright 2023 Texas Instruments, Inc. ++// Copyright 2023 - 2024 Texas Instruments, Inc. + // + // Author: Shenghao Ding ++// Current maintainer: Baojun Xu + ++#include + #include + #include + #include +@@ -519,20 +521,22 @@ static void tas2781_apply_calib(struct t + static const unsigned char rgno_array[CALIB_MAX] = { + 0x74, 0x0c, 0x14, 0x70, 0x7c, + }; +- unsigned char *data; ++ int offset = 0; + int i, j, rc; ++ __be32 data; + + for (i = 0; i < tas_priv->ndev; i++) { +- data = tas_priv->cali_data.data + +- i * TASDEVICE_SPEAKER_CALIBRATION_SIZE; + for (j = 0; j < CALIB_MAX; j++) { ++ data = get_unaligned_be32( ++ &tas_priv->cali_data.data[offset]); + rc = tasdevice_dev_bulk_write(tas_priv, i, + TASDEVICE_REG(0, page_array[j], rgno_array[j]), +- &(data[4 * j]), 4); ++ (unsigned char *)&data, 4); + if (rc < 0) + dev_err(tas_priv->dev, + "chn %d calib %d bulk_wr err = %d\n", + i, j, rc); ++ offset += 4; + } + } + } diff --git a/queue-6.10/alsa-timer-relax-start-tick-time-check-for-slave-timer-elements.patch b/queue-6.10/alsa-timer-relax-start-tick-time-check-for-slave-timer-elements.patch new file mode 100644 index 00000000000..c9d6c57ea5a --- /dev/null +++ b/queue-6.10/alsa-timer-relax-start-tick-time-check-for-slave-timer-elements.patch @@ -0,0 +1,38 @@ +From ccbfcac05866ebe6eb3bc6d07b51d4ed4fcde436 Mon Sep 17 00:00:00 2001 +From: Takashi Iwai +Date: Sat, 10 Aug 2024 10:48:32 +0200 +Subject: ALSA: timer: Relax start tick time check for slave timer elements + +From: Takashi Iwai + +commit ccbfcac05866ebe6eb3bc6d07b51d4ed4fcde436 upstream. + +The recent addition of a sanity check for a too low start tick time +seems breaking some applications that uses aloop with a certain slave +timer setup. They may have the initial resolution 0, hence it's +treated as if it were a too low value. + +Relax and skip the check for the slave timer instance for addressing +the regression. + +Fixes: 4a63bd179fa8 ("ALSA: timer: Set lower bound of start tick time") +Cc: +Link: https://github.com/raspberrypi/linux/issues/6294 +Link: https://patch.msgid.link/20240810084833.10939-1-tiwai@suse.de +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman +--- + sound/core/timer.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/sound/core/timer.c ++++ b/sound/core/timer.c +@@ -547,7 +547,7 @@ static int snd_timer_start1(struct snd_t + /* check the actual time for the start tick; + * bail out as error if it's way too low (< 100us) + */ +- if (start) { ++ if (start && !(timer->hw.flags & SNDRV_TIMER_HW_SLAVE)) { + if ((u64)snd_timer_hw_resolution(timer) * ticks < 100000) + return -EINVAL; + } diff --git a/queue-6.10/alsa-usb-audio-add-delay-quirk-for-vivo-usb-c-xe710-headset.patch b/queue-6.10/alsa-usb-audio-add-delay-quirk-for-vivo-usb-c-xe710-headset.patch new file mode 100644 index 00000000000..979e48506cf --- /dev/null +++ b/queue-6.10/alsa-usb-audio-add-delay-quirk-for-vivo-usb-c-xe710-headset.patch @@ -0,0 +1,32 @@ +From 004eb8ba776ccd3e296ea6f78f7ae7985b12824e Mon Sep 17 00:00:00 2001 +From: Lianqin Hu +Date: Sun, 11 Aug 2024 08:30:11 +0000 +Subject: ALSA: usb-audio: Add delay quirk for VIVO USB-C-XE710 HEADSET + +From: Lianqin Hu + +commit 004eb8ba776ccd3e296ea6f78f7ae7985b12824e upstream. + +Audio control requests that sets sampling frequency sometimes fail on +this card. Adding delay between control messages eliminates that problem. + +Signed-off-by: Lianqin Hu +Cc: +Link: https://patch.msgid.link/TYUPR06MB6217FF67076AF3E49E12C877D2842@TYUPR06MB6217.apcprd06.prod.outlook.com +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman +--- + sound/usb/quirks.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/sound/usb/quirks.c ++++ b/sound/usb/quirks.c +@@ -2221,6 +2221,8 @@ static const struct usb_audio_quirk_flag + QUIRK_FLAG_GENERIC_IMPLICIT_FB), + DEVICE_FLG(0x2b53, 0x0031, /* Fiero SC-01 (firmware v1.1.0) */ + QUIRK_FLAG_GENERIC_IMPLICIT_FB), ++ DEVICE_FLG(0x2d95, 0x8021, /* VIVO USB-C-XE710 HEADSET */ ++ QUIRK_FLAG_CTL_MSG_DELAY_1M), + DEVICE_FLG(0x30be, 0x0101, /* Schiit Hel */ + QUIRK_FLAG_IGNORE_CTL_ERROR), + DEVICE_FLG(0x413c, 0xa506, /* Dell AE515 sound bar */ diff --git a/queue-6.10/alsa-usb-audio-support-yamaha-p-125-quirk-entry.patch b/queue-6.10/alsa-usb-audio-support-yamaha-p-125-quirk-entry.patch new file mode 100644 index 00000000000..21d1842e5eb --- /dev/null +++ b/queue-6.10/alsa-usb-audio-support-yamaha-p-125-quirk-entry.patch @@ -0,0 +1,33 @@ +From c286f204ce6ba7b48e3dcba53eda7df8eaa64dd9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Juan=20Jos=C3=A9=20Arboleda?= +Date: Tue, 13 Aug 2024 11:10:53 -0500 +Subject: ALSA: usb-audio: Support Yamaha P-125 quirk entry +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Juan José Arboleda + +commit c286f204ce6ba7b48e3dcba53eda7df8eaa64dd9 upstream. + +This patch adds a USB quirk for the Yamaha P-125 digital piano. + +Signed-off-by: Juan José Arboleda +Cc: +Link: https://patch.msgid.link/20240813161053.70256-1-soyjuanarbol@gmail.com +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman +--- + sound/usb/quirks-table.h | 1 + + 1 file changed, 1 insertion(+) + +--- a/sound/usb/quirks-table.h ++++ b/sound/usb/quirks-table.h +@@ -273,6 +273,7 @@ YAMAHA_DEVICE(0x105a, NULL), + YAMAHA_DEVICE(0x105b, NULL), + YAMAHA_DEVICE(0x105c, NULL), + YAMAHA_DEVICE(0x105d, NULL), ++YAMAHA_DEVICE(0x1718, "P-125"), + { + USB_DEVICE(0x0499, 0x1503), + .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { diff --git a/queue-6.10/arm64-acpi-numa-initialize-all-values-of-acpi_early_node_map-to-numa_no_node.patch b/queue-6.10/arm64-acpi-numa-initialize-all-values-of-acpi_early_node_map-to-numa_no_node.patch new file mode 100644 index 00000000000..6d20bbd2495 --- /dev/null +++ b/queue-6.10/arm64-acpi-numa-initialize-all-values-of-acpi_early_node_map-to-numa_no_node.patch @@ -0,0 +1,42 @@ +From a21dcf0ea8566ebbe011c79d6ed08cdfea771de3 Mon Sep 17 00:00:00 2001 +From: Haibo Xu +Date: Mon, 5 Aug 2024 11:30:24 +0800 +Subject: arm64: ACPI: NUMA: initialize all values of acpi_early_node_map to NUMA_NO_NODE + +From: Haibo Xu + +commit a21dcf0ea8566ebbe011c79d6ed08cdfea771de3 upstream. + +Currently, only acpi_early_node_map[0] was initialized to NUMA_NO_NODE. +To ensure all the values were properly initialized, switch to initialize +all of them to NUMA_NO_NODE. + +Fixes: e18962491696 ("arm64: numa: rework ACPI NUMA initialization") +Cc: # 4.19.x +Reported-by: Andrew Jones +Suggested-by: Andrew Jones +Signed-off-by: Haibo Xu +Reviewed-by: Anshuman Khandual +Reviewed-by: Sunil V L +Reviewed-by: Andrew Jones +Acked-by: Catalin Marinas +Acked-by: Lorenzo Pieralisi +Reviewed-by: Hanjun Guo +Link: https://lore.kernel.org/r/853d7f74aa243f6f5999e203246f0d1ae92d2b61.1722828421.git.haibo1.xu@intel.com +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kernel/acpi_numa.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/arm64/kernel/acpi_numa.c ++++ b/arch/arm64/kernel/acpi_numa.c +@@ -27,7 +27,7 @@ + + #include + +-static int acpi_early_node_map[NR_CPUS] __initdata = { NUMA_NO_NODE }; ++static int acpi_early_node_map[NR_CPUS] __initdata = { [0 ... NR_CPUS - 1] = NUMA_NO_NODE }; + + int __init acpi_numa_get_nid(unsigned int cpu) + { diff --git a/queue-6.10/btrfs-check-delayed-refs-when-we-re-checking-if-a-ref-exists.patch b/queue-6.10/btrfs-check-delayed-refs-when-we-re-checking-if-a-ref-exists.patch new file mode 100644 index 00000000000..1c6abb4c061 --- /dev/null +++ b/queue-6.10/btrfs-check-delayed-refs-when-we-re-checking-if-a-ref-exists.patch @@ -0,0 +1,253 @@ +From 42fac187b5c746227c92d024f1caf33bc1d337e4 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Thu, 11 Apr 2024 16:41:20 -0400 +Subject: btrfs: check delayed refs when we're checking if a ref exists + +From: Josef Bacik + +commit 42fac187b5c746227c92d024f1caf33bc1d337e4 upstream. + +In the patch 78c52d9eb6b7 ("btrfs: check for refs on snapshot delete +resume") I added some code to handle file systems that had been +corrupted by a bug that incorrectly skipped updating the drop progress +key while dropping a snapshot. This code would check to see if we had +already deleted our reference for a child block, and skip the deletion +if we had already. + +Unfortunately there is a bug, as the check would only check the on-disk +references. I made an incorrect assumption that blocks in an already +deleted snapshot that was having the deletion resume on mount wouldn't +be modified. + +If we have 2 pending deleted snapshots that share blocks, we can easily +modify the rules for a block. Take the following example + +subvolume a exists, and subvolume b is a snapshot of subvolume a. They +share references to block 1. Block 1 will have 2 full references, one +for subvolume a and one for subvolume b, and it belongs to subvolume a +(btrfs_header_owner(block 1) == subvolume a). + +When deleting subvolume a, we will drop our full reference for block 1, +and because we are the owner we will drop our full reference for all of +block 1's children, convert block 1 to FULL BACKREF, and add a shared +reference to all of block 1's children. + +Then we will start the snapshot deletion of subvolume b. We look up the +extent info for block 1, which checks delayed refs and tells us that +FULL BACKREF is set, so sets parent to the bytenr of block 1. However +because this is a resumed snapshot deletion, we call into +check_ref_exists(). Because check_ref_exists() only looks at the disk, +it doesn't find the shared backref for the child of block 1, and thus +returns 0 and we skip deleting the reference for the child of block 1 +and continue. This orphans the child of block 1. + +The fix is to lookup the delayed refs, similar to what we do in +btrfs_lookup_extent_info(). However we only care about whether the +reference exists or not. If we fail to find our reference on disk, go +look up the bytenr in the delayed refs, and if it exists look for an +existing ref in the delayed ref head. If that exists then we know we +can delete the reference safely and carry on. If it doesn't exist we +know we have to skip over this block. + +This bug has existed since I introduced this fix, however requires +having multiple deleted snapshots pending when we unmount. We noticed +this in production because our shutdown path stops the container on the +system, which deletes a bunch of subvolumes, and then reboots the box. +This gives us plenty of opportunities to hit this issue. Looking at the +history we've seen this occasionally in production, but we had a big +spike recently thanks to faster machines getting jobs with multiple +subvolumes in the job. + +Chris Mason wrote a reproducer which does the following + +mount /dev/nvme4n1 /btrfs +btrfs subvol create /btrfs/s1 +simoop -E -f 4k -n 200000 -z /btrfs/s1 +while(true) ; do + btrfs subvol snap /btrfs/s1 /btrfs/s2 + simoop -f 4k -n 200000 -r 10 -z /btrfs/s2 + btrfs subvol snap /btrfs/s2 /btrfs/s3 + btrfs balance start -dusage=80 /btrfs + btrfs subvol del /btrfs/s2 /btrfs/s3 + umount /btrfs + btrfsck /dev/nvme4n1 || exit 1 + mount /dev/nvme4n1 /btrfs +done + +On the second loop this would fail consistently, with my patch it has +been running for hours and hasn't failed. + +I also used dm-log-writes to capture the state of the failure so I could +debug the problem. Using the existing failure case to test my patch +validated that it fixes the problem. + +Fixes: 78c52d9eb6b7 ("btrfs: check for refs on snapshot delete resume") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/delayed-ref.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++ + fs/btrfs/delayed-ref.h | 2 + + fs/btrfs/extent-tree.c | 51 ++++++++++++++++++++++++++++++++----- + 3 files changed, 114 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/delayed-ref.c ++++ b/fs/btrfs/delayed-ref.c +@@ -1169,6 +1169,73 @@ btrfs_find_delayed_ref_head(struct btrfs + return find_ref_head(delayed_refs, bytenr, false); + } + ++static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent) ++{ ++ int type = parent ? BTRFS_SHARED_BLOCK_REF_KEY : BTRFS_TREE_BLOCK_REF_KEY; ++ ++ if (type < entry->type) ++ return -1; ++ if (type > entry->type) ++ return 1; ++ ++ if (type == BTRFS_TREE_BLOCK_REF_KEY) { ++ if (root < entry->ref_root) ++ return -1; ++ if (root > entry->ref_root) ++ return 1; ++ } else { ++ if (parent < entry->parent) ++ return -1; ++ if (parent > entry->parent) ++ return 1; ++ } ++ return 0; ++} ++ ++/* ++ * Check to see if a given root/parent reference is attached to the head. This ++ * only checks for BTRFS_ADD_DELAYED_REF references that match, as that ++ * indicates the reference exists for the given root or parent. This is for ++ * tree blocks only. ++ * ++ * @head: the head of the bytenr we're searching. ++ * @root: the root objectid of the reference if it is a normal reference. ++ * @parent: the parent if this is a shared backref. ++ */ ++bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, ++ u64 root, u64 parent) ++{ ++ struct rb_node *node; ++ bool found = false; ++ ++ lockdep_assert_held(&head->mutex); ++ ++ spin_lock(&head->lock); ++ node = head->ref_tree.rb_root.rb_node; ++ while (node) { ++ struct btrfs_delayed_ref_node *entry; ++ int ret; ++ ++ entry = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); ++ ret = find_comp(entry, root, parent); ++ if (ret < 0) { ++ node = node->rb_left; ++ } else if (ret > 0) { ++ node = node->rb_right; ++ } else { ++ /* ++ * We only want to count ADD actions, as drops mean the ++ * ref doesn't exist. ++ */ ++ if (entry->action == BTRFS_ADD_DELAYED_REF) ++ found = true; ++ break; ++ } ++ } ++ spin_unlock(&head->lock); ++ return found; ++} ++ + void __cold btrfs_delayed_ref_exit(void) + { + kmem_cache_destroy(btrfs_delayed_ref_head_cachep); +--- a/fs/btrfs/delayed-ref.h ++++ b/fs/btrfs/delayed-ref.h +@@ -389,6 +389,8 @@ int btrfs_delayed_refs_rsv_refill(struct + void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + u64 num_bytes); + bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); ++bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head, ++ u64 root, u64 parent); + + static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node) + { +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -5387,23 +5387,62 @@ static int check_ref_exists(struct btrfs + struct btrfs_root *root, u64 bytenr, u64 parent, + int level) + { ++ struct btrfs_delayed_ref_root *delayed_refs; ++ struct btrfs_delayed_ref_head *head; + struct btrfs_path *path; + struct btrfs_extent_inline_ref *iref; + int ret; ++ bool exists = false; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +- ++again: + ret = lookup_extent_backref(trans, path, &iref, bytenr, + root->fs_info->nodesize, parent, + btrfs_root_id(root), level, 0); ++ if (ret != -ENOENT) { ++ /* ++ * If we get 0 then we found our reference, return 1, else ++ * return the error if it's not -ENOENT; ++ */ ++ btrfs_free_path(path); ++ return (ret < 0 ) ? ret : 1; ++ } ++ ++ /* ++ * We could have a delayed ref with this reference, so look it up while ++ * we're holding the path open to make sure we don't race with the ++ * delayed ref running. ++ */ ++ delayed_refs = &trans->transaction->delayed_refs; ++ spin_lock(&delayed_refs->lock); ++ head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); ++ if (!head) ++ goto out; ++ if (!mutex_trylock(&head->mutex)) { ++ /* ++ * We're contended, means that the delayed ref is running, get a ++ * reference and wait for the ref head to be complete and then ++ * try again. ++ */ ++ refcount_inc(&head->refs); ++ spin_unlock(&delayed_refs->lock); ++ ++ btrfs_release_path(path); ++ ++ mutex_lock(&head->mutex); ++ mutex_unlock(&head->mutex); ++ btrfs_put_delayed_ref_head(head); ++ goto again; ++ } ++ ++ exists = btrfs_find_delayed_tree_ref(head, root->root_key.objectid, parent); ++ mutex_unlock(&head->mutex); ++out: ++ spin_unlock(&delayed_refs->lock); + btrfs_free_path(path); +- if (ret == -ENOENT) +- return 0; +- if (ret < 0) +- return ret; +- return 1; ++ return exists ? 1 : 0; + } + + /* diff --git a/queue-6.10/btrfs-only-enable-extent-map-shrinker-for-debug-builds.patch b/queue-6.10/btrfs-only-enable-extent-map-shrinker-for-debug-builds.patch new file mode 100644 index 00000000000..640e1cffca8 --- /dev/null +++ b/queue-6.10/btrfs-only-enable-extent-map-shrinker-for-debug-builds.patch @@ -0,0 +1,45 @@ +From 534f7eff9239c1b0af852fc33f5af2b62c00eddf Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 16 Aug 2024 10:40:38 +0930 +Subject: btrfs: only enable extent map shrinker for DEBUG builds + +From: Qu Wenruo + +commit 534f7eff9239c1b0af852fc33f5af2b62c00eddf upstream. + +Although there are several patches improving the extent map shrinker, +there are still reports of too frequent shrinker behavior, taking too +much CPU for the kswapd process. + +So let's only enable extent shrinker for now, until we got more +comprehensive understanding and a better solution. + +Link: https://lore.kernel.org/linux-btrfs/3df4acd616a07ef4d2dc6bad668701504b412ffc.camel@intelfx.name/ +Link: https://lore.kernel.org/linux-btrfs/c30fd6b3-ca7a-4759-8a53-d42878bf84f7@gmail.com/ +Fixes: 956a17d9d050 ("btrfs: add a shrinker for extent maps") +CC: stable@vger.kernel.org # 6.10+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/super.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -2387,7 +2387,13 @@ static long btrfs_nr_cached_objects(stru + + trace_btrfs_extent_map_shrinker_count(fs_info, nr); + +- return nr; ++ /* ++ * Only report the real number for DEBUG builds, as there are reports of ++ * serious performance degradation caused by too frequent shrinks. ++ */ ++ if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) ++ return nr; ++ return 0; + } + + static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc) diff --git a/queue-6.10/btrfs-only-run-the-extent-map-shrinker-from-kswapd-tasks.patch b/queue-6.10/btrfs-only-run-the-extent-map-shrinker-from-kswapd-tasks.patch new file mode 100644 index 00000000000..320ef5672df --- /dev/null +++ b/queue-6.10/btrfs-only-run-the-extent-map-shrinker-from-kswapd-tasks.patch @@ -0,0 +1,139 @@ +From ae1e766f623f7a2a889a0b09eb076dd9a60efbe9 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Sun, 11 Aug 2024 11:53:42 +0100 +Subject: btrfs: only run the extent map shrinker from kswapd tasks + +From: Filipe Manana + +commit ae1e766f623f7a2a889a0b09eb076dd9a60efbe9 upstream. + +Currently the extent map shrinker can be run by any task when attempting +to allocate memory and there's enough memory pressure to trigger it. + +To avoid too much latency we stop iterating over extent maps and removing +them once the task needs to reschedule. This logic was introduced in commit +b3ebb9b7e92a ("btrfs: stop extent map shrinker if reschedule is needed"). + +While that solved high latency problems for some use cases, it's still +not enough because with a too high number of tasks entering the extent map +shrinker code, either due to memory allocations or because they are a +kswapd task, we end up having a very high level of contention on some +spin locks, namely: + +1) The fs_info->fs_roots_radix_lock spin lock, which we need to find + roots to iterate over their inodes; + +2) The spin lock of the xarray used to track open inodes for a root + (struct btrfs_root::inodes) - on 6.10 kernels and below, it used to + be a red black tree and the spin lock was root->inode_lock; + +3) The fs_info->delayed_iput_lock spin lock since the shrinker adds + delayed iputs (calls btrfs_add_delayed_iput()). + +Instead of allowing the extent map shrinker to be run by any task, make +it run only by kswapd tasks. This still solves the problem of running +into OOM situations due to an unbounded extent map creation, which is +simple to trigger by direct IO writes, as described in the changelog +of commit 956a17d9d050 ("btrfs: add a shrinker for extent maps"), and +by a similar case when doing buffered IO on files with a very large +number of holes (keeping the file open and creating many holes, whose +extent maps are only released when the file is closed). + +Reported-by: kzd +Link: https://bugzilla.kernel.org/show_bug.cgi?id=219121 +Reported-by: Octavia Togami +Link: https://lore.kernel.org/linux-btrfs/CAHPNGSSt-a4ZZWrtJdVyYnJFscFjP9S7rMcvEMaNSpR556DdLA@mail.gmail.com/ +Fixes: 956a17d9d050 ("btrfs: add a shrinker for extent maps") +CC: stable@vger.kernel.org # 6.10+ +Tested-by: kzd +Tested-by: Octavia Togami +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_map.c | 22 ++++++---------------- + fs/btrfs/super.c | 10 ++++++++++ + 2 files changed, 16 insertions(+), 16 deletions(-) + +--- a/fs/btrfs/extent_map.c ++++ b/fs/btrfs/extent_map.c +@@ -1065,8 +1065,7 @@ static long btrfs_scan_inode(struct btrf + return 0; + + /* +- * We want to be fast because we can be called from any path trying to +- * allocate memory, so if the lock is busy we don't want to spend time ++ * We want to be fast so if the lock is busy we don't want to spend time + * waiting for it - either some task is about to do IO for the inode or + * we may have another task shrinking extent maps, here in this code, so + * skip this inode. +@@ -1109,9 +1108,7 @@ next: + /* + * Stop if we need to reschedule or there's contention on the + * lock. This is to avoid slowing other tasks trying to take the +- * lock and because the shrinker might be called during a memory +- * allocation path and we want to avoid taking a very long time +- * and slowing down all sorts of tasks. ++ * lock. + */ + if (need_resched() || rwlock_needbreak(&tree->lock)) + break; +@@ -1139,12 +1136,7 @@ static long btrfs_scan_root(struct btrfs + if (ctx->scanned >= ctx->nr_to_scan) + break; + +- /* +- * We may be called from memory allocation paths, so we don't +- * want to take too much time and slowdown tasks. +- */ +- if (need_resched()) +- break; ++ cond_resched(); + + inode = btrfs_find_first_inode(root, min_ino); + } +@@ -1202,14 +1194,12 @@ long btrfs_free_extent_maps(struct btrfs + ctx.last_ino); + } + +- /* +- * We may be called from memory allocation paths, so we don't want to +- * take too much time and slowdown tasks, so stop if we need reschedule. +- */ +- while (ctx.scanned < ctx.nr_to_scan && !need_resched()) { ++ while (ctx.scanned < ctx.nr_to_scan) { + struct btrfs_root *root; + unsigned long count; + ++ cond_resched(); ++ + spin_lock(&fs_info->fs_roots_radix_lock); + count = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)&root, +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + #include "messages.h" + #include "delayed-inode.h" + #include "ctree.h" +@@ -2394,6 +2395,15 @@ static long btrfs_free_cached_objects(st + const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + ++ /* ++ * We may be called from any task trying to allocate memory and we don't ++ * want to slow it down with scanning and dropping extent maps. It would ++ * also cause heavy lock contention if many tasks concurrently enter ++ * here. Therefore only allow kswapd tasks to scan and drop extent maps. ++ */ ++ if (!current_is_kswapd()) ++ return 0; ++ + return btrfs_free_extent_maps(fs_info, nr_to_scan); + } + diff --git a/queue-6.10/btrfs-send-allow-cloning-non-aligned-extent-if-it-ends-at-i_size.patch b/queue-6.10/btrfs-send-allow-cloning-non-aligned-extent-if-it-ends-at-i_size.patch new file mode 100644 index 00000000000..bfc84dd56ea --- /dev/null +++ b/queue-6.10/btrfs-send-allow-cloning-non-aligned-extent-if-it-ends-at-i_size.patch @@ -0,0 +1,188 @@ +From 46a6e10a1ab16cc71d4a3cab73e79aabadd6b8ea Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 12 Aug 2024 14:18:06 +0100 +Subject: btrfs: send: allow cloning non-aligned extent if it ends at i_size + +From: Filipe Manana + +commit 46a6e10a1ab16cc71d4a3cab73e79aabadd6b8ea upstream. + +If we a find that an extent is shared but its end offset is not sector +size aligned, then we don't clone it and issue write operations instead. +This is because the reflink (remap_file_range) operation does not allow +to clone unaligned ranges, except if the end offset of the range matches +the i_size of the source and destination files (and the start offset is +sector size aligned). + +While this is not incorrect because send can only guarantee that a file +has the same data in the source and destination snapshots, it's not +optimal and generates confusion and surprising behaviour for users. + +For example, running this test: + + $ cat test.sh + #!/bin/bash + + DEV=/dev/sdi + MNT=/mnt/sdi + + mkfs.btrfs -f $DEV + mount $DEV $MNT + + # Use a file size not aligned to any possible sector size. + file_size=$((1 * 1024 * 1024 + 5)) # 1MB + 5 bytes + dd if=/dev/random of=$MNT/foo bs=$file_size count=1 + cp --reflink=always $MNT/foo $MNT/bar + + btrfs subvolume snapshot -r $MNT/ $MNT/snap + rm -f /tmp/send-test + btrfs send -f /tmp/send-test $MNT/snap + + umount $MNT + mkfs.btrfs -f $DEV + mount $DEV $MNT + + btrfs receive -vv -f /tmp/send-test $MNT + + xfs_io -r -c "fiemap -v" $MNT/snap/bar + + umount $MNT + +Gives the following result: + + (...) + mkfile o258-7-0 + rename o258-7-0 -> bar + write bar - offset=0 length=49152 + write bar - offset=49152 length=49152 + write bar - offset=98304 length=49152 + write bar - offset=147456 length=49152 + write bar - offset=196608 length=49152 + write bar - offset=245760 length=49152 + write bar - offset=294912 length=49152 + write bar - offset=344064 length=49152 + write bar - offset=393216 length=49152 + write bar - offset=442368 length=49152 + write bar - offset=491520 length=49152 + write bar - offset=540672 length=49152 + write bar - offset=589824 length=49152 + write bar - offset=638976 length=49152 + write bar - offset=688128 length=49152 + write bar - offset=737280 length=49152 + write bar - offset=786432 length=49152 + write bar - offset=835584 length=49152 + write bar - offset=884736 length=49152 + write bar - offset=933888 length=49152 + write bar - offset=983040 length=49152 + write bar - offset=1032192 length=16389 + chown bar - uid=0, gid=0 + chmod bar - mode=0644 + utimes bar + utimes + BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=06d640da-9ca1-604c-b87c-3375175a8eb3, stransid=7 + /mnt/sdi/snap/bar: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..2055]: 26624..28679 2056 0x1 + +There's no clone operation to clone extents from the file foo into file +bar and fiemap confirms there's no shared flag (0x2000). + +So update send_write_or_clone() so that it proceeds with cloning if the +source and destination ranges end at the i_size of the respective files. + +After this changes the result of the test is: + + (...) + mkfile o258-7-0 + rename o258-7-0 -> bar + clone bar - source=foo source offset=0 offset=0 length=1048581 + chown bar - uid=0, gid=0 + chmod bar - mode=0644 + utimes bar + utimes + BTRFS_IOC_SET_RECEIVED_SUBVOL uuid=582420f3-ea7d-564e-bbe5-ce440d622190, stransid=7 + /mnt/sdi/snap/bar: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + 0: [0..2055]: 26624..28679 2056 0x2001 + +A test case for fstests will also follow up soon. + +Link: https://github.com/kdave/btrfs-progs/issues/572#issuecomment-2282841416 +CC: stable@vger.kernel.org # 5.10+ +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/send.c | 54 ++++++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 40 insertions(+), 14 deletions(-) + +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -6158,25 +6158,51 @@ static int send_write_or_clone(struct se + u64 offset = key->offset; + u64 end; + u64 bs = sctx->send_root->fs_info->sectorsize; ++ struct btrfs_file_extent_item *ei; ++ u64 disk_byte; ++ u64 data_offset; ++ u64 num_bytes; ++ struct btrfs_inode_info info = { 0 }; + + end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); + if (offset >= end) + return 0; + +- if (clone_root && IS_ALIGNED(end, bs)) { +- struct btrfs_file_extent_item *ei; +- u64 disk_byte; +- u64 data_offset; +- +- ei = btrfs_item_ptr(path->nodes[0], path->slots[0], +- struct btrfs_file_extent_item); +- disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); +- data_offset = btrfs_file_extent_offset(path->nodes[0], ei); +- ret = clone_range(sctx, path, clone_root, disk_byte, +- data_offset, offset, end - offset); +- } else { +- ret = send_extent_data(sctx, path, offset, end - offset); +- } ++ num_bytes = end - offset; ++ ++ if (!clone_root) ++ goto write_data; ++ ++ if (IS_ALIGNED(end, bs)) ++ goto clone_data; ++ ++ /* ++ * If the extent end is not aligned, we can clone if the extent ends at ++ * the i_size of the inode and the clone range ends at the i_size of the ++ * source inode, otherwise the clone operation fails with -EINVAL. ++ */ ++ if (end != sctx->cur_inode_size) ++ goto write_data; ++ ++ ret = get_inode_info(clone_root->root, clone_root->ino, &info); ++ if (ret < 0) ++ return ret; ++ ++ if (clone_root->offset + num_bytes == info.size) ++ goto clone_data; ++ ++write_data: ++ ret = send_extent_data(sctx, path, offset, num_bytes); ++ sctx->cur_inode_next_write_offset = end; ++ return ret; ++ ++clone_data: ++ ei = btrfs_item_ptr(path->nodes[0], path->slots[0], ++ struct btrfs_file_extent_item); ++ disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); ++ data_offset = btrfs_file_extent_offset(path->nodes[0], ei); ++ ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset, ++ num_bytes); + sctx->cur_inode_next_write_offset = end; + return ret; + } diff --git a/queue-6.10/btrfs-tree-checker-add-dev-extent-item-checks.patch b/queue-6.10/btrfs-tree-checker-add-dev-extent-item-checks.patch new file mode 100644 index 00000000000..83f5b01e191 --- /dev/null +++ b/queue-6.10/btrfs-tree-checker-add-dev-extent-item-checks.patch @@ -0,0 +1,162 @@ +From 008e2512dc5696ab2dc5bf264e98a9fe9ceb830e Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Sun, 11 Aug 2024 15:00:22 +0930 +Subject: btrfs: tree-checker: add dev extent item checks + +From: Qu Wenruo + +commit 008e2512dc5696ab2dc5bf264e98a9fe9ceb830e upstream. + +[REPORT] +There is a corruption report that btrfs refused to mount a fs that has +overlapping dev extents: + + BTRFS error (device sdc): dev extent devid 4 physical offset 14263979671552 overlap with previous dev extent end 14263980982272 + BTRFS error (device sdc): failed to verify dev extents against chunks: -117 + BTRFS error (device sdc): open_ctree failed + +[CAUSE] +The direct cause is very obvious, there is a bad dev extent item with +incorrect length. + +With btrfs check reporting two overlapping extents, the second one shows +some clue on the cause: + + ERROR: dev extent devid 4 offset 14263979671552 len 6488064 overlap with previous dev extent end 14263980982272 + ERROR: dev extent devid 13 offset 2257707008000 len 6488064 overlap with previous dev extent end 2257707270144 + ERROR: errors found in extent allocation tree or chunk allocation + +The second one looks like a bitflip happened during new chunk +allocation: +hex(2257707008000) = 0x20da9d30000 +hex(2257707270144) = 0x20da9d70000 +diff = 0x00000040000 + +So it looks like a bitflip happened during new dev extent allocation, +resulting the second overlap. + +Currently we only do the dev-extent verification at mount time, but if the +corruption is caused by memory bitflip, we really want to catch it before +writing the corruption to the storage. + +Furthermore the dev extent items has the following key definition: + + ( DEV_EXTENT ) + +Thus we can not just rely on the generic key order check to make sure +there is no overlapping. + +[ENHANCEMENT] +Introduce dedicated dev extent checks, including: + +- Fixed member checks + * chunk_tree should always be BTRFS_CHUNK_TREE_OBJECTID (3) + * chunk_objectid should always be + BTRFS_FIRST_CHUNK_CHUNK_TREE_OBJECTID (256) + +- Alignment checks + * chunk_offset should be aligned to sectorsize + * length should be aligned to sectorsize + * key.offset should be aligned to sectorsize + +- Overlap checks + If the previous key is also a dev-extent item, with the same + device id, make sure we do not overlap with the previous dev extent. + +Reported: Stefan N +Link: https://lore.kernel.org/linux-btrfs/CA+W5K0rSO3koYTo=nzxxTm1-Pdu1HYgVxEpgJ=aGc7d=E8mGEg@mail.gmail.com/ +CC: stable@vger.kernel.org # 5.10+ +Reviewed-by: Anand Jain +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 69 insertions(+) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -1718,6 +1718,72 @@ static int check_raid_stripe_extent(cons + return 0; + } + ++static int check_dev_extent_item(const struct extent_buffer *leaf, ++ const struct btrfs_key *key, ++ int slot, ++ struct btrfs_key *prev_key) ++{ ++ struct btrfs_dev_extent *de; ++ const u32 sectorsize = leaf->fs_info->sectorsize; ++ ++ de = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); ++ /* Basic fixed member checks. */ ++ if (unlikely(btrfs_dev_extent_chunk_tree(leaf, de) != ++ BTRFS_CHUNK_TREE_OBJECTID)) { ++ generic_err(leaf, slot, ++ "invalid dev extent chunk tree id, has %llu expect %llu", ++ btrfs_dev_extent_chunk_tree(leaf, de), ++ BTRFS_CHUNK_TREE_OBJECTID); ++ return -EUCLEAN; ++ } ++ if (unlikely(btrfs_dev_extent_chunk_objectid(leaf, de) != ++ BTRFS_FIRST_CHUNK_TREE_OBJECTID)) { ++ generic_err(leaf, slot, ++ "invalid dev extent chunk objectid, has %llu expect %llu", ++ btrfs_dev_extent_chunk_objectid(leaf, de), ++ BTRFS_FIRST_CHUNK_TREE_OBJECTID); ++ return -EUCLEAN; ++ } ++ /* Alignment check. */ ++ if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { ++ generic_err(leaf, slot, ++ "invalid dev extent key.offset, has %llu not aligned to %u", ++ key->offset, sectorsize); ++ return -EUCLEAN; ++ } ++ if (unlikely(!IS_ALIGNED(btrfs_dev_extent_chunk_offset(leaf, de), ++ sectorsize))) { ++ generic_err(leaf, slot, ++ "invalid dev extent chunk offset, has %llu not aligned to %u", ++ btrfs_dev_extent_chunk_objectid(leaf, de), ++ sectorsize); ++ return -EUCLEAN; ++ } ++ if (unlikely(!IS_ALIGNED(btrfs_dev_extent_length(leaf, de), ++ sectorsize))) { ++ generic_err(leaf, slot, ++ "invalid dev extent length, has %llu not aligned to %u", ++ btrfs_dev_extent_length(leaf, de), sectorsize); ++ return -EUCLEAN; ++ } ++ /* Overlap check with previous dev extent. */ ++ if (slot && prev_key->objectid == key->objectid && ++ prev_key->type == key->type) { ++ struct btrfs_dev_extent *prev_de; ++ u64 prev_len; ++ ++ prev_de = btrfs_item_ptr(leaf, slot - 1, struct btrfs_dev_extent); ++ prev_len = btrfs_dev_extent_length(leaf, prev_de); ++ if (unlikely(prev_key->offset + prev_len > key->offset)) { ++ generic_err(leaf, slot, ++ "dev extent overlap, prev offset %llu len %llu current offset %llu", ++ prev_key->objectid, prev_len, key->offset); ++ return -EUCLEAN; ++ } ++ } ++ return 0; ++} ++ + /* + * Common point to switch the item-specific validation. + */ +@@ -1754,6 +1820,9 @@ static enum btrfs_tree_block_status chec + case BTRFS_DEV_ITEM_KEY: + ret = check_dev_item(leaf, key, slot); + break; ++ case BTRFS_DEV_EXTENT_KEY: ++ ret = check_dev_extent_item(leaf, key, slot, prev_key); ++ break; + case BTRFS_INODE_ITEM_KEY: + ret = check_inode_item(leaf, key, slot); + break; diff --git a/queue-6.10/btrfs-tree-checker-reject-btrfs_ft_unknown-dir-type.patch b/queue-6.10/btrfs-tree-checker-reject-btrfs_ft_unknown-dir-type.patch new file mode 100644 index 00000000000..3196fa3ea1a --- /dev/null +++ b/queue-6.10/btrfs-tree-checker-reject-btrfs_ft_unknown-dir-type.patch @@ -0,0 +1,58 @@ +From 31723c9542dba1681cc3720571fdf12ffe0eddd9 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Mon, 12 Aug 2024 08:52:44 +0930 +Subject: btrfs: tree-checker: reject BTRFS_FT_UNKNOWN dir type + +From: Qu Wenruo + +commit 31723c9542dba1681cc3720571fdf12ffe0eddd9 upstream. + +[REPORT] +There is a bug report that kernel is rejecting a mismatching inode mode +and its dir item: + + [ 1881.553937] BTRFS critical (device dm-0): inode mode mismatch with + dir: inode mode=040700 btrfs type=2 dir type=0 + +[CAUSE] +It looks like the inode mode is correct, while the dir item type +0 is BTRFS_FT_UNKNOWN, which should not be generated by btrfs at all. + +This may be caused by a memory bit flip. + +[ENHANCEMENT] +Although tree-checker is not able to do any cross-leaf verification, for +this particular case we can at least reject any dir type with +BTRFS_FT_UNKNOWN. + +So here we enhance the dir type check from [0, BTRFS_FT_MAX), to +(0, BTRFS_FT_MAX). +Although the existing corruption can not be fixed just by such enhanced +checking, it should prevent the same 0x2->0x0 bitflip for dir type to +reach disk in the future. + +Reported-by: Kota +Link: https://lore.kernel.org/linux-btrfs/CACsxjPYnQF9ZF-0OhH16dAx50=BXXOcP74MxBc3BG+xae4vTTw@mail.gmail.com/ +CC: stable@vger.kernel.org # 5.4+ +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -551,9 +551,10 @@ static int check_dir_item(struct extent_ + + /* dir type check */ + dir_type = btrfs_dir_ftype(leaf, di); +- if (unlikely(dir_type >= BTRFS_FT_MAX)) { ++ if (unlikely(dir_type <= BTRFS_FT_UNKNOWN || ++ dir_type >= BTRFS_FT_MAX)) { + dir_item_err(leaf, slot, +- "invalid dir item type, have %u expect [0, %u)", ++ "invalid dir item type, have %u expect (0, %u)", + dir_type, BTRFS_FT_MAX); + return -EUCLEAN; + } diff --git a/queue-6.10/btrfs-zoned-properly-take-lock-to-read-update-block-group-s-zoned-variables.patch b/queue-6.10/btrfs-zoned-properly-take-lock-to-read-update-block-group-s-zoned-variables.patch new file mode 100644 index 00000000000..487836f16e9 --- /dev/null +++ b/queue-6.10/btrfs-zoned-properly-take-lock-to-read-update-block-group-s-zoned-variables.patch @@ -0,0 +1,82 @@ +From e30729d4bd4001881be4d1ad4332a5d4985398f8 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Thu, 1 Aug 2024 16:47:52 +0900 +Subject: btrfs: zoned: properly take lock to read/update block group's zoned variables + +From: Naohiro Aota + +commit e30729d4bd4001881be4d1ad4332a5d4985398f8 upstream. + +__btrfs_add_free_space_zoned() references and modifies bg's alloc_offset, +ro, and zone_unusable, but without taking the lock. It is mostly safe +because they monotonically increase (at least for now) and this function is +mostly called by a transaction commit, which is serialized by itself. + +Still, taking the lock is a safer and correct option and I'm going to add a +change to reset zone_unusable while a block group is still alive. So, add +locking around the operations. + +Fixes: 169e0da91a21 ("btrfs: zoned: track unusable bytes for zones") +CC: stable@vger.kernel.org # 5.15+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/free-space-cache.c | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -2698,15 +2698,16 @@ static int __btrfs_add_free_space_zoned( + u64 offset = bytenr - block_group->start; + u64 to_free, to_unusable; + int bg_reclaim_threshold = 0; +- bool initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); ++ bool initial; + u64 reclaimable_unusable; + +- WARN_ON(!initial && offset + size > block_group->zone_capacity); ++ spin_lock(&block_group->lock); + ++ initial = ((size == block_group->length) && (block_group->alloc_offset == 0)); ++ WARN_ON(!initial && offset + size > block_group->zone_capacity); + if (!initial) + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); + +- spin_lock(&ctl->tree_lock); + if (!used) + to_free = size; + else if (initial) +@@ -2719,7 +2720,9 @@ static int __btrfs_add_free_space_zoned( + to_free = offset + size - block_group->alloc_offset; + to_unusable = size - to_free; + ++ spin_lock(&ctl->tree_lock); + ctl->free_space += to_free; ++ spin_unlock(&ctl->tree_lock); + /* + * If the block group is read-only, we should account freed space into + * bytes_readonly. +@@ -2728,11 +2731,8 @@ static int __btrfs_add_free_space_zoned( + block_group->zone_unusable += to_unusable; + WARN_ON(block_group->zone_unusable > block_group->length); + } +- spin_unlock(&ctl->tree_lock); + if (!used) { +- spin_lock(&block_group->lock); + block_group->alloc_offset -= size; +- spin_unlock(&block_group->lock); + } + + reclaimable_unusable = block_group->zone_unusable - +@@ -2746,6 +2746,8 @@ static int __btrfs_add_free_space_zoned( + btrfs_mark_bg_to_reclaim(block_group); + } + ++ spin_unlock(&block_group->lock); ++ + return 0; + } + diff --git a/queue-6.10/char-xillybus-check-usb-endpoints-when-probing-device.patch b/queue-6.10/char-xillybus-check-usb-endpoints-when-probing-device.patch new file mode 100644 index 00000000000..393ff94442b --- /dev/null +++ b/queue-6.10/char-xillybus-check-usb-endpoints-when-probing-device.patch @@ -0,0 +1,93 @@ +From 2374bf7558de915edc6ec8cb10ec3291dfab9594 Mon Sep 17 00:00:00 2001 +From: Eli Billauer +Date: Fri, 16 Aug 2024 10:02:00 +0300 +Subject: char: xillybus: Check USB endpoints when probing device + +From: Eli Billauer + +commit 2374bf7558de915edc6ec8cb10ec3291dfab9594 upstream. + +Ensure, as the driver probes the device, that all endpoints that the +driver may attempt to access exist and are of the correct type. + +All XillyUSB devices must have a Bulk IN and Bulk OUT endpoint at +address 1. This is verified in xillyusb_setup_base_eps(). + +On top of that, a XillyUSB device may have additional Bulk OUT +endpoints. The information about these endpoints' addresses is deduced +from a data structure (the IDT) that the driver fetches from the device +while probing it. These endpoints are checked in setup_channels(). + +A XillyUSB device never has more than one IN endpoint, as all data +towards the host is multiplexed in this single Bulk IN endpoint. This is +why setup_channels() only checks OUT endpoints. + +Reported-by: syzbot+eac39cba052f2e750dbe@syzkaller.appspotmail.com +Cc: stable +Closes: https://lore.kernel.org/all/0000000000001d44a6061f7a54ee@google.com/T/ +Fixes: a53d1202aef1 ("char: xillybus: Add driver for XillyUSB (Xillybus variant for USB)"). +Signed-off-by: Eli Billauer +Link: https://lore.kernel.org/r/20240816070200.50695-2-eli.billauer@gmail.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/char/xillybus/xillyusb.c | 22 ++++++++++++++++++++-- + 1 file changed, 20 insertions(+), 2 deletions(-) + +--- a/drivers/char/xillybus/xillyusb.c ++++ b/drivers/char/xillybus/xillyusb.c +@@ -1903,6 +1903,13 @@ static const struct file_operations xill + + static int xillyusb_setup_base_eps(struct xillyusb_dev *xdev) + { ++ struct usb_device *udev = xdev->udev; ++ ++ /* Verify that device has the two fundamental bulk in/out endpoints */ ++ if (usb_pipe_type_check(udev, usb_sndbulkpipe(udev, MSG_EP_NUM)) || ++ usb_pipe_type_check(udev, usb_rcvbulkpipe(udev, IN_EP_NUM))) ++ return -ENODEV; ++ + xdev->msg_ep = endpoint_alloc(xdev, MSG_EP_NUM | USB_DIR_OUT, + bulk_out_work, 1, 2); + if (!xdev->msg_ep) +@@ -1932,14 +1939,15 @@ static int setup_channels(struct xillyus + __le16 *chandesc, + int num_channels) + { +- struct xillyusb_channel *chan; ++ struct usb_device *udev = xdev->udev; ++ struct xillyusb_channel *chan, *new_channels; + int i; + + chan = kcalloc(num_channels, sizeof(*chan), GFP_KERNEL); + if (!chan) + return -ENOMEM; + +- xdev->channels = chan; ++ new_channels = chan; + + for (i = 0; i < num_channels; i++, chan++) { + unsigned int in_desc = le16_to_cpu(*chandesc++); +@@ -1968,6 +1976,15 @@ static int setup_channels(struct xillyus + */ + + if ((out_desc & 0x80) && i < 14) { /* Entry is valid */ ++ if (usb_pipe_type_check(udev, ++ usb_sndbulkpipe(udev, i + 2))) { ++ dev_err(xdev->dev, ++ "Missing BULK OUT endpoint %d\n", ++ i + 2); ++ kfree(new_channels); ++ return -ENODEV; ++ } ++ + chan->writable = 1; + chan->out_synchronous = !!(out_desc & 0x40); + chan->out_seekable = !!(out_desc & 0x20); +@@ -1977,6 +1994,7 @@ static int setup_channels(struct xillyus + } + } + ++ xdev->channels = new_channels; + return 0; + } + diff --git a/queue-6.10/char-xillybus-don-t-destroy-workqueue-from-work-item-running-on-it.patch b/queue-6.10/char-xillybus-don-t-destroy-workqueue-from-work-item-running-on-it.patch new file mode 100644 index 00000000000..79c860c8dc1 --- /dev/null +++ b/queue-6.10/char-xillybus-don-t-destroy-workqueue-from-work-item-running-on-it.patch @@ -0,0 +1,83 @@ +From ccbde4b128ef9c73d14d0d7817d68ef795f6d131 Mon Sep 17 00:00:00 2001 +From: Eli Billauer +Date: Thu, 1 Aug 2024 15:11:26 +0300 +Subject: char: xillybus: Don't destroy workqueue from work item running on it + +From: Eli Billauer + +commit ccbde4b128ef9c73d14d0d7817d68ef795f6d131 upstream. + +Triggered by a kref decrement, destroy_workqueue() may be called from +within a work item for destroying its own workqueue. This illegal +situation is averted by adding a module-global workqueue for exclusive +use of the offending work item. Other work items continue to be queued +on per-device workqueues to ensure performance. + +Reported-by: syzbot+91dbdfecdd3287734d8e@syzkaller.appspotmail.com +Cc: stable +Closes: https://lore.kernel.org/lkml/0000000000000ab25a061e1dfe9f@google.com/ +Signed-off-by: Eli Billauer +Link: https://lore.kernel.org/r/20240801121126.60183-1-eli.billauer@gmail.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/char/xillybus/xillyusb.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +--- a/drivers/char/xillybus/xillyusb.c ++++ b/drivers/char/xillybus/xillyusb.c +@@ -50,6 +50,7 @@ MODULE_LICENSE("GPL v2"); + static const char xillyname[] = "xillyusb"; + + static unsigned int fifo_buf_order; ++static struct workqueue_struct *wakeup_wq; + + #define USB_VENDOR_ID_XILINX 0x03fd + #define USB_VENDOR_ID_ALTERA 0x09fb +@@ -569,10 +570,6 @@ static void cleanup_dev(struct kref *kre + * errors if executed. The mechanism relies on that xdev->error is assigned + * a non-zero value by report_io_error() prior to queueing wakeup_all(), + * which prevents bulk_in_work() from calling process_bulk_in(). +- * +- * The fact that wakeup_all() and bulk_in_work() are queued on the same +- * workqueue makes their concurrent execution very unlikely, however the +- * kernel's API doesn't seem to ensure this strictly. + */ + + static void wakeup_all(struct work_struct *work) +@@ -627,7 +624,7 @@ static void report_io_error(struct xilly + + if (do_once) { + kref_get(&xdev->kref); /* xdev is used by work item */ +- queue_work(xdev->workq, &xdev->wakeup_workitem); ++ queue_work(wakeup_wq, &xdev->wakeup_workitem); + } + } + +@@ -2258,6 +2255,10 @@ static int __init xillyusb_init(void) + { + int rc = 0; + ++ wakeup_wq = alloc_workqueue(xillyname, 0, 0); ++ if (!wakeup_wq) ++ return -ENOMEM; ++ + if (LOG2_INITIAL_FIFO_BUF_SIZE > PAGE_SHIFT) + fifo_buf_order = LOG2_INITIAL_FIFO_BUF_SIZE - PAGE_SHIFT; + else +@@ -2265,11 +2266,16 @@ static int __init xillyusb_init(void) + + rc = usb_register(&xillyusb_driver); + ++ if (rc) ++ destroy_workqueue(wakeup_wq); ++ + return rc; + } + + static void __exit xillyusb_exit(void) + { ++ destroy_workqueue(wakeup_wq); ++ + usb_deregister(&xillyusb_driver); + } + diff --git a/queue-6.10/char-xillybus-refine-workqueue-handling.patch b/queue-6.10/char-xillybus-refine-workqueue-handling.patch new file mode 100644 index 00000000000..cc782a39d7f --- /dev/null +++ b/queue-6.10/char-xillybus-refine-workqueue-handling.patch @@ -0,0 +1,52 @@ +From ad899c301c880766cc709aad277991b3ab671b66 Mon Sep 17 00:00:00 2001 +From: Eli Billauer +Date: Fri, 16 Aug 2024 10:01:59 +0300 +Subject: char: xillybus: Refine workqueue handling + +From: Eli Billauer + +commit ad899c301c880766cc709aad277991b3ab671b66 upstream. + +As the wakeup work item now runs on a separate workqueue, it needs to be +flushed separately along with flushing the device's workqueue. + +Also, move the destroy_workqueue() call to the end of the exit method, +so that deinitialization is done in the opposite order of +initialization. + +Fixes: ccbde4b128ef ("char: xillybus: Don't destroy workqueue from work item running on it") +Cc: stable +Signed-off-by: Eli Billauer +Link: https://lore.kernel.org/r/20240816070200.50695-1-eli.billauer@gmail.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/char/xillybus/xillyusb.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/drivers/char/xillybus/xillyusb.c ++++ b/drivers/char/xillybus/xillyusb.c +@@ -2093,9 +2093,11 @@ static int xillyusb_discovery(struct usb + * just after responding with the IDT, there is no reason for any + * work item to be running now. To be sure that xdev->channels + * is updated on anything that might run in parallel, flush the +- * workqueue, which rarely does anything. ++ * device's workqueue and the wakeup work item. This rarely ++ * does anything. + */ + flush_workqueue(xdev->workq); ++ flush_work(&xdev->wakeup_workitem); + + xdev->num_channels = num_channels; + +@@ -2274,9 +2276,9 @@ static int __init xillyusb_init(void) + + static void __exit xillyusb_exit(void) + { +- destroy_workqueue(wakeup_wq); +- + usb_deregister(&xillyusb_driver); ++ ++ destroy_workqueue(wakeup_wq); + } + + module_init(xillyusb_init); diff --git a/queue-6.10/dm-persistent-data-fix-memory-allocation-failure.patch b/queue-6.10/dm-persistent-data-fix-memory-allocation-failure.patch new file mode 100644 index 00000000000..3b5c8a3f8a0 --- /dev/null +++ b/queue-6.10/dm-persistent-data-fix-memory-allocation-failure.patch @@ -0,0 +1,45 @@ +From faada2174c08662ae98b439c69efe3e79382c538 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Tue, 13 Aug 2024 16:35:14 +0200 +Subject: dm persistent data: fix memory allocation failure + +From: Mikulas Patocka + +commit faada2174c08662ae98b439c69efe3e79382c538 upstream. + +kmalloc is unreliable when allocating more than 8 pages of memory. It may +fail when there is plenty of free memory but the memory is fragmented. +Zdenek Kabelac observed such failure in his tests. + +This commit changes kmalloc to kvmalloc - kvmalloc will fall back to +vmalloc if the large allocation fails. + +Signed-off-by: Mikulas Patocka +Reported-by: Zdenek Kabelac +Reviewed-by: Mike Snitzer +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/persistent-data/dm-space-map-metadata.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/md/persistent-data/dm-space-map-metadata.c ++++ b/drivers/md/persistent-data/dm-space-map-metadata.c +@@ -277,7 +277,7 @@ static void sm_metadata_destroy(struct d + { + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + +- kfree(smm); ++ kvfree(smm); + } + + static int sm_metadata_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count) +@@ -772,7 +772,7 @@ struct dm_space_map *dm_sm_metadata_init + { + struct sm_metadata *smm; + +- smm = kmalloc(sizeof(*smm), GFP_KERNEL); ++ smm = kvmalloc(sizeof(*smm), GFP_KERNEL); + if (!smm) + return ERR_PTR(-ENOMEM); + diff --git a/queue-6.10/dm-resume-don-t-return-einval-when-signalled.patch b/queue-6.10/dm-resume-don-t-return-einval-when-signalled.patch new file mode 100644 index 00000000000..486f01315a5 --- /dev/null +++ b/queue-6.10/dm-resume-don-t-return-einval-when-signalled.patch @@ -0,0 +1,60 @@ +From 7a636b4f03af9d541205f69e373672e7b2b60a8a Mon Sep 17 00:00:00 2001 +From: Khazhismel Kumykov +Date: Tue, 13 Aug 2024 12:39:52 +0200 +Subject: dm resume: don't return EINVAL when signalled + +From: Khazhismel Kumykov + +commit 7a636b4f03af9d541205f69e373672e7b2b60a8a upstream. + +If the dm_resume method is called on a device that is not suspended, the +method will suspend the device briefly, before resuming it (so that the +table will be swapped). + +However, there was a bug that the return value of dm_suspended_md was not +checked. dm_suspended_md may return an error when it is interrupted by a +signal. In this case, do_resume would call dm_swap_table, which would +return -EINVAL. + +This commit fixes the logic, so that error returned by dm_suspend is +checked and the resume operation is undone. + +Signed-off-by: Mikulas Patocka +Signed-off-by: Khazhismel Kumykov +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-ioctl.c | 22 ++++++++++++++++++++-- + 1 file changed, 20 insertions(+), 2 deletions(-) + +--- a/drivers/md/dm-ioctl.c ++++ b/drivers/md/dm-ioctl.c +@@ -1181,8 +1181,26 @@ static int do_resume(struct dm_ioctl *pa + suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG; + if (param->flags & DM_NOFLUSH_FLAG) + suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG; +- if (!dm_suspended_md(md)) +- dm_suspend(md, suspend_flags); ++ if (!dm_suspended_md(md)) { ++ r = dm_suspend(md, suspend_flags); ++ if (r) { ++ down_write(&_hash_lock); ++ hc = dm_get_mdptr(md); ++ if (hc && !hc->new_map) { ++ hc->new_map = new_map; ++ new_map = NULL; ++ } else { ++ r = -ENXIO; ++ } ++ up_write(&_hash_lock); ++ if (new_map) { ++ dm_sync_table(md); ++ dm_table_destroy(new_map); ++ } ++ dm_put(md); ++ return r; ++ } ++ } + + old_size = dm_get_size(md); + old_map = dm_swap_table(md, new_map); diff --git a/queue-6.10/drm-amdgpu-actually-check-flags-for-all-context-ops.patch b/queue-6.10/drm-amdgpu-actually-check-flags-for-all-context-ops.patch new file mode 100644 index 00000000000..e5a399a2ef1 --- /dev/null +++ b/queue-6.10/drm-amdgpu-actually-check-flags-for-all-context-ops.patch @@ -0,0 +1,50 @@ +From 0573a1e2ea7e35bff08944a40f1adf2bb35cea61 Mon Sep 17 00:00:00 2001 +From: Bas Nieuwenhuizen +Date: Tue, 6 Aug 2024 22:27:32 +0200 +Subject: drm/amdgpu: Actually check flags for all context ops. + +From: Bas Nieuwenhuizen + +commit 0573a1e2ea7e35bff08944a40f1adf2bb35cea61 upstream. + +Missing validation ... + +Checked libdrm and it clears all the structs, so we should be +safe to just check everything. + +Signed-off-by: Bas Nieuwenhuizen +Signed-off-by: Alex Deucher +(cherry picked from commit c6b86421f1f9ddf9d706f2453159813ee39d0cf9) +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c +@@ -685,16 +685,24 @@ int amdgpu_ctx_ioctl(struct drm_device * + + switch (args->in.op) { + case AMDGPU_CTX_OP_ALLOC_CTX: ++ if (args->in.flags) ++ return -EINVAL; + r = amdgpu_ctx_alloc(adev, fpriv, filp, priority, &id); + args->out.alloc.ctx_id = id; + break; + case AMDGPU_CTX_OP_FREE_CTX: ++ if (args->in.flags) ++ return -EINVAL; + r = amdgpu_ctx_free(fpriv, id); + break; + case AMDGPU_CTX_OP_QUERY_STATE: ++ if (args->in.flags) ++ return -EINVAL; + r = amdgpu_ctx_query(adev, fpriv, id, &args->out); + break; + case AMDGPU_CTX_OP_QUERY_STATE2: ++ if (args->in.flags) ++ return -EINVAL; + r = amdgpu_ctx_query2(adev, fpriv, id, &args->out); + break; + case AMDGPU_CTX_OP_GET_STABLE_PSTATE: diff --git a/queue-6.10/fix-bitmap-corruption-on-close_range-with-close_range_unshare.patch b/queue-6.10/fix-bitmap-corruption-on-close_range-with-close_range_unshare.patch new file mode 100644 index 00000000000..e864ddf457a --- /dev/null +++ b/queue-6.10/fix-bitmap-corruption-on-close_range-with-close_range_unshare.patch @@ -0,0 +1,184 @@ +From 9a2fa1472083580b6c66bdaf291f591e1170123a Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Sat, 3 Aug 2024 18:02:00 -0400 +Subject: fix bitmap corruption on close_range() with CLOSE_RANGE_UNSHARE + +From: Al Viro + +commit 9a2fa1472083580b6c66bdaf291f591e1170123a upstream. + +copy_fd_bitmaps(new, old, count) is expected to copy the first +count/BITS_PER_LONG bits from old->full_fds_bits[] and fill +the rest with zeroes. What it does is copying enough words +(BITS_TO_LONGS(count/BITS_PER_LONG)), then memsets the rest. +That works fine, *if* all bits past the cutoff point are +clear. Otherwise we are risking garbage from the last word +we'd copied. + +For most of the callers that is true - expand_fdtable() has +count equal to old->max_fds, so there's no open descriptors +past count, let alone fully occupied words in ->open_fds[], +which is what bits in ->full_fds_bits[] correspond to. + +The other caller (dup_fd()) passes sane_fdtable_size(old_fdt, max_fds), +which is the smallest multiple of BITS_PER_LONG that covers all +opened descriptors below max_fds. In the common case (copying on +fork()) max_fds is ~0U, so all opened descriptors will be below +it and we are fine, by the same reasons why the call in expand_fdtable() +is safe. + +Unfortunately, there is a case where max_fds is less than that +and where we might, indeed, end up with junk in ->full_fds_bits[] - +close_range(from, to, CLOSE_RANGE_UNSHARE) with + * descriptor table being currently shared + * 'to' being above the current capacity of descriptor table + * 'from' being just under some chunk of opened descriptors. +In that case we end up with observably wrong behaviour - e.g. spawn +a child with CLONE_FILES, get all descriptors in range 0..127 open, +then close_range(64, ~0U, CLOSE_RANGE_UNSHARE) and watch dup(0) ending +up with descriptor #128, despite #64 being observably not open. + +The minimally invasive fix would be to deal with that in dup_fd(). +If this proves to add measurable overhead, we can go that way, but +let's try to fix copy_fd_bitmaps() first. + +* new helper: bitmap_copy_and_expand(to, from, bits_to_copy, size). +* make copy_fd_bitmaps() take the bitmap size in words, rather than +bits; it's 'count' argument is always a multiple of BITS_PER_LONG, +so we are not losing any information, and that way we can use the +same helper for all three bitmaps - compiler will see that count +is a multiple of BITS_PER_LONG for the large ones, so it'll generate +plain memcpy()+memset(). + +Reproducer added to tools/testing/selftests/core/close_range_test.c + +Cc: stable@vger.kernel.org +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman +--- + fs/file.c | 28 ++++++++----------- + include/linux/bitmap.h | 12 ++++++++ + tools/testing/selftests/core/close_range_test.c | 35 ++++++++++++++++++++++++ + 3 files changed, 59 insertions(+), 16 deletions(-) + +--- a/fs/file.c ++++ b/fs/file.c +@@ -46,27 +46,23 @@ static void free_fdtable_rcu(struct rcu_ + #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr)) + #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long)) + ++#define fdt_words(fdt) ((fdt)->max_fds / BITS_PER_LONG) // words in ->open_fds + /* + * Copy 'count' fd bits from the old table to the new table and clear the extra + * space if any. This does not copy the file pointers. Called with the files + * spinlock held for write. + */ +-static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, +- unsigned int count) ++static inline void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt, ++ unsigned int copy_words) + { +- unsigned int cpy, set; ++ unsigned int nwords = fdt_words(nfdt); + +- cpy = count / BITS_PER_BYTE; +- set = (nfdt->max_fds - count) / BITS_PER_BYTE; +- memcpy(nfdt->open_fds, ofdt->open_fds, cpy); +- memset((char *)nfdt->open_fds + cpy, 0, set); +- memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); +- memset((char *)nfdt->close_on_exec + cpy, 0, set); +- +- cpy = BITBIT_SIZE(count); +- set = BITBIT_SIZE(nfdt->max_fds) - cpy; +- memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy); +- memset((char *)nfdt->full_fds_bits + cpy, 0, set); ++ bitmap_copy_and_extend(nfdt->open_fds, ofdt->open_fds, ++ copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); ++ bitmap_copy_and_extend(nfdt->close_on_exec, ofdt->close_on_exec, ++ copy_words * BITS_PER_LONG, nwords * BITS_PER_LONG); ++ bitmap_copy_and_extend(nfdt->full_fds_bits, ofdt->full_fds_bits, ++ copy_words, nwords); + } + + /* +@@ -84,7 +80,7 @@ static void copy_fdtable(struct fdtable + memcpy(nfdt->fd, ofdt->fd, cpy); + memset((char *)nfdt->fd + cpy, 0, set); + +- copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds); ++ copy_fd_bitmaps(nfdt, ofdt, fdt_words(ofdt)); + } + + /* +@@ -379,7 +375,7 @@ struct files_struct *dup_fd(struct files + open_files = sane_fdtable_size(old_fdt, max_fds); + } + +- copy_fd_bitmaps(new_fdt, old_fdt, open_files); ++ copy_fd_bitmaps(new_fdt, old_fdt, open_files / BITS_PER_LONG); + + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; +--- a/include/linux/bitmap.h ++++ b/include/linux/bitmap.h +@@ -270,6 +270,18 @@ static inline void bitmap_copy_clear_tai + dst[nbits / BITS_PER_LONG] &= BITMAP_LAST_WORD_MASK(nbits); + } + ++static inline void bitmap_copy_and_extend(unsigned long *to, ++ const unsigned long *from, ++ unsigned int count, unsigned int size) ++{ ++ unsigned int copy = BITS_TO_LONGS(count); ++ ++ memcpy(to, from, copy * sizeof(long)); ++ if (count % BITS_PER_LONG) ++ to[copy - 1] &= BITMAP_LAST_WORD_MASK(count); ++ memset(to + copy, 0, bitmap_size(size) - copy * sizeof(long)); ++} ++ + /* + * On 32-bit systems bitmaps are represented as u32 arrays internally. On LE64 + * machines the order of hi and lo parts of numbers match the bitmap structure. +--- a/tools/testing/selftests/core/close_range_test.c ++++ b/tools/testing/selftests/core/close_range_test.c +@@ -589,4 +589,39 @@ TEST(close_range_cloexec_unshare_syzbot) + EXPECT_EQ(close(fd3), 0); + } + ++TEST(close_range_bitmap_corruption) ++{ ++ pid_t pid; ++ int status; ++ struct __clone_args args = { ++ .flags = CLONE_FILES, ++ .exit_signal = SIGCHLD, ++ }; ++ ++ /* get the first 128 descriptors open */ ++ for (int i = 2; i < 128; i++) ++ EXPECT_GE(dup2(0, i), 0); ++ ++ /* get descriptor table shared */ ++ pid = sys_clone3(&args, sizeof(args)); ++ ASSERT_GE(pid, 0); ++ ++ if (pid == 0) { ++ /* unshare and truncate descriptor table down to 64 */ ++ if (sys_close_range(64, ~0U, CLOSE_RANGE_UNSHARE)) ++ exit(EXIT_FAILURE); ++ ++ ASSERT_EQ(fcntl(64, F_GETFD), -1); ++ /* ... and verify that the range 64..127 is not ++ stuck "fully used" according to secondary bitmap */ ++ EXPECT_EQ(dup(0), 64) ++ exit(EXIT_FAILURE); ++ exit(EXIT_SUCCESS); ++ } ++ ++ EXPECT_EQ(waitpid(pid, &status, 0), pid); ++ EXPECT_EQ(true, WIFEXITED(status)); ++ EXPECT_EQ(0, WEXITSTATUS(status)); ++} ++ + TEST_HARNESS_MAIN diff --git a/queue-6.10/fs-netfs-fscache_cookie-add-missing-n_accesses-check.patch b/queue-6.10/fs-netfs-fscache_cookie-add-missing-n_accesses-check.patch new file mode 100644 index 00000000000..774deb346ff --- /dev/null +++ b/queue-6.10/fs-netfs-fscache_cookie-add-missing-n_accesses-check.patch @@ -0,0 +1,114 @@ +From f71aa06398aabc2e3eaac25acdf3d62e0094ba70 Mon Sep 17 00:00:00 2001 +From: Max Kellermann +Date: Mon, 29 Jul 2024 17:19:30 +0100 +Subject: fs/netfs/fscache_cookie: add missing "n_accesses" check + +From: Max Kellermann + +commit f71aa06398aabc2e3eaac25acdf3d62e0094ba70 upstream. + +This fixes a NULL pointer dereference bug due to a data race which +looks like this: + + BUG: kernel NULL pointer dereference, address: 0000000000000008 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 0 P4D 0 + Oops: 0000 [#1] SMP PTI + CPU: 33 PID: 16573 Comm: kworker/u97:799 Not tainted 6.8.7-cm4all1-hp+ #43 + Hardware name: HP ProLiant DL380 Gen9/ProLiant DL380 Gen9, BIOS P89 10/17/2018 + Workqueue: events_unbound netfs_rreq_write_to_cache_work + RIP: 0010:cachefiles_prepare_write+0x30/0xa0 + Code: 57 41 56 45 89 ce 41 55 49 89 cd 41 54 49 89 d4 55 53 48 89 fb 48 83 ec 08 48 8b 47 08 48 83 7f 10 00 48 89 34 24 48 8b 68 20 <48> 8b 45 08 4c 8b 38 74 45 49 8b 7f 50 e8 4e a9 b0 ff 48 8b 73 10 + RSP: 0018:ffffb4e78113bde0 EFLAGS: 00010286 + RAX: ffff976126be6d10 RBX: ffff97615cdb8438 RCX: 0000000000020000 + RDX: ffff97605e6c4c68 RSI: ffff97605e6c4c60 RDI: ffff97615cdb8438 + RBP: 0000000000000000 R08: 0000000000278333 R09: 0000000000000001 + R10: ffff97605e6c4600 R11: 0000000000000001 R12: ffff97605e6c4c68 + R13: 0000000000020000 R14: 0000000000000001 R15: ffff976064fe2c00 + FS: 0000000000000000(0000) GS:ffff9776dfd40000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000000008 CR3: 000000005942c002 CR4: 00000000001706f0 + Call Trace: + + ? __die+0x1f/0x70 + ? page_fault_oops+0x15d/0x440 + ? search_module_extables+0xe/0x40 + ? fixup_exception+0x22/0x2f0 + ? exc_page_fault+0x5f/0x100 + ? asm_exc_page_fault+0x22/0x30 + ? cachefiles_prepare_write+0x30/0xa0 + netfs_rreq_write_to_cache_work+0x135/0x2e0 + process_one_work+0x137/0x2c0 + worker_thread+0x2e9/0x400 + ? __pfx_worker_thread+0x10/0x10 + kthread+0xcc/0x100 + ? __pfx_kthread+0x10/0x10 + ret_from_fork+0x30/0x50 + ? __pfx_kthread+0x10/0x10 + ret_from_fork_asm+0x1b/0x30 + + Modules linked in: + CR2: 0000000000000008 + ---[ end trace 0000000000000000 ]--- + +This happened because fscache_cookie_state_machine() was slow and was +still running while another process invoked fscache_unuse_cookie(); +this led to a fscache_cookie_lru_do_one() call, setting the +FSCACHE_COOKIE_DO_LRU_DISCARD flag, which was picked up by +fscache_cookie_state_machine(), withdrawing the cookie via +cachefiles_withdraw_cookie(), clearing cookie->cache_priv. + +At the same time, yet another process invoked +cachefiles_prepare_write(), which found a NULL pointer in this code +line: + + struct cachefiles_object *object = cachefiles_cres_object(cres); + +The next line crashes, obviously: + + struct cachefiles_cache *cache = object->volume->cache; + +During cachefiles_prepare_write(), the "n_accesses" counter is +non-zero (via fscache_begin_operation()). The cookie must not be +withdrawn until it drops to zero. + +The counter is checked by fscache_cookie_state_machine() before +switching to FSCACHE_COOKIE_STATE_RELINQUISHING and +FSCACHE_COOKIE_STATE_WITHDRAWING (in "case +FSCACHE_COOKIE_STATE_FAILED"), but not for +FSCACHE_COOKIE_STATE_LRU_DISCARDING ("case +FSCACHE_COOKIE_STATE_ACTIVE"). + +This patch adds the missing check. With a non-zero access counter, +the function returns and the next fscache_end_cookie_access() call +will queue another fscache_cookie_state_machine() call to handle the +still-pending FSCACHE_COOKIE_DO_LRU_DISCARD. + +Fixes: 12bb21a29c19 ("fscache: Implement cookie user counting and resource pinning") +Signed-off-by: Max Kellermann +Signed-off-by: David Howells +Link: https://lore.kernel.org/r/20240729162002.3436763-2-dhowells@redhat.com +cc: Jeff Layton +cc: netfs@lists.linux.dev +cc: linux-fsdevel@vger.kernel.org +cc: stable@vger.kernel.org +Signed-off-by: Christian Brauner +Signed-off-by: Greg Kroah-Hartman +--- + fs/netfs/fscache_cookie.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/netfs/fscache_cookie.c ++++ b/fs/netfs/fscache_cookie.c +@@ -741,6 +741,10 @@ again_locked: + spin_lock(&cookie->lock); + } + if (test_bit(FSCACHE_COOKIE_DO_LRU_DISCARD, &cookie->flags)) { ++ if (atomic_read(&cookie->n_accesses) != 0) ++ /* still being accessed: postpone it */ ++ break; ++ + __fscache_set_cookie_state(cookie, + FSCACHE_COOKIE_STATE_LRU_DISCARDING); + wake = true; diff --git a/queue-6.10/i2c-qcom-geni-add-missing-geni_icc_disable-in-geni_i2c_runtime_resume.patch b/queue-6.10/i2c-qcom-geni-add-missing-geni_icc_disable-in-geni_i2c_runtime_resume.patch new file mode 100644 index 00000000000..a5c15a22767 --- /dev/null +++ b/queue-6.10/i2c-qcom-geni-add-missing-geni_icc_disable-in-geni_i2c_runtime_resume.patch @@ -0,0 +1,39 @@ +From 4e91fa1ef3ce6290b4c598e54b5eb6cf134fbec8 Mon Sep 17 00:00:00 2001 +From: Andi Shyti +Date: Mon, 12 Aug 2024 21:40:28 +0200 +Subject: i2c: qcom-geni: Add missing geni_icc_disable in geni_i2c_runtime_resume + +From: Andi Shyti + +commit 4e91fa1ef3ce6290b4c598e54b5eb6cf134fbec8 upstream. + +Add the missing geni_icc_disable() call before returning in the +geni_i2c_runtime_resume() function. + +Commit 9ba48db9f77c ("i2c: qcom-geni: Add missing +geni_icc_disable in geni_i2c_runtime_resume") by Gaosheng missed +disabling the interconnect in one case. + +Fixes: bf225ed357c6 ("i2c: i2c-qcom-geni: Add interconnect support") +Cc: Gaosheng Cui +Cc: stable@vger.kernel.org # v5.9+ +Signed-off-by: Andi Shyti +Signed-off-by: Greg Kroah-Hartman +--- + drivers/i2c/busses/i2c-qcom-geni.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/drivers/i2c/busses/i2c-qcom-geni.c ++++ b/drivers/i2c/busses/i2c-qcom-geni.c +@@ -986,8 +986,10 @@ static int __maybe_unused geni_i2c_runti + return ret; + + ret = clk_prepare_enable(gi2c->core_clk); +- if (ret) ++ if (ret) { ++ geni_icc_disable(&gi2c->se); + return ret; ++ } + + ret = geni_se_resources_on(&gi2c->se); + if (ret) { diff --git a/queue-6.10/i2c-tegra-do-not-mark-acpi-devices-as-irq-safe.patch b/queue-6.10/i2c-tegra-do-not-mark-acpi-devices-as-irq-safe.patch new file mode 100644 index 00000000000..821a8e985bc --- /dev/null +++ b/queue-6.10/i2c-tegra-do-not-mark-acpi-devices-as-irq-safe.patch @@ -0,0 +1,58 @@ +From 14d069d92951a3e150c0a81f2ca3b93e54da913b Mon Sep 17 00:00:00 2001 +From: Breno Leitao +Date: Tue, 13 Aug 2024 09:12:53 -0700 +Subject: i2c: tegra: Do not mark ACPI devices as irq safe + +From: Breno Leitao + +commit 14d069d92951a3e150c0a81f2ca3b93e54da913b upstream. + +On ACPI machines, the tegra i2c module encounters an issue due to a +mutex being called inside a spinlock. This leads to the following bug: + + BUG: sleeping function called from invalid context at kernel/locking/mutex.c:585 + ... + + Call trace: + __might_sleep + __mutex_lock_common + mutex_lock_nested + acpi_subsys_runtime_resume + rpm_resume + tegra_i2c_xfer + +The problem arises because during __pm_runtime_resume(), the spinlock +&dev->power.lock is acquired before rpm_resume() is called. Later, +rpm_resume() invokes acpi_subsys_runtime_resume(), which relies on +mutexes, triggering the error. + +To address this issue, devices on ACPI are now marked as not IRQ-safe, +considering the dependency of acpi_subsys_runtime_resume() on mutexes. + +Fixes: bd2fdedbf2ba ("i2c: tegra: Add the ACPI support") +Cc: # v5.17+ +Co-developed-by: Michael van der Westhuizen +Signed-off-by: Michael van der Westhuizen +Signed-off-by: Breno Leitao +Reviewed-by: Dmitry Osipenko +Reviewed-by: Andy Shevchenko +Signed-off-by: Andi Shyti +Signed-off-by: Greg Kroah-Hartman +--- + drivers/i2c/busses/i2c-tegra.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/i2c/busses/i2c-tegra.c ++++ b/drivers/i2c/busses/i2c-tegra.c +@@ -1802,9 +1802,9 @@ static int tegra_i2c_probe(struct platfo + * domain. + * + * VI I2C device shouldn't be marked as IRQ-safe because VI I2C won't +- * be used for atomic transfers. ++ * be used for atomic transfers. ACPI device is not IRQ safe also. + */ +- if (!IS_VI(i2c_dev)) ++ if (!IS_VI(i2c_dev) && !has_acpi_companion(i2c_dev->dev)) + pm_runtime_irq_safe(i2c_dev->dev); + + pm_runtime_enable(i2c_dev->dev); diff --git a/queue-6.10/keys-trusted-dcp-fix-leak-of-blob-encryption-key.patch b/queue-6.10/keys-trusted-dcp-fix-leak-of-blob-encryption-key.patch new file mode 100644 index 00000000000..487df44230e --- /dev/null +++ b/queue-6.10/keys-trusted-dcp-fix-leak-of-blob-encryption-key.patch @@ -0,0 +1,141 @@ +From 0e28bf61a5f9ab30be3f3b4eafb8d097e39446bb Mon Sep 17 00:00:00 2001 +From: David Gstir +Date: Wed, 17 Jul 2024 13:28:45 +0200 +Subject: KEYS: trusted: dcp: fix leak of blob encryption key + +From: David Gstir + +commit 0e28bf61a5f9ab30be3f3b4eafb8d097e39446bb upstream. + +Trusted keys unseal the key blob on load, but keep the sealed payload in +the blob field so that every subsequent read (export) will simply +convert this field to hex and send it to userspace. + +With DCP-based trusted keys, we decrypt the blob encryption key (BEK) +in the Kernel due hardware limitations and then decrypt the blob payload. +BEK decryption is done in-place which means that the trusted key blob +field is modified and it consequently holds the BEK in plain text. +Every subsequent read of that key thus send the plain text BEK instead +of the encrypted BEK to userspace. + +This issue only occurs when importing a trusted DCP-based key and +then exporting it again. This should rarely happen as the common use cases +are to either create a new trusted key and export it, or import a key +blob and then just use it without exporting it again. + +Fix this by performing BEK decryption and encryption in a dedicated +buffer. Further always wipe the plain text BEK buffer to prevent leaking +the key via uninitialized memory. + +Cc: stable@vger.kernel.org # v6.10+ +Fixes: 2e8a0f40a39c ("KEYS: trusted: Introduce NXP DCP-backed trusted keys") +Signed-off-by: David Gstir +Reviewed-by: Jarkko Sakkinen +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Greg Kroah-Hartman +--- + security/keys/trusted-keys/trusted_dcp.c | 33 +++++++++++++++--------- + 1 file changed, 21 insertions(+), 12 deletions(-) + +diff --git a/security/keys/trusted-keys/trusted_dcp.c b/security/keys/trusted-keys/trusted_dcp.c +index b0947f072a98..4edc5bbbcda3 100644 +--- a/security/keys/trusted-keys/trusted_dcp.c ++++ b/security/keys/trusted-keys/trusted_dcp.c +@@ -186,20 +186,21 @@ static int do_aead_crypto(u8 *in, u8 *out, size_t len, u8 *key, u8 *nonce, + return ret; + } + +-static int decrypt_blob_key(u8 *key) ++static int decrypt_blob_key(u8 *encrypted_key, u8 *plain_key) + { +- return do_dcp_crypto(key, key, false); ++ return do_dcp_crypto(encrypted_key, plain_key, false); + } + +-static int encrypt_blob_key(u8 *key) ++static int encrypt_blob_key(u8 *plain_key, u8 *encrypted_key) + { +- return do_dcp_crypto(key, key, true); ++ return do_dcp_crypto(plain_key, encrypted_key, true); + } + + static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob) + { + struct dcp_blob_fmt *b = (struct dcp_blob_fmt *)p->blob; + int blen, ret; ++ u8 plain_blob_key[AES_KEYSIZE_128]; + + blen = calc_blob_len(p->key_len); + if (blen > MAX_BLOB_SIZE) +@@ -207,30 +208,36 @@ static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob) + + b->fmt_version = DCP_BLOB_VERSION; + get_random_bytes(b->nonce, AES_KEYSIZE_128); +- get_random_bytes(b->blob_key, AES_KEYSIZE_128); ++ get_random_bytes(plain_blob_key, AES_KEYSIZE_128); + +- ret = do_aead_crypto(p->key, b->payload, p->key_len, b->blob_key, ++ ret = do_aead_crypto(p->key, b->payload, p->key_len, plain_blob_key, + b->nonce, true); + if (ret) { + pr_err("Unable to encrypt blob payload: %i\n", ret); +- return ret; ++ goto out; + } + +- ret = encrypt_blob_key(b->blob_key); ++ ret = encrypt_blob_key(plain_blob_key, b->blob_key); + if (ret) { + pr_err("Unable to encrypt blob key: %i\n", ret); +- return ret; ++ goto out; + } + + put_unaligned_le32(p->key_len, &b->payload_len); + p->blob_len = blen; +- return 0; ++ ret = 0; ++ ++out: ++ memzero_explicit(plain_blob_key, sizeof(plain_blob_key)); ++ ++ return ret; + } + + static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob) + { + struct dcp_blob_fmt *b = (struct dcp_blob_fmt *)p->blob; + int blen, ret; ++ u8 plain_blob_key[AES_KEYSIZE_128]; + + if (b->fmt_version != DCP_BLOB_VERSION) { + pr_err("DCP blob has bad version: %i, expected %i\n", +@@ -248,14 +255,14 @@ static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob) + goto out; + } + +- ret = decrypt_blob_key(b->blob_key); ++ ret = decrypt_blob_key(b->blob_key, plain_blob_key); + if (ret) { + pr_err("Unable to decrypt blob key: %i\n", ret); + goto out; + } + + ret = do_aead_crypto(b->payload, p->key, p->key_len + DCP_BLOB_AUTHLEN, +- b->blob_key, b->nonce, false); ++ plain_blob_key, b->nonce, false); + if (ret) { + pr_err("Unwrap of DCP payload failed: %i\n", ret); + goto out; +@@ -263,6 +270,8 @@ static int trusted_dcp_unseal(struct trusted_key_payload *p, char *datablob) + + ret = 0; + out: ++ memzero_explicit(plain_blob_key, sizeof(plain_blob_key)); ++ + return ret; + } + +-- +2.46.0 + diff --git a/queue-6.10/keys-trusted-fix-dcp-blob-payload-length-assignment.patch b/queue-6.10/keys-trusted-fix-dcp-blob-payload-length-assignment.patch new file mode 100644 index 00000000000..92558cd63a0 --- /dev/null +++ b/queue-6.10/keys-trusted-fix-dcp-blob-payload-length-assignment.patch @@ -0,0 +1,44 @@ +From 6486cad00a8b7f8585983408c152bbe33dda529b Mon Sep 17 00:00:00 2001 +From: David Gstir +Date: Wed, 17 Jul 2024 13:28:44 +0200 +Subject: KEYS: trusted: fix DCP blob payload length assignment + +From: David Gstir + +commit 6486cad00a8b7f8585983408c152bbe33dda529b upstream. + +The DCP trusted key type uses the wrong helper function to store +the blob's payload length which can lead to the wrong byte order +being used in case this would ever run on big endian architectures. + +Fix by using correct helper function. + +Cc: stable@vger.kernel.org # v6.10+ +Fixes: 2e8a0f40a39c ("KEYS: trusted: Introduce NXP DCP-backed trusted keys") +Suggested-by: Richard Weinberger +Reported-by: kernel test robot +Closes: https://lore.kernel.org/oe-kbuild-all/202405240610.fj53EK0q-lkp@intel.com/ +Signed-off-by: David Gstir +Reviewed-by: Jarkko Sakkinen +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Greg Kroah-Hartman +--- + security/keys/trusted-keys/trusted_dcp.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/security/keys/trusted-keys/trusted_dcp.c b/security/keys/trusted-keys/trusted_dcp.c +index b5f81a05be36..b0947f072a98 100644 +--- a/security/keys/trusted-keys/trusted_dcp.c ++++ b/security/keys/trusted-keys/trusted_dcp.c +@@ -222,7 +222,7 @@ static int trusted_dcp_seal(struct trusted_key_payload *p, char *datablob) + return ret; + } + +- b->payload_len = get_unaligned_le32(&p->key_len); ++ put_unaligned_le32(p->key_len, &b->payload_len); + p->blob_len = blen; + return 0; + } +-- +2.46.0 + diff --git a/queue-6.10/kvm-s390-fix-validity-interception-issue-when-gisa-is-switched-off.patch b/queue-6.10/kvm-s390-fix-validity-interception-issue-when-gisa-is-switched-off.patch new file mode 100644 index 00000000000..6542222ebf6 --- /dev/null +++ b/queue-6.10/kvm-s390-fix-validity-interception-issue-when-gisa-is-switched-off.patch @@ -0,0 +1,88 @@ +From 5a44bb061d04b0306f2aa8add761d86d152b9377 Mon Sep 17 00:00:00 2001 +From: Michael Mueller +Date: Thu, 1 Aug 2024 14:31:09 +0200 +Subject: KVM: s390: fix validity interception issue when gisa is switched off + +From: Michael Mueller + +commit 5a44bb061d04b0306f2aa8add761d86d152b9377 upstream. + +We might run into a SIE validity if gisa has been disabled either via using +kernel parameter "kvm.use_gisa=0" or by setting the related sysfs +attribute to N (echo N >/sys/module/kvm/parameters/use_gisa). + +The validity is caused by an invalid value in the SIE control block's +gisa designation. That happens because we pass the uninitialized gisa +origin to virt_to_phys() before writing it to the gisa designation. + +To fix this we return 0 in kvm_s390_get_gisa_desc() if the origin is 0. +kvm_s390_get_gisa_desc() is used to determine which gisa designation to +set in the SIE control block. A value of 0 in the gisa designation disables +gisa usage. + +The issue surfaces in the host kernel with the following kernel message as +soon a new kvm guest start is attemted. + +kvm: unhandled validity intercept 0x1011 +WARNING: CPU: 0 PID: 781237 at arch/s390/kvm/intercept.c:101 kvm_handle_sie_intercept+0x42e/0x4d0 [kvm] +Modules linked in: vhost_net tap tun xt_CHECKSUM xt_MASQUERADE xt_conntrack ipt_REJECT xt_tcpudp nft_compat x_tables nf_nat_tftp nf_conntrack_tftp vfio_pci_core irqbypass vhost_vsock vmw_vsock_virtio_transport_common vsock vhost vhost_iotlb kvm nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ip_set nf_tables sunrpc mlx5_ib ib_uverbs ib_core mlx5_core uvdevice s390_trng eadm_sch vfio_ccw zcrypt_cex4 mdev vfio_iommu_type1 vfio sch_fq_codel drm i2c_core loop drm_panel_orientation_quirks configfs nfnetlink lcs ctcm fsm dm_service_time ghash_s390 prng chacha_s390 libchacha aes_s390 des_s390 libdes sha3_512_s390 sha3_256_s390 sha512_s390 sha256_s390 sha1_s390 sha_common dm_mirror dm_region_hash dm_log zfcp scsi_transport_fc scsi_dh_rdac scsi_dh_emc scsi_dh_alua pkey zcrypt dm_multipath rng_core autofs4 [last unloaded: vfio_pci] +CPU: 0 PID: 781237 Comm: CPU 0/KVM Not tainted 6.10.0-08682-gcad9f11498ea #6 +Hardware name: IBM 3931 A01 701 (LPAR) +Krnl PSW : 0704c00180000000 000003d93deb0122 (kvm_handle_sie_intercept+0x432/0x4d0 [kvm]) + R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3 +Krnl GPRS: 000003d900000027 000003d900000023 0000000000000028 000002cd00000000 + 000002d063a00900 00000359c6daf708 00000000000bebb5 0000000000001eff + 000002cfd82e9000 000002cfd80bc000 0000000000001011 000003d93deda412 + 000003ff8962df98 000003d93de77ce0 000003d93deb011e 00000359c6daf960 +Krnl Code: 000003d93deb0112: c020fffe7259 larl %r2,000003d93de7e5c4 + 000003d93deb0118: c0e53fa8beac brasl %r14,000003d9bd3c7e70 + #000003d93deb011e: af000000 mc 0,0 + >000003d93deb0122: a728ffea lhi %r2,-22 + 000003d93deb0126: a7f4fe24 brc 15,000003d93deafd6e + 000003d93deb012a: 9101f0b0 tm 176(%r15),1 + 000003d93deb012e: a774fe48 brc 7,000003d93deafdbe + 000003d93deb0132: 40a0f0ae sth %r10,174(%r15) +Call Trace: + [<000003d93deb0122>] kvm_handle_sie_intercept+0x432/0x4d0 [kvm] +([<000003d93deb011e>] kvm_handle_sie_intercept+0x42e/0x4d0 [kvm]) + [<000003d93deacc10>] vcpu_post_run+0x1d0/0x3b0 [kvm] + [<000003d93deaceda>] __vcpu_run+0xea/0x2d0 [kvm] + [<000003d93dead9da>] kvm_arch_vcpu_ioctl_run+0x16a/0x430 [kvm] + [<000003d93de93ee0>] kvm_vcpu_ioctl+0x190/0x7c0 [kvm] + [<000003d9bd728b4e>] vfs_ioctl+0x2e/0x70 + [<000003d9bd72a092>] __s390x_sys_ioctl+0xc2/0xd0 + [<000003d9be0e9222>] __do_syscall+0x1f2/0x2e0 + [<000003d9be0f9a90>] system_call+0x70/0x98 +Last Breaking-Event-Address: + [<000003d9bd3c7f58>] __warn_printk+0xe8/0xf0 + +Cc: stable@vger.kernel.org +Reported-by: Christian Borntraeger +Fixes: fe0ef0030463 ("KVM: s390: sort out physical vs virtual pointers usage") +Signed-off-by: Michael Mueller +Tested-by: Christian Borntraeger +Reviewed-by: Janosch Frank +Link: https://lore.kernel.org/r/20240801123109.2782155-1-mimu@linux.ibm.com +Message-ID: <20240801123109.2782155-1-mimu@linux.ibm.com> +Signed-off-by: Janosch Frank +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/kvm/kvm-s390.h | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/arch/s390/kvm/kvm-s390.h ++++ b/arch/s390/kvm/kvm-s390.h +@@ -267,7 +267,12 @@ static inline unsigned long kvm_s390_get + + static inline u32 kvm_s390_get_gisa_desc(struct kvm *kvm) + { +- u32 gd = virt_to_phys(kvm->arch.gisa_int.origin); ++ u32 gd; ++ ++ if (!kvm->arch.gisa_int.origin) ++ return 0; ++ ++ gd = virt_to_phys(kvm->arch.gisa_int.origin); + + if (gd && sclp.has_gisaf) + gd |= GISA_FORMAT1; diff --git a/queue-6.10/md-raid1-fix-data-corruption-for-degraded-array-with-slow-disk.patch b/queue-6.10/md-raid1-fix-data-corruption-for-degraded-array-with-slow-disk.patch new file mode 100644 index 00000000000..91b287bcc26 --- /dev/null +++ b/queue-6.10/md-raid1-fix-data-corruption-for-degraded-array-with-slow-disk.patch @@ -0,0 +1,93 @@ +From c916ca35308d3187c9928664f9be249b22a3a701 Mon Sep 17 00:00:00 2001 +From: Yu Kuai +Date: Sat, 3 Aug 2024 17:11:37 +0800 +Subject: md/raid1: Fix data corruption for degraded array with slow disk +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Yu Kuai + +commit c916ca35308d3187c9928664f9be249b22a3a701 upstream. + +read_balance() will avoid reading from slow disks as much as possible, +however, if valid data only lands in slow disks, and a new normal disk +is still in recovery, unrecovered data can be read: + +raid1_read_request + read_balance + raid1_should_read_first + -> return false + choose_best_rdev + -> normal disk is not recovered, return -1 + choose_bb_rdev + -> missing the checking of recovery, return the normal disk + -> read unrecovered data + +Root cause is that the checking of recovery is missing in +choose_bb_rdev(). Hence add such checking to fix the problem. + +Also fix similar problem in choose_slow_rdev(). + +Cc: stable@vger.kernel.org +Fixes: 9f3ced792203 ("md/raid1: factor out choose_bb_rdev() from read_balance()") +Fixes: dfa8ecd167c1 ("md/raid1: factor out choose_slow_rdev() from read_balance()") +Reported-and-tested-by: Mateusz Jończyk +Closes: https://lore.kernel.org/all/9952f532-2554-44bf-b906-4880b2e88e3a@o2.pl/ +Signed-off-by: Yu Kuai +Link: https://lore.kernel.org/r/20240803091137.3197008-1-yukuai1@huaweicloud.com +Signed-off-by: Song Liu +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/raid1.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c +index 7acfe7c9dc8d..761989d67906 100644 +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -617,6 +617,12 @@ static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio, + return -1; + } + ++static bool rdev_in_recovery(struct md_rdev *rdev, struct r1bio *r1_bio) ++{ ++ return !test_bit(In_sync, &rdev->flags) && ++ rdev->recovery_offset < r1_bio->sector + r1_bio->sectors; ++} ++ + static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) + { +@@ -635,6 +641,7 @@ static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags) || ++ rdev_in_recovery(rdev, r1_bio) || + test_bit(WriteMostly, &rdev->flags)) + continue; + +@@ -673,7 +680,8 @@ static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags) || +- !test_bit(WriteMostly, &rdev->flags)) ++ !test_bit(WriteMostly, &rdev->flags) || ++ rdev_in_recovery(rdev, r1_bio)) + continue; + + /* there are no bad blocks, we can use this disk */ +@@ -733,9 +741,7 @@ static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio) + if (!rdev || test_bit(Faulty, &rdev->flags)) + return false; + +- /* still in recovery */ +- if (!test_bit(In_sync, &rdev->flags) && +- rdev->recovery_offset < r1_bio->sector + r1_bio->sectors) ++ if (rdev_in_recovery(rdev, r1_bio)) + return false; + + /* don't read from slow disk unless have to */ +-- +2.46.0 + diff --git a/queue-6.10/media-atomisp-fix-streaming-no-longer-working-on-byt-isp2400-devices.patch b/queue-6.10/media-atomisp-fix-streaming-no-longer-working-on-byt-isp2400-devices.patch new file mode 100644 index 00000000000..0300f4ac6e3 --- /dev/null +++ b/queue-6.10/media-atomisp-fix-streaming-no-longer-working-on-byt-isp2400-devices.patch @@ -0,0 +1,98 @@ +From 63de936b513f7a9ce559194d3269ac291f4f4662 Mon Sep 17 00:00:00 2001 +From: Hans de Goede +Date: Sun, 21 Jul 2024 17:38:40 +0200 +Subject: media: atomisp: Fix streaming no longer working on BYT / ISP2400 devices + +From: Hans de Goede + +commit 63de936b513f7a9ce559194d3269ac291f4f4662 upstream. + +Commit a0821ca14bb8 ("media: atomisp: Remove test pattern generator (TPG) +support") broke BYT support because it removed a seemingly unused field +from struct sh_css_sp_config and a seemingly unused value from enum +ia_css_input_mode. + +But these are part of the ABI between the kernel and firmware on ISP2400 +and this part of the TPG support removal changes broke ISP2400 support. + +ISP2401 support was not affected because on ISP2401 only a part of +struct sh_css_sp_config is used. + +Restore the removed field and enum value to fix this. + +Fixes: a0821ca14bb8 ("media: atomisp: Remove test pattern generator (TPG) support") +Cc: stable@vger.kernel.org +Signed-off-by: Hans de Goede +Signed-off-by: Hans Verkuil +Signed-off-by: Greg Kroah-Hartman +--- + drivers/staging/media/atomisp/pci/ia_css_stream_public.h | 8 ++++-- + drivers/staging/media/atomisp/pci/sh_css_internal.h | 19 ++++++++++++--- + 2 files changed, 22 insertions(+), 5 deletions(-) + +--- a/drivers/staging/media/atomisp/pci/ia_css_stream_public.h ++++ b/drivers/staging/media/atomisp/pci/ia_css_stream_public.h +@@ -27,12 +27,16 @@ + #include "ia_css_prbs.h" + #include "ia_css_input_port.h" + +-/* Input modes, these enumerate all supported input modes. +- * Note that not all ISP modes support all input modes. ++/* ++ * Input modes, these enumerate all supported input modes. ++ * This enum is part of the atomisp firmware ABI and must ++ * NOT be changed! ++ * Note that not all ISP modes support all input modes. + */ + enum ia_css_input_mode { + IA_CSS_INPUT_MODE_SENSOR, /** data from sensor */ + IA_CSS_INPUT_MODE_FIFO, /** data from input-fifo */ ++ IA_CSS_INPUT_MODE_TPG, /** data from test-pattern generator */ + IA_CSS_INPUT_MODE_PRBS, /** data from pseudo-random bit stream */ + IA_CSS_INPUT_MODE_MEMORY, /** data from a frame in memory */ + IA_CSS_INPUT_MODE_BUFFERED_SENSOR /** data is sent through mipi buffer */ +--- a/drivers/staging/media/atomisp/pci/sh_css_internal.h ++++ b/drivers/staging/media/atomisp/pci/sh_css_internal.h +@@ -341,7 +341,14 @@ struct sh_css_sp_input_formatter_set { + + #define IA_CSS_MIPI_SIZE_CHECK_MAX_NOF_ENTRIES_PER_PORT (3) + +-/* SP configuration information */ ++/* ++ * SP configuration information ++ * ++ * This struct is part of the atomisp firmware ABI and is directly copied ++ * to ISP DRAM by sh_css_store_sp_group_to_ddr() ++ * ++ * Do NOT change this struct's layout or remove seemingly unused fields! ++ */ + struct sh_css_sp_config { + u8 no_isp_sync; /* Signal host immediately after start */ + u8 enable_raw_pool_locking; /** Enable Raw Buffer Locking for HALv3 Support */ +@@ -351,6 +358,10 @@ struct sh_css_sp_config { + host (true) or when they are passed to the preview/video pipe + (false). */ + ++ /* ++ * Note the fields below are only used on the ISP2400 not on the ISP2401, ++ * sh_css_store_sp_group_to_ddr() skip copying these when run on the ISP2401. ++ */ + struct { + u8 a_changed; + u8 b_changed; +@@ -360,11 +371,13 @@ struct sh_css_sp_config { + } input_formatter; + + sync_generator_cfg_t sync_gen; ++ tpg_cfg_t tpg; + prbs_cfg_t prbs; + input_system_cfg_t input_circuit; + u8 input_circuit_cfg_changed; +- u32 mipi_sizes_for_check[N_CSI_PORTS][IA_CSS_MIPI_SIZE_CHECK_MAX_NOF_ENTRIES_PER_PORT]; +- u8 enable_isys_event_queue; ++ u32 mipi_sizes_for_check[N_CSI_PORTS][IA_CSS_MIPI_SIZE_CHECK_MAX_NOF_ENTRIES_PER_PORT]; ++ /* These last 2 fields are used on both the ISP2400 and the ISP2401 */ ++ u8 enable_isys_event_queue; + u8 disable_cont_vf; + }; + diff --git a/queue-6.10/memcg_write_event_control-fix-a-user-triggerable-oops.patch b/queue-6.10/memcg_write_event_control-fix-a-user-triggerable-oops.patch new file mode 100644 index 00000000000..4ebef5820e5 --- /dev/null +++ b/queue-6.10/memcg_write_event_control-fix-a-user-triggerable-oops.patch @@ -0,0 +1,39 @@ +From 046667c4d3196938e992fba0dfcde570aa85cd0e Mon Sep 17 00:00:00 2001 +From: Al Viro +Date: Sun, 21 Jul 2024 14:45:08 -0400 +Subject: memcg_write_event_control(): fix a user-triggerable oops + +From: Al Viro + +commit 046667c4d3196938e992fba0dfcde570aa85cd0e upstream. + +we are *not* guaranteed that anything past the terminating NUL +is mapped (let alone initialized with anything sane). + +Fixes: 0dea116876ee ("cgroup: implement eventfd-based generic API for notifications") +Cc: stable@vger.kernel.org +Cc: Andrew Morton +Acked-by: Michal Hocko +Signed-off-by: Al Viro +Signed-off-by: Greg Kroah-Hartman +--- + mm/memcontrol.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -5282,9 +5282,12 @@ static ssize_t memcg_write_event_control + buf = endp + 1; + + cfd = simple_strtoul(buf, &endp, 10); +- if ((*endp != ' ') && (*endp != '\0')) ++ if (*endp == '\0') ++ buf = endp; ++ else if (*endp == ' ') ++ buf = endp + 1; ++ else + return -EINVAL; +- buf = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) diff --git a/queue-6.10/mm-fix-endless-reclaim-on-machines-with-unaccepted-memory.patch b/queue-6.10/mm-fix-endless-reclaim-on-machines-with-unaccepted-memory.patch new file mode 100644 index 00000000000..ac7e863e556 --- /dev/null +++ b/queue-6.10/mm-fix-endless-reclaim-on-machines-with-unaccepted-memory.patch @@ -0,0 +1,157 @@ +From 807174a93d24c456503692dc3f5af322ee0b640a Mon Sep 17 00:00:00 2001 +From: "Kirill A. Shutemov" +Date: Fri, 9 Aug 2024 14:48:47 +0300 +Subject: mm: fix endless reclaim on machines with unaccepted memory + +From: Kirill A. Shutemov + +commit 807174a93d24c456503692dc3f5af322ee0b640a upstream. + +Unaccepted memory is considered unusable free memory, which is not counted +as free on the zone watermark check. This causes get_page_from_freelist() +to accept more memory to hit the high watermark, but it creates problems +in the reclaim path. + +The reclaim path encounters a failed zone watermark check and attempts to +reclaim memory. This is usually successful, but if there is little or no +reclaimable memory, it can result in endless reclaim with little to no +progress. This can occur early in the boot process, just after start of +the init process when the only reclaimable memory is the page cache of the +init executable and its libraries. + +Make unaccepted memory free from watermark check point of view. This way +unaccepted memory will never be the trigger of memory reclaim. Accept +more memory in the get_page_from_freelist() if needed. + +Link: https://lkml.kernel.org/r/20240809114854.3745464-2-kirill.shutemov@linux.intel.com +Fixes: dcdfdd40fa82 ("mm: Add support for unaccepted memory") +Signed-off-by: Kirill A. Shutemov +Reported-by: Jianxiong Gao +Acked-by: David Hildenbrand +Tested-by: Jianxiong Gao +Cc: Borislav Petkov +Cc: Johannes Weiner +Cc: Kirill A. Shutemov +Cc: Matthew Wilcox +Cc: Mel Gorman +Cc: Mike Rapoport (Microsoft) +Cc: Tom Lendacky +Cc: Vlastimil Babka +Cc: [6.5+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 42 ++++++++++++++++++++---------------------- + 1 file changed, 20 insertions(+), 22 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -287,7 +287,7 @@ EXPORT_SYMBOL(nr_online_nodes); + + static bool page_contains_unaccepted(struct page *page, unsigned int order); + static void accept_page(struct page *page, unsigned int order); +-static bool try_to_accept_memory(struct zone *zone, unsigned int order); ++static bool cond_accept_memory(struct zone *zone, unsigned int order); + static inline bool has_unaccepted_memory(void); + static bool __free_unaccepted(struct page *page); + +@@ -3059,9 +3059,6 @@ static inline long __zone_watermark_unus + if (!(alloc_flags & ALLOC_CMA)) + unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES); + #endif +-#ifdef CONFIG_UNACCEPTED_MEMORY +- unusable_free += zone_page_state(z, NR_UNACCEPTED); +-#endif + + return unusable_free; + } +@@ -3355,6 +3352,8 @@ retry: + } + } + ++ cond_accept_memory(zone, order); ++ + /* + * Detect whether the number of free pages is below high + * watermark. If so, we will decrease pcp->high and free +@@ -3380,10 +3379,8 @@ check_alloc_wmark: + gfp_mask)) { + int ret; + +- if (has_unaccepted_memory()) { +- if (try_to_accept_memory(zone, order)) +- goto try_this_zone; +- } ++ if (cond_accept_memory(zone, order)) ++ goto try_this_zone; + + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* +@@ -3437,10 +3434,8 @@ try_this_zone: + + return page; + } else { +- if (has_unaccepted_memory()) { +- if (try_to_accept_memory(zone, order)) +- goto try_this_zone; +- } ++ if (cond_accept_memory(zone, order)) ++ goto try_this_zone; + + #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + /* Try again if zone has deferred pages */ +@@ -6933,9 +6928,6 @@ static bool try_to_accept_memory_one(str + struct page *page; + bool last; + +- if (list_empty(&zone->unaccepted_pages)) +- return false; +- + spin_lock_irqsave(&zone->lock, flags); + page = list_first_entry_or_null(&zone->unaccepted_pages, + struct page, lru); +@@ -6961,23 +6953,29 @@ static bool try_to_accept_memory_one(str + return true; + } + +-static bool try_to_accept_memory(struct zone *zone, unsigned int order) ++static bool cond_accept_memory(struct zone *zone, unsigned int order) + { + long to_accept; +- int ret = false; ++ bool ret = false; ++ ++ if (!has_unaccepted_memory()) ++ return false; ++ ++ if (list_empty(&zone->unaccepted_pages)) ++ return false; + + /* How much to accept to get to high watermark? */ + to_accept = high_wmark_pages(zone) - + (zone_page_state(zone, NR_FREE_PAGES) - +- __zone_watermark_unusable_free(zone, order, 0)); ++ __zone_watermark_unusable_free(zone, order, 0) - ++ zone_page_state(zone, NR_UNACCEPTED)); + +- /* Accept at least one page */ +- do { ++ while (to_accept > 0) { + if (!try_to_accept_memory_one(zone)) + break; + ret = true; + to_accept -= MAX_ORDER_NR_PAGES; +- } while (to_accept > 0); ++ } + + return ret; + } +@@ -7020,7 +7018,7 @@ static void accept_page(struct page *pag + { + } + +-static bool try_to_accept_memory(struct zone *zone, unsigned int order) ++static bool cond_accept_memory(struct zone *zone, unsigned int order) + { + return false; + } diff --git a/queue-6.10/mm-hugetlb-fix-hugetlb-vs.-core-mm-pt-locking.patch b/queue-6.10/mm-hugetlb-fix-hugetlb-vs.-core-mm-pt-locking.patch new file mode 100644 index 00000000000..0d2b20aa1b5 --- /dev/null +++ b/queue-6.10/mm-hugetlb-fix-hugetlb-vs.-core-mm-pt-locking.patch @@ -0,0 +1,161 @@ +From 5f75cfbd6bb02295ddaed48adf667b6c828ce07b Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 1 Aug 2024 22:47:48 +0200 +Subject: mm/hugetlb: fix hugetlb vs. core-mm PT locking + +From: David Hildenbrand + +commit 5f75cfbd6bb02295ddaed48adf667b6c828ce07b upstream. + +We recently made GUP's common page table walking code to also walk hugetlb +VMAs without most hugetlb special-casing, preparing for the future of +having less hugetlb-specific page table walking code in the codebase. +Turns out that we missed one page table locking detail: page table locking +for hugetlb folios that are not mapped using a single PMD/PUD. + +Assume we have hugetlb folio that spans multiple PTEs (e.g., 64 KiB +hugetlb folios on arm64 with 4 KiB base page size). GUP, as it walks the +page tables, will perform a pte_offset_map_lock() to grab the PTE table +lock. + +However, hugetlb that concurrently modifies these page tables would +actually grab the mm->page_table_lock: with USE_SPLIT_PTE_PTLOCKS, the +locks would differ. Something similar can happen right now with hugetlb +folios that span multiple PMDs when USE_SPLIT_PMD_PTLOCKS. + +This issue can be reproduced [1], for example triggering: + +[ 3105.936100] ------------[ cut here ]------------ +[ 3105.939323] WARNING: CPU: 31 PID: 2732 at mm/gup.c:142 try_grab_folio+0x11c/0x188 +[ 3105.944634] Modules linked in: [...] +[ 3105.974841] CPU: 31 PID: 2732 Comm: reproducer Not tainted 6.10.0-64.eln141.aarch64 #1 +[ 3105.980406] Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20240524-4.fc40 05/24/2024 +[ 3105.986185] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) +[ 3105.991108] pc : try_grab_folio+0x11c/0x188 +[ 3105.994013] lr : follow_page_pte+0xd8/0x430 +[ 3105.996986] sp : ffff80008eafb8f0 +[ 3105.999346] x29: ffff80008eafb900 x28: ffffffe8d481f380 x27: 00f80001207cff43 +[ 3106.004414] x26: 0000000000000001 x25: 0000000000000000 x24: ffff80008eafba48 +[ 3106.009520] x23: 0000ffff9372f000 x22: ffff7a54459e2000 x21: ffff7a546c1aa978 +[ 3106.014529] x20: ffffffe8d481f3c0 x19: 0000000000610041 x18: 0000000000000001 +[ 3106.019506] x17: 0000000000000001 x16: ffffffffffffffff x15: 0000000000000000 +[ 3106.024494] x14: ffffb85477fdfe08 x13: 0000ffff9372ffff x12: 0000000000000000 +[ 3106.029469] x11: 1fffef4a88a96be1 x10: ffff7a54454b5f0c x9 : ffffb854771b12f0 +[ 3106.034324] x8 : 0008000000000000 x7 : ffff7a546c1aa980 x6 : 0008000000000080 +[ 3106.038902] x5 : 00000000001207cf x4 : 0000ffff9372f000 x3 : ffffffe8d481f000 +[ 3106.043420] x2 : 0000000000610041 x1 : 0000000000000001 x0 : 0000000000000000 +[ 3106.047957] Call trace: +[ 3106.049522] try_grab_folio+0x11c/0x188 +[ 3106.051996] follow_pmd_mask.constprop.0.isra.0+0x150/0x2e0 +[ 3106.055527] follow_page_mask+0x1a0/0x2b8 +[ 3106.058118] __get_user_pages+0xf0/0x348 +[ 3106.060647] faultin_page_range+0xb0/0x360 +[ 3106.063651] do_madvise+0x340/0x598 + +Let's make huge_pte_lockptr() effectively use the same PT locks as any +core-mm page table walker would. Add ptep_lockptr() to obtain the PTE +page table lock using a pte pointer -- unfortunately we cannot convert +pte_lockptr() because virt_to_page() doesn't work with kmap'ed page tables +we can have with CONFIG_HIGHPTE. + +Handle CONFIG_PGTABLE_LEVELS correctly by checking in reverse order, such +that when e.g., CONFIG_PGTABLE_LEVELS==2 with +PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE will work as expected. Document +why that works. + +There is one ugly case: powerpc 8xx, whereby we have an 8 MiB hugetlb +folio being mapped using two PTE page tables. While hugetlb wants to take +the PMD table lock, core-mm would grab the PTE table lock of one of both +PTE page tables. In such corner cases, we have to make sure that both +locks match, which is (fortunately!) currently guaranteed for 8xx as it +does not support SMP and consequently doesn't use split PT locks. + +[1] https://lore.kernel.org/all/1bbfcc7f-f222-45a5-ac44-c5a1381c596d@redhat.com/ + +Link: https://lkml.kernel.org/r/20240801204748.99107-1-david@redhat.com +Fixes: 9cb28da54643 ("mm/gup: handle hugetlb in the generic follow_page_mask code") +Signed-off-by: David Hildenbrand +Acked-by: Peter Xu +Reviewed-by: Baolin Wang +Tested-by: Baolin Wang +Cc: Peter Xu +Cc: Oscar Salvador +Cc: Muchun Song +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/hugetlb.h | 33 ++++++++++++++++++++++++++++++--- + include/linux/mm.h | 11 +++++++++++ + 2 files changed, 41 insertions(+), 3 deletions(-) + +--- a/include/linux/hugetlb.h ++++ b/include/linux/hugetlb.h +@@ -967,10 +967,37 @@ static inline bool htlb_allow_alloc_fall + static inline spinlock_t *huge_pte_lockptr(struct hstate *h, + struct mm_struct *mm, pte_t *pte) + { +- if (huge_page_size(h) == PMD_SIZE) ++ const unsigned long size = huge_page_size(h); ++ ++ VM_WARN_ON(size == PAGE_SIZE); ++ ++ /* ++ * hugetlb must use the exact same PT locks as core-mm page table ++ * walkers would. When modifying a PTE table, hugetlb must take the ++ * PTE PT lock, when modifying a PMD table, hugetlb must take the PMD ++ * PT lock etc. ++ * ++ * The expectation is that any hugetlb folio smaller than a PMD is ++ * always mapped into a single PTE table and that any hugetlb folio ++ * smaller than a PUD (but at least as big as a PMD) is always mapped ++ * into a single PMD table. ++ * ++ * If that does not hold for an architecture, then that architecture ++ * must disable split PT locks such that all *_lockptr() functions ++ * will give us the same result: the per-MM PT lock. ++ * ++ * Note that with e.g., CONFIG_PGTABLE_LEVELS=2 where ++ * PGDIR_SIZE==P4D_SIZE==PUD_SIZE==PMD_SIZE, we'd use pud_lockptr() ++ * and core-mm would use pmd_lockptr(). However, in such configurations ++ * split PMD locks are disabled -- they don't make sense on a single ++ * PGDIR page table -- and the end result is the same. ++ */ ++ if (size >= PUD_SIZE) ++ return pud_lockptr(mm, (pud_t *) pte); ++ else if (size >= PMD_SIZE || IS_ENABLED(CONFIG_HIGHPTE)) + return pmd_lockptr(mm, (pmd_t *) pte); +- VM_BUG_ON(huge_page_size(h) == PAGE_SIZE); +- return &mm->page_table_lock; ++ /* pte_alloc_huge() only applies with !CONFIG_HIGHPTE */ ++ return ptep_lockptr(mm, pte); + } + + #ifndef hugepages_supported +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2960,6 +2960,13 @@ static inline spinlock_t *pte_lockptr(st + return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); + } + ++static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) ++{ ++ BUILD_BUG_ON(IS_ENABLED(CONFIG_HIGHPTE)); ++ BUILD_BUG_ON(MAX_PTRS_PER_PTE * sizeof(pte_t) > PAGE_SIZE); ++ return ptlock_ptr(virt_to_ptdesc(pte)); ++} ++ + static inline bool ptlock_init(struct ptdesc *ptdesc) + { + /* +@@ -2984,6 +2991,10 @@ static inline spinlock_t *pte_lockptr(st + { + return &mm->page_table_lock; + } ++static inline spinlock_t *ptep_lockptr(struct mm_struct *mm, pte_t *pte) ++{ ++ return &mm->page_table_lock; ++} + static inline void ptlock_cache_init(void) {} + static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } + static inline void ptlock_free(struct ptdesc *ptdesc) {} diff --git a/queue-6.10/mm-memory-failure-use-raw_spinlock_t-in-struct-memory_failure_cpu.patch b/queue-6.10/mm-memory-failure-use-raw_spinlock_t-in-struct-memory_failure_cpu.patch new file mode 100644 index 00000000000..1132422c985 --- /dev/null +++ b/queue-6.10/mm-memory-failure-use-raw_spinlock_t-in-struct-memory_failure_cpu.patch @@ -0,0 +1,129 @@ +From d75abd0d0bc29e6ebfebbf76d11b4067b35844af Mon Sep 17 00:00:00 2001 +From: Waiman Long +Date: Tue, 6 Aug 2024 12:41:07 -0400 +Subject: mm/memory-failure: use raw_spinlock_t in struct memory_failure_cpu + +From: Waiman Long + +commit d75abd0d0bc29e6ebfebbf76d11b4067b35844af upstream. + +The memory_failure_cpu structure is a per-cpu structure. Access to its +content requires the use of get_cpu_var() to lock in the current CPU and +disable preemption. The use of a regular spinlock_t for locking purpose +is fine for a non-RT kernel. + +Since the integration of RT spinlock support into the v5.15 kernel, a +spinlock_t in a RT kernel becomes a sleeping lock and taking a sleeping +lock in a preemption disabled context is illegal resulting in the +following kind of warning. + + [12135.732244] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 + [12135.732248] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 270076, name: kworker/0:0 + [12135.732252] preempt_count: 1, expected: 0 + [12135.732255] RCU nest depth: 2, expected: 2 + : + [12135.732420] Hardware name: Dell Inc. PowerEdge R640/0HG0J8, BIOS 2.10.2 02/24/2021 + [12135.732423] Workqueue: kacpi_notify acpi_os_execute_deferred + [12135.732433] Call Trace: + [12135.732436] + [12135.732450] dump_stack_lvl+0x57/0x81 + [12135.732461] __might_resched.cold+0xf4/0x12f + [12135.732479] rt_spin_lock+0x4c/0x100 + [12135.732491] memory_failure_queue+0x40/0xe0 + [12135.732503] ghes_do_memory_failure+0x53/0x390 + [12135.732516] ghes_do_proc.constprop.0+0x229/0x3e0 + [12135.732575] ghes_proc+0xf9/0x1a0 + [12135.732591] ghes_notify_hed+0x6a/0x150 + [12135.732602] notifier_call_chain+0x43/0xb0 + [12135.732626] blocking_notifier_call_chain+0x43/0x60 + [12135.732637] acpi_ev_notify_dispatch+0x47/0x70 + [12135.732648] acpi_os_execute_deferred+0x13/0x20 + [12135.732654] process_one_work+0x41f/0x500 + [12135.732695] worker_thread+0x192/0x360 + [12135.732715] kthread+0x111/0x140 + [12135.732733] ret_from_fork+0x29/0x50 + [12135.732779] + +Fix it by using a raw_spinlock_t for locking instead. + +Also move the pr_err() out of the lock critical section and after +put_cpu_ptr() to avoid indeterminate latency and the possibility of sleep +with this call. + +[longman@redhat.com: don't hold percpu ref across pr_err(), per Miaohe] + Link: https://lkml.kernel.org/r/20240807181130.1122660-1-longman@redhat.com +Link: https://lkml.kernel.org/r/20240806164107.1044956-1-longman@redhat.com +Fixes: 0f383b6dc96e ("locking/spinlock: Provide RT variant") +Signed-off-by: Waiman Long +Acked-by: Miaohe Lin +Cc: "Huang, Ying" +Cc: Juri Lelli +Cc: Len Brown +Cc: Naoya Horiguchi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory-failure.c | 20 +++++++++++--------- + 1 file changed, 11 insertions(+), 9 deletions(-) + +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -2406,7 +2406,7 @@ struct memory_failure_entry { + struct memory_failure_cpu { + DECLARE_KFIFO(fifo, struct memory_failure_entry, + MEMORY_FAILURE_FIFO_SIZE); +- spinlock_t lock; ++ raw_spinlock_t lock; + struct work_struct work; + }; + +@@ -2432,20 +2432,22 @@ void memory_failure_queue(unsigned long + { + struct memory_failure_cpu *mf_cpu; + unsigned long proc_flags; ++ bool buffer_overflow; + struct memory_failure_entry entry = { + .pfn = pfn, + .flags = flags, + }; + + mf_cpu = &get_cpu_var(memory_failure_cpu); +- spin_lock_irqsave(&mf_cpu->lock, proc_flags); +- if (kfifo_put(&mf_cpu->fifo, entry)) ++ raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags); ++ buffer_overflow = !kfifo_put(&mf_cpu->fifo, entry); ++ if (!buffer_overflow) + schedule_work_on(smp_processor_id(), &mf_cpu->work); +- else ++ raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); ++ put_cpu_var(memory_failure_cpu); ++ if (buffer_overflow) + pr_err("buffer overflow when queuing memory failure at %#lx\n", + pfn); +- spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); +- put_cpu_var(memory_failure_cpu); + } + EXPORT_SYMBOL_GPL(memory_failure_queue); + +@@ -2458,9 +2460,9 @@ static void memory_failure_work_func(str + + mf_cpu = container_of(work, struct memory_failure_cpu, work); + for (;;) { +- spin_lock_irqsave(&mf_cpu->lock, proc_flags); ++ raw_spin_lock_irqsave(&mf_cpu->lock, proc_flags); + gotten = kfifo_get(&mf_cpu->fifo, &entry); +- spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); ++ raw_spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); + if (!gotten) + break; + if (entry.flags & MF_SOFT_OFFLINE) +@@ -2490,7 +2492,7 @@ static int __init memory_failure_init(vo + + for_each_possible_cpu(cpu) { + mf_cpu = &per_cpu(memory_failure_cpu, cpu); +- spin_lock_init(&mf_cpu->lock); ++ raw_spin_lock_init(&mf_cpu->lock); + INIT_KFIFO(mf_cpu->fifo); + INIT_WORK(&mf_cpu->work, memory_failure_work_func); + } diff --git a/queue-6.10/mm-numa-no-task_numa_fault-call-if-pmd-is-changed.patch b/queue-6.10/mm-numa-no-task_numa_fault-call-if-pmd-is-changed.patch new file mode 100644 index 00000000000..ef189260d4d --- /dev/null +++ b/queue-6.10/mm-numa-no-task_numa_fault-call-if-pmd-is-changed.patch @@ -0,0 +1,93 @@ +From fd8c35a92910f4829b7c99841f39b1b952c259d5 Mon Sep 17 00:00:00 2001 +From: Zi Yan +Date: Fri, 9 Aug 2024 10:59:05 -0400 +Subject: mm/numa: no task_numa_fault() call if PMD is changed + +From: Zi Yan + +commit fd8c35a92910f4829b7c99841f39b1b952c259d5 upstream. + +When handling a numa page fault, task_numa_fault() should be called by a +process that restores the page table of the faulted folio to avoid +duplicated stats counting. Commit c5b5a3dd2c1f ("mm: thp: refactor NUMA +fault handling") restructured do_huge_pmd_numa_page() and did not avoid +task_numa_fault() call in the second page table check after a numa +migration failure. Fix it by making all !pmd_same() return immediately. + +This issue can cause task_numa_fault() being called more than necessary +and lead to unexpected numa balancing results (It is hard to tell whether +the issue will cause positive or negative performance impact due to +duplicated numa fault counting). + +Link: https://lkml.kernel.org/r/20240809145906.1513458-3-ziy@nvidia.com +Fixes: c5b5a3dd2c1f ("mm: thp: refactor NUMA fault handling") +Reported-by: "Huang, Ying" +Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.intel.com/ +Signed-off-by: Zi Yan +Acked-by: David Hildenbrand +Cc: Baolin Wang +Cc: "Huang, Ying" +Cc: Kefeng Wang +Cc: Mel Gorman +Cc: Yang Shi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/huge_memory.c | 29 +++++++++++++---------------- + 1 file changed, 13 insertions(+), 16 deletions(-) + +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -1672,7 +1672,7 @@ vm_fault_t do_huge_pmd_numa_page(struct + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { + spin_unlock(vmf->ptl); +- goto out; ++ return 0; + } + + pmd = pmd_modify(oldpmd, vma->vm_page_prot); +@@ -1715,22 +1715,16 @@ vm_fault_t do_huge_pmd_numa_page(struct + if (!migrate_misplaced_folio(folio, vma, target_nid)) { + flags |= TNF_MIGRATED; + nid = target_nid; +- } else { +- flags |= TNF_MIGRATE_FAIL; +- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); +- if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { +- spin_unlock(vmf->ptl); +- goto out; +- } +- goto out_map; +- } +- +-out: +- if (nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); ++ return 0; ++ } + +- return 0; +- ++ flags |= TNF_MIGRATE_FAIL; ++ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ++ if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { ++ spin_unlock(vmf->ptl); ++ return 0; ++ } + out_map: + /* Restore the PMD */ + pmd = pmd_modify(oldpmd, vma->vm_page_prot); +@@ -1740,7 +1734,10 @@ out_map: + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); + spin_unlock(vmf->ptl); +- goto out; ++ ++ if (nid != NUMA_NO_NODE) ++ task_numa_fault(last_cpupid, nid, HPAGE_PMD_NR, flags); ++ return 0; + } + + /* diff --git a/queue-6.10/mm-numa-no-task_numa_fault-call-if-pte-is-changed.patch b/queue-6.10/mm-numa-no-task_numa_fault-call-if-pte-is-changed.patch new file mode 100644 index 00000000000..4f4f6a1039b --- /dev/null +++ b/queue-6.10/mm-numa-no-task_numa_fault-call-if-pte-is-changed.patch @@ -0,0 +1,97 @@ +From 40b760cfd44566bca791c80e0720d70d75382b84 Mon Sep 17 00:00:00 2001 +From: Zi Yan +Date: Fri, 9 Aug 2024 10:59:04 -0400 +Subject: mm/numa: no task_numa_fault() call if PTE is changed + +From: Zi Yan + +commit 40b760cfd44566bca791c80e0720d70d75382b84 upstream. + +When handling a numa page fault, task_numa_fault() should be called by a +process that restores the page table of the faulted folio to avoid +duplicated stats counting. Commit b99a342d4f11 ("NUMA balancing: reduce +TLB flush via delaying mapping on hint page fault") restructured +do_numa_page() and did not avoid task_numa_fault() call in the second page +table check after a numa migration failure. Fix it by making all +!pte_same() return immediately. + +This issue can cause task_numa_fault() being called more than necessary +and lead to unexpected numa balancing results (It is hard to tell whether +the issue will cause positive or negative performance impact due to +duplicated numa fault counting). + +Link: https://lkml.kernel.org/r/20240809145906.1513458-2-ziy@nvidia.com +Fixes: b99a342d4f11 ("NUMA balancing: reduce TLB flush via delaying mapping on hint page fault") +Signed-off-by: Zi Yan +Reported-by: "Huang, Ying" +Closes: https://lore.kernel.org/linux-mm/87zfqfw0yw.fsf@yhuang6-desk2.ccr.corp.intel.com/ +Acked-by: David Hildenbrand +Cc: Baolin Wang +Cc: Kefeng Wang +Cc: Mel Gorman +Cc: Yang Shi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory.c | 33 ++++++++++++++++----------------- + 1 file changed, 16 insertions(+), 17 deletions(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5155,7 +5155,7 @@ static vm_fault_t do_numa_page(struct vm + + if (unlikely(!pte_same(old_pte, vmf->orig_pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); +- goto out; ++ return 0; + } + + pte = pte_modify(old_pte, vma->vm_page_prot); +@@ -5218,23 +5218,19 @@ static vm_fault_t do_numa_page(struct vm + if (!migrate_misplaced_folio(folio, vma, target_nid)) { + nid = target_nid; + flags |= TNF_MIGRATED; +- } else { +- flags |= TNF_MIGRATE_FAIL; +- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, +- vmf->address, &vmf->ptl); +- if (unlikely(!vmf->pte)) +- goto out; +- if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { +- pte_unmap_unlock(vmf->pte, vmf->ptl); +- goto out; +- } +- goto out_map; ++ task_numa_fault(last_cpupid, nid, nr_pages, flags); ++ return 0; + } + +-out: +- if (nid != NUMA_NO_NODE) +- task_numa_fault(last_cpupid, nid, nr_pages, flags); +- return 0; ++ flags |= TNF_MIGRATE_FAIL; ++ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, ++ vmf->address, &vmf->ptl); ++ if (unlikely(!vmf->pte)) ++ return 0; ++ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) { ++ pte_unmap_unlock(vmf->pte, vmf->ptl); ++ return 0; ++ } + out_map: + /* + * Make it present again, depending on how arch implements +@@ -5247,7 +5243,10 @@ out_map: + numa_rebuild_single_mapping(vmf, vma, vmf->address, vmf->pte, + writable); + pte_unmap_unlock(vmf->pte, vmf->ptl); +- goto out; ++ ++ if (nid != NUMA_NO_NODE) ++ task_numa_fault(last_cpupid, nid, nr_pages, flags); ++ return 0; + } + + static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) diff --git a/queue-6.10/mm-vmalloc-fix-page-mapping-if-vm_area_alloc_pages-with-high-order-fallback-to-order-0.patch b/queue-6.10/mm-vmalloc-fix-page-mapping-if-vm_area_alloc_pages-with-high-order-fallback-to-order-0.patch new file mode 100644 index 00000000000..279ea805823 --- /dev/null +++ b/queue-6.10/mm-vmalloc-fix-page-mapping-if-vm_area_alloc_pages-with-high-order-fallback-to-order-0.patch @@ -0,0 +1,68 @@ +From 61ebe5a747da649057c37be1c37eb934b4af79ca Mon Sep 17 00:00:00 2001 +From: Hailong Liu +Date: Thu, 8 Aug 2024 20:19:56 +0800 +Subject: mm/vmalloc: fix page mapping if vm_area_alloc_pages() with high order fallback to order 0 + +From: Hailong Liu + +commit 61ebe5a747da649057c37be1c37eb934b4af79ca upstream. + +The __vmap_pages_range_noflush() assumes its argument pages** contains +pages with the same page shift. However, since commit e9c3cda4d86e ("mm, +vmalloc: fix high order __GFP_NOFAIL allocations"), if gfp_flags includes +__GFP_NOFAIL with high order in vm_area_alloc_pages() and page allocation +failed for high order, the pages** may contain two different page shifts +(high order and order-0). This could lead __vmap_pages_range_noflush() to +perform incorrect mappings, potentially resulting in memory corruption. + +Users might encounter this as follows (vmap_allow_huge = true, 2M is for +PMD_SIZE): + +kvmalloc(2M, __GFP_NOFAIL|GFP_X) + __vmalloc_node_range_noprof(vm_flags=VM_ALLOW_HUGE_VMAP) + vm_area_alloc_pages(order=9) ---> order-9 allocation failed and fallback to order-0 + vmap_pages_range() + vmap_pages_range_noflush() + __vmap_pages_range_noflush(page_shift = 21) ----> wrong mapping happens + +We can remove the fallback code because if a high-order allocation fails, +__vmalloc_node_range_noprof() will retry with order-0. Therefore, it is +unnecessary to fallback to order-0 here. Therefore, fix this by removing +the fallback code. + +Link: https://lkml.kernel.org/r/20240808122019.3361-1-hailong.liu@oppo.com +Fixes: e9c3cda4d86e ("mm, vmalloc: fix high order __GFP_NOFAIL allocations") +Signed-off-by: Hailong Liu +Reported-by: Tangquan Zheng +Reviewed-by: Baoquan He +Reviewed-by: Uladzislau Rezki (Sony) +Acked-by: Barry Song +Acked-by: Michal Hocko +Cc: Matthew Wilcox +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/vmalloc.c | 11 ++--------- + 1 file changed, 2 insertions(+), 9 deletions(-) + +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -3583,15 +3583,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, + page = alloc_pages_noprof(alloc_gfp, order); + else + page = alloc_pages_node_noprof(nid, alloc_gfp, order); +- if (unlikely(!page)) { +- if (!nofail) +- break; +- +- /* fall back to the zero order allocations */ +- alloc_gfp |= __GFP_NOFAIL; +- order = 0; +- continue; +- } ++ if (unlikely(!page)) ++ break; + + /* + * Higher order allocations must be able to be treated as diff --git a/queue-6.10/mseal-fix-is_madv_discard.patch b/queue-6.10/mseal-fix-is_madv_discard.patch new file mode 100644 index 00000000000..6f662379b1c --- /dev/null +++ b/queue-6.10/mseal-fix-is_madv_discard.patch @@ -0,0 +1,66 @@ +From e46bc2e7eb90a370bc27fa2fd98cb8251e7da1ec Mon Sep 17 00:00:00 2001 +From: Pedro Falcato +Date: Wed, 7 Aug 2024 18:33:35 +0100 +Subject: mseal: fix is_madv_discard() + +From: Pedro Falcato + +commit e46bc2e7eb90a370bc27fa2fd98cb8251e7da1ec upstream. + +is_madv_discard did its check wrong. MADV_ flags are not bitwise, +they're normal sequential numbers. So, for instance: + behavior & (/* ... */ | MADV_REMOVE) + +tagged both MADV_REMOVE and MADV_RANDOM (bit 0 set) as discard +operations. + +As a result the kernel could erroneously block certain madvises (e.g +MADV_RANDOM or MADV_HUGEPAGE) on sealed VMAs due to them sharing bits +with blocked MADV operations (e.g REMOVE or WIPEONFORK). + +This is obviously incorrect, so use a switch statement instead. + +Link: https://lkml.kernel.org/r/20240807173336.2523757-1-pedro.falcato@gmail.com +Link: https://lkml.kernel.org/r/20240807173336.2523757-2-pedro.falcato@gmail.com +Fixes: 8be7258aad44 ("mseal: add mseal syscall") +Signed-off-by: Pedro Falcato +Tested-by: Jeff Xu +Reviewed-by: Jeff Xu +Cc: Kees Cook +Cc: Liam R. Howlett +Cc: Shuah Khan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/mseal.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +diff --git a/mm/mseal.c b/mm/mseal.c +index bf783bba8ed0..15bba28acc00 100644 +--- a/mm/mseal.c ++++ b/mm/mseal.c +@@ -40,9 +40,17 @@ static bool can_modify_vma(struct vm_area_struct *vma) + + static bool is_madv_discard(int behavior) + { +- return behavior & +- (MADV_FREE | MADV_DONTNEED | MADV_DONTNEED_LOCKED | +- MADV_REMOVE | MADV_DONTFORK | MADV_WIPEONFORK); ++ switch (behavior) { ++ case MADV_FREE: ++ case MADV_DONTNEED: ++ case MADV_DONTNEED_LOCKED: ++ case MADV_REMOVE: ++ case MADV_DONTFORK: ++ case MADV_WIPEONFORK: ++ return true; ++ } ++ ++ return false; + } + + static bool is_ro_anon(struct vm_area_struct *vma) +-- +2.46.0 + diff --git a/queue-6.10/net-mana-fix-doorbell-out-of-order-violation-and-avoid-unnecessary-doorbell-rings.patch b/queue-6.10/net-mana-fix-doorbell-out-of-order-violation-and-avoid-unnecessary-doorbell-rings.patch new file mode 100644 index 00000000000..5c01b20f48d --- /dev/null +++ b/queue-6.10/net-mana-fix-doorbell-out-of-order-violation-and-avoid-unnecessary-doorbell-rings.patch @@ -0,0 +1,89 @@ +From 58a63729c957621f1990c3494c702711188ca347 Mon Sep 17 00:00:00 2001 +From: Long Li +Date: Fri, 9 Aug 2024 08:58:58 -0700 +Subject: net: mana: Fix doorbell out of order violation and avoid unnecessary doorbell rings + +From: Long Li + +commit 58a63729c957621f1990c3494c702711188ca347 upstream. + +After napi_complete_done() is called when NAPI is polling in the current +process context, another NAPI may be scheduled and start running in +softirq on another CPU and may ring the doorbell before the current CPU +does. When combined with unnecessary rings when there is no need to arm +the CQ, it triggers error paths in the hardware. + +This patch fixes this by calling napi_complete_done() after doorbell +rings. It limits the number of unnecessary rings when there is +no need to arm. MANA hardware specifies that there must be one doorbell +ring every 8 CQ wraparounds. This driver guarantees one doorbell ring as +soon as the number of consumed CQEs exceeds 4 CQ wraparounds. In practical +workloads, the 4 CQ wraparounds proves to be big enough that it rarely +exceeds this limit before all the napi weight is consumed. + +To implement this, add a per-CQ counter cq->work_done_since_doorbell, +and make sure the CQ is armed as soon as passing 4 wraparounds of the CQ. + +Cc: stable@vger.kernel.org +Fixes: e1b5683ff62e ("net: mana: Move NAPI from EQ to CQ") +Reviewed-by: Haiyang Zhang +Signed-off-by: Long Li +Link: https://patch.msgid.link/1723219138-29887-1-git-send-email-longli@linuxonhyperv.com +Signed-off-by: Paolo Abeni +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/microsoft/mana/mana_en.c | 22 ++++++++++++++-------- + include/net/mana/mana.h | 1 + + 2 files changed, 15 insertions(+), 8 deletions(-) + +--- a/drivers/net/ethernet/microsoft/mana/mana_en.c ++++ b/drivers/net/ethernet/microsoft/mana/mana_en.c +@@ -1777,7 +1777,6 @@ static void mana_poll_rx_cq(struct mana_ + static int mana_cq_handler(void *context, struct gdma_queue *gdma_queue) + { + struct mana_cq *cq = context; +- u8 arm_bit; + int w; + + WARN_ON_ONCE(cq->gdma_cq != gdma_queue); +@@ -1788,16 +1787,23 @@ static int mana_cq_handler(void *context + mana_poll_tx_cq(cq); + + w = cq->work_done; ++ cq->work_done_since_doorbell += w; + +- if (w < cq->budget && +- napi_complete_done(&cq->napi, w)) { +- arm_bit = SET_ARM_BIT; +- } else { +- arm_bit = 0; ++ if (w < cq->budget) { ++ mana_gd_ring_cq(gdma_queue, SET_ARM_BIT); ++ cq->work_done_since_doorbell = 0; ++ napi_complete_done(&cq->napi, w); ++ } else if (cq->work_done_since_doorbell > ++ cq->gdma_cq->queue_size / COMP_ENTRY_SIZE * 4) { ++ /* MANA hardware requires at least one doorbell ring every 8 ++ * wraparounds of CQ even if there is no need to arm the CQ. ++ * This driver rings the doorbell as soon as we have exceeded ++ * 4 wraparounds. ++ */ ++ mana_gd_ring_cq(gdma_queue, 0); ++ cq->work_done_since_doorbell = 0; + } + +- mana_gd_ring_cq(gdma_queue, arm_bit); +- + return w; + } + +--- a/include/net/mana/mana.h ++++ b/include/net/mana/mana.h +@@ -274,6 +274,7 @@ struct mana_cq { + /* NAPI data */ + struct napi_struct napi; + int work_done; ++ int work_done_since_doorbell; + int budget; + }; + diff --git a/queue-6.10/net-mana-fix-rx-buf-alloc_size-alignment-and-atomic-op-panic.patch b/queue-6.10/net-mana-fix-rx-buf-alloc_size-alignment-and-atomic-op-panic.patch new file mode 100644 index 00000000000..26ac6054315 --- /dev/null +++ b/queue-6.10/net-mana-fix-rx-buf-alloc_size-alignment-and-atomic-op-panic.patch @@ -0,0 +1,67 @@ +From 32316f676b4ee87c0404d333d248ccf777f739bc Mon Sep 17 00:00:00 2001 +From: Haiyang Zhang +Date: Fri, 9 Aug 2024 14:01:24 -0700 +Subject: net: mana: Fix RX buf alloc_size alignment and atomic op panic + +From: Haiyang Zhang + +commit 32316f676b4ee87c0404d333d248ccf777f739bc upstream. + +The MANA driver's RX buffer alloc_size is passed into napi_build_skb() to +create SKB. skb_shinfo(skb) is located at the end of skb, and its alignment +is affected by the alloc_size passed into napi_build_skb(). The size needs +to be aligned properly for better performance and atomic operations. +Otherwise, on ARM64 CPU, for certain MTU settings like 4000, atomic +operations may panic on the skb_shinfo(skb)->dataref due to alignment fault. + +To fix this bug, add proper alignment to the alloc_size calculation. + +Sample panic info: +[ 253.298819] Unable to handle kernel paging request at virtual address ffff000129ba5cce +[ 253.300900] Mem abort info: +[ 253.301760] ESR = 0x0000000096000021 +[ 253.302825] EC = 0x25: DABT (current EL), IL = 32 bits +[ 253.304268] SET = 0, FnV = 0 +[ 253.305172] EA = 0, S1PTW = 0 +[ 253.306103] FSC = 0x21: alignment fault +Call trace: + __skb_clone+0xfc/0x198 + skb_clone+0x78/0xe0 + raw6_local_deliver+0xfc/0x228 + ip6_protocol_deliver_rcu+0x80/0x500 + ip6_input_finish+0x48/0x80 + ip6_input+0x48/0xc0 + ip6_sublist_rcv_finish+0x50/0x78 + ip6_sublist_rcv+0x1cc/0x2b8 + ipv6_list_rcv+0x100/0x150 + __netif_receive_skb_list_core+0x180/0x220 + netif_receive_skb_list_internal+0x198/0x2a8 + __napi_poll+0x138/0x250 + net_rx_action+0x148/0x330 + handle_softirqs+0x12c/0x3a0 + +Cc: stable@vger.kernel.org +Fixes: 80f6215b450e ("net: mana: Add support for jumbo frame") +Signed-off-by: Haiyang Zhang +Reviewed-by: Long Li +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/microsoft/mana/mana_en.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/microsoft/mana/mana_en.c ++++ b/drivers/net/ethernet/microsoft/mana/mana_en.c +@@ -599,7 +599,11 @@ static void mana_get_rxbuf_cfg(int mtu, + else + *headroom = XDP_PACKET_HEADROOM; + +- *alloc_size = mtu + MANA_RXBUF_PAD + *headroom; ++ *alloc_size = SKB_DATA_ALIGN(mtu + MANA_RXBUF_PAD + *headroom); ++ ++ /* Using page pool in this case, so alloc_size is PAGE_SIZE */ ++ if (*alloc_size < PAGE_SIZE) ++ *alloc_size = PAGE_SIZE; + + *datasize = mtu + ETH_HLEN; + } diff --git a/queue-6.10/perf-bpf-don-t-call-bpf_overflow_handler-for-tracing-events.patch b/queue-6.10/perf-bpf-don-t-call-bpf_overflow_handler-for-tracing-events.patch new file mode 100644 index 00000000000..46bd327e783 --- /dev/null +++ b/queue-6.10/perf-bpf-don-t-call-bpf_overflow_handler-for-tracing-events.patch @@ -0,0 +1,51 @@ +From 100bff23818eb61751ed05d64a7df36ce9728a4d Mon Sep 17 00:00:00 2001 +From: Kyle Huey +Date: Tue, 13 Aug 2024 15:17:27 +0000 +Subject: perf/bpf: Don't call bpf_overflow_handler() for tracing events + +From: Kyle Huey + +commit 100bff23818eb61751ed05d64a7df36ce9728a4d upstream. + +The regressing commit is new in 6.10. It assumed that anytime event->prog +is set bpf_overflow_handler() should be invoked to execute the attached bpf +program. This assumption is false for tracing events, and as a result the +regressing commit broke bpftrace by invoking the bpf handler with garbage +inputs on overflow. + +Prior to the regression the overflow handlers formed a chain (of length 0, +1, or 2) and perf_event_set_bpf_handler() (the !tracing case) added +bpf_overflow_handler() to that chain, while perf_event_attach_bpf_prog() +(the tracing case) did not. Both set event->prog. The chain of overflow +handlers was replaced by a single overflow handler slot and a fixed call to +bpf_overflow_handler() when appropriate. This modifies the condition there +to check event->prog->type == BPF_PROG_TYPE_PERF_EVENT, restoring the +previous behavior and fixing bpftrace. + +Signed-off-by: Kyle Huey +Suggested-by: Andrii Nakryiko +Reported-by: Joe Damato +Closes: https://lore.kernel.org/lkml/ZpFfocvyF3KHaSzF@LQ3V64L9R2/ +Fixes: f11f10bfa1ca ("perf/bpf: Call BPF handler directly, not through overflow machinery") +Cc: stable@vger.kernel.org +Tested-by: Joe Damato # bpftrace +Acked-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20240813151727.28797-1-jdamato@fastly.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman +--- + kernel/events/core.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -9708,7 +9708,8 @@ static int __perf_event_overflow(struct + + ret = __perf_event_account_interrupt(event, throttle); + +- if (event->prog && !bpf_overflow_handler(event, data, regs)) ++ if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && ++ !bpf_overflow_handler(event, data, regs)) + return ret; + + /* diff --git a/queue-6.10/riscv-change-xip-s-kernel_map.size-to-be-size-of-the-entire-kernel.patch b/queue-6.10/riscv-change-xip-s-kernel_map.size-to-be-size-of-the-entire-kernel.patch new file mode 100644 index 00000000000..faf6590c866 --- /dev/null +++ b/queue-6.10/riscv-change-xip-s-kernel_map.size-to-be-size-of-the-entire-kernel.patch @@ -0,0 +1,50 @@ +From 57d76bc51fd80824bcc0c84a5b5ec944f1b51edd Mon Sep 17 00:00:00 2001 +From: Nam Cao +Date: Wed, 8 May 2024 21:19:17 +0200 +Subject: riscv: change XIP's kernel_map.size to be size of the entire kernel + +From: Nam Cao + +commit 57d76bc51fd80824bcc0c84a5b5ec944f1b51edd upstream. + +With XIP kernel, kernel_map.size is set to be only the size of data part of +the kernel. This is inconsistent with "normal" kernel, who sets it to be +the size of the entire kernel. + +More importantly, XIP kernel fails to boot if CONFIG_DEBUG_VIRTUAL is +enabled, because there are checks on virtual addresses with the assumption +that kernel_map.size is the size of the entire kernel (these checks are in +arch/riscv/mm/physaddr.c). + +Change XIP's kernel_map.size to be the size of the entire kernel. + +Signed-off-by: Nam Cao +Cc: # v6.1+ +Reviewed-by: Alexandre Ghiti +Link: https://lore.kernel.org/r/20240508191917.2892064-1-namcao@linutronix.de +Signed-off-by: Palmer Dabbelt +Signed-off-by: Greg Kroah-Hartman +--- + arch/riscv/mm/init.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/riscv/mm/init.c ++++ b/arch/riscv/mm/init.c +@@ -931,7 +931,7 @@ static void __init create_kernel_page_ta + PMD_SIZE, PAGE_KERNEL_EXEC); + + /* Map the data in RAM */ +- end_va = kernel_map.virt_addr + XIP_OFFSET + kernel_map.size; ++ end_va = kernel_map.virt_addr + kernel_map.size; + for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += PMD_SIZE) + create_pgd_mapping(pgdir, va, + kernel_map.phys_addr + (va - (kernel_map.virt_addr + XIP_OFFSET)), +@@ -1100,7 +1100,7 @@ asmlinkage void __init setup_vm(uintptr_ + + phys_ram_base = CONFIG_PHYS_RAM_BASE; + kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE; +- kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_sdata); ++ kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_start); + + kernel_map.va_kernel_xip_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; + #else diff --git a/queue-6.10/riscv-entry-always-initialize-regs-a0-to-enosys.patch b/queue-6.10/riscv-entry-always-initialize-regs-a0-to-enosys.patch new file mode 100644 index 00000000000..3cfef885797 --- /dev/null +++ b/queue-6.10/riscv-entry-always-initialize-regs-a0-to-enosys.patch @@ -0,0 +1,49 @@ +From 61119394631f219e23ce98bcc3eb993a64a8ea64 Mon Sep 17 00:00:00 2001 +From: Celeste Liu +Date: Thu, 27 Jun 2024 22:23:39 +0800 +Subject: riscv: entry: always initialize regs->a0 to -ENOSYS +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Celeste Liu + +commit 61119394631f219e23ce98bcc3eb993a64a8ea64 upstream. + +Otherwise when the tracer changes syscall number to -1, the kernel fails +to initialize a0 with -ENOSYS and subsequently fails to return the error +code of the failed syscall to userspace. For example, it will break +strace syscall tampering. + +Fixes: 52449c17bdd1 ("riscv: entry: set a0 = -ENOSYS only when syscall != -1") +Reported-by: "Dmitry V. Levin" +Reviewed-by: Björn Töpel +Cc: stable@vger.kernel.org +Signed-off-by: Celeste Liu +Link: https://lore.kernel.org/r/20240627142338.5114-2-CoelacanthusHex@gmail.com +Signed-off-by: Palmer Dabbelt +Signed-off-by: Greg Kroah-Hartman +--- + arch/riscv/kernel/traps.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/arch/riscv/kernel/traps.c ++++ b/arch/riscv/kernel/traps.c +@@ -319,6 +319,7 @@ void do_trap_ecall_u(struct pt_regs *reg + + regs->epc += 4; + regs->orig_a0 = regs->a0; ++ regs->a0 = -ENOSYS; + + riscv_v_vstate_discard(regs); + +@@ -328,8 +329,7 @@ void do_trap_ecall_u(struct pt_regs *reg + + if (syscall >= 0 && syscall < NR_syscalls) + syscall_handler(regs, syscall); +- else if (syscall != -1) +- regs->a0 = -ENOSYS; ++ + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * so the maximum stack offset is 1k bytes (10 bits). diff --git a/queue-6.10/rtla-osnoise-prevent-null-dereference-in-error-handling.patch b/queue-6.10/rtla-osnoise-prevent-null-dereference-in-error-handling.patch new file mode 100644 index 00000000000..3066d024c05 --- /dev/null +++ b/queue-6.10/rtla-osnoise-prevent-null-dereference-in-error-handling.patch @@ -0,0 +1,52 @@ +From 90574d2a675947858b47008df8d07f75ea50d0d0 Mon Sep 17 00:00:00 2001 +From: Dan Carpenter +Date: Fri, 9 Aug 2024 15:34:30 +0300 +Subject: rtla/osnoise: Prevent NULL dereference in error handling + +From: Dan Carpenter + +commit 90574d2a675947858b47008df8d07f75ea50d0d0 upstream. + +If the "tool->data" allocation fails then there is no need to call +osnoise_free_top() and, in fact, doing so will lead to a NULL dereference. + +Cc: stable@vger.kernel.org +Cc: John Kacur +Cc: "Luis Claudio R. Goncalves" +Cc: Clark Williams +Fixes: 1eceb2fc2ca5 ("rtla/osnoise: Add osnoise top mode") +Link: https://lore.kernel.org/f964ed1f-64d2-4fde-ad3e-708331f8f358@stanley.mountain +Signed-off-by: Dan Carpenter +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + tools/tracing/rtla/src/osnoise_top.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +--- a/tools/tracing/rtla/src/osnoise_top.c ++++ b/tools/tracing/rtla/src/osnoise_top.c +@@ -640,8 +640,10 @@ struct osnoise_tool *osnoise_init_top(st + return NULL; + + tool->data = osnoise_alloc_top(nr_cpus); +- if (!tool->data) +- goto out_err; ++ if (!tool->data) { ++ osnoise_destroy_tool(tool); ++ return NULL; ++ } + + tool->params = params; + +@@ -649,11 +651,6 @@ struct osnoise_tool *osnoise_init_top(st + osnoise_top_handler, NULL); + + return tool; +- +-out_err: +- osnoise_free_top(tool->data); +- osnoise_destroy_tool(tool); +- return NULL; + } + + static int stop_tracing; diff --git a/queue-6.10/s390-dasd-fix-error-recovery-leading-to-data-corruption-on-ese-devices.patch b/queue-6.10/s390-dasd-fix-error-recovery-leading-to-data-corruption-on-ese-devices.patch new file mode 100644 index 00000000000..291eefe6596 --- /dev/null +++ b/queue-6.10/s390-dasd-fix-error-recovery-leading-to-data-corruption-on-ese-devices.patch @@ -0,0 +1,242 @@ +From 7db4042336580dfd75cb5faa82c12cd51098c90b Mon Sep 17 00:00:00 2001 +From: Stefan Haberland +Date: Mon, 12 Aug 2024 14:57:33 +0200 +Subject: s390/dasd: fix error recovery leading to data corruption on ESE devices + +From: Stefan Haberland + +commit 7db4042336580dfd75cb5faa82c12cd51098c90b upstream. + +Extent Space Efficient (ESE) or thin provisioned volumes need to be +formatted on demand during usual IO processing. + +The dasd_ese_needs_format function checks for error codes that signal +the non existence of a proper track format. + +The check for incorrect length is to imprecise since other error cases +leading to transport of insufficient data also have this flag set. +This might lead to data corruption in certain error cases for example +during a storage server warmstart. + +Fix by removing the check for incorrect length and replacing by +explicitly checking for invalid track format in transport mode. + +Also remove the check for file protected since this is not a valid +ESE handling case. + +Cc: stable@vger.kernel.org # 5.3+ +Fixes: 5e2b17e712cf ("s390/dasd: Add dynamic formatting support for ESE volumes") +Reviewed-by: Jan Hoeppner +Signed-off-by: Stefan Haberland +Link: https://lore.kernel.org/r/20240812125733.126431-3-sth@linux.ibm.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + drivers/s390/block/dasd.c | 36 +++++++++++++++--------- + drivers/s390/block/dasd_3990_erp.c | 10 +----- + drivers/s390/block/dasd_eckd.c | 55 ++++++++++++++++--------------------- + drivers/s390/block/dasd_int.h | 2 - + 4 files changed, 50 insertions(+), 53 deletions(-) + +--- a/drivers/s390/block/dasd.c ++++ b/drivers/s390/block/dasd.c +@@ -1601,9 +1601,15 @@ static int dasd_ese_needs_format(struct + if (!sense) + return 0; + +- return !!(sense[1] & SNS1_NO_REC_FOUND) || +- !!(sense[1] & SNS1_FILE_PROTECTED) || +- scsw_cstat(&irb->scsw) == SCHN_STAT_INCORR_LEN; ++ if (sense[1] & SNS1_NO_REC_FOUND) ++ return 1; ++ ++ if ((sense[1] & SNS1_INV_TRACK_FORMAT) && ++ scsw_is_tm(&irb->scsw) && ++ !(sense[2] & SNS2_ENV_DATA_PRESENT)) ++ return 1; ++ ++ return 0; + } + + static int dasd_ese_oos_cond(u8 *sense) +@@ -1624,7 +1630,7 @@ void dasd_int_handler(struct ccw_device + struct dasd_device *device; + unsigned long now; + int nrf_suppressed = 0; +- int fp_suppressed = 0; ++ int it_suppressed = 0; + struct request *req; + u8 *sense = NULL; + int expires; +@@ -1679,8 +1685,9 @@ void dasd_int_handler(struct ccw_device + */ + sense = dasd_get_sense(irb); + if (sense) { +- fp_suppressed = (sense[1] & SNS1_FILE_PROTECTED) && +- test_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); ++ it_suppressed = (sense[1] & SNS1_INV_TRACK_FORMAT) && ++ !(sense[2] & SNS2_ENV_DATA_PRESENT) && ++ test_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags); + nrf_suppressed = (sense[1] & SNS1_NO_REC_FOUND) && + test_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); + +@@ -1695,7 +1702,7 @@ void dasd_int_handler(struct ccw_device + return; + } + } +- if (!(fp_suppressed || nrf_suppressed)) ++ if (!(it_suppressed || nrf_suppressed)) + device->discipline->dump_sense_dbf(device, irb, "int"); + + if (device->features & DASD_FEATURE_ERPLOG) +@@ -2459,14 +2466,17 @@ retry: + rc = 0; + list_for_each_entry_safe(cqr, n, ccw_queue, blocklist) { + /* +- * In some cases the 'File Protected' or 'Incorrect Length' +- * error might be expected and error recovery would be +- * unnecessary in these cases. Check if the according suppress +- * bit is set. ++ * In some cases certain errors might be expected and ++ * error recovery would be unnecessary in these cases. ++ * Check if the according suppress bit is set. + */ + sense = dasd_get_sense(&cqr->irb); +- if (sense && sense[1] & SNS1_FILE_PROTECTED && +- test_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags)) ++ if (sense && (sense[1] & SNS1_INV_TRACK_FORMAT) && ++ !(sense[2] & SNS2_ENV_DATA_PRESENT) && ++ test_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags)) ++ continue; ++ if (sense && (sense[1] & SNS1_NO_REC_FOUND) && ++ test_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags)) + continue; + if (scsw_cstat(&cqr->irb.scsw) == 0x40 && + test_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags)) +--- a/drivers/s390/block/dasd_3990_erp.c ++++ b/drivers/s390/block/dasd_3990_erp.c +@@ -1386,14 +1386,8 @@ dasd_3990_erp_file_prot(struct dasd_ccw_ + + struct dasd_device *device = erp->startdev; + +- /* +- * In some cases the 'File Protected' error might be expected and +- * log messages shouldn't be written then. +- * Check if the according suppress bit is set. +- */ +- if (!test_bit(DASD_CQR_SUPPRESS_FP, &erp->flags)) +- dev_err(&device->cdev->dev, +- "Accessing the DASD failed because of a hardware error\n"); ++ dev_err(&device->cdev->dev, ++ "Accessing the DASD failed because of a hardware error\n"); + + return dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); + +--- a/drivers/s390/block/dasd_eckd.c ++++ b/drivers/s390/block/dasd_eckd.c +@@ -2274,6 +2274,7 @@ dasd_eckd_analysis_ccw(struct dasd_devic + cqr->status = DASD_CQR_FILLED; + /* Set flags to suppress output for expected errors */ + set_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); ++ set_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags); + + return cqr; + } +@@ -2555,7 +2556,6 @@ dasd_eckd_build_check_tcw(struct dasd_de + cqr->buildclk = get_tod_clock(); + cqr->status = DASD_CQR_FILLED; + /* Set flags to suppress output for expected errors */ +- set_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); + set_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags); + + return cqr; +@@ -4129,8 +4129,6 @@ static struct dasd_ccw_req *dasd_eckd_bu + + /* Set flags to suppress output for expected errors */ + if (dasd_eckd_is_ese(basedev)) { +- set_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); +- set_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags); + set_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); + } + +@@ -4632,9 +4630,8 @@ static struct dasd_ccw_req *dasd_eckd_bu + + /* Set flags to suppress output for expected errors */ + if (dasd_eckd_is_ese(basedev)) { +- set_bit(DASD_CQR_SUPPRESS_FP, &cqr->flags); +- set_bit(DASD_CQR_SUPPRESS_IL, &cqr->flags); + set_bit(DASD_CQR_SUPPRESS_NRF, &cqr->flags); ++ set_bit(DASD_CQR_SUPPRESS_IT, &cqr->flags); + } + + return cqr; +@@ -5779,36 +5776,32 @@ static void dasd_eckd_dump_sense(struct + { + u8 *sense = dasd_get_sense(irb); + +- if (scsw_is_tm(&irb->scsw)) { +- /* +- * In some cases the 'File Protected' or 'Incorrect Length' +- * error might be expected and log messages shouldn't be written +- * then. Check if the according suppress bit is set. +- */ +- if (sense && (sense[1] & SNS1_FILE_PROTECTED) && +- test_bit(DASD_CQR_SUPPRESS_FP, &req->flags)) +- return; +- if (scsw_cstat(&irb->scsw) == 0x40 && +- test_bit(DASD_CQR_SUPPRESS_IL, &req->flags)) +- return; ++ /* ++ * In some cases certain errors might be expected and ++ * log messages shouldn't be written then. ++ * Check if the according suppress bit is set. ++ */ ++ if (sense && (sense[1] & SNS1_INV_TRACK_FORMAT) && ++ !(sense[2] & SNS2_ENV_DATA_PRESENT) && ++ test_bit(DASD_CQR_SUPPRESS_IT, &req->flags)) ++ return; + +- dasd_eckd_dump_sense_tcw(device, req, irb); +- } else { +- /* +- * In some cases the 'Command Reject' or 'No Record Found' +- * error might be expected and log messages shouldn't be +- * written then. Check if the according suppress bit is set. +- */ +- if (sense && sense[0] & SNS0_CMD_REJECT && +- test_bit(DASD_CQR_SUPPRESS_CR, &req->flags)) +- return; ++ if (sense && sense[0] & SNS0_CMD_REJECT && ++ test_bit(DASD_CQR_SUPPRESS_CR, &req->flags)) ++ return; + +- if (sense && sense[1] & SNS1_NO_REC_FOUND && +- test_bit(DASD_CQR_SUPPRESS_NRF, &req->flags)) +- return; ++ if (sense && sense[1] & SNS1_NO_REC_FOUND && ++ test_bit(DASD_CQR_SUPPRESS_NRF, &req->flags)) ++ return; + ++ if (scsw_cstat(&irb->scsw) == 0x40 && ++ test_bit(DASD_CQR_SUPPRESS_IL, &req->flags)) ++ return; ++ ++ if (scsw_is_tm(&irb->scsw)) ++ dasd_eckd_dump_sense_tcw(device, req, irb); ++ else + dasd_eckd_dump_sense_ccw(device, req, irb); +- } + } + + static int dasd_eckd_reload_device(struct dasd_device *device) +--- a/drivers/s390/block/dasd_int.h ++++ b/drivers/s390/block/dasd_int.h +@@ -196,7 +196,7 @@ struct dasd_ccw_req { + * The following flags are used to suppress output of certain errors. + */ + #define DASD_CQR_SUPPRESS_NRF 4 /* Suppress 'No Record Found' error */ +-#define DASD_CQR_SUPPRESS_FP 5 /* Suppress 'File Protected' error*/ ++#define DASD_CQR_SUPPRESS_IT 5 /* Suppress 'Invalid Track' error*/ + #define DASD_CQR_SUPPRESS_IL 6 /* Suppress 'Incorrect Length' error */ + #define DASD_CQR_SUPPRESS_CR 7 /* Suppress 'Command Reject' error */ + diff --git a/queue-6.10/selftests-memfd_secret-don-t-build-memfd_secret-test-on-unsupported-arches.patch b/queue-6.10/selftests-memfd_secret-don-t-build-memfd_secret-test-on-unsupported-arches.patch new file mode 100644 index 00000000000..42d340fd767 --- /dev/null +++ b/queue-6.10/selftests-memfd_secret-don-t-build-memfd_secret-test-on-unsupported-arches.patch @@ -0,0 +1,73 @@ +From 7c5e8d212d7d81991a580e7de3904ea213d9a852 Mon Sep 17 00:00:00 2001 +From: Muhammad Usama Anjum +Date: Fri, 9 Aug 2024 12:56:42 +0500 +Subject: selftests: memfd_secret: don't build memfd_secret test on unsupported arches + +From: Muhammad Usama Anjum + +commit 7c5e8d212d7d81991a580e7de3904ea213d9a852 upstream. + +[1] mentions that memfd_secret is only supported on arm64, riscv, x86 and +x86_64 for now. It doesn't support other architectures. I found the +build error on arm and decided to send the fix as it was creating noise on +KernelCI: + +memfd_secret.c: In function 'memfd_secret': +memfd_secret.c:42:24: error: '__NR_memfd_secret' undeclared (first use in this function); +did you mean 'memfd_secret'? + 42 | return syscall(__NR_memfd_secret, flags); + | ^~~~~~~~~~~~~~~~~ + | memfd_secret + +Hence I'm adding condition that memfd_secret should only be compiled on +supported architectures. + +Also check in run_vmtests script if memfd_secret binary is present before +executing it. + +Link: https://lkml.kernel.org/r/20240812061522.1933054-1-usama.anjum@collabora.com +Link: https://lore.kernel.org/all/20210518072034.31572-7-rppt@kernel.org/ [1] +Link: https://lkml.kernel.org/r/20240809075642.403247-1-usama.anjum@collabora.com +Fixes: 76fe17ef588a ("secretmem: test: add basic selftest for memfd_secret(2)") +Signed-off-by: Muhammad Usama Anjum +Reviewed-by: Shuah Khan +Acked-by: Mike Rapoport (Microsoft) +Cc: Albert Ou +Cc: James Bottomley +Cc: Mike Rapoport (Microsoft) +Cc: Palmer Dabbelt +Cc: Paul Walmsley +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/mm/Makefile | 2 ++ + tools/testing/selftests/mm/run_vmtests.sh | 3 +++ + 2 files changed, 5 insertions(+) + +--- a/tools/testing/selftests/mm/Makefile ++++ b/tools/testing/selftests/mm/Makefile +@@ -51,7 +51,9 @@ TEST_GEN_FILES += madv_populate + TEST_GEN_FILES += map_fixed_noreplace + TEST_GEN_FILES += map_hugetlb + TEST_GEN_FILES += map_populate ++ifneq (,$(filter $(ARCH),arm64 riscv riscv64 x86 x86_64)) + TEST_GEN_FILES += memfd_secret ++endif + TEST_GEN_FILES += migration + TEST_GEN_FILES += mkdirty + TEST_GEN_FILES += mlock-random-test +--- a/tools/testing/selftests/mm/run_vmtests.sh ++++ b/tools/testing/selftests/mm/run_vmtests.sh +@@ -367,8 +367,11 @@ CATEGORY="hmm" run_test bash ./test_hmm. + # MADV_POPULATE_READ and MADV_POPULATE_WRITE tests + CATEGORY="madv_populate" run_test ./madv_populate + ++if [ -x ./memfd_secret ] ++then + (echo 0 | sudo tee /proc/sys/kernel/yama/ptrace_scope 2>&1) | tap_prefix + CATEGORY="memfd_secret" run_test ./memfd_secret ++fi + + # KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100 + CATEGORY="ksm" run_test ./ksm_tests -H -s 100 diff --git a/queue-6.10/selinux-add-the-processing-of-the-failure-of-avc_add_xperms_decision.patch b/queue-6.10/selinux-add-the-processing-of-the-failure-of-avc_add_xperms_decision.patch new file mode 100644 index 00000000000..894c67c8cbb --- /dev/null +++ b/queue-6.10/selinux-add-the-processing-of-the-failure-of-avc_add_xperms_decision.patch @@ -0,0 +1,39 @@ +From 6dd1e4c045afa6a4ba5d46f044c83bd357c593c2 Mon Sep 17 00:00:00 2001 +From: Zhen Lei +Date: Wed, 7 Aug 2024 17:00:56 +0800 +Subject: selinux: add the processing of the failure of avc_add_xperms_decision() + +From: Zhen Lei + +commit 6dd1e4c045afa6a4ba5d46f044c83bd357c593c2 upstream. + +When avc_add_xperms_decision() fails, the information recorded by the new +avc node is incomplete. In this case, the new avc node should be released +instead of replacing the old avc node. + +Cc: stable@vger.kernel.org +Fixes: fa1aa143ac4a ("selinux: extended permissions for ioctls") +Suggested-by: Stephen Smalley +Signed-off-by: Zhen Lei +Acked-by: Stephen Smalley +Signed-off-by: Paul Moore +Signed-off-by: Greg Kroah-Hartman +--- + security/selinux/avc.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/security/selinux/avc.c ++++ b/security/selinux/avc.c +@@ -907,7 +907,11 @@ static int avc_update_node(u32 event, u3 + node->ae.avd.auditdeny &= ~perms; + break; + case AVC_CALLBACK_ADD_XPERMS: +- avc_add_xperms_decision(node, xpd); ++ rc = avc_add_xperms_decision(node, xpd); ++ if (rc) { ++ avc_node_kill(node); ++ goto out_unlock; ++ } + break; + } + avc_node_replace(node, orig); diff --git a/queue-6.10/selinux-fix-potential-counting-error-in-avc_add_xperms_decision.patch b/queue-6.10/selinux-fix-potential-counting-error-in-avc_add_xperms_decision.patch new file mode 100644 index 00000000000..cf61a2bf8bb --- /dev/null +++ b/queue-6.10/selinux-fix-potential-counting-error-in-avc_add_xperms_decision.patch @@ -0,0 +1,38 @@ +From 379d9af3f3da2da1bbfa67baf1820c72a080d1f1 Mon Sep 17 00:00:00 2001 +From: Zhen Lei +Date: Tue, 6 Aug 2024 14:51:13 +0800 +Subject: selinux: fix potential counting error in avc_add_xperms_decision() + +From: Zhen Lei + +commit 379d9af3f3da2da1bbfa67baf1820c72a080d1f1 upstream. + +The count increases only when a node is successfully added to +the linked list. + +Cc: stable@vger.kernel.org +Fixes: fa1aa143ac4a ("selinux: extended permissions for ioctls") +Signed-off-by: Zhen Lei +Acked-by: Stephen Smalley +Signed-off-by: Paul Moore +Signed-off-by: Greg Kroah-Hartman +--- + security/selinux/avc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/security/selinux/avc.c ++++ b/security/selinux/avc.c +@@ -330,12 +330,12 @@ static int avc_add_xperms_decision(struc + { + struct avc_xperms_decision_node *dest_xpd; + +- node->ae.xp_node->xp.len++; + dest_xpd = avc_xperms_decision_alloc(src->used); + if (!dest_xpd) + return -ENOMEM; + avc_copy_xperms_decision(&dest_xpd->xpd, src); + list_add(&dest_xpd->xpd_list, &node->ae.xp_node->xpd_head); ++ node->ae.xp_node->xp.len++; + return 0; + } + diff --git a/queue-6.10/series b/queue-6.10/series index 7b411195f83..71c1d88717a 100644 --- a/queue-6.10/series +++ b/queue-6.10/series @@ -8,3 +8,61 @@ revert-usb-typec-tcpm-clear-pd_event-queue-in-port_reset.patch selinux-revert-our-use-of-vma_is_initial_heap.patch netfs-ceph-revert-netfs-remove-deprecated-use-of-pg_private_2-as-a-second-writeback-flag.patch fuse-initialize-beyond-eof-page-contents-before-setting-uptodate.patch +char-xillybus-don-t-destroy-workqueue-from-work-item-running-on-it.patch +char-xillybus-refine-workqueue-handling.patch +char-xillybus-check-usb-endpoints-when-probing-device.patch +alsa-usb-audio-add-delay-quirk-for-vivo-usb-c-xe710-headset.patch +alsa-usb-audio-support-yamaha-p-125-quirk-entry.patch +usb-misc-ljca-add-lunar-lake-ljca-gpio-hid-to-ljca_gpio_hids.patch +usb-xhci-check-for-xhci-interrupters-being-allocated-in-xhci_mem_clearup.patch +xhci-fix-panther-point-null-pointer-deref-at-full-speed-re-enumeration.patch +thunderbolt-mark-xdomain-as-unplugged-when-router-is-removed.patch +alsa-hda-tas2781-fix-wrong-calibrated-data-order.patch +alsa-timer-relax-start-tick-time-check-for-slave-timer-elements.patch +s390-dasd-fix-error-recovery-leading-to-data-corruption-on-ese-devices.patch +kvm-s390-fix-validity-interception-issue-when-gisa-is-switched-off.patch +thermal-gov_bang_bang-call-__thermal_cdev_update-directly.patch +keys-trusted-fix-dcp-blob-payload-length-assignment.patch +keys-trusted-dcp-fix-leak-of-blob-encryption-key.patch +riscv-change-xip-s-kernel_map.size-to-be-size-of-the-entire-kernel.patch +riscv-entry-always-initialize-regs-a0-to-enosys.patch +smb3-fix-lock-breakage-for-cached-writes.patch +i2c-tegra-do-not-mark-acpi-devices-as-irq-safe.patch +acpica-add-a-depth-argument-to-acpi_execute_reg_methods.patch +acpi-ec-evaluate-_reg-outside-the-ec-scope-more-carefully.patch +arm64-acpi-numa-initialize-all-values-of-acpi_early_node_map-to-numa_no_node.patch +dm-resume-don-t-return-einval-when-signalled.patch +dm-persistent-data-fix-memory-allocation-failure.patch +vfs-don-t-evict-inode-under-the-inode-lru-traversing-context.patch +fix-bitmap-corruption-on-close_range-with-close_range_unshare.patch +i2c-qcom-geni-add-missing-geni_icc_disable-in-geni_i2c_runtime_resume.patch +tracing-return-from-tracing_buffers_read-if-the-file-has-been-closed.patch +perf-bpf-don-t-call-bpf_overflow_handler-for-tracing-events.patch +mseal-fix-is_madv_discard.patch +rtla-osnoise-prevent-null-dereference-in-error-handling.patch +mm-fix-endless-reclaim-on-machines-with-unaccepted-memory.patch +mm-hugetlb-fix-hugetlb-vs.-core-mm-pt-locking.patch +md-raid1-fix-data-corruption-for-degraded-array-with-slow-disk.patch +net-mana-fix-rx-buf-alloc_size-alignment-and-atomic-op-panic.patch +media-atomisp-fix-streaming-no-longer-working-on-byt-isp2400-devices.patch +net-mana-fix-doorbell-out-of-order-violation-and-avoid-unnecessary-doorbell-rings.patch +wifi-brcmfmac-cfg80211-handle-ssid-based-pmksa-deletion.patch +fs-netfs-fscache_cookie-add-missing-n_accesses-check.patch +selinux-fix-potential-counting-error-in-avc_add_xperms_decision.patch +selinux-add-the-processing-of-the-failure-of-avc_add_xperms_decision.patch +alloc_tag-mark-pages-reserved-during-cma-activation-as-not-tagged.patch +mm-memory-failure-use-raw_spinlock_t-in-struct-memory_failure_cpu.patch +selftests-memfd_secret-don-t-build-memfd_secret-test-on-unsupported-arches.patch +alloc_tag-introduce-clear_page_tag_ref-helper-function.patch +mm-numa-no-task_numa_fault-call-if-pmd-is-changed.patch +mm-vmalloc-fix-page-mapping-if-vm_area_alloc_pages-with-high-order-fallback-to-order-0.patch +mm-numa-no-task_numa_fault-call-if-pte-is-changed.patch +btrfs-tree-checker-reject-btrfs_ft_unknown-dir-type.patch +btrfs-send-allow-cloning-non-aligned-extent-if-it-ends-at-i_size.patch +btrfs-check-delayed-refs-when-we-re-checking-if-a-ref-exists.patch +btrfs-only-run-the-extent-map-shrinker-from-kswapd-tasks.patch +btrfs-zoned-properly-take-lock-to-read-update-block-group-s-zoned-variables.patch +btrfs-tree-checker-add-dev-extent-item-checks.patch +btrfs-only-enable-extent-map-shrinker-for-debug-builds.patch +drm-amdgpu-actually-check-flags-for-all-context-ops.patch +memcg_write_event_control-fix-a-user-triggerable-oops.patch diff --git a/queue-6.10/smb3-fix-lock-breakage-for-cached-writes.patch b/queue-6.10/smb3-fix-lock-breakage-for-cached-writes.patch new file mode 100644 index 00000000000..5c6e210cabb --- /dev/null +++ b/queue-6.10/smb3-fix-lock-breakage-for-cached-writes.patch @@ -0,0 +1,66 @@ +From 836bb3268db405cf9021496ac4dbc26d3e4758fe Mon Sep 17 00:00:00 2001 +From: Steve French +Date: Thu, 15 Aug 2024 14:03:43 -0500 +Subject: smb3: fix lock breakage for cached writes + +From: Steve French + +commit 836bb3268db405cf9021496ac4dbc26d3e4758fe upstream. + +Mandatory locking is enforced for cached writes, which violates +default posix semantics, and also it is enforced inconsistently. +This apparently breaks recent versions of libreoffice, but can +also be demonstrated by opening a file twice from the same +client, locking it from handle one and writing to it from +handle two (which fails, returning EACCES). + +Since there was already a mount option "forcemandatorylock" +(which defaults to off), with this change only when the user +intentionally specifies "forcemandatorylock" on mount will we +break posix semantics on write to a locked range (ie we will +only fail the write in this case, if the user mounts with +"forcemandatorylock"). + +Fixes: 85160e03a79e ("CIFS: Implement caching mechanism for mandatory brlocks") +Cc: stable@vger.kernel.org +Cc: Pavel Shilovsky +Reported-by: abartlet@samba.org +Reported-by: Kevin Ottens +Reviewed-by: David Howells +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman +--- + fs/smb/client/file.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/fs/smb/client/file.c ++++ b/fs/smb/client/file.c +@@ -2719,6 +2719,7 @@ cifs_writev(struct kiocb *iocb, struct i + struct inode *inode = file->f_mapping->host; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; ++ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + ssize_t rc; + + rc = netfs_start_io_write(inode); +@@ -2735,12 +2736,16 @@ cifs_writev(struct kiocb *iocb, struct i + if (rc <= 0) + goto out; + +- if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), ++ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) && ++ (cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), + server->vals->exclusive_lock_type, 0, +- NULL, CIFS_WRITE_OP)) +- rc = netfs_buffered_write_iter_locked(iocb, from, NULL); +- else ++ NULL, CIFS_WRITE_OP))) { + rc = -EACCES; ++ goto out; ++ } ++ ++ rc = netfs_buffered_write_iter_locked(iocb, from, NULL); ++ + out: + up_read(&cinode->lock_sem); + netfs_end_io_write(inode); diff --git a/queue-6.10/thermal-gov_bang_bang-call-__thermal_cdev_update-directly.patch b/queue-6.10/thermal-gov_bang_bang-call-__thermal_cdev_update-directly.patch new file mode 100644 index 00000000000..d4c513fa152 --- /dev/null +++ b/queue-6.10/thermal-gov_bang_bang-call-__thermal_cdev_update-directly.patch @@ -0,0 +1,46 @@ +From b9b6ee6fe258ce4d89592593efcd3d798c418859 Mon Sep 17 00:00:00 2001 +From: "Rafael J. Wysocki" +Date: Tue, 13 Aug 2024 16:25:19 +0200 +Subject: thermal: gov_bang_bang: Call __thermal_cdev_update() directly +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Rafael J. Wysocki + +commit b9b6ee6fe258ce4d89592593efcd3d798c418859 upstream. + +Instead of clearing the "updated" flag for each cooling device +affected by the trip point crossing in bang_bang_control() and +walking all thermal instances to run thermal_cdev_update() for all +of the affected cooling devices, call __thermal_cdev_update() +directly for each of them. + +No intentional functional impact. + +Signed-off-by: Rafael J. Wysocki +Acked-by: Peter Kästle +Reviewed-by: Zhang Rui +Cc: 6.10+ # 6.10+ +Link: https://patch.msgid.link/13583081.uLZWGnKmhe@rjwysocki.net +Signed-off-by: Greg Kroah-Hartman +--- + drivers/thermal/gov_bang_bang.c | 5 +---- + 1 file changed, 1 insertion(+), 4 deletions(-) + +--- a/drivers/thermal/gov_bang_bang.c ++++ b/drivers/thermal/gov_bang_bang.c +@@ -79,12 +79,9 @@ static void bang_bang_control(struct the + dev_dbg(&instance->cdev->device, "target=%ld\n", instance->target); + + mutex_lock(&instance->cdev->lock); +- instance->cdev->updated = false; /* cdev needs update */ ++ __thermal_cdev_update(instance->cdev); + mutex_unlock(&instance->cdev->lock); + } +- +- list_for_each_entry(instance, &tz->thermal_instances, tz_node) +- thermal_cdev_update(instance->cdev); + } + + static struct thermal_governor thermal_gov_bang_bang = { diff --git a/queue-6.10/thunderbolt-mark-xdomain-as-unplugged-when-router-is-removed.patch b/queue-6.10/thunderbolt-mark-xdomain-as-unplugged-when-router-is-removed.patch new file mode 100644 index 00000000000..ce4c2da62be --- /dev/null +++ b/queue-6.10/thunderbolt-mark-xdomain-as-unplugged-when-router-is-removed.patch @@ -0,0 +1,39 @@ +From e2006140ad2e01a02ed0aff49cc2ae3ceeb11f8d Mon Sep 17 00:00:00 2001 +From: Mika Westerberg +Date: Thu, 13 Jun 2024 15:05:03 +0300 +Subject: thunderbolt: Mark XDomain as unplugged when router is removed + +From: Mika Westerberg + +commit e2006140ad2e01a02ed0aff49cc2ae3ceeb11f8d upstream. + +I noticed that when we do discrete host router NVM upgrade and it gets +hot-removed from the PCIe side as a result of NVM firmware authentication, +if there is another host connected with enabled paths we hang in tearing +them down. This is due to fact that the Thunderbolt networking driver +also tries to cleanup the paths and ends up blocking in +tb_disconnect_xdomain_paths() waiting for the domain lock. + +However, at this point we already cleaned the paths in tb_stop() so +there is really no need for tb_disconnect_xdomain_paths() to do that +anymore. Furthermore it already checks if the XDomain is unplugged and +bails out early so take advantage of that and mark the XDomain as +unplugged when we remove the parent router. + +Cc: stable@vger.kernel.org +Signed-off-by: Mika Westerberg +Signed-off-by: Greg Kroah-Hartman +--- + drivers/thunderbolt/switch.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/thunderbolt/switch.c ++++ b/drivers/thunderbolt/switch.c +@@ -3392,6 +3392,7 @@ void tb_switch_remove(struct tb_switch * + tb_switch_remove(port->remote->sw); + port->remote = NULL; + } else if (port->xdomain) { ++ port->xdomain->is_unplugged = true; + tb_xdomain_remove(port->xdomain); + port->xdomain = NULL; + } diff --git a/queue-6.10/tracing-return-from-tracing_buffers_read-if-the-file-has-been-closed.patch b/queue-6.10/tracing-return-from-tracing_buffers_read-if-the-file-has-been-closed.patch new file mode 100644 index 00000000000..321d7bc094b --- /dev/null +++ b/queue-6.10/tracing-return-from-tracing_buffers_read-if-the-file-has-been-closed.patch @@ -0,0 +1,59 @@ +From d0949cd44a62c4c41b30ea7ae94d8c887f586882 Mon Sep 17 00:00:00 2001 +From: Steven Rostedt +Date: Thu, 8 Aug 2024 23:57:30 -0400 +Subject: tracing: Return from tracing_buffers_read() if the file has been closed + +From: Steven Rostedt + +commit d0949cd44a62c4c41b30ea7ae94d8c887f586882 upstream. + +When running the following: + + # cd /sys/kernel/tracing/ + # echo 1 > events/sched/sched_waking/enable + # echo 1 > events/sched/sched_switch/enable + # echo 0 > tracing_on + # dd if=per_cpu/cpu0/trace_pipe_raw of=/tmp/raw0.dat + +The dd task would get stuck in an infinite loop in the kernel. What would +happen is the following: + +When ring_buffer_read_page() returns -1 (no data) then a check is made to +see if the buffer is empty (as happens when the page is not full), it will +call wait_on_pipe() to wait until the ring buffer has data. When it is it +will try again to read data (unless O_NONBLOCK is set). + +The issue happens when there's a reader and the file descriptor is closed. +The wait_on_pipe() will return when that is the case. But this loop will +continue to try again and wait_on_pipe() will again return immediately and +the loop will continue and never stop. + +Simply check if the file was closed before looping and exit out if it is. + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mathieu Desnoyers +Link: https://lore.kernel.org/20240808235730.78bf63e5@rorschach.local.home +Fixes: 2aa043a55b9a7 ("tracing/ring-buffer: Fix wait_on_pipe() race") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 10cd38bce2f1..ebe7ce2f5f4a 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -7956,7 +7956,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, + trace_access_unlock(iter->cpu_file); + + if (ret < 0) { +- if (trace_empty(iter)) { ++ if (trace_empty(iter) && !iter->closed) { + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + +-- +2.46.0 + diff --git a/queue-6.10/usb-misc-ljca-add-lunar-lake-ljca-gpio-hid-to-ljca_gpio_hids.patch b/queue-6.10/usb-misc-ljca-add-lunar-lake-ljca-gpio-hid-to-ljca_gpio_hids.patch new file mode 100644 index 00000000000..84f80311d2d --- /dev/null +++ b/queue-6.10/usb-misc-ljca-add-lunar-lake-ljca-gpio-hid-to-ljca_gpio_hids.patch @@ -0,0 +1,32 @@ +From 3ed486e383ccee9b0c8d727608f12a937c6603ca Mon Sep 17 00:00:00 2001 +From: Hans de Goede +Date: Mon, 12 Aug 2024 11:50:38 +0200 +Subject: usb: misc: ljca: Add Lunar Lake ljca GPIO HID to ljca_gpio_hids[] + +From: Hans de Goede + +commit 3ed486e383ccee9b0c8d727608f12a937c6603ca upstream. + +Add LJCA GPIO support for the Lunar Lake platform. + +New HID taken from out of tree ivsc-driver git repo. + +Link: https://github.com/intel/ivsc-driver/commit/47e7c4a446c8ea8c741ff5a32fa7b19f9e6fd47e +Cc: stable +Signed-off-by: Hans de Goede +Link: https://lore.kernel.org/r/20240812095038.555837-1-hdegoede@redhat.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/usb/misc/usb-ljca.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/usb/misc/usb-ljca.c ++++ b/drivers/usb/misc/usb-ljca.c +@@ -169,6 +169,7 @@ static const struct acpi_device_id ljca_ + { "INTC1096" }, + { "INTC100B" }, + { "INTC10D1" }, ++ { "INTC10B5" }, + {}, + }; + diff --git a/queue-6.10/usb-xhci-check-for-xhci-interrupters-being-allocated-in-xhci_mem_clearup.patch b/queue-6.10/usb-xhci-check-for-xhci-interrupters-being-allocated-in-xhci_mem_clearup.patch new file mode 100644 index 00000000000..49056858ef8 --- /dev/null +++ b/queue-6.10/usb-xhci-check-for-xhci-interrupters-being-allocated-in-xhci_mem_clearup.patch @@ -0,0 +1,45 @@ +From dcdb52d948f3a17ccd3fce757d9bd981d7c32039 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Fri, 9 Aug 2024 15:44:07 +0300 +Subject: usb: xhci: Check for xhci->interrupters being allocated in xhci_mem_clearup() + +From: Marc Zyngier + +commit dcdb52d948f3a17ccd3fce757d9bd981d7c32039 upstream. + +If xhci_mem_init() fails, it calls into xhci_mem_cleanup() to mop +up the damage. If it fails early enough, before xhci->interrupters +is allocated but after xhci->max_interrupters has been set, which +happens in most (all?) cases, things get uglier, as xhci_mem_cleanup() +unconditionally derefences xhci->interrupters. With prejudice. + +Gate the interrupt freeing loop with a check on xhci->interrupters +being non-NULL. + +Found while debugging a DMA allocation issue that led the XHCI driver +on this exact path. + +Fixes: c99b38c41234 ("xhci: add support to allocate several interrupters") +Cc: Mathias Nyman +Cc: Wesley Cheng +Cc: Greg Kroah-Hartman +Signed-off-by: Marc Zyngier +Cc: stable@vger.kernel.org # 6.8+ +Signed-off-by: Mathias Nyman +Link: https://lore.kernel.org/r/20240809124408.505786-2-mathias.nyman@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/usb/host/xhci-mem.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/usb/host/xhci-mem.c ++++ b/drivers/usb/host/xhci-mem.c +@@ -1877,7 +1877,7 @@ void xhci_mem_cleanup(struct xhci_hcd *x + + cancel_delayed_work_sync(&xhci->cmd_timer); + +- for (i = 0; i < xhci->max_interrupters; i++) { ++ for (i = 0; xhci->interrupters && i < xhci->max_interrupters; i++) { + if (xhci->interrupters[i]) { + xhci_remove_interrupter(xhci, xhci->interrupters[i]); + xhci_free_interrupter(xhci, xhci->interrupters[i]); diff --git a/queue-6.10/vfs-don-t-evict-inode-under-the-inode-lru-traversing-context.patch b/queue-6.10/vfs-don-t-evict-inode-under-the-inode-lru-traversing-context.patch new file mode 100644 index 00000000000..e12035b47e5 --- /dev/null +++ b/queue-6.10/vfs-don-t-evict-inode-under-the-inode-lru-traversing-context.patch @@ -0,0 +1,215 @@ +From 2a0629834cd82f05d424bbc193374f9a43d1f87d Mon Sep 17 00:00:00 2001 +From: Zhihao Cheng +Date: Fri, 9 Aug 2024 11:16:28 +0800 +Subject: vfs: Don't evict inode under the inode lru traversing context +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Zhihao Cheng + +commit 2a0629834cd82f05d424bbc193374f9a43d1f87d upstream. + +The inode reclaiming process(See function prune_icache_sb) collects all +reclaimable inodes and mark them with I_FREEING flag at first, at that +time, other processes will be stuck if they try getting these inodes +(See function find_inode_fast), then the reclaiming process destroy the +inodes by function dispose_list(). Some filesystems(eg. ext4 with +ea_inode feature, ubifs with xattr) may do inode lookup in the inode +evicting callback function, if the inode lookup is operated under the +inode lru traversing context, deadlock problems may happen. + +Case 1: In function ext4_evict_inode(), the ea inode lookup could happen + if ea_inode feature is enabled, the lookup process will be stuck + under the evicting context like this: + + 1. File A has inode i_reg and an ea inode i_ea + 2. getfattr(A, xattr_buf) // i_ea is added into lru // lru->i_ea + 3. Then, following three processes running like this: + + PA PB + echo 2 > /proc/sys/vm/drop_caches + shrink_slab + prune_dcache_sb + // i_reg is added into lru, lru->i_ea->i_reg + prune_icache_sb + list_lru_walk_one + inode_lru_isolate + i_ea->i_state |= I_FREEING // set inode state + inode_lru_isolate + __iget(i_reg) + spin_unlock(&i_reg->i_lock) + spin_unlock(lru_lock) + rm file A + i_reg->nlink = 0 + iput(i_reg) // i_reg->nlink is 0, do evict + ext4_evict_inode + ext4_xattr_delete_inode + ext4_xattr_inode_dec_ref_all + ext4_xattr_inode_iget + ext4_iget(i_ea->i_ino) + iget_locked + find_inode_fast + __wait_on_freeing_inode(i_ea) ----→ AA deadlock + dispose_list // cannot be executed by prune_icache_sb + wake_up_bit(&i_ea->i_state) + +Case 2: In deleted inode writing function ubifs_jnl_write_inode(), file + deleting process holds BASEHD's wbuf->io_mutex while getting the + xattr inode, which could race with inode reclaiming process(The + reclaiming process could try locking BASEHD's wbuf->io_mutex in + inode evicting function), then an ABBA deadlock problem would + happen as following: + + 1. File A has inode ia and a xattr(with inode ixa), regular file B has + inode ib and a xattr. + 2. getfattr(A, xattr_buf) // ixa is added into lru // lru->ixa + 3. Then, following three processes running like this: + + PA PB PC + echo 2 > /proc/sys/vm/drop_caches + shrink_slab + prune_dcache_sb + // ib and ia are added into lru, lru->ixa->ib->ia + prune_icache_sb + list_lru_walk_one + inode_lru_isolate + ixa->i_state |= I_FREEING // set inode state + inode_lru_isolate + __iget(ib) + spin_unlock(&ib->i_lock) + spin_unlock(lru_lock) + rm file B + ib->nlink = 0 + rm file A + iput(ia) + ubifs_evict_inode(ia) + ubifs_jnl_delete_inode(ia) + ubifs_jnl_write_inode(ia) + make_reservation(BASEHD) // Lock wbuf->io_mutex + ubifs_iget(ixa->i_ino) + iget_locked + find_inode_fast + __wait_on_freeing_inode(ixa) + | iput(ib) // ib->nlink is 0, do evict + | ubifs_evict_inode + | ubifs_jnl_delete_inode(ib) + ↓ ubifs_jnl_write_inode + ABBA deadlock ←-----make_reservation(BASEHD) + dispose_list // cannot be executed by prune_icache_sb + wake_up_bit(&ixa->i_state) + +Fix the possible deadlock by using new inode state flag I_LRU_ISOLATING +to pin the inode in memory while inode_lru_isolate() reclaims its pages +instead of using ordinary inode reference. This way inode deletion +cannot be triggered from inode_lru_isolate() thus avoiding the deadlock. +evict() is made to wait for I_LRU_ISOLATING to be cleared before +proceeding with inode cleanup. + +Link: https://lore.kernel.org/all/37c29c42-7685-d1f0-067d-63582ffac405@huaweicloud.com/ +Link: https://bugzilla.kernel.org/show_bug.cgi?id=219022 +Fixes: e50e5129f384 ("ext4: xattr-in-inode support") +Fixes: 7959cf3a7506 ("ubifs: journal: Handle xattrs like files") +Cc: stable@vger.kernel.org +Signed-off-by: Zhihao Cheng +Link: https://lore.kernel.org/r/20240809031628.1069873-1-chengzhihao@huaweicloud.com +Reviewed-by: Jan Kara +Suggested-by: Jan Kara +Suggested-by: Mateusz Guzik +Signed-off-by: Christian Brauner +Signed-off-by: Greg Kroah-Hartman +--- + fs/inode.c | 39 +++++++++++++++++++++++++++++++++++++-- + include/linux/fs.h | 5 +++++ + 2 files changed, 42 insertions(+), 2 deletions(-) + +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -486,6 +486,39 @@ static void inode_lru_list_del(struct in + this_cpu_dec(nr_unused); + } + ++static void inode_pin_lru_isolating(struct inode *inode) ++{ ++ lockdep_assert_held(&inode->i_lock); ++ WARN_ON(inode->i_state & (I_LRU_ISOLATING | I_FREEING | I_WILL_FREE)); ++ inode->i_state |= I_LRU_ISOLATING; ++} ++ ++static void inode_unpin_lru_isolating(struct inode *inode) ++{ ++ spin_lock(&inode->i_lock); ++ WARN_ON(!(inode->i_state & I_LRU_ISOLATING)); ++ inode->i_state &= ~I_LRU_ISOLATING; ++ smp_mb(); ++ wake_up_bit(&inode->i_state, __I_LRU_ISOLATING); ++ spin_unlock(&inode->i_lock); ++} ++ ++static void inode_wait_for_lru_isolating(struct inode *inode) ++{ ++ spin_lock(&inode->i_lock); ++ if (inode->i_state & I_LRU_ISOLATING) { ++ DEFINE_WAIT_BIT(wq, &inode->i_state, __I_LRU_ISOLATING); ++ wait_queue_head_t *wqh; ++ ++ wqh = bit_waitqueue(&inode->i_state, __I_LRU_ISOLATING); ++ spin_unlock(&inode->i_lock); ++ __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); ++ spin_lock(&inode->i_lock); ++ WARN_ON(inode->i_state & I_LRU_ISOLATING); ++ } ++ spin_unlock(&inode->i_lock); ++} ++ + /** + * inode_sb_list_add - add inode to the superblock list of inodes + * @inode: inode to add +@@ -655,6 +688,8 @@ static void evict(struct inode *inode) + + inode_sb_list_del(inode); + ++ inode_wait_for_lru_isolating(inode); ++ + /* + * Wait for flusher thread to be done with the inode so that filesystem + * does not start destroying it while writeback is still running. Since +@@ -843,7 +878,7 @@ static enum lru_status inode_lru_isolate + * be under pressure before the cache inside the highmem zone. + */ + if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { +- __iget(inode); ++ inode_pin_lru_isolating(inode); + spin_unlock(&inode->i_lock); + spin_unlock(lru_lock); + if (remove_inode_buffers(inode)) { +@@ -855,7 +890,7 @@ static enum lru_status inode_lru_isolate + __count_vm_events(PGINODESTEAL, reap); + mm_account_reclaimed_pages(reap); + } +- iput(inode); ++ inode_unpin_lru_isolating(inode); + spin_lock(lru_lock); + return LRU_RETRY; + } +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -2370,6 +2370,9 @@ static inline void kiocb_clone(struct ki + * + * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. + * ++ * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding ++ * i_count. ++ * + * Q: What is the difference between I_WILL_FREE and I_FREEING? + */ + #define I_DIRTY_SYNC (1 << 0) +@@ -2393,6 +2396,8 @@ static inline void kiocb_clone(struct ki + #define I_DONTCACHE (1 << 16) + #define I_SYNC_QUEUED (1 << 17) + #define I_PINNING_NETFS_WB (1 << 18) ++#define __I_LRU_ISOLATING 19 ++#define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) + + #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) + #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) diff --git a/queue-6.10/wifi-brcmfmac-cfg80211-handle-ssid-based-pmksa-deletion.patch b/queue-6.10/wifi-brcmfmac-cfg80211-handle-ssid-based-pmksa-deletion.patch new file mode 100644 index 00000000000..011cc29af80 --- /dev/null +++ b/queue-6.10/wifi-brcmfmac-cfg80211-handle-ssid-based-pmksa-deletion.patch @@ -0,0 +1,49 @@ +From 2ad4e1ada8eebafa2d75a4b75eeeca882de6ada1 Mon Sep 17 00:00:00 2001 +From: Janne Grunau +Date: Sat, 3 Aug 2024 21:52:55 +0200 +Subject: wifi: brcmfmac: cfg80211: Handle SSID based pmksa deletion + +From: Janne Grunau + +commit 2ad4e1ada8eebafa2d75a4b75eeeca882de6ada1 upstream. + +wpa_supplicant 2.11 sends since 1efdba5fdc2c ("Handle PMKSA flush in the +driver for SAE/OWE offload cases") SSID based PMKSA del commands. +brcmfmac is not prepared and tries to dereference the NULL bssid and +pmkid pointers in cfg80211_pmksa. PMKID_V3 operations support SSID based +updates so copy the SSID. + +Fixes: a96202acaea4 ("wifi: brcmfmac: cfg80211: Add support for PMKID_V3 operations") +Cc: stable@vger.kernel.org # 6.4.x +Signed-off-by: Janne Grunau +Reviewed-by: Neal Gompa +Acked-by: Arend van Spriel +Signed-off-by: Kalle Valo +Link: https://patch.msgid.link/20240803-brcmfmac_pmksa_del_ssid-v1-1-4e85f19135e1@jannau.net +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c | 13 +++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c ++++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +@@ -4320,9 +4320,16 @@ brcmf_pmksa_v3_op(struct brcmf_if *ifp, + /* Single PMK operation */ + pmk_op->count = cpu_to_le16(1); + length += sizeof(struct brcmf_pmksa_v3); +- memcpy(pmk_op->pmk[0].bssid, pmksa->bssid, ETH_ALEN); +- memcpy(pmk_op->pmk[0].pmkid, pmksa->pmkid, WLAN_PMKID_LEN); +- pmk_op->pmk[0].pmkid_len = WLAN_PMKID_LEN; ++ if (pmksa->bssid) ++ memcpy(pmk_op->pmk[0].bssid, pmksa->bssid, ETH_ALEN); ++ if (pmksa->pmkid) { ++ memcpy(pmk_op->pmk[0].pmkid, pmksa->pmkid, WLAN_PMKID_LEN); ++ pmk_op->pmk[0].pmkid_len = WLAN_PMKID_LEN; ++ } ++ if (pmksa->ssid && pmksa->ssid_len) { ++ memcpy(pmk_op->pmk[0].ssid.SSID, pmksa->ssid, pmksa->ssid_len); ++ pmk_op->pmk[0].ssid.SSID_len = pmksa->ssid_len; ++ } + pmk_op->pmk[0].time_left = cpu_to_le32(alive ? BRCMF_PMKSA_NO_EXPIRY : 0); + } + diff --git a/queue-6.10/xhci-fix-panther-point-null-pointer-deref-at-full-speed-re-enumeration.patch b/queue-6.10/xhci-fix-panther-point-null-pointer-deref-at-full-speed-re-enumeration.patch new file mode 100644 index 00000000000..822fa9a59b6 --- /dev/null +++ b/queue-6.10/xhci-fix-panther-point-null-pointer-deref-at-full-speed-re-enumeration.patch @@ -0,0 +1,82 @@ +From af8e119f52e9c13e556be9e03f27957554a84656 Mon Sep 17 00:00:00 2001 +From: Mathias Nyman +Date: Thu, 15 Aug 2024 17:11:17 +0300 +Subject: xhci: Fix Panther point NULL pointer deref at full-speed re-enumeration + +From: Mathias Nyman + +commit af8e119f52e9c13e556be9e03f27957554a84656 upstream. + +re-enumerating full-speed devices after a failed address device command +can trigger a NULL pointer dereference. + +Full-speed devices may need to reconfigure the endpoint 0 Max Packet Size +value during enumeration. Usb core calls usb_ep0_reinit() in this case, +which ends up calling xhci_configure_endpoint(). + +On Panther point xHC the xhci_configure_endpoint() function will +additionally check and reserve bandwidth in software. Other hosts do +this in hardware + +If xHC address device command fails then a new xhci_virt_device structure +is allocated as part of re-enabling the slot, but the bandwidth table +pointers are not set up properly here. +This triggers the NULL pointer dereference the next time usb_ep0_reinit() +is called and xhci_configure_endpoint() tries to check and reserve +bandwidth + +[46710.713538] usb 3-1: new full-speed USB device number 5 using xhci_hcd +[46710.713699] usb 3-1: Device not responding to setup address. +[46710.917684] usb 3-1: Device not responding to setup address. +[46711.125536] usb 3-1: device not accepting address 5, error -71 +[46711.125594] BUG: kernel NULL pointer dereference, address: 0000000000000008 +[46711.125600] #PF: supervisor read access in kernel mode +[46711.125603] #PF: error_code(0x0000) - not-present page +[46711.125606] PGD 0 P4D 0 +[46711.125610] Oops: Oops: 0000 [#1] PREEMPT SMP PTI +[46711.125615] CPU: 1 PID: 25760 Comm: kworker/1:2 Not tainted 6.10.3_2 #1 +[46711.125620] Hardware name: Gigabyte Technology Co., Ltd. +[46711.125623] Workqueue: usb_hub_wq hub_event [usbcore] +[46711.125668] RIP: 0010:xhci_reserve_bandwidth (drivers/usb/host/xhci.c + +Fix this by making sure bandwidth table pointers are set up correctly +after a failed address device command, and additionally by avoiding +checking for bandwidth in cases like this where no actual endpoints are +added or removed, i.e. only context for default control endpoint 0 is +evaluated. + +Reported-by: Karel Balej +Closes: https://lore.kernel.org/linux-usb/D3CKQQAETH47.1MUO22RTCH2O3@matfyz.cz/ +Cc: stable@vger.kernel.org +Fixes: 651aaf36a7d7 ("usb: xhci: Handle USB transaction error on address command") +Signed-off-by: Mathias Nyman +Link: https://lore.kernel.org/r/20240815141117.2702314-2-mathias.nyman@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/usb/host/xhci.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/drivers/usb/host/xhci.c ++++ b/drivers/usb/host/xhci.c +@@ -2837,7 +2837,7 @@ static int xhci_configure_endpoint(struc + xhci->num_active_eps); + return -ENOMEM; + } +- if ((xhci->quirks & XHCI_SW_BW_CHECKING) && ++ if ((xhci->quirks & XHCI_SW_BW_CHECKING) && !ctx_change && + xhci_reserve_bandwidth(xhci, virt_dev, command->in_ctx)) { + if ((xhci->quirks & XHCI_EP_LIMIT_QUIRK)) + xhci_free_host_resources(xhci, ctrl_ctx); +@@ -4200,8 +4200,10 @@ static int xhci_setup_device(struct usb_ + mutex_unlock(&xhci->mutex); + ret = xhci_disable_slot(xhci, udev->slot_id); + xhci_free_virt_device(xhci, udev->slot_id); +- if (!ret) +- xhci_alloc_dev(hcd, udev); ++ if (!ret) { ++ if (xhci_alloc_dev(hcd, udev) == 1) ++ xhci_setup_addressable_virt_dev(xhci, udev); ++ } + kfree(command->completion); + kfree(command); + return -EPROTO; -- 2.47.3