From: Greg Kroah-Hartman Date: Wed, 15 Oct 2025 10:56:42 +0000 (+0200) Subject: 6.17-stable patches X-Git-Tag: v5.15.195~118 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=381f84ab711d92c8f7448598514350342dd7383a;p=thirdparty%2Fkernel%2Fstable-queue.git 6.17-stable patches added patches: arm64-map-_text-_stext-virtual-address-range-non-executable-read-only.patch btrfs-fix-the-incorrect-max_bytes-value-for-find_lock_delalloc_range.patch cxl-acpi-hmat-update-cxl-access-coordinates-directly-instead-of-through-hmat.patch fs-always-return-zero-on-success-from-replace_fd.patch fscontext-do-not-consume-log-entries-when-returning-emsgsize.patch rseq-protect-event-mask-against-membarrier-ipi.patch series --- diff --git a/queue-6.17/arm64-map-_text-_stext-virtual-address-range-non-executable-read-only.patch b/queue-6.17/arm64-map-_text-_stext-virtual-address-range-non-executable-read-only.patch new file mode 100644 index 0000000000..29a5619c22 --- /dev/null +++ b/queue-6.17/arm64-map-_text-_stext-virtual-address-range-non-executable-read-only.patch @@ -0,0 +1,146 @@ +From 5973a62efa34c80c9a4e5eac1fca6f6209b902af Mon Sep 17 00:00:00 2001 +From: Omar Sandoval +Date: Fri, 19 Sep 2025 14:27:51 -0700 +Subject: arm64: map [_text, _stext) virtual address range non-executable+read-only + +From: Omar Sandoval + +commit 5973a62efa34c80c9a4e5eac1fca6f6209b902af upstream. + +Since the referenced fixes commit, the kernel's .text section is only +mapped starting from _stext; the region [_text, _stext) is omitted. As a +result, other vmalloc/vmap allocations may use the virtual addresses +nominally in the range [_text, _stext). This address reuse confuses +multiple things: + +1. crash_prepare_elf64_headers() sets up a segment in /proc/vmcore + mapping the entire range [_text, _end) to + [__pa_symbol(_text), __pa_symbol(_end)). Reading an address in + [_text, _stext) from /proc/vmcore therefore gives the incorrect + result. +2. Tools doing symbolization (either by reading /proc/kallsyms or based + on the vmlinux ELF file) will incorrectly identify vmalloc/vmap + allocations in [_text, _stext) as kernel symbols. + +In practice, both of these issues affect the drgn debugger. +Specifically, there were cases where the vmap IRQ stacks for some CPUs +were allocated in [_text, _stext). As a result, drgn could not get the +stack trace for a crash in an IRQ handler because the core dump +contained invalid data for the IRQ stack address. The stack addresses +were also symbolized as being in the _text symbol. + +Fix this by bringing back the mapping of [_text, _stext), but now make +it non-executable and read-only. This prevents other allocations from +using it while still achieving the original goal of not mapping +unpredictable data as executable. Other than the changed protection, +this is effectively a revert of the fixes commit. + +Fixes: e2a073dde921 ("arm64: omit [_text, _stext) from permanent kernel mapping") +Cc: stable@vger.kernel.org +Signed-off-by: Omar Sandoval +Signed-off-by: Will Deacon +Signed-off-by: Greg Kroah-Hartman +--- + arch/arm64/kernel/pi/map_kernel.c | 6 ++++++ + arch/arm64/kernel/setup.c | 4 ++-- + arch/arm64/mm/init.c | 2 +- + arch/arm64/mm/mmu.c | 14 +++++++++----- + 4 files changed, 18 insertions(+), 8 deletions(-) + +--- a/arch/arm64/kernel/pi/map_kernel.c ++++ b/arch/arm64/kernel/pi/map_kernel.c +@@ -78,6 +78,12 @@ static void __init map_kernel(u64 kaslr_ + twopass |= enable_scs; + prot = twopass ? data_prot : text_prot; + ++ /* ++ * [_stext, _text) isn't executed after boot and contains some ++ * non-executable, unpredictable data, so map it non-executable. ++ */ ++ map_segment(init_pg_dir, &pgdp, va_offset, _text, _stext, data_prot, ++ false, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, _stext, _etext, prot, + !twopass, root_level); + map_segment(init_pg_dir, &pgdp, va_offset, __start_rodata, +--- a/arch/arm64/kernel/setup.c ++++ b/arch/arm64/kernel/setup.c +@@ -214,7 +214,7 @@ static void __init request_standard_reso + unsigned long i = 0; + size_t res_size; + +- kernel_code.start = __pa_symbol(_stext); ++ kernel_code.start = __pa_symbol(_text); + kernel_code.end = __pa_symbol(__init_begin - 1); + kernel_data.start = __pa_symbol(_sdata); + kernel_data.end = __pa_symbol(_end - 1); +@@ -280,7 +280,7 @@ u64 cpu_logical_map(unsigned int cpu) + + void __init __no_sanitize_address setup_arch(char **cmdline_p) + { +- setup_initial_init_mm(_stext, _etext, _edata, _end); ++ setup_initial_init_mm(_text, _etext, _edata, _end); + + *cmdline_p = boot_command_line; + +--- a/arch/arm64/mm/init.c ++++ b/arch/arm64/mm/init.c +@@ -279,7 +279,7 @@ void __init arm64_memblock_init(void) + * Register the kernel text, kernel data, initrd, and initial + * pagetables with memblock. + */ +- memblock_reserve(__pa_symbol(_stext), _end - _stext); ++ memblock_reserve(__pa_symbol(_text), _end - _text); + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { + /* the generic initrd code expects virtual addresses */ + initrd_start = __phys_to_virt(phys_initrd_start); +--- a/arch/arm64/mm/mmu.c ++++ b/arch/arm64/mm/mmu.c +@@ -574,8 +574,8 @@ void __init mark_linear_text_alias_ro(vo + /* + * Remove the write permissions from the linear alias of .text/.rodata + */ +- update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), +- (unsigned long)__init_begin - (unsigned long)_stext, ++ update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), ++ (unsigned long)__init_begin - (unsigned long)_text, + PAGE_KERNEL_RO); + } + +@@ -636,7 +636,7 @@ static inline void arm64_kfence_map_pool + static void __init map_mem(pgd_t *pgdp) + { + static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN); +- phys_addr_t kernel_start = __pa_symbol(_stext); ++ phys_addr_t kernel_start = __pa_symbol(_text); + phys_addr_t kernel_end = __pa_symbol(__init_begin); + phys_addr_t start, end; + phys_addr_t early_kfence_pool; +@@ -683,7 +683,7 @@ static void __init map_mem(pgd_t *pgdp) + } + + /* +- * Map the linear alias of the [_stext, __init_begin) interval ++ * Map the linear alias of the [_text, __init_begin) interval + * as non-executable now, and remove the write permission in + * mark_linear_text_alias_ro() below (which will be called after + * alternative patching has completed). This makes the contents +@@ -710,6 +710,10 @@ void mark_rodata_ro(void) + WRITE_ONCE(rodata_is_rw, false); + update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, + section_size, PAGE_KERNEL_RO); ++ /* mark the range between _text and _stext as read only. */ ++ update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, ++ (unsigned long)_stext - (unsigned long)_text, ++ PAGE_KERNEL_RO); + } + + static void __init declare_vma(struct vm_struct *vma, +@@ -780,7 +784,7 @@ static void __init declare_kernel_vmas(v + { + static struct vm_struct vmlinux_seg[KERNEL_SEGMENT_COUNT]; + +- declare_vma(&vmlinux_seg[0], _stext, _etext, VM_NO_GUARD); ++ declare_vma(&vmlinux_seg[0], _text, _etext, VM_NO_GUARD); + declare_vma(&vmlinux_seg[1], __start_rodata, __inittext_begin, VM_NO_GUARD); + declare_vma(&vmlinux_seg[2], __inittext_begin, __inittext_end, VM_NO_GUARD); + declare_vma(&vmlinux_seg[3], __initdata_begin, __initdata_end, VM_NO_GUARD); diff --git a/queue-6.17/btrfs-fix-the-incorrect-max_bytes-value-for-find_lock_delalloc_range.patch b/queue-6.17/btrfs-fix-the-incorrect-max_bytes-value-for-find_lock_delalloc_range.patch new file mode 100644 index 0000000000..a0e58be383 --- /dev/null +++ b/queue-6.17/btrfs-fix-the-incorrect-max_bytes-value-for-find_lock_delalloc_range.patch @@ -0,0 +1,142 @@ +From 7b26da407420e5054e3f06c5d13271697add9423 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 19 Sep 2025 14:33:23 +0930 +Subject: btrfs: fix the incorrect max_bytes value for find_lock_delalloc_range() + +From: Qu Wenruo + +commit 7b26da407420e5054e3f06c5d13271697add9423 upstream. + +[BUG] +With my local branch to enable bs > ps support for btrfs, sometimes I +hit the following ASSERT() inside submit_one_sector(): + + ASSERT(block_start != EXTENT_MAP_HOLE); + +Please note that it's not yet possible to hit this ASSERT() in the wild +yet, as it requires btrfs bs > ps support, which is not even in the +development branch. + +But on the other hand, there is also a very low chance to hit above +ASSERT() with bs < ps cases, so this is an existing bug affect not only +the incoming bs > ps support but also the existing bs < ps support. + +[CAUSE] +Firstly that ASSERT() means we're trying to submit a dirty block but +without a real extent map nor ordered extent map backing it. + +Furthermore with extra debugging, the folio triggering such ASSERT() is +always larger than the fs block size in my bs > ps case. +(8K block size, 4K page size) + +After some more debugging, the ASSERT() is trigger by the following +sequence: + + extent_writepage() + | We got a 32K folio (4 fs blocks) at file offset 0, and the fs block + | size is 8K, page size is 4K. + | And there is another 8K folio at file offset 32K, which is also + | dirty. + | So the filemap layout looks like the following: + | + | "||" is the filio boundary in the filemap. + | "//| is the dirty range. + | + | 0 8K 16K 24K 32K 40K + | |////////| |//////////////////////||////////| + | + |- writepage_delalloc() + | |- find_lock_delalloc_range() for [0, 8K) + | | Now range [0, 8K) is properly locked. + | | + | |- find_lock_delalloc_range() for [16K, 40K) + | | |- btrfs_find_delalloc_range() returned range [16K, 40K) + | | |- lock_delalloc_folios() locked folio 0 successfully + | | | + | | | The filemap range [32K, 40K) got dropped from filemap. + | | | + | | |- lock_delalloc_folios() failed with -EAGAIN on folio 32K + | | | As the folio at 32K is dropped. + | | | + | | |- loops = 1; + | | |- max_bytes = PAGE_SIZE; + | | |- goto again; + | | | This will re-do the lookup for dirty delalloc ranges. + | | | + | | |- btrfs_find_delalloc_range() called with @max_bytes == 4K + | | | This is smaller than block size, so + | | | btrfs_find_delalloc_range() is unable to return any range. + | | \- return false; + | | + | \- Now only range [0, 8K) has an OE for it, but for dirty range + | [16K, 32K) it's dirty without an OE. + | This breaks the assumption that writepage_delalloc() will find + | and lock all dirty ranges inside the folio. + | + |- extent_writepage_io() + |- submit_one_sector() for [0, 8K) + | Succeeded + | + |- submit_one_sector() for [16K, 24K) + Triggering the ASSERT(), as there is no OE, and the original + extent map is a hole. + +Please note that, this also exposed the same problem for bs < ps +support. E.g. with 64K page size and 4K block size. + +If we failed to lock a folio, and falls back into the "loops = 1;" +branch, we will re-do the search using 64K as max_bytes. +Which may fail again to lock the next folio, and exit early without +handling all dirty blocks inside the folio. + +[FIX] +Instead of using the fixed size PAGE_SIZE as @max_bytes, use +@sectorsize, so that we are ensured to find and lock any remaining +blocks inside the folio. + +And since we're here, add an extra ASSERT() to +before calling btrfs_find_delalloc_range() to make sure the @max_bytes is +at least no smaller than a block to avoid false negative. + +Cc: stable@vger.kernel.org # 5.15+ +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -345,6 +345,13 @@ again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; ++ ++ /* ++ * If @max_bytes is smaller than a block, btrfs_find_delalloc_range() can ++ * return early without handling any dirty ranges. ++ */ ++ ASSERT(max_bytes >= fs_info->sectorsize); ++ + found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes, &cached_state); + if (!found || delalloc_end <= *start || delalloc_start > orig_end) { +@@ -375,13 +382,14 @@ again: + delalloc_end); + ASSERT(!ret || ret == -EAGAIN); + if (ret == -EAGAIN) { +- /* some of the folios are gone, lets avoid looping by +- * shortening the size of the delalloc range we're searching ++ /* ++ * Some of the folios are gone, lets avoid looping by ++ * shortening the size of the delalloc range we're searching. + */ + btrfs_free_extent_state(cached_state); + cached_state = NULL; + if (!loops) { +- max_bytes = PAGE_SIZE; ++ max_bytes = fs_info->sectorsize; + loops = 1; + goto again; + } else { diff --git a/queue-6.17/cxl-acpi-hmat-update-cxl-access-coordinates-directly-instead-of-through-hmat.patch b/queue-6.17/cxl-acpi-hmat-update-cxl-access-coordinates-directly-instead-of-through-hmat.patch new file mode 100644 index 0000000000..e94eb9f7b9 --- /dev/null +++ b/queue-6.17/cxl-acpi-hmat-update-cxl-access-coordinates-directly-instead-of-through-hmat.patch @@ -0,0 +1,172 @@ +From 2e454fb8056df6da4bba7d89a57bf60e217463c0 Mon Sep 17 00:00:00 2001 +From: Dave Jiang +Date: Fri, 29 Aug 2025 15:29:06 -0700 +Subject: cxl, acpi/hmat: Update CXL access coordinates directly instead of through HMAT + +From: Dave Jiang + +commit 2e454fb8056df6da4bba7d89a57bf60e217463c0 upstream. + +The current implementation of CXL memory hotplug notifier gets called +before the HMAT memory hotplug notifier. The CXL driver calculates the +access coordinates (bandwidth and latency values) for the CXL end to +end path (i.e. CPU to endpoint). When the CXL region is onlined, the CXL +memory hotplug notifier writes the access coordinates to the HMAT target +structs. Then the HMAT memory hotplug notifier is called and it creates +the access coordinates for the node sysfs attributes. + +During testing on an Intel platform, it was found that although the +newly calculated coordinates were pushed to sysfs, the sysfs attributes for +the access coordinates showed up with the wrong initiator. The system has +4 nodes (0, 1, 2, 3) where node 0 and 1 are CPU nodes and node 2 and 3 are +CXL nodes. The expectation is that node 2 would show up as a target to node +0: +/sys/devices/system/node/node2/access0/initiators/node0 + +However it was observed that node 2 showed up as a target under node 1: +/sys/devices/system/node/node2/access0/initiators/node1 + +The original intent of the 'ext_updated' flag in HMAT handling code was to +stop HMAT memory hotplug callback from clobbering the access coordinates +after CXL has injected its calculated coordinates and replaced the generic +target access coordinates provided by the HMAT table in the HMAT target +structs. However the flag is hacky at best and blocks the updates from +other CXL regions that are onlined in the same node later on. Remove the +'ext_updated' flag usage and just update the access coordinates for the +nodes directly without touching HMAT target data. + +The hotplug memory callback ordering is changed. Instead of changing CXL, +move HMAT back so there's room for the levels rather than have CXL share +the same level as SLAB_CALLBACK_PRI. The change will resulting in the CXL +callback to be executed after the HMAT callback. + +With the change, the CXL hotplug memory notifier runs after the HMAT +callback. The HMAT callback will create the node sysfs attributes for +access coordinates. The CXL callback will write the access coordinates to +the now created node sysfs attributes directly and will not pollute the +HMAT target values. + +A nodemask is introduced to keep track if a node has been updated and +prevents further updates. + +Fixes: 067353a46d8c ("cxl/region: Add memory hotplug notifier for cxl region") +Cc: stable@vger.kernel.org +Tested-by: Marc Herbert +Reviewed-by: Dan Williams +Reviewed-by: Jonathan Cameron +Link: https://patch.msgid.link/20250829222907.1290912-4-dave.jiang@intel.com +Signed-off-by: Dave Jiang +Signed-off-by: Greg Kroah-Hartman +--- + drivers/acpi/numa/hmat.c | 6 ------ + drivers/cxl/core/cdat.c | 5 ----- + drivers/cxl/core/core.h | 1 - + drivers/cxl/core/region.c | 20 ++++++++++++-------- + include/linux/memory.h | 2 +- + 5 files changed, 13 insertions(+), 21 deletions(-) + +--- a/drivers/acpi/numa/hmat.c ++++ b/drivers/acpi/numa/hmat.c +@@ -74,7 +74,6 @@ struct memory_target { + struct node_cache_attrs cache_attrs; + u8 gen_port_device_handle[ACPI_SRAT_DEVICE_HANDLE_SIZE]; + bool registered; +- bool ext_updated; /* externally updated */ + }; + + struct memory_initiator { +@@ -391,7 +390,6 @@ int hmat_update_target_coordinates(int n + coord->read_bandwidth, access); + hmat_update_target_access(target, ACPI_HMAT_WRITE_BANDWIDTH, + coord->write_bandwidth, access); +- target->ext_updated = true; + + return 0; + } +@@ -773,10 +771,6 @@ static void hmat_update_target_attrs(str + u32 best = 0; + int i; + +- /* Don't update if an external agent has changed the data. */ +- if (target->ext_updated) +- return; +- + /* Don't update for generic port if there's no device handle */ + if ((access == NODE_ACCESS_CLASS_GENPORT_SINK_LOCAL || + access == NODE_ACCESS_CLASS_GENPORT_SINK_CPU) && +--- a/drivers/cxl/core/cdat.c ++++ b/drivers/cxl/core/cdat.c +@@ -1081,8 +1081,3 @@ int cxl_update_hmat_access_coordinates(i + { + return hmat_update_target_coordinates(nid, &cxlr->coord[access], access); + } +- +-bool cxl_need_node_perf_attrs_update(int nid) +-{ +- return !acpi_node_backed_by_real_pxm(nid); +-} +--- a/drivers/cxl/core/core.h ++++ b/drivers/cxl/core/core.h +@@ -139,7 +139,6 @@ long cxl_pci_get_latency(struct pci_dev + int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c); + int cxl_update_hmat_access_coordinates(int nid, struct cxl_region *cxlr, + enum access_coordinate_class access); +-bool cxl_need_node_perf_attrs_update(int nid); + int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, + struct access_coordinate *c); + +--- a/drivers/cxl/core/region.c ++++ b/drivers/cxl/core/region.c +@@ -30,6 +30,12 @@ + * 3. Decoder targets + */ + ++/* ++ * nodemask that sets per node when the access_coordinates for the node has ++ * been updated by the CXL memory hotplug notifier. ++ */ ++static nodemask_t nodemask_region_seen = NODE_MASK_NONE; ++ + static struct cxl_region *to_cxl_region(struct device *dev); + + #define __ACCESS_ATTR_RO(_level, _name) { \ +@@ -2442,14 +2448,8 @@ static bool cxl_region_update_coordinate + + for (int i = 0; i < ACCESS_COORDINATE_MAX; i++) { + if (cxlr->coord[i].read_bandwidth) { +- rc = 0; +- if (cxl_need_node_perf_attrs_update(nid)) +- node_set_perf_attrs(nid, &cxlr->coord[i], i); +- else +- rc = cxl_update_hmat_access_coordinates(nid, cxlr, i); +- +- if (rc == 0) +- cset++; ++ node_update_perf_attrs(nid, &cxlr->coord[i], i); ++ cset++; + } + } + +@@ -2487,6 +2487,10 @@ static int cxl_region_perf_attrs_callbac + if (nid != region_nid) + return NOTIFY_DONE; + ++ /* No action needed if node bit already set */ ++ if (node_test_and_set(nid, nodemask_region_seen)) ++ return NOTIFY_DONE; ++ + if (!cxl_region_update_coordinates(cxlr, nid)) + return NOTIFY_DONE; + +--- a/include/linux/memory.h ++++ b/include/linux/memory.h +@@ -120,8 +120,8 @@ struct mem_section; + */ + #define DEFAULT_CALLBACK_PRI 0 + #define SLAB_CALLBACK_PRI 1 +-#define HMAT_CALLBACK_PRI 2 + #define CXL_CALLBACK_PRI 5 ++#define HMAT_CALLBACK_PRI 6 + #define MM_COMPUTE_BATCH_PRI 10 + #define CPUSET_CALLBACK_PRI 10 + #define MEMTIER_HOTPLUG_PRI 100 diff --git a/queue-6.17/fs-always-return-zero-on-success-from-replace_fd.patch b/queue-6.17/fs-always-return-zero-on-success-from-replace_fd.patch new file mode 100644 index 0000000000..ef6c58a43d --- /dev/null +++ b/queue-6.17/fs-always-return-zero-on-success-from-replace_fd.patch @@ -0,0 +1,48 @@ +From 708c04a5c2b78e22f56e2350de41feba74dfccd9 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= +Date: Tue, 5 Aug 2025 14:38:08 +0200 +Subject: fs: always return zero on success from replace_fd() +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Thomas Weißschuh + +commit 708c04a5c2b78e22f56e2350de41feba74dfccd9 upstream. + +replace_fd() returns the number of the new file descriptor through the +return value of do_dup2(). However its callers never care about the +specific returned number. In fact the caller in receive_fd_replace() treats +any non-zero return value as an error and therefore never calls +__receive_sock() for most file descriptors, which is a bug. + +To fix the bug in receive_fd_replace() and to avoid the same issue +happening in future callers, signal success through a plain zero. + +Suggested-by: Al Viro +Link: https://lore.kernel.org/lkml/20250801220215.GS222315@ZenIV/ +Fixes: 173817151b15 ("fs: Expand __receive_fd() to accept existing fd") +Fixes: 42eb0d54c08a ("fs: split receive_fd_replace from __receive_fd") +Cc: stable@vger.kernel.org +Signed-off-by: Thomas Weißschuh +Link: https://lore.kernel.org/20250805-fix-receive_fd_replace-v3-1-b72ba8b34bac@linutronix.de +Signed-off-by: Christian Brauner +Signed-off-by: Greg Kroah-Hartman +--- + fs/file.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/fs/file.c ++++ b/fs/file.c +@@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file + err = expand_files(files, fd); + if (unlikely(err < 0)) + goto out_unlock; +- return do_dup2(files, file, fd, flags); ++ err = do_dup2(files, file, fd, flags); ++ if (err < 0) ++ return err; ++ return 0; + + out_unlock: + spin_unlock(&files->file_lock); diff --git a/queue-6.17/fscontext-do-not-consume-log-entries-when-returning-emsgsize.patch b/queue-6.17/fscontext-do-not-consume-log-entries-when-returning-emsgsize.patch new file mode 100644 index 0000000000..80edfb9696 --- /dev/null +++ b/queue-6.17/fscontext-do-not-consume-log-entries-when-returning-emsgsize.patch @@ -0,0 +1,121 @@ +From 72d271a7baa7062cb27e774ac37c5459c6d20e22 Mon Sep 17 00:00:00 2001 +From: Aleksa Sarai +Date: Thu, 7 Aug 2025 03:55:23 +1000 +Subject: fscontext: do not consume log entries when returning -EMSGSIZE + +From: Aleksa Sarai + +commit 72d271a7baa7062cb27e774ac37c5459c6d20e22 upstream. + +Userspace generally expects APIs that return -EMSGSIZE to allow for them +to adjust their buffer size and retry the operation. However, the +fscontext log would previously clear the message even in the -EMSGSIZE +case. + +Given that it is very cheap for us to check whether the buffer is too +small before we remove the message from the ring buffer, let's just do +that instead. While we're at it, refactor some fscontext_read() into a +separate helper to make the ring buffer logic a bit easier to read. + +Fixes: 007ec26cdc9f ("vfs: Implement logging through fs_context") +Cc: David Howells +Cc: stable@vger.kernel.org # v5.2+ +Signed-off-by: Aleksa Sarai +Link: https://lore.kernel.org/20250807-fscontext-log-cleanups-v3-1-8d91d6242dc3@cyphar.com +Signed-off-by: Christian Brauner +Signed-off-by: Greg Kroah-Hartman +--- + fs/fsopen.c | 70 ++++++++++++++++++++++++++++++++---------------------------- + 1 file changed, 38 insertions(+), 32 deletions(-) + +--- a/fs/fsopen.c ++++ b/fs/fsopen.c +@@ -18,50 +18,56 @@ + #include "internal.h" + #include "mount.h" + ++static inline const char *fetch_message_locked(struct fc_log *log, size_t len, ++ bool *need_free) ++{ ++ const char *p; ++ int index; ++ ++ if (unlikely(log->head == log->tail)) ++ return ERR_PTR(-ENODATA); ++ ++ index = log->tail & (ARRAY_SIZE(log->buffer) - 1); ++ p = log->buffer[index]; ++ if (unlikely(strlen(p) > len)) ++ return ERR_PTR(-EMSGSIZE); ++ ++ log->buffer[index] = NULL; ++ *need_free = log->need_free & (1 << index); ++ log->need_free &= ~(1 << index); ++ log->tail++; ++ ++ return p; ++} ++ + /* + * Allow the user to read back any error, warning or informational messages. ++ * Only one message is returned for each read(2) call. + */ + static ssize_t fscontext_read(struct file *file, + char __user *_buf, size_t len, loff_t *pos) + { + struct fs_context *fc = file->private_data; +- struct fc_log *log = fc->log.log; +- unsigned int logsize = ARRAY_SIZE(log->buffer); +- ssize_t ret; +- char *p; ++ ssize_t err; ++ const char *p __free(kfree) = NULL, *message; + bool need_free; +- int index, n; +- +- ret = mutex_lock_interruptible(&fc->uapi_mutex); +- if (ret < 0) +- return ret; +- +- if (log->head == log->tail) { +- mutex_unlock(&fc->uapi_mutex); +- return -ENODATA; +- } ++ int n; + +- index = log->tail & (logsize - 1); +- p = log->buffer[index]; +- need_free = log->need_free & (1 << index); +- log->buffer[index] = NULL; +- log->need_free &= ~(1 << index); +- log->tail++; ++ err = mutex_lock_interruptible(&fc->uapi_mutex); ++ if (err < 0) ++ return err; ++ message = fetch_message_locked(fc->log.log, len, &need_free); + mutex_unlock(&fc->uapi_mutex); ++ if (IS_ERR(message)) ++ return PTR_ERR(message); + +- ret = -EMSGSIZE; +- n = strlen(p); +- if (n > len) +- goto err_free; +- ret = -EFAULT; +- if (copy_to_user(_buf, p, n) != 0) +- goto err_free; +- ret = n; +- +-err_free: + if (need_free) +- kfree(p); +- return ret; ++ p = message; ++ ++ n = strlen(message); ++ if (copy_to_user(_buf, message, n)) ++ return -EFAULT; ++ return n; + } + + static int fscontext_release(struct inode *inode, struct file *file) diff --git a/queue-6.17/rseq-protect-event-mask-against-membarrier-ipi.patch b/queue-6.17/rseq-protect-event-mask-against-membarrier-ipi.patch new file mode 100644 index 0000000000..eb05c242bd --- /dev/null +++ b/queue-6.17/rseq-protect-event-mask-against-membarrier-ipi.patch @@ -0,0 +1,75 @@ +From 6eb350a2233100a283f882c023e5ad426d0ed63b Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 13 Aug 2025 17:02:30 +0200 +Subject: rseq: Protect event mask against membarrier IPI + +From: Thomas Gleixner + +commit 6eb350a2233100a283f882c023e5ad426d0ed63b upstream. + +rseq_need_restart() reads and clears task::rseq_event_mask with preemption +disabled to guard against the scheduler. + +But membarrier() uses an IPI and sets the PREEMPT bit in the event mask +from the IPI, which leaves that RMW operation unprotected. + +Use guard(irq) if CONFIG_MEMBARRIER is enabled to fix that. + +Fixes: 2a36ab717e8f ("rseq/membarrier: Add MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ") +Signed-off-by: Thomas Gleixner +Reviewed-by: Boqun Feng +Reviewed-by: Mathieu Desnoyers +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/rseq.h | 11 ++++++++--- + kernel/rseq.c | 10 +++++----- + 2 files changed, 13 insertions(+), 8 deletions(-) + +--- a/include/linux/rseq.h ++++ b/include/linux/rseq.h +@@ -7,6 +7,12 @@ + #include + #include + ++#ifdef CONFIG_MEMBARRIER ++# define RSEQ_EVENT_GUARD irq ++#else ++# define RSEQ_EVENT_GUARD preempt ++#endif ++ + /* + * Map the event mask on the user-space ABI enum rseq_cs_flags + * for direct mask checks. +@@ -41,9 +47,8 @@ static inline void rseq_handle_notify_re + static inline void rseq_signal_deliver(struct ksignal *ksig, + struct pt_regs *regs) + { +- preempt_disable(); +- __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); +- preempt_enable(); ++ scoped_guard(RSEQ_EVENT_GUARD) ++ __set_bit(RSEQ_EVENT_SIGNAL_BIT, ¤t->rseq_event_mask); + rseq_handle_notify_resume(ksig, regs); + } + +--- a/kernel/rseq.c ++++ b/kernel/rseq.c +@@ -342,12 +342,12 @@ static int rseq_need_restart(struct task + + /* + * Load and clear event mask atomically with respect to +- * scheduler preemption. ++ * scheduler preemption and membarrier IPIs. + */ +- preempt_disable(); +- event_mask = t->rseq_event_mask; +- t->rseq_event_mask = 0; +- preempt_enable(); ++ scoped_guard(RSEQ_EVENT_GUARD) { ++ event_mask = t->rseq_event_mask; ++ t->rseq_event_mask = 0; ++ } + + return !!event_mask; + } diff --git a/queue-6.17/series b/queue-6.17/series new file mode 100644 index 0000000000..8801f19b32 --- /dev/null +++ b/queue-6.17/series @@ -0,0 +1,6 @@ +fs-always-return-zero-on-success-from-replace_fd.patch +fscontext-do-not-consume-log-entries-when-returning-emsgsize.patch +btrfs-fix-the-incorrect-max_bytes-value-for-find_lock_delalloc_range.patch +arm64-map-_text-_stext-virtual-address-range-non-executable-read-only.patch +cxl-acpi-hmat-update-cxl-access-coordinates-directly-instead-of-through-hmat.patch +rseq-protect-event-mask-against-membarrier-ipi.patch