From d1eb2de35e10e73384ce34d83cc978d1a08df2cf Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 13 Jun 2024 09:33:22 +0200 Subject: [PATCH] 6.9-stable patches added patches: acpi-apei-einj-fix-einj_dev-release-leak.patch filemap-add-helper-mapping_max_folio_size.patch io_uring-check-for-non-null-file-pointer-in-io_file_can_poll.patch io_uring-napi-fix-timeout-calculation.patch iomap-fault-in-smaller-chunks-for-non-large-folio-mappings.patch --- ...-apei-einj-fix-einj_dev-release-leak.patch | 42 ++++++++++ ...ap-add-helper-mapping_max_folio_size.patch | 83 ++++++++++++++++++ ...ull-file-pointer-in-io_file_can_poll.patch | 76 +++++++++++++++++ ...o_uring-napi-fix-timeout-calculation.patch | 84 +++++++++++++++++++ ...-chunks-for-non-large-folio-mappings.patch | 81 ++++++++++++++++++ queue-6.9/series | 5 ++ 6 files changed, 371 insertions(+) create mode 100644 queue-6.9/acpi-apei-einj-fix-einj_dev-release-leak.patch create mode 100644 queue-6.9/filemap-add-helper-mapping_max_folio_size.patch create mode 100644 queue-6.9/io_uring-check-for-non-null-file-pointer-in-io_file_can_poll.patch create mode 100644 queue-6.9/io_uring-napi-fix-timeout-calculation.patch create mode 100644 queue-6.9/iomap-fault-in-smaller-chunks-for-non-large-folio-mappings.patch diff --git a/queue-6.9/acpi-apei-einj-fix-einj_dev-release-leak.patch b/queue-6.9/acpi-apei-einj-fix-einj_dev-release-leak.patch new file mode 100644 index 00000000000..b49bbd37d03 --- /dev/null +++ b/queue-6.9/acpi-apei-einj-fix-einj_dev-release-leak.patch @@ -0,0 +1,42 @@ +From 7ff6c798eca05e4a9dcb80163cb454d7787a4bc3 Mon Sep 17 00:00:00 2001 +From: Dan Williams +Date: Tue, 21 May 2024 15:46:32 -0700 +Subject: ACPI: APEI: EINJ: Fix einj_dev release leak + +From: Dan Williams + +commit 7ff6c798eca05e4a9dcb80163cb454d7787a4bc3 upstream. + +The platform driver conversion of EINJ mistakenly used +platform_device_del() to unwind platform_device_register_full() at +module exit. This leads to a small leak of one 'struct platform_device' +instance per module load/unload cycle. Switch to +platform_device_unregister() which performs both device_del() and final +put_device(). + +Fixes: 5621fafaac00 ("EINJ: Migrate to a platform driver") +Cc: 6.9+ # 6.9+ +Signed-off-by: Dan Williams +Reviewed-by: Ben Cheatham +Signed-off-by: Rafael J. Wysocki +Signed-off-by: Greg Kroah-Hartman +--- + drivers/acpi/apei/einj-core.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/acpi/apei/einj-core.c b/drivers/acpi/apei/einj-core.c +index 9515bcfe5e97..73903a497d73 100644 +--- a/drivers/acpi/apei/einj-core.c ++++ b/drivers/acpi/apei/einj-core.c +@@ -909,7 +909,7 @@ static void __exit einj_exit(void) + if (einj_initialized) + platform_driver_unregister(&einj_driver); + +- platform_device_del(einj_dev); ++ platform_device_unregister(einj_dev); + } + + module_init(einj_init); +-- +2.45.2 + diff --git a/queue-6.9/filemap-add-helper-mapping_max_folio_size.patch b/queue-6.9/filemap-add-helper-mapping_max_folio_size.patch new file mode 100644 index 00000000000..87c1d56d4ae --- /dev/null +++ b/queue-6.9/filemap-add-helper-mapping_max_folio_size.patch @@ -0,0 +1,83 @@ +From 79c137454815ba5554caa8eeb4ad5c94e96e45ce Mon Sep 17 00:00:00 2001 +From: Xu Yang +Date: Tue, 21 May 2024 19:49:38 +0800 +Subject: filemap: add helper mapping_max_folio_size() + +From: Xu Yang + +commit 79c137454815ba5554caa8eeb4ad5c94e96e45ce upstream. + +Add mapping_max_folio_size() to get the maximum folio size for this +pagecache mapping. + +Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace") +Cc: stable@vger.kernel.org +Reviewed-by: Darrick J. Wong +Signed-off-by: Xu Yang +Link: https://lore.kernel.org/r/20240521114939.2541461-1-xu.yang_2@nxp.com +Reviewed-by: Ritesh Harjani (IBM) +Reviewed-by: Christoph Hellwig +Reviewed-by: Matthew Wilcox (Oracle) +Signed-off-by: Christian Brauner +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/pagemap.h | 34 +++++++++++++++++++++------------- + 1 file changed, 21 insertions(+), 13 deletions(-) + +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -344,6 +344,19 @@ static inline void mapping_set_gfp_mask( + m->gfp_mask = mask; + } + ++/* ++ * There are some parts of the kernel which assume that PMD entries ++ * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, ++ * limit the maximum allocation order to PMD size. I'm not aware of any ++ * assumptions about maximum order if THP are disabled, but 8 seems like ++ * a good order (that's 1MB if you're using 4kB pages) ++ */ ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER ++#else ++#define MAX_PAGECACHE_ORDER 8 ++#endif ++ + /** + * mapping_set_large_folios() - Indicate the file supports large folios. + * @mapping: The file. +@@ -370,6 +383,14 @@ static inline bool mapping_large_folio_s + test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); + } + ++/* Return the maximum folio size for this pagecache mapping, in bytes. */ ++static inline size_t mapping_max_folio_size(struct address_space *mapping) ++{ ++ if (mapping_large_folio_support(mapping)) ++ return PAGE_SIZE << MAX_PAGECACHE_ORDER; ++ return PAGE_SIZE; ++} ++ + static inline int filemap_nr_thps(struct address_space *mapping) + { + #ifdef CONFIG_READ_ONLY_THP_FOR_FS +@@ -528,19 +549,6 @@ static inline void *detach_page_private( + return folio_detach_private(page_folio(page)); + } + +-/* +- * There are some parts of the kernel which assume that PMD entries +- * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then, +- * limit the maximum allocation order to PMD size. I'm not aware of any +- * assumptions about maximum order if THP are disabled, but 8 seems like +- * a good order (that's 1MB if you're using 4kB pages) +- */ +-#ifdef CONFIG_TRANSPARENT_HUGEPAGE +-#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER +-#else +-#define MAX_PAGECACHE_ORDER 8 +-#endif +- + #ifdef CONFIG_NUMA + struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); + #else diff --git a/queue-6.9/io_uring-check-for-non-null-file-pointer-in-io_file_can_poll.patch b/queue-6.9/io_uring-check-for-non-null-file-pointer-in-io_file_can_poll.patch new file mode 100644 index 00000000000..c404275bd77 --- /dev/null +++ b/queue-6.9/io_uring-check-for-non-null-file-pointer-in-io_file_can_poll.patch @@ -0,0 +1,76 @@ +From 5fc16fa5f13b3c06fdb959ef262050bd810416a2 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Sat, 1 Jun 2024 12:25:35 -0600 +Subject: io_uring: check for non-NULL file pointer in io_file_can_poll() + +From: Jens Axboe + +commit 5fc16fa5f13b3c06fdb959ef262050bd810416a2 upstream. + +In earlier kernels, it was possible to trigger a NULL pointer +dereference off the forced async preparation path, if no file had +been assigned. The trace leading to that looks as follows: + +BUG: kernel NULL pointer dereference, address: 00000000000000b0 +PGD 0 P4D 0 +Oops: 0000 [#1] PREEMPT SMP +CPU: 67 PID: 1633 Comm: buf-ring-invali Not tainted 6.8.0-rc3+ #1 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022 +RIP: 0010:io_buffer_select+0xc3/0x210 +Code: 00 00 48 39 d1 0f 82 ae 00 00 00 48 81 4b 48 00 00 01 00 48 89 73 70 0f b7 50 0c 66 89 53 42 85 ed 0f 85 d2 00 00 00 48 8b 13 <48> 8b 92 b0 00 00 00 48 83 7a 40 00 0f 84 21 01 00 00 4c 8b 20 5b +RSP: 0018:ffffb7bec38c7d88 EFLAGS: 00010246 +RAX: ffff97af2be61000 RBX: ffff97af234f1700 RCX: 0000000000000040 +RDX: 0000000000000000 RSI: ffff97aecfb04820 RDI: ffff97af234f1700 +RBP: 0000000000000000 R08: 0000000000200030 R09: 0000000000000020 +R10: ffffb7bec38c7dc8 R11: 000000000000c000 R12: ffffb7bec38c7db8 +R13: ffff97aecfb05800 R14: ffff97aecfb05800 R15: ffff97af2be5e000 +FS: 00007f852f74b740(0000) GS:ffff97b1eeec0000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00000000000000b0 CR3: 000000016deab005 CR4: 0000000000370ef0 +Call Trace: + + ? __die+0x1f/0x60 + ? page_fault_oops+0x14d/0x420 + ? do_user_addr_fault+0x61/0x6a0 + ? exc_page_fault+0x6c/0x150 + ? asm_exc_page_fault+0x22/0x30 + ? io_buffer_select+0xc3/0x210 + __io_import_iovec+0xb5/0x120 + io_readv_prep_async+0x36/0x70 + io_queue_sqe_fallback+0x20/0x260 + io_submit_sqes+0x314/0x630 + __do_sys_io_uring_enter+0x339/0xbc0 + ? __do_sys_io_uring_register+0x11b/0xc50 + ? vm_mmap_pgoff+0xce/0x160 + do_syscall_64+0x5f/0x180 + entry_SYSCALL_64_after_hwframe+0x46/0x4e +RIP: 0033:0x55e0a110a67e +Code: ba cc 00 00 00 45 31 c0 44 0f b6 92 d0 00 00 00 31 d2 41 b9 08 00 00 00 41 83 e2 01 41 c1 e2 04 41 09 c2 b8 aa 01 00 00 0f 05 90 89 30 eb a9 0f 1f 40 00 48 8b 42 20 8b 00 a8 06 75 af 85 f6 + +because the request is marked forced ASYNC and has a bad file fd, and +hence takes the forced async prep path. + +Current kernels with the request async prep cleaned up can no longer hit +this issue, but for ease of backporting, let's add this safety check in +here too as it really doesn't hurt. For both cases, this will inevitably +end with a CQE posted with -EBADF. + +Cc: stable@vger.kernel.org +Fixes: a76c0b31eef5 ("io_uring: commit non-pollable provided mapped buffers upfront") +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/io_uring/io_uring.h ++++ b/io_uring/io_uring.h +@@ -442,7 +442,7 @@ static inline bool io_file_can_poll(stru + { + if (req->flags & REQ_F_CAN_POLL) + return true; +- if (file_can_poll(req->file)) { ++ if (req->file && file_can_poll(req->file)) { + req->flags |= REQ_F_CAN_POLL; + return true; + } diff --git a/queue-6.9/io_uring-napi-fix-timeout-calculation.patch b/queue-6.9/io_uring-napi-fix-timeout-calculation.patch new file mode 100644 index 00000000000..36e06d1583e --- /dev/null +++ b/queue-6.9/io_uring-napi-fix-timeout-calculation.patch @@ -0,0 +1,84 @@ +From 415ce0ea55c5a3afea501a773e002be9ed7149f5 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Mon, 3 Jun 2024 13:56:53 -0600 +Subject: io_uring/napi: fix timeout calculation + +From: Jens Axboe + +commit 415ce0ea55c5a3afea501a773e002be9ed7149f5 upstream. + +Not quite sure what __io_napi_adjust_timeout() was attemping to do, it's +adjusting both the NAPI timeout and the general overall timeout, and +calculating a value that is never used. The overall timeout is a super +set of the NAPI timeout, and doesn't need adjusting. The only thing we +really need to care about is that the NAPI timeout doesn't exceed the +overall timeout. If a user asked for a timeout of eg 5 usec and NAPI +timeout is 10 usec, then we should not spin for 10 usec. + +While in there, sanitize the time checking a bit. If we have a negative +value in the passed in timeout, discard it. Round up the value as well, +so we don't end up with a NAPI timeout for the majority of the wait, +with only a tiny sleep value at the end. + +Hence the only case we need to care about is if the NAPI timeout is +larger than the overall timeout. If it is, cap the NAPI timeout at what +the overall timeout is. + +Cc: stable@vger.kernel.org +Fixes: 8d0c12a80cde ("io-uring: add napi busy poll support") +Reported-by: Lewis Baker +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/napi.c | 22 ++++++++++++---------- + 1 file changed, 12 insertions(+), 10 deletions(-) + +diff --git a/io_uring/napi.c b/io_uring/napi.c +index 883a1a665907..8c18ede595c4 100644 +--- a/io_uring/napi.c ++++ b/io_uring/napi.c +@@ -261,12 +261,14 @@ int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) + } + + /* +- * __io_napi_adjust_timeout() - Add napi id to the busy poll list ++ * __io_napi_adjust_timeout() - adjust busy loop timeout + * @ctx: pointer to io-uring context structure + * @iowq: pointer to io wait queue + * @ts: pointer to timespec or NULL + * + * Adjust the busy loop timeout according to timespec and busy poll timeout. ++ * If the specified NAPI timeout is bigger than the wait timeout, then adjust ++ * the NAPI timeout accordingly. + */ + void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iowq, + struct timespec64 *ts) +@@ -274,16 +276,16 @@ void __io_napi_adjust_timeout(struct io_ring_ctx *ctx, struct io_wait_queue *iow + unsigned int poll_to = READ_ONCE(ctx->napi_busy_poll_to); + + if (ts) { +- struct timespec64 poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); ++ struct timespec64 poll_to_ts; + +- if (timespec64_compare(ts, &poll_to_ts) > 0) { +- *ts = timespec64_sub(*ts, poll_to_ts); +- } else { +- u64 to = timespec64_to_ns(ts); +- +- do_div(to, 1000); +- ts->tv_sec = 0; +- ts->tv_nsec = 0; ++ poll_to_ts = ns_to_timespec64(1000 * (s64)poll_to); ++ if (timespec64_compare(ts, &poll_to_ts) < 0) { ++ s64 poll_to_ns = timespec64_to_ns(ts); ++ if (poll_to_ns > 0) { ++ u64 val = poll_to_ns + 999; ++ do_div(val, (s64) 1000); ++ poll_to = val; ++ } + } + } + +-- +2.45.2 + diff --git a/queue-6.9/iomap-fault-in-smaller-chunks-for-non-large-folio-mappings.patch b/queue-6.9/iomap-fault-in-smaller-chunks-for-non-large-folio-mappings.patch new file mode 100644 index 00000000000..a776a996c19 --- /dev/null +++ b/queue-6.9/iomap-fault-in-smaller-chunks-for-non-large-folio-mappings.patch @@ -0,0 +1,81 @@ +From 4e527d5841e24623181edc7fd6f6598ffa810e10 Mon Sep 17 00:00:00 2001 +From: Xu Yang +Date: Tue, 21 May 2024 19:49:39 +0800 +Subject: iomap: fault in smaller chunks for non-large folio mappings + +From: Xu Yang + +commit 4e527d5841e24623181edc7fd6f6598ffa810e10 upstream. + +Since commit (5d8edfb900d5 "iomap: Copy larger chunks from userspace"), +iomap will try to copy in larger chunks than PAGE_SIZE. However, if the +mapping doesn't support large folio, only one page of maximum 4KB will +be created and 4KB data will be writen to pagecache each time. Then, +next 4KB will be handled in next iteration. This will cause potential +write performance problem. + +If chunk is 2MB, total 512 pages need to be handled finally. During this +period, fault_in_iov_iter_readable() is called to check iov_iter readable +validity. Since only 4KB will be handled each time, below address space +will be checked over and over again: + +start end +- +buf, buf+2MB +buf+4KB, buf+2MB +buf+8KB, buf+2MB +... +buf+2044KB buf+2MB + +Obviously the checking size is wrong since only 4KB will be handled each +time. So this will get a correct chunk to let iomap work well in non-large +folio case. + +With this change, the write speed will be stable. Tested on ARM64 device. + +Before: + + - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (334 MB/s) + - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (278 MB/s) + - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (204 MB/s) + - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (170 MB/s) + - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (150 MB/s) + - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (139 MB/s) + +After: + + - dd if=/dev/zero of=/dev/sda bs=400K count=10485 (339 MB/s) + - dd if=/dev/zero of=/dev/sda bs=800K count=5242 (330 MB/s) + - dd if=/dev/zero of=/dev/sda bs=1600K count=2621 (332 MB/s) + - dd if=/dev/zero of=/dev/sda bs=2200K count=1906 (333 MB/s) + - dd if=/dev/zero of=/dev/sda bs=3000K count=1398 (333 MB/s) + - dd if=/dev/zero of=/dev/sda bs=4500K count=932 (333 MB/s) + +Fixes: 5d8edfb900d5 ("iomap: Copy larger chunks from userspace") +Cc: stable@vger.kernel.org +Reviewed-by: Darrick J. Wong +Signed-off-by: Xu Yang +Link: https://lore.kernel.org/r/20240521114939.2541461-2-xu.yang_2@nxp.com +Reviewed-by: Christoph Hellwig +Reviewed-by: Matthew Wilcox (Oracle) +Signed-off-by: Christian Brauner +Signed-off-by: Greg Kroah-Hartman +--- + fs/iomap/buffered-io.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/iomap/buffered-io.c ++++ b/fs/iomap/buffered-io.c +@@ -909,11 +909,11 @@ static size_t iomap_write_end(struct iom + static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) + { + loff_t length = iomap_length(iter); +- size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; + loff_t pos = iter->pos; + ssize_t written = 0; + long status = 0; + struct address_space *mapping = iter->inode->i_mapping; ++ size_t chunk = mapping_max_folio_size(mapping); + unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; + + do { diff --git a/queue-6.9/series b/queue-6.9/series index 86905c30796..f038bce8c9a 100644 --- a/queue-6.9/series +++ b/queue-6.9/series @@ -71,3 +71,8 @@ crypto-ecrdsa-fix-module-auto-load-on-add_key.patch crypto-qat-fix-adf_dev_reset_sync-memory-leak.patch kbuild-remove-support-for-clang-s-thinlto-caching.patch mm-fix-race-between-__split_huge_pmd_locked-and-gup-fast.patch +io_uring-napi-fix-timeout-calculation.patch +io_uring-check-for-non-null-file-pointer-in-io_file_can_poll.patch +filemap-add-helper-mapping_max_folio_size.patch +iomap-fault-in-smaller-chunks-for-non-large-folio-mappings.patch +acpi-apei-einj-fix-einj_dev-release-leak.patch -- 2.47.3