From 8bd23cdfc46979ed3c4d298e7a6063c74ce08e8c Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 5 Nov 2024 20:54:05 -0500 Subject: [PATCH] Fixes for 6.1 Signed-off-by: Sasha Levin --- ...-fix-headset-mic-on-tuxedo-stellaris.patch | 36 ++ ...-limit-internal-mic-boost-on-dell-pl.patch | 97 +++++ ...anity-checks-in-blk_rq_map_user_bvec.patch | 59 +++ ...-dedicated-workqueue-for-cgroup-bpf-.patch | 154 +++++++ ...xl-acpi-move-rescan-to-the-workqueue.patch | 114 ++++++ ...cxl_bus_rescan-vs-bus_rescan_devices.patch | 65 +++ ...eate-kiocb_-start-end-_write-helpers.patch | 77 ++++ ...-rename-kiocb_end_write-local-helper.patch | 75 ++++ ...missing-nowait-check-for-o_direct-st.patch | 121 ++++++ ...-use-kiocb_-start-end-_write-helpers.patch | 69 ++++ .../kasan-remove-vmalloc_percpu-test.patch | 87 ++++ .../mctp-i2c-handle-null-header-address.patch | 44 ++ ...-convert-migrate_pages-to-use-folios.patch | 380 ++++++++++++++++++ ...convert-unmap_and_move-to-use-folios.patch | 158 ++++++++ ...ganize-stats-with-struct-migrate_pag.patch | 261 ++++++++++++ ...strict-number-of-pages-to-migrate-in.patch | 364 +++++++++++++++++ ...es-separate-hugetlb-folios-migration.patch | 253 ++++++++++++ ...lit-unmap_and_move-to-_unmap-and-_mo.patch | 310 ++++++++++++++ ...gain-if-thp-split-is-failed-due-to-p.patch | 118 ++++++ ...grate.c-stop-using-0-as-null-pointer.patch | 39 ++ ...plicitly-define-how-__gfp_high-non-b.patch | 151 +++++++ ...plicitly-define-what-alloc-flags-dep.patch | 113 ++++++ ...plicitly-record-high-order-atomic-al.patch | 124 ++++++ ...t-gfp_atomic-order-0-allocs-access-h.patch | 88 ++++ ...name-alloc_high-to-alloc_min_reserve.patch | 113 ++++++ ...treat-rt-tasks-similar-to-__gfp_high.patch | 55 +++ ...n-dh_key-to-null-after-kfree_sensiti.patch | 41 ++ ...o-ocfs2_truncate_inline-maybe-overfl.patch | 60 +++ ...set-nx-compat-flag-in-pe-coff-header.patch | 48 +++ .../riscv-remove-duplicated-get_rm.patch | 38 ++ ...remove-unused-generating_asm_offsets.patch | 44 ++ ...cv-use-u-to-format-the-output-of-cpu.patch | 43 ++ ...nt-the-compiler-from-inserting-calls.patch | 40 ++ queue-6.1/series | 34 ++ ...ix-page-count-imbalance-on-node-stat.patch | 75 ++++ 35 files changed, 3948 insertions(+) create mode 100644 queue-6.1/alsa-hda-realtek-fix-headset-mic-on-tuxedo-stellaris.patch create mode 100644 queue-6.1/alsa-hda-realtek-limit-internal-mic-boost-on-dell-pl.patch create mode 100644 queue-6.1/block-fix-sanity-checks-in-blk_rq_map_user_bvec.patch create mode 100644 queue-6.1/cgroup-bpf-use-a-dedicated-workqueue-for-cgroup-bpf-.patch create mode 100644 queue-6.1/cxl-acpi-move-rescan-to-the-workqueue.patch create mode 100644 queue-6.1/cxl-port-fix-cxl_bus_rescan-vs-bus_rescan_devices.patch create mode 100644 queue-6.1/fs-create-kiocb_-start-end-_write-helpers.patch create mode 100644 queue-6.1/io_uring-rename-kiocb_end_write-local-helper.patch create mode 100644 queue-6.1/io_uring-rw-fix-missing-nowait-check-for-o_direct-st.patch create mode 100644 queue-6.1/io_uring-use-kiocb_-start-end-_write-helpers.patch create mode 100644 queue-6.1/kasan-remove-vmalloc_percpu-test.patch create mode 100644 queue-6.1/mctp-i2c-handle-null-header-address.patch create mode 100644 queue-6.1/migrate-convert-migrate_pages-to-use-folios.patch create mode 100644 queue-6.1/migrate-convert-unmap_and_move-to-use-folios.patch create mode 100644 queue-6.1/migrate_pages-organize-stats-with-struct-migrate_pag.patch create mode 100644 queue-6.1/migrate_pages-restrict-number-of-pages-to-migrate-in.patch create mode 100644 queue-6.1/migrate_pages-separate-hugetlb-folios-migration.patch create mode 100644 queue-6.1/migrate_pages-split-unmap_and_move-to-_unmap-and-_mo.patch create mode 100644 queue-6.1/mm-migrate-try-again-if-thp-split-is-failed-due-to-p.patch create mode 100644 queue-6.1/mm-migrate.c-stop-using-0-as-null-pointer.patch create mode 100644 queue-6.1/mm-page_alloc-explicitly-define-how-__gfp_high-non-b.patch create mode 100644 queue-6.1/mm-page_alloc-explicitly-define-what-alloc-flags-dep.patch create mode 100644 queue-6.1/mm-page_alloc-explicitly-record-high-order-atomic-al.patch create mode 100644 queue-6.1/mm-page_alloc-let-gfp_atomic-order-0-allocs-access-h.patch create mode 100644 queue-6.1/mm-page_alloc-rename-alloc_high-to-alloc_min_reserve.patch create mode 100644 queue-6.1/mm-page_alloc-treat-rt-tasks-similar-to-__gfp_high.patch create mode 100644 queue-6.1/nvmet-auth-assign-dh_key-to-null-after-kfree_sensiti.patch create mode 100644 queue-6.1/ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overfl.patch create mode 100644 queue-6.1/riscv-efi-set-nx-compat-flag-in-pe-coff-header.patch create mode 100644 queue-6.1/riscv-remove-duplicated-get_rm.patch create mode 100644 queue-6.1/riscv-remove-unused-generating_asm_offsets.patch create mode 100644 queue-6.1/riscv-use-u-to-format-the-output-of-cpu.patch create mode 100644 queue-6.1/riscv-vdso-prevent-the-compiler-from-inserting-calls.patch create mode 100644 queue-6.1/vmscan-migrate-fix-page-count-imbalance-on-node-stat.patch diff --git a/queue-6.1/alsa-hda-realtek-fix-headset-mic-on-tuxedo-stellaris.patch b/queue-6.1/alsa-hda-realtek-fix-headset-mic-on-tuxedo-stellaris.patch new file mode 100644 index 00000000000..b43584ed64b --- /dev/null +++ b/queue-6.1/alsa-hda-realtek-fix-headset-mic-on-tuxedo-stellaris.patch @@ -0,0 +1,36 @@ +From 0e01a20897e4404df3b1eaa37e3b37f829ab4363 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 29 Oct 2024 16:16:53 +0100 +Subject: ALSA: hda/realtek: Fix headset mic on TUXEDO Stellaris 16 Gen6 mb1 + +From: Christoffer Sandberg + +[ Upstream commit e49370d769e71456db3fbd982e95bab8c69f73e8 ] + +Quirk is needed to enable headset microphone on missing pin 0x19. + +Signed-off-by: Christoffer Sandberg +Signed-off-by: Werner Sembach +Cc: +Link: https://patch.msgid.link/20241029151653.80726-2-wse@tuxedocomputers.com +Signed-off-by: Takashi Iwai +Signed-off-by: Sasha Levin +--- + sound/pci/hda/patch_realtek.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index 3cbd9cf80be96..d750c6e6eb984 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -10214,6 +10214,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x1d05, 0x115c, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x121b, "TongFang GMxAGxx", ALC269_FIXUP_NO_SHUTUP), + SND_PCI_QUIRK(0x1d05, 0x1387, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC), ++ SND_PCI_QUIRK(0x1d05, 0x1409, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1d17, 0x3288, "Haier Boyue G42", ALC269VC_FIXUP_ACER_VCOPPERBOX_PINS), + SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC), + SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE), +-- +2.43.0 + diff --git a/queue-6.1/alsa-hda-realtek-limit-internal-mic-boost-on-dell-pl.patch b/queue-6.1/alsa-hda-realtek-limit-internal-mic-boost-on-dell-pl.patch new file mode 100644 index 00000000000..214d60f302f --- /dev/null +++ b/queue-6.1/alsa-hda-realtek-limit-internal-mic-boost-on-dell-pl.patch @@ -0,0 +1,97 @@ +From cdd6f79946b70304f691527e0efecb41c8c114d3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 18 Oct 2024 13:53:24 +0800 +Subject: ALSA: hda/realtek: Limit internal Mic boost on Dell platform + +From: Kailang Yang + +[ Upstream commit 78e7be018784934081afec77f96d49a2483f9188 ] + +Dell want to limit internal Mic boost on all Dell platform. + +Signed-off-by: Kailang Yang +Cc: +Link: https://lore.kernel.org/561fc5f5eff04b6cbd79ed173cd1c1db@realtek.com +Signed-off-by: Takashi Iwai +Signed-off-by: Sasha Levin +--- + sound/pci/hda/patch_realtek.c | 21 ++++++++++++++++++--- + 1 file changed, 18 insertions(+), 3 deletions(-) + +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index a8bc95ffa41a3..3cbd9cf80be96 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -7159,6 +7159,7 @@ enum { + ALC286_FIXUP_SONY_MIC_NO_PRESENCE, + ALC269_FIXUP_PINCFG_NO_HP_TO_LINEOUT, + ALC269_FIXUP_DELL1_MIC_NO_PRESENCE, ++ ALC269_FIXUP_DELL1_LIMIT_INT_MIC_BOOST, + ALC269_FIXUP_DELL2_MIC_NO_PRESENCE, + ALC269_FIXUP_DELL3_MIC_NO_PRESENCE, + ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, +@@ -7193,6 +7194,7 @@ enum { + ALC255_FIXUP_ACER_MIC_NO_PRESENCE, + ALC255_FIXUP_ASUS_MIC_NO_PRESENCE, + ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, ++ ALC255_FIXUP_DELL1_LIMIT_INT_MIC_BOOST, + ALC255_FIXUP_DELL2_MIC_NO_PRESENCE, + ALC255_FIXUP_HEADSET_MODE, + ALC255_FIXUP_HEADSET_MODE_NO_HP_MIC, +@@ -7658,6 +7660,12 @@ static const struct hda_fixup alc269_fixups[] = { + .chained = true, + .chain_id = ALC269_FIXUP_HEADSET_MODE + }, ++ [ALC269_FIXUP_DELL1_LIMIT_INT_MIC_BOOST] = { ++ .type = HDA_FIXUP_FUNC, ++ .v.func = alc269_fixup_limit_int_mic_boost, ++ .chained = true, ++ .chain_id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE ++ }, + [ALC269_FIXUP_DELL2_MIC_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { +@@ -7938,6 +7946,12 @@ static const struct hda_fixup alc269_fixups[] = { + .chained = true, + .chain_id = ALC255_FIXUP_HEADSET_MODE + }, ++ [ALC255_FIXUP_DELL1_LIMIT_INT_MIC_BOOST] = { ++ .type = HDA_FIXUP_FUNC, ++ .v.func = alc269_fixup_limit_int_mic_boost, ++ .chained = true, ++ .chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE ++ }, + [ALC255_FIXUP_DELL2_MIC_NO_PRESENCE] = { + .type = HDA_FIXUP_PINS, + .v.pins = (const struct hda_pintbl[]) { +@@ -10294,6 +10308,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { + {.id = ALC269_FIXUP_DELL2_MIC_NO_PRESENCE, .name = "dell-headset-dock"}, + {.id = ALC269_FIXUP_DELL3_MIC_NO_PRESENCE, .name = "dell-headset3"}, + {.id = ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, .name = "dell-headset4"}, ++ {.id = ALC269_FIXUP_DELL4_MIC_NO_PRESENCE_QUIET, .name = "dell-headset4-quiet"}, + {.id = ALC283_FIXUP_CHROME_BOOK, .name = "alc283-dac-wcaps"}, + {.id = ALC283_FIXUP_SENSE_COMBO_JACK, .name = "alc283-sense-combo"}, + {.id = ALC292_FIXUP_TPT440_DOCK, .name = "tpt440-dock"}, +@@ -10841,16 +10856,16 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = { + SND_HDA_PIN_QUIRK(0x10ec0289, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, + {0x19, 0x40000000}, + {0x1b, 0x40000000}), +- SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, ++ SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE_QUIET, + {0x19, 0x40000000}, + {0x1b, 0x40000000}), + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + {0x19, 0x40000000}, + {0x1a, 0x40000000}), +- SND_HDA_PIN_QUIRK(0x10ec0236, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, ++ SND_HDA_PIN_QUIRK(0x10ec0236, 0x1028, "Dell", ALC255_FIXUP_DELL1_LIMIT_INT_MIC_BOOST, + {0x19, 0x40000000}, + {0x1a, 0x40000000}), +- SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB, ++ SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC269_FIXUP_DELL1_LIMIT_INT_MIC_BOOST, + {0x19, 0x40000000}, + {0x1a, 0x40000000}), + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1043, "ASUS", ALC2XX_FIXUP_HEADSET_MIC, +-- +2.43.0 + diff --git a/queue-6.1/block-fix-sanity-checks-in-blk_rq_map_user_bvec.patch b/queue-6.1/block-fix-sanity-checks-in-blk_rq_map_user_bvec.patch new file mode 100644 index 00000000000..03e2ff64cb1 --- /dev/null +++ b/queue-6.1/block-fix-sanity-checks-in-blk_rq_map_user_bvec.patch @@ -0,0 +1,59 @@ +From 9a988c5c336b6dfb5c813c357a36faaacac25c88 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 23 Oct 2024 15:15:19 -0600 +Subject: block: fix sanity checks in blk_rq_map_user_bvec + +From: Xinyu Zhang + +[ Upstream commit 2ff949441802a8d076d9013c7761f63e8ae5a9bd ] + +blk_rq_map_user_bvec contains a check bytes + bv->bv_len > nr_iter which +causes unnecessary failures in NVMe passthrough I/O, reproducible as +follows: + +- register a 2 page, page-aligned buffer against a ring +- use that buffer to do a 1 page io_uring NVMe passthrough read + +The second (i = 1) iteration of the loop in blk_rq_map_user_bvec will +then have nr_iter == 1 page, bytes == 1 page, bv->bv_len == 1 page, so +the check bytes + bv->bv_len > nr_iter will succeed, causing the I/O to +fail. This failure is unnecessary, as when the check succeeds, it means +we've checked the entire buffer that will be used by the request - i.e. +blk_rq_map_user_bvec should complete successfully. Therefore, terminate +the loop early and return successfully when the check bytes + bv->bv_len +> nr_iter succeeds. + +While we're at it, also remove the check that all segments in the bvec +are single-page. While this seems to be true for all users of the +function, it doesn't appear to be required anywhere downstream. + +CC: stable@vger.kernel.org +Signed-off-by: Xinyu Zhang +Co-developed-by: Uday Shankar +Signed-off-by: Uday Shankar +Fixes: 37987547932c ("block: extend functionality to map bvec iterator") +Link: https://lore.kernel.org/r/20241023211519.4177873-1-ushankar@purestorage.com +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + block/blk-map.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/block/blk-map.c b/block/blk-map.c +index b337ae347bfa3..a2fa387560375 100644 +--- a/block/blk-map.c ++++ b/block/blk-map.c +@@ -597,9 +597,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) + if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) + goto put_bio; + if (bytes + bv->bv_len > nr_iter) +- goto put_bio; +- if (bv->bv_offset + bv->bv_len > PAGE_SIZE) +- goto put_bio; ++ break; + + nsegs++; + bytes += bv->bv_len; +-- +2.43.0 + diff --git a/queue-6.1/cgroup-bpf-use-a-dedicated-workqueue-for-cgroup-bpf-.patch b/queue-6.1/cgroup-bpf-use-a-dedicated-workqueue-for-cgroup-bpf-.patch new file mode 100644 index 00000000000..09733cae91c --- /dev/null +++ b/queue-6.1/cgroup-bpf-use-a-dedicated-workqueue-for-cgroup-bpf-.patch @@ -0,0 +1,154 @@ +From 0c340c704aa5935085c6ae2de631adada19df11e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 8 Oct 2024 11:24:56 +0000 +Subject: cgroup/bpf: use a dedicated workqueue for cgroup bpf destruction + +From: Chen Ridong + +[ Upstream commit 117932eea99b729ee5d12783601a4f7f5fd58a23 ] + +A hung_task problem shown below was found: + +INFO: task kworker/0:0:8 blocked for more than 327 seconds. +"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. +Workqueue: events cgroup_bpf_release +Call Trace: + + __schedule+0x5a2/0x2050 + ? find_held_lock+0x33/0x100 + ? wq_worker_sleeping+0x9e/0xe0 + schedule+0x9f/0x180 + schedule_preempt_disabled+0x25/0x50 + __mutex_lock+0x512/0x740 + ? cgroup_bpf_release+0x1e/0x4d0 + ? cgroup_bpf_release+0xcf/0x4d0 + ? process_scheduled_works+0x161/0x8a0 + ? cgroup_bpf_release+0x1e/0x4d0 + ? mutex_lock_nested+0x2b/0x40 + ? __pfx_delay_tsc+0x10/0x10 + mutex_lock_nested+0x2b/0x40 + cgroup_bpf_release+0xcf/0x4d0 + ? process_scheduled_works+0x161/0x8a0 + ? trace_event_raw_event_workqueue_execute_start+0x64/0xd0 + ? process_scheduled_works+0x161/0x8a0 + process_scheduled_works+0x23a/0x8a0 + worker_thread+0x231/0x5b0 + ? __pfx_worker_thread+0x10/0x10 + kthread+0x14d/0x1c0 + ? __pfx_kthread+0x10/0x10 + ret_from_fork+0x59/0x70 + ? __pfx_kthread+0x10/0x10 + ret_from_fork_asm+0x1b/0x30 + + +This issue can be reproduced by the following pressuse test: +1. A large number of cpuset cgroups are deleted. +2. Set cpu on and off repeatly. +3. Set watchdog_thresh repeatly. +The scripts can be obtained at LINK mentioned above the signature. + +The reason for this issue is cgroup_mutex and cpu_hotplug_lock are +acquired in different tasks, which may lead to deadlock. +It can lead to a deadlock through the following steps: +1. A large number of cpusets are deleted asynchronously, which puts a + large number of cgroup_bpf_release works into system_wq. The max_active + of system_wq is WQ_DFL_ACTIVE(256). Consequently, all active works are + cgroup_bpf_release works, and many cgroup_bpf_release works will be put + into inactive queue. As illustrated in the diagram, there are 256 (in + the acvtive queue) + n (in the inactive queue) works. +2. Setting watchdog_thresh will hold cpu_hotplug_lock.read and put + smp_call_on_cpu work into system_wq. However step 1 has already filled + system_wq, 'sscs.work' is put into inactive queue. 'sscs.work' has + to wait until the works that were put into the inacvtive queue earlier + have executed (n cgroup_bpf_release), so it will be blocked for a while. +3. Cpu offline requires cpu_hotplug_lock.write, which is blocked by step 2. +4. Cpusets that were deleted at step 1 put cgroup_release works into + cgroup_destroy_wq. They are competing to get cgroup_mutex all the time. + When cgroup_metux is acqured by work at css_killed_work_fn, it will + call cpuset_css_offline, which needs to acqure cpu_hotplug_lock.read. + However, cpuset_css_offline will be blocked for step 3. +5. At this moment, there are 256 works in active queue that are + cgroup_bpf_release, they are attempting to acquire cgroup_mutex, and as + a result, all of them are blocked. Consequently, sscs.work can not be + executed. Ultimately, this situation leads to four processes being + blocked, forming a deadlock. + +system_wq(step1) WatchDog(step2) cpu offline(step3) cgroup_destroy_wq(step4) +... +2000+ cgroups deleted asyn +256 actives + n inactives + __lockup_detector_reconfigure + P(cpu_hotplug_lock.read) + put sscs.work into system_wq +256 + n + 1(sscs.work) +sscs.work wait to be executed + warting sscs.work finish + percpu_down_write + P(cpu_hotplug_lock.write) + ...blocking... + css_killed_work_fn + P(cgroup_mutex) + cpuset_css_offline + P(cpu_hotplug_lock.read) + ...blocking... +256 cgroup_bpf_release +mutex_lock(&cgroup_mutex); +..blocking... + +To fix the problem, place cgroup_bpf_release works on a dedicated +workqueue which can break the loop and solve the problem. System wqs are +for misc things which shouldn't create a large number of concurrent work +items. If something is going to generate >WQ_DFL_ACTIVE(256) concurrent +work items, it should use its own dedicated workqueue. + +Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself") +Cc: stable@vger.kernel.org # v5.3+ +Link: https://lore.kernel.org/cgroups/e90c32d2-2a85-4f28-9154-09c7d320cb60@huawei.com/T/#t +Tested-by: Vishal Chourasia +Signed-off-by: Chen Ridong +Signed-off-by: Tejun Heo +Signed-off-by: Sasha Levin +--- + kernel/bpf/cgroup.c | 19 ++++++++++++++++++- + 1 file changed, 18 insertions(+), 1 deletion(-) + +diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c +index bb70f400c25eb..2cb04e0e118d9 100644 +--- a/kernel/bpf/cgroup.c ++++ b/kernel/bpf/cgroup.c +@@ -24,6 +24,23 @@ + DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); + EXPORT_SYMBOL(cgroup_bpf_enabled_key); + ++/* ++ * cgroup bpf destruction makes heavy use of work items and there can be a lot ++ * of concurrent destructions. Use a separate workqueue so that cgroup bpf ++ * destruction work items don't end up filling up max_active of system_wq ++ * which may lead to deadlock. ++ */ ++static struct workqueue_struct *cgroup_bpf_destroy_wq; ++ ++static int __init cgroup_bpf_wq_init(void) ++{ ++ cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); ++ if (!cgroup_bpf_destroy_wq) ++ panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); ++ return 0; ++} ++core_initcall(cgroup_bpf_wq_init); ++ + /* __always_inline is necessary to prevent indirect call through run_prog + * function pointer. + */ +@@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) + struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); + + INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); +- queue_work(system_wq, &cgrp->bpf.release_work); ++ queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); + } + + /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through +-- +2.43.0 + diff --git a/queue-6.1/cxl-acpi-move-rescan-to-the-workqueue.patch b/queue-6.1/cxl-acpi-move-rescan-to-the-workqueue.patch new file mode 100644 index 00000000000..1745fde24c9 --- /dev/null +++ b/queue-6.1/cxl-acpi-move-rescan-to-the-workqueue.patch @@ -0,0 +1,114 @@ +From 2c6aa71b070247d3da2b5ed5820e41eb07a2cf17 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 1 Dec 2022 13:33:48 -0800 +Subject: cxl/acpi: Move rescan to the workqueue + +From: Dan Williams + +[ Upstream commit 4029c32fb601d505dfb92bdf0db9fdcc41fe1434 ] + +Now that the cxl_mem driver has a need to take the root device lock, the +cxl_bus_rescan() needs to run outside of the root lock context. That +need arises from RCH topologies and the locking that the cxl_mem driver +does to attach a descendant to an upstream port. In the RCH case the +lock needed is the CXL root device lock [1]. + +Link: http://lore.kernel.org/r/166993045621.1882361.1730100141527044744.stgit@dwillia2-xfh.jf.intel.com [1] +Tested-by: Robert Richter +Link: http://lore.kernel.org/r/166993042884.1882361.5633723613683058881.stgit@dwillia2-xfh.jf.intel.com +Reviewed-by: Jonathan Cameron +Signed-off-by: Dan Williams +Stable-dep-of: 3d6ebf16438d ("cxl/port: Fix cxl_bus_rescan() vs bus_rescan_devices()") +Signed-off-by: Sasha Levin +--- + drivers/cxl/acpi.c | 17 +++++++++++++++-- + drivers/cxl/core/port.c | 19 +++++++++++++++++-- + drivers/cxl/cxl.h | 3 ++- + 3 files changed, 34 insertions(+), 5 deletions(-) + +diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c +index dd610556a3afa..d7d789211c173 100644 +--- a/drivers/cxl/acpi.c ++++ b/drivers/cxl/acpi.c +@@ -509,7 +509,8 @@ static int cxl_acpi_probe(struct platform_device *pdev) + return rc; + + /* In case PCI is scanned before ACPI re-trigger memdev attach */ +- return cxl_bus_rescan(); ++ cxl_bus_rescan(); ++ return 0; + } + + static const struct acpi_device_id cxl_acpi_ids[] = { +@@ -533,7 +534,19 @@ static struct platform_driver cxl_acpi_driver = { + .id_table = cxl_test_ids, + }; + +-module_platform_driver(cxl_acpi_driver); ++static int __init cxl_acpi_init(void) ++{ ++ return platform_driver_register(&cxl_acpi_driver); ++} ++ ++static void __exit cxl_acpi_exit(void) ++{ ++ platform_driver_unregister(&cxl_acpi_driver); ++ cxl_bus_drain(); ++} ++ ++module_init(cxl_acpi_init); ++module_exit(cxl_acpi_exit); + MODULE_LICENSE("GPL v2"); + MODULE_IMPORT_NS(CXL); + MODULE_IMPORT_NS(ACPI); +diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c +index 1f1483a9e5252..f0875fa86c616 100644 +--- a/drivers/cxl/core/port.c ++++ b/drivers/cxl/core/port.c +@@ -1786,12 +1786,27 @@ static void cxl_bus_remove(struct device *dev) + + static struct workqueue_struct *cxl_bus_wq; + +-int cxl_bus_rescan(void) ++static void cxl_bus_rescan_queue(struct work_struct *w) + { +- return bus_rescan_devices(&cxl_bus_type); ++ int rc = bus_rescan_devices(&cxl_bus_type); ++ ++ pr_debug("CXL bus rescan result: %d\n", rc); ++} ++ ++void cxl_bus_rescan(void) ++{ ++ static DECLARE_WORK(rescan_work, cxl_bus_rescan_queue); ++ ++ queue_work(cxl_bus_wq, &rescan_work); + } + EXPORT_SYMBOL_NS_GPL(cxl_bus_rescan, CXL); + ++void cxl_bus_drain(void) ++{ ++ drain_workqueue(cxl_bus_wq); ++} ++EXPORT_SYMBOL_NS_GPL(cxl_bus_drain, CXL); ++ + bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd) + { + return queue_work(cxl_bus_wq, &cxlmd->detach_work); +diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h +index 7750ccb7652db..827fa94cddda1 100644 +--- a/drivers/cxl/cxl.h ++++ b/drivers/cxl/cxl.h +@@ -564,7 +564,8 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport, + struct cxl_dport *parent_dport); + struct cxl_port *find_cxl_root(struct device *dev); + int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd); +-int cxl_bus_rescan(void); ++void cxl_bus_rescan(void); ++void cxl_bus_drain(void); + struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd, + struct cxl_dport **dport); + bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd); +-- +2.43.0 + diff --git a/queue-6.1/cxl-port-fix-cxl_bus_rescan-vs-bus_rescan_devices.patch b/queue-6.1/cxl-port-fix-cxl_bus_rescan-vs-bus_rescan_devices.patch new file mode 100644 index 00000000000..22e445c3066 --- /dev/null +++ b/queue-6.1/cxl-port-fix-cxl_bus_rescan-vs-bus_rescan_devices.patch @@ -0,0 +1,65 @@ +From d7f1f35cfbdd17cb6884cd3722f902922f725701 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 22 Oct 2024 18:43:32 -0700 +Subject: cxl/port: Fix cxl_bus_rescan() vs bus_rescan_devices() + +From: Dan Williams + +[ Upstream commit 3d6ebf16438de5d712030fefbb4182b46373d677 ] + +It turns out since its original introduction, pre-2.6.12, +bus_rescan_devices() has skipped devices that might be in the process of +attaching or detaching from their driver. For CXL this behavior is +unwanted and expects that cxl_bus_rescan() is a probe barrier. + +That behavior is simple enough to achieve with bus_for_each_dev() paired +with call to device_attach(), and it is unclear why bus_rescan_devices() +took the position of lockless consumption of dev->driver which is racy. + +The "Fixes:" but no "Cc: stable" on this patch reflects that the issue +is merely by inspection since the bug that triggered the discovery of +this potential problem [1] is fixed by other means. However, a stable +backport should do no harm. + +Fixes: 8dd2bc0f8e02 ("cxl/mem: Add the cxl_mem driver") +Link: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net [1] +Signed-off-by: Dan Williams +Tested-by: Gregory Price +Reviewed-by: Jonathan Cameron +Reviewed-by: Ira Weiny +Link: https://patch.msgid.link/172964781104.81806.4277549800082443769.stgit@dwillia2-xfh.jf.intel.com +Signed-off-by: Ira Weiny +Signed-off-by: Sasha Levin +--- + drivers/cxl/core/port.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c +index f0875fa86c616..20f052d3759e0 100644 +--- a/drivers/cxl/core/port.c ++++ b/drivers/cxl/core/port.c +@@ -1786,11 +1786,18 @@ static void cxl_bus_remove(struct device *dev) + + static struct workqueue_struct *cxl_bus_wq; + +-static void cxl_bus_rescan_queue(struct work_struct *w) ++static int cxl_rescan_attach(struct device *dev, void *data) + { +- int rc = bus_rescan_devices(&cxl_bus_type); ++ int rc = device_attach(dev); ++ ++ dev_vdbg(dev, "rescan: %s\n", rc ? "attach" : "detached"); + +- pr_debug("CXL bus rescan result: %d\n", rc); ++ return 0; ++} ++ ++static void cxl_bus_rescan_queue(struct work_struct *w) ++{ ++ bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_rescan_attach); + } + + void cxl_bus_rescan(void) +-- +2.43.0 + diff --git a/queue-6.1/fs-create-kiocb_-start-end-_write-helpers.patch b/queue-6.1/fs-create-kiocb_-start-end-_write-helpers.patch new file mode 100644 index 00000000000..e9fb0747f27 --- /dev/null +++ b/queue-6.1/fs-create-kiocb_-start-end-_write-helpers.patch @@ -0,0 +1,77 @@ +From fc9f40091fb160075127a025b7278dbdf985a5b4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 17 Aug 2023 17:13:33 +0300 +Subject: fs: create kiocb_{start,end}_write() helpers + +From: Amir Goldstein + +[ Upstream commit ed0360bbab72b829437b67ebb2f9cfac19f59dfe ] + +aio, io_uring, cachefiles and overlayfs, all open code an ugly variant +of file_{start,end}_write() to silence lockdep warnings. + +Create helpers for this lockdep dance so we can use the helpers in all +the callers. + +Suggested-by: Jan Kara +Signed-off-by: Amir Goldstein +Reviewed-by: Jan Kara +Reviewed-by: Jens Axboe +Message-Id: <20230817141337.1025891-4-amir73il@gmail.com> +Signed-off-by: Christian Brauner +Stable-dep-of: 1d60d74e8526 ("io_uring/rw: fix missing NOWAIT check for O_DIRECT start write") +Signed-off-by: Sasha Levin +--- + include/linux/fs.h | 36 ++++++++++++++++++++++++++++++++++++ + 1 file changed, 36 insertions(+) + +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 33c4961309833..0d32634c5cf0d 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3029,6 +3029,42 @@ static inline void file_end_write(struct file *file) + __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE); + } + ++/** ++ * kiocb_start_write - get write access to a superblock for async file io ++ * @iocb: the io context we want to submit the write with ++ * ++ * This is a variant of sb_start_write() for async io submission. ++ * Should be matched with a call to kiocb_end_write(). ++ */ ++static inline void kiocb_start_write(struct kiocb *iocb) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ ++ sb_start_write(inode->i_sb); ++ /* ++ * Fool lockdep by telling it the lock got released so that it ++ * doesn't complain about the held lock when we return to userspace. ++ */ ++ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); ++} ++ ++/** ++ * kiocb_end_write - drop write access to a superblock after async file io ++ * @iocb: the io context we sumbitted the write with ++ * ++ * Should be matched with a call to kiocb_start_write(). ++ */ ++static inline void kiocb_end_write(struct kiocb *iocb) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ ++ /* ++ * Tell lockdep we inherited freeze protection from submission thread. ++ */ ++ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); ++ sb_end_write(inode->i_sb); ++} ++ + /* + * This is used for regular files where some users -- especially the + * currently executed binary in a process, previously handled via +-- +2.43.0 + diff --git a/queue-6.1/io_uring-rename-kiocb_end_write-local-helper.patch b/queue-6.1/io_uring-rename-kiocb_end_write-local-helper.patch new file mode 100644 index 00000000000..ca9e820fae7 --- /dev/null +++ b/queue-6.1/io_uring-rename-kiocb_end_write-local-helper.patch @@ -0,0 +1,75 @@ +From 85ca1f838a54f739f7d8ddcc16ace4d7f67af19f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 17 Aug 2023 17:13:31 +0300 +Subject: io_uring: rename kiocb_end_write() local helper + +From: Amir Goldstein + +[ Upstream commit a370167fe526123637965f60859a9f1f3e1a58b7 ] + +This helper does not take a kiocb as input and we want to create a +common helper by that name that takes a kiocb as input. + +Signed-off-by: Amir Goldstein +Reviewed-by: Jan Kara +Reviewed-by: Jens Axboe +Message-Id: <20230817141337.1025891-2-amir73il@gmail.com> +Signed-off-by: Christian Brauner +Stable-dep-of: 1d60d74e8526 ("io_uring/rw: fix missing NOWAIT check for O_DIRECT start write") +Signed-off-by: Sasha Levin +--- + io_uring/rw.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/io_uring/rw.c b/io_uring/rw.c +index 038e6b13a7496..4eb42fc29c151 100644 +--- a/io_uring/rw.c ++++ b/io_uring/rw.c +@@ -220,7 +220,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req) + } + #endif + +-static void kiocb_end_write(struct io_kiocb *req) ++static void io_req_end_write(struct io_kiocb *req) + { + /* + * Tell lockdep we inherited freeze protection from submission +@@ -243,7 +243,7 @@ static void io_req_io_end(struct io_kiocb *req) + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + + if (rw->kiocb.ki_flags & IOCB_WRITE) { +- kiocb_end_write(req); ++ io_req_end_write(req); + fsnotify_modify(req->file); + } else { + fsnotify_access(req->file); +@@ -307,7 +307,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) + struct io_kiocb *req = cmd_to_io_kiocb(rw); + + if (kiocb->ki_flags & IOCB_WRITE) +- kiocb_end_write(req); ++ io_req_end_write(req); + if (unlikely(res != req->cqe.res)) { + if (res == -EAGAIN && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO; +@@ -956,7 +956,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) + io->bytes_done += ret2; + + if (kiocb->ki_flags & IOCB_WRITE) +- kiocb_end_write(req); ++ io_req_end_write(req); + return ret ? ret : -EAGAIN; + } + done: +@@ -967,7 +967,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) + ret = io_setup_async_rw(req, iovec, s, false); + if (!ret) { + if (kiocb->ki_flags & IOCB_WRITE) +- kiocb_end_write(req); ++ io_req_end_write(req); + return -EAGAIN; + } + return ret; +-- +2.43.0 + diff --git a/queue-6.1/io_uring-rw-fix-missing-nowait-check-for-o_direct-st.patch b/queue-6.1/io_uring-rw-fix-missing-nowait-check-for-o_direct-st.patch new file mode 100644 index 00000000000..a7aabfaea41 --- /dev/null +++ b/queue-6.1/io_uring-rw-fix-missing-nowait-check-for-o_direct-st.patch @@ -0,0 +1,121 @@ +From 7cc8484de2585d9324d89c118dfda19dc847ebab Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 31 Oct 2024 08:05:44 -0600 +Subject: io_uring/rw: fix missing NOWAIT check for O_DIRECT start write + +From: Jens Axboe + +[ Upstream commit 1d60d74e852647255bd8e76f5a22dc42531e4389 ] + +When io_uring starts a write, it'll call kiocb_start_write() to bump the +super block rwsem, preventing any freezes from happening while that +write is in-flight. The freeze side will grab that rwsem for writing, +excluding any new writers from happening and waiting for existing writes +to finish. But io_uring unconditionally uses kiocb_start_write(), which +will block if someone is currently attempting to freeze the mount point. +This causes a deadlock where freeze is waiting for previous writes to +complete, but the previous writes cannot complete, as the task that is +supposed to complete them is blocked waiting on starting a new write. +This results in the following stuck trace showing that dependency with +the write blocked starting a new write: + +task:fio state:D stack:0 pid:886 tgid:886 ppid:876 +Call trace: + __switch_to+0x1d8/0x348 + __schedule+0x8e8/0x2248 + schedule+0x110/0x3f0 + percpu_rwsem_wait+0x1e8/0x3f8 + __percpu_down_read+0xe8/0x500 + io_write+0xbb8/0xff8 + io_issue_sqe+0x10c/0x1020 + io_submit_sqes+0x614/0x2110 + __arm64_sys_io_uring_enter+0x524/0x1038 + invoke_syscall+0x74/0x268 + el0_svc_common.constprop.0+0x160/0x238 + do_el0_svc+0x44/0x60 + el0_svc+0x44/0xb0 + el0t_64_sync_handler+0x118/0x128 + el0t_64_sync+0x168/0x170 +INFO: task fsfreeze:7364 blocked for more than 15 seconds. + Not tainted 6.12.0-rc5-00063-g76aaf945701c #7963 + +with the attempting freezer stuck trying to grab the rwsem: + +task:fsfreeze state:D stack:0 pid:7364 tgid:7364 ppid:995 +Call trace: + __switch_to+0x1d8/0x348 + __schedule+0x8e8/0x2248 + schedule+0x110/0x3f0 + percpu_down_write+0x2b0/0x680 + freeze_super+0x248/0x8a8 + do_vfs_ioctl+0x149c/0x1b18 + __arm64_sys_ioctl+0xd0/0x1a0 + invoke_syscall+0x74/0x268 + el0_svc_common.constprop.0+0x160/0x238 + do_el0_svc+0x44/0x60 + el0_svc+0x44/0xb0 + el0t_64_sync_handler+0x118/0x128 + el0t_64_sync+0x168/0x170 + +Fix this by having the io_uring side honor IOCB_NOWAIT, and only attempt a +blocking grab of the super block rwsem if it isn't set. For normal issue +where IOCB_NOWAIT would always be set, this returns -EAGAIN which will +have io_uring core issue a blocking attempt of the write. That will in +turn also get completions run, ensuring forward progress. + +Since freezing requires CAP_SYS_ADMIN in the first place, this isn't +something that can be triggered by a regular user. + +Cc: stable@vger.kernel.org # 5.10+ +Reported-by: Peter Mann +Link: https://lore.kernel.org/io-uring/38c94aec-81c9-4f62-b44e-1d87f5597644@sh.cz +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/rw.c | 23 +++++++++++++++++++++-- + 1 file changed, 21 insertions(+), 2 deletions(-) + +diff --git a/io_uring/rw.c b/io_uring/rw.c +index c15c7873813b3..9d6e17a244ae7 100644 +--- a/io_uring/rw.c ++++ b/io_uring/rw.c +@@ -839,6 +839,25 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) + return kiocb_done(req, ret, issue_flags); + } + ++static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb) ++{ ++ struct inode *inode; ++ bool ret; ++ ++ if (!(req->flags & REQ_F_ISREG)) ++ return true; ++ if (!(kiocb->ki_flags & IOCB_NOWAIT)) { ++ kiocb_start_write(kiocb); ++ return true; ++ } ++ ++ inode = file_inode(kiocb->ki_filp); ++ ret = sb_start_write_trylock(inode->i_sb); ++ if (ret) ++ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); ++ return ret; ++} ++ + int io_write(struct io_kiocb *req, unsigned int issue_flags) + { + struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); +@@ -892,8 +911,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) + return ret; + } + +- if (req->flags & REQ_F_ISREG) +- kiocb_start_write(kiocb); ++ if (unlikely(!io_kiocb_start_write(req, kiocb))) ++ return -EAGAIN; + kiocb->ki_flags |= IOCB_WRITE; + + if (likely(req->file->f_op->write_iter)) +-- +2.43.0 + diff --git a/queue-6.1/io_uring-use-kiocb_-start-end-_write-helpers.patch b/queue-6.1/io_uring-use-kiocb_-start-end-_write-helpers.patch new file mode 100644 index 00000000000..61b48e7ebd9 --- /dev/null +++ b/queue-6.1/io_uring-use-kiocb_-start-end-_write-helpers.patch @@ -0,0 +1,69 @@ +From 4cf01f8e6f316d434b3882d6b4fff5666ee05971 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 17 Aug 2023 17:13:34 +0300 +Subject: io_uring: use kiocb_{start,end}_write() helpers + +From: Amir Goldstein + +[ Upstream commit e484fd73f4bdcb00c2188100c2d84e9f3f5c9f7d ] + +Use helpers instead of the open coded dance to silence lockdep warnings. + +Suggested-by: Jan Kara +Signed-off-by: Amir Goldstein +Reviewed-by: Jan Kara +Reviewed-by: Jens Axboe +Message-Id: <20230817141337.1025891-5-amir73il@gmail.com> +Signed-off-by: Christian Brauner +Stable-dep-of: 1d60d74e8526 ("io_uring/rw: fix missing NOWAIT check for O_DIRECT start write") +Signed-off-by: Sasha Levin +--- + io_uring/rw.c | 23 ++++------------------- + 1 file changed, 4 insertions(+), 19 deletions(-) + +diff --git a/io_uring/rw.c b/io_uring/rw.c +index 4eb42fc29c151..c15c7873813b3 100644 +--- a/io_uring/rw.c ++++ b/io_uring/rw.c +@@ -222,15 +222,10 @@ static bool io_rw_should_reissue(struct io_kiocb *req) + + static void io_req_end_write(struct io_kiocb *req) + { +- /* +- * Tell lockdep we inherited freeze protection from submission +- * thread. +- */ + if (req->flags & REQ_F_ISREG) { +- struct super_block *sb = file_inode(req->file)->i_sb; ++ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw); + +- __sb_writers_acquired(sb, SB_FREEZE_WRITE); +- sb_end_write(sb); ++ kiocb_end_write(&rw->kiocb); + } + } + +@@ -897,18 +892,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) + return ret; + } + +- /* +- * Open-code file_start_write here to grab freeze protection, +- * which will be released by another thread in +- * io_complete_rw(). Fool lockdep by telling it the lock got +- * released so that it doesn't complain about the held lock when +- * we return to userspace. +- */ +- if (req->flags & REQ_F_ISREG) { +- sb_start_write(file_inode(req->file)->i_sb); +- __sb_writers_release(file_inode(req->file)->i_sb, +- SB_FREEZE_WRITE); +- } ++ if (req->flags & REQ_F_ISREG) ++ kiocb_start_write(kiocb); + kiocb->ki_flags |= IOCB_WRITE; + + if (likely(req->file->f_op->write_iter)) +-- +2.43.0 + diff --git a/queue-6.1/kasan-remove-vmalloc_percpu-test.patch b/queue-6.1/kasan-remove-vmalloc_percpu-test.patch new file mode 100644 index 00000000000..01d971a65bc --- /dev/null +++ b/queue-6.1/kasan-remove-vmalloc_percpu-test.patch @@ -0,0 +1,87 @@ +From 4ed3283db4accb8de6dae5d596250f67d6afd12b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 22 Oct 2024 18:07:06 +0200 +Subject: kasan: remove vmalloc_percpu test + +From: Andrey Konovalov + +[ Upstream commit 330d8df81f3673d6fb74550bbc9bb159d81b35f7 ] + +Commit 1a2473f0cbc0 ("kasan: improve vmalloc tests") added the +vmalloc_percpu KASAN test with the assumption that __alloc_percpu always +uses vmalloc internally, which is tagged by KASAN. + +However, __alloc_percpu might allocate memory from the first per-CPU +chunk, which is not allocated via vmalloc(). As a result, the test might +fail. + +Remove the test until proper KASAN annotation for the per-CPU allocated +are added; tracked in https://bugzilla.kernel.org/show_bug.cgi?id=215019. + +Link: https://lkml.kernel.org/r/20241022160706.38943-1-andrey.konovalov@linux.dev +Fixes: 1a2473f0cbc0 ("kasan: improve vmalloc tests") +Signed-off-by: Andrey Konovalov +Reported-by: Samuel Holland +Link: https://lore.kernel.org/all/4a245fff-cc46-44d1-a5f9-fd2f1c3764ae@sifive.com/ +Reported-by: Sabyrzhan Tasbolatov +Link: https://lore.kernel.org/all/CACzwLxiWzNqPBp4C1VkaXZ2wDwvY3yZeetCi1TLGFipKW77drA@mail.gmail.com/ +Cc: Alexander Potapenko +Cc: Andrey Ryabinin +Cc: Dmitry Vyukov +Cc: Marco Elver +Cc: Sabyrzhan Tasbolatov +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/kasan/kasan_test.c | 27 --------------------------- + 1 file changed, 27 deletions(-) + +diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c +index cef683a2e0d2e..df9658299a08a 100644 +--- a/mm/kasan/kasan_test.c ++++ b/mm/kasan/kasan_test.c +@@ -1260,32 +1260,6 @@ static void vm_map_ram_tags(struct kunit *test) + free_pages((unsigned long)p_ptr, 1); + } + +-static void vmalloc_percpu(struct kunit *test) +-{ +- char __percpu *ptr; +- int cpu; +- +- /* +- * This test is specifically crafted for the software tag-based mode, +- * the only tag-based mode that poisons percpu mappings. +- */ +- KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS); +- +- ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); +- +- for_each_possible_cpu(cpu) { +- char *c_ptr = per_cpu_ptr(ptr, cpu); +- +- KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN); +- KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL); +- +- /* Make sure that in-bounds accesses don't crash the kernel. */ +- *c_ptr = 0; +- } +- +- free_percpu(ptr); +-} +- + /* + * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN, + * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based +@@ -1439,7 +1413,6 @@ static struct kunit_case kasan_kunit_test_cases[] = { + KUNIT_CASE(vmalloc_oob), + KUNIT_CASE(vmap_tags), + KUNIT_CASE(vm_map_ram_tags), +- KUNIT_CASE(vmalloc_percpu), + KUNIT_CASE(match_all_not_assigned), + KUNIT_CASE(match_all_ptr_tag), + KUNIT_CASE(match_all_mem_tag), +-- +2.43.0 + diff --git a/queue-6.1/mctp-i2c-handle-null-header-address.patch b/queue-6.1/mctp-i2c-handle-null-header-address.patch new file mode 100644 index 00000000000..d9010ad6681 --- /dev/null +++ b/queue-6.1/mctp-i2c-handle-null-header-address.patch @@ -0,0 +1,44 @@ +From f2e4472e93a1a0c03ce77a1cb2932502e907e34d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 22 Oct 2024 18:25:14 +0800 +Subject: mctp i2c: handle NULL header address + +From: Matt Johnston + +[ Upstream commit 01e215975fd80af81b5b79f009d49ddd35976c13 ] + +daddr can be NULL if there is no neighbour table entry present, +in that case the tx packet should be dropped. + +saddr will usually be set by MCTP core, but check for NULL in case a +packet is transmitted by a different protocol. + +Fixes: f5b8abf9fc3d ("mctp i2c: MCTP I2C binding driver") +Cc: stable@vger.kernel.org +Reported-by: Dung Cao +Signed-off-by: Matt Johnston +Reviewed-by: Simon Horman +Link: https://patch.msgid.link/20241022-mctp-i2c-null-dest-v3-1-e929709956c5@codeconstruct.com.au +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/mctp/mctp-i2c.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c +index 1d67a3ca1fd11..7635a8b3c35cd 100644 +--- a/drivers/net/mctp/mctp-i2c.c ++++ b/drivers/net/mctp/mctp-i2c.c +@@ -547,6 +547,9 @@ static int mctp_i2c_header_create(struct sk_buff *skb, struct net_device *dev, + if (len > MCTP_I2C_MAXMTU) + return -EMSGSIZE; + ++ if (!daddr || !saddr) ++ return -EINVAL; ++ + lldst = *((u8 *)daddr); + llsrc = *((u8 *)saddr); + +-- +2.43.0 + diff --git a/queue-6.1/migrate-convert-migrate_pages-to-use-folios.patch b/queue-6.1/migrate-convert-migrate_pages-to-use-folios.patch new file mode 100644 index 00000000000..9b82ab3eb1d --- /dev/null +++ b/queue-6.1/migrate-convert-migrate_pages-to-use-folios.patch @@ -0,0 +1,380 @@ +From 3992d9e7cfbd4eec0eee3c177e5f4e11ba8c4294 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 9 Nov 2022 09:23:48 +0800 +Subject: migrate: convert migrate_pages() to use folios + +From: Huang Ying + +[ Upstream commit eaec4e639f11413ce75fbf38affd1aa5c40979e9 ] + +Quite straightforward, the page functions are converted to corresponding +folio functions. Same for comments. + +THP specific code are converted to be large folio. + +Link: https://lkml.kernel.org/r/20221109012348.93849-3-ying.huang@intel.com +Signed-off-by: "Huang, Ying" +Reviewed-by: Baolin Wang +Tested-by: Baolin Wang +Cc: Zi Yan +Cc: Yang Shi +Cc: Oscar Salvador +Cc: Matthew Wilcox +Signed-off-by: Andrew Morton +Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages") +Signed-off-by: Sasha Levin +--- + mm/migrate.c | 210 +++++++++++++++++++++++++++------------------------ + 1 file changed, 112 insertions(+), 98 deletions(-) + +diff --git a/mm/migrate.c b/mm/migrate.c +index 16b456b927c18..562f819dc6189 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1385,231 +1385,245 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, + return rc; + } + +-static inline int try_split_thp(struct page *page, struct list_head *split_pages) ++static inline int try_split_folio(struct folio *folio, struct list_head *split_folios) + { + int rc; + +- lock_page(page); +- rc = split_huge_page_to_list(page, split_pages); +- unlock_page(page); ++ folio_lock(folio); ++ rc = split_folio_to_list(folio, split_folios); ++ folio_unlock(folio); + if (!rc) +- list_move_tail(&page->lru, split_pages); ++ list_move_tail(&folio->lru, split_folios); + + return rc; + } + + /* +- * migrate_pages - migrate the pages specified in a list, to the free pages ++ * migrate_pages - migrate the folios specified in a list, to the free folios + * supplied as the target for the page migration + * +- * @from: The list of pages to be migrated. +- * @get_new_page: The function used to allocate free pages to be used +- * as the target of the page migration. +- * @put_new_page: The function used to free target pages if migration ++ * @from: The list of folios to be migrated. ++ * @get_new_page: The function used to allocate free folios to be used ++ * as the target of the folio migration. ++ * @put_new_page: The function used to free target folios if migration + * fails, or NULL if no special handling is necessary. + * @private: Private data to be passed on to get_new_page() + * @mode: The migration mode that specifies the constraints for +- * page migration, if any. +- * @reason: The reason for page migration. +- * @ret_succeeded: Set to the number of normal pages migrated successfully if ++ * folio migration, if any. ++ * @reason: The reason for folio migration. ++ * @ret_succeeded: Set to the number of folios migrated successfully if + * the caller passes a non-NULL pointer. + * +- * The function returns after 10 attempts or if no pages are movable any more +- * because the list has become empty or no retryable pages exist any more. +- * It is caller's responsibility to call putback_movable_pages() to return pages ++ * The function returns after 10 attempts or if no folios are movable any more ++ * because the list has become empty or no retryable folios exist any more. ++ * It is caller's responsibility to call putback_movable_pages() to return folios + * to the LRU or free list only if ret != 0. + * +- * Returns the number of {normal page, THP, hugetlb} that were not migrated, or +- * an error code. The number of THP splits will be considered as the number of +- * non-migrated THP, no matter how many subpages of the THP are migrated successfully. ++ * Returns the number of {normal folio, large folio, hugetlb} that were not ++ * migrated, or an error code. The number of large folio splits will be ++ * considered as the number of non-migrated large folio, no matter how many ++ * split folios of the large folio are migrated successfully. + */ + int migrate_pages(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason, unsigned int *ret_succeeded) + { + int retry = 1; ++ int large_retry = 1; + int thp_retry = 1; + int nr_failed = 0; + int nr_failed_pages = 0; + int nr_retry_pages = 0; + int nr_succeeded = 0; + int nr_thp_succeeded = 0; ++ int nr_large_failed = 0; + int nr_thp_failed = 0; + int nr_thp_split = 0; + int pass = 0; ++ bool is_large = false; + bool is_thp = false; +- struct page *page; +- struct page *page2; +- int rc, nr_subpages; +- LIST_HEAD(ret_pages); +- LIST_HEAD(thp_split_pages); ++ struct folio *folio, *folio2; ++ int rc, nr_pages; ++ LIST_HEAD(ret_folios); ++ LIST_HEAD(split_folios); + bool nosplit = (reason == MR_NUMA_MISPLACED); +- bool no_subpage_counting = false; ++ bool no_split_folio_counting = false; + + trace_mm_migrate_pages_start(mode, reason); + +-thp_subpage_migration: +- for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { ++split_folio_migration: ++ for (pass = 0; pass < 10 && (retry || large_retry); pass++) { + retry = 0; ++ large_retry = 0; + thp_retry = 0; + nr_retry_pages = 0; + +- list_for_each_entry_safe(page, page2, from, lru) { ++ list_for_each_entry_safe(folio, folio2, from, lru) { + /* +- * THP statistics is based on the source huge page. +- * Capture required information that might get lost +- * during migration. ++ * Large folio statistics is based on the source large ++ * folio. Capture required information that might get ++ * lost during migration. + */ +- is_thp = PageTransHuge(page) && !PageHuge(page); +- nr_subpages = compound_nr(page); ++ is_large = folio_test_large(folio) && !folio_test_hugetlb(folio); ++ is_thp = is_large && folio_test_pmd_mappable(folio); ++ nr_pages = folio_nr_pages(folio); + cond_resched(); + +- if (PageHuge(page)) ++ if (folio_test_hugetlb(folio)) + rc = unmap_and_move_huge_page(get_new_page, +- put_new_page, private, page, +- pass > 2, mode, reason, +- &ret_pages); ++ put_new_page, private, ++ &folio->page, pass > 2, mode, ++ reason, ++ &ret_folios); + else + rc = unmap_and_move(get_new_page, put_new_page, +- private, page_folio(page), pass > 2, mode, +- reason, &ret_pages); ++ private, folio, pass > 2, mode, ++ reason, &ret_folios); + /* + * The rules are: +- * Success: non hugetlb page will be freed, hugetlb +- * page will be put back ++ * Success: non hugetlb folio will be freed, hugetlb ++ * folio will be put back + * -EAGAIN: stay on the from list + * -ENOMEM: stay on the from list + * -ENOSYS: stay on the from list +- * Other errno: put on ret_pages list then splice to ++ * Other errno: put on ret_folios list then splice to + * from list + */ + switch(rc) { + /* +- * THP migration might be unsupported or the +- * allocation could've failed so we should +- * retry on the same page with the THP split +- * to base pages. ++ * Large folio migration might be unsupported or ++ * the allocation could've failed so we should retry ++ * on the same folio with the large folio split ++ * to normal folios. + * +- * Sub-pages are put in thp_split_pages, and ++ * Split folios are put in split_folios, and + * we will migrate them after the rest of the + * list is processed. + */ + case -ENOSYS: +- /* THP migration is unsupported */ +- if (is_thp) { +- nr_thp_failed++; +- if (!try_split_thp(page, &thp_split_pages)) { +- nr_thp_split++; ++ /* Large folio migration is unsupported */ ++ if (is_large) { ++ nr_large_failed++; ++ nr_thp_failed += is_thp; ++ if (!try_split_folio(folio, &split_folios)) { ++ nr_thp_split += is_thp; + break; + } + /* Hugetlb migration is unsupported */ +- } else if (!no_subpage_counting) { ++ } else if (!no_split_folio_counting) { + nr_failed++; + } + +- nr_failed_pages += nr_subpages; +- list_move_tail(&page->lru, &ret_pages); ++ nr_failed_pages += nr_pages; ++ list_move_tail(&folio->lru, &ret_folios); + break; + case -ENOMEM: + /* + * When memory is low, don't bother to try to migrate +- * other pages, just exit. ++ * other folios, just exit. + */ +- if (is_thp) { +- nr_thp_failed++; +- /* THP NUMA faulting doesn't split THP to retry. */ ++ if (is_large) { ++ nr_large_failed++; ++ nr_thp_failed += is_thp; ++ /* Large folio NUMA faulting doesn't split to retry. */ + if (!nosplit) { +- int ret = try_split_thp(page, &thp_split_pages); ++ int ret = try_split_folio(folio, &split_folios); + + if (!ret) { +- nr_thp_split++; ++ nr_thp_split += is_thp; + break; + } else if (reason == MR_LONGTERM_PIN && + ret == -EAGAIN) { + /* +- * Try again to split THP to mitigate +- * the failure of longterm pinning. ++ * Try again to split large folio to ++ * mitigate the failure of longterm pinning. + */ +- thp_retry++; +- nr_retry_pages += nr_subpages; ++ large_retry++; ++ thp_retry += is_thp; ++ nr_retry_pages += nr_pages; + break; + } + } +- } else if (!no_subpage_counting) { ++ } else if (!no_split_folio_counting) { + nr_failed++; + } + +- nr_failed_pages += nr_subpages + nr_retry_pages; ++ nr_failed_pages += nr_pages + nr_retry_pages; + /* +- * There might be some subpages of fail-to-migrate THPs +- * left in thp_split_pages list. Move them back to migration ++ * There might be some split folios of fail-to-migrate large ++ * folios left in split_folios list. Move them back to migration + * list so that they could be put back to the right list by +- * the caller otherwise the page refcnt will be leaked. ++ * the caller otherwise the folio refcnt will be leaked. + */ +- list_splice_init(&thp_split_pages, from); ++ list_splice_init(&split_folios, from); + /* nr_failed isn't updated for not used */ ++ nr_large_failed += large_retry; + nr_thp_failed += thp_retry; + goto out; + case -EAGAIN: +- if (is_thp) +- thp_retry++; +- else if (!no_subpage_counting) ++ if (is_large) { ++ large_retry++; ++ thp_retry += is_thp; ++ } else if (!no_split_folio_counting) { + retry++; +- nr_retry_pages += nr_subpages; ++ } ++ nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: +- nr_succeeded += nr_subpages; +- if (is_thp) +- nr_thp_succeeded++; ++ nr_succeeded += nr_pages; ++ nr_thp_succeeded += is_thp; + break; + default: + /* + * Permanent failure (-EBUSY, etc.): +- * unlike -EAGAIN case, the failed page is +- * removed from migration page list and not ++ * unlike -EAGAIN case, the failed folio is ++ * removed from migration folio list and not + * retried in the next outer loop. + */ +- if (is_thp) +- nr_thp_failed++; +- else if (!no_subpage_counting) ++ if (is_large) { ++ nr_large_failed++; ++ nr_thp_failed += is_thp; ++ } else if (!no_split_folio_counting) { + nr_failed++; ++ } + +- nr_failed_pages += nr_subpages; ++ nr_failed_pages += nr_pages; + break; + } + } + } + nr_failed += retry; ++ nr_large_failed += large_retry; + nr_thp_failed += thp_retry; + nr_failed_pages += nr_retry_pages; + /* +- * Try to migrate subpages of fail-to-migrate THPs, no nr_failed +- * counting in this round, since all subpages of a THP is counted +- * as 1 failure in the first round. ++ * Try to migrate split folios of fail-to-migrate large folios, no ++ * nr_failed counting in this round, since all split folios of a ++ * large folio is counted as 1 failure in the first round. + */ +- if (!list_empty(&thp_split_pages)) { ++ if (!list_empty(&split_folios)) { + /* +- * Move non-migrated pages (after 10 retries) to ret_pages ++ * Move non-migrated folios (after 10 retries) to ret_folios + * to avoid migrating them again. + */ +- list_splice_init(from, &ret_pages); +- list_splice_init(&thp_split_pages, from); +- no_subpage_counting = true; ++ list_splice_init(from, &ret_folios); ++ list_splice_init(&split_folios, from); ++ no_split_folio_counting = true; + retry = 1; +- goto thp_subpage_migration; ++ goto split_folio_migration; + } + +- rc = nr_failed + nr_thp_failed; ++ rc = nr_failed + nr_large_failed; + out: + /* +- * Put the permanent failure page back to migration list, they ++ * Put the permanent failure folio back to migration list, they + * will be put back to the right list by the caller. + */ +- list_splice(&ret_pages, from); ++ list_splice(&ret_folios, from); + + /* +- * Return 0 in case all subpages of fail-to-migrate THPs are +- * migrated successfully. ++ * Return 0 in case all split folios of fail-to-migrate large folios ++ * are migrated successfully. + */ + if (list_empty(from)) + rc = 0; +-- +2.43.0 + diff --git a/queue-6.1/migrate-convert-unmap_and_move-to-use-folios.patch b/queue-6.1/migrate-convert-unmap_and_move-to-use-folios.patch new file mode 100644 index 00000000000..f4a07cdc810 --- /dev/null +++ b/queue-6.1/migrate-convert-unmap_and_move-to-use-folios.patch @@ -0,0 +1,158 @@ +From 989ae777a6d2f1f50e59c2afe0e78e6d29cc0dde Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 9 Nov 2022 09:23:47 +0800 +Subject: migrate: convert unmap_and_move() to use folios + +From: Huang Ying + +[ Upstream commit 49f51859221a3dfee27488eaeaff800459cac6a9 ] + +Patch series "migrate: convert migrate_pages()/unmap_and_move() to use +folios", v2. + +The conversion is quite straightforward, just replace the page API to the +corresponding folio API. migrate_pages() and unmap_and_move() mostly work +with folios (head pages) only. + +This patch (of 2): + +Quite straightforward, the page functions are converted to corresponding +folio functions. Same for comments. + +Link: https://lkml.kernel.org/r/20221109012348.93849-1-ying.huang@intel.com +Link: https://lkml.kernel.org/r/20221109012348.93849-2-ying.huang@intel.com +Signed-off-by: "Huang, Ying" +Reviewed-by: Yang Shi +Reviewed-by: Zi Yan +Reviewed-by: Matthew Wilcox (Oracle) +Reviewed-by: Baolin Wang +Cc: Oscar Salvador +Signed-off-by: Andrew Morton +Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages") +Signed-off-by: Sasha Levin +--- + mm/migrate.c | 54 ++++++++++++++++++++++++++-------------------------- + 1 file changed, 27 insertions(+), 27 deletions(-) + +diff --git a/mm/migrate.c b/mm/migrate.c +index b0caa89e67d5f..16b456b927c18 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1162,79 +1162,79 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, + } + + /* +- * Obtain the lock on page, remove all ptes and migrate the page +- * to the newly allocated page in newpage. ++ * Obtain the lock on folio, remove all ptes and migrate the folio ++ * to the newly allocated folio in dst. + */ + static int unmap_and_move(new_page_t get_new_page, + free_page_t put_new_page, +- unsigned long private, struct page *page, ++ unsigned long private, struct folio *src, + int force, enum migrate_mode mode, + enum migrate_reason reason, + struct list_head *ret) + { +- struct folio *dst, *src = page_folio(page); ++ struct folio *dst; + int rc = MIGRATEPAGE_SUCCESS; + struct page *newpage = NULL; + +- if (!thp_migration_supported() && PageTransHuge(page)) ++ if (!thp_migration_supported() && folio_test_transhuge(src)) + return -ENOSYS; + +- if (page_count(page) == 1) { +- /* Page was freed from under us. So we are done. */ +- ClearPageActive(page); +- ClearPageUnevictable(page); ++ if (folio_ref_count(src) == 1) { ++ /* Folio was freed from under us. So we are done. */ ++ folio_clear_active(src); ++ folio_clear_unevictable(src); + /* free_pages_prepare() will clear PG_isolated. */ + goto out; + } + +- newpage = get_new_page(page, private); ++ newpage = get_new_page(&src->page, private); + if (!newpage) + return -ENOMEM; + dst = page_folio(newpage); + +- newpage->private = 0; ++ dst->private = 0; + rc = __unmap_and_move(src, dst, force, mode); + if (rc == MIGRATEPAGE_SUCCESS) +- set_page_owner_migrate_reason(newpage, reason); ++ set_page_owner_migrate_reason(&dst->page, reason); + + out: + if (rc != -EAGAIN) { + /* +- * A page that has been migrated has all references +- * removed and will be freed. A page that has not been ++ * A folio that has been migrated has all references ++ * removed and will be freed. A folio that has not been + * migrated will have kept its references and be restored. + */ +- list_del(&page->lru); ++ list_del(&src->lru); + } + + /* + * If migration is successful, releases reference grabbed during +- * isolation. Otherwise, restore the page to right list unless ++ * isolation. Otherwise, restore the folio to right list unless + * we want to retry. + */ + if (rc == MIGRATEPAGE_SUCCESS) { + /* +- * Compaction can migrate also non-LRU pages which are ++ * Compaction can migrate also non-LRU folios which are + * not accounted to NR_ISOLATED_*. They can be recognized +- * as __PageMovable ++ * as __folio_test_movable + */ +- if (likely(!__PageMovable(page))) +- mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + +- page_is_file_lru(page), -thp_nr_pages(page)); ++ if (likely(!__folio_test_movable(src))) ++ mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + ++ folio_is_file_lru(src), -folio_nr_pages(src)); + + if (reason != MR_MEMORY_FAILURE) + /* +- * We release the page in page_handle_poison. ++ * We release the folio in page_handle_poison. + */ +- put_page(page); ++ folio_put(src); + } else { + if (rc != -EAGAIN) +- list_add_tail(&page->lru, ret); ++ list_add_tail(&src->lru, ret); + + if (put_new_page) +- put_new_page(newpage, private); ++ put_new_page(&dst->page, private); + else +- put_page(newpage); ++ folio_put(dst); + } + + return rc; +@@ -1471,7 +1471,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + &ret_pages); + else + rc = unmap_and_move(get_new_page, put_new_page, +- private, page, pass > 2, mode, ++ private, page_folio(page), pass > 2, mode, + reason, &ret_pages); + /* + * The rules are: +-- +2.43.0 + diff --git a/queue-6.1/migrate_pages-organize-stats-with-struct-migrate_pag.patch b/queue-6.1/migrate_pages-organize-stats-with-struct-migrate_pag.patch new file mode 100644 index 00000000000..5a1e4cf81aa --- /dev/null +++ b/queue-6.1/migrate_pages-organize-stats-with-struct-migrate_pag.patch @@ -0,0 +1,261 @@ +From 41a3f5ffed4ddea2c459d69b3b751704faa84a6f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Feb 2023 20:34:36 +0800 +Subject: migrate_pages: organize stats with struct migrate_pages_stats + +From: Huang Ying + +[ Upstream commit 5b855937096aea7f81e73ad6d40d433c9dd49577 ] + +Patch series "migrate_pages(): batch TLB flushing", v5. + +Now, migrate_pages() migrates folios one by one, like the fake code as +follows, + + for each folio + unmap + flush TLB + copy + restore map + +If multiple folios are passed to migrate_pages(), there are opportunities +to batch the TLB flushing and copying. That is, we can change the code to +something as follows, + + for each folio + unmap + for each folio + flush TLB + for each folio + copy + for each folio + restore map + +The total number of TLB flushing IPI can be reduced considerably. And we +may use some hardware accelerator such as DSA to accelerate the folio +copying. + +So in this patch, we refactor the migrate_pages() implementation and +implement the TLB flushing batching. Base on this, hardware accelerated +folio copying can be implemented. + +If too many folios are passed to migrate_pages(), in the naive batched +implementation, we may unmap too many folios at the same time. The +possibility for a task to wait for the migrated folios to be mapped again +increases. So the latency may be hurt. To deal with this issue, the max +number of folios be unmapped in batch is restricted to no more than +HPAGE_PMD_NR in the unit of page. That is, the influence is at the same +level of THP migration. + +We use the following test to measure the performance impact of the +patchset, + +On a 2-socket Intel server, + + - Run pmbench memory accessing benchmark + + - Run `migratepages` to migrate pages of pmbench between node 0 and + node 1 back and forth. + +With the patch, the TLB flushing IPI reduces 99.1% during the test and +the number of pages migrated successfully per second increases 291.7%. + +Xin Hao helped to test the patchset on an ARM64 server with 128 cores, +2 NUMA nodes. Test results show that the page migration performance +increases up to 78%. + +This patch (of 9): + +Define struct migrate_pages_stats to organize the various statistics in +migrate_pages(). This makes it easier to collect and consume the +statistics in multiple functions. This will be needed in the following +patches in the series. + +Link: https://lkml.kernel.org/r/20230213123444.155149-1-ying.huang@intel.com +Link: https://lkml.kernel.org/r/20230213123444.155149-2-ying.huang@intel.com +Signed-off-by: "Huang, Ying" +Reviewed-by: Alistair Popple +Reviewed-by: Zi Yan +Reviewed-by: Baolin Wang +Reviewed-by: Xin Hao +Cc: Yang Shi +Cc: Oscar Salvador +Cc: Matthew Wilcox +Cc: Bharata B Rao +Cc: Minchan Kim +Cc: Mike Kravetz +Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> +Signed-off-by: Andrew Morton +Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages") +Signed-off-by: Sasha Levin +--- + mm/migrate.c | 60 +++++++++++++++++++++++++++++----------------------- + 1 file changed, 34 insertions(+), 26 deletions(-) + +diff --git a/mm/migrate.c b/mm/migrate.c +index 81444abf54dba..b7596a0b4445f 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1398,6 +1398,16 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f + return rc; + } + ++struct migrate_pages_stats { ++ int nr_succeeded; /* Normal and large folios migrated successfully, in ++ units of base pages */ ++ int nr_failed_pages; /* Normal and large folios failed to be migrated, in ++ units of base pages. Untried folios aren't counted */ ++ int nr_thp_succeeded; /* THP migrated successfully */ ++ int nr_thp_failed; /* THP failed to be migrated */ ++ int nr_thp_split; /* THP split before migrating */ ++}; ++ + /* + * migrate_pages - migrate the folios specified in a list, to the free folios + * supplied as the target for the page migration +@@ -1432,13 +1442,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + int large_retry = 1; + int thp_retry = 1; + int nr_failed = 0; +- int nr_failed_pages = 0; + int nr_retry_pages = 0; +- int nr_succeeded = 0; +- int nr_thp_succeeded = 0; + int nr_large_failed = 0; +- int nr_thp_failed = 0; +- int nr_thp_split = 0; + int pass = 0; + bool is_large = false; + bool is_thp = false; +@@ -1448,9 +1453,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + LIST_HEAD(split_folios); + bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_split_folio_counting = false; ++ struct migrate_pages_stats stats; + + trace_mm_migrate_pages_start(mode, reason); + ++ memset(&stats, 0, sizeof(stats)); + split_folio_migration: + for (pass = 0; pass < 10 && (retry || large_retry); pass++) { + retry = 0; +@@ -1504,9 +1511,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + /* Large folio migration is unsupported */ + if (is_large) { + nr_large_failed++; +- nr_thp_failed += is_thp; ++ stats.nr_thp_failed += is_thp; + if (!try_split_folio(folio, &split_folios)) { +- nr_thp_split += is_thp; ++ stats.nr_thp_split += is_thp; + break; + } + /* Hugetlb migration is unsupported */ +@@ -1514,7 +1521,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + nr_failed++; + } + +- nr_failed_pages += nr_pages; ++ stats.nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, &ret_folios); + break; + case -ENOMEM: +@@ -1524,13 +1531,13 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + */ + if (is_large) { + nr_large_failed++; +- nr_thp_failed += is_thp; ++ stats.nr_thp_failed += is_thp; + /* Large folio NUMA faulting doesn't split to retry. */ + if (!nosplit) { + int ret = try_split_folio(folio, &split_folios); + + if (!ret) { +- nr_thp_split += is_thp; ++ stats.nr_thp_split += is_thp; + break; + } else if (reason == MR_LONGTERM_PIN && + ret == -EAGAIN) { +@@ -1548,7 +1555,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + nr_failed++; + } + +- nr_failed_pages += nr_pages + nr_retry_pages; ++ stats.nr_failed_pages += nr_pages + nr_retry_pages; + /* + * There might be some split folios of fail-to-migrate large + * folios left in split_folios list. Move them back to migration +@@ -1558,7 +1565,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + list_splice_init(&split_folios, from); + /* nr_failed isn't updated for not used */ + nr_large_failed += large_retry; +- nr_thp_failed += thp_retry; ++ stats.nr_thp_failed += thp_retry; + goto out; + case -EAGAIN: + if (is_large) { +@@ -1570,8 +1577,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: +- nr_succeeded += nr_pages; +- nr_thp_succeeded += is_thp; ++ stats.nr_succeeded += nr_pages; ++ stats.nr_thp_succeeded += is_thp; + break; + default: + /* +@@ -1582,20 +1589,20 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + */ + if (is_large) { + nr_large_failed++; +- nr_thp_failed += is_thp; ++ stats.nr_thp_failed += is_thp; + } else if (!no_split_folio_counting) { + nr_failed++; + } + +- nr_failed_pages += nr_pages; ++ stats.nr_failed_pages += nr_pages; + break; + } + } + } + nr_failed += retry; + nr_large_failed += large_retry; +- nr_thp_failed += thp_retry; +- nr_failed_pages += nr_retry_pages; ++ stats.nr_thp_failed += thp_retry; ++ stats.nr_failed_pages += nr_retry_pages; + /* + * Try to migrate split folios of fail-to-migrate large folios, no + * nr_failed counting in this round, since all split folios of a +@@ -1628,16 +1635,17 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + if (list_empty(from)) + rc = 0; + +- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); +- count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); +- count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); +- count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); +- count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); +- trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, +- nr_thp_failed, nr_thp_split, mode, reason); ++ count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); ++ count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); ++ count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded); ++ count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed); ++ count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split); ++ trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages, ++ stats.nr_thp_succeeded, stats.nr_thp_failed, ++ stats.nr_thp_split, mode, reason); + + if (ret_succeeded) +- *ret_succeeded = nr_succeeded; ++ *ret_succeeded = stats.nr_succeeded; + + return rc; + } +-- +2.43.0 + diff --git a/queue-6.1/migrate_pages-restrict-number-of-pages-to-migrate-in.patch b/queue-6.1/migrate_pages-restrict-number-of-pages-to-migrate-in.patch new file mode 100644 index 00000000000..fd1dc52221f --- /dev/null +++ b/queue-6.1/migrate_pages-restrict-number-of-pages-to-migrate-in.patch @@ -0,0 +1,364 @@ +From f440d486b0dc2fe6f1bca63448860dc0b8809928 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Feb 2023 20:34:38 +0800 +Subject: migrate_pages: restrict number of pages to migrate in batch + +From: Huang Ying + +[ Upstream commit 42012e0436d44aeb2e68f11a28ddd0ad3f38b61f ] + +This is a preparation patch to batch the folio unmapping and moving for +non-hugetlb folios. + +If we had batched the folio unmapping, all folios to be migrated would be +unmapped before copying the contents and flags of the folios. If the +folios that were passed to migrate_pages() were too many in unit of pages, +the execution of the processes would be stopped for too long time, thus +too long latency. For example, migrate_pages() syscall will call +migrate_pages() with all folios of a process. To avoid this possible +issue, in this patch, we restrict the number of pages to be migrated to be +no more than HPAGE_PMD_NR. That is, the influence is at the same level of +THP migration. + +Link: https://lkml.kernel.org/r/20230213123444.155149-4-ying.huang@intel.com +Signed-off-by: "Huang, Ying" +Reviewed-by: Baolin Wang +Cc: Zi Yan +Cc: Yang Shi +Cc: Oscar Salvador +Cc: Matthew Wilcox +Cc: Bharata B Rao +Cc: Alistair Popple +Cc: Xin Hao +Cc: Minchan Kim +Cc: Mike Kravetz +Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> +Signed-off-by: Andrew Morton +Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages") +Signed-off-by: Sasha Levin +--- + mm/migrate.c | 174 +++++++++++++++++++++++++++++++-------------------- + 1 file changed, 106 insertions(+), 68 deletions(-) + +diff --git a/mm/migrate.c b/mm/migrate.c +index 70d0b20d06a5f..40ae91e1a026b 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1398,6 +1398,11 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f + return rc; + } + ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++#define NR_MAX_BATCHED_MIGRATION HPAGE_PMD_NR ++#else ++#define NR_MAX_BATCHED_MIGRATION 512 ++#endif + #define NR_MAX_MIGRATE_PAGES_RETRY 10 + + struct migrate_pages_stats { +@@ -1499,40 +1504,15 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, + return nr_failed; + } + +-/* +- * migrate_pages - migrate the folios specified in a list, to the free folios +- * supplied as the target for the page migration +- * +- * @from: The list of folios to be migrated. +- * @get_new_page: The function used to allocate free folios to be used +- * as the target of the folio migration. +- * @put_new_page: The function used to free target folios if migration +- * fails, or NULL if no special handling is necessary. +- * @private: Private data to be passed on to get_new_page() +- * @mode: The migration mode that specifies the constraints for +- * folio migration, if any. +- * @reason: The reason for folio migration. +- * @ret_succeeded: Set to the number of folios migrated successfully if +- * the caller passes a non-NULL pointer. +- * +- * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios +- * are movable any more because the list has become empty or no retryable folios +- * exist any more. It is caller's responsibility to call putback_movable_pages() +- * only if ret != 0. +- * +- * Returns the number of {normal folio, large folio, hugetlb} that were not +- * migrated, or an error code. The number of large folio splits will be +- * considered as the number of non-migrated large folio, no matter how many +- * split folios of the large folio are migrated successfully. +- */ +-int migrate_pages(struct list_head *from, new_page_t get_new_page, ++static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, + free_page_t put_new_page, unsigned long private, +- enum migrate_mode mode, int reason, unsigned int *ret_succeeded) ++ enum migrate_mode mode, int reason, struct list_head *ret_folios, ++ struct migrate_pages_stats *stats) + { + int retry = 1; + int large_retry = 1; + int thp_retry = 1; +- int nr_failed; ++ int nr_failed = 0; + int nr_retry_pages = 0; + int nr_large_failed = 0; + int pass = 0; +@@ -1540,20 +1520,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + bool is_thp = false; + struct folio *folio, *folio2; + int rc, nr_pages; +- LIST_HEAD(ret_folios); + LIST_HEAD(split_folios); + bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_split_folio_counting = false; +- struct migrate_pages_stats stats; +- +- trace_mm_migrate_pages_start(mode, reason); +- +- memset(&stats, 0, sizeof(stats)); +- rc = migrate_hugetlbs(from, get_new_page, put_new_page, private, mode, reason, +- &stats, &ret_folios); +- if (rc < 0) +- goto out; +- nr_failed = rc; + + split_folio_migration: + for (pass = 0; +@@ -1565,12 +1534,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + nr_retry_pages = 0; + + list_for_each_entry_safe(folio, folio2, from, lru) { +- /* Retried hugetlb folios will be kept in list */ +- if (folio_test_hugetlb(folio)) { +- list_move_tail(&folio->lru, &ret_folios); +- continue; +- } +- + /* + * Large folio statistics is based on the source large + * folio. Capture required information that might get +@@ -1584,15 +1547,14 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + + rc = unmap_and_move(get_new_page, put_new_page, + private, folio, pass > 2, mode, +- reason, &ret_folios); ++ reason, ret_folios); + /* + * The rules are: + * Success: folio will be freed + * -EAGAIN: stay on the from list + * -ENOMEM: stay on the from list + * -ENOSYS: stay on the from list +- * Other errno: put on ret_folios list then splice to +- * from list ++ * Other errno: put on ret_folios list + */ + switch(rc) { + /* +@@ -1609,17 +1571,17 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + /* Large folio migration is unsupported */ + if (is_large) { + nr_large_failed++; +- stats.nr_thp_failed += is_thp; ++ stats->nr_thp_failed += is_thp; + if (!try_split_folio(folio, &split_folios)) { +- stats.nr_thp_split += is_thp; ++ stats->nr_thp_split += is_thp; + break; + } + } else if (!no_split_folio_counting) { + nr_failed++; + } + +- stats.nr_failed_pages += nr_pages; +- list_move_tail(&folio->lru, &ret_folios); ++ stats->nr_failed_pages += nr_pages; ++ list_move_tail(&folio->lru, ret_folios); + break; + case -ENOMEM: + /* +@@ -1628,13 +1590,13 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + */ + if (is_large) { + nr_large_failed++; +- stats.nr_thp_failed += is_thp; ++ stats->nr_thp_failed += is_thp; + /* Large folio NUMA faulting doesn't split to retry. */ + if (!nosplit) { + int ret = try_split_folio(folio, &split_folios); + + if (!ret) { +- stats.nr_thp_split += is_thp; ++ stats->nr_thp_split += is_thp; + break; + } else if (reason == MR_LONGTERM_PIN && + ret == -EAGAIN) { +@@ -1652,17 +1614,17 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + nr_failed++; + } + +- stats.nr_failed_pages += nr_pages + nr_retry_pages; ++ stats->nr_failed_pages += nr_pages + nr_retry_pages; + /* + * There might be some split folios of fail-to-migrate large +- * folios left in split_folios list. Move them back to migration ++ * folios left in split_folios list. Move them to ret_folios + * list so that they could be put back to the right list by + * the caller otherwise the folio refcnt will be leaked. + */ +- list_splice_init(&split_folios, from); ++ list_splice_init(&split_folios, ret_folios); + /* nr_failed isn't updated for not used */ + nr_large_failed += large_retry; +- stats.nr_thp_failed += thp_retry; ++ stats->nr_thp_failed += thp_retry; + goto out; + case -EAGAIN: + if (is_large) { +@@ -1674,8 +1636,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + nr_retry_pages += nr_pages; + break; + case MIGRATEPAGE_SUCCESS: +- stats.nr_succeeded += nr_pages; +- stats.nr_thp_succeeded += is_thp; ++ stats->nr_succeeded += nr_pages; ++ stats->nr_thp_succeeded += is_thp; + break; + default: + /* +@@ -1686,20 +1648,20 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + */ + if (is_large) { + nr_large_failed++; +- stats.nr_thp_failed += is_thp; ++ stats->nr_thp_failed += is_thp; + } else if (!no_split_folio_counting) { + nr_failed++; + } + +- stats.nr_failed_pages += nr_pages; ++ stats->nr_failed_pages += nr_pages; + break; + } + } + } + nr_failed += retry; + nr_large_failed += large_retry; +- stats.nr_thp_failed += thp_retry; +- stats.nr_failed_pages += nr_retry_pages; ++ stats->nr_thp_failed += thp_retry; ++ stats->nr_failed_pages += nr_retry_pages; + /* + * Try to migrate split folios of fail-to-migrate large folios, no + * nr_failed counting in this round, since all split folios of a +@@ -1710,7 +1672,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY + * retries) to ret_folios to avoid migrating them again. + */ +- list_splice_init(from, &ret_folios); ++ list_splice_init(from, ret_folios); + list_splice_init(&split_folios, from); + no_split_folio_counting = true; + retry = 1; +@@ -1718,6 +1680,82 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + } + + rc = nr_failed + nr_large_failed; ++out: ++ return rc; ++} ++ ++/* ++ * migrate_pages - migrate the folios specified in a list, to the free folios ++ * supplied as the target for the page migration ++ * ++ * @from: The list of folios to be migrated. ++ * @get_new_page: The function used to allocate free folios to be used ++ * as the target of the folio migration. ++ * @put_new_page: The function used to free target folios if migration ++ * fails, or NULL if no special handling is necessary. ++ * @private: Private data to be passed on to get_new_page() ++ * @mode: The migration mode that specifies the constraints for ++ * folio migration, if any. ++ * @reason: The reason for folio migration. ++ * @ret_succeeded: Set to the number of folios migrated successfully if ++ * the caller passes a non-NULL pointer. ++ * ++ * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios ++ * are movable any more because the list has become empty or no retryable folios ++ * exist any more. It is caller's responsibility to call putback_movable_pages() ++ * only if ret != 0. ++ * ++ * Returns the number of {normal folio, large folio, hugetlb} that were not ++ * migrated, or an error code. The number of large folio splits will be ++ * considered as the number of non-migrated large folio, no matter how many ++ * split folios of the large folio are migrated successfully. ++ */ ++int migrate_pages(struct list_head *from, new_page_t get_new_page, ++ free_page_t put_new_page, unsigned long private, ++ enum migrate_mode mode, int reason, unsigned int *ret_succeeded) ++{ ++ int rc, rc_gather; ++ int nr_pages; ++ struct folio *folio, *folio2; ++ LIST_HEAD(folios); ++ LIST_HEAD(ret_folios); ++ struct migrate_pages_stats stats; ++ ++ trace_mm_migrate_pages_start(mode, reason); ++ ++ memset(&stats, 0, sizeof(stats)); ++ ++ rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private, ++ mode, reason, &stats, &ret_folios); ++ if (rc_gather < 0) ++ goto out; ++again: ++ nr_pages = 0; ++ list_for_each_entry_safe(folio, folio2, from, lru) { ++ /* Retried hugetlb folios will be kept in list */ ++ if (folio_test_hugetlb(folio)) { ++ list_move_tail(&folio->lru, &ret_folios); ++ continue; ++ } ++ ++ nr_pages += folio_nr_pages(folio); ++ if (nr_pages > NR_MAX_BATCHED_MIGRATION) ++ break; ++ } ++ if (nr_pages > NR_MAX_BATCHED_MIGRATION) ++ list_cut_before(&folios, from, &folio->lru); ++ else ++ list_splice_init(from, &folios); ++ rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private, ++ mode, reason, &ret_folios, &stats); ++ list_splice_tail_init(&folios, &ret_folios); ++ if (rc < 0) { ++ rc_gather = rc; ++ goto out; ++ } ++ rc_gather += rc; ++ if (!list_empty(from)) ++ goto again; + out: + /* + * Put the permanent failure folio back to migration list, they +@@ -1730,7 +1768,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + * are migrated successfully. + */ + if (list_empty(from)) +- rc = 0; ++ rc_gather = 0; + + count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded); + count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages); +@@ -1744,7 +1782,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + if (ret_succeeded) + *ret_succeeded = stats.nr_succeeded; + +- return rc; ++ return rc_gather; + } + + struct page *alloc_migration_target(struct page *page, unsigned long private) +-- +2.43.0 + diff --git a/queue-6.1/migrate_pages-separate-hugetlb-folios-migration.patch b/queue-6.1/migrate_pages-separate-hugetlb-folios-migration.patch new file mode 100644 index 00000000000..aef60270980 --- /dev/null +++ b/queue-6.1/migrate_pages-separate-hugetlb-folios-migration.patch @@ -0,0 +1,253 @@ +From dc1b2cb876a9212a452499066bada1d7645a8442 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Feb 2023 20:34:37 +0800 +Subject: migrate_pages: separate hugetlb folios migration + +From: Huang Ying + +[ Upstream commit e5bfff8b10e496378da4b7863479dd6fb907d4ea ] + +This is a preparation patch to batch the folio unmapping and moving for +the non-hugetlb folios. Based on that we can batch the TLB shootdown +during the folio migration and make it possible to use some hardware +accelerator for the folio copying. + +In this patch the hugetlb folios and non-hugetlb folios migration is +separated in migrate_pages() to make it easy to change the non-hugetlb +folios migration implementation. + +Link: https://lkml.kernel.org/r/20230213123444.155149-3-ying.huang@intel.com +Signed-off-by: "Huang, Ying" +Reviewed-by: Baolin Wang +Reviewed-by: Xin Hao +Cc: Zi Yan +Cc: Yang Shi +Cc: Oscar Salvador +Cc: Matthew Wilcox +Cc: Bharata B Rao +Cc: Alistair Popple +Cc: Minchan Kim +Cc: Mike Kravetz +Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> +Signed-off-by: Andrew Morton +Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages") +Signed-off-by: Sasha Levin +--- + mm/migrate.c | 141 +++++++++++++++++++++++++++++++++++++++++++-------- + 1 file changed, 119 insertions(+), 22 deletions(-) + +diff --git a/mm/migrate.c b/mm/migrate.c +index b7596a0b4445f..70d0b20d06a5f 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1398,6 +1398,8 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f + return rc; + } + ++#define NR_MAX_MIGRATE_PAGES_RETRY 10 ++ + struct migrate_pages_stats { + int nr_succeeded; /* Normal and large folios migrated successfully, in + units of base pages */ +@@ -1408,6 +1410,95 @@ struct migrate_pages_stats { + int nr_thp_split; /* THP split before migrating */ + }; + ++/* ++ * Returns the number of hugetlb folios that were not migrated, or an error code ++ * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable ++ * any more because the list has become empty or no retryable hugetlb folios ++ * exist any more. It is caller's responsibility to call putback_movable_pages() ++ * only if ret != 0. ++ */ ++static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page, ++ free_page_t put_new_page, unsigned long private, ++ enum migrate_mode mode, int reason, ++ struct migrate_pages_stats *stats, ++ struct list_head *ret_folios) ++{ ++ int retry = 1; ++ int nr_failed = 0; ++ int nr_retry_pages = 0; ++ int pass = 0; ++ struct folio *folio, *folio2; ++ int rc, nr_pages; ++ ++ for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) { ++ retry = 0; ++ nr_retry_pages = 0; ++ ++ list_for_each_entry_safe(folio, folio2, from, lru) { ++ if (!folio_test_hugetlb(folio)) ++ continue; ++ ++ nr_pages = folio_nr_pages(folio); ++ ++ cond_resched(); ++ ++ rc = unmap_and_move_huge_page(get_new_page, ++ put_new_page, private, ++ &folio->page, pass > 2, mode, ++ reason, ret_folios); ++ /* ++ * The rules are: ++ * Success: hugetlb folio will be put back ++ * -EAGAIN: stay on the from list ++ * -ENOMEM: stay on the from list ++ * -ENOSYS: stay on the from list ++ * Other errno: put on ret_folios list ++ */ ++ switch(rc) { ++ case -ENOSYS: ++ /* Hugetlb migration is unsupported */ ++ nr_failed++; ++ stats->nr_failed_pages += nr_pages; ++ list_move_tail(&folio->lru, ret_folios); ++ break; ++ case -ENOMEM: ++ /* ++ * When memory is low, don't bother to try to migrate ++ * other folios, just exit. ++ */ ++ stats->nr_failed_pages += nr_pages + nr_retry_pages; ++ return -ENOMEM; ++ case -EAGAIN: ++ retry++; ++ nr_retry_pages += nr_pages; ++ break; ++ case MIGRATEPAGE_SUCCESS: ++ stats->nr_succeeded += nr_pages; ++ break; ++ default: ++ /* ++ * Permanent failure (-EBUSY, etc.): ++ * unlike -EAGAIN case, the failed folio is ++ * removed from migration folio list and not ++ * retried in the next outer loop. ++ */ ++ nr_failed++; ++ stats->nr_failed_pages += nr_pages; ++ break; ++ } ++ } ++ } ++ /* ++ * nr_failed is number of hugetlb folios failed to be migrated. After ++ * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb ++ * folios as failed. ++ */ ++ nr_failed += retry; ++ stats->nr_failed_pages += nr_retry_pages; ++ ++ return nr_failed; ++} ++ + /* + * migrate_pages - migrate the folios specified in a list, to the free folios + * supplied as the target for the page migration +@@ -1424,10 +1515,10 @@ struct migrate_pages_stats { + * @ret_succeeded: Set to the number of folios migrated successfully if + * the caller passes a non-NULL pointer. + * +- * The function returns after 10 attempts or if no folios are movable any more +- * because the list has become empty or no retryable folios exist any more. +- * It is caller's responsibility to call putback_movable_pages() to return folios +- * to the LRU or free list only if ret != 0. ++ * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios ++ * are movable any more because the list has become empty or no retryable folios ++ * exist any more. It is caller's responsibility to call putback_movable_pages() ++ * only if ret != 0. + * + * Returns the number of {normal folio, large folio, hugetlb} that were not + * migrated, or an error code. The number of large folio splits will be +@@ -1441,7 +1532,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + int retry = 1; + int large_retry = 1; + int thp_retry = 1; +- int nr_failed = 0; ++ int nr_failed; + int nr_retry_pages = 0; + int nr_large_failed = 0; + int pass = 0; +@@ -1458,38 +1549,45 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + trace_mm_migrate_pages_start(mode, reason); + + memset(&stats, 0, sizeof(stats)); ++ rc = migrate_hugetlbs(from, get_new_page, put_new_page, private, mode, reason, ++ &stats, &ret_folios); ++ if (rc < 0) ++ goto out; ++ nr_failed = rc; ++ + split_folio_migration: +- for (pass = 0; pass < 10 && (retry || large_retry); pass++) { ++ for (pass = 0; ++ pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry); ++ pass++) { + retry = 0; + large_retry = 0; + thp_retry = 0; + nr_retry_pages = 0; + + list_for_each_entry_safe(folio, folio2, from, lru) { ++ /* Retried hugetlb folios will be kept in list */ ++ if (folio_test_hugetlb(folio)) { ++ list_move_tail(&folio->lru, &ret_folios); ++ continue; ++ } ++ + /* + * Large folio statistics is based on the source large + * folio. Capture required information that might get + * lost during migration. + */ +- is_large = folio_test_large(folio) && !folio_test_hugetlb(folio); ++ is_large = folio_test_large(folio); + is_thp = is_large && folio_test_pmd_mappable(folio); + nr_pages = folio_nr_pages(folio); ++ + cond_resched(); + +- if (folio_test_hugetlb(folio)) +- rc = unmap_and_move_huge_page(get_new_page, +- put_new_page, private, +- &folio->page, pass > 2, mode, +- reason, +- &ret_folios); +- else +- rc = unmap_and_move(get_new_page, put_new_page, +- private, folio, pass > 2, mode, +- reason, &ret_folios); ++ rc = unmap_and_move(get_new_page, put_new_page, ++ private, folio, pass > 2, mode, ++ reason, &ret_folios); + /* + * The rules are: +- * Success: non hugetlb folio will be freed, hugetlb +- * folio will be put back ++ * Success: folio will be freed + * -EAGAIN: stay on the from list + * -ENOMEM: stay on the from list + * -ENOSYS: stay on the from list +@@ -1516,7 +1614,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + stats.nr_thp_split += is_thp; + break; + } +- /* Hugetlb migration is unsupported */ + } else if (!no_split_folio_counting) { + nr_failed++; + } +@@ -1610,8 +1707,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + */ + if (!list_empty(&split_folios)) { + /* +- * Move non-migrated folios (after 10 retries) to ret_folios +- * to avoid migrating them again. ++ * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY ++ * retries) to ret_folios to avoid migrating them again. + */ + list_splice_init(from, &ret_folios); + list_splice_init(&split_folios, from); +-- +2.43.0 + diff --git a/queue-6.1/migrate_pages-split-unmap_and_move-to-_unmap-and-_mo.patch b/queue-6.1/migrate_pages-split-unmap_and_move-to-_unmap-and-_mo.patch new file mode 100644 index 00000000000..7629fc4ca29 --- /dev/null +++ b/queue-6.1/migrate_pages-split-unmap_and_move-to-_unmap-and-_mo.patch @@ -0,0 +1,310 @@ +From 742e80422397bb0e53a0352ee493019024a9c902 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 13 Feb 2023 20:34:39 +0800 +Subject: migrate_pages: split unmap_and_move() to _unmap() and _move() + +From: Huang Ying + +[ Upstream commit 64c8902ed4418317cd416c566f896bd4a92b2efc ] + +This is a preparation patch to batch the folio unmapping and moving. + +In this patch, unmap_and_move() is split to migrate_folio_unmap() and +migrate_folio_move(). So, we can batch _unmap() and _move() in different +loops later. To pass some information between unmap and move, the +original unused dst->mapping and dst->private are used. + +Link: https://lkml.kernel.org/r/20230213123444.155149-5-ying.huang@intel.com +Signed-off-by: "Huang, Ying" +Reviewed-by: Baolin Wang +Reviewed-by: Xin Hao +Cc: Zi Yan +Cc: Yang Shi +Cc: Oscar Salvador +Cc: Matthew Wilcox +Cc: Bharata B Rao +Cc: Alistair Popple +Cc: Minchan Kim +Cc: Mike Kravetz +Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> +Signed-off-by: Andrew Morton +Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages") +Signed-off-by: Sasha Levin +--- + include/linux/migrate.h | 1 + + mm/migrate.c | 169 ++++++++++++++++++++++++++++++---------- + 2 files changed, 129 insertions(+), 41 deletions(-) + +diff --git a/include/linux/migrate.h b/include/linux/migrate.h +index 3ef77f52a4f04..7376074f2e1e3 100644 +--- a/include/linux/migrate.h ++++ b/include/linux/migrate.h +@@ -18,6 +18,7 @@ struct migration_target_control; + * - zero on page migration success; + */ + #define MIGRATEPAGE_SUCCESS 0 ++#define MIGRATEPAGE_UNMAP 1 + + /** + * struct movable_operations - Driver page migration +diff --git a/mm/migrate.c b/mm/migrate.c +index 40ae91e1a026b..46a1476e188c3 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1011,11 +1011,53 @@ static int move_to_new_folio(struct folio *dst, struct folio *src, + return rc; + } + +-static int __unmap_and_move(struct folio *src, struct folio *dst, ++/* ++ * To record some information during migration, we use some unused ++ * fields (mapping and private) of struct folio of the newly allocated ++ * destination folio. This is safe because nobody is using them ++ * except us. ++ */ ++static void __migrate_folio_record(struct folio *dst, ++ unsigned long page_was_mapped, ++ struct anon_vma *anon_vma) ++{ ++ dst->mapping = (void *)anon_vma; ++ dst->private = (void *)page_was_mapped; ++} ++ ++static void __migrate_folio_extract(struct folio *dst, ++ int *page_was_mappedp, ++ struct anon_vma **anon_vmap) ++{ ++ *anon_vmap = (void *)dst->mapping; ++ *page_was_mappedp = (unsigned long)dst->private; ++ dst->mapping = NULL; ++ dst->private = NULL; ++} ++ ++/* Cleanup src folio upon migration success */ ++static void migrate_folio_done(struct folio *src, ++ enum migrate_reason reason) ++{ ++ /* ++ * Compaction can migrate also non-LRU pages which are ++ * not accounted to NR_ISOLATED_*. They can be recognized ++ * as __PageMovable ++ */ ++ if (likely(!__folio_test_movable(src))) ++ mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + ++ folio_is_file_lru(src), -folio_nr_pages(src)); ++ ++ if (reason != MR_MEMORY_FAILURE) ++ /* We release the page in page_handle_poison. */ ++ folio_put(src); ++} ++ ++static int __migrate_folio_unmap(struct folio *src, struct folio *dst, + int force, enum migrate_mode mode) + { + int rc = -EAGAIN; +- bool page_was_mapped = false; ++ int page_was_mapped = 0; + struct anon_vma *anon_vma = NULL; + bool is_lru = !__PageMovable(&src->page); + +@@ -1091,8 +1133,8 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, + goto out_unlock; + + if (unlikely(!is_lru)) { +- rc = move_to_new_folio(dst, src, mode); +- goto out_unlock_both; ++ __migrate_folio_record(dst, page_was_mapped, anon_vma); ++ return MIGRATEPAGE_UNMAP; + } + + /* +@@ -1117,11 +1159,42 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, + VM_BUG_ON_FOLIO(folio_test_anon(src) && + !folio_test_ksm(src) && !anon_vma, src); + try_to_migrate(src, 0); +- page_was_mapped = true; ++ page_was_mapped = 1; + } + +- if (!folio_mapped(src)) +- rc = move_to_new_folio(dst, src, mode); ++ if (!folio_mapped(src)) { ++ __migrate_folio_record(dst, page_was_mapped, anon_vma); ++ return MIGRATEPAGE_UNMAP; ++ } ++ ++ if (page_was_mapped) ++ remove_migration_ptes(src, src, false); ++ ++out_unlock_both: ++ folio_unlock(dst); ++out_unlock: ++ /* Drop an anon_vma reference if we took one */ ++ if (anon_vma) ++ put_anon_vma(anon_vma); ++ folio_unlock(src); ++out: ++ ++ return rc; ++} ++ ++static int __migrate_folio_move(struct folio *src, struct folio *dst, ++ enum migrate_mode mode) ++{ ++ int rc; ++ int page_was_mapped = 0; ++ struct anon_vma *anon_vma = NULL; ++ bool is_lru = !__PageMovable(&src->page); ++ ++ __migrate_folio_extract(dst, &page_was_mapped, &anon_vma); ++ ++ rc = move_to_new_folio(dst, src, mode); ++ if (unlikely(!is_lru)) ++ goto out_unlock_both; + + /* + * When successful, push dst to LRU immediately: so that if it +@@ -1144,12 +1217,10 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, + + out_unlock_both: + folio_unlock(dst); +-out_unlock: + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + put_anon_vma(anon_vma); + folio_unlock(src); +-out: + /* + * If migration is successful, decrease refcount of dst, + * which will not free the page because new page owner increased +@@ -1161,19 +1232,15 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, + return rc; + } + +-/* +- * Obtain the lock on folio, remove all ptes and migrate the folio +- * to the newly allocated folio in dst. +- */ +-static int unmap_and_move(new_page_t get_new_page, +- free_page_t put_new_page, +- unsigned long private, struct folio *src, +- int force, enum migrate_mode mode, +- enum migrate_reason reason, +- struct list_head *ret) ++/* Obtain the lock on page, remove all ptes. */ ++static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page, ++ unsigned long private, struct folio *src, ++ struct folio **dstp, int force, ++ enum migrate_mode mode, enum migrate_reason reason, ++ struct list_head *ret) + { + struct folio *dst; +- int rc = MIGRATEPAGE_SUCCESS; ++ int rc = MIGRATEPAGE_UNMAP; + struct page *newpage = NULL; + + if (!thp_migration_supported() && folio_test_transhuge(src)) +@@ -1184,20 +1251,49 @@ static int unmap_and_move(new_page_t get_new_page, + folio_clear_active(src); + folio_clear_unevictable(src); + /* free_pages_prepare() will clear PG_isolated. */ +- goto out; ++ list_del(&src->lru); ++ migrate_folio_done(src, reason); ++ return MIGRATEPAGE_SUCCESS; + } + + newpage = get_new_page(&src->page, private); + if (!newpage) + return -ENOMEM; + dst = page_folio(newpage); ++ *dstp = dst; + + dst->private = NULL; +- rc = __unmap_and_move(src, dst, force, mode); ++ rc = __migrate_folio_unmap(src, dst, force, mode); ++ if (rc == MIGRATEPAGE_UNMAP) ++ return rc; ++ ++ /* ++ * A folio that has not been unmapped will be restored to ++ * right list unless we want to retry. ++ */ ++ if (rc != -EAGAIN) ++ list_move_tail(&src->lru, ret); ++ ++ if (put_new_page) ++ put_new_page(&dst->page, private); ++ else ++ folio_put(dst); ++ ++ return rc; ++} ++ ++/* Migrate the folio to the newly allocated folio in dst. */ ++static int migrate_folio_move(free_page_t put_new_page, unsigned long private, ++ struct folio *src, struct folio *dst, ++ enum migrate_mode mode, enum migrate_reason reason, ++ struct list_head *ret) ++{ ++ int rc; ++ ++ rc = __migrate_folio_move(src, dst, mode); + if (rc == MIGRATEPAGE_SUCCESS) + set_page_owner_migrate_reason(&dst->page, reason); + +-out: + if (rc != -EAGAIN) { + /* + * A folio that has been migrated has all references +@@ -1213,20 +1309,7 @@ static int unmap_and_move(new_page_t get_new_page, + * we want to retry. + */ + if (rc == MIGRATEPAGE_SUCCESS) { +- /* +- * Compaction can migrate also non-LRU folios which are +- * not accounted to NR_ISOLATED_*. They can be recognized +- * as __folio_test_movable +- */ +- if (likely(!__folio_test_movable(src))) +- mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + +- folio_is_file_lru(src), -folio_nr_pages(src)); +- +- if (reason != MR_MEMORY_FAILURE) +- /* +- * We release the folio in page_handle_poison. +- */ +- folio_put(src); ++ migrate_folio_done(src, reason); + } else { + if (rc != -EAGAIN) + list_add_tail(&src->lru, ret); +@@ -1518,7 +1601,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, + int pass = 0; + bool is_large = false; + bool is_thp = false; +- struct folio *folio, *folio2; ++ struct folio *folio, *folio2, *dst = NULL; + int rc, nr_pages; + LIST_HEAD(split_folios); + bool nosplit = (reason == MR_NUMA_MISPLACED); +@@ -1545,9 +1628,13 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page, + + cond_resched(); + +- rc = unmap_and_move(get_new_page, put_new_page, +- private, folio, pass > 2, mode, +- reason, ret_folios); ++ rc = migrate_folio_unmap(get_new_page, put_new_page, private, ++ folio, &dst, pass > 2, mode, ++ reason, ret_folios); ++ if (rc == MIGRATEPAGE_UNMAP) ++ rc = migrate_folio_move(put_new_page, private, ++ folio, dst, mode, ++ reason, ret_folios); + /* + * The rules are: + * Success: folio will be freed +-- +2.43.0 + diff --git a/queue-6.1/mm-migrate-try-again-if-thp-split-is-failed-due-to-p.patch b/queue-6.1/mm-migrate-try-again-if-thp-split-is-failed-due-to-p.patch new file mode 100644 index 00000000000..b3d2f615537 --- /dev/null +++ b/queue-6.1/mm-migrate-try-again-if-thp-split-is-failed-due-to-p.patch @@ -0,0 +1,118 @@ +From 8834c3584cdce54f769ee76fdb530bc80d0689a4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 24 Oct 2022 16:34:22 +0800 +Subject: mm: migrate: try again if THP split is failed due to page refcnt + +From: Baolin Wang + +[ Upstream commit fd4a7ac32918d3d7a2d17dc06c5520f45e36eb52 ] + +When creating a virtual machine, we will use memfd_create() to get a file +descriptor which can be used to create share memory mappings using the +mmap function, meanwhile the mmap() will set the MAP_POPULATE flag to +allocate physical pages for the virtual machine. + +When allocating physical pages for the guest, the host can fallback to +allocate some CMA pages for the guest when over half of the zone's free +memory is in the CMA area. + +In guest os, when the application wants to do some data transaction with +DMA, our QEMU will call VFIO_IOMMU_MAP_DMA ioctl to do longterm-pin and +create IOMMU mappings for the DMA pages. However, when calling +VFIO_IOMMU_MAP_DMA ioctl to pin the physical pages, we found it will be +failed to longterm-pin sometimes. + +After some invetigation, we found the pages used to do DMA mapping can +contain some CMA pages, and these CMA pages will cause a possible failure +of the longterm-pin, due to failed to migrate the CMA pages. The reason +of migration failure may be temporary reference count or memory allocation +failure. So that will cause the VFIO_IOMMU_MAP_DMA ioctl returns error, +which makes the application failed to start. + +I observed one migration failure case (which is not easy to reproduce) is +that, the 'thp_migration_fail' count is 1 and the 'thp_split_page_failed' +count is also 1. + +That means when migrating a THP which is in CMA area, but can not allocate +a new THP due to memory fragmentation, so it will split the THP. However +THP split is also failed, probably the reason is temporary reference count +of this THP. And the temporary reference count can be caused by dropping +page caches (I observed the drop caches operation in the system), but we +can not drop the shmem page caches due to they are already dirty at that +time. + +Especially for THP split failure, which is caused by temporary reference +count, we can try again to mitigate the failure of migration in this case +according to previous discussion [1]. + +[1] https://lore.kernel.org/all/470dc638-a300-f261-94b4-e27250e42f96@redhat.com/ +Link: https://lkml.kernel.org/r/6784730480a1df82e8f4cba1ed088e4ac767994b.1666599848.git.baolin.wang@linux.alibaba.com +Signed-off-by: Baolin Wang +Reviewed-by: "Huang, Ying" +Cc: Alistair Popple +Cc: David Hildenbrand +Cc: Yang Shi +Cc: Zi Yan +Signed-off-by: Andrew Morton +Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages") +Signed-off-by: Sasha Levin +--- + mm/huge_memory.c | 4 ++-- + mm/migrate.c | 19 ++++++++++++++++--- + 2 files changed, 18 insertions(+), 5 deletions(-) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 98a1a05f2db2d..f53bc54dacb37 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -2728,7 +2728,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + * split PMDs + */ + if (!can_split_folio(folio, &extra_pins)) { +- ret = -EBUSY; ++ ret = -EAGAIN; + goto out_unlock; + } + +@@ -2780,7 +2780,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) + xas_unlock(&xas); + local_irq_enable(); + remap_page(folio, folio_nr_pages(folio)); +- ret = -EBUSY; ++ ret = -EAGAIN; + } + + out_unlock: +diff --git a/mm/migrate.c b/mm/migrate.c +index 0252aa4ff572e..b0caa89e67d5f 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1518,9 +1518,22 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, + if (is_thp) { + nr_thp_failed++; + /* THP NUMA faulting doesn't split THP to retry. */ +- if (!nosplit && !try_split_thp(page, &thp_split_pages)) { +- nr_thp_split++; +- break; ++ if (!nosplit) { ++ int ret = try_split_thp(page, &thp_split_pages); ++ ++ if (!ret) { ++ nr_thp_split++; ++ break; ++ } else if (reason == MR_LONGTERM_PIN && ++ ret == -EAGAIN) { ++ /* ++ * Try again to split THP to mitigate ++ * the failure of longterm pinning. ++ */ ++ thp_retry++; ++ nr_retry_pages += nr_subpages; ++ break; ++ } + } + } else if (!no_subpage_counting) { + nr_failed++; +-- +2.43.0 + diff --git a/queue-6.1/mm-migrate.c-stop-using-0-as-null-pointer.patch b/queue-6.1/mm-migrate.c-stop-using-0-as-null-pointer.patch new file mode 100644 index 00000000000..3761d5e16d9 --- /dev/null +++ b/queue-6.1/mm-migrate.c-stop-using-0-as-null-pointer.patch @@ -0,0 +1,39 @@ +From d6819b70b7c23771f73c3c02e6128860b55539bb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Nov 2022 09:23:45 +0800 +Subject: mm/migrate.c: stop using 0 as NULL pointer + +From: Yang Li + +[ Upstream commit 4c74b65f478dc9353780a6be17fc82f1b06cea80 ] + +mm/migrate.c:1198:24: warning: Using plain integer as NULL pointer + +Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3080 +Link: https://lkml.kernel.org/r/20221116012345.84870-1-yang.lee@linux.alibaba.com +Signed-off-by: Yang Li +Reported-by: Abaci Robot +Reviewed-by: David Hildenbrand +Signed-off-by: Andrew Morton +Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages") +Signed-off-by: Sasha Levin +--- + mm/migrate.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/migrate.c b/mm/migrate.c +index 562f819dc6189..81444abf54dba 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1192,7 +1192,7 @@ static int unmap_and_move(new_page_t get_new_page, + return -ENOMEM; + dst = page_folio(newpage); + +- dst->private = 0; ++ dst->private = NULL; + rc = __unmap_and_move(src, dst, force, mode); + if (rc == MIGRATEPAGE_SUCCESS) + set_page_owner_migrate_reason(&dst->page, reason); +-- +2.43.0 + diff --git a/queue-6.1/mm-page_alloc-explicitly-define-how-__gfp_high-non-b.patch b/queue-6.1/mm-page_alloc-explicitly-define-how-__gfp_high-non-b.patch new file mode 100644 index 00000000000..21e642f531a --- /dev/null +++ b/queue-6.1/mm-page_alloc-explicitly-define-how-__gfp_high-non-b.patch @@ -0,0 +1,151 @@ +From 1c6a2cba75776f1944c170b81c1ad027ef2a12f4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 13 Jan 2023 11:12:16 +0000 +Subject: mm/page_alloc: explicitly define how __GFP_HIGH non-blocking + allocations accesses reserves + +From: Mel Gorman + +[ Upstream commit 1ebbb21811b76c3b932959787f37985af36f62fa ] + +GFP_ATOMIC allocations get flagged ALLOC_HARDER which is a vague +description. In preparation for the removal of GFP_ATOMIC redefine +__GFP_ATOMIC to simply mean non-blocking and renaming ALLOC_HARDER to +ALLOC_NON_BLOCK accordingly. __GFP_HIGH is required for access to +reserves but non-blocking is granted more access. For example, GFP_NOWAIT +is non-blocking but has no special access to reserves. A __GFP_NOFAIL +blocking allocation is granted access similar to __GFP_HIGH if the only +alternative is an OOM kill. + +Link: https://lkml.kernel.org/r/20230113111217.14134-6-mgorman@techsingularity.net +Signed-off-by: Mel Gorman +Acked-by: Michal Hocko +Cc: Matthew Wilcox +Cc: NeilBrown +Cc: Thierry Reding +Cc: Vlastimil Babka +Signed-off-by: Andrew Morton +Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves") +Signed-off-by: Sasha Levin +--- + mm/internal.h | 7 +++++-- + mm/page_alloc.c | 44 ++++++++++++++++++++++++-------------------- + 2 files changed, 29 insertions(+), 22 deletions(-) + +diff --git a/mm/internal.h b/mm/internal.h +index cd095ce2f199e..a50bc08337d21 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -754,7 +754,10 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, + #define ALLOC_OOM ALLOC_NO_WATERMARKS + #endif + +-#define ALLOC_HARDER 0x10 /* try to alloc harder */ ++#define ALLOC_NON_BLOCK 0x10 /* Caller cannot block. Allow access ++ * to 25% of the min watermark or ++ * 62.5% if __GFP_HIGH is set. ++ */ + #define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% + * of the min watermark. + */ +@@ -769,7 +772,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, + #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ + + /* Flags that allow allocations below the min watermark. */ +-#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) ++#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) + + enum ttu_flags; + struct tlbflush_unmap_batch; +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 6ab53e47ccea1..49dc4ba88c278 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3996,18 +3996,19 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + * __GFP_HIGH allows access to 50% of the min reserve as well + * as OOM. + */ +- if (alloc_flags & ALLOC_MIN_RESERVE) ++ if (alloc_flags & ALLOC_MIN_RESERVE) { + min -= min / 2; + +- /* +- * Non-blocking allocations can access some of the reserve +- * with more access if also __GFP_HIGH. The reasoning is that +- * a non-blocking caller may incur a more severe penalty +- * if it cannot get memory quickly, particularly if it's +- * also __GFP_HIGH. +- */ +- if (alloc_flags & ALLOC_HARDER) +- min -= min / 4; ++ /* ++ * Non-blocking allocations (e.g. GFP_ATOMIC) can ++ * access more reserves than just __GFP_HIGH. Other ++ * non-blocking allocations requests such as GFP_NOWAIT ++ * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get ++ * access to the min reserve. ++ */ ++ if (alloc_flags & ALLOC_NON_BLOCK) ++ min -= min / 4; ++ } + + /* + * OOM victims can try even harder than the normal reserve +@@ -4858,28 +4859,30 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will +- * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH). ++ * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH). + */ + alloc_flags |= (__force int) + (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); + +- if (gfp_mask & __GFP_ATOMIC) { ++ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) { + /* + * Not worth trying to allocate harder for __GFP_NOMEMALLOC even + * if it can't schedule. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) { +- alloc_flags |= ALLOC_HARDER; ++ alloc_flags |= ALLOC_NON_BLOCK; + + if (order > 0) + alloc_flags |= ALLOC_HIGHATOMIC; + } + + /* +- * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the +- * comment for __cpuset_node_allowed(). ++ * Ignore cpuset mems for non-blocking __GFP_HIGH (probably ++ * GFP_ATOMIC) rather than fail, see the comment for ++ * __cpuset_node_allowed(). + */ +- alloc_flags &= ~ALLOC_CPUSET; ++ if (alloc_flags & ALLOC_MIN_RESERVE) ++ alloc_flags &= ~ALLOC_CPUSET; + } else if (unlikely(rt_task(current)) && in_task()) + alloc_flags |= ALLOC_MIN_RESERVE; + +@@ -5312,12 +5315,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + WARN_ON_ONCE_GFP(costly_order, gfp_mask); + + /* +- * Help non-failing allocations by giving them access to memory +- * reserves but do not use ALLOC_NO_WATERMARKS because this ++ * Help non-failing allocations by giving some access to memory ++ * reserves normally used for high priority non-blocking ++ * allocations but do not use ALLOC_NO_WATERMARKS because this + * could deplete whole memory reserves which would just make +- * the situation worse ++ * the situation worse. + */ +- page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac); ++ page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac); + if (page) + goto got_pg; + +-- +2.43.0 + diff --git a/queue-6.1/mm-page_alloc-explicitly-define-what-alloc-flags-dep.patch b/queue-6.1/mm-page_alloc-explicitly-define-what-alloc-flags-dep.patch new file mode 100644 index 00000000000..2ed41d8ab1f --- /dev/null +++ b/queue-6.1/mm-page_alloc-explicitly-define-what-alloc-flags-dep.patch @@ -0,0 +1,113 @@ +From 7f56b2ec2c70a47a901ef2da605c6c7552cd71c2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 13 Jan 2023 11:12:15 +0000 +Subject: mm/page_alloc: explicitly define what alloc flags deplete min + reserves + +From: Mel Gorman + +[ Upstream commit ab3508854353793cd35e348fde89a5c09b2fd8b5 ] + +As there are more ALLOC_ flags that affect reserves, define what flags +affect reserves and clarify the effect of each flag. + +Link: https://lkml.kernel.org/r/20230113111217.14134-5-mgorman@techsingularity.net +Signed-off-by: Mel Gorman +Acked-by: Vlastimil Babka +Acked-by: Michal Hocko +Cc: Matthew Wilcox +Cc: NeilBrown +Cc: Thierry Reding +Signed-off-by: Andrew Morton +Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves") +Signed-off-by: Sasha Levin +--- + mm/internal.h | 3 +++ + mm/page_alloc.c | 34 ++++++++++++++++++++++------------ + 2 files changed, 25 insertions(+), 12 deletions(-) + +diff --git a/mm/internal.h b/mm/internal.h +index f0f6198462cc1..cd095ce2f199e 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -768,6 +768,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, + #define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ + #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ + ++/* Flags that allow allocations below the min watermark. */ ++#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM) ++ + enum ttu_flags; + struct tlbflush_unmap_batch; + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 8e1f4d779b26c..6ab53e47ccea1 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3956,15 +3956,14 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE); + static inline long __zone_watermark_unusable_free(struct zone *z, + unsigned int order, unsigned int alloc_flags) + { +- const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); + long unusable_free = (1 << order) - 1; + + /* +- * If the caller does not have rights to ALLOC_HARDER then subtract +- * the high-atomic reserves. This will over-estimate the size of the +- * atomic reserve but it avoids a search. ++ * If the caller does not have rights to reserves below the min ++ * watermark then subtract the high-atomic reserves. This will ++ * over-estimate the size of the atomic reserve but it avoids a search. + */ +- if (likely(!alloc_harder)) ++ if (likely(!(alloc_flags & ALLOC_RESERVES))) + unusable_free += z->nr_reserved_highatomic; + + #ifdef CONFIG_CMA +@@ -3988,25 +3987,36 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + { + long min = mark; + int o; +- const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM)); + + /* free_pages may go negative - that's OK */ + free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); + +- if (alloc_flags & ALLOC_MIN_RESERVE) +- min -= min / 2; ++ if (unlikely(alloc_flags & ALLOC_RESERVES)) { ++ /* ++ * __GFP_HIGH allows access to 50% of the min reserve as well ++ * as OOM. ++ */ ++ if (alloc_flags & ALLOC_MIN_RESERVE) ++ min -= min / 2; + +- if (unlikely(alloc_harder)) { + /* +- * OOM victims can try even harder than normal ALLOC_HARDER ++ * Non-blocking allocations can access some of the reserve ++ * with more access if also __GFP_HIGH. The reasoning is that ++ * a non-blocking caller may incur a more severe penalty ++ * if it cannot get memory quickly, particularly if it's ++ * also __GFP_HIGH. ++ */ ++ if (alloc_flags & ALLOC_HARDER) ++ min -= min / 4; ++ ++ /* ++ * OOM victims can try even harder than the normal reserve + * users on the grounds that it's definitely going to be in + * the exit path shortly and free memory. Any allocation it + * makes during the free path will be small and short-lived. + */ + if (alloc_flags & ALLOC_OOM) + min -= min / 2; +- else +- min -= min / 4; + } + + /* +-- +2.43.0 + diff --git a/queue-6.1/mm-page_alloc-explicitly-record-high-order-atomic-al.patch b/queue-6.1/mm-page_alloc-explicitly-record-high-order-atomic-al.patch new file mode 100644 index 00000000000..c6251419c41 --- /dev/null +++ b/queue-6.1/mm-page_alloc-explicitly-record-high-order-atomic-al.patch @@ -0,0 +1,124 @@ +From 69f11057cb93d04feb54dbe7ac271978e1263a69 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 13 Jan 2023 11:12:14 +0000 +Subject: mm/page_alloc: explicitly record high-order atomic allocations in + alloc_flags + +From: Mel Gorman + +[ Upstream commit eb2e2b425c6984ca8034448a3f2c680622bd3d4d ] + +A high-order ALLOC_HARDER allocation is assumed to be atomic. While that +is accurate, it changes later in the series. In preparation, explicitly +record high-order atomic allocations in gfp_to_alloc_flags(). + +Link: https://lkml.kernel.org/r/20230113111217.14134-4-mgorman@techsingularity.net +Signed-off-by: Mel Gorman +Acked-by: Vlastimil Babka +Acked-by: Michal Hocko +Cc: Matthew Wilcox +Cc: NeilBrown +Cc: Thierry Reding +Signed-off-by: Andrew Morton +Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves") +Signed-off-by: Sasha Levin +--- + mm/internal.h | 1 + + mm/page_alloc.c | 29 +++++++++++++++++++++++------ + 2 files changed, 24 insertions(+), 6 deletions(-) + +diff --git a/mm/internal.h b/mm/internal.h +index 1be79a5147549..f0f6198462cc1 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -765,6 +765,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, + #else + #define ALLOC_NOFRAGMENT 0x0 + #endif ++#define ALLOC_HIGHATOMIC 0x200 /* Allows access to MIGRATE_HIGHATOMIC */ + #define ALLOC_KSWAPD 0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */ + + enum ttu_flags; +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index e78ab23eb1743..8e1f4d779b26c 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3713,10 +3713,20 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + * reserved for high-order atomic allocation, so order-0 + * request should skip it. + */ +- if (order > 0 && alloc_flags & ALLOC_HARDER) ++ if (alloc_flags & ALLOC_HIGHATOMIC) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (!page) { + page = __rmqueue(zone, order, migratetype, alloc_flags); ++ ++ /* ++ * If the allocation fails, allow OOM handling access ++ * to HIGHATOMIC reserves as failing now is worse than ++ * failing a high-order atomic allocation in the ++ * future. ++ */ ++ if (!page && (alloc_flags & ALLOC_OOM)) ++ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); ++ + if (!page) { + spin_unlock_irqrestore(&zone->lock, flags); + return NULL; +@@ -4030,8 +4040,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + return true; + } + #endif +- if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC)) ++ if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) && ++ !free_area_empty(area, MIGRATE_HIGHATOMIC)) { + return true; ++ } + } + return false; + } +@@ -4293,7 +4305,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, + * If this is a high-order atomic allocation then check + * if the pageblock should be reserved for the future + */ +- if (unlikely(order && (alloc_flags & ALLOC_HARDER))) ++ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) + reserve_highatomic_pageblock(page, zone, order); + + return page; +@@ -4820,7 +4832,7 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask, + } + + static inline unsigned int +-gfp_to_alloc_flags(gfp_t gfp_mask) ++gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) + { + unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + +@@ -4846,8 +4858,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask) + * Not worth trying to allocate harder for __GFP_NOMEMALLOC even + * if it can't schedule. + */ +- if (!(gfp_mask & __GFP_NOMEMALLOC)) ++ if (!(gfp_mask & __GFP_NOMEMALLOC)) { + alloc_flags |= ALLOC_HARDER; ++ ++ if (order > 0) ++ alloc_flags |= ALLOC_HIGHATOMIC; ++ } ++ + /* + * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the + * comment for __cpuset_node_allowed(). +@@ -5056,7 +5073,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + * kswapd needs to be woken up, and to avoid the cost of setting up + * alloc_flags precisely. So we do that now. + */ +- alloc_flags = gfp_to_alloc_flags(gfp_mask); ++ alloc_flags = gfp_to_alloc_flags(gfp_mask, order); + + /* + * We need to recalculate the starting point for the zonelist iterator +-- +2.43.0 + diff --git a/queue-6.1/mm-page_alloc-let-gfp_atomic-order-0-allocs-access-h.patch b/queue-6.1/mm-page_alloc-let-gfp_atomic-order-0-allocs-access-h.patch new file mode 100644 index 00000000000..4f800cf8029 --- /dev/null +++ b/queue-6.1/mm-page_alloc-let-gfp_atomic-order-0-allocs-access-h.patch @@ -0,0 +1,88 @@ +From 035af24a1a0a452608fb425c6bd69b4d36c22548 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 11 Oct 2024 13:07:37 +0100 +Subject: mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic + reserves + +From: Matt Fleming + +[ Upstream commit 281dd25c1a018261a04d1b8bf41a0674000bfe38 ] + +Under memory pressure it's possible for GFP_ATOMIC order-0 allocations to +fail even though free pages are available in the highatomic reserves. +GFP_ATOMIC allocations cannot trigger unreserve_highatomic_pageblock() +since it's only run from reclaim. + +Given that such allocations will pass the watermarks in +__zone_watermark_unusable_free(), it makes sense to fallback to highatomic +reserves the same way that ALLOC_OOM can. + +This fixes order-0 page allocation failures observed on Cloudflare's fleet +when handling network packets: + + kswapd1: page allocation failure: order:0, mode:0x820(GFP_ATOMIC), + nodemask=(null),cpuset=/,mems_allowed=0-7 + CPU: 10 PID: 696 Comm: kswapd1 Kdump: loaded Tainted: G O 6.6.43-CUSTOM #1 + Hardware name: MACHINE + Call Trace: + + dump_stack_lvl+0x3c/0x50 + warn_alloc+0x13a/0x1c0 + __alloc_pages_slowpath.constprop.0+0xc9d/0xd10 + __alloc_pages+0x327/0x340 + __napi_alloc_skb+0x16d/0x1f0 + bnxt_rx_page_skb+0x96/0x1b0 [bnxt_en] + bnxt_rx_pkt+0x201/0x15e0 [bnxt_en] + __bnxt_poll_work+0x156/0x2b0 [bnxt_en] + bnxt_poll+0xd9/0x1c0 [bnxt_en] + __napi_poll+0x2b/0x1b0 + bpf_trampoline_6442524138+0x7d/0x1000 + __napi_poll+0x5/0x1b0 + net_rx_action+0x342/0x740 + handle_softirqs+0xcf/0x2b0 + irq_exit_rcu+0x6c/0x90 + sysvec_apic_timer_interrupt+0x72/0x90 + + +[mfleming@cloudflare.com: update comment] + Link: https://lkml.kernel.org/r/20241015125158.3597702-1-matt@readmodwrite.com +Link: https://lkml.kernel.org/r/20241011120737.3300370-1-matt@readmodwrite.com +Link: https://lore.kernel.org/all/CAGis_TWzSu=P7QJmjD58WWiu3zjMTVKSzdOwWE8ORaGytzWJwQ@mail.gmail.com/ +Fixes: 1d91df85f399 ("mm/page_alloc: handle a missing case for memalloc_nocma_{save/restore} APIs") +Signed-off-by: Matt Fleming +Suggested-by: Vlastimil Babka +Reviewed-by: Vlastimil Babka +Cc: Mel Gorman +Cc: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/page_alloc.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 49dc4ba88c278..b87b350b2f405 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3719,12 +3719,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, + page = __rmqueue(zone, order, migratetype, alloc_flags); + + /* +- * If the allocation fails, allow OOM handling access +- * to HIGHATOMIC reserves as failing now is worse than +- * failing a high-order atomic allocation in the +- * future. ++ * If the allocation fails, allow OOM handling and ++ * order-0 (atomic) allocs access to HIGHATOMIC ++ * reserves as failing now is worse than failing a ++ * high-order atomic allocation in the future. + */ +- if (!page && (alloc_flags & ALLOC_OOM)) ++ if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK))) + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + + if (!page) { +-- +2.43.0 + diff --git a/queue-6.1/mm-page_alloc-rename-alloc_high-to-alloc_min_reserve.patch b/queue-6.1/mm-page_alloc-rename-alloc_high-to-alloc_min_reserve.patch new file mode 100644 index 00000000000..d1704f41845 --- /dev/null +++ b/queue-6.1/mm-page_alloc-rename-alloc_high-to-alloc_min_reserve.patch @@ -0,0 +1,113 @@ +From e6ad0b3e024d77a33bb122f362f753202b75a30e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 13 Jan 2023 11:12:12 +0000 +Subject: mm/page_alloc: rename ALLOC_HIGH to ALLOC_MIN_RESERVE + +From: Mel Gorman + +[ Upstream commit 524c48072e5673f4511f1ad81493e2485863fd65 ] + +Patch series "Discard __GFP_ATOMIC", v3. + +Neil's patch has been residing in mm-unstable as commit 2fafb4fe8f7a ("mm: +discard __GFP_ATOMIC") for a long time and recently brought up again. +Most recently, I was worried that __GFP_HIGH allocations could use +high-order atomic reserves which is unintentional but there was no +response so lets revisit -- this series reworks how min reserves are used, +protects highorder reserves and then finishes with Neil's patch with very +minor modifications so it fits on top. + +There was a review discussion on renaming __GFP_DIRECT_RECLAIM to +__GFP_ALLOW_BLOCKING but I didn't think it was that big an issue and is +orthogonal to the removal of __GFP_ATOMIC. + +There were some concerns about how the gfp flags affect the min reserves +but it never reached a solid conclusion so I made my own attempt. + +The series tries to iron out some of the details on how reserves are used. +ALLOC_HIGH becomes ALLOC_MIN_RESERVE and ALLOC_HARDER becomes +ALLOC_NON_BLOCK and documents how the reserves are affected. For example, +ALLOC_NON_BLOCK (no direct reclaim) on its own allows 25% of the min +reserve. ALLOC_MIN_RESERVE (__GFP_HIGH) allows 50% and both combined +allows deeper access again. ALLOC_OOM allows access to 75%. + +High-order atomic allocations are explicitly handled with the caveat that +no __GFP_ATOMIC flag means that any high-order allocation that specifies +GFP_HIGH and cannot enter direct reclaim will be treated as if it was +GFP_ATOMIC. + +This patch (of 6): + +__GFP_HIGH aliases to ALLOC_HIGH but the name does not really hint what it +means. As ALLOC_HIGH is internal to the allocator, rename it to +ALLOC_MIN_RESERVE to document that the min reserves can be depleted. + +Link: https://lkml.kernel.org/r/20230113111217.14134-1-mgorman@techsingularity.net +Link: https://lkml.kernel.org/r/20230113111217.14134-2-mgorman@techsingularity.net +Signed-off-by: Mel Gorman +Acked-by: Vlastimil Babka +Acked-by: Michal Hocko +Cc: Matthew Wilcox +Cc: NeilBrown +Cc: Thierry Reding +Signed-off-by: Andrew Morton +Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves") +Signed-off-by: Sasha Levin +--- + mm/internal.h | 4 +++- + mm/page_alloc.c | 8 ++++---- + 2 files changed, 7 insertions(+), 5 deletions(-) + +diff --git a/mm/internal.h b/mm/internal.h +index d01130efce5fb..1be79a5147549 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -755,7 +755,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, + #endif + + #define ALLOC_HARDER 0x10 /* try to alloc harder */ +-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ ++#define ALLOC_MIN_RESERVE 0x20 /* __GFP_HIGH set. Allow access to 50% ++ * of the min watermark. ++ */ + #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ + #ifdef CONFIG_ZONE_DMA32 +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index a905b850d31c4..f5b870780d3fd 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3983,7 +3983,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + /* free_pages may go negative - that's OK */ + free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags); + +- if (alloc_flags & ALLOC_HIGH) ++ if (alloc_flags & ALLOC_MIN_RESERVE) + min -= min / 2; + + if (unlikely(alloc_harder)) { +@@ -4825,18 +4825,18 @@ gfp_to_alloc_flags(gfp_t gfp_mask) + unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + + /* +- * __GFP_HIGH is assumed to be the same as ALLOC_HIGH ++ * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE + * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD + * to save two branches. + */ +- BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); ++ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE); + BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD); + + /* + * The caller may dip into page reserves a bit more if the caller + * cannot run direct reclaim, or if the caller has realtime scheduling + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will +- * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH). ++ * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH). + */ + alloc_flags |= (__force int) + (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM)); +-- +2.43.0 + diff --git a/queue-6.1/mm-page_alloc-treat-rt-tasks-similar-to-__gfp_high.patch b/queue-6.1/mm-page_alloc-treat-rt-tasks-similar-to-__gfp_high.patch new file mode 100644 index 00000000000..98cd1f08895 --- /dev/null +++ b/queue-6.1/mm-page_alloc-treat-rt-tasks-similar-to-__gfp_high.patch @@ -0,0 +1,55 @@ +From 5fc910982082f797aee07e26aceeec356048aab5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 13 Jan 2023 11:12:13 +0000 +Subject: mm/page_alloc: treat RT tasks similar to __GFP_HIGH + +From: Mel Gorman + +[ Upstream commit c988dcbecf3fd5430921eaa3fe9054754f76d185 ] + +RT tasks are allowed to dip below the min reserve but ALLOC_HARDER is +typically combined with ALLOC_MIN_RESERVE so RT tasks are a little +unusual. While there is some justification for allowing RT tasks access +to memory reserves, there is a strong chance that a RT task that is also +under memory pressure is at risk of missing deadlines anyway. Relax how +much reserves an RT task can access by treating it the same as __GFP_HIGH +allocations. + +Note that in a future kernel release that the RT special casing will be +removed. Hard realtime tasks should be locking down resources in advance +and ensuring enough memory is available. Even a soft-realtime task like +audio or video live decoding which cannot jitter should be allocating both +memory and any disk space required up-front before the recording starts +instead of relying on reserves. At best, reserve access will only delay +the problem by a very short interval. + +Link: https://lkml.kernel.org/r/20230113111217.14134-3-mgorman@techsingularity.net +Signed-off-by: Mel Gorman +Acked-by: Vlastimil Babka +Acked-by: Michal Hocko +Cc: Matthew Wilcox +Cc: NeilBrown +Cc: Thierry Reding +Signed-off-by: Andrew Morton +Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves") +Signed-off-by: Sasha Levin +--- + mm/page_alloc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index f5b870780d3fd..e78ab23eb1743 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -4854,7 +4854,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) + */ + alloc_flags &= ~ALLOC_CPUSET; + } else if (unlikely(rt_task(current)) && in_task()) +- alloc_flags |= ALLOC_HARDER; ++ alloc_flags |= ALLOC_MIN_RESERVE; + + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); + +-- +2.43.0 + diff --git a/queue-6.1/nvmet-auth-assign-dh_key-to-null-after-kfree_sensiti.patch b/queue-6.1/nvmet-auth-assign-dh_key-to-null-after-kfree_sensiti.patch new file mode 100644 index 00000000000..07624993101 --- /dev/null +++ b/queue-6.1/nvmet-auth-assign-dh_key-to-null-after-kfree_sensiti.patch @@ -0,0 +1,41 @@ +From 60268a8bc1d37e87324ab48e3fa7c47d3a7306b7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 16 Sep 2024 22:41:37 +0500 +Subject: nvmet-auth: assign dh_key to NULL after kfree_sensitive + +From: Vitaliy Shevtsov + +[ Upstream commit d2f551b1f72b4c508ab9298419f6feadc3b5d791 ] + +ctrl->dh_key might be used across multiple calls to nvmet_setup_dhgroup() +for the same controller. So it's better to nullify it after release on +error path in order to avoid double free later in nvmet_destroy_auth(). + +Found by Linux Verification Center (linuxtesting.org) with Svace. + +Fixes: 7a277c37d352 ("nvmet-auth: Diffie-Hellman key exchange support") +Cc: stable@vger.kernel.org +Signed-off-by: Vitaliy Shevtsov +Reviewed-by: Christoph Hellwig +Reviewed-by: Hannes Reinecke +Signed-off-by: Keith Busch +Signed-off-by: Sasha Levin +--- + drivers/nvme/target/auth.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c +index aacc05ec00c2b..74791078fdebc 100644 +--- a/drivers/nvme/target/auth.c ++++ b/drivers/nvme/target/auth.c +@@ -101,6 +101,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id) + pr_debug("%s: ctrl %d failed to generate private key, err %d\n", + __func__, ctrl->cntlid, ret); + kfree_sensitive(ctrl->dh_key); ++ ctrl->dh_key = NULL; + return ret; + } + ctrl->dh_keysize = crypto_kpp_maxsize(ctrl->dh_tfm); +-- +2.43.0 + diff --git a/queue-6.1/ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overfl.patch b/queue-6.1/ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overfl.patch new file mode 100644 index 00000000000..bfad203f1a9 --- /dev/null +++ b/queue-6.1/ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overfl.patch @@ -0,0 +1,60 @@ +From e385b2a0a317a67940c499f3891df7a28f222d5a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Oct 2024 19:43:47 +0800 +Subject: ocfs2: pass u64 to ocfs2_truncate_inline maybe overflow + +From: Edward Adam Davis + +[ Upstream commit bc0a2f3a73fcdac651fca64df39306d1e5ebe3b0 ] + +Syzbot reported a kernel BUG in ocfs2_truncate_inline. There are two +reasons for this: first, the parameter value passed is greater than +ocfs2_max_inline_data_with_xattr, second, the start and end parameters of +ocfs2_truncate_inline are "unsigned int". + +So, we need to add a sanity check for byte_start and byte_len right before +ocfs2_truncate_inline() in ocfs2_remove_inode_range(), if they are greater +than ocfs2_max_inline_data_with_xattr return -EINVAL. + +Link: https://lkml.kernel.org/r/tencent_D48DB5122ADDAEDDD11918CFB68D93258C07@qq.com +Fixes: 1afc32b95233 ("ocfs2: Write support for inline data") +Signed-off-by: Edward Adam Davis +Reported-by: syzbot+81092778aac03460d6b7@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=81092778aac03460d6b7 +Reviewed-by: Joseph Qi +Cc: Joel Becker +Cc: Joseph Qi +Cc: Mark Fasheh +Cc: Junxiao Bi +Cc: Changwei Ge +Cc: Gang He +Cc: Jun Piao +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + fs/ocfs2/file.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c +index f502bb2ce2ea7..ea7c79e8ce429 100644 +--- a/fs/ocfs2/file.c ++++ b/fs/ocfs2/file.c +@@ -1784,6 +1784,14 @@ int ocfs2_remove_inode_range(struct inode *inode, + return 0; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { ++ int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di); ++ ++ if (byte_start > id_count || byte_start + byte_len > id_count) { ++ ret = -EINVAL; ++ mlog_errno(ret); ++ goto out; ++ } ++ + ret = ocfs2_truncate_inline(inode, di_bh, byte_start, + byte_start + byte_len, 0); + if (ret) { +-- +2.43.0 + diff --git a/queue-6.1/riscv-efi-set-nx-compat-flag-in-pe-coff-header.patch b/queue-6.1/riscv-efi-set-nx-compat-flag-in-pe-coff-header.patch new file mode 100644 index 00000000000..7543ccf5124 --- /dev/null +++ b/queue-6.1/riscv-efi-set-nx-compat-flag-in-pe-coff-header.patch @@ -0,0 +1,48 @@ +From 253cd32230547e4e3a73363d58169af99d480326 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 29 Sep 2024 16:02:33 +0200 +Subject: riscv: efi: Set NX compat flag in PE/COFF header + +From: Heinrich Schuchardt + +[ Upstream commit d41373a4b910961df5a5e3527d7bde6ad45ca438 ] + +The IMAGE_DLLCHARACTERISTICS_NX_COMPAT informs the firmware that the +EFI binary does not rely on pages that are both executable and +writable. + +The flag is used by some distro versions of GRUB to decide if the EFI +binary may be executed. + +As the Linux kernel neither has RWX sections nor needs RWX pages for +relocation we should set the flag. + +Cc: Ard Biesheuvel +Cc: +Signed-off-by: Heinrich Schuchardt +Reviewed-by: Emil Renner Berthing +Fixes: cb7d2dd5612a ("RISC-V: Add PE/COFF header for EFI stub") +Acked-by: Ard Biesheuvel +Link: https://lore.kernel.org/r/20240929140233.211800-1-heinrich.schuchardt@canonical.com +Signed-off-by: Palmer Dabbelt +Signed-off-by: Sasha Levin +--- + arch/riscv/kernel/efi-header.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/riscv/kernel/efi-header.S b/arch/riscv/kernel/efi-header.S +index 8e733aa48ba6c..c306f3a6a800e 100644 +--- a/arch/riscv/kernel/efi-header.S ++++ b/arch/riscv/kernel/efi-header.S +@@ -59,7 +59,7 @@ extra_header_fields: + .long efi_header_end - _start // SizeOfHeaders + .long 0 // CheckSum + .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem +- .short 0 // DllCharacteristics ++ .short IMAGE_DLL_CHARACTERISTICS_NX_COMPAT // DllCharacteristics + .quad 0 // SizeOfStackReserve + .quad 0 // SizeOfStackCommit + .quad 0 // SizeOfHeapReserve +-- +2.43.0 + diff --git a/queue-6.1/riscv-remove-duplicated-get_rm.patch b/queue-6.1/riscv-remove-duplicated-get_rm.patch new file mode 100644 index 00000000000..22e43e35349 --- /dev/null +++ b/queue-6.1/riscv-remove-duplicated-get_rm.patch @@ -0,0 +1,38 @@ +From 7c946b69fbe00d6a7ea385b4e627abd569037584 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 8 Oct 2024 17:41:39 +0800 +Subject: riscv: Remove duplicated GET_RM + +From: Chunyan Zhang + +[ Upstream commit 164f66de6bb6ef454893f193c898dc8f1da6d18b ] + +The macro GET_RM defined twice in this file, one can be removed. + +Reviewed-by: Alexandre Ghiti +Signed-off-by: Chunyan Zhang +Fixes: 956d705dd279 ("riscv: Unaligned load/store handling for M_MODE") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20241008094141.549248-3-zhangchunyan@iscas.ac.cn +Signed-off-by: Palmer Dabbelt +Signed-off-by: Sasha Levin +--- + arch/riscv/kernel/traps_misaligned.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c +index 5348d842c7453..3d16cc803220e 100644 +--- a/arch/riscv/kernel/traps_misaligned.c ++++ b/arch/riscv/kernel/traps_misaligned.c +@@ -132,8 +132,6 @@ + #define REG_PTR(insn, pos, regs) \ + (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos)) + +-#define GET_RM(insn) (((insn) >> 12) & 7) +- + #define GET_RS1(insn, regs) (*REG_PTR(insn, SH_RS1, regs)) + #define GET_RS2(insn, regs) (*REG_PTR(insn, SH_RS2, regs)) + #define GET_RS1S(insn, regs) (*REG_PTR(RVC_RS1S(insn), 0, regs)) +-- +2.43.0 + diff --git a/queue-6.1/riscv-remove-unused-generating_asm_offsets.patch b/queue-6.1/riscv-remove-unused-generating_asm_offsets.patch new file mode 100644 index 00000000000..aeb7a726527 --- /dev/null +++ b/queue-6.1/riscv-remove-unused-generating_asm_offsets.patch @@ -0,0 +1,44 @@ +From c50a27b625ad9865ab4d7c4464b650f3309d5ba8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 8 Oct 2024 17:41:38 +0800 +Subject: riscv: Remove unused GENERATING_ASM_OFFSETS + +From: Chunyan Zhang + +[ Upstream commit 46d4e5ac6f2f801f97bcd0ec82365969197dc9b1 ] + +The macro is not used in the current version of kernel, it looks like +can be removed to avoid a build warning: + +../arch/riscv/kernel/asm-offsets.c: At top level: +../arch/riscv/kernel/asm-offsets.c:7: warning: macro "GENERATING_ASM_OFFSETS" is not used [-Wunused-macros] + 7 | #define GENERATING_ASM_OFFSETS + +Fixes: 9639a44394b9 ("RISC-V: Provide a cleaner raw_smp_processor_id()") +Cc: stable@vger.kernel.org +Reviewed-by: Alexandre Ghiti +Tested-by: Alexandre Ghiti +Signed-off-by: Chunyan Zhang +Link: https://lore.kernel.org/r/20241008094141.549248-2-zhangchunyan@iscas.ac.cn +Signed-off-by: Palmer Dabbelt +Signed-off-by: Sasha Levin +--- + arch/riscv/kernel/asm-offsets.c | 2 -- + 1 file changed, 2 deletions(-) + +diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c +index df9444397908d..1ecafbcee9a0a 100644 +--- a/arch/riscv/kernel/asm-offsets.c ++++ b/arch/riscv/kernel/asm-offsets.c +@@ -4,8 +4,6 @@ + * Copyright (C) 2017 SiFive + */ + +-#define GENERATING_ASM_OFFSETS +- + #include + #include + #include +-- +2.43.0 + diff --git a/queue-6.1/riscv-use-u-to-format-the-output-of-cpu.patch b/queue-6.1/riscv-use-u-to-format-the-output-of-cpu.patch new file mode 100644 index 00000000000..919d0937494 --- /dev/null +++ b/queue-6.1/riscv-use-u-to-format-the-output-of-cpu.patch @@ -0,0 +1,43 @@ +From ff7ce41d5795e9e45aae3d280bfb411d40248dca Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 17 Oct 2024 11:20:10 +0800 +Subject: riscv: Use '%u' to format the output of 'cpu' + +From: WangYuli + +[ Upstream commit e0872ab72630dada3ae055bfa410bf463ff1d1e0 ] + +'cpu' is an unsigned integer, so its conversion specifier should +be %u, not %d. + +Suggested-by: Wentao Guan +Suggested-by: Maciej W. Rozycki +Link: https://lore.kernel.org/all/alpine.DEB.2.21.2409122309090.40372@angie.orcam.me.uk/ +Signed-off-by: WangYuli +Reviewed-by: Charlie Jenkins +Tested-by: Charlie Jenkins +Fixes: f1e58583b9c7 ("RISC-V: Support cpu hotplug") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/4C127DEECDA287C8+20241017032010.96772-1-wangyuli@uniontech.com +Signed-off-by: Palmer Dabbelt +Signed-off-by: Sasha Levin +--- + arch/riscv/kernel/cpu-hotplug.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c +index f7a832e3a1d1d..462b3631663f9 100644 +--- a/arch/riscv/kernel/cpu-hotplug.c ++++ b/arch/riscv/kernel/cpu-hotplug.c +@@ -65,7 +65,7 @@ void __cpu_die(unsigned int cpu) + if (cpu_ops[cpu]->cpu_is_stopped) + ret = cpu_ops[cpu]->cpu_is_stopped(cpu); + if (ret) +- pr_warn("CPU%d may not have stopped: %d\n", cpu, ret); ++ pr_warn("CPU%u may not have stopped: %d\n", cpu, ret); + } + + /* +-- +2.43.0 + diff --git a/queue-6.1/riscv-vdso-prevent-the-compiler-from-inserting-calls.patch b/queue-6.1/riscv-vdso-prevent-the-compiler-from-inserting-calls.patch new file mode 100644 index 00000000000..c7bce24f738 --- /dev/null +++ b/queue-6.1/riscv-vdso-prevent-the-compiler-from-inserting-calls.patch @@ -0,0 +1,40 @@ +From d06392f489cede0c9e276cf66ff324f9b61a1157 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 16 Oct 2024 10:36:24 +0200 +Subject: riscv: vdso: Prevent the compiler from inserting calls to memset() + +From: Alexandre Ghiti + +[ Upstream commit bf40167d54d55d4b54d0103713d86a8638fb9290 ] + +The compiler is smart enough to insert a call to memset() in +riscv_vdso_get_cpus(), which generates a dynamic relocation. + +So prevent this by using -fno-builtin option. + +Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API") +Cc: stable@vger.kernel.org +Signed-off-by: Alexandre Ghiti +Reviewed-by: Guo Ren +Link: https://lore.kernel.org/r/20241016083625.136311-2-alexghiti@rivosinc.com +Signed-off-by: Palmer Dabbelt +Signed-off-by: Sasha Levin +--- + arch/riscv/kernel/vdso/Makefile | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile +index 06e6b27f3bcc9..c1b68f962bada 100644 +--- a/arch/riscv/kernel/vdso/Makefile ++++ b/arch/riscv/kernel/vdso/Makefile +@@ -18,6 +18,7 @@ obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o + + ccflags-y := -fno-stack-protector + ccflags-y += -DDISABLE_BRANCH_PROFILING ++ccflags-y += -fno-builtin + + ifneq ($(c-gettimeofday-y),) + CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y) +-- +2.43.0 + diff --git a/queue-6.1/series b/queue-6.1/series index 6e0e48c5bba..a9b3fb705b1 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -77,3 +77,37 @@ staging-iio-frequency-ad9832-fix-division-by-zero-in-ad9832_calc_freqreg.patch iio-adc-ad7124-fix-division-by-zero-in-ad7124_set_channel_odr.patch iio-light-veml6030-fix-microlux-value-calculation.patch nilfs2-fix-potential-deadlock-with-newly-created-symlinks.patch +block-fix-sanity-checks-in-blk_rq_map_user_bvec.patch +cgroup-bpf-use-a-dedicated-workqueue-for-cgroup-bpf-.patch +riscv-vdso-prevent-the-compiler-from-inserting-calls.patch +alsa-hda-realtek-limit-internal-mic-boost-on-dell-pl.patch +riscv-efi-set-nx-compat-flag-in-pe-coff-header.patch +riscv-use-u-to-format-the-output-of-cpu.patch +riscv-remove-unused-generating_asm_offsets.patch +riscv-remove-duplicated-get_rm.patch +cxl-acpi-move-rescan-to-the-workqueue.patch +cxl-port-fix-cxl_bus_rescan-vs-bus_rescan_devices.patch +mm-page_alloc-rename-alloc_high-to-alloc_min_reserve.patch +mm-page_alloc-treat-rt-tasks-similar-to-__gfp_high.patch +mm-page_alloc-explicitly-record-high-order-atomic-al.patch +mm-page_alloc-explicitly-define-what-alloc-flags-dep.patch +mm-page_alloc-explicitly-define-how-__gfp_high-non-b.patch +mm-page_alloc-let-gfp_atomic-order-0-allocs-access-h.patch +ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overfl.patch +mctp-i2c-handle-null-header-address.patch +alsa-hda-realtek-fix-headset-mic-on-tuxedo-stellaris.patch +nvmet-auth-assign-dh_key-to-null-after-kfree_sensiti.patch +kasan-remove-vmalloc_percpu-test.patch +io_uring-rename-kiocb_end_write-local-helper.patch +fs-create-kiocb_-start-end-_write-helpers.patch +io_uring-use-kiocb_-start-end-_write-helpers.patch +io_uring-rw-fix-missing-nowait-check-for-o_direct-st.patch +mm-migrate-try-again-if-thp-split-is-failed-due-to-p.patch +migrate-convert-unmap_and_move-to-use-folios.patch +migrate-convert-migrate_pages-to-use-folios.patch +mm-migrate.c-stop-using-0-as-null-pointer.patch +migrate_pages-organize-stats-with-struct-migrate_pag.patch +migrate_pages-separate-hugetlb-folios-migration.patch +migrate_pages-restrict-number-of-pages-to-migrate-in.patch +migrate_pages-split-unmap_and_move-to-_unmap-and-_mo.patch +vmscan-migrate-fix-page-count-imbalance-on-node-stat.patch diff --git a/queue-6.1/vmscan-migrate-fix-page-count-imbalance-on-node-stat.patch b/queue-6.1/vmscan-migrate-fix-page-count-imbalance-on-node-stat.patch new file mode 100644 index 00000000000..aa73f5d38c9 --- /dev/null +++ b/queue-6.1/vmscan-migrate-fix-page-count-imbalance-on-node-stat.patch @@ -0,0 +1,75 @@ +From 1f94221149da9099afe301073304fc26a2ec2ac6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 25 Oct 2024 10:17:24 -0400 +Subject: vmscan,migrate: fix page count imbalance on node stats when demoting + pages + +From: Gregory Price + +[ Upstream commit 35e41024c4c2b02ef8207f61b9004f6956cf037b ] + +When numa balancing is enabled with demotion, vmscan will call +migrate_pages when shrinking LRUs. migrate_pages will decrement the +the node's isolated page count, leading to an imbalanced count when +invoked from (MG)LRU code. + +The result is dmesg output like such: + +$ cat /proc/sys/vm/stat_refresh + +[77383.088417] vmstat_refresh: nr_isolated_anon -103212 +[77383.088417] vmstat_refresh: nr_isolated_file -899642 + +This negative value may impact compaction and reclaim throttling. + +The following path produces the decrement: + +shrink_folio_list + demote_folio_list + migrate_pages + migrate_pages_batch + migrate_folio_move + migrate_folio_done + mod_node_page_state(-ve) <- decrement + +This path happens for SUCCESSFUL migrations, not failures. Typically +callers to migrate_pages are required to handle putback/accounting for +failures, but this is already handled in the shrink code. + +When accounting for migrations, instead do not decrement the count when +the migration reason is MR_DEMOTION. As of v6.11, this demotion logic +is the only source of MR_DEMOTION. + +Link: https://lkml.kernel.org/r/20241025141724.17927-1-gourry@gourry.net +Fixes: 26aa2d199d6f ("mm/migrate: demote pages during reclaim") +Signed-off-by: Gregory Price +Reviewed-by: Yang Shi +Reviewed-by: Davidlohr Bueso +Reviewed-by: Shakeel Butt +Reviewed-by: "Huang, Ying" +Reviewed-by: Oscar Salvador +Cc: Dave Hansen +Cc: Wei Xu +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/migrate.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/migrate.c b/mm/migrate.c +index 46a1476e188c3..9ff5d77b61a3e 100644 +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -1044,7 +1044,7 @@ static void migrate_folio_done(struct folio *src, + * not accounted to NR_ISOLATED_*. They can be recognized + * as __PageMovable + */ +- if (likely(!__folio_test_movable(src))) ++ if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION) + mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + + folio_is_file_lru(src), -folio_nr_pages(src)); + +-- +2.43.0 + -- 2.47.3