]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 6.1
authorSasha Levin <sashal@kernel.org>
Wed, 6 Nov 2024 01:54:05 +0000 (20:54 -0500)
committerSasha Levin <sashal@kernel.org>
Wed, 6 Nov 2024 01:55:02 +0000 (20:55 -0500)
Signed-off-by: Sasha Levin <sashal@kernel.org>
35 files changed:
queue-6.1/alsa-hda-realtek-fix-headset-mic-on-tuxedo-stellaris.patch [new file with mode: 0644]
queue-6.1/alsa-hda-realtek-limit-internal-mic-boost-on-dell-pl.patch [new file with mode: 0644]
queue-6.1/block-fix-sanity-checks-in-blk_rq_map_user_bvec.patch [new file with mode: 0644]
queue-6.1/cgroup-bpf-use-a-dedicated-workqueue-for-cgroup-bpf-.patch [new file with mode: 0644]
queue-6.1/cxl-acpi-move-rescan-to-the-workqueue.patch [new file with mode: 0644]
queue-6.1/cxl-port-fix-cxl_bus_rescan-vs-bus_rescan_devices.patch [new file with mode: 0644]
queue-6.1/fs-create-kiocb_-start-end-_write-helpers.patch [new file with mode: 0644]
queue-6.1/io_uring-rename-kiocb_end_write-local-helper.patch [new file with mode: 0644]
queue-6.1/io_uring-rw-fix-missing-nowait-check-for-o_direct-st.patch [new file with mode: 0644]
queue-6.1/io_uring-use-kiocb_-start-end-_write-helpers.patch [new file with mode: 0644]
queue-6.1/kasan-remove-vmalloc_percpu-test.patch [new file with mode: 0644]
queue-6.1/mctp-i2c-handle-null-header-address.patch [new file with mode: 0644]
queue-6.1/migrate-convert-migrate_pages-to-use-folios.patch [new file with mode: 0644]
queue-6.1/migrate-convert-unmap_and_move-to-use-folios.patch [new file with mode: 0644]
queue-6.1/migrate_pages-organize-stats-with-struct-migrate_pag.patch [new file with mode: 0644]
queue-6.1/migrate_pages-restrict-number-of-pages-to-migrate-in.patch [new file with mode: 0644]
queue-6.1/migrate_pages-separate-hugetlb-folios-migration.patch [new file with mode: 0644]
queue-6.1/migrate_pages-split-unmap_and_move-to-_unmap-and-_mo.patch [new file with mode: 0644]
queue-6.1/mm-migrate-try-again-if-thp-split-is-failed-due-to-p.patch [new file with mode: 0644]
queue-6.1/mm-migrate.c-stop-using-0-as-null-pointer.patch [new file with mode: 0644]
queue-6.1/mm-page_alloc-explicitly-define-how-__gfp_high-non-b.patch [new file with mode: 0644]
queue-6.1/mm-page_alloc-explicitly-define-what-alloc-flags-dep.patch [new file with mode: 0644]
queue-6.1/mm-page_alloc-explicitly-record-high-order-atomic-al.patch [new file with mode: 0644]
queue-6.1/mm-page_alloc-let-gfp_atomic-order-0-allocs-access-h.patch [new file with mode: 0644]
queue-6.1/mm-page_alloc-rename-alloc_high-to-alloc_min_reserve.patch [new file with mode: 0644]
queue-6.1/mm-page_alloc-treat-rt-tasks-similar-to-__gfp_high.patch [new file with mode: 0644]
queue-6.1/nvmet-auth-assign-dh_key-to-null-after-kfree_sensiti.patch [new file with mode: 0644]
queue-6.1/ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overfl.patch [new file with mode: 0644]
queue-6.1/riscv-efi-set-nx-compat-flag-in-pe-coff-header.patch [new file with mode: 0644]
queue-6.1/riscv-remove-duplicated-get_rm.patch [new file with mode: 0644]
queue-6.1/riscv-remove-unused-generating_asm_offsets.patch [new file with mode: 0644]
queue-6.1/riscv-use-u-to-format-the-output-of-cpu.patch [new file with mode: 0644]
queue-6.1/riscv-vdso-prevent-the-compiler-from-inserting-calls.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/vmscan-migrate-fix-page-count-imbalance-on-node-stat.patch [new file with mode: 0644]

diff --git a/queue-6.1/alsa-hda-realtek-fix-headset-mic-on-tuxedo-stellaris.patch b/queue-6.1/alsa-hda-realtek-fix-headset-mic-on-tuxedo-stellaris.patch
new file mode 100644 (file)
index 0000000..b43584e
--- /dev/null
@@ -0,0 +1,36 @@
+From 0e01a20897e4404df3b1eaa37e3b37f829ab4363 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 29 Oct 2024 16:16:53 +0100
+Subject: ALSA: hda/realtek: Fix headset mic on TUXEDO Stellaris 16 Gen6 mb1
+
+From: Christoffer Sandberg <cs@tuxedo.de>
+
+[ Upstream commit e49370d769e71456db3fbd982e95bab8c69f73e8 ]
+
+Quirk is needed to enable headset microphone on missing pin 0x19.
+
+Signed-off-by: Christoffer Sandberg <cs@tuxedo.de>
+Signed-off-by: Werner Sembach <wse@tuxedocomputers.com>
+Cc: <stable@vger.kernel.org>
+Link: https://patch.msgid.link/20241029151653.80726-2-wse@tuxedocomputers.com
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/pci/hda/patch_realtek.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
+index 3cbd9cf80be96..d750c6e6eb984 100644
+--- a/sound/pci/hda/patch_realtek.c
++++ b/sound/pci/hda/patch_realtek.c
+@@ -10214,6 +10214,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = {
+       SND_PCI_QUIRK(0x1d05, 0x115c, "TongFang GMxTGxx", ALC269_FIXUP_NO_SHUTUP),
+       SND_PCI_QUIRK(0x1d05, 0x121b, "TongFang GMxAGxx", ALC269_FIXUP_NO_SHUTUP),
+       SND_PCI_QUIRK(0x1d05, 0x1387, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC),
++      SND_PCI_QUIRK(0x1d05, 0x1409, "TongFang GMxIXxx", ALC2XX_FIXUP_HEADSET_MIC),
+       SND_PCI_QUIRK(0x1d17, 0x3288, "Haier Boyue G42", ALC269VC_FIXUP_ACER_VCOPPERBOX_PINS),
+       SND_PCI_QUIRK(0x1d72, 0x1602, "RedmiBook", ALC255_FIXUP_XIAOMI_HEADSET_MIC),
+       SND_PCI_QUIRK(0x1d72, 0x1701, "XiaomiNotebook Pro", ALC298_FIXUP_DELL1_MIC_NO_PRESENCE),
+-- 
+2.43.0
+
diff --git a/queue-6.1/alsa-hda-realtek-limit-internal-mic-boost-on-dell-pl.patch b/queue-6.1/alsa-hda-realtek-limit-internal-mic-boost-on-dell-pl.patch
new file mode 100644 (file)
index 0000000..214d60f
--- /dev/null
@@ -0,0 +1,97 @@
+From cdd6f79946b70304f691527e0efecb41c8c114d3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 18 Oct 2024 13:53:24 +0800
+Subject: ALSA: hda/realtek: Limit internal Mic boost on Dell platform
+
+From: Kailang Yang <kailang@realtek.com>
+
+[ Upstream commit 78e7be018784934081afec77f96d49a2483f9188 ]
+
+Dell want to limit internal Mic boost on all Dell platform.
+
+Signed-off-by: Kailang Yang <kailang@realtek.com>
+Cc: <stable@vger.kernel.org>
+Link: https://lore.kernel.org/561fc5f5eff04b6cbd79ed173cd1c1db@realtek.com
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/pci/hda/patch_realtek.c | 21 ++++++++++++++++++---
+ 1 file changed, 18 insertions(+), 3 deletions(-)
+
+diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
+index a8bc95ffa41a3..3cbd9cf80be96 100644
+--- a/sound/pci/hda/patch_realtek.c
++++ b/sound/pci/hda/patch_realtek.c
+@@ -7159,6 +7159,7 @@ enum {
+       ALC286_FIXUP_SONY_MIC_NO_PRESENCE,
+       ALC269_FIXUP_PINCFG_NO_HP_TO_LINEOUT,
+       ALC269_FIXUP_DELL1_MIC_NO_PRESENCE,
++      ALC269_FIXUP_DELL1_LIMIT_INT_MIC_BOOST,
+       ALC269_FIXUP_DELL2_MIC_NO_PRESENCE,
+       ALC269_FIXUP_DELL3_MIC_NO_PRESENCE,
+       ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
+@@ -7193,6 +7194,7 @@ enum {
+       ALC255_FIXUP_ACER_MIC_NO_PRESENCE,
+       ALC255_FIXUP_ASUS_MIC_NO_PRESENCE,
+       ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
++      ALC255_FIXUP_DELL1_LIMIT_INT_MIC_BOOST,
+       ALC255_FIXUP_DELL2_MIC_NO_PRESENCE,
+       ALC255_FIXUP_HEADSET_MODE,
+       ALC255_FIXUP_HEADSET_MODE_NO_HP_MIC,
+@@ -7658,6 +7660,12 @@ static const struct hda_fixup alc269_fixups[] = {
+               .chained = true,
+               .chain_id = ALC269_FIXUP_HEADSET_MODE
+       },
++      [ALC269_FIXUP_DELL1_LIMIT_INT_MIC_BOOST] = {
++              .type = HDA_FIXUP_FUNC,
++              .v.func = alc269_fixup_limit_int_mic_boost,
++              .chained = true,
++              .chain_id = ALC269_FIXUP_DELL1_MIC_NO_PRESENCE
++      },
+       [ALC269_FIXUP_DELL2_MIC_NO_PRESENCE] = {
+               .type = HDA_FIXUP_PINS,
+               .v.pins = (const struct hda_pintbl[]) {
+@@ -7938,6 +7946,12 @@ static const struct hda_fixup alc269_fixups[] = {
+               .chained = true,
+               .chain_id = ALC255_FIXUP_HEADSET_MODE
+       },
++      [ALC255_FIXUP_DELL1_LIMIT_INT_MIC_BOOST] = {
++              .type = HDA_FIXUP_FUNC,
++              .v.func = alc269_fixup_limit_int_mic_boost,
++              .chained = true,
++              .chain_id = ALC255_FIXUP_DELL1_MIC_NO_PRESENCE
++      },
+       [ALC255_FIXUP_DELL2_MIC_NO_PRESENCE] = {
+               .type = HDA_FIXUP_PINS,
+               .v.pins = (const struct hda_pintbl[]) {
+@@ -10294,6 +10308,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = {
+       {.id = ALC269_FIXUP_DELL2_MIC_NO_PRESENCE, .name = "dell-headset-dock"},
+       {.id = ALC269_FIXUP_DELL3_MIC_NO_PRESENCE, .name = "dell-headset3"},
+       {.id = ALC269_FIXUP_DELL4_MIC_NO_PRESENCE, .name = "dell-headset4"},
++      {.id = ALC269_FIXUP_DELL4_MIC_NO_PRESENCE_QUIET, .name = "dell-headset4-quiet"},
+       {.id = ALC283_FIXUP_CHROME_BOOK, .name = "alc283-dac-wcaps"},
+       {.id = ALC283_FIXUP_SENSE_COMBO_JACK, .name = "alc283-sense-combo"},
+       {.id = ALC292_FIXUP_TPT440_DOCK, .name = "tpt440-dock"},
+@@ -10841,16 +10856,16 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = {
+       SND_HDA_PIN_QUIRK(0x10ec0289, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
+               {0x19, 0x40000000},
+               {0x1b, 0x40000000}),
+-      SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE,
++      SND_HDA_PIN_QUIRK(0x10ec0295, 0x1028, "Dell", ALC269_FIXUP_DELL4_MIC_NO_PRESENCE_QUIET,
+               {0x19, 0x40000000},
+               {0x1b, 0x40000000}),
+       SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
+               {0x19, 0x40000000},
+               {0x1a, 0x40000000}),
+-      SND_HDA_PIN_QUIRK(0x10ec0236, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
++      SND_HDA_PIN_QUIRK(0x10ec0236, 0x1028, "Dell", ALC255_FIXUP_DELL1_LIMIT_INT_MIC_BOOST,
+               {0x19, 0x40000000},
+               {0x1a, 0x40000000}),
+-      SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB,
++      SND_HDA_PIN_QUIRK(0x10ec0274, 0x1028, "Dell", ALC269_FIXUP_DELL1_LIMIT_INT_MIC_BOOST,
+               {0x19, 0x40000000},
+               {0x1a, 0x40000000}),
+       SND_HDA_PIN_QUIRK(0x10ec0256, 0x1043, "ASUS", ALC2XX_FIXUP_HEADSET_MIC,
+-- 
+2.43.0
+
diff --git a/queue-6.1/block-fix-sanity-checks-in-blk_rq_map_user_bvec.patch b/queue-6.1/block-fix-sanity-checks-in-blk_rq_map_user_bvec.patch
new file mode 100644 (file)
index 0000000..03e2ff6
--- /dev/null
@@ -0,0 +1,59 @@
+From 9a988c5c336b6dfb5c813c357a36faaacac25c88 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 23 Oct 2024 15:15:19 -0600
+Subject: block: fix sanity checks in blk_rq_map_user_bvec
+
+From: Xinyu Zhang <xizhang@purestorage.com>
+
+[ Upstream commit 2ff949441802a8d076d9013c7761f63e8ae5a9bd ]
+
+blk_rq_map_user_bvec contains a check bytes + bv->bv_len > nr_iter which
+causes unnecessary failures in NVMe passthrough I/O, reproducible as
+follows:
+
+- register a 2 page, page-aligned buffer against a ring
+- use that buffer to do a 1 page io_uring NVMe passthrough read
+
+The second (i = 1) iteration of the loop in blk_rq_map_user_bvec will
+then have nr_iter == 1 page, bytes == 1 page, bv->bv_len == 1 page, so
+the check bytes + bv->bv_len > nr_iter will succeed, causing the I/O to
+fail. This failure is unnecessary, as when the check succeeds, it means
+we've checked the entire buffer that will be used by the request - i.e.
+blk_rq_map_user_bvec should complete successfully. Therefore, terminate
+the loop early and return successfully when the check bytes + bv->bv_len
+> nr_iter succeeds.
+
+While we're at it, also remove the check that all segments in the bvec
+are single-page. While this seems to be true for all users of the
+function, it doesn't appear to be required anywhere downstream.
+
+CC: stable@vger.kernel.org
+Signed-off-by: Xinyu Zhang <xizhang@purestorage.com>
+Co-developed-by: Uday Shankar <ushankar@purestorage.com>
+Signed-off-by: Uday Shankar <ushankar@purestorage.com>
+Fixes: 37987547932c ("block: extend functionality to map bvec iterator")
+Link: https://lore.kernel.org/r/20241023211519.4177873-1-ushankar@purestorage.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ block/blk-map.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/block/blk-map.c b/block/blk-map.c
+index b337ae347bfa3..a2fa387560375 100644
+--- a/block/blk-map.c
++++ b/block/blk-map.c
+@@ -597,9 +597,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
+               if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len)
+                       goto put_bio;
+               if (bytes + bv->bv_len > nr_iter)
+-                      goto put_bio;
+-              if (bv->bv_offset + bv->bv_len > PAGE_SIZE)
+-                      goto put_bio;
++                      break;
+               nsegs++;
+               bytes += bv->bv_len;
+-- 
+2.43.0
+
diff --git a/queue-6.1/cgroup-bpf-use-a-dedicated-workqueue-for-cgroup-bpf-.patch b/queue-6.1/cgroup-bpf-use-a-dedicated-workqueue-for-cgroup-bpf-.patch
new file mode 100644 (file)
index 0000000..09733ca
--- /dev/null
@@ -0,0 +1,154 @@
+From 0c340c704aa5935085c6ae2de631adada19df11e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Oct 2024 11:24:56 +0000
+Subject: cgroup/bpf: use a dedicated workqueue for cgroup bpf destruction
+
+From: Chen Ridong <chenridong@huawei.com>
+
+[ Upstream commit 117932eea99b729ee5d12783601a4f7f5fd58a23 ]
+
+A hung_task problem shown below was found:
+
+INFO: task kworker/0:0:8 blocked for more than 327 seconds.
+"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+Workqueue: events cgroup_bpf_release
+Call Trace:
+ <TASK>
+ __schedule+0x5a2/0x2050
+ ? find_held_lock+0x33/0x100
+ ? wq_worker_sleeping+0x9e/0xe0
+ schedule+0x9f/0x180
+ schedule_preempt_disabled+0x25/0x50
+ __mutex_lock+0x512/0x740
+ ? cgroup_bpf_release+0x1e/0x4d0
+ ? cgroup_bpf_release+0xcf/0x4d0
+ ? process_scheduled_works+0x161/0x8a0
+ ? cgroup_bpf_release+0x1e/0x4d0
+ ? mutex_lock_nested+0x2b/0x40
+ ? __pfx_delay_tsc+0x10/0x10
+ mutex_lock_nested+0x2b/0x40
+ cgroup_bpf_release+0xcf/0x4d0
+ ? process_scheduled_works+0x161/0x8a0
+ ? trace_event_raw_event_workqueue_execute_start+0x64/0xd0
+ ? process_scheduled_works+0x161/0x8a0
+ process_scheduled_works+0x23a/0x8a0
+ worker_thread+0x231/0x5b0
+ ? __pfx_worker_thread+0x10/0x10
+ kthread+0x14d/0x1c0
+ ? __pfx_kthread+0x10/0x10
+ ret_from_fork+0x59/0x70
+ ? __pfx_kthread+0x10/0x10
+ ret_from_fork_asm+0x1b/0x30
+ </TASK>
+
+This issue can be reproduced by the following pressuse test:
+1. A large number of cpuset cgroups are deleted.
+2. Set cpu on and off repeatly.
+3. Set watchdog_thresh repeatly.
+The scripts can be obtained at LINK mentioned above the signature.
+
+The reason for this issue is cgroup_mutex and cpu_hotplug_lock are
+acquired in different tasks, which may lead to deadlock.
+It can lead to a deadlock through the following steps:
+1. A large number of cpusets are deleted asynchronously, which puts a
+   large number of cgroup_bpf_release works into system_wq. The max_active
+   of system_wq is WQ_DFL_ACTIVE(256). Consequently, all active works are
+   cgroup_bpf_release works, and many cgroup_bpf_release works will be put
+   into inactive queue. As illustrated in the diagram, there are 256 (in
+   the acvtive queue) + n (in the inactive queue) works.
+2. Setting watchdog_thresh will hold cpu_hotplug_lock.read and put
+   smp_call_on_cpu work into system_wq. However step 1 has already filled
+   system_wq, 'sscs.work' is put into inactive queue. 'sscs.work' has
+   to wait until the works that were put into the inacvtive queue earlier
+   have executed (n cgroup_bpf_release), so it will be blocked for a while.
+3. Cpu offline requires cpu_hotplug_lock.write, which is blocked by step 2.
+4. Cpusets that were deleted at step 1 put cgroup_release works into
+   cgroup_destroy_wq. They are competing to get cgroup_mutex all the time.
+   When cgroup_metux is acqured by work at css_killed_work_fn, it will
+   call cpuset_css_offline, which needs to acqure cpu_hotplug_lock.read.
+   However, cpuset_css_offline will be blocked for step 3.
+5. At this moment, there are 256 works in active queue that are
+   cgroup_bpf_release, they are attempting to acquire cgroup_mutex, and as
+   a result, all of them are blocked. Consequently, sscs.work can not be
+   executed. Ultimately, this situation leads to four processes being
+   blocked, forming a deadlock.
+
+system_wq(step1)               WatchDog(step2)                 cpu offline(step3)      cgroup_destroy_wq(step4)
+...
+2000+ cgroups deleted asyn
+256 actives + n inactives
+                               __lockup_detector_reconfigure
+                               P(cpu_hotplug_lock.read)
+                               put sscs.work into system_wq
+256 + n + 1(sscs.work)
+sscs.work wait to be executed
+                               warting sscs.work finish
+                                                               percpu_down_write
+                                                               P(cpu_hotplug_lock.write)
+                                                               ...blocking...
+                                                                                       css_killed_work_fn
+                                                                                       P(cgroup_mutex)
+                                                                                       cpuset_css_offline
+                                                                                       P(cpu_hotplug_lock.read)
+                                                                                       ...blocking...
+256 cgroup_bpf_release
+mutex_lock(&cgroup_mutex);
+..blocking...
+
+To fix the problem, place cgroup_bpf_release works on a dedicated
+workqueue which can break the loop and solve the problem. System wqs are
+for misc things which shouldn't create a large number of concurrent work
+items. If something is going to generate >WQ_DFL_ACTIVE(256) concurrent
+work items, it should use its own dedicated workqueue.
+
+Fixes: 4bfc0bb2c60e ("bpf: decouple the lifetime of cgroup_bpf from cgroup itself")
+Cc: stable@vger.kernel.org # v5.3+
+Link: https://lore.kernel.org/cgroups/e90c32d2-2a85-4f28-9154-09c7d320cb60@huawei.com/T/#t
+Tested-by: Vishal Chourasia <vishalc@linux.ibm.com>
+Signed-off-by: Chen Ridong <chenridong@huawei.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/cgroup.c | 19 ++++++++++++++++++-
+ 1 file changed, 18 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
+index bb70f400c25eb..2cb04e0e118d9 100644
+--- a/kernel/bpf/cgroup.c
++++ b/kernel/bpf/cgroup.c
+@@ -24,6 +24,23 @@
+ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE);
+ EXPORT_SYMBOL(cgroup_bpf_enabled_key);
++/*
++ * cgroup bpf destruction makes heavy use of work items and there can be a lot
++ * of concurrent destructions.  Use a separate workqueue so that cgroup bpf
++ * destruction work items don't end up filling up max_active of system_wq
++ * which may lead to deadlock.
++ */
++static struct workqueue_struct *cgroup_bpf_destroy_wq;
++
++static int __init cgroup_bpf_wq_init(void)
++{
++      cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1);
++      if (!cgroup_bpf_destroy_wq)
++              panic("Failed to alloc workqueue for cgroup bpf destroy.\n");
++      return 0;
++}
++core_initcall(cgroup_bpf_wq_init);
++
+ /* __always_inline is necessary to prevent indirect call through run_prog
+  * function pointer.
+  */
+@@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref)
+       struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt);
+       INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release);
+-      queue_work(system_wq, &cgrp->bpf.release_work);
++      queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work);
+ }
+ /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through
+-- 
+2.43.0
+
diff --git a/queue-6.1/cxl-acpi-move-rescan-to-the-workqueue.patch b/queue-6.1/cxl-acpi-move-rescan-to-the-workqueue.patch
new file mode 100644 (file)
index 0000000..1745fde
--- /dev/null
@@ -0,0 +1,114 @@
+From 2c6aa71b070247d3da2b5ed5820e41eb07a2cf17 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 1 Dec 2022 13:33:48 -0800
+Subject: cxl/acpi: Move rescan to the workqueue
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+[ Upstream commit 4029c32fb601d505dfb92bdf0db9fdcc41fe1434 ]
+
+Now that the cxl_mem driver has a need to take the root device lock, the
+cxl_bus_rescan() needs to run outside of the root lock context. That
+need arises from RCH topologies and the locking that the cxl_mem driver
+does to attach a descendant to an upstream port. In the RCH case the
+lock needed is the CXL root device lock [1].
+
+Link: http://lore.kernel.org/r/166993045621.1882361.1730100141527044744.stgit@dwillia2-xfh.jf.intel.com [1]
+Tested-by: Robert Richter <rrichter@amd.com>
+Link: http://lore.kernel.org/r/166993042884.1882361.5633723613683058881.stgit@dwillia2-xfh.jf.intel.com
+Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Stable-dep-of: 3d6ebf16438d ("cxl/port: Fix cxl_bus_rescan() vs bus_rescan_devices()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cxl/acpi.c      | 17 +++++++++++++++--
+ drivers/cxl/core/port.c | 19 +++++++++++++++++--
+ drivers/cxl/cxl.h       |  3 ++-
+ 3 files changed, 34 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/cxl/acpi.c b/drivers/cxl/acpi.c
+index dd610556a3afa..d7d789211c173 100644
+--- a/drivers/cxl/acpi.c
++++ b/drivers/cxl/acpi.c
+@@ -509,7 +509,8 @@ static int cxl_acpi_probe(struct platform_device *pdev)
+               return rc;
+       /* In case PCI is scanned before ACPI re-trigger memdev attach */
+-      return cxl_bus_rescan();
++      cxl_bus_rescan();
++      return 0;
+ }
+ static const struct acpi_device_id cxl_acpi_ids[] = {
+@@ -533,7 +534,19 @@ static struct platform_driver cxl_acpi_driver = {
+       .id_table = cxl_test_ids,
+ };
+-module_platform_driver(cxl_acpi_driver);
++static int __init cxl_acpi_init(void)
++{
++      return platform_driver_register(&cxl_acpi_driver);
++}
++
++static void __exit cxl_acpi_exit(void)
++{
++      platform_driver_unregister(&cxl_acpi_driver);
++      cxl_bus_drain();
++}
++
++module_init(cxl_acpi_init);
++module_exit(cxl_acpi_exit);
+ MODULE_LICENSE("GPL v2");
+ MODULE_IMPORT_NS(CXL);
+ MODULE_IMPORT_NS(ACPI);
+diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
+index 1f1483a9e5252..f0875fa86c616 100644
+--- a/drivers/cxl/core/port.c
++++ b/drivers/cxl/core/port.c
+@@ -1786,12 +1786,27 @@ static void cxl_bus_remove(struct device *dev)
+ static struct workqueue_struct *cxl_bus_wq;
+-int cxl_bus_rescan(void)
++static void cxl_bus_rescan_queue(struct work_struct *w)
+ {
+-      return bus_rescan_devices(&cxl_bus_type);
++      int rc = bus_rescan_devices(&cxl_bus_type);
++
++      pr_debug("CXL bus rescan result: %d\n", rc);
++}
++
++void cxl_bus_rescan(void)
++{
++      static DECLARE_WORK(rescan_work, cxl_bus_rescan_queue);
++
++      queue_work(cxl_bus_wq, &rescan_work);
+ }
+ EXPORT_SYMBOL_NS_GPL(cxl_bus_rescan, CXL);
++void cxl_bus_drain(void)
++{
++      drain_workqueue(cxl_bus_wq);
++}
++EXPORT_SYMBOL_NS_GPL(cxl_bus_drain, CXL);
++
+ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd)
+ {
+       return queue_work(cxl_bus_wq, &cxlmd->detach_work);
+diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
+index 7750ccb7652db..827fa94cddda1 100644
+--- a/drivers/cxl/cxl.h
++++ b/drivers/cxl/cxl.h
+@@ -564,7 +564,8 @@ struct cxl_port *devm_cxl_add_port(struct device *host, struct device *uport,
+                                  struct cxl_dport *parent_dport);
+ struct cxl_port *find_cxl_root(struct device *dev);
+ int devm_cxl_enumerate_ports(struct cxl_memdev *cxlmd);
+-int cxl_bus_rescan(void);
++void cxl_bus_rescan(void);
++void cxl_bus_drain(void);
+ struct cxl_port *cxl_mem_find_port(struct cxl_memdev *cxlmd,
+                                  struct cxl_dport **dport);
+ bool schedule_cxl_memdev_detach(struct cxl_memdev *cxlmd);
+-- 
+2.43.0
+
diff --git a/queue-6.1/cxl-port-fix-cxl_bus_rescan-vs-bus_rescan_devices.patch b/queue-6.1/cxl-port-fix-cxl_bus_rescan-vs-bus_rescan_devices.patch
new file mode 100644 (file)
index 0000000..22e445c
--- /dev/null
@@ -0,0 +1,65 @@
+From d7f1f35cfbdd17cb6884cd3722f902922f725701 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 22 Oct 2024 18:43:32 -0700
+Subject: cxl/port: Fix cxl_bus_rescan() vs bus_rescan_devices()
+
+From: Dan Williams <dan.j.williams@intel.com>
+
+[ Upstream commit 3d6ebf16438de5d712030fefbb4182b46373d677 ]
+
+It turns out since its original introduction, pre-2.6.12,
+bus_rescan_devices() has skipped devices that might be in the process of
+attaching or detaching from their driver. For CXL this behavior is
+unwanted and expects that cxl_bus_rescan() is a probe barrier.
+
+That behavior is simple enough to achieve with bus_for_each_dev() paired
+with call to device_attach(), and it is unclear why bus_rescan_devices()
+took the position of lockless consumption of dev->driver which is racy.
+
+The "Fixes:" but no "Cc: stable" on this patch reflects that the issue
+is merely by inspection since the bug that triggered the discovery of
+this potential problem [1] is fixed by other means.  However, a stable
+backport should do no harm.
+
+Fixes: 8dd2bc0f8e02 ("cxl/mem: Add the cxl_mem driver")
+Link: http://lore.kernel.org/20241004212504.1246-1-gourry@gourry.net [1]
+Signed-off-by: Dan Williams <dan.j.williams@intel.com>
+Tested-by: Gregory Price <gourry@gourry.net>
+Reviewed-by: Jonathan Cameron <Jonathan.Cameron@huawei.com>
+Reviewed-by: Ira Weiny <ira.weiny@intel.com>
+Link: https://patch.msgid.link/172964781104.81806.4277549800082443769.stgit@dwillia2-xfh.jf.intel.com
+Signed-off-by: Ira Weiny <ira.weiny@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cxl/core/port.c | 13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c
+index f0875fa86c616..20f052d3759e0 100644
+--- a/drivers/cxl/core/port.c
++++ b/drivers/cxl/core/port.c
+@@ -1786,11 +1786,18 @@ static void cxl_bus_remove(struct device *dev)
+ static struct workqueue_struct *cxl_bus_wq;
+-static void cxl_bus_rescan_queue(struct work_struct *w)
++static int cxl_rescan_attach(struct device *dev, void *data)
+ {
+-      int rc = bus_rescan_devices(&cxl_bus_type);
++      int rc = device_attach(dev);
++
++      dev_vdbg(dev, "rescan: %s\n", rc ? "attach" : "detached");
+-      pr_debug("CXL bus rescan result: %d\n", rc);
++      return 0;
++}
++
++static void cxl_bus_rescan_queue(struct work_struct *w)
++{
++      bus_for_each_dev(&cxl_bus_type, NULL, NULL, cxl_rescan_attach);
+ }
+ void cxl_bus_rescan(void)
+-- 
+2.43.0
+
diff --git a/queue-6.1/fs-create-kiocb_-start-end-_write-helpers.patch b/queue-6.1/fs-create-kiocb_-start-end-_write-helpers.patch
new file mode 100644 (file)
index 0000000..e9fb074
--- /dev/null
@@ -0,0 +1,77 @@
+From fc9f40091fb160075127a025b7278dbdf985a5b4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Aug 2023 17:13:33 +0300
+Subject: fs: create kiocb_{start,end}_write() helpers
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit ed0360bbab72b829437b67ebb2f9cfac19f59dfe ]
+
+aio, io_uring, cachefiles and overlayfs, all open code an ugly variant
+of file_{start,end}_write() to silence lockdep warnings.
+
+Create helpers for this lockdep dance so we can use the helpers in all
+the callers.
+
+Suggested-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Message-Id: <20230817141337.1025891-4-amir73il@gmail.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 1d60d74e8526 ("io_uring/rw: fix missing NOWAIT check for O_DIRECT start write")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/fs.h | 36 ++++++++++++++++++++++++++++++++++++
+ 1 file changed, 36 insertions(+)
+
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 33c4961309833..0d32634c5cf0d 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -3029,6 +3029,42 @@ static inline void file_end_write(struct file *file)
+       __sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+ }
++/**
++ * kiocb_start_write - get write access to a superblock for async file io
++ * @iocb: the io context we want to submit the write with
++ *
++ * This is a variant of sb_start_write() for async io submission.
++ * Should be matched with a call to kiocb_end_write().
++ */
++static inline void kiocb_start_write(struct kiocb *iocb)
++{
++      struct inode *inode = file_inode(iocb->ki_filp);
++
++      sb_start_write(inode->i_sb);
++      /*
++       * Fool lockdep by telling it the lock got released so that it
++       * doesn't complain about the held lock when we return to userspace.
++       */
++      __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
++}
++
++/**
++ * kiocb_end_write - drop write access to a superblock after async file io
++ * @iocb: the io context we sumbitted the write with
++ *
++ * Should be matched with a call to kiocb_start_write().
++ */
++static inline void kiocb_end_write(struct kiocb *iocb)
++{
++      struct inode *inode = file_inode(iocb->ki_filp);
++
++      /*
++       * Tell lockdep we inherited freeze protection from submission thread.
++       */
++      __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
++      sb_end_write(inode->i_sb);
++}
++
+ /*
+  * This is used for regular files where some users -- especially the
+  * currently executed binary in a process, previously handled via
+-- 
+2.43.0
+
diff --git a/queue-6.1/io_uring-rename-kiocb_end_write-local-helper.patch b/queue-6.1/io_uring-rename-kiocb_end_write-local-helper.patch
new file mode 100644 (file)
index 0000000..ca9e820
--- /dev/null
@@ -0,0 +1,75 @@
+From 85ca1f838a54f739f7d8ddcc16ace4d7f67af19f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Aug 2023 17:13:31 +0300
+Subject: io_uring: rename kiocb_end_write() local helper
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit a370167fe526123637965f60859a9f1f3e1a58b7 ]
+
+This helper does not take a kiocb as input and we want to create a
+common helper by that name that takes a kiocb as input.
+
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Message-Id: <20230817141337.1025891-2-amir73il@gmail.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 1d60d74e8526 ("io_uring/rw: fix missing NOWAIT check for O_DIRECT start write")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ io_uring/rw.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/io_uring/rw.c b/io_uring/rw.c
+index 038e6b13a7496..4eb42fc29c151 100644
+--- a/io_uring/rw.c
++++ b/io_uring/rw.c
+@@ -220,7 +220,7 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
+ }
+ #endif
+-static void kiocb_end_write(struct io_kiocb *req)
++static void io_req_end_write(struct io_kiocb *req)
+ {
+       /*
+        * Tell lockdep we inherited freeze protection from submission
+@@ -243,7 +243,7 @@ static void io_req_io_end(struct io_kiocb *req)
+       struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+       if (rw->kiocb.ki_flags & IOCB_WRITE) {
+-              kiocb_end_write(req);
++              io_req_end_write(req);
+               fsnotify_modify(req->file);
+       } else {
+               fsnotify_access(req->file);
+@@ -307,7 +307,7 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res)
+       struct io_kiocb *req = cmd_to_io_kiocb(rw);
+       if (kiocb->ki_flags & IOCB_WRITE)
+-              kiocb_end_write(req);
++              io_req_end_write(req);
+       if (unlikely(res != req->cqe.res)) {
+               if (res == -EAGAIN && io_rw_should_reissue(req)) {
+                       req->flags |= REQ_F_REISSUE | REQ_F_PARTIAL_IO;
+@@ -956,7 +956,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
+                               io->bytes_done += ret2;
+                       if (kiocb->ki_flags & IOCB_WRITE)
+-                              kiocb_end_write(req);
++                              io_req_end_write(req);
+                       return ret ? ret : -EAGAIN;
+               }
+ done:
+@@ -967,7 +967,7 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
+               ret = io_setup_async_rw(req, iovec, s, false);
+               if (!ret) {
+                       if (kiocb->ki_flags & IOCB_WRITE)
+-                              kiocb_end_write(req);
++                              io_req_end_write(req);
+                       return -EAGAIN;
+               }
+               return ret;
+-- 
+2.43.0
+
diff --git a/queue-6.1/io_uring-rw-fix-missing-nowait-check-for-o_direct-st.patch b/queue-6.1/io_uring-rw-fix-missing-nowait-check-for-o_direct-st.patch
new file mode 100644 (file)
index 0000000..a7aabfa
--- /dev/null
@@ -0,0 +1,121 @@
+From 7cc8484de2585d9324d89c118dfda19dc847ebab Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 31 Oct 2024 08:05:44 -0600
+Subject: io_uring/rw: fix missing NOWAIT check for O_DIRECT start write
+
+From: Jens Axboe <axboe@kernel.dk>
+
+[ Upstream commit 1d60d74e852647255bd8e76f5a22dc42531e4389 ]
+
+When io_uring starts a write, it'll call kiocb_start_write() to bump the
+super block rwsem, preventing any freezes from happening while that
+write is in-flight. The freeze side will grab that rwsem for writing,
+excluding any new writers from happening and waiting for existing writes
+to finish. But io_uring unconditionally uses kiocb_start_write(), which
+will block if someone is currently attempting to freeze the mount point.
+This causes a deadlock where freeze is waiting for previous writes to
+complete, but the previous writes cannot complete, as the task that is
+supposed to complete them is blocked waiting on starting a new write.
+This results in the following stuck trace showing that dependency with
+the write blocked starting a new write:
+
+task:fio             state:D stack:0     pid:886   tgid:886   ppid:876
+Call trace:
+ __switch_to+0x1d8/0x348
+ __schedule+0x8e8/0x2248
+ schedule+0x110/0x3f0
+ percpu_rwsem_wait+0x1e8/0x3f8
+ __percpu_down_read+0xe8/0x500
+ io_write+0xbb8/0xff8
+ io_issue_sqe+0x10c/0x1020
+ io_submit_sqes+0x614/0x2110
+ __arm64_sys_io_uring_enter+0x524/0x1038
+ invoke_syscall+0x74/0x268
+ el0_svc_common.constprop.0+0x160/0x238
+ do_el0_svc+0x44/0x60
+ el0_svc+0x44/0xb0
+ el0t_64_sync_handler+0x118/0x128
+ el0t_64_sync+0x168/0x170
+INFO: task fsfreeze:7364 blocked for more than 15 seconds.
+      Not tainted 6.12.0-rc5-00063-g76aaf945701c #7963
+
+with the attempting freezer stuck trying to grab the rwsem:
+
+task:fsfreeze        state:D stack:0     pid:7364  tgid:7364  ppid:995
+Call trace:
+ __switch_to+0x1d8/0x348
+ __schedule+0x8e8/0x2248
+ schedule+0x110/0x3f0
+ percpu_down_write+0x2b0/0x680
+ freeze_super+0x248/0x8a8
+ do_vfs_ioctl+0x149c/0x1b18
+ __arm64_sys_ioctl+0xd0/0x1a0
+ invoke_syscall+0x74/0x268
+ el0_svc_common.constprop.0+0x160/0x238
+ do_el0_svc+0x44/0x60
+ el0_svc+0x44/0xb0
+ el0t_64_sync_handler+0x118/0x128
+ el0t_64_sync+0x168/0x170
+
+Fix this by having the io_uring side honor IOCB_NOWAIT, and only attempt a
+blocking grab of the super block rwsem if it isn't set. For normal issue
+where IOCB_NOWAIT would always be set, this returns -EAGAIN which will
+have io_uring core issue a blocking attempt of the write. That will in
+turn also get completions run, ensuring forward progress.
+
+Since freezing requires CAP_SYS_ADMIN in the first place, this isn't
+something that can be triggered by a regular user.
+
+Cc: stable@vger.kernel.org # 5.10+
+Reported-by: Peter Mann <peter.mann@sh.cz>
+Link: https://lore.kernel.org/io-uring/38c94aec-81c9-4f62-b44e-1d87f5597644@sh.cz
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ io_uring/rw.c | 23 +++++++++++++++++++++--
+ 1 file changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/io_uring/rw.c b/io_uring/rw.c
+index c15c7873813b3..9d6e17a244ae7 100644
+--- a/io_uring/rw.c
++++ b/io_uring/rw.c
+@@ -839,6 +839,25 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
+       return kiocb_done(req, ret, issue_flags);
+ }
++static bool io_kiocb_start_write(struct io_kiocb *req, struct kiocb *kiocb)
++{
++      struct inode *inode;
++      bool ret;
++
++      if (!(req->flags & REQ_F_ISREG))
++              return true;
++      if (!(kiocb->ki_flags & IOCB_NOWAIT)) {
++              kiocb_start_write(kiocb);
++              return true;
++      }
++
++      inode = file_inode(kiocb->ki_filp);
++      ret = sb_start_write_trylock(inode->i_sb);
++      if (ret)
++              __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
++      return ret;
++}
++
+ int io_write(struct io_kiocb *req, unsigned int issue_flags)
+ {
+       struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+@@ -892,8 +911,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
+               return ret;
+       }
+-      if (req->flags & REQ_F_ISREG)
+-              kiocb_start_write(kiocb);
++      if (unlikely(!io_kiocb_start_write(req, kiocb)))
++              return -EAGAIN;
+       kiocb->ki_flags |= IOCB_WRITE;
+       if (likely(req->file->f_op->write_iter))
+-- 
+2.43.0
+
diff --git a/queue-6.1/io_uring-use-kiocb_-start-end-_write-helpers.patch b/queue-6.1/io_uring-use-kiocb_-start-end-_write-helpers.patch
new file mode 100644 (file)
index 0000000..61b48e7
--- /dev/null
@@ -0,0 +1,69 @@
+From 4cf01f8e6f316d434b3882d6b4fff5666ee05971 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Aug 2023 17:13:34 +0300
+Subject: io_uring: use kiocb_{start,end}_write() helpers
+
+From: Amir Goldstein <amir73il@gmail.com>
+
+[ Upstream commit e484fd73f4bdcb00c2188100c2d84e9f3f5c9f7d ]
+
+Use helpers instead of the open coded dance to silence lockdep warnings.
+
+Suggested-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Amir Goldstein <amir73il@gmail.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Reviewed-by: Jens Axboe <axboe@kernel.dk>
+Message-Id: <20230817141337.1025891-5-amir73il@gmail.com>
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Stable-dep-of: 1d60d74e8526 ("io_uring/rw: fix missing NOWAIT check for O_DIRECT start write")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ io_uring/rw.c | 23 ++++-------------------
+ 1 file changed, 4 insertions(+), 19 deletions(-)
+
+diff --git a/io_uring/rw.c b/io_uring/rw.c
+index 4eb42fc29c151..c15c7873813b3 100644
+--- a/io_uring/rw.c
++++ b/io_uring/rw.c
+@@ -222,15 +222,10 @@ static bool io_rw_should_reissue(struct io_kiocb *req)
+ static void io_req_end_write(struct io_kiocb *req)
+ {
+-      /*
+-       * Tell lockdep we inherited freeze protection from submission
+-       * thread.
+-       */
+       if (req->flags & REQ_F_ISREG) {
+-              struct super_block *sb = file_inode(req->file)->i_sb;
++              struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+-              __sb_writers_acquired(sb, SB_FREEZE_WRITE);
+-              sb_end_write(sb);
++              kiocb_end_write(&rw->kiocb);
+       }
+ }
+@@ -897,18 +892,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
+               return ret;
+       }
+-      /*
+-       * Open-code file_start_write here to grab freeze protection,
+-       * which will be released by another thread in
+-       * io_complete_rw().  Fool lockdep by telling it the lock got
+-       * released so that it doesn't complain about the held lock when
+-       * we return to userspace.
+-       */
+-      if (req->flags & REQ_F_ISREG) {
+-              sb_start_write(file_inode(req->file)->i_sb);
+-              __sb_writers_release(file_inode(req->file)->i_sb,
+-                                      SB_FREEZE_WRITE);
+-      }
++      if (req->flags & REQ_F_ISREG)
++              kiocb_start_write(kiocb);
+       kiocb->ki_flags |= IOCB_WRITE;
+       if (likely(req->file->f_op->write_iter))
+-- 
+2.43.0
+
diff --git a/queue-6.1/kasan-remove-vmalloc_percpu-test.patch b/queue-6.1/kasan-remove-vmalloc_percpu-test.patch
new file mode 100644 (file)
index 0000000..01d971a
--- /dev/null
@@ -0,0 +1,87 @@
+From 4ed3283db4accb8de6dae5d596250f67d6afd12b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 22 Oct 2024 18:07:06 +0200
+Subject: kasan: remove vmalloc_percpu test
+
+From: Andrey Konovalov <andreyknvl@gmail.com>
+
+[ Upstream commit 330d8df81f3673d6fb74550bbc9bb159d81b35f7 ]
+
+Commit 1a2473f0cbc0 ("kasan: improve vmalloc tests") added the
+vmalloc_percpu KASAN test with the assumption that __alloc_percpu always
+uses vmalloc internally, which is tagged by KASAN.
+
+However, __alloc_percpu might allocate memory from the first per-CPU
+chunk, which is not allocated via vmalloc().  As a result, the test might
+fail.
+
+Remove the test until proper KASAN annotation for the per-CPU allocated
+are added; tracked in https://bugzilla.kernel.org/show_bug.cgi?id=215019.
+
+Link: https://lkml.kernel.org/r/20241022160706.38943-1-andrey.konovalov@linux.dev
+Fixes: 1a2473f0cbc0 ("kasan: improve vmalloc tests")
+Signed-off-by: Andrey Konovalov <andreyknvl@gmail.com>
+Reported-by: Samuel Holland <samuel.holland@sifive.com>
+Link: https://lore.kernel.org/all/4a245fff-cc46-44d1-a5f9-fd2f1c3764ae@sifive.com/
+Reported-by: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
+Link: https://lore.kernel.org/all/CACzwLxiWzNqPBp4C1VkaXZ2wDwvY3yZeetCi1TLGFipKW77drA@mail.gmail.com/
+Cc: Alexander Potapenko <glider@google.com>
+Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+Cc: Dmitry Vyukov <dvyukov@google.com>
+Cc: Marco Elver <elver@google.com>
+Cc: Sabyrzhan Tasbolatov <snovitoll@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/kasan/kasan_test.c | 27 ---------------------------
+ 1 file changed, 27 deletions(-)
+
+diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c
+index cef683a2e0d2e..df9658299a08a 100644
+--- a/mm/kasan/kasan_test.c
++++ b/mm/kasan/kasan_test.c
+@@ -1260,32 +1260,6 @@ static void vm_map_ram_tags(struct kunit *test)
+       free_pages((unsigned long)p_ptr, 1);
+ }
+-static void vmalloc_percpu(struct kunit *test)
+-{
+-      char __percpu *ptr;
+-      int cpu;
+-
+-      /*
+-       * This test is specifically crafted for the software tag-based mode,
+-       * the only tag-based mode that poisons percpu mappings.
+-       */
+-      KASAN_TEST_NEEDS_CONFIG_ON(test, CONFIG_KASAN_SW_TAGS);
+-
+-      ptr = __alloc_percpu(PAGE_SIZE, PAGE_SIZE);
+-
+-      for_each_possible_cpu(cpu) {
+-              char *c_ptr = per_cpu_ptr(ptr, cpu);
+-
+-              KUNIT_EXPECT_GE(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_MIN);
+-              KUNIT_EXPECT_LT(test, (u8)get_tag(c_ptr), (u8)KASAN_TAG_KERNEL);
+-
+-              /* Make sure that in-bounds accesses don't crash the kernel. */
+-              *c_ptr = 0;
+-      }
+-
+-      free_percpu(ptr);
+-}
+-
+ /*
+  * Check that the assigned pointer tag falls within the [KASAN_TAG_MIN,
+  * KASAN_TAG_KERNEL) range (note: excluding the match-all tag) for tag-based
+@@ -1439,7 +1413,6 @@ static struct kunit_case kasan_kunit_test_cases[] = {
+       KUNIT_CASE(vmalloc_oob),
+       KUNIT_CASE(vmap_tags),
+       KUNIT_CASE(vm_map_ram_tags),
+-      KUNIT_CASE(vmalloc_percpu),
+       KUNIT_CASE(match_all_not_assigned),
+       KUNIT_CASE(match_all_ptr_tag),
+       KUNIT_CASE(match_all_mem_tag),
+-- 
+2.43.0
+
diff --git a/queue-6.1/mctp-i2c-handle-null-header-address.patch b/queue-6.1/mctp-i2c-handle-null-header-address.patch
new file mode 100644 (file)
index 0000000..d9010ad
--- /dev/null
@@ -0,0 +1,44 @@
+From f2e4472e93a1a0c03ce77a1cb2932502e907e34d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 22 Oct 2024 18:25:14 +0800
+Subject: mctp i2c: handle NULL header address
+
+From: Matt Johnston <matt@codeconstruct.com.au>
+
+[ Upstream commit 01e215975fd80af81b5b79f009d49ddd35976c13 ]
+
+daddr can be NULL if there is no neighbour table entry present,
+in that case the tx packet should be dropped.
+
+saddr will usually be set by MCTP core, but check for NULL in case a
+packet is transmitted by a different protocol.
+
+Fixes: f5b8abf9fc3d ("mctp i2c: MCTP I2C binding driver")
+Cc: stable@vger.kernel.org
+Reported-by: Dung Cao <dung@os.amperecomputing.com>
+Signed-off-by: Matt Johnston <matt@codeconstruct.com.au>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20241022-mctp-i2c-null-dest-v3-1-e929709956c5@codeconstruct.com.au
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/mctp/mctp-i2c.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/drivers/net/mctp/mctp-i2c.c b/drivers/net/mctp/mctp-i2c.c
+index 1d67a3ca1fd11..7635a8b3c35cd 100644
+--- a/drivers/net/mctp/mctp-i2c.c
++++ b/drivers/net/mctp/mctp-i2c.c
+@@ -547,6 +547,9 @@ static int mctp_i2c_header_create(struct sk_buff *skb, struct net_device *dev,
+       if (len > MCTP_I2C_MAXMTU)
+               return -EMSGSIZE;
++      if (!daddr || !saddr)
++              return -EINVAL;
++
+       lldst = *((u8 *)daddr);
+       llsrc = *((u8 *)saddr);
+-- 
+2.43.0
+
diff --git a/queue-6.1/migrate-convert-migrate_pages-to-use-folios.patch b/queue-6.1/migrate-convert-migrate_pages-to-use-folios.patch
new file mode 100644 (file)
index 0000000..9b82ab3
--- /dev/null
@@ -0,0 +1,380 @@
+From 3992d9e7cfbd4eec0eee3c177e5f4e11ba8c4294 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 9 Nov 2022 09:23:48 +0800
+Subject: migrate: convert migrate_pages() to use folios
+
+From: Huang Ying <ying.huang@intel.com>
+
+[ Upstream commit eaec4e639f11413ce75fbf38affd1aa5c40979e9 ]
+
+Quite straightforward, the page functions are converted to corresponding
+folio functions.  Same for comments.
+
+THP specific code are converted to be large folio.
+
+Link: https://lkml.kernel.org/r/20221109012348.93849-3-ying.huang@intel.com
+Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Matthew Wilcox <willy@infradead.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/migrate.c | 210 +++++++++++++++++++++++++++------------------------
+ 1 file changed, 112 insertions(+), 98 deletions(-)
+
+diff --git a/mm/migrate.c b/mm/migrate.c
+index 16b456b927c18..562f819dc6189 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1385,231 +1385,245 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
+       return rc;
+ }
+-static inline int try_split_thp(struct page *page, struct list_head *split_pages)
++static inline int try_split_folio(struct folio *folio, struct list_head *split_folios)
+ {
+       int rc;
+-      lock_page(page);
+-      rc = split_huge_page_to_list(page, split_pages);
+-      unlock_page(page);
++      folio_lock(folio);
++      rc = split_folio_to_list(folio, split_folios);
++      folio_unlock(folio);
+       if (!rc)
+-              list_move_tail(&page->lru, split_pages);
++              list_move_tail(&folio->lru, split_folios);
+       return rc;
+ }
+ /*
+- * migrate_pages - migrate the pages specified in a list, to the free pages
++ * migrate_pages - migrate the folios specified in a list, to the free folios
+  *               supplied as the target for the page migration
+  *
+- * @from:             The list of pages to be migrated.
+- * @get_new_page:     The function used to allocate free pages to be used
+- *                    as the target of the page migration.
+- * @put_new_page:     The function used to free target pages if migration
++ * @from:             The list of folios to be migrated.
++ * @get_new_page:     The function used to allocate free folios to be used
++ *                    as the target of the folio migration.
++ * @put_new_page:     The function used to free target folios if migration
+  *                    fails, or NULL if no special handling is necessary.
+  * @private:          Private data to be passed on to get_new_page()
+  * @mode:             The migration mode that specifies the constraints for
+- *                    page migration, if any.
+- * @reason:           The reason for page migration.
+- * @ret_succeeded:    Set to the number of normal pages migrated successfully if
++ *                    folio migration, if any.
++ * @reason:           The reason for folio migration.
++ * @ret_succeeded:    Set to the number of folios migrated successfully if
+  *                    the caller passes a non-NULL pointer.
+  *
+- * The function returns after 10 attempts or if no pages are movable any more
+- * because the list has become empty or no retryable pages exist any more.
+- * It is caller's responsibility to call putback_movable_pages() to return pages
++ * The function returns after 10 attempts or if no folios are movable any more
++ * because the list has become empty or no retryable folios exist any more.
++ * It is caller's responsibility to call putback_movable_pages() to return folios
+  * to the LRU or free list only if ret != 0.
+  *
+- * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
+- * an error code. The number of THP splits will be considered as the number of
+- * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
++ * Returns the number of {normal folio, large folio, hugetlb} that were not
++ * migrated, or an error code. The number of large folio splits will be
++ * considered as the number of non-migrated large folio, no matter how many
++ * split folios of the large folio are migrated successfully.
+  */
+ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+               free_page_t put_new_page, unsigned long private,
+               enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
+ {
+       int retry = 1;
++      int large_retry = 1;
+       int thp_retry = 1;
+       int nr_failed = 0;
+       int nr_failed_pages = 0;
+       int nr_retry_pages = 0;
+       int nr_succeeded = 0;
+       int nr_thp_succeeded = 0;
++      int nr_large_failed = 0;
+       int nr_thp_failed = 0;
+       int nr_thp_split = 0;
+       int pass = 0;
++      bool is_large = false;
+       bool is_thp = false;
+-      struct page *page;
+-      struct page *page2;
+-      int rc, nr_subpages;
+-      LIST_HEAD(ret_pages);
+-      LIST_HEAD(thp_split_pages);
++      struct folio *folio, *folio2;
++      int rc, nr_pages;
++      LIST_HEAD(ret_folios);
++      LIST_HEAD(split_folios);
+       bool nosplit = (reason == MR_NUMA_MISPLACED);
+-      bool no_subpage_counting = false;
++      bool no_split_folio_counting = false;
+       trace_mm_migrate_pages_start(mode, reason);
+-thp_subpage_migration:
+-      for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
++split_folio_migration:
++      for (pass = 0; pass < 10 && (retry || large_retry); pass++) {
+               retry = 0;
++              large_retry = 0;
+               thp_retry = 0;
+               nr_retry_pages = 0;
+-              list_for_each_entry_safe(page, page2, from, lru) {
++              list_for_each_entry_safe(folio, folio2, from, lru) {
+                       /*
+-                       * THP statistics is based on the source huge page.
+-                       * Capture required information that might get lost
+-                       * during migration.
++                       * Large folio statistics is based on the source large
++                       * folio. Capture required information that might get
++                       * lost during migration.
+                        */
+-                      is_thp = PageTransHuge(page) && !PageHuge(page);
+-                      nr_subpages = compound_nr(page);
++                      is_large = folio_test_large(folio) && !folio_test_hugetlb(folio);
++                      is_thp = is_large && folio_test_pmd_mappable(folio);
++                      nr_pages = folio_nr_pages(folio);
+                       cond_resched();
+-                      if (PageHuge(page))
++                      if (folio_test_hugetlb(folio))
+                               rc = unmap_and_move_huge_page(get_new_page,
+-                                              put_new_page, private, page,
+-                                              pass > 2, mode, reason,
+-                                              &ret_pages);
++                                              put_new_page, private,
++                                              &folio->page, pass > 2, mode,
++                                              reason,
++                                              &ret_folios);
+                       else
+                               rc = unmap_and_move(get_new_page, put_new_page,
+-                                              private, page_folio(page), pass > 2, mode,
+-                                              reason, &ret_pages);
++                                              private, folio, pass > 2, mode,
++                                              reason, &ret_folios);
+                       /*
+                        * The rules are:
+-                       *      Success: non hugetlb page will be freed, hugetlb
+-                       *               page will be put back
++                       *      Success: non hugetlb folio will be freed, hugetlb
++                       *               folio will be put back
+                        *      -EAGAIN: stay on the from list
+                        *      -ENOMEM: stay on the from list
+                        *      -ENOSYS: stay on the from list
+-                       *      Other errno: put on ret_pages list then splice to
++                       *      Other errno: put on ret_folios list then splice to
+                        *                   from list
+                        */
+                       switch(rc) {
+                       /*
+-                       * THP migration might be unsupported or the
+-                       * allocation could've failed so we should
+-                       * retry on the same page with the THP split
+-                       * to base pages.
++                       * Large folio migration might be unsupported or
++                       * the allocation could've failed so we should retry
++                       * on the same folio with the large folio split
++                       * to normal folios.
+                        *
+-                       * Sub-pages are put in thp_split_pages, and
++                       * Split folios are put in split_folios, and
+                        * we will migrate them after the rest of the
+                        * list is processed.
+                        */
+                       case -ENOSYS:
+-                              /* THP migration is unsupported */
+-                              if (is_thp) {
+-                                      nr_thp_failed++;
+-                                      if (!try_split_thp(page, &thp_split_pages)) {
+-                                              nr_thp_split++;
++                              /* Large folio migration is unsupported */
++                              if (is_large) {
++                                      nr_large_failed++;
++                                      nr_thp_failed += is_thp;
++                                      if (!try_split_folio(folio, &split_folios)) {
++                                              nr_thp_split += is_thp;
+                                               break;
+                                       }
+                               /* Hugetlb migration is unsupported */
+-                              } else if (!no_subpage_counting) {
++                              } else if (!no_split_folio_counting) {
+                                       nr_failed++;
+                               }
+-                              nr_failed_pages += nr_subpages;
+-                              list_move_tail(&page->lru, &ret_pages);
++                              nr_failed_pages += nr_pages;
++                              list_move_tail(&folio->lru, &ret_folios);
+                               break;
+                       case -ENOMEM:
+                               /*
+                                * When memory is low, don't bother to try to migrate
+-                               * other pages, just exit.
++                               * other folios, just exit.
+                                */
+-                              if (is_thp) {
+-                                      nr_thp_failed++;
+-                                      /* THP NUMA faulting doesn't split THP to retry. */
++                              if (is_large) {
++                                      nr_large_failed++;
++                                      nr_thp_failed += is_thp;
++                                      /* Large folio NUMA faulting doesn't split to retry. */
+                                       if (!nosplit) {
+-                                              int ret = try_split_thp(page, &thp_split_pages);
++                                              int ret = try_split_folio(folio, &split_folios);
+                                               if (!ret) {
+-                                                      nr_thp_split++;
++                                                      nr_thp_split += is_thp;
+                                                       break;
+                                               } else if (reason == MR_LONGTERM_PIN &&
+                                                          ret == -EAGAIN) {
+                                                       /*
+-                                                       * Try again to split THP to mitigate
+-                                                       * the failure of longterm pinning.
++                                                       * Try again to split large folio to
++                                                       * mitigate the failure of longterm pinning.
+                                                        */
+-                                                      thp_retry++;
+-                                                      nr_retry_pages += nr_subpages;
++                                                      large_retry++;
++                                                      thp_retry += is_thp;
++                                                      nr_retry_pages += nr_pages;
+                                                       break;
+                                               }
+                                       }
+-                              } else if (!no_subpage_counting) {
++                              } else if (!no_split_folio_counting) {
+                                       nr_failed++;
+                               }
+-                              nr_failed_pages += nr_subpages + nr_retry_pages;
++                              nr_failed_pages += nr_pages + nr_retry_pages;
+                               /*
+-                               * There might be some subpages of fail-to-migrate THPs
+-                               * left in thp_split_pages list. Move them back to migration
++                               * There might be some split folios of fail-to-migrate large
++                               * folios left in split_folios list. Move them back to migration
+                                * list so that they could be put back to the right list by
+-                               * the caller otherwise the page refcnt will be leaked.
++                               * the caller otherwise the folio refcnt will be leaked.
+                                */
+-                              list_splice_init(&thp_split_pages, from);
++                              list_splice_init(&split_folios, from);
+                               /* nr_failed isn't updated for not used */
++                              nr_large_failed += large_retry;
+                               nr_thp_failed += thp_retry;
+                               goto out;
+                       case -EAGAIN:
+-                              if (is_thp)
+-                                      thp_retry++;
+-                              else if (!no_subpage_counting)
++                              if (is_large) {
++                                      large_retry++;
++                                      thp_retry += is_thp;
++                              } else if (!no_split_folio_counting) {
+                                       retry++;
+-                              nr_retry_pages += nr_subpages;
++                              }
++                              nr_retry_pages += nr_pages;
+                               break;
+                       case MIGRATEPAGE_SUCCESS:
+-                              nr_succeeded += nr_subpages;
+-                              if (is_thp)
+-                                      nr_thp_succeeded++;
++                              nr_succeeded += nr_pages;
++                              nr_thp_succeeded += is_thp;
+                               break;
+                       default:
+                               /*
+                                * Permanent failure (-EBUSY, etc.):
+-                               * unlike -EAGAIN case, the failed page is
+-                               * removed from migration page list and not
++                               * unlike -EAGAIN case, the failed folio is
++                               * removed from migration folio list and not
+                                * retried in the next outer loop.
+                                */
+-                              if (is_thp)
+-                                      nr_thp_failed++;
+-                              else if (!no_subpage_counting)
++                              if (is_large) {
++                                      nr_large_failed++;
++                                      nr_thp_failed += is_thp;
++                              } else if (!no_split_folio_counting) {
+                                       nr_failed++;
++                              }
+-                              nr_failed_pages += nr_subpages;
++                              nr_failed_pages += nr_pages;
+                               break;
+                       }
+               }
+       }
+       nr_failed += retry;
++      nr_large_failed += large_retry;
+       nr_thp_failed += thp_retry;
+       nr_failed_pages += nr_retry_pages;
+       /*
+-       * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
+-       * counting in this round, since all subpages of a THP is counted
+-       * as 1 failure in the first round.
++       * Try to migrate split folios of fail-to-migrate large folios, no
++       * nr_failed counting in this round, since all split folios of a
++       * large folio is counted as 1 failure in the first round.
+        */
+-      if (!list_empty(&thp_split_pages)) {
++      if (!list_empty(&split_folios)) {
+               /*
+-               * Move non-migrated pages (after 10 retries) to ret_pages
++               * Move non-migrated folios (after 10 retries) to ret_folios
+                * to avoid migrating them again.
+                */
+-              list_splice_init(from, &ret_pages);
+-              list_splice_init(&thp_split_pages, from);
+-              no_subpage_counting = true;
++              list_splice_init(from, &ret_folios);
++              list_splice_init(&split_folios, from);
++              no_split_folio_counting = true;
+               retry = 1;
+-              goto thp_subpage_migration;
++              goto split_folio_migration;
+       }
+-      rc = nr_failed + nr_thp_failed;
++      rc = nr_failed + nr_large_failed;
+ out:
+       /*
+-       * Put the permanent failure page back to migration list, they
++       * Put the permanent failure folio back to migration list, they
+        * will be put back to the right list by the caller.
+        */
+-      list_splice(&ret_pages, from);
++      list_splice(&ret_folios, from);
+       /*
+-       * Return 0 in case all subpages of fail-to-migrate THPs are
+-       * migrated successfully.
++       * Return 0 in case all split folios of fail-to-migrate large folios
++       * are migrated successfully.
+        */
+       if (list_empty(from))
+               rc = 0;
+-- 
+2.43.0
+
diff --git a/queue-6.1/migrate-convert-unmap_and_move-to-use-folios.patch b/queue-6.1/migrate-convert-unmap_and_move-to-use-folios.patch
new file mode 100644 (file)
index 0000000..f4a07cd
--- /dev/null
@@ -0,0 +1,158 @@
+From 989ae777a6d2f1f50e59c2afe0e78e6d29cc0dde Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 9 Nov 2022 09:23:47 +0800
+Subject: migrate: convert unmap_and_move() to use folios
+
+From: Huang Ying <ying.huang@intel.com>
+
+[ Upstream commit 49f51859221a3dfee27488eaeaff800459cac6a9 ]
+
+Patch series "migrate: convert migrate_pages()/unmap_and_move() to use
+folios", v2.
+
+The conversion is quite straightforward, just replace the page API to the
+corresponding folio API.  migrate_pages() and unmap_and_move() mostly work
+with folios (head pages) only.
+
+This patch (of 2):
+
+Quite straightforward, the page functions are converted to corresponding
+folio functions.  Same for comments.
+
+Link: https://lkml.kernel.org/r/20221109012348.93849-1-ying.huang@intel.com
+Link: https://lkml.kernel.org/r/20221109012348.93849-2-ying.huang@intel.com
+Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
+Reviewed-by: Yang Shi <shy828301@gmail.com>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/migrate.c | 54 ++++++++++++++++++++++++++--------------------------
+ 1 file changed, 27 insertions(+), 27 deletions(-)
+
+diff --git a/mm/migrate.c b/mm/migrate.c
+index b0caa89e67d5f..16b456b927c18 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1162,79 +1162,79 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
+ }
+ /*
+- * Obtain the lock on page, remove all ptes and migrate the page
+- * to the newly allocated page in newpage.
++ * Obtain the lock on folio, remove all ptes and migrate the folio
++ * to the newly allocated folio in dst.
+  */
+ static int unmap_and_move(new_page_t get_new_page,
+                                  free_page_t put_new_page,
+-                                 unsigned long private, struct page *page,
++                                 unsigned long private, struct folio *src,
+                                  int force, enum migrate_mode mode,
+                                  enum migrate_reason reason,
+                                  struct list_head *ret)
+ {
+-      struct folio *dst, *src = page_folio(page);
++      struct folio *dst;
+       int rc = MIGRATEPAGE_SUCCESS;
+       struct page *newpage = NULL;
+-      if (!thp_migration_supported() && PageTransHuge(page))
++      if (!thp_migration_supported() && folio_test_transhuge(src))
+               return -ENOSYS;
+-      if (page_count(page) == 1) {
+-              /* Page was freed from under us. So we are done. */
+-              ClearPageActive(page);
+-              ClearPageUnevictable(page);
++      if (folio_ref_count(src) == 1) {
++              /* Folio was freed from under us. So we are done. */
++              folio_clear_active(src);
++              folio_clear_unevictable(src);
+               /* free_pages_prepare() will clear PG_isolated. */
+               goto out;
+       }
+-      newpage = get_new_page(page, private);
++      newpage = get_new_page(&src->page, private);
+       if (!newpage)
+               return -ENOMEM;
+       dst = page_folio(newpage);
+-      newpage->private = 0;
++      dst->private = 0;
+       rc = __unmap_and_move(src, dst, force, mode);
+       if (rc == MIGRATEPAGE_SUCCESS)
+-              set_page_owner_migrate_reason(newpage, reason);
++              set_page_owner_migrate_reason(&dst->page, reason);
+ out:
+       if (rc != -EAGAIN) {
+               /*
+-               * A page that has been migrated has all references
+-               * removed and will be freed. A page that has not been
++               * A folio that has been migrated has all references
++               * removed and will be freed. A folio that has not been
+                * migrated will have kept its references and be restored.
+                */
+-              list_del(&page->lru);
++              list_del(&src->lru);
+       }
+       /*
+        * If migration is successful, releases reference grabbed during
+-       * isolation. Otherwise, restore the page to right list unless
++       * isolation. Otherwise, restore the folio to right list unless
+        * we want to retry.
+        */
+       if (rc == MIGRATEPAGE_SUCCESS) {
+               /*
+-               * Compaction can migrate also non-LRU pages which are
++               * Compaction can migrate also non-LRU folios which are
+                * not accounted to NR_ISOLATED_*. They can be recognized
+-               * as __PageMovable
++               * as __folio_test_movable
+                */
+-              if (likely(!__PageMovable(page)))
+-                      mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+-                                      page_is_file_lru(page), -thp_nr_pages(page));
++              if (likely(!__folio_test_movable(src)))
++                      mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
++                                      folio_is_file_lru(src), -folio_nr_pages(src));
+               if (reason != MR_MEMORY_FAILURE)
+                       /*
+-                       * We release the page in page_handle_poison.
++                       * We release the folio in page_handle_poison.
+                        */
+-                      put_page(page);
++                      folio_put(src);
+       } else {
+               if (rc != -EAGAIN)
+-                      list_add_tail(&page->lru, ret);
++                      list_add_tail(&src->lru, ret);
+               if (put_new_page)
+-                      put_new_page(newpage, private);
++                      put_new_page(&dst->page, private);
+               else
+-                      put_page(newpage);
++                      folio_put(dst);
+       }
+       return rc;
+@@ -1471,7 +1471,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                                               &ret_pages);
+                       else
+                               rc = unmap_and_move(get_new_page, put_new_page,
+-                                              private, page, pass > 2, mode,
++                                              private, page_folio(page), pass > 2, mode,
+                                               reason, &ret_pages);
+                       /*
+                        * The rules are:
+-- 
+2.43.0
+
diff --git a/queue-6.1/migrate_pages-organize-stats-with-struct-migrate_pag.patch b/queue-6.1/migrate_pages-organize-stats-with-struct-migrate_pag.patch
new file mode 100644 (file)
index 0000000..5a1e4cf
--- /dev/null
@@ -0,0 +1,261 @@
+From 41a3f5ffed4ddea2c459d69b3b751704faa84a6f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Feb 2023 20:34:36 +0800
+Subject: migrate_pages: organize stats with struct migrate_pages_stats
+
+From: Huang Ying <ying.huang@intel.com>
+
+[ Upstream commit 5b855937096aea7f81e73ad6d40d433c9dd49577 ]
+
+Patch series "migrate_pages(): batch TLB flushing", v5.
+
+Now, migrate_pages() migrates folios one by one, like the fake code as
+follows,
+
+  for each folio
+    unmap
+    flush TLB
+    copy
+    restore map
+
+If multiple folios are passed to migrate_pages(), there are opportunities
+to batch the TLB flushing and copying.  That is, we can change the code to
+something as follows,
+
+  for each folio
+    unmap
+  for each folio
+    flush TLB
+  for each folio
+    copy
+  for each folio
+    restore map
+
+The total number of TLB flushing IPI can be reduced considerably.  And we
+may use some hardware accelerator such as DSA to accelerate the folio
+copying.
+
+So in this patch, we refactor the migrate_pages() implementation and
+implement the TLB flushing batching.  Base on this, hardware accelerated
+folio copying can be implemented.
+
+If too many folios are passed to migrate_pages(), in the naive batched
+implementation, we may unmap too many folios at the same time.  The
+possibility for a task to wait for the migrated folios to be mapped again
+increases.  So the latency may be hurt.  To deal with this issue, the max
+number of folios be unmapped in batch is restricted to no more than
+HPAGE_PMD_NR in the unit of page.  That is, the influence is at the same
+level of THP migration.
+
+We use the following test to measure the performance impact of the
+patchset,
+
+On a 2-socket Intel server,
+
+ - Run pmbench memory accessing benchmark
+
+ - Run `migratepages` to migrate pages of pmbench between node 0 and
+   node 1 back and forth.
+
+With the patch, the TLB flushing IPI reduces 99.1% during the test and
+the number of pages migrated successfully per second increases 291.7%.
+
+Xin Hao helped to test the patchset on an ARM64 server with 128 cores,
+2 NUMA nodes.  Test results show that the page migration performance
+increases up to 78%.
+
+This patch (of 9):
+
+Define struct migrate_pages_stats to organize the various statistics in
+migrate_pages().  This makes it easier to collect and consume the
+statistics in multiple functions.  This will be needed in the following
+patches in the series.
+
+Link: https://lkml.kernel.org/r/20230213123444.155149-1-ying.huang@intel.com
+Link: https://lkml.kernel.org/r/20230213123444.155149-2-ying.huang@intel.com
+Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
+Reviewed-by: Alistair Popple <apopple@nvidia.com>
+Reviewed-by: Zi Yan <ziy@nvidia.com>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Reviewed-by: Xin Hao <xhao@linux.alibaba.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Bharata B Rao <bharata@amd.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/migrate.c | 60 +++++++++++++++++++++++++++++-----------------------
+ 1 file changed, 34 insertions(+), 26 deletions(-)
+
+diff --git a/mm/migrate.c b/mm/migrate.c
+index 81444abf54dba..b7596a0b4445f 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1398,6 +1398,16 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f
+       return rc;
+ }
++struct migrate_pages_stats {
++      int nr_succeeded;       /* Normal and large folios migrated successfully, in
++                                 units of base pages */
++      int nr_failed_pages;    /* Normal and large folios failed to be migrated, in
++                                 units of base pages.  Untried folios aren't counted */
++      int nr_thp_succeeded;   /* THP migrated successfully */
++      int nr_thp_failed;      /* THP failed to be migrated */
++      int nr_thp_split;       /* THP split before migrating */
++};
++
+ /*
+  * migrate_pages - migrate the folios specified in a list, to the free folios
+  *               supplied as the target for the page migration
+@@ -1432,13 +1442,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+       int large_retry = 1;
+       int thp_retry = 1;
+       int nr_failed = 0;
+-      int nr_failed_pages = 0;
+       int nr_retry_pages = 0;
+-      int nr_succeeded = 0;
+-      int nr_thp_succeeded = 0;
+       int nr_large_failed = 0;
+-      int nr_thp_failed = 0;
+-      int nr_thp_split = 0;
+       int pass = 0;
+       bool is_large = false;
+       bool is_thp = false;
+@@ -1448,9 +1453,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+       LIST_HEAD(split_folios);
+       bool nosplit = (reason == MR_NUMA_MISPLACED);
+       bool no_split_folio_counting = false;
++      struct migrate_pages_stats stats;
+       trace_mm_migrate_pages_start(mode, reason);
++      memset(&stats, 0, sizeof(stats));
+ split_folio_migration:
+       for (pass = 0; pass < 10 && (retry || large_retry); pass++) {
+               retry = 0;
+@@ -1504,9 +1511,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                               /* Large folio migration is unsupported */
+                               if (is_large) {
+                                       nr_large_failed++;
+-                                      nr_thp_failed += is_thp;
++                                      stats.nr_thp_failed += is_thp;
+                                       if (!try_split_folio(folio, &split_folios)) {
+-                                              nr_thp_split += is_thp;
++                                              stats.nr_thp_split += is_thp;
+                                               break;
+                                       }
+                               /* Hugetlb migration is unsupported */
+@@ -1514,7 +1521,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                                       nr_failed++;
+                               }
+-                              nr_failed_pages += nr_pages;
++                              stats.nr_failed_pages += nr_pages;
+                               list_move_tail(&folio->lru, &ret_folios);
+                               break;
+                       case -ENOMEM:
+@@ -1524,13 +1531,13 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                                */
+                               if (is_large) {
+                                       nr_large_failed++;
+-                                      nr_thp_failed += is_thp;
++                                      stats.nr_thp_failed += is_thp;
+                                       /* Large folio NUMA faulting doesn't split to retry. */
+                                       if (!nosplit) {
+                                               int ret = try_split_folio(folio, &split_folios);
+                                               if (!ret) {
+-                                                      nr_thp_split += is_thp;
++                                                      stats.nr_thp_split += is_thp;
+                                                       break;
+                                               } else if (reason == MR_LONGTERM_PIN &&
+                                                          ret == -EAGAIN) {
+@@ -1548,7 +1555,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                                       nr_failed++;
+                               }
+-                              nr_failed_pages += nr_pages + nr_retry_pages;
++                              stats.nr_failed_pages += nr_pages + nr_retry_pages;
+                               /*
+                                * There might be some split folios of fail-to-migrate large
+                                * folios left in split_folios list. Move them back to migration
+@@ -1558,7 +1565,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                               list_splice_init(&split_folios, from);
+                               /* nr_failed isn't updated for not used */
+                               nr_large_failed += large_retry;
+-                              nr_thp_failed += thp_retry;
++                              stats.nr_thp_failed += thp_retry;
+                               goto out;
+                       case -EAGAIN:
+                               if (is_large) {
+@@ -1570,8 +1577,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                               nr_retry_pages += nr_pages;
+                               break;
+                       case MIGRATEPAGE_SUCCESS:
+-                              nr_succeeded += nr_pages;
+-                              nr_thp_succeeded += is_thp;
++                              stats.nr_succeeded += nr_pages;
++                              stats.nr_thp_succeeded += is_thp;
+                               break;
+                       default:
+                               /*
+@@ -1582,20 +1589,20 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                                */
+                               if (is_large) {
+                                       nr_large_failed++;
+-                                      nr_thp_failed += is_thp;
++                                      stats.nr_thp_failed += is_thp;
+                               } else if (!no_split_folio_counting) {
+                                       nr_failed++;
+                               }
+-                              nr_failed_pages += nr_pages;
++                              stats.nr_failed_pages += nr_pages;
+                               break;
+                       }
+               }
+       }
+       nr_failed += retry;
+       nr_large_failed += large_retry;
+-      nr_thp_failed += thp_retry;
+-      nr_failed_pages += nr_retry_pages;
++      stats.nr_thp_failed += thp_retry;
++      stats.nr_failed_pages += nr_retry_pages;
+       /*
+        * Try to migrate split folios of fail-to-migrate large folios, no
+        * nr_failed counting in this round, since all split folios of a
+@@ -1628,16 +1635,17 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+       if (list_empty(from))
+               rc = 0;
+-      count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+-      count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
+-      count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
+-      count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
+-      count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
+-      trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
+-                             nr_thp_failed, nr_thp_split, mode, reason);
++      count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded);
++      count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages);
++      count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded);
++      count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed);
++      count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
++      trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
++                             stats.nr_thp_succeeded, stats.nr_thp_failed,
++                             stats.nr_thp_split, mode, reason);
+       if (ret_succeeded)
+-              *ret_succeeded = nr_succeeded;
++              *ret_succeeded = stats.nr_succeeded;
+       return rc;
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.1/migrate_pages-restrict-number-of-pages-to-migrate-in.patch b/queue-6.1/migrate_pages-restrict-number-of-pages-to-migrate-in.patch
new file mode 100644 (file)
index 0000000..fd1dc52
--- /dev/null
@@ -0,0 +1,364 @@
+From f440d486b0dc2fe6f1bca63448860dc0b8809928 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Feb 2023 20:34:38 +0800
+Subject: migrate_pages: restrict number of pages to migrate in batch
+
+From: Huang Ying <ying.huang@intel.com>
+
+[ Upstream commit 42012e0436d44aeb2e68f11a28ddd0ad3f38b61f ]
+
+This is a preparation patch to batch the folio unmapping and moving for
+non-hugetlb folios.
+
+If we had batched the folio unmapping, all folios to be migrated would be
+unmapped before copying the contents and flags of the folios.  If the
+folios that were passed to migrate_pages() were too many in unit of pages,
+the execution of the processes would be stopped for too long time, thus
+too long latency.  For example, migrate_pages() syscall will call
+migrate_pages() with all folios of a process.  To avoid this possible
+issue, in this patch, we restrict the number of pages to be migrated to be
+no more than HPAGE_PMD_NR.  That is, the influence is at the same level of
+THP migration.
+
+Link: https://lkml.kernel.org/r/20230213123444.155149-4-ying.huang@intel.com
+Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Bharata B Rao <bharata@amd.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Xin Hao <xhao@linux.alibaba.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/migrate.c | 174 +++++++++++++++++++++++++++++++--------------------
+ 1 file changed, 106 insertions(+), 68 deletions(-)
+
+diff --git a/mm/migrate.c b/mm/migrate.c
+index 70d0b20d06a5f..40ae91e1a026b 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1398,6 +1398,11 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f
+       return rc;
+ }
++#ifdef CONFIG_TRANSPARENT_HUGEPAGE
++#define NR_MAX_BATCHED_MIGRATION      HPAGE_PMD_NR
++#else
++#define NR_MAX_BATCHED_MIGRATION      512
++#endif
+ #define NR_MAX_MIGRATE_PAGES_RETRY    10
+ struct migrate_pages_stats {
+@@ -1499,40 +1504,15 @@ static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
+       return nr_failed;
+ }
+-/*
+- * migrate_pages - migrate the folios specified in a list, to the free folios
+- *               supplied as the target for the page migration
+- *
+- * @from:             The list of folios to be migrated.
+- * @get_new_page:     The function used to allocate free folios to be used
+- *                    as the target of the folio migration.
+- * @put_new_page:     The function used to free target folios if migration
+- *                    fails, or NULL if no special handling is necessary.
+- * @private:          Private data to be passed on to get_new_page()
+- * @mode:             The migration mode that specifies the constraints for
+- *                    folio migration, if any.
+- * @reason:           The reason for folio migration.
+- * @ret_succeeded:    Set to the number of folios migrated successfully if
+- *                    the caller passes a non-NULL pointer.
+- *
+- * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
+- * are movable any more because the list has become empty or no retryable folios
+- * exist any more. It is caller's responsibility to call putback_movable_pages()
+- * only if ret != 0.
+- *
+- * Returns the number of {normal folio, large folio, hugetlb} that were not
+- * migrated, or an error code. The number of large folio splits will be
+- * considered as the number of non-migrated large folio, no matter how many
+- * split folios of the large folio are migrated successfully.
+- */
+-int migrate_pages(struct list_head *from, new_page_t get_new_page,
++static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
+               free_page_t put_new_page, unsigned long private,
+-              enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
++              enum migrate_mode mode, int reason, struct list_head *ret_folios,
++              struct migrate_pages_stats *stats)
+ {
+       int retry = 1;
+       int large_retry = 1;
+       int thp_retry = 1;
+-      int nr_failed;
++      int nr_failed = 0;
+       int nr_retry_pages = 0;
+       int nr_large_failed = 0;
+       int pass = 0;
+@@ -1540,20 +1520,9 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+       bool is_thp = false;
+       struct folio *folio, *folio2;
+       int rc, nr_pages;
+-      LIST_HEAD(ret_folios);
+       LIST_HEAD(split_folios);
+       bool nosplit = (reason == MR_NUMA_MISPLACED);
+       bool no_split_folio_counting = false;
+-      struct migrate_pages_stats stats;
+-
+-      trace_mm_migrate_pages_start(mode, reason);
+-
+-      memset(&stats, 0, sizeof(stats));
+-      rc = migrate_hugetlbs(from, get_new_page, put_new_page, private, mode, reason,
+-                            &stats, &ret_folios);
+-      if (rc < 0)
+-              goto out;
+-      nr_failed = rc;
+ split_folio_migration:
+       for (pass = 0;
+@@ -1565,12 +1534,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+               nr_retry_pages = 0;
+               list_for_each_entry_safe(folio, folio2, from, lru) {
+-                      /* Retried hugetlb folios will be kept in list  */
+-                      if (folio_test_hugetlb(folio)) {
+-                              list_move_tail(&folio->lru, &ret_folios);
+-                              continue;
+-                      }
+-
+                       /*
+                        * Large folio statistics is based on the source large
+                        * folio. Capture required information that might get
+@@ -1584,15 +1547,14 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                       rc = unmap_and_move(get_new_page, put_new_page,
+                                           private, folio, pass > 2, mode,
+-                                          reason, &ret_folios);
++                                          reason, ret_folios);
+                       /*
+                        * The rules are:
+                        *      Success: folio will be freed
+                        *      -EAGAIN: stay on the from list
+                        *      -ENOMEM: stay on the from list
+                        *      -ENOSYS: stay on the from list
+-                       *      Other errno: put on ret_folios list then splice to
+-                       *                   from list
++                       *      Other errno: put on ret_folios list
+                        */
+                       switch(rc) {
+                       /*
+@@ -1609,17 +1571,17 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                               /* Large folio migration is unsupported */
+                               if (is_large) {
+                                       nr_large_failed++;
+-                                      stats.nr_thp_failed += is_thp;
++                                      stats->nr_thp_failed += is_thp;
+                                       if (!try_split_folio(folio, &split_folios)) {
+-                                              stats.nr_thp_split += is_thp;
++                                              stats->nr_thp_split += is_thp;
+                                               break;
+                                       }
+                               } else if (!no_split_folio_counting) {
+                                       nr_failed++;
+                               }
+-                              stats.nr_failed_pages += nr_pages;
+-                              list_move_tail(&folio->lru, &ret_folios);
++                              stats->nr_failed_pages += nr_pages;
++                              list_move_tail(&folio->lru, ret_folios);
+                               break;
+                       case -ENOMEM:
+                               /*
+@@ -1628,13 +1590,13 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                                */
+                               if (is_large) {
+                                       nr_large_failed++;
+-                                      stats.nr_thp_failed += is_thp;
++                                      stats->nr_thp_failed += is_thp;
+                                       /* Large folio NUMA faulting doesn't split to retry. */
+                                       if (!nosplit) {
+                                               int ret = try_split_folio(folio, &split_folios);
+                                               if (!ret) {
+-                                                      stats.nr_thp_split += is_thp;
++                                                      stats->nr_thp_split += is_thp;
+                                                       break;
+                                               } else if (reason == MR_LONGTERM_PIN &&
+                                                          ret == -EAGAIN) {
+@@ -1652,17 +1614,17 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                                       nr_failed++;
+                               }
+-                              stats.nr_failed_pages += nr_pages + nr_retry_pages;
++                              stats->nr_failed_pages += nr_pages + nr_retry_pages;
+                               /*
+                                * There might be some split folios of fail-to-migrate large
+-                               * folios left in split_folios list. Move them back to migration
++                               * folios left in split_folios list. Move them to ret_folios
+                                * list so that they could be put back to the right list by
+                                * the caller otherwise the folio refcnt will be leaked.
+                                */
+-                              list_splice_init(&split_folios, from);
++                              list_splice_init(&split_folios, ret_folios);
+                               /* nr_failed isn't updated for not used */
+                               nr_large_failed += large_retry;
+-                              stats.nr_thp_failed += thp_retry;
++                              stats->nr_thp_failed += thp_retry;
+                               goto out;
+                       case -EAGAIN:
+                               if (is_large) {
+@@ -1674,8 +1636,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                               nr_retry_pages += nr_pages;
+                               break;
+                       case MIGRATEPAGE_SUCCESS:
+-                              stats.nr_succeeded += nr_pages;
+-                              stats.nr_thp_succeeded += is_thp;
++                              stats->nr_succeeded += nr_pages;
++                              stats->nr_thp_succeeded += is_thp;
+                               break;
+                       default:
+                               /*
+@@ -1686,20 +1648,20 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                                */
+                               if (is_large) {
+                                       nr_large_failed++;
+-                                      stats.nr_thp_failed += is_thp;
++                                      stats->nr_thp_failed += is_thp;
+                               } else if (!no_split_folio_counting) {
+                                       nr_failed++;
+                               }
+-                              stats.nr_failed_pages += nr_pages;
++                              stats->nr_failed_pages += nr_pages;
+                               break;
+                       }
+               }
+       }
+       nr_failed += retry;
+       nr_large_failed += large_retry;
+-      stats.nr_thp_failed += thp_retry;
+-      stats.nr_failed_pages += nr_retry_pages;
++      stats->nr_thp_failed += thp_retry;
++      stats->nr_failed_pages += nr_retry_pages;
+       /*
+        * Try to migrate split folios of fail-to-migrate large folios, no
+        * nr_failed counting in this round, since all split folios of a
+@@ -1710,7 +1672,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
+                * retries) to ret_folios to avoid migrating them again.
+                */
+-              list_splice_init(from, &ret_folios);
++              list_splice_init(from, ret_folios);
+               list_splice_init(&split_folios, from);
+               no_split_folio_counting = true;
+               retry = 1;
+@@ -1718,6 +1680,82 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+       }
+       rc = nr_failed + nr_large_failed;
++out:
++      return rc;
++}
++
++/*
++ * migrate_pages - migrate the folios specified in a list, to the free folios
++ *               supplied as the target for the page migration
++ *
++ * @from:             The list of folios to be migrated.
++ * @get_new_page:     The function used to allocate free folios to be used
++ *                    as the target of the folio migration.
++ * @put_new_page:     The function used to free target folios if migration
++ *                    fails, or NULL if no special handling is necessary.
++ * @private:          Private data to be passed on to get_new_page()
++ * @mode:             The migration mode that specifies the constraints for
++ *                    folio migration, if any.
++ * @reason:           The reason for folio migration.
++ * @ret_succeeded:    Set to the number of folios migrated successfully if
++ *                    the caller passes a non-NULL pointer.
++ *
++ * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
++ * are movable any more because the list has become empty or no retryable folios
++ * exist any more. It is caller's responsibility to call putback_movable_pages()
++ * only if ret != 0.
++ *
++ * Returns the number of {normal folio, large folio, hugetlb} that were not
++ * migrated, or an error code. The number of large folio splits will be
++ * considered as the number of non-migrated large folio, no matter how many
++ * split folios of the large folio are migrated successfully.
++ */
++int migrate_pages(struct list_head *from, new_page_t get_new_page,
++              free_page_t put_new_page, unsigned long private,
++              enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
++{
++      int rc, rc_gather;
++      int nr_pages;
++      struct folio *folio, *folio2;
++      LIST_HEAD(folios);
++      LIST_HEAD(ret_folios);
++      struct migrate_pages_stats stats;
++
++      trace_mm_migrate_pages_start(mode, reason);
++
++      memset(&stats, 0, sizeof(stats));
++
++      rc_gather = migrate_hugetlbs(from, get_new_page, put_new_page, private,
++                                   mode, reason, &stats, &ret_folios);
++      if (rc_gather < 0)
++              goto out;
++again:
++      nr_pages = 0;
++      list_for_each_entry_safe(folio, folio2, from, lru) {
++              /* Retried hugetlb folios will be kept in list  */
++              if (folio_test_hugetlb(folio)) {
++                      list_move_tail(&folio->lru, &ret_folios);
++                      continue;
++              }
++
++              nr_pages += folio_nr_pages(folio);
++              if (nr_pages > NR_MAX_BATCHED_MIGRATION)
++                      break;
++      }
++      if (nr_pages > NR_MAX_BATCHED_MIGRATION)
++              list_cut_before(&folios, from, &folio->lru);
++      else
++              list_splice_init(from, &folios);
++      rc = migrate_pages_batch(&folios, get_new_page, put_new_page, private,
++                               mode, reason, &ret_folios, &stats);
++      list_splice_tail_init(&folios, &ret_folios);
++      if (rc < 0) {
++              rc_gather = rc;
++              goto out;
++      }
++      rc_gather += rc;
++      if (!list_empty(from))
++              goto again;
+ out:
+       /*
+        * Put the permanent failure folio back to migration list, they
+@@ -1730,7 +1768,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+        * are migrated successfully.
+        */
+       if (list_empty(from))
+-              rc = 0;
++              rc_gather = 0;
+       count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded);
+       count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages);
+@@ -1744,7 +1782,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+       if (ret_succeeded)
+               *ret_succeeded = stats.nr_succeeded;
+-      return rc;
++      return rc_gather;
+ }
+ struct page *alloc_migration_target(struct page *page, unsigned long private)
+-- 
+2.43.0
+
diff --git a/queue-6.1/migrate_pages-separate-hugetlb-folios-migration.patch b/queue-6.1/migrate_pages-separate-hugetlb-folios-migration.patch
new file mode 100644 (file)
index 0000000..aef6027
--- /dev/null
@@ -0,0 +1,253 @@
+From dc1b2cb876a9212a452499066bada1d7645a8442 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Feb 2023 20:34:37 +0800
+Subject: migrate_pages: separate hugetlb folios migration
+
+From: Huang Ying <ying.huang@intel.com>
+
+[ Upstream commit e5bfff8b10e496378da4b7863479dd6fb907d4ea ]
+
+This is a preparation patch to batch the folio unmapping and moving for
+the non-hugetlb folios.  Based on that we can batch the TLB shootdown
+during the folio migration and make it possible to use some hardware
+accelerator for the folio copying.
+
+In this patch the hugetlb folios and non-hugetlb folios migration is
+separated in migrate_pages() to make it easy to change the non-hugetlb
+folios migration implementation.
+
+Link: https://lkml.kernel.org/r/20230213123444.155149-3-ying.huang@intel.com
+Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Reviewed-by: Xin Hao <xhao@linux.alibaba.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Bharata B Rao <bharata@amd.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/migrate.c | 141 +++++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 119 insertions(+), 22 deletions(-)
+
+diff --git a/mm/migrate.c b/mm/migrate.c
+index b7596a0b4445f..70d0b20d06a5f 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1398,6 +1398,8 @@ static inline int try_split_folio(struct folio *folio, struct list_head *split_f
+       return rc;
+ }
++#define NR_MAX_MIGRATE_PAGES_RETRY    10
++
+ struct migrate_pages_stats {
+       int nr_succeeded;       /* Normal and large folios migrated successfully, in
+                                  units of base pages */
+@@ -1408,6 +1410,95 @@ struct migrate_pages_stats {
+       int nr_thp_split;       /* THP split before migrating */
+ };
++/*
++ * Returns the number of hugetlb folios that were not migrated, or an error code
++ * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable
++ * any more because the list has become empty or no retryable hugetlb folios
++ * exist any more. It is caller's responsibility to call putback_movable_pages()
++ * only if ret != 0.
++ */
++static int migrate_hugetlbs(struct list_head *from, new_page_t get_new_page,
++                          free_page_t put_new_page, unsigned long private,
++                          enum migrate_mode mode, int reason,
++                          struct migrate_pages_stats *stats,
++                          struct list_head *ret_folios)
++{
++      int retry = 1;
++      int nr_failed = 0;
++      int nr_retry_pages = 0;
++      int pass = 0;
++      struct folio *folio, *folio2;
++      int rc, nr_pages;
++
++      for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) {
++              retry = 0;
++              nr_retry_pages = 0;
++
++              list_for_each_entry_safe(folio, folio2, from, lru) {
++                      if (!folio_test_hugetlb(folio))
++                              continue;
++
++                      nr_pages = folio_nr_pages(folio);
++
++                      cond_resched();
++
++                      rc = unmap_and_move_huge_page(get_new_page,
++                                                    put_new_page, private,
++                                                    &folio->page, pass > 2, mode,
++                                                    reason, ret_folios);
++                      /*
++                       * The rules are:
++                       *      Success: hugetlb folio will be put back
++                       *      -EAGAIN: stay on the from list
++                       *      -ENOMEM: stay on the from list
++                       *      -ENOSYS: stay on the from list
++                       *      Other errno: put on ret_folios list
++                       */
++                      switch(rc) {
++                      case -ENOSYS:
++                              /* Hugetlb migration is unsupported */
++                              nr_failed++;
++                              stats->nr_failed_pages += nr_pages;
++                              list_move_tail(&folio->lru, ret_folios);
++                              break;
++                      case -ENOMEM:
++                              /*
++                               * When memory is low, don't bother to try to migrate
++                               * other folios, just exit.
++                               */
++                              stats->nr_failed_pages += nr_pages + nr_retry_pages;
++                              return -ENOMEM;
++                      case -EAGAIN:
++                              retry++;
++                              nr_retry_pages += nr_pages;
++                              break;
++                      case MIGRATEPAGE_SUCCESS:
++                              stats->nr_succeeded += nr_pages;
++                              break;
++                      default:
++                              /*
++                               * Permanent failure (-EBUSY, etc.):
++                               * unlike -EAGAIN case, the failed folio is
++                               * removed from migration folio list and not
++                               * retried in the next outer loop.
++                               */
++                              nr_failed++;
++                              stats->nr_failed_pages += nr_pages;
++                              break;
++                      }
++              }
++      }
++      /*
++       * nr_failed is number of hugetlb folios failed to be migrated.  After
++       * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb
++       * folios as failed.
++       */
++      nr_failed += retry;
++      stats->nr_failed_pages += nr_retry_pages;
++
++      return nr_failed;
++}
++
+ /*
+  * migrate_pages - migrate the folios specified in a list, to the free folios
+  *               supplied as the target for the page migration
+@@ -1424,10 +1515,10 @@ struct migrate_pages_stats {
+  * @ret_succeeded:    Set to the number of folios migrated successfully if
+  *                    the caller passes a non-NULL pointer.
+  *
+- * The function returns after 10 attempts or if no folios are movable any more
+- * because the list has become empty or no retryable folios exist any more.
+- * It is caller's responsibility to call putback_movable_pages() to return folios
+- * to the LRU or free list only if ret != 0.
++ * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
++ * are movable any more because the list has become empty or no retryable folios
++ * exist any more. It is caller's responsibility to call putback_movable_pages()
++ * only if ret != 0.
+  *
+  * Returns the number of {normal folio, large folio, hugetlb} that were not
+  * migrated, or an error code. The number of large folio splits will be
+@@ -1441,7 +1532,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+       int retry = 1;
+       int large_retry = 1;
+       int thp_retry = 1;
+-      int nr_failed = 0;
++      int nr_failed;
+       int nr_retry_pages = 0;
+       int nr_large_failed = 0;
+       int pass = 0;
+@@ -1458,38 +1549,45 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+       trace_mm_migrate_pages_start(mode, reason);
+       memset(&stats, 0, sizeof(stats));
++      rc = migrate_hugetlbs(from, get_new_page, put_new_page, private, mode, reason,
++                            &stats, &ret_folios);
++      if (rc < 0)
++              goto out;
++      nr_failed = rc;
++
+ split_folio_migration:
+-      for (pass = 0; pass < 10 && (retry || large_retry); pass++) {
++      for (pass = 0;
++           pass < NR_MAX_MIGRATE_PAGES_RETRY && (retry || large_retry);
++           pass++) {
+               retry = 0;
+               large_retry = 0;
+               thp_retry = 0;
+               nr_retry_pages = 0;
+               list_for_each_entry_safe(folio, folio2, from, lru) {
++                      /* Retried hugetlb folios will be kept in list  */
++                      if (folio_test_hugetlb(folio)) {
++                              list_move_tail(&folio->lru, &ret_folios);
++                              continue;
++                      }
++
+                       /*
+                        * Large folio statistics is based on the source large
+                        * folio. Capture required information that might get
+                        * lost during migration.
+                        */
+-                      is_large = folio_test_large(folio) && !folio_test_hugetlb(folio);
++                      is_large = folio_test_large(folio);
+                       is_thp = is_large && folio_test_pmd_mappable(folio);
+                       nr_pages = folio_nr_pages(folio);
++
+                       cond_resched();
+-                      if (folio_test_hugetlb(folio))
+-                              rc = unmap_and_move_huge_page(get_new_page,
+-                                              put_new_page, private,
+-                                              &folio->page, pass > 2, mode,
+-                                              reason,
+-                                              &ret_folios);
+-                      else
+-                              rc = unmap_and_move(get_new_page, put_new_page,
+-                                              private, folio, pass > 2, mode,
+-                                              reason, &ret_folios);
++                      rc = unmap_and_move(get_new_page, put_new_page,
++                                          private, folio, pass > 2, mode,
++                                          reason, &ret_folios);
+                       /*
+                        * The rules are:
+-                       *      Success: non hugetlb folio will be freed, hugetlb
+-                       *               folio will be put back
++                       *      Success: folio will be freed
+                        *      -EAGAIN: stay on the from list
+                        *      -ENOMEM: stay on the from list
+                        *      -ENOSYS: stay on the from list
+@@ -1516,7 +1614,6 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                                               stats.nr_thp_split += is_thp;
+                                               break;
+                                       }
+-                              /* Hugetlb migration is unsupported */
+                               } else if (!no_split_folio_counting) {
+                                       nr_failed++;
+                               }
+@@ -1610,8 +1707,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+        */
+       if (!list_empty(&split_folios)) {
+               /*
+-               * Move non-migrated folios (after 10 retries) to ret_folios
+-               * to avoid migrating them again.
++               * Move non-migrated folios (after NR_MAX_MIGRATE_PAGES_RETRY
++               * retries) to ret_folios to avoid migrating them again.
+                */
+               list_splice_init(from, &ret_folios);
+               list_splice_init(&split_folios, from);
+-- 
+2.43.0
+
diff --git a/queue-6.1/migrate_pages-split-unmap_and_move-to-_unmap-and-_mo.patch b/queue-6.1/migrate_pages-split-unmap_and_move-to-_unmap-and-_mo.patch
new file mode 100644 (file)
index 0000000..7629fc4
--- /dev/null
@@ -0,0 +1,310 @@
+From 742e80422397bb0e53a0352ee493019024a9c902 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 13 Feb 2023 20:34:39 +0800
+Subject: migrate_pages: split unmap_and_move() to _unmap() and _move()
+
+From: Huang Ying <ying.huang@intel.com>
+
+[ Upstream commit 64c8902ed4418317cd416c566f896bd4a92b2efc ]
+
+This is a preparation patch to batch the folio unmapping and moving.
+
+In this patch, unmap_and_move() is split to migrate_folio_unmap() and
+migrate_folio_move().  So, we can batch _unmap() and _move() in different
+loops later.  To pass some information between unmap and move, the
+original unused dst->mapping and dst->private are used.
+
+Link: https://lkml.kernel.org/r/20230213123444.155149-5-ying.huang@intel.com
+Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Reviewed-by: Xin Hao <xhao@linux.alibaba.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Bharata B Rao <bharata@amd.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/migrate.h |   1 +
+ mm/migrate.c            | 169 ++++++++++++++++++++++++++++++----------
+ 2 files changed, 129 insertions(+), 41 deletions(-)
+
+diff --git a/include/linux/migrate.h b/include/linux/migrate.h
+index 3ef77f52a4f04..7376074f2e1e3 100644
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -18,6 +18,7 @@ struct migration_target_control;
+  * - zero on page migration success;
+  */
+ #define MIGRATEPAGE_SUCCESS           0
++#define MIGRATEPAGE_UNMAP             1
+ /**
+  * struct movable_operations - Driver page migration
+diff --git a/mm/migrate.c b/mm/migrate.c
+index 40ae91e1a026b..46a1476e188c3 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1011,11 +1011,53 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
+       return rc;
+ }
+-static int __unmap_and_move(struct folio *src, struct folio *dst,
++/*
++ * To record some information during migration, we use some unused
++ * fields (mapping and private) of struct folio of the newly allocated
++ * destination folio.  This is safe because nobody is using them
++ * except us.
++ */
++static void __migrate_folio_record(struct folio *dst,
++                                 unsigned long page_was_mapped,
++                                 struct anon_vma *anon_vma)
++{
++      dst->mapping = (void *)anon_vma;
++      dst->private = (void *)page_was_mapped;
++}
++
++static void __migrate_folio_extract(struct folio *dst,
++                                 int *page_was_mappedp,
++                                 struct anon_vma **anon_vmap)
++{
++      *anon_vmap = (void *)dst->mapping;
++      *page_was_mappedp = (unsigned long)dst->private;
++      dst->mapping = NULL;
++      dst->private = NULL;
++}
++
++/* Cleanup src folio upon migration success */
++static void migrate_folio_done(struct folio *src,
++                             enum migrate_reason reason)
++{
++      /*
++       * Compaction can migrate also non-LRU pages which are
++       * not accounted to NR_ISOLATED_*. They can be recognized
++       * as __PageMovable
++       */
++      if (likely(!__folio_test_movable(src)))
++              mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
++                                  folio_is_file_lru(src), -folio_nr_pages(src));
++
++      if (reason != MR_MEMORY_FAILURE)
++              /* We release the page in page_handle_poison. */
++              folio_put(src);
++}
++
++static int __migrate_folio_unmap(struct folio *src, struct folio *dst,
+                               int force, enum migrate_mode mode)
+ {
+       int rc = -EAGAIN;
+-      bool page_was_mapped = false;
++      int page_was_mapped = 0;
+       struct anon_vma *anon_vma = NULL;
+       bool is_lru = !__PageMovable(&src->page);
+@@ -1091,8 +1133,8 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
+               goto out_unlock;
+       if (unlikely(!is_lru)) {
+-              rc = move_to_new_folio(dst, src, mode);
+-              goto out_unlock_both;
++              __migrate_folio_record(dst, page_was_mapped, anon_vma);
++              return MIGRATEPAGE_UNMAP;
+       }
+       /*
+@@ -1117,11 +1159,42 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
+               VM_BUG_ON_FOLIO(folio_test_anon(src) &&
+                              !folio_test_ksm(src) && !anon_vma, src);
+               try_to_migrate(src, 0);
+-              page_was_mapped = true;
++              page_was_mapped = 1;
+       }
+-      if (!folio_mapped(src))
+-              rc = move_to_new_folio(dst, src, mode);
++      if (!folio_mapped(src)) {
++              __migrate_folio_record(dst, page_was_mapped, anon_vma);
++              return MIGRATEPAGE_UNMAP;
++      }
++
++      if (page_was_mapped)
++              remove_migration_ptes(src, src, false);
++
++out_unlock_both:
++      folio_unlock(dst);
++out_unlock:
++      /* Drop an anon_vma reference if we took one */
++      if (anon_vma)
++              put_anon_vma(anon_vma);
++      folio_unlock(src);
++out:
++
++      return rc;
++}
++
++static int __migrate_folio_move(struct folio *src, struct folio *dst,
++                              enum migrate_mode mode)
++{
++      int rc;
++      int page_was_mapped = 0;
++      struct anon_vma *anon_vma = NULL;
++      bool is_lru = !__PageMovable(&src->page);
++
++      __migrate_folio_extract(dst, &page_was_mapped, &anon_vma);
++
++      rc = move_to_new_folio(dst, src, mode);
++      if (unlikely(!is_lru))
++              goto out_unlock_both;
+       /*
+        * When successful, push dst to LRU immediately: so that if it
+@@ -1144,12 +1217,10 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
+ out_unlock_both:
+       folio_unlock(dst);
+-out_unlock:
+       /* Drop an anon_vma reference if we took one */
+       if (anon_vma)
+               put_anon_vma(anon_vma);
+       folio_unlock(src);
+-out:
+       /*
+        * If migration is successful, decrease refcount of dst,
+        * which will not free the page because new page owner increased
+@@ -1161,19 +1232,15 @@ static int __unmap_and_move(struct folio *src, struct folio *dst,
+       return rc;
+ }
+-/*
+- * Obtain the lock on folio, remove all ptes and migrate the folio
+- * to the newly allocated folio in dst.
+- */
+-static int unmap_and_move(new_page_t get_new_page,
+-                                 free_page_t put_new_page,
+-                                 unsigned long private, struct folio *src,
+-                                 int force, enum migrate_mode mode,
+-                                 enum migrate_reason reason,
+-                                 struct list_head *ret)
++/* Obtain the lock on page, remove all ptes. */
++static int migrate_folio_unmap(new_page_t get_new_page, free_page_t put_new_page,
++                             unsigned long private, struct folio *src,
++                             struct folio **dstp, int force,
++                             enum migrate_mode mode, enum migrate_reason reason,
++                             struct list_head *ret)
+ {
+       struct folio *dst;
+-      int rc = MIGRATEPAGE_SUCCESS;
++      int rc = MIGRATEPAGE_UNMAP;
+       struct page *newpage = NULL;
+       if (!thp_migration_supported() && folio_test_transhuge(src))
+@@ -1184,20 +1251,49 @@ static int unmap_and_move(new_page_t get_new_page,
+               folio_clear_active(src);
+               folio_clear_unevictable(src);
+               /* free_pages_prepare() will clear PG_isolated. */
+-              goto out;
++              list_del(&src->lru);
++              migrate_folio_done(src, reason);
++              return MIGRATEPAGE_SUCCESS;
+       }
+       newpage = get_new_page(&src->page, private);
+       if (!newpage)
+               return -ENOMEM;
+       dst = page_folio(newpage);
++      *dstp = dst;
+       dst->private = NULL;
+-      rc = __unmap_and_move(src, dst, force, mode);
++      rc = __migrate_folio_unmap(src, dst, force, mode);
++      if (rc == MIGRATEPAGE_UNMAP)
++              return rc;
++
++      /*
++       * A folio that has not been unmapped will be restored to
++       * right list unless we want to retry.
++       */
++      if (rc != -EAGAIN)
++              list_move_tail(&src->lru, ret);
++
++      if (put_new_page)
++              put_new_page(&dst->page, private);
++      else
++              folio_put(dst);
++
++      return rc;
++}
++
++/* Migrate the folio to the newly allocated folio in dst. */
++static int migrate_folio_move(free_page_t put_new_page, unsigned long private,
++                            struct folio *src, struct folio *dst,
++                            enum migrate_mode mode, enum migrate_reason reason,
++                            struct list_head *ret)
++{
++      int rc;
++
++      rc = __migrate_folio_move(src, dst, mode);
+       if (rc == MIGRATEPAGE_SUCCESS)
+               set_page_owner_migrate_reason(&dst->page, reason);
+-out:
+       if (rc != -EAGAIN) {
+               /*
+                * A folio that has been migrated has all references
+@@ -1213,20 +1309,7 @@ static int unmap_and_move(new_page_t get_new_page,
+        * we want to retry.
+        */
+       if (rc == MIGRATEPAGE_SUCCESS) {
+-              /*
+-               * Compaction can migrate also non-LRU folios which are
+-               * not accounted to NR_ISOLATED_*. They can be recognized
+-               * as __folio_test_movable
+-               */
+-              if (likely(!__folio_test_movable(src)))
+-                      mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
+-                                      folio_is_file_lru(src), -folio_nr_pages(src));
+-
+-              if (reason != MR_MEMORY_FAILURE)
+-                      /*
+-                       * We release the folio in page_handle_poison.
+-                       */
+-                      folio_put(src);
++              migrate_folio_done(src, reason);
+       } else {
+               if (rc != -EAGAIN)
+                       list_add_tail(&src->lru, ret);
+@@ -1518,7 +1601,7 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
+       int pass = 0;
+       bool is_large = false;
+       bool is_thp = false;
+-      struct folio *folio, *folio2;
++      struct folio *folio, *folio2, *dst = NULL;
+       int rc, nr_pages;
+       LIST_HEAD(split_folios);
+       bool nosplit = (reason == MR_NUMA_MISPLACED);
+@@ -1545,9 +1628,13 @@ static int migrate_pages_batch(struct list_head *from, new_page_t get_new_page,
+                       cond_resched();
+-                      rc = unmap_and_move(get_new_page, put_new_page,
+-                                          private, folio, pass > 2, mode,
+-                                          reason, ret_folios);
++                      rc = migrate_folio_unmap(get_new_page, put_new_page, private,
++                                               folio, &dst, pass > 2, mode,
++                                               reason, ret_folios);
++                      if (rc == MIGRATEPAGE_UNMAP)
++                              rc = migrate_folio_move(put_new_page, private,
++                                                      folio, dst, mode,
++                                                      reason, ret_folios);
+                       /*
+                        * The rules are:
+                        *      Success: folio will be freed
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-migrate-try-again-if-thp-split-is-failed-due-to-p.patch b/queue-6.1/mm-migrate-try-again-if-thp-split-is-failed-due-to-p.patch
new file mode 100644 (file)
index 0000000..b3d2f61
--- /dev/null
@@ -0,0 +1,118 @@
+From 8834c3584cdce54f769ee76fdb530bc80d0689a4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 24 Oct 2022 16:34:22 +0800
+Subject: mm: migrate: try again if THP split is failed due to page refcnt
+
+From: Baolin Wang <baolin.wang@linux.alibaba.com>
+
+[ Upstream commit fd4a7ac32918d3d7a2d17dc06c5520f45e36eb52 ]
+
+When creating a virtual machine, we will use memfd_create() to get a file
+descriptor which can be used to create share memory mappings using the
+mmap function, meanwhile the mmap() will set the MAP_POPULATE flag to
+allocate physical pages for the virtual machine.
+
+When allocating physical pages for the guest, the host can fallback to
+allocate some CMA pages for the guest when over half of the zone's free
+memory is in the CMA area.
+
+In guest os, when the application wants to do some data transaction with
+DMA, our QEMU will call VFIO_IOMMU_MAP_DMA ioctl to do longterm-pin and
+create IOMMU mappings for the DMA pages.  However, when calling
+VFIO_IOMMU_MAP_DMA ioctl to pin the physical pages, we found it will be
+failed to longterm-pin sometimes.
+
+After some invetigation, we found the pages used to do DMA mapping can
+contain some CMA pages, and these CMA pages will cause a possible failure
+of the longterm-pin, due to failed to migrate the CMA pages.  The reason
+of migration failure may be temporary reference count or memory allocation
+failure.  So that will cause the VFIO_IOMMU_MAP_DMA ioctl returns error,
+which makes the application failed to start.
+
+I observed one migration failure case (which is not easy to reproduce) is
+that, the 'thp_migration_fail' count is 1 and the 'thp_split_page_failed'
+count is also 1.
+
+That means when migrating a THP which is in CMA area, but can not allocate
+a new THP due to memory fragmentation, so it will split the THP.  However
+THP split is also failed, probably the reason is temporary reference count
+of this THP.  And the temporary reference count can be caused by dropping
+page caches (I observed the drop caches operation in the system), but we
+can not drop the shmem page caches due to they are already dirty at that
+time.
+
+Especially for THP split failure, which is caused by temporary reference
+count, we can try again to mitigate the failure of migration in this case
+according to previous discussion [1].
+
+[1] https://lore.kernel.org/all/470dc638-a300-f261-94b4-e27250e42f96@redhat.com/
+Link: https://lkml.kernel.org/r/6784730480a1df82e8f4cba1ed088e4ac767994b.1666599848.git.baolin.wang@linux.alibaba.com
+Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: Zi Yan <ziy@nvidia.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/huge_memory.c |  4 ++--
+ mm/migrate.c     | 19 ++++++++++++++++---
+ 2 files changed, 18 insertions(+), 5 deletions(-)
+
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 98a1a05f2db2d..f53bc54dacb37 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -2728,7 +2728,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+        * split PMDs
+        */
+       if (!can_split_folio(folio, &extra_pins)) {
+-              ret = -EBUSY;
++              ret = -EAGAIN;
+               goto out_unlock;
+       }
+@@ -2780,7 +2780,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
+                       xas_unlock(&xas);
+               local_irq_enable();
+               remap_page(folio, folio_nr_pages(folio));
+-              ret = -EBUSY;
++              ret = -EAGAIN;
+       }
+ out_unlock:
+diff --git a/mm/migrate.c b/mm/migrate.c
+index 0252aa4ff572e..b0caa89e67d5f 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1518,9 +1518,22 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
+                               if (is_thp) {
+                                       nr_thp_failed++;
+                                       /* THP NUMA faulting doesn't split THP to retry. */
+-                                      if (!nosplit && !try_split_thp(page, &thp_split_pages)) {
+-                                              nr_thp_split++;
+-                                              break;
++                                      if (!nosplit) {
++                                              int ret = try_split_thp(page, &thp_split_pages);
++
++                                              if (!ret) {
++                                                      nr_thp_split++;
++                                                      break;
++                                              } else if (reason == MR_LONGTERM_PIN &&
++                                                         ret == -EAGAIN) {
++                                                      /*
++                                                       * Try again to split THP to mitigate
++                                                       * the failure of longterm pinning.
++                                                       */
++                                                      thp_retry++;
++                                                      nr_retry_pages += nr_subpages;
++                                                      break;
++                                              }
+                                       }
+                               } else if (!no_subpage_counting) {
+                                       nr_failed++;
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-migrate.c-stop-using-0-as-null-pointer.patch b/queue-6.1/mm-migrate.c-stop-using-0-as-null-pointer.patch
new file mode 100644 (file)
index 0000000..3761d5e
--- /dev/null
@@ -0,0 +1,39 @@
+From d6819b70b7c23771f73c3c02e6128860b55539bb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Nov 2022 09:23:45 +0800
+Subject: mm/migrate.c: stop using 0 as NULL pointer
+
+From: Yang Li <yang.lee@linux.alibaba.com>
+
+[ Upstream commit 4c74b65f478dc9353780a6be17fc82f1b06cea80 ]
+
+mm/migrate.c:1198:24: warning: Using plain integer as NULL pointer
+
+Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=3080
+Link: https://lkml.kernel.org/r/20221116012345.84870-1-yang.lee@linux.alibaba.com
+Signed-off-by: Yang Li <yang.lee@linux.alibaba.com>
+Reported-by: Abaci Robot <abaci@linux.alibaba.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 35e41024c4c2 ("vmscan,migrate: fix page count imbalance on node stats when demoting pages")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/migrate.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/migrate.c b/mm/migrate.c
+index 562f819dc6189..81444abf54dba 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1192,7 +1192,7 @@ static int unmap_and_move(new_page_t get_new_page,
+               return -ENOMEM;
+       dst = page_folio(newpage);
+-      dst->private = 0;
++      dst->private = NULL;
+       rc = __unmap_and_move(src, dst, force, mode);
+       if (rc == MIGRATEPAGE_SUCCESS)
+               set_page_owner_migrate_reason(&dst->page, reason);
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-page_alloc-explicitly-define-how-__gfp_high-non-b.patch b/queue-6.1/mm-page_alloc-explicitly-define-how-__gfp_high-non-b.patch
new file mode 100644 (file)
index 0000000..21e642f
--- /dev/null
@@ -0,0 +1,151 @@
+From 1c6a2cba75776f1944c170b81c1ad027ef2a12f4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Jan 2023 11:12:16 +0000
+Subject: mm/page_alloc: explicitly define how __GFP_HIGH non-blocking
+ allocations accesses reserves
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+[ Upstream commit 1ebbb21811b76c3b932959787f37985af36f62fa ]
+
+GFP_ATOMIC allocations get flagged ALLOC_HARDER which is a vague
+description.  In preparation for the removal of GFP_ATOMIC redefine
+__GFP_ATOMIC to simply mean non-blocking and renaming ALLOC_HARDER to
+ALLOC_NON_BLOCK accordingly.  __GFP_HIGH is required for access to
+reserves but non-blocking is granted more access.  For example, GFP_NOWAIT
+is non-blocking but has no special access to reserves.  A __GFP_NOFAIL
+blocking allocation is granted access similar to __GFP_HIGH if the only
+alternative is an OOM kill.
+
+Link: https://lkml.kernel.org/r/20230113111217.14134-6-mgorman@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Thierry Reding <thierry.reding@gmail.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/internal.h   |  7 +++++--
+ mm/page_alloc.c | 44 ++++++++++++++++++++++++--------------------
+ 2 files changed, 29 insertions(+), 22 deletions(-)
+
+diff --git a/mm/internal.h b/mm/internal.h
+index cd095ce2f199e..a50bc08337d21 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -754,7 +754,10 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
+ #define ALLOC_OOM             ALLOC_NO_WATERMARKS
+ #endif
+-#define ALLOC_HARDER           0x10 /* try to alloc harder */
++#define ALLOC_NON_BLOCK                0x10 /* Caller cannot block. Allow access
++                                     * to 25% of the min watermark or
++                                     * 62.5% if __GFP_HIGH is set.
++                                     */
+ #define ALLOC_MIN_RESERVE      0x20 /* __GFP_HIGH set. Allow access to 50%
+                                      * of the min watermark.
+                                      */
+@@ -769,7 +772,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
+ #define ALLOC_KSWAPD          0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
+ /* Flags that allow allocations below the min watermark. */
+-#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
++#define ALLOC_RESERVES (ALLOC_NON_BLOCK|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
+ enum ttu_flags;
+ struct tlbflush_unmap_batch;
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 6ab53e47ccea1..49dc4ba88c278 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3996,18 +3996,19 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+                * __GFP_HIGH allows access to 50% of the min reserve as well
+                * as OOM.
+                */
+-              if (alloc_flags & ALLOC_MIN_RESERVE)
++              if (alloc_flags & ALLOC_MIN_RESERVE) {
+                       min -= min / 2;
+-              /*
+-               * Non-blocking allocations can access some of the reserve
+-               * with more access if also __GFP_HIGH. The reasoning is that
+-               * a non-blocking caller may incur a more severe penalty
+-               * if it cannot get memory quickly, particularly if it's
+-               * also __GFP_HIGH.
+-               */
+-              if (alloc_flags & ALLOC_HARDER)
+-                      min -= min / 4;
++                      /*
++                       * Non-blocking allocations (e.g. GFP_ATOMIC) can
++                       * access more reserves than just __GFP_HIGH. Other
++                       * non-blocking allocations requests such as GFP_NOWAIT
++                       * or (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) do not get
++                       * access to the min reserve.
++                       */
++                      if (alloc_flags & ALLOC_NON_BLOCK)
++                              min -= min / 4;
++              }
+               /*
+                * OOM victims can try even harder than the normal reserve
+@@ -4858,28 +4859,30 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
+        * The caller may dip into page reserves a bit more if the caller
+        * cannot run direct reclaim, or if the caller has realtime scheduling
+        * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+-       * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH).
++       * set both ALLOC_NON_BLOCK and ALLOC_MIN_RESERVE(__GFP_HIGH).
+        */
+       alloc_flags |= (__force int)
+               (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
+-      if (gfp_mask & __GFP_ATOMIC) {
++      if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) {
+               /*
+                * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+                * if it can't schedule.
+                */
+               if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+-                      alloc_flags |= ALLOC_HARDER;
++                      alloc_flags |= ALLOC_NON_BLOCK;
+                       if (order > 0)
+                               alloc_flags |= ALLOC_HIGHATOMIC;
+               }
+               /*
+-               * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+-               * comment for __cpuset_node_allowed().
++               * Ignore cpuset mems for non-blocking __GFP_HIGH (probably
++               * GFP_ATOMIC) rather than fail, see the comment for
++               * __cpuset_node_allowed().
+                */
+-              alloc_flags &= ~ALLOC_CPUSET;
++              if (alloc_flags & ALLOC_MIN_RESERVE)
++                      alloc_flags &= ~ALLOC_CPUSET;
+       } else if (unlikely(rt_task(current)) && in_task())
+               alloc_flags |= ALLOC_MIN_RESERVE;
+@@ -5312,12 +5315,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+               WARN_ON_ONCE_GFP(costly_order, gfp_mask);
+               /*
+-               * Help non-failing allocations by giving them access to memory
+-               * reserves but do not use ALLOC_NO_WATERMARKS because this
++               * Help non-failing allocations by giving some access to memory
++               * reserves normally used for high priority non-blocking
++               * allocations but do not use ALLOC_NO_WATERMARKS because this
+                * could deplete whole memory reserves which would just make
+-               * the situation worse
++               * the situation worse.
+                */
+-              page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
++              page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_MIN_RESERVE, ac);
+               if (page)
+                       goto got_pg;
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-page_alloc-explicitly-define-what-alloc-flags-dep.patch b/queue-6.1/mm-page_alloc-explicitly-define-what-alloc-flags-dep.patch
new file mode 100644 (file)
index 0000000..2ed41d8
--- /dev/null
@@ -0,0 +1,113 @@
+From 7f56b2ec2c70a47a901ef2da605c6c7552cd71c2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Jan 2023 11:12:15 +0000
+Subject: mm/page_alloc: explicitly define what alloc flags deplete min
+ reserves
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+[ Upstream commit ab3508854353793cd35e348fde89a5c09b2fd8b5 ]
+
+As there are more ALLOC_ flags that affect reserves, define what flags
+affect reserves and clarify the effect of each flag.
+
+Link: https://lkml.kernel.org/r/20230113111217.14134-5-mgorman@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Thierry Reding <thierry.reding@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/internal.h   |  3 +++
+ mm/page_alloc.c | 34 ++++++++++++++++++++++------------
+ 2 files changed, 25 insertions(+), 12 deletions(-)
+
+diff --git a/mm/internal.h b/mm/internal.h
+index f0f6198462cc1..cd095ce2f199e 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -768,6 +768,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
+ #define ALLOC_HIGHATOMIC      0x200 /* Allows access to MIGRATE_HIGHATOMIC */
+ #define ALLOC_KSWAPD          0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
++/* Flags that allow allocations below the min watermark. */
++#define ALLOC_RESERVES (ALLOC_HARDER|ALLOC_MIN_RESERVE|ALLOC_HIGHATOMIC|ALLOC_OOM)
++
+ enum ttu_flags;
+ struct tlbflush_unmap_batch;
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 8e1f4d779b26c..6ab53e47ccea1 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3956,15 +3956,14 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
+ static inline long __zone_watermark_unusable_free(struct zone *z,
+                               unsigned int order, unsigned int alloc_flags)
+ {
+-      const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
+       long unusable_free = (1 << order) - 1;
+       /*
+-       * If the caller does not have rights to ALLOC_HARDER then subtract
+-       * the high-atomic reserves. This will over-estimate the size of the
+-       * atomic reserve but it avoids a search.
++       * If the caller does not have rights to reserves below the min
++       * watermark then subtract the high-atomic reserves. This will
++       * over-estimate the size of the atomic reserve but it avoids a search.
+        */
+-      if (likely(!alloc_harder))
++      if (likely(!(alloc_flags & ALLOC_RESERVES)))
+               unusable_free += z->nr_reserved_highatomic;
+ #ifdef CONFIG_CMA
+@@ -3988,25 +3987,36 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+ {
+       long min = mark;
+       int o;
+-      const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
+       /* free_pages may go negative - that's OK */
+       free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
+-      if (alloc_flags & ALLOC_MIN_RESERVE)
+-              min -= min / 2;
++      if (unlikely(alloc_flags & ALLOC_RESERVES)) {
++              /*
++               * __GFP_HIGH allows access to 50% of the min reserve as well
++               * as OOM.
++               */
++              if (alloc_flags & ALLOC_MIN_RESERVE)
++                      min -= min / 2;
+-      if (unlikely(alloc_harder)) {
+               /*
+-               * OOM victims can try even harder than normal ALLOC_HARDER
++               * Non-blocking allocations can access some of the reserve
++               * with more access if also __GFP_HIGH. The reasoning is that
++               * a non-blocking caller may incur a more severe penalty
++               * if it cannot get memory quickly, particularly if it's
++               * also __GFP_HIGH.
++               */
++              if (alloc_flags & ALLOC_HARDER)
++                      min -= min / 4;
++
++              /*
++               * OOM victims can try even harder than the normal reserve
+                * users on the grounds that it's definitely going to be in
+                * the exit path shortly and free memory. Any allocation it
+                * makes during the free path will be small and short-lived.
+                */
+               if (alloc_flags & ALLOC_OOM)
+                       min -= min / 2;
+-              else
+-                      min -= min / 4;
+       }
+       /*
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-page_alloc-explicitly-record-high-order-atomic-al.patch b/queue-6.1/mm-page_alloc-explicitly-record-high-order-atomic-al.patch
new file mode 100644 (file)
index 0000000..c625141
--- /dev/null
@@ -0,0 +1,124 @@
+From 69f11057cb93d04feb54dbe7ac271978e1263a69 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Jan 2023 11:12:14 +0000
+Subject: mm/page_alloc: explicitly record high-order atomic allocations in
+ alloc_flags
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+[ Upstream commit eb2e2b425c6984ca8034448a3f2c680622bd3d4d ]
+
+A high-order ALLOC_HARDER allocation is assumed to be atomic.  While that
+is accurate, it changes later in the series.  In preparation, explicitly
+record high-order atomic allocations in gfp_to_alloc_flags().
+
+Link: https://lkml.kernel.org/r/20230113111217.14134-4-mgorman@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Thierry Reding <thierry.reding@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/internal.h   |  1 +
+ mm/page_alloc.c | 29 +++++++++++++++++++++++------
+ 2 files changed, 24 insertions(+), 6 deletions(-)
+
+diff --git a/mm/internal.h b/mm/internal.h
+index 1be79a5147549..f0f6198462cc1 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -765,6 +765,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
+ #else
+ #define ALLOC_NOFRAGMENT        0x0
+ #endif
++#define ALLOC_HIGHATOMIC      0x200 /* Allows access to MIGRATE_HIGHATOMIC */
+ #define ALLOC_KSWAPD          0x800 /* allow waking of kswapd, __GFP_KSWAPD_RECLAIM set */
+ enum ttu_flags;
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index e78ab23eb1743..8e1f4d779b26c 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3713,10 +3713,20 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+                * reserved for high-order atomic allocation, so order-0
+                * request should skip it.
+                */
+-              if (order > 0 && alloc_flags & ALLOC_HARDER)
++              if (alloc_flags & ALLOC_HIGHATOMIC)
+                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+               if (!page) {
+                       page = __rmqueue(zone, order, migratetype, alloc_flags);
++
++                      /*
++                       * If the allocation fails, allow OOM handling access
++                       * to HIGHATOMIC reserves as failing now is worse than
++                       * failing a high-order atomic allocation in the
++                       * future.
++                       */
++                      if (!page && (alloc_flags & ALLOC_OOM))
++                              page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
++
+                       if (!page) {
+                               spin_unlock_irqrestore(&zone->lock, flags);
+                               return NULL;
+@@ -4030,8 +4040,10 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+                       return true;
+               }
+ #endif
+-              if (alloc_harder && !free_area_empty(area, MIGRATE_HIGHATOMIC))
++              if ((alloc_flags & (ALLOC_HIGHATOMIC|ALLOC_OOM)) &&
++                  !free_area_empty(area, MIGRATE_HIGHATOMIC)) {
+                       return true;
++              }
+       }
+       return false;
+ }
+@@ -4293,7 +4305,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
+                        * If this is a high-order atomic allocation then check
+                        * if the pageblock should be reserved for the future
+                        */
+-                      if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
++                      if (unlikely(alloc_flags & ALLOC_HIGHATOMIC))
+                               reserve_highatomic_pageblock(page, zone, order);
+                       return page;
+@@ -4820,7 +4832,7 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
+ }
+ static inline unsigned int
+-gfp_to_alloc_flags(gfp_t gfp_mask)
++gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order)
+ {
+       unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+@@ -4846,8 +4858,13 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
+                * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
+                * if it can't schedule.
+                */
+-              if (!(gfp_mask & __GFP_NOMEMALLOC))
++              if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+                       alloc_flags |= ALLOC_HARDER;
++
++                      if (order > 0)
++                              alloc_flags |= ALLOC_HIGHATOMIC;
++              }
++
+               /*
+                * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
+                * comment for __cpuset_node_allowed().
+@@ -5056,7 +5073,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+        * kswapd needs to be woken up, and to avoid the cost of setting up
+        * alloc_flags precisely. So we do that now.
+        */
+-      alloc_flags = gfp_to_alloc_flags(gfp_mask);
++      alloc_flags = gfp_to_alloc_flags(gfp_mask, order);
+       /*
+        * We need to recalculate the starting point for the zonelist iterator
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-page_alloc-let-gfp_atomic-order-0-allocs-access-h.patch b/queue-6.1/mm-page_alloc-let-gfp_atomic-order-0-allocs-access-h.patch
new file mode 100644 (file)
index 0000000..4f800cf
--- /dev/null
@@ -0,0 +1,88 @@
+From 035af24a1a0a452608fb425c6bd69b4d36c22548 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 11 Oct 2024 13:07:37 +0100
+Subject: mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic
+ reserves
+
+From: Matt Fleming <mfleming@cloudflare.com>
+
+[ Upstream commit 281dd25c1a018261a04d1b8bf41a0674000bfe38 ]
+
+Under memory pressure it's possible for GFP_ATOMIC order-0 allocations to
+fail even though free pages are available in the highatomic reserves.
+GFP_ATOMIC allocations cannot trigger unreserve_highatomic_pageblock()
+since it's only run from reclaim.
+
+Given that such allocations will pass the watermarks in
+__zone_watermark_unusable_free(), it makes sense to fallback to highatomic
+reserves the same way that ALLOC_OOM can.
+
+This fixes order-0 page allocation failures observed on Cloudflare's fleet
+when handling network packets:
+
+  kswapd1: page allocation failure: order:0, mode:0x820(GFP_ATOMIC),
+  nodemask=(null),cpuset=/,mems_allowed=0-7
+  CPU: 10 PID: 696 Comm: kswapd1 Kdump: loaded Tainted: G           O 6.6.43-CUSTOM #1
+  Hardware name: MACHINE
+  Call Trace:
+   <IRQ>
+   dump_stack_lvl+0x3c/0x50
+   warn_alloc+0x13a/0x1c0
+   __alloc_pages_slowpath.constprop.0+0xc9d/0xd10
+   __alloc_pages+0x327/0x340
+   __napi_alloc_skb+0x16d/0x1f0
+   bnxt_rx_page_skb+0x96/0x1b0 [bnxt_en]
+   bnxt_rx_pkt+0x201/0x15e0 [bnxt_en]
+   __bnxt_poll_work+0x156/0x2b0 [bnxt_en]
+   bnxt_poll+0xd9/0x1c0 [bnxt_en]
+   __napi_poll+0x2b/0x1b0
+   bpf_trampoline_6442524138+0x7d/0x1000
+   __napi_poll+0x5/0x1b0
+   net_rx_action+0x342/0x740
+   handle_softirqs+0xcf/0x2b0
+   irq_exit_rcu+0x6c/0x90
+   sysvec_apic_timer_interrupt+0x72/0x90
+   </IRQ>
+
+[mfleming@cloudflare.com: update comment]
+  Link: https://lkml.kernel.org/r/20241015125158.3597702-1-matt@readmodwrite.com
+Link: https://lkml.kernel.org/r/20241011120737.3300370-1-matt@readmodwrite.com
+Link: https://lore.kernel.org/all/CAGis_TWzSu=P7QJmjD58WWiu3zjMTVKSzdOwWE8ORaGytzWJwQ@mail.gmail.com/
+Fixes: 1d91df85f399 ("mm/page_alloc: handle a missing case for memalloc_nocma_{save/restore} APIs")
+Signed-off-by: Matt Fleming <mfleming@cloudflare.com>
+Suggested-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/page_alloc.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 49dc4ba88c278..b87b350b2f405 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3719,12 +3719,12 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+                       page = __rmqueue(zone, order, migratetype, alloc_flags);
+                       /*
+-                       * If the allocation fails, allow OOM handling access
+-                       * to HIGHATOMIC reserves as failing now is worse than
+-                       * failing a high-order atomic allocation in the
+-                       * future.
++                       * If the allocation fails, allow OOM handling and
++                       * order-0 (atomic) allocs access to HIGHATOMIC
++                       * reserves as failing now is worse than failing a
++                       * high-order atomic allocation in the future.
+                        */
+-                      if (!page && (alloc_flags & ALLOC_OOM))
++                      if (!page && (alloc_flags & (ALLOC_OOM|ALLOC_NON_BLOCK)))
+                               page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+                       if (!page) {
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-page_alloc-rename-alloc_high-to-alloc_min_reserve.patch b/queue-6.1/mm-page_alloc-rename-alloc_high-to-alloc_min_reserve.patch
new file mode 100644 (file)
index 0000000..d1704f4
--- /dev/null
@@ -0,0 +1,113 @@
+From e6ad0b3e024d77a33bb122f362f753202b75a30e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Jan 2023 11:12:12 +0000
+Subject: mm/page_alloc: rename ALLOC_HIGH to ALLOC_MIN_RESERVE
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+[ Upstream commit 524c48072e5673f4511f1ad81493e2485863fd65 ]
+
+Patch series "Discard __GFP_ATOMIC", v3.
+
+Neil's patch has been residing in mm-unstable as commit 2fafb4fe8f7a ("mm:
+discard __GFP_ATOMIC") for a long time and recently brought up again.
+Most recently, I was worried that __GFP_HIGH allocations could use
+high-order atomic reserves which is unintentional but there was no
+response so lets revisit -- this series reworks how min reserves are used,
+protects highorder reserves and then finishes with Neil's patch with very
+minor modifications so it fits on top.
+
+There was a review discussion on renaming __GFP_DIRECT_RECLAIM to
+__GFP_ALLOW_BLOCKING but I didn't think it was that big an issue and is
+orthogonal to the removal of __GFP_ATOMIC.
+
+There were some concerns about how the gfp flags affect the min reserves
+but it never reached a solid conclusion so I made my own attempt.
+
+The series tries to iron out some of the details on how reserves are used.
+ALLOC_HIGH becomes ALLOC_MIN_RESERVE and ALLOC_HARDER becomes
+ALLOC_NON_BLOCK and documents how the reserves are affected.  For example,
+ALLOC_NON_BLOCK (no direct reclaim) on its own allows 25% of the min
+reserve.  ALLOC_MIN_RESERVE (__GFP_HIGH) allows 50% and both combined
+allows deeper access again.  ALLOC_OOM allows access to 75%.
+
+High-order atomic allocations are explicitly handled with the caveat that
+no __GFP_ATOMIC flag means that any high-order allocation that specifies
+GFP_HIGH and cannot enter direct reclaim will be treated as if it was
+GFP_ATOMIC.
+
+This patch (of 6):
+
+__GFP_HIGH aliases to ALLOC_HIGH but the name does not really hint what it
+means.  As ALLOC_HIGH is internal to the allocator, rename it to
+ALLOC_MIN_RESERVE to document that the min reserves can be depleted.
+
+Link: https://lkml.kernel.org/r/20230113111217.14134-1-mgorman@techsingularity.net
+Link: https://lkml.kernel.org/r/20230113111217.14134-2-mgorman@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Thierry Reding <thierry.reding@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/internal.h   | 4 +++-
+ mm/page_alloc.c | 8 ++++----
+ 2 files changed, 7 insertions(+), 5 deletions(-)
+
+diff --git a/mm/internal.h b/mm/internal.h
+index d01130efce5fb..1be79a5147549 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -755,7 +755,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
+ #endif
+ #define ALLOC_HARDER           0x10 /* try to alloc harder */
+-#define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
++#define ALLOC_MIN_RESERVE      0x20 /* __GFP_HIGH set. Allow access to 50%
++                                     * of the min watermark.
++                                     */
+ #define ALLOC_CPUSET           0x40 /* check for correct cpuset */
+ #define ALLOC_CMA              0x80 /* allow allocations from CMA areas */
+ #ifdef CONFIG_ZONE_DMA32
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index a905b850d31c4..f5b870780d3fd 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3983,7 +3983,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+       /* free_pages may go negative - that's OK */
+       free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
+-      if (alloc_flags & ALLOC_HIGH)
++      if (alloc_flags & ALLOC_MIN_RESERVE)
+               min -= min / 2;
+       if (unlikely(alloc_harder)) {
+@@ -4825,18 +4825,18 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
+       unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+       /*
+-       * __GFP_HIGH is assumed to be the same as ALLOC_HIGH
++       * __GFP_HIGH is assumed to be the same as ALLOC_MIN_RESERVE
+        * and __GFP_KSWAPD_RECLAIM is assumed to be the same as ALLOC_KSWAPD
+        * to save two branches.
+        */
+-      BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
++      BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_MIN_RESERVE);
+       BUILD_BUG_ON(__GFP_KSWAPD_RECLAIM != (__force gfp_t) ALLOC_KSWAPD);
+       /*
+        * The caller may dip into page reserves a bit more if the caller
+        * cannot run direct reclaim, or if the caller has realtime scheduling
+        * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+-       * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
++       * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_MIN_RESERVE(__GFP_HIGH).
+        */
+       alloc_flags |= (__force int)
+               (gfp_mask & (__GFP_HIGH | __GFP_KSWAPD_RECLAIM));
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-page_alloc-treat-rt-tasks-similar-to-__gfp_high.patch b/queue-6.1/mm-page_alloc-treat-rt-tasks-similar-to-__gfp_high.patch
new file mode 100644 (file)
index 0000000..98cd1f0
--- /dev/null
@@ -0,0 +1,55 @@
+From 5fc910982082f797aee07e26aceeec356048aab5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Jan 2023 11:12:13 +0000
+Subject: mm/page_alloc: treat RT tasks similar to __GFP_HIGH
+
+From: Mel Gorman <mgorman@techsingularity.net>
+
+[ Upstream commit c988dcbecf3fd5430921eaa3fe9054754f76d185 ]
+
+RT tasks are allowed to dip below the min reserve but ALLOC_HARDER is
+typically combined with ALLOC_MIN_RESERVE so RT tasks are a little
+unusual.  While there is some justification for allowing RT tasks access
+to memory reserves, there is a strong chance that a RT task that is also
+under memory pressure is at risk of missing deadlines anyway.  Relax how
+much reserves an RT task can access by treating it the same as __GFP_HIGH
+allocations.
+
+Note that in a future kernel release that the RT special casing will be
+removed.  Hard realtime tasks should be locking down resources in advance
+and ensuring enough memory is available.  Even a soft-realtime task like
+audio or video live decoding which cannot jitter should be allocating both
+memory and any disk space required up-front before the recording starts
+instead of relying on reserves.  At best, reserve access will only delay
+the problem by a very short interval.
+
+Link: https://lkml.kernel.org/r/20230113111217.14134-3-mgorman@techsingularity.net
+Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: NeilBrown <neilb@suse.de>
+Cc: Thierry Reding <thierry.reding@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Stable-dep-of: 281dd25c1a01 ("mm/page_alloc: let GFP_ATOMIC order-0 allocs access highatomic reserves")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/page_alloc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index f5b870780d3fd..e78ab23eb1743 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -4854,7 +4854,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
+                */
+               alloc_flags &= ~ALLOC_CPUSET;
+       } else if (unlikely(rt_task(current)) && in_task())
+-              alloc_flags |= ALLOC_HARDER;
++              alloc_flags |= ALLOC_MIN_RESERVE;
+       alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags);
+-- 
+2.43.0
+
diff --git a/queue-6.1/nvmet-auth-assign-dh_key-to-null-after-kfree_sensiti.patch b/queue-6.1/nvmet-auth-assign-dh_key-to-null-after-kfree_sensiti.patch
new file mode 100644 (file)
index 0000000..0762499
--- /dev/null
@@ -0,0 +1,41 @@
+From 60268a8bc1d37e87324ab48e3fa7c47d3a7306b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 16 Sep 2024 22:41:37 +0500
+Subject: nvmet-auth: assign dh_key to NULL after kfree_sensitive
+
+From: Vitaliy Shevtsov <v.shevtsov@maxima.ru>
+
+[ Upstream commit d2f551b1f72b4c508ab9298419f6feadc3b5d791 ]
+
+ctrl->dh_key might be used across multiple calls to nvmet_setup_dhgroup()
+for the same controller. So it's better to nullify it after release on
+error path in order to avoid double free later in nvmet_destroy_auth().
+
+Found by Linux Verification Center (linuxtesting.org) with Svace.
+
+Fixes: 7a277c37d352 ("nvmet-auth: Diffie-Hellman key exchange support")
+Cc: stable@vger.kernel.org
+Signed-off-by: Vitaliy Shevtsov <v.shevtsov@maxima.ru>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Signed-off-by: Keith Busch <kbusch@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/nvme/target/auth.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
+index aacc05ec00c2b..74791078fdebc 100644
+--- a/drivers/nvme/target/auth.c
++++ b/drivers/nvme/target/auth.c
+@@ -101,6 +101,7 @@ int nvmet_setup_dhgroup(struct nvmet_ctrl *ctrl, u8 dhgroup_id)
+                       pr_debug("%s: ctrl %d failed to generate private key, err %d\n",
+                                __func__, ctrl->cntlid, ret);
+                       kfree_sensitive(ctrl->dh_key);
++                      ctrl->dh_key = NULL;
+                       return ret;
+               }
+               ctrl->dh_keysize = crypto_kpp_maxsize(ctrl->dh_tfm);
+-- 
+2.43.0
+
diff --git a/queue-6.1/ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overfl.patch b/queue-6.1/ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overfl.patch
new file mode 100644 (file)
index 0000000..bfad203
--- /dev/null
@@ -0,0 +1,60 @@
+From e385b2a0a317a67940c499f3891df7a28f222d5a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Oct 2024 19:43:47 +0800
+Subject: ocfs2: pass u64 to ocfs2_truncate_inline maybe overflow
+
+From: Edward Adam Davis <eadavis@qq.com>
+
+[ Upstream commit bc0a2f3a73fcdac651fca64df39306d1e5ebe3b0 ]
+
+Syzbot reported a kernel BUG in ocfs2_truncate_inline.  There are two
+reasons for this: first, the parameter value passed is greater than
+ocfs2_max_inline_data_with_xattr, second, the start and end parameters of
+ocfs2_truncate_inline are "unsigned int".
+
+So, we need to add a sanity check for byte_start and byte_len right before
+ocfs2_truncate_inline() in ocfs2_remove_inode_range(), if they are greater
+than ocfs2_max_inline_data_with_xattr return -EINVAL.
+
+Link: https://lkml.kernel.org/r/tencent_D48DB5122ADDAEDDD11918CFB68D93258C07@qq.com
+Fixes: 1afc32b95233 ("ocfs2: Write support for inline data")
+Signed-off-by: Edward Adam Davis <eadavis@qq.com>
+Reported-by: syzbot+81092778aac03460d6b7@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=81092778aac03460d6b7
+Reviewed-by: Joseph Qi <joseph.qi@linux.alibaba.com>
+Cc: Joel Becker <jlbec@evilplan.org>
+Cc: Joseph Qi <joseph.qi@linux.alibaba.com>
+Cc: Mark Fasheh <mark@fasheh.com>
+Cc: Junxiao Bi <junxiao.bi@oracle.com>
+Cc: Changwei Ge <gechangwei@live.cn>
+Cc: Gang He <ghe@suse.com>
+Cc: Jun Piao <piaojun@huawei.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/ocfs2/file.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
+index f502bb2ce2ea7..ea7c79e8ce429 100644
+--- a/fs/ocfs2/file.c
++++ b/fs/ocfs2/file.c
+@@ -1784,6 +1784,14 @@ int ocfs2_remove_inode_range(struct inode *inode,
+               return 0;
+       if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
++              int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
++
++              if (byte_start > id_count || byte_start + byte_len > id_count) {
++                      ret = -EINVAL;
++                      mlog_errno(ret);
++                      goto out;
++              }
++
+               ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
+                                           byte_start + byte_len, 0);
+               if (ret) {
+-- 
+2.43.0
+
diff --git a/queue-6.1/riscv-efi-set-nx-compat-flag-in-pe-coff-header.patch b/queue-6.1/riscv-efi-set-nx-compat-flag-in-pe-coff-header.patch
new file mode 100644 (file)
index 0000000..7543ccf
--- /dev/null
@@ -0,0 +1,48 @@
+From 253cd32230547e4e3a73363d58169af99d480326 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 29 Sep 2024 16:02:33 +0200
+Subject: riscv: efi: Set NX compat flag in PE/COFF header
+
+From: Heinrich Schuchardt <heinrich.schuchardt@canonical.com>
+
+[ Upstream commit d41373a4b910961df5a5e3527d7bde6ad45ca438 ]
+
+The IMAGE_DLLCHARACTERISTICS_NX_COMPAT informs the firmware that the
+EFI binary does not rely on pages that are both executable and
+writable.
+
+The flag is used by some distro versions of GRUB to decide if the EFI
+binary may be executed.
+
+As the Linux kernel neither has RWX sections nor needs RWX pages for
+relocation we should set the flag.
+
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Heinrich Schuchardt <heinrich.schuchardt@canonical.com>
+Reviewed-by: Emil Renner Berthing <emil.renner.berthing@canonical.com>
+Fixes: cb7d2dd5612a ("RISC-V: Add PE/COFF header for EFI stub")
+Acked-by: Ard Biesheuvel <ardb@kernel.org>
+Link: https://lore.kernel.org/r/20240929140233.211800-1-heinrich.schuchardt@canonical.com
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/kernel/efi-header.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/riscv/kernel/efi-header.S b/arch/riscv/kernel/efi-header.S
+index 8e733aa48ba6c..c306f3a6a800e 100644
+--- a/arch/riscv/kernel/efi-header.S
++++ b/arch/riscv/kernel/efi-header.S
+@@ -59,7 +59,7 @@ extra_header_fields:
+       .long   efi_header_end - _start                 // SizeOfHeaders
+       .long   0                                       // CheckSum
+       .short  IMAGE_SUBSYSTEM_EFI_APPLICATION         // Subsystem
+-      .short  0                                       // DllCharacteristics
++      .short  IMAGE_DLL_CHARACTERISTICS_NX_COMPAT     // DllCharacteristics
+       .quad   0                                       // SizeOfStackReserve
+       .quad   0                                       // SizeOfStackCommit
+       .quad   0                                       // SizeOfHeapReserve
+-- 
+2.43.0
+
diff --git a/queue-6.1/riscv-remove-duplicated-get_rm.patch b/queue-6.1/riscv-remove-duplicated-get_rm.patch
new file mode 100644 (file)
index 0000000..22e43e3
--- /dev/null
@@ -0,0 +1,38 @@
+From 7c946b69fbe00d6a7ea385b4e627abd569037584 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Oct 2024 17:41:39 +0800
+Subject: riscv: Remove duplicated GET_RM
+
+From: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+
+[ Upstream commit 164f66de6bb6ef454893f193c898dc8f1da6d18b ]
+
+The macro GET_RM defined twice in this file, one can be removed.
+
+Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
+Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+Fixes: 956d705dd279 ("riscv: Unaligned load/store handling for M_MODE")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20241008094141.549248-3-zhangchunyan@iscas.ac.cn
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/kernel/traps_misaligned.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/arch/riscv/kernel/traps_misaligned.c b/arch/riscv/kernel/traps_misaligned.c
+index 5348d842c7453..3d16cc803220e 100644
+--- a/arch/riscv/kernel/traps_misaligned.c
++++ b/arch/riscv/kernel/traps_misaligned.c
+@@ -132,8 +132,6 @@
+ #define REG_PTR(insn, pos, regs)      \
+       (ulong *)((ulong)(regs) + REG_OFFSET(insn, pos))
+-#define GET_RM(insn)                  (((insn) >> 12) & 7)
+-
+ #define GET_RS1(insn, regs)           (*REG_PTR(insn, SH_RS1, regs))
+ #define GET_RS2(insn, regs)           (*REG_PTR(insn, SH_RS2, regs))
+ #define GET_RS1S(insn, regs)          (*REG_PTR(RVC_RS1S(insn), 0, regs))
+-- 
+2.43.0
+
diff --git a/queue-6.1/riscv-remove-unused-generating_asm_offsets.patch b/queue-6.1/riscv-remove-unused-generating_asm_offsets.patch
new file mode 100644 (file)
index 0000000..aeb7a72
--- /dev/null
@@ -0,0 +1,44 @@
+From c50a27b625ad9865ab4d7c4464b650f3309d5ba8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 8 Oct 2024 17:41:38 +0800
+Subject: riscv: Remove unused GENERATING_ASM_OFFSETS
+
+From: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+
+[ Upstream commit 46d4e5ac6f2f801f97bcd0ec82365969197dc9b1 ]
+
+The macro is not used in the current version of kernel, it looks like
+can be removed to avoid a build warning:
+
+../arch/riscv/kernel/asm-offsets.c: At top level:
+../arch/riscv/kernel/asm-offsets.c:7: warning: macro "GENERATING_ASM_OFFSETS" is not used [-Wunused-macros]
+    7 | #define GENERATING_ASM_OFFSETS
+
+Fixes: 9639a44394b9 ("RISC-V: Provide a cleaner raw_smp_processor_id()")
+Cc: stable@vger.kernel.org
+Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
+Tested-by: Alexandre Ghiti <alexghiti@rivosinc.com>
+Signed-off-by: Chunyan Zhang <zhangchunyan@iscas.ac.cn>
+Link: https://lore.kernel.org/r/20241008094141.549248-2-zhangchunyan@iscas.ac.cn
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/kernel/asm-offsets.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
+index df9444397908d..1ecafbcee9a0a 100644
+--- a/arch/riscv/kernel/asm-offsets.c
++++ b/arch/riscv/kernel/asm-offsets.c
+@@ -4,8 +4,6 @@
+  * Copyright (C) 2017 SiFive
+  */
+-#define GENERATING_ASM_OFFSETS
+-
+ #include <linux/kbuild.h>
+ #include <linux/mm.h>
+ #include <linux/sched.h>
+-- 
+2.43.0
+
diff --git a/queue-6.1/riscv-use-u-to-format-the-output-of-cpu.patch b/queue-6.1/riscv-use-u-to-format-the-output-of-cpu.patch
new file mode 100644 (file)
index 0000000..919d093
--- /dev/null
@@ -0,0 +1,43 @@
+From ff7ce41d5795e9e45aae3d280bfb411d40248dca Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 17 Oct 2024 11:20:10 +0800
+Subject: riscv: Use '%u' to format the output of 'cpu'
+
+From: WangYuli <wangyuli@uniontech.com>
+
+[ Upstream commit e0872ab72630dada3ae055bfa410bf463ff1d1e0 ]
+
+'cpu' is an unsigned integer, so its conversion specifier should
+be %u, not %d.
+
+Suggested-by: Wentao Guan <guanwentao@uniontech.com>
+Suggested-by: Maciej W. Rozycki <macro@orcam.me.uk>
+Link: https://lore.kernel.org/all/alpine.DEB.2.21.2409122309090.40372@angie.orcam.me.uk/
+Signed-off-by: WangYuli <wangyuli@uniontech.com>
+Reviewed-by: Charlie Jenkins <charlie@rivosinc.com>
+Tested-by: Charlie Jenkins <charlie@rivosinc.com>
+Fixes: f1e58583b9c7 ("RISC-V: Support cpu hotplug")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/4C127DEECDA287C8+20241017032010.96772-1-wangyuli@uniontech.com
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/kernel/cpu-hotplug.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/riscv/kernel/cpu-hotplug.c b/arch/riscv/kernel/cpu-hotplug.c
+index f7a832e3a1d1d..462b3631663f9 100644
+--- a/arch/riscv/kernel/cpu-hotplug.c
++++ b/arch/riscv/kernel/cpu-hotplug.c
+@@ -65,7 +65,7 @@ void __cpu_die(unsigned int cpu)
+       if (cpu_ops[cpu]->cpu_is_stopped)
+               ret = cpu_ops[cpu]->cpu_is_stopped(cpu);
+       if (ret)
+-              pr_warn("CPU%d may not have stopped: %d\n", cpu, ret);
++              pr_warn("CPU%u may not have stopped: %d\n", cpu, ret);
+ }
+ /*
+-- 
+2.43.0
+
diff --git a/queue-6.1/riscv-vdso-prevent-the-compiler-from-inserting-calls.patch b/queue-6.1/riscv-vdso-prevent-the-compiler-from-inserting-calls.patch
new file mode 100644 (file)
index 0000000..c7bce24
--- /dev/null
@@ -0,0 +1,40 @@
+From d06392f489cede0c9e276cf66ff324f9b61a1157 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Oct 2024 10:36:24 +0200
+Subject: riscv: vdso: Prevent the compiler from inserting calls to memset()
+
+From: Alexandre Ghiti <alexghiti@rivosinc.com>
+
+[ Upstream commit bf40167d54d55d4b54d0103713d86a8638fb9290 ]
+
+The compiler is smart enough to insert a call to memset() in
+riscv_vdso_get_cpus(), which generates a dynamic relocation.
+
+So prevent this by using -fno-builtin option.
+
+Fixes: e2c0cdfba7f6 ("RISC-V: User-facing API")
+Cc: stable@vger.kernel.org
+Signed-off-by: Alexandre Ghiti <alexghiti@rivosinc.com>
+Reviewed-by: Guo Ren <guoren@kernel.org>
+Link: https://lore.kernel.org/r/20241016083625.136311-2-alexghiti@rivosinc.com
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/kernel/vdso/Makefile | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/arch/riscv/kernel/vdso/Makefile b/arch/riscv/kernel/vdso/Makefile
+index 06e6b27f3bcc9..c1b68f962bada 100644
+--- a/arch/riscv/kernel/vdso/Makefile
++++ b/arch/riscv/kernel/vdso/Makefile
+@@ -18,6 +18,7 @@ obj-vdso = $(patsubst %, %.o, $(vdso-syms)) note.o
+ ccflags-y := -fno-stack-protector
+ ccflags-y += -DDISABLE_BRANCH_PROFILING
++ccflags-y += -fno-builtin
+ ifneq ($(c-gettimeofday-y),)
+   CFLAGS_vgettimeofday.o += -fPIC -include $(c-gettimeofday-y)
+-- 
+2.43.0
+
index 6e0e48c5bba5583860cec77546630c1eccc6ebe8..a9b3fb705b12ba5311dae2b10523ac81168fc809 100644 (file)
@@ -77,3 +77,37 @@ staging-iio-frequency-ad9832-fix-division-by-zero-in-ad9832_calc_freqreg.patch
 iio-adc-ad7124-fix-division-by-zero-in-ad7124_set_channel_odr.patch
 iio-light-veml6030-fix-microlux-value-calculation.patch
 nilfs2-fix-potential-deadlock-with-newly-created-symlinks.patch
+block-fix-sanity-checks-in-blk_rq_map_user_bvec.patch
+cgroup-bpf-use-a-dedicated-workqueue-for-cgroup-bpf-.patch
+riscv-vdso-prevent-the-compiler-from-inserting-calls.patch
+alsa-hda-realtek-limit-internal-mic-boost-on-dell-pl.patch
+riscv-efi-set-nx-compat-flag-in-pe-coff-header.patch
+riscv-use-u-to-format-the-output-of-cpu.patch
+riscv-remove-unused-generating_asm_offsets.patch
+riscv-remove-duplicated-get_rm.patch
+cxl-acpi-move-rescan-to-the-workqueue.patch
+cxl-port-fix-cxl_bus_rescan-vs-bus_rescan_devices.patch
+mm-page_alloc-rename-alloc_high-to-alloc_min_reserve.patch
+mm-page_alloc-treat-rt-tasks-similar-to-__gfp_high.patch
+mm-page_alloc-explicitly-record-high-order-atomic-al.patch
+mm-page_alloc-explicitly-define-what-alloc-flags-dep.patch
+mm-page_alloc-explicitly-define-how-__gfp_high-non-b.patch
+mm-page_alloc-let-gfp_atomic-order-0-allocs-access-h.patch
+ocfs2-pass-u64-to-ocfs2_truncate_inline-maybe-overfl.patch
+mctp-i2c-handle-null-header-address.patch
+alsa-hda-realtek-fix-headset-mic-on-tuxedo-stellaris.patch
+nvmet-auth-assign-dh_key-to-null-after-kfree_sensiti.patch
+kasan-remove-vmalloc_percpu-test.patch
+io_uring-rename-kiocb_end_write-local-helper.patch
+fs-create-kiocb_-start-end-_write-helpers.patch
+io_uring-use-kiocb_-start-end-_write-helpers.patch
+io_uring-rw-fix-missing-nowait-check-for-o_direct-st.patch
+mm-migrate-try-again-if-thp-split-is-failed-due-to-p.patch
+migrate-convert-unmap_and_move-to-use-folios.patch
+migrate-convert-migrate_pages-to-use-folios.patch
+mm-migrate.c-stop-using-0-as-null-pointer.patch
+migrate_pages-organize-stats-with-struct-migrate_pag.patch
+migrate_pages-separate-hugetlb-folios-migration.patch
+migrate_pages-restrict-number-of-pages-to-migrate-in.patch
+migrate_pages-split-unmap_and_move-to-_unmap-and-_mo.patch
+vmscan-migrate-fix-page-count-imbalance-on-node-stat.patch
diff --git a/queue-6.1/vmscan-migrate-fix-page-count-imbalance-on-node-stat.patch b/queue-6.1/vmscan-migrate-fix-page-count-imbalance-on-node-stat.patch
new file mode 100644 (file)
index 0000000..aa73f5d
--- /dev/null
@@ -0,0 +1,75 @@
+From 1f94221149da9099afe301073304fc26a2ec2ac6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 25 Oct 2024 10:17:24 -0400
+Subject: vmscan,migrate: fix page count imbalance on node stats when demoting
+ pages
+
+From: Gregory Price <gourry@gourry.net>
+
+[ Upstream commit 35e41024c4c2b02ef8207f61b9004f6956cf037b ]
+
+When numa balancing is enabled with demotion, vmscan will call
+migrate_pages when shrinking LRUs.  migrate_pages will decrement the
+the node's isolated page count, leading to an imbalanced count when
+invoked from (MG)LRU code.
+
+The result is dmesg output like such:
+
+$ cat /proc/sys/vm/stat_refresh
+
+[77383.088417] vmstat_refresh: nr_isolated_anon -103212
+[77383.088417] vmstat_refresh: nr_isolated_file -899642
+
+This negative value may impact compaction and reclaim throttling.
+
+The following path produces the decrement:
+
+shrink_folio_list
+  demote_folio_list
+    migrate_pages
+      migrate_pages_batch
+        migrate_folio_move
+          migrate_folio_done
+            mod_node_page_state(-ve) <- decrement
+
+This path happens for SUCCESSFUL migrations, not failures.  Typically
+callers to migrate_pages are required to handle putback/accounting for
+failures, but this is already handled in the shrink code.
+
+When accounting for migrations, instead do not decrement the count when
+the migration reason is MR_DEMOTION.  As of v6.11, this demotion logic
+is the only source of MR_DEMOTION.
+
+Link: https://lkml.kernel.org/r/20241025141724.17927-1-gourry@gourry.net
+Fixes: 26aa2d199d6f ("mm/migrate: demote pages during reclaim")
+Signed-off-by: Gregory Price <gourry@gourry.net>
+Reviewed-by: Yang Shi <shy828301@gmail.com>
+Reviewed-by: Davidlohr Bueso <dave@stgolabs.net>
+Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
+Reviewed-by: "Huang, Ying" <ying.huang@intel.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: Wei Xu <weixugc@google.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/migrate.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/migrate.c b/mm/migrate.c
+index 46a1476e188c3..9ff5d77b61a3e 100644
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -1044,7 +1044,7 @@ static void migrate_folio_done(struct folio *src,
+        * not accounted to NR_ISOLATED_*. They can be recognized
+        * as __PageMovable
+        */
+-      if (likely(!__folio_test_movable(src)))
++      if (likely(!__folio_test_movable(src)) && reason != MR_DEMOTION)
+               mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
+                                   folio_is_file_lru(src), -folio_nr_pages(src));
+-- 
+2.43.0
+