From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sat, 2 Apr 2022 11:21:05 +0000 (+0200)
Subject: 5.17-stable patches
X-Git-Tag: v5.17.2~193
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=968e0a5077970199a7e5545170c4535b1c70778a;p=thirdparty%2Fkernel%2Fstable-queue.git

5.17-stable patches

added patches:
	dm-fix-double-accounting-of-flush-with-data.patch
	dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch
	dm-integrity-set-journal-entry-unused-when-shrinking-device.patch
	dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch
	dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch
	drbd-fix-potential-silent-data-corruption.patch
	mm-hwpoison-unmap-poisoned-page-before-invalidation.patch
	mm-kmemleak-reset-tag-when-compare-object-pointer.patch
	mm-madvise-return-correct-bytes-advised-with-process_madvise.patch
	mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
	mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch
	revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
	tracing-have-trace-event-string-test-handle-zero-length-strings.patch
---

diff --git a/queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch b/queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch
new file mode 100644
index 00000000000..14a8feb9368
--- /dev/null
+++ b/queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch
@@ -0,0 +1,140 @@
+From 8d394bc4adf588ca4a0650745167cb83f86c18c9 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 17 Feb 2022 23:39:57 -0500
+Subject: dm: fix double accounting of flush with data
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 8d394bc4adf588ca4a0650745167cb83f86c18c9 upstream.
+
+DM handles a flush with data by first issuing an empty flush and then
+once it completes the REQ_PREFLUSH flag is removed and the payload is
+issued.  The problem fixed by this commit is that both the empty flush
+bio and the data payload will account the full extent of the data
+payload.
+
+Fix this by factoring out dm_io_acct() and having it wrap all IO
+accounting to set the size of  bio with REQ_PREFLUSH to 0, account the
+IO, and then restore the original size.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-stats.c |    6 ++++--
+ drivers/md/dm-stats.h |    2 +-
+ drivers/md/dm.c       |   47 +++++++++++++++++++++++++++++++++--------------
+ 3 files changed, 38 insertions(+), 17 deletions(-)
+
+--- a/drivers/md/dm-stats.c
++++ b/drivers/md/dm-stats.c
+@@ -644,13 +644,14 @@ static void __dm_stat_bio(struct dm_stat
+ 
+ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+ 			 sector_t bi_sector, unsigned bi_sectors, bool end,
+-			 unsigned long duration_jiffies,
++			 unsigned long start_time,
+ 			 struct dm_stats_aux *stats_aux)
+ {
+ 	struct dm_stat *s;
+ 	sector_t end_sector;
+ 	struct dm_stats_last_position *last;
+ 	bool got_precise_time;
++	unsigned long duration_jiffies = 0;
+ 
+ 	if (unlikely(!bi_sectors))
+ 		return;
+@@ -670,7 +671,8 @@ void dm_stats_account_io(struct dm_stats
+ 				       ));
+ 		WRITE_ONCE(last->last_sector, end_sector);
+ 		WRITE_ONCE(last->last_rw, bi_rw);
+-	}
++	} else
++		duration_jiffies = jiffies - start_time;
+ 
+ 	rcu_read_lock();
+ 
+--- a/drivers/md/dm-stats.h
++++ b/drivers/md/dm-stats.h
+@@ -31,7 +31,7 @@ int dm_stats_message(struct mapped_devic
+ 
+ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+ 			 sector_t bi_sector, unsigned bi_sectors, bool end,
+-			 unsigned long duration_jiffies,
++			 unsigned long start_time,
+ 			 struct dm_stats_aux *aux);
+ 
+ static inline bool dm_stats_used(struct dm_stats *st)
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -484,29 +484,48 @@ u64 dm_start_time_ns_from_clone(struct b
+ }
+ EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
+ 
+-static void start_io_acct(struct dm_io *io)
++static bool bio_is_flush_with_data(struct bio *bio)
+ {
+-	struct mapped_device *md = io->md;
+-	struct bio *bio = io->orig_bio;
++	return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
++}
++
++static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio,
++		       unsigned long start_time, struct dm_stats_aux *stats_aux)
++{
++	bool is_flush_with_data;
++	unsigned int bi_size;
++
++	/* If REQ_PREFLUSH set save any payload but do not account it */
++	is_flush_with_data = bio_is_flush_with_data(bio);
++	if (is_flush_with_data) {
++		bi_size = bio->bi_iter.bi_size;
++		bio->bi_iter.bi_size = 0;
++	}
++
++	if (!end)
++		bio_start_io_acct_time(bio, start_time);
++	else
++		bio_end_io_acct(bio, start_time);
+ 
+-	bio_start_io_acct_time(bio, io->start_time);
+ 	if (unlikely(dm_stats_used(&md->stats)))
+ 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
+ 				    bio->bi_iter.bi_sector, bio_sectors(bio),
+-				    false, 0, &io->stats_aux);
++				    end, start_time, stats_aux);
++
++	/* Restore bio's payload so it does get accounted upon requeue */
++	if (is_flush_with_data)
++		bio->bi_iter.bi_size = bi_size;
++}
++
++static void start_io_acct(struct dm_io *io)
++{
++	dm_io_acct(false, io->md, io->orig_bio, io->start_time, &io->stats_aux);
+ }
+ 
+ static void end_io_acct(struct mapped_device *md, struct bio *bio,
+ 			unsigned long start_time, struct dm_stats_aux *stats_aux)
+ {
+-	unsigned long duration = jiffies - start_time;
+-
+-	bio_end_io_acct(bio, start_time);
+-
+-	if (unlikely(dm_stats_used(&md->stats)))
+-		dm_stats_account_io(&md->stats, bio_data_dir(bio),
+-				    bio->bi_iter.bi_sector, bio_sectors(bio),
+-				    true, duration, stats_aux);
++	dm_io_acct(true, md, bio, start_time, stats_aux);
+ }
+ 
+ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
+@@ -835,7 +854,7 @@ void dm_io_dec_pending(struct dm_io *io,
+ 		if (io_error == BLK_STS_DM_REQUEUE)
+ 			return;
+ 
+-		if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
++		if (bio_is_flush_with_data(bio)) {
+ 			/*
+ 			 * Preflush done for flush with data, reissue
+ 			 * without REQ_PREFLUSH.
diff --git a/queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch b/queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch
new file mode 100644
index 00000000000..c5171c0a6fd
--- /dev/null
+++ b/queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch
@@ -0,0 +1,76 @@
+From 588b7f5df0cb64f281290c7672470c006abe7160 Mon Sep 17 00:00:00 2001
+From: Kirill Tkhai <ktkhai@virtuozzo.com>
+Date: Tue, 1 Feb 2022 11:39:52 +0300
+Subject: dm: fix use-after-free in dm_cleanup_zoned_dev()
+
+From: Kirill Tkhai <ktkhai@virtuozzo.com>
+
+commit 588b7f5df0cb64f281290c7672470c006abe7160 upstream.
+
+dm_cleanup_zoned_dev() uses queue, so it must be called
+before blk_cleanup_disk() starts its killing:
+
+blk_cleanup_disk->blk_cleanup_queue()->kobject_put()->blk_release_queue()->
+->...RCU...->blk_free_queue_rcu()->kmem_cache_free()
+
+Otherwise, RCU callback may be executed first and
+dm_cleanup_zoned_dev() will touch free'd memory:
+
+ BUG: KASAN: use-after-free in dm_cleanup_zoned_dev+0x33/0xd0
+ Read of size 8 at addr ffff88805ac6e430 by task dmsetup/681
+
+ CPU: 4 PID: 681 Comm: dmsetup Not tainted 5.17.0-rc2+ #6
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
+ Call Trace:
+  <TASK>
+  dump_stack_lvl+0x57/0x7d
+  print_address_description.constprop.0+0x1f/0x150
+  ? dm_cleanup_zoned_dev+0x33/0xd0
+  kasan_report.cold+0x7f/0x11b
+  ? dm_cleanup_zoned_dev+0x33/0xd0
+  dm_cleanup_zoned_dev+0x33/0xd0
+  __dm_destroy+0x26a/0x400
+  ? dm_blk_ioctl+0x230/0x230
+  ? up_write+0xd8/0x270
+  dev_remove+0x156/0x1d0
+  ctl_ioctl+0x269/0x530
+  ? table_clear+0x140/0x140
+  ? lock_release+0xb2/0x750
+  ? remove_all+0x40/0x40
+  ? rcu_read_lock_sched_held+0x12/0x70
+  ? lock_downgrade+0x3c0/0x3c0
+  ? rcu_read_lock_sched_held+0x12/0x70
+  dm_ctl_ioctl+0xa/0x10
+  __x64_sys_ioctl+0xb9/0xf0
+  do_syscall_64+0x3b/0x90
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+ RIP: 0033:0x7fb6dfa95c27
+
+Fixes: bb37d77239af ("dm: introduce zone append emulation")
+Cc: stable@vger.kernel.org
+Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
+Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -1609,6 +1609,7 @@ static void cleanup_mapped_device(struct
+ 		md->dax_dev = NULL;
+ 	}
+ 
++	dm_cleanup_zoned_dev(md);
+ 	if (md->disk) {
+ 		spin_lock(&_minor_lock);
+ 		md->disk->private_data = NULL;
+@@ -1629,7 +1630,6 @@ static void cleanup_mapped_device(struct
+ 	mutex_destroy(&md->swap_bios_lock);
+ 
+ 	dm_mq_cleanup_mapped_device(md);
+-	dm_cleanup_zoned_dev(md);
+ }
+ 
+ /*
diff --git a/queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch b/queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch
new file mode 100644
index 00000000000..0cea50a4fac
--- /dev/null
+++ b/queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch
@@ -0,0 +1,44 @@
+From cc09e8a9dec4f0e8299e80a7a2a8e6f54164a10b Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Sat, 26 Mar 2022 10:24:56 -0400
+Subject: dm integrity: set journal entry unused when shrinking device
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+commit cc09e8a9dec4f0e8299e80a7a2a8e6f54164a10b upstream.
+
+Commit f6f72f32c22c ("dm integrity: don't replay journal data past the
+end of the device") skips journal replay if the target sector points
+beyond the end of the device. Unfortunatelly, it doesn't set the
+journal entry unused, which resulted in this BUG being triggered:
+BUG_ON(!journal_entry_is_unused(je))
+
+Fix this by calling journal_entry_set_unused() for this case.
+
+Fixes: f6f72f32c22c ("dm integrity: don't replay journal data past the end of the device")
+Cc: stable@vger.kernel.org # v5.7+
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Tested-by: Milan Broz <gmazyland@gmail.com>
+[snitzer: revised header]
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-integrity.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/md/dm-integrity.c
++++ b/drivers/md/dm-integrity.c
+@@ -2473,9 +2473,11 @@ static void do_journal_write(struct dm_i
+ 					dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
+ 					sec &= ~(sector_t)(ic->sectors_per_block - 1);
+ 				}
++				if (unlikely(sec >= ic->provided_data_sectors)) {
++					journal_entry_set_unused(je);
++					continue;
++				}
+ 			}
+-			if (unlikely(sec >= ic->provided_data_sectors))
+-				continue;
+ 			get_area_and_offset(ic, sec, &area, &offset);
+ 			restore_last_bytes(ic, access_journal_data(ic, i, j), je);
+ 			for (k = j + 1; k < ic->journal_section_entries; k++) {
diff --git a/queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch b/queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch
new file mode 100644
index 00000000000..4a36f0f679d
--- /dev/null
+++ b/queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch
@@ -0,0 +1,139 @@
+From 9f6dc633761006f974701d4c88da71ab68670749 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 17 Feb 2022 23:40:02 -0500
+Subject: dm: interlock pending dm_io and dm_wait_for_bios_completion
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 9f6dc633761006f974701d4c88da71ab68670749 upstream.
+
+Commit d208b89401e0 ("dm: fix mempool NULL pointer race when
+completing IO") didn't go far enough.
+
+When bio_end_io_acct ends the count of in-flight I/Os may reach zero
+and the DM device may be suspended. There is a possibility that the
+suspend races with dm_stats_account_io.
+
+Fix this by adding percpu "pending_io" counters to track outstanding
+dm_io. Move kicking of suspend queue to dm_io_dec_pending(). Also,
+rename md_in_flight_bios() to dm_in_flight_bios() and update it to
+iterate all pending_io counters.
+
+Fixes: d208b89401e0 ("dm: fix mempool NULL pointer race when completing IO")
+Cc: stable@vger.kernel.org
+Co-developed-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-core.h |    2 ++
+ drivers/md/dm.c      |   35 +++++++++++++++++++++++------------
+ 2 files changed, 25 insertions(+), 12 deletions(-)
+
+--- a/drivers/md/dm-core.h
++++ b/drivers/md/dm-core.h
+@@ -65,6 +65,8 @@ struct mapped_device {
+ 	struct gendisk *disk;
+ 	struct dax_device *dax_dev;
+ 
++	unsigned long __percpu *pending_io;
++
+ 	/*
+ 	 * A list of ios that arrived while we were suspended.
+ 	 */
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -507,10 +507,6 @@ static void end_io_acct(struct mapped_de
+ 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
+ 				    bio->bi_iter.bi_sector, bio_sectors(bio),
+ 				    true, duration, stats_aux);
+-
+-	/* nudge anyone waiting on suspend queue */
+-	if (unlikely(wq_has_sleeper(&md->wait)))
+-		wake_up(&md->wait);
+ }
+ 
+ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
+@@ -531,6 +527,7 @@ static struct dm_io *alloc_io(struct map
+ 	io->magic = DM_IO_MAGIC;
+ 	io->status = 0;
+ 	atomic_set(&io->io_count, 1);
++	this_cpu_inc(*md->pending_io);
+ 	io->orig_bio = bio;
+ 	io->md = md;
+ 	spin_lock_init(&io->endio_lock);
+@@ -828,6 +825,12 @@ void dm_io_dec_pending(struct dm_io *io,
+ 		stats_aux = io->stats_aux;
+ 		free_io(md, io);
+ 		end_io_acct(md, bio, start_time, &stats_aux);
++		smp_wmb();
++		this_cpu_dec(*md->pending_io);
++
++		/* nudge anyone waiting on suspend queue */
++		if (unlikely(wq_has_sleeper(&md->wait)))
++			wake_up(&md->wait);
+ 
+ 		if (io_error == BLK_STS_DM_REQUEUE)
+ 			return;
+@@ -1622,6 +1625,11 @@ static void cleanup_mapped_device(struct
+ 		blk_cleanup_disk(md->disk);
+ 	}
+ 
++	if (md->pending_io) {
++		free_percpu(md->pending_io);
++		md->pending_io = NULL;
++	}
++
+ 	cleanup_srcu_struct(&md->io_barrier);
+ 
+ 	mutex_destroy(&md->suspend_lock);
+@@ -1723,6 +1731,10 @@ static struct mapped_device *alloc_dev(i
+ 	if (!md->wq)
+ 		goto bad;
+ 
++	md->pending_io = alloc_percpu(unsigned long);
++	if (!md->pending_io)
++		goto bad;
++
+ 	dm_stats_init(&md->stats);
+ 
+ 	/* Populate the mapping, nobody knows we exist yet */
+@@ -2130,16 +2142,13 @@ void dm_put(struct mapped_device *md)
+ }
+ EXPORT_SYMBOL_GPL(dm_put);
+ 
+-static bool md_in_flight_bios(struct mapped_device *md)
++static bool dm_in_flight_bios(struct mapped_device *md)
+ {
+ 	int cpu;
+-	struct block_device *part = dm_disk(md)->part0;
+-	long sum = 0;
++	unsigned long sum = 0;
+ 
+-	for_each_possible_cpu(cpu) {
+-		sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+-		sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
+-	}
++	for_each_possible_cpu(cpu)
++		sum += *per_cpu_ptr(md->pending_io, cpu);
+ 
+ 	return sum != 0;
+ }
+@@ -2152,7 +2161,7 @@ static int dm_wait_for_bios_completion(s
+ 	while (true) {
+ 		prepare_to_wait(&md->wait, &wait, task_state);
+ 
+-		if (!md_in_flight_bios(md))
++		if (!dm_in_flight_bios(md))
+ 			break;
+ 
+ 		if (signal_pending_state(task_state, current)) {
+@@ -2164,6 +2173,8 @@ static int dm_wait_for_bios_completion(s
+ 	}
+ 	finish_wait(&md->wait, &wait);
+ 
++	smp_rmb();
++
+ 	return r;
+ }
+ 
diff --git a/queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch b/queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch
new file mode 100644
index 00000000000..963a4eec6ba
--- /dev/null
+++ b/queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch
@@ -0,0 +1,134 @@
+From 0cdb90f0f306384ecbc60dfd6dc48cdbc1f2d0d8 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 17 Feb 2022 23:39:59 -0500
+Subject: dm stats: fix too short end duration_ns when using precise_timestamps
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 0cdb90f0f306384ecbc60dfd6dc48cdbc1f2d0d8 upstream.
+
+dm_stats_account_io()'s STAT_PRECISE_TIMESTAMPS support doesn't handle
+the fact that with commit b879f915bc48 ("dm: properly fix redundant
+bio-based IO accounting") io->start_time _may_ be in the past (meaning
+the start_io_acct() was deferred until later).
+
+Add a new dm_stats_recalc_precise_timestamps() helper that will
+set/clear a new 'precise_timestamps' flag in the dm_stats struct based
+on whether any configured stats enable STAT_PRECISE_TIMESTAMPS.
+And update DM core's alloc_io() to use dm_stats_record_start() to set
+stats_aux.duration_ns if stats->precise_timestamps is true.
+
+Also, remove unused 'last_sector' and 'last_rw' members from the
+dm_stats struct.
+
+Fixes: b879f915bc48 ("dm: properly fix redundant bio-based IO accounting")
+Cc: stable@vger.kernel.org
+Co-developed-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-stats.c |   28 +++++++++++++++++++++++++---
+ drivers/md/dm-stats.h |    9 +++++++--
+ drivers/md/dm.c       |    2 ++
+ 3 files changed, 34 insertions(+), 5 deletions(-)
+
+--- a/drivers/md/dm-stats.c
++++ b/drivers/md/dm-stats.c
+@@ -195,6 +195,7 @@ void dm_stats_init(struct dm_stats *stat
+ 
+ 	mutex_init(&stats->mutex);
+ 	INIT_LIST_HEAD(&stats->list);
++	stats->precise_timestamps = false;
+ 	stats->last = alloc_percpu(struct dm_stats_last_position);
+ 	for_each_possible_cpu(cpu) {
+ 		last = per_cpu_ptr(stats->last, cpu);
+@@ -231,6 +232,22 @@ void dm_stats_cleanup(struct dm_stats *s
+ 	mutex_destroy(&stats->mutex);
+ }
+ 
++static void dm_stats_recalc_precise_timestamps(struct dm_stats *stats)
++{
++	struct list_head *l;
++	struct dm_stat *tmp_s;
++	bool precise_timestamps = false;
++
++	list_for_each(l, &stats->list) {
++		tmp_s = container_of(l, struct dm_stat, list_entry);
++		if (tmp_s->stat_flags & STAT_PRECISE_TIMESTAMPS) {
++			precise_timestamps = true;
++			break;
++		}
++	}
++	stats->precise_timestamps = precise_timestamps;
++}
++
+ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
+ 			   sector_t step, unsigned stat_flags,
+ 			   unsigned n_histogram_entries,
+@@ -376,6 +393,9 @@ static int dm_stats_create(struct dm_sta
+ 	}
+ 	ret_id = s->id;
+ 	list_add_tail_rcu(&s->list_entry, l);
++
++	dm_stats_recalc_precise_timestamps(stats);
++
+ 	mutex_unlock(&stats->mutex);
+ 
+ 	resume_callback(md);
+@@ -418,6 +438,9 @@ static int dm_stats_delete(struct dm_sta
+ 	}
+ 
+ 	list_del_rcu(&s->list_entry);
++
++	dm_stats_recalc_precise_timestamps(stats);
++
+ 	mutex_unlock(&stats->mutex);
+ 
+ 	/*
+@@ -654,9 +677,8 @@ void dm_stats_account_io(struct dm_stats
+ 	got_precise_time = false;
+ 	list_for_each_entry_rcu(s, &stats->list, list_entry) {
+ 		if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
+-			if (!end)
+-				stats_aux->duration_ns = ktime_to_ns(ktime_get());
+-			else
++			/* start (!end) duration_ns is set by DM core's alloc_io() */
++			if (end)
+ 				stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
+ 			got_precise_time = true;
+ 		}
+--- a/drivers/md/dm-stats.h
++++ b/drivers/md/dm-stats.h
+@@ -13,8 +13,7 @@ struct dm_stats {
+ 	struct mutex mutex;
+ 	struct list_head list;	/* list of struct dm_stat */
+ 	struct dm_stats_last_position __percpu *last;
+-	sector_t last_sector;
+-	unsigned last_rw;
++	bool precise_timestamps;
+ };
+ 
+ struct dm_stats_aux {
+@@ -40,4 +39,10 @@ static inline bool dm_stats_used(struct
+ 	return !list_empty(&st->list);
+ }
+ 
++static inline void dm_stats_record_start(struct dm_stats *stats, struct dm_stats_aux *aux)
++{
++	if (unlikely(stats->precise_timestamps))
++		aux->duration_ns = ktime_to_ns(ktime_get());
++}
++
+ #endif
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -537,6 +537,8 @@ static struct dm_io *alloc_io(struct map
+ 
+ 	io->start_time = jiffies;
+ 
++	dm_stats_record_start(&md->stats, &io->stats_aux);
++
+ 	return io;
+ }
+ 
diff --git a/queue-5.17/drbd-fix-potential-silent-data-corruption.patch b/queue-5.17/drbd-fix-potential-silent-data-corruption.patch
new file mode 100644
index 00000000000..99cb0998d4e
--- /dev/null
+++ b/queue-5.17/drbd-fix-potential-silent-data-corruption.patch
@@ -0,0 +1,67 @@
+From f4329d1f848ac35757d9cc5487669d19dfc5979c Mon Sep 17 00:00:00 2001
+From: Lars Ellenberg <lars.ellenberg@linbit.com>
+Date: Wed, 30 Mar 2022 20:55:51 +0200
+Subject: drbd: fix potential silent data corruption
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Lars Ellenberg <lars.ellenberg@linbit.com>
+
+commit f4329d1f848ac35757d9cc5487669d19dfc5979c upstream.
+
+Scenario:
+---------
+
+bio chain generated by blk_queue_split().
+Some split bio fails and propagates its error status to the "parent" bio.
+But then the (last part of the) parent bio itself completes without error.
+
+We would clobber the already recorded error status with BLK_STS_OK,
+causing silent data corruption.
+
+Reproducer:
+-----------
+
+How to trigger this in the real world within seconds:
+
+DRBD on top of degraded parity raid,
+small stripe_cache_size, large read_ahead setting.
+Drop page cache (sysctl vm.drop_caches=1, fadvise "DONTNEED",
+umount and mount again, "reboot").
+
+Cause significant read ahead.
+
+Large read ahead request is split by blk_queue_split().
+Parts of the read ahead that are already in the stripe cache,
+or find an available stripe cache to use, can be serviced.
+Parts of the read ahead that would need "too much work",
+would need to wait for a "stripe_head" to become available,
+are rejected immediately.
+
+For larger read ahead requests that are split in many pieces, it is very
+likely that some "splits" will be serviced, but then the stripe cache is
+exhausted/busy, and the remaining ones will be rejected.
+
+Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
+Signed-off-by: Christoph BÃ¶hmwalder <christoph.boehmwalder@linbit.com>
+Cc: <stable@vger.kernel.org> # 4.13.x
+Link: https://lore.kernel.org/r/20220330185551.3553196-1-christoph.boehmwalder@linbit.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/drbd/drbd_req.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/block/drbd/drbd_req.c
++++ b/drivers/block/drbd/drbd_req.c
+@@ -180,7 +180,8 @@ void start_new_tl_epoch(struct drbd_conn
+ void complete_master_bio(struct drbd_device *device,
+ 		struct bio_and_error *m)
+ {
+-	m->bio->bi_status = errno_to_blk_status(m->error);
++	if (unlikely(m->error))
++		m->bio->bi_status = errno_to_blk_status(m->error);
+ 	bio_endio(m->bio);
+ 	dec_ap_bio(device);
+ }
diff --git a/queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch b/queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch
new file mode 100644
index 00000000000..2cf450d4040
--- /dev/null
+++ b/queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch
@@ -0,0 +1,67 @@
+From 3149c79f3cb0e2e3bafb7cfadacec090cbd250d3 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Fri, 1 Apr 2022 11:28:42 -0700
+Subject: mm,hwpoison: unmap poisoned page before invalidation
+
+From: Rik van Riel <riel@surriel.com>
+
+commit 3149c79f3cb0e2e3bafb7cfadacec090cbd250d3 upstream.
+
+In some cases it appears the invalidation of a hwpoisoned page fails
+because the page is still mapped in another process.  This can cause a
+program to be continuously restarted and die when it page faults on the
+page that was not invalidated.  Avoid that problem by unmapping the
+hwpoisoned page when we find it.
+
+Another issue is that sometimes we end up oopsing in finish_fault, if
+the code tries to do something with the now-NULL vmf->page.  I did not
+hit this error when submitting the previous patch because there are
+several opportunities for alloc_set_pte to bail out before accessing
+vmf->page, and that apparently happened on those systems, and most of
+the time on other systems, too.
+
+However, across several million systems that error does occur a handful
+of times a day.  It can be avoided by returning VM_FAULT_NOPAGE which
+will cause do_read_fault to return before calling finish_fault.
+
+Link: https://lkml.kernel.org/r/20220325161428.5068d97e@imladris.surriel.com
+Fixes: e53ac7374e64 ("mm: invalidate hwpoison page cache page in fault path")
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
+Tested-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c |   12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3893,14 +3893,18 @@ static vm_fault_t __do_fault(struct vm_f
+ 		return ret;
+ 
+ 	if (unlikely(PageHWPoison(vmf->page))) {
++		struct page *page = vmf->page;
+ 		vm_fault_t poisonret = VM_FAULT_HWPOISON;
+ 		if (ret & VM_FAULT_LOCKED) {
++			if (page_mapped(page))
++				unmap_mapping_pages(page_mapping(page),
++						    page->index, 1, false);
+ 			/* Retry if a clean page was removed from the cache. */
+-			if (invalidate_inode_page(vmf->page))
+-				poisonret = 0;
+-			unlock_page(vmf->page);
++			if (invalidate_inode_page(page))
++				poisonret = VM_FAULT_NOPAGE;
++			unlock_page(page);
+ 		}
+-		put_page(vmf->page);
++		put_page(page);
+ 		vmf->page = NULL;
+ 		return poisonret;
+ 	}
diff --git a/queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch b/queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch
new file mode 100644
index 00000000000..ea7021fb1ea
--- /dev/null
+++ b/queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch
@@ -0,0 +1,99 @@
+From bfc8089f00fa526dea983844c880fa8106c33ac4 Mon Sep 17 00:00:00 2001
+From: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
+Date: Fri, 1 Apr 2022 11:28:54 -0700
+Subject: mm/kmemleak: reset tag when compare object pointer
+
+From: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
+
+commit bfc8089f00fa526dea983844c880fa8106c33ac4 upstream.
+
+When we use HW-tag based kasan and enable vmalloc support, we hit the
+following bug.  It is due to comparison between tagged object and
+non-tagged pointer.
+
+We need to reset the kasan tag when we need to compare tagged object and
+non-tagged pointer.
+
+  kmemleak: [name:kmemleak&]Scan area larger than object 0xffffffe77076f440
+  CPU: 4 PID: 1 Comm: init Tainted: G S      W         5.15.25-android13-0-g5cacf919c2bc #1
+  Hardware name: MT6983(ENG) (DT)
+  Call trace:
+   add_scan_area+0xc4/0x244
+   kmemleak_scan_area+0x40/0x9c
+   layout_and_allocate+0x1e8/0x288
+   load_module+0x2c8/0xf00
+   __se_sys_finit_module+0x190/0x1d0
+   __arm64_sys_finit_module+0x20/0x30
+   invoke_syscall+0x60/0x170
+   el0_svc_common+0xc8/0x114
+   do_el0_svc+0x28/0xa0
+   el0_svc+0x60/0xf8
+   el0t_64_sync_handler+0x88/0xec
+   el0t_64_sync+0x1b4/0x1b8
+  kmemleak: [name:kmemleak&]Object 0xf5ffffe77076b000 (size 32768):
+  kmemleak: [name:kmemleak&]  comm "init", pid 1, jiffies 4294894197
+  kmemleak: [name:kmemleak&]  min_count = 0
+  kmemleak: [name:kmemleak&]  count = 0
+  kmemleak: [name:kmemleak&]  flags = 0x1
+  kmemleak: [name:kmemleak&]  checksum = 0
+  kmemleak: [name:kmemleak&]  backtrace:
+       module_alloc+0x9c/0x120
+       move_module+0x34/0x19c
+       layout_and_allocate+0x1c4/0x288
+       load_module+0x2c8/0xf00
+       __se_sys_finit_module+0x190/0x1d0
+       __arm64_sys_finit_module+0x20/0x30
+       invoke_syscall+0x60/0x170
+       el0_svc_common+0xc8/0x114
+       do_el0_svc+0x28/0xa0
+       el0_svc+0x60/0xf8
+       el0t_64_sync_handler+0x88/0xec
+       el0t_64_sync+0x1b4/0x1b8
+
+Link: https://lkml.kernel.org/r/20220318034051.30687-1-Kuan-Ying.Lee@mediatek.com
+Signed-off-by: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Matthias Brugger <matthias.bgg@gmail.com>
+Cc: Chinwen Chang <chinwen.chang@mediatek.com>
+Cc: Nicholas Tang <nicholas.tang@mediatek.com>
+Cc: Yee Lee <yee.lee@mediatek.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/kmemleak.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/mm/kmemleak.c
++++ b/mm/kmemleak.c
+@@ -796,6 +796,8 @@ static void add_scan_area(unsigned long
+ 	unsigned long flags;
+ 	struct kmemleak_object *object;
+ 	struct kmemleak_scan_area *area = NULL;
++	unsigned long untagged_ptr;
++	unsigned long untagged_objp;
+ 
+ 	object = find_and_get_object(ptr, 1);
+ 	if (!object) {
+@@ -804,6 +806,9 @@ static void add_scan_area(unsigned long
+ 		return;
+ 	}
+ 
++	untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
++	untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
++
+ 	if (scan_area_cache)
+ 		area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
+ 
+@@ -815,8 +820,8 @@ static void add_scan_area(unsigned long
+ 		goto out_unlock;
+ 	}
+ 	if (size == SIZE_MAX) {
+-		size = object->pointer + object->size - ptr;
+-	} else if (ptr + size > object->pointer + object->size) {
++		size = untagged_objp + object->size - untagged_ptr;
++	} else if (untagged_ptr + size > untagged_objp + object->size) {
+ 		kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
+ 		dump_object_info(object);
+ 		kmem_cache_free(scan_area_cache, area);
diff --git a/queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch b/queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch
new file mode 100644
index 00000000000..c8c41118c22
--- /dev/null
+++ b/queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch
@@ -0,0 +1,64 @@
+From 5bd009c7c9a9e888077c07535dc0c70aeab242c3 Mon Sep 17 00:00:00 2001
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+Date: Tue, 22 Mar 2022 14:46:44 -0700
+Subject: mm: madvise: return correct bytes advised with process_madvise
+
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+
+commit 5bd009c7c9a9e888077c07535dc0c70aeab242c3 upstream.
+
+Patch series "mm: madvise: return correct bytes processed with
+process_madvise", v2.  With the process_madvise(), always choose to return
+non zero processed bytes over an error.  This can help the user to know on
+which VMA, passed in the 'struct iovec' vector list, is failed to advise
+thus can take the decission of retrying/skipping on that VMA.
+
+This patch (of 2):
+
+The process_madvise() system call returns error even after processing some
+VMA's passed in the 'struct iovec' vector list which leaves the user
+confused to know where to restart the advise next.  It is also against
+this syscall man page[1] documentation where it mentions that "return
+value may be less than the total number of requested bytes, if an error
+occurred after some iovec elements were already processed.".
+
+Consider a user passed 10 VMA's in the 'struct iovec' vector list of which
+9 are processed but one.  Then it just returns the error caused on that
+failed VMA despite the first 9 VMA's processed, leaving the user confused
+about on which VMA it is failed.  Returning the number of bytes processed
+here can help the user to know which VMA it is failed on and thus can
+retry/skip the advise on that VMA.
+
+[1]https://man7.org/linux/man-pages/man2/process_madvise.2.html.
+
+Link: https://lkml.kernel.org/r/cover.1647008754.git.quic_charante@quicinc.com
+Link: https://lkml.kernel.org/r/125b61a0edcee5c2db8658aed9d06a43a19ccafc.1647008754.git.quic_charante@quicinc.com
+Fixes: ecb8ac8b1f14("mm/madvise: introduce process_madvise() syscall: an external memory hinting API")
+Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1440,8 +1440,7 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+ 		iov_iter_advance(&iter, iovec.iov_len);
+ 	}
+ 
+-	if (ret == 0)
+-		ret = total_len - iov_iter_count(&iter);
++	ret = (total_len - iov_iter_count(&iter)) ? : ret;
+ 
+ release_mm:
+ 	mmput(mm);
diff --git a/queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch b/queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
new file mode 100644
index 00000000000..8dbbe8db845
--- /dev/null
+++ b/queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
@@ -0,0 +1,57 @@
+From 08095d6310a7ce43256b4251577bc66a25c6e1a6 Mon Sep 17 00:00:00 2001
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+Date: Tue, 22 Mar 2022 14:46:48 -0700
+Subject: mm: madvise: skip unmapped vma holes passed to process_madvise
+
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+
+commit 08095d6310a7ce43256b4251577bc66a25c6e1a6 upstream.
+
+The process_madvise() system call is expected to skip holes in vma passed
+through 'struct iovec' vector list.  But do_madvise, which
+process_madvise() calls for each vma, returns ENOMEM in case of unmapped
+holes, despite the VMA is processed.
+
+Thus process_madvise() should treat ENOMEM as expected and consider the
+VMA passed to as processed and continue processing other vma's in the
+vector list.  Returning -ENOMEM to user, despite the VMA is processed,
+will be unable to figure out where to start the next madvise.
+
+Link: https://lkml.kernel.org/r/4f091776142f2ebf7b94018146de72318474e686.1647008754.git.quic_charante@quicinc.com
+Fixes: ecb8ac8b1f14("mm/madvise: introduce process_madvise() syscall: an external memory hinting API")
+Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c |    9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1426,9 +1426,16 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+ 
+ 	while (iov_iter_count(&iter)) {
+ 		iovec = iov_iter_iovec(&iter);
++		/*
++		 * do_madvise returns ENOMEM if unmapped holes are present
++		 * in the passed VMA. process_madvise() is expected to skip
++		 * unmapped holes passed to it in the 'struct iovec' list
++		 * and not fail because of them. Thus treat -ENOMEM return
++		 * from do_madvise as valid and continue processing.
++		 */
+ 		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+ 					iovec.iov_len, behavior);
+-		if (ret < 0)
++		if (ret < 0 && ret != -ENOMEM)
+ 			break;
+ 		iov_iter_advance(&iter, iovec.iov_len);
+ 	}
diff --git a/queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch b/queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch
new file mode 100644
index 00000000000..e70fb4b8864
--- /dev/null
+++ b/queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch
@@ -0,0 +1,183 @@
+From 734c15700cdf9062ae98d8b131c6fe873dfad26d Mon Sep 17 00:00:00 2001
+From: Oscar Salvador <osalvador@suse.de>
+Date: Tue, 22 Mar 2022 14:47:37 -0700
+Subject: mm: only re-generate demotion targets when a numa node changes its N_CPU state
+
+From: Oscar Salvador <osalvador@suse.de>
+
+commit 734c15700cdf9062ae98d8b131c6fe873dfad26d upstream.
+
+Abhishek reported that after patch [1], hotplug operations are taking
+roughly double the expected time.  [2]
+
+The reason behind is that the CPU callbacks that
+migrate_on_reclaim_init() sets always call set_migration_target_nodes()
+whenever a CPU is brought up/down.
+
+But we only care about numa nodes going from having cpus to become
+cpuless, and vice versa, as that influences the demotion_target order.
+
+We do already have two CPU callbacks (vmstat_cpu_online() and
+vmstat_cpu_dead()) that check exactly that, so get rid of the CPU
+callbacks in migrate_on_reclaim_init() and only call
+set_migration_target_nodes() from vmstat_cpu_{dead,online}() whenever a
+numa node change its N_CPU state.
+
+[1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.com/
+[2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux.vnet.ibm.com/
+
+[osalvador@suse.de: add feedback from Huang Ying]
+  Link: https://lkml.kernel.org/r/20220314150945.12694-1-osalvador@suse.de
+
+Link: https://lkml.kernel.org/r/20220310120749.23077-1-osalvador@suse.de
+Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Reported-by: Abhishek Goel <huntbag@linux.vnet.ibm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Abhishek Goel <huntbag@linux.vnet.ibm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/migrate.h |    8 ++++++++
+ mm/migrate.c            |   47 ++++++++++-------------------------------------
+ mm/vmstat.c             |   13 ++++++++++++-
+ 3 files changed, 30 insertions(+), 38 deletions(-)
+
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address
+ 		struct folio *newfolio, struct folio *folio, int extra_count);
+ 
+ extern bool numa_demotion_enabled;
++extern void migrate_on_reclaim_init(void);
++#ifdef CONFIG_HOTPLUG_CPU
++extern void set_migration_target_nodes(void);
+ #else
++static inline void set_migration_target_nodes(void) {}
++#endif
++#else
++
++static inline void set_migration_target_nodes(void) {}
+ 
+ static inline void putback_movable_pages(struct list_head *l) {}
+ static inline int migrate_pages(struct list_head *l, new_page_t new,
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -3190,7 +3190,7 @@ again:
+ /*
+  * For callers that do not hold get_online_mems() already.
+  */
+-static void set_migration_target_nodes(void)
++void set_migration_target_nodes(void)
+ {
+ 	get_online_mems();
+ 	__set_migration_target_nodes();
+@@ -3254,51 +3254,24 @@ static int __meminit migrate_on_reclaim_
+ 	return notifier_from_errno(0);
+ }
+ 
+-/*
+- * React to hotplug events that might affect the migration targets
+- * like events that online or offline NUMA nodes.
+- *
+- * The ordering is also currently dependent on which nodes have
+- * CPUs.  That means we need CPU on/offline notification too.
+- */
+-static int migration_online_cpu(unsigned int cpu)
+-{
+-	set_migration_target_nodes();
+-	return 0;
+-}
+-
+-static int migration_offline_cpu(unsigned int cpu)
+-{
+-	set_migration_target_nodes();
+-	return 0;
+-}
+-
+-static int __init migrate_on_reclaim_init(void)
++void __init migrate_on_reclaim_init(void)
+ {
+-	int ret;
+-
+ 	node_demotion = kmalloc_array(nr_node_ids,
+ 				      sizeof(struct demotion_nodes),
+ 				      GFP_KERNEL);
+ 	WARN_ON(!node_demotion);
+ 
+-	ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
+-					NULL, migration_offline_cpu);
++	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+ 	/*
+-	 * In the unlikely case that this fails, the automatic
+-	 * migration targets may become suboptimal for nodes
+-	 * where N_CPU changes.  With such a small impact in a
+-	 * rare case, do not bother trying to do anything special.
++	 * At this point, all numa nodes with memory/CPus have their state
++	 * properly set, so we can build the demotion order now.
++	 * Let us hold the cpu_hotplug lock just, as we could possibily have
++	 * CPU hotplug events during boot.
+ 	 */
+-	WARN_ON(ret < 0);
+-	ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
+-				migration_online_cpu, NULL);
+-	WARN_ON(ret < 0);
+-
+-	hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+-	return 0;
++	cpus_read_lock();
++	set_migration_target_nodes();
++	cpus_read_unlock();
+ }
+-late_initcall(migrate_on_reclaim_init);
+ #endif /* CONFIG_HOTPLUG_CPU */
+ 
+ bool numa_demotion_enabled = false;
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -28,6 +28,7 @@
+ #include <linux/mm_inline.h>
+ #include <linux/page_ext.h>
+ #include <linux/page_owner.h>
++#include <linux/migrate.h>
+ 
+ #include "internal.h"
+ 
+@@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(v
+ static int vmstat_cpu_online(unsigned int cpu)
+ {
+ 	refresh_zone_stat_thresholds();
+-	node_set_state(cpu_to_node(cpu), N_CPU);
++
++	if (!node_state(cpu_to_node(cpu), N_CPU)) {
++		node_set_state(cpu_to_node(cpu), N_CPU);
++		set_migration_target_nodes();
++	}
++
+ 	return 0;
+ }
+ 
+@@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int
+ 		return 0;
+ 
+ 	node_clear_state(node, N_CPU);
++	set_migration_target_nodes();
++
+ 	return 0;
+ }
+ 
+@@ -2097,6 +2105,9 @@ void __init init_mm_internals(void)
+ 
+ 	start_shepherd_timer();
+ #endif
++#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
++	migrate_on_reclaim_init();
++#endif
+ #ifdef CONFIG_PROC_FS
+ 	proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
+ 	proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
diff --git a/queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch b/queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
new file mode 100644
index 00000000000..3e355823dfd
--- /dev/null
+++ b/queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
@@ -0,0 +1,57 @@
+From e6b0a7b357659c332231621e4315658d062c23ee Mon Sep 17 00:00:00 2001
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+Date: Fri, 1 Apr 2022 11:28:12 -0700
+Subject: Revert "mm: madvise: skip unmapped vma holes passed to process_madvise"
+
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+
+commit e6b0a7b357659c332231621e4315658d062c23ee upstream.
+
+This reverts commit 08095d6310a7 ("mm: madvise: skip unmapped vma holes
+passed to process_madvise") as process_madvise() fails to return the
+exact processed bytes in other cases too.
+
+As an example: if process_madvise() hits mlocked pages after processing
+some initial bytes passed in [start, end), it just returns EINVAL
+although some bytes are processed.  Thus making an exception only for
+ENOMEM is partially fixing the problem of returning the proper advised
+bytes.
+
+Thus revert this patch and return proper bytes advised.
+
+Link: https://lkml.kernel.org/r/e73da1304a88b6a8a11907045117cccf4c2b8374.1648046642.git.quic_charante@quicinc.com
+Fixes: 08095d6310a7ce ("mm: madvise: skip unmapped vma holes passed to process_madvise")
+Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c |    9 +--------
+ 1 file changed, 1 insertion(+), 8 deletions(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1426,16 +1426,9 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+ 
+ 	while (iov_iter_count(&iter)) {
+ 		iovec = iov_iter_iovec(&iter);
+-		/*
+-		 * do_madvise returns ENOMEM if unmapped holes are present
+-		 * in the passed VMA. process_madvise() is expected to skip
+-		 * unmapped holes passed to it in the 'struct iovec' list
+-		 * and not fail because of them. Thus treat -ENOMEM return
+-		 * from do_madvise as valid and continue processing.
+-		 */
+ 		ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+ 					iovec.iov_len, behavior);
+-		if (ret < 0 && ret != -ENOMEM)
++		if (ret < 0)
+ 			break;
+ 		iov_iter_advance(&iter, iovec.iov_len);
+ 	}
diff --git a/queue-5.17/series b/queue-5.17/series
index 773df2de633..ad0ac27e51d 100644
--- a/queue-5.17/series
+++ b/queue-5.17/series
@@ -102,3 +102,16 @@ rtc-pl031-fix-rtc-features-null-pointer-dereference.patch
 io_uring-ensure-that-fsnotify-is-always-called.patch
 ocfs2-fix-crash-when-mount-with-quota-enabled.patch
 drm-simpledrm-add-panel-orientation-property-on-non-upright-mounted-lcd-panels.patch
+mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
+mm-madvise-return-correct-bytes-advised-with-process_madvise.patch
+revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
+mm-hwpoison-unmap-poisoned-page-before-invalidation.patch
+mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch
+mm-kmemleak-reset-tag-when-compare-object-pointer.patch
+dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch
+dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch
+dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch
+dm-fix-double-accounting-of-flush-with-data.patch
+dm-integrity-set-journal-entry-unused-when-shrinking-device.patch
+tracing-have-trace-event-string-test-handle-zero-length-strings.patch
+drbd-fix-potential-silent-data-corruption.patch
diff --git a/queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch b/queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch
new file mode 100644
index 00000000000..28c2b62ebbd
--- /dev/null
+++ b/queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch
@@ -0,0 +1,62 @@
+From eca344a7362e0f34f179298fd8366bcd556eede1 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Wed, 23 Mar 2022 10:32:51 -0400
+Subject: tracing: Have trace event string test handle zero length strings
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit eca344a7362e0f34f179298fd8366bcd556eede1 upstream.
+
+If a trace event has in its TP_printk():
+
+ "%*.s", len, len ? __get_str(string) : NULL
+
+It is perfectly valid if len is zero and passing in the NULL.
+Unfortunately, the runtime string check at time of reading the trace sees
+the NULL and flags it as a bad string and produces a WARN_ON().
+
+Handle this case by passing into the test function if the format has an
+asterisk (star) and if so, if the length is zero, then mark it as safe.
+
+Link: https://lore.kernel.org/all/YjsWzuw5FbWPrdqq@bfoster/
+
+Cc: stable@vger.kernel.org
+Reported-by: Brian Foster <bfoster@redhat.com>
+Tested-by: Brian Foster <bfoster@redhat.com>
+Fixes: 9a6944fee68e2 ("tracing: Add a verifier to check string pointers for trace events")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -3663,12 +3663,17 @@ static char *trace_iter_expand_format(st
+ }
+ 
+ /* Returns true if the string is safe to dereference from an event */
+-static bool trace_safe_str(struct trace_iterator *iter, const char *str)
++static bool trace_safe_str(struct trace_iterator *iter, const char *str,
++			   bool star, int len)
+ {
+ 	unsigned long addr = (unsigned long)str;
+ 	struct trace_event *trace_event;
+ 	struct trace_event_call *event;
+ 
++	/* Ignore strings with no length */
++	if (star && !len)
++		return true;
++
+ 	/* OK if part of the event data */
+ 	if ((addr >= (unsigned long)iter->ent) &&
+ 	    (addr < (unsigned long)iter->ent + iter->ent_size))
+@@ -3854,7 +3859,7 @@ void trace_check_vprintf(struct trace_it
+ 		 * instead. See samples/trace_events/trace-events-sample.h
+ 		 * for reference.
+ 		 */
+-		if (WARN_ONCE(!trace_safe_str(iter, str),
++		if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
+ 			      "fmt: '%s' current_buffer: '%s'",
+ 			      fmt, show_buffer(&iter->seq))) {
+ 			int ret;