--- /dev/null
+From 8d394bc4adf588ca4a0650745167cb83f86c18c9 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 17 Feb 2022 23:39:57 -0500
+Subject: dm: fix double accounting of flush with data
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 8d394bc4adf588ca4a0650745167cb83f86c18c9 upstream.
+
+DM handles a flush with data by first issuing an empty flush and then
+once it completes the REQ_PREFLUSH flag is removed and the payload is
+issued. The problem fixed by this commit is that both the empty flush
+bio and the data payload will account the full extent of the data
+payload.
+
+Fix this by factoring out dm_io_acct() and having it wrap all IO
+accounting to set the size of bio with REQ_PREFLUSH to 0, account the
+IO, and then restore the original size.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-stats.c | 6 ++++--
+ drivers/md/dm-stats.h | 2 +-
+ drivers/md/dm.c | 47 +++++++++++++++++++++++++++++++++--------------
+ 3 files changed, 38 insertions(+), 17 deletions(-)
+
+--- a/drivers/md/dm-stats.c
++++ b/drivers/md/dm-stats.c
+@@ -644,13 +644,14 @@ static void __dm_stat_bio(struct dm_stat
+
+ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+ sector_t bi_sector, unsigned bi_sectors, bool end,
+- unsigned long duration_jiffies,
++ unsigned long start_time,
+ struct dm_stats_aux *stats_aux)
+ {
+ struct dm_stat *s;
+ sector_t end_sector;
+ struct dm_stats_last_position *last;
+ bool got_precise_time;
++ unsigned long duration_jiffies = 0;
+
+ if (unlikely(!bi_sectors))
+ return;
+@@ -670,7 +671,8 @@ void dm_stats_account_io(struct dm_stats
+ ));
+ WRITE_ONCE(last->last_sector, end_sector);
+ WRITE_ONCE(last->last_rw, bi_rw);
+- }
++ } else
++ duration_jiffies = jiffies - start_time;
+
+ rcu_read_lock();
+
+--- a/drivers/md/dm-stats.h
++++ b/drivers/md/dm-stats.h
+@@ -31,7 +31,7 @@ int dm_stats_message(struct mapped_devic
+
+ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+ sector_t bi_sector, unsigned bi_sectors, bool end,
+- unsigned long duration_jiffies,
++ unsigned long start_time,
+ struct dm_stats_aux *aux);
+
+ static inline bool dm_stats_used(struct dm_stats *st)
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -484,29 +484,48 @@ u64 dm_start_time_ns_from_clone(struct b
+ }
+ EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
+
+-static void start_io_acct(struct dm_io *io)
++static bool bio_is_flush_with_data(struct bio *bio)
+ {
+- struct mapped_device *md = io->md;
+- struct bio *bio = io->orig_bio;
++ return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
++}
++
++static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio,
++ unsigned long start_time, struct dm_stats_aux *stats_aux)
++{
++ bool is_flush_with_data;
++ unsigned int bi_size;
++
++ /* If REQ_PREFLUSH set save any payload but do not account it */
++ is_flush_with_data = bio_is_flush_with_data(bio);
++ if (is_flush_with_data) {
++ bi_size = bio->bi_iter.bi_size;
++ bio->bi_iter.bi_size = 0;
++ }
++
++ if (!end)
++ bio_start_io_acct_time(bio, start_time);
++ else
++ bio_end_io_acct(bio, start_time);
+
+- bio_start_io_acct_time(bio, io->start_time);
+ if (unlikely(dm_stats_used(&md->stats)))
+ dm_stats_account_io(&md->stats, bio_data_dir(bio),
+ bio->bi_iter.bi_sector, bio_sectors(bio),
+- false, 0, &io->stats_aux);
++ end, start_time, stats_aux);
++
++ /* Restore bio's payload so it does get accounted upon requeue */
++ if (is_flush_with_data)
++ bio->bi_iter.bi_size = bi_size;
++}
++
++static void start_io_acct(struct dm_io *io)
++{
++ dm_io_acct(false, io->md, io->orig_bio, io->start_time, &io->stats_aux);
+ }
+
+ static void end_io_acct(struct mapped_device *md, struct bio *bio,
+ unsigned long start_time, struct dm_stats_aux *stats_aux)
+ {
+- unsigned long duration = jiffies - start_time;
+-
+- bio_end_io_acct(bio, start_time);
+-
+- if (unlikely(dm_stats_used(&md->stats)))
+- dm_stats_account_io(&md->stats, bio_data_dir(bio),
+- bio->bi_iter.bi_sector, bio_sectors(bio),
+- true, duration, stats_aux);
++ dm_io_acct(true, md, bio, start_time, stats_aux);
+ }
+
+ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
+@@ -835,7 +854,7 @@ void dm_io_dec_pending(struct dm_io *io,
+ if (io_error == BLK_STS_DM_REQUEUE)
+ return;
+
+- if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
++ if (bio_is_flush_with_data(bio)) {
+ /*
+ * Preflush done for flush with data, reissue
+ * without REQ_PREFLUSH.
--- /dev/null
+From 588b7f5df0cb64f281290c7672470c006abe7160 Mon Sep 17 00:00:00 2001
+From: Kirill Tkhai <ktkhai@virtuozzo.com>
+Date: Tue, 1 Feb 2022 11:39:52 +0300
+Subject: dm: fix use-after-free in dm_cleanup_zoned_dev()
+
+From: Kirill Tkhai <ktkhai@virtuozzo.com>
+
+commit 588b7f5df0cb64f281290c7672470c006abe7160 upstream.
+
+dm_cleanup_zoned_dev() uses queue, so it must be called
+before blk_cleanup_disk() starts its killing:
+
+blk_cleanup_disk->blk_cleanup_queue()->kobject_put()->blk_release_queue()->
+->...RCU...->blk_free_queue_rcu()->kmem_cache_free()
+
+Otherwise, RCU callback may be executed first and
+dm_cleanup_zoned_dev() will touch free'd memory:
+
+ BUG: KASAN: use-after-free in dm_cleanup_zoned_dev+0x33/0xd0
+ Read of size 8 at addr ffff88805ac6e430 by task dmsetup/681
+
+ CPU: 4 PID: 681 Comm: dmsetup Not tainted 5.17.0-rc2+ #6
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
+ Call Trace:
+ <TASK>
+ dump_stack_lvl+0x57/0x7d
+ print_address_description.constprop.0+0x1f/0x150
+ ? dm_cleanup_zoned_dev+0x33/0xd0
+ kasan_report.cold+0x7f/0x11b
+ ? dm_cleanup_zoned_dev+0x33/0xd0
+ dm_cleanup_zoned_dev+0x33/0xd0
+ __dm_destroy+0x26a/0x400
+ ? dm_blk_ioctl+0x230/0x230
+ ? up_write+0xd8/0x270
+ dev_remove+0x156/0x1d0
+ ctl_ioctl+0x269/0x530
+ ? table_clear+0x140/0x140
+ ? lock_release+0xb2/0x750
+ ? remove_all+0x40/0x40
+ ? rcu_read_lock_sched_held+0x12/0x70
+ ? lock_downgrade+0x3c0/0x3c0
+ ? rcu_read_lock_sched_held+0x12/0x70
+ dm_ctl_ioctl+0xa/0x10
+ __x64_sys_ioctl+0xb9/0xf0
+ do_syscall_64+0x3b/0x90
+ entry_SYSCALL_64_after_hwframe+0x44/0xae
+ RIP: 0033:0x7fb6dfa95c27
+
+Fixes: bb37d77239af ("dm: introduce zone append emulation")
+Cc: stable@vger.kernel.org
+Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
+Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -1609,6 +1609,7 @@ static void cleanup_mapped_device(struct
+ md->dax_dev = NULL;
+ }
+
++ dm_cleanup_zoned_dev(md);
+ if (md->disk) {
+ spin_lock(&_minor_lock);
+ md->disk->private_data = NULL;
+@@ -1629,7 +1630,6 @@ static void cleanup_mapped_device(struct
+ mutex_destroy(&md->swap_bios_lock);
+
+ dm_mq_cleanup_mapped_device(md);
+- dm_cleanup_zoned_dev(md);
+ }
+
+ /*
--- /dev/null
+From cc09e8a9dec4f0e8299e80a7a2a8e6f54164a10b Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Sat, 26 Mar 2022 10:24:56 -0400
+Subject: dm integrity: set journal entry unused when shrinking device
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+commit cc09e8a9dec4f0e8299e80a7a2a8e6f54164a10b upstream.
+
+Commit f6f72f32c22c ("dm integrity: don't replay journal data past the
+end of the device") skips journal replay if the target sector points
+beyond the end of the device. Unfortunatelly, it doesn't set the
+journal entry unused, which resulted in this BUG being triggered:
+BUG_ON(!journal_entry_is_unused(je))
+
+Fix this by calling journal_entry_set_unused() for this case.
+
+Fixes: f6f72f32c22c ("dm integrity: don't replay journal data past the end of the device")
+Cc: stable@vger.kernel.org # v5.7+
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Tested-by: Milan Broz <gmazyland@gmail.com>
+[snitzer: revised header]
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-integrity.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/md/dm-integrity.c
++++ b/drivers/md/dm-integrity.c
+@@ -2473,9 +2473,11 @@ static void do_journal_write(struct dm_i
+ dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
+ sec &= ~(sector_t)(ic->sectors_per_block - 1);
+ }
++ if (unlikely(sec >= ic->provided_data_sectors)) {
++ journal_entry_set_unused(je);
++ continue;
++ }
+ }
+- if (unlikely(sec >= ic->provided_data_sectors))
+- continue;
+ get_area_and_offset(ic, sec, &area, &offset);
+ restore_last_bytes(ic, access_journal_data(ic, i, j), je);
+ for (k = j + 1; k < ic->journal_section_entries; k++) {
--- /dev/null
+From 9f6dc633761006f974701d4c88da71ab68670749 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 17 Feb 2022 23:40:02 -0500
+Subject: dm: interlock pending dm_io and dm_wait_for_bios_completion
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 9f6dc633761006f974701d4c88da71ab68670749 upstream.
+
+Commit d208b89401e0 ("dm: fix mempool NULL pointer race when
+completing IO") didn't go far enough.
+
+When bio_end_io_acct ends the count of in-flight I/Os may reach zero
+and the DM device may be suspended. There is a possibility that the
+suspend races with dm_stats_account_io.
+
+Fix this by adding percpu "pending_io" counters to track outstanding
+dm_io. Move kicking of suspend queue to dm_io_dec_pending(). Also,
+rename md_in_flight_bios() to dm_in_flight_bios() and update it to
+iterate all pending_io counters.
+
+Fixes: d208b89401e0 ("dm: fix mempool NULL pointer race when completing IO")
+Cc: stable@vger.kernel.org
+Co-developed-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-core.h | 2 ++
+ drivers/md/dm.c | 35 +++++++++++++++++++++++------------
+ 2 files changed, 25 insertions(+), 12 deletions(-)
+
+--- a/drivers/md/dm-core.h
++++ b/drivers/md/dm-core.h
+@@ -65,6 +65,8 @@ struct mapped_device {
+ struct gendisk *disk;
+ struct dax_device *dax_dev;
+
++ unsigned long __percpu *pending_io;
++
+ /*
+ * A list of ios that arrived while we were suspended.
+ */
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -507,10 +507,6 @@ static void end_io_acct(struct mapped_de
+ dm_stats_account_io(&md->stats, bio_data_dir(bio),
+ bio->bi_iter.bi_sector, bio_sectors(bio),
+ true, duration, stats_aux);
+-
+- /* nudge anyone waiting on suspend queue */
+- if (unlikely(wq_has_sleeper(&md->wait)))
+- wake_up(&md->wait);
+ }
+
+ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
+@@ -531,6 +527,7 @@ static struct dm_io *alloc_io(struct map
+ io->magic = DM_IO_MAGIC;
+ io->status = 0;
+ atomic_set(&io->io_count, 1);
++ this_cpu_inc(*md->pending_io);
+ io->orig_bio = bio;
+ io->md = md;
+ spin_lock_init(&io->endio_lock);
+@@ -828,6 +825,12 @@ void dm_io_dec_pending(struct dm_io *io,
+ stats_aux = io->stats_aux;
+ free_io(md, io);
+ end_io_acct(md, bio, start_time, &stats_aux);
++ smp_wmb();
++ this_cpu_dec(*md->pending_io);
++
++ /* nudge anyone waiting on suspend queue */
++ if (unlikely(wq_has_sleeper(&md->wait)))
++ wake_up(&md->wait);
+
+ if (io_error == BLK_STS_DM_REQUEUE)
+ return;
+@@ -1622,6 +1625,11 @@ static void cleanup_mapped_device(struct
+ blk_cleanup_disk(md->disk);
+ }
+
++ if (md->pending_io) {
++ free_percpu(md->pending_io);
++ md->pending_io = NULL;
++ }
++
+ cleanup_srcu_struct(&md->io_barrier);
+
+ mutex_destroy(&md->suspend_lock);
+@@ -1723,6 +1731,10 @@ static struct mapped_device *alloc_dev(i
+ if (!md->wq)
+ goto bad;
+
++ md->pending_io = alloc_percpu(unsigned long);
++ if (!md->pending_io)
++ goto bad;
++
+ dm_stats_init(&md->stats);
+
+ /* Populate the mapping, nobody knows we exist yet */
+@@ -2130,16 +2142,13 @@ void dm_put(struct mapped_device *md)
+ }
+ EXPORT_SYMBOL_GPL(dm_put);
+
+-static bool md_in_flight_bios(struct mapped_device *md)
++static bool dm_in_flight_bios(struct mapped_device *md)
+ {
+ int cpu;
+- struct block_device *part = dm_disk(md)->part0;
+- long sum = 0;
++ unsigned long sum = 0;
+
+- for_each_possible_cpu(cpu) {
+- sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+- sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
+- }
++ for_each_possible_cpu(cpu)
++ sum += *per_cpu_ptr(md->pending_io, cpu);
+
+ return sum != 0;
+ }
+@@ -2152,7 +2161,7 @@ static int dm_wait_for_bios_completion(s
+ while (true) {
+ prepare_to_wait(&md->wait, &wait, task_state);
+
+- if (!md_in_flight_bios(md))
++ if (!dm_in_flight_bios(md))
+ break;
+
+ if (signal_pending_state(task_state, current)) {
+@@ -2164,6 +2173,8 @@ static int dm_wait_for_bios_completion(s
+ }
+ finish_wait(&md->wait, &wait);
+
++ smp_rmb();
++
+ return r;
+ }
+
--- /dev/null
+From 0cdb90f0f306384ecbc60dfd6dc48cdbc1f2d0d8 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 17 Feb 2022 23:39:59 -0500
+Subject: dm stats: fix too short end duration_ns when using precise_timestamps
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 0cdb90f0f306384ecbc60dfd6dc48cdbc1f2d0d8 upstream.
+
+dm_stats_account_io()'s STAT_PRECISE_TIMESTAMPS support doesn't handle
+the fact that with commit b879f915bc48 ("dm: properly fix redundant
+bio-based IO accounting") io->start_time _may_ be in the past (meaning
+the start_io_acct() was deferred until later).
+
+Add a new dm_stats_recalc_precise_timestamps() helper that will
+set/clear a new 'precise_timestamps' flag in the dm_stats struct based
+on whether any configured stats enable STAT_PRECISE_TIMESTAMPS.
+And update DM core's alloc_io() to use dm_stats_record_start() to set
+stats_aux.duration_ns if stats->precise_timestamps is true.
+
+Also, remove unused 'last_sector' and 'last_rw' members from the
+dm_stats struct.
+
+Fixes: b879f915bc48 ("dm: properly fix redundant bio-based IO accounting")
+Cc: stable@vger.kernel.org
+Co-developed-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-stats.c | 28 +++++++++++++++++++++++++---
+ drivers/md/dm-stats.h | 9 +++++++--
+ drivers/md/dm.c | 2 ++
+ 3 files changed, 34 insertions(+), 5 deletions(-)
+
+--- a/drivers/md/dm-stats.c
++++ b/drivers/md/dm-stats.c
+@@ -195,6 +195,7 @@ void dm_stats_init(struct dm_stats *stat
+
+ mutex_init(&stats->mutex);
+ INIT_LIST_HEAD(&stats->list);
++ stats->precise_timestamps = false;
+ stats->last = alloc_percpu(struct dm_stats_last_position);
+ for_each_possible_cpu(cpu) {
+ last = per_cpu_ptr(stats->last, cpu);
+@@ -231,6 +232,22 @@ void dm_stats_cleanup(struct dm_stats *s
+ mutex_destroy(&stats->mutex);
+ }
+
++static void dm_stats_recalc_precise_timestamps(struct dm_stats *stats)
++{
++ struct list_head *l;
++ struct dm_stat *tmp_s;
++ bool precise_timestamps = false;
++
++ list_for_each(l, &stats->list) {
++ tmp_s = container_of(l, struct dm_stat, list_entry);
++ if (tmp_s->stat_flags & STAT_PRECISE_TIMESTAMPS) {
++ precise_timestamps = true;
++ break;
++ }
++ }
++ stats->precise_timestamps = precise_timestamps;
++}
++
+ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
+ sector_t step, unsigned stat_flags,
+ unsigned n_histogram_entries,
+@@ -376,6 +393,9 @@ static int dm_stats_create(struct dm_sta
+ }
+ ret_id = s->id;
+ list_add_tail_rcu(&s->list_entry, l);
++
++ dm_stats_recalc_precise_timestamps(stats);
++
+ mutex_unlock(&stats->mutex);
+
+ resume_callback(md);
+@@ -418,6 +438,9 @@ static int dm_stats_delete(struct dm_sta
+ }
+
+ list_del_rcu(&s->list_entry);
++
++ dm_stats_recalc_precise_timestamps(stats);
++
+ mutex_unlock(&stats->mutex);
+
+ /*
+@@ -654,9 +677,8 @@ void dm_stats_account_io(struct dm_stats
+ got_precise_time = false;
+ list_for_each_entry_rcu(s, &stats->list, list_entry) {
+ if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
+- if (!end)
+- stats_aux->duration_ns = ktime_to_ns(ktime_get());
+- else
++ /* start (!end) duration_ns is set by DM core's alloc_io() */
++ if (end)
+ stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
+ got_precise_time = true;
+ }
+--- a/drivers/md/dm-stats.h
++++ b/drivers/md/dm-stats.h
+@@ -13,8 +13,7 @@ struct dm_stats {
+ struct mutex mutex;
+ struct list_head list; /* list of struct dm_stat */
+ struct dm_stats_last_position __percpu *last;
+- sector_t last_sector;
+- unsigned last_rw;
++ bool precise_timestamps;
+ };
+
+ struct dm_stats_aux {
+@@ -40,4 +39,10 @@ static inline bool dm_stats_used(struct
+ return !list_empty(&st->list);
+ }
+
++static inline void dm_stats_record_start(struct dm_stats *stats, struct dm_stats_aux *aux)
++{
++ if (unlikely(stats->precise_timestamps))
++ aux->duration_ns = ktime_to_ns(ktime_get());
++}
++
+ #endif
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -537,6 +537,8 @@ static struct dm_io *alloc_io(struct map
+
+ io->start_time = jiffies;
+
++ dm_stats_record_start(&md->stats, &io->stats_aux);
++
+ return io;
+ }
+
--- /dev/null
+From f4329d1f848ac35757d9cc5487669d19dfc5979c Mon Sep 17 00:00:00 2001
+From: Lars Ellenberg <lars.ellenberg@linbit.com>
+Date: Wed, 30 Mar 2022 20:55:51 +0200
+Subject: drbd: fix potential silent data corruption
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Lars Ellenberg <lars.ellenberg@linbit.com>
+
+commit f4329d1f848ac35757d9cc5487669d19dfc5979c upstream.
+
+Scenario:
+---------
+
+bio chain generated by blk_queue_split().
+Some split bio fails and propagates its error status to the "parent" bio.
+But then the (last part of the) parent bio itself completes without error.
+
+We would clobber the already recorded error status with BLK_STS_OK,
+causing silent data corruption.
+
+Reproducer:
+-----------
+
+How to trigger this in the real world within seconds:
+
+DRBD on top of degraded parity raid,
+small stripe_cache_size, large read_ahead setting.
+Drop page cache (sysctl vm.drop_caches=1, fadvise "DONTNEED",
+umount and mount again, "reboot").
+
+Cause significant read ahead.
+
+Large read ahead request is split by blk_queue_split().
+Parts of the read ahead that are already in the stripe cache,
+or find an available stripe cache to use, can be serviced.
+Parts of the read ahead that would need "too much work",
+would need to wait for a "stripe_head" to become available,
+are rejected immediately.
+
+For larger read ahead requests that are split in many pieces, it is very
+likely that some "splits" will be serviced, but then the stripe cache is
+exhausted/busy, and the remaining ones will be rejected.
+
+Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
+Signed-off-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
+Cc: <stable@vger.kernel.org> # 4.13.x
+Link: https://lore.kernel.org/r/20220330185551.3553196-1-christoph.boehmwalder@linbit.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/drbd/drbd_req.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/block/drbd/drbd_req.c
++++ b/drivers/block/drbd/drbd_req.c
+@@ -180,7 +180,8 @@ void start_new_tl_epoch(struct drbd_conn
+ void complete_master_bio(struct drbd_device *device,
+ struct bio_and_error *m)
+ {
+- m->bio->bi_status = errno_to_blk_status(m->error);
++ if (unlikely(m->error))
++ m->bio->bi_status = errno_to_blk_status(m->error);
+ bio_endio(m->bio);
+ dec_ap_bio(device);
+ }
--- /dev/null
+From 3149c79f3cb0e2e3bafb7cfadacec090cbd250d3 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Fri, 1 Apr 2022 11:28:42 -0700
+Subject: mm,hwpoison: unmap poisoned page before invalidation
+
+From: Rik van Riel <riel@surriel.com>
+
+commit 3149c79f3cb0e2e3bafb7cfadacec090cbd250d3 upstream.
+
+In some cases it appears the invalidation of a hwpoisoned page fails
+because the page is still mapped in another process. This can cause a
+program to be continuously restarted and die when it page faults on the
+page that was not invalidated. Avoid that problem by unmapping the
+hwpoisoned page when we find it.
+
+Another issue is that sometimes we end up oopsing in finish_fault, if
+the code tries to do something with the now-NULL vmf->page. I did not
+hit this error when submitting the previous patch because there are
+several opportunities for alloc_set_pte to bail out before accessing
+vmf->page, and that apparently happened on those systems, and most of
+the time on other systems, too.
+
+However, across several million systems that error does occur a handful
+of times a day. It can be avoided by returning VM_FAULT_NOPAGE which
+will cause do_read_fault to return before calling finish_fault.
+
+Link: https://lkml.kernel.org/r/20220325161428.5068d97e@imladris.surriel.com
+Fixes: e53ac7374e64 ("mm: invalidate hwpoison page cache page in fault path")
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
+Tested-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3893,14 +3893,18 @@ static vm_fault_t __do_fault(struct vm_f
+ return ret;
+
+ if (unlikely(PageHWPoison(vmf->page))) {
++ struct page *page = vmf->page;
+ vm_fault_t poisonret = VM_FAULT_HWPOISON;
+ if (ret & VM_FAULT_LOCKED) {
++ if (page_mapped(page))
++ unmap_mapping_pages(page_mapping(page),
++ page->index, 1, false);
+ /* Retry if a clean page was removed from the cache. */
+- if (invalidate_inode_page(vmf->page))
+- poisonret = 0;
+- unlock_page(vmf->page);
++ if (invalidate_inode_page(page))
++ poisonret = VM_FAULT_NOPAGE;
++ unlock_page(page);
+ }
+- put_page(vmf->page);
++ put_page(page);
+ vmf->page = NULL;
+ return poisonret;
+ }
--- /dev/null
+From bfc8089f00fa526dea983844c880fa8106c33ac4 Mon Sep 17 00:00:00 2001
+From: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
+Date: Fri, 1 Apr 2022 11:28:54 -0700
+Subject: mm/kmemleak: reset tag when compare object pointer
+
+From: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
+
+commit bfc8089f00fa526dea983844c880fa8106c33ac4 upstream.
+
+When we use HW-tag based kasan and enable vmalloc support, we hit the
+following bug. It is due to comparison between tagged object and
+non-tagged pointer.
+
+We need to reset the kasan tag when we need to compare tagged object and
+non-tagged pointer.
+
+ kmemleak: [name:kmemleak&]Scan area larger than object 0xffffffe77076f440
+ CPU: 4 PID: 1 Comm: init Tainted: G S W 5.15.25-android13-0-g5cacf919c2bc #1
+ Hardware name: MT6983(ENG) (DT)
+ Call trace:
+ add_scan_area+0xc4/0x244
+ kmemleak_scan_area+0x40/0x9c
+ layout_and_allocate+0x1e8/0x288
+ load_module+0x2c8/0xf00
+ __se_sys_finit_module+0x190/0x1d0
+ __arm64_sys_finit_module+0x20/0x30
+ invoke_syscall+0x60/0x170
+ el0_svc_common+0xc8/0x114
+ do_el0_svc+0x28/0xa0
+ el0_svc+0x60/0xf8
+ el0t_64_sync_handler+0x88/0xec
+ el0t_64_sync+0x1b4/0x1b8
+ kmemleak: [name:kmemleak&]Object 0xf5ffffe77076b000 (size 32768):
+ kmemleak: [name:kmemleak&] comm "init", pid 1, jiffies 4294894197
+ kmemleak: [name:kmemleak&] min_count = 0
+ kmemleak: [name:kmemleak&] count = 0
+ kmemleak: [name:kmemleak&] flags = 0x1
+ kmemleak: [name:kmemleak&] checksum = 0
+ kmemleak: [name:kmemleak&] backtrace:
+ module_alloc+0x9c/0x120
+ move_module+0x34/0x19c
+ layout_and_allocate+0x1c4/0x288
+ load_module+0x2c8/0xf00
+ __se_sys_finit_module+0x190/0x1d0
+ __arm64_sys_finit_module+0x20/0x30
+ invoke_syscall+0x60/0x170
+ el0_svc_common+0xc8/0x114
+ do_el0_svc+0x28/0xa0
+ el0_svc+0x60/0xf8
+ el0t_64_sync_handler+0x88/0xec
+ el0t_64_sync+0x1b4/0x1b8
+
+Link: https://lkml.kernel.org/r/20220318034051.30687-1-Kuan-Ying.Lee@mediatek.com
+Signed-off-by: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Matthias Brugger <matthias.bgg@gmail.com>
+Cc: Chinwen Chang <chinwen.chang@mediatek.com>
+Cc: Nicholas Tang <nicholas.tang@mediatek.com>
+Cc: Yee Lee <yee.lee@mediatek.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/kmemleak.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/mm/kmemleak.c
++++ b/mm/kmemleak.c
+@@ -796,6 +796,8 @@ static void add_scan_area(unsigned long
+ unsigned long flags;
+ struct kmemleak_object *object;
+ struct kmemleak_scan_area *area = NULL;
++ unsigned long untagged_ptr;
++ unsigned long untagged_objp;
+
+ object = find_and_get_object(ptr, 1);
+ if (!object) {
+@@ -804,6 +806,9 @@ static void add_scan_area(unsigned long
+ return;
+ }
+
++ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
++ untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
++
+ if (scan_area_cache)
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
+
+@@ -815,8 +820,8 @@ static void add_scan_area(unsigned long
+ goto out_unlock;
+ }
+ if (size == SIZE_MAX) {
+- size = object->pointer + object->size - ptr;
+- } else if (ptr + size > object->pointer + object->size) {
++ size = untagged_objp + object->size - untagged_ptr;
++ } else if (untagged_ptr + size > untagged_objp + object->size) {
+ kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
+ dump_object_info(object);
+ kmem_cache_free(scan_area_cache, area);
--- /dev/null
+From 5bd009c7c9a9e888077c07535dc0c70aeab242c3 Mon Sep 17 00:00:00 2001
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+Date: Tue, 22 Mar 2022 14:46:44 -0700
+Subject: mm: madvise: return correct bytes advised with process_madvise
+
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+
+commit 5bd009c7c9a9e888077c07535dc0c70aeab242c3 upstream.
+
+Patch series "mm: madvise: return correct bytes processed with
+process_madvise", v2. With the process_madvise(), always choose to return
+non zero processed bytes over an error. This can help the user to know on
+which VMA, passed in the 'struct iovec' vector list, is failed to advise
+thus can take the decission of retrying/skipping on that VMA.
+
+This patch (of 2):
+
+The process_madvise() system call returns error even after processing some
+VMA's passed in the 'struct iovec' vector list which leaves the user
+confused to know where to restart the advise next. It is also against
+this syscall man page[1] documentation where it mentions that "return
+value may be less than the total number of requested bytes, if an error
+occurred after some iovec elements were already processed.".
+
+Consider a user passed 10 VMA's in the 'struct iovec' vector list of which
+9 are processed but one. Then it just returns the error caused on that
+failed VMA despite the first 9 VMA's processed, leaving the user confused
+about on which VMA it is failed. Returning the number of bytes processed
+here can help the user to know which VMA it is failed on and thus can
+retry/skip the advise on that VMA.
+
+[1]https://man7.org/linux/man-pages/man2/process_madvise.2.html.
+
+Link: https://lkml.kernel.org/r/cover.1647008754.git.quic_charante@quicinc.com
+Link: https://lkml.kernel.org/r/125b61a0edcee5c2db8658aed9d06a43a19ccafc.1647008754.git.quic_charante@quicinc.com
+Fixes: ecb8ac8b1f14("mm/madvise: introduce process_madvise() syscall: an external memory hinting API")
+Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1440,8 +1440,7 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+ iov_iter_advance(&iter, iovec.iov_len);
+ }
+
+- if (ret == 0)
+- ret = total_len - iov_iter_count(&iter);
++ ret = (total_len - iov_iter_count(&iter)) ? : ret;
+
+ release_mm:
+ mmput(mm);
--- /dev/null
+From 08095d6310a7ce43256b4251577bc66a25c6e1a6 Mon Sep 17 00:00:00 2001
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+Date: Tue, 22 Mar 2022 14:46:48 -0700
+Subject: mm: madvise: skip unmapped vma holes passed to process_madvise
+
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+
+commit 08095d6310a7ce43256b4251577bc66a25c6e1a6 upstream.
+
+The process_madvise() system call is expected to skip holes in vma passed
+through 'struct iovec' vector list. But do_madvise, which
+process_madvise() calls for each vma, returns ENOMEM in case of unmapped
+holes, despite the VMA is processed.
+
+Thus process_madvise() should treat ENOMEM as expected and consider the
+VMA passed to as processed and continue processing other vma's in the
+vector list. Returning -ENOMEM to user, despite the VMA is processed,
+will be unable to figure out where to start the next madvise.
+
+Link: https://lkml.kernel.org/r/4f091776142f2ebf7b94018146de72318474e686.1647008754.git.quic_charante@quicinc.com
+Fixes: ecb8ac8b1f14("mm/madvise: introduce process_madvise() syscall: an external memory hinting API")
+Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1426,9 +1426,16 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+
+ while (iov_iter_count(&iter)) {
+ iovec = iov_iter_iovec(&iter);
++ /*
++ * do_madvise returns ENOMEM if unmapped holes are present
++ * in the passed VMA. process_madvise() is expected to skip
++ * unmapped holes passed to it in the 'struct iovec' list
++ * and not fail because of them. Thus treat -ENOMEM return
++ * from do_madvise as valid and continue processing.
++ */
+ ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
+- if (ret < 0)
++ if (ret < 0 && ret != -ENOMEM)
+ break;
+ iov_iter_advance(&iter, iovec.iov_len);
+ }
--- /dev/null
+From 734c15700cdf9062ae98d8b131c6fe873dfad26d Mon Sep 17 00:00:00 2001
+From: Oscar Salvador <osalvador@suse.de>
+Date: Tue, 22 Mar 2022 14:47:37 -0700
+Subject: mm: only re-generate demotion targets when a numa node changes its N_CPU state
+
+From: Oscar Salvador <osalvador@suse.de>
+
+commit 734c15700cdf9062ae98d8b131c6fe873dfad26d upstream.
+
+Abhishek reported that after patch [1], hotplug operations are taking
+roughly double the expected time. [2]
+
+The reason behind is that the CPU callbacks that
+migrate_on_reclaim_init() sets always call set_migration_target_nodes()
+whenever a CPU is brought up/down.
+
+But we only care about numa nodes going from having cpus to become
+cpuless, and vice versa, as that influences the demotion_target order.
+
+We do already have two CPU callbacks (vmstat_cpu_online() and
+vmstat_cpu_dead()) that check exactly that, so get rid of the CPU
+callbacks in migrate_on_reclaim_init() and only call
+set_migration_target_nodes() from vmstat_cpu_{dead,online}() whenever a
+numa node change its N_CPU state.
+
+[1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.com/
+[2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux.vnet.ibm.com/
+
+[osalvador@suse.de: add feedback from Huang Ying]
+ Link: https://lkml.kernel.org/r/20220314150945.12694-1-osalvador@suse.de
+
+Link: https://lkml.kernel.org/r/20220310120749.23077-1-osalvador@suse.de
+Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Reported-by: Abhishek Goel <huntbag@linux.vnet.ibm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Abhishek Goel <huntbag@linux.vnet.ibm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/migrate.h | 8 ++++++++
+ mm/migrate.c | 47 ++++++++++-------------------------------------
+ mm/vmstat.c | 13 ++++++++++++-
+ 3 files changed, 30 insertions(+), 38 deletions(-)
+
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address
+ struct folio *newfolio, struct folio *folio, int extra_count);
+
+ extern bool numa_demotion_enabled;
++extern void migrate_on_reclaim_init(void);
++#ifdef CONFIG_HOTPLUG_CPU
++extern void set_migration_target_nodes(void);
+ #else
++static inline void set_migration_target_nodes(void) {}
++#endif
++#else
++
++static inline void set_migration_target_nodes(void) {}
+
+ static inline void putback_movable_pages(struct list_head *l) {}
+ static inline int migrate_pages(struct list_head *l, new_page_t new,
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -3190,7 +3190,7 @@ again:
+ /*
+ * For callers that do not hold get_online_mems() already.
+ */
+-static void set_migration_target_nodes(void)
++void set_migration_target_nodes(void)
+ {
+ get_online_mems();
+ __set_migration_target_nodes();
+@@ -3254,51 +3254,24 @@ static int __meminit migrate_on_reclaim_
+ return notifier_from_errno(0);
+ }
+
+-/*
+- * React to hotplug events that might affect the migration targets
+- * like events that online or offline NUMA nodes.
+- *
+- * The ordering is also currently dependent on which nodes have
+- * CPUs. That means we need CPU on/offline notification too.
+- */
+-static int migration_online_cpu(unsigned int cpu)
+-{
+- set_migration_target_nodes();
+- return 0;
+-}
+-
+-static int migration_offline_cpu(unsigned int cpu)
+-{
+- set_migration_target_nodes();
+- return 0;
+-}
+-
+-static int __init migrate_on_reclaim_init(void)
++void __init migrate_on_reclaim_init(void)
+ {
+- int ret;
+-
+ node_demotion = kmalloc_array(nr_node_ids,
+ sizeof(struct demotion_nodes),
+ GFP_KERNEL);
+ WARN_ON(!node_demotion);
+
+- ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
+- NULL, migration_offline_cpu);
++ hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+ /*
+- * In the unlikely case that this fails, the automatic
+- * migration targets may become suboptimal for nodes
+- * where N_CPU changes. With such a small impact in a
+- * rare case, do not bother trying to do anything special.
++ * At this point, all numa nodes with memory/CPus have their state
++ * properly set, so we can build the demotion order now.
++ * Let us hold the cpu_hotplug lock just, as we could possibily have
++ * CPU hotplug events during boot.
+ */
+- WARN_ON(ret < 0);
+- ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
+- migration_online_cpu, NULL);
+- WARN_ON(ret < 0);
+-
+- hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+- return 0;
++ cpus_read_lock();
++ set_migration_target_nodes();
++ cpus_read_unlock();
+ }
+-late_initcall(migrate_on_reclaim_init);
+ #endif /* CONFIG_HOTPLUG_CPU */
+
+ bool numa_demotion_enabled = false;
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -28,6 +28,7 @@
+ #include <linux/mm_inline.h>
+ #include <linux/page_ext.h>
+ #include <linux/page_owner.h>
++#include <linux/migrate.h>
+
+ #include "internal.h"
+
+@@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(v
+ static int vmstat_cpu_online(unsigned int cpu)
+ {
+ refresh_zone_stat_thresholds();
+- node_set_state(cpu_to_node(cpu), N_CPU);
++
++ if (!node_state(cpu_to_node(cpu), N_CPU)) {
++ node_set_state(cpu_to_node(cpu), N_CPU);
++ set_migration_target_nodes();
++ }
++
+ return 0;
+ }
+
+@@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int
+ return 0;
+
+ node_clear_state(node, N_CPU);
++ set_migration_target_nodes();
++
+ return 0;
+ }
+
+@@ -2097,6 +2105,9 @@ void __init init_mm_internals(void)
+
+ start_shepherd_timer();
+ #endif
++#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
++ migrate_on_reclaim_init();
++#endif
+ #ifdef CONFIG_PROC_FS
+ proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
+ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
--- /dev/null
+From e6b0a7b357659c332231621e4315658d062c23ee Mon Sep 17 00:00:00 2001
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+Date: Fri, 1 Apr 2022 11:28:12 -0700
+Subject: Revert "mm: madvise: skip unmapped vma holes passed to process_madvise"
+
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+
+commit e6b0a7b357659c332231621e4315658d062c23ee upstream.
+
+This reverts commit 08095d6310a7 ("mm: madvise: skip unmapped vma holes
+passed to process_madvise") as process_madvise() fails to return the
+exact processed bytes in other cases too.
+
+As an example: if process_madvise() hits mlocked pages after processing
+some initial bytes passed in [start, end), it just returns EINVAL
+although some bytes are processed. Thus making an exception only for
+ENOMEM is partially fixing the problem of returning the proper advised
+bytes.
+
+Thus revert this patch and return proper bytes advised.
+
+Link: https://lkml.kernel.org/r/e73da1304a88b6a8a11907045117cccf4c2b8374.1648046642.git.quic_charante@quicinc.com
+Fixes: 08095d6310a7ce ("mm: madvise: skip unmapped vma holes passed to process_madvise")
+Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c | 9 +--------
+ 1 file changed, 1 insertion(+), 8 deletions(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1426,16 +1426,9 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+
+ while (iov_iter_count(&iter)) {
+ iovec = iov_iter_iovec(&iter);
+- /*
+- * do_madvise returns ENOMEM if unmapped holes are present
+- * in the passed VMA. process_madvise() is expected to skip
+- * unmapped holes passed to it in the 'struct iovec' list
+- * and not fail because of them. Thus treat -ENOMEM return
+- * from do_madvise as valid and continue processing.
+- */
+ ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+ iovec.iov_len, behavior);
+- if (ret < 0 && ret != -ENOMEM)
++ if (ret < 0)
+ break;
+ iov_iter_advance(&iter, iovec.iov_len);
+ }
io_uring-ensure-that-fsnotify-is-always-called.patch
ocfs2-fix-crash-when-mount-with-quota-enabled.patch
drm-simpledrm-add-panel-orientation-property-on-non-upright-mounted-lcd-panels.patch
+mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
+mm-madvise-return-correct-bytes-advised-with-process_madvise.patch
+revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
+mm-hwpoison-unmap-poisoned-page-before-invalidation.patch
+mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch
+mm-kmemleak-reset-tag-when-compare-object-pointer.patch
+dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch
+dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch
+dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch
+dm-fix-double-accounting-of-flush-with-data.patch
+dm-integrity-set-journal-entry-unused-when-shrinking-device.patch
+tracing-have-trace-event-string-test-handle-zero-length-strings.patch
+drbd-fix-potential-silent-data-corruption.patch
--- /dev/null
+From eca344a7362e0f34f179298fd8366bcd556eede1 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Wed, 23 Mar 2022 10:32:51 -0400
+Subject: tracing: Have trace event string test handle zero length strings
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit eca344a7362e0f34f179298fd8366bcd556eede1 upstream.
+
+If a trace event has in its TP_printk():
+
+ "%*.s", len, len ? __get_str(string) : NULL
+
+It is perfectly valid if len is zero and passing in the NULL.
+Unfortunately, the runtime string check at time of reading the trace sees
+the NULL and flags it as a bad string and produces a WARN_ON().
+
+Handle this case by passing into the test function if the format has an
+asterisk (star) and if so, if the length is zero, then mark it as safe.
+
+Link: https://lore.kernel.org/all/YjsWzuw5FbWPrdqq@bfoster/
+
+Cc: stable@vger.kernel.org
+Reported-by: Brian Foster <bfoster@redhat.com>
+Tested-by: Brian Foster <bfoster@redhat.com>
+Fixes: 9a6944fee68e2 ("tracing: Add a verifier to check string pointers for trace events")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -3663,12 +3663,17 @@ static char *trace_iter_expand_format(st
+ }
+
+ /* Returns true if the string is safe to dereference from an event */
+-static bool trace_safe_str(struct trace_iterator *iter, const char *str)
++static bool trace_safe_str(struct trace_iterator *iter, const char *str,
++ bool star, int len)
+ {
+ unsigned long addr = (unsigned long)str;
+ struct trace_event *trace_event;
+ struct trace_event_call *event;
+
++ /* Ignore strings with no length */
++ if (star && !len)
++ return true;
++
+ /* OK if part of the event data */
+ if ((addr >= (unsigned long)iter->ent) &&
+ (addr < (unsigned long)iter->ent + iter->ent_size))
+@@ -3854,7 +3859,7 @@ void trace_check_vprintf(struct trace_it
+ * instead. See samples/trace_events/trace-events-sample.h
+ * for reference.
+ */
+- if (WARN_ONCE(!trace_safe_str(iter, str),
++ if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
+ "fmt: '%s' current_buffer: '%s'",
+ fmt, show_buffer(&iter->seq))) {
+ int ret;