From: Greg Kroah-Hartman Date: Sat, 2 Apr 2022 11:21:05 +0000 (+0200) Subject: 5.17-stable patches X-Git-Tag: v5.17.2~193 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=968e0a5077970199a7e5545170c4535b1c70778a;p=thirdparty%2Fkernel%2Fstable-queue.git 5.17-stable patches added patches: dm-fix-double-accounting-of-flush-with-data.patch dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch dm-integrity-set-journal-entry-unused-when-shrinking-device.patch dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch drbd-fix-potential-silent-data-corruption.patch mm-hwpoison-unmap-poisoned-page-before-invalidation.patch mm-kmemleak-reset-tag-when-compare-object-pointer.patch mm-madvise-return-correct-bytes-advised-with-process_madvise.patch mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch tracing-have-trace-event-string-test-handle-zero-length-strings.patch --- diff --git a/queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch b/queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch new file mode 100644 index 00000000000..14a8feb9368 --- /dev/null +++ b/queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch @@ -0,0 +1,140 @@ +From 8d394bc4adf588ca4a0650745167cb83f86c18c9 Mon Sep 17 00:00:00 2001 +From: Mike Snitzer +Date: Thu, 17 Feb 2022 23:39:57 -0500 +Subject: dm: fix double accounting of flush with data + +From: Mike Snitzer + +commit 8d394bc4adf588ca4a0650745167cb83f86c18c9 upstream. + +DM handles a flush with data by first issuing an empty flush and then +once it completes the REQ_PREFLUSH flag is removed and the payload is +issued. The problem fixed by this commit is that both the empty flush +bio and the data payload will account the full extent of the data +payload. + +Fix this by factoring out dm_io_acct() and having it wrap all IO +accounting to set the size of bio with REQ_PREFLUSH to 0, account the +IO, and then restore the original size. + +Cc: stable@vger.kernel.org +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-stats.c | 6 ++++-- + drivers/md/dm-stats.h | 2 +- + drivers/md/dm.c | 47 +++++++++++++++++++++++++++++++++-------------- + 3 files changed, 38 insertions(+), 17 deletions(-) + +--- a/drivers/md/dm-stats.c ++++ b/drivers/md/dm-stats.c +@@ -644,13 +644,14 @@ static void __dm_stat_bio(struct dm_stat + + void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, + sector_t bi_sector, unsigned bi_sectors, bool end, +- unsigned long duration_jiffies, ++ unsigned long start_time, + struct dm_stats_aux *stats_aux) + { + struct dm_stat *s; + sector_t end_sector; + struct dm_stats_last_position *last; + bool got_precise_time; ++ unsigned long duration_jiffies = 0; + + if (unlikely(!bi_sectors)) + return; +@@ -670,7 +671,8 @@ void dm_stats_account_io(struct dm_stats + )); + WRITE_ONCE(last->last_sector, end_sector); + WRITE_ONCE(last->last_rw, bi_rw); +- } ++ } else ++ duration_jiffies = jiffies - start_time; + + rcu_read_lock(); + +--- a/drivers/md/dm-stats.h ++++ b/drivers/md/dm-stats.h +@@ -31,7 +31,7 @@ int dm_stats_message(struct mapped_devic + + void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw, + sector_t bi_sector, unsigned bi_sectors, bool end, +- unsigned long duration_jiffies, ++ unsigned long start_time, + struct dm_stats_aux *aux); + + static inline bool dm_stats_used(struct dm_stats *st) +--- a/drivers/md/dm.c ++++ b/drivers/md/dm.c +@@ -484,29 +484,48 @@ u64 dm_start_time_ns_from_clone(struct b + } + EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone); + +-static void start_io_acct(struct dm_io *io) ++static bool bio_is_flush_with_data(struct bio *bio) + { +- struct mapped_device *md = io->md; +- struct bio *bio = io->orig_bio; ++ return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size); ++} ++ ++static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio, ++ unsigned long start_time, struct dm_stats_aux *stats_aux) ++{ ++ bool is_flush_with_data; ++ unsigned int bi_size; ++ ++ /* If REQ_PREFLUSH set save any payload but do not account it */ ++ is_flush_with_data = bio_is_flush_with_data(bio); ++ if (is_flush_with_data) { ++ bi_size = bio->bi_iter.bi_size; ++ bio->bi_iter.bi_size = 0; ++ } ++ ++ if (!end) ++ bio_start_io_acct_time(bio, start_time); ++ else ++ bio_end_io_acct(bio, start_time); + +- bio_start_io_acct_time(bio, io->start_time); + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio_data_dir(bio), + bio->bi_iter.bi_sector, bio_sectors(bio), +- false, 0, &io->stats_aux); ++ end, start_time, stats_aux); ++ ++ /* Restore bio's payload so it does get accounted upon requeue */ ++ if (is_flush_with_data) ++ bio->bi_iter.bi_size = bi_size; ++} ++ ++static void start_io_acct(struct dm_io *io) ++{ ++ dm_io_acct(false, io->md, io->orig_bio, io->start_time, &io->stats_aux); + } + + static void end_io_acct(struct mapped_device *md, struct bio *bio, + unsigned long start_time, struct dm_stats_aux *stats_aux) + { +- unsigned long duration = jiffies - start_time; +- +- bio_end_io_acct(bio, start_time); +- +- if (unlikely(dm_stats_used(&md->stats))) +- dm_stats_account_io(&md->stats, bio_data_dir(bio), +- bio->bi_iter.bi_sector, bio_sectors(bio), +- true, duration, stats_aux); ++ dm_io_acct(true, md, bio, start_time, stats_aux); + } + + static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) +@@ -835,7 +854,7 @@ void dm_io_dec_pending(struct dm_io *io, + if (io_error == BLK_STS_DM_REQUEUE) + return; + +- if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) { ++ if (bio_is_flush_with_data(bio)) { + /* + * Preflush done for flush with data, reissue + * without REQ_PREFLUSH. diff --git a/queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch b/queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch new file mode 100644 index 00000000000..c5171c0a6fd --- /dev/null +++ b/queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch @@ -0,0 +1,76 @@ +From 588b7f5df0cb64f281290c7672470c006abe7160 Mon Sep 17 00:00:00 2001 +From: Kirill Tkhai +Date: Tue, 1 Feb 2022 11:39:52 +0300 +Subject: dm: fix use-after-free in dm_cleanup_zoned_dev() + +From: Kirill Tkhai + +commit 588b7f5df0cb64f281290c7672470c006abe7160 upstream. + +dm_cleanup_zoned_dev() uses queue, so it must be called +before blk_cleanup_disk() starts its killing: + +blk_cleanup_disk->blk_cleanup_queue()->kobject_put()->blk_release_queue()-> +->...RCU...->blk_free_queue_rcu()->kmem_cache_free() + +Otherwise, RCU callback may be executed first and +dm_cleanup_zoned_dev() will touch free'd memory: + + BUG: KASAN: use-after-free in dm_cleanup_zoned_dev+0x33/0xd0 + Read of size 8 at addr ffff88805ac6e430 by task dmsetup/681 + + CPU: 4 PID: 681 Comm: dmsetup Not tainted 5.17.0-rc2+ #6 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 + Call Trace: + + dump_stack_lvl+0x57/0x7d + print_address_description.constprop.0+0x1f/0x150 + ? dm_cleanup_zoned_dev+0x33/0xd0 + kasan_report.cold+0x7f/0x11b + ? dm_cleanup_zoned_dev+0x33/0xd0 + dm_cleanup_zoned_dev+0x33/0xd0 + __dm_destroy+0x26a/0x400 + ? dm_blk_ioctl+0x230/0x230 + ? up_write+0xd8/0x270 + dev_remove+0x156/0x1d0 + ctl_ioctl+0x269/0x530 + ? table_clear+0x140/0x140 + ? lock_release+0xb2/0x750 + ? remove_all+0x40/0x40 + ? rcu_read_lock_sched_held+0x12/0x70 + ? lock_downgrade+0x3c0/0x3c0 + ? rcu_read_lock_sched_held+0x12/0x70 + dm_ctl_ioctl+0xa/0x10 + __x64_sys_ioctl+0xb9/0xf0 + do_syscall_64+0x3b/0x90 + entry_SYSCALL_64_after_hwframe+0x44/0xae + RIP: 0033:0x7fb6dfa95c27 + +Fixes: bb37d77239af ("dm: introduce zone append emulation") +Cc: stable@vger.kernel.org +Signed-off-by: Kirill Tkhai +Reviewed-by: Damien Le Moal +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/md/dm.c ++++ b/drivers/md/dm.c +@@ -1609,6 +1609,7 @@ static void cleanup_mapped_device(struct + md->dax_dev = NULL; + } + ++ dm_cleanup_zoned_dev(md); + if (md->disk) { + spin_lock(&_minor_lock); + md->disk->private_data = NULL; +@@ -1629,7 +1630,6 @@ static void cleanup_mapped_device(struct + mutex_destroy(&md->swap_bios_lock); + + dm_mq_cleanup_mapped_device(md); +- dm_cleanup_zoned_dev(md); + } + + /* diff --git a/queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch b/queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch new file mode 100644 index 00000000000..0cea50a4fac --- /dev/null +++ b/queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch @@ -0,0 +1,44 @@ +From cc09e8a9dec4f0e8299e80a7a2a8e6f54164a10b Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Sat, 26 Mar 2022 10:24:56 -0400 +Subject: dm integrity: set journal entry unused when shrinking device + +From: Mikulas Patocka + +commit cc09e8a9dec4f0e8299e80a7a2a8e6f54164a10b upstream. + +Commit f6f72f32c22c ("dm integrity: don't replay journal data past the +end of the device") skips journal replay if the target sector points +beyond the end of the device. Unfortunatelly, it doesn't set the +journal entry unused, which resulted in this BUG being triggered: +BUG_ON(!journal_entry_is_unused(je)) + +Fix this by calling journal_entry_set_unused() for this case. + +Fixes: f6f72f32c22c ("dm integrity: don't replay journal data past the end of the device") +Cc: stable@vger.kernel.org # v5.7+ +Signed-off-by: Mikulas Patocka +Tested-by: Milan Broz +[snitzer: revised header] +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-integrity.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/md/dm-integrity.c ++++ b/drivers/md/dm-integrity.c +@@ -2473,9 +2473,11 @@ static void do_journal_write(struct dm_i + dm_integrity_io_error(ic, "invalid sector in journal", -EIO); + sec &= ~(sector_t)(ic->sectors_per_block - 1); + } ++ if (unlikely(sec >= ic->provided_data_sectors)) { ++ journal_entry_set_unused(je); ++ continue; ++ } + } +- if (unlikely(sec >= ic->provided_data_sectors)) +- continue; + get_area_and_offset(ic, sec, &area, &offset); + restore_last_bytes(ic, access_journal_data(ic, i, j), je); + for (k = j + 1; k < ic->journal_section_entries; k++) { diff --git a/queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch b/queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch new file mode 100644 index 00000000000..4a36f0f679d --- /dev/null +++ b/queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch @@ -0,0 +1,139 @@ +From 9f6dc633761006f974701d4c88da71ab68670749 Mon Sep 17 00:00:00 2001 +From: Mike Snitzer +Date: Thu, 17 Feb 2022 23:40:02 -0500 +Subject: dm: interlock pending dm_io and dm_wait_for_bios_completion + +From: Mike Snitzer + +commit 9f6dc633761006f974701d4c88da71ab68670749 upstream. + +Commit d208b89401e0 ("dm: fix mempool NULL pointer race when +completing IO") didn't go far enough. + +When bio_end_io_acct ends the count of in-flight I/Os may reach zero +and the DM device may be suspended. There is a possibility that the +suspend races with dm_stats_account_io. + +Fix this by adding percpu "pending_io" counters to track outstanding +dm_io. Move kicking of suspend queue to dm_io_dec_pending(). Also, +rename md_in_flight_bios() to dm_in_flight_bios() and update it to +iterate all pending_io counters. + +Fixes: d208b89401e0 ("dm: fix mempool NULL pointer race when completing IO") +Cc: stable@vger.kernel.org +Co-developed-by: Mikulas Patocka +Signed-off-by: Mikulas Patocka +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-core.h | 2 ++ + drivers/md/dm.c | 35 +++++++++++++++++++++++------------ + 2 files changed, 25 insertions(+), 12 deletions(-) + +--- a/drivers/md/dm-core.h ++++ b/drivers/md/dm-core.h +@@ -65,6 +65,8 @@ struct mapped_device { + struct gendisk *disk; + struct dax_device *dax_dev; + ++ unsigned long __percpu *pending_io; ++ + /* + * A list of ios that arrived while we were suspended. + */ +--- a/drivers/md/dm.c ++++ b/drivers/md/dm.c +@@ -507,10 +507,6 @@ static void end_io_acct(struct mapped_de + dm_stats_account_io(&md->stats, bio_data_dir(bio), + bio->bi_iter.bi_sector, bio_sectors(bio), + true, duration, stats_aux); +- +- /* nudge anyone waiting on suspend queue */ +- if (unlikely(wq_has_sleeper(&md->wait))) +- wake_up(&md->wait); + } + + static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio) +@@ -531,6 +527,7 @@ static struct dm_io *alloc_io(struct map + io->magic = DM_IO_MAGIC; + io->status = 0; + atomic_set(&io->io_count, 1); ++ this_cpu_inc(*md->pending_io); + io->orig_bio = bio; + io->md = md; + spin_lock_init(&io->endio_lock); +@@ -828,6 +825,12 @@ void dm_io_dec_pending(struct dm_io *io, + stats_aux = io->stats_aux; + free_io(md, io); + end_io_acct(md, bio, start_time, &stats_aux); ++ smp_wmb(); ++ this_cpu_dec(*md->pending_io); ++ ++ /* nudge anyone waiting on suspend queue */ ++ if (unlikely(wq_has_sleeper(&md->wait))) ++ wake_up(&md->wait); + + if (io_error == BLK_STS_DM_REQUEUE) + return; +@@ -1622,6 +1625,11 @@ static void cleanup_mapped_device(struct + blk_cleanup_disk(md->disk); + } + ++ if (md->pending_io) { ++ free_percpu(md->pending_io); ++ md->pending_io = NULL; ++ } ++ + cleanup_srcu_struct(&md->io_barrier); + + mutex_destroy(&md->suspend_lock); +@@ -1723,6 +1731,10 @@ static struct mapped_device *alloc_dev(i + if (!md->wq) + goto bad; + ++ md->pending_io = alloc_percpu(unsigned long); ++ if (!md->pending_io) ++ goto bad; ++ + dm_stats_init(&md->stats); + + /* Populate the mapping, nobody knows we exist yet */ +@@ -2130,16 +2142,13 @@ void dm_put(struct mapped_device *md) + } + EXPORT_SYMBOL_GPL(dm_put); + +-static bool md_in_flight_bios(struct mapped_device *md) ++static bool dm_in_flight_bios(struct mapped_device *md) + { + int cpu; +- struct block_device *part = dm_disk(md)->part0; +- long sum = 0; ++ unsigned long sum = 0; + +- for_each_possible_cpu(cpu) { +- sum += part_stat_local_read_cpu(part, in_flight[0], cpu); +- sum += part_stat_local_read_cpu(part, in_flight[1], cpu); +- } ++ for_each_possible_cpu(cpu) ++ sum += *per_cpu_ptr(md->pending_io, cpu); + + return sum != 0; + } +@@ -2152,7 +2161,7 @@ static int dm_wait_for_bios_completion(s + while (true) { + prepare_to_wait(&md->wait, &wait, task_state); + +- if (!md_in_flight_bios(md)) ++ if (!dm_in_flight_bios(md)) + break; + + if (signal_pending_state(task_state, current)) { +@@ -2164,6 +2173,8 @@ static int dm_wait_for_bios_completion(s + } + finish_wait(&md->wait, &wait); + ++ smp_rmb(); ++ + return r; + } + diff --git a/queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch b/queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch new file mode 100644 index 00000000000..963a4eec6ba --- /dev/null +++ b/queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch @@ -0,0 +1,134 @@ +From 0cdb90f0f306384ecbc60dfd6dc48cdbc1f2d0d8 Mon Sep 17 00:00:00 2001 +From: Mike Snitzer +Date: Thu, 17 Feb 2022 23:39:59 -0500 +Subject: dm stats: fix too short end duration_ns when using precise_timestamps + +From: Mike Snitzer + +commit 0cdb90f0f306384ecbc60dfd6dc48cdbc1f2d0d8 upstream. + +dm_stats_account_io()'s STAT_PRECISE_TIMESTAMPS support doesn't handle +the fact that with commit b879f915bc48 ("dm: properly fix redundant +bio-based IO accounting") io->start_time _may_ be in the past (meaning +the start_io_acct() was deferred until later). + +Add a new dm_stats_recalc_precise_timestamps() helper that will +set/clear a new 'precise_timestamps' flag in the dm_stats struct based +on whether any configured stats enable STAT_PRECISE_TIMESTAMPS. +And update DM core's alloc_io() to use dm_stats_record_start() to set +stats_aux.duration_ns if stats->precise_timestamps is true. + +Also, remove unused 'last_sector' and 'last_rw' members from the +dm_stats struct. + +Fixes: b879f915bc48 ("dm: properly fix redundant bio-based IO accounting") +Cc: stable@vger.kernel.org +Co-developed-by: Mikulas Patocka +Signed-off-by: Mikulas Patocka +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-stats.c | 28 +++++++++++++++++++++++++--- + drivers/md/dm-stats.h | 9 +++++++-- + drivers/md/dm.c | 2 ++ + 3 files changed, 34 insertions(+), 5 deletions(-) + +--- a/drivers/md/dm-stats.c ++++ b/drivers/md/dm-stats.c +@@ -195,6 +195,7 @@ void dm_stats_init(struct dm_stats *stat + + mutex_init(&stats->mutex); + INIT_LIST_HEAD(&stats->list); ++ stats->precise_timestamps = false; + stats->last = alloc_percpu(struct dm_stats_last_position); + for_each_possible_cpu(cpu) { + last = per_cpu_ptr(stats->last, cpu); +@@ -231,6 +232,22 @@ void dm_stats_cleanup(struct dm_stats *s + mutex_destroy(&stats->mutex); + } + ++static void dm_stats_recalc_precise_timestamps(struct dm_stats *stats) ++{ ++ struct list_head *l; ++ struct dm_stat *tmp_s; ++ bool precise_timestamps = false; ++ ++ list_for_each(l, &stats->list) { ++ tmp_s = container_of(l, struct dm_stat, list_entry); ++ if (tmp_s->stat_flags & STAT_PRECISE_TIMESTAMPS) { ++ precise_timestamps = true; ++ break; ++ } ++ } ++ stats->precise_timestamps = precise_timestamps; ++} ++ + static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end, + sector_t step, unsigned stat_flags, + unsigned n_histogram_entries, +@@ -376,6 +393,9 @@ static int dm_stats_create(struct dm_sta + } + ret_id = s->id; + list_add_tail_rcu(&s->list_entry, l); ++ ++ dm_stats_recalc_precise_timestamps(stats); ++ + mutex_unlock(&stats->mutex); + + resume_callback(md); +@@ -418,6 +438,9 @@ static int dm_stats_delete(struct dm_sta + } + + list_del_rcu(&s->list_entry); ++ ++ dm_stats_recalc_precise_timestamps(stats); ++ + mutex_unlock(&stats->mutex); + + /* +@@ -654,9 +677,8 @@ void dm_stats_account_io(struct dm_stats + got_precise_time = false; + list_for_each_entry_rcu(s, &stats->list, list_entry) { + if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) { +- if (!end) +- stats_aux->duration_ns = ktime_to_ns(ktime_get()); +- else ++ /* start (!end) duration_ns is set by DM core's alloc_io() */ ++ if (end) + stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns; + got_precise_time = true; + } +--- a/drivers/md/dm-stats.h ++++ b/drivers/md/dm-stats.h +@@ -13,8 +13,7 @@ struct dm_stats { + struct mutex mutex; + struct list_head list; /* list of struct dm_stat */ + struct dm_stats_last_position __percpu *last; +- sector_t last_sector; +- unsigned last_rw; ++ bool precise_timestamps; + }; + + struct dm_stats_aux { +@@ -40,4 +39,10 @@ static inline bool dm_stats_used(struct + return !list_empty(&st->list); + } + ++static inline void dm_stats_record_start(struct dm_stats *stats, struct dm_stats_aux *aux) ++{ ++ if (unlikely(stats->precise_timestamps)) ++ aux->duration_ns = ktime_to_ns(ktime_get()); ++} ++ + #endif +--- a/drivers/md/dm.c ++++ b/drivers/md/dm.c +@@ -537,6 +537,8 @@ static struct dm_io *alloc_io(struct map + + io->start_time = jiffies; + ++ dm_stats_record_start(&md->stats, &io->stats_aux); ++ + return io; + } + diff --git a/queue-5.17/drbd-fix-potential-silent-data-corruption.patch b/queue-5.17/drbd-fix-potential-silent-data-corruption.patch new file mode 100644 index 00000000000..99cb0998d4e --- /dev/null +++ b/queue-5.17/drbd-fix-potential-silent-data-corruption.patch @@ -0,0 +1,67 @@ +From f4329d1f848ac35757d9cc5487669d19dfc5979c Mon Sep 17 00:00:00 2001 +From: Lars Ellenberg +Date: Wed, 30 Mar 2022 20:55:51 +0200 +Subject: drbd: fix potential silent data corruption +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Lars Ellenberg + +commit f4329d1f848ac35757d9cc5487669d19dfc5979c upstream. + +Scenario: +--------- + +bio chain generated by blk_queue_split(). +Some split bio fails and propagates its error status to the "parent" bio. +But then the (last part of the) parent bio itself completes without error. + +We would clobber the already recorded error status with BLK_STS_OK, +causing silent data corruption. + +Reproducer: +----------- + +How to trigger this in the real world within seconds: + +DRBD on top of degraded parity raid, +small stripe_cache_size, large read_ahead setting. +Drop page cache (sysctl vm.drop_caches=1, fadvise "DONTNEED", +umount and mount again, "reboot"). + +Cause significant read ahead. + +Large read ahead request is split by blk_queue_split(). +Parts of the read ahead that are already in the stripe cache, +or find an available stripe cache to use, can be serviced. +Parts of the read ahead that would need "too much work", +would need to wait for a "stripe_head" to become available, +are rejected immediately. + +For larger read ahead requests that are split in many pieces, it is very +likely that some "splits" will be serviced, but then the stripe cache is +exhausted/busy, and the remaining ones will be rejected. + +Signed-off-by: Lars Ellenberg +Signed-off-by: Christoph Böhmwalder +Cc: # 4.13.x +Link: https://lore.kernel.org/r/20220330185551.3553196-1-christoph.boehmwalder@linbit.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/drbd/drbd_req.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/block/drbd/drbd_req.c ++++ b/drivers/block/drbd/drbd_req.c +@@ -180,7 +180,8 @@ void start_new_tl_epoch(struct drbd_conn + void complete_master_bio(struct drbd_device *device, + struct bio_and_error *m) + { +- m->bio->bi_status = errno_to_blk_status(m->error); ++ if (unlikely(m->error)) ++ m->bio->bi_status = errno_to_blk_status(m->error); + bio_endio(m->bio); + dec_ap_bio(device); + } diff --git a/queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch b/queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch new file mode 100644 index 00000000000..2cf450d4040 --- /dev/null +++ b/queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch @@ -0,0 +1,67 @@ +From 3149c79f3cb0e2e3bafb7cfadacec090cbd250d3 Mon Sep 17 00:00:00 2001 +From: Rik van Riel +Date: Fri, 1 Apr 2022 11:28:42 -0700 +Subject: mm,hwpoison: unmap poisoned page before invalidation + +From: Rik van Riel + +commit 3149c79f3cb0e2e3bafb7cfadacec090cbd250d3 upstream. + +In some cases it appears the invalidation of a hwpoisoned page fails +because the page is still mapped in another process. This can cause a +program to be continuously restarted and die when it page faults on the +page that was not invalidated. Avoid that problem by unmapping the +hwpoisoned page when we find it. + +Another issue is that sometimes we end up oopsing in finish_fault, if +the code tries to do something with the now-NULL vmf->page. I did not +hit this error when submitting the previous patch because there are +several opportunities for alloc_set_pte to bail out before accessing +vmf->page, and that apparently happened on those systems, and most of +the time on other systems, too. + +However, across several million systems that error does occur a handful +of times a day. It can be avoided by returning VM_FAULT_NOPAGE which +will cause do_read_fault to return before calling finish_fault. + +Link: https://lkml.kernel.org/r/20220325161428.5068d97e@imladris.surriel.com +Fixes: e53ac7374e64 ("mm: invalidate hwpoison page cache page in fault path") +Signed-off-by: Rik van Riel +Reviewed-by: Miaohe Lin +Tested-by: Naoya Horiguchi +Reviewed-by: Oscar Salvador +Cc: Mel Gorman +Cc: Johannes Weiner +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/memory.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -3893,14 +3893,18 @@ static vm_fault_t __do_fault(struct vm_f + return ret; + + if (unlikely(PageHWPoison(vmf->page))) { ++ struct page *page = vmf->page; + vm_fault_t poisonret = VM_FAULT_HWPOISON; + if (ret & VM_FAULT_LOCKED) { ++ if (page_mapped(page)) ++ unmap_mapping_pages(page_mapping(page), ++ page->index, 1, false); + /* Retry if a clean page was removed from the cache. */ +- if (invalidate_inode_page(vmf->page)) +- poisonret = 0; +- unlock_page(vmf->page); ++ if (invalidate_inode_page(page)) ++ poisonret = VM_FAULT_NOPAGE; ++ unlock_page(page); + } +- put_page(vmf->page); ++ put_page(page); + vmf->page = NULL; + return poisonret; + } diff --git a/queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch b/queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch new file mode 100644 index 00000000000..ea7021fb1ea --- /dev/null +++ b/queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch @@ -0,0 +1,99 @@ +From bfc8089f00fa526dea983844c880fa8106c33ac4 Mon Sep 17 00:00:00 2001 +From: Kuan-Ying Lee +Date: Fri, 1 Apr 2022 11:28:54 -0700 +Subject: mm/kmemleak: reset tag when compare object pointer + +From: Kuan-Ying Lee + +commit bfc8089f00fa526dea983844c880fa8106c33ac4 upstream. + +When we use HW-tag based kasan and enable vmalloc support, we hit the +following bug. It is due to comparison between tagged object and +non-tagged pointer. + +We need to reset the kasan tag when we need to compare tagged object and +non-tagged pointer. + + kmemleak: [name:kmemleak&]Scan area larger than object 0xffffffe77076f440 + CPU: 4 PID: 1 Comm: init Tainted: G S W 5.15.25-android13-0-g5cacf919c2bc #1 + Hardware name: MT6983(ENG) (DT) + Call trace: + add_scan_area+0xc4/0x244 + kmemleak_scan_area+0x40/0x9c + layout_and_allocate+0x1e8/0x288 + load_module+0x2c8/0xf00 + __se_sys_finit_module+0x190/0x1d0 + __arm64_sys_finit_module+0x20/0x30 + invoke_syscall+0x60/0x170 + el0_svc_common+0xc8/0x114 + do_el0_svc+0x28/0xa0 + el0_svc+0x60/0xf8 + el0t_64_sync_handler+0x88/0xec + el0t_64_sync+0x1b4/0x1b8 + kmemleak: [name:kmemleak&]Object 0xf5ffffe77076b000 (size 32768): + kmemleak: [name:kmemleak&] comm "init", pid 1, jiffies 4294894197 + kmemleak: [name:kmemleak&] min_count = 0 + kmemleak: [name:kmemleak&] count = 0 + kmemleak: [name:kmemleak&] flags = 0x1 + kmemleak: [name:kmemleak&] checksum = 0 + kmemleak: [name:kmemleak&] backtrace: + module_alloc+0x9c/0x120 + move_module+0x34/0x19c + layout_and_allocate+0x1c4/0x288 + load_module+0x2c8/0xf00 + __se_sys_finit_module+0x190/0x1d0 + __arm64_sys_finit_module+0x20/0x30 + invoke_syscall+0x60/0x170 + el0_svc_common+0xc8/0x114 + do_el0_svc+0x28/0xa0 + el0_svc+0x60/0xf8 + el0t_64_sync_handler+0x88/0xec + el0t_64_sync+0x1b4/0x1b8 + +Link: https://lkml.kernel.org/r/20220318034051.30687-1-Kuan-Ying.Lee@mediatek.com +Signed-off-by: Kuan-Ying Lee +Reviewed-by: Catalin Marinas +Cc: Matthias Brugger +Cc: Chinwen Chang +Cc: Nicholas Tang +Cc: Yee Lee +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/kmemleak.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/mm/kmemleak.c ++++ b/mm/kmemleak.c +@@ -796,6 +796,8 @@ static void add_scan_area(unsigned long + unsigned long flags; + struct kmemleak_object *object; + struct kmemleak_scan_area *area = NULL; ++ unsigned long untagged_ptr; ++ unsigned long untagged_objp; + + object = find_and_get_object(ptr, 1); + if (!object) { +@@ -804,6 +806,9 @@ static void add_scan_area(unsigned long + return; + } + ++ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr); ++ untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer); ++ + if (scan_area_cache) + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); + +@@ -815,8 +820,8 @@ static void add_scan_area(unsigned long + goto out_unlock; + } + if (size == SIZE_MAX) { +- size = object->pointer + object->size - ptr; +- } else if (ptr + size > object->pointer + object->size) { ++ size = untagged_objp + object->size - untagged_ptr; ++ } else if (untagged_ptr + size > untagged_objp + object->size) { + kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); + dump_object_info(object); + kmem_cache_free(scan_area_cache, area); diff --git a/queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch b/queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch new file mode 100644 index 00000000000..c8c41118c22 --- /dev/null +++ b/queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch @@ -0,0 +1,64 @@ +From 5bd009c7c9a9e888077c07535dc0c70aeab242c3 Mon Sep 17 00:00:00 2001 +From: Charan Teja Kalla +Date: Tue, 22 Mar 2022 14:46:44 -0700 +Subject: mm: madvise: return correct bytes advised with process_madvise + +From: Charan Teja Kalla + +commit 5bd009c7c9a9e888077c07535dc0c70aeab242c3 upstream. + +Patch series "mm: madvise: return correct bytes processed with +process_madvise", v2. With the process_madvise(), always choose to return +non zero processed bytes over an error. This can help the user to know on +which VMA, passed in the 'struct iovec' vector list, is failed to advise +thus can take the decission of retrying/skipping on that VMA. + +This patch (of 2): + +The process_madvise() system call returns error even after processing some +VMA's passed in the 'struct iovec' vector list which leaves the user +confused to know where to restart the advise next. It is also against +this syscall man page[1] documentation where it mentions that "return +value may be less than the total number of requested bytes, if an error +occurred after some iovec elements were already processed.". + +Consider a user passed 10 VMA's in the 'struct iovec' vector list of which +9 are processed but one. Then it just returns the error caused on that +failed VMA despite the first 9 VMA's processed, leaving the user confused +about on which VMA it is failed. Returning the number of bytes processed +here can help the user to know which VMA it is failed on and thus can +retry/skip the advise on that VMA. + +[1]https://man7.org/linux/man-pages/man2/process_madvise.2.html. + +Link: https://lkml.kernel.org/r/cover.1647008754.git.quic_charante@quicinc.com +Link: https://lkml.kernel.org/r/125b61a0edcee5c2db8658aed9d06a43a19ccafc.1647008754.git.quic_charante@quicinc.com +Fixes: ecb8ac8b1f14("mm/madvise: introduce process_madvise() syscall: an external memory hinting API") +Signed-off-by: Charan Teja Kalla +Cc: Suren Baghdasaryan +Cc: Vlastimil Babka +Cc: David Rientjes +Cc: Stephen Rothwell +Cc: Minchan Kim +Cc: Nadav Amit +Cc: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/madvise.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -1440,8 +1440,7 @@ SYSCALL_DEFINE5(process_madvise, int, pi + iov_iter_advance(&iter, iovec.iov_len); + } + +- if (ret == 0) +- ret = total_len - iov_iter_count(&iter); ++ ret = (total_len - iov_iter_count(&iter)) ? : ret; + + release_mm: + mmput(mm); diff --git a/queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch b/queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch new file mode 100644 index 00000000000..8dbbe8db845 --- /dev/null +++ b/queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch @@ -0,0 +1,57 @@ +From 08095d6310a7ce43256b4251577bc66a25c6e1a6 Mon Sep 17 00:00:00 2001 +From: Charan Teja Kalla +Date: Tue, 22 Mar 2022 14:46:48 -0700 +Subject: mm: madvise: skip unmapped vma holes passed to process_madvise + +From: Charan Teja Kalla + +commit 08095d6310a7ce43256b4251577bc66a25c6e1a6 upstream. + +The process_madvise() system call is expected to skip holes in vma passed +through 'struct iovec' vector list. But do_madvise, which +process_madvise() calls for each vma, returns ENOMEM in case of unmapped +holes, despite the VMA is processed. + +Thus process_madvise() should treat ENOMEM as expected and consider the +VMA passed to as processed and continue processing other vma's in the +vector list. Returning -ENOMEM to user, despite the VMA is processed, +will be unable to figure out where to start the next madvise. + +Link: https://lkml.kernel.org/r/4f091776142f2ebf7b94018146de72318474e686.1647008754.git.quic_charante@quicinc.com +Fixes: ecb8ac8b1f14("mm/madvise: introduce process_madvise() syscall: an external memory hinting API") +Signed-off-by: Charan Teja Kalla +Cc: David Rientjes +Cc: Michal Hocko +Cc: Minchan Kim +Cc: Nadav Amit +Cc: Stephen Rothwell +Cc: Suren Baghdasaryan +Cc: Vlastimil Babka +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/madvise.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -1426,9 +1426,16 @@ SYSCALL_DEFINE5(process_madvise, int, pi + + while (iov_iter_count(&iter)) { + iovec = iov_iter_iovec(&iter); ++ /* ++ * do_madvise returns ENOMEM if unmapped holes are present ++ * in the passed VMA. process_madvise() is expected to skip ++ * unmapped holes passed to it in the 'struct iovec' list ++ * and not fail because of them. Thus treat -ENOMEM return ++ * from do_madvise as valid and continue processing. ++ */ + ret = do_madvise(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); +- if (ret < 0) ++ if (ret < 0 && ret != -ENOMEM) + break; + iov_iter_advance(&iter, iovec.iov_len); + } diff --git a/queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch b/queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch new file mode 100644 index 00000000000..e70fb4b8864 --- /dev/null +++ b/queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch @@ -0,0 +1,183 @@ +From 734c15700cdf9062ae98d8b131c6fe873dfad26d Mon Sep 17 00:00:00 2001 +From: Oscar Salvador +Date: Tue, 22 Mar 2022 14:47:37 -0700 +Subject: mm: only re-generate demotion targets when a numa node changes its N_CPU state + +From: Oscar Salvador + +commit 734c15700cdf9062ae98d8b131c6fe873dfad26d upstream. + +Abhishek reported that after patch [1], hotplug operations are taking +roughly double the expected time. [2] + +The reason behind is that the CPU callbacks that +migrate_on_reclaim_init() sets always call set_migration_target_nodes() +whenever a CPU is brought up/down. + +But we only care about numa nodes going from having cpus to become +cpuless, and vice versa, as that influences the demotion_target order. + +We do already have two CPU callbacks (vmstat_cpu_online() and +vmstat_cpu_dead()) that check exactly that, so get rid of the CPU +callbacks in migrate_on_reclaim_init() and only call +set_migration_target_nodes() from vmstat_cpu_{dead,online}() whenever a +numa node change its N_CPU state. + +[1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.com/ +[2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux.vnet.ibm.com/ + +[osalvador@suse.de: add feedback from Huang Ying] + Link: https://lkml.kernel.org/r/20220314150945.12694-1-osalvador@suse.de + +Link: https://lkml.kernel.org/r/20220310120749.23077-1-osalvador@suse.de +Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events") +Signed-off-by: Oscar Salvador +Reviewed-by: Baolin Wang +Tested-by: Baolin Wang +Reported-by: Abhishek Goel +Cc: Dave Hansen +Cc: "Huang, Ying" +Cc: Abhishek Goel +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/migrate.h | 8 ++++++++ + mm/migrate.c | 47 ++++++++++------------------------------------- + mm/vmstat.c | 13 ++++++++++++- + 3 files changed, 30 insertions(+), 38 deletions(-) + +--- a/include/linux/migrate.h ++++ b/include/linux/migrate.h +@@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address + struct folio *newfolio, struct folio *folio, int extra_count); + + extern bool numa_demotion_enabled; ++extern void migrate_on_reclaim_init(void); ++#ifdef CONFIG_HOTPLUG_CPU ++extern void set_migration_target_nodes(void); + #else ++static inline void set_migration_target_nodes(void) {} ++#endif ++#else ++ ++static inline void set_migration_target_nodes(void) {} + + static inline void putback_movable_pages(struct list_head *l) {} + static inline int migrate_pages(struct list_head *l, new_page_t new, +--- a/mm/migrate.c ++++ b/mm/migrate.c +@@ -3190,7 +3190,7 @@ again: + /* + * For callers that do not hold get_online_mems() already. + */ +-static void set_migration_target_nodes(void) ++void set_migration_target_nodes(void) + { + get_online_mems(); + __set_migration_target_nodes(); +@@ -3254,51 +3254,24 @@ static int __meminit migrate_on_reclaim_ + return notifier_from_errno(0); + } + +-/* +- * React to hotplug events that might affect the migration targets +- * like events that online or offline NUMA nodes. +- * +- * The ordering is also currently dependent on which nodes have +- * CPUs. That means we need CPU on/offline notification too. +- */ +-static int migration_online_cpu(unsigned int cpu) +-{ +- set_migration_target_nodes(); +- return 0; +-} +- +-static int migration_offline_cpu(unsigned int cpu) +-{ +- set_migration_target_nodes(); +- return 0; +-} +- +-static int __init migrate_on_reclaim_init(void) ++void __init migrate_on_reclaim_init(void) + { +- int ret; +- + node_demotion = kmalloc_array(nr_node_ids, + sizeof(struct demotion_nodes), + GFP_KERNEL); + WARN_ON(!node_demotion); + +- ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline", +- NULL, migration_offline_cpu); ++ hotplug_memory_notifier(migrate_on_reclaim_callback, 100); + /* +- * In the unlikely case that this fails, the automatic +- * migration targets may become suboptimal for nodes +- * where N_CPU changes. With such a small impact in a +- * rare case, do not bother trying to do anything special. ++ * At this point, all numa nodes with memory/CPus have their state ++ * properly set, so we can build the demotion order now. ++ * Let us hold the cpu_hotplug lock just, as we could possibily have ++ * CPU hotplug events during boot. + */ +- WARN_ON(ret < 0); +- ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online", +- migration_online_cpu, NULL); +- WARN_ON(ret < 0); +- +- hotplug_memory_notifier(migrate_on_reclaim_callback, 100); +- return 0; ++ cpus_read_lock(); ++ set_migration_target_nodes(); ++ cpus_read_unlock(); + } +-late_initcall(migrate_on_reclaim_init); + #endif /* CONFIG_HOTPLUG_CPU */ + + bool numa_demotion_enabled = false; +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include "internal.h" + +@@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(v + static int vmstat_cpu_online(unsigned int cpu) + { + refresh_zone_stat_thresholds(); +- node_set_state(cpu_to_node(cpu), N_CPU); ++ ++ if (!node_state(cpu_to_node(cpu), N_CPU)) { ++ node_set_state(cpu_to_node(cpu), N_CPU); ++ set_migration_target_nodes(); ++ } ++ + return 0; + } + +@@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int + return 0; + + node_clear_state(node, N_CPU); ++ set_migration_target_nodes(); ++ + return 0; + } + +@@ -2097,6 +2105,9 @@ void __init init_mm_internals(void) + + start_shepherd_timer(); + #endif ++#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU) ++ migrate_on_reclaim_init(); ++#endif + #ifdef CONFIG_PROC_FS + proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op); + proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op); diff --git a/queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch b/queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch new file mode 100644 index 00000000000..3e355823dfd --- /dev/null +++ b/queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch @@ -0,0 +1,57 @@ +From e6b0a7b357659c332231621e4315658d062c23ee Mon Sep 17 00:00:00 2001 +From: Charan Teja Kalla +Date: Fri, 1 Apr 2022 11:28:12 -0700 +Subject: Revert "mm: madvise: skip unmapped vma holes passed to process_madvise" + +From: Charan Teja Kalla + +commit e6b0a7b357659c332231621e4315658d062c23ee upstream. + +This reverts commit 08095d6310a7 ("mm: madvise: skip unmapped vma holes +passed to process_madvise") as process_madvise() fails to return the +exact processed bytes in other cases too. + +As an example: if process_madvise() hits mlocked pages after processing +some initial bytes passed in [start, end), it just returns EINVAL +although some bytes are processed. Thus making an exception only for +ENOMEM is partially fixing the problem of returning the proper advised +bytes. + +Thus revert this patch and return proper bytes advised. + +Link: https://lkml.kernel.org/r/e73da1304a88b6a8a11907045117cccf4c2b8374.1648046642.git.quic_charante@quicinc.com +Fixes: 08095d6310a7ce ("mm: madvise: skip unmapped vma holes passed to process_madvise") +Signed-off-by: Charan Teja Kalla +Acked-by: Michal Hocko +Cc: Suren Baghdasaryan +Cc: Vlastimil Babka +Cc: David Rientjes +Cc: Nadav Amit +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/madvise.c | 9 +-------- + 1 file changed, 1 insertion(+), 8 deletions(-) + +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -1426,16 +1426,9 @@ SYSCALL_DEFINE5(process_madvise, int, pi + + while (iov_iter_count(&iter)) { + iovec = iov_iter_iovec(&iter); +- /* +- * do_madvise returns ENOMEM if unmapped holes are present +- * in the passed VMA. process_madvise() is expected to skip +- * unmapped holes passed to it in the 'struct iovec' list +- * and not fail because of them. Thus treat -ENOMEM return +- * from do_madvise as valid and continue processing. +- */ + ret = do_madvise(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); +- if (ret < 0 && ret != -ENOMEM) ++ if (ret < 0) + break; + iov_iter_advance(&iter, iovec.iov_len); + } diff --git a/queue-5.17/series b/queue-5.17/series index 773df2de633..ad0ac27e51d 100644 --- a/queue-5.17/series +++ b/queue-5.17/series @@ -102,3 +102,16 @@ rtc-pl031-fix-rtc-features-null-pointer-dereference.patch io_uring-ensure-that-fsnotify-is-always-called.patch ocfs2-fix-crash-when-mount-with-quota-enabled.patch drm-simpledrm-add-panel-orientation-property-on-non-upright-mounted-lcd-panels.patch +mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch +mm-madvise-return-correct-bytes-advised-with-process_madvise.patch +revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch +mm-hwpoison-unmap-poisoned-page-before-invalidation.patch +mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch +mm-kmemleak-reset-tag-when-compare-object-pointer.patch +dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch +dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch +dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch +dm-fix-double-accounting-of-flush-with-data.patch +dm-integrity-set-journal-entry-unused-when-shrinking-device.patch +tracing-have-trace-event-string-test-handle-zero-length-strings.patch +drbd-fix-potential-silent-data-corruption.patch diff --git a/queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch b/queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch new file mode 100644 index 00000000000..28c2b62ebbd --- /dev/null +++ b/queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch @@ -0,0 +1,62 @@ +From eca344a7362e0f34f179298fd8366bcd556eede1 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Google)" +Date: Wed, 23 Mar 2022 10:32:51 -0400 +Subject: tracing: Have trace event string test handle zero length strings + +From: Steven Rostedt (Google) + +commit eca344a7362e0f34f179298fd8366bcd556eede1 upstream. + +If a trace event has in its TP_printk(): + + "%*.s", len, len ? __get_str(string) : NULL + +It is perfectly valid if len is zero and passing in the NULL. +Unfortunately, the runtime string check at time of reading the trace sees +the NULL and flags it as a bad string and produces a WARN_ON(). + +Handle this case by passing into the test function if the format has an +asterisk (star) and if so, if the length is zero, then mark it as safe. + +Link: https://lore.kernel.org/all/YjsWzuw5FbWPrdqq@bfoster/ + +Cc: stable@vger.kernel.org +Reported-by: Brian Foster +Tested-by: Brian Foster +Fixes: 9a6944fee68e2 ("tracing: Add a verifier to check string pointers for trace events") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -3663,12 +3663,17 @@ static char *trace_iter_expand_format(st + } + + /* Returns true if the string is safe to dereference from an event */ +-static bool trace_safe_str(struct trace_iterator *iter, const char *str) ++static bool trace_safe_str(struct trace_iterator *iter, const char *str, ++ bool star, int len) + { + unsigned long addr = (unsigned long)str; + struct trace_event *trace_event; + struct trace_event_call *event; + ++ /* Ignore strings with no length */ ++ if (star && !len) ++ return true; ++ + /* OK if part of the event data */ + if ((addr >= (unsigned long)iter->ent) && + (addr < (unsigned long)iter->ent + iter->ent_size)) +@@ -3854,7 +3859,7 @@ void trace_check_vprintf(struct trace_it + * instead. See samples/trace_events/trace-events-sample.h + * for reference. + */ +- if (WARN_ONCE(!trace_safe_str(iter, str), ++ if (WARN_ONCE(!trace_safe_str(iter, str, star, len), + "fmt: '%s' current_buffer: '%s'", + fmt, show_buffer(&iter->seq))) { + int ret;