5.17-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 2 Apr 2022 11:21:05 +0000 (13:21 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sat, 2 Apr 2022 11:21:05 +0000 (13:21 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 2 Apr 2022 11:21:05 +0000 (13:21 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sat, 2 Apr 2022 11:21:05 +0000 (13:21 +0200)
diff --git a/queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch b/queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch

new file mode 100644 (file)

index 0000000..14a8feb
--- /dev/null
+++ b/queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch
@@ -0,0 +1,140 @@
+From 8d394bc4adf588ca4a0650745167cb83f86c18c9 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 17 Feb 2022 23:39:57 -0500
+Subject: dm: fix double accounting of flush with data
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 8d394bc4adf588ca4a0650745167cb83f86c18c9 upstream.
+
+DM handles a flush with data by first issuing an empty flush and then
+once it completes the REQ_PREFLUSH flag is removed and the payload is
+issued.  The problem fixed by this commit is that both the empty flush
+bio and the data payload will account the full extent of the data
+payload.
+
+Fix this by factoring out dm_io_acct() and having it wrap all IO
+accounting to set the size of  bio with REQ_PREFLUSH to 0, account the
+IO, and then restore the original size.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-stats.c |    6 ++++--
+ drivers/md/dm-stats.h |    2 +-
+ drivers/md/dm.c       |   47 +++++++++++++++++++++++++++++++++--------------
+ 3 files changed, 38 insertions(+), 17 deletions(-)
+
+--- a/drivers/md/dm-stats.c
++++ b/drivers/md/dm-stats.c
+@@ -644,13 +644,14 @@ static void __dm_stat_bio(struct dm_stat
+ 
+ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+                        sector_t bi_sector, unsigned bi_sectors, bool end,
+-                       unsigned long duration_jiffies,
++                       unsigned long start_time,
+                        struct dm_stats_aux *stats_aux)
+ {
+       struct dm_stat *s;
+       sector_t end_sector;
+       struct dm_stats_last_position *last;
+       bool got_precise_time;
++      unsigned long duration_jiffies = 0;
+ 
+       if (unlikely(!bi_sectors))
+               return;
+@@ -670,7 +671,8 @@ void dm_stats_account_io(struct dm_stats
+                                      ));
+               WRITE_ONCE(last->last_sector, end_sector);
+               WRITE_ONCE(last->last_rw, bi_rw);
+-      }
++      } else
++              duration_jiffies = jiffies - start_time;
+ 
+       rcu_read_lock();
+ 
+--- a/drivers/md/dm-stats.h
++++ b/drivers/md/dm-stats.h
+@@ -31,7 +31,7 @@ int dm_stats_message(struct mapped_devic
+ 
+ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
+                        sector_t bi_sector, unsigned bi_sectors, bool end,
+-                       unsigned long duration_jiffies,
++                       unsigned long start_time,
+                        struct dm_stats_aux *aux);
+ 
+ static inline bool dm_stats_used(struct dm_stats *st)
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -484,29 +484,48 @@ u64 dm_start_time_ns_from_clone(struct b
+ }
+ EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
+ 
+-static void start_io_acct(struct dm_io *io)
++static bool bio_is_flush_with_data(struct bio *bio)
+ {
+-      struct mapped_device *md = io->md;
+-      struct bio *bio = io->orig_bio;
++      return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
++}
++
++static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio,
++                     unsigned long start_time, struct dm_stats_aux *stats_aux)
++{
++      bool is_flush_with_data;
++      unsigned int bi_size;
++
++      /* If REQ_PREFLUSH set save any payload but do not account it */
++      is_flush_with_data = bio_is_flush_with_data(bio);
++      if (is_flush_with_data) {
++              bi_size = bio->bi_iter.bi_size;
++              bio->bi_iter.bi_size = 0;
++      }
++
++      if (!end)
++              bio_start_io_acct_time(bio, start_time);
++      else
++              bio_end_io_acct(bio, start_time);
+ 
+-      bio_start_io_acct_time(bio, io->start_time);
+       if (unlikely(dm_stats_used(&md->stats)))
+               dm_stats_account_io(&md->stats, bio_data_dir(bio),
+                                   bio->bi_iter.bi_sector, bio_sectors(bio),
+-                                  false, 0, &io->stats_aux);
++                                  end, start_time, stats_aux);
++
++      /* Restore bio's payload so it does get accounted upon requeue */
++      if (is_flush_with_data)
++              bio->bi_iter.bi_size = bi_size;
++}
++
++static void start_io_acct(struct dm_io *io)
++{
++      dm_io_acct(false, io->md, io->orig_bio, io->start_time, &io->stats_aux);
+ }
+ 
+ static void end_io_acct(struct mapped_device *md, struct bio *bio,
+                       unsigned long start_time, struct dm_stats_aux *stats_aux)
+ {
+-      unsigned long duration = jiffies - start_time;
+-
+-      bio_end_io_acct(bio, start_time);
+-
+-      if (unlikely(dm_stats_used(&md->stats)))
+-              dm_stats_account_io(&md->stats, bio_data_dir(bio),
+-                                  bio->bi_iter.bi_sector, bio_sectors(bio),
+-                                  true, duration, stats_aux);
++      dm_io_acct(true, md, bio, start_time, stats_aux);
+ }
+ 
+ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
+@@ -835,7 +854,7 @@ void dm_io_dec_pending(struct dm_io *io,
+               if (io_error == BLK_STS_DM_REQUEUE)
+                       return;
+ 
+-              if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
++              if (bio_is_flush_with_data(bio)) {
+                       /*
+                        * Preflush done for flush with data, reissue
+                        * without REQ_PREFLUSH.
diff --git a/queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch b/queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch

new file mode 100644 (file)

index 0000000..c5171c0
--- /dev/null
+++ b/queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch
@@ -0,0 +1,76 @@
+From 588b7f5df0cb64f281290c7672470c006abe7160 Mon Sep 17 00:00:00 2001
+From: Kirill Tkhai <ktkhai@virtuozzo.com>
+Date: Tue, 1 Feb 2022 11:39:52 +0300
+Subject: dm: fix use-after-free in dm_cleanup_zoned_dev()
+
+From: Kirill Tkhai <ktkhai@virtuozzo.com>
+
+commit 588b7f5df0cb64f281290c7672470c006abe7160 upstream.
+
+dm_cleanup_zoned_dev() uses queue, so it must be called
+before blk_cleanup_disk() starts its killing:
+
+blk_cleanup_disk->blk_cleanup_queue()->kobject_put()->blk_release_queue()->
+->...RCU...->blk_free_queue_rcu()->kmem_cache_free()
+
+Otherwise, RCU callback may be executed first and
+dm_cleanup_zoned_dev() will touch free'd memory:
+
+ BUG: KASAN: use-after-free in dm_cleanup_zoned_dev+0x33/0xd0
+ Read of size 8 at addr ffff88805ac6e430 by task dmsetup/681
+
+ CPU: 4 PID: 681 Comm: dmsetup Not tainted 5.17.0-rc2+ #6
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
+ Call Trace:
+  <TASK>
+  dump_stack_lvl+0x57/0x7d
+  print_address_description.constprop.0+0x1f/0x150
+  ? dm_cleanup_zoned_dev+0x33/0xd0
+  kasan_report.cold+0x7f/0x11b
+  ? dm_cleanup_zoned_dev+0x33/0xd0
+  dm_cleanup_zoned_dev+0x33/0xd0
+  __dm_destroy+0x26a/0x400
+  ? dm_blk_ioctl+0x230/0x230
+  ? up_write+0xd8/0x270
+  dev_remove+0x156/0x1d0
+  ctl_ioctl+0x269/0x530
+  ? table_clear+0x140/0x140
+  ? lock_release+0xb2/0x750
+  ? remove_all+0x40/0x40
+  ? rcu_read_lock_sched_held+0x12/0x70
+  ? lock_downgrade+0x3c0/0x3c0
+  ? rcu_read_lock_sched_held+0x12/0x70
+  dm_ctl_ioctl+0xa/0x10
+  __x64_sys_ioctl+0xb9/0xf0
+  do_syscall_64+0x3b/0x90
+  entry_SYSCALL_64_after_hwframe+0x44/0xae
+ RIP: 0033:0x7fb6dfa95c27
+
+Fixes: bb37d77239af ("dm: introduce zone append emulation")
+Cc: stable@vger.kernel.org
+Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
+Reviewed-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -1609,6 +1609,7 @@ static void cleanup_mapped_device(struct
+               md->dax_dev = NULL;
+       }
+ 
++      dm_cleanup_zoned_dev(md);
+       if (md->disk) {
+               spin_lock(&_minor_lock);
+               md->disk->private_data = NULL;
+@@ -1629,7 +1630,6 @@ static void cleanup_mapped_device(struct
+       mutex_destroy(&md->swap_bios_lock);
+ 
+       dm_mq_cleanup_mapped_device(md);
+-      dm_cleanup_zoned_dev(md);
+ }
+ 
+ /*
diff --git a/queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch b/queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch

new file mode 100644 (file)

index 0000000..0cea50a
--- /dev/null
+++ b/queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch
@@ -0,0 +1,44 @@
+From cc09e8a9dec4f0e8299e80a7a2a8e6f54164a10b Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Sat, 26 Mar 2022 10:24:56 -0400
+Subject: dm integrity: set journal entry unused when shrinking device
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+commit cc09e8a9dec4f0e8299e80a7a2a8e6f54164a10b upstream.
+
+Commit f6f72f32c22c ("dm integrity: don't replay journal data past the
+end of the device") skips journal replay if the target sector points
+beyond the end of the device. Unfortunatelly, it doesn't set the
+journal entry unused, which resulted in this BUG being triggered:
+BUG_ON(!journal_entry_is_unused(je))
+
+Fix this by calling journal_entry_set_unused() for this case.
+
+Fixes: f6f72f32c22c ("dm integrity: don't replay journal data past the end of the device")
+Cc: stable@vger.kernel.org # v5.7+
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Tested-by: Milan Broz <gmazyland@gmail.com>
+[snitzer: revised header]
+Signed-off-by: Mike Snitzer <snitzer@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-integrity.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/md/dm-integrity.c
++++ b/drivers/md/dm-integrity.c
+@@ -2473,9 +2473,11 @@ static void do_journal_write(struct dm_i
+                                       dm_integrity_io_error(ic, "invalid sector in journal", -EIO);
+                                       sec &= ~(sector_t)(ic->sectors_per_block - 1);
+                               }
++                              if (unlikely(sec >= ic->provided_data_sectors)) {
++                                      journal_entry_set_unused(je);
++                                      continue;
++                              }
+                       }
+-                      if (unlikely(sec >= ic->provided_data_sectors))
+-                              continue;
+                       get_area_and_offset(ic, sec, &area, &offset);
+                       restore_last_bytes(ic, access_journal_data(ic, i, j), je);
+                       for (k = j + 1; k < ic->journal_section_entries; k++) {
diff --git a/queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch b/queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch

new file mode 100644 (file)

index 0000000..4a36f0f
--- /dev/null
+++ b/queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch
@@ -0,0 +1,139 @@
+From 9f6dc633761006f974701d4c88da71ab68670749 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 17 Feb 2022 23:40:02 -0500
+Subject: dm: interlock pending dm_io and dm_wait_for_bios_completion
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 9f6dc633761006f974701d4c88da71ab68670749 upstream.
+
+Commit d208b89401e0 ("dm: fix mempool NULL pointer race when
+completing IO") didn't go far enough.
+
+When bio_end_io_acct ends the count of in-flight I/Os may reach zero
+and the DM device may be suspended. There is a possibility that the
+suspend races with dm_stats_account_io.
+
+Fix this by adding percpu "pending_io" counters to track outstanding
+dm_io. Move kicking of suspend queue to dm_io_dec_pending(). Also,
+rename md_in_flight_bios() to dm_in_flight_bios() and update it to
+iterate all pending_io counters.
+
+Fixes: d208b89401e0 ("dm: fix mempool NULL pointer race when completing IO")
+Cc: stable@vger.kernel.org
+Co-developed-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-core.h |    2 ++
+ drivers/md/dm.c      |   35 +++++++++++++++++++++++------------
+ 2 files changed, 25 insertions(+), 12 deletions(-)
+
+--- a/drivers/md/dm-core.h
++++ b/drivers/md/dm-core.h
+@@ -65,6 +65,8 @@ struct mapped_device {
+       struct gendisk *disk;
+       struct dax_device *dax_dev;
+ 
++      unsigned long __percpu *pending_io;
++
+       /*
+        * A list of ios that arrived while we were suspended.
+        */
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -507,10 +507,6 @@ static void end_io_acct(struct mapped_de
+               dm_stats_account_io(&md->stats, bio_data_dir(bio),
+                                   bio->bi_iter.bi_sector, bio_sectors(bio),
+                                   true, duration, stats_aux);
+-
+-      /* nudge anyone waiting on suspend queue */
+-      if (unlikely(wq_has_sleeper(&md->wait)))
+-              wake_up(&md->wait);
+ }
+ 
+ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
+@@ -531,6 +527,7 @@ static struct dm_io *alloc_io(struct map
+       io->magic = DM_IO_MAGIC;
+       io->status = 0;
+       atomic_set(&io->io_count, 1);
++      this_cpu_inc(*md->pending_io);
+       io->orig_bio = bio;
+       io->md = md;
+       spin_lock_init(&io->endio_lock);
+@@ -828,6 +825,12 @@ void dm_io_dec_pending(struct dm_io *io,
+               stats_aux = io->stats_aux;
+               free_io(md, io);
+               end_io_acct(md, bio, start_time, &stats_aux);
++              smp_wmb();
++              this_cpu_dec(*md->pending_io);
++
++              /* nudge anyone waiting on suspend queue */
++              if (unlikely(wq_has_sleeper(&md->wait)))
++                      wake_up(&md->wait);
+ 
+               if (io_error == BLK_STS_DM_REQUEUE)
+                       return;
+@@ -1622,6 +1625,11 @@ static void cleanup_mapped_device(struct
+               blk_cleanup_disk(md->disk);
+       }
+ 
++      if (md->pending_io) {
++              free_percpu(md->pending_io);
++              md->pending_io = NULL;
++      }
++
+       cleanup_srcu_struct(&md->io_barrier);
+ 
+       mutex_destroy(&md->suspend_lock);
+@@ -1723,6 +1731,10 @@ static struct mapped_device *alloc_dev(i
+       if (!md->wq)
+               goto bad;
+ 
++      md->pending_io = alloc_percpu(unsigned long);
++      if (!md->pending_io)
++              goto bad;
++
+       dm_stats_init(&md->stats);
+ 
+       /* Populate the mapping, nobody knows we exist yet */
+@@ -2130,16 +2142,13 @@ void dm_put(struct mapped_device *md)
+ }
+ EXPORT_SYMBOL_GPL(dm_put);
+ 
+-static bool md_in_flight_bios(struct mapped_device *md)
++static bool dm_in_flight_bios(struct mapped_device *md)
+ {
+       int cpu;
+-      struct block_device *part = dm_disk(md)->part0;
+-      long sum = 0;
++      unsigned long sum = 0;
+ 
+-      for_each_possible_cpu(cpu) {
+-              sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
+-              sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
+-      }
++      for_each_possible_cpu(cpu)
++              sum += *per_cpu_ptr(md->pending_io, cpu);
+ 
+       return sum != 0;
+ }
+@@ -2152,7 +2161,7 @@ static int dm_wait_for_bios_completion(s
+       while (true) {
+               prepare_to_wait(&md->wait, &wait, task_state);
+ 
+-              if (!md_in_flight_bios(md))
++              if (!dm_in_flight_bios(md))
+                       break;
+ 
+               if (signal_pending_state(task_state, current)) {
+@@ -2164,6 +2173,8 @@ static int dm_wait_for_bios_completion(s
+       }
+       finish_wait(&md->wait, &wait);
+ 
++      smp_rmb();
++
+       return r;
+ }
+ 
diff --git a/queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch b/queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch

new file mode 100644 (file)

index 0000000..963a4ee
--- /dev/null
+++ b/queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch
@@ -0,0 +1,134 @@
+From 0cdb90f0f306384ecbc60dfd6dc48cdbc1f2d0d8 Mon Sep 17 00:00:00 2001
+From: Mike Snitzer <snitzer@redhat.com>
+Date: Thu, 17 Feb 2022 23:39:59 -0500
+Subject: dm stats: fix too short end duration_ns when using precise_timestamps
+
+From: Mike Snitzer <snitzer@redhat.com>
+
+commit 0cdb90f0f306384ecbc60dfd6dc48cdbc1f2d0d8 upstream.
+
+dm_stats_account_io()'s STAT_PRECISE_TIMESTAMPS support doesn't handle
+the fact that with commit b879f915bc48 ("dm: properly fix redundant
+bio-based IO accounting") io->start_time _may_ be in the past (meaning
+the start_io_acct() was deferred until later).
+
+Add a new dm_stats_recalc_precise_timestamps() helper that will
+set/clear a new 'precise_timestamps' flag in the dm_stats struct based
+on whether any configured stats enable STAT_PRECISE_TIMESTAMPS.
+And update DM core's alloc_io() to use dm_stats_record_start() to set
+stats_aux.duration_ns if stats->precise_timestamps is true.
+
+Also, remove unused 'last_sector' and 'last_rw' members from the
+dm_stats struct.
+
+Fixes: b879f915bc48 ("dm: properly fix redundant bio-based IO accounting")
+Cc: stable@vger.kernel.org
+Co-developed-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-stats.c |   28 +++++++++++++++++++++++++---
+ drivers/md/dm-stats.h |    9 +++++++--
+ drivers/md/dm.c       |    2 ++
+ 3 files changed, 34 insertions(+), 5 deletions(-)
+
+--- a/drivers/md/dm-stats.c
++++ b/drivers/md/dm-stats.c
+@@ -195,6 +195,7 @@ void dm_stats_init(struct dm_stats *stat
+ 
+       mutex_init(&stats->mutex);
+       INIT_LIST_HEAD(&stats->list);
++      stats->precise_timestamps = false;
+       stats->last = alloc_percpu(struct dm_stats_last_position);
+       for_each_possible_cpu(cpu) {
+               last = per_cpu_ptr(stats->last, cpu);
+@@ -231,6 +232,22 @@ void dm_stats_cleanup(struct dm_stats *s
+       mutex_destroy(&stats->mutex);
+ }
+ 
++static void dm_stats_recalc_precise_timestamps(struct dm_stats *stats)
++{
++      struct list_head *l;
++      struct dm_stat *tmp_s;
++      bool precise_timestamps = false;
++
++      list_for_each(l, &stats->list) {
++              tmp_s = container_of(l, struct dm_stat, list_entry);
++              if (tmp_s->stat_flags & STAT_PRECISE_TIMESTAMPS) {
++                      precise_timestamps = true;
++                      break;
++              }
++      }
++      stats->precise_timestamps = precise_timestamps;
++}
++
+ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
+                          sector_t step, unsigned stat_flags,
+                          unsigned n_histogram_entries,
+@@ -376,6 +393,9 @@ static int dm_stats_create(struct dm_sta
+       }
+       ret_id = s->id;
+       list_add_tail_rcu(&s->list_entry, l);
++
++      dm_stats_recalc_precise_timestamps(stats);
++
+       mutex_unlock(&stats->mutex);
+ 
+       resume_callback(md);
+@@ -418,6 +438,9 @@ static int dm_stats_delete(struct dm_sta
+       }
+ 
+       list_del_rcu(&s->list_entry);
++
++      dm_stats_recalc_precise_timestamps(stats);
++
+       mutex_unlock(&stats->mutex);
+ 
+       /*
+@@ -654,9 +677,8 @@ void dm_stats_account_io(struct dm_stats
+       got_precise_time = false;
+       list_for_each_entry_rcu(s, &stats->list, list_entry) {
+               if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
+-                      if (!end)
+-                              stats_aux->duration_ns = ktime_to_ns(ktime_get());
+-                      else
++                      /* start (!end) duration_ns is set by DM core's alloc_io() */
++                      if (end)
+                               stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
+                       got_precise_time = true;
+               }
+--- a/drivers/md/dm-stats.h
++++ b/drivers/md/dm-stats.h
+@@ -13,8 +13,7 @@ struct dm_stats {
+       struct mutex mutex;
+       struct list_head list;  /* list of struct dm_stat */
+       struct dm_stats_last_position __percpu *last;
+-      sector_t last_sector;
+-      unsigned last_rw;
++      bool precise_timestamps;
+ };
+ 
+ struct dm_stats_aux {
+@@ -40,4 +39,10 @@ static inline bool dm_stats_used(struct
+       return !list_empty(&st->list);
+ }
+ 
++static inline void dm_stats_record_start(struct dm_stats *stats, struct dm_stats_aux *aux)
++{
++      if (unlikely(stats->precise_timestamps))
++              aux->duration_ns = ktime_to_ns(ktime_get());
++}
++
+ #endif
+--- a/drivers/md/dm.c
++++ b/drivers/md/dm.c
+@@ -537,6 +537,8 @@ static struct dm_io *alloc_io(struct map
+ 
+       io->start_time = jiffies;
+ 
++      dm_stats_record_start(&md->stats, &io->stats_aux);
++
+       return io;
+ }
+ 
diff --git a/queue-5.17/drbd-fix-potential-silent-data-corruption.patch b/queue-5.17/drbd-fix-potential-silent-data-corruption.patch

new file mode 100644 (file)

index 0000000..99cb099
--- /dev/null
+++ b/queue-5.17/drbd-fix-potential-silent-data-corruption.patch
@@ -0,0 +1,67 @@
+From f4329d1f848ac35757d9cc5487669d19dfc5979c Mon Sep 17 00:00:00 2001
+From: Lars Ellenberg <lars.ellenberg@linbit.com>
+Date: Wed, 30 Mar 2022 20:55:51 +0200
+Subject: drbd: fix potential silent data corruption
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Lars Ellenberg <lars.ellenberg@linbit.com>
+
+commit f4329d1f848ac35757d9cc5487669d19dfc5979c upstream.
+
+Scenario:
+---------
+
+bio chain generated by blk_queue_split().
+Some split bio fails and propagates its error status to the "parent" bio.
+But then the (last part of the) parent bio itself completes without error.
+
+We would clobber the already recorded error status with BLK_STS_OK,
+causing silent data corruption.
+
+Reproducer:
+-----------
+
+How to trigger this in the real world within seconds:
+
+DRBD on top of degraded parity raid,
+small stripe_cache_size, large read_ahead setting.
+Drop page cache (sysctl vm.drop_caches=1, fadvise "DONTNEED",
+umount and mount again, "reboot").
+
+Cause significant read ahead.
+
+Large read ahead request is split by blk_queue_split().
+Parts of the read ahead that are already in the stripe cache,
+or find an available stripe cache to use, can be serviced.
+Parts of the read ahead that would need "too much work",
+would need to wait for a "stripe_head" to become available,
+are rejected immediately.
+
+For larger read ahead requests that are split in many pieces, it is very
+likely that some "splits" will be serviced, but then the stripe cache is
+exhausted/busy, and the remaining ones will be rejected.
+
+Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
+Signed-off-by: Christoph Böhmwalder <christoph.boehmwalder@linbit.com>
+Cc: <stable@vger.kernel.org> # 4.13.x
+Link: https://lore.kernel.org/r/20220330185551.3553196-1-christoph.boehmwalder@linbit.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/block/drbd/drbd_req.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/block/drbd/drbd_req.c
++++ b/drivers/block/drbd/drbd_req.c
+@@ -180,7 +180,8 @@ void start_new_tl_epoch(struct drbd_conn
+ void complete_master_bio(struct drbd_device *device,
+               struct bio_and_error *m)
+ {
+-      m->bio->bi_status = errno_to_blk_status(m->error);
++      if (unlikely(m->error))
++              m->bio->bi_status = errno_to_blk_status(m->error);
+       bio_endio(m->bio);
+       dec_ap_bio(device);
+ }
diff --git a/queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch b/queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch

new file mode 100644 (file)

index 0000000..2cf450d
--- /dev/null
+++ b/queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch
@@ -0,0 +1,67 @@
+From 3149c79f3cb0e2e3bafb7cfadacec090cbd250d3 Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@surriel.com>
+Date: Fri, 1 Apr 2022 11:28:42 -0700
+Subject: mm,hwpoison: unmap poisoned page before invalidation
+
+From: Rik van Riel <riel@surriel.com>
+
+commit 3149c79f3cb0e2e3bafb7cfadacec090cbd250d3 upstream.
+
+In some cases it appears the invalidation of a hwpoisoned page fails
+because the page is still mapped in another process.  This can cause a
+program to be continuously restarted and die when it page faults on the
+page that was not invalidated.  Avoid that problem by unmapping the
+hwpoisoned page when we find it.
+
+Another issue is that sometimes we end up oopsing in finish_fault, if
+the code tries to do something with the now-NULL vmf->page.  I did not
+hit this error when submitting the previous patch because there are
+several opportunities for alloc_set_pte to bail out before accessing
+vmf->page, and that apparently happened on those systems, and most of
+the time on other systems, too.
+
+However, across several million systems that error does occur a handful
+of times a day.  It can be avoided by returning VM_FAULT_NOPAGE which
+will cause do_read_fault to return before calling finish_fault.
+
+Link: https://lkml.kernel.org/r/20220325161428.5068d97e@imladris.surriel.com
+Fixes: e53ac7374e64 ("mm: invalidate hwpoison page cache page in fault path")
+Signed-off-by: Rik van Riel <riel@surriel.com>
+Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
+Tested-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Reviewed-by: Oscar Salvador <osalvador@suse.de>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory.c |   12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -3893,14 +3893,18 @@ static vm_fault_t __do_fault(struct vm_f
+               return ret;
+ 
+       if (unlikely(PageHWPoison(vmf->page))) {
++              struct page *page = vmf->page;
+               vm_fault_t poisonret = VM_FAULT_HWPOISON;
+               if (ret & VM_FAULT_LOCKED) {
++                      if (page_mapped(page))
++                              unmap_mapping_pages(page_mapping(page),
++                                                  page->index, 1, false);
+                       /* Retry if a clean page was removed from the cache. */
+-                      if (invalidate_inode_page(vmf->page))
+-                              poisonret = 0;
+-                      unlock_page(vmf->page);
++                      if (invalidate_inode_page(page))
++                              poisonret = VM_FAULT_NOPAGE;
++                      unlock_page(page);
+               }
+-              put_page(vmf->page);
++              put_page(page);
+               vmf->page = NULL;
+               return poisonret;
+       }
diff --git a/queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch b/queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch

new file mode 100644 (file)

index 0000000..ea7021f
--- /dev/null
+++ b/queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch
@@ -0,0 +1,99 @@
+From bfc8089f00fa526dea983844c880fa8106c33ac4 Mon Sep 17 00:00:00 2001
+From: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
+Date: Fri, 1 Apr 2022 11:28:54 -0700
+Subject: mm/kmemleak: reset tag when compare object pointer
+
+From: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
+
+commit bfc8089f00fa526dea983844c880fa8106c33ac4 upstream.
+
+When we use HW-tag based kasan and enable vmalloc support, we hit the
+following bug.  It is due to comparison between tagged object and
+non-tagged pointer.
+
+We need to reset the kasan tag when we need to compare tagged object and
+non-tagged pointer.
+
+  kmemleak: [name:kmemleak&]Scan area larger than object 0xffffffe77076f440
+  CPU: 4 PID: 1 Comm: init Tainted: G S      W         5.15.25-android13-0-g5cacf919c2bc #1
+  Hardware name: MT6983(ENG) (DT)
+  Call trace:
+   add_scan_area+0xc4/0x244
+   kmemleak_scan_area+0x40/0x9c
+   layout_and_allocate+0x1e8/0x288
+   load_module+0x2c8/0xf00
+   __se_sys_finit_module+0x190/0x1d0
+   __arm64_sys_finit_module+0x20/0x30
+   invoke_syscall+0x60/0x170
+   el0_svc_common+0xc8/0x114
+   do_el0_svc+0x28/0xa0
+   el0_svc+0x60/0xf8
+   el0t_64_sync_handler+0x88/0xec
+   el0t_64_sync+0x1b4/0x1b8
+  kmemleak: [name:kmemleak&]Object 0xf5ffffe77076b000 (size 32768):
+  kmemleak: [name:kmemleak&]  comm "init", pid 1, jiffies 4294894197
+  kmemleak: [name:kmemleak&]  min_count = 0
+  kmemleak: [name:kmemleak&]  count = 0
+  kmemleak: [name:kmemleak&]  flags = 0x1
+  kmemleak: [name:kmemleak&]  checksum = 0
+  kmemleak: [name:kmemleak&]  backtrace:
+       module_alloc+0x9c/0x120
+       move_module+0x34/0x19c
+       layout_and_allocate+0x1c4/0x288
+       load_module+0x2c8/0xf00
+       __se_sys_finit_module+0x190/0x1d0
+       __arm64_sys_finit_module+0x20/0x30
+       invoke_syscall+0x60/0x170
+       el0_svc_common+0xc8/0x114
+       do_el0_svc+0x28/0xa0
+       el0_svc+0x60/0xf8
+       el0t_64_sync_handler+0x88/0xec
+       el0t_64_sync+0x1b4/0x1b8
+
+Link: https://lkml.kernel.org/r/20220318034051.30687-1-Kuan-Ying.Lee@mediatek.com
+Signed-off-by: Kuan-Ying Lee <Kuan-Ying.Lee@mediatek.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Matthias Brugger <matthias.bgg@gmail.com>
+Cc: Chinwen Chang <chinwen.chang@mediatek.com>
+Cc: Nicholas Tang <nicholas.tang@mediatek.com>
+Cc: Yee Lee <yee.lee@mediatek.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/kmemleak.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/mm/kmemleak.c
++++ b/mm/kmemleak.c
+@@ -796,6 +796,8 @@ static void add_scan_area(unsigned long
+       unsigned long flags;
+       struct kmemleak_object *object;
+       struct kmemleak_scan_area *area = NULL;
++      unsigned long untagged_ptr;
++      unsigned long untagged_objp;
+ 
+       object = find_and_get_object(ptr, 1);
+       if (!object) {
+@@ -804,6 +806,9 @@ static void add_scan_area(unsigned long
+               return;
+       }
+ 
++      untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
++      untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
++
+       if (scan_area_cache)
+               area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
+ 
+@@ -815,8 +820,8 @@ static void add_scan_area(unsigned long
+               goto out_unlock;
+       }
+       if (size == SIZE_MAX) {
+-              size = object->pointer + object->size - ptr;
+-      } else if (ptr + size > object->pointer + object->size) {
++              size = untagged_objp + object->size - untagged_ptr;
++      } else if (untagged_ptr + size > untagged_objp + object->size) {
+               kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
+               dump_object_info(object);
+               kmem_cache_free(scan_area_cache, area);
diff --git a/queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch b/queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch

new file mode 100644 (file)

index 0000000..c8c4111
--- /dev/null
+++ b/queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch
@@ -0,0 +1,64 @@
+From 5bd009c7c9a9e888077c07535dc0c70aeab242c3 Mon Sep 17 00:00:00 2001
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+Date: Tue, 22 Mar 2022 14:46:44 -0700
+Subject: mm: madvise: return correct bytes advised with process_madvise
+
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+
+commit 5bd009c7c9a9e888077c07535dc0c70aeab242c3 upstream.
+
+Patch series "mm: madvise: return correct bytes processed with
+process_madvise", v2.  With the process_madvise(), always choose to return
+non zero processed bytes over an error.  This can help the user to know on
+which VMA, passed in the 'struct iovec' vector list, is failed to advise
+thus can take the decission of retrying/skipping on that VMA.
+
+This patch (of 2):
+
+The process_madvise() system call returns error even after processing some
+VMA's passed in the 'struct iovec' vector list which leaves the user
+confused to know where to restart the advise next.  It is also against
+this syscall man page[1] documentation where it mentions that "return
+value may be less than the total number of requested bytes, if an error
+occurred after some iovec elements were already processed.".
+
+Consider a user passed 10 VMA's in the 'struct iovec' vector list of which
+9 are processed but one.  Then it just returns the error caused on that
+failed VMA despite the first 9 VMA's processed, leaving the user confused
+about on which VMA it is failed.  Returning the number of bytes processed
+here can help the user to know which VMA it is failed on and thus can
+retry/skip the advise on that VMA.
+
+[1]https://man7.org/linux/man-pages/man2/process_madvise.2.html.
+
+Link: https://lkml.kernel.org/r/cover.1647008754.git.quic_charante@quicinc.com
+Link: https://lkml.kernel.org/r/125b61a0edcee5c2db8658aed9d06a43a19ccafc.1647008754.git.quic_charante@quicinc.com
+Fixes: ecb8ac8b1f14("mm/madvise: introduce process_madvise() syscall: an external memory hinting API")
+Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1440,8 +1440,7 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+               iov_iter_advance(&iter, iovec.iov_len);
+       }
+ 
+-      if (ret == 0)
+-              ret = total_len - iov_iter_count(&iter);
++      ret = (total_len - iov_iter_count(&iter)) ? : ret;
+ 
+ release_mm:
+       mmput(mm);
diff --git a/queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch b/queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch

new file mode 100644 (file)

index 0000000..8dbbe8d
--- /dev/null
+++ b/queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
@@ -0,0 +1,57 @@
+From 08095d6310a7ce43256b4251577bc66a25c6e1a6 Mon Sep 17 00:00:00 2001
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+Date: Tue, 22 Mar 2022 14:46:48 -0700
+Subject: mm: madvise: skip unmapped vma holes passed to process_madvise
+
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+
+commit 08095d6310a7ce43256b4251577bc66a25c6e1a6 upstream.
+
+The process_madvise() system call is expected to skip holes in vma passed
+through 'struct iovec' vector list.  But do_madvise, which
+process_madvise() calls for each vma, returns ENOMEM in case of unmapped
+holes, despite the VMA is processed.
+
+Thus process_madvise() should treat ENOMEM as expected and consider the
+VMA passed to as processed and continue processing other vma's in the
+vector list.  Returning -ENOMEM to user, despite the VMA is processed,
+will be unable to figure out where to start the next madvise.
+
+Link: https://lkml.kernel.org/r/4f091776142f2ebf7b94018146de72318474e686.1647008754.git.quic_charante@quicinc.com
+Fixes: ecb8ac8b1f14("mm/madvise: introduce process_madvise() syscall: an external memory hinting API")
+Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c |    9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1426,9 +1426,16 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+ 
+       while (iov_iter_count(&iter)) {
+               iovec = iov_iter_iovec(&iter);
++              /*
++               * do_madvise returns ENOMEM if unmapped holes are present
++               * in the passed VMA. process_madvise() is expected to skip
++               * unmapped holes passed to it in the 'struct iovec' list
++               * and not fail because of them. Thus treat -ENOMEM return
++               * from do_madvise as valid and continue processing.
++               */
+               ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+                                       iovec.iov_len, behavior);
+-              if (ret < 0)
++              if (ret < 0 && ret != -ENOMEM)
+                       break;
+               iov_iter_advance(&iter, iovec.iov_len);
+       }
diff --git a/queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch b/queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch

new file mode 100644 (file)

index 0000000..e70fb4b
--- /dev/null
+++ b/queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch
@@ -0,0 +1,183 @@
+From 734c15700cdf9062ae98d8b131c6fe873dfad26d Mon Sep 17 00:00:00 2001
+From: Oscar Salvador <osalvador@suse.de>
+Date: Tue, 22 Mar 2022 14:47:37 -0700
+Subject: mm: only re-generate demotion targets when a numa node changes its N_CPU state
+
+From: Oscar Salvador <osalvador@suse.de>
+
+commit 734c15700cdf9062ae98d8b131c6fe873dfad26d upstream.
+
+Abhishek reported that after patch [1], hotplug operations are taking
+roughly double the expected time.  [2]
+
+The reason behind is that the CPU callbacks that
+migrate_on_reclaim_init() sets always call set_migration_target_nodes()
+whenever a CPU is brought up/down.
+
+But we only care about numa nodes going from having cpus to become
+cpuless, and vice versa, as that influences the demotion_target order.
+
+We do already have two CPU callbacks (vmstat_cpu_online() and
+vmstat_cpu_dead()) that check exactly that, so get rid of the CPU
+callbacks in migrate_on_reclaim_init() and only call
+set_migration_target_nodes() from vmstat_cpu_{dead,online}() whenever a
+numa node change its N_CPU state.
+
+[1] https://lore.kernel.org/linux-mm/20210721063926.3024591-2-ying.huang@intel.com/
+[2] https://lore.kernel.org/linux-mm/eb438ddd-2919-73d4-bd9f-b7eecdd9577a@linux.vnet.ibm.com/
+
+[osalvador@suse.de: add feedback from Huang Ying]
+  Link: https://lkml.kernel.org/r/20220314150945.12694-1-osalvador@suse.de
+
+Link: https://lkml.kernel.org/r/20220310120749.23077-1-osalvador@suse.de
+Fixes: 884a6e5d1f93b ("mm/migrate: update node demotion order on hotplug events")
+Signed-off-by: Oscar Salvador <osalvador@suse.de>
+Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Tested-by: Baolin Wang <baolin.wang@linux.alibaba.com>
+Reported-by: Abhishek Goel <huntbag@linux.vnet.ibm.com>
+Cc: Dave Hansen <dave.hansen@linux.intel.com>
+Cc: "Huang, Ying" <ying.huang@intel.com>
+Cc: Abhishek Goel <huntbag@linux.vnet.ibm.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/migrate.h |    8 ++++++++
+ mm/migrate.c            |   47 ++++++++++-------------------------------------
+ mm/vmstat.c             |   13 ++++++++++++-
+ 3 files changed, 30 insertions(+), 38 deletions(-)
+
+--- a/include/linux/migrate.h
++++ b/include/linux/migrate.h
+@@ -48,7 +48,15 @@ int folio_migrate_mapping(struct address
+               struct folio *newfolio, struct folio *folio, int extra_count);
+ 
+ extern bool numa_demotion_enabled;
++extern void migrate_on_reclaim_init(void);
++#ifdef CONFIG_HOTPLUG_CPU
++extern void set_migration_target_nodes(void);
+ #else
++static inline void set_migration_target_nodes(void) {}
++#endif
++#else
++
++static inline void set_migration_target_nodes(void) {}
+ 
+ static inline void putback_movable_pages(struct list_head *l) {}
+ static inline int migrate_pages(struct list_head *l, new_page_t new,
+--- a/mm/migrate.c
++++ b/mm/migrate.c
+@@ -3190,7 +3190,7 @@ again:
+ /*
+  * For callers that do not hold get_online_mems() already.
+  */
+-static void set_migration_target_nodes(void)
++void set_migration_target_nodes(void)
+ {
+       get_online_mems();
+       __set_migration_target_nodes();
+@@ -3254,51 +3254,24 @@ static int __meminit migrate_on_reclaim_
+       return notifier_from_errno(0);
+ }
+ 
+-/*
+- * React to hotplug events that might affect the migration targets
+- * like events that online or offline NUMA nodes.
+- *
+- * The ordering is also currently dependent on which nodes have
+- * CPUs.  That means we need CPU on/offline notification too.
+- */
+-static int migration_online_cpu(unsigned int cpu)
+-{
+-      set_migration_target_nodes();
+-      return 0;
+-}
+-
+-static int migration_offline_cpu(unsigned int cpu)
+-{
+-      set_migration_target_nodes();
+-      return 0;
+-}
+-
+-static int __init migrate_on_reclaim_init(void)
++void __init migrate_on_reclaim_init(void)
+ {
+-      int ret;
+-
+       node_demotion = kmalloc_array(nr_node_ids,
+                                     sizeof(struct demotion_nodes),
+                                     GFP_KERNEL);
+       WARN_ON(!node_demotion);
+ 
+-      ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
+-                                      NULL, migration_offline_cpu);
++      hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+       /*
+-       * In the unlikely case that this fails, the automatic
+-       * migration targets may become suboptimal for nodes
+-       * where N_CPU changes.  With such a small impact in a
+-       * rare case, do not bother trying to do anything special.
++       * At this point, all numa nodes with memory/CPus have their state
++       * properly set, so we can build the demotion order now.
++       * Let us hold the cpu_hotplug lock just, as we could possibily have
++       * CPU hotplug events during boot.
+        */
+-      WARN_ON(ret < 0);
+-      ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
+-                              migration_online_cpu, NULL);
+-      WARN_ON(ret < 0);
+-
+-      hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+-      return 0;
++      cpus_read_lock();
++      set_migration_target_nodes();
++      cpus_read_unlock();
+ }
+-late_initcall(migrate_on_reclaim_init);
+ #endif /* CONFIG_HOTPLUG_CPU */
+ 
+ bool numa_demotion_enabled = false;
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -28,6 +28,7 @@
+ #include <linux/mm_inline.h>
+ #include <linux/page_ext.h>
+ #include <linux/page_owner.h>
++#include <linux/migrate.h>
+ 
+ #include "internal.h"
+ 
+@@ -2043,7 +2044,12 @@ static void __init init_cpu_node_state(v
+ static int vmstat_cpu_online(unsigned int cpu)
+ {
+       refresh_zone_stat_thresholds();
+-      node_set_state(cpu_to_node(cpu), N_CPU);
++
++      if (!node_state(cpu_to_node(cpu), N_CPU)) {
++              node_set_state(cpu_to_node(cpu), N_CPU);
++              set_migration_target_nodes();
++      }
++
+       return 0;
+ }
+ 
+@@ -2066,6 +2072,8 @@ static int vmstat_cpu_dead(unsigned int
+               return 0;
+ 
+       node_clear_state(node, N_CPU);
++      set_migration_target_nodes();
++
+       return 0;
+ }
+ 
+@@ -2097,6 +2105,9 @@ void __init init_mm_internals(void)
+ 
+       start_shepherd_timer();
+ #endif
++#if defined(CONFIG_MIGRATION) && defined(CONFIG_HOTPLUG_CPU)
++      migrate_on_reclaim_init();
++#endif
+ #ifdef CONFIG_PROC_FS
+       proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
+       proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
diff --git a/queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch b/queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch

new file mode 100644 (file)

index 0000000..3e35582
--- /dev/null
+++ b/queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
@@ -0,0 +1,57 @@
+From e6b0a7b357659c332231621e4315658d062c23ee Mon Sep 17 00:00:00 2001
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+Date: Fri, 1 Apr 2022 11:28:12 -0700
+Subject: Revert "mm: madvise: skip unmapped vma holes passed to process_madvise"
+
+From: Charan Teja Kalla <quic_charante@quicinc.com>
+
+commit e6b0a7b357659c332231621e4315658d062c23ee upstream.
+
+This reverts commit 08095d6310a7 ("mm: madvise: skip unmapped vma holes
+passed to process_madvise") as process_madvise() fails to return the
+exact processed bytes in other cases too.
+
+As an example: if process_madvise() hits mlocked pages after processing
+some initial bytes passed in [start, end), it just returns EINVAL
+although some bytes are processed.  Thus making an exception only for
+ENOMEM is partially fixing the problem of returning the proper advised
+bytes.
+
+Thus revert this patch and return proper bytes advised.
+
+Link: https://lkml.kernel.org/r/e73da1304a88b6a8a11907045117cccf4c2b8374.1648046642.git.quic_charante@quicinc.com
+Fixes: 08095d6310a7ce ("mm: madvise: skip unmapped vma holes passed to process_madvise")
+Signed-off-by: Charan Teja Kalla <quic_charante@quicinc.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/madvise.c |    9 +--------
+ 1 file changed, 1 insertion(+), 8 deletions(-)
+
+--- a/mm/madvise.c
++++ b/mm/madvise.c
+@@ -1426,16 +1426,9 @@ SYSCALL_DEFINE5(process_madvise, int, pi
+ 
+       while (iov_iter_count(&iter)) {
+               iovec = iov_iter_iovec(&iter);
+-              /*
+-               * do_madvise returns ENOMEM if unmapped holes are present
+-               * in the passed VMA. process_madvise() is expected to skip
+-               * unmapped holes passed to it in the 'struct iovec' list
+-               * and not fail because of them. Thus treat -ENOMEM return
+-               * from do_madvise as valid and continue processing.
+-               */
+               ret = do_madvise(mm, (unsigned long)iovec.iov_base,
+                                       iovec.iov_len, behavior);
+-              if (ret < 0 && ret != -ENOMEM)
++              if (ret < 0)
+                       break;
+               iov_iter_advance(&iter, iovec.iov_len);
+       }
diff --git a/queue-5.17/series b/queue-5.17/series

index 773df2de6336d54545b5d584272afd75c8a16924..ad0ac27e51d681a621fe4424376575a84936be78 100644 (file)
--- a/queue-5.17/series
+++ b/queue-5.17/series
@@ -102,3 +102,16 @@ rtc-pl031-fix-rtc-features-null-pointer-dereference.patch
  io_uring-ensure-that-fsnotify-is-always-called.patch
  ocfs2-fix-crash-when-mount-with-quota-enabled.patch
  drm-simpledrm-add-panel-orientation-property-on-non-upright-mounted-lcd-panels.patch
+mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
+mm-madvise-return-correct-bytes-advised-with-process_madvise.patch
+revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch
+mm-hwpoison-unmap-poisoned-page-before-invalidation.patch
+mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch
+mm-kmemleak-reset-tag-when-compare-object-pointer.patch
+dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch
+dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch
+dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch
+dm-fix-double-accounting-of-flush-with-data.patch
+dm-integrity-set-journal-entry-unused-when-shrinking-device.patch
+tracing-have-trace-event-string-test-handle-zero-length-strings.patch
+drbd-fix-potential-silent-data-corruption.patch
diff --git a/queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch b/queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch

new file mode 100644 (file)

index 0000000..28c2b62
--- /dev/null
+++ b/queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch
@@ -0,0 +1,62 @@
+From eca344a7362e0f34f179298fd8366bcd556eede1 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Wed, 23 Mar 2022 10:32:51 -0400
+Subject: tracing: Have trace event string test handle zero length strings
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit eca344a7362e0f34f179298fd8366bcd556eede1 upstream.
+
+If a trace event has in its TP_printk():
+
+ "%*.s", len, len ? __get_str(string) : NULL
+
+It is perfectly valid if len is zero and passing in the NULL.
+Unfortunately, the runtime string check at time of reading the trace sees
+the NULL and flags it as a bad string and produces a WARN_ON().
+
+Handle this case by passing into the test function if the format has an
+asterisk (star) and if so, if the length is zero, then mark it as safe.
+
+Link: https://lore.kernel.org/all/YjsWzuw5FbWPrdqq@bfoster/
+
+Cc: stable@vger.kernel.org
+Reported-by: Brian Foster <bfoster@redhat.com>
+Tested-by: Brian Foster <bfoster@redhat.com>
+Fixes: 9a6944fee68e2 ("tracing: Add a verifier to check string pointers for trace events")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -3663,12 +3663,17 @@ static char *trace_iter_expand_format(st
+ }
+ 
+ /* Returns true if the string is safe to dereference from an event */
+-static bool trace_safe_str(struct trace_iterator *iter, const char *str)
++static bool trace_safe_str(struct trace_iterator *iter, const char *str,
++                         bool star, int len)
+ {
+       unsigned long addr = (unsigned long)str;
+       struct trace_event *trace_event;
+       struct trace_event_call *event;
+ 
++      /* Ignore strings with no length */
++      if (star && !len)
++              return true;
++
+       /* OK if part of the event data */
+       if ((addr >= (unsigned long)iter->ent) &&
+           (addr < (unsigned long)iter->ent + iter->ent_size))
+@@ -3854,7 +3859,7 @@ void trace_check_vprintf(struct trace_it
+                * instead. See samples/trace_events/trace-events-sample.h
+                * for reference.
+                */
+-              if (WARN_ONCE(!trace_safe_str(iter, str),
++              if (WARN_ONCE(!trace_safe_str(iter, str, star, len),
+                             "fmt: '%s' current_buffer: '%s'",
+                             fmt, show_buffer(&iter->seq))) {
+                       int ret;
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 2 Apr 2022 11:21:05 +0000 (13:21 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sat, 2 Apr 2022 11:21:05 +0000 (13:21 +0200)
queue-5.17/dm-fix-double-accounting-of-flush-with-data.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/dm-fix-use-after-free-in-dm_cleanup_zoned_dev.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/dm-integrity-set-journal-entry-unused-when-shrinking-device.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/dm-interlock-pending-dm_io-and-dm_wait_for_bios_completion.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/dm-stats-fix-too-short-end-duration_ns-when-using-precise_timestamps.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/drbd-fix-potential-silent-data-corruption.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/mm-hwpoison-unmap-poisoned-page-before-invalidation.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/mm-kmemleak-reset-tag-when-compare-object-pointer.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/mm-madvise-return-correct-bytes-advised-with-process_madvise.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/mm-only-re-generate-demotion-targets-when-a-numa-node-changes-its-n_cpu-state.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/revert-mm-madvise-skip-unmapped-vma-holes-passed-to-process_madvise.patch	[new file with mode: 0644]	patch \| blob
queue-5.17/series		patch \| blob \| blame \| history
queue-5.17/tracing-have-trace-event-string-test-handle-zero-length-strings.patch	[new file with mode: 0644]	patch \| blob