]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.19-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 22 Jul 2021 14:51:42 +0000 (16:51 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 22 Jul 2021 14:51:42 +0000 (16:51 +0200)
added patches:
dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch
dm-writecache-return-the-exact-table-values-that-were-set.patch
mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch

queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch [new file with mode: 0644]
queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch [new file with mode: 0644]
queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch [new file with mode: 0644]
queue-4.19/series

diff --git a/queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch b/queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch
new file mode 100644 (file)
index 0000000..9f9bc52
--- /dev/null
@@ -0,0 +1,78 @@
+From 4134455f2aafdfeab50cabb4cccb35e916034b93 Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Tue, 9 Feb 2021 10:56:20 -0500
+Subject: dm writecache: fix writing beyond end of underlying device when shrinking
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+commit 4134455f2aafdfeab50cabb4cccb35e916034b93 upstream.
+
+Do not attempt to write any data beyond the end of the underlying data
+device while shrinking it.
+
+The DM writecache device must be suspended when the underlying data
+device is shrunk.
+
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-writecache.c |   18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+--- a/drivers/md/dm-writecache.c
++++ b/drivers/md/dm-writecache.c
+@@ -142,6 +142,7 @@ struct dm_writecache {
+       size_t metadata_sectors;
+       size_t n_blocks;
+       uint64_t seq_count;
++      sector_t data_device_sectors;
+       void *block_start;
+       struct wc_entry *entries;
+       unsigned block_size;
+@@ -929,6 +930,8 @@ static void writecache_resume(struct dm_
+       wc_lock(wc);
++      wc->data_device_sectors = i_size_read(wc->dev->bdev->bd_inode) >> SECTOR_SHIFT;
++
+       if (WC_MODE_PMEM(wc)) {
+               persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
+       } else {
+@@ -1499,6 +1502,10 @@ static bool wc_add_block(struct writebac
+       void *address = memory_data(wc, e);
+       persistent_memory_flush_cache(address, block_size);
++
++      if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
++              return true;
++
+       return bio_add_page(&wb->bio, persistent_memory_page(address),
+                           block_size, persistent_memory_page_offset(address)) != 0;
+ }
+@@ -1571,6 +1578,9 @@ static void __writecache_writeback_pmem(
+               if (writecache_has_error(wc)) {
+                       bio->bi_status = BLK_STS_IOERR;
+                       bio_endio(&wb->bio);
++              } else if (unlikely(!bio_sectors(&wb->bio))) {
++                      bio->bi_status = BLK_STS_OK;
++                      bio_endio(&wb->bio);
+               } else {
+                       submit_bio(&wb->bio);
+               }
+@@ -1614,6 +1624,14 @@ static void __writecache_writeback_ssd(s
+                       e = f;
+               }
++              if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
++                      if (to.sector >= wc->data_device_sectors) {
++                              writecache_copy_endio(0, 0, c);
++                              continue;
++                      }
++                      from.count = to.count = wc->data_device_sectors - to.sector;
++              }
++
+               dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
+               __writeback_throttle(wc, wbl);
diff --git a/queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch b/queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch
new file mode 100644 (file)
index 0000000..d7026ed
--- /dev/null
@@ -0,0 +1,124 @@
+From 054bee16163df023e2589db09fd27d81f7ad9e72 Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Thu, 4 Feb 2021 05:20:52 -0500
+Subject: dm writecache: return the exact table values that were set
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+commit 054bee16163df023e2589db09fd27d81f7ad9e72 upstream.
+
+LVM doesn't like it when the target returns different values from what
+was set in the constructor. Fix dm-writecache so that the returned
+table values are exactly the same as requested values.
+
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Cc: stable@vger.kernel.org # v4.18+
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-writecache.c |   32 ++++++++++++++++----------------
+ 1 file changed, 16 insertions(+), 16 deletions(-)
+
+--- a/drivers/md/dm-writecache.c
++++ b/drivers/md/dm-writecache.c
+@@ -153,6 +153,7 @@ struct dm_writecache {
+       bool overwrote_committed:1;
+       bool memory_vmapped:1;
++      bool start_sector_set:1;
+       bool high_wm_percent_set:1;
+       bool low_wm_percent_set:1;
+       bool max_writeback_jobs_set:1;
+@@ -161,6 +162,10 @@ struct dm_writecache {
+       bool writeback_fua_set:1;
+       bool flush_on_suspend:1;
++      unsigned high_wm_percent_value;
++      unsigned low_wm_percent_value;
++      unsigned autocommit_time_value;
++
+       unsigned writeback_all;
+       struct workqueue_struct *writeback_wq;
+       struct work_struct writeback_work;
+@@ -2045,6 +2050,7 @@ static int writecache_ctr(struct dm_targ
+                       if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
+                               goto invalid_optional;
+                       wc->start_sector = start_sector;
++                      wc->start_sector_set = true;
+                       if (wc->start_sector != start_sector ||
+                           wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
+                               goto invalid_optional;
+@@ -2054,6 +2060,7 @@ static int writecache_ctr(struct dm_targ
+                               goto invalid_optional;
+                       if (high_wm_percent < 0 || high_wm_percent > 100)
+                               goto invalid_optional;
++                      wc->high_wm_percent_value = high_wm_percent;
+                       wc->high_wm_percent_set = true;
+               } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
+                       string = dm_shift_arg(&as), opt_params--;
+@@ -2061,6 +2068,7 @@ static int writecache_ctr(struct dm_targ
+                               goto invalid_optional;
+                       if (low_wm_percent < 0 || low_wm_percent > 100)
+                               goto invalid_optional;
++                      wc->low_wm_percent_value = low_wm_percent;
+                       wc->low_wm_percent_set = true;
+               } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
+                       string = dm_shift_arg(&as), opt_params--;
+@@ -2080,6 +2088,7 @@ static int writecache_ctr(struct dm_targ
+                       if (autocommit_msecs > 3600000)
+                               goto invalid_optional;
+                       wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
++                      wc->autocommit_time_value = autocommit_msecs;
+                       wc->autocommit_time_set = true;
+               } else if (!strcasecmp(string, "fua")) {
+                       if (WC_MODE_PMEM(wc)) {
+@@ -2275,7 +2284,6 @@ static void writecache_status(struct dm_
+       struct dm_writecache *wc = ti->private;
+       unsigned extra_args;
+       unsigned sz = 0;
+-      uint64_t x;
+       switch (type) {
+       case STATUSTYPE_INFO:
+@@ -2287,7 +2295,7 @@ static void writecache_status(struct dm_
+               DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
+                               wc->dev->name, wc->ssd_dev->name, wc->block_size);
+               extra_args = 0;
+-              if (wc->start_sector)
++              if (wc->start_sector_set)
+                       extra_args += 2;
+               if (wc->high_wm_percent_set)
+                       extra_args += 2;
+@@ -2303,26 +2311,18 @@ static void writecache_status(struct dm_
+                       extra_args++;
+               DMEMIT("%u", extra_args);
+-              if (wc->start_sector)
++              if (wc->start_sector_set)
+                       DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
+-              if (wc->high_wm_percent_set) {
+-                      x = (uint64_t)wc->freelist_high_watermark * 100;
+-                      x += wc->n_blocks / 2;
+-                      do_div(x, (size_t)wc->n_blocks);
+-                      DMEMIT(" high_watermark %u", 100 - (unsigned)x);
+-              }
+-              if (wc->low_wm_percent_set) {
+-                      x = (uint64_t)wc->freelist_low_watermark * 100;
+-                      x += wc->n_blocks / 2;
+-                      do_div(x, (size_t)wc->n_blocks);
+-                      DMEMIT(" low_watermark %u", 100 - (unsigned)x);
+-              }
++              if (wc->high_wm_percent_set)
++                      DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
++              if (wc->low_wm_percent_set)
++                      DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
+               if (wc->max_writeback_jobs_set)
+                       DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
+               if (wc->autocommit_blocks_set)
+                       DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
+               if (wc->autocommit_time_set)
+-                      DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
++                      DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
+               if (wc->writeback_fua_set)
+                       DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
+               break;
diff --git a/queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch b/queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch
new file mode 100644 (file)
index 0000000..fe73ad1
--- /dev/null
@@ -0,0 +1,160 @@
+From sunnanyong@huawei.com  Thu Jul 22 16:42:41 2021
+From: Nanyong Sun <sunnanyong@huawei.com>
+Date: Tue, 20 Jul 2021 16:20:48 +0800
+Subject: mm: slab: fix kmem_cache_create failed when sysfs node not destroyed
+To: <songmuchun@bytedance.com>, <cl@linux.com>, <penberg@kernel.org>, <rientjes@google.com>, <iamjoonsoo.kim@lge.com>, <akpm@linux-foundation.org>
+Cc: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>, <stable@vger.kernel.org>
+Message-ID: <20210720082048.2797315-1-sunnanyong@huawei.com>
+
+From: Nanyong Sun <sunnanyong@huawei.com>
+
+The commit d38a2b7a9c93 ("mm: memcg/slab: fix memory leak at non-root
+kmem_cache destroy") introduced a problem: If one thread destroy a
+kmem_cache A and another thread concurrently create a kmem_cache B,
+which is mergeable with A and has same size with A, the B may fail to
+create due to the duplicate sysfs node.
+The scenario in detail:
+1) Thread 1 uses kmem_cache_destroy() to destroy kmem_cache A which is
+mergeable, it decreases A's refcount and if refcount is 0, then call
+memcg_set_kmem_cache_dying() which set A->memcg_params.dying = true,
+then unlock the slab_mutex and call flush_memcg_workqueue(), it may cost
+a while.
+Note: now the sysfs node(like '/kernel/slab/:0000248') of A is still
+present, it will be deleted in shutdown_cache() which will be called
+after flush_memcg_workqueue() is done and lock the slab_mutex again.
+2) Now if thread 2 is coming, it use kmem_cache_create() to create B, which
+is mergeable with A(their size is same), it gain the lock of slab_mutex,
+then call __kmem_cache_alias() trying to find a mergeable node, because
+of the below added code in commit d38a2b7a9c93 ("mm: memcg/slab: fix
+memory leak at non-root kmem_cache destroy"), B is not mergeable with
+A whose memcg_params.dying is true.
+
+int slab_unmergeable(struct kmem_cache *s)
+       if (s->refcount < 0)
+               return 1;
+
+       /*
+        * Skip the dying kmem_cache.
+        */
+       if (s->memcg_params.dying)
+               return 1;
+
+       return 0;
+ }
+
+So B has to create its own sysfs node by calling:
+ create_cache->
+       __kmem_cache_create->
+               sysfs_slab_add->
+                       kobject_init_and_add
+Because B is mergeable itself, its filename of sysfs node is based on its size,
+like '/kernel/slab/:0000248', which is duplicate with A, and the sysfs
+node of A is still present now, so kobject_init_and_add() will return
+fail and result in kmem_cache_create() fail.
+
+Concurrently modprobe and rmmod the two modules below can reproduce the issue
+quickly: nf_conntrack_expect, se_sess_cache. See call trace in the end.
+
+LTS versions of v4.19.y and v5.4.y have this problem, whereas linux versions after
+v5.9 do not have this problem because the patchset: ("The new cgroup slab memory
+controller") almost refactored memcg slab.
+
+A potential solution(this patch belongs): Just let the dying kmem_cache be mergeable,
+the slab_mutex lock can prevent the race between alias kmem_cache creating thread
+and root kmem_cache destroying thread. In the destroying thread, after
+flush_memcg_workqueue() is done, judge the refcount again, if someone
+reference it again during un-lock time, we don't need to destroy the kmem_cache
+completely, we can reuse it.
+
+Another potential solution: revert the commit d38a2b7a9c93 ("mm: memcg/slab:
+fix memory leak at non-root kmem_cache destroy"), compare to the fail of
+kmem_cache_create, the memory leak in special scenario seems less harmful.
+
+Call trace:
+ sysfs: cannot create duplicate filename '/kernel/slab/:0000248'
+ Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
+ Call trace:
+  dump_backtrace+0x0/0x198
+  show_stack+0x24/0x30
+  dump_stack+0xb0/0x100
+  sysfs_warn_dup+0x6c/0x88
+  sysfs_create_dir_ns+0x104/0x120
+  kobject_add_internal+0xd0/0x378
+  kobject_init_and_add+0x90/0xd8
+  sysfs_slab_add+0x16c/0x2d0
+  __kmem_cache_create+0x16c/0x1d8
+  create_cache+0xbc/0x1f8
+  kmem_cache_create_usercopy+0x1a0/0x230
+  kmem_cache_create+0x50/0x68
+  init_se_kmem_caches+0x38/0x258 [target_core_mod]
+  target_core_init_configfs+0x8c/0x390 [target_core_mod]
+  do_one_initcall+0x54/0x230
+  do_init_module+0x64/0x1ec
+  load_module+0x150c/0x16f0
+  __se_sys_finit_module+0xf0/0x108
+  __arm64_sys_finit_module+0x24/0x30
+  el0_svc_common+0x80/0x1c0
+  el0_svc_handler+0x78/0xe0
+  el0_svc+0x10/0x260
+ kobject_add_internal failed for :0000248 with -EEXIST, don't try to register things with the same name in the same directory.
+ kmem_cache_create(se_sess_cache) failed with error -17
+ Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
+ Call trace:
+  dump_backtrace+0x0/0x198
+  show_stack+0x24/0x30
+  dump_stack+0xb0/0x100
+  kmem_cache_create_usercopy+0xa8/0x230
+  kmem_cache_create+0x50/0x68
+  init_se_kmem_caches+0x38/0x258 [target_core_mod]
+  target_core_init_configfs+0x8c/0x390 [target_core_mod]
+  do_one_initcall+0x54/0x230
+  do_init_module+0x64/0x1ec
+  load_module+0x150c/0x16f0
+  __se_sys_finit_module+0xf0/0x108
+  __arm64_sys_finit_module+0x24/0x30
+  el0_svc_common+0x80/0x1c0
+  el0_svc_handler+0x78/0xe0
+  el0_svc+0x10/0x260
+
+Fixes: d38a2b7a9c93 ("mm: memcg/slab: fix memory leak at non-root kmem_cache destroy")
+Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/slab_common.c |   18 ++++++++++--------
+ 1 file changed, 10 insertions(+), 8 deletions(-)
+
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -311,14 +311,6 @@ int slab_unmergeable(struct kmem_cache *
+       if (s->refcount < 0)
+               return 1;
+-#ifdef CONFIG_MEMCG_KMEM
+-      /*
+-       * Skip the dying kmem_cache.
+-       */
+-      if (s->memcg_params.dying)
+-              return 1;
+-#endif
+-
+       return 0;
+ }
+@@ -918,6 +910,16 @@ void kmem_cache_destroy(struct kmem_cach
+       get_online_mems();
+       mutex_lock(&slab_mutex);
++
++      /*
++       * Another thread referenced it again
++       */
++      if (READ_ONCE(s->refcount)) {
++              spin_lock_irq(&memcg_kmem_wq_lock);
++              s->memcg_params.dying = false;
++              spin_unlock_irq(&memcg_kmem_wq_lock);
++              goto out_unlock;
++      }
+ #endif
+       err = shutdown_memcg_caches(s);
index b500e1cc7acc339ad3cceda1ba8a0405667d0ec1..248b607b0f156805e1852d3ccc64e02572880dae 100644 (file)
@@ -31,3 +31,6 @@ scsi-aic7xxx-fix-unintentional-sign-extension-issue-.patch
 scsi-libsas-add-lun-number-check-in-.slave_alloc-cal.patch
 scsi-libfc-fix-array-index-out-of-bound-exception.patch
 sched-fair-fix-cfs-bandwidth-hrtimer-expiry-type.patch
+mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch
+dm-writecache-return-the-exact-table-values-that-were-set.patch
+dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch