From 0252da713b7b1e833b0739884ea20696c7d90a9d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 22 Jul 2021 16:51:42 +0200 Subject: [PATCH] 4.19-stable patches added patches: dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch dm-writecache-return-the-exact-table-values-that-were-set.patch mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch --- ...-of-underlying-device-when-shrinking.patch | 78 +++++++++ ...the-exact-table-values-that-were-set.patch | 124 ++++++++++++++ ...failed-when-sysfs-node-not-destroyed.patch | 160 ++++++++++++++++++ queue-4.19/series | 3 + 4 files changed, 365 insertions(+) create mode 100644 queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch create mode 100644 queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch create mode 100644 queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch diff --git a/queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch b/queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch new file mode 100644 index 00000000000..9f9bc52927a --- /dev/null +++ b/queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch @@ -0,0 +1,78 @@ +From 4134455f2aafdfeab50cabb4cccb35e916034b93 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Tue, 9 Feb 2021 10:56:20 -0500 +Subject: dm writecache: fix writing beyond end of underlying device when shrinking + +From: Mikulas Patocka + +commit 4134455f2aafdfeab50cabb4cccb35e916034b93 upstream. + +Do not attempt to write any data beyond the end of the underlying data +device while shrinking it. + +The DM writecache device must be suspended when the underlying data +device is shrunk. + +Signed-off-by: Mikulas Patocka +Cc: stable@vger.kernel.org +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-writecache.c | 18 ++++++++++++++++++ + 1 file changed, 18 insertions(+) + +--- a/drivers/md/dm-writecache.c ++++ b/drivers/md/dm-writecache.c +@@ -142,6 +142,7 @@ struct dm_writecache { + size_t metadata_sectors; + size_t n_blocks; + uint64_t seq_count; ++ sector_t data_device_sectors; + void *block_start; + struct wc_entry *entries; + unsigned block_size; +@@ -929,6 +930,8 @@ static void writecache_resume(struct dm_ + + wc_lock(wc); + ++ wc->data_device_sectors = i_size_read(wc->dev->bdev->bd_inode) >> SECTOR_SHIFT; ++ + if (WC_MODE_PMEM(wc)) { + persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size); + } else { +@@ -1499,6 +1502,10 @@ static bool wc_add_block(struct writebac + void *address = memory_data(wc, e); + + persistent_memory_flush_cache(address, block_size); ++ ++ if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors)) ++ return true; ++ + return bio_add_page(&wb->bio, persistent_memory_page(address), + block_size, persistent_memory_page_offset(address)) != 0; + } +@@ -1571,6 +1578,9 @@ static void __writecache_writeback_pmem( + if (writecache_has_error(wc)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(&wb->bio); ++ } else if (unlikely(!bio_sectors(&wb->bio))) { ++ bio->bi_status = BLK_STS_OK; ++ bio_endio(&wb->bio); + } else { + submit_bio(&wb->bio); + } +@@ -1614,6 +1624,14 @@ static void __writecache_writeback_ssd(s + e = f; + } + ++ if (unlikely(to.sector + to.count > wc->data_device_sectors)) { ++ if (to.sector >= wc->data_device_sectors) { ++ writecache_copy_endio(0, 0, c); ++ continue; ++ } ++ from.count = to.count = wc->data_device_sectors - to.sector; ++ } ++ + dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c); + + __writeback_throttle(wc, wbl); diff --git a/queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch b/queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch new file mode 100644 index 00000000000..d7026ed9d1d --- /dev/null +++ b/queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch @@ -0,0 +1,124 @@ +From 054bee16163df023e2589db09fd27d81f7ad9e72 Mon Sep 17 00:00:00 2001 +From: Mikulas Patocka +Date: Thu, 4 Feb 2021 05:20:52 -0500 +Subject: dm writecache: return the exact table values that were set + +From: Mikulas Patocka + +commit 054bee16163df023e2589db09fd27d81f7ad9e72 upstream. + +LVM doesn't like it when the target returns different values from what +was set in the constructor. Fix dm-writecache so that the returned +table values are exactly the same as requested values. + +Signed-off-by: Mikulas Patocka +Cc: stable@vger.kernel.org # v4.18+ +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-writecache.c | 32 ++++++++++++++++---------------- + 1 file changed, 16 insertions(+), 16 deletions(-) + +--- a/drivers/md/dm-writecache.c ++++ b/drivers/md/dm-writecache.c +@@ -153,6 +153,7 @@ struct dm_writecache { + bool overwrote_committed:1; + bool memory_vmapped:1; + ++ bool start_sector_set:1; + bool high_wm_percent_set:1; + bool low_wm_percent_set:1; + bool max_writeback_jobs_set:1; +@@ -161,6 +162,10 @@ struct dm_writecache { + bool writeback_fua_set:1; + bool flush_on_suspend:1; + ++ unsigned high_wm_percent_value; ++ unsigned low_wm_percent_value; ++ unsigned autocommit_time_value; ++ + unsigned writeback_all; + struct workqueue_struct *writeback_wq; + struct work_struct writeback_work; +@@ -2045,6 +2050,7 @@ static int writecache_ctr(struct dm_targ + if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1) + goto invalid_optional; + wc->start_sector = start_sector; ++ wc->start_sector_set = true; + if (wc->start_sector != start_sector || + wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT) + goto invalid_optional; +@@ -2054,6 +2060,7 @@ static int writecache_ctr(struct dm_targ + goto invalid_optional; + if (high_wm_percent < 0 || high_wm_percent > 100) + goto invalid_optional; ++ wc->high_wm_percent_value = high_wm_percent; + wc->high_wm_percent_set = true; + } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) { + string = dm_shift_arg(&as), opt_params--; +@@ -2061,6 +2068,7 @@ static int writecache_ctr(struct dm_targ + goto invalid_optional; + if (low_wm_percent < 0 || low_wm_percent > 100) + goto invalid_optional; ++ wc->low_wm_percent_value = low_wm_percent; + wc->low_wm_percent_set = true; + } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) { + string = dm_shift_arg(&as), opt_params--; +@@ -2080,6 +2088,7 @@ static int writecache_ctr(struct dm_targ + if (autocommit_msecs > 3600000) + goto invalid_optional; + wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs); ++ wc->autocommit_time_value = autocommit_msecs; + wc->autocommit_time_set = true; + } else if (!strcasecmp(string, "fua")) { + if (WC_MODE_PMEM(wc)) { +@@ -2275,7 +2284,6 @@ static void writecache_status(struct dm_ + struct dm_writecache *wc = ti->private; + unsigned extra_args; + unsigned sz = 0; +- uint64_t x; + + switch (type) { + case STATUSTYPE_INFO: +@@ -2287,7 +2295,7 @@ static void writecache_status(struct dm_ + DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's', + wc->dev->name, wc->ssd_dev->name, wc->block_size); + extra_args = 0; +- if (wc->start_sector) ++ if (wc->start_sector_set) + extra_args += 2; + if (wc->high_wm_percent_set) + extra_args += 2; +@@ -2303,26 +2311,18 @@ static void writecache_status(struct dm_ + extra_args++; + + DMEMIT("%u", extra_args); +- if (wc->start_sector) ++ if (wc->start_sector_set) + DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector); +- if (wc->high_wm_percent_set) { +- x = (uint64_t)wc->freelist_high_watermark * 100; +- x += wc->n_blocks / 2; +- do_div(x, (size_t)wc->n_blocks); +- DMEMIT(" high_watermark %u", 100 - (unsigned)x); +- } +- if (wc->low_wm_percent_set) { +- x = (uint64_t)wc->freelist_low_watermark * 100; +- x += wc->n_blocks / 2; +- do_div(x, (size_t)wc->n_blocks); +- DMEMIT(" low_watermark %u", 100 - (unsigned)x); +- } ++ if (wc->high_wm_percent_set) ++ DMEMIT(" high_watermark %u", wc->high_wm_percent_value); ++ if (wc->low_wm_percent_set) ++ DMEMIT(" low_watermark %u", wc->low_wm_percent_value); + if (wc->max_writeback_jobs_set) + DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs); + if (wc->autocommit_blocks_set) + DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks); + if (wc->autocommit_time_set) +- DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies)); ++ DMEMIT(" autocommit_time %u", wc->autocommit_time_value); + if (wc->writeback_fua_set) + DMEMIT(" %sfua", wc->writeback_fua ? "" : "no"); + break; diff --git a/queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch b/queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch new file mode 100644 index 00000000000..fe73ad1c483 --- /dev/null +++ b/queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch @@ -0,0 +1,160 @@ +From sunnanyong@huawei.com Thu Jul 22 16:42:41 2021 +From: Nanyong Sun +Date: Tue, 20 Jul 2021 16:20:48 +0800 +Subject: mm: slab: fix kmem_cache_create failed when sysfs node not destroyed +To: , , , , , +Cc: , , +Message-ID: <20210720082048.2797315-1-sunnanyong@huawei.com> + +From: Nanyong Sun + +The commit d38a2b7a9c93 ("mm: memcg/slab: fix memory leak at non-root +kmem_cache destroy") introduced a problem: If one thread destroy a +kmem_cache A and another thread concurrently create a kmem_cache B, +which is mergeable with A and has same size with A, the B may fail to +create due to the duplicate sysfs node. +The scenario in detail: +1) Thread 1 uses kmem_cache_destroy() to destroy kmem_cache A which is +mergeable, it decreases A's refcount and if refcount is 0, then call +memcg_set_kmem_cache_dying() which set A->memcg_params.dying = true, +then unlock the slab_mutex and call flush_memcg_workqueue(), it may cost +a while. +Note: now the sysfs node(like '/kernel/slab/:0000248') of A is still +present, it will be deleted in shutdown_cache() which will be called +after flush_memcg_workqueue() is done and lock the slab_mutex again. +2) Now if thread 2 is coming, it use kmem_cache_create() to create B, which +is mergeable with A(their size is same), it gain the lock of slab_mutex, +then call __kmem_cache_alias() trying to find a mergeable node, because +of the below added code in commit d38a2b7a9c93 ("mm: memcg/slab: fix +memory leak at non-root kmem_cache destroy"), B is not mergeable with +A whose memcg_params.dying is true. + +int slab_unmergeable(struct kmem_cache *s) + if (s->refcount < 0) + return 1; + + /* + * Skip the dying kmem_cache. + */ + if (s->memcg_params.dying) + return 1; + + return 0; + } + +So B has to create its own sysfs node by calling: + create_cache-> + __kmem_cache_create-> + sysfs_slab_add-> + kobject_init_and_add +Because B is mergeable itself, its filename of sysfs node is based on its size, +like '/kernel/slab/:0000248', which is duplicate with A, and the sysfs +node of A is still present now, so kobject_init_and_add() will return +fail and result in kmem_cache_create() fail. + +Concurrently modprobe and rmmod the two modules below can reproduce the issue +quickly: nf_conntrack_expect, se_sess_cache. See call trace in the end. + +LTS versions of v4.19.y and v5.4.y have this problem, whereas linux versions after +v5.9 do not have this problem because the patchset: ("The new cgroup slab memory +controller") almost refactored memcg slab. + +A potential solution(this patch belongs): Just let the dying kmem_cache be mergeable, +the slab_mutex lock can prevent the race between alias kmem_cache creating thread +and root kmem_cache destroying thread. In the destroying thread, after +flush_memcg_workqueue() is done, judge the refcount again, if someone +reference it again during un-lock time, we don't need to destroy the kmem_cache +completely, we can reuse it. + +Another potential solution: revert the commit d38a2b7a9c93 ("mm: memcg/slab: +fix memory leak at non-root kmem_cache destroy"), compare to the fail of +kmem_cache_create, the memory leak in special scenario seems less harmful. + +Call trace: + sysfs: cannot create duplicate filename '/kernel/slab/:0000248' + Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 + Call trace: + dump_backtrace+0x0/0x198 + show_stack+0x24/0x30 + dump_stack+0xb0/0x100 + sysfs_warn_dup+0x6c/0x88 + sysfs_create_dir_ns+0x104/0x120 + kobject_add_internal+0xd0/0x378 + kobject_init_and_add+0x90/0xd8 + sysfs_slab_add+0x16c/0x2d0 + __kmem_cache_create+0x16c/0x1d8 + create_cache+0xbc/0x1f8 + kmem_cache_create_usercopy+0x1a0/0x230 + kmem_cache_create+0x50/0x68 + init_se_kmem_caches+0x38/0x258 [target_core_mod] + target_core_init_configfs+0x8c/0x390 [target_core_mod] + do_one_initcall+0x54/0x230 + do_init_module+0x64/0x1ec + load_module+0x150c/0x16f0 + __se_sys_finit_module+0xf0/0x108 + __arm64_sys_finit_module+0x24/0x30 + el0_svc_common+0x80/0x1c0 + el0_svc_handler+0x78/0xe0 + el0_svc+0x10/0x260 + kobject_add_internal failed for :0000248 with -EEXIST, don't try to register things with the same name in the same directory. + kmem_cache_create(se_sess_cache) failed with error -17 + Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 + Call trace: + dump_backtrace+0x0/0x198 + show_stack+0x24/0x30 + dump_stack+0xb0/0x100 + kmem_cache_create_usercopy+0xa8/0x230 + kmem_cache_create+0x50/0x68 + init_se_kmem_caches+0x38/0x258 [target_core_mod] + target_core_init_configfs+0x8c/0x390 [target_core_mod] + do_one_initcall+0x54/0x230 + do_init_module+0x64/0x1ec + load_module+0x150c/0x16f0 + __se_sys_finit_module+0xf0/0x108 + __arm64_sys_finit_module+0x24/0x30 + el0_svc_common+0x80/0x1c0 + el0_svc_handler+0x78/0xe0 + el0_svc+0x10/0x260 + +Fixes: d38a2b7a9c93 ("mm: memcg/slab: fix memory leak at non-root kmem_cache destroy") +Signed-off-by: Nanyong Sun +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + mm/slab_common.c | 18 ++++++++++-------- + 1 file changed, 10 insertions(+), 8 deletions(-) + +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -311,14 +311,6 @@ int slab_unmergeable(struct kmem_cache * + if (s->refcount < 0) + return 1; + +-#ifdef CONFIG_MEMCG_KMEM +- /* +- * Skip the dying kmem_cache. +- */ +- if (s->memcg_params.dying) +- return 1; +-#endif +- + return 0; + } + +@@ -918,6 +910,16 @@ void kmem_cache_destroy(struct kmem_cach + get_online_mems(); + + mutex_lock(&slab_mutex); ++ ++ /* ++ * Another thread referenced it again ++ */ ++ if (READ_ONCE(s->refcount)) { ++ spin_lock_irq(&memcg_kmem_wq_lock); ++ s->memcg_params.dying = false; ++ spin_unlock_irq(&memcg_kmem_wq_lock); ++ goto out_unlock; ++ } + #endif + + err = shutdown_memcg_caches(s); diff --git a/queue-4.19/series b/queue-4.19/series index b500e1cc7ac..248b607b0f1 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -31,3 +31,6 @@ scsi-aic7xxx-fix-unintentional-sign-extension-issue-.patch scsi-libsas-add-lun-number-check-in-.slave_alloc-cal.patch scsi-libfc-fix-array-index-out-of-bound-exception.patch sched-fair-fix-cfs-bandwidth-hrtimer-expiry-type.patch +mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch +dm-writecache-return-the-exact-table-values-that-were-set.patch +dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch -- 2.47.3