From 0252da713b7b1e833b0739884ea20696c7d90a9d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 22 Jul 2021 16:51:42 +0200
Subject: [PATCH] 4.19-stable patches

added patches:
	dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch
	dm-writecache-return-the-exact-table-values-that-were-set.patch
	mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch
---
 ...-of-underlying-device-when-shrinking.patch |  78 +++++++++
 ...the-exact-table-values-that-were-set.patch | 124 ++++++++++++++
 ...failed-when-sysfs-node-not-destroyed.patch | 160 ++++++++++++++++++
 queue-4.19/series                             |   3 +
 4 files changed, 365 insertions(+)
 create mode 100644 queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch
 create mode 100644 queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch
 create mode 100644 queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch

diff --git a/queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch b/queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch
new file mode 100644
index 00000000000..9f9bc52927a
--- /dev/null
+++ b/queue-4.19/dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch
@@ -0,0 +1,78 @@
+From 4134455f2aafdfeab50cabb4cccb35e916034b93 Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Tue, 9 Feb 2021 10:56:20 -0500
+Subject: dm writecache: fix writing beyond end of underlying device when shrinking
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+commit 4134455f2aafdfeab50cabb4cccb35e916034b93 upstream.
+
+Do not attempt to write any data beyond the end of the underlying data
+device while shrinking it.
+
+The DM writecache device must be suspended when the underlying data
+device is shrunk.
+
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-writecache.c |   18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+--- a/drivers/md/dm-writecache.c
++++ b/drivers/md/dm-writecache.c
+@@ -142,6 +142,7 @@ struct dm_writecache {
+ 	size_t metadata_sectors;
+ 	size_t n_blocks;
+ 	uint64_t seq_count;
++	sector_t data_device_sectors;
+ 	void *block_start;
+ 	struct wc_entry *entries;
+ 	unsigned block_size;
+@@ -929,6 +930,8 @@ static void writecache_resume(struct dm_
+ 
+ 	wc_lock(wc);
+ 
++	wc->data_device_sectors = i_size_read(wc->dev->bdev->bd_inode) >> SECTOR_SHIFT;
++
+ 	if (WC_MODE_PMEM(wc)) {
+ 		persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
+ 	} else {
+@@ -1499,6 +1502,10 @@ static bool wc_add_block(struct writebac
+ 	void *address = memory_data(wc, e);
+ 
+ 	persistent_memory_flush_cache(address, block_size);
++
++	if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
++		return true;
++
+ 	return bio_add_page(&wb->bio, persistent_memory_page(address),
+ 			    block_size, persistent_memory_page_offset(address)) != 0;
+ }
+@@ -1571,6 +1578,9 @@ static void __writecache_writeback_pmem(
+ 		if (writecache_has_error(wc)) {
+ 			bio->bi_status = BLK_STS_IOERR;
+ 			bio_endio(&wb->bio);
++		} else if (unlikely(!bio_sectors(&wb->bio))) {
++			bio->bi_status = BLK_STS_OK;
++			bio_endio(&wb->bio);
+ 		} else {
+ 			submit_bio(&wb->bio);
+ 		}
+@@ -1614,6 +1624,14 @@ static void __writecache_writeback_ssd(s
+ 			e = f;
+ 		}
+ 
++		if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
++			if (to.sector >= wc->data_device_sectors) {
++				writecache_copy_endio(0, 0, c);
++				continue;
++			}
++			from.count = to.count = wc->data_device_sectors - to.sector;
++		}
++
+ 		dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
+ 
+ 		__writeback_throttle(wc, wbl);
diff --git a/queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch b/queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch
new file mode 100644
index 00000000000..d7026ed9d1d
--- /dev/null
+++ b/queue-4.19/dm-writecache-return-the-exact-table-values-that-were-set.patch
@@ -0,0 +1,124 @@
+From 054bee16163df023e2589db09fd27d81f7ad9e72 Mon Sep 17 00:00:00 2001
+From: Mikulas Patocka <mpatocka@redhat.com>
+Date: Thu, 4 Feb 2021 05:20:52 -0500
+Subject: dm writecache: return the exact table values that were set
+
+From: Mikulas Patocka <mpatocka@redhat.com>
+
+commit 054bee16163df023e2589db09fd27d81f7ad9e72 upstream.
+
+LVM doesn't like it when the target returns different values from what
+was set in the constructor. Fix dm-writecache so that the returned
+table values are exactly the same as requested values.
+
+Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
+Cc: stable@vger.kernel.org # v4.18+
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-writecache.c |   32 ++++++++++++++++----------------
+ 1 file changed, 16 insertions(+), 16 deletions(-)
+
+--- a/drivers/md/dm-writecache.c
++++ b/drivers/md/dm-writecache.c
+@@ -153,6 +153,7 @@ struct dm_writecache {
+ 	bool overwrote_committed:1;
+ 	bool memory_vmapped:1;
+ 
++	bool start_sector_set:1;
+ 	bool high_wm_percent_set:1;
+ 	bool low_wm_percent_set:1;
+ 	bool max_writeback_jobs_set:1;
+@@ -161,6 +162,10 @@ struct dm_writecache {
+ 	bool writeback_fua_set:1;
+ 	bool flush_on_suspend:1;
+ 
++	unsigned high_wm_percent_value;
++	unsigned low_wm_percent_value;
++	unsigned autocommit_time_value;
++
+ 	unsigned writeback_all;
+ 	struct workqueue_struct *writeback_wq;
+ 	struct work_struct writeback_work;
+@@ -2045,6 +2050,7 @@ static int writecache_ctr(struct dm_targ
+ 			if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
+ 				goto invalid_optional;
+ 			wc->start_sector = start_sector;
++			wc->start_sector_set = true;
+ 			if (wc->start_sector != start_sector ||
+ 			    wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
+ 				goto invalid_optional;
+@@ -2054,6 +2060,7 @@ static int writecache_ctr(struct dm_targ
+ 				goto invalid_optional;
+ 			if (high_wm_percent < 0 || high_wm_percent > 100)
+ 				goto invalid_optional;
++			wc->high_wm_percent_value = high_wm_percent;
+ 			wc->high_wm_percent_set = true;
+ 		} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
+ 			string = dm_shift_arg(&as), opt_params--;
+@@ -2061,6 +2068,7 @@ static int writecache_ctr(struct dm_targ
+ 				goto invalid_optional;
+ 			if (low_wm_percent < 0 || low_wm_percent > 100)
+ 				goto invalid_optional;
++			wc->low_wm_percent_value = low_wm_percent;
+ 			wc->low_wm_percent_set = true;
+ 		} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
+ 			string = dm_shift_arg(&as), opt_params--;
+@@ -2080,6 +2088,7 @@ static int writecache_ctr(struct dm_targ
+ 			if (autocommit_msecs > 3600000)
+ 				goto invalid_optional;
+ 			wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
++			wc->autocommit_time_value = autocommit_msecs;
+ 			wc->autocommit_time_set = true;
+ 		} else if (!strcasecmp(string, "fua")) {
+ 			if (WC_MODE_PMEM(wc)) {
+@@ -2275,7 +2284,6 @@ static void writecache_status(struct dm_
+ 	struct dm_writecache *wc = ti->private;
+ 	unsigned extra_args;
+ 	unsigned sz = 0;
+-	uint64_t x;
+ 
+ 	switch (type) {
+ 	case STATUSTYPE_INFO:
+@@ -2287,7 +2295,7 @@ static void writecache_status(struct dm_
+ 		DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
+ 				wc->dev->name, wc->ssd_dev->name, wc->block_size);
+ 		extra_args = 0;
+-		if (wc->start_sector)
++		if (wc->start_sector_set)
+ 			extra_args += 2;
+ 		if (wc->high_wm_percent_set)
+ 			extra_args += 2;
+@@ -2303,26 +2311,18 @@ static void writecache_status(struct dm_
+ 			extra_args++;
+ 
+ 		DMEMIT("%u", extra_args);
+-		if (wc->start_sector)
++		if (wc->start_sector_set)
+ 			DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
+-		if (wc->high_wm_percent_set) {
+-			x = (uint64_t)wc->freelist_high_watermark * 100;
+-			x += wc->n_blocks / 2;
+-			do_div(x, (size_t)wc->n_blocks);
+-			DMEMIT(" high_watermark %u", 100 - (unsigned)x);
+-		}
+-		if (wc->low_wm_percent_set) {
+-			x = (uint64_t)wc->freelist_low_watermark * 100;
+-			x += wc->n_blocks / 2;
+-			do_div(x, (size_t)wc->n_blocks);
+-			DMEMIT(" low_watermark %u", 100 - (unsigned)x);
+-		}
++		if (wc->high_wm_percent_set)
++			DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
++		if (wc->low_wm_percent_set)
++			DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
+ 		if (wc->max_writeback_jobs_set)
+ 			DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
+ 		if (wc->autocommit_blocks_set)
+ 			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
+ 		if (wc->autocommit_time_set)
+-			DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
++			DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
+ 		if (wc->writeback_fua_set)
+ 			DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
+ 		break;
diff --git a/queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch b/queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch
new file mode 100644
index 00000000000..fe73ad1c483
--- /dev/null
+++ b/queue-4.19/mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch
@@ -0,0 +1,160 @@
+From sunnanyong@huawei.com  Thu Jul 22 16:42:41 2021
+From: Nanyong Sun <sunnanyong@huawei.com>
+Date: Tue, 20 Jul 2021 16:20:48 +0800
+Subject: mm: slab: fix kmem_cache_create failed when sysfs node not destroyed
+To: <songmuchun@bytedance.com>, <cl@linux.com>, <penberg@kernel.org>, <rientjes@google.com>, <iamjoonsoo.kim@lge.com>, <akpm@linux-foundation.org>
+Cc: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>, <stable@vger.kernel.org>
+Message-ID: <20210720082048.2797315-1-sunnanyong@huawei.com>
+
+From: Nanyong Sun <sunnanyong@huawei.com>
+
+The commit d38a2b7a9c93 ("mm: memcg/slab: fix memory leak at non-root
+kmem_cache destroy") introduced a problem: If one thread destroy a
+kmem_cache A and another thread concurrently create a kmem_cache B,
+which is mergeable with A and has same size with A, the B may fail to
+create due to the duplicate sysfs node.
+The scenario in detail:
+1) Thread 1 uses kmem_cache_destroy() to destroy kmem_cache A which is
+mergeable, it decreases A's refcount and if refcount is 0, then call
+memcg_set_kmem_cache_dying() which set A->memcg_params.dying = true,
+then unlock the slab_mutex and call flush_memcg_workqueue(), it may cost
+a while.
+Note: now the sysfs node(like '/kernel/slab/:0000248') of A is still
+present, it will be deleted in shutdown_cache() which will be called
+after flush_memcg_workqueue() is done and lock the slab_mutex again.
+2) Now if thread 2 is coming, it use kmem_cache_create() to create B, which
+is mergeable with A(their size is same), it gain the lock of slab_mutex,
+then call __kmem_cache_alias() trying to find a mergeable node, because
+of the below added code in commit d38a2b7a9c93 ("mm: memcg/slab: fix
+memory leak at non-root kmem_cache destroy"), B is not mergeable with
+A whose memcg_params.dying is true.
+
+int slab_unmergeable(struct kmem_cache *s)
+ 	if (s->refcount < 0)
+ 		return 1;
+
+	/*
+	 * Skip the dying kmem_cache.
+	 */
+	if (s->memcg_params.dying)
+		return 1;
+
+ 	return 0;
+ }
+
+So B has to create its own sysfs node by calling:
+ create_cache->
+	__kmem_cache_create->
+		sysfs_slab_add->
+			kobject_init_and_add
+Because B is mergeable itself, its filename of sysfs node is based on its size,
+like '/kernel/slab/:0000248', which is duplicate with A, and the sysfs
+node of A is still present now, so kobject_init_and_add() will return
+fail and result in kmem_cache_create() fail.
+
+Concurrently modprobe and rmmod the two modules below can reproduce the issue
+quickly: nf_conntrack_expect, se_sess_cache. See call trace in the end.
+
+LTS versions of v4.19.y and v5.4.y have this problem, whereas linux versions after
+v5.9 do not have this problem because the patchset: ("The new cgroup slab memory
+controller") almost refactored memcg slab.
+
+A potential solution(this patch belongs): Just let the dying kmem_cache be mergeable,
+the slab_mutex lock can prevent the race between alias kmem_cache creating thread
+and root kmem_cache destroying thread. In the destroying thread, after
+flush_memcg_workqueue() is done, judge the refcount again, if someone
+reference it again during un-lock time, we don't need to destroy the kmem_cache
+completely, we can reuse it.
+
+Another potential solution: revert the commit d38a2b7a9c93 ("mm: memcg/slab:
+fix memory leak at non-root kmem_cache destroy"), compare to the fail of
+kmem_cache_create, the memory leak in special scenario seems less harmful.
+
+Call trace:
+ sysfs: cannot create duplicate filename '/kernel/slab/:0000248'
+ Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
+ Call trace:
+  dump_backtrace+0x0/0x198
+  show_stack+0x24/0x30
+  dump_stack+0xb0/0x100
+  sysfs_warn_dup+0x6c/0x88
+  sysfs_create_dir_ns+0x104/0x120
+  kobject_add_internal+0xd0/0x378
+  kobject_init_and_add+0x90/0xd8
+  sysfs_slab_add+0x16c/0x2d0
+  __kmem_cache_create+0x16c/0x1d8
+  create_cache+0xbc/0x1f8
+  kmem_cache_create_usercopy+0x1a0/0x230
+  kmem_cache_create+0x50/0x68
+  init_se_kmem_caches+0x38/0x258 [target_core_mod]
+  target_core_init_configfs+0x8c/0x390 [target_core_mod]
+  do_one_initcall+0x54/0x230
+  do_init_module+0x64/0x1ec
+  load_module+0x150c/0x16f0
+  __se_sys_finit_module+0xf0/0x108
+  __arm64_sys_finit_module+0x24/0x30
+  el0_svc_common+0x80/0x1c0
+  el0_svc_handler+0x78/0xe0
+  el0_svc+0x10/0x260
+ kobject_add_internal failed for :0000248 with -EEXIST, don't try to register things with the same name in the same directory.
+ kmem_cache_create(se_sess_cache) failed with error -17
+ Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
+ Call trace:
+  dump_backtrace+0x0/0x198
+  show_stack+0x24/0x30
+  dump_stack+0xb0/0x100
+  kmem_cache_create_usercopy+0xa8/0x230
+  kmem_cache_create+0x50/0x68
+  init_se_kmem_caches+0x38/0x258 [target_core_mod]
+  target_core_init_configfs+0x8c/0x390 [target_core_mod]
+  do_one_initcall+0x54/0x230
+  do_init_module+0x64/0x1ec
+  load_module+0x150c/0x16f0
+  __se_sys_finit_module+0xf0/0x108
+  __arm64_sys_finit_module+0x24/0x30
+  el0_svc_common+0x80/0x1c0
+  el0_svc_handler+0x78/0xe0
+  el0_svc+0x10/0x260
+
+Fixes: d38a2b7a9c93 ("mm: memcg/slab: fix memory leak at non-root kmem_cache destroy")
+Signed-off-by: Nanyong Sun <sunnanyong@huawei.com>
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/slab_common.c |   18 ++++++++++--------
+ 1 file changed, 10 insertions(+), 8 deletions(-)
+
+--- a/mm/slab_common.c
++++ b/mm/slab_common.c
+@@ -311,14 +311,6 @@ int slab_unmergeable(struct kmem_cache *
+ 	if (s->refcount < 0)
+ 		return 1;
+ 
+-#ifdef CONFIG_MEMCG_KMEM
+-	/*
+-	 * Skip the dying kmem_cache.
+-	 */
+-	if (s->memcg_params.dying)
+-		return 1;
+-#endif
+-
+ 	return 0;
+ }
+ 
+@@ -918,6 +910,16 @@ void kmem_cache_destroy(struct kmem_cach
+ 	get_online_mems();
+ 
+ 	mutex_lock(&slab_mutex);
++
++	/*
++	 * Another thread referenced it again
++	 */
++	if (READ_ONCE(s->refcount)) {
++		spin_lock_irq(&memcg_kmem_wq_lock);
++		s->memcg_params.dying = false;
++		spin_unlock_irq(&memcg_kmem_wq_lock);
++		goto out_unlock;
++	}
+ #endif
+ 
+ 	err = shutdown_memcg_caches(s);
diff --git a/queue-4.19/series b/queue-4.19/series
index b500e1cc7ac..248b607b0f1 100644
--- a/queue-4.19/series
+++ b/queue-4.19/series
@@ -31,3 +31,6 @@ scsi-aic7xxx-fix-unintentional-sign-extension-issue-.patch
 scsi-libsas-add-lun-number-check-in-.slave_alloc-cal.patch
 scsi-libfc-fix-array-index-out-of-bound-exception.patch
 sched-fair-fix-cfs-bandwidth-hrtimer-expiry-type.patch
+mm-slab-fix-kmem_cache_create-failed-when-sysfs-node-not-destroyed.patch
+dm-writecache-return-the-exact-table-values-that-were-set.patch
+dm-writecache-fix-writing-beyond-end-of-underlying-device-when-shrinking.patch
-- 
2.47.3