From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 9 Dec 2020 09:17:01 +0000 (+0100)
Subject: 5.9-stable patches
X-Git-Tag: v5.9.14~37
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=e9bac76faa3b8b6af0a24fcd63127a84d516d13c;p=thirdparty%2Fkernel%2Fstable-queue.git

5.9-stable patches

added patches:
	genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch
	hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch
	mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch
	mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch
---

diff --git a/queue-5.9/genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch b/queue-5.9/genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch
new file mode 100644
index 00000000000..273ff85c455
--- /dev/null
+++ b/queue-5.9/genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch
@@ -0,0 +1,104 @@
+From bb4c6910c8b41623104c2e64a30615682689a54d Mon Sep 17 00:00:00 2001
+From: Laurent Vivier <lvivier@redhat.com>
+Date: Thu, 26 Nov 2020 09:28:51 +0100
+Subject: genirq/irqdomain: Add an irq_create_mapping_affinity() function
+
+From: Laurent Vivier <lvivier@redhat.com>
+
+commit bb4c6910c8b41623104c2e64a30615682689a54d upstream.
+
+There is currently no way to convey the affinity of an interrupt
+via irq_create_mapping(), which creates issues for devices that
+expect that affinity to be managed by the kernel.
+
+In order to sort this out, rename irq_create_mapping() to
+irq_create_mapping_affinity() with an additional affinity parameter that
+can be passed down to irq_domain_alloc_descs().
+
+irq_create_mapping() is re-implemented as a wrapper around
+irq_create_mapping_affinity().
+
+No functional change.
+
+Fixes: e75eafb9b039 ("genirq/msi: Switch to new irq spreading infrastructure")
+Signed-off-by: Laurent Vivier <lvivier@redhat.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Greg Kurz <groug@kaod.org>
+Cc: Michael Ellerman <mpe@ellerman.id.au>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20201126082852.1178497-2-lvivier@redhat.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/irqdomain.h |   12 ++++++++++--
+ kernel/irq/irqdomain.c    |   13 ++++++++-----
+ 2 files changed, 18 insertions(+), 7 deletions(-)
+
+--- a/include/linux/irqdomain.h
++++ b/include/linux/irqdomain.h
+@@ -383,11 +383,19 @@ extern void irq_domain_associate_many(st
+ extern void irq_domain_disassociate(struct irq_domain *domain,
+ 				    unsigned int irq);
+ 
+-extern unsigned int irq_create_mapping(struct irq_domain *host,
+-				       irq_hw_number_t hwirq);
++extern unsigned int irq_create_mapping_affinity(struct irq_domain *host,
++				      irq_hw_number_t hwirq,
++				      const struct irq_affinity_desc *affinity);
+ extern unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec);
+ extern void irq_dispose_mapping(unsigned int virq);
+ 
++static inline unsigned int irq_create_mapping(struct irq_domain *host,
++					      irq_hw_number_t hwirq)
++{
++	return irq_create_mapping_affinity(host, hwirq, NULL);
++}
++
++
+ /**
+  * irq_linear_revmap() - Find a linux irq from a hw irq number.
+  * @domain: domain owning this hardware interrupt
+--- a/kernel/irq/irqdomain.c
++++ b/kernel/irq/irqdomain.c
+@@ -624,17 +624,19 @@ unsigned int irq_create_direct_mapping(s
+ EXPORT_SYMBOL_GPL(irq_create_direct_mapping);
+ 
+ /**
+- * irq_create_mapping() - Map a hardware interrupt into linux irq space
++ * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space
+  * @domain: domain owning this hardware interrupt or NULL for default domain
+  * @hwirq: hardware irq number in that domain space
++ * @affinity: irq affinity
+  *
+  * Only one mapping per hardware interrupt is permitted. Returns a linux
+  * irq number.
+  * If the sense/trigger is to be specified, set_irq_type() should be called
+  * on the number returned from that call.
+  */
+-unsigned int irq_create_mapping(struct irq_domain *domain,
+-				irq_hw_number_t hwirq)
++unsigned int irq_create_mapping_affinity(struct irq_domain *domain,
++				       irq_hw_number_t hwirq,
++				       const struct irq_affinity_desc *affinity)
+ {
+ 	struct device_node *of_node;
+ 	int virq;
+@@ -660,7 +662,8 @@ unsigned int irq_create_mapping(struct i
+ 	}
+ 
+ 	/* Allocate a virtual interrupt number */
+-	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL);
++	virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node),
++				      affinity);
+ 	if (virq <= 0) {
+ 		pr_debug("-> virq allocation failed\n");
+ 		return 0;
+@@ -676,7 +679,7 @@ unsigned int irq_create_mapping(struct i
+ 
+ 	return virq;
+ }
+-EXPORT_SYMBOL_GPL(irq_create_mapping);
++EXPORT_SYMBOL_GPL(irq_create_mapping_affinity);
+ 
+ /**
+  * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs
diff --git a/queue-5.9/hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch b/queue-5.9/hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch
new file mode 100644
index 00000000000..a4a5ea5bf1c
--- /dev/null
+++ b/queue-5.9/hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch
@@ -0,0 +1,114 @@
+From 7a5bde37983d37783161681ff7c6122dfd081791 Mon Sep 17 00:00:00 2001
+From: Mike Kravetz <mike.kravetz@oracle.com>
+Date: Sat, 5 Dec 2020 22:15:12 -0800
+Subject: hugetlb_cgroup: fix offline of hugetlb cgroup with reservations
+
+From: Mike Kravetz <mike.kravetz@oracle.com>
+
+commit 7a5bde37983d37783161681ff7c6122dfd081791 upstream.
+
+Adrian Moreno was ruuning a kubernetes 1.19 + containerd/docker workload
+using hugetlbfs.  In this environment the issue is reproduced by:
+
+ - Start a simple pod that uses the recently added HugePages medium
+   feature (pod yaml attached)
+
+ - Start a DPDK app. It doesn't need to run successfully (as in transfer
+   packets) nor interact with real hardware. It seems just initializing
+   the EAL layer (which handles hugepage reservation and locking) is
+   enough to trigger the issue
+
+ - Delete the Pod (or let it "Complete").
+
+This would result in a kworker thread going into a tight loop (top output):
+
+   1425 root      20   0       0      0      0 R  99.7   0.0   5:22.45 kworker/28:7+cgroup_destroy
+
+'perf top -g' reports:
+
+  -   63.28%     0.01%  [kernel]                    [k] worker_thread
+     - 49.97% worker_thread
+        - 52.64% process_one_work
+           - 62.08% css_killed_work_fn
+              - hugetlb_cgroup_css_offline
+                   41.52% _raw_spin_lock
+                 - 2.82% _cond_resched
+                      rcu_all_qs
+                   2.66% PageHuge
+        - 0.57% schedule
+           - 0.57% __schedule
+
+We are spinning in the do-while loop in hugetlb_cgroup_css_offline.
+Worse yet, we are holding the master cgroup lock (cgroup_mutex) while
+infinitely spinning.  Little else can be done on the system as the
+cgroup_mutex can not be acquired.
+
+Do note that the issue can be reproduced by simply offlining a hugetlb
+cgroup containing pages with reservation counts.
+
+The loop in hugetlb_cgroup_css_offline is moving page counts from the
+cgroup being offlined to the parent cgroup.  This is done for each
+hstate, and is repeated until hugetlb_cgroup_have_usage returns false.
+The routine moving counts (hugetlb_cgroup_move_parent) is only moving
+'usage' counts.  The routine hugetlb_cgroup_have_usage is checking for
+both 'usage' and 'reservation' counts.  Discussion about what to do with
+reservation counts when reparenting was discussed here:
+
+https://lore.kernel.org/linux-kselftest/CAHS8izMFAYTgxym-Hzb_JmkTK1N_S9tGN71uS6MFV+R7swYu5A@mail.gmail.com/
+
+The decision was made to leave a zombie cgroup for with reservation
+counts.  Unfortunately, the code checking reservation counts was
+incorrectly added to hugetlb_cgroup_have_usage.
+
+To fix the issue, simply remove the check for reservation counts.  While
+fixing this issue, a related bug in hugetlb_cgroup_css_offline was
+noticed.  The hstate index is not reinitialized each time through the
+do-while loop.  Fix this as well.
+
+Fixes: 1adc4d419aa2 ("hugetlb_cgroup: add interface for charge/uncharge hugetlb reservations")
+Reported-by: Adrian Moreno <amorenoz@redhat.com>
+Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Tested-by: Adrian Moreno <amorenoz@redhat.com>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Cc: Mina Almasry <almasrymina@google.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Greg Thelen <gthelen@google.com>
+Cc: Sandipan Das <sandipan@linux.ibm.com>
+Cc: Shuah Khan <shuah@kernel.org>
+Cc: <stable@vger.kernel.org>
+Link: https://lkml.kernel.org/r/20201203220242.158165-1-mike.kravetz@oracle.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/hugetlb_cgroup.c |    8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+--- a/mm/hugetlb_cgroup.c
++++ b/mm/hugetlb_cgroup.c
+@@ -82,11 +82,8 @@ static inline bool hugetlb_cgroup_have_u
+ 
+ 	for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+ 		if (page_counter_read(
+-			    hugetlb_cgroup_counter_from_cgroup(h_cg, idx)) ||
+-		    page_counter_read(hugetlb_cgroup_counter_from_cgroup_rsvd(
+-			    h_cg, idx))) {
++				hugetlb_cgroup_counter_from_cgroup(h_cg, idx)))
+ 			return true;
+-		}
+ 	}
+ 	return false;
+ }
+@@ -202,9 +199,10 @@ static void hugetlb_cgroup_css_offline(s
+ 	struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
+ 	struct hstate *h;
+ 	struct page *page;
+-	int idx = 0;
++	int idx;
+ 
+ 	do {
++		idx = 0;
+ 		for_each_hstate(h) {
+ 			spin_lock(&hugetlb_lock);
+ 			list_for_each_entry(page, &h->hugepage_activelist, lru)
diff --git a/queue-5.9/mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch b/queue-5.9/mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch
new file mode 100644
index 00000000000..e5e26163105
--- /dev/null
+++ b/queue-5.9/mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch
@@ -0,0 +1,131 @@
+From 8199be001a470209f5c938570cc199abb012fe53 Mon Sep 17 00:00:00 2001
+From: Yang Shi <shy828301@gmail.com>
+Date: Sat, 5 Dec 2020 22:14:48 -0800
+Subject: mm: list_lru: set shrinker map bit when child nr_items is not zero
+
+From: Yang Shi <shy828301@gmail.com>
+
+commit 8199be001a470209f5c938570cc199abb012fe53 upstream.
+
+When investigating a slab cache bloat problem, significant amount of
+negative dentry cache was seen, but confusingly they neither got shrunk
+by reclaimer (the host has very tight memory) nor be shrunk by dropping
+cache.  The vmcore shows there are over 14M negative dentry objects on
+lru, but tracing result shows they were even not scanned at all.
+
+Further investigation shows the memcg's vfs shrinker_map bit is not set.
+So the reclaimer or dropping cache just skip calling vfs shrinker.  So
+we have to reboot the hosts to get the memory back.
+
+I didn't manage to come up with a reproducer in test environment, and
+the problem can't be reproduced after rebooting.  But it seems there is
+race between shrinker map bit clear and reparenting by code inspection.
+The hypothesis is elaborated as below.
+
+The memcg hierarchy on our production environment looks like:
+
+                root
+               /    \
+          system   user
+
+The main workloads are running under user slice's children, and it
+creates and removes memcg frequently.  So reparenting happens very often
+under user slice, but no task is under user slice directly.
+
+So with the frequent reparenting and tight memory pressure, the below
+hypothetical race condition may happen:
+
+       CPU A                            CPU B
+reparent
+    dst->nr_items == 0
+                                 shrinker:
+                                     total_objects == 0
+    add src->nr_items to dst
+    set_bit
+                                     return SHRINK_EMPTY
+                                     clear_bit
+child memcg offline
+    replace child's kmemcg_id with
+    parent's (in memcg_offline_kmem())
+                                  list_lru_del() between shrinker runs
+                                     see parent's kmemcg_id
+                                     dec dst->nr_items
+reparent again
+    dst->nr_items may go negative
+    due to concurrent list_lru_del()
+
+                                 The second run of shrinker:
+                                     read nr_items without any
+                                     synchronization, so it may
+                                     see intermediate negative
+                                     nr_items then total_objects
+                                     may return 0 coincidently
+
+                                     keep the bit cleared
+    dst->nr_items != 0
+    skip set_bit
+    add scr->nr_item to dst
+
+After this point dst->nr_item may never go zero, so reparenting will not
+set shrinker_map bit anymore.  And since there is no task under user
+slice directly, so no new object will be added to its lru to set the
+shrinker map bit either.  That bit is kept cleared forever.
+
+How does list_lru_del() race with reparenting? It is because reparenting
+replaces children's kmemcg_id to parent's without protecting from
+nlru->lock, so list_lru_del() may see parent's kmemcg_id but actually
+deleting items from child's lru, but dec'ing parent's nr_items, so the
+parent's nr_items may go negative as commit 2788cf0c401c ("memcg:
+reparent list_lrus and free kmemcg_id on css offline") says.
+
+Since it is impossible that dst->nr_items goes negative and
+src->nr_items goes zero at the same time, so it seems we could set the
+shrinker map bit iff src->nr_items != 0.  We could synchronize
+list_lru_count_one() and reparenting with nlru->lock, but it seems
+checking src->nr_items in reparenting is the simplest and avoids lock
+contention.
+
+Fixes: fae91d6d8be5 ("mm/list_lru.c: set bit in memcg shrinker bitmap on first list_lru item appearance")
+Suggested-by: Roman Gushchin <guro@fb.com>
+Signed-off-by: Yang Shi <shy828301@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Roman Gushchin <guro@fb.com>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Kirill Tkhai <ktkhai@virtuozzo.com>
+Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
+Cc: <stable@vger.kernel.org>	[4.19]
+Link: https://lkml.kernel.org/r/20201202171749.264354-1-shy828301@gmail.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/list_lru.c |   10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/mm/list_lru.c
++++ b/mm/list_lru.c
+@@ -534,7 +534,6 @@ static void memcg_drain_list_lru_node(st
+ 	struct list_lru_node *nlru = &lru->node[nid];
+ 	int dst_idx = dst_memcg->kmemcg_id;
+ 	struct list_lru_one *src, *dst;
+-	bool set;
+ 
+ 	/*
+ 	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+@@ -546,11 +545,12 @@ static void memcg_drain_list_lru_node(st
+ 	dst = list_lru_from_memcg_idx(nlru, dst_idx);
+ 
+ 	list_splice_init(&src->list, &dst->list);
+-	set = (!dst->nr_items && src->nr_items);
+-	dst->nr_items += src->nr_items;
+-	if (set)
++
++	if (src->nr_items) {
++		dst->nr_items += src->nr_items;
+ 		memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
+-	src->nr_items = 0;
++		src->nr_items = 0;
++	}
+ 
+ 	spin_unlock_irq(&nlru->lock);
+ }
diff --git a/queue-5.9/mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch b/queue-5.9/mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch
new file mode 100644
index 00000000000..0b15ab57084
--- /dev/null
+++ b/queue-5.9/mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch
@@ -0,0 +1,53 @@
+From b11a76b37a5aa7b07c3e3eeeaae20b25475bddd3 Mon Sep 17 00:00:00 2001
+From: Qian Cai <qcai@redhat.com>
+Date: Sat, 5 Dec 2020 22:14:55 -0800
+Subject: mm/swapfile: do not sleep with a spin lock held
+
+From: Qian Cai <qcai@redhat.com>
+
+commit b11a76b37a5aa7b07c3e3eeeaae20b25475bddd3 upstream.
+
+We can't call kvfree() with a spin lock held, so defer it.  Fixes a
+might_sleep() runtime warning.
+
+Fixes: 873d7bcfd066 ("mm/swapfile.c: use kvzalloc for swap_info_struct allocation")
+Signed-off-by: Qian Cai <qcai@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: <stable@vger.kernel.org>
+Link: https://lkml.kernel.org/r/20201202151549.10350-1-qcai@redhat.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/swapfile.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -2868,6 +2868,7 @@ late_initcall(max_swapfiles_check);
+ static struct swap_info_struct *alloc_swap_info(void)
+ {
+ 	struct swap_info_struct *p;
++	struct swap_info_struct *defer = NULL;
+ 	unsigned int type;
+ 	int i;
+ 
+@@ -2896,7 +2897,7 @@ static struct swap_info_struct *alloc_sw
+ 		smp_wmb();
+ 		WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
+ 	} else {
+-		kvfree(p);
++		defer = p;
+ 		p = swap_info[type];
+ 		/*
+ 		 * Do not memset this entry: a racing procfs swap_next()
+@@ -2909,6 +2910,7 @@ static struct swap_info_struct *alloc_sw
+ 		plist_node_init(&p->avail_lists[i], 0);
+ 	p->flags = SWP_USED;
+ 	spin_unlock(&swap_lock);
++	kvfree(defer);
+ 	spin_lock_init(&p->lock);
+ 	spin_lock_init(&p->cont_lock);
+ 
diff --git a/queue-5.9/series b/queue-5.9/series
index fcd3342b8a3..94c2e92833b 100644
--- a/queue-5.9/series
+++ b/queue-5.9/series
@@ -43,9 +43,13 @@ io_uring-fix-recvmsg-setup-with-compat-buf-select.patch
 dm-writecache-advance-the-number-of-arguments-when-reporting-max_age.patch
 dm-writecache-fix-the-maximum-number-of-arguments.patch
 powerpc-64s-powernv-fix-memory-corruption-when-saving-slb-entries-on-mce.patch
+genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch
 powerpc-pseries-pass-msi-affinity-to-irq_create_mapping.patch
 dm-fix-bug-with-rcu-locking-in-dm_blk_report_zones.patch
 dm-fix-double-rcu-unlock-in-dm_dax_zero_page_range-error-path.patch
 dm-remove-invalid-sparse-__acquires-and-__releases-annotations.patch
 x86-uprobes-do-not-use-prefixes.nbytes-when-looping-over-prefixes.bytes.patch
 coredump-fix-core_pattern-parse-error.patch
+mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch
+mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch
+hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch