From: Greg Kroah-Hartman Date: Wed, 9 Dec 2020 09:17:01 +0000 (+0100) Subject: 5.9-stable patches X-Git-Tag: v5.9.14~37 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=e9bac76faa3b8b6af0a24fcd63127a84d516d13c;p=thirdparty%2Fkernel%2Fstable-queue.git 5.9-stable patches added patches: genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch --- diff --git a/queue-5.9/genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch b/queue-5.9/genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch new file mode 100644 index 00000000000..273ff85c455 --- /dev/null +++ b/queue-5.9/genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch @@ -0,0 +1,104 @@ +From bb4c6910c8b41623104c2e64a30615682689a54d Mon Sep 17 00:00:00 2001 +From: Laurent Vivier +Date: Thu, 26 Nov 2020 09:28:51 +0100 +Subject: genirq/irqdomain: Add an irq_create_mapping_affinity() function + +From: Laurent Vivier + +commit bb4c6910c8b41623104c2e64a30615682689a54d upstream. + +There is currently no way to convey the affinity of an interrupt +via irq_create_mapping(), which creates issues for devices that +expect that affinity to be managed by the kernel. + +In order to sort this out, rename irq_create_mapping() to +irq_create_mapping_affinity() with an additional affinity parameter that +can be passed down to irq_domain_alloc_descs(). + +irq_create_mapping() is re-implemented as a wrapper around +irq_create_mapping_affinity(). + +No functional change. + +Fixes: e75eafb9b039 ("genirq/msi: Switch to new irq spreading infrastructure") +Signed-off-by: Laurent Vivier +Signed-off-by: Thomas Gleixner +Reviewed-by: Greg Kurz +Cc: Michael Ellerman +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20201126082852.1178497-2-lvivier@redhat.com +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/irqdomain.h | 12 ++++++++++-- + kernel/irq/irqdomain.c | 13 ++++++++----- + 2 files changed, 18 insertions(+), 7 deletions(-) + +--- a/include/linux/irqdomain.h ++++ b/include/linux/irqdomain.h +@@ -383,11 +383,19 @@ extern void irq_domain_associate_many(st + extern void irq_domain_disassociate(struct irq_domain *domain, + unsigned int irq); + +-extern unsigned int irq_create_mapping(struct irq_domain *host, +- irq_hw_number_t hwirq); ++extern unsigned int irq_create_mapping_affinity(struct irq_domain *host, ++ irq_hw_number_t hwirq, ++ const struct irq_affinity_desc *affinity); + extern unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); + extern void irq_dispose_mapping(unsigned int virq); + ++static inline unsigned int irq_create_mapping(struct irq_domain *host, ++ irq_hw_number_t hwirq) ++{ ++ return irq_create_mapping_affinity(host, hwirq, NULL); ++} ++ ++ + /** + * irq_linear_revmap() - Find a linux irq from a hw irq number. + * @domain: domain owning this hardware interrupt +--- a/kernel/irq/irqdomain.c ++++ b/kernel/irq/irqdomain.c +@@ -624,17 +624,19 @@ unsigned int irq_create_direct_mapping(s + EXPORT_SYMBOL_GPL(irq_create_direct_mapping); + + /** +- * irq_create_mapping() - Map a hardware interrupt into linux irq space ++ * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space + * @domain: domain owning this hardware interrupt or NULL for default domain + * @hwirq: hardware irq number in that domain space ++ * @affinity: irq affinity + * + * Only one mapping per hardware interrupt is permitted. Returns a linux + * irq number. + * If the sense/trigger is to be specified, set_irq_type() should be called + * on the number returned from that call. + */ +-unsigned int irq_create_mapping(struct irq_domain *domain, +- irq_hw_number_t hwirq) ++unsigned int irq_create_mapping_affinity(struct irq_domain *domain, ++ irq_hw_number_t hwirq, ++ const struct irq_affinity_desc *affinity) + { + struct device_node *of_node; + int virq; +@@ -660,7 +662,8 @@ unsigned int irq_create_mapping(struct i + } + + /* Allocate a virtual interrupt number */ +- virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); ++ virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), ++ affinity); + if (virq <= 0) { + pr_debug("-> virq allocation failed\n"); + return 0; +@@ -676,7 +679,7 @@ unsigned int irq_create_mapping(struct i + + return virq; + } +-EXPORT_SYMBOL_GPL(irq_create_mapping); ++EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); + + /** + * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs diff --git a/queue-5.9/hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch b/queue-5.9/hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch new file mode 100644 index 00000000000..a4a5ea5bf1c --- /dev/null +++ b/queue-5.9/hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch @@ -0,0 +1,114 @@ +From 7a5bde37983d37783161681ff7c6122dfd081791 Mon Sep 17 00:00:00 2001 +From: Mike Kravetz +Date: Sat, 5 Dec 2020 22:15:12 -0800 +Subject: hugetlb_cgroup: fix offline of hugetlb cgroup with reservations + +From: Mike Kravetz + +commit 7a5bde37983d37783161681ff7c6122dfd081791 upstream. + +Adrian Moreno was ruuning a kubernetes 1.19 + containerd/docker workload +using hugetlbfs. In this environment the issue is reproduced by: + + - Start a simple pod that uses the recently added HugePages medium + feature (pod yaml attached) + + - Start a DPDK app. It doesn't need to run successfully (as in transfer + packets) nor interact with real hardware. It seems just initializing + the EAL layer (which handles hugepage reservation and locking) is + enough to trigger the issue + + - Delete the Pod (or let it "Complete"). + +This would result in a kworker thread going into a tight loop (top output): + + 1425 root 20 0 0 0 0 R 99.7 0.0 5:22.45 kworker/28:7+cgroup_destroy + +'perf top -g' reports: + + - 63.28% 0.01% [kernel] [k] worker_thread + - 49.97% worker_thread + - 52.64% process_one_work + - 62.08% css_killed_work_fn + - hugetlb_cgroup_css_offline + 41.52% _raw_spin_lock + - 2.82% _cond_resched + rcu_all_qs + 2.66% PageHuge + - 0.57% schedule + - 0.57% __schedule + +We are spinning in the do-while loop in hugetlb_cgroup_css_offline. +Worse yet, we are holding the master cgroup lock (cgroup_mutex) while +infinitely spinning. Little else can be done on the system as the +cgroup_mutex can not be acquired. + +Do note that the issue can be reproduced by simply offlining a hugetlb +cgroup containing pages with reservation counts. + +The loop in hugetlb_cgroup_css_offline is moving page counts from the +cgroup being offlined to the parent cgroup. This is done for each +hstate, and is repeated until hugetlb_cgroup_have_usage returns false. +The routine moving counts (hugetlb_cgroup_move_parent) is only moving +'usage' counts. The routine hugetlb_cgroup_have_usage is checking for +both 'usage' and 'reservation' counts. Discussion about what to do with +reservation counts when reparenting was discussed here: + +https://lore.kernel.org/linux-kselftest/CAHS8izMFAYTgxym-Hzb_JmkTK1N_S9tGN71uS6MFV+R7swYu5A@mail.gmail.com/ + +The decision was made to leave a zombie cgroup for with reservation +counts. Unfortunately, the code checking reservation counts was +incorrectly added to hugetlb_cgroup_have_usage. + +To fix the issue, simply remove the check for reservation counts. While +fixing this issue, a related bug in hugetlb_cgroup_css_offline was +noticed. The hstate index is not reinitialized each time through the +do-while loop. Fix this as well. + +Fixes: 1adc4d419aa2 ("hugetlb_cgroup: add interface for charge/uncharge hugetlb reservations") +Reported-by: Adrian Moreno +Signed-off-by: Mike Kravetz +Signed-off-by: Andrew Morton +Tested-by: Adrian Moreno +Reviewed-by: Shakeel Butt +Cc: Mina Almasry +Cc: David Rientjes +Cc: Greg Thelen +Cc: Sandipan Das +Cc: Shuah Khan +Cc: +Link: https://lkml.kernel.org/r/20201203220242.158165-1-mike.kravetz@oracle.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/hugetlb_cgroup.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +--- a/mm/hugetlb_cgroup.c ++++ b/mm/hugetlb_cgroup.c +@@ -82,11 +82,8 @@ static inline bool hugetlb_cgroup_have_u + + for (idx = 0; idx < hugetlb_max_hstate; idx++) { + if (page_counter_read( +- hugetlb_cgroup_counter_from_cgroup(h_cg, idx)) || +- page_counter_read(hugetlb_cgroup_counter_from_cgroup_rsvd( +- h_cg, idx))) { ++ hugetlb_cgroup_counter_from_cgroup(h_cg, idx))) + return true; +- } + } + return false; + } +@@ -202,9 +199,10 @@ static void hugetlb_cgroup_css_offline(s + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); + struct hstate *h; + struct page *page; +- int idx = 0; ++ int idx; + + do { ++ idx = 0; + for_each_hstate(h) { + spin_lock(&hugetlb_lock); + list_for_each_entry(page, &h->hugepage_activelist, lru) diff --git a/queue-5.9/mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch b/queue-5.9/mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch new file mode 100644 index 00000000000..e5e26163105 --- /dev/null +++ b/queue-5.9/mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch @@ -0,0 +1,131 @@ +From 8199be001a470209f5c938570cc199abb012fe53 Mon Sep 17 00:00:00 2001 +From: Yang Shi +Date: Sat, 5 Dec 2020 22:14:48 -0800 +Subject: mm: list_lru: set shrinker map bit when child nr_items is not zero + +From: Yang Shi + +commit 8199be001a470209f5c938570cc199abb012fe53 upstream. + +When investigating a slab cache bloat problem, significant amount of +negative dentry cache was seen, but confusingly they neither got shrunk +by reclaimer (the host has very tight memory) nor be shrunk by dropping +cache. The vmcore shows there are over 14M negative dentry objects on +lru, but tracing result shows they were even not scanned at all. + +Further investigation shows the memcg's vfs shrinker_map bit is not set. +So the reclaimer or dropping cache just skip calling vfs shrinker. So +we have to reboot the hosts to get the memory back. + +I didn't manage to come up with a reproducer in test environment, and +the problem can't be reproduced after rebooting. But it seems there is +race between shrinker map bit clear and reparenting by code inspection. +The hypothesis is elaborated as below. + +The memcg hierarchy on our production environment looks like: + + root + / \ + system user + +The main workloads are running under user slice's children, and it +creates and removes memcg frequently. So reparenting happens very often +under user slice, but no task is under user slice directly. + +So with the frequent reparenting and tight memory pressure, the below +hypothetical race condition may happen: + + CPU A CPU B +reparent + dst->nr_items == 0 + shrinker: + total_objects == 0 + add src->nr_items to dst + set_bit + return SHRINK_EMPTY + clear_bit +child memcg offline + replace child's kmemcg_id with + parent's (in memcg_offline_kmem()) + list_lru_del() between shrinker runs + see parent's kmemcg_id + dec dst->nr_items +reparent again + dst->nr_items may go negative + due to concurrent list_lru_del() + + The second run of shrinker: + read nr_items without any + synchronization, so it may + see intermediate negative + nr_items then total_objects + may return 0 coincidently + + keep the bit cleared + dst->nr_items != 0 + skip set_bit + add scr->nr_item to dst + +After this point dst->nr_item may never go zero, so reparenting will not +set shrinker_map bit anymore. And since there is no task under user +slice directly, so no new object will be added to its lru to set the +shrinker map bit either. That bit is kept cleared forever. + +How does list_lru_del() race with reparenting? It is because reparenting +replaces children's kmemcg_id to parent's without protecting from +nlru->lock, so list_lru_del() may see parent's kmemcg_id but actually +deleting items from child's lru, but dec'ing parent's nr_items, so the +parent's nr_items may go negative as commit 2788cf0c401c ("memcg: +reparent list_lrus and free kmemcg_id on css offline") says. + +Since it is impossible that dst->nr_items goes negative and +src->nr_items goes zero at the same time, so it seems we could set the +shrinker map bit iff src->nr_items != 0. We could synchronize +list_lru_count_one() and reparenting with nlru->lock, but it seems +checking src->nr_items in reparenting is the simplest and avoids lock +contention. + +Fixes: fae91d6d8be5 ("mm/list_lru.c: set bit in memcg shrinker bitmap on first list_lru item appearance") +Suggested-by: Roman Gushchin +Signed-off-by: Yang Shi +Signed-off-by: Andrew Morton +Reviewed-by: Roman Gushchin +Reviewed-by: Shakeel Butt +Acked-by: Kirill Tkhai +Cc: Vladimir Davydov +Cc: [4.19] +Link: https://lkml.kernel.org/r/20201202171749.264354-1-shy828301@gmail.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/list_lru.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/mm/list_lru.c ++++ b/mm/list_lru.c +@@ -534,7 +534,6 @@ static void memcg_drain_list_lru_node(st + struct list_lru_node *nlru = &lru->node[nid]; + int dst_idx = dst_memcg->kmemcg_id; + struct list_lru_one *src, *dst; +- bool set; + + /* + * Since list_lru_{add,del} may be called under an IRQ-safe lock, +@@ -546,11 +545,12 @@ static void memcg_drain_list_lru_node(st + dst = list_lru_from_memcg_idx(nlru, dst_idx); + + list_splice_init(&src->list, &dst->list); +- set = (!dst->nr_items && src->nr_items); +- dst->nr_items += src->nr_items; +- if (set) ++ ++ if (src->nr_items) { ++ dst->nr_items += src->nr_items; + memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); +- src->nr_items = 0; ++ src->nr_items = 0; ++ } + + spin_unlock_irq(&nlru->lock); + } diff --git a/queue-5.9/mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch b/queue-5.9/mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch new file mode 100644 index 00000000000..0b15ab57084 --- /dev/null +++ b/queue-5.9/mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch @@ -0,0 +1,53 @@ +From b11a76b37a5aa7b07c3e3eeeaae20b25475bddd3 Mon Sep 17 00:00:00 2001 +From: Qian Cai +Date: Sat, 5 Dec 2020 22:14:55 -0800 +Subject: mm/swapfile: do not sleep with a spin lock held + +From: Qian Cai + +commit b11a76b37a5aa7b07c3e3eeeaae20b25475bddd3 upstream. + +We can't call kvfree() with a spin lock held, so defer it. Fixes a +might_sleep() runtime warning. + +Fixes: 873d7bcfd066 ("mm/swapfile.c: use kvzalloc for swap_info_struct allocation") +Signed-off-by: Qian Cai +Signed-off-by: Andrew Morton +Reviewed-by: Andrew Morton +Cc: Hugh Dickins +Cc: +Link: https://lkml.kernel.org/r/20201202151549.10350-1-qcai@redhat.com +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/swapfile.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/mm/swapfile.c ++++ b/mm/swapfile.c +@@ -2868,6 +2868,7 @@ late_initcall(max_swapfiles_check); + static struct swap_info_struct *alloc_swap_info(void) + { + struct swap_info_struct *p; ++ struct swap_info_struct *defer = NULL; + unsigned int type; + int i; + +@@ -2896,7 +2897,7 @@ static struct swap_info_struct *alloc_sw + smp_wmb(); + WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1); + } else { +- kvfree(p); ++ defer = p; + p = swap_info[type]; + /* + * Do not memset this entry: a racing procfs swap_next() +@@ -2909,6 +2910,7 @@ static struct swap_info_struct *alloc_sw + plist_node_init(&p->avail_lists[i], 0); + p->flags = SWP_USED; + spin_unlock(&swap_lock); ++ kvfree(defer); + spin_lock_init(&p->lock); + spin_lock_init(&p->cont_lock); + diff --git a/queue-5.9/series b/queue-5.9/series index fcd3342b8a3..94c2e92833b 100644 --- a/queue-5.9/series +++ b/queue-5.9/series @@ -43,9 +43,13 @@ io_uring-fix-recvmsg-setup-with-compat-buf-select.patch dm-writecache-advance-the-number-of-arguments-when-reporting-max_age.patch dm-writecache-fix-the-maximum-number-of-arguments.patch powerpc-64s-powernv-fix-memory-corruption-when-saving-slb-entries-on-mce.patch +genirq-irqdomain-add-an-irq_create_mapping_affinity-function.patch powerpc-pseries-pass-msi-affinity-to-irq_create_mapping.patch dm-fix-bug-with-rcu-locking-in-dm_blk_report_zones.patch dm-fix-double-rcu-unlock-in-dm_dax_zero_page_range-error-path.patch dm-remove-invalid-sparse-__acquires-and-__releases-annotations.patch x86-uprobes-do-not-use-prefixes.nbytes-when-looping-over-prefixes.bytes.patch coredump-fix-core_pattern-parse-error.patch +mm-list_lru-set-shrinker-map-bit-when-child-nr_items-is-not-zero.patch +mm-swapfile-do-not-sleep-with-a-spin-lock-held.patch +hugetlb_cgroup-fix-offline-of-hugetlb-cgroup-with-reservations.patch