From: Greg Kroah-Hartman Date: Sun, 14 Aug 2016 16:14:34 +0000 (+0200) Subject: 4.4-stable patches X-Git-Tag: v3.14.76~15 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=106802a91b338db37a8156416e5645de9c2493fe;p=thirdparty%2Fkernel%2Fstable-queue.git 4.4-stable patches added patches: mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch mm-memcontrol-fix-memcg-id-ref-counter-on-swap-charge-move.patch mm-memcontrol-fix-swap-counter-leak-on-swapout-from-offline-cgroup.patch --- diff --git a/queue-4.4/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch b/queue-4.4/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch new file mode 100644 index 00000000000..c0088c0c750 --- /dev/null +++ b/queue-4.4/mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch @@ -0,0 +1,278 @@ +From 73f576c04b9410ed19660f74f97521bee6e1c546 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 20 Jul 2016 15:44:57 -0700 +Subject: mm: memcontrol: fix cgroup creation failure after many small jobs + +From: Johannes Weiner + +commit 73f576c04b9410ed19660f74f97521bee6e1c546 upstream. + +The memory controller has quite a bit of state that usually outlives the +cgroup and pins its CSS until said state disappears. At the same time +it imposes a 16-bit limit on the CSS ID space to economically store IDs +in the wild. Consequently, when we use cgroups to contain frequent but +small and short-lived jobs that leave behind some page cache, we quickly +run into the 64k limitations of outstanding CSSs. Creating a new cgroup +fails with -ENOSPC while there are only a few, or even no user-visible +cgroups in existence. + +Although pinning CSSs past cgroup removal is common, there are only two +instances that actually need an ID after a cgroup is deleted: cache +shadow entries and swapout records. + +Cache shadow entries reference the ID weakly and can deal with the CSS +having disappeared when it's looked up later. They pose no hurdle. + +Swap-out records do need to pin the css to hierarchically attribute +swapins after the cgroup has been deleted; though the only pages that +remain swapped out after offlining are tmpfs/shmem pages. And those +references are under the user's control, so they are manageable. + +This patch introduces a private 16-bit memcg ID and switches swap and +cache shadow entries over to using that. This ID can then be recycled +after offlining when the CSS remains pinned only by objects that don't +specifically need it. + +This script demonstrates the problem by faulting one cache page in a new +cgroup and deleting it again: + + set -e + mkdir -p pages + for x in `seq 128000`; do + [ $((x % 1000)) -eq 0 ] && echo $x + mkdir /cgroup/foo + echo $$ >/cgroup/foo/cgroup.procs + echo trex >pages/$x + echo $$ >/cgroup/cgroup.procs + rmdir /cgroup/foo + done + +When run on an unpatched kernel, we eventually run out of possible IDs +even though there are no visible cgroups: + + [root@ham ~]# ./cssidstress.sh + [...] + 65000 + mkdir: cannot create directory '/cgroup/foo': No space left on device + +After this patch, the IDs get released upon cgroup destruction and the +cache and css objects get released once memory reclaim kicks in. + +[hannes@cmpxchg.org: init the IDR] + Link: http://lkml.kernel.org/r/20160621154601.GA22431@cmpxchg.org +Fixes: b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups") +Link: http://lkml.kernel.org/r/20160617162516.GD19084@cmpxchg.org +Signed-off-by: Johannes Weiner +Reported-by: John Garcia +Reviewed-by: Vladimir Davydov +Acked-by: Tejun Heo +Cc: Nikolay Borisov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Michal Hocko +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/memcontrol.h | 8 +++ + mm/memcontrol.c | 95 ++++++++++++++++++++++++++++++++++++--------- + mm/slab_common.c | 2 + 3 files changed, 85 insertions(+), 20 deletions(-) + +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -174,6 +174,11 @@ struct mem_cgroup_thresholds { + struct mem_cgroup_threshold_ary *spare; + }; + ++struct mem_cgroup_id { ++ int id; ++ atomic_t ref; ++}; ++ + /* + * The memory controller data structure. The memory controller controls both + * page cache and RSS per cgroup. We would eventually like to provide +@@ -183,6 +188,9 @@ struct mem_cgroup_thresholds { + struct mem_cgroup { + struct cgroup_subsys_state css; + ++ /* Private memcg ID. Used to ID objects that outlive the cgroup */ ++ struct mem_cgroup_id id; ++ + /* Accounted resources */ + struct page_counter memory; + struct page_counter memsw; +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -272,21 +272,7 @@ static inline bool mem_cgroup_is_root(st + + static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) + { +- return memcg->css.id; +-} +- +-/* +- * A helper function to get mem_cgroup from ID. must be called under +- * rcu_read_lock(). The caller is responsible for calling +- * css_tryget_online() if the mem_cgroup is used for charging. (dropping +- * refcnt from swap can be called against removed memcg.) +- */ +-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) +-{ +- struct cgroup_subsys_state *css; +- +- css = css_from_id(id, &memory_cgrp_subsys); +- return mem_cgroup_from_css(css); ++ return memcg->id.id; + } + + /* Writing them here to avoid exposing memcg's inner layout */ +@@ -4124,6 +4110,60 @@ static struct cftype mem_cgroup_legacy_f + { }, /* terminate */ + }; + ++/* ++ * Private memory cgroup IDR ++ * ++ * Swap-out records and page cache shadow entries need to store memcg ++ * references in constrained space, so we maintain an ID space that is ++ * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of ++ * memory-controlled cgroups to 64k. ++ * ++ * However, there usually are many references to the oflline CSS after ++ * the cgroup has been destroyed, such as page cache or reclaimable ++ * slab objects, that don't need to hang on to the ID. We want to keep ++ * those dead CSS from occupying IDs, or we might quickly exhaust the ++ * relatively small ID space and prevent the creation of new cgroups ++ * even when there are much fewer than 64k cgroups - possibly none. ++ * ++ * Maintain a private 16-bit ID space for memcg, and allow the ID to ++ * be freed and recycled when it's no longer needed, which is usually ++ * when the CSS is offlined. ++ * ++ * The only exception to that are records of swapped out tmpfs/shmem ++ * pages that need to be attributed to live ancestors on swapin. But ++ * those references are manageable from userspace. ++ */ ++ ++static DEFINE_IDR(mem_cgroup_idr); ++ ++static void mem_cgroup_id_get(struct mem_cgroup *memcg) ++{ ++ atomic_inc(&memcg->id.ref); ++} ++ ++static void mem_cgroup_id_put(struct mem_cgroup *memcg) ++{ ++ if (atomic_dec_and_test(&memcg->id.ref)) { ++ idr_remove(&mem_cgroup_idr, memcg->id.id); ++ memcg->id.id = 0; ++ ++ /* Memcg ID pins CSS */ ++ css_put(&memcg->css); ++ } ++} ++ ++/** ++ * mem_cgroup_from_id - look up a memcg from a memcg id ++ * @id: the memcg id to look up ++ * ++ * Caller must hold rcu_read_lock(). ++ */ ++struct mem_cgroup *mem_cgroup_from_id(unsigned short id) ++{ ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++ return idr_find(&mem_cgroup_idr, id); ++} ++ + static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) + { + struct mem_cgroup_per_node *pn; +@@ -4171,17 +4211,27 @@ static struct mem_cgroup *mem_cgroup_all + if (!memcg) + return NULL; + ++ memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL, ++ 1, MEM_CGROUP_ID_MAX, ++ GFP_KERNEL); ++ if (memcg->id.id < 0) ++ goto out_free; ++ + memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat) +- goto out_free; ++ goto out_idr; + + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto out_free_stat; + ++ idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); + return memcg; + + out_free_stat: + free_percpu(memcg->stat); ++out_idr: ++ if (memcg->id.id > 0) ++ idr_remove(&mem_cgroup_idr, memcg->id.id); + out_free: + kfree(memcg); + return NULL; +@@ -4277,8 +4327,9 @@ mem_cgroup_css_online(struct cgroup_subs + struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); + int ret; + +- if (css->id > MEM_CGROUP_ID_MAX) +- return -ENOSPC; ++ /* Online state pins memcg ID, memcg ID pins CSS */ ++ mem_cgroup_id_get(mem_cgroup_from_css(css)); ++ css_get(css); + + if (!parent) + return 0; +@@ -4352,6 +4403,8 @@ static void mem_cgroup_css_offline(struc + memcg_deactivate_kmem(memcg); + + wb_memcg_offline(memcg); ++ ++ mem_cgroup_id_put(memcg); + } + + static void mem_cgroup_css_released(struct cgroup_subsys_state *css) +@@ -5685,6 +5738,7 @@ void mem_cgroup_swapout(struct page *pag + if (!memcg) + return; + ++ mem_cgroup_id_get(memcg); + oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); + VM_BUG_ON_PAGE(oldid, page); + mem_cgroup_swap_statistics(memcg, true); +@@ -5703,6 +5757,9 @@ void mem_cgroup_swapout(struct page *pag + VM_BUG_ON(!irqs_disabled()); + mem_cgroup_charge_statistics(memcg, page, -1); + memcg_check_events(memcg, page); ++ ++ if (!mem_cgroup_is_root(memcg)) ++ css_put(&memcg->css); + } + + /** +@@ -5726,7 +5783,7 @@ void mem_cgroup_uncharge_swap(swp_entry_ + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memsw, 1); + mem_cgroup_swap_statistics(memcg, false); +- css_put(&memcg->css); ++ mem_cgroup_id_put(memcg); + } + rcu_read_unlock(); + } +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -522,7 +522,7 @@ void memcg_create_kmem_cache(struct mem_ + + cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); + cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, +- css->id, memcg_name_buf); ++ css->serial_nr, memcg_name_buf); + if (!cache_name) + goto out_unlock; + diff --git a/queue-4.4/mm-memcontrol-fix-memcg-id-ref-counter-on-swap-charge-move.patch b/queue-4.4/mm-memcontrol-fix-memcg-id-ref-counter-on-swap-charge-move.patch new file mode 100644 index 00000000000..c20f2017aef --- /dev/null +++ b/queue-4.4/mm-memcontrol-fix-memcg-id-ref-counter-on-swap-charge-move.patch @@ -0,0 +1,92 @@ +From 615d66c37c755c49ce022c9e5ac0875d27d2603d Mon Sep 17 00:00:00 2001 +From: Vladimir Davydov +Date: Thu, 11 Aug 2016 15:33:03 -0700 +Subject: mm: memcontrol: fix memcg id ref counter on swap charge move + +From: Vladimir Davydov + +commit 615d66c37c755c49ce022c9e5ac0875d27d2603d upstream. + +Since commit 73f576c04b94 ("mm: memcontrol: fix cgroup creation failure +after many small jobs") swap entries do not pin memcg->css.refcnt +directly. Instead, they pin memcg->id.ref. So we should adjust the +reference counters accordingly when moving swap charges between cgroups. + +Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") +Link: http://lkml.kernel.org/r/9ce297c64954a42dc90b543bc76106c4a94f07e8.1470219853.git.vdavydov@virtuozzo.com +Signed-off-by: Vladimir Davydov +Acked-by: Michal Hocko +Acked-by: Johannes Weiner +Cc: [3.19+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Michal Hocko +Signed-off-by: Greg Kroah-Hartman +--- + mm/memcontrol.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -4136,9 +4136,9 @@ static struct cftype mem_cgroup_legacy_f + + static DEFINE_IDR(mem_cgroup_idr); + +-static void mem_cgroup_id_get(struct mem_cgroup *memcg) ++static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n) + { +- atomic_inc(&memcg->id.ref); ++ atomic_add(n, &memcg->id.ref); + } + + static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) +@@ -4159,9 +4159,9 @@ static struct mem_cgroup *mem_cgroup_id_ + return memcg; + } + +-static void mem_cgroup_id_put(struct mem_cgroup *memcg) ++static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n) + { +- if (atomic_dec_and_test(&memcg->id.ref)) { ++ if (atomic_sub_and_test(n, &memcg->id.ref)) { + idr_remove(&mem_cgroup_idr, memcg->id.id); + memcg->id.id = 0; + +@@ -4170,6 +4170,16 @@ static void mem_cgroup_id_put(struct mem + } + } + ++static inline void mem_cgroup_id_get(struct mem_cgroup *memcg) ++{ ++ mem_cgroup_id_get_many(memcg, 1); ++} ++ ++static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) ++{ ++ mem_cgroup_id_put_many(memcg, 1); ++} ++ + /** + * mem_cgroup_from_id - look up a memcg from a memcg id + * @id: the memcg id to look up +@@ -4856,6 +4866,8 @@ static void __mem_cgroup_clear_mc(void) + if (!mem_cgroup_is_root(mc.from)) + page_counter_uncharge(&mc.from->memsw, mc.moved_swap); + ++ mem_cgroup_id_put_many(mc.from, mc.moved_swap); ++ + /* + * we charged both to->memory and to->memsw, so we + * should uncharge to->memory. +@@ -4863,9 +4875,9 @@ static void __mem_cgroup_clear_mc(void) + if (!mem_cgroup_is_root(mc.to)) + page_counter_uncharge(&mc.to->memory, mc.moved_swap); + +- css_put_many(&mc.from->css, mc.moved_swap); ++ mem_cgroup_id_get_many(mc.to, mc.moved_swap); ++ css_put_many(&mc.to->css, mc.moved_swap); + +- /* we've already done css_get(mc.to) */ + mc.moved_swap = 0; + } + memcg_oom_recover(from); diff --git a/queue-4.4/mm-memcontrol-fix-swap-counter-leak-on-swapout-from-offline-cgroup.patch b/queue-4.4/mm-memcontrol-fix-swap-counter-leak-on-swapout-from-offline-cgroup.patch new file mode 100644 index 00000000000..52027194623 --- /dev/null +++ b/queue-4.4/mm-memcontrol-fix-swap-counter-leak-on-swapout-from-offline-cgroup.patch @@ -0,0 +1,103 @@ +From 1f47b61fb4077936465dcde872a4e5cc4fe708da Mon Sep 17 00:00:00 2001 +From: Vladimir Davydov +Date: Thu, 11 Aug 2016 15:33:00 -0700 +Subject: mm: memcontrol: fix swap counter leak on swapout from offline cgroup + +From: Vladimir Davydov + +commit 1f47b61fb4077936465dcde872a4e5cc4fe708da upstream. + +An offline memory cgroup might have anonymous memory or shmem left +charged to it and no swap. Since only swap entries pin the id of an +offline cgroup, such a cgroup will have no id and so an attempt to +swapout its anon/shmem will not store memory cgroup info in the swap +cgroup map. As a result, memcg->swap or memcg->memsw will never get +uncharged from it and any of its ascendants. + +Fix this by always charging swapout to the first ancestor cgroup that +hasn't released its id yet. + +[hannes@cmpxchg.org: add comment to mem_cgroup_swapout] +[vdavydov@virtuozzo.com: use WARN_ON_ONCE() in mem_cgroup_id_get_online()] + Link: http://lkml.kernel.org/r/20160803123445.GJ13263@esperanza +Fixes: 73f576c04b941 ("mm: memcontrol: fix cgroup creation failure after many small jobs") +Link: http://lkml.kernel.org/r/5336daa5c9a32e776067773d9da655d2dc126491.1470219853.git.vdavydov@virtuozzo.com +Signed-off-by: Vladimir Davydov +Acked-by: Johannes Weiner +Acked-by: Michal Hocko +Cc: [3.19+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Michal Hocko +Signed-off-by: Greg Kroah-Hartman +--- + mm/memcontrol.c | 37 +++++++++++++++++++++++++++++++++---- + 1 file changed, 33 insertions(+), 4 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -4141,6 +4141,24 @@ static void mem_cgroup_id_get(struct mem + atomic_inc(&memcg->id.ref); + } + ++static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) ++{ ++ while (!atomic_inc_not_zero(&memcg->id.ref)) { ++ /* ++ * The root cgroup cannot be destroyed, so it's refcount must ++ * always be >= 1. ++ */ ++ if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { ++ VM_BUG_ON(1); ++ break; ++ } ++ memcg = parent_mem_cgroup(memcg); ++ if (!memcg) ++ memcg = root_mem_cgroup; ++ } ++ return memcg; ++} ++ + static void mem_cgroup_id_put(struct mem_cgroup *memcg) + { + if (atomic_dec_and_test(&memcg->id.ref)) { +@@ -5723,7 +5741,7 @@ subsys_initcall(mem_cgroup_init); + */ + void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + { +- struct mem_cgroup *memcg; ++ struct mem_cgroup *memcg, *swap_memcg; + unsigned short oldid; + + VM_BUG_ON_PAGE(PageLRU(page), page); +@@ -5738,16 +5756,27 @@ void mem_cgroup_swapout(struct page *pag + if (!memcg) + return; + +- mem_cgroup_id_get(memcg); +- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg)); ++ /* ++ * In case the memcg owning these pages has been offlined and doesn't ++ * have an ID allocated to it anymore, charge the closest online ++ * ancestor for the swap instead and transfer the memory+swap charge. ++ */ ++ swap_memcg = mem_cgroup_id_get_online(memcg); ++ oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg)); + VM_BUG_ON_PAGE(oldid, page); +- mem_cgroup_swap_statistics(memcg, true); ++ mem_cgroup_swap_statistics(swap_memcg, true); + + page->mem_cgroup = NULL; + + if (!mem_cgroup_is_root(memcg)) + page_counter_uncharge(&memcg->memory, 1); + ++ if (memcg != swap_memcg) { ++ if (!mem_cgroup_is_root(swap_memcg)) ++ page_counter_charge(&swap_memcg->memsw, 1); ++ page_counter_uncharge(&memcg->memsw, 1); ++ } ++ + /* + * Interrupts should be disabled here because the caller holds the + * mapping->tree_lock lock which is taken with interrupts-off. It is diff --git a/queue-4.4/series b/queue-4.4/series index 24a4b5d01a7..ddf5e731a7c 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -29,3 +29,6 @@ x86-xen-pat-remove-pat-table-init-code-from-xen.patch x86-pat-document-the-pat-initialization-sequence.patch x86-mm-pat-fix-bug_on-in-mmap_mem-on-qemu-i386.patch drm-i915-pretend-cursor-is-always-on-for-ilk-style-wm-calculations-v2.patch +mm-memcontrol-fix-cgroup-creation-failure-after-many-small-jobs.patch +mm-memcontrol-fix-swap-counter-leak-on-swapout-from-offline-cgroup.patch +mm-memcontrol-fix-memcg-id-ref-counter-on-swap-charge-move.patch