]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/3.0.18/memcg-add-mem_cgroup_replace_page_cache-to-fix-lru-issue.patch
5.1-stable patches
[thirdparty/kernel/stable-queue.git] / releases / 3.0.18 / memcg-add-mem_cgroup_replace_page_cache-to-fix-lru-issue.patch
1 From ab936cbcd02072a34b60d268f94440fd5cf1970b Mon Sep 17 00:00:00 2001
2 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
3 Date: Thu, 12 Jan 2012 17:17:44 -0800
4 Subject: memcg: add mem_cgroup_replace_page_cache() to fix LRU issue
5
6 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
7
8 commit ab936cbcd02072a34b60d268f94440fd5cf1970b upstream.
9
10 Commit ef6a3c6311 ("mm: add replace_page_cache_page() function") added a
11 function replace_page_cache_page(). This function replaces a page in the
12 radix-tree with a new page. WHen doing this, memory cgroup needs to fix
13 up the accounting information. memcg need to check PCG_USED bit etc.
14
15 In some(many?) cases, 'newpage' is on LRU before calling
16 replace_page_cache(). So, memcg's LRU accounting information should be
17 fixed, too.
18
19 This patch adds mem_cgroup_replace_page_cache() and removes the old hooks.
20 In that function, old pages will be unaccounted without touching
21 res_counter and new page will be accounted to the memcg (of old page).
22 WHen overwriting pc->mem_cgroup of newpage, take zone->lru_lock and avoid
23 races with LRU handling.
24
25 Background:
26 replace_page_cache_page() is called by FUSE code in its splice() handling.
27 Here, 'newpage' is replacing oldpage but this newpage is not a newly allocated
28 page and may be on LRU. LRU mis-accounting will be critical for memory cgroup
29 because rmdir() checks the whole LRU is empty and there is no account leak.
30 If a page is on the other LRU than it should be, rmdir() will fail.
31
32 This bug was added in March 2011, but no bug report yet. I guess there
33 are not many people who use memcg and FUSE at the same time with upstream
34 kernels.
35
36 The result of this bug is that admin cannot destroy a memcg because of
37 account leak. So, no panic, no deadlock. And, even if an active cgroup
38 exist, umount can succseed. So no problem at shutdown.
39
40 Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
41 Acked-by: Johannes Weiner <hannes@cmpxchg.org>
42 Acked-by: Michal Hocko <mhocko@suse.cz>
43 Cc: Miklos Szeredi <mszeredi@suse.cz>
44 Cc: Hugh Dickins <hughd@google.com>
45 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
46 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
47 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
48
49 ---
50 include/linux/memcontrol.h | 6 ++++++
51 mm/filemap.c | 18 ++----------------
52 mm/memcontrol.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
53 3 files changed, 52 insertions(+), 16 deletions(-)
54
55 --- a/include/linux/memcontrol.h
56 +++ b/include/linux/memcontrol.h
57 @@ -119,6 +119,8 @@ struct zone_reclaim_stat*
58 mem_cgroup_get_reclaim_stat_from_page(struct page *page);
59 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
60 struct task_struct *p);
61 +extern void mem_cgroup_replace_page_cache(struct page *oldpage,
62 + struct page *newpage);
63
64 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
65 extern int do_swap_account;
66 @@ -370,6 +372,10 @@ static inline
67 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
68 {
69 }
70 +static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
71 + struct page *newpage)
72 +{
73 +}
74 #endif /* CONFIG_CGROUP_MEM_CONT */
75
76 #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
77 --- a/mm/filemap.c
78 +++ b/mm/filemap.c
79 @@ -396,24 +396,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_ran
80 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
81 {
82 int error;
83 - struct mem_cgroup *memcg = NULL;
84
85 VM_BUG_ON(!PageLocked(old));
86 VM_BUG_ON(!PageLocked(new));
87 VM_BUG_ON(new->mapping);
88
89 - /*
90 - * This is not page migration, but prepare_migration and
91 - * end_migration does enough work for charge replacement.
92 - *
93 - * In the longer term we probably want a specialized function
94 - * for moving the charge from old to new in a more efficient
95 - * manner.
96 - */
97 - error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
98 - if (error)
99 - return error;
100 -
101 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
102 if (!error) {
103 struct address_space *mapping = old->mapping;
104 @@ -435,13 +422,12 @@ int replace_page_cache_page(struct page
105 if (PageSwapBacked(new))
106 __inc_zone_page_state(new, NR_SHMEM);
107 spin_unlock_irq(&mapping->tree_lock);
108 + /* mem_cgroup codes must not be called under tree_lock */
109 + mem_cgroup_replace_page_cache(old, new);
110 radix_tree_preload_end();
111 if (freepage)
112 freepage(old);
113 page_cache_release(old);
114 - mem_cgroup_end_migration(memcg, old, new, true);
115 - } else {
116 - mem_cgroup_end_migration(memcg, old, new, false);
117 }
118
119 return error;
120 --- a/mm/memcontrol.c
121 +++ b/mm/memcontrol.c
122 @@ -3422,6 +3422,50 @@ int mem_cgroup_shmem_charge_fallback(str
123 return ret;
124 }
125
126 +/*
127 + * At replace page cache, newpage is not under any memcg but it's on
128 + * LRU. So, this function doesn't touch res_counter but handles LRU
129 + * in correct way. Both pages are locked so we cannot race with uncharge.
130 + */
131 +void mem_cgroup_replace_page_cache(struct page *oldpage,
132 + struct page *newpage)
133 +{
134 + struct mem_cgroup *memcg;
135 + struct page_cgroup *pc;
136 + struct zone *zone;
137 + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
138 + unsigned long flags;
139 +
140 + if (mem_cgroup_disabled())
141 + return;
142 +
143 + pc = lookup_page_cgroup(oldpage);
144 + /* fix accounting on old pages */
145 + lock_page_cgroup(pc);
146 + memcg = pc->mem_cgroup;
147 + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
148 + ClearPageCgroupUsed(pc);
149 + unlock_page_cgroup(pc);
150 +
151 + if (PageSwapBacked(oldpage))
152 + type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
153 +
154 + zone = page_zone(newpage);
155 + pc = lookup_page_cgroup(newpage);
156 + /*
157 + * Even if newpage->mapping was NULL before starting replacement,
158 + * the newpage may be on LRU(or pagevec for LRU) already. We lock
159 + * LRU while we overwrite pc->mem_cgroup.
160 + */
161 + spin_lock_irqsave(&zone->lru_lock, flags);
162 + if (PageLRU(newpage))
163 + del_page_from_lru_list(zone, newpage, page_lru(newpage));
164 + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
165 + if (PageLRU(newpage))
166 + add_page_to_lru_list(zone, newpage, page_lru(newpage));
167 + spin_unlock_irqrestore(&zone->lru_lock, flags);
168 +}
169 +
170 #ifdef CONFIG_DEBUG_VM
171 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
172 {