1 From ab936cbcd02072a34b60d268f94440fd5cf1970b Mon Sep 17 00:00:00 2001
2 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
3 Date: Thu, 12 Jan 2012 17:17:44 -0800
4 Subject: memcg: add mem_cgroup_replace_page_cache() to fix LRU issue
6 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
8 commit ab936cbcd02072a34b60d268f94440fd5cf1970b upstream.
10 Commit ef6a3c6311 ("mm: add replace_page_cache_page() function") added a
11 function replace_page_cache_page(). This function replaces a page in the
12 radix-tree with a new page. WHen doing this, memory cgroup needs to fix
13 up the accounting information. memcg need to check PCG_USED bit etc.
15 In some(many?) cases, 'newpage' is on LRU before calling
16 replace_page_cache(). So, memcg's LRU accounting information should be
19 This patch adds mem_cgroup_replace_page_cache() and removes the old hooks.
20 In that function, old pages will be unaccounted without touching
21 res_counter and new page will be accounted to the memcg (of old page).
22 WHen overwriting pc->mem_cgroup of newpage, take zone->lru_lock and avoid
23 races with LRU handling.
26 replace_page_cache_page() is called by FUSE code in its splice() handling.
27 Here, 'newpage' is replacing oldpage but this newpage is not a newly allocated
28 page and may be on LRU. LRU mis-accounting will be critical for memory cgroup
29 because rmdir() checks the whole LRU is empty and there is no account leak.
30 If a page is on the other LRU than it should be, rmdir() will fail.
32 This bug was added in March 2011, but no bug report yet. I guess there
33 are not many people who use memcg and FUSE at the same time with upstream
36 The result of this bug is that admin cannot destroy a memcg because of
37 account leak. So, no panic, no deadlock. And, even if an active cgroup
38 exist, umount can succseed. So no problem at shutdown.
40 Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
41 Acked-by: Johannes Weiner <hannes@cmpxchg.org>
42 Acked-by: Michal Hocko <mhocko@suse.cz>
43 Cc: Miklos Szeredi <mszeredi@suse.cz>
44 Cc: Hugh Dickins <hughd@google.com>
45 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
46 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
47 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
50 include/linux/memcontrol.h | 6 ++++++
51 mm/filemap.c | 18 ++----------------
52 mm/memcontrol.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
53 3 files changed, 52 insertions(+), 16 deletions(-)
55 --- a/include/linux/memcontrol.h
56 +++ b/include/linux/memcontrol.h
57 @@ -119,6 +119,8 @@ struct zone_reclaim_stat*
58 mem_cgroup_get_reclaim_stat_from_page(struct page *page);
59 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
60 struct task_struct *p);
61 +extern void mem_cgroup_replace_page_cache(struct page *oldpage,
62 + struct page *newpage);
64 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
65 extern int do_swap_account;
66 @@ -370,6 +372,10 @@ static inline
67 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
70 +static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
71 + struct page *newpage)
74 #endif /* CONFIG_CGROUP_MEM_CONT */
76 #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
79 @@ -396,24 +396,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_ran
80 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
83 - struct mem_cgroup *memcg = NULL;
85 VM_BUG_ON(!PageLocked(old));
86 VM_BUG_ON(!PageLocked(new));
87 VM_BUG_ON(new->mapping);
90 - * This is not page migration, but prepare_migration and
91 - * end_migration does enough work for charge replacement.
93 - * In the longer term we probably want a specialized function
94 - * for moving the charge from old to new in a more efficient
97 - error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
101 error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
103 struct address_space *mapping = old->mapping;
104 @@ -435,13 +422,12 @@ int replace_page_cache_page(struct page
105 if (PageSwapBacked(new))
106 __inc_zone_page_state(new, NR_SHMEM);
107 spin_unlock_irq(&mapping->tree_lock);
108 + /* mem_cgroup codes must not be called under tree_lock */
109 + mem_cgroup_replace_page_cache(old, new);
110 radix_tree_preload_end();
113 page_cache_release(old);
114 - mem_cgroup_end_migration(memcg, old, new, true);
116 - mem_cgroup_end_migration(memcg, old, new, false);
120 --- a/mm/memcontrol.c
121 +++ b/mm/memcontrol.c
122 @@ -3422,6 +3422,50 @@ int mem_cgroup_shmem_charge_fallback(str
127 + * At replace page cache, newpage is not under any memcg but it's on
128 + * LRU. So, this function doesn't touch res_counter but handles LRU
129 + * in correct way. Both pages are locked so we cannot race with uncharge.
131 +void mem_cgroup_replace_page_cache(struct page *oldpage,
132 + struct page *newpage)
134 + struct mem_cgroup *memcg;
135 + struct page_cgroup *pc;
137 + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
138 + unsigned long flags;
140 + if (mem_cgroup_disabled())
143 + pc = lookup_page_cgroup(oldpage);
144 + /* fix accounting on old pages */
145 + lock_page_cgroup(pc);
146 + memcg = pc->mem_cgroup;
147 + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
148 + ClearPageCgroupUsed(pc);
149 + unlock_page_cgroup(pc);
151 + if (PageSwapBacked(oldpage))
152 + type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
154 + zone = page_zone(newpage);
155 + pc = lookup_page_cgroup(newpage);
157 + * Even if newpage->mapping was NULL before starting replacement,
158 + * the newpage may be on LRU(or pagevec for LRU) already. We lock
159 + * LRU while we overwrite pc->mem_cgroup.
161 + spin_lock_irqsave(&zone->lru_lock, flags);
162 + if (PageLRU(newpage))
163 + del_page_from_lru_list(zone, newpage, page_lru(newpage));
164 + __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
165 + if (PageLRU(newpage))
166 + add_page_to_lru_list(zone, newpage, page_lru(newpage));
167 + spin_unlock_irqrestore(&zone->lru_lock, flags);
170 #ifdef CONFIG_DEBUG_VM
171 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)