releases/3.0.18/memcg-add-mem_cgroup_replace_page_cache-to-fix-lru-issue.patch

   1 From ab936cbcd02072a34b60d268f94440fd5cf1970b Mon Sep 17 00:00:00 2001
   2 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
   3 Date: Thu, 12 Jan 2012 17:17:44 -0800
   4 Subject: memcg: add mem_cgroup_replace_page_cache() to fix LRU issue
   5
   6 From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
   7
   8 commit ab936cbcd02072a34b60d268f94440fd5cf1970b upstream.
   9
  10 Commit ef6a3c6311 ("mm: add replace_page_cache_page() function") added a
  11 function replace_page_cache_page().  This function replaces a page in the
  12 radix-tree with a new page.  WHen doing this, memory cgroup needs to fix
  13 up the accounting information.  memcg need to check PCG_USED bit etc.
  14
  15 In some(many?) cases, 'newpage' is on LRU before calling
  16 replace_page_cache().  So, memcg's LRU accounting information should be
  17 fixed, too.
  18
  19 This patch adds mem_cgroup_replace_page_cache() and removes the old hooks.
  20  In that function, old pages will be unaccounted without touching
  21 res_counter and new page will be accounted to the memcg (of old page).
  22 WHen overwriting pc->mem_cgroup of newpage, take zone->lru_lock and avoid
  23 races with LRU handling.
  24
  25 Background:
  26   replace_page_cache_page() is called by FUSE code in its splice() handling.
  27   Here, 'newpage' is replacing oldpage but this newpage is not a newly allocated
  28   page and may be on LRU. LRU mis-accounting will be critical for memory cgroup
  29   because rmdir() checks the whole LRU is empty and there is no account leak.
  30   If a page is on the other LRU than it should be, rmdir() will fail.
  31
  32 This bug was added in March 2011, but no bug report yet.  I guess there
  33 are not many people who use memcg and FUSE at the same time with upstream
  34 kernels.
  35
  36 The result of this bug is that admin cannot destroy a memcg because of
  37 account leak.  So, no panic, no deadlock.  And, even if an active cgroup
  38 exist, umount can succseed.  So no problem at shutdown.
  39
  40 Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
  41 Acked-by: Johannes Weiner <hannes@cmpxchg.org>
  42 Acked-by: Michal Hocko <mhocko@suse.cz>
  43 Cc: Miklos Szeredi <mszeredi@suse.cz>
  44 Cc: Hugh Dickins <hughd@google.com>
  45 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  46 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  47 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
  48
  49 ---
  50  include/linux/memcontrol.h |    6 ++++++
  51  mm/filemap.c               |   18 ++----------------
  52  mm/memcontrol.c            |   44 ++++++++++++++++++++++++++++++++++++++++++++
  53  3 files changed, 52 insertions(+), 16 deletions(-)
  54
  55 --- a/include/linux/memcontrol.h
  56 +++ b/include/linux/memcontrol.h
  57 @@ -119,6 +119,8 @@ struct zone_reclaim_stat*
  58  mem_cgroup_get_reclaim_stat_from_page(struct page *page);
  59  extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
  60                                         struct task_struct *p);
  61 +extern void mem_cgroup_replace_page_cache(struct page *oldpage,
  62 +                                       struct page *newpage);
  63
  64  #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
  65  extern int do_swap_account;
  66 @@ -370,6 +372,10 @@ static inline
  67  void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
  68  {
  69  }
  70 +static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
  71 +                               struct page *newpage)
  72 +{
  73 +}
  74  #endif /* CONFIG_CGROUP_MEM_CONT */
  75
  76  #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM)
  77 --- a/mm/filemap.c
  78 +++ b/mm/filemap.c
  79 @@ -396,24 +396,11 @@ EXPORT_SYMBOL(filemap_write_and_wait_ran
  80  int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
  81  {
  82         int error;
  83 -       struct mem_cgroup *memcg = NULL;
  84
  85         VM_BUG_ON(!PageLocked(old));
  86         VM_BUG_ON(!PageLocked(new));
  87         VM_BUG_ON(new->mapping);
  88
  89 -       /*
  90 -        * This is not page migration, but prepare_migration and
  91 -        * end_migration does enough work for charge replacement.
  92 -        *
  93 -        * In the longer term we probably want a specialized function
  94 -        * for moving the charge from old to new in a more efficient
  95 -        * manner.
  96 -        */
  97 -       error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
  98 -       if (error)
  99 -               return error;
 100 -
 101         error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 102         if (!error) {
 103                 struct address_space *mapping = old->mapping;
 104 @@ -435,13 +422,12 @@ int replace_page_cache_page(struct page
 105                 if (PageSwapBacked(new))
 106                         __inc_zone_page_state(new, NR_SHMEM);
 107                 spin_unlock_irq(&mapping->tree_lock);
 108 +               /* mem_cgroup codes must not be called under tree_lock */
 109 +               mem_cgroup_replace_page_cache(old, new);
 110                 radix_tree_preload_end();
 111                 if (freepage)
 112                         freepage(old);
 113                 page_cache_release(old);
 114 -               mem_cgroup_end_migration(memcg, old, new, true);
 115 -       } else {
 116 -               mem_cgroup_end_migration(memcg, old, new, false);
 117         }
 118
 119         return error;
 120 --- a/mm/memcontrol.c
 121 +++ b/mm/memcontrol.c
 122 @@ -3422,6 +3422,50 @@ int mem_cgroup_shmem_charge_fallback(str
 123         return ret;
 124  }
 125
 126 +/*
 127 + * At replace page cache, newpage is not under any memcg but it's on
 128 + * LRU. So, this function doesn't touch res_counter but handles LRU
 129 + * in correct way. Both pages are locked so we cannot race with uncharge.
 130 + */
 131 +void mem_cgroup_replace_page_cache(struct page *oldpage,
 132 +                                 struct page *newpage)
 133 +{
 134 +       struct mem_cgroup *memcg;
 135 +       struct page_cgroup *pc;
 136 +       struct zone *zone;
 137 +       enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
 138 +       unsigned long flags;
 139 +
 140 +       if (mem_cgroup_disabled())
 141 +               return;
 142 +
 143 +       pc = lookup_page_cgroup(oldpage);
 144 +       /* fix accounting on old pages */
 145 +       lock_page_cgroup(pc);
 146 +       memcg = pc->mem_cgroup;
 147 +       mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -1);
 148 +       ClearPageCgroupUsed(pc);
 149 +       unlock_page_cgroup(pc);
 150 +
 151 +       if (PageSwapBacked(oldpage))
 152 +               type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 153 +
 154 +       zone = page_zone(newpage);
 155 +       pc = lookup_page_cgroup(newpage);
 156 +       /*
 157 +        * Even if newpage->mapping was NULL before starting replacement,
 158 +        * the newpage may be on LRU(or pagevec for LRU) already. We lock
 159 +        * LRU while we overwrite pc->mem_cgroup.
 160 +        */
 161 +       spin_lock_irqsave(&zone->lru_lock, flags);
 162 +       if (PageLRU(newpage))
 163 +               del_page_from_lru_list(zone, newpage, page_lru(newpage));
 164 +       __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type);
 165 +       if (PageLRU(newpage))
 166 +               add_page_to_lru_list(zone, newpage, page_lru(newpage));
 167 +       spin_unlock_irqrestore(&zone->lru_lock, flags);
 168 +}
 169 +
 170  #ifdef CONFIG_DEBUG_VM
 171  static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 172  {