src/patches/suse-2.6.27.25/patches.fixes/mm-fix-Commited_AS-underflow-on-large-NR_CPUS

   1 From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
   2 Subject: mm: fix Committed_AS underflow on large NR_CPUS environment
   3 Patch-mainline: 2.6.30-rc5
   4 Git-commit: 00a62ce91e554198ef28234c91c36f850f5a3bc9
   5 References: bnc#505831
   6
   7 mm: fix Committed_AS underflow on large NR_CPUS environment
   8
   9 The Committed_AS field can underflow in certain situations:
  10
  11 >         # while true; do cat /proc/meminfo  | grep _AS; sleep 1; done | uniq -c
  12 >               1 Committed_AS: 18446744073709323392 kB
  13 >              11 Committed_AS: 18446744073709455488 kB
  14 >               6 Committed_AS:    35136 kB
  15 >               5 Committed_AS: 18446744073709454400 kB
  16 >               7 Committed_AS:    35904 kB
  17 >               3 Committed_AS: 18446744073709453248 kB
  18 >               2 Committed_AS:    34752 kB
  19 >               9 Committed_AS: 18446744073709453248 kB
  20 >               8 Committed_AS:    34752 kB
  21 >               3 Committed_AS: 18446744073709320960 kB
  22 >               7 Committed_AS: 18446744073709454080 kB
  23 >               3 Committed_AS: 18446744073709320960 kB
  24 >               5 Committed_AS: 18446744073709454080 kB
  25 >               6 Committed_AS: 18446744073709320960 kB
  26
  27 Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does
  28 not check for underflow.
  29
  30 But NR_CPUS proportional isn't good calculation.  In general,
  31 possibility of lock contention is proportional to the number of online
  32 cpus, not theorical maximum cpus (NR_CPUS).
  33
  34 The current kernel has generic percpu-counter stuff.  using it is right
  35 way.  it makes code simplify and percpu_counter_read_positive() don't
  36 make underflow issue.
  37
  38 Reported-by: Dave Hansen <dave@linux.vnet.ibm.com>
  39 Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
  40 Cc: Eric B Munson <ebmunson@us.ibm.com>
  41 Cc: Mel Gorman <mel@csn.ul.ie>
  42 Cc: Christoph Lameter <cl@linux-foundation.org>
  43 Cc: <stable@kernel.org>         [All kernel versions]
  44 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  45 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  46 Backported-by: Jeff Mahoney <jeffm@suse.com>
  47 Signed-off-by: Jeff Mahoney <jeffm@suse.com>
  48 ---
  49  fs/proc/proc_misc.c  |    2 +-
  50  include/linux/mman.h |    9 +++------
  51  kernel/fork.c        |    2 ++
  52  mm/mmap.c            |    8 ++------
  53  mm/nommu.c           |    9 +++------
  54  mm/swap.c            |   46 ----------------------------------------------
  55  6 files changed, 11 insertions(+), 65 deletions(-)
  56
  57 --- a/fs/proc/proc_misc.c
  58 +++ b/fs/proc/proc_misc.c
  59 @@ -145,7 +145,7 @@ static int meminfo_read_proc(char *page,
  60  #define K(x) ((x) << (PAGE_SHIFT - 10))
  61         si_meminfo(&i);
  62         si_swapinfo(&i);
  63 -       committed = atomic_long_read(&vm_committed_space);
  64 +       committed = percpu_counter_read_positive(&vm_committed_as);
  65         allowed = ((totalram_pages - hugetlb_total_pages())
  66                 * sysctl_overcommit_ratio / 100) + total_swap_pages;
  67
  68 --- a/include/linux/mman.h
  69 +++ b/include/linux/mman.h
  70 @@ -12,21 +12,18 @@
  71
  72  #ifdef __KERNEL__
  73  #include <linux/mm.h>
  74 +#include <linux/percpu_counter.h>
  75
  76  #include <asm/atomic.h>
  77
  78  extern int sysctl_overcommit_memory;
  79  extern int sysctl_overcommit_ratio;
  80 -extern atomic_long_t vm_committed_space;
  81 +extern struct percpu_counter vm_committed_as;
  82
  83 -#ifdef CONFIG_SMP
  84 -extern void vm_acct_memory(long pages);
  85 -#else
  86  static inline void vm_acct_memory(long pages)
  87  {
  88 -       atomic_long_add(pages, &vm_committed_space);
  89 +       percpu_counter_add(&vm_committed_as, pages);
  90  }
  91 -#endif
  92
  93  static inline void vm_unacct_memory(long pages)
  94  {
  95 --- a/kernel/fork.c
  96 +++ b/kernel/fork.c
  97 @@ -1442,6 +1442,8 @@ void __init proc_caches_init(void)
  98         mm_cachep = kmem_cache_create("mm_struct",
  99                         sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 100                         SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 101 +       if (percpu_counter_init(&vm_committed_as, 0))
 102 +               panic("Failed to allocate vm_committed_as");
 103  }
 104
 105  /*
 106 --- a/mm/mmap.c
 107 +++ b/mm/mmap.c
 108 @@ -84,7 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 109  int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 110  int sysctl_overcommit_ratio = 50;      /* default is 50% */
 111  int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 112 -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
 113 +struct percpu_counter vm_committed_as;
 114  int heap_stack_gap __read_mostly = 1;
 115
 116  /*
 117 @@ -178,11 +178,7 @@ int __vm_enough_memory(struct mm_struct
 118            leave 3% of the size of this process for other processes */
 119         allowed -= mm->total_vm / 32;
 120
 121 -       /*
 122 -        * cast `allowed' as a signed long because vm_committed_space
 123 -        * sometimes has a negative value
 124 -        */
 125 -       if (atomic_long_read(&vm_committed_space) < (long)allowed)
 126 +       if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 127                 return 0;
 128  error:
 129         vm_unacct_memory(pages);
 130 --- a/mm/nommu.c
 131 +++ b/mm/nommu.c
 132 @@ -39,7 +39,7 @@ struct page *mem_map;
 133  unsigned long max_mapnr;
 134  unsigned long num_physpages;
 135  unsigned long askedalloc, realalloc;
 136 -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
 137 +struct percpu_counter vm_committed_as;
 138  int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 139  int sysctl_overcommit_ratio = 50; /* default is 50% */
 140  int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 141 @@ -1434,12 +1434,9 @@ int __vm_enough_memory(struct mm_struct
 142            leave 3% of the size of this process for other processes */
 143         allowed -= current->mm->total_vm / 32;
 144
 145 -       /*
 146 -        * cast `allowed' as a signed long because vm_committed_space
 147 -        * sometimes has a negative value
 148 -        */
 149 -       if (atomic_long_read(&vm_committed_space) < (long)allowed)
 150 +       if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 151                 return 0;
 152 +
 153  error:
 154         vm_unacct_memory(pages);
 155
 156 --- a/mm/swap.c
 157 +++ b/mm/swap.c
 158 @@ -474,49 +474,6 @@ unsigned pagevec_lookup_tag(struct pagev
 159
 160  EXPORT_SYMBOL(pagevec_lookup_tag);
 161
 162 -#ifdef CONFIG_SMP
 163 -/*
 164 - * We tolerate a little inaccuracy to avoid ping-ponging the counter between
 165 - * CPUs
 166 - */
 167 -#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
 168 -
 169 -static DEFINE_PER_CPU(long, committed_space);
 170 -
 171 -void vm_acct_memory(long pages)
 172 -{
 173 -       long *local;
 174 -
 175 -       preempt_disable();
 176 -       local = &__get_cpu_var(committed_space);
 177 -       *local += pages;
 178 -       if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
 179 -               atomic_long_add(*local, &vm_committed_space);
 180 -               *local = 0;
 181 -       }
 182 -       preempt_enable();
 183 -}
 184 -
 185 -#ifdef CONFIG_HOTPLUG_CPU
 186 -
 187 -/* Drop the CPU's cached committed space back into the central pool. */
 188 -static int cpu_swap_callback(struct notifier_block *nfb,
 189 -                            unsigned long action,
 190 -                            void *hcpu)
 191 -{
 192 -       long *committed;
 193 -
 194 -       committed = &per_cpu(committed_space, (long)hcpu);
 195 -       if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 196 -               atomic_long_add(*committed, &vm_committed_space);
 197 -               *committed = 0;
 198 -               drain_cpu_pagevecs((long)hcpu);
 199 -       }
 200 -       return NOTIFY_OK;
 201 -}
 202 -#endif /* CONFIG_HOTPLUG_CPU */
 203 -#endif /* CONFIG_SMP */
 204 -
 205  /*
 206   * Perform any setup for the swap system
 207   */
 208 @@ -537,7 +494,4 @@ void __init swap_setup(void)
 209          * Right now other parts of the system means that we
 210          * _really_ don't want to cluster much more
 211          */
 212 -#ifdef CONFIG_HOTPLUG_CPU
 213 -       hotcpu_notifier(cpu_swap_callback, 0);
 214 -#endif
 215  }