1 From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
2 Subject: mm: fix Committed_AS underflow on large NR_CPUS environment
3 Patch-mainline: 2.6.30-rc5
4 Git-commit: 00a62ce91e554198ef28234c91c36f850f5a3bc9
7 mm: fix Committed_AS underflow on large NR_CPUS environment
9 The Committed_AS field can underflow in certain situations:
11 > # while true; do cat /proc/meminfo | grep _AS; sleep 1; done | uniq -c
12 > 1 Committed_AS: 18446744073709323392 kB
13 > 11 Committed_AS: 18446744073709455488 kB
14 > 6 Committed_AS: 35136 kB
15 > 5 Committed_AS: 18446744073709454400 kB
16 > 7 Committed_AS: 35904 kB
17 > 3 Committed_AS: 18446744073709453248 kB
18 > 2 Committed_AS: 34752 kB
19 > 9 Committed_AS: 18446744073709453248 kB
20 > 8 Committed_AS: 34752 kB
21 > 3 Committed_AS: 18446744073709320960 kB
22 > 7 Committed_AS: 18446744073709454080 kB
23 > 3 Committed_AS: 18446744073709320960 kB
24 > 5 Committed_AS: 18446744073709454080 kB
25 > 6 Committed_AS: 18446744073709320960 kB
27 Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does
28 not check for underflow.
30 But NR_CPUS proportional isn't good calculation. In general,
31 possibility of lock contention is proportional to the number of online
32 cpus, not theorical maximum cpus (NR_CPUS).
34 The current kernel has generic percpu-counter stuff. using it is right
35 way. it makes code simplify and percpu_counter_read_positive() don't
38 Reported-by: Dave Hansen <dave@linux.vnet.ibm.com>
39 Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
40 Cc: Eric B Munson <ebmunson@us.ibm.com>
41 Cc: Mel Gorman <mel@csn.ul.ie>
42 Cc: Christoph Lameter <cl@linux-foundation.org>
43 Cc: <stable@kernel.org> [All kernel versions]
44 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
45 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
46 Backported-by: Jeff Mahoney <jeffm@suse.com>
47 Signed-off-by: Jeff Mahoney <jeffm@suse.com>
49 fs/proc/proc_misc.c | 2 +-
50 include/linux/mman.h | 9 +++------
52 mm/mmap.c | 8 ++------
53 mm/nommu.c | 9 +++------
54 mm/swap.c | 46 ----------------------------------------------
55 6 files changed, 11 insertions(+), 65 deletions(-)
57 --- a/fs/proc/proc_misc.c
58 +++ b/fs/proc/proc_misc.c
59 @@ -145,7 +145,7 @@ static int meminfo_read_proc(char *page,
60 #define K(x) ((x) << (PAGE_SHIFT - 10))
63 - committed = atomic_long_read(&vm_committed_space);
64 + committed = percpu_counter_read_positive(&vm_committed_as);
65 allowed = ((totalram_pages - hugetlb_total_pages())
66 * sysctl_overcommit_ratio / 100) + total_swap_pages;
68 --- a/include/linux/mman.h
69 +++ b/include/linux/mman.h
74 +#include <linux/percpu_counter.h>
76 #include <asm/atomic.h>
78 extern int sysctl_overcommit_memory;
79 extern int sysctl_overcommit_ratio;
80 -extern atomic_long_t vm_committed_space;
81 +extern struct percpu_counter vm_committed_as;
84 -extern void vm_acct_memory(long pages);
86 static inline void vm_acct_memory(long pages)
88 - atomic_long_add(pages, &vm_committed_space);
89 + percpu_counter_add(&vm_committed_as, pages);
93 static inline void vm_unacct_memory(long pages)
97 @@ -1442,6 +1442,8 @@ void __init proc_caches_init(void)
98 mm_cachep = kmem_cache_create("mm_struct",
99 sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
100 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
101 + if (percpu_counter_init(&vm_committed_as, 0))
102 + panic("Failed to allocate vm_committed_as");
108 @@ -84,7 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
109 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
110 int sysctl_overcommit_ratio = 50; /* default is 50% */
111 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
112 -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
113 +struct percpu_counter vm_committed_as;
114 int heap_stack_gap __read_mostly = 1;
117 @@ -178,11 +178,7 @@ int __vm_enough_memory(struct mm_struct
118 leave 3% of the size of this process for other processes */
119 allowed -= mm->total_vm / 32;
122 - * cast `allowed' as a signed long because vm_committed_space
123 - * sometimes has a negative value
125 - if (atomic_long_read(&vm_committed_space) < (long)allowed)
126 + if (percpu_counter_read_positive(&vm_committed_as) < allowed)
129 vm_unacct_memory(pages);
132 @@ -39,7 +39,7 @@ struct page *mem_map;
133 unsigned long max_mapnr;
134 unsigned long num_physpages;
135 unsigned long askedalloc, realalloc;
136 -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
137 +struct percpu_counter vm_committed_as;
138 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
139 int sysctl_overcommit_ratio = 50; /* default is 50% */
140 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
141 @@ -1434,12 +1434,9 @@ int __vm_enough_memory(struct mm_struct
142 leave 3% of the size of this process for other processes */
143 allowed -= current->mm->total_vm / 32;
146 - * cast `allowed' as a signed long because vm_committed_space
147 - * sometimes has a negative value
149 - if (atomic_long_read(&vm_committed_space) < (long)allowed)
150 + if (percpu_counter_read_positive(&vm_committed_as) < allowed)
154 vm_unacct_memory(pages);
158 @@ -474,49 +474,6 @@ unsigned pagevec_lookup_tag(struct pagev
160 EXPORT_SYMBOL(pagevec_lookup_tag);
164 - * We tolerate a little inaccuracy to avoid ping-ponging the counter between
167 -#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
169 -static DEFINE_PER_CPU(long, committed_space);
171 -void vm_acct_memory(long pages)
176 - local = &__get_cpu_var(committed_space);
178 - if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
179 - atomic_long_add(*local, &vm_committed_space);
185 -#ifdef CONFIG_HOTPLUG_CPU
187 -/* Drop the CPU's cached committed space back into the central pool. */
188 -static int cpu_swap_callback(struct notifier_block *nfb,
189 - unsigned long action,
194 - committed = &per_cpu(committed_space, (long)hcpu);
195 - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
196 - atomic_long_add(*committed, &vm_committed_space);
198 - drain_cpu_pagevecs((long)hcpu);
202 -#endif /* CONFIG_HOTPLUG_CPU */
203 -#endif /* CONFIG_SMP */
206 * Perform any setup for the swap system
208 @@ -537,7 +494,4 @@ void __init swap_setup(void)
209 * Right now other parts of the system means that we
210 * _really_ don't want to cluster much more
212 -#ifdef CONFIG_HOTPLUG_CPU
213 - hotcpu_notifier(cpu_swap_callback, 0);