]>
Commit | Line | Data |
---|---|---|
00e5a55c BS |
1 | From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> |
2 | Subject: mm: fix Committed_AS underflow on large NR_CPUS environment | |
3 | Patch-mainline: 2.6.30-rc5 | |
4 | Git-commit: 00a62ce91e554198ef28234c91c36f850f5a3bc9 | |
5 | References: bnc#505831 | |
6 | ||
7 | mm: fix Committed_AS underflow on large NR_CPUS environment | |
8 | ||
9 | The Committed_AS field can underflow in certain situations: | |
10 | ||
11 | > # while true; do cat /proc/meminfo | grep _AS; sleep 1; done | uniq -c | |
12 | > 1 Committed_AS: 18446744073709323392 kB | |
13 | > 11 Committed_AS: 18446744073709455488 kB | |
14 | > 6 Committed_AS: 35136 kB | |
15 | > 5 Committed_AS: 18446744073709454400 kB | |
16 | > 7 Committed_AS: 35904 kB | |
17 | > 3 Committed_AS: 18446744073709453248 kB | |
18 | > 2 Committed_AS: 34752 kB | |
19 | > 9 Committed_AS: 18446744073709453248 kB | |
20 | > 8 Committed_AS: 34752 kB | |
21 | > 3 Committed_AS: 18446744073709320960 kB | |
22 | > 7 Committed_AS: 18446744073709454080 kB | |
23 | > 3 Committed_AS: 18446744073709320960 kB | |
24 | > 5 Committed_AS: 18446744073709454080 kB | |
25 | > 6 Committed_AS: 18446744073709320960 kB | |
26 | ||
27 | Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does | |
28 | not check for underflow. | |
29 | ||
30 | But NR_CPUS proportional isn't good calculation. In general, | |
31 | possibility of lock contention is proportional to the number of online | |
32 | cpus, not theorical maximum cpus (NR_CPUS). | |
33 | ||
34 | The current kernel has generic percpu-counter stuff. using it is right | |
35 | way. it makes code simplify and percpu_counter_read_positive() don't | |
36 | make underflow issue. | |
37 | ||
38 | Reported-by: Dave Hansen <dave@linux.vnet.ibm.com> | |
39 | Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> | |
40 | Cc: Eric B Munson <ebmunson@us.ibm.com> | |
41 | Cc: Mel Gorman <mel@csn.ul.ie> | |
42 | Cc: Christoph Lameter <cl@linux-foundation.org> | |
43 | Cc: <stable@kernel.org> [All kernel versions] | |
44 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
45 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
46 | Backported-by: Jeff Mahoney <jeffm@suse.com> | |
47 | Signed-off-by: Jeff Mahoney <jeffm@suse.com> | |
48 | --- | |
49 | fs/proc/proc_misc.c | 2 +- | |
50 | include/linux/mman.h | 9 +++------ | |
51 | kernel/fork.c | 2 ++ | |
52 | mm/mmap.c | 8 ++------ | |
53 | mm/nommu.c | 9 +++------ | |
54 | mm/swap.c | 46 ---------------------------------------------- | |
55 | 6 files changed, 11 insertions(+), 65 deletions(-) | |
56 | ||
57 | --- a/fs/proc/proc_misc.c | |
58 | +++ b/fs/proc/proc_misc.c | |
59 | @@ -145,7 +145,7 @@ static int meminfo_read_proc(char *page, | |
60 | #define K(x) ((x) << (PAGE_SHIFT - 10)) | |
61 | si_meminfo(&i); | |
62 | si_swapinfo(&i); | |
63 | - committed = atomic_long_read(&vm_committed_space); | |
64 | + committed = percpu_counter_read_positive(&vm_committed_as); | |
65 | allowed = ((totalram_pages - hugetlb_total_pages()) | |
66 | * sysctl_overcommit_ratio / 100) + total_swap_pages; | |
67 | ||
68 | --- a/include/linux/mman.h | |
69 | +++ b/include/linux/mman.h | |
70 | @@ -12,21 +12,18 @@ | |
71 | ||
72 | #ifdef __KERNEL__ | |
73 | #include <linux/mm.h> | |
74 | +#include <linux/percpu_counter.h> | |
75 | ||
76 | #include <asm/atomic.h> | |
77 | ||
78 | extern int sysctl_overcommit_memory; | |
79 | extern int sysctl_overcommit_ratio; | |
80 | -extern atomic_long_t vm_committed_space; | |
81 | +extern struct percpu_counter vm_committed_as; | |
82 | ||
83 | -#ifdef CONFIG_SMP | |
84 | -extern void vm_acct_memory(long pages); | |
85 | -#else | |
86 | static inline void vm_acct_memory(long pages) | |
87 | { | |
88 | - atomic_long_add(pages, &vm_committed_space); | |
89 | + percpu_counter_add(&vm_committed_as, pages); | |
90 | } | |
91 | -#endif | |
92 | ||
93 | static inline void vm_unacct_memory(long pages) | |
94 | { | |
95 | --- a/kernel/fork.c | |
96 | +++ b/kernel/fork.c | |
97 | @@ -1442,6 +1442,8 @@ void __init proc_caches_init(void) | |
98 | mm_cachep = kmem_cache_create("mm_struct", | |
99 | sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, | |
100 | SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); | |
101 | + if (percpu_counter_init(&vm_committed_as, 0)) | |
102 | + panic("Failed to allocate vm_committed_as"); | |
103 | } | |
104 | ||
105 | /* | |
106 | --- a/mm/mmap.c | |
107 | +++ b/mm/mmap.c | |
108 | @@ -84,7 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot); | |
109 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | |
110 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | |
111 | int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; | |
112 | -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | |
113 | +struct percpu_counter vm_committed_as; | |
114 | int heap_stack_gap __read_mostly = 1; | |
115 | ||
116 | /* | |
117 | @@ -178,11 +178,7 @@ int __vm_enough_memory(struct mm_struct | |
118 | leave 3% of the size of this process for other processes */ | |
119 | allowed -= mm->total_vm / 32; | |
120 | ||
121 | - /* | |
122 | - * cast `allowed' as a signed long because vm_committed_space | |
123 | - * sometimes has a negative value | |
124 | - */ | |
125 | - if (atomic_long_read(&vm_committed_space) < (long)allowed) | |
126 | + if (percpu_counter_read_positive(&vm_committed_as) < allowed) | |
127 | return 0; | |
128 | error: | |
129 | vm_unacct_memory(pages); | |
130 | --- a/mm/nommu.c | |
131 | +++ b/mm/nommu.c | |
132 | @@ -39,7 +39,7 @@ struct page *mem_map; | |
133 | unsigned long max_mapnr; | |
134 | unsigned long num_physpages; | |
135 | unsigned long askedalloc, realalloc; | |
136 | -atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); | |
137 | +struct percpu_counter vm_committed_as; | |
138 | int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ | |
139 | int sysctl_overcommit_ratio = 50; /* default is 50% */ | |
140 | int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; | |
141 | @@ -1434,12 +1434,9 @@ int __vm_enough_memory(struct mm_struct | |
142 | leave 3% of the size of this process for other processes */ | |
143 | allowed -= current->mm->total_vm / 32; | |
144 | ||
145 | - /* | |
146 | - * cast `allowed' as a signed long because vm_committed_space | |
147 | - * sometimes has a negative value | |
148 | - */ | |
149 | - if (atomic_long_read(&vm_committed_space) < (long)allowed) | |
150 | + if (percpu_counter_read_positive(&vm_committed_as) < allowed) | |
151 | return 0; | |
152 | + | |
153 | error: | |
154 | vm_unacct_memory(pages); | |
155 | ||
156 | --- a/mm/swap.c | |
157 | +++ b/mm/swap.c | |
158 | @@ -474,49 +474,6 @@ unsigned pagevec_lookup_tag(struct pagev | |
159 | ||
160 | EXPORT_SYMBOL(pagevec_lookup_tag); | |
161 | ||
162 | -#ifdef CONFIG_SMP | |
163 | -/* | |
164 | - * We tolerate a little inaccuracy to avoid ping-ponging the counter between | |
165 | - * CPUs | |
166 | - */ | |
167 | -#define ACCT_THRESHOLD max(16, NR_CPUS * 2) | |
168 | - | |
169 | -static DEFINE_PER_CPU(long, committed_space); | |
170 | - | |
171 | -void vm_acct_memory(long pages) | |
172 | -{ | |
173 | - long *local; | |
174 | - | |
175 | - preempt_disable(); | |
176 | - local = &__get_cpu_var(committed_space); | |
177 | - *local += pages; | |
178 | - if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) { | |
179 | - atomic_long_add(*local, &vm_committed_space); | |
180 | - *local = 0; | |
181 | - } | |
182 | - preempt_enable(); | |
183 | -} | |
184 | - | |
185 | -#ifdef CONFIG_HOTPLUG_CPU | |
186 | - | |
187 | -/* Drop the CPU's cached committed space back into the central pool. */ | |
188 | -static int cpu_swap_callback(struct notifier_block *nfb, | |
189 | - unsigned long action, | |
190 | - void *hcpu) | |
191 | -{ | |
192 | - long *committed; | |
193 | - | |
194 | - committed = &per_cpu(committed_space, (long)hcpu); | |
195 | - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { | |
196 | - atomic_long_add(*committed, &vm_committed_space); | |
197 | - *committed = 0; | |
198 | - drain_cpu_pagevecs((long)hcpu); | |
199 | - } | |
200 | - return NOTIFY_OK; | |
201 | -} | |
202 | -#endif /* CONFIG_HOTPLUG_CPU */ | |
203 | -#endif /* CONFIG_SMP */ | |
204 | - | |
205 | /* | |
206 | * Perform any setup for the swap system | |
207 | */ | |
208 | @@ -537,7 +494,4 @@ void __init swap_setup(void) | |
209 | * Right now other parts of the system means that we | |
210 | * _really_ don't want to cluster much more | |
211 | */ | |
212 | -#ifdef CONFIG_HOTPLUG_CPU | |
213 | - hotcpu_notifier(cpu_swap_callback, 0); | |
214 | -#endif | |
215 | } |