[people/pmueller/ipfire-2.x.git] / src / patches / suse-2.6.27.25 / patches.fixes / mm-fix-Commited_AS-underflow-on-large-NR_CPUS

From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Subject: mm: fix Committed_AS underflow on large NR_CPUS environment
Patch-mainline: 2.6.30-rc5
Git-commit: 00a62ce91e554198ef28234c91c36f850f5a3bc9
References: bnc#505831

mm: fix Committed_AS underflow on large NR_CPUS environment

The Committed_AS field can underflow in certain situations:

>         # while true; do cat /proc/meminfo  | grep _AS; sleep 1; done | uniq -c
>               1 Committed_AS: 18446744073709323392 kB
>              11 Committed_AS: 18446744073709455488 kB
>               6 Committed_AS:    35136 kB
>               5 Committed_AS: 18446744073709454400 kB
>               7 Committed_AS:    35904 kB
>               3 Committed_AS: 18446744073709453248 kB
>               2 Committed_AS:    34752 kB
>               9 Committed_AS: 18446744073709453248 kB
>               8 Committed_AS:    34752 kB
>               3 Committed_AS: 18446744073709320960 kB
>               7 Committed_AS: 18446744073709454080 kB
>               3 Committed_AS: 18446744073709320960 kB
>               5 Committed_AS: 18446744073709454080 kB
>               6 Committed_AS: 18446744073709320960 kB

Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does
not check for underflow.

But NR_CPUS proportional isn't good calculation.  In general,
possibility of lock contention is proportional to the number of online
cpus, not theorical maximum cpus (NR_CPUS).

The current kernel has generic percpu-counter stuff.  using it is right
way.  it makes code simplify and percpu_counter_read_positive() don't
make underflow issue.

Reported-by: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Eric B Munson <ebmunson@us.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: <stable@kernel.org>		[All kernel versions]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Backported-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Jeff Mahoney <jeffm@suse.com>
---
 fs/proc/proc_misc.c  |    2 +-
 include/linux/mman.h |    9 +++------
 kernel/fork.c        |    2 ++
 mm/mmap.c            |    8 ++------
 mm/nommu.c           |    9 +++------
 mm/swap.c            |   46 ----------------------------------------------
 6 files changed, 11 insertions(+), 65 deletions(-)

--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -145,7 +145,7 @@ static int meminfo_read_proc(char *page,
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	si_meminfo(&i);
 	si_swapinfo(&i);
-	committed = atomic_long_read(&vm_committed_space);
+	committed = percpu_counter_read_positive(&vm_committed_as);
 	allowed = ((totalram_pages - hugetlb_total_pages())
 		* sysctl_overcommit_ratio / 100) + total_swap_pages;
 
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -12,21 +12,18 @@
 
 #ifdef __KERNEL__
 #include <linux/mm.h>
+#include <linux/percpu_counter.h>
 
 #include <asm/atomic.h>
 
 extern int sysctl_overcommit_memory;
 extern int sysctl_overcommit_ratio;
-extern atomic_long_t vm_committed_space;
+extern struct percpu_counter vm_committed_as;
 
-#ifdef CONFIG_SMP
-extern void vm_acct_memory(long pages);
-#else
 static inline void vm_acct_memory(long pages)
 {
-	atomic_long_add(pages, &vm_committed_space);
+	percpu_counter_add(&vm_committed_as, pages);
 }
-#endif
 
 static inline void vm_unacct_memory(long pages)
 {
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1442,6 +1442,8 @@ void __init proc_caches_init(void)
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	if (percpu_counter_init(&vm_committed_as, 0))
+		panic("Failed to allocate vm_committed_as");
 }
 
 /*
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -84,7 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
 int heap_stack_gap __read_mostly = 1;
 
 /*
@@ -178,11 +178,7 @@ int __vm_enough_memory(struct mm_struct
 	   leave 3% of the size of this process for other processes */
 	allowed -= mm->total_vm / 32;
 
-	/*
-	 * cast `allowed' as a signed long because vm_committed_space
-	 * sometimes has a negative value
-	 */
-	if (atomic_long_read(&vm_committed_space) < (long)allowed)
+	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 		return 0;
 error:
 	vm_unacct_memory(pages);
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -39,7 +39,7 @@ struct page *mem_map;
 unsigned long max_mapnr;
 unsigned long num_physpages;
 unsigned long askedalloc, realalloc;
-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
+struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
@@ -1434,12 +1434,9 @@ int __vm_enough_memory(struct mm_struct
 	   leave 3% of the size of this process for other processes */
 	allowed -= current->mm->total_vm / 32;
 
-	/*
-	 * cast `allowed' as a signed long because vm_committed_space
-	 * sometimes has a negative value
-	 */
-	if (atomic_long_read(&vm_committed_space) < (long)allowed)
+	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 		return 0;
+
 error:
 	vm_unacct_memory(pages);
 
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -474,49 +474,6 @@ unsigned pagevec_lookup_tag(struct pagev
 
 EXPORT_SYMBOL(pagevec_lookup_tag);
 
-#ifdef CONFIG_SMP
-/*
- * We tolerate a little inaccuracy to avoid ping-ponging the counter between
- * CPUs
- */
-#define ACCT_THRESHOLD	max(16, NR_CPUS * 2)
-
-static DEFINE_PER_CPU(long, committed_space);
-
-void vm_acct_memory(long pages)
-{
-	long *local;
-
-	preempt_disable();
-	local = &__get_cpu_var(committed_space);
-	*local += pages;
-	if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
-		atomic_long_add(*local, &vm_committed_space);
-		*local = 0;
-	}
-	preempt_enable();
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* Drop the CPU's cached committed space back into the central pool. */
-static int cpu_swap_callback(struct notifier_block *nfb,
-			     unsigned long action,
-			     void *hcpu)
-{
-	long *committed;
-
-	committed = &per_cpu(committed_space, (long)hcpu);
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
-		atomic_long_add(*committed, &vm_committed_space);
-		*committed = 0;
-		drain_cpu_pagevecs((long)hcpu);
-	}
-	return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-#endif /* CONFIG_SMP */
-
 /*
  * Perform any setup for the swap system
  */
@@ -537,7 +494,4 @@ void __init swap_setup(void)
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
-#ifdef CONFIG_HOTPLUG_CPU
-	hotcpu_notifier(cpu_swap_callback, 0);
-#endif
 }
Commit	Line	Data
00e5a55c BS	1	From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
	2	Subject: mm: fix Committed_AS underflow on large NR_CPUS environment
	3	Patch-mainline: 2.6.30-rc5
	4	Git-commit: 00a62ce91e554198ef28234c91c36f850f5a3bc9
	5	References: bnc#505831
	6
	7	mm: fix Committed_AS underflow on large NR_CPUS environment
	8
	9	The Committed_AS field can underflow in certain situations:
	10
	11	> # while true; do cat /proc/meminfo \| grep _AS; sleep 1; done \| uniq -c
	12	> 1 Committed_AS: 18446744073709323392 kB
	13	> 11 Committed_AS: 18446744073709455488 kB
	14	> 6 Committed_AS: 35136 kB
	15	> 5 Committed_AS: 18446744073709454400 kB
	16	> 7 Committed_AS: 35904 kB
	17	> 3 Committed_AS: 18446744073709453248 kB
	18	> 2 Committed_AS: 34752 kB
	19	> 9 Committed_AS: 18446744073709453248 kB
	20	> 8 Committed_AS: 34752 kB
	21	> 3 Committed_AS: 18446744073709320960 kB
	22	> 7 Committed_AS: 18446744073709454080 kB
	23	> 3 Committed_AS: 18446744073709320960 kB
	24	> 5 Committed_AS: 18446744073709454080 kB
	25	> 6 Committed_AS: 18446744073709320960 kB
	26
	27	Because NR_CPUS can be greater than 1000 and meminfo_proc_show() does
	28	not check for underflow.
	29
	30	But NR_CPUS proportional isn't good calculation. In general,
	31	possibility of lock contention is proportional to the number of online
	32	cpus, not theorical maximum cpus (NR_CPUS).
	33
	34	The current kernel has generic percpu-counter stuff. using it is right
	35	way. it makes code simplify and percpu_counter_read_positive() don't
	36	make underflow issue.
	37
	38	Reported-by: Dave Hansen <dave@linux.vnet.ibm.com>
	39	Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
	40	Cc: Eric B Munson <ebmunson@us.ibm.com>
	41	Cc: Mel Gorman <mel@csn.ul.ie>
	42	Cc: Christoph Lameter <cl@linux-foundation.org>
	43	Cc: <stable@kernel.org> [All kernel versions]
	44	Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
	45	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	46	Backported-by: Jeff Mahoney <jeffm@suse.com>
	47	Signed-off-by: Jeff Mahoney <jeffm@suse.com>
	48	---
	49	fs/proc/proc_misc.c \| 2 +-
	50	include/linux/mman.h \| 9 +++------
	51	kernel/fork.c \| 2 ++
	52	mm/mmap.c \| 8 ++------
	53	mm/nommu.c \| 9 +++------
	54	mm/swap.c \| 46 ----------------------------------------------
	55	6 files changed, 11 insertions(+), 65 deletions(-)
	56
	57	--- a/fs/proc/proc_misc.c
	58	+++ b/fs/proc/proc_misc.c
	59	@@ -145,7 +145,7 @@ static int meminfo_read_proc(char *page,
	60	#define K(x) ((x) << (PAGE_SHIFT - 10))
	61	si_meminfo(&i);
	62	si_swapinfo(&i);
	63	- committed = atomic_long_read(&vm_committed_space);
	64	+ committed = percpu_counter_read_positive(&vm_committed_as);
65	allowed = ((totalram_pages - hugetlb_total_pages())
66	* sysctl_overcommit_ratio / 100) + total_swap_pages;
67
68	--- a/include/linux/mman.h
69	+++ b/include/linux/mman.h
70	@@ -12,21 +12,18 @@
71
72	#ifdef __KERNEL__
73	#include <linux/mm.h>
74	+#include <linux/percpu_counter.h>
75
76	#include <asm/atomic.h>
77
78	extern int sysctl_overcommit_memory;
79	extern int sysctl_overcommit_ratio;
80	-extern atomic_long_t vm_committed_space;
81	+extern struct percpu_counter vm_committed_as;
82
83	-#ifdef CONFIG_SMP
84	-extern void vm_acct_memory(long pages);
85	-#else
86	static inline void vm_acct_memory(long pages)
87	{
88	- atomic_long_add(pages, &vm_committed_space);
89	+ percpu_counter_add(&vm_committed_as, pages);
90	}
91	-#endif
92
93	static inline void vm_unacct_memory(long pages)
94	{
95	--- a/kernel/fork.c
96	+++ b/kernel/fork.c
97	@@ -1442,6 +1442,8 @@ void __init proc_caches_init(void)
98	mm_cachep = kmem_cache_create("mm_struct",
99	sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
100	SLAB_HWCACHE_ALIGN\|SLAB_PANIC, NULL);
101	+ if (percpu_counter_init(&vm_committed_as, 0))
102	+ panic("Failed to allocate vm_committed_as");
103	}
104
105	/*
106	--- a/mm/mmap.c
107	+++ b/mm/mmap.c
108	@@ -84,7 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
109	int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
110	int sysctl_overcommit_ratio = 50; /* default is 50% */
111	int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
112	-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
113	+struct percpu_counter vm_committed_as;
114	int heap_stack_gap __read_mostly = 1;
115
116	/*
117	@@ -178,11 +178,7 @@ int __vm_enough_memory(struct mm_struct
118	leave 3% of the size of this process for other processes */
119	allowed -= mm->total_vm / 32;
120
121	- /*
122	- * cast `allowed' as a signed long because vm_committed_space
123	- * sometimes has a negative value
124	- */
125	- if (atomic_long_read(&vm_committed_space) < (long)allowed)
126	+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
127	return 0;
128	error:
129	vm_unacct_memory(pages);
130	--- a/mm/nommu.c
131	+++ b/mm/nommu.c
132	@@ -39,7 +39,7 @@ struct page *mem_map;
133	unsigned long max_mapnr;
134	unsigned long num_physpages;
135	unsigned long askedalloc, realalloc;
136	-atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
137	+struct percpu_counter vm_committed_as;
138	int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
139	int sysctl_overcommit_ratio = 50; /* default is 50% */
140	int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
141	@@ -1434,12 +1434,9 @@ int __vm_enough_memory(struct mm_struct
142	leave 3% of the size of this process for other processes */
143	allowed -= current->mm->total_vm / 32;
144
145	- /*
146	- * cast `allowed' as a signed long because vm_committed_space
147	- * sometimes has a negative value
148	- */
149	- if (atomic_long_read(&vm_committed_space) < (long)allowed)
150	+ if (percpu_counter_read_positive(&vm_committed_as) < allowed)
151	return 0;
152	+
153	error:
154	vm_unacct_memory(pages);
155
156	--- a/mm/swap.c
157	+++ b/mm/swap.c
158	@@ -474,49 +474,6 @@ unsigned pagevec_lookup_tag(struct pagev
159
160	EXPORT_SYMBOL(pagevec_lookup_tag);
161
162	-#ifdef CONFIG_SMP
163	-/*
164	- * We tolerate a little inaccuracy to avoid ping-ponging the counter between
165	- * CPUs
166	- */
167	-#define ACCT_THRESHOLD max(16, NR_CPUS * 2)
168	-
169	-static DEFINE_PER_CPU(long, committed_space);
170	-
171	-void vm_acct_memory(long pages)
172	-{
173	- long *local;
174	-
175	- preempt_disable();
176	- local = &__get_cpu_var(committed_space);
177	- *local += pages;
178	- if (local > ACCT_THRESHOLD \|\| local < -ACCT_THRESHOLD) {
179	- atomic_long_add(*local, &vm_committed_space);
180	- *local = 0;
181	- }
182	- preempt_enable();
183	-}
184	-
185	-#ifdef CONFIG_HOTPLUG_CPU
186	-
187	-/* Drop the CPU's cached committed space back into the central pool. */
188	-static int cpu_swap_callback(struct notifier_block *nfb,
189	- unsigned long action,
190	- void *hcpu)
191	-{
192	- long *committed;
193	-
194	- committed = &per_cpu(committed_space, (long)hcpu);
195	- if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN) {
196	- atomic_long_add(*committed, &vm_committed_space);
197	- *committed = 0;
198	- drain_cpu_pagevecs((long)hcpu);
199	- }
200	- return NOTIFY_OK;
201	-}
202	-#endif /* CONFIG_HOTPLUG_CPU */
203	-#endif /* CONFIG_SMP */
204	-
205	/*
206	* Perform any setup for the swap system
207	*/
208	@@ -537,7 +494,4 @@ void __init swap_setup(void)
209	* Right now other parts of the system means that we
210	* _really_ don't want to cluster much more
211	*/
212	-#ifdef CONFIG_HOTPLUG_CPU
213	- hotcpu_notifier(cpu_swap_callback, 0);
214	-#endif
215	}