--- /dev/null
+From stable-bounces@linux.kernel.org Fri Mar 28 11:47:42 2008
+From: snitzer@gmail.com
+To: stable@kernel.org
+Date: Fri, 28 Mar 2008 14:42:43 -0400
+Message-Id: <1206729763-22578-3-git-send-email-snitzer@gmail.com>
+Cc: akpm@linux-foundation.org, torvalds@linux-foundation.org, dada1@cosmosbay.com
+Subject: alloc_percpu() fails to allocate percpu data
+
+From: Eric Dumazet <dada1@cosmosbay.com>
+
+upstream commit: be852795e1c8d3829ddf3cb1ce806113611fa555
+
+Some oprofile results obtained while using tbench on a 2x2 cpu machine were
+very surprising.
+
+For example, loopback_xmit() function was using high number of cpu cycles
+to perform the statistic updates, supposed to be real cheap since they use
+percpu data
+
+ pcpu_lstats = netdev_priv(dev);
+ lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id());
+ lb_stats->packets++; /* HERE : serious contention */
+ lb_stats->bytes += skb->len;
+
+struct pcpu_lstats is a small structure containing two longs. It appears
+that on my 32bits platform, alloc_percpu(8) allocates a single cache line,
+instead of giving to each cpu a separate cache line.
+
+Using the following patch gave me impressive boost in various benchmarks
+( 6 % in tbench)
+(all percpu_counters hit this bug too)
+
+Long term fix (ie >= 2.6.26) would be to let each CPU allocate their own
+block of memory, so that we dont need to roudup sizes to L1_CACHE_BYTES, or
+merging the SGI stuff of course...
+
+Note : SLUB vs SLAB is important here to *show* the improvement, since they
+dont have the same minimum allocation sizes (8 bytes vs 32 bytes). This
+could very well explain regressions some guys reported when they switched
+to SLUB.
+
+Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
+Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ mm/allocpercpu.c | 15 ++++++++++++++-
+ 1 file changed, 14 insertions(+), 1 deletion(-)
+
+This is appropriate for 2.6.24.y
+
+--- a/mm/allocpercpu.c
++++ b/mm/allocpercpu.c
+@@ -6,6 +6,10 @@
+ #include <linux/mm.h>
+ #include <linux/module.h>
+
++#ifndef cache_line_size
++#define cache_line_size() L1_CACHE_BYTES
++#endif
++
+ /**
+ * percpu_depopulate - depopulate per-cpu data for given cpu
+ * @__pdata: per-cpu data to depopulate
+@@ -52,6 +56,11 @@ void *percpu_populate(void *__pdata, siz
+ struct percpu_data *pdata = __percpu_disguise(__pdata);
+ int node = cpu_to_node(cpu);
+
++ /*
++ * We should make sure each CPU gets private memory.
++ */
++ size = roundup(size, cache_line_size());
++
+ BUG_ON(pdata->ptrs[cpu]);
+ if (node_online(node))
+ pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
+@@ -98,7 +107,11 @@ EXPORT_SYMBOL_GPL(__percpu_populate_mask
+ */
+ void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+ {
+- void *pdata = kzalloc(nr_cpu_ids * sizeof(void *), gfp);
++ /*
++ * We allocate whole cache lines to avoid false sharing
++ */
++ size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
++ void *pdata = kzalloc(sz, gfp);
+ void *__pdata = __percpu_disguise(pdata);
+
+ if (unlikely(!pdata))
--- /dev/null
+From stable-bounces@linux.kernel.org Fri Mar 28 11:45:35 2008
+From: snitzer@gmail.com
+To: stable@kernel.org
+Date: Fri, 28 Mar 2008 14:42:42 -0400
+Message-Id: <1206729763-22578-2-git-send-email-snitzer@gmail.com>
+Cc: akpm@linux-foundation.org, clameter@sgi.com, torvalds@linux-foundation.org, davem@davemloft.net, dada1@cosmosbay.com
+Subject: PERCPU : __percpu_alloc_mask() can dynamically size percpu_data storage
+
+From: Eric Dumazet <dada1@cosmosbay.com>
+
+upstream commit: b3242151906372f30f57feaa43b4cac96a23edb1
+
+Instead of allocating a fix sized array of NR_CPUS pointers for percpu_data,
+we can use nr_cpu_ids, which is generally < NR_CPUS.
+
+Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
+Cc: Christoph Lameter <clameter@sgi.com>
+Cc: "David S. Miller" <davem@davemloft.net>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Chris Wright <chrisw@sous-sol.org>
+---
+ include/linux/percpu.h | 2 +-
+ mm/allocpercpu.c | 2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+This is appropriate for 2.6.24.y
+
+--- a/include/linux/percpu.h
++++ b/include/linux/percpu.h
+@@ -34,7 +34,7 @@
+ #ifdef CONFIG_SMP
+
+ struct percpu_data {
+- void *ptrs[NR_CPUS];
++ void *ptrs[1];
+ };
+
+ #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+--- a/mm/allocpercpu.c
++++ b/mm/allocpercpu.c
+@@ -98,7 +98,7 @@ EXPORT_SYMBOL_GPL(__percpu_populate_mask
+ */
+ void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
+ {
+- void *pdata = kzalloc(sizeof(struct percpu_data), gfp);
++ void *pdata = kzalloc(nr_cpu_ids * sizeof(void *), gfp);
+ void *__pdata = __percpu_disguise(pdata);
+
+ if (unlikely(!pdata))