3.0-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 25 Jul 2012 21:51:58 +0000 (14:51 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 25 Jul 2012 21:51:58 +0000 (14:51 -0700)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 25 Jul 2012 21:51:58 +0000 (14:51 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 25 Jul 2012 21:51:58 +0000 (14:51 -0700)
diff --git a/queue-3.0/cpusets-avoid-looping-when-storing-to-mems_allowed-if-one-node-remains-set.patch b/queue-3.0/cpusets-avoid-looping-when-storing-to-mems_allowed-if-one-node-remains-set.patch

new file mode 100644 (file)

index 0000000..62c89f8
--- /dev/null
+++ b/queue-3.0/cpusets-avoid-looping-when-storing-to-mems_allowed-if-one-node-remains-set.patch
@@ -0,0 +1,87 @@
+From 89e8a244b97e48f1f30e898b6f32acca477f2a13 Mon Sep 17 00:00:00 2001
+From: David Rientjes <rientjes@google.com>
+Date: Wed, 2 Nov 2011 13:38:39 -0700
+Subject: cpusets: avoid looping when storing to mems_allowed if one node remains set
+
+From: David Rientjes <rientjes@google.com>
+
+commit 89e8a244b97e48f1f30e898b6f32acca477f2a13 upstream.
+
+Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is
+       extremely expensive and severely impacted page allocator performance.
+       This is part of a series of patches that reduce page allocator
+       overhead.
+
+{get,put}_mems_allowed() exist so that general kernel code may locklessly
+access a task's set of allowable nodes without having the chance that a
+concurrent write will cause the nodemask to be empty on configurations
+where MAX_NUMNODES > BITS_PER_LONG.
+
+This could incur a significant delay, however, especially in low memory
+conditions because the page allocator is blocking and reclaim requires
+get_mems_allowed() itself.  It is not atypical to see writes to
+cpuset.mems take over 2 seconds to complete, for example.  In low memory
+conditions, this is problematic because it's one of the most imporant
+times to change cpuset.mems in the first place!
+
+The only way a task's set of allowable nodes may change is through cpusets
+by writing to cpuset.mems and when attaching a task to a generic code is
+not reading the nodemask with get_mems_allowed() at the same time, and
+then clearing all the old nodes.  This prevents the possibility that a
+reader will see an empty nodemask at the same time the writer is storing a
+new nodemask.
+
+If at least one node remains unchanged, though, it's possible to simply
+set all new nodes and then clear all the old nodes.  Changing a task's
+nodemask is protected by cgroup_mutex so it's guaranteed that two threads
+are not changing the same task's nodemask at the same time, so the
+nodemask is guaranteed to be stored before another thread changes it and
+determines whether a node remains set or not.
+
+Signed-off-by: David Rientjes <rientjes@google.com>
+Cc: Miao Xie <miaox@cn.fujitsu.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Nick Piggin <npiggin@kernel.dk>
+Cc: Paul Menage <paul@paulmenage.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/cpuset.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_
+ static void cpuset_change_task_nodemask(struct task_struct *tsk,
+                                       nodemask_t *newmems)
+ {
++      bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed);
++
+ repeat:
+       /*
+        * Allow tasks that have access to memory reserves because they have
+@@ -963,7 +965,6 @@ repeat:
+       nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+       mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+ 
+-
+       /*
+        * ensure checking ->mems_allowed_change_disable after setting all new
+        * allowed nodes.
+@@ -980,9 +981,11 @@ repeat:
+ 
+       /*
+        * Allocation of memory is very fast, we needn't sleep when waiting
+-       * for the read-side.
++       * for the read-side.  No wait is necessary, however, if at least one
++       * node remains unchanged.
+        */
+-      while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
++      while (masks_disjoint &&
++                      ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+               task_unlock(tsk);
+               if (!task_curr(tsk))
+                       yield();
diff --git a/queue-3.0/cpusets-stall-when-updating-mems_allowed-for-mempolicy-or-disjoint-nodemask.patch b/queue-3.0/cpusets-stall-when-updating-mems_allowed-for-mempolicy-or-disjoint-nodemask.patch

new file mode 100644 (file)

index 0000000..be47bdb
--- /dev/null
+++ b/queue-3.0/cpusets-stall-when-updating-mems_allowed-for-mempolicy-or-disjoint-nodemask.patch
@@ -0,0 +1,113 @@
+From b246272ecc5ac68c743b15c9e41a2275f7ce70e2 Mon Sep 17 00:00:00 2001
+From: David Rientjes <rientjes@google.com>
+Date: Mon, 19 Dec 2011 17:11:52 -0800
+Subject: cpusets: stall when updating mems_allowed for mempolicy or disjoint nodemask
+
+From: David Rientjes <rientjes@google.com>
+
+commit b246272ecc5ac68c743b15c9e41a2275f7ce70e2 upstream.
+
+Stable note: Not tracked in Bugzilla. [get|put]_mems_allowed() is extremely
+       expensive and severely impacted page allocator performance. This is
+       part of a series of patches that reduce page allocator overhead.
+
+Kernels where MAX_NUMNODES > BITS_PER_LONG may temporarily see an empty
+nodemask in a tsk's mempolicy if its previous nodemask is remapped onto a
+new set of allowed cpuset nodes where the two nodemasks, as a result of
+the remap, are now disjoint.
+
+c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing
+cpuset's mems") adds get_mems_allowed() to prevent the set of allowed
+nodes from changing for a thread.  This causes any update to a set of
+allowed nodes to stall until put_mems_allowed() is called.
+
+This stall is unncessary, however, if at least one node remains unchanged
+in the update to the set of allowed nodes.  This was addressed by
+89e8a244b97e ("cpusets: avoid looping when storing to mems_allowed if one
+node remains set"), but it's still possible that an empty nodemask may be
+read from a mempolicy because the old nodemask may be remapped to the new
+nodemask during rebind.  To prevent this, only avoid the stall if there is
+no mempolicy for the thread being changed.
+
+This is a temporary solution until all reads from mempolicy nodemasks can
+be guaranteed to not be empty without the get_mems_allowed()
+synchronization.
+
+Also moves the check for nodemask intersection inside task_lock() so that
+tsk->mems_allowed cannot change.  This ensures that nothing can set this
+tsk's mems_allowed out from under us and also protects tsk->mempolicy.
+
+Reported-by: Miao Xie <miaox@cn.fujitsu.com>
+Signed-off-by: David Rientjes <rientjes@google.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: Paul Menage <paul@paulmenage.org>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/cpuset.c |   29 ++++++++++++++++++++++++-----
+ 1 file changed, 24 insertions(+), 5 deletions(-)
+
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -123,6 +123,19 @@ static inline struct cpuset *task_cs(str
+                           struct cpuset, css);
+ }
+ 
++#ifdef CONFIG_NUMA
++static inline bool task_has_mempolicy(struct task_struct *task)
++{
++      return task->mempolicy;
++}
++#else
++static inline bool task_has_mempolicy(struct task_struct *task)
++{
++      return false;
++}
++#endif
++
++
+ /* bits in struct cpuset flags field */
+ typedef enum {
+       CS_CPU_EXCLUSIVE,
+@@ -949,7 +962,7 @@ static void cpuset_migrate_mm(struct mm_
+ static void cpuset_change_task_nodemask(struct task_struct *tsk,
+                                       nodemask_t *newmems)
+ {
+-      bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed);
++      bool need_loop;
+ 
+ repeat:
+       /*
+@@ -962,6 +975,14 @@ repeat:
+               return;
+ 
+       task_lock(tsk);
++      /*
++       * Determine if a loop is necessary if another thread is doing
++       * get_mems_allowed().  If at least one node remains unchanged and
++       * tsk does not have a mempolicy, then an empty nodemask will not be
++       * possible when mems_allowed is larger than a word.
++       */
++      need_loop = task_has_mempolicy(tsk) ||
++                      !nodes_intersects(*newmems, tsk->mems_allowed);
+       nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
+       mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
+ 
+@@ -981,11 +1002,9 @@ repeat:
+ 
+       /*
+        * Allocation of memory is very fast, we needn't sleep when waiting
+-       * for the read-side.  No wait is necessary, however, if at least one
+-       * node remains unchanged.
++       * for the read-side.
+        */
+-      while (masks_disjoint &&
+-                      ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
++      while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+               task_unlock(tsk);
+               if (!task_curr(tsk))
+                       yield();
diff --git a/queue-3.0/mm-vmscan-convert-global-reclaim-to-per-memcg-lru-lists.patch b/queue-3.0/mm-vmscan-convert-global-reclaim-to-per-memcg-lru-lists.patch

new file mode 100644 (file)

index 0000000..1dd6c73
--- /dev/null
+++ b/queue-3.0/mm-vmscan-convert-global-reclaim-to-per-memcg-lru-lists.patch
@@ -0,0 +1,52 @@
+From b95a2f2d486d0d768a92879c023a03757b9c7e58 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <jweiner@redhat.com>
+Date: Thu, 12 Jan 2012 17:18:06 -0800
+Subject: mm: vmscan: convert global reclaim to per-memcg LRU lists
+
+From: Johannes Weiner <jweiner@redhat.com>
+
+commit b95a2f2d486d0d768a92879c023a03757b9c7e58 upstream - WARNING: this is a substitute patch.
+
+Stable note: Not tracked in Bugzilla. This is a partial backport of an
+       upstream commit addressing a completely different issue
+       that accidentally contained an important fix. The workload
+       this patch helps was memcached when IO is started in the
+       background. memcached should stay resident but without this patch
+       it gets swapped. Sometimes this manifests as a drop in throughput
+       but mostly it was observed through /proc/vmstat.
+
+Commit [246e87a9: memcg: fix get_scan_count() for small targets] was meant
+to fix a problem whereby small scan targets on memcg were ignored causing
+priority to raise too sharply. It forced scanning to take place if the
+target was small, memcg or kswapd.
+
+From the time it was introduced it caused excessive reclaim by kswapd
+with workloads being pushed to swap that previously would have stayed
+resident. This was accidentally fixed in commit [b95a2f2d: mm: vmscan:
+convert global reclaim to per-memcg LRU lists] by making it harder for
+kswapd to force scan small targets but that patchset is not suitable for
+backporting. This was later changed again by commit [90126375: mm/vmscan:
+push lruvec pointer into get_scan_count()] into a format that looks
+like it would be a straight-forward backport but there is a subtle
+difference due to the use of lruvecs.
+
+The impact of the accidental fix is to make it harder for kswapd to force
+scan small targets by taking zone->all_unreclaimable into account. This
+patch is the closest equivalent available based on what is backported.
+
+---
+ mm/vmscan.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1850,7 +1850,8 @@ static void get_scan_count(struct zone *
+       unsigned long nr_force_scan[2];
+ 
+       /* kswapd does zone balancing and needs to scan this zone */
+-      if (scanning_global_lru(sc) && current_is_kswapd())
++      if (scanning_global_lru(sc) && current_is_kswapd() &&
++          zone->all_unreclaimable)
+               force_scan = true;
+       /* memcg may have small limit and need to avoid priority drop */
+       if (!scanning_global_lru(sc))
diff --git a/queue-3.0/series b/queue-3.0/series

index cc62e6b133737475e70da47e0828bca16b34e3b7..78565a490aa6d54577ac23d744c5ac3674d7fae9 100644 (file)
--- a/queue-3.0/series
+++ b/queue-3.0/series
@@ -33,3 +33,6 @@ vmscan-promote-shared-file-mapped-pages.patch
  vmscan-activate-executable-pages-after-first-usage.patch
  mm-vmscan.c-consider-swap-space-when-deciding-whether-to-continue-reclaim.patch
  mm-test-pageswapbacked-in-lumpy-reclaim.patch
+mm-vmscan-convert-global-reclaim-to-per-memcg-lru-lists.patch
+cpusets-avoid-looping-when-storing-to-mems_allowed-if-one-node-remains-set.patch
+cpusets-stall-when-updating-mems_allowed-for-mempolicy-or-disjoint-nodemask.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 25 Jul 2012 21:51:58 +0000 (14:51 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 25 Jul 2012 21:51:58 +0000 (14:51 -0700)
queue-3.0/cpusets-avoid-looping-when-storing-to-mems_allowed-if-one-node-remains-set.patch	[new file with mode: 0644]	patch \| blob
queue-3.0/cpusets-stall-when-updating-mems_allowed-for-mempolicy-or-disjoint-nodemask.patch	[new file with mode: 0644]	patch \| blob
queue-3.0/mm-vmscan-convert-global-reclaim-to-per-memcg-lru-lists.patch	[new file with mode: 0644]	patch \| blob
queue-3.0/series		patch \| blob \| blame \| history