--- /dev/null
+From d35be8bab9b0ce44bed4b9453f86ebf64062721e Mon Sep 17 00:00:00 2001
+From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
+Date: Thu, 24 May 2012 19:46:26 +0530
+Subject: CPU hotplug, cpusets, suspend: Don't modify cpusets during suspend/resume
+
+From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
+
+commit d35be8bab9b0ce44bed4b9453f86ebf64062721e upstream.
+
+In the event of CPU hotplug, the kernel modifies the cpusets' cpus_allowed
+masks as and when necessary to ensure that the tasks belonging to the cpusets
+have some place (online CPUs) to run on. And regular CPU hotplug is
+destructive in the sense that the kernel doesn't remember the original cpuset
+configurations set by the user, across hotplug operations.
+
+However, suspend/resume (which uses CPU hotplug) is a special case in which
+the kernel has the responsibility to restore the system (during resume), to
+exactly the same state it was in before suspend.
+
+In order to achieve that, do the following:
+
+1. Don't modify cpusets during suspend/resume. At all.
+ In particular, don't move the tasks from one cpuset to another, and
+ don't modify any cpuset's cpus_allowed mask. So, simply ignore cpusets
+ during the CPU hotplug operations that are carried out in the
+ suspend/resume path.
+
+2. However, cpusets and sched domains are related. We just want to avoid
+ altering cpusets alone. So, to keep the sched domains updated, build
+ a single sched domain (containing all active cpus) during each of the
+ CPU hotplug operations carried out in s/r path, effectively ignoring
+ the cpusets' cpus_allowed masks.
+
+ (Since userspace is frozen while doing all this, it will go unnoticed.)
+
+3. During the last CPU online operation during resume, build the sched
+ domains by looking up the (unaltered) cpusets' cpus_allowed masks.
+ That will bring back the system to the same original state as it was in
+ before suspend.
+
+Ultimately, this will not only solve the cpuset problem related to suspend
+resume (ie., restores the cpusets to exactly what it was before suspend, by
+not touching it at all) but also speeds up suspend/resume because we avoid
+running cpuset update code for every CPU being offlined/onlined.
+
+Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Link: http://lkml.kernel.org/r/20120524141611.3692.20155.stgit@srivatsabhat.in.ibm.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/cpuset.c | 3 +++
+ kernel/sched/core.c | 40 ++++++++++++++++++++++++++++++++++++----
+ 2 files changed, 39 insertions(+), 4 deletions(-)
+
+--- a/kernel/cpuset.c
++++ b/kernel/cpuset.c
+@@ -2065,6 +2065,9 @@ static void scan_for_empty_cpusets(struc
+ * (of no affect) on systems that are actively using CPU hotplug
+ * but making no active use of cpusets.
+ *
++ * The only exception to this is suspend/resume, where we don't
++ * modify cpusets at all.
++ *
+ * This routine ensures that top_cpuset.cpus_allowed tracks
+ * cpu_active_mask on each CPU hotplug (cpuhp) event.
+ *
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -6937,34 +6937,66 @@ int __init sched_create_sysfs_power_savi
+ }
+ #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
++static int num_cpus_frozen; /* used to mark begin/end of suspend/resume */
++
+ /*
+ * Update cpusets according to cpu_active mask. If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
++ *
++ * If we come here as part of a suspend/resume, don't touch cpusets because we
++ * want to restore it back to its original state upon resume anyway.
+ */
+ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+ {
+- switch (action & ~CPU_TASKS_FROZEN) {
++ switch (action) {
++ case CPU_ONLINE_FROZEN:
++ case CPU_DOWN_FAILED_FROZEN:
++
++ /*
++ * num_cpus_frozen tracks how many CPUs are involved in suspend
++ * resume sequence. As long as this is not the last online
++ * operation in the resume sequence, just build a single sched
++ * domain, ignoring cpusets.
++ */
++ num_cpus_frozen--;
++ if (likely(num_cpus_frozen)) {
++ partition_sched_domains(1, NULL, NULL);
++ break;
++ }
++
++ /*
++ * This is the last CPU online operation. So fall through and
++ * restore the original sched domains by considering the
++ * cpuset configurations.
++ */
++
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ cpuset_update_active_cpus();
+- return NOTIFY_OK;
++ break;
+ default:
+ return NOTIFY_DONE;
+ }
++ return NOTIFY_OK;
+ }
+
+ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+ void *hcpu)
+ {
+- switch (action & ~CPU_TASKS_FROZEN) {
++ switch (action) {
+ case CPU_DOWN_PREPARE:
+ cpuset_update_active_cpus();
+- return NOTIFY_OK;
++ break;
++ case CPU_DOWN_PREPARE_FROZEN:
++ num_cpus_frozen++;
++ partition_sched_domains(1, NULL, NULL);
++ break;
+ default:
+ return NOTIFY_DONE;
+ }
++ return NOTIFY_OK;
+ }
+
+ void __init sched_init_smp(void)
--- /dev/null
+From 9dead5bbb825d7c25c0400e61de83075046322d0 Mon Sep 17 00:00:00 2001
+From: Matthew Garrett <mjg@redhat.com>
+Date: Thu, 26 Jul 2012 18:00:00 -0400
+Subject: efi: Build EFI stub with EFI-appropriate options
+
+From: Matthew Garrett <mjg@redhat.com>
+
+commit 9dead5bbb825d7c25c0400e61de83075046322d0 upstream.
+
+We can't assume the presence of the red zone while we're still in a boot
+services environment, so we should build with -fno-red-zone to avoid
+problems. Change the size of wchar at the same time to make string handling
+simpler.
+
+Signed-off-by: Matthew Garrett <mjg@redhat.com>
+Signed-off-by: Matt Fleming <matt.fleming@intel.com>
+Acked-by: Josh Boyer <jwboyer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/boot/compressed/Makefile | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/arch/x86/boot/compressed/Makefile
++++ b/arch/x86/boot/compressed/Makefile
+@@ -28,6 +28,9 @@ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)
+ $(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \
+ $(obj)/piggy.o
+
++$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
++$(obj)/efi_stub_$(BITS).o: KBUILD_CLFAGS += -fshort-wchar -mno-red-zone
++
+ ifeq ($(CONFIG_EFI_STUB), y)
+ VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o
+ endif
--- /dev/null
+From d6cf86d8f23253225fe2a763d627ecf7dfee9dae Mon Sep 17 00:00:00 2001
+From: Seiji Aguchi <seiji.aguchi@hds.com>
+Date: Tue, 24 Jul 2012 13:27:23 +0000
+Subject: efi: initialize efi.runtime_version to make query_variable_info/update_capsule workable
+
+From: Seiji Aguchi <seiji.aguchi@hds.com>
+
+commit d6cf86d8f23253225fe2a763d627ecf7dfee9dae upstream.
+
+A value of efi.runtime_version is checked before calling
+update_capsule()/query_variable_info() as follows.
+But it isn't initialized anywhere.
+
+<snip>
+static efi_status_t virt_efi_query_variable_info(u32 attr,
+ u64 *storage_space,
+ u64 *remaining_space,
+ u64 *max_variable_size)
+{
+ if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION)
+ return EFI_UNSUPPORTED;
+<snip>
+
+This patch initializes a value of efi.runtime_version at boot time.
+
+Signed-off-by: Seiji Aguchi <seiji.aguchi@hds.com>
+Acked-by: Matthew Garrett <mjg@redhat.com>
+Signed-off-by: Matt Fleming <matt.fleming@intel.com>
+Signed-off-by: Ivan Hu <ivan.hu@canonical.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/platform/efi/efi.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/x86/platform/efi/efi.c
++++ b/arch/x86/platform/efi/efi.c
+@@ -890,6 +890,7 @@ void __init efi_enter_virtual_mode(void)
+ *
+ * Call EFI services through wrapper functions.
+ */
++ efi.runtime_version = efi_systab.fw_revision;
+ efi.get_time = virt_efi_get_time;
+ efi.set_time = virt_efi_set_time;
+ efi.get_wakeup_time = virt_efi_get_wakeup_time;
--- /dev/null
+From 00442ad04a5eac08a98255697c510e708f6082e2 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 8 Oct 2012 16:29:20 -0700
+Subject: mempolicy: fix a memory corruption by refcount imbalance in alloc_pages_vma()
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit 00442ad04a5eac08a98255697c510e708f6082e2 upstream.
+
+Commit cc9a6c877661 ("cpuset: mm: reduce large amounts of memory barrier
+related damage v3") introduced a potential memory corruption.
+shmem_alloc_page() uses a pseudo vma and it has one significant unique
+combination, vma->vm_ops=NULL and vma->policy->flags & MPOL_F_SHARED.
+
+get_vma_policy() does NOT increase a policy ref when vma->vm_ops=NULL
+and mpol_cond_put() DOES decrease a policy ref when a policy has
+MPOL_F_SHARED. Therefore, when a cpuset update race occurs,
+alloc_pages_vma() falls in 'goto retry_cpuset' path, decrements the
+reference count and frees the policy prematurely.
+
+Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Christoph Lameter <cl@linux.com>
+Cc: Josh Boyer <jwboyer@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mempolicy.c | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -1532,8 +1532,18 @@ struct mempolicy *get_vma_policy(struct
+ addr);
+ if (vpol)
+ pol = vpol;
+- } else if (vma->vm_policy)
++ } else if (vma->vm_policy) {
+ pol = vma->vm_policy;
++
++ /*
++ * shmem_alloc_page() passes MPOL_F_SHARED policy with
++ * a pseudo vma whose vma->vm_ops=NULL. Take a reference
++ * count on these policies which will be dropped by
++ * mpol_cond_put() later
++ */
++ if (mpol_needs_cond_ref(pol))
++ mpol_get(pol);
++ }
+ }
+ if (!pol)
+ pol = &default_policy;
--- /dev/null
+From b22d127a39ddd10d93deee3d96e643657ad53a49 Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Mon, 8 Oct 2012 16:29:17 -0700
+Subject: mempolicy: fix a race in shared_policy_replace()
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit b22d127a39ddd10d93deee3d96e643657ad53a49 upstream.
+
+shared_policy_replace() use of sp_alloc() is unsafe. 1) sp_node cannot
+be dereferenced if sp->lock is not held and 2) another thread can modify
+sp_node between spin_unlock for allocating a new sp node and next
+spin_lock. The bug was introduced before 2.6.12-rc2.
+
+Kosaki's original patch for this problem was to allocate an sp node and
+policy within shared_policy_replace and initialise it when the lock is
+reacquired. I was not keen on this approach because it partially
+duplicates sp_alloc(). As the paths were sp->lock is taken are not that
+performance critical this patch converts sp->lock to sp->mutex so it can
+sleep when calling sp_alloc().
+
+[kosaki.motohiro@jp.fujitsu.com: Original patch]
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Reviewed-by: Christoph Lameter <cl@linux.com>
+Cc: Josh Boyer <jwboyer@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mempolicy.h | 2 +-
+ mm/mempolicy.c | 37 ++++++++++++++++---------------------
+ 2 files changed, 17 insertions(+), 22 deletions(-)
+
+--- a/include/linux/mempolicy.h
++++ b/include/linux/mempolicy.h
+@@ -188,7 +188,7 @@ struct sp_node {
+
+ struct shared_policy {
+ struct rb_root root;
+- spinlock_t lock;
++ struct mutex mutex;
+ };
+
+ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -2057,7 +2057,7 @@ bool __mpol_equal(struct mempolicy *a, s
+ */
+
+ /* lookup first element intersecting start-end */
+-/* Caller holds sp->lock */
++/* Caller holds sp->mutex */
+ static struct sp_node *
+ sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
+ {
+@@ -2121,13 +2121,13 @@ mpol_shared_policy_lookup(struct shared_
+
+ if (!sp->root.rb_node)
+ return NULL;
+- spin_lock(&sp->lock);
++ mutex_lock(&sp->mutex);
+ sn = sp_lookup(sp, idx, idx+1);
+ if (sn) {
+ mpol_get(sn->policy);
+ pol = sn->policy;
+ }
+- spin_unlock(&sp->lock);
++ mutex_unlock(&sp->mutex);
+ return pol;
+ }
+
+@@ -2167,10 +2167,10 @@ static struct sp_node *sp_alloc(unsigned
+ static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
+ unsigned long end, struct sp_node *new)
+ {
+- struct sp_node *n, *new2 = NULL;
++ struct sp_node *n;
++ int ret = 0;
+
+-restart:
+- spin_lock(&sp->lock);
++ mutex_lock(&sp->mutex);
+ n = sp_lookup(sp, start, end);
+ /* Take care of old policies in the same range. */
+ while (n && n->start < end) {
+@@ -2183,16 +2183,14 @@ restart:
+ } else {
+ /* Old policy spanning whole new range. */
+ if (n->end > end) {
++ struct sp_node *new2;
++ new2 = sp_alloc(end, n->end, n->policy);
+ if (!new2) {
+- spin_unlock(&sp->lock);
+- new2 = sp_alloc(end, n->end, n->policy);
+- if (!new2)
+- return -ENOMEM;
+- goto restart;
++ ret = -ENOMEM;
++ goto out;
+ }
+ n->end = start;
+ sp_insert(sp, new2);
+- new2 = NULL;
+ break;
+ } else
+ n->end = start;
+@@ -2203,12 +2201,9 @@ restart:
+ }
+ if (new)
+ sp_insert(sp, new);
+- spin_unlock(&sp->lock);
+- if (new2) {
+- mpol_put(new2->policy);
+- kmem_cache_free(sn_cache, new2);
+- }
+- return 0;
++out:
++ mutex_unlock(&sp->mutex);
++ return ret;
+ }
+
+ /**
+@@ -2226,7 +2221,7 @@ void mpol_shared_policy_init(struct shar
+ int ret;
+
+ sp->root = RB_ROOT; /* empty tree == default mempolicy */
+- spin_lock_init(&sp->lock);
++ mutex_init(&sp->mutex);
+
+ if (mpol) {
+ struct vm_area_struct pvma;
+@@ -2292,7 +2287,7 @@ void mpol_free_shared_policy(struct shar
+
+ if (!p->root.rb_node)
+ return;
+- spin_lock(&p->lock);
++ mutex_lock(&p->mutex);
+ next = rb_first(&p->root);
+ while (next) {
+ n = rb_entry(next, struct sp_node, nd);
+@@ -2301,7 +2296,7 @@ void mpol_free_shared_policy(struct shar
+ mpol_put(n->policy);
+ kmem_cache_free(sn_cache, n);
+ }
+- spin_unlock(&p->lock);
++ mutex_unlock(&p->mutex);
+ }
+
+ /* assumes fs == KERNEL_DS */
--- /dev/null
+From 63f74ca21f1fad36d075e063f06dcc6d39fe86b2 Mon Sep 17 00:00:00 2001
+From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Date: Mon, 8 Oct 2012 16:29:19 -0700
+Subject: mempolicy: fix refcount leak in mpol_set_shared_policy()
+
+From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+
+commit 63f74ca21f1fad36d075e063f06dcc6d39fe86b2 upstream.
+
+When shared_policy_replace() fails to allocate new->policy is not freed
+correctly by mpol_set_shared_policy(). The problem is that shared
+mempolicy code directly call kmem_cache_free() in multiple places where
+it is easy to make a mistake.
+
+This patch creates an sp_free wrapper function and uses it. The bug was
+introduced pre-git age (IOW, before 2.6.12-rc2).
+
+[mgorman@suse.de: Editted changelog]
+Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Reviewed-by: Christoph Lameter <cl@linux.com>
+Cc: Josh Boyer <jwboyer@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mempolicy.c | 15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -2131,12 +2131,17 @@ mpol_shared_policy_lookup(struct shared_
+ return pol;
+ }
+
++static void sp_free(struct sp_node *n)
++{
++ mpol_put(n->policy);
++ kmem_cache_free(sn_cache, n);
++}
++
+ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
+ {
+ pr_debug("deleting %lx-l%lx\n", n->start, n->end);
+ rb_erase(&n->nd, &sp->root);
+- mpol_put(n->policy);
+- kmem_cache_free(sn_cache, n);
++ sp_free(n);
+ }
+
+ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
+@@ -2275,7 +2280,7 @@ int mpol_set_shared_policy(struct shared
+ }
+ err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
+ if (err && new)
+- kmem_cache_free(sn_cache, new);
++ sp_free(new);
+ return err;
+ }
+
+@@ -2292,9 +2297,7 @@ void mpol_free_shared_policy(struct shar
+ while (next) {
+ n = rb_entry(next, struct sp_node, nd);
+ next = rb_next(&n->nd);
+- rb_erase(&n->nd, &p->root);
+- mpol_put(n->policy);
+- kmem_cache_free(sn_cache, n);
++ sp_delete(p, n);
+ }
+ mutex_unlock(&p->mutex);
+ }
--- /dev/null
+From 869833f2c5c6e4dd09a5378cfc665ffb4615e5d2 Mon Sep 17 00:00:00 2001
+From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Date: Mon, 8 Oct 2012 16:29:16 -0700
+Subject: mempolicy: remove mempolicy sharing
+
+From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+
+commit 869833f2c5c6e4dd09a5378cfc665ffb4615e5d2 upstream.
+
+Dave Jones' system call fuzz testing tool "trinity" triggered the
+following bug error with slab debugging enabled
+
+ =============================================================================
+ BUG numa_policy (Not tainted): Poison overwritten
+ -----------------------------------------------------------------------------
+
+ INFO: 0xffff880146498250-0xffff880146498250. First byte 0x6a instead of 0x6b
+ INFO: Allocated in mpol_new+0xa3/0x140 age=46310 cpu=6 pid=32154
+ __slab_alloc+0x3d3/0x445
+ kmem_cache_alloc+0x29d/0x2b0
+ mpol_new+0xa3/0x140
+ sys_mbind+0x142/0x620
+ system_call_fastpath+0x16/0x1b
+
+ INFO: Freed in __mpol_put+0x27/0x30 age=46268 cpu=6 pid=32154
+ __slab_free+0x2e/0x1de
+ kmem_cache_free+0x25a/0x260
+ __mpol_put+0x27/0x30
+ remove_vma+0x68/0x90
+ exit_mmap+0x118/0x140
+ mmput+0x73/0x110
+ exit_mm+0x108/0x130
+ do_exit+0x162/0xb90
+ do_group_exit+0x4f/0xc0
+ sys_exit_group+0x17/0x20
+ system_call_fastpath+0x16/0x1b
+
+ INFO: Slab 0xffffea0005192600 objects=27 used=27 fp=0x (null) flags=0x20000000004080
+ INFO: Object 0xffff880146498250 @offset=592 fp=0xffff88014649b9d0
+
+The problem is that the structure is being prematurely freed due to a
+reference count imbalance. In the following case mbind(addr, len) should
+replace the memory policies of both vma1 and vma2 and thus they will
+become to share the same mempolicy and the new mempolicy will have the
+MPOL_F_SHARED flag.
+
+ +-------------------+-------------------+
+ | vma1 | vma2(shmem) |
+ +-------------------+-------------------+
+ | |
+ addr addr+len
+
+alloc_pages_vma() uses get_vma_policy() and mpol_cond_put() pair for
+maintaining the mempolicy reference count. The current rule is that
+get_vma_policy() only increments refcount for shmem VMA and
+mpol_conf_put() only decrements refcount if the policy has
+MPOL_F_SHARED.
+
+In above case, vma1 is not shmem vma and vma->policy has MPOL_F_SHARED!
+The reference count will be decreased even though was not increased
+whenever alloc_page_vma() is called. This has been broken since commit
+[52cd3b07: mempolicy: rework mempolicy Reference Counting] in 2008.
+
+There is another serious bug with the sharing of memory policies.
+Currently, mempolicy rebind logic (it is called from cpuset rebinding)
+ignores a refcount of mempolicy and override it forcibly. Thus, any
+mempolicy sharing may cause mempolicy corruption. The bug was
+introduced by commit [68860ec1: cpusets: automatic numa mempolicy
+rebinding].
+
+Ideally, the shared policy handling would be rewritten to either
+properly handle COW of the policy structures or at least reference count
+MPOL_F_SHARED based exclusively on information within the policy.
+However, this patch takes the easier approach of disabling any policy
+sharing between VMAs. Each new range allocated with sp_alloc will
+allocate a new policy, set the reference count to 1 and drop the
+reference count of the old policy. This increases the memory footprint
+but is not expected to be a major problem as mbind() is unlikely to be
+used for fine-grained ranges. It is also inefficient because it means
+we allocate a new policy even in cases where mbind_range() could use the
+new_policy passed to it. However, it is more straight-forward and the
+change should be invisible to the user.
+
+[mgorman@suse.de: Edited changelog]
+Reported-by: Dave Jones <davej@redhat.com>,
+Cc: Christoph Lameter <cl@linux.com>,
+Reviewed-by: Christoph Lameter <cl@linux.com>
+Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Cc: Josh Boyer <jwboyer@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mempolicy.c | 52 ++++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 38 insertions(+), 14 deletions(-)
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -607,24 +607,39 @@ check_range(struct mm_struct *mm, unsign
+ return first;
+ }
+
+-/* Apply policy to a single VMA */
+-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
++/*
++ * Apply policy to a single VMA
++ * This must be called with the mmap_sem held for writing.
++ */
++static int vma_replace_policy(struct vm_area_struct *vma,
++ struct mempolicy *pol)
+ {
+- int err = 0;
+- struct mempolicy *old = vma->vm_policy;
++ int err;
++ struct mempolicy *old;
++ struct mempolicy *new;
+
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ vma->vm_start, vma->vm_end, vma->vm_pgoff,
+ vma->vm_ops, vma->vm_file,
+ vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+
+- if (vma->vm_ops && vma->vm_ops->set_policy)
++ new = mpol_dup(pol);
++ if (IS_ERR(new))
++ return PTR_ERR(new);
++
++ if (vma->vm_ops && vma->vm_ops->set_policy) {
+ err = vma->vm_ops->set_policy(vma, new);
+- if (!err) {
+- mpol_get(new);
+- vma->vm_policy = new;
+- mpol_put(old);
++ if (err)
++ goto err_out;
+ }
++
++ old = vma->vm_policy;
++ vma->vm_policy = new; /* protected by mmap_sem */
++ mpol_put(old);
++
++ return 0;
++ err_out:
++ mpol_put(new);
+ return err;
+ }
+
+@@ -676,7 +691,7 @@ static int mbind_range(struct mm_struct
+ if (err)
+ goto out;
+ }
+- err = policy_vma(vma, new_pol);
++ err = vma_replace_policy(vma, new_pol);
+ if (err)
+ goto out;
+ }
+@@ -2127,15 +2142,24 @@ static void sp_delete(struct shared_poli
+ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
+ struct mempolicy *pol)
+ {
+- struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
++ struct sp_node *n;
++ struct mempolicy *newpol;
+
++ n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
+ if (!n)
+ return NULL;
++
++ newpol = mpol_dup(pol);
++ if (IS_ERR(newpol)) {
++ kmem_cache_free(sn_cache, n);
++ return NULL;
++ }
++ newpol->flags |= MPOL_F_SHARED;
++
+ n->start = start;
+ n->end = end;
+- mpol_get(pol);
+- pol->flags |= MPOL_F_SHARED; /* for unref */
+- n->policy = pol;
++ n->policy = newpol;
++
+ return n;
+ }
+
--- /dev/null
+From 8d34694c1abf29df1f3c7317936b7e3e2e308d9b Mon Sep 17 00:00:00 2001
+From: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
+Date: Mon, 8 Oct 2012 16:29:14 -0700
+Subject: revert "mm: mempolicy: Let vma_merge and vma_split handle vma->vm_policy linkages"
+
+From: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
+
+commit 8d34694c1abf29df1f3c7317936b7e3e2e308d9b upstream.
+
+Commit 05f144a0d5c2 ("mm: mempolicy: Let vma_merge and vma_split handle
+vma->vm_policy linkages") removed vma->vm_policy updates code but it is
+the purpose of mbind_range(). Now, mbind_range() is virtually a no-op
+and while it does not allow memory corruption it is not the right fix.
+This patch is a revert.
+
+[mgorman@suse.de: Edited changelog]
+Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Josh Boyer <jwboyer@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mempolicy.c | 41 ++++++++++++++++++++++++-----------------
+ 1 file changed, 24 insertions(+), 17 deletions(-)
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -607,6 +607,27 @@ check_range(struct mm_struct *mm, unsign
+ return first;
+ }
+
++/* Apply policy to a single VMA */
++static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
++{
++ int err = 0;
++ struct mempolicy *old = vma->vm_policy;
++
++ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
++ vma->vm_start, vma->vm_end, vma->vm_pgoff,
++ vma->vm_ops, vma->vm_file,
++ vma->vm_ops ? vma->vm_ops->set_policy : NULL);
++
++ if (vma->vm_ops && vma->vm_ops->set_policy)
++ err = vma->vm_ops->set_policy(vma, new);
++ if (!err) {
++ mpol_get(new);
++ vma->vm_policy = new;
++ mpol_put(old);
++ }
++ return err;
++}
++
+ /* Step 2: apply policy to a range and do splits. */
+ static int mbind_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct mempolicy *new_pol)
+@@ -655,23 +676,9 @@ static int mbind_range(struct mm_struct
+ if (err)
+ goto out;
+ }
+-
+- /*
+- * Apply policy to a single VMA. The reference counting of
+- * policy for vma_policy linkages has already been handled by
+- * vma_merge and split_vma as necessary. If this is a shared
+- * policy then ->set_policy will increment the reference count
+- * for an sp node.
+- */
+- pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+- vma->vm_start, vma->vm_end, vma->vm_pgoff,
+- vma->vm_ops, vma->vm_file,
+- vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+- if (vma->vm_ops && vma->vm_ops->set_policy) {
+- err = vma->vm_ops->set_policy(vma, new_pol);
+- if (err)
+- goto out;
+- }
++ err = policy_vma(vma, new_pol);
++ if (err)
++ goto out;
+ }
+
+ out:
rcu-fix-day-one-dyntick-idle-stall-warning-bug.patch
r8169-config1-is-read-only-on-8168c-and-later.patch
r8169-8168c-and-later-require-bit-0x20-to-be-set-in-config2-for-pme-signaling.patch
+revert-mm-mempolicy-let-vma_merge-and-vma_split-handle-vma-vm_policy-linkages.patch
+mempolicy-remove-mempolicy-sharing.patch
+mempolicy-fix-a-race-in-shared_policy_replace.patch
+mempolicy-fix-refcount-leak-in-mpol_set_shared_policy.patch
+mempolicy-fix-a-memory-corruption-by-refcount-imbalance-in-alloc_pages_vma.patch
+efi-build-efi-stub-with-efi-appropriate-options.patch
+efi-initialize-efi.runtime_version-to-make-query_variable_info-update_capsule-workable.patch
+cpu-hotplug-cpusets-suspend-don-t-modify-cpusets-during-suspend-resume.patch