--- /dev/null
+From 5edee61edeaaebafe584f8fb7074c1ef4658596b Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Tue, 16 Oct 2012 15:03:14 -0700
+Subject: cgroup: cgroup_subsys->fork() should be called after the task is added to css_set
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 5edee61edeaaebafe584f8fb7074c1ef4658596b upstream.
+
+cgroup core has a bug which violates a basic rule about event
+notifications - when a new entity needs to be added, you add that to
+the notification list first and then make the new entity conform to
+the current state. If done in the reverse order, an event happening
+inbetween will be lost.
+
+cgroup_subsys->fork() is invoked way before the new task is added to
+the css_set. Currently, cgroup_freezer is the only user of ->fork()
+and uses it to make new tasks conform to the current state of the
+freezer. If FROZEN state is requested while fork is in progress
+between cgroup_fork_callbacks() and cgroup_post_fork(), the child
+could escape freezing - the cgroup isn't frozen when ->fork() is
+called and the freezer couldn't see the new task on the css_set.
+
+This patch moves cgroup_subsys->fork() invocation to
+cgroup_post_fork() after the new task is added to the css_set.
+cgroup_fork_callbacks() is removed.
+
+Because now a task may be migrated during cgroup_subsys->fork(),
+freezer_fork() is updated so that it adheres to the usual RCU locking
+and the rather pointless comment on why locking can be different there
+is removed (if it doesn't make anything simpler, why even bother?).
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Rafael J. Wysocki <rjw@sisk.pl>
+[hq: Backported to 3.4:
+ - Adjust context
+ - Iterate over first CGROUP_BUILTIN_SUBSYS_COUNT elements of subsys]
+Signed-off-by: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cgroup.h | 1
+ kernel/cgroup.c | 50 ++++++++++++++++++++----------------------------
+ kernel/cgroup_freezer.c | 13 +++---------
+ kernel/fork.c | 9 --------
+ 4 files changed, 26 insertions(+), 47 deletions(-)
+
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -32,7 +32,6 @@ extern int cgroup_lock_is_held(void);
+ extern bool cgroup_lock_live_group(struct cgroup *cgrp);
+ extern void cgroup_unlock(void);
+ extern void cgroup_fork(struct task_struct *p);
+-extern void cgroup_fork_callbacks(struct task_struct *p);
+ extern void cgroup_post_fork(struct task_struct *p);
+ extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+ extern int cgroupstats_build(struct cgroupstats *stats,
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -4497,41 +4497,19 @@ void cgroup_fork(struct task_struct *chi
+ }
+
+ /**
+- * cgroup_fork_callbacks - run fork callbacks
+- * @child: the new task
+- *
+- * Called on a new task very soon before adding it to the
+- * tasklist. No need to take any locks since no-one can
+- * be operating on this task.
+- */
+-void cgroup_fork_callbacks(struct task_struct *child)
+-{
+- if (need_forkexit_callback) {
+- int i;
+- /*
+- * forkexit callbacks are only supported for builtin
+- * subsystems, and the builtin section of the subsys array is
+- * immutable, so we don't need to lock the subsys array here.
+- */
+- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+- struct cgroup_subsys *ss = subsys[i];
+- if (ss->fork)
+- ss->fork(child);
+- }
+- }
+-}
+-
+-/**
+ * cgroup_post_fork - called on a new task after adding it to the task list
+ * @child: the task in question
+ *
+- * Adds the task to the list running through its css_set if necessary.
+- * Has to be after the task is visible on the task list in case we race
+- * with the first call to cgroup_iter_start() - to guarantee that the
+- * new task ends up on its list.
++ * Adds the task to the list running through its css_set if necessary and
++ * call the subsystem fork() callbacks. Has to be after the task is
++ * visible on the task list in case we race with the first call to
++ * cgroup_iter_start() - to guarantee that the new task ends up on its
++ * list.
+ */
+ void cgroup_post_fork(struct task_struct *child)
+ {
++ int i;
++
+ /*
+ * use_task_css_set_links is set to 1 before we walk the tasklist
+ * under the tasklist_lock and we read it here after we added the child
+@@ -4551,7 +4529,21 @@ void cgroup_post_fork(struct task_struct
+ task_unlock(child);
+ write_unlock(&css_set_lock);
+ }
++
++ /*
++ * Call ss->fork(). This must happen after @child is linked on
++ * css_set; otherwise, @child might change state between ->fork()
++ * and addition to css_set.
++ */
++ if (need_forkexit_callback) {
++ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
++ struct cgroup_subsys *ss = subsys[i];
++ if (ss->fork)
++ ss->fork(child);
++ }
++ }
+ }
++
+ /**
+ * cgroup_exit - detach cgroup from exiting task
+ * @tsk: pointer to task_struct of exiting process
+--- a/kernel/cgroup_freezer.c
++++ b/kernel/cgroup_freezer.c
+@@ -186,23 +186,15 @@ static void freezer_fork(struct task_str
+ {
+ struct freezer *freezer;
+
+- /*
+- * No lock is needed, since the task isn't on tasklist yet,
+- * so it can't be moved to another cgroup, which means the
+- * freezer won't be removed and will be valid during this
+- * function call. Nevertheless, apply RCU read-side critical
+- * section to suppress RCU lockdep false positives.
+- */
+ rcu_read_lock();
+ freezer = task_freezer(task);
+- rcu_read_unlock();
+
+ /*
+ * The root cgroup is non-freezable, so we can skip the
+ * following check.
+ */
+ if (!freezer->css.cgroup->parent)
+- return;
++ goto out;
+
+ spin_lock_irq(&freezer->lock);
+ BUG_ON(freezer->state == CGROUP_FROZEN);
+@@ -210,7 +202,10 @@ static void freezer_fork(struct task_str
+ /* Locking avoids race with FREEZING -> THAWED transitions. */
+ if (freezer->state == CGROUP_FREEZING)
+ freeze_task(task);
++
+ spin_unlock_irq(&freezer->lock);
++out:
++ rcu_read_unlock();
+ }
+
+ /*
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1124,7 +1124,6 @@ static struct task_struct *copy_process(
+ {
+ int retval;
+ struct task_struct *p;
+- int cgroup_callbacks_done = 0;
+
+ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+@@ -1383,12 +1382,6 @@ static struct task_struct *copy_process(
+ p->group_leader = p;
+ INIT_LIST_HEAD(&p->thread_group);
+
+- /* Now that the task is set up, run cgroup callbacks if
+- * necessary. We need to run them before the task is visible
+- * on the tasklist. */
+- cgroup_fork_callbacks(p);
+- cgroup_callbacks_done = 1;
+-
+ /* Need tasklist lock for parent etc handling! */
+ write_lock_irq(&tasklist_lock);
+
+@@ -1493,7 +1486,7 @@ bad_fork_cleanup_cgroup:
+ #endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_change_end(current);
+- cgroup_exit(p, cgroup_callbacks_done);
++ cgroup_exit(p, 0);
+ delayacct_tsk_free(p);
+ module_put(task_thread_info(p)->exec_domain->module);
+ bad_fork_cleanup_count:
--- /dev/null
+From 2235df8b82f6333f2f1a4c6e0acf80f19f591b55 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Tue, 25 Jun 2013 11:48:32 -0700
+Subject: cgroup: fix RCU accesses to task->cgroups
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 14611e51a57df10240817d8ada510842faf0ec51 upstream.
+
+task->cgroups is a RCU pointer pointing to struct css_set. A task
+switches to a different css_set on cgroup migration but a css_set
+doesn't change once created and its pointers to cgroup_subsys_states
+aren't RCU protected.
+
+task_subsys_state[_check]() is the macro to acquire css given a task
+and subsys_id pair. It RCU-dereferences task->cgroups->subsys[] not
+task->cgroups, so the RCU pointer task->cgroups ends up being
+dereferenced without read_barrier_depends() after it. It's broken.
+
+Fix it by introducing task_css_set[_check]() which does
+RCU-dereference on task->cgroups. task_subsys_state[_check]() is
+reimplemented to directly dereference ->subsys[] of the css_set
+returned from task_css_set[_check]().
+
+This removes some of sparse RCU warnings in cgroup.
+
+v2: Fixed unbalanced parenthsis and there's no need to use
+ rcu_dereference_raw() when !CONFIG_PROVE_RCU. Both spotted by Li.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-by: Fengguang Wu <fengguang.wu@intel.com>
+Acked-by: Li Zefan <lizefan@huawei.com>
+[bwh: Backported to 3.2:
+ - Adjust context
+ - Remove CONFIG_PROVE_RCU condition
+ - s/lockdep_is_held(&cgroup_mutex)/cgroup_lock_is_held()/]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cgroup.h | 52 ++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 45 insertions(+), 7 deletions(-)
+
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -513,16 +513,54 @@ static inline struct cgroup_subsys_state
+ return cgrp->subsys[subsys_id];
+ }
+
+-/*
+- * function to get the cgroup_subsys_state which allows for extra
+- * rcu_dereference_check() conditions, such as locks used during the
+- * cgroup_subsys::attach() methods.
++/**
++ * task_css_set_check - obtain a task's css_set with extra access conditions
++ * @task: the task to obtain css_set for
++ * @__c: extra condition expression to be passed to rcu_dereference_check()
++ *
++ * A task's css_set is RCU protected, initialized and exited while holding
++ * task_lock(), and can only be modified while holding both cgroup_mutex
++ * and task_lock() while the task is alive. This macro verifies that the
++ * caller is inside proper critical section and returns @task's css_set.
++ *
++ * The caller can also specify additional allowed conditions via @__c, such
++ * as locks used during the cgroup_subsys::attach() methods.
++ */
++#define task_css_set_check(task, __c) \
++ rcu_dereference_check((task)->cgroups, \
++ lockdep_is_held(&(task)->alloc_lock) || \
++ cgroup_lock_is_held() || (__c))
++
++/**
++ * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds
++ * @task: the target task
++ * @subsys_id: the target subsystem ID
++ * @__c: extra condition expression to be passed to rcu_dereference_check()
++ *
++ * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The
++ * synchronization rules are the same as task_css_set_check().
+ */
+ #define task_subsys_state_check(task, subsys_id, __c) \
+- rcu_dereference_check(task->cgroups->subsys[subsys_id], \
+- lockdep_is_held(&task->alloc_lock) || \
+- cgroup_lock_is_held() || (__c))
++ task_css_set_check((task), (__c))->subsys[(subsys_id)]
+
++/**
++ * task_css_set - obtain a task's css_set
++ * @task: the task to obtain css_set for
++ *
++ * See task_css_set_check().
++ */
++static inline struct css_set *task_css_set(struct task_struct *task)
++{
++ return task_css_set_check(task, false);
++}
++
++/**
++ * task_subsys_state - obtain css for (task, subsys)
++ * @task: the target task
++ * @subsys_id: the target subsystem ID
++ *
++ * See task_subsys_state_check().
++ */
+ static inline struct cgroup_subsys_state *
+ task_subsys_state(struct task_struct *task, int subsys_id)
+ {
--- /dev/null
+From da4eda14795e2e246fc340ba181a95b09ac5205c Mon Sep 17 00:00:00 2001
+From: Greg Edwards <gedwards@ddn.com>
+Date: Mon, 4 Nov 2013 09:08:12 -0700
+Subject: KVM: IOMMU: hva align mapping page size
+
+From: Greg Edwards <gedwards@ddn.com>
+
+commit 27ef63c7e97d1e5dddd85051c03f8d44cc887f34 upstream.
+
+When determining the page size we could use to map with the IOMMU, the
+page size should also be aligned with the hva, not just the gfn. The
+gfn may not reflect the real alignment within the hugetlbfs file.
+
+Most of the time, this works fine. However, if the hugetlbfs file is
+backed by non-contiguous huge pages, a multi-huge page memslot starts at
+an unaligned offset within the hugetlbfs file, and the gfn is aligned
+with respect to the huge page size, kvm_host_page_size() will return the
+huge page size and we will use that to map with the IOMMU.
+
+When we later unpin that same memslot, the IOMMU returns the unmap size
+as the huge page size, and we happily unpin that many pfns in
+monotonically increasing order, not realizing we are spanning
+non-contiguous huge pages and partially unpin the wrong huge page.
+
+Ensure the IOMMU mapping page size is aligned with the hva corresponding
+to the gfn, which does reflect the alignment within the hugetlbfs file.
+
+Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Greg Edwards <gedwards@ddn.com>
+Signed-off-by: Gleb Natapov <gleb@redhat.com>
+[bwh: Backported to 3.2: s/__gfn_to_hva_memslot/gfn_to_hva_memslot/]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ virt/kvm/iommu.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/virt/kvm/iommu.c
++++ b/virt/kvm/iommu.c
+@@ -101,6 +101,10 @@ int kvm_iommu_map_pages(struct kvm *kvm,
+ while ((gfn << PAGE_SHIFT) & (page_size - 1))
+ page_size >>= 1;
+
++ /* Make sure hva is aligned to the page size we want to map */
++ while (gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
++ page_size >>= 1;
++
+ /*
+ * Pin all pages we are about to map in memory. This is
+ * important because we unmap and unpin in 4kb steps later.
--- /dev/null
+From 1a14c25a0b523225ba47a983d094d1b253f2585a Mon Sep 17 00:00:00 2001
+From: Alexander Graf <agraf@suse.de>
+Date: Thu, 17 Jan 2013 13:50:25 +0100
+Subject: KVM: PPC: Emulate dcbf
+
+From: Alexander Graf <agraf@suse.de>
+
+commit d3286144c92ec876da9e30320afa875699b7e0f1 upstream.
+
+Guests can trigger MMIO exits using dcbf. Since we don't emulate cache
+incoherent MMIO, just do nothing and move on.
+
+Reported-by: Ben Collins <ben.c@servergy.com>
+Signed-off-by: Alexander Graf <agraf@suse.de>
+Tested-by: Ben Collins <ben.c@servergy.com>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/kvm/emulate.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/powerpc/kvm/emulate.c
++++ b/arch/powerpc/kvm/emulate.c
+@@ -36,6 +36,7 @@
+ #define OP_TRAP_64 2
+
+ #define OP_31_XOP_LWZX 23
++#define OP_31_XOP_DCBF 86
+ #define OP_31_XOP_LBZX 87
+ #define OP_31_XOP_STWX 151
+ #define OP_31_XOP_STBX 215
+@@ -373,6 +374,7 @@ int kvmppc_emulate_instruction(struct kv
+ kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+ break;
+
++ case OP_31_XOP_DCBF:
+ case OP_31_XOP_DCBI:
+ /* Do nothing. The guest is performing dcbi because
+ * hardware DMA is not snooped by the dcache, but
--- /dev/null
+From 2b29a9fdcb92bfc6b6f4c412d71505869de61a56 Mon Sep 17 00:00:00 2001
+From: Dominik Dingel <dingel@linux.vnet.ibm.com>
+Date: Fri, 26 Jul 2013 15:04:00 +0200
+Subject: KVM: s390: move kvm_guest_enter,exit closer to sie
+
+From: Dominik Dingel <dingel@linux.vnet.ibm.com>
+
+commit 2b29a9fdcb92bfc6b6f4c412d71505869de61a56 upstream.
+
+Any uaccess between guest_enter and guest_exit could trigger a page fault,
+the page fault handler would handle it as a guest fault and translate a
+user address as guest address.
+
+Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+CC: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+[hq: Backported to 3.4: adjust context]
+Signed-off-by: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/kvm/kvm-s390.c | 12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+--- a/arch/s390/kvm/kvm-s390.c
++++ b/arch/s390/kvm/kvm-s390.c
+@@ -525,13 +525,18 @@ static int __vcpu_run(struct kvm_vcpu *v
+ if (!kvm_is_ucontrol(vcpu->kvm))
+ kvm_s390_deliver_pending_interrupts(vcpu);
+
++ VCPU_EVENT(vcpu, 6, "entering sie flags %x",
++ atomic_read(&vcpu->arch.sie_block->cpuflags));
++
+ vcpu->arch.sie_block->icptcode = 0;
+ local_irq_disable();
+ kvm_guest_enter();
+ local_irq_enable();
+- VCPU_EVENT(vcpu, 6, "entering sie flags %x",
+- atomic_read(&vcpu->arch.sie_block->cpuflags));
+ rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
++ local_irq_disable();
++ kvm_guest_exit();
++ local_irq_enable();
++
+ if (rc) {
+ if (kvm_is_ucontrol(vcpu->kvm)) {
+ rc = SIE_INTERCEPT_UCONTROL;
+@@ -543,9 +548,6 @@ static int __vcpu_run(struct kvm_vcpu *v
+ }
+ VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
+ vcpu->arch.sie_block->icptcode);
+- local_irq_disable();
+- kvm_guest_exit();
+- local_irq_enable();
+
+ memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
+ return rc;
--- /dev/null
+From 1bdb24f5ab52e64658f496a0dbfe04ffb56edaf6 Mon Sep 17 00:00:00 2001
+From: Jiang Liu <jiang.liu@huawei.com>
+Date: Tue, 31 Jul 2012 16:43:30 -0700
+Subject: mm/hotplug: correctly add new zone to all other nodes' zone lists
+
+From: Jiang Liu <jiang.liu@huawei.com>
+
+commit 08dff7b7d629807dbb1f398c68dd9cd58dd657a1 upstream.
+
+When online_pages() is called to add new memory to an empty zone, it
+rebuilds all zone lists by calling build_all_zonelists(). But there's a
+bug which prevents the new zone to be added to other nodes' zone lists.
+
+online_pages() {
+ build_all_zonelists()
+ .....
+ node_set_state(zone_to_nid(zone), N_HIGH_MEMORY)
+}
+
+Here the node of the zone is put into N_HIGH_MEMORY state after calling
+build_all_zonelists(), but build_all_zonelists() only adds zones from
+nodes in N_HIGH_MEMORY state to the fallback zone lists.
+build_all_zonelists()
+
+ ->__build_all_zonelists()
+ ->build_zonelists()
+ ->find_next_best_node()
+ ->for_each_node_state(n, N_HIGH_MEMORY)
+
+So memory in the new zone will never be used by other nodes, and it may
+cause strange behavor when system is under memory pressure. So put node
+into N_HIGH_MEMORY state before calling build_all_zonelists().
+
+Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
+Signed-off-by: Jiang Liu <liuj97@gmail.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Rusty Russell <rusty@rustcorp.com.au>
+Cc: Yinghai Lu <yinghai@kernel.org>
+Cc: Tony Luck <tony.luck@intel.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Keping Chen <chenkeping@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory_hotplug.c | 15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -515,19 +515,20 @@ int __ref online_pages(unsigned long pfn
+
+ zone->present_pages += onlined_pages;
+ zone->zone_pgdat->node_present_pages += onlined_pages;
+- if (need_zonelists_rebuild)
+- build_all_zonelists(zone);
+- else
+- zone_pcp_update(zone);
++ if (onlined_pages) {
++ node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
++ if (need_zonelists_rebuild)
++ build_all_zonelists(zone);
++ else
++ zone_pcp_update(zone);
++ }
+
+ mutex_unlock(&zonelists_mutex);
+
+ init_per_zone_wmark_min();
+
+- if (onlined_pages) {
++ if (onlined_pages)
+ kswapd_run(zone_to_nid(zone));
+- node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+- }
+
+ vm_total_pages = nr_free_pagecache_pages();
+
--- /dev/null
+From 60cefed485a02bd99b6299dad70666fe49245da7 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 29 Nov 2012 13:54:23 -0800
+Subject: mm: vmscan: fix endless loop in kswapd balancing
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 60cefed485a02bd99b6299dad70666fe49245da7 upstream.
+
+Kswapd does not in all places have the same criteria for a balanced
+zone. Zones are only being reclaimed when their high watermark is
+breached, but compaction checks loop over the zonelist again when the
+zone does not meet the low watermark plus two times the size of the
+allocation. This gets kswapd stuck in an endless loop over a small
+zone, like the DMA zone, where the high watermark is smaller than the
+compaction requirement.
+
+Add a function, zone_balanced(), that checks the watermark, and, for
+higher order allocations, if compaction has enough free memory. Then
+use it uniformly to check for balanced zones.
+
+This makes sure that when the compaction watermark is not met, at least
+reclaim happens and progress is made - or the zone is declared
+unreclaimable at some point and skipped entirely.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: George Spelvin <linux@horizon.com>
+Reported-by: Johannes Hirte <johannes.hirte@fem.tu-ilmenau.de>
+Reported-by: Tomas Racek <tracek@redhat.com>
+Tested-by: Johannes Hirte <johannes.hirte@fem.tu-ilmenau.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mel@csn.ul.ie>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[hq: Backported to 3.4: adjust context]
+Signed-off-by: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ mm/vmscan.c | 27 ++++++++++++++++++---------
+ 1 file changed, 18 insertions(+), 9 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2569,6 +2569,19 @@ static void age_active_anon(struct zone
+ } while (memcg);
+ }
+
++static bool zone_balanced(struct zone *zone, int order,
++ unsigned long balance_gap, int classzone_idx)
++{
++ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
++ balance_gap, classzone_idx, 0))
++ return false;
++
++ if (COMPACTION_BUILD && order && !compaction_suitable(zone, order))
++ return false;
++
++ return true;
++}
++
+ /*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+@@ -2628,8 +2641,7 @@ static bool sleeping_prematurely(pg_data
+ continue;
+ }
+
+- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+- i, 0))
++ if (!zone_balanced(zone, order, 0, i))
+ all_zones_ok = false;
+ else
+ balanced += zone->present_pages;
+@@ -2741,8 +2753,7 @@ loop_again:
+ break;
+ }
+
+- if (!zone_watermark_ok_safe(zone, order,
+- high_wmark_pages(zone), 0, 0)) {
++ if (!zone_balanced(zone, order, 0, 0)) {
+ end_zone = i;
+ break;
+ } else {
+@@ -2817,9 +2828,8 @@ loop_again:
+ testorder = 0;
+
+ if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
+- !zone_watermark_ok_safe(zone, testorder,
+- high_wmark_pages(zone) + balance_gap,
+- end_zone, 0)) {
++ !zone_balanced(zone, testorder,
++ balance_gap, end_zone)) {
+ shrink_zone(priority, zone, &sc);
+
+ reclaim_state->reclaimed_slab = 0;
+@@ -2846,8 +2856,7 @@ loop_again:
+ continue;
+ }
+
+- if (!zone_watermark_ok_safe(zone, testorder,
+- high_wmark_pages(zone), end_zone, 0)) {
++ if (!zone_balanced(zone, testorder, 0, end_zone)) {
+ all_zones_ok = 0;
+ /*
+ * We are still under min water mark. This
--- /dev/null
+From 275ef3878f698941353780440fec6926107a320b Mon Sep 17 00:00:00 2001
+From: Jiri Olsa <jolsa@redhat.com>
+Date: Wed, 5 Sep 2012 19:51:33 +0200
+Subject: perf tools: Fix cache event name generation
+
+From: Jiri Olsa <jolsa@redhat.com>
+
+commit 275ef3878f698941353780440fec6926107a320b upstream.
+
+If the event name is specified with all 3 components, the last one
+overwrites the previous one during the name composing within the
+parse_events_add_cache function.
+
+Fixing this by properly adjusting the string index.
+
+Reported-by: Joel Uckelman <joel@lightboxtechnologies.com>
+Signed-off-by: Jiri Olsa <jolsa@redhat.com>
+Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Ingo Molnar <mingo@elte.hu>
+Cc: Joel Uckelman <joel@lightboxtechnologies.com>
+Cc: Paul Mackerras <paulus@samba.org>
+Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LPU-Reference: 20120905175133.GA18352@krava.brq.redhat.com
+[ committer note: Remove the newline fix, done already in 42e1fb7 ]
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Cc: Vinson Lee <vlee@twopensource.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/perf/util/parse-events.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/perf/util/parse-events.c
++++ b/tools/perf/util/parse-events.c
+@@ -413,7 +413,7 @@ int parse_events_add_cache(struct list_h
+ for (i = 0; (i < 2) && (op_result[i]); i++) {
+ char *str = op_result[i];
+
+- snprintf(name + n, MAX_NAME_LEN - n, "-%s", str);
++ n += snprintf(name + n, MAX_NAME_LEN - n, "-%s", str);
+
+ if (cache_op == -1) {
+ cache_op = parse_aliases(str, hw_cache_op,
--- /dev/null
+From 42e1fb776087713b5482cd7cf6cac998fbdd6544 Mon Sep 17 00:00:00 2001
+From: Arnaldo Carvalho de Melo <acme@redhat.com>
+Date: Thu, 6 Sep 2012 14:43:28 -0300
+Subject: perf tools: Remove extraneous newline when parsing hardware cache events
+
+From: Arnaldo Carvalho de Melo <acme@redhat.com>
+
+commit 42e1fb776087713b5482cd7cf6cac998fbdd6544 upstream.
+
+Noticed while developing a 'perf test' entry to verify that
+perf_evsel__name works.
+
+Cc: David Ahern <dsahern@gmail.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Jiri Olsa <jolsa@redhat.com>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Namhyung Kim <namhyung@gmail.com>
+Cc: Paul Mackerras <paulus@samba.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Stephane Eranian <eranian@google.com>
+Link: http://lkml.kernel.org/n/tip-xz6zgh38mp3cjnd2udh38z8f@git.kernel.org
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Cc: Vinson Lee <vlee@twopensource.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/perf/util/parse-events.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/perf/util/parse-events.c
++++ b/tools/perf/util/parse-events.c
+@@ -413,7 +413,7 @@ int parse_events_add_cache(struct list_h
+ for (i = 0; (i < 2) && (op_result[i]); i++) {
+ char *str = op_result[i];
+
+- snprintf(name + n, MAX_NAME_LEN - n, "-%s\n", str);
++ snprintf(name + n, MAX_NAME_LEN - n, "-%s", str);
+
+ if (cache_op == -1) {
+ cache_op = parse_aliases(str, hw_cache_op,
--- /dev/null
+From 7c4bf08d3fa22613b628fc967953f581564a13ad Mon Sep 17 00:00:00 2001
+From: Kees Cook <keescook@chromium.org>
+Date: Mon, 25 Feb 2013 21:32:25 +0000
+Subject: proc connector: reject unprivileged listener bumps
+
+From: Kees Cook <keescook@chromium.org>
+
+commit e70ab977991964a5a7ad1182799451d067e62669 upstream.
+
+While PROC_CN_MCAST_LISTEN/IGNORE is entirely advisory, it was possible
+for an unprivileged user to turn off notifications for all listeners by
+sending PROC_CN_MCAST_IGNORE. Instead, require the same privileges as
+required for a multicast bind.
+
+Signed-off-by: Kees Cook <keescook@chromium.org>
+Cc: Evgeniy Polyakov <zbr@ioremap.net>
+Cc: Matt Helsley <matthltc@us.ibm.com>
+Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
+Acked-by: Matt Helsley <matthltc@us.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/connector/cn_proc.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/drivers/connector/cn_proc.c
++++ b/drivers/connector/cn_proc.c
+@@ -331,6 +331,12 @@ static void cn_proc_mcast_ctl(struct cn_
+ if (msg->len != sizeof(*mc_op))
+ return;
+
++ /* Can only change if privileged. */
++ if (!capable(CAP_NET_ADMIN)) {
++ err = EPERM;
++ goto out;
++ }
++
+ mc_op = (enum proc_cn_mcast_op*)msg->data;
+ switch (*mc_op) {
+ case PROC_CN_MCAST_LISTEN:
+@@ -343,6 +349,8 @@ static void cn_proc_mcast_ctl(struct cn_
+ err = EINVAL;
+ break;
+ }
++
++out:
+ cn_proc_ack(err, msg->seq, msg->ack);
+ }
+
--- /dev/null
+From 511c73bcf8e747cc95925a311dfeb630989db5a6 Mon Sep 17 00:00:00 2001
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+Date: Tue, 2 Oct 2012 16:25:38 +0200
+Subject: s390/kvm: dont announce RRBM support
+
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+
+commit 87cac8f879a5ecd7109dbe688087e8810b3364eb upstream.
+
+Newer kernels (linux-next with the transparent huge page patches)
+use rrbm if the feature is announced via feature bit 66.
+RRBM will cause intercepts, so KVM does not handle it right now,
+causing an illegal instruction in the guest.
+The easy solution is to disable the feature bit for the guest.
+
+This fixes bugs like:
+Kernel BUG at 0000000000124c2a [verbose debug info unavailable]
+illegal operation: 0001 [#1] SMP
+Modules linked in: virtio_balloon virtio_net ipv6 autofs4
+CPU: 0 Not tainted 3.5.4 #1
+Process fmempig (pid: 659, task: 000000007b712fd0, ksp: 000000007bed3670)
+Krnl PSW : 0704d00180000000 0000000000124c2a (pmdp_clear_flush_young+0x5e/0x80)
+ R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 EA:3
+ 00000000003cc000 0000000000000004 0000000000000000 0000000079800000
+ 0000000000040000 0000000000000000 000000007bed3918 000000007cf40000
+ 0000000000000001 000003fff7f00000 000003d281a94000 000000007bed383c
+ 000000007bed3918 00000000005ecbf8 00000000002314a6 000000007bed36e0
+ Krnl Code:>0000000000124c2a: b9810025 ogr %r2,%r5
+ 0000000000124c2e: 41343000 la %r3,0(%r4,%r3)
+ 0000000000124c32: a716fffa brct %r1,124c26
+ 0000000000124c36: b9010022 lngr %r2,%r2
+ 0000000000124c3a: e3d0f0800004 lg %r13,128(%r15)
+ 0000000000124c40: eb22003f000c srlg %r2,%r2,63
+[ 2150.713198] Call Trace:
+[ 2150.713223] ([<00000000002312c4>] page_referenced_one+0x6c/0x27c)
+[ 2150.713749] [<0000000000233812>] page_referenced+0x32a/0x410
+[...]
+
+CC: Alex Graf <agraf@suse.de>
+Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/kvm/kvm-s390.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/s390/kvm/kvm-s390.c
++++ b/arch/s390/kvm/kvm-s390.c
+@@ -904,7 +904,7 @@ static int __init kvm_s390_init(void)
+ }
+ memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
+ facilities[0] &= 0xff00fff3f47c0000ULL;
+- facilities[1] &= 0x201c000000000000ULL;
++ facilities[1] &= 0x001c000000000000ULL;
+ return 0;
+ }
+
quota-fix-race-between-dqput-and-dquot_scan_active.patch
dma-ste_dma40-don-t-dereference-free-d-descriptor.patch
dm-mpath-fix-stalls-when-handling-invalid-ioctls.patch
+mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch
+cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch
+kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch
+s390-kvm-dont-announce-rrbm-support.patch
+kvm-ppc-emulate-dcbf.patch
+kvm-iommu-hva-align-mapping-page-size.patch
+proc-connector-reject-unprivileged-listener-bumps.patch
+cgroup-fix-rcu-accesses-to-task-cgroups.patch
+mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch
+perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch
+perf-tools-fix-cache-event-name-generation.patch