3.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 4 Mar 2014 19:47:35 +0000 (11:47 -0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 4 Mar 2014 19:47:35 +0000 (11:47 -0800)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 4 Mar 2014 19:47:35 +0000 (11:47 -0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 4 Mar 2014 19:47:35 +0000 (11:47 -0800)
diff --git a/queue-3.4/cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch b/queue-3.4/cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch

new file mode 100644 (file)

index 0000000..965a9d1
--- /dev/null
+++ b/queue-3.4/cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch
@@ -0,0 +1,201 @@
+From 5edee61edeaaebafe584f8fb7074c1ef4658596b Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Tue, 16 Oct 2012 15:03:14 -0700
+Subject: cgroup: cgroup_subsys->fork() should be called after the task is added to css_set
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 5edee61edeaaebafe584f8fb7074c1ef4658596b upstream.
+
+cgroup core has a bug which violates a basic rule about event
+notifications - when a new entity needs to be added, you add that to
+the notification list first and then make the new entity conform to
+the current state.  If done in the reverse order, an event happening
+inbetween will be lost.
+
+cgroup_subsys->fork() is invoked way before the new task is added to
+the css_set.  Currently, cgroup_freezer is the only user of ->fork()
+and uses it to make new tasks conform to the current state of the
+freezer.  If FROZEN state is requested while fork is in progress
+between cgroup_fork_callbacks() and cgroup_post_fork(), the child
+could escape freezing - the cgroup isn't frozen when ->fork() is
+called and the freezer couldn't see the new task on the css_set.
+
+This patch moves cgroup_subsys->fork() invocation to
+cgroup_post_fork() after the new task is added to the css_set.
+cgroup_fork_callbacks() is removed.
+
+Because now a task may be migrated during cgroup_subsys->fork(),
+freezer_fork() is updated so that it adheres to the usual RCU locking
+and the rather pointless comment on why locking can be different there
+is removed (if it doesn't make anything simpler, why even bother?).
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Rafael J. Wysocki <rjw@sisk.pl>
+[hq: Backported to 3.4:
+ - Adjust context
+ - Iterate over first CGROUP_BUILTIN_SUBSYS_COUNT elements of subsys]
+Signed-off-by: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cgroup.h  |    1 
+ kernel/cgroup.c         |   50 ++++++++++++++++++++----------------------------
+ kernel/cgroup_freezer.c |   13 +++---------
+ kernel/fork.c           |    9 --------
+ 4 files changed, 26 insertions(+), 47 deletions(-)
+
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -32,7 +32,6 @@ extern int cgroup_lock_is_held(void);
+ extern bool cgroup_lock_live_group(struct cgroup *cgrp);
+ extern void cgroup_unlock(void);
+ extern void cgroup_fork(struct task_struct *p);
+-extern void cgroup_fork_callbacks(struct task_struct *p);
+ extern void cgroup_post_fork(struct task_struct *p);
+ extern void cgroup_exit(struct task_struct *p, int run_callbacks);
+ extern int cgroupstats_build(struct cgroupstats *stats,
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -4497,41 +4497,19 @@ void cgroup_fork(struct task_struct *chi
+ }
+ 
+ /**
+- * cgroup_fork_callbacks - run fork callbacks
+- * @child: the new task
+- *
+- * Called on a new task very soon before adding it to the
+- * tasklist. No need to take any locks since no-one can
+- * be operating on this task.
+- */
+-void cgroup_fork_callbacks(struct task_struct *child)
+-{
+-      if (need_forkexit_callback) {
+-              int i;
+-              /*
+-               * forkexit callbacks are only supported for builtin
+-               * subsystems, and the builtin section of the subsys array is
+-               * immutable, so we don't need to lock the subsys array here.
+-               */
+-              for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+-                      struct cgroup_subsys *ss = subsys[i];
+-                      if (ss->fork)
+-                              ss->fork(child);
+-              }
+-      }
+-}
+-
+-/**
+  * cgroup_post_fork - called on a new task after adding it to the task list
+  * @child: the task in question
+  *
+- * Adds the task to the list running through its css_set if necessary.
+- * Has to be after the task is visible on the task list in case we race
+- * with the first call to cgroup_iter_start() - to guarantee that the
+- * new task ends up on its list.
++ * Adds the task to the list running through its css_set if necessary and
++ * call the subsystem fork() callbacks.  Has to be after the task is
++ * visible on the task list in case we race with the first call to
++ * cgroup_iter_start() - to guarantee that the new task ends up on its
++ * list.
+  */
+ void cgroup_post_fork(struct task_struct *child)
+ {
++      int i;
++
+       /*
+        * use_task_css_set_links is set to 1 before we walk the tasklist
+        * under the tasklist_lock and we read it here after we added the child
+@@ -4551,7 +4529,21 @@ void cgroup_post_fork(struct task_struct
+               task_unlock(child);
+               write_unlock(&css_set_lock);
+       }
++
++      /*
++       * Call ss->fork().  This must happen after @child is linked on
++       * css_set; otherwise, @child might change state between ->fork()
++       * and addition to css_set.
++       */
++      if (need_forkexit_callback) {
++              for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
++                      struct cgroup_subsys *ss = subsys[i];
++                      if (ss->fork)
++                              ss->fork(child);
++              }
++      }
+ }
++
+ /**
+  * cgroup_exit - detach cgroup from exiting task
+  * @tsk: pointer to task_struct of exiting process
+--- a/kernel/cgroup_freezer.c
++++ b/kernel/cgroup_freezer.c
+@@ -186,23 +186,15 @@ static void freezer_fork(struct task_str
+ {
+       struct freezer *freezer;
+ 
+-      /*
+-       * No lock is needed, since the task isn't on tasklist yet,
+-       * so it can't be moved to another cgroup, which means the
+-       * freezer won't be removed and will be valid during this
+-       * function call.  Nevertheless, apply RCU read-side critical
+-       * section to suppress RCU lockdep false positives.
+-       */
+       rcu_read_lock();
+       freezer = task_freezer(task);
+-      rcu_read_unlock();
+ 
+       /*
+        * The root cgroup is non-freezable, so we can skip the
+        * following check.
+        */
+       if (!freezer->css.cgroup->parent)
+-              return;
++              goto out;
+ 
+       spin_lock_irq(&freezer->lock);
+       BUG_ON(freezer->state == CGROUP_FROZEN);
+@@ -210,7 +202,10 @@ static void freezer_fork(struct task_str
+       /* Locking avoids race with FREEZING -> THAWED transitions. */
+       if (freezer->state == CGROUP_FREEZING)
+               freeze_task(task);
++
+       spin_unlock_irq(&freezer->lock);
++out:
++      rcu_read_unlock();
+ }
+ 
+ /*
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1124,7 +1124,6 @@ static struct task_struct *copy_process(
+ {
+       int retval;
+       struct task_struct *p;
+-      int cgroup_callbacks_done = 0;
+ 
+       if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
+               return ERR_PTR(-EINVAL);
+@@ -1383,12 +1382,6 @@ static struct task_struct *copy_process(
+       p->group_leader = p;
+       INIT_LIST_HEAD(&p->thread_group);
+ 
+-      /* Now that the task is set up, run cgroup callbacks if
+-       * necessary. We need to run them before the task is visible
+-       * on the tasklist. */
+-      cgroup_fork_callbacks(p);
+-      cgroup_callbacks_done = 1;
+-
+       /* Need tasklist lock for parent etc handling! */
+       write_lock_irq(&tasklist_lock);
+ 
+@@ -1493,7 +1486,7 @@ bad_fork_cleanup_cgroup:
+ #endif
+       if (clone_flags & CLONE_THREAD)
+               threadgroup_change_end(current);
+-      cgroup_exit(p, cgroup_callbacks_done);
++      cgroup_exit(p, 0);
+       delayacct_tsk_free(p);
+       module_put(task_thread_info(p)->exec_domain->module);
+ bad_fork_cleanup_count:
diff --git a/queue-3.4/cgroup-fix-rcu-accesses-to-task-cgroups.patch b/queue-3.4/cgroup-fix-rcu-accesses-to-task-cgroups.patch

new file mode 100644 (file)

index 0000000..105fa3c
--- /dev/null
+++ b/queue-3.4/cgroup-fix-rcu-accesses-to-task-cgroups.patch
@@ -0,0 +1,108 @@
+From 2235df8b82f6333f2f1a4c6e0acf80f19f591b55 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Tue, 25 Jun 2013 11:48:32 -0700
+Subject: cgroup: fix RCU accesses to task->cgroups
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 14611e51a57df10240817d8ada510842faf0ec51 upstream.
+
+task->cgroups is a RCU pointer pointing to struct css_set.  A task
+switches to a different css_set on cgroup migration but a css_set
+doesn't change once created and its pointers to cgroup_subsys_states
+aren't RCU protected.
+
+task_subsys_state[_check]() is the macro to acquire css given a task
+and subsys_id pair.  It RCU-dereferences task->cgroups->subsys[] not
+task->cgroups, so the RCU pointer task->cgroups ends up being
+dereferenced without read_barrier_depends() after it.  It's broken.
+
+Fix it by introducing task_css_set[_check]() which does
+RCU-dereference on task->cgroups.  task_subsys_state[_check]() is
+reimplemented to directly dereference ->subsys[] of the css_set
+returned from task_css_set[_check]().
+
+This removes some of sparse RCU warnings in cgroup.
+
+v2: Fixed unbalanced parenthsis and there's no need to use
+    rcu_dereference_raw() when !CONFIG_PROVE_RCU.  Both spotted by Li.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reported-by: Fengguang Wu <fengguang.wu@intel.com>
+Acked-by: Li Zefan <lizefan@huawei.com>
+[bwh: Backported to 3.2:
+ - Adjust context
+ - Remove CONFIG_PROVE_RCU condition
+ - s/lockdep_is_held(&cgroup_mutex)/cgroup_lock_is_held()/]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cgroup.h |   52 ++++++++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 45 insertions(+), 7 deletions(-)
+
+--- a/include/linux/cgroup.h
++++ b/include/linux/cgroup.h
+@@ -513,16 +513,54 @@ static inline struct cgroup_subsys_state
+       return cgrp->subsys[subsys_id];
+ }
+ 
+-/*
+- * function to get the cgroup_subsys_state which allows for extra
+- * rcu_dereference_check() conditions, such as locks used during the
+- * cgroup_subsys::attach() methods.
++/**
++ * task_css_set_check - obtain a task's css_set with extra access conditions
++ * @task: the task to obtain css_set for
++ * @__c: extra condition expression to be passed to rcu_dereference_check()
++ *
++ * A task's css_set is RCU protected, initialized and exited while holding
++ * task_lock(), and can only be modified while holding both cgroup_mutex
++ * and task_lock() while the task is alive.  This macro verifies that the
++ * caller is inside proper critical section and returns @task's css_set.
++ *
++ * The caller can also specify additional allowed conditions via @__c, such
++ * as locks used during the cgroup_subsys::attach() methods.
++ */
++#define task_css_set_check(task, __c)                                 \
++      rcu_dereference_check((task)->cgroups,                          \
++              lockdep_is_held(&(task)->alloc_lock) ||                 \
++              cgroup_lock_is_held() || (__c))
++
++/**
++ * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds
++ * @task: the target task
++ * @subsys_id: the target subsystem ID
++ * @__c: extra condition expression to be passed to rcu_dereference_check()
++ *
++ * Return the cgroup_subsys_state for the (@task, @subsys_id) pair.  The
++ * synchronization rules are the same as task_css_set_check().
+  */
+ #define task_subsys_state_check(task, subsys_id, __c)                 \
+-      rcu_dereference_check(task->cgroups->subsys[subsys_id],         \
+-                            lockdep_is_held(&task->alloc_lock) ||     \
+-                            cgroup_lock_is_held() || (__c))
++      task_css_set_check((task), (__c))->subsys[(subsys_id)]
+ 
++/**
++ * task_css_set - obtain a task's css_set
++ * @task: the task to obtain css_set for
++ *
++ * See task_css_set_check().
++ */
++static inline struct css_set *task_css_set(struct task_struct *task)
++{
++      return task_css_set_check(task, false);
++}
++
++/**
++ * task_subsys_state - obtain css for (task, subsys)
++ * @task: the target task
++ * @subsys_id: the target subsystem ID
++ *
++ * See task_subsys_state_check().
++ */
+ static inline struct cgroup_subsys_state *
+ task_subsys_state(struct task_struct *task, int subsys_id)
+ {
diff --git a/queue-3.4/kvm-iommu-hva-align-mapping-page-size.patch b/queue-3.4/kvm-iommu-hva-align-mapping-page-size.patch

new file mode 100644 (file)

index 0000000..e8bb6d0
--- /dev/null
+++ b/queue-3.4/kvm-iommu-hva-align-mapping-page-size.patch
@@ -0,0 +1,52 @@
+From da4eda14795e2e246fc340ba181a95b09ac5205c Mon Sep 17 00:00:00 2001
+From: Greg Edwards <gedwards@ddn.com>
+Date: Mon, 4 Nov 2013 09:08:12 -0700
+Subject: KVM: IOMMU: hva align mapping page size
+
+From: Greg Edwards <gedwards@ddn.com>
+
+commit 27ef63c7e97d1e5dddd85051c03f8d44cc887f34 upstream.
+
+When determining the page size we could use to map with the IOMMU, the
+page size should also be aligned with the hva, not just the gfn.  The
+gfn may not reflect the real alignment within the hugetlbfs file.
+
+Most of the time, this works fine.  However, if the hugetlbfs file is
+backed by non-contiguous huge pages, a multi-huge page memslot starts at
+an unaligned offset within the hugetlbfs file, and the gfn is aligned
+with respect to the huge page size, kvm_host_page_size() will return the
+huge page size and we will use that to map with the IOMMU.
+
+When we later unpin that same memslot, the IOMMU returns the unmap size
+as the huge page size, and we happily unpin that many pfns in
+monotonically increasing order, not realizing we are spanning
+non-contiguous huge pages and partially unpin the wrong huge page.
+
+Ensure the IOMMU mapping page size is aligned with the hva corresponding
+to the gfn, which does reflect the alignment within the hugetlbfs file.
+
+Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Greg Edwards <gedwards@ddn.com>
+Signed-off-by: Gleb Natapov <gleb@redhat.com>
+[bwh: Backported to 3.2: s/__gfn_to_hva_memslot/gfn_to_hva_memslot/]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ virt/kvm/iommu.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/virt/kvm/iommu.c
++++ b/virt/kvm/iommu.c
+@@ -101,6 +101,10 @@ int kvm_iommu_map_pages(struct kvm *kvm,
+               while ((gfn << PAGE_SHIFT) & (page_size - 1))
+                       page_size >>= 1;
+ 
++              /* Make sure hva is aligned to the page size we want to map */
++              while (gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
++                      page_size >>= 1;
++
+               /*
+                * Pin all pages we are about to map in memory. This is
+                * important because we unmap and unpin in 4kb steps later.
diff --git a/queue-3.4/kvm-ppc-emulate-dcbf.patch b/queue-3.4/kvm-ppc-emulate-dcbf.patch

new file mode 100644 (file)

index 0000000..8d5913e
--- /dev/null
+++ b/queue-3.4/kvm-ppc-emulate-dcbf.patch
@@ -0,0 +1,42 @@
+From 1a14c25a0b523225ba47a983d094d1b253f2585a Mon Sep 17 00:00:00 2001
+From: Alexander Graf <agraf@suse.de>
+Date: Thu, 17 Jan 2013 13:50:25 +0100
+Subject: KVM: PPC: Emulate dcbf
+
+From: Alexander Graf <agraf@suse.de>
+
+commit d3286144c92ec876da9e30320afa875699b7e0f1 upstream.
+
+Guests can trigger MMIO exits using dcbf. Since we don't emulate cache
+incoherent MMIO, just do nothing and move on.
+
+Reported-by: Ben Collins <ben.c@servergy.com>
+Signed-off-by: Alexander Graf <agraf@suse.de>
+Tested-by: Ben Collins <ben.c@servergy.com>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/powerpc/kvm/emulate.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/arch/powerpc/kvm/emulate.c
++++ b/arch/powerpc/kvm/emulate.c
+@@ -36,6 +36,7 @@
+ #define OP_TRAP_64 2
+ 
+ #define OP_31_XOP_LWZX      23
++#define OP_31_XOP_DCBF      86
+ #define OP_31_XOP_LBZX      87
+ #define OP_31_XOP_STWX      151
+ #define OP_31_XOP_STBX      215
+@@ -373,6 +374,7 @@ int kvmppc_emulate_instruction(struct kv
+                       kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+                       break;
+ 
++              case OP_31_XOP_DCBF:
+               case OP_31_XOP_DCBI:
+                       /* Do nothing. The guest is performing dcbi because
+                        * hardware DMA is not snooped by the dcache, but
diff --git a/queue-3.4/kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch b/queue-3.4/kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch

new file mode 100644 (file)

index 0000000..c5a3180
--- /dev/null
+++ b/queue-3.4/kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch
@@ -0,0 +1,58 @@
+From 2b29a9fdcb92bfc6b6f4c412d71505869de61a56 Mon Sep 17 00:00:00 2001
+From: Dominik Dingel <dingel@linux.vnet.ibm.com>
+Date: Fri, 26 Jul 2013 15:04:00 +0200
+Subject: KVM: s390: move kvm_guest_enter,exit closer to sie
+
+From: Dominik Dingel <dingel@linux.vnet.ibm.com>
+
+commit 2b29a9fdcb92bfc6b6f4c412d71505869de61a56 upstream.
+
+Any uaccess between guest_enter and guest_exit could trigger a page fault,
+the page fault handler would handle it as a guest fault and translate a
+user address as guest address.
+
+Signed-off-by: Dominik Dingel <dingel@linux.vnet.ibm.com>
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+CC: stable@vger.kernel.org
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+[hq: Backported to 3.4: adjust context]
+Signed-off-by: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/kvm/kvm-s390.c |   12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+--- a/arch/s390/kvm/kvm-s390.c
++++ b/arch/s390/kvm/kvm-s390.c
+@@ -525,13 +525,18 @@ static int __vcpu_run(struct kvm_vcpu *v
+       if (!kvm_is_ucontrol(vcpu->kvm))
+               kvm_s390_deliver_pending_interrupts(vcpu);
+ 
++      VCPU_EVENT(vcpu, 6, "entering sie flags %x",
++                 atomic_read(&vcpu->arch.sie_block->cpuflags));
++
+       vcpu->arch.sie_block->icptcode = 0;
+       local_irq_disable();
+       kvm_guest_enter();
+       local_irq_enable();
+-      VCPU_EVENT(vcpu, 6, "entering sie flags %x",
+-                 atomic_read(&vcpu->arch.sie_block->cpuflags));
+       rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs);
++      local_irq_disable();
++      kvm_guest_exit();
++      local_irq_enable();
++
+       if (rc) {
+               if (kvm_is_ucontrol(vcpu->kvm)) {
+                       rc = SIE_INTERCEPT_UCONTROL;
+@@ -543,9 +548,6 @@ static int __vcpu_run(struct kvm_vcpu *v
+       }
+       VCPU_EVENT(vcpu, 6, "exit sie icptcode %d",
+                  vcpu->arch.sie_block->icptcode);
+-      local_irq_disable();
+-      kvm_guest_exit();
+-      local_irq_enable();
+ 
+       memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16);
+       return rc;
diff --git a/queue-3.4/mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch b/queue-3.4/mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch

new file mode 100644 (file)

index 0000000..94b7f39
--- /dev/null
+++ b/queue-3.4/mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch
@@ -0,0 +1,86 @@
+From 1bdb24f5ab52e64658f496a0dbfe04ffb56edaf6 Mon Sep 17 00:00:00 2001
+From: Jiang Liu <jiang.liu@huawei.com>
+Date: Tue, 31 Jul 2012 16:43:30 -0700
+Subject: mm/hotplug: correctly add new zone to all other nodes' zone lists
+
+From: Jiang Liu <jiang.liu@huawei.com>
+
+commit 08dff7b7d629807dbb1f398c68dd9cd58dd657a1 upstream.
+
+When online_pages() is called to add new memory to an empty zone, it
+rebuilds all zone lists by calling build_all_zonelists().  But there's a
+bug which prevents the new zone to be added to other nodes' zone lists.
+
+online_pages() {
+       build_all_zonelists()
+       .....
+       node_set_state(zone_to_nid(zone), N_HIGH_MEMORY)
+}
+
+Here the node of the zone is put into N_HIGH_MEMORY state after calling
+build_all_zonelists(), but build_all_zonelists() only adds zones from
+nodes in N_HIGH_MEMORY state to the fallback zone lists.
+build_all_zonelists()
+
+    ->__build_all_zonelists()
+       ->build_zonelists()
+           ->find_next_best_node()
+               ->for_each_node_state(n, N_HIGH_MEMORY)
+
+So memory in the new zone will never be used by other nodes, and it may
+cause strange behavor when system is under memory pressure.  So put node
+into N_HIGH_MEMORY state before calling build_all_zonelists().
+
+Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
+Signed-off-by: Jiang Liu <liuj97@gmail.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Michal Hocko <mhocko@suse.cz>
+Cc: Minchan Kim <minchan@kernel.org>
+Cc: Rusty Russell <rusty@rustcorp.com.au>
+Cc: Yinghai Lu <yinghai@kernel.org>
+Cc: Tony Luck <tony.luck@intel.com>
+Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
+Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Keping Chen <chenkeping@huawei.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory_hotplug.c |   15 ++++++++-------
+ 1 file changed, 8 insertions(+), 7 deletions(-)
+
+--- a/mm/memory_hotplug.c
++++ b/mm/memory_hotplug.c
+@@ -515,19 +515,20 @@ int __ref online_pages(unsigned long pfn
+ 
+       zone->present_pages += onlined_pages;
+       zone->zone_pgdat->node_present_pages += onlined_pages;
+-      if (need_zonelists_rebuild)
+-              build_all_zonelists(zone);
+-      else
+-              zone_pcp_update(zone);
++      if (onlined_pages) {
++              node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
++              if (need_zonelists_rebuild)
++                      build_all_zonelists(zone);
++              else
++                      zone_pcp_update(zone);
++      }
+ 
+       mutex_unlock(&zonelists_mutex);
+ 
+       init_per_zone_wmark_min();
+ 
+-      if (onlined_pages) {
++      if (onlined_pages)
+               kswapd_run(zone_to_nid(zone));
+-              node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+-      }
+ 
+       vm_total_pages = nr_free_pagecache_pages();
+ 
diff --git a/queue-3.4/mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch b/queue-3.4/mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch

new file mode 100644 (file)

index 0000000..c471f4d
--- /dev/null
+++ b/queue-3.4/mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch
@@ -0,0 +1,107 @@
+From 60cefed485a02bd99b6299dad70666fe49245da7 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 29 Nov 2012 13:54:23 -0800
+Subject: mm: vmscan: fix endless loop in kswapd balancing
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 60cefed485a02bd99b6299dad70666fe49245da7 upstream.
+
+Kswapd does not in all places have the same criteria for a balanced
+zone.  Zones are only being reclaimed when their high watermark is
+breached, but compaction checks loop over the zonelist again when the
+zone does not meet the low watermark plus two times the size of the
+allocation.  This gets kswapd stuck in an endless loop over a small
+zone, like the DMA zone, where the high watermark is smaller than the
+compaction requirement.
+
+Add a function, zone_balanced(), that checks the watermark, and, for
+higher order allocations, if compaction has enough free memory.  Then
+use it uniformly to check for balanced zones.
+
+This makes sure that when the compaction watermark is not met, at least
+reclaim happens and progress is made - or the zone is declared
+unreclaimable at some point and skipped entirely.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: George Spelvin <linux@horizon.com>
+Reported-by: Johannes Hirte <johannes.hirte@fem.tu-ilmenau.de>
+Reported-by: Tomas Racek <tracek@redhat.com>
+Tested-by: Johannes Hirte <johannes.hirte@fem.tu-ilmenau.de>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mel@csn.ul.ie>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[hq: Backported to 3.4: adjust context]
+Signed-off-by: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ mm/vmscan.c |   27 ++++++++++++++++++---------
+ 1 file changed, 18 insertions(+), 9 deletions(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -2569,6 +2569,19 @@ static void age_active_anon(struct zone
+       } while (memcg);
+ }
+ 
++static bool zone_balanced(struct zone *zone, int order,
++                        unsigned long balance_gap, int classzone_idx)
++{
++      if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
++                                  balance_gap, classzone_idx, 0))
++              return false;
++
++      if (COMPACTION_BUILD && order && !compaction_suitable(zone, order))
++              return false;
++
++      return true;
++}
++
+ /*
+  * pgdat_balanced is used when checking if a node is balanced for high-order
+  * allocations. Only zones that meet watermarks and are in a zone allowed
+@@ -2628,8 +2641,7 @@ static bool sleeping_prematurely(pg_data
+                       continue;
+               }
+ 
+-              if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
+-                                                      i, 0))
++              if (!zone_balanced(zone, order, 0, i))
+                       all_zones_ok = false;
+               else
+                       balanced += zone->present_pages;
+@@ -2741,8 +2753,7 @@ loop_again:
+                               break;
+                       }
+ 
+-                      if (!zone_watermark_ok_safe(zone, order,
+-                                      high_wmark_pages(zone), 0, 0)) {
++                      if (!zone_balanced(zone, order, 0, 0)) {
+                               end_zone = i;
+                               break;
+                       } else {
+@@ -2817,9 +2828,8 @@ loop_again:
+                               testorder = 0;
+ 
+                       if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
+-                                  !zone_watermark_ok_safe(zone, testorder,
+-                                      high_wmark_pages(zone) + balance_gap,
+-                                      end_zone, 0)) {
++                          !zone_balanced(zone, testorder,
++                                         balance_gap, end_zone)) {
+                               shrink_zone(priority, zone, &sc);
+ 
+                               reclaim_state->reclaimed_slab = 0;
+@@ -2846,8 +2856,7 @@ loop_again:
+                               continue;
+                       }
+ 
+-                      if (!zone_watermark_ok_safe(zone, testorder,
+-                                      high_wmark_pages(zone), end_zone, 0)) {
++                      if (!zone_balanced(zone, testorder, 0, end_zone)) {
+                               all_zones_ok = 0;
+                               /*
+                                * We are still under min water mark.  This
diff --git a/queue-3.4/perf-tools-fix-cache-event-name-generation.patch b/queue-3.4/perf-tools-fix-cache-event-name-generation.patch

new file mode 100644 (file)

index 0000000..df772e1
--- /dev/null
+++ b/queue-3.4/perf-tools-fix-cache-event-name-generation.patch
@@ -0,0 +1,44 @@
+From 275ef3878f698941353780440fec6926107a320b Mon Sep 17 00:00:00 2001
+From: Jiri Olsa <jolsa@redhat.com>
+Date: Wed, 5 Sep 2012 19:51:33 +0200
+Subject: perf tools: Fix cache event name generation
+
+From: Jiri Olsa <jolsa@redhat.com>
+
+commit 275ef3878f698941353780440fec6926107a320b upstream.
+
+If the event name is specified with all 3 components, the last one
+overwrites the previous one during the name composing within the
+parse_events_add_cache function.
+
+Fixing this by properly adjusting the string index.
+
+Reported-by: Joel Uckelman <joel@lightboxtechnologies.com>
+Signed-off-by: Jiri Olsa <jolsa@redhat.com>
+Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Ingo Molnar <mingo@elte.hu>
+Cc: Joel Uckelman <joel@lightboxtechnologies.com>
+Cc: Paul Mackerras <paulus@samba.org>
+Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
+LPU-Reference: 20120905175133.GA18352@krava.brq.redhat.com
+[ committer note: Remove the newline fix, done already in 42e1fb7 ]
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Cc: Vinson Lee <vlee@twopensource.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/perf/util/parse-events.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/perf/util/parse-events.c
++++ b/tools/perf/util/parse-events.c
+@@ -413,7 +413,7 @@ int parse_events_add_cache(struct list_h
+       for (i = 0; (i < 2) && (op_result[i]); i++) {
+               char *str = op_result[i];
+ 
+-              snprintf(name + n, MAX_NAME_LEN - n, "-%s", str);
++              n += snprintf(name + n, MAX_NAME_LEN - n, "-%s", str);
+ 
+               if (cache_op == -1) {
+                       cache_op = parse_aliases(str, hw_cache_op,
diff --git a/queue-3.4/perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch b/queue-3.4/perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch

new file mode 100644 (file)

index 0000000..eba84a7
--- /dev/null
+++ b/queue-3.4/perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch
@@ -0,0 +1,40 @@
+From 42e1fb776087713b5482cd7cf6cac998fbdd6544 Mon Sep 17 00:00:00 2001
+From: Arnaldo Carvalho de Melo <acme@redhat.com>
+Date: Thu, 6 Sep 2012 14:43:28 -0300
+Subject: perf tools: Remove extraneous newline when parsing hardware cache events
+
+From: Arnaldo Carvalho de Melo <acme@redhat.com>
+
+commit 42e1fb776087713b5482cd7cf6cac998fbdd6544 upstream.
+
+Noticed while developing a 'perf test' entry to verify that
+perf_evsel__name works.
+
+Cc: David Ahern <dsahern@gmail.com>
+Cc: Frederic Weisbecker <fweisbec@gmail.com>
+Cc: Jiri Olsa <jolsa@redhat.com>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Namhyung Kim <namhyung@gmail.com>
+Cc: Paul Mackerras <paulus@samba.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Stephane Eranian <eranian@google.com>
+Link: http://lkml.kernel.org/n/tip-xz6zgh38mp3cjnd2udh38z8f@git.kernel.org
+Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
+Cc: Vinson Lee <vlee@twopensource.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ tools/perf/util/parse-events.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/tools/perf/util/parse-events.c
++++ b/tools/perf/util/parse-events.c
+@@ -413,7 +413,7 @@ int parse_events_add_cache(struct list_h
+       for (i = 0; (i < 2) && (op_result[i]); i++) {
+               char *str = op_result[i];
+ 
+-              snprintf(name + n, MAX_NAME_LEN - n, "-%s\n", str);
++              snprintf(name + n, MAX_NAME_LEN - n, "-%s", str);
+ 
+               if (cache_op == -1) {
+                       cache_op = parse_aliases(str, hw_cache_op,
diff --git a/queue-3.4/proc-connector-reject-unprivileged-listener-bumps.patch b/queue-3.4/proc-connector-reject-unprivileged-listener-bumps.patch

new file mode 100644 (file)

index 0000000..fc8a48d
--- /dev/null
+++ b/queue-3.4/proc-connector-reject-unprivileged-listener-bumps.patch
@@ -0,0 +1,53 @@
+From 7c4bf08d3fa22613b628fc967953f581564a13ad Mon Sep 17 00:00:00 2001
+From: Kees Cook <keescook@chromium.org>
+Date: Mon, 25 Feb 2013 21:32:25 +0000
+Subject: proc connector: reject unprivileged listener bumps
+
+From: Kees Cook <keescook@chromium.org>
+
+commit e70ab977991964a5a7ad1182799451d067e62669 upstream.
+
+While PROC_CN_MCAST_LISTEN/IGNORE is entirely advisory, it was possible
+for an unprivileged user to turn off notifications for all listeners by
+sending PROC_CN_MCAST_IGNORE. Instead, require the same privileges as
+required for a multicast bind.
+
+Signed-off-by: Kees Cook <keescook@chromium.org>
+Cc: Evgeniy Polyakov <zbr@ioremap.net>
+Cc: Matt Helsley <matthltc@us.ibm.com>
+Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
+Acked-by: Matt Helsley <matthltc@us.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+[bwh: Backported to 3.2: adjust context]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/connector/cn_proc.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/drivers/connector/cn_proc.c
++++ b/drivers/connector/cn_proc.c
+@@ -331,6 +331,12 @@ static void cn_proc_mcast_ctl(struct cn_
+       if (msg->len != sizeof(*mc_op))
+               return;
+ 
++      /* Can only change if privileged. */
++      if (!capable(CAP_NET_ADMIN)) {
++              err = EPERM;
++              goto out;
++      }
++
+       mc_op = (enum proc_cn_mcast_op*)msg->data;
+       switch (*mc_op) {
+       case PROC_CN_MCAST_LISTEN:
+@@ -343,6 +349,8 @@ static void cn_proc_mcast_ctl(struct cn_
+               err = EINVAL;
+               break;
+       }
++
++out:
+       cn_proc_ack(err, msg->seq, msg->ack);
+ }
+ 
diff --git a/queue-3.4/s390-kvm-dont-announce-rrbm-support.patch b/queue-3.4/s390-kvm-dont-announce-rrbm-support.patch

new file mode 100644 (file)

index 0000000..edf4836
--- /dev/null
+++ b/queue-3.4/s390-kvm-dont-announce-rrbm-support.patch
@@ -0,0 +1,61 @@
+From 511c73bcf8e747cc95925a311dfeb630989db5a6 Mon Sep 17 00:00:00 2001
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+Date: Tue, 2 Oct 2012 16:25:38 +0200
+Subject: s390/kvm: dont announce RRBM support
+
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+
+commit 87cac8f879a5ecd7109dbe688087e8810b3364eb upstream.
+
+Newer kernels (linux-next with the transparent huge page patches)
+use rrbm if the feature is announced via feature bit 66.
+RRBM will cause intercepts, so KVM does not handle it right now,
+causing an illegal instruction in the guest.
+The  easy solution is to disable the feature bit for the guest.
+
+This fixes bugs like:
+Kernel BUG at 0000000000124c2a [verbose debug info unavailable]
+illegal operation: 0001 [#1] SMP
+Modules linked in: virtio_balloon virtio_net ipv6 autofs4
+CPU: 0 Not tainted 3.5.4 #1
+Process fmempig (pid: 659, task: 000000007b712fd0, ksp: 000000007bed3670)
+Krnl PSW : 0704d00180000000 0000000000124c2a (pmdp_clear_flush_young+0x5e/0x80)
+     R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 EA:3
+     00000000003cc000 0000000000000004 0000000000000000 0000000079800000
+     0000000000040000 0000000000000000 000000007bed3918 000000007cf40000
+     0000000000000001 000003fff7f00000 000003d281a94000 000000007bed383c
+     000000007bed3918 00000000005ecbf8 00000000002314a6 000000007bed36e0
+ Krnl Code:>0000000000124c2a: b9810025          ogr     %r2,%r5
+           0000000000124c2e: 41343000           la      %r3,0(%r4,%r3)
+           0000000000124c32: a716fffa           brct    %r1,124c26
+           0000000000124c36: b9010022           lngr    %r2,%r2
+           0000000000124c3a: e3d0f0800004       lg      %r13,128(%r15)
+           0000000000124c40: eb22003f000c       srlg    %r2,%r2,63
+[ 2150.713198] Call Trace:
+[ 2150.713223] ([<00000000002312c4>] page_referenced_one+0x6c/0x27c)
+[ 2150.713749]  [<0000000000233812>] page_referenced+0x32a/0x410
+[...]
+
+CC: Alex Graf <agraf@suse.de>
+Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Cc: Qiang Huang <h.huangqiang@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/kvm/kvm-s390.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/s390/kvm/kvm-s390.c
++++ b/arch/s390/kvm/kvm-s390.c
+@@ -904,7 +904,7 @@ static int __init kvm_s390_init(void)
+       }
+       memcpy(facilities, S390_lowcore.stfle_fac_list, 16);
+       facilities[0] &= 0xff00fff3f47c0000ULL;
+-      facilities[1] &= 0x201c000000000000ULL;
++      facilities[1] &= 0x001c000000000000ULL;
+       return 0;
+ }
+ 
diff --git a/queue-3.4/series b/queue-3.4/series

index 2c931cf0422f66aeef5d7521c5e96963f4ce514a..c1875848a4571783902e928be048dc5196d7e0e1 100644 (file)
--- a/queue-3.4/series
+++ b/queue-3.4/series
@@ -40,3 +40,14 @@ selinux-bigendian-problems-with-filename-trans-rules.patch
  quota-fix-race-between-dqput-and-dquot_scan_active.patch
  dma-ste_dma40-don-t-dereference-free-d-descriptor.patch
  dm-mpath-fix-stalls-when-handling-invalid-ioctls.patch
+mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch
+cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch
+kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch
+s390-kvm-dont-announce-rrbm-support.patch
+kvm-ppc-emulate-dcbf.patch
+kvm-iommu-hva-align-mapping-page-size.patch
+proc-connector-reject-unprivileged-listener-bumps.patch
+cgroup-fix-rcu-accesses-to-task-cgroups.patch
+mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch
+perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch
+perf-tools-fix-cache-event-name-generation.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 4 Mar 2014 19:47:35 +0000 (11:47 -0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 4 Mar 2014 19:47:35 +0000 (11:47 -0800)
queue-3.4/cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/cgroup-fix-rcu-accesses-to-task-cgroups.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/kvm-iommu-hva-align-mapping-page-size.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/kvm-ppc-emulate-dcbf.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/perf-tools-fix-cache-event-name-generation.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/proc-connector-reject-unprivileged-listener-bumps.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/s390-kvm-dont-announce-rrbm-support.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/series		patch \| blob \| blame \| history