From: Greg Kroah-Hartman Date: Tue, 4 Mar 2014 19:47:35 +0000 (-0800) Subject: 3.4-stable patches X-Git-Tag: v3.10.33~7 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2306115e47ed0924cc5bcdb05a16aba916e2523a;p=thirdparty%2Fkernel%2Fstable-queue.git 3.4-stable patches added patches: cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch cgroup-fix-rcu-accesses-to-task-cgroups.patch kvm-iommu-hva-align-mapping-page-size.patch kvm-ppc-emulate-dcbf.patch kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch perf-tools-fix-cache-event-name-generation.patch perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch proc-connector-reject-unprivileged-listener-bumps.patch s390-kvm-dont-announce-rrbm-support.patch --- diff --git a/queue-3.4/cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch b/queue-3.4/cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch new file mode 100644 index 00000000000..965a9d1a6fe --- /dev/null +++ b/queue-3.4/cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch @@ -0,0 +1,201 @@ +From 5edee61edeaaebafe584f8fb7074c1ef4658596b Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Tue, 16 Oct 2012 15:03:14 -0700 +Subject: cgroup: cgroup_subsys->fork() should be called after the task is added to css_set + +From: Tejun Heo + +commit 5edee61edeaaebafe584f8fb7074c1ef4658596b upstream. + +cgroup core has a bug which violates a basic rule about event +notifications - when a new entity needs to be added, you add that to +the notification list first and then make the new entity conform to +the current state. If done in the reverse order, an event happening +inbetween will be lost. + +cgroup_subsys->fork() is invoked way before the new task is added to +the css_set. Currently, cgroup_freezer is the only user of ->fork() +and uses it to make new tasks conform to the current state of the +freezer. If FROZEN state is requested while fork is in progress +between cgroup_fork_callbacks() and cgroup_post_fork(), the child +could escape freezing - the cgroup isn't frozen when ->fork() is +called and the freezer couldn't see the new task on the css_set. + +This patch moves cgroup_subsys->fork() invocation to +cgroup_post_fork() after the new task is added to the css_set. +cgroup_fork_callbacks() is removed. + +Because now a task may be migrated during cgroup_subsys->fork(), +freezer_fork() is updated so that it adheres to the usual RCU locking +and the rather pointless comment on why locking can be different there +is removed (if it doesn't make anything simpler, why even bother?). + +Signed-off-by: Tejun Heo +Cc: Oleg Nesterov +Cc: Rafael J. Wysocki +[hq: Backported to 3.4: + - Adjust context + - Iterate over first CGROUP_BUILTIN_SUBSYS_COUNT elements of subsys] +Signed-off-by: Qiang Huang +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/cgroup.h | 1 + kernel/cgroup.c | 50 ++++++++++++++++++++---------------------------- + kernel/cgroup_freezer.c | 13 +++--------- + kernel/fork.c | 9 -------- + 4 files changed, 26 insertions(+), 47 deletions(-) + +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -32,7 +32,6 @@ extern int cgroup_lock_is_held(void); + extern bool cgroup_lock_live_group(struct cgroup *cgrp); + extern void cgroup_unlock(void); + extern void cgroup_fork(struct task_struct *p); +-extern void cgroup_fork_callbacks(struct task_struct *p); + extern void cgroup_post_fork(struct task_struct *p); + extern void cgroup_exit(struct task_struct *p, int run_callbacks); + extern int cgroupstats_build(struct cgroupstats *stats, +--- a/kernel/cgroup.c ++++ b/kernel/cgroup.c +@@ -4497,41 +4497,19 @@ void cgroup_fork(struct task_struct *chi + } + + /** +- * cgroup_fork_callbacks - run fork callbacks +- * @child: the new task +- * +- * Called on a new task very soon before adding it to the +- * tasklist. No need to take any locks since no-one can +- * be operating on this task. +- */ +-void cgroup_fork_callbacks(struct task_struct *child) +-{ +- if (need_forkexit_callback) { +- int i; +- /* +- * forkexit callbacks are only supported for builtin +- * subsystems, and the builtin section of the subsys array is +- * immutable, so we don't need to lock the subsys array here. +- */ +- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { +- struct cgroup_subsys *ss = subsys[i]; +- if (ss->fork) +- ss->fork(child); +- } +- } +-} +- +-/** + * cgroup_post_fork - called on a new task after adding it to the task list + * @child: the task in question + * +- * Adds the task to the list running through its css_set if necessary. +- * Has to be after the task is visible on the task list in case we race +- * with the first call to cgroup_iter_start() - to guarantee that the +- * new task ends up on its list. ++ * Adds the task to the list running through its css_set if necessary and ++ * call the subsystem fork() callbacks. Has to be after the task is ++ * visible on the task list in case we race with the first call to ++ * cgroup_iter_start() - to guarantee that the new task ends up on its ++ * list. + */ + void cgroup_post_fork(struct task_struct *child) + { ++ int i; ++ + /* + * use_task_css_set_links is set to 1 before we walk the tasklist + * under the tasklist_lock and we read it here after we added the child +@@ -4551,7 +4529,21 @@ void cgroup_post_fork(struct task_struct + task_unlock(child); + write_unlock(&css_set_lock); + } ++ ++ /* ++ * Call ss->fork(). This must happen after @child is linked on ++ * css_set; otherwise, @child might change state between ->fork() ++ * and addition to css_set. ++ */ ++ if (need_forkexit_callback) { ++ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { ++ struct cgroup_subsys *ss = subsys[i]; ++ if (ss->fork) ++ ss->fork(child); ++ } ++ } + } ++ + /** + * cgroup_exit - detach cgroup from exiting task + * @tsk: pointer to task_struct of exiting process +--- a/kernel/cgroup_freezer.c ++++ b/kernel/cgroup_freezer.c +@@ -186,23 +186,15 @@ static void freezer_fork(struct task_str + { + struct freezer *freezer; + +- /* +- * No lock is needed, since the task isn't on tasklist yet, +- * so it can't be moved to another cgroup, which means the +- * freezer won't be removed and will be valid during this +- * function call. Nevertheless, apply RCU read-side critical +- * section to suppress RCU lockdep false positives. +- */ + rcu_read_lock(); + freezer = task_freezer(task); +- rcu_read_unlock(); + + /* + * The root cgroup is non-freezable, so we can skip the + * following check. + */ + if (!freezer->css.cgroup->parent) +- return; ++ goto out; + + spin_lock_irq(&freezer->lock); + BUG_ON(freezer->state == CGROUP_FROZEN); +@@ -210,7 +202,10 @@ static void freezer_fork(struct task_str + /* Locking avoids race with FREEZING -> THAWED transitions. */ + if (freezer->state == CGROUP_FREEZING) + freeze_task(task); ++ + spin_unlock_irq(&freezer->lock); ++out: ++ rcu_read_unlock(); + } + + /* +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1124,7 +1124,6 @@ static struct task_struct *copy_process( + { + int retval; + struct task_struct *p; +- int cgroup_callbacks_done = 0; + + if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) + return ERR_PTR(-EINVAL); +@@ -1383,12 +1382,6 @@ static struct task_struct *copy_process( + p->group_leader = p; + INIT_LIST_HEAD(&p->thread_group); + +- /* Now that the task is set up, run cgroup callbacks if +- * necessary. We need to run them before the task is visible +- * on the tasklist. */ +- cgroup_fork_callbacks(p); +- cgroup_callbacks_done = 1; +- + /* Need tasklist lock for parent etc handling! */ + write_lock_irq(&tasklist_lock); + +@@ -1493,7 +1486,7 @@ bad_fork_cleanup_cgroup: + #endif + if (clone_flags & CLONE_THREAD) + threadgroup_change_end(current); +- cgroup_exit(p, cgroup_callbacks_done); ++ cgroup_exit(p, 0); + delayacct_tsk_free(p); + module_put(task_thread_info(p)->exec_domain->module); + bad_fork_cleanup_count: diff --git a/queue-3.4/cgroup-fix-rcu-accesses-to-task-cgroups.patch b/queue-3.4/cgroup-fix-rcu-accesses-to-task-cgroups.patch new file mode 100644 index 00000000000..105fa3c30c2 --- /dev/null +++ b/queue-3.4/cgroup-fix-rcu-accesses-to-task-cgroups.patch @@ -0,0 +1,108 @@ +From 2235df8b82f6333f2f1a4c6e0acf80f19f591b55 Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Tue, 25 Jun 2013 11:48:32 -0700 +Subject: cgroup: fix RCU accesses to task->cgroups + +From: Tejun Heo + +commit 14611e51a57df10240817d8ada510842faf0ec51 upstream. + +task->cgroups is a RCU pointer pointing to struct css_set. A task +switches to a different css_set on cgroup migration but a css_set +doesn't change once created and its pointers to cgroup_subsys_states +aren't RCU protected. + +task_subsys_state[_check]() is the macro to acquire css given a task +and subsys_id pair. It RCU-dereferences task->cgroups->subsys[] not +task->cgroups, so the RCU pointer task->cgroups ends up being +dereferenced without read_barrier_depends() after it. It's broken. + +Fix it by introducing task_css_set[_check]() which does +RCU-dereference on task->cgroups. task_subsys_state[_check]() is +reimplemented to directly dereference ->subsys[] of the css_set +returned from task_css_set[_check](). + +This removes some of sparse RCU warnings in cgroup. + +v2: Fixed unbalanced parenthsis and there's no need to use + rcu_dereference_raw() when !CONFIG_PROVE_RCU. Both spotted by Li. + +Signed-off-by: Tejun Heo +Reported-by: Fengguang Wu +Acked-by: Li Zefan +[bwh: Backported to 3.2: + - Adjust context + - Remove CONFIG_PROVE_RCU condition + - s/lockdep_is_held(&cgroup_mutex)/cgroup_lock_is_held()/] +Signed-off-by: Ben Hutchings +Cc: Qiang Huang +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/cgroup.h | 52 ++++++++++++++++++++++++++++++++++++++++++------- + 1 file changed, 45 insertions(+), 7 deletions(-) + +--- a/include/linux/cgroup.h ++++ b/include/linux/cgroup.h +@@ -513,16 +513,54 @@ static inline struct cgroup_subsys_state + return cgrp->subsys[subsys_id]; + } + +-/* +- * function to get the cgroup_subsys_state which allows for extra +- * rcu_dereference_check() conditions, such as locks used during the +- * cgroup_subsys::attach() methods. ++/** ++ * task_css_set_check - obtain a task's css_set with extra access conditions ++ * @task: the task to obtain css_set for ++ * @__c: extra condition expression to be passed to rcu_dereference_check() ++ * ++ * A task's css_set is RCU protected, initialized and exited while holding ++ * task_lock(), and can only be modified while holding both cgroup_mutex ++ * and task_lock() while the task is alive. This macro verifies that the ++ * caller is inside proper critical section and returns @task's css_set. ++ * ++ * The caller can also specify additional allowed conditions via @__c, such ++ * as locks used during the cgroup_subsys::attach() methods. ++ */ ++#define task_css_set_check(task, __c) \ ++ rcu_dereference_check((task)->cgroups, \ ++ lockdep_is_held(&(task)->alloc_lock) || \ ++ cgroup_lock_is_held() || (__c)) ++ ++/** ++ * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds ++ * @task: the target task ++ * @subsys_id: the target subsystem ID ++ * @__c: extra condition expression to be passed to rcu_dereference_check() ++ * ++ * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The ++ * synchronization rules are the same as task_css_set_check(). + */ + #define task_subsys_state_check(task, subsys_id, __c) \ +- rcu_dereference_check(task->cgroups->subsys[subsys_id], \ +- lockdep_is_held(&task->alloc_lock) || \ +- cgroup_lock_is_held() || (__c)) ++ task_css_set_check((task), (__c))->subsys[(subsys_id)] + ++/** ++ * task_css_set - obtain a task's css_set ++ * @task: the task to obtain css_set for ++ * ++ * See task_css_set_check(). ++ */ ++static inline struct css_set *task_css_set(struct task_struct *task) ++{ ++ return task_css_set_check(task, false); ++} ++ ++/** ++ * task_subsys_state - obtain css for (task, subsys) ++ * @task: the target task ++ * @subsys_id: the target subsystem ID ++ * ++ * See task_subsys_state_check(). ++ */ + static inline struct cgroup_subsys_state * + task_subsys_state(struct task_struct *task, int subsys_id) + { diff --git a/queue-3.4/kvm-iommu-hva-align-mapping-page-size.patch b/queue-3.4/kvm-iommu-hva-align-mapping-page-size.patch new file mode 100644 index 00000000000..e8bb6d034ec --- /dev/null +++ b/queue-3.4/kvm-iommu-hva-align-mapping-page-size.patch @@ -0,0 +1,52 @@ +From da4eda14795e2e246fc340ba181a95b09ac5205c Mon Sep 17 00:00:00 2001 +From: Greg Edwards +Date: Mon, 4 Nov 2013 09:08:12 -0700 +Subject: KVM: IOMMU: hva align mapping page size + +From: Greg Edwards + +commit 27ef63c7e97d1e5dddd85051c03f8d44cc887f34 upstream. + +When determining the page size we could use to map with the IOMMU, the +page size should also be aligned with the hva, not just the gfn. The +gfn may not reflect the real alignment within the hugetlbfs file. + +Most of the time, this works fine. However, if the hugetlbfs file is +backed by non-contiguous huge pages, a multi-huge page memslot starts at +an unaligned offset within the hugetlbfs file, and the gfn is aligned +with respect to the huge page size, kvm_host_page_size() will return the +huge page size and we will use that to map with the IOMMU. + +When we later unpin that same memslot, the IOMMU returns the unmap size +as the huge page size, and we happily unpin that many pfns in +monotonically increasing order, not realizing we are spanning +non-contiguous huge pages and partially unpin the wrong huge page. + +Ensure the IOMMU mapping page size is aligned with the hva corresponding +to the gfn, which does reflect the alignment within the hugetlbfs file. + +Reviewed-by: Marcelo Tosatti +Signed-off-by: Greg Edwards +Signed-off-by: Gleb Natapov +[bwh: Backported to 3.2: s/__gfn_to_hva_memslot/gfn_to_hva_memslot/] +Signed-off-by: Ben Hutchings +Cc: Qiang Huang +Signed-off-by: Greg Kroah-Hartman + +--- + virt/kvm/iommu.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/virt/kvm/iommu.c ++++ b/virt/kvm/iommu.c +@@ -101,6 +101,10 @@ int kvm_iommu_map_pages(struct kvm *kvm, + while ((gfn << PAGE_SHIFT) & (page_size - 1)) + page_size >>= 1; + ++ /* Make sure hva is aligned to the page size we want to map */ ++ while (gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) ++ page_size >>= 1; ++ + /* + * Pin all pages we are about to map in memory. This is + * important because we unmap and unpin in 4kb steps later. diff --git a/queue-3.4/kvm-ppc-emulate-dcbf.patch b/queue-3.4/kvm-ppc-emulate-dcbf.patch new file mode 100644 index 00000000000..8d5913e71d5 --- /dev/null +++ b/queue-3.4/kvm-ppc-emulate-dcbf.patch @@ -0,0 +1,42 @@ +From 1a14c25a0b523225ba47a983d094d1b253f2585a Mon Sep 17 00:00:00 2001 +From: Alexander Graf +Date: Thu, 17 Jan 2013 13:50:25 +0100 +Subject: KVM: PPC: Emulate dcbf + +From: Alexander Graf + +commit d3286144c92ec876da9e30320afa875699b7e0f1 upstream. + +Guests can trigger MMIO exits using dcbf. Since we don't emulate cache +incoherent MMIO, just do nothing and move on. + +Reported-by: Ben Collins +Signed-off-by: Alexander Graf +Tested-by: Ben Collins +[bwh: Backported to 3.2: adjust context] +Signed-off-by: Ben Hutchings +Cc: Qiang Huang +Signed-off-by: Greg Kroah-Hartman + +--- + arch/powerpc/kvm/emulate.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/arch/powerpc/kvm/emulate.c ++++ b/arch/powerpc/kvm/emulate.c +@@ -36,6 +36,7 @@ + #define OP_TRAP_64 2 + + #define OP_31_XOP_LWZX 23 ++#define OP_31_XOP_DCBF 86 + #define OP_31_XOP_LBZX 87 + #define OP_31_XOP_STWX 151 + #define OP_31_XOP_STBX 215 +@@ -373,6 +374,7 @@ int kvmppc_emulate_instruction(struct kv + kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); + break; + ++ case OP_31_XOP_DCBF: + case OP_31_XOP_DCBI: + /* Do nothing. The guest is performing dcbi because + * hardware DMA is not snooped by the dcache, but diff --git a/queue-3.4/kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch b/queue-3.4/kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch new file mode 100644 index 00000000000..c5a318057c0 --- /dev/null +++ b/queue-3.4/kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch @@ -0,0 +1,58 @@ +From 2b29a9fdcb92bfc6b6f4c412d71505869de61a56 Mon Sep 17 00:00:00 2001 +From: Dominik Dingel +Date: Fri, 26 Jul 2013 15:04:00 +0200 +Subject: KVM: s390: move kvm_guest_enter,exit closer to sie + +From: Dominik Dingel + +commit 2b29a9fdcb92bfc6b6f4c412d71505869de61a56 upstream. + +Any uaccess between guest_enter and guest_exit could trigger a page fault, +the page fault handler would handle it as a guest fault and translate a +user address as guest address. + +Signed-off-by: Dominik Dingel +Signed-off-by: Christian Borntraeger +CC: stable@vger.kernel.org +Signed-off-by: Paolo Bonzini +[hq: Backported to 3.4: adjust context] +Signed-off-by: Qiang Huang +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/kvm/kvm-s390.c | 12 +++++++----- + 1 file changed, 7 insertions(+), 5 deletions(-) + +--- a/arch/s390/kvm/kvm-s390.c ++++ b/arch/s390/kvm/kvm-s390.c +@@ -525,13 +525,18 @@ static int __vcpu_run(struct kvm_vcpu *v + if (!kvm_is_ucontrol(vcpu->kvm)) + kvm_s390_deliver_pending_interrupts(vcpu); + ++ VCPU_EVENT(vcpu, 6, "entering sie flags %x", ++ atomic_read(&vcpu->arch.sie_block->cpuflags)); ++ + vcpu->arch.sie_block->icptcode = 0; + local_irq_disable(); + kvm_guest_enter(); + local_irq_enable(); +- VCPU_EVENT(vcpu, 6, "entering sie flags %x", +- atomic_read(&vcpu->arch.sie_block->cpuflags)); + rc = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); ++ local_irq_disable(); ++ kvm_guest_exit(); ++ local_irq_enable(); ++ + if (rc) { + if (kvm_is_ucontrol(vcpu->kvm)) { + rc = SIE_INTERCEPT_UCONTROL; +@@ -543,9 +548,6 @@ static int __vcpu_run(struct kvm_vcpu *v + } + VCPU_EVENT(vcpu, 6, "exit sie icptcode %d", + vcpu->arch.sie_block->icptcode); +- local_irq_disable(); +- kvm_guest_exit(); +- local_irq_enable(); + + memcpy(&vcpu->run->s.regs.gprs[14], &vcpu->arch.sie_block->gg14, 16); + return rc; diff --git a/queue-3.4/mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch b/queue-3.4/mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch new file mode 100644 index 00000000000..94b7f3960b7 --- /dev/null +++ b/queue-3.4/mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch @@ -0,0 +1,86 @@ +From 1bdb24f5ab52e64658f496a0dbfe04ffb56edaf6 Mon Sep 17 00:00:00 2001 +From: Jiang Liu +Date: Tue, 31 Jul 2012 16:43:30 -0700 +Subject: mm/hotplug: correctly add new zone to all other nodes' zone lists + +From: Jiang Liu + +commit 08dff7b7d629807dbb1f398c68dd9cd58dd657a1 upstream. + +When online_pages() is called to add new memory to an empty zone, it +rebuilds all zone lists by calling build_all_zonelists(). But there's a +bug which prevents the new zone to be added to other nodes' zone lists. + +online_pages() { + build_all_zonelists() + ..... + node_set_state(zone_to_nid(zone), N_HIGH_MEMORY) +} + +Here the node of the zone is put into N_HIGH_MEMORY state after calling +build_all_zonelists(), but build_all_zonelists() only adds zones from +nodes in N_HIGH_MEMORY state to the fallback zone lists. +build_all_zonelists() + + ->__build_all_zonelists() + ->build_zonelists() + ->find_next_best_node() + ->for_each_node_state(n, N_HIGH_MEMORY) + +So memory in the new zone will never be used by other nodes, and it may +cause strange behavor when system is under memory pressure. So put node +into N_HIGH_MEMORY state before calling build_all_zonelists(). + +Signed-off-by: Jianguo Wu +Signed-off-by: Jiang Liu +Cc: Mel Gorman +Cc: Michal Hocko +Cc: Minchan Kim +Cc: Rusty Russell +Cc: Yinghai Lu +Cc: Tony Luck +Cc: KAMEZAWA Hiroyuki +Cc: KOSAKI Motohiro +Cc: David Rientjes +Cc: Keping Chen +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[bwh: Backported to 3.2: adjust context] +Signed-off-by: Ben Hutchings +Cc: Qiang Huang +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory_hotplug.c | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +--- a/mm/memory_hotplug.c ++++ b/mm/memory_hotplug.c +@@ -515,19 +515,20 @@ int __ref online_pages(unsigned long pfn + + zone->present_pages += onlined_pages; + zone->zone_pgdat->node_present_pages += onlined_pages; +- if (need_zonelists_rebuild) +- build_all_zonelists(zone); +- else +- zone_pcp_update(zone); ++ if (onlined_pages) { ++ node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); ++ if (need_zonelists_rebuild) ++ build_all_zonelists(zone); ++ else ++ zone_pcp_update(zone); ++ } + + mutex_unlock(&zonelists_mutex); + + init_per_zone_wmark_min(); + +- if (onlined_pages) { ++ if (onlined_pages) + kswapd_run(zone_to_nid(zone)); +- node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); +- } + + vm_total_pages = nr_free_pagecache_pages(); + diff --git a/queue-3.4/mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch b/queue-3.4/mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch new file mode 100644 index 00000000000..c471f4d5d61 --- /dev/null +++ b/queue-3.4/mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch @@ -0,0 +1,107 @@ +From 60cefed485a02bd99b6299dad70666fe49245da7 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Thu, 29 Nov 2012 13:54:23 -0800 +Subject: mm: vmscan: fix endless loop in kswapd balancing + +From: Johannes Weiner + +commit 60cefed485a02bd99b6299dad70666fe49245da7 upstream. + +Kswapd does not in all places have the same criteria for a balanced +zone. Zones are only being reclaimed when their high watermark is +breached, but compaction checks loop over the zonelist again when the +zone does not meet the low watermark plus two times the size of the +allocation. This gets kswapd stuck in an endless loop over a small +zone, like the DMA zone, where the high watermark is smaller than the +compaction requirement. + +Add a function, zone_balanced(), that checks the watermark, and, for +higher order allocations, if compaction has enough free memory. Then +use it uniformly to check for balanced zones. + +This makes sure that when the compaction watermark is not met, at least +reclaim happens and progress is made - or the zone is declared +unreclaimable at some point and skipped entirely. + +Signed-off-by: Johannes Weiner +Reported-by: George Spelvin +Reported-by: Johannes Hirte +Reported-by: Tomas Racek +Tested-by: Johannes Hirte +Reviewed-by: Rik van Riel +Cc: Mel Gorman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +[hq: Backported to 3.4: adjust context] +Signed-off-by: Qiang Huang +Signed-off-by: Greg Kroah-Hartman + + +--- + mm/vmscan.c | 27 ++++++++++++++++++--------- + 1 file changed, 18 insertions(+), 9 deletions(-) + +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -2569,6 +2569,19 @@ static void age_active_anon(struct zone + } while (memcg); + } + ++static bool zone_balanced(struct zone *zone, int order, ++ unsigned long balance_gap, int classzone_idx) ++{ ++ if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + ++ balance_gap, classzone_idx, 0)) ++ return false; ++ ++ if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) ++ return false; ++ ++ return true; ++} ++ + /* + * pgdat_balanced is used when checking if a node is balanced for high-order + * allocations. Only zones that meet watermarks and are in a zone allowed +@@ -2628,8 +2641,7 @@ static bool sleeping_prematurely(pg_data + continue; + } + +- if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), +- i, 0)) ++ if (!zone_balanced(zone, order, 0, i)) + all_zones_ok = false; + else + balanced += zone->present_pages; +@@ -2741,8 +2753,7 @@ loop_again: + break; + } + +- if (!zone_watermark_ok_safe(zone, order, +- high_wmark_pages(zone), 0, 0)) { ++ if (!zone_balanced(zone, order, 0, 0)) { + end_zone = i; + break; + } else { +@@ -2817,9 +2828,8 @@ loop_again: + testorder = 0; + + if ((buffer_heads_over_limit && is_highmem_idx(i)) || +- !zone_watermark_ok_safe(zone, testorder, +- high_wmark_pages(zone) + balance_gap, +- end_zone, 0)) { ++ !zone_balanced(zone, testorder, ++ balance_gap, end_zone)) { + shrink_zone(priority, zone, &sc); + + reclaim_state->reclaimed_slab = 0; +@@ -2846,8 +2856,7 @@ loop_again: + continue; + } + +- if (!zone_watermark_ok_safe(zone, testorder, +- high_wmark_pages(zone), end_zone, 0)) { ++ if (!zone_balanced(zone, testorder, 0, end_zone)) { + all_zones_ok = 0; + /* + * We are still under min water mark. This diff --git a/queue-3.4/perf-tools-fix-cache-event-name-generation.patch b/queue-3.4/perf-tools-fix-cache-event-name-generation.patch new file mode 100644 index 00000000000..df772e110af --- /dev/null +++ b/queue-3.4/perf-tools-fix-cache-event-name-generation.patch @@ -0,0 +1,44 @@ +From 275ef3878f698941353780440fec6926107a320b Mon Sep 17 00:00:00 2001 +From: Jiri Olsa +Date: Wed, 5 Sep 2012 19:51:33 +0200 +Subject: perf tools: Fix cache event name generation + +From: Jiri Olsa + +commit 275ef3878f698941353780440fec6926107a320b upstream. + +If the event name is specified with all 3 components, the last one +overwrites the previous one during the name composing within the +parse_events_add_cache function. + +Fixing this by properly adjusting the string index. + +Reported-by: Joel Uckelman +Signed-off-by: Jiri Olsa +Cc: Corey Ashford +Cc: Frederic Weisbecker +Cc: Ingo Molnar +Cc: Joel Uckelman +Cc: Paul Mackerras +Cc: Peter Zijlstra +LPU-Reference: 20120905175133.GA18352@krava.brq.redhat.com +[ committer note: Remove the newline fix, done already in 42e1fb7 ] +Signed-off-by: Arnaldo Carvalho de Melo +Cc: Vinson Lee +Signed-off-by: Greg Kroah-Hartman + +--- + tools/perf/util/parse-events.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/tools/perf/util/parse-events.c ++++ b/tools/perf/util/parse-events.c +@@ -413,7 +413,7 @@ int parse_events_add_cache(struct list_h + for (i = 0; (i < 2) && (op_result[i]); i++) { + char *str = op_result[i]; + +- snprintf(name + n, MAX_NAME_LEN - n, "-%s", str); ++ n += snprintf(name + n, MAX_NAME_LEN - n, "-%s", str); + + if (cache_op == -1) { + cache_op = parse_aliases(str, hw_cache_op, diff --git a/queue-3.4/perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch b/queue-3.4/perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch new file mode 100644 index 00000000000..eba84a7fa2f --- /dev/null +++ b/queue-3.4/perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch @@ -0,0 +1,40 @@ +From 42e1fb776087713b5482cd7cf6cac998fbdd6544 Mon Sep 17 00:00:00 2001 +From: Arnaldo Carvalho de Melo +Date: Thu, 6 Sep 2012 14:43:28 -0300 +Subject: perf tools: Remove extraneous newline when parsing hardware cache events + +From: Arnaldo Carvalho de Melo + +commit 42e1fb776087713b5482cd7cf6cac998fbdd6544 upstream. + +Noticed while developing a 'perf test' entry to verify that +perf_evsel__name works. + +Cc: David Ahern +Cc: Frederic Weisbecker +Cc: Jiri Olsa +Cc: Mike Galbraith +Cc: Namhyung Kim +Cc: Paul Mackerras +Cc: Peter Zijlstra +Cc: Stephane Eranian +Link: http://lkml.kernel.org/n/tip-xz6zgh38mp3cjnd2udh38z8f@git.kernel.org +Signed-off-by: Arnaldo Carvalho de Melo +Cc: Vinson Lee +Signed-off-by: Greg Kroah-Hartman + +--- + tools/perf/util/parse-events.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/tools/perf/util/parse-events.c ++++ b/tools/perf/util/parse-events.c +@@ -413,7 +413,7 @@ int parse_events_add_cache(struct list_h + for (i = 0; (i < 2) && (op_result[i]); i++) { + char *str = op_result[i]; + +- snprintf(name + n, MAX_NAME_LEN - n, "-%s\n", str); ++ snprintf(name + n, MAX_NAME_LEN - n, "-%s", str); + + if (cache_op == -1) { + cache_op = parse_aliases(str, hw_cache_op, diff --git a/queue-3.4/proc-connector-reject-unprivileged-listener-bumps.patch b/queue-3.4/proc-connector-reject-unprivileged-listener-bumps.patch new file mode 100644 index 00000000000..fc8a48d4c56 --- /dev/null +++ b/queue-3.4/proc-connector-reject-unprivileged-listener-bumps.patch @@ -0,0 +1,53 @@ +From 7c4bf08d3fa22613b628fc967953f581564a13ad Mon Sep 17 00:00:00 2001 +From: Kees Cook +Date: Mon, 25 Feb 2013 21:32:25 +0000 +Subject: proc connector: reject unprivileged listener bumps + +From: Kees Cook + +commit e70ab977991964a5a7ad1182799451d067e62669 upstream. + +While PROC_CN_MCAST_LISTEN/IGNORE is entirely advisory, it was possible +for an unprivileged user to turn off notifications for all listeners by +sending PROC_CN_MCAST_IGNORE. Instead, require the same privileges as +required for a multicast bind. + +Signed-off-by: Kees Cook +Cc: Evgeniy Polyakov +Cc: Matt Helsley +Acked-by: Evgeniy Polyakov +Acked-by: Matt Helsley +Signed-off-by: David S. Miller +[bwh: Backported to 3.2: adjust context] +Signed-off-by: Ben Hutchings +Cc: Qiang Huang +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/connector/cn_proc.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/drivers/connector/cn_proc.c ++++ b/drivers/connector/cn_proc.c +@@ -331,6 +331,12 @@ static void cn_proc_mcast_ctl(struct cn_ + if (msg->len != sizeof(*mc_op)) + return; + ++ /* Can only change if privileged. */ ++ if (!capable(CAP_NET_ADMIN)) { ++ err = EPERM; ++ goto out; ++ } ++ + mc_op = (enum proc_cn_mcast_op*)msg->data; + switch (*mc_op) { + case PROC_CN_MCAST_LISTEN: +@@ -343,6 +349,8 @@ static void cn_proc_mcast_ctl(struct cn_ + err = EINVAL; + break; + } ++ ++out: + cn_proc_ack(err, msg->seq, msg->ack); + } + diff --git a/queue-3.4/s390-kvm-dont-announce-rrbm-support.patch b/queue-3.4/s390-kvm-dont-announce-rrbm-support.patch new file mode 100644 index 00000000000..edf4836907d --- /dev/null +++ b/queue-3.4/s390-kvm-dont-announce-rrbm-support.patch @@ -0,0 +1,61 @@ +From 511c73bcf8e747cc95925a311dfeb630989db5a6 Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger +Date: Tue, 2 Oct 2012 16:25:38 +0200 +Subject: s390/kvm: dont announce RRBM support + +From: Christian Borntraeger + +commit 87cac8f879a5ecd7109dbe688087e8810b3364eb upstream. + +Newer kernels (linux-next with the transparent huge page patches) +use rrbm if the feature is announced via feature bit 66. +RRBM will cause intercepts, so KVM does not handle it right now, +causing an illegal instruction in the guest. +The easy solution is to disable the feature bit for the guest. + +This fixes bugs like: +Kernel BUG at 0000000000124c2a [verbose debug info unavailable] +illegal operation: 0001 [#1] SMP +Modules linked in: virtio_balloon virtio_net ipv6 autofs4 +CPU: 0 Not tainted 3.5.4 #1 +Process fmempig (pid: 659, task: 000000007b712fd0, ksp: 000000007bed3670) +Krnl PSW : 0704d00180000000 0000000000124c2a (pmdp_clear_flush_young+0x5e/0x80) + R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 EA:3 + 00000000003cc000 0000000000000004 0000000000000000 0000000079800000 + 0000000000040000 0000000000000000 000000007bed3918 000000007cf40000 + 0000000000000001 000003fff7f00000 000003d281a94000 000000007bed383c + 000000007bed3918 00000000005ecbf8 00000000002314a6 000000007bed36e0 + Krnl Code:>0000000000124c2a: b9810025 ogr %r2,%r5 + 0000000000124c2e: 41343000 la %r3,0(%r4,%r3) + 0000000000124c32: a716fffa brct %r1,124c26 + 0000000000124c36: b9010022 lngr %r2,%r2 + 0000000000124c3a: e3d0f0800004 lg %r13,128(%r15) + 0000000000124c40: eb22003f000c srlg %r2,%r2,63 +[ 2150.713198] Call Trace: +[ 2150.713223] ([<00000000002312c4>] page_referenced_one+0x6c/0x27c) +[ 2150.713749] [<0000000000233812>] page_referenced+0x32a/0x410 +[...] + +CC: Alex Graf +Signed-off-by: Martin Schwidefsky +Signed-off-by: Christian Borntraeger +Signed-off-by: Marcelo Tosatti +Signed-off-by: Ben Hutchings +Cc: Qiang Huang +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/kvm/kvm-s390.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/s390/kvm/kvm-s390.c ++++ b/arch/s390/kvm/kvm-s390.c +@@ -904,7 +904,7 @@ static int __init kvm_s390_init(void) + } + memcpy(facilities, S390_lowcore.stfle_fac_list, 16); + facilities[0] &= 0xff00fff3f47c0000ULL; +- facilities[1] &= 0x201c000000000000ULL; ++ facilities[1] &= 0x001c000000000000ULL; + return 0; + } + diff --git a/queue-3.4/series b/queue-3.4/series index 2c931cf0422..c1875848a45 100644 --- a/queue-3.4/series +++ b/queue-3.4/series @@ -40,3 +40,14 @@ selinux-bigendian-problems-with-filename-trans-rules.patch quota-fix-race-between-dqput-and-dquot_scan_active.patch dma-ste_dma40-don-t-dereference-free-d-descriptor.patch dm-mpath-fix-stalls-when-handling-invalid-ioctls.patch +mm-vmscan-fix-endless-loop-in-kswapd-balancing.patch +cgroup-cgroup_subsys-fork-should-be-called-after-the-task-is-added-to-css_set.patch +kvm-s390-move-kvm_guest_enter-exit-closer-to-sie.patch +s390-kvm-dont-announce-rrbm-support.patch +kvm-ppc-emulate-dcbf.patch +kvm-iommu-hva-align-mapping-page-size.patch +proc-connector-reject-unprivileged-listener-bumps.patch +cgroup-fix-rcu-accesses-to-task-cgroups.patch +mm-hotplug-correctly-add-new-zone-to-all-other-nodes-zone-lists.patch +perf-tools-remove-extraneous-newline-when-parsing-hardware-cache-events.patch +perf-tools-fix-cache-event-name-generation.patch