3.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 18 Feb 2014 22:37:04 +0000 (14:37 -0800)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 18 Feb 2014 22:37:04 +0000 (14:37 -0800)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 18 Feb 2014 22:37:04 +0000 (14:37 -0800)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 18 Feb 2014 22:37:04 +0000 (14:37 -0800)
diff --git a/queue-3.4/ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch b/queue-3.4/ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch

new file mode 100644 (file)

index 0000000..15cf941
--- /dev/null
+++ b/queue-3.4/ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch
@@ -0,0 +1,60 @@
+From 603e7729920e42b3c2f4dbfab9eef4878cb6e8fa Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Fri, 4 Oct 2013 09:29:12 -0400
+Subject: IB/qib: Convert qib_user_sdma_pin_pages() to use get_user_pages_fast()
+
+From: Jan Kara <jack@suse.cz>
+
+commit 603e7729920e42b3c2f4dbfab9eef4878cb6e8fa upstream.
+
+qib_user_sdma_queue_pkts() gets called with mmap_sem held for
+writing. Except for get_user_pages() deep down in
+qib_user_sdma_pin_pages() we don't seem to need mmap_sem at all.  Even
+more interestingly the function qib_user_sdma_queue_pkts() (and also
+qib_user_sdma_coalesce() called somewhat later) call copy_from_user()
+which can hit a page fault and we deadlock on trying to get mmap_sem
+when handling that fault.
+
+So just make qib_user_sdma_pin_pages() use get_user_pages_fast() and
+leave mmap_sem locking for mm.
+
+This deadlock has actually been observed in the wild when the node
+is under memory pressure.
+
+Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Roland Dreier <roland@purestorage.com>
+[Backported to 3.4: (Thank to Ben Hutchings)
+ - Adjust context
+ - Adjust indentation and nr_pages argument in qib_user_sdma_pin_pages()]
+Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
+Signed-off-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/infiniband/hw/qib/qib_user_sdma.c |    6 +-----
+ 1 file changed, 1 insertion(+), 5 deletions(-)
+
+--- a/drivers/infiniband/hw/qib/qib_user_sdma.c
++++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
+@@ -284,8 +284,7 @@ static int qib_user_sdma_pin_pages(const
+       int j;
+       int ret;
+ 
+-      ret = get_user_pages(current, current->mm, addr,
+-                           npages, 0, 1, pages, NULL);
++      ret = get_user_pages_fast(addr, npages, 0, pages);
+ 
+       if (ret != npages) {
+               int i;
+@@ -830,10 +829,7 @@ int qib_user_sdma_writev(struct qib_ctxt
+       while (dim) {
+               const int mxp = 8;
+ 
+-              down_write(&current->mm->mmap_sem);
+               ret = qib_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
+-              up_write(&current->mm->mmap_sem);
+-
+               if (ret <= 0)
+                       goto done_unlock;
+               else {
diff --git a/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations-some-more.patch b/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations-some-more.patch

new file mode 100644 (file)

index 0000000..5cb51f1
--- /dev/null
+++ b/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations-some-more.patch
@@ -0,0 +1,141 @@
+From 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Thu, 17 May 2012 17:15:29 +0200
+Subject: sched/nohz: Fix rq->cpu_load calculations some more
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 upstream.
+
+Follow up on commit 556061b00 ("sched/nohz: Fix rq->cpu_load[]
+calculations") since while that fixed the busy case it regressed the
+mostly idle case.
+
+Add a callback from the nohz exit to also age the rq->cpu_load[]
+array. This closes the hole where either there was no nohz load
+balance pass during the nohz, or there was a 'significant' amount of
+idle time between the last nohz balance and the nohz exit.
+
+So we'll update unconditionally from the tick to not insert any
+accidental 0 load periods while busy, and we try and catch up from
+nohz idle balance and nohz exit. Both these are still prone to missing
+a jiffy, but that has always been the case.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: pjt@google.com
+Cc: Venkatesh Pallipadi <venki@google.com>
+Link: http://lkml.kernel.org/n/tip-kt0trz0apodbf84ucjfdbr1a@git.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Li Zefan <lizefan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/sched.h    |    1 
+ kernel/sched/core.c      |   53 ++++++++++++++++++++++++++++++++++++++---------
+ kernel/time/tick-sched.c |    1 
+ 3 files changed, 45 insertions(+), 10 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -144,6 +144,7 @@ extern unsigned long this_cpu_load(void)
+ 
+ 
+ extern void calc_global_load(unsigned long ticks);
++extern void update_cpu_load_nohz(void);
+ 
+ extern unsigned long get_parent_ip(unsigned long addr);
+ 
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -2649,25 +2649,32 @@ static void __update_cpu_load(struct rq
+       sched_avg_update(this_rq);
+ }
+ 
++#ifdef CONFIG_NO_HZ
++/*
++ * There is no sane way to deal with nohz on smp when using jiffies because the
++ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
++ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
++ *
++ * Therefore we cannot use the delta approach from the regular tick since that
++ * would seriously skew the load calculation. However we'll make do for those
++ * updates happening while idle (nohz_idle_balance) or coming out of idle
++ * (tick_nohz_idle_exit).
++ *
++ * This means we might still be one tick off for nohz periods.
++ */
++
+ /*
+  * Called from nohz_idle_balance() to update the load ratings before doing the
+  * idle balance.
+  */
+ void update_idle_cpu_load(struct rq *this_rq)
+ {
+-      unsigned long curr_jiffies = jiffies;
++      unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+       unsigned long load = this_rq->load.weight;
+       unsigned long pending_updates;
+ 
+       /*
+-       * Bloody broken means of dealing with nohz, but better than nothing..
+-       * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
+-       * update and see 0 difference the one time and 2 the next, even though
+-       * we ticked at roughtly the same rate.
+-       *
+-       * Hence we only use this from nohz_idle_balance() and skip this
+-       * nonsense when called from the scheduler_tick() since that's
+-       * guaranteed a stable rate.
++       * bail if there's load or we're actually up-to-date.
+        */
+       if (load || curr_jiffies == this_rq->last_load_update_tick)
+               return;
+@@ -2679,12 +2686,38 @@ void update_idle_cpu_load(struct rq *thi
+ }
+ 
+ /*
++ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
++ */
++void update_cpu_load_nohz(void)
++{
++      struct rq *this_rq = this_rq();
++      unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
++      unsigned long pending_updates;
++
++      if (curr_jiffies == this_rq->last_load_update_tick)
++              return;
++
++      raw_spin_lock(&this_rq->lock);
++      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
++      if (pending_updates) {
++              this_rq->last_load_update_tick = curr_jiffies;
++              /*
++               * We were idle, this means load 0, the current load might be
++               * !0 due to remote wakeups and the sort.
++               */
++              __update_cpu_load(this_rq, 0, pending_updates);
++      }
++      raw_spin_unlock(&this_rq->lock);
++}
++#endif /* CONFIG_NO_HZ */
++
++/*
+  * Called from scheduler_tick()
+  */
+ static void update_cpu_load_active(struct rq *this_rq)
+ {
+       /*
+-       * See the mess in update_idle_cpu_load().
++       * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+        */
+       this_rq->last_load_update_tick = jiffies;
+       __update_cpu_load(this_rq, this_rq->load.weight, 1);
+--- a/kernel/time/tick-sched.c
++++ b/kernel/time/tick-sched.c
+@@ -582,6 +582,7 @@ void tick_nohz_idle_exit(void)
+       /* Update jiffies first */
+       select_nohz_load_balancer(0);
+       tick_do_update_jiffies64(now);
++      update_cpu_load_nohz();
+ 
+ #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+       /*
diff --git a/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations.patch b/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations.patch

new file mode 100644 (file)

index 0000000..9259e80
--- /dev/null
+++ b/queue-3.4/sched-nohz-fix-rq-cpu_load-calculations.patch
@@ -0,0 +1,145 @@
+From 556061b00c9f2fd6a5524b6bde823ef12f299ecf Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Date: Fri, 11 May 2012 17:31:26 +0200
+Subject: sched/nohz: Fix rq->cpu_load[] calculations
+
+From: Peter Zijlstra <a.p.zijlstra@chello.nl>
+
+commit 556061b00c9f2fd6a5524b6bde823ef12f299ecf upstream.
+
+While investigating why the load-balancer did funny I found that the
+rq->cpu_load[] tables were completely screwy.. a bit more digging
+revealed that the updates that got through were missing ticks followed
+by a catchup of 2 ticks.
+
+The catchup assumes the cpu was idle during that time (since only nohz
+can cause missed ticks and the machine is idle etc..) this means that
+esp. the higher indices were significantly lower than they ought to
+be.
+
+The reason for this is that its not correct to compare against jiffies
+on every jiffy on any other cpu than the cpu that updates jiffies.
+
+This patch cludges around it by only doing the catch-up stuff from
+nohz_idle_balance() and doing the regular stuff unconditionally from
+the tick.
+
+Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
+Cc: pjt@google.com
+Cc: Venkatesh Pallipadi <venki@google.com>
+Link: http://lkml.kernel.org/n/tip-tp4kj18xdd5aj4vvj0qg55s2@git.kernel.org
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Cc: Li Zefan <lizefan@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/core.c  |   53 +++++++++++++++++++++++++++++++++++++--------------
+ kernel/sched/fair.c  |    2 -
+ kernel/sched/sched.h |    2 -
+ 3 files changed, 41 insertions(+), 16 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *
+ }
+ #endif
+ 
+-void update_cpu_load(struct rq *this_rq);
+-
+ static void set_load_weight(struct task_struct *p)
+ {
+       int prio = p->static_prio - MAX_RT_PRIO;
+@@ -2620,22 +2618,13 @@ decay_load_missed(unsigned long load, un
+  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+  * every tick. We fix it up based on jiffies.
+  */
+-void update_cpu_load(struct rq *this_rq)
++static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
++                            unsigned long pending_updates)
+ {
+-      unsigned long this_load = this_rq->load.weight;
+-      unsigned long curr_jiffies = jiffies;
+-      unsigned long pending_updates;
+       int i, scale;
+ 
+       this_rq->nr_load_updates++;
+ 
+-      /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+-      if (curr_jiffies == this_rq->last_load_update_tick)
+-              return;
+-
+-      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+-      this_rq->last_load_update_tick = curr_jiffies;
+-
+       /* Update our load: */
+       this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+       for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+@@ -2660,9 +2649,45 @@ void update_cpu_load(struct rq *this_rq)
+       sched_avg_update(this_rq);
+ }
+ 
++/*
++ * Called from nohz_idle_balance() to update the load ratings before doing the
++ * idle balance.
++ */
++void update_idle_cpu_load(struct rq *this_rq)
++{
++      unsigned long curr_jiffies = jiffies;
++      unsigned long load = this_rq->load.weight;
++      unsigned long pending_updates;
++
++      /*
++       * Bloody broken means of dealing with nohz, but better than nothing..
++       * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
++       * update and see 0 difference the one time and 2 the next, even though
++       * we ticked at roughtly the same rate.
++       *
++       * Hence we only use this from nohz_idle_balance() and skip this
++       * nonsense when called from the scheduler_tick() since that's
++       * guaranteed a stable rate.
++       */
++      if (load || curr_jiffies == this_rq->last_load_update_tick)
++              return;
++
++      pending_updates = curr_jiffies - this_rq->last_load_update_tick;
++      this_rq->last_load_update_tick = curr_jiffies;
++
++      __update_cpu_load(this_rq, load, pending_updates);
++}
++
++/*
++ * Called from scheduler_tick()
++ */
+ static void update_cpu_load_active(struct rq *this_rq)
+ {
+-      update_cpu_load(this_rq);
++      /*
++       * See the mess in update_idle_cpu_load().
++       */
++      this_rq->last_load_update_tick = jiffies;
++      __update_cpu_load(this_rq, this_rq->load.weight, 1);
+ 
+       calc_load_account_active(this_rq);
+ }
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -5042,7 +5042,7 @@ static void nohz_idle_balance(int this_c
+ 
+               raw_spin_lock_irq(&this_rq->lock);
+               update_rq_clock(this_rq);
+-              update_cpu_load(this_rq);
++              update_idle_cpu_load(this_rq);
+               raw_spin_unlock_irq(&this_rq->lock);
+ 
+               rebalance_domains(balance_cpu, CPU_IDLE);
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -873,7 +873,7 @@ extern void resched_cpu(int cpu);
+ extern struct rt_bandwidth def_rt_bandwidth;
+ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+ 
+-extern void update_cpu_load(struct rq *this_rq);
++extern void update_idle_cpu_load(struct rq *this_rq);
+ 
+ #ifdef CONFIG_CGROUP_CPUACCT
+ #include <linux/cgroup.h>
diff --git a/queue-3.4/series b/queue-3.4/series

index a3671465e004c5d151f109ad56c3abfd8a54e6fa..b49f32ea2b614a218bffff7c7151da07797e4953 100644 (file)
--- a/queue-3.4/series
+++ b/queue-3.4/series
@@ -11,3 +11,6 @@ dm-sysfs-fix-a-module-unload-race.patch
  ftrace-synchronize-setting-function_trace_op-with-ftrace_trace_function.patch
  ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch
  ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch
+sched-nohz-fix-rq-cpu_load-calculations.patch
+sched-nohz-fix-rq-cpu_load-calculations-some-more.patch
+ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 18 Feb 2014 22:37:04 +0000 (14:37 -0800)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 18 Feb 2014 22:37:04 +0000 (14:37 -0800)
queue-3.4/ib-qib-convert-qib_user_sdma_pin_pages-to-use-get_user_pages_fast.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/sched-nohz-fix-rq-cpu_load-calculations-some-more.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/sched-nohz-fix-rq-cpu_load-calculations.patch	[new file with mode: 0644]	patch \| blob
queue-3.4/series		patch \| blob \| blame \| history