4.2-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 13 Oct 2015 22:58:38 +0000 (15:58 -0700)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Tue, 13 Oct 2015 22:58:38 +0000 (15:58 -0700)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 13 Oct 2015 22:58:38 +0000 (15:58 -0700)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Tue, 13 Oct 2015 22:58:38 +0000 (15:58 -0700)
diff --git a/queue-4.2/leds-led-class-add-missing-put_device.patch b/queue-4.2/leds-led-class-add-missing-put_device.patch

new file mode 100644 (file)

index 0000000..2c79d25
--- /dev/null
+++ b/queue-4.2/leds-led-class-add-missing-put_device.patch
@@ -0,0 +1,42 @@
+From e5b5a61fcb3743f1dacf9e20d28f48423cecf0c1 Mon Sep 17 00:00:00 2001
+From: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
+Date: Fri, 31 Jul 2015 13:36:21 +0200
+Subject: leds/led-class: Add missing put_device()
+
+From: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
+
+commit e5b5a61fcb3743f1dacf9e20d28f48423cecf0c1 upstream.
+
+Devices found by class_find_device must be freed with put_device().
+Otherwise the reference count will not work properly.
+
+Fixes: a96aa64cb572 ("leds/led-class: Handle LEDs with the same name")
+Reported-by: Alan Tull <delicious.quinoa@gmail.com>
+Signed-off-by: Ricardo Ribalda Delgado <ricardo.ribalda@gmail.com>
+Signed-off-by: Jacek Anaszewski <j.anaszewski@samsung.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/leds/led-class.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/drivers/leds/led-class.c
++++ b/drivers/leds/led-class.c
+@@ -228,12 +228,15 @@ static int led_classdev_next_name(const
+ {
+       unsigned int i = 0;
+       int ret = 0;
++      struct device *dev;
+ 
+       strlcpy(name, init_name, len);
+ 
+-      while (class_find_device(leds_class, NULL, name, match_name) &&
+-             (ret < len))
++      while ((ret < len) &&
++             (dev = class_find_device(leds_class, NULL, name, match_name))) {
++              put_device(dev);
+               ret = snprintf(name, len, "%s_%u", init_name, ++i);
++      }
+ 
+       if (ret >= len)
+               return -ENOMEM;
diff --git a/queue-4.2/leds-lp55xx-correct-kconfig-dependency-for-f-w-user-helper.patch b/queue-4.2/leds-lp55xx-correct-kconfig-dependency-for-f-w-user-helper.patch

new file mode 100644 (file)

index 0000000..52c7268
--- /dev/null
+++ b/queue-4.2/leds-lp55xx-correct-kconfig-dependency-for-f-w-user-helper.patch
@@ -0,0 +1,40 @@
+From 2338f73d407d5abe2036d92716ba25ef5279c3d2 Mon Sep 17 00:00:00 2001
+From: Takashi Iwai <tiwai@suse.de>
+Date: Mon, 7 Sep 2015 14:25:01 +0200
+Subject: leds:lp55xx: Correct Kconfig dependency for f/w user helper
+
+From: Takashi Iwai <tiwai@suse.de>
+
+commit 2338f73d407d5abe2036d92716ba25ef5279c3d2 upstream.
+
+The commit [b67893206fc0: leds:lp55xx: fix firmware loading error]
+tries to address the firmware file handling with user helper, but it
+sets a wrong Kconfig CONFIG_FW_LOADER_USER_HELPER_FALLBACK.  Since the
+wrong option was enabled, the system got a regression -- it suffers
+from the unexpected long delays for non-present firmware files.
+
+This patch corrects the Kconfig dependency to the right one,
+CONFIG_FW_LOADER_USER_HELPER.  This doesn't change the fallback
+behavior but only enables UMH when needed.
+
+Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=944661
+Fixes: b67893206fc0 ('leds:lp55xx: fix firmware loading error')
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Jacek Anaszewski <j.anaszewski@samsung.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/leds/Kconfig |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/leds/Kconfig
++++ b/drivers/leds/Kconfig
+@@ -229,7 +229,7 @@ config LEDS_LP55XX_COMMON
+       tristate "Common Driver for TI/National LP5521/5523/55231/5562/8501"
+       depends on LEDS_LP5521 || LEDS_LP5523 || LEDS_LP5562 || LEDS_LP8501
+       select FW_LOADER
+-      select FW_LOADER_USER_HELPER_FALLBACK
++      select FW_LOADER_USER_HELPER
+       help
+         This option supports common operations for LP5521/5523/55231/5562/8501
+         devices.
diff --git a/queue-4.2/memcg-make-mem_cgroup_read_stat-unsigned.patch b/queue-4.2/memcg-make-mem_cgroup_read_stat-unsigned.patch

new file mode 100644 (file)

index 0000000..74aa6db
--- /dev/null
+++ b/queue-4.2/memcg-make-mem_cgroup_read_stat-unsigned.patch
@@ -0,0 +1,131 @@
+From 484ebb3b8c8b27dd2171696462a3116edb9ff801 Mon Sep 17 00:00:00 2001
+From: Greg Thelen <gthelen@google.com>
+Date: Thu, 1 Oct 2015 15:37:05 -0700
+Subject: memcg: make mem_cgroup_read_stat() unsigned
+
+From: Greg Thelen <gthelen@google.com>
+
+commit 484ebb3b8c8b27dd2171696462a3116edb9ff801 upstream.
+
+mem_cgroup_read_stat() returns a page count by summing per cpu page
+counters.  The summing is racy wrt.  updates, so a transient negative
+sum is possible.  Callers don't want negative values:
+
+ - mem_cgroup_wb_stats() doesn't want negative nr_dirty or nr_writeback.
+   This could confuse dirty throttling.
+
+ - oom reports and memory.stat shouldn't show confusing negative usage.
+
+ - tree_usage() already avoids negatives.
+
+Avoid returning negative page counts from mem_cgroup_read_stat() and
+convert it to unsigned.
+
+[akpm@linux-foundation.org: fix old typo while we're in there]
+Signed-off-by: Greg Thelen <gthelen@google.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |   30 ++++++++++++++++++------------
+ 1 file changed, 18 insertions(+), 12 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -806,12 +806,14 @@ mem_cgroup_largest_soft_limit_node(struc
+ }
+ 
+ /*
++ * Return page count for single (non recursive) @memcg.
++ *
+  * Implementation Note: reading percpu statistics for memcg.
+  *
+  * Both of vmstat[] and percpu_counter has threshold and do periodic
+  * synchronization to implement "quick" read. There are trade-off between
+  * reading cost and precision of value. Then, we may have a chance to implement
+- * a periodic synchronizion of counter in memcg's counter.
++ * a periodic synchronization of counter in memcg's counter.
+  *
+  * But this _read() function is used for user interface now. The user accounts
+  * memory usage by memory cgroup and he _always_ requires exact value because
+@@ -821,17 +823,24 @@ mem_cgroup_largest_soft_limit_node(struc
+  *
+  * If there are kernel internal actions which can make use of some not-exact
+  * value, and reading all cpu value can be performance bottleneck in some
+- * common workload, threashold and synchonization as vmstat[] should be
++ * common workload, threshold and synchronization as vmstat[] should be
+  * implemented.
+  */
+-static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
+-                               enum mem_cgroup_stat_index idx)
++static unsigned long
++mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
+ {
+       long val = 0;
+       int cpu;
+ 
++      /* Per-cpu values can be negative, use a signed accumulator */
+       for_each_possible_cpu(cpu)
+               val += per_cpu(memcg->stat->count[idx], cpu);
++      /*
++       * Summing races with updates, so val may be negative.  Avoid exposing
++       * transient negative values.
++       */
++      if (val < 0)
++              val = 0;
+       return val;
+ }
+ 
+@@ -1498,7 +1507,7 @@ void mem_cgroup_print_oom_info(struct me
+               for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+                       if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+                               continue;
+-                      pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
++                      pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
+                               K(mem_cgroup_read_stat(iter, i)));
+               }
+ 
+@@ -3119,14 +3128,11 @@ static unsigned long tree_stat(struct me
+                              enum mem_cgroup_stat_index idx)
+ {
+       struct mem_cgroup *iter;
+-      long val = 0;
++      unsigned long val = 0;
+ 
+-      /* Per-cpu values can be negative, use a signed accumulator */
+       for_each_mem_cgroup_tree(iter, memcg)
+               val += mem_cgroup_read_stat(iter, idx);
+ 
+-      if (val < 0) /* race ? */
+-              val = 0;
+       return val;
+ }
+ 
+@@ -3469,7 +3475,7 @@ static int memcg_stat_show(struct seq_fi
+       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+               if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+                       continue;
+-              seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
++              seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
+                          mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
+       }
+ 
+@@ -3494,13 +3500,13 @@ static int memcg_stat_show(struct seq_fi
+                          (u64)memsw * PAGE_SIZE);
+ 
+       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
+-              long long val = 0;
++              unsigned long long val = 0;
+ 
+               if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
+                       continue;
+               for_each_mem_cgroup_tree(mi, memcg)
+                       val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
+-              seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
++              seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
+       }
+ 
+       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
diff --git a/queue-4.2/revert-cgroup-simplify-threadgroup-locking.patch b/queue-4.2/revert-cgroup-simplify-threadgroup-locking.patch

new file mode 100644 (file)

index 0000000..ae426fc
--- /dev/null
+++ b/queue-4.2/revert-cgroup-simplify-threadgroup-locking.patch
@@ -0,0 +1,136 @@
+From f9f9e7b776142fb1c0782cade004cc8e0147a199 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Wed, 16 Sep 2015 11:51:12 -0400
+Subject: Revert "cgroup: simplify threadgroup locking"
+
+From: Tejun Heo <tj@kernel.org>
+
+commit f9f9e7b776142fb1c0782cade004cc8e0147a199 upstream.
+
+This reverts commit b5ba75b5fc0e8404e2c50cb68f39bb6a53fc916f.
+
+d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with
+a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify
+threadgroup locking") changed how cgroup synchronizes against task
+fork and exits so that it uses global percpu_rwsem instead of
+per-process rwsem; unfortunately, the write [un]lock paths of
+percpu_rwsem always involve synchronize_rcu_expedited() which turned
+out to be too expensive.
+
+Improvements for percpu_rwsem are scheduled to be merged in the coming
+v4.4-rc1 merge window which alleviates this issue.  For now, revert
+the two commits to restore per-process rwsem.  They will be re-applied
+for the v4.4-rc1 merge window.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com
+Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/cgroup.c |   45 +++++++++++++++++++++++++++++++++------------
+ 1 file changed, 33 insertions(+), 12 deletions(-)
+
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -2452,13 +2452,14 @@ static ssize_t __cgroup_procs_write(stru
+       if (!cgrp)
+               return -ENODEV;
+ 
+-      percpu_down_write(&cgroup_threadgroup_rwsem);
++retry_find_task:
+       rcu_read_lock();
+       if (pid) {
+               tsk = find_task_by_vpid(pid);
+               if (!tsk) {
++                      rcu_read_unlock();
+                       ret = -ESRCH;
+-                      goto out_unlock_rcu;
++                      goto out_unlock_cgroup;
+               }
+       } else {
+               tsk = current;
+@@ -2474,23 +2475,37 @@ static ssize_t __cgroup_procs_write(stru
+        */
+       if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+               ret = -EINVAL;
+-              goto out_unlock_rcu;
++              rcu_read_unlock();
++              goto out_unlock_cgroup;
+       }
+ 
+       get_task_struct(tsk);
+       rcu_read_unlock();
+ 
++      percpu_down_write(&cgroup_threadgroup_rwsem);
++      if (threadgroup) {
++              if (!thread_group_leader(tsk)) {
++                      /*
++                       * a race with de_thread from another thread's exec()
++                       * may strip us of our leadership, if this happens,
++                       * there is no choice but to throw this task away and
++                       * try again; this is
++                       * "double-double-toil-and-trouble-check locking".
++                       */
++                      percpu_up_write(&cgroup_threadgroup_rwsem);
++                      put_task_struct(tsk);
++                      goto retry_find_task;
++              }
++      }
++
+       ret = cgroup_procs_write_permission(tsk, cgrp, of);
+       if (!ret)
+               ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+ 
+-      put_task_struct(tsk);
+-      goto out_unlock_threadgroup;
+-
+-out_unlock_rcu:
+-      rcu_read_unlock();
+-out_unlock_threadgroup:
+       percpu_up_write(&cgroup_threadgroup_rwsem);
++
++      put_task_struct(tsk);
++out_unlock_cgroup:
+       cgroup_kn_unlock(of->kn);
+       return ret ?: nbytes;
+ }
+@@ -2635,8 +2650,6 @@ static int cgroup_update_dfl_csses(struc
+ 
+       lockdep_assert_held(&cgroup_mutex);
+ 
+-      percpu_down_write(&cgroup_threadgroup_rwsem);
+-
+       /* look up all csses currently attached to @cgrp's subtree */
+       down_read(&css_set_rwsem);
+       css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
+@@ -2692,8 +2705,17 @@ static int cgroup_update_dfl_csses(struc
+                               goto out_finish;
+                       last_task = task;
+ 
++                      percpu_down_write(&cgroup_threadgroup_rwsem);
++                      /* raced against de_thread() from another thread? */
++                      if (!thread_group_leader(task)) {
++                              percpu_up_write(&cgroup_threadgroup_rwsem);
++                              put_task_struct(task);
++                              continue;
++                      }
++
+                       ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+ 
++                      percpu_up_write(&cgroup_threadgroup_rwsem);
+                       put_task_struct(task);
+ 
+                       if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+@@ -2703,7 +2725,6 @@ static int cgroup_update_dfl_csses(struc
+ 
+ out_finish:
+       cgroup_migrate_finish(&preloaded_csets);
+-      percpu_up_write(&cgroup_threadgroup_rwsem);
+       return ret;
+ }
+ 
diff --git a/queue-4.2/revert-sched-cgroup-replace-signal_struct-group_rwsem-with-a-global-percpu_rwsem.patch b/queue-4.2/revert-sched-cgroup-replace-signal_struct-group_rwsem-with-a-global-percpu_rwsem.patch

new file mode 100644 (file)

index 0000000..7a990b5
--- /dev/null
+++ b/queue-4.2/revert-sched-cgroup-replace-signal_struct-group_rwsem-with-a-global-percpu_rwsem.patch
@@ -0,0 +1,306 @@
+From 0c986253b939cc14c69d4adbe2b4121bdf4aa220 Mon Sep 17 00:00:00 2001
+From: Tejun Heo <tj@kernel.org>
+Date: Wed, 16 Sep 2015 11:51:12 -0400
+Subject: Revert "sched, cgroup: replace signal_struct->group_rwsem with a global percpu_rwsem"
+
+From: Tejun Heo <tj@kernel.org>
+
+commit 0c986253b939cc14c69d4adbe2b4121bdf4aa220 upstream.
+
+This reverts commit d59cfc09c32a2ae31f1c3bc2983a0cd79afb3f14.
+
+d59cfc09c32a ("sched, cgroup: replace signal_struct->group_rwsem with
+a global percpu_rwsem") and b5ba75b5fc0e ("cgroup: simplify
+threadgroup locking") changed how cgroup synchronizes against task
+fork and exits so that it uses global percpu_rwsem instead of
+per-process rwsem; unfortunately, the write [un]lock paths of
+percpu_rwsem always involve synchronize_rcu_expedited() which turned
+out to be too expensive.
+
+Improvements for percpu_rwsem are scheduled to be merged in the coming
+v4.4-rc1 merge window which alleviates this issue.  For now, revert
+the two commits to restore per-process rwsem.  They will be re-applied
+for the v4.4-rc1 merge window.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com
+Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/cgroup-defs.h |   27 +--------------
+ include/linux/init_task.h   |    8 ++++
+ include/linux/sched.h       |   12 ++++++
+ kernel/cgroup.c             |   77 ++++++++++++++++++++++++++++++++------------
+ kernel/fork.c               |    4 ++
+ 5 files changed, 83 insertions(+), 45 deletions(-)
+
+--- a/include/linux/cgroup-defs.h
++++ b/include/linux/cgroup-defs.h
+@@ -463,31 +463,8 @@ struct cgroup_subsys {
+       unsigned int depends_on;
+ };
+ 
+-extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+-
+-/**
+- * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
+- * @tsk: target task
+- *
+- * Called from threadgroup_change_begin() and allows cgroup operations to
+- * synchronize against threadgroup changes using a percpu_rw_semaphore.
+- */
+-static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+-{
+-      percpu_down_read(&cgroup_threadgroup_rwsem);
+-}
+-
+-/**
+- * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
+- * @tsk: target task
+- *
+- * Called from threadgroup_change_end().  Counterpart of
+- * cgroup_threadcgroup_change_begin().
+- */
+-static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
+-{
+-      percpu_up_read(&cgroup_threadgroup_rwsem);
+-}
++void cgroup_threadgroup_change_begin(struct task_struct *tsk);
++void cgroup_threadgroup_change_end(struct task_struct *tsk);
+ 
+ #else /* CONFIG_CGROUPS */
+ 
+--- a/include/linux/init_task.h
++++ b/include/linux/init_task.h
+@@ -25,6 +25,13 @@
+ extern struct files_struct init_files;
+ extern struct fs_struct init_fs;
+ 
++#ifdef CONFIG_CGROUPS
++#define INIT_GROUP_RWSEM(sig)                                         \
++      .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
++#else
++#define INIT_GROUP_RWSEM(sig)
++#endif
++
+ #ifdef CONFIG_CPUSETS
+ #define INIT_CPUSET_SEQ(tsk)                                                  \
+       .mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
+@@ -48,6 +55,7 @@ extern struct fs_struct init_fs;
+       },                                                              \
+       .cred_guard_mutex =                                             \
+                __MUTEX_INITIALIZER(sig.cred_guard_mutex),             \
++      INIT_GROUP_RWSEM(sig)                                           \
+ }
+ 
+ extern struct nsproxy init_nsproxy;
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -754,6 +754,18 @@ struct signal_struct {
+       unsigned audit_tty_log_passwd;
+       struct tty_audit_buf *tty_audit_buf;
+ #endif
++#ifdef CONFIG_CGROUPS
++      /*
++       * group_rwsem prevents new tasks from entering the threadgroup and
++       * member tasks from exiting,a more specifically, setting of
++       * PF_EXITING.  fork and exit paths are protected with this rwsem
++       * using threadgroup_change_begin/end().  Users which require
++       * threadgroup to remain stable should use threadgroup_[un]lock()
++       * which also takes care of exec path.  Currently, cgroup is the
++       * only user.
++       */
++      struct rw_semaphore group_rwsem;
++#endif
+ 
+       oom_flags_t oom_flags;
+       short oom_score_adj;            /* OOM kill score adjustment */
+--- a/kernel/cgroup.c
++++ b/kernel/cgroup.c
+@@ -46,7 +46,6 @@
+ #include <linux/slab.h>
+ #include <linux/spinlock.h>
+ #include <linux/rwsem.h>
+-#include <linux/percpu-rwsem.h>
+ #include <linux/string.h>
+ #include <linux/sort.h>
+ #include <linux/kmod.h>
+@@ -104,8 +103,6 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
+  */
+ static DEFINE_SPINLOCK(release_agent_path_lock);
+ 
+-struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+-
+ #define cgroup_assert_mutex_or_rcu_locked()                           \
+       rcu_lockdep_assert(rcu_read_lock_held() ||                      \
+                          lockdep_is_held(&cgroup_mutex),              \
+@@ -870,6 +867,48 @@ static struct css_set *find_css_set(stru
+       return cset;
+ }
+ 
++void cgroup_threadgroup_change_begin(struct task_struct *tsk)
++{
++      down_read(&tsk->signal->group_rwsem);
++}
++
++void cgroup_threadgroup_change_end(struct task_struct *tsk)
++{
++      up_read(&tsk->signal->group_rwsem);
++}
++
++/**
++ * threadgroup_lock - lock threadgroup
++ * @tsk: member task of the threadgroup to lock
++ *
++ * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
++ * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
++ * change ->group_leader/pid.  This is useful for cases where the threadgroup
++ * needs to stay stable across blockable operations.
++ *
++ * fork and exit explicitly call threadgroup_change_{begin|end}() for
++ * synchronization.  While held, no new task will be added to threadgroup
++ * and no existing live task will have its PF_EXITING set.
++ *
++ * de_thread() does threadgroup_change_{begin|end}() when a non-leader
++ * sub-thread becomes a new leader.
++ */
++static void threadgroup_lock(struct task_struct *tsk)
++{
++      down_write(&tsk->signal->group_rwsem);
++}
++
++/**
++ * threadgroup_unlock - unlock threadgroup
++ * @tsk: member task of the threadgroup to unlock
++ *
++ * Reverse threadgroup_lock().
++ */
++static inline void threadgroup_unlock(struct task_struct *tsk)
++{
++      up_write(&tsk->signal->group_rwsem);
++}
++
+ static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
+ {
+       struct cgroup *root_cgrp = kf_root->kn->priv;
+@@ -2066,9 +2105,9 @@ static void cgroup_task_migrate(struct c
+       lockdep_assert_held(&css_set_rwsem);
+ 
+       /*
+-       * We are synchronized through cgroup_threadgroup_rwsem against
+-       * PF_EXITING setting such that we can't race against cgroup_exit()
+-       * changing the css_set to init_css_set and dropping the old one.
++       * We are synchronized through threadgroup_lock() against PF_EXITING
++       * setting such that we can't race against cgroup_exit() changing the
++       * css_set to init_css_set and dropping the old one.
+        */
+       WARN_ON_ONCE(tsk->flags & PF_EXITING);
+       old_cset = task_css_set(tsk);
+@@ -2125,11 +2164,10 @@ static void cgroup_migrate_finish(struct
+  * @src_cset and add it to @preloaded_csets, which should later be cleaned
+  * up by cgroup_migrate_finish().
+  *
+- * This function may be called without holding cgroup_threadgroup_rwsem
+- * even if the target is a process.  Threads may be created and destroyed
+- * but as long as cgroup_mutex is not dropped, no new css_set can be put
+- * into play and the preloaded css_sets are guaranteed to cover all
+- * migrations.
++ * This function may be called without holding threadgroup_lock even if the
++ * target is a process.  Threads may be created and destroyed but as long
++ * as cgroup_mutex is not dropped, no new css_set can be put into play and
++ * the preloaded css_sets are guaranteed to cover all migrations.
+  */
+ static void cgroup_migrate_add_src(struct css_set *src_cset,
+                                  struct cgroup *dst_cgrp,
+@@ -2232,7 +2270,7 @@ err:
+  * @threadgroup: whether @leader points to the whole process or a single task
+  *
+  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
+- * process, the caller must be holding cgroup_threadgroup_rwsem.  The
++ * process, the caller must be holding threadgroup_lock of @leader.  The
+  * caller is also responsible for invoking cgroup_migrate_add_src() and
+  * cgroup_migrate_prepare_dst() on the targets before invoking this
+  * function and following up with cgroup_migrate_finish().
+@@ -2360,7 +2398,7 @@ out_release_tset:
+  * @leader: the task or the leader of the threadgroup to be attached
+  * @threadgroup: attach the whole threadgroup?
+  *
+- * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
++ * Call holding cgroup_mutex and threadgroup_lock of @leader.
+  */
+ static int cgroup_attach_task(struct cgroup *dst_cgrp,
+                             struct task_struct *leader, bool threadgroup)
+@@ -2482,7 +2520,7 @@ retry_find_task:
+       get_task_struct(tsk);
+       rcu_read_unlock();
+ 
+-      percpu_down_write(&cgroup_threadgroup_rwsem);
++      threadgroup_lock(tsk);
+       if (threadgroup) {
+               if (!thread_group_leader(tsk)) {
+                       /*
+@@ -2492,7 +2530,7 @@ retry_find_task:
+                        * try again; this is
+                        * "double-double-toil-and-trouble-check locking".
+                        */
+-                      percpu_up_write(&cgroup_threadgroup_rwsem);
++                      threadgroup_unlock(tsk);
+                       put_task_struct(tsk);
+                       goto retry_find_task;
+               }
+@@ -2502,7 +2540,7 @@ retry_find_task:
+       if (!ret)
+               ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+ 
+-      percpu_up_write(&cgroup_threadgroup_rwsem);
++      threadgroup_unlock(tsk);
+ 
+       put_task_struct(tsk);
+ out_unlock_cgroup:
+@@ -2705,17 +2743,17 @@ static int cgroup_update_dfl_csses(struc
+                               goto out_finish;
+                       last_task = task;
+ 
+-                      percpu_down_write(&cgroup_threadgroup_rwsem);
++                      threadgroup_lock(task);
+                       /* raced against de_thread() from another thread? */
+                       if (!thread_group_leader(task)) {
+-                              percpu_up_write(&cgroup_threadgroup_rwsem);
++                              threadgroup_unlock(task);
+                               put_task_struct(task);
+                               continue;
+                       }
+ 
+                       ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+ 
+-                      percpu_up_write(&cgroup_threadgroup_rwsem);
++                      threadgroup_unlock(task);
+                       put_task_struct(task);
+ 
+                       if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
+@@ -5034,7 +5072,6 @@ int __init cgroup_init(void)
+       unsigned long key;
+       int ssid, err;
+ 
+-      BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
+       BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+       BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+ 
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -1146,6 +1146,10 @@ static int copy_signal(unsigned long clo
+       tty_audit_fork(sig);
+       sched_autogroup_fork(sig);
+ 
++#ifdef CONFIG_CGROUPS
++      init_rwsem(&sig->group_rwsem);
++#endif
++
+       sig->oom_score_adj = current->signal->oom_score_adj;
+       sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+ 
diff --git a/queue-4.2/s390-boot-decompression-disable-floating-point-in-decompressor.patch b/queue-4.2/s390-boot-decompression-disable-floating-point-in-decompressor.patch

new file mode 100644 (file)

index 0000000..514b1e2
--- /dev/null
+++ b/queue-4.2/s390-boot-decompression-disable-floating-point-in-decompressor.patch
@@ -0,0 +1,47 @@
+From adc0b7fbf6fe9967505c0254d9535ec7288186ae Mon Sep 17 00:00:00 2001
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+Date: Mon, 28 Sep 2015 22:47:42 +0200
+Subject: s390/boot/decompression: disable floating point in decompressor
+
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+
+commit adc0b7fbf6fe9967505c0254d9535ec7288186ae upstream.
+
+my gcc 5.1 used an ldgr instruction with a register != 0,2,4,6 for
+spilling/filling into a floating point register in our decompressor.
+
+This will cause an AFP-register data exception as the decompressor
+did not setup the additional floating point registers via cr0.
+That causes a program check loop that looked like a hang with
+one "Uncompressing Linux... " message (directly booted via kvm)
+or a loop of "Uncompressing Linux... " messages (when booted via
+zipl boot loader).
+
+The offending code in my build was
+
+   48e400:       e3 c0 af ff ff 71       lay     %r12,-1(%r10)
+-->48e406:       b3 c1 00 1c             ldgr    %f1,%r12
+   48e40a:       ec 6c 01 22 02 7f       clij    %r6,2,12,0x48e64e
+
+but gcc could do spilling into an fpr at any function. We can
+simply disable floating point support at that early stage.
+
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/boot/compressed/Makefile |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/s390/boot/compressed/Makefile
++++ b/arch/s390/boot/compressed/Makefile
+@@ -10,7 +10,7 @@ targets += misc.o piggy.o sizes.h head.o
+ 
+ KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUX_INCLUDE) -O2
+ KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
+-KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks
++KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks -msoft-float
+ KBUILD_CFLAGS += $(call cc-option,-mpacked-stack)
+ KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
+ 
diff --git a/queue-4.2/s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch b/queue-4.2/s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch

new file mode 100644 (file)

index 0000000..2fdc513
--- /dev/null
+++ b/queue-4.2/s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch
@@ -0,0 +1,108 @@
+From 8d4bd0ed0439dfc780aab801a085961925ed6838 Mon Sep 17 00:00:00 2001
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Date: Tue, 8 Sep 2015 15:25:39 +0200
+Subject: s390/compat: correct uc_sigmask of the compat signal frame
+
+From: Martin Schwidefsky <schwidefsky@de.ibm.com>
+
+commit 8d4bd0ed0439dfc780aab801a085961925ed6838 upstream.
+
+The uc_sigmask in the ucontext structure is an array of words to keep
+the 64 signal bits (or 1024 if you ask glibc but the kernel sigset_t
+only has 64 bits).
+
+For 64 bit the sigset_t contains a single 8 byte word, but for 31 bit
+there are two 4 byte words. The compat signal handler code uses a
+simple copy of the 64 bit sigset_t to the 31 bit compat_sigset_t.
+As s390 is a big-endian architecture this is incorrect, the two words
+in the 31 bit sigset_t array need to be swapped.
+
+Reported-by: Stefan Liebler <stli@linux.vnet.ibm.com>
+Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/kernel/compat_signal.c |   27 +++++++++++++++++++++++----
+ 1 file changed, 23 insertions(+), 4 deletions(-)
+
+--- a/arch/s390/kernel/compat_signal.c
++++ b/arch/s390/kernel/compat_signal.c
+@@ -48,6 +48,19 @@ typedef struct
+       struct ucontext32 uc;
+ } rt_sigframe32;
+ 
++static inline void sigset_to_sigset32(unsigned long *set64,
++                                    compat_sigset_word *set32)
++{
++      set32[0] = (compat_sigset_word) set64[0];
++      set32[1] = (compat_sigset_word)(set64[0] >> 32);
++}
++
++static inline void sigset32_to_sigset(compat_sigset_word *set32,
++                                    unsigned long *set64)
++{
++      set64[0] = (unsigned long) set32[0] | ((unsigned long) set32[1] << 32);
++}
++
+ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+ {
+       int err;
+@@ -303,10 +316,12 @@ COMPAT_SYSCALL_DEFINE0(sigreturn)
+ {
+       struct pt_regs *regs = task_pt_regs(current);
+       sigframe32 __user *frame = (sigframe32 __user *)regs->gprs[15];
++      compat_sigset_t cset;
+       sigset_t set;
+ 
+-      if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32))
++      if (__copy_from_user(&cset.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32))
+               goto badframe;
++      sigset32_to_sigset(cset.sig, set.sig);
+       set_current_blocked(&set);
+       if (restore_sigregs32(regs, &frame->sregs))
+               goto badframe;
+@@ -323,10 +338,12 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn)
+ {
+       struct pt_regs *regs = task_pt_regs(current);
+       rt_sigframe32 __user *frame = (rt_sigframe32 __user *)regs->gprs[15];
++      compat_sigset_t cset;
+       sigset_t set;
+ 
+-      if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
++      if (__copy_from_user(&cset, &frame->uc.uc_sigmask, sizeof(cset)))
+               goto badframe;
++      sigset32_to_sigset(cset.sig, set.sig);
+       set_current_blocked(&set);
+       if (compat_restore_altstack(&frame->uc.uc_stack))
+               goto badframe;
+@@ -397,7 +414,7 @@ static int setup_frame32(struct ksignal
+               return -EFAULT;
+ 
+       /* Create struct sigcontext32 on the signal stack */
+-      memcpy(&sc.oldmask, &set->sig, _SIGMASK_COPY_SIZE32);
++      sigset_to_sigset32(set->sig, sc.oldmask);
+       sc.sregs = (__u32)(unsigned long __force) &frame->sregs;
+       if (__copy_to_user(&frame->sc, &sc, sizeof(frame->sc)))
+               return -EFAULT;
+@@ -458,6 +475,7 @@ static int setup_frame32(struct ksignal
+ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
+                           struct pt_regs *regs)
+ {
++      compat_sigset_t cset;
+       rt_sigframe32 __user *frame;
+       unsigned long restorer;
+       size_t frame_size;
+@@ -505,11 +523,12 @@ static int setup_rt_frame32(struct ksign
+       store_sigregs();
+ 
+       /* Create ucontext on the signal stack. */
++      sigset_to_sigset32(set->sig, cset.sig);
+       if (__put_user(uc_flags, &frame->uc.uc_flags) ||
+           __put_user(0, &frame->uc.uc_link) ||
+           __compat_save_altstack(&frame->uc.uc_stack, regs->gprs[15]) ||
+           save_sigregs32(regs, &frame->uc.uc_mcontext) ||
+-          __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)) ||
++          __copy_to_user(&frame->uc.uc_sigmask, &cset, sizeof(cset)) ||
+           save_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext))
+               return -EFAULT;
+ 
diff --git a/queue-4.2/sched-core-fix-task_dead-race-in-finish_task_switch.patch b/queue-4.2/sched-core-fix-task_dead-race-in-finish_task_switch.patch

new file mode 100644 (file)

index 0000000..45c2073
--- /dev/null
+++ b/queue-4.2/sched-core-fix-task_dead-race-in-finish_task_switch.patch
@@ -0,0 +1,98 @@
+From 95913d97914f44db2b81271c2e2ebd4d2ac2df83 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Tue, 29 Sep 2015 14:45:09 +0200
+Subject: sched/core: Fix TASK_DEAD race in finish_task_switch()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 95913d97914f44db2b81271c2e2ebd4d2ac2df83 upstream.
+
+So the problem this patch is trying to address is as follows:
+
+        CPU0                            CPU1
+
+        context_switch(A, B)
+                                        ttwu(A)
+                                          LOCK A->pi_lock
+                                          A->on_cpu == 0
+        finish_task_switch(A)
+          prev_state = A->state  <-.
+          WMB                      |
+          A->on_cpu = 0;           |
+          UNLOCK rq0->lock         |
+                                   |    context_switch(C, A)
+                                   `--  A->state = TASK_DEAD
+          prev_state == TASK_DEAD
+            put_task_struct(A)
+                                        context_switch(A, C)
+                                        finish_task_switch(A)
+                                          A->state == TASK_DEAD
+                                            put_task_struct(A)
+
+The argument being that the WMB will allow the load of A->state on CPU0
+to cross over and observe CPU1's store of A->state, which will then
+result in a double-drop and use-after-free.
+
+Now the comment states (and this was true once upon a long time ago)
+that we need to observe A->state while holding rq->lock because that
+will order us against the wakeup; however the wakeup will not in fact
+acquire (that) rq->lock; it takes A->pi_lock these days.
+
+We can obviously fix this by upgrading the WMB to an MB, but that is
+expensive, so we'd rather avoid that.
+
+The alternative this patch takes is: smp_store_release(&A->on_cpu, 0),
+which avoids the MB on some archs, but not important ones like ARM.
+
+Reported-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-kernel@vger.kernel.org
+Cc: manfred@colorfullife.com
+Cc: will.deacon@arm.com
+Fixes: e4a52bcb9a18 ("sched: Remove rq->lock from the first half of ttwu()")
+Link: http://lkml.kernel.org/r/20150929124509.GG3816@twins.programming.kicks-ass.net
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/core.c  |   10 +++++-----
+ kernel/sched/sched.h |    5 +++--
+ 2 files changed, 8 insertions(+), 7 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -2461,11 +2461,11 @@ static struct rq *finish_task_switch(str
+        * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+        * schedule one last time. The schedule call will never return, and
+        * the scheduled task must drop that reference.
+-       * The test for TASK_DEAD must occur while the runqueue locks are
+-       * still held, otherwise prev could be scheduled on another cpu, die
+-       * there before we look at prev->state, and then the reference would
+-       * be dropped twice.
+-       *              Manfred Spraul <manfred@colorfullife.com>
++       *
++       * We must observe prev->state before clearing prev->on_cpu (in
++       * finish_lock_switch), otherwise a concurrent wakeup can get prev
++       * running on another CPU and we could rave with its RUNNING -> DEAD
++       * transition, resulting in a double drop.
+        */
+       prev_state = prev->state;
+       vtime_task_switch(prev);
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -1091,9 +1091,10 @@ static inline void finish_lock_switch(st
+        * After ->on_cpu is cleared, the task can be moved to a different CPU.
+        * We must ensure this doesn't happen until the switch is completely
+        * finished.
++       *
++       * Pairs with the control dependency and rmb in try_to_wake_up().
+        */
+-      smp_wmb();
+-      prev->on_cpu = 0;
++      smp_store_release(&prev->on_cpu, 0);
+ #endif
+ #ifdef CONFIG_DEBUG_SPINLOCK
+       /* this is a valid case when another task releases the spinlock */
diff --git a/queue-4.2/series b/queue-4.2/series

index a33d378061186bd1f4465653a66657886b6c64cf..cbdba25c730137e5a6e6f5d825fb91eedb353abb 100644 (file)
--- a/queue-4.2/series
+++ b/queue-4.2/series
@@ -64,3 +64,17 @@ x86-ioapic-force-affinity-setting-in-setup_ioapic_dest.patch
  x86-pci-intel_mid_pci-work-around-for-irq0-assignment.patch
  x86-paravirt-replace-the-paravirt-nop-with-a-bona-fide-empty-function.patch
  x86-nmi-64-fix-a-paravirt-stack-clobbering-bug-in-the-nmi-code.patch
+use-warn_on_once-for-missing-x86_feature_nrips.patch
+x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch
+x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch
+x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch
+x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch
+x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch
+leds-lp55xx-correct-kconfig-dependency-for-f-w-user-helper.patch
+leds-led-class-add-missing-put_device.patch
+sched-core-fix-task_dead-race-in-finish_task_switch.patch
+s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch
+s390-boot-decompression-disable-floating-point-in-decompressor.patch
+revert-cgroup-simplify-threadgroup-locking.patch
+revert-sched-cgroup-replace-signal_struct-group_rwsem-with-a-global-percpu_rwsem.patch
+memcg-make-mem_cgroup_read_stat-unsigned.patch
diff --git a/queue-4.2/use-warn_on_once-for-missing-x86_feature_nrips.patch b/queue-4.2/use-warn_on_once-for-missing-x86_feature_nrips.patch

new file mode 100644 (file)

index 0000000..7872522
--- /dev/null
+++ b/queue-4.2/use-warn_on_once-for-missing-x86_feature_nrips.patch
@@ -0,0 +1,37 @@
+From d2922422c48df93f3edff7d872ee4f3191fefb08 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Dirk=20M=C3=BCller?= <dmueller@suse.com>
+Date: Thu, 1 Oct 2015 13:43:42 +0200
+Subject: Use WARN_ON_ONCE for missing X86_FEATURE_NRIPS
+
+From: =?UTF-8?q?Dirk=20M=C3=BCller?= <dmueller@suse.com>
+
+commit d2922422c48df93f3edff7d872ee4f3191fefb08 upstream.
+
+The cpu feature flags are not ever going to change, so warning
+everytime can cause a lot of kernel log spam
+(in our case more than 10GB/hour).
+
+The warning seems to only occur when nested virtualization is
+enabled, so it's probably triggered by a KVM bug.  This is a
+sensible and safe change anyway, and the KVM bug fix might not
+be suitable for stable releases anyway.
+
+Signed-off-by: Dirk Mueller <dmueller@suse.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/svm.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -513,7 +513,7 @@ static void skip_emulated_instruction(st
+       struct vcpu_svm *svm = to_svm(vcpu);
+ 
+       if (svm->vmcb->control.next_rip != 0) {
+-              WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
++              WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
+               svm->next_rip = svm->vmcb->control.next_rip;
+       }
+ 
diff --git a/queue-4.2/x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch b/queue-4.2/x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch

new file mode 100644 (file)

index 0000000..0a53b2e
--- /dev/null
+++ b/queue-4.2/x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch
@@ -0,0 +1,173 @@
+From a5caa209ba9c29c6421292e7879d2387a2ef39c9 Mon Sep 17 00:00:00 2001
+From: Matt Fleming <matt.fleming@intel.com>
+Date: Fri, 25 Sep 2015 23:02:18 +0100
+Subject: x86/efi: Fix boot crash by mapping EFI memmap entries bottom-up at runtime, instead of top-down
+
+From: Matt Fleming <matt.fleming@intel.com>
+
+commit a5caa209ba9c29c6421292e7879d2387a2ef39c9 upstream.
+
+Beginning with UEFI v2.5 EFI_PROPERTIES_TABLE was introduced
+that signals that the firmware PE/COFF loader supports splitting
+code and data sections of PE/COFF images into separate EFI
+memory map entries. This allows the kernel to map those regions
+with strict memory protections, e.g. EFI_MEMORY_RO for code,
+EFI_MEMORY_XP for data, etc.
+
+Unfortunately, an unwritten requirement of this new feature is
+that the regions need to be mapped with the same offsets
+relative to each other as observed in the EFI memory map. If
+this is not done crashes like this may occur,
+
+  BUG: unable to handle kernel paging request at fffffffefe6086dd
+  IP: [<fffffffefe6086dd>] 0xfffffffefe6086dd
+  Call Trace:
+   [<ffffffff8104c90e>] efi_call+0x7e/0x100
+   [<ffffffff81602091>] ? virt_efi_set_variable+0x61/0x90
+   [<ffffffff8104c583>] efi_delete_dummy_variable+0x63/0x70
+   [<ffffffff81f4e4aa>] efi_enter_virtual_mode+0x383/0x392
+   [<ffffffff81f37e1b>] start_kernel+0x38a/0x417
+   [<ffffffff81f37495>] x86_64_start_reservations+0x2a/0x2c
+   [<ffffffff81f37582>] x86_64_start_kernel+0xeb/0xef
+
+Here 0xfffffffefe6086dd refers to an address the firmware
+expects to be mapped but which the OS never claimed was mapped.
+The issue is that included in these regions are relative
+addresses to other regions which were emitted by the firmware
+toolchain before the "splitting" of sections occurred at
+runtime.
+
+Needless to say, we don't satisfy this unwritten requirement on
+x86_64 and instead map the EFI memory map entries in reverse
+order. The above crash is almost certainly triggerable with any
+kernel newer than v3.13 because that's when we rewrote the EFI
+runtime region mapping code, in commit d2f7cbe7b26a ("x86/efi:
+Runtime services virtual mapping"). For kernel versions before
+v3.13 things may work by pure luck depending on the
+fragmentation of the kernel virtual address space at the time we
+map the EFI regions.
+
+Instead of mapping the EFI memory map entries in reverse order,
+where entry N has a higher virtual address than entry N+1, map
+them in the same order as they appear in the EFI memory map to
+preserve this relative offset between regions.
+
+This patch has been kept as small as possible with the intention
+that it should be applied aggressively to stable and
+distribution kernels. It is very much a bugfix rather than
+support for a new feature, since when EFI_PROPERTIES_TABLE is
+enabled we must map things as outlined above to even boot - we
+have no way of asking the firmware not to split the code/data
+regions.
+
+In fact, this patch doesn't even make use of the more strict
+memory protections available in UEFI v2.5. That will come later.
+
+Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Matt Fleming <matt.fleming@intel.com>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Chun-Yi <jlee@suse.com>
+Cc: Dave Young <dyoung@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: James Bottomley <JBottomley@Odin.com>
+Cc: Lee, Chun-Yi <jlee@suse.com>
+Cc: Leif Lindholm <leif.lindholm@linaro.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Matthew Garrett <mjg59@srcf.ucam.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Jones <pjones@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/1443218539-7610-2-git-send-email-matt@codeblueprint.co.uk
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/platform/efi/efi.c |   67 +++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 66 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/platform/efi/efi.c
++++ b/arch/x86/platform/efi/efi.c
+@@ -705,6 +705,70 @@ out:
+ }
+ 
+ /*
++ * Iterate the EFI memory map in reverse order because the regions
++ * will be mapped top-down. The end result is the same as if we had
++ * mapped things forward, but doesn't require us to change the
++ * existing implementation of efi_map_region().
++ */
++static inline void *efi_map_next_entry_reverse(void *entry)
++{
++      /* Initial call */
++      if (!entry)
++              return memmap.map_end - memmap.desc_size;
++
++      entry -= memmap.desc_size;
++      if (entry < memmap.map)
++              return NULL;
++
++      return entry;
++}
++
++/*
++ * efi_map_next_entry - Return the next EFI memory map descriptor
++ * @entry: Previous EFI memory map descriptor
++ *
++ * This is a helper function to iterate over the EFI memory map, which
++ * we do in different orders depending on the current configuration.
++ *
++ * To begin traversing the memory map @entry must be %NULL.
++ *
++ * Returns %NULL when we reach the end of the memory map.
++ */
++static void *efi_map_next_entry(void *entry)
++{
++      if (!efi_enabled(EFI_OLD_MEMMAP) && efi_enabled(EFI_64BIT)) {
++              /*
++               * Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE
++               * config table feature requires us to map all entries
++               * in the same order as they appear in the EFI memory
++               * map. That is to say, entry N must have a lower
++               * virtual address than entry N+1. This is because the
++               * firmware toolchain leaves relative references in
++               * the code/data sections, which are split and become
++               * separate EFI memory regions. Mapping things
++               * out-of-order leads to the firmware accessing
++               * unmapped addresses.
++               *
++               * Since we need to map things this way whether or not
++               * the kernel actually makes use of
++               * EFI_PROPERTIES_TABLE, let's just switch to this
++               * scheme by default for 64-bit.
++               */
++              return efi_map_next_entry_reverse(entry);
++      }
++
++      /* Initial call */
++      if (!entry)
++              return memmap.map;
++
++      entry += memmap.desc_size;
++      if (entry >= memmap.map_end)
++              return NULL;
++
++      return entry;
++}
++
++/*
+  * Map the efi memory ranges of the runtime services and update new_mmap with
+  * virtual addresses.
+  */
+@@ -714,7 +778,8 @@ static void * __init efi_map_regions(int
+       unsigned long left = 0;
+       efi_memory_desc_t *md;
+ 
+-      for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
++      p = NULL;
++      while ((p = efi_map_next_entry(p))) {
+               md = p;
+               if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
+ #ifdef CONFIG_X86_64
diff --git a/queue-4.2/x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch b/queue-4.2/x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch

new file mode 100644 (file)

index 0000000..d77ad00
--- /dev/null
+++ b/queue-4.2/x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch
@@ -0,0 +1,103 @@
+From e3c41e37b0f4b18cbd4dac76cbeece5a7558b909 Mon Sep 17 00:00:00 2001
+From: "Lee, Chun-Yi" <joeyli.kernel@gmail.com>
+Date: Tue, 29 Sep 2015 20:58:57 +0800
+Subject: x86/kexec: Fix kexec crash in syscall kexec_file_load()
+
+From: "Lee, Chun-Yi" <joeyli.kernel@gmail.com>
+
+commit e3c41e37b0f4b18cbd4dac76cbeece5a7558b909 upstream.
+
+The original bug is a page fault crash that sometimes happens
+on big machines when preparing ELF headers:
+
+    BUG: unable to handle kernel paging request at ffffc90613fc9000
+    IP: [<ffffffff8103d645>] prepare_elf64_ram_headers_callback+0x165/0x260
+
+The bug is caused by us under-counting the number of memory ranges
+and subsequently not allocating enough ELF header space for them.
+The bug is typically masked on smaller systems, because the ELF header
+allocation is rounded up to the next page.
+
+This patch modifies the code in fill_up_crash_elf_data() by using
+walk_system_ram_res() instead of walk_system_ram_range() to correctly
+count the max number of crash memory ranges. That's because the
+walk_system_ram_range() filters out small memory regions that
+reside in the same page, but walk_system_ram_res() does not.
+
+Here's how I found the bug:
+
+After tracing prepare_elf64_headers() and prepare_elf64_ram_headers_callback(),
+the code uses walk_system_ram_res() to fill-in crash memory regions information
+to the program header, so it counts those small memory regions that
+reside in a page area.
+
+But, when the kernel was using walk_system_ram_range() in
+fill_up_crash_elf_data() to count the number of crash memory regions,
+it filters out small regions.
+
+I printed those small memory regions, for example:
+
+  kexec: Get nr_ram ranges. vaddr=0xffff880077592258 paddr=0x77592258, sz=0xdc0
+
+Based on the code in walk_system_ram_range(), this memory region
+will be filtered out:
+
+  pfn = (0x77592258 + 0x1000 - 1) >> 12 = 0x77593
+  end_pfn = (0x77592258 + 0xfc0 -1 + 1) >> 12 = 0x77593
+  end_pfn - pfn = 0x77593 - 0x77593 = 0  <=== if (end_pfn > pfn) is FALSE
+
+So, the max_nr_ranges that's counted by the kernel doesn't include
+small memory regions - causing us to under-allocate the required space.
+That causes the page fault crash that happens in a later code path
+when preparing ELF headers.
+
+This bug is not easy to reproduce on small machines that have few
+CPUs, because the allocated page aligned ELF buffer has more free
+space to cover those small memory regions' PT_LOAD headers.
+
+Signed-off-by: Lee, Chun-Yi <jlee@suse.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Baoquan He <bhe@redhat.com>
+Cc: Jiang Liu <jiang.liu@linux.intel.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Stephen Rothwell <sfr@canb.auug.org.au>
+Cc: Takashi Iwai <tiwai@suse.de>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Viresh Kumar <viresh.kumar@linaro.org>
+Cc: Vivek Goyal <vgoyal@redhat.com>
+Cc: kexec@lists.infradead.org
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/1443531537-29436-1-git-send-email-jlee@suse.com
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/crash.c |    7 +++----
+ 1 file changed, 3 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kernel/crash.c
++++ b/arch/x86/kernel/crash.c
+@@ -185,10 +185,9 @@ void native_machine_crash_shutdown(struc
+ }
+ 
+ #ifdef CONFIG_KEXEC_FILE
+-static int get_nr_ram_ranges_callback(unsigned long start_pfn,
+-                              unsigned long nr_pfn, void *arg)
++static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
+ {
+-      int *nr_ranges = arg;
++      unsigned int *nr_ranges = arg;
+ 
+       (*nr_ranges)++;
+       return 0;
+@@ -214,7 +213,7 @@ static void fill_up_crash_elf_data(struc
+ 
+       ced->image = image;
+ 
+-      walk_system_ram_range(0, -1, &nr_ranges,
++      walk_system_ram_res(0, -1, &nr_ranges,
+                               get_nr_ram_ranges_callback);
+ 
+       ced->max_nr_ranges = nr_ranges;
diff --git a/queue-4.2/x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch b/queue-4.2/x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch

new file mode 100644 (file)

index 0000000..e8fbd10
--- /dev/null
+++ b/queue-4.2/x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch
@@ -0,0 +1,64 @@
+From ab76f7b4ab2397ffdd2f1eb07c55697d19991d10 Mon Sep 17 00:00:00 2001
+From: Stephen Smalley <sds@tycho.nsa.gov>
+Date: Thu, 1 Oct 2015 09:04:22 -0400
+Subject: x86/mm: Set NX on gap between __ex_table and rodata
+
+From: Stephen Smalley <sds@tycho.nsa.gov>
+
+commit ab76f7b4ab2397ffdd2f1eb07c55697d19991d10 upstream.
+
+Unused space between the end of __ex_table and the start of
+rodata can be left W+x in the kernel page tables.  Extend the
+setting of the NX bit to cover this gap by starting from
+text_end rather than rodata_start.
+
+  Before:
+  ---[ High Kernel Mapping ]---
+  0xffffffff80000000-0xffffffff81000000          16M                               pmd
+  0xffffffff81000000-0xffffffff81600000           6M     ro         PSE     GLB x  pmd
+  0xffffffff81600000-0xffffffff81754000        1360K     ro                 GLB x  pte
+  0xffffffff81754000-0xffffffff81800000         688K     RW                 GLB x  pte
+  0xffffffff81800000-0xffffffff81a00000           2M     ro         PSE     GLB NX pmd
+  0xffffffff81a00000-0xffffffff81b3b000        1260K     ro                 GLB NX pte
+  0xffffffff81b3b000-0xffffffff82000000        4884K     RW                 GLB NX pte
+  0xffffffff82000000-0xffffffff82200000           2M     RW         PSE     GLB NX pmd
+  0xffffffff82200000-0xffffffffa0000000         478M                               pmd
+
+  After:
+  ---[ High Kernel Mapping ]---
+  0xffffffff80000000-0xffffffff81000000          16M                               pmd
+  0xffffffff81000000-0xffffffff81600000           6M     ro         PSE     GLB x  pmd
+  0xffffffff81600000-0xffffffff81754000        1360K     ro                 GLB x  pte
+  0xffffffff81754000-0xffffffff81800000         688K     RW                 GLB NX pte
+  0xffffffff81800000-0xffffffff81a00000           2M     ro         PSE     GLB NX pmd
+  0xffffffff81a00000-0xffffffff81b3b000        1260K     ro                 GLB NX pte
+  0xffffffff81b3b000-0xffffffff82000000        4884K     RW                 GLB NX pte
+  0xffffffff82000000-0xffffffff82200000           2M     RW         PSE     GLB NX pmd
+  0xffffffff82200000-0xffffffffa0000000         478M                               pmd
+
+Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
+Acked-by: Kees Cook <keescook@chromium.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/1443704662-3138-1-git-send-email-sds@tycho.nsa.gov
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/init_64.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -1132,7 +1132,7 @@ void mark_rodata_ro(void)
+        * has been zapped already via cleanup_highmem().
+        */
+       all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
+-      set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
++      set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
+ 
+       rodata_test();
+ 
diff --git a/queue-4.2/x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch b/queue-4.2/x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch

new file mode 100644 (file)

index 0000000..46e55ba
--- /dev/null
+++ b/queue-4.2/x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch
@@ -0,0 +1,152 @@
+From eddd3826a1a0190e5235703d1e666affa4d13b96 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed, 30 Sep 2015 08:38:22 +0000
+Subject: x86/process: Add proper bound checks in 64bit get_wchan()
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit eddd3826a1a0190e5235703d1e666affa4d13b96 upstream.
+
+Dmitry Vyukov reported the following using trinity and the memory
+error detector AddressSanitizer
+(https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel).
+
+[ 124.575597] ERROR: AddressSanitizer: heap-buffer-overflow on
+address ffff88002e280000
+[ 124.576801] ffff88002e280000 is located 131938492886538 bytes to
+the left of 28857600-byte region [ffffffff81282e0a, ffffffff82e0830a)
+[ 124.578633] Accessed by thread T10915:
+[ 124.579295] inlined in describe_heap_address
+./arch/x86/mm/asan/report.c:164
+[ 124.579295] #0 ffffffff810dd277 in asan_report_error
+./arch/x86/mm/asan/report.c:278
+[ 124.580137] #1 ffffffff810dc6a0 in asan_check_region
+./arch/x86/mm/asan/asan.c:37
+[ 124.581050] #2 ffffffff810dd423 in __tsan_read8 ??:0
+[ 124.581893] #3 ffffffff8107c093 in get_wchan
+./arch/x86/kernel/process_64.c:444
+
+The address checks in the 64bit implementation of get_wchan() are
+wrong in several ways:
+
+ - The lower bound of the stack is not the start of the stack
+   page. It's the start of the stack page plus sizeof (struct
+   thread_info)
+
+ - The upper bound must be:
+
+       top_of_stack - TOP_OF_KERNEL_STACK_PADDING - 2 * sizeof(unsigned long).
+
+   The 2 * sizeof(unsigned long) is required because the stack pointer
+   points at the frame pointer. The layout on the stack is: ... IP FP
+   ... IP FP. So we need to make sure that both IP and FP are in the
+   bounds.
+
+Fix the bound checks and get rid of the mix of numeric constants, u64
+and unsigned long. Making all unsigned long allows us to use the same
+function for 32bit as well.
+
+Use READ_ONCE() when accessing the stack. This does not prevent a
+concurrent wakeup of the task and the stack changing, but at least it
+avoids TOCTOU.
+
+Also check task state at the end of the loop. Again that does not
+prevent concurrent changes, but it avoids walking for nothing.
+
+Add proper comments while at it.
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Reported-by: Sasha Levin <sasha.levin@oracle.com>
+Based-on-patch-from: Wolfram Gloger <wmglo@dent.med.uni-muenchen.de>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov <bp@alien8.de>
+Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
+Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+Cc: Andy Lutomirski <luto@amacapital.net>
+Cc: Andrey Konovalov <andreyknvl@google.com>
+Cc: Kostya Serebryany <kcc@google.com>
+Cc: Alexander Potapenko <glider@google.com>
+Cc: kasan-dev <kasan-dev@googlegroups.com>
+Cc: Denys Vlasenko <dvlasenk@redhat.com>
+Cc: Andi Kleen <ak@linux.intel.com>
+Cc: Wolfram Gloger <wmglo@dent.med.uni-muenchen.de>
+Link: http://lkml.kernel.org/r/20150930083302.694788319@linutronix.de
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/process_64.c |   52 ++++++++++++++++++++++++++++++++++---------
+ 1 file changed, 42 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -497,27 +497,59 @@ void set_personality_ia32(bool x32)
+ }
+ EXPORT_SYMBOL_GPL(set_personality_ia32);
+ 
++/*
++ * Called from fs/proc with a reference on @p to find the function
++ * which called into schedule(). This needs to be done carefully
++ * because the task might wake up and we might look at a stack
++ * changing under us.
++ */
+ unsigned long get_wchan(struct task_struct *p)
+ {
+-      unsigned long stack;
+-      u64 fp, ip;
++      unsigned long start, bottom, top, sp, fp, ip;
+       int count = 0;
+ 
+       if (!p || p == current || p->state == TASK_RUNNING)
+               return 0;
+-      stack = (unsigned long)task_stack_page(p);
+-      if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
++
++      start = (unsigned long)task_stack_page(p);
++      if (!start)
+               return 0;
+-      fp = *(u64 *)(p->thread.sp);
++
++      /*
++       * Layout of the stack page:
++       *
++       * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
++       * PADDING
++       * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
++       * stack
++       * ----------- bottom = start + sizeof(thread_info)
++       * thread_info
++       * ----------- start
++       *
++       * The tasks stack pointer points at the location where the
++       * framepointer is stored. The data on the stack is:
++       * ... IP FP ... IP FP
++       *
++       * We need to read FP and IP, so we need to adjust the upper
++       * bound by another unsigned long.
++       */
++      top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
++      top -= 2 * sizeof(unsigned long);
++      bottom = start + sizeof(struct thread_info);
++
++      sp = READ_ONCE(p->thread.sp);
++      if (sp < bottom || sp > top)
++              return 0;
++
++      fp = READ_ONCE(*(unsigned long *)sp);
+       do {
+-              if (fp < (unsigned long)stack ||
+-                  fp >= (unsigned long)stack+THREAD_SIZE)
++              if (fp < bottom || fp > top)
+                       return 0;
+-              ip = *(u64 *)(fp+8);
++              ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long)));
+               if (!in_sched_functions(ip))
+                       return ip;
+-              fp = *(u64 *)fp;
+-      } while (count++ < 16);
++              fp = READ_ONCE(*(unsigned long *)fp);
++      } while (count++ < 16 && p->state != TASK_RUNNING);
+       return 0;
+ }
+ 
diff --git a/queue-4.2/x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch b/queue-4.2/x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch

new file mode 100644 (file)

index 0000000..6fd0bc9
--- /dev/null
+++ b/queue-4.2/x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch
@@ -0,0 +1,105 @@
+From 0b34a166f291d255755be46e43ed5497cdd194f2 Mon Sep 17 00:00:00 2001
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Fri, 25 Sep 2015 11:59:52 +0200
+Subject: x86/xen: Support kexec/kdump in HVM guests by doing a soft reset
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+commit 0b34a166f291d255755be46e43ed5497cdd194f2 upstream.
+
+Currently there is a number of issues preventing PVHVM Xen guests from
+doing successful kexec/kdump:
+
+  - Bound event channels.
+  - Registered vcpu_info.
+  - PIRQ/emuirq mappings.
+  - shared_info frame after XENMAPSPACE_shared_info operation.
+  - Active grant mappings.
+
+Basically, newly booted kernel stumbles upon already set up Xen
+interfaces and there is no way to reestablish them. In Xen-4.7 a new
+feature called 'soft reset' is coming. A guest performing kexec/kdump
+operation is supposed to call SCHEDOP_shutdown hypercall with
+SHUTDOWN_soft_reset reason before jumping to new kernel. Hypervisor
+(with some help from toolstack) will do full domain cleanup (but
+keeping its memory and vCPU contexts intact) returning the guest to
+the state it had when it was first booted and thus allowing it to
+start over.
+
+Doing SHUTDOWN_soft_reset on Xen hypervisors which don't support it is
+probably OK as by default all unknown shutdown reasons cause domain
+destroy with a message in toolstack log: 'Unknown shutdown reason code
+5. Destroying domain.'  which gives a clue to what the problem is and
+eliminates false expectations.
+
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: David Vrabel <david.vrabel@citrix.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/xen/enlighten.c      |   23 +++++++++++++++++++++++
+ include/xen/interface/sched.h |    8 ++++++++
+ 2 files changed, 31 insertions(+)
+
+--- a/arch/x86/xen/enlighten.c
++++ b/arch/x86/xen/enlighten.c
+@@ -33,6 +33,10 @@
+ #include <linux/memblock.h>
+ #include <linux/edd.h>
+ 
++#ifdef CONFIG_KEXEC_CORE
++#include <linux/kexec.h>
++#endif
++
+ #include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/interface/xen.h>
+@@ -1800,6 +1804,21 @@ static struct notifier_block xen_hvm_cpu
+       .notifier_call  = xen_hvm_cpu_notify,
+ };
+ 
++#ifdef CONFIG_KEXEC_CORE
++static void xen_hvm_shutdown(void)
++{
++      native_machine_shutdown();
++      if (kexec_in_progress)
++              xen_reboot(SHUTDOWN_soft_reset);
++}
++
++static void xen_hvm_crash_shutdown(struct pt_regs *regs)
++{
++      native_machine_crash_shutdown(regs);
++      xen_reboot(SHUTDOWN_soft_reset);
++}
++#endif
++
+ static void __init xen_hvm_guest_init(void)
+ {
+       if (xen_pv_domain())
+@@ -1819,6 +1838,10 @@ static void __init xen_hvm_guest_init(vo
+       x86_init.irqs.intr_init = xen_init_IRQ;
+       xen_hvm_init_time_ops();
+       xen_hvm_init_mmu_ops();
++#ifdef CONFIG_KEXEC_CORE
++      machine_ops.shutdown = xen_hvm_shutdown;
++      machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
++#endif
+ }
+ #endif
+ 
+--- a/include/xen/interface/sched.h
++++ b/include/xen/interface/sched.h
+@@ -107,5 +107,13 @@ struct sched_watchdog {
+ #define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
+ #define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
+ #define SHUTDOWN_watchdog   4  /* Restart because watchdog time expired.     */
++/*
++ * Domain asked to perform 'soft reset' for it. The expected behavior is to
++ * reset internal Xen state for the domain returning it to the point where it
++ * was created but leaving the domain's memory contents and vCPU contexts
++ * intact. This will allow the domain to start over and set up all Xen specific
++ * interfaces again.
++ */
++#define SHUTDOWN_soft_reset 5
+ 
+ #endif /* __XEN_PUBLIC_SCHED_H__ */
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 13 Oct 2015 22:58:38 +0000 (15:58 -0700)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Tue, 13 Oct 2015 22:58:38 +0000 (15:58 -0700)
queue-4.2/leds-led-class-add-missing-put_device.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/leds-lp55xx-correct-kconfig-dependency-for-f-w-user-helper.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/memcg-make-mem_cgroup_read_stat-unsigned.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/revert-cgroup-simplify-threadgroup-locking.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/revert-sched-cgroup-replace-signal_struct-group_rwsem-with-a-global-percpu_rwsem.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/s390-boot-decompression-disable-floating-point-in-decompressor.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/sched-core-fix-task_dead-race-in-finish_task_switch.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/series		patch \| blob \| blame \| history
queue-4.2/use-warn_on_once-for-missing-x86_feature_nrips.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch	[new file with mode: 0644]	patch \| blob
queue-4.2/x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch	[new file with mode: 0644]	patch \| blob