]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.12-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 6 Feb 2014 23:24:35 +0000 (15:24 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 6 Feb 2014 23:24:35 +0000 (15:24 -0800)
added patches:
arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch
audit-correct-a-type-mismatch-in-audit_syscall_exit.patch
audit-reset-audit-backlog-wait-time-after-error-recovery.patch
ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch
ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch
fuse-fix-pipe_buf_operations.patch
intel-iommu-fix-off-by-one-in-pagetable-freeing.patch
mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch
mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch
mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch
mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch
revert-eisa-initialize-device-before-its-resources.patch
selinux-fix-memory-leak-upon-loading-policy.patch
tracing-check-if-tracing-is-enabled-in-trace_puts.patch
tracing-have-trace-buffer-point-back-to-trace_array.patch
xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch

17 files changed:
queue-3.12/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch [new file with mode: 0644]
queue-3.12/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch [new file with mode: 0644]
queue-3.12/audit-reset-audit-backlog-wait-time-after-error-recovery.patch [new file with mode: 0644]
queue-3.12/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch [new file with mode: 0644]
queue-3.12/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch [new file with mode: 0644]
queue-3.12/fuse-fix-pipe_buf_operations.patch [new file with mode: 0644]
queue-3.12/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch [new file with mode: 0644]
queue-3.12/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch [new file with mode: 0644]
queue-3.12/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch [new file with mode: 0644]
queue-3.12/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch [new file with mode: 0644]
queue-3.12/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch [new file with mode: 0644]
queue-3.12/revert-eisa-initialize-device-before-its-resources.patch [new file with mode: 0644]
queue-3.12/selinux-fix-memory-leak-upon-loading-policy.patch [new file with mode: 0644]
queue-3.12/series [new file with mode: 0644]
queue-3.12/tracing-check-if-tracing-is-enabled-in-trace_puts.patch [new file with mode: 0644]
queue-3.12/tracing-have-trace-buffer-point-back-to-trace_array.patch [new file with mode: 0644]
queue-3.12/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch [new file with mode: 0644]

diff --git a/queue-3.12/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch b/queue-3.12/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch
new file mode 100644 (file)
index 0000000..27a040e
--- /dev/null
@@ -0,0 +1,43 @@
+From 53a52f17d96c8d47c79a7dafa81426317e89c7c1 Mon Sep 17 00:00:00 2001
+From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
+Date: Tue, 21 Jan 2014 15:48:41 -0800
+Subject: arch/sh/kernel/kgdb.c: add missing #include <linux/sched.h>
+
+From: Wanlong Gao <gaowanlong@cn.fujitsu.com>
+
+commit 53a52f17d96c8d47c79a7dafa81426317e89c7c1 upstream.
+
+  arch/sh/kernel/kgdb.c: In function 'sleeping_thread_to_gdb_regs':
+  arch/sh/kernel/kgdb.c:225:32: error: implicit declaration of function 'task_stack_page' [-Werror=implicit-function-declaration]
+  arch/sh/kernel/kgdb.c:242:23: error: dereferencing pointer to incomplete type
+  arch/sh/kernel/kgdb.c:243:22: error: dereferencing pointer to incomplete type
+  arch/sh/kernel/kgdb.c: In function 'singlestep_trap_handler':
+  arch/sh/kernel/kgdb.c:310:27: error: 'SIGTRAP' undeclared (first use in this function)
+  arch/sh/kernel/kgdb.c:310:27: note: each undeclared identifier is reported only once for each function it appears in
+
+This was introduced by commit 16559ae48c76 ("kgdb: remove #include
+<linux/serial_8250.h> from kgdb.h").
+
+[geert@linux-m68k.org: reworded and reformatted]
+Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
+Signed-off-by: Geert Uytterhoeven <geert+renesas@linux-m68k.org>
+Reported-by: Fengguang Wu <fengguang.wu@intel.com>
+Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/sh/kernel/kgdb.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/sh/kernel/kgdb.c
++++ b/arch/sh/kernel/kgdb.c
+@@ -13,6 +13,7 @@
+ #include <linux/kdebug.h>
+ #include <linux/irq.h>
+ #include <linux/io.h>
++#include <linux/sched.h>
+ #include <asm/cacheflush.h>
+ #include <asm/traps.h>
diff --git a/queue-3.12/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch b/queue-3.12/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch
new file mode 100644 (file)
index 0000000..2d680e4
--- /dev/null
@@ -0,0 +1,39 @@
+From 06bdadd7634551cfe8ce071fe44d0311b3033d9e Mon Sep 17 00:00:00 2001
+From: AKASHI Takahiro <takahiro.akashi@linaro.org>
+Date: Mon, 13 Jan 2014 13:33:09 -0800
+Subject: audit: correct a type mismatch in audit_syscall_exit()
+
+From: AKASHI Takahiro <takahiro.akashi@linaro.org>
+
+commit 06bdadd7634551cfe8ce071fe44d0311b3033d9e upstream.
+
+audit_syscall_exit() saves a result of regs_return_value() in intermediate
+"int" variable and passes it to __audit_syscall_exit(), which expects its
+second argument as a "long" value.  This will result in truncating the
+value returned by a system call and making a wrong audit record.
+
+I don't know why gcc compiler doesn't complain about this, but anyway it
+causes a problem at runtime on arm64 (and probably most 64-bit archs).
+
+Signed-off-by: AKASHI Takahiro <takahiro.akashi@linaro.org>
+Cc: Al Viro <viro@zeniv.linux.org.uk>
+Cc: Eric Paris <eparis@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Eric Paris <eparis@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/audit.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/linux/audit.h
++++ b/include/linux/audit.h
+@@ -135,7 +135,7 @@ static inline void audit_syscall_exit(vo
+ {
+       if (unlikely(current->audit_context)) {
+               int success = is_syscall_success(pt_regs);
+-              int return_code = regs_return_value(pt_regs);
++              long return_code = regs_return_value(pt_regs);
+               __audit_syscall_exit(success, return_code);
+       }
diff --git a/queue-3.12/audit-reset-audit-backlog-wait-time-after-error-recovery.patch b/queue-3.12/audit-reset-audit-backlog-wait-time-after-error-recovery.patch
new file mode 100644 (file)
index 0000000..351efcb
--- /dev/null
@@ -0,0 +1,48 @@
+From e789e561a50de0aaa8c695662d97aaa5eac9d55f Mon Sep 17 00:00:00 2001
+From: Richard Guy Briggs <rgb@redhat.com>
+Date: Thu, 12 Sep 2013 23:03:51 -0400
+Subject: audit: reset audit backlog wait time after error recovery
+
+From: Richard Guy Briggs <rgb@redhat.com>
+
+commit e789e561a50de0aaa8c695662d97aaa5eac9d55f upstream.
+
+When the audit queue overflows and times out (audit_backlog_wait_time), the
+audit queue overflow timeout is set to zero.  Once the audit queue overflow
+timeout condition recovers, the timeout should be reset to the original value.
+
+See also:
+       https://lkml.org/lkml/2013/9/2/473
+
+Signed-off-by: Luiz Capitulino <lcapitulino@redhat.com>
+Signed-off-by: Dan Duval <dan.duval@oracle.com>
+Signed-off-by: Chuck Anderson <chuck.anderson@oracle.com>
+Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
+Signed-off-by: Eric Paris <eparis@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/audit.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/kernel/audit.c
++++ b/kernel/audit.c
+@@ -103,7 +103,8 @@ static int audit_rate_limit;
+ /* Number of outstanding audit_buffers allowed. */
+ static int    audit_backlog_limit = 64;
+-static int    audit_backlog_wait_time = 60 * HZ;
++#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
++static int    audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+ static int    audit_backlog_wait_overflow = 0;
+ /* The identity of the user shutting down the audit system. */
+@@ -1135,6 +1136,8 @@ struct audit_buffer *audit_log_start(str
+               return NULL;
+       }
++      audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
++
+       ab = audit_buffer_alloc(ctx, gfp_mask, type);
+       if (!ab) {
+               audit_log_lost("out of memory in audit_log_start");
diff --git a/queue-3.12/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch b/queue-3.12/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch
new file mode 100644 (file)
index 0000000..f505f78
--- /dev/null
@@ -0,0 +1,111 @@
+From a4c35ed241129dd142be4cadb1e5a474a56d5464 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+Date: Mon, 13 Jan 2014 12:56:21 -0500
+Subject: ftrace: Fix synchronization location disabling and freeing ftrace_ops
+
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+
+commit a4c35ed241129dd142be4cadb1e5a474a56d5464 upstream.
+
+The synchronization needed after ftrace_ops are unregistered must happen
+after the callback is disabled from becing called by functions.
+
+The current location happens after the function is being removed from the
+internal lists, but not after the function callbacks were disabled, leaving
+the functions susceptible of being called after their callbacks are freed.
+
+This affects perf and any externel users of function tracing (LTTng and
+SystemTap).
+
+Fixes: cdbe61bfe704 "ftrace: Allow dynamically allocated function tracers"
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/trace/ftrace.c |   58 +++++++++++++++++++++++++++-----------------------
+ 1 file changed, 32 insertions(+), 26 deletions(-)
+
+--- a/kernel/trace/ftrace.c
++++ b/kernel/trace/ftrace.c
+@@ -447,20 +447,6 @@ static int __unregister_ftrace_function(
+       } else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
+               ret = remove_ftrace_list_ops(&ftrace_control_list,
+                                            &control_ops, ops);
+-              if (!ret) {
+-                      /*
+-                       * The ftrace_ops is now removed from the list,
+-                       * so there'll be no new users. We must ensure
+-                       * all current users are done before we free
+-                       * the control data.
+-                       * Note synchronize_sched() is not enough, as we
+-                       * use preempt_disable() to do RCU, but the function
+-                       * tracer can be called where RCU is not active
+-                       * (before user_exit()).
+-                       */
+-                      schedule_on_each_cpu(ftrace_sync);
+-                      control_ops_free(ops);
+-              }
+       } else
+               ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+@@ -470,17 +456,6 @@ static int __unregister_ftrace_function(
+       if (ftrace_enabled)
+               update_ftrace_function();
+-      /*
+-       * Dynamic ops may be freed, we must make sure that all
+-       * callers are done before leaving this function.
+-       *
+-       * Again, normal synchronize_sched() is not good enough.
+-       * We need to do a hard force of sched synchronization.
+-       */
+-      if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+-              schedule_on_each_cpu(ftrace_sync);
+-
+-
+       return 0;
+ }
+@@ -2164,10 +2139,41 @@ static int ftrace_shutdown(struct ftrace
+               command |= FTRACE_UPDATE_TRACE_FUNC;
+       }
+-      if (!command || !ftrace_enabled)
++      if (!command || !ftrace_enabled) {
++              /*
++               * If these are control ops, they still need their
++               * per_cpu field freed. Since, function tracing is
++               * not currently active, we can just free them
++               * without synchronizing all CPUs.
++               */
++              if (ops->flags & FTRACE_OPS_FL_CONTROL)
++                      control_ops_free(ops);
+               return 0;
++      }
+       ftrace_run_update_code(command);
++
++      /*
++       * Dynamic ops may be freed, we must make sure that all
++       * callers are done before leaving this function.
++       * The same goes for freeing the per_cpu data of the control
++       * ops.
++       *
++       * Again, normal synchronize_sched() is not good enough.
++       * We need to do a hard force of sched synchronization.
++       * This is because we use preempt_disable() to do RCU, but
++       * the function tracers can be called where RCU is not watching
++       * (like before user_exit()). We can not rely on the RCU
++       * infrastructure to do the synchronization, thus we must do it
++       * ourselves.
++       */
++      if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
++              schedule_on_each_cpu(ftrace_sync);
++
++              if (ops->flags & FTRACE_OPS_FL_CONTROL)
++                      control_ops_free(ops);
++      }
++
+       return 0;
+ }
diff --git a/queue-3.12/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch b/queue-3.12/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch
new file mode 100644 (file)
index 0000000..cb7fc11
--- /dev/null
@@ -0,0 +1,184 @@
+From 23a8e8441a0a74dd612edf81dc89d1600bc0a3d1 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+Date: Mon, 13 Jan 2014 10:30:23 -0500
+Subject: ftrace: Have function graph only trace based on global_ops filters
+
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+
+commit 23a8e8441a0a74dd612edf81dc89d1600bc0a3d1 upstream.
+
+Doing some different tests, I discovered that function graph tracing, when
+filtered via the set_ftrace_filter and set_ftrace_notrace files, does
+not always keep with them if another function ftrace_ops is registered
+to trace functions.
+
+The reason is that function graph just happens to trace all functions
+that the function tracer enables. When there was only one user of
+function tracing, the function graph tracer did not need to worry about
+being called by functions that it did not want to trace. But now that there
+are other users, this becomes a problem.
+
+For example, one just needs to do the following:
+
+ # cd /sys/kernel/debug/tracing
+ # echo schedule > set_ftrace_filter
+ # echo function_graph > current_tracer
+ # cat trace
+[..]
+ 0)               |  schedule() {
+ ------------------------------------------
+ 0)    <idle>-0    =>   rcu_pre-7
+ ------------------------------------------
+
+ 0) ! 2980.314 us |  }
+ 0)               |  schedule() {
+ ------------------------------------------
+ 0)   rcu_pre-7    =>    <idle>-0
+ ------------------------------------------
+
+ 0) + 20.701 us   |  }
+
+ # echo 1 > /proc/sys/kernel/stack_tracer_enabled
+ # cat trace
+[..]
+ 1) + 20.825 us   |      }
+ 1) + 21.651 us   |    }
+ 1) + 30.924 us   |  } /* SyS_ioctl */
+ 1)               |  do_page_fault() {
+ 1)               |    __do_page_fault() {
+ 1)   0.274 us    |      down_read_trylock();
+ 1)   0.098 us    |      find_vma();
+ 1)               |      handle_mm_fault() {
+ 1)               |        _raw_spin_lock() {
+ 1)   0.102 us    |          preempt_count_add();
+ 1)   0.097 us    |          do_raw_spin_lock();
+ 1)   2.173 us    |        }
+ 1)               |        do_wp_page() {
+ 1)   0.079 us    |          vm_normal_page();
+ 1)   0.086 us    |          reuse_swap_page();
+ 1)   0.076 us    |          page_move_anon_rmap();
+ 1)               |          unlock_page() {
+ 1)   0.082 us    |            page_waitqueue();
+ 1)   0.086 us    |            __wake_up_bit();
+ 1)   1.801 us    |          }
+ 1)   0.075 us    |          ptep_set_access_flags();
+ 1)               |          _raw_spin_unlock() {
+ 1)   0.098 us    |            do_raw_spin_unlock();
+ 1)   0.105 us    |            preempt_count_sub();
+ 1)   1.884 us    |          }
+ 1)   9.149 us    |        }
+ 1) + 13.083 us   |      }
+ 1)   0.146 us    |      up_read();
+
+When the stack tracer was enabled, it enabled all functions to be traced, which
+now the function graph tracer also traces. This is a side effect that should
+not occur.
+
+To fix this a test is added when the function tracing is changed, as well as when
+the graph tracer is enabled, to see if anything other than the ftrace global_ops
+function tracer is enabled. If so, then the graph tracer calls a test trampoline
+that will look at the function that is being traced and compare it with the
+filters defined by the global_ops.
+
+As an optimization, if there's no other function tracers registered, or if
+the only registered function tracers also use the global ops, the function
+graph infrastructure will call the registered function graph callback directly
+and not go through the test trampoline.
+
+Fixes: d2d45c7a03a2 "tracing: Have stack_tracer use a separate list of functions"
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/trace/ftrace.c |   45 ++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 44 insertions(+), 1 deletion(-)
+
+--- a/kernel/trace/ftrace.c
++++ b/kernel/trace/ftrace.c
+@@ -278,6 +278,12 @@ static void update_global_ops(void)
+       global_ops.func = func;
+ }
++#ifdef CONFIG_FUNCTION_GRAPH_TRACER
++static void update_function_graph_func(void);
++#else
++static inline void update_function_graph_func(void) { }
++#endif
++
+ static void update_ftrace_function(void)
+ {
+       ftrace_func_t func;
+@@ -325,6 +331,8 @@ static int remove_ftrace_ops(struct ftra
+ {
+       struct ftrace_ops **p;
++      update_function_graph_func();
++
+       /*
+        * If we are removing the last function, then simply point
+        * to the ftrace_stub.
+@@ -4777,6 +4785,7 @@ int ftrace_graph_entry_stub(struct ftrac
+ trace_func_graph_ret_t ftrace_graph_return =
+                       (trace_func_graph_ret_t)ftrace_stub;
+ trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
++static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
+ /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
+ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
+@@ -4918,6 +4927,30 @@ static struct ftrace_ops fgraph_ops __re
+                               FTRACE_OPS_FL_RECURSION_SAFE,
+ };
++static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
++{
++      if (!ftrace_ops_test(&global_ops, trace->func, NULL))
++              return 0;
++      return __ftrace_graph_entry(trace);
++}
++
++/*
++ * The function graph tracer should only trace the functions defined
++ * by set_ftrace_filter and set_ftrace_notrace. If another function
++ * tracer ops is registered, the graph tracer requires testing the
++ * function against the global ops, and not just trace any function
++ * that any ftrace_ops registered.
++ */
++static void update_function_graph_func(void)
++{
++      if (ftrace_ops_list == &ftrace_list_end ||
++          (ftrace_ops_list == &global_ops &&
++           global_ops.next == &ftrace_list_end))
++              ftrace_graph_entry = __ftrace_graph_entry;
++      else
++              ftrace_graph_entry = ftrace_graph_entry_test;
++}
++
+ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
+                       trace_func_graph_ent_t entryfunc)
+ {
+@@ -4942,7 +4975,16 @@ int register_ftrace_graph(trace_func_gra
+       }
+       ftrace_graph_return = retfunc;
+-      ftrace_graph_entry = entryfunc;
++
++      /*
++       * Update the indirect function to the entryfunc, and the
++       * function that gets called to the entry_test first. Then
++       * call the update fgraph entry function to determine if
++       * the entryfunc should be called directly or not.
++       */
++      __ftrace_graph_entry = entryfunc;
++      ftrace_graph_entry = ftrace_graph_entry_test;
++      update_function_graph_func();
+       ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
+@@ -4961,6 +5003,7 @@ void unregister_ftrace_graph(void)
+       ftrace_graph_active--;
+       ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
+       ftrace_graph_entry = ftrace_graph_entry_stub;
++      __ftrace_graph_entry = ftrace_graph_entry_stub;
+       ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
+       unregister_pm_notifier(&ftrace_suspend_notifier);
+       unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/queue-3.12/fuse-fix-pipe_buf_operations.patch b/queue-3.12/fuse-fix-pipe_buf_operations.patch
new file mode 100644 (file)
index 0000000..d9d9403
--- /dev/null
@@ -0,0 +1,152 @@
+From 28a625cbc2a14f17b83e47ef907b2658576a32aa Mon Sep 17 00:00:00 2001
+From: Miklos Szeredi <mszeredi@suse.cz>
+Date: Wed, 22 Jan 2014 19:36:57 +0100
+Subject: fuse: fix pipe_buf_operations
+
+From: Miklos Szeredi <mszeredi@suse.cz>
+
+commit 28a625cbc2a14f17b83e47ef907b2658576a32aa upstream.
+
+Having this struct in module memory could Oops when if the module is
+unloaded while the buffer still persists in a pipe.
+
+Since sock_pipe_buf_ops is essentially the same as fuse_dev_pipe_buf_steal
+merge them into nosteal_pipe_buf_ops (this is the same as
+default_pipe_buf_ops except stealing the page from the buffer is not
+allowed).
+
+Reported-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fuse/dev.c             |   22 +++++-----------------
+ fs/splice.c               |   18 ++++++++++++++++++
+ include/linux/pipe_fs_i.h |    2 ++
+ net/core/skbuff.c         |   32 +-------------------------------
+ 4 files changed, 26 insertions(+), 48 deletions(-)
+
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kioc
+       return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+ }
+-static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
+-                                 struct pipe_buffer *buf)
+-{
+-      return 1;
+-}
+-
+-static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
+-      .can_merge = 0,
+-      .map = generic_pipe_buf_map,
+-      .unmap = generic_pipe_buf_unmap,
+-      .confirm = generic_pipe_buf_confirm,
+-      .release = generic_pipe_buf_release,
+-      .steal = fuse_dev_pipe_buf_steal,
+-      .get = generic_pipe_buf_get,
+-};
+-
+ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+                                   struct pipe_inode_info *pipe,
+                                   size_t len, unsigned int flags)
+@@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(stru
+               buf->page = bufs[page_nr].page;
+               buf->offset = bufs[page_nr].offset;
+               buf->len = bufs[page_nr].len;
+-              buf->ops = &fuse_dev_pipe_buf_ops;
++              /*
++               * Need to be careful about this.  Having buf->ops in module
++               * code can Oops if the buffer persists after module unload.
++               */
++              buf->ops = &nosteal_pipe_buf_ops;
+               pipe->nrbufs++;
+               page_nr++;
+--- a/fs/splice.c
++++ b/fs/splice.c
+@@ -555,6 +555,24 @@ static const struct pipe_buf_operations
+       .get = generic_pipe_buf_get,
+ };
++static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
++                                  struct pipe_buffer *buf)
++{
++      return 1;
++}
++
++/* Pipe buffer operations for a socket and similar. */
++const struct pipe_buf_operations nosteal_pipe_buf_ops = {
++      .can_merge = 0,
++      .map = generic_pipe_buf_map,
++      .unmap = generic_pipe_buf_unmap,
++      .confirm = generic_pipe_buf_confirm,
++      .release = generic_pipe_buf_release,
++      .steal = generic_pipe_buf_nosteal,
++      .get = generic_pipe_buf_get,
++};
++EXPORT_SYMBOL(nosteal_pipe_buf_ops);
++
+ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+                           unsigned long vlen, loff_t offset)
+ {
+--- a/include/linux/pipe_fs_i.h
++++ b/include/linux/pipe_fs_i.h
+@@ -157,6 +157,8 @@ int generic_pipe_buf_confirm(struct pipe
+ int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
+ void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
++extern const struct pipe_buf_operations nosteal_pipe_buf_ops;
++
+ /* for F_SETPIPE_SZ and F_GETPIPE_SZ */
+ long pipe_fcntl(struct file *, unsigned int, unsigned long arg);
+ struct pipe_inode_info *get_pipe_info(struct file *file);
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -74,36 +74,6 @@
+ struct kmem_cache *skbuff_head_cache __read_mostly;
+ static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+-static void sock_pipe_buf_release(struct pipe_inode_info *pipe,
+-                                struct pipe_buffer *buf)
+-{
+-      put_page(buf->page);
+-}
+-
+-static void sock_pipe_buf_get(struct pipe_inode_info *pipe,
+-                              struct pipe_buffer *buf)
+-{
+-      get_page(buf->page);
+-}
+-
+-static int sock_pipe_buf_steal(struct pipe_inode_info *pipe,
+-                             struct pipe_buffer *buf)
+-{
+-      return 1;
+-}
+-
+-
+-/* Pipe buffer operations for a socket. */
+-static const struct pipe_buf_operations sock_pipe_buf_ops = {
+-      .can_merge = 0,
+-      .map = generic_pipe_buf_map,
+-      .unmap = generic_pipe_buf_unmap,
+-      .confirm = generic_pipe_buf_confirm,
+-      .release = sock_pipe_buf_release,
+-      .steal = sock_pipe_buf_steal,
+-      .get = sock_pipe_buf_get,
+-};
+-
+ /**
+  *    skb_panic - private function for out-of-line support
+  *    @skb:   buffer
+@@ -1800,7 +1770,7 @@ int skb_splice_bits(struct sk_buff *skb,
+               .partial = partial,
+               .nr_pages_max = MAX_SKB_FRAGS,
+               .flags = flags,
+-              .ops = &sock_pipe_buf_ops,
++              .ops = &nosteal_pipe_buf_ops,
+               .spd_release = sock_spd_release,
+       };
+       struct sk_buff *frag_iter;
diff --git a/queue-3.12/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch b/queue-3.12/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch
new file mode 100644 (file)
index 0000000..86e14c0
--- /dev/null
@@ -0,0 +1,61 @@
+From 08336fd218e087cc4fcc458e6b6dcafe8702b098 Mon Sep 17 00:00:00 2001
+From: Alex Williamson <alex.williamson@redhat.com>
+Date: Tue, 21 Jan 2014 15:48:18 -0800
+Subject: intel-iommu: fix off-by-one in pagetable freeing
+
+From: Alex Williamson <alex.williamson@redhat.com>
+
+commit 08336fd218e087cc4fcc458e6b6dcafe8702b098 upstream.
+
+dma_pte_free_level() has an off-by-one error when checking whether a pte
+is completely covered by a range.  Take for example the case of
+attempting to free pfn 0x0 - 0x1ff, ie.  512 entries covering the first
+2M superpage.
+
+The level_size() is 0x200 and we test:
+
+  static void dma_pte_free_level(...
+       ...
+
+       if (!(0 > 0 || 0x1ff < 0 + 0x200)) {
+               ...
+       }
+
+Clearly the 2nd test is true, which means we fail to take the branch to
+clear and free the pagetable entry.  As a result, we're leaking
+pagetables and failing to install new pages over the range.
+
+This was found with a PCI device assigned to a QEMU guest using vfio-pci
+without a VGA device present.  The first 1M of guest address space is
+mapped with various combinations of 4K pages, but eventually the range
+is entirely freed and replaced with a 2M contiguous mapping.
+intel-iommu errors out with something like:
+
+  ERROR: DMA PTE for vPFN 0x0 already set (to 5c2b8003 not 849c00083)
+
+In this case 5c2b8003 is the pointer to the previous leaf page that was
+neither freed nor cleared and 849c00083 is the superpage entry that
+we're trying to replace it with.
+
+Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
+Cc: David Woodhouse <dwmw2@infradead.org>
+Cc: Joerg Roedel <joro@8bytes.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/iommu/intel-iommu.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/iommu/intel-iommu.c
++++ b/drivers/iommu/intel-iommu.c
+@@ -917,7 +917,7 @@ static void dma_pte_free_level(struct dm
+               /* If range covers entire pagetable, free it */
+               if (!(start_pfn > level_pfn ||
+-                    last_pfn < level_pfn + level_size(level))) {
++                    last_pfn < level_pfn + level_size(level) - 1)) {
+                       dma_clear_pte(pte);
+                       domain_flush_cache(domain, pte, sizeof(*pte));
+                       free_pgtable_page(level_pte);
diff --git a/queue-3.12/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch b/queue-3.12/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch
new file mode 100644 (file)
index 0000000..81c002c
--- /dev/null
@@ -0,0 +1,41 @@
+From d8ad30559715ce97afb7d1a93a12fd90e8fff312 Mon Sep 17 00:00:00 2001
+From: Hugh Dickins <hughd@google.com>
+Date: Thu, 23 Jan 2014 15:53:32 -0800
+Subject: mm/memcg: iteration skip memcgs not yet fully initialized
+
+From: Hugh Dickins <hughd@google.com>
+
+commit d8ad30559715ce97afb7d1a93a12fd90e8fff312 upstream.
+
+It is surprising that the mem_cgroup iterator can return memcgs which
+have not yet been fully initialized.  By accident (or trial and error?)
+this appears not to present an actual problem; but it may be better to
+prevent such surprises, by skipping memcgs not yet online.
+
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Cc: Tejun Heo <tj@kernel.org>
+Acked-by: Michal Hocko <mhocko@suse.cz>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memcontrol.c |    6 ++----
+ 1 file changed, 2 insertions(+), 4 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -1081,10 +1081,8 @@ skip_node:
+        * protected by css_get and the tree walk is rcu safe.
+        */
+       if (next_css) {
+-              struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
+-
+-              if (css_tryget(&mem->css))
+-                      return mem;
++              if ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))
++                      return mem_cgroup_from_css(next_css);
+               else {
+                       prev_css = next_css;
+                       goto skip_node;
diff --git a/queue-3.12/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch b/queue-3.12/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch
new file mode 100644 (file)
index 0000000..8610322
--- /dev/null
@@ -0,0 +1,141 @@
+From 54b9dd14d09f24927285359a227aa363ce46089e Mon Sep 17 00:00:00 2001
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Date: Thu, 23 Jan 2014 15:53:14 -0800
+Subject: mm/memory-failure.c: shift page lock from head page to tail page after thp split
+
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+
+commit 54b9dd14d09f24927285359a227aa363ce46089e upstream.
+
+After thp split in hwpoison_user_mappings(), we hold page lock on the
+raw error page only between try_to_unmap, hence we are in danger of race
+condition.
+
+I found in the RHEL7 MCE-relay testing that we have "bad page" error
+when a memory error happens on a thp tail page used by qemu-kvm:
+
+  Triggering MCE exception on CPU 10
+  mce: [Hardware Error]: Machine check events logged
+  MCE exception done on CPU 10
+  MCE 0x38c535: Killing qemu-kvm:8418 due to hardware memory corruption
+  MCE 0x38c535: dirty LRU page recovery: Recovered
+  qemu-kvm[8418]: segfault at 20 ip 00007ffb0f0f229a sp 00007fffd6bc5240 error 4 in qemu-kvm[7ffb0ef14000+420000]
+  BUG: Bad page state in process qemu-kvm  pfn:38c400
+  page:ffffea000e310000 count:0 mapcount:0 mapping:          (null) index:0x7ffae3c00
+  page flags: 0x2fffff0008001d(locked|referenced|uptodate|dirty|swapbacked)
+  Modules linked in: hwpoison_inject mce_inject vhost_net macvtap macvlan ...
+  CPU: 0 PID: 8418 Comm: qemu-kvm Tainted: G   M        --------------   3.10.0-54.0.1.el7.mce_test_fixed.x86_64 #1
+  Hardware name: NEC NEC Express5800/R120b-1 [N8100-1719F]/MS-91E7-001, BIOS 4.6.3C19 02/10/2011
+  Call Trace:
+    dump_stack+0x19/0x1b
+    bad_page.part.59+0xcf/0xe8
+    free_pages_prepare+0x148/0x160
+    free_hot_cold_page+0x31/0x140
+    free_hot_cold_page_list+0x46/0xa0
+    release_pages+0x1c1/0x200
+    free_pages_and_swap_cache+0xad/0xd0
+    tlb_flush_mmu.part.46+0x4c/0x90
+    tlb_finish_mmu+0x55/0x60
+    exit_mmap+0xcb/0x170
+    mmput+0x67/0xf0
+    vhost_dev_cleanup+0x231/0x260 [vhost_net]
+    vhost_net_release+0x3f/0x90 [vhost_net]
+    __fput+0xe9/0x270
+    ____fput+0xe/0x10
+    task_work_run+0xc4/0xe0
+    do_exit+0x2bb/0xa40
+    do_group_exit+0x3f/0xa0
+    get_signal_to_deliver+0x1d0/0x6e0
+    do_signal+0x48/0x5e0
+    do_notify_resume+0x71/0xc0
+    retint_signal+0x48/0x8c
+
+The reason of this bug is that a page fault happens before unlocking the
+head page at the end of memory_failure().  This strange page fault is
+trying to access to address 0x20 and I'm not sure why qemu-kvm does
+this, but anyway as a result the SIGSEGV makes qemu-kvm exit and on the
+way we catch the bad page bug/warning because we try to free a locked
+page (which was the former head page.)
+
+To fix this, this patch suggests to shift page lock from head page to
+tail page just after thp split.  SIGSEGV still happens, but it affects
+only error affected VMs, not a whole system.
+
+Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Andi Kleen <andi@firstfloor.org>
+Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory-failure.c |   21 +++++++++++----------
+ 1 file changed, 11 insertions(+), 10 deletions(-)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -856,14 +856,14 @@ static int page_action(struct page_state
+  * the pages and send SIGBUS to the processes if the data was dirty.
+  */
+ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
+-                                int trapno, int flags)
++                                int trapno, int flags, struct page **hpagep)
+ {
+       enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+       struct address_space *mapping;
+       LIST_HEAD(tokill);
+       int ret;
+       int kill = 1, forcekill;
+-      struct page *hpage = compound_head(p);
++      struct page *hpage = *hpagep;
+       struct page *ppage;
+       if (PageReserved(p) || PageSlab(p))
+@@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct
+                        * We pinned the head page for hwpoison handling,
+                        * now we split the thp and we are interested in
+                        * the hwpoisoned raw page, so move the refcount
+-                       * to it.
++                       * to it. Similarly, page lock is shifted.
+                        */
+                       if (hpage != p) {
+                               put_page(hpage);
+                               get_page(p);
++                              lock_page(p);
++                              unlock_page(hpage);
++                              *hpagep = p;
+                       }
+                       /* THP is split, so ppage should be the real poisoned page. */
+                       ppage = p;
+@@ -964,17 +967,11 @@ static int hwpoison_user_mappings(struct
+       if (kill)
+               collect_procs(ppage, &tokill);
+-      if (hpage != ppage)
+-              lock_page(ppage);
+-
+       ret = try_to_unmap(ppage, ttu);
+       if (ret != SWAP_SUCCESS)
+               printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
+                               pfn, page_mapcount(ppage));
+-      if (hpage != ppage)
+-              unlock_page(ppage);
+-
+       /*
+        * Now that the dirty bit has been propagated to the
+        * struct page and all unmaps done we can decide if
+@@ -1193,8 +1190,12 @@ int memory_failure(unsigned long pfn, in
+       /*
+        * Now take care of user space mappings.
+        * Abort on fail: __delete_from_page_cache() assumes unmapped page.
++       *
++       * When the raw error page is thp tail page, hpage points to the raw
++       * page after thp split.
+        */
+-      if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) {
++      if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)
++          != SWAP_SUCCESS) {
+               printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
+               res = -EBUSY;
+               goto out;
diff --git a/queue-3.12/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch b/queue-3.12/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch
new file mode 100644 (file)
index 0000000..8685c56
--- /dev/null
@@ -0,0 +1,125 @@
+From a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 29 Jan 2014 14:05:41 -0800
+Subject: mm/page-writeback.c: do not count anon pages as dirtyable memory
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 upstream.
+
+The VM is currently heavily tuned to avoid swapping.  Whether that is
+good or bad is a separate discussion, but as long as the VM won't swap
+to make room for dirty cache, we can not consider anonymous pages when
+calculating the amount of dirtyable memory, the baseline to which
+dirty_background_ratio and dirty_ratio are applied.
+
+A simple workload that occupies a significant size (40+%, depending on
+memory layout, storage speeds etc.) of memory with anon/tmpfs pages and
+uses the remainder for a streaming writer demonstrates this problem.  In
+that case, the actual cache pages are a small fraction of what is
+considered dirtyable overall, which results in an relatively large
+portion of the cache pages to be dirtied.  As kswapd starts rotating
+these, random tasks enter direct reclaim and stall on IO.
+
+Only consider free pages and file pages dirtyable.
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Tejun Heo <tj@kernel.org>
+Tested-by: Tejun Heo <tj@kernel.org>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Reviewed-by: Michal Hocko <mhocko@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/vmstat.h |    2 --
+ mm/internal.h          |    1 -
+ mm/page-writeback.c    |    6 ++++--
+ mm/vmscan.c            |   23 +----------------------
+ 4 files changed, 5 insertions(+), 27 deletions(-)
+
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -142,8 +142,6 @@ static inline unsigned long zone_page_st
+       return x;
+ }
+-extern unsigned long global_reclaimable_pages(void);
+-
+ #ifdef CONFIG_NUMA
+ /*
+  * Determine the per node value of a stat item. This function
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -85,7 +85,6 @@ extern unsigned long highest_memmap_pfn;
+  */
+ extern int isolate_lru_page(struct page *page);
+ extern void putback_lru_page(struct page *page);
+-extern unsigned long zone_reclaimable_pages(struct zone *zone);
+ extern bool zone_reclaimable(struct zone *zone);
+ /*
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -205,7 +205,8 @@ static unsigned long zone_dirtyable_memo
+       nr_pages = zone_page_state(zone, NR_FREE_PAGES);
+       nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+-      nr_pages += zone_reclaimable_pages(zone);
++      nr_pages += zone_page_state(zone, NR_INACTIVE_FILE);
++      nr_pages += zone_page_state(zone, NR_ACTIVE_FILE);
+       return nr_pages;
+ }
+@@ -258,7 +259,8 @@ static unsigned long global_dirtyable_me
+       x = global_page_state(NR_FREE_PAGES);
+       x -= min(x, dirty_balance_reserve);
+-      x += global_reclaimable_pages();
++      x += global_page_state(NR_INACTIVE_FILE);
++      x += global_page_state(NR_ACTIVE_FILE);
+       if (!vm_highmem_is_dirtyable)
+               x -= highmem_dirtyable_memory(x);
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_c
+ }
+ #endif
+-unsigned long zone_reclaimable_pages(struct zone *zone)
++static unsigned long zone_reclaimable_pages(struct zone *zone)
+ {
+       int nr;
+@@ -3297,27 +3297,6 @@ void wakeup_kswapd(struct zone *zone, in
+       wake_up_interruptible(&pgdat->kswapd_wait);
+ }
+-/*
+- * The reclaimable count would be mostly accurate.
+- * The less reclaimable pages may be
+- * - mlocked pages, which will be moved to unevictable list when encountered
+- * - mapped pages, which may require several travels to be reclaimed
+- * - dirty pages, which is not "instantly" reclaimable
+- */
+-unsigned long global_reclaimable_pages(void)
+-{
+-      int nr;
+-
+-      nr = global_page_state(NR_ACTIVE_FILE) +
+-           global_page_state(NR_INACTIVE_FILE);
+-
+-      if (get_nr_swap_pages() > 0)
+-              nr += global_page_state(NR_ACTIVE_ANON) +
+-                    global_page_state(NR_INACTIVE_ANON);
+-
+-      return nr;
+-}
+-
+ #ifdef CONFIG_HIBERNATION
+ /*
+  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
diff --git a/queue-3.12/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch b/queue-3.12/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch
new file mode 100644 (file)
index 0000000..7cd2e50
--- /dev/null
@@ -0,0 +1,147 @@
+From a804552b9a15c931cfc2a92a2e0aed1add8b580a Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Wed, 29 Jan 2014 14:05:39 -0800
+Subject: mm/page-writeback.c: fix dirty_balance_reserve subtraction from dirtyable memory
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit a804552b9a15c931cfc2a92a2e0aed1add8b580a upstream.
+
+Tejun reported stuttering and latency spikes on a system where random
+tasks would enter direct reclaim and get stuck on dirty pages.  Around
+50% of memory was occupied by tmpfs backed by an SSD, and another disk
+(rotating) was reading and writing at max speed to shrink a partition.
+
+: The problem was pretty ridiculous.  It's a 8gig machine w/ one ssd and 10k
+: rpm harddrive and I could reliably reproduce constant stuttering every
+: several seconds for as long as buffered IO was going on on the hard drive
+: either with tmpfs occupying somewhere above 4gig or a test program which
+: allocates about the same amount of anon memory.  Although swap usage was
+: zero, turning off swap also made the problem go away too.
+:
+: The trigger conditions seem quite plausible - high anon memory usage w/
+: heavy buffered IO and swap configured - and it's highly likely that this
+: is happening in the wild too.  (this can happen with copying large files
+: to usb sticks too, right?)
+
+This patch (of 2):
+
+The dirty_balance_reserve is an approximation of the fraction of free
+pages that the page allocator does not make available for page cache
+allocations.  As a result, it has to be taken into account when
+calculating the amount of "dirtyable memory", the baseline to which
+dirty_background_ratio and dirty_ratio are applied.
+
+However, currently the reserve is subtracted from the sum of free and
+reclaimable pages, which is non-sensical and leads to erroneous results
+when the system is dominated by unreclaimable pages and the
+dirty_balance_reserve is bigger than free+reclaimable.  In that case, at
+least the already allocated cache should be considered dirtyable.
+
+Fix the calculation by subtracting the reserve from the amount of free
+pages, then adding the reclaimable pages on top.
+
+[akpm@linux-foundation.org: fix CONFIG_HIGHMEM build]
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Reported-by: Tejun Heo <tj@kernel.org>
+Tested-by: Tejun Heo <tj@kernel.org>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Wu Fengguang <fengguang.wu@intel.com>
+Reviewed-by: Michal Hocko <mhocko@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/page-writeback.c |   55 ++++++++++++++++++++++------------------------------
+ 1 file changed, 24 insertions(+), 31 deletions(-)
+
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -191,6 +191,25 @@ static unsigned long writeout_period_tim
+  * global dirtyable memory first.
+  */
++/**
++ * zone_dirtyable_memory - number of dirtyable pages in a zone
++ * @zone: the zone
++ *
++ * Returns the zone's number of pages potentially available for dirty
++ * page cache.  This is the base value for the per-zone dirty limits.
++ */
++static unsigned long zone_dirtyable_memory(struct zone *zone)
++{
++      unsigned long nr_pages;
++
++      nr_pages = zone_page_state(zone, NR_FREE_PAGES);
++      nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
++
++      nr_pages += zone_reclaimable_pages(zone);
++
++      return nr_pages;
++}
++
+ static unsigned long highmem_dirtyable_memory(unsigned long total)
+ {
+ #ifdef CONFIG_HIGHMEM
+@@ -198,11 +217,9 @@ static unsigned long highmem_dirtyable_m
+       unsigned long x = 0;
+       for_each_node_state(node, N_HIGH_MEMORY) {
+-              struct zone *z =
+-                      &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
++              struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+-              x += zone_page_state(z, NR_FREE_PAGES) +
+-                   zone_reclaimable_pages(z) - z->dirty_balance_reserve;
++              x += zone_dirtyable_memory(z);
+       }
+       /*
+        * Unreclaimable memory (kernel memory or anonymous memory
+@@ -238,9 +255,11 @@ static unsigned long global_dirtyable_me
+ {
+       unsigned long x;
+-      x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
++      x = global_page_state(NR_FREE_PAGES);
+       x -= min(x, dirty_balance_reserve);
++      x += global_reclaimable_pages();
++
+       if (!vm_highmem_is_dirtyable)
+               x -= highmem_dirtyable_memory(x);
+@@ -289,32 +308,6 @@ void global_dirty_limits(unsigned long *
+ }
+ /**
+- * zone_dirtyable_memory - number of dirtyable pages in a zone
+- * @zone: the zone
+- *
+- * Returns the zone's number of pages potentially available for dirty
+- * page cache.  This is the base value for the per-zone dirty limits.
+- */
+-static unsigned long zone_dirtyable_memory(struct zone *zone)
+-{
+-      /*
+-       * The effective global number of dirtyable pages may exclude
+-       * highmem as a big-picture measure to keep the ratio between
+-       * dirty memory and lowmem reasonable.
+-       *
+-       * But this function is purely about the individual zone and a
+-       * highmem zone can hold its share of dirty pages, so we don't
+-       * care about vm_highmem_is_dirtyable here.
+-       */
+-      unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) +
+-              zone_reclaimable_pages(zone);
+-
+-      /* don't allow this to underflow */
+-      nr_pages -= min(nr_pages, zone->dirty_balance_reserve);
+-      return nr_pages;
+-}
+-
+-/**
+  * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+  * @zone: the zone
+  *
diff --git a/queue-3.12/revert-eisa-initialize-device-before-its-resources.patch b/queue-3.12/revert-eisa-initialize-device-before-its-resources.patch
new file mode 100644 (file)
index 0000000..6de968b
--- /dev/null
@@ -0,0 +1,96 @@
+From 765ee51f9a3f652959b4c7297d198a28e37952b4 Mon Sep 17 00:00:00 2001
+From: Bjorn Helgaas <bhelgaas@google.com>
+Date: Fri, 17 Jan 2014 14:57:29 -0700
+Subject: Revert "EISA: Initialize device before its resources"
+
+From: Bjorn Helgaas <bhelgaas@google.com>
+
+commit 765ee51f9a3f652959b4c7297d198a28e37952b4 upstream.
+
+This reverts commit 26abfeed4341872364386c6a52b9acef8c81a81a.
+
+In the eisa_probe() force_probe path, if we were unable to request slot
+resources (e.g., [io 0x800-0x8ff]), we skipped the slot with "Cannot
+allocate resource for EISA slot %d" before reading the EISA signature in
+eisa_init_device().
+
+Commit 26abfeed4341 moved eisa_init_device() earlier, so we tried to read
+the EISA signature before requesting the slot resources, and this caused
+hangs during boot.
+
+Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1251816
+Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/eisa/eisa-bus.c |   26 +++++++++++++++-----------
+ 1 file changed, 15 insertions(+), 11 deletions(-)
+
+--- a/drivers/eisa/eisa-bus.c
++++ b/drivers/eisa/eisa-bus.c
+@@ -275,11 +275,13 @@ static int __init eisa_request_resources
+               }
+               
+               if (slot) {
++                      edev->res[i].name  = NULL;
+                       edev->res[i].start = SLOT_ADDRESS(root, slot)
+                                            + (i * 0x400);
+                       edev->res[i].end   = edev->res[i].start + 0xff;
+                       edev->res[i].flags = IORESOURCE_IO;
+               } else {
++                      edev->res[i].name  = NULL;
+                       edev->res[i].start = SLOT_ADDRESS(root, slot)
+                                            + EISA_VENDOR_ID_OFFSET;
+                       edev->res[i].end   = edev->res[i].start + 3;
+@@ -326,19 +328,20 @@ static int __init eisa_probe(struct eisa
+               return -ENOMEM;
+       }
+               
+-      if (eisa_init_device(root, edev, 0)) {
++      if (eisa_request_resources(root, edev, 0)) {
++              dev_warn(root->dev,
++                       "EISA: Cannot allocate resource for mainboard\n");
+               kfree(edev);
+               if (!root->force_probe)
+-                      return -ENODEV;
++                      return -EBUSY;
+               goto force_probe;
+       }
+-      if (eisa_request_resources(root, edev, 0)) {
+-              dev_warn(root->dev,
+-                       "EISA: Cannot allocate resource for mainboard\n");
++      if (eisa_init_device(root, edev, 0)) {
++              eisa_release_resources(edev);
+               kfree(edev);
+               if (!root->force_probe)
+-                      return -EBUSY;
++                      return -ENODEV;
+               goto force_probe;
+       }
+@@ -361,11 +364,6 @@ static int __init eisa_probe(struct eisa
+                       continue;
+               }
+-              if (eisa_init_device(root, edev, i)) {
+-                      kfree(edev);
+-                      continue;
+-              }
+-
+               if (eisa_request_resources(root, edev, i)) {
+                       dev_warn(root->dev,
+                                "Cannot allocate resource for EISA slot %d\n",
+@@ -373,6 +371,12 @@ static int __init eisa_probe(struct eisa
+                       kfree(edev);
+                       continue;
+               }
++
++              if (eisa_init_device(root, edev, i)) {
++                      eisa_release_resources(edev);
++                      kfree(edev);
++                      continue;
++              }
+               if (edev->state == (EISA_CONFIG_ENABLED | EISA_CONFIG_FORCED))
+                       enabled_str = " (forced enabled)";
diff --git a/queue-3.12/selinux-fix-memory-leak-upon-loading-policy.patch b/queue-3.12/selinux-fix-memory-leak-upon-loading-policy.patch
new file mode 100644 (file)
index 0000000..373ee43
--- /dev/null
@@ -0,0 +1,79 @@
+From 8ed814602876bec9bad2649ca17f34b499357a1c Mon Sep 17 00:00:00 2001
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Date: Mon, 6 Jan 2014 21:28:15 +0900
+Subject: SELinux: Fix memory leak upon loading policy
+
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+
+commit 8ed814602876bec9bad2649ca17f34b499357a1c upstream.
+
+Hello.
+
+I got below leak with linux-3.10.0-54.0.1.el7.x86_64 .
+
+[  681.903890] kmemleak: 5538 new suspected memory leaks (see /sys/kernel/debug/kmemleak)
+
+Below is a patch, but I don't know whether we need special handing for undoing
+ebitmap_set_bit() call.
+----------
+>>From fe97527a90fe95e2239dfbaa7558f0ed559c0992 Mon Sep 17 00:00:00 2001
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Date: Mon, 6 Jan 2014 16:30:21 +0900
+Subject: SELinux: Fix memory leak upon loading policy
+
+Commit 2463c26d "SELinux: put name based create rules in a hashtable" did not
+check return value from hashtab_insert() in filename_trans_read(). It leaks
+memory if hashtab_insert() returns error.
+
+  unreferenced object 0xffff88005c9160d0 (size 8):
+    comm "systemd", pid 1, jiffies 4294688674 (age 235.265s)
+    hex dump (first 8 bytes):
+      57 0b 00 00 6b 6b 6b a5                          W...kkk.
+    backtrace:
+      [<ffffffff816604ae>] kmemleak_alloc+0x4e/0xb0
+      [<ffffffff811cba5e>] kmem_cache_alloc_trace+0x12e/0x360
+      [<ffffffff812aec5d>] policydb_read+0xd1d/0xf70
+      [<ffffffff812b345c>] security_load_policy+0x6c/0x500
+      [<ffffffff812a623c>] sel_write_load+0xac/0x750
+      [<ffffffff811eb680>] vfs_write+0xc0/0x1f0
+      [<ffffffff811ec08c>] SyS_write+0x4c/0xa0
+      [<ffffffff81690419>] system_call_fastpath+0x16/0x1b
+      [<ffffffffffffffff>] 0xffffffffffffffff
+
+However, we should not return EEXIST error to the caller, or the systemd will
+show below message and the boot sequence freezes.
+
+  systemd[1]: Failed to load SELinux policy. Freezing.
+
+Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Acked-by: Eric Paris <eparis@redhat.com>
+Signed-off-by: Paul Moore <pmoore@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ security/selinux/ss/policydb.c |   14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/security/selinux/ss/policydb.c
++++ b/security/selinux/ss/policydb.c
+@@ -1941,7 +1941,19 @@ static int filename_trans_read(struct po
+               if (rc)
+                       goto out;
+-              hashtab_insert(p->filename_trans, ft, otype);
++              rc = hashtab_insert(p->filename_trans, ft, otype);
++              if (rc) {
++                      /*
++                       * Do not return -EEXIST to the caller, or the system
++                       * will not boot.
++                       */
++                      if (rc != -EEXIST)
++                              goto out;
++                      /* But free memory to avoid memory leak. */
++                      kfree(ft);
++                      kfree(name);
++                      kfree(otype);
++              }
+       }
+       hash_eval(p->filename_trans, "filenametr");
+       return 0;
diff --git a/queue-3.12/series b/queue-3.12/series
new file mode 100644 (file)
index 0000000..0ac9d1d
--- /dev/null
@@ -0,0 +1,16 @@
+selinux-fix-memory-leak-upon-loading-policy.patch
+ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch
+ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch
+tracing-have-trace-buffer-point-back-to-trace_array.patch
+tracing-check-if-tracing-is-enabled-in-trace_puts.patch
+arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch
+intel-iommu-fix-off-by-one-in-pagetable-freeing.patch
+revert-eisa-initialize-device-before-its-resources.patch
+fuse-fix-pipe_buf_operations.patch
+audit-reset-audit-backlog-wait-time-after-error-recovery.patch
+audit-correct-a-type-mismatch-in-audit_syscall_exit.patch
+xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch
+mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch
+mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch
+mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch
+mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch
diff --git a/queue-3.12/tracing-check-if-tracing-is-enabled-in-trace_puts.patch b/queue-3.12/tracing-check-if-tracing-is-enabled-in-trace_puts.patch
new file mode 100644 (file)
index 0000000..6f6114b
--- /dev/null
@@ -0,0 +1,44 @@
+From 3132e107d608f8753240d82d61303c500fd515b4 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+Date: Thu, 23 Jan 2014 12:27:59 -0500
+Subject: tracing: Check if tracing is enabled in trace_puts()
+
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+
+commit 3132e107d608f8753240d82d61303c500fd515b4 upstream.
+
+If trace_puts() is used very early in boot up, it can crash the machine
+if it is called before the ring buffer is allocated. If a trace_printk()
+is used with no arguments, then it will be converted into a trace_puts()
+and suffer the same fate.
+
+Fixes: 09ae72348ecc "tracing: Add trace_puts() for even faster trace_printk() tracing"
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/trace/trace.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -435,6 +435,9 @@ int __trace_puts(unsigned long ip, const
+       unsigned long irq_flags;
+       int alloc;
++      if (unlikely(tracing_selftest_running || tracing_disabled))
++              return 0;
++
+       alloc = sizeof(*entry) + size + 2; /* possible \n added */
+       local_save_flags(irq_flags);
+@@ -475,6 +478,9 @@ int __trace_bputs(unsigned long ip, cons
+       unsigned long irq_flags;
+       int size = sizeof(struct bputs_entry);
++      if (unlikely(tracing_selftest_running || tracing_disabled))
++              return 0;
++
+       local_save_flags(irq_flags);
+       buffer = global_trace.trace_buffer.buffer;
+       event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
diff --git a/queue-3.12/tracing-have-trace-buffer-point-back-to-trace_array.patch b/queue-3.12/tracing-have-trace-buffer-point-back-to-trace_array.patch
new file mode 100644 (file)
index 0000000..0665f55
--- /dev/null
@@ -0,0 +1,36 @@
+From dced341b2d4f06668efaab33f88de5d287c0f45b Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+Date: Tue, 14 Jan 2014 10:19:46 -0500
+Subject: tracing: Have trace buffer point back to trace_array
+
+From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
+
+commit dced341b2d4f06668efaab33f88de5d287c0f45b upstream.
+
+The trace buffer has a descriptor pointer that goes back to the trace
+array. But it was never assigned. Luckily, nothing uses it (yet), but
+it will in the future.
+
+Although nothing currently uses this, if any of the new features get
+backported to older kernels, and because this is such a simple change,
+I'm marking it for stable too.
+
+Fixes: 12883efb670c "tracing: Consolidate max_tr into main trace_array structure"
+Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/trace/trace.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -5872,6 +5872,8 @@ allocate_trace_buffer(struct trace_array
+       rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
++      buf->tr = tr;
++
+       buf->buffer = ring_buffer_alloc(size, rb_flags);
+       if (!buf->buffer)
+               return -ENOMEM;
diff --git a/queue-3.12/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch b/queue-3.12/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch
new file mode 100644 (file)
index 0000000..f1daa06
--- /dev/null
@@ -0,0 +1,334 @@
+From 51c71a3bbaca868043cc45b3ad3786dd48a90235 Mon Sep 17 00:00:00 2001
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+Date: Tue, 26 Nov 2013 15:05:40 -0500
+Subject: xen/pvhvm: If xen_platform_pci=0 is set don't blow up (v4).
+
+From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+
+commit 51c71a3bbaca868043cc45b3ad3786dd48a90235 upstream.
+
+The user has the option of disabling the platform driver:
+00:02.0 Unassigned class [ff80]: XenSource, Inc. Xen Platform Device (rev 01)
+
+which is used to unplug the emulated drivers (IDE, Realtek 8169, etc)
+and allow the PV drivers to take over. If the user wishes
+to disable that they can set:
+
+  xen_platform_pci=0
+  (in the guest config file)
+
+or
+  xen_emul_unplug=never
+  (on the Linux command line)
+
+except it does not work properly. The PV drivers still try to
+load and since the Xen platform driver is not run - and it
+has not initialized the grant tables, most of the PV drivers
+stumble upon:
+
+input: Xen Virtual Keyboard as /devices/virtual/input/input5
+input: Xen Virtual Pointer as /devices/virtual/input/input6M
+------------[ cut here ]------------
+kernel BUG at /home/konrad/ssd/konrad/linux/drivers/xen/grant-table.c:1206!
+invalid opcode: 0000 [#1] SMP
+Modules linked in: xen_kbdfront(+) xenfs xen_privcmd
+CPU: 6 PID: 1389 Comm: modprobe Not tainted 3.13.0-rc1upstream-00021-ga6c892b-dirty #1
+Hardware name: Xen HVM domU, BIOS 4.4-unstable 11/26/2013
+RIP: 0010:[<ffffffff813ddc40>]  [<ffffffff813ddc40>] get_free_entries+0x2e0/0x300
+Call Trace:
+ [<ffffffff8150d9a3>] ? evdev_connect+0x1e3/0x240
+ [<ffffffff813ddd0e>] gnttab_grant_foreign_access+0x2e/0x70
+ [<ffffffffa0010081>] xenkbd_connect_backend+0x41/0x290 [xen_kbdfront]
+ [<ffffffffa0010a12>] xenkbd_probe+0x2f2/0x324 [xen_kbdfront]
+ [<ffffffff813e5757>] xenbus_dev_probe+0x77/0x130
+ [<ffffffff813e7217>] xenbus_frontend_dev_probe+0x47/0x50
+ [<ffffffff8145e9a9>] driver_probe_device+0x89/0x230
+ [<ffffffff8145ebeb>] __driver_attach+0x9b/0xa0
+ [<ffffffff8145eb50>] ? driver_probe_device+0x230/0x230
+ [<ffffffff8145eb50>] ? driver_probe_device+0x230/0x230
+ [<ffffffff8145cf1c>] bus_for_each_dev+0x8c/0xb0
+ [<ffffffff8145e7d9>] driver_attach+0x19/0x20
+ [<ffffffff8145e260>] bus_add_driver+0x1a0/0x220
+ [<ffffffff8145f1ff>] driver_register+0x5f/0xf0
+ [<ffffffff813e55c5>] xenbus_register_driver_common+0x15/0x20
+ [<ffffffff813e76b3>] xenbus_register_frontend+0x23/0x40
+ [<ffffffffa0015000>] ? 0xffffffffa0014fff
+ [<ffffffffa001502b>] xenkbd_init+0x2b/0x1000 [xen_kbdfront]
+ [<ffffffff81002049>] do_one_initcall+0x49/0x170
+
+.. snip..
+
+which is hardly nice. This patch fixes this by having each
+PV driver check for:
+ - if running in PV, then it is fine to execute (as that is their
+   native environment).
+ - if running in HVM, check if user wanted 'xen_emul_unplug=never',
+   in which case bail out and don't load any PV drivers.
+ - if running in HVM, and if PCI device 5853:0001 (xen_platform_pci)
+   does not exist, then bail out and not load PV drivers.
+ - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=ide-disks',
+   then bail out for all PV devices _except_ the block one.
+   Ditto for the network one ('nics').
+ - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=unnecessary'
+   then load block PV driver, and also setup the legacy IDE paths.
+   In (v3) make it actually load PV drivers.
+
+Reported-by: Sander Eikelenboom <linux@eikelenboom.it
+Reported-by: Anthony PERARD <anthony.perard@citrix.com>
+Reported-and-Tested-by: Fabio Fantoni <fabio.fantoni@m2r.biz>
+Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
+[v2: Add extra logic to handle the myrid ways 'xen_emul_unplug'
+can be used per Ian and Stefano suggestion]
+[v3: Make the unnecessary case work properly]
+[v4: s/disks/ide-disks/ spotted by Fabio]
+Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
+Acked-by: Bjorn Helgaas <bhelgaas@google.com> [for PCI parts]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/xen/platform-pci-unplug.c         |   74 +++++++++++++++++++++++++++++
+ drivers/block/xen-blkfront.c               |    4 -
+ drivers/char/tpm/xen-tpmfront.c            |    4 +
+ drivers/input/misc/xen-kbdfront.c          |    4 +
+ drivers/net/xen-netfront.c                 |    2 
+ drivers/pci/xen-pcifront.c                 |    4 +
+ drivers/video/xen-fbfront.c                |    4 +
+ drivers/xen/xenbus/xenbus_probe_frontend.c |    2 
+ include/xen/platform_pci.h                 |   23 +++++++++
+ 9 files changed, 117 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/xen/platform-pci-unplug.c
++++ b/arch/x86/xen/platform-pci-unplug.c
+@@ -69,6 +69,80 @@ static int check_platform_magic(void)
+       return 0;
+ }
++bool xen_has_pv_devices()
++{
++      if (!xen_domain())
++              return false;
++
++      /* PV domains always have them. */
++      if (xen_pv_domain())
++              return true;
++
++      /* And user has xen_platform_pci=0 set in guest config as
++       * driver did not modify the value. */
++      if (xen_platform_pci_unplug == 0)
++              return false;
++
++      if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER)
++              return false;
++
++      if (xen_platform_pci_unplug & XEN_UNPLUG_ALL)
++              return true;
++
++      /* This is an odd one - we are going to run legacy
++       * and PV drivers at the same time. */
++      if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
++              return true;
++
++      /* And the caller has to follow with xen_pv_{disk,nic}_devices
++       * to be certain which driver can load. */
++      return false;
++}
++EXPORT_SYMBOL_GPL(xen_has_pv_devices);
++
++static bool __xen_has_pv_device(int state)
++{
++      /* HVM domains might or might not */
++      if (xen_hvm_domain() && (xen_platform_pci_unplug & state))
++              return true;
++
++      return xen_has_pv_devices();
++}
++
++bool xen_has_pv_nic_devices(void)
++{
++      return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL);
++}
++EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices);
++
++bool xen_has_pv_disk_devices(void)
++{
++      return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS |
++                                 XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL);
++}
++EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices);
++
++/*
++ * This one is odd - it determines whether you want to run PV _and_
++ * legacy (IDE) drivers together. This combination is only possible
++ * under HVM.
++ */
++bool xen_has_pv_and_legacy_disk_devices(void)
++{
++      if (!xen_domain())
++              return false;
++
++      /* N.B. This is only ever used in HVM mode */
++      if (xen_pv_domain())
++              return false;
++
++      if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY)
++              return true;
++
++      return false;
++}
++EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices);
++
+ void xen_unplug_emulated_devices(void)
+ {
+       int r;
+--- a/drivers/block/xen-blkfront.c
++++ b/drivers/block/xen-blkfront.c
+@@ -1278,7 +1278,7 @@ static int blkfront_probe(struct xenbus_
+               char *type;
+               int len;
+               /* no unplug has been done: do not hook devices != xen vbds */
+-              if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
++              if (xen_has_pv_and_legacy_disk_devices()) {
+                       int major;
+                       if (!VDEV_IS_EXTENDED(vdevice))
+@@ -2022,7 +2022,7 @@ static int __init xlblk_init(void)
+       if (!xen_domain())
+               return -ENODEV;
+-      if (xen_hvm_domain() && !xen_platform_pci_unplug)
++      if (!xen_has_pv_disk_devices())
+               return -ENODEV;
+       if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
+--- a/drivers/char/tpm/xen-tpmfront.c
++++ b/drivers/char/tpm/xen-tpmfront.c
+@@ -17,6 +17,7 @@
+ #include <xen/xenbus.h>
+ #include <xen/page.h>
+ #include "tpm.h"
++#include <xen/platform_pci.h>
+ struct tpm_private {
+       struct tpm_chip *chip;
+@@ -423,6 +424,9 @@ static int __init xen_tpmfront_init(void
+       if (!xen_domain())
+               return -ENODEV;
++      if (!xen_has_pv_devices())
++              return -ENODEV;
++
+       return xenbus_register_frontend(&tpmfront_driver);
+ }
+ module_init(xen_tpmfront_init);
+--- a/drivers/input/misc/xen-kbdfront.c
++++ b/drivers/input/misc/xen-kbdfront.c
+@@ -29,6 +29,7 @@
+ #include <xen/interface/io/fbif.h>
+ #include <xen/interface/io/kbdif.h>
+ #include <xen/xenbus.h>
++#include <xen/platform_pci.h>
+ struct xenkbd_info {
+       struct input_dev *kbd;
+@@ -380,6 +381,9 @@ static int __init xenkbd_init(void)
+       if (xen_initial_domain())
+               return -ENODEV;
++      if (!xen_has_pv_devices())
++              return -ENODEV;
++
+       return xenbus_register_frontend(&xenkbd_driver);
+ }
+--- a/drivers/net/xen-netfront.c
++++ b/drivers/net/xen-netfront.c
+@@ -2070,7 +2070,7 @@ static int __init netif_init(void)
+       if (!xen_domain())
+               return -ENODEV;
+-      if (xen_hvm_domain() && !xen_platform_pci_unplug)
++      if (!xen_has_pv_nic_devices())
+               return -ENODEV;
+       pr_info("Initialising Xen virtual ethernet driver\n");
+--- a/drivers/pci/xen-pcifront.c
++++ b/drivers/pci/xen-pcifront.c
+@@ -20,6 +20,7 @@
+ #include <linux/workqueue.h>
+ #include <linux/bitops.h>
+ #include <linux/time.h>
++#include <xen/platform_pci.h>
+ #include <asm/xen/swiotlb-xen.h>
+ #define INVALID_GRANT_REF (0)
+@@ -1138,6 +1139,9 @@ static int __init pcifront_init(void)
+       if (!xen_pv_domain() || xen_initial_domain())
+               return -ENODEV;
++      if (!xen_has_pv_devices())
++              return -ENODEV;
++
+       pci_frontend_registrar(1 /* enable */);
+       return xenbus_register_frontend(&xenpci_driver);
+--- a/drivers/video/xen-fbfront.c
++++ b/drivers/video/xen-fbfront.c
+@@ -35,6 +35,7 @@
+ #include <xen/interface/io/fbif.h>
+ #include <xen/interface/io/protocols.h>
+ #include <xen/xenbus.h>
++#include <xen/platform_pci.h>
+ struct xenfb_info {
+       unsigned char           *fb;
+@@ -699,6 +700,9 @@ static int __init xenfb_init(void)
+       if (xen_initial_domain())
+               return -ENODEV;
++      if (!xen_has_pv_devices())
++              return -ENODEV;
++
+       return xenbus_register_frontend(&xenfb_driver);
+ }
+--- a/drivers/xen/xenbus/xenbus_probe_frontend.c
++++ b/drivers/xen/xenbus/xenbus_probe_frontend.c
+@@ -496,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_in
+ #ifndef MODULE
+ static int __init boot_wait_for_devices(void)
+ {
+-      if (xen_hvm_domain() && !xen_platform_pci_unplug)
++      if (!xen_has_pv_devices())
+               return -ENODEV;
+       ready_to_wait_for_devices = 1;
+--- a/include/xen/platform_pci.h
++++ b/include/xen/platform_pci.h
+@@ -48,4 +48,27 @@ static inline int xen_must_unplug_disks(
+ extern int xen_platform_pci_unplug;
++#if defined(CONFIG_XEN_PVHVM)
++extern bool xen_has_pv_devices(void);
++extern bool xen_has_pv_disk_devices(void);
++extern bool xen_has_pv_nic_devices(void);
++extern bool xen_has_pv_and_legacy_disk_devices(void);
++#else
++static inline bool xen_has_pv_devices(void)
++{
++      return IS_ENABLED(CONFIG_XEN);
++}
++static inline bool xen_has_pv_disk_devices(void)
++{
++      return IS_ENABLED(CONFIG_XEN);
++}
++static inline bool xen_has_pv_nic_devices(void)
++{
++      return IS_ENABLED(CONFIG_XEN);
++}
++static inline bool xen_has_pv_and_legacy_disk_devices(void)
++{
++      return false;
++}
++#endif
+ #endif /* _XEN_PLATFORM_PCI_H */