From 2f6f46f6e173946d9c5809418ff2725d3493c4c3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 6 Feb 2014 16:44:58 -0800 Subject: [PATCH] 3.13-stable patches added patches: arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch audit-correct-a-type-mismatch-in-audit_syscall_exit.patch audit-reset-audit-backlog-wait-time-after-error-recovery.patch ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch fuse-fix-pipe_buf_operations.patch intel-iommu-fix-off-by-one-in-pagetable-freeing.patch memcg-fix-css-reference-leak-and-endless-loop-in-mem_cgroup_iter.patch memcg-fix-endless-loop-caused-by-mem_cgroup_iter.patch mm-don-t-lose-the-soft_dirty-flag-on-mprotect.patch mm-ignore-vm_softdirty-on-vma-merging.patch mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch mm-munlock-fix-potential-race-with-thp-page-split.patch mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch mmc-atmel-mci-fix-timeout-errors-in-sdio-mode-when-using-dma.patch mmc-core-sd-implement-proper-support-for-sd3.0-au-sizes.patch mmc-fix-host-release-issue-after-discard-operation.patch revert-eisa-initialize-device-before-its-resources.patch selinux-fix-memory-leak-upon-loading-policy.patch tracing-check-if-tracing-is-enabled-in-trace_puts.patch tracing-have-trace-buffer-point-back-to-trace_array.patch xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch --- ....c-add-missing-include-linux-sched.h.patch | 43 +++ ...-type-mismatch-in-audit_syscall_exit.patch | 39 ++ ...cklog-wait-time-after-error-recovery.patch | 48 +++ ...ion-disabling-and-freeing-ftrace_ops.patch | 111 ++++++ ...ly-trace-based-on-global_ops-filters.patch | 184 ++++++++++ queue-3.13/fuse-fix-pipe_buf_operations.patch | 152 ++++++++ ...-fix-off-by-one-in-pagetable-freeing.patch | 61 ++++ ...-and-endless-loop-in-mem_cgroup_iter.patch | 79 +++++ ...dless-loop-caused-by-mem_cgroup_iter.patch | 98 +++++ ...lose-the-soft_dirty-flag-on-mprotect.patch | 52 +++ ...m-ignore-vm_softdirty-on-vma-merging.patch | 83 +++++ ...kip-memcgs-not-yet-fully-initialized.patch | 41 +++ ...ad-page-to-tail-page-after-thp-split.patch | 141 ++++++++ ...x-potential-race-with-thp-page-split.patch | 207 +++++++++++ ...count-anon-pages-as-dirtyable-memory.patch | 125 +++++++ ...ve-subtraction-from-dirtyable-memory.patch | 147 ++++++++ ...t-errors-in-sdio-mode-when-using-dma.patch | 50 +++ ...nt-proper-support-for-sd3.0-au-sizes.patch | 98 +++++ ...elease-issue-after-discard-operation.patch | 66 ++++ ...itialize-device-before-its-resources.patch | 96 +++++ ...-fix-memory-leak-upon-loading-policy.patch | 79 +++++ queue-3.13/series | 24 ++ ...-if-tracing-is-enabled-in-trace_puts.patch | 44 +++ ...ace-buffer-point-back-to-trace_array.patch | 36 ++ ...atform_pci-0-is-set-don-t-blow-up-v4.patch | 334 ++++++++++++++++++ 25 files changed, 2438 insertions(+) create mode 100644 queue-3.13/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch create mode 100644 queue-3.13/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch create mode 100644 queue-3.13/audit-reset-audit-backlog-wait-time-after-error-recovery.patch create mode 100644 queue-3.13/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch create mode 100644 queue-3.13/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch create mode 100644 queue-3.13/fuse-fix-pipe_buf_operations.patch create mode 100644 queue-3.13/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch create mode 100644 queue-3.13/memcg-fix-css-reference-leak-and-endless-loop-in-mem_cgroup_iter.patch create mode 100644 queue-3.13/memcg-fix-endless-loop-caused-by-mem_cgroup_iter.patch create mode 100644 queue-3.13/mm-don-t-lose-the-soft_dirty-flag-on-mprotect.patch create mode 100644 queue-3.13/mm-ignore-vm_softdirty-on-vma-merging.patch create mode 100644 queue-3.13/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch create mode 100644 queue-3.13/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch create mode 100644 queue-3.13/mm-munlock-fix-potential-race-with-thp-page-split.patch create mode 100644 queue-3.13/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch create mode 100644 queue-3.13/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch create mode 100644 queue-3.13/mmc-atmel-mci-fix-timeout-errors-in-sdio-mode-when-using-dma.patch create mode 100644 queue-3.13/mmc-core-sd-implement-proper-support-for-sd3.0-au-sizes.patch create mode 100644 queue-3.13/mmc-fix-host-release-issue-after-discard-operation.patch create mode 100644 queue-3.13/revert-eisa-initialize-device-before-its-resources.patch create mode 100644 queue-3.13/selinux-fix-memory-leak-upon-loading-policy.patch create mode 100644 queue-3.13/tracing-check-if-tracing-is-enabled-in-trace_puts.patch create mode 100644 queue-3.13/tracing-have-trace-buffer-point-back-to-trace_array.patch create mode 100644 queue-3.13/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch diff --git a/queue-3.13/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch b/queue-3.13/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch new file mode 100644 index 00000000000..27a040ee275 --- /dev/null +++ b/queue-3.13/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch @@ -0,0 +1,43 @@ +From 53a52f17d96c8d47c79a7dafa81426317e89c7c1 Mon Sep 17 00:00:00 2001 +From: Wanlong Gao +Date: Tue, 21 Jan 2014 15:48:41 -0800 +Subject: arch/sh/kernel/kgdb.c: add missing #include + +From: Wanlong Gao + +commit 53a52f17d96c8d47c79a7dafa81426317e89c7c1 upstream. + + arch/sh/kernel/kgdb.c: In function 'sleeping_thread_to_gdb_regs': + arch/sh/kernel/kgdb.c:225:32: error: implicit declaration of function 'task_stack_page' [-Werror=implicit-function-declaration] + arch/sh/kernel/kgdb.c:242:23: error: dereferencing pointer to incomplete type + arch/sh/kernel/kgdb.c:243:22: error: dereferencing pointer to incomplete type + arch/sh/kernel/kgdb.c: In function 'singlestep_trap_handler': + arch/sh/kernel/kgdb.c:310:27: error: 'SIGTRAP' undeclared (first use in this function) + arch/sh/kernel/kgdb.c:310:27: note: each undeclared identifier is reported only once for each function it appears in + +This was introduced by commit 16559ae48c76 ("kgdb: remove #include + from kgdb.h"). + +[geert@linux-m68k.org: reworded and reformatted] +Signed-off-by: Wanlong Gao +Signed-off-by: Geert Uytterhoeven +Reported-by: Fengguang Wu +Acked-by: Greg Kroah-Hartman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/sh/kernel/kgdb.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/sh/kernel/kgdb.c ++++ b/arch/sh/kernel/kgdb.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + diff --git a/queue-3.13/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch b/queue-3.13/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch new file mode 100644 index 00000000000..dab9dfbaab3 --- /dev/null +++ b/queue-3.13/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch @@ -0,0 +1,39 @@ +From 06bdadd7634551cfe8ce071fe44d0311b3033d9e Mon Sep 17 00:00:00 2001 +From: AKASHI Takahiro +Date: Mon, 13 Jan 2014 13:33:09 -0800 +Subject: audit: correct a type mismatch in audit_syscall_exit() + +From: AKASHI Takahiro + +commit 06bdadd7634551cfe8ce071fe44d0311b3033d9e upstream. + +audit_syscall_exit() saves a result of regs_return_value() in intermediate +"int" variable and passes it to __audit_syscall_exit(), which expects its +second argument as a "long" value. This will result in truncating the +value returned by a system call and making a wrong audit record. + +I don't know why gcc compiler doesn't complain about this, but anyway it +causes a problem at runtime on arm64 (and probably most 64-bit archs). + +Signed-off-by: AKASHI Takahiro +Cc: Al Viro +Cc: Eric Paris +Signed-off-by: Andrew Morton +Signed-off-by: Eric Paris +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/audit.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/linux/audit.h ++++ b/include/linux/audit.h +@@ -137,7 +137,7 @@ static inline void audit_syscall_exit(vo + { + if (unlikely(current->audit_context)) { + int success = is_syscall_success(pt_regs); +- int return_code = regs_return_value(pt_regs); ++ long return_code = regs_return_value(pt_regs); + + __audit_syscall_exit(success, return_code); + } diff --git a/queue-3.13/audit-reset-audit-backlog-wait-time-after-error-recovery.patch b/queue-3.13/audit-reset-audit-backlog-wait-time-after-error-recovery.patch new file mode 100644 index 00000000000..c9ad481de71 --- /dev/null +++ b/queue-3.13/audit-reset-audit-backlog-wait-time-after-error-recovery.patch @@ -0,0 +1,48 @@ +From e789e561a50de0aaa8c695662d97aaa5eac9d55f Mon Sep 17 00:00:00 2001 +From: Richard Guy Briggs +Date: Thu, 12 Sep 2013 23:03:51 -0400 +Subject: audit: reset audit backlog wait time after error recovery + +From: Richard Guy Briggs + +commit e789e561a50de0aaa8c695662d97aaa5eac9d55f upstream. + +When the audit queue overflows and times out (audit_backlog_wait_time), the +audit queue overflow timeout is set to zero. Once the audit queue overflow +timeout condition recovers, the timeout should be reset to the original value. + +See also: + https://lkml.org/lkml/2013/9/2/473 + +Signed-off-by: Luiz Capitulino +Signed-off-by: Dan Duval +Signed-off-by: Chuck Anderson +Signed-off-by: Richard Guy Briggs +Signed-off-by: Eric Paris +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/audit.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -102,7 +102,8 @@ static int audit_rate_limit; + + /* Number of outstanding audit_buffers allowed. */ + static int audit_backlog_limit = 64; +-static int audit_backlog_wait_time = 60 * HZ; ++#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) ++static int audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; + static int audit_backlog_wait_overflow = 0; + + /* The identity of the user shutting down the audit system. */ +@@ -1239,6 +1240,8 @@ struct audit_buffer *audit_log_start(str + return NULL; + } + ++ audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; ++ + ab = audit_buffer_alloc(ctx, gfp_mask, type); + if (!ab) { + audit_log_lost("out of memory in audit_log_start"); diff --git a/queue-3.13/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch b/queue-3.13/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch new file mode 100644 index 00000000000..f505f789a6b --- /dev/null +++ b/queue-3.13/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch @@ -0,0 +1,111 @@ +From a4c35ed241129dd142be4cadb1e5a474a56d5464 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Red Hat)" +Date: Mon, 13 Jan 2014 12:56:21 -0500 +Subject: ftrace: Fix synchronization location disabling and freeing ftrace_ops + +From: "Steven Rostedt (Red Hat)" + +commit a4c35ed241129dd142be4cadb1e5a474a56d5464 upstream. + +The synchronization needed after ftrace_ops are unregistered must happen +after the callback is disabled from becing called by functions. + +The current location happens after the function is being removed from the +internal lists, but not after the function callbacks were disabled, leaving +the functions susceptible of being called after their callbacks are freed. + +This affects perf and any externel users of function tracing (LTTng and +SystemTap). + +Fixes: cdbe61bfe704 "ftrace: Allow dynamically allocated function tracers" +Signed-off-by: Steven Rostedt +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/ftrace.c | 58 +++++++++++++++++++++++++++----------------------- + 1 file changed, 32 insertions(+), 26 deletions(-) + +--- a/kernel/trace/ftrace.c ++++ b/kernel/trace/ftrace.c +@@ -447,20 +447,6 @@ static int __unregister_ftrace_function( + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + ret = remove_ftrace_list_ops(&ftrace_control_list, + &control_ops, ops); +- if (!ret) { +- /* +- * The ftrace_ops is now removed from the list, +- * so there'll be no new users. We must ensure +- * all current users are done before we free +- * the control data. +- * Note synchronize_sched() is not enough, as we +- * use preempt_disable() to do RCU, but the function +- * tracer can be called where RCU is not active +- * (before user_exit()). +- */ +- schedule_on_each_cpu(ftrace_sync); +- control_ops_free(ops); +- } + } else + ret = remove_ftrace_ops(&ftrace_ops_list, ops); + +@@ -470,17 +456,6 @@ static int __unregister_ftrace_function( + if (ftrace_enabled) + update_ftrace_function(); + +- /* +- * Dynamic ops may be freed, we must make sure that all +- * callers are done before leaving this function. +- * +- * Again, normal synchronize_sched() is not good enough. +- * We need to do a hard force of sched synchronization. +- */ +- if (ops->flags & FTRACE_OPS_FL_DYNAMIC) +- schedule_on_each_cpu(ftrace_sync); +- +- + return 0; + } + +@@ -2164,10 +2139,41 @@ static int ftrace_shutdown(struct ftrace + command |= FTRACE_UPDATE_TRACE_FUNC; + } + +- if (!command || !ftrace_enabled) ++ if (!command || !ftrace_enabled) { ++ /* ++ * If these are control ops, they still need their ++ * per_cpu field freed. Since, function tracing is ++ * not currently active, we can just free them ++ * without synchronizing all CPUs. ++ */ ++ if (ops->flags & FTRACE_OPS_FL_CONTROL) ++ control_ops_free(ops); + return 0; ++ } + + ftrace_run_update_code(command); ++ ++ /* ++ * Dynamic ops may be freed, we must make sure that all ++ * callers are done before leaving this function. ++ * The same goes for freeing the per_cpu data of the control ++ * ops. ++ * ++ * Again, normal synchronize_sched() is not good enough. ++ * We need to do a hard force of sched synchronization. ++ * This is because we use preempt_disable() to do RCU, but ++ * the function tracers can be called where RCU is not watching ++ * (like before user_exit()). We can not rely on the RCU ++ * infrastructure to do the synchronization, thus we must do it ++ * ourselves. ++ */ ++ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { ++ schedule_on_each_cpu(ftrace_sync); ++ ++ if (ops->flags & FTRACE_OPS_FL_CONTROL) ++ control_ops_free(ops); ++ } ++ + return 0; + } + diff --git a/queue-3.13/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch b/queue-3.13/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch new file mode 100644 index 00000000000..9ae9d2d1c76 --- /dev/null +++ b/queue-3.13/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch @@ -0,0 +1,184 @@ +From 23a8e8441a0a74dd612edf81dc89d1600bc0a3d1 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Red Hat)" +Date: Mon, 13 Jan 2014 10:30:23 -0500 +Subject: ftrace: Have function graph only trace based on global_ops filters + +From: "Steven Rostedt (Red Hat)" + +commit 23a8e8441a0a74dd612edf81dc89d1600bc0a3d1 upstream. + +Doing some different tests, I discovered that function graph tracing, when +filtered via the set_ftrace_filter and set_ftrace_notrace files, does +not always keep with them if another function ftrace_ops is registered +to trace functions. + +The reason is that function graph just happens to trace all functions +that the function tracer enables. When there was only one user of +function tracing, the function graph tracer did not need to worry about +being called by functions that it did not want to trace. But now that there +are other users, this becomes a problem. + +For example, one just needs to do the following: + + # cd /sys/kernel/debug/tracing + # echo schedule > set_ftrace_filter + # echo function_graph > current_tracer + # cat trace +[..] + 0) | schedule() { + ------------------------------------------ + 0) -0 => rcu_pre-7 + ------------------------------------------ + + 0) ! 2980.314 us | } + 0) | schedule() { + ------------------------------------------ + 0) rcu_pre-7 => -0 + ------------------------------------------ + + 0) + 20.701 us | } + + # echo 1 > /proc/sys/kernel/stack_tracer_enabled + # cat trace +[..] + 1) + 20.825 us | } + 1) + 21.651 us | } + 1) + 30.924 us | } /* SyS_ioctl */ + 1) | do_page_fault() { + 1) | __do_page_fault() { + 1) 0.274 us | down_read_trylock(); + 1) 0.098 us | find_vma(); + 1) | handle_mm_fault() { + 1) | _raw_spin_lock() { + 1) 0.102 us | preempt_count_add(); + 1) 0.097 us | do_raw_spin_lock(); + 1) 2.173 us | } + 1) | do_wp_page() { + 1) 0.079 us | vm_normal_page(); + 1) 0.086 us | reuse_swap_page(); + 1) 0.076 us | page_move_anon_rmap(); + 1) | unlock_page() { + 1) 0.082 us | page_waitqueue(); + 1) 0.086 us | __wake_up_bit(); + 1) 1.801 us | } + 1) 0.075 us | ptep_set_access_flags(); + 1) | _raw_spin_unlock() { + 1) 0.098 us | do_raw_spin_unlock(); + 1) 0.105 us | preempt_count_sub(); + 1) 1.884 us | } + 1) 9.149 us | } + 1) + 13.083 us | } + 1) 0.146 us | up_read(); + +When the stack tracer was enabled, it enabled all functions to be traced, which +now the function graph tracer also traces. This is a side effect that should +not occur. + +To fix this a test is added when the function tracing is changed, as well as when +the graph tracer is enabled, to see if anything other than the ftrace global_ops +function tracer is enabled. If so, then the graph tracer calls a test trampoline +that will look at the function that is being traced and compare it with the +filters defined by the global_ops. + +As an optimization, if there's no other function tracers registered, or if +the only registered function tracers also use the global ops, the function +graph infrastructure will call the registered function graph callback directly +and not go through the test trampoline. + +Fixes: d2d45c7a03a2 "tracing: Have stack_tracer use a separate list of functions" +Signed-off-by: Steven Rostedt +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/ftrace.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 44 insertions(+), 1 deletion(-) + +--- a/kernel/trace/ftrace.c ++++ b/kernel/trace/ftrace.c +@@ -278,6 +278,12 @@ static void update_global_ops(void) + global_ops.func = func; + } + ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++static void update_function_graph_func(void); ++#else ++static inline void update_function_graph_func(void) { } ++#endif ++ + static void update_ftrace_function(void) + { + ftrace_func_t func; +@@ -325,6 +331,8 @@ static int remove_ftrace_ops(struct ftra + { + struct ftrace_ops **p; + ++ update_function_graph_func(); ++ + /* + * If we are removing the last function, then simply point + * to the ftrace_stub. +@@ -4862,6 +4870,7 @@ int ftrace_graph_entry_stub(struct ftrac + trace_func_graph_ret_t ftrace_graph_return = + (trace_func_graph_ret_t)ftrace_stub; + trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; ++static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; + + /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ + static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +@@ -5003,6 +5012,30 @@ static struct ftrace_ops fgraph_ops __re + FTRACE_OPS_FL_RECURSION_SAFE, + }; + ++static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) ++{ ++ if (!ftrace_ops_test(&global_ops, trace->func, NULL)) ++ return 0; ++ return __ftrace_graph_entry(trace); ++} ++ ++/* ++ * The function graph tracer should only trace the functions defined ++ * by set_ftrace_filter and set_ftrace_notrace. If another function ++ * tracer ops is registered, the graph tracer requires testing the ++ * function against the global ops, and not just trace any function ++ * that any ftrace_ops registered. ++ */ ++static void update_function_graph_func(void) ++{ ++ if (ftrace_ops_list == &ftrace_list_end || ++ (ftrace_ops_list == &global_ops && ++ global_ops.next == &ftrace_list_end)) ++ ftrace_graph_entry = __ftrace_graph_entry; ++ else ++ ftrace_graph_entry = ftrace_graph_entry_test; ++} ++ + int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) + { +@@ -5027,7 +5060,16 @@ int register_ftrace_graph(trace_func_gra + } + + ftrace_graph_return = retfunc; +- ftrace_graph_entry = entryfunc; ++ ++ /* ++ * Update the indirect function to the entryfunc, and the ++ * function that gets called to the entry_test first. Then ++ * call the update fgraph entry function to determine if ++ * the entryfunc should be called directly or not. ++ */ ++ __ftrace_graph_entry = entryfunc; ++ ftrace_graph_entry = ftrace_graph_entry_test; ++ update_function_graph_func(); + + ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); + +@@ -5046,6 +5088,7 @@ void unregister_ftrace_graph(void) + ftrace_graph_active--; + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; ++ __ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/queue-3.13/fuse-fix-pipe_buf_operations.patch b/queue-3.13/fuse-fix-pipe_buf_operations.patch new file mode 100644 index 00000000000..0b5abd01c7d --- /dev/null +++ b/queue-3.13/fuse-fix-pipe_buf_operations.patch @@ -0,0 +1,152 @@ +From 28a625cbc2a14f17b83e47ef907b2658576a32aa Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Wed, 22 Jan 2014 19:36:57 +0100 +Subject: fuse: fix pipe_buf_operations + +From: Miklos Szeredi + +commit 28a625cbc2a14f17b83e47ef907b2658576a32aa upstream. + +Having this struct in module memory could Oops when if the module is +unloaded while the buffer still persists in a pipe. + +Since sock_pipe_buf_ops is essentially the same as fuse_dev_pipe_buf_steal +merge them into nosteal_pipe_buf_ops (this is the same as +default_pipe_buf_ops except stealing the page from the buffer is not +allowed). + +Reported-by: Al Viro +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fuse/dev.c | 22 +++++----------------- + fs/splice.c | 18 ++++++++++++++++++ + include/linux/pipe_fs_i.h | 2 ++ + net/core/skbuff.c | 32 +------------------------------- + 4 files changed, 26 insertions(+), 48 deletions(-) + +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kioc + return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); + } + +-static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, +- struct pipe_buffer *buf) +-{ +- return 1; +-} +- +-static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { +- .can_merge = 0, +- .map = generic_pipe_buf_map, +- .unmap = generic_pipe_buf_unmap, +- .confirm = generic_pipe_buf_confirm, +- .release = generic_pipe_buf_release, +- .steal = fuse_dev_pipe_buf_steal, +- .get = generic_pipe_buf_get, +-}; +- + static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +@@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(stru + buf->page = bufs[page_nr].page; + buf->offset = bufs[page_nr].offset; + buf->len = bufs[page_nr].len; +- buf->ops = &fuse_dev_pipe_buf_ops; ++ /* ++ * Need to be careful about this. Having buf->ops in module ++ * code can Oops if the buffer persists after module unload. ++ */ ++ buf->ops = &nosteal_pipe_buf_ops; + + pipe->nrbufs++; + page_nr++; +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -555,6 +555,24 @@ static const struct pipe_buf_operations + .get = generic_pipe_buf_get, + }; + ++static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ return 1; ++} ++ ++/* Pipe buffer operations for a socket and similar. */ ++const struct pipe_buf_operations nosteal_pipe_buf_ops = { ++ .can_merge = 0, ++ .map = generic_pipe_buf_map, ++ .unmap = generic_pipe_buf_unmap, ++ .confirm = generic_pipe_buf_confirm, ++ .release = generic_pipe_buf_release, ++ .steal = generic_pipe_buf_nosteal, ++ .get = generic_pipe_buf_get, ++}; ++EXPORT_SYMBOL(nosteal_pipe_buf_ops); ++ + static ssize_t kernel_readv(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t offset) + { +--- a/include/linux/pipe_fs_i.h ++++ b/include/linux/pipe_fs_i.h +@@ -157,6 +157,8 @@ int generic_pipe_buf_confirm(struct pipe + int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); + void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); + ++extern const struct pipe_buf_operations nosteal_pipe_buf_ops; ++ + /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ + long pipe_fcntl(struct file *, unsigned int, unsigned long arg); + struct pipe_inode_info *get_pipe_info(struct file *file); +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -74,36 +74,6 @@ + struct kmem_cache *skbuff_head_cache __read_mostly; + static struct kmem_cache *skbuff_fclone_cache __read_mostly; + +-static void sock_pipe_buf_release(struct pipe_inode_info *pipe, +- struct pipe_buffer *buf) +-{ +- put_page(buf->page); +-} +- +-static void sock_pipe_buf_get(struct pipe_inode_info *pipe, +- struct pipe_buffer *buf) +-{ +- get_page(buf->page); +-} +- +-static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, +- struct pipe_buffer *buf) +-{ +- return 1; +-} +- +- +-/* Pipe buffer operations for a socket. */ +-static const struct pipe_buf_operations sock_pipe_buf_ops = { +- .can_merge = 0, +- .map = generic_pipe_buf_map, +- .unmap = generic_pipe_buf_unmap, +- .confirm = generic_pipe_buf_confirm, +- .release = sock_pipe_buf_release, +- .steal = sock_pipe_buf_steal, +- .get = sock_pipe_buf_get, +-}; +- + /** + * skb_panic - private function for out-of-line support + * @skb: buffer +@@ -1830,7 +1800,7 @@ int skb_splice_bits(struct sk_buff *skb, + .partial = partial, + .nr_pages_max = MAX_SKB_FRAGS, + .flags = flags, +- .ops = &sock_pipe_buf_ops, ++ .ops = &nosteal_pipe_buf_ops, + .spd_release = sock_spd_release, + }; + struct sk_buff *frag_iter; diff --git a/queue-3.13/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch b/queue-3.13/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch new file mode 100644 index 00000000000..86e14c028e5 --- /dev/null +++ b/queue-3.13/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch @@ -0,0 +1,61 @@ +From 08336fd218e087cc4fcc458e6b6dcafe8702b098 Mon Sep 17 00:00:00 2001 +From: Alex Williamson +Date: Tue, 21 Jan 2014 15:48:18 -0800 +Subject: intel-iommu: fix off-by-one in pagetable freeing + +From: Alex Williamson + +commit 08336fd218e087cc4fcc458e6b6dcafe8702b098 upstream. + +dma_pte_free_level() has an off-by-one error when checking whether a pte +is completely covered by a range. Take for example the case of +attempting to free pfn 0x0 - 0x1ff, ie. 512 entries covering the first +2M superpage. + +The level_size() is 0x200 and we test: + + static void dma_pte_free_level(... + ... + + if (!(0 > 0 || 0x1ff < 0 + 0x200)) { + ... + } + +Clearly the 2nd test is true, which means we fail to take the branch to +clear and free the pagetable entry. As a result, we're leaking +pagetables and failing to install new pages over the range. + +This was found with a PCI device assigned to a QEMU guest using vfio-pci +without a VGA device present. The first 1M of guest address space is +mapped with various combinations of 4K pages, but eventually the range +is entirely freed and replaced with a 2M contiguous mapping. +intel-iommu errors out with something like: + + ERROR: DMA PTE for vPFN 0x0 already set (to 5c2b8003 not 849c00083) + +In this case 5c2b8003 is the pointer to the previous leaf page that was +neither freed nor cleared and 849c00083 is the superpage entry that +we're trying to replace it with. + +Signed-off-by: Alex Williamson +Cc: David Woodhouse +Cc: Joerg Roedel +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/iommu/intel-iommu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/iommu/intel-iommu.c ++++ b/drivers/iommu/intel-iommu.c +@@ -917,7 +917,7 @@ static void dma_pte_free_level(struct dm + + /* If range covers entire pagetable, free it */ + if (!(start_pfn > level_pfn || +- last_pfn < level_pfn + level_size(level))) { ++ last_pfn < level_pfn + level_size(level) - 1)) { + dma_clear_pte(pte); + domain_flush_cache(domain, pte, sizeof(*pte)); + free_pgtable_page(level_pte); diff --git a/queue-3.13/memcg-fix-css-reference-leak-and-endless-loop-in-mem_cgroup_iter.patch b/queue-3.13/memcg-fix-css-reference-leak-and-endless-loop-in-mem_cgroup_iter.patch new file mode 100644 index 00000000000..a3490fc0638 --- /dev/null +++ b/queue-3.13/memcg-fix-css-reference-leak-and-endless-loop-in-mem_cgroup_iter.patch @@ -0,0 +1,79 @@ +From 0eef615665ede1e0d603ea9ecca88c1da6f02234 Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Thu, 23 Jan 2014 15:53:37 -0800 +Subject: memcg: fix css reference leak and endless loop in mem_cgroup_iter + +From: Michal Hocko + +commit 0eef615665ede1e0d603ea9ecca88c1da6f02234 upstream. + +Commit 19f39402864e ("memcg: simplify mem_cgroup_iter") has reorganized +mem_cgroup_iter code in order to simplify it. A part of that change was +dropping an optimization which didn't call css_tryget on the root of the +walked tree. The patch however didn't change the css_put part in +mem_cgroup_iter which excludes root. + +This wasn't an issue at the time because __mem_cgroup_iter_next bailed +out for root early without taking a reference as cgroup iterators +(css_next_descendant_pre) didn't visit root themselves. + +Nevertheless cgroup iterators have been reworked to visit root by commit +bd8815a6d802 ("cgroup: make css_for_each_descendant() and friends +include the origin css in the iteration") when the root bypass have been +dropped in __mem_cgroup_iter_next. This means that css_put is not +called for root and so css along with mem_cgroup and other cgroup +internal object tied by css lifetime are never freed. + +Fix the issue by reintroducing root check in __mem_cgroup_iter_next and +do not take css reference for it. + +This reference counting magic protects us also from another issue, an +endless loop reported by Hugh Dickins when reclaim races with root +removal and css_tryget called by iterator internally would fail. There +would be no other nodes to visit so __mem_cgroup_iter_next would return +NULL and mem_cgroup_iter would interpret it as "start looping from root +again" and so mem_cgroup_iter would loop forever internally. + +Signed-off-by: Michal Hocko +Reported-by: Hugh Dickins +Tested-by: Hugh Dickins +Cc: Johannes Weiner +Cc: Greg Thelen +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -1098,14 +1098,22 @@ skip_node: + * skipped and we should continue the tree walk. + * last_visited css is safe to use because it is + * protected by css_get and the tree walk is rcu safe. ++ * ++ * We do not take a reference on the root of the tree walk ++ * because we might race with the root removal when it would ++ * be the only node in the iterated hierarchy and mem_cgroup_iter ++ * would end up in an endless loop because it expects that at ++ * least one valid node will be returned. Root cannot disappear ++ * because caller of the iterator should hold it already so ++ * skipping css reference should be safe. + */ + if (next_css) { +- if ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)) ++ if ((next_css->flags & CSS_ONLINE) && ++ (next_css == &root->css || css_tryget(next_css))) + return mem_cgroup_from_css(next_css); +- else { +- prev_css = next_css; +- goto skip_node; +- } ++ ++ prev_css = next_css; ++ goto skip_node; + } + + return NULL; diff --git a/queue-3.13/memcg-fix-endless-loop-caused-by-mem_cgroup_iter.patch b/queue-3.13/memcg-fix-endless-loop-caused-by-mem_cgroup_iter.patch new file mode 100644 index 00000000000..f1064cdc891 --- /dev/null +++ b/queue-3.13/memcg-fix-endless-loop-caused-by-mem_cgroup_iter.patch @@ -0,0 +1,98 @@ +From ecc736fc3c71c411a9d201d8588c9e7e049e5d8c Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Thu, 23 Jan 2014 15:53:35 -0800 +Subject: memcg: fix endless loop caused by mem_cgroup_iter + +From: Michal Hocko + +commit ecc736fc3c71c411a9d201d8588c9e7e049e5d8c upstream. + +Hugh has reported an endless loop when the hardlimit reclaim sees the +same group all the time. This might happen when the reclaim races with +the memcg removal. + +shrink_zone + [rmdir root] + mem_cgroup_iter(root, NULL, reclaim) + // prev = NULL + rcu_read_lock() + mem_cgroup_iter_load + last_visited = iter->last_visited // gets root || NULL + css_tryget(last_visited) // failed + last_visited = NULL [1] + memcg = root = __mem_cgroup_iter_next(root, NULL) + mem_cgroup_iter_update + iter->last_visited = root; + reclaim->generation = iter->generation + + mem_cgroup_iter(root, root, reclaim) + // prev = root + rcu_read_lock + mem_cgroup_iter_load + last_visited = iter->last_visited // gets root + css_tryget(last_visited) // failed + [1] + +The issue seemed to be introduced by commit 5f5781619718 ("memcg: relax +memcg iter caching") which has replaced unconditional css_get/css_put by +css_tryget/css_put for the cached iterator. + +This patch fixes the issue by skipping css_tryget on the root of the +tree walk in mem_cgroup_iter_load and symmetrically doesn't release it +in mem_cgroup_iter_update. + +Signed-off-by: Michal Hocko +Reported-by: Hugh Dickins +Tested-by: Hugh Dickins +Cc: Johannes Weiner +Cc: Greg Thelen +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 17 ++++++++++++++--- + 1 file changed, 14 insertions(+), 3 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -1139,7 +1139,15 @@ mem_cgroup_iter_load(struct mem_cgroup_r + if (iter->last_dead_count == *sequence) { + smp_rmb(); + position = iter->last_visited; +- if (position && !css_tryget(&position->css)) ++ ++ /* ++ * We cannot take a reference to root because we might race ++ * with root removal and returning NULL would end up in ++ * an endless loop on the iterator user level when root ++ * would be returned all the time. ++ */ ++ if (position && position != root && ++ !css_tryget(&position->css)) + position = NULL; + } + return position; +@@ -1148,9 +1156,11 @@ mem_cgroup_iter_load(struct mem_cgroup_r + static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *last_visited, + struct mem_cgroup *new_position, ++ struct mem_cgroup *root, + int sequence) + { +- if (last_visited) ++ /* root reference counting symmetric to mem_cgroup_iter_load */ ++ if (last_visited && last_visited != root) + css_put(&last_visited->css); + /* + * We store the sequence count from the time @last_visited was +@@ -1225,7 +1235,8 @@ struct mem_cgroup *mem_cgroup_iter(struc + memcg = __mem_cgroup_iter_next(root, last_visited); + + if (reclaim) { +- mem_cgroup_iter_update(iter, last_visited, memcg, seq); ++ mem_cgroup_iter_update(iter, last_visited, memcg, root, ++ seq); + + if (!memcg) + iter->generation++; diff --git a/queue-3.13/mm-don-t-lose-the-soft_dirty-flag-on-mprotect.patch b/queue-3.13/mm-don-t-lose-the-soft_dirty-flag-on-mprotect.patch new file mode 100644 index 00000000000..ddcb84c2db0 --- /dev/null +++ b/queue-3.13/mm-don-t-lose-the-soft_dirty-flag-on-mprotect.patch @@ -0,0 +1,52 @@ +From 24f91eba18bbfdb27e71a1aae5b3a61b67fcd091 Mon Sep 17 00:00:00 2001 +From: Andrey Vagin +Date: Thu, 30 Jan 2014 15:46:10 -0800 +Subject: mm: don't lose the SOFT_DIRTY flag on mprotect + +From: Andrey Vagin + +commit 24f91eba18bbfdb27e71a1aae5b3a61b67fcd091 upstream. + +The SOFT_DIRTY bit shows that the content of memory was changed after a +defined point in the past. mprotect() doesn't change the content of +memory, so it must not change the SOFT_DIRTY bit. + +This bug causes a malfunction: on the first iteration all pages are +dumped. On other iterations only pages with the SOFT_DIRTY bit are +dumped. So if the SOFT_DIRTY bit is cleared from a page by mistake, the +page is not dumped and its content will be restored incorrectly. + +This patch does nothing with _PAGE_SWP_SOFT_DIRTY, becase pte_modify() +is called only for present pages. + +Fixes commit 0f8975ec4db2 ("mm: soft-dirty bits for user memory changes +tracking"). + +Signed-off-by: Andrey Vagin +Acked-by: Cyrill Gorcunov +Cc: Thomas Gleixner +Cc: Ingo Molnar +Cc: "H. Peter Anvin" +Cc: Pavel Emelyanov +Cc: Borislav Petkov +Cc: Wen Congyang +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/pgtable_types.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -121,7 +121,8 @@ + + /* Set of bits not changed in pte_modify */ + #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ +- _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) ++ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ ++ _PAGE_SOFT_DIRTY) + #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) + + #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) diff --git a/queue-3.13/mm-ignore-vm_softdirty-on-vma-merging.patch b/queue-3.13/mm-ignore-vm_softdirty-on-vma-merging.patch new file mode 100644 index 00000000000..b33ab868020 --- /dev/null +++ b/queue-3.13/mm-ignore-vm_softdirty-on-vma-merging.patch @@ -0,0 +1,83 @@ +From 34228d473efe764d4db7c0536375f0c993e6e06a Mon Sep 17 00:00:00 2001 +From: Cyrill Gorcunov +Date: Thu, 23 Jan 2014 15:53:42 -0800 +Subject: mm: ignore VM_SOFTDIRTY on VMA merging + +From: Cyrill Gorcunov + +commit 34228d473efe764d4db7c0536375f0c993e6e06a upstream. + +The VM_SOFTDIRTY bit affects vma merge routine: if two VMAs has all bits +in vm_flags matched except dirty bit the kernel can't longer merge them +and this forces the kernel to generate new VMAs instead. + +It finally may lead to the situation when userspace application reaches +vm.max_map_count limit and get crashed in worse case + + | (gimp:11768): GLib-ERROR **: gmem.c:110: failed to allocate 4096 bytes + | + | (file-tiff-load:12038): LibGimpBase-WARNING **: file-tiff-load: gimp_wire_read(): error + | xinit: connection to X server lost + | + | waiting for X server to shut down + | /usr/lib64/gimp/2.0/plug-ins/file-tiff-load terminated: Hangup + | /usr/lib64/gimp/2.0/plug-ins/script-fu terminated: Hangup + | /usr/lib64/gimp/2.0/plug-ins/script-fu terminated: Hangup + + https://bugzilla.kernel.org/show_bug.cgi?id=67651 + https://bugzilla.gnome.org/show_bug.cgi?id=719619#c0 + +Initial problem came from missed VM_SOFTDIRTY in do_brk() routine but +even if we would set up VM_SOFTDIRTY here, there is still a way to +prevent VMAs from merging: one can call + + | echo 4 > /proc/$PID/clear_refs + +and clear all VM_SOFTDIRTY over all VMAs presented in memory map, then +new do_brk() will try to extend old VMA and finds that dirty bit doesn't +match thus new VMA will be generated. + +As discussed with Pavel, the right approach should be to ignore +VM_SOFTDIRTY bit when we're trying to merge VMAs and if merge successed +we mark extended VMA with dirty bit where needed. + +Signed-off-by: Cyrill Gorcunov +Reported-by: Bastian Hougaard +Reported-by: Mel Gorman +Cc: Pavel Emelyanov +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/mmap.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -893,7 +893,15 @@ again: remove_next = 1 + (end > next-> + static inline int is_mergeable_vma(struct vm_area_struct *vma, + struct file *file, unsigned long vm_flags) + { +- if (vma->vm_flags ^ vm_flags) ++ /* ++ * VM_SOFTDIRTY should not prevent from VMA merging, if we ++ * match the flags but dirty bit -- the caller should mark ++ * merged VMA as dirty. If dirty bit won't be excluded from ++ * comparison, we increase pressue on the memory system forcing ++ * the kernel to generate new VMAs when old one could be ++ * extended instead. ++ */ ++ if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY) + return 0; + if (vma->vm_file != file) + return 0; +@@ -1082,7 +1090,7 @@ static int anon_vma_compatible(struct vm + return a->vm_end == b->vm_start && + mpol_equal(vma_policy(a), vma_policy(b)) && + a->vm_file == b->vm_file && +- !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && ++ !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); + } + diff --git a/queue-3.13/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch b/queue-3.13/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch new file mode 100644 index 00000000000..b8ff8cad910 --- /dev/null +++ b/queue-3.13/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch @@ -0,0 +1,41 @@ +From d8ad30559715ce97afb7d1a93a12fd90e8fff312 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Thu, 23 Jan 2014 15:53:32 -0800 +Subject: mm/memcg: iteration skip memcgs not yet fully initialized + +From: Hugh Dickins + +commit d8ad30559715ce97afb7d1a93a12fd90e8fff312 upstream. + +It is surprising that the mem_cgroup iterator can return memcgs which +have not yet been fully initialized. By accident (or trial and error?) +this appears not to present an actual problem; but it may be better to +prevent such surprises, by skipping memcgs not yet online. + +Signed-off-by: Hugh Dickins +Cc: Tejun Heo +Acked-by: Michal Hocko +Cc: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -1100,10 +1100,8 @@ skip_node: + * protected by css_get and the tree walk is rcu safe. + */ + if (next_css) { +- struct mem_cgroup *mem = mem_cgroup_from_css(next_css); +- +- if (css_tryget(&mem->css)) +- return mem; ++ if ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)) ++ return mem_cgroup_from_css(next_css); + else { + prev_css = next_css; + goto skip_node; diff --git a/queue-3.13/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch b/queue-3.13/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch new file mode 100644 index 00000000000..8610322f726 --- /dev/null +++ b/queue-3.13/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch @@ -0,0 +1,141 @@ +From 54b9dd14d09f24927285359a227aa363ce46089e Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Thu, 23 Jan 2014 15:53:14 -0800 +Subject: mm/memory-failure.c: shift page lock from head page to tail page after thp split + +From: Naoya Horiguchi + +commit 54b9dd14d09f24927285359a227aa363ce46089e upstream. + +After thp split in hwpoison_user_mappings(), we hold page lock on the +raw error page only between try_to_unmap, hence we are in danger of race +condition. + +I found in the RHEL7 MCE-relay testing that we have "bad page" error +when a memory error happens on a thp tail page used by qemu-kvm: + + Triggering MCE exception on CPU 10 + mce: [Hardware Error]: Machine check events logged + MCE exception done on CPU 10 + MCE 0x38c535: Killing qemu-kvm:8418 due to hardware memory corruption + MCE 0x38c535: dirty LRU page recovery: Recovered + qemu-kvm[8418]: segfault at 20 ip 00007ffb0f0f229a sp 00007fffd6bc5240 error 4 in qemu-kvm[7ffb0ef14000+420000] + BUG: Bad page state in process qemu-kvm pfn:38c400 + page:ffffea000e310000 count:0 mapcount:0 mapping: (null) index:0x7ffae3c00 + page flags: 0x2fffff0008001d(locked|referenced|uptodate|dirty|swapbacked) + Modules linked in: hwpoison_inject mce_inject vhost_net macvtap macvlan ... + CPU: 0 PID: 8418 Comm: qemu-kvm Tainted: G M -------------- 3.10.0-54.0.1.el7.mce_test_fixed.x86_64 #1 + Hardware name: NEC NEC Express5800/R120b-1 [N8100-1719F]/MS-91E7-001, BIOS 4.6.3C19 02/10/2011 + Call Trace: + dump_stack+0x19/0x1b + bad_page.part.59+0xcf/0xe8 + free_pages_prepare+0x148/0x160 + free_hot_cold_page+0x31/0x140 + free_hot_cold_page_list+0x46/0xa0 + release_pages+0x1c1/0x200 + free_pages_and_swap_cache+0xad/0xd0 + tlb_flush_mmu.part.46+0x4c/0x90 + tlb_finish_mmu+0x55/0x60 + exit_mmap+0xcb/0x170 + mmput+0x67/0xf0 + vhost_dev_cleanup+0x231/0x260 [vhost_net] + vhost_net_release+0x3f/0x90 [vhost_net] + __fput+0xe9/0x270 + ____fput+0xe/0x10 + task_work_run+0xc4/0xe0 + do_exit+0x2bb/0xa40 + do_group_exit+0x3f/0xa0 + get_signal_to_deliver+0x1d0/0x6e0 + do_signal+0x48/0x5e0 + do_notify_resume+0x71/0xc0 + retint_signal+0x48/0x8c + +The reason of this bug is that a page fault happens before unlocking the +head page at the end of memory_failure(). This strange page fault is +trying to access to address 0x20 and I'm not sure why qemu-kvm does +this, but anyway as a result the SIGSEGV makes qemu-kvm exit and on the +way we catch the bad page bug/warning because we try to free a locked +page (which was the former head page.) + +To fix this, this patch suggests to shift page lock from head page to +tail page just after thp split. SIGSEGV still happens, but it affects +only error affected VMs, not a whole system. + +Signed-off-by: Naoya Horiguchi +Cc: Andi Kleen +Cc: Wanpeng Li +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory-failure.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -856,14 +856,14 @@ static int page_action(struct page_state + * the pages and send SIGBUS to the processes if the data was dirty. + */ + static int hwpoison_user_mappings(struct page *p, unsigned long pfn, +- int trapno, int flags) ++ int trapno, int flags, struct page **hpagep) + { + enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + struct address_space *mapping; + LIST_HEAD(tokill); + int ret; + int kill = 1, forcekill; +- struct page *hpage = compound_head(p); ++ struct page *hpage = *hpagep; + struct page *ppage; + + if (PageReserved(p) || PageSlab(p)) +@@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct + * We pinned the head page for hwpoison handling, + * now we split the thp and we are interested in + * the hwpoisoned raw page, so move the refcount +- * to it. ++ * to it. Similarly, page lock is shifted. + */ + if (hpage != p) { + put_page(hpage); + get_page(p); ++ lock_page(p); ++ unlock_page(hpage); ++ *hpagep = p; + } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; +@@ -964,17 +967,11 @@ static int hwpoison_user_mappings(struct + if (kill) + collect_procs(ppage, &tokill); + +- if (hpage != ppage) +- lock_page(ppage); +- + ret = try_to_unmap(ppage, ttu); + if (ret != SWAP_SUCCESS) + printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(ppage)); + +- if (hpage != ppage) +- unlock_page(ppage); +- + /* + * Now that the dirty bit has been propagated to the + * struct page and all unmaps done we can decide if +@@ -1193,8 +1190,12 @@ int memory_failure(unsigned long pfn, in + /* + * Now take care of user space mappings. + * Abort on fail: __delete_from_page_cache() assumes unmapped page. ++ * ++ * When the raw error page is thp tail page, hpage points to the raw ++ * page after thp split. + */ +- if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { ++ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) ++ != SWAP_SUCCESS) { + printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + res = -EBUSY; + goto out; diff --git a/queue-3.13/mm-munlock-fix-potential-race-with-thp-page-split.patch b/queue-3.13/mm-munlock-fix-potential-race-with-thp-page-split.patch new file mode 100644 index 00000000000..040801de0ed --- /dev/null +++ b/queue-3.13/mm-munlock-fix-potential-race-with-thp-page-split.patch @@ -0,0 +1,207 @@ +From 01cc2e58697e34c6ee9a40fb6cebc18bf5a1923f Mon Sep 17 00:00:00 2001 +From: Vlastimil Babka +Date: Thu, 23 Jan 2014 15:52:50 -0800 +Subject: mm: munlock: fix potential race with THP page split + +From: Vlastimil Babka + +commit 01cc2e58697e34c6ee9a40fb6cebc18bf5a1923f upstream. + +Since commit ff6a6da60b89 ("mm: accelerate munlock() treatment of THP +pages") munlock skips tail pages of a munlocked THP page. There is some +attempt to prevent bad consequences of racing with a THP page split, but +code inspection indicates that there are two problems that may lead to a +non-fatal, yet wrong outcome. + +First, __split_huge_page_refcount() copies flags including PageMlocked +from the head page to the tail pages. Clearing PageMlocked by +munlock_vma_page() in the middle of this operation might result in part +of tail pages left with PageMlocked flag. As the head page still +appears to be a THP page until all tail pages are processed, +munlock_vma_page() might think it munlocked the whole THP page and skip +all the former tail pages. Before ff6a6da60, those pages would be +cleared in further iterations of munlock_vma_pages_range(), but NR_MLOCK +would still become undercounted (related the next point). + +Second, NR_MLOCK accounting is based on call to hpage_nr_pages() after +the PageMlocked is cleared. The accounting might also become +inconsistent due to race with __split_huge_page_refcount() + +- undercount when HUGE_PMD_NR is subtracted, but some tail pages are + left with PageMlocked set and counted again (only possible before + ff6a6da60) + +- overcount when hpage_nr_pages() sees a normal page (split has already + finished), but the parallel split has meanwhile cleared PageMlocked from + additional tail pages + +This patch prevents both problems via extending the scope of lru_lock in +munlock_vma_page(). This is convenient because: + +- __split_huge_page_refcount() takes lru_lock for its whole operation + +- munlock_vma_page() typically takes lru_lock anyway for page isolation + +As this becomes a second function where page isolation is done with +lru_lock already held, factor this out to a new +__munlock_isolate_lru_page() function and clean up the code around. + +[akpm@linux-foundation.org: avoid a coding-style ugly] +Signed-off-by: Vlastimil Babka +Cc: Sasha Levin +Cc: Michel Lespinasse +Cc: Andrea Arcangeli +Cc: Rik van Riel +Cc: Mel Gorman +Cc: Hugh Dickins +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/mlock.c | 104 +++++++++++++++++++++++++++++++++++-------------------------- + 1 file changed, 60 insertions(+), 44 deletions(-) + +--- a/mm/mlock.c ++++ b/mm/mlock.c +@@ -91,6 +91,26 @@ void mlock_vma_page(struct page *page) + } + + /* ++ * Isolate a page from LRU with optional get_page() pin. ++ * Assumes lru_lock already held and page already pinned. ++ */ ++static bool __munlock_isolate_lru_page(struct page *page, bool getpage) ++{ ++ if (PageLRU(page)) { ++ struct lruvec *lruvec; ++ ++ lruvec = mem_cgroup_page_lruvec(page, page_zone(page)); ++ if (getpage) ++ get_page(page); ++ ClearPageLRU(page); ++ del_page_from_lru_list(page, lruvec, page_lru(page)); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* + * Finish munlock after successful page isolation + * + * Page must be locked. This is a wrapper for try_to_munlock() +@@ -126,9 +146,9 @@ static void __munlock_isolated_page(stru + static void __munlock_isolation_failed(struct page *page) + { + if (PageUnevictable(page)) +- count_vm_event(UNEVICTABLE_PGSTRANDED); ++ __count_vm_event(UNEVICTABLE_PGSTRANDED); + else +- count_vm_event(UNEVICTABLE_PGMUNLOCKED); ++ __count_vm_event(UNEVICTABLE_PGMUNLOCKED); + } + + /** +@@ -152,28 +172,34 @@ static void __munlock_isolation_failed(s + unsigned int munlock_vma_page(struct page *page) + { + unsigned int nr_pages; ++ struct zone *zone = page_zone(page); + + BUG_ON(!PageLocked(page)); + +- if (TestClearPageMlocked(page)) { +- nr_pages = hpage_nr_pages(page); +- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); +- if (!isolate_lru_page(page)) +- __munlock_isolated_page(page); +- else +- __munlock_isolation_failed(page); +- } else { +- nr_pages = hpage_nr_pages(page); +- } +- + /* +- * Regardless of the original PageMlocked flag, we determine nr_pages +- * after touching the flag. This leaves a possible race with a THP page +- * split, such that a whole THP page was munlocked, but nr_pages == 1. +- * Returning a smaller mask due to that is OK, the worst that can +- * happen is subsequent useless scanning of the former tail pages. +- * The NR_MLOCK accounting can however become broken. ++ * Serialize with any parallel __split_huge_page_refcount() which ++ * might otherwise copy PageMlocked to part of the tail pages before ++ * we clear it in the head page. It also stabilizes hpage_nr_pages(). + */ ++ spin_lock_irq(&zone->lru_lock); ++ ++ nr_pages = hpage_nr_pages(page); ++ if (!TestClearPageMlocked(page)) ++ goto unlock_out; ++ ++ __mod_zone_page_state(zone, NR_MLOCK, -nr_pages); ++ ++ if (__munlock_isolate_lru_page(page, true)) { ++ spin_unlock_irq(&zone->lru_lock); ++ __munlock_isolated_page(page); ++ goto out; ++ } ++ __munlock_isolation_failed(page); ++ ++unlock_out: ++ spin_unlock_irq(&zone->lru_lock); ++ ++out: + return nr_pages - 1; + } + +@@ -310,34 +336,24 @@ static void __munlock_pagevec(struct pag + struct page *page = pvec->pages[i]; + + if (TestClearPageMlocked(page)) { +- struct lruvec *lruvec; +- int lru; +- +- if (PageLRU(page)) { +- lruvec = mem_cgroup_page_lruvec(page, zone); +- lru = page_lru(page); +- /* +- * We already have pin from follow_page_mask() +- * so we can spare the get_page() here. +- */ +- ClearPageLRU(page); +- del_page_from_lru_list(page, lruvec, lru); +- } else { +- __munlock_isolation_failed(page); +- goto skip_munlock; +- } +- +- } else { +-skip_munlock: + /* +- * We won't be munlocking this page in the next phase +- * but we still need to release the follow_page_mask() +- * pin. We cannot do it under lru_lock however. If it's +- * the last pin, __page_cache_release would deadlock. ++ * We already have pin from follow_page_mask() ++ * so we can spare the get_page() here. + */ +- pagevec_add(&pvec_putback, pvec->pages[i]); +- pvec->pages[i] = NULL; ++ if (__munlock_isolate_lru_page(page, false)) ++ continue; ++ else ++ __munlock_isolation_failed(page); + } ++ ++ /* ++ * We won't be munlocking this page in the next phase ++ * but we still need to release the follow_page_mask() ++ * pin. We cannot do it under lru_lock however. If it's ++ * the last pin, __page_cache_release() would deadlock. ++ */ ++ pagevec_add(&pvec_putback, pvec->pages[i]); ++ pvec->pages[i] = NULL; + } + delta_munlocked = -nr + pagevec_count(&pvec_putback); + __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); diff --git a/queue-3.13/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch b/queue-3.13/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch new file mode 100644 index 00000000000..8685c56cf8e --- /dev/null +++ b/queue-3.13/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch @@ -0,0 +1,125 @@ +From a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 29 Jan 2014 14:05:41 -0800 +Subject: mm/page-writeback.c: do not count anon pages as dirtyable memory + +From: Johannes Weiner + +commit a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 upstream. + +The VM is currently heavily tuned to avoid swapping. Whether that is +good or bad is a separate discussion, but as long as the VM won't swap +to make room for dirty cache, we can not consider anonymous pages when +calculating the amount of dirtyable memory, the baseline to which +dirty_background_ratio and dirty_ratio are applied. + +A simple workload that occupies a significant size (40+%, depending on +memory layout, storage speeds etc.) of memory with anon/tmpfs pages and +uses the remainder for a streaming writer demonstrates this problem. In +that case, the actual cache pages are a small fraction of what is +considered dirtyable overall, which results in an relatively large +portion of the cache pages to be dirtied. As kswapd starts rotating +these, random tasks enter direct reclaim and stall on IO. + +Only consider free pages and file pages dirtyable. + +Signed-off-by: Johannes Weiner +Reported-by: Tejun Heo +Tested-by: Tejun Heo +Reviewed-by: Rik van Riel +Cc: Mel Gorman +Cc: Wu Fengguang +Reviewed-by: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/vmstat.h | 2 -- + mm/internal.h | 1 - + mm/page-writeback.c | 6 ++++-- + mm/vmscan.c | 23 +---------------------- + 4 files changed, 5 insertions(+), 27 deletions(-) + +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -142,8 +142,6 @@ static inline unsigned long zone_page_st + return x; + } + +-extern unsigned long global_reclaimable_pages(void); +- + #ifdef CONFIG_NUMA + /* + * Determine the per node value of a stat item. This function +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -85,7 +85,6 @@ extern unsigned long highest_memmap_pfn; + */ + extern int isolate_lru_page(struct page *page); + extern void putback_lru_page(struct page *page); +-extern unsigned long zone_reclaimable_pages(struct zone *zone); + extern bool zone_reclaimable(struct zone *zone); + + /* +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -205,7 +205,8 @@ static unsigned long zone_dirtyable_memo + nr_pages = zone_page_state(zone, NR_FREE_PAGES); + nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + +- nr_pages += zone_reclaimable_pages(zone); ++ nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); ++ nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); + + return nr_pages; + } +@@ -258,7 +259,8 @@ static unsigned long global_dirtyable_me + x = global_page_state(NR_FREE_PAGES); + x -= min(x, dirty_balance_reserve); + +- x += global_reclaimable_pages(); ++ x += global_page_state(NR_INACTIVE_FILE); ++ x += global_page_state(NR_ACTIVE_FILE); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_c + } + #endif + +-unsigned long zone_reclaimable_pages(struct zone *zone) ++static unsigned long zone_reclaimable_pages(struct zone *zone) + { + int nr; + +@@ -3297,27 +3297,6 @@ void wakeup_kswapd(struct zone *zone, in + wake_up_interruptible(&pgdat->kswapd_wait); + } + +-/* +- * The reclaimable count would be mostly accurate. +- * The less reclaimable pages may be +- * - mlocked pages, which will be moved to unevictable list when encountered +- * - mapped pages, which may require several travels to be reclaimed +- * - dirty pages, which is not "instantly" reclaimable +- */ +-unsigned long global_reclaimable_pages(void) +-{ +- int nr; +- +- nr = global_page_state(NR_ACTIVE_FILE) + +- global_page_state(NR_INACTIVE_FILE); +- +- if (get_nr_swap_pages() > 0) +- nr += global_page_state(NR_ACTIVE_ANON) + +- global_page_state(NR_INACTIVE_ANON); +- +- return nr; +-} +- + #ifdef CONFIG_HIBERNATION + /* + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of diff --git a/queue-3.13/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch b/queue-3.13/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch new file mode 100644 index 00000000000..7cd2e5055f1 --- /dev/null +++ b/queue-3.13/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch @@ -0,0 +1,147 @@ +From a804552b9a15c931cfc2a92a2e0aed1add8b580a Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 29 Jan 2014 14:05:39 -0800 +Subject: mm/page-writeback.c: fix dirty_balance_reserve subtraction from dirtyable memory + +From: Johannes Weiner + +commit a804552b9a15c931cfc2a92a2e0aed1add8b580a upstream. + +Tejun reported stuttering and latency spikes on a system where random +tasks would enter direct reclaim and get stuck on dirty pages. Around +50% of memory was occupied by tmpfs backed by an SSD, and another disk +(rotating) was reading and writing at max speed to shrink a partition. + +: The problem was pretty ridiculous. It's a 8gig machine w/ one ssd and 10k +: rpm harddrive and I could reliably reproduce constant stuttering every +: several seconds for as long as buffered IO was going on on the hard drive +: either with tmpfs occupying somewhere above 4gig or a test program which +: allocates about the same amount of anon memory. Although swap usage was +: zero, turning off swap also made the problem go away too. +: +: The trigger conditions seem quite plausible - high anon memory usage w/ +: heavy buffered IO and swap configured - and it's highly likely that this +: is happening in the wild too. (this can happen with copying large files +: to usb sticks too, right?) + +This patch (of 2): + +The dirty_balance_reserve is an approximation of the fraction of free +pages that the page allocator does not make available for page cache +allocations. As a result, it has to be taken into account when +calculating the amount of "dirtyable memory", the baseline to which +dirty_background_ratio and dirty_ratio are applied. + +However, currently the reserve is subtracted from the sum of free and +reclaimable pages, which is non-sensical and leads to erroneous results +when the system is dominated by unreclaimable pages and the +dirty_balance_reserve is bigger than free+reclaimable. In that case, at +least the already allocated cache should be considered dirtyable. + +Fix the calculation by subtracting the reserve from the amount of free +pages, then adding the reclaimable pages on top. + +[akpm@linux-foundation.org: fix CONFIG_HIGHMEM build] +Signed-off-by: Johannes Weiner +Reported-by: Tejun Heo +Tested-by: Tejun Heo +Reviewed-by: Rik van Riel +Cc: Mel Gorman +Cc: Wu Fengguang +Reviewed-by: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page-writeback.c | 55 ++++++++++++++++++++++------------------------------ + 1 file changed, 24 insertions(+), 31 deletions(-) + +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -191,6 +191,25 @@ static unsigned long writeout_period_tim + * global dirtyable memory first. + */ + ++/** ++ * zone_dirtyable_memory - number of dirtyable pages in a zone ++ * @zone: the zone ++ * ++ * Returns the zone's number of pages potentially available for dirty ++ * page cache. This is the base value for the per-zone dirty limits. ++ */ ++static unsigned long zone_dirtyable_memory(struct zone *zone) ++{ ++ unsigned long nr_pages; ++ ++ nr_pages = zone_page_state(zone, NR_FREE_PAGES); ++ nr_pages -= min(nr_pages, zone->dirty_balance_reserve); ++ ++ nr_pages += zone_reclaimable_pages(zone); ++ ++ return nr_pages; ++} ++ + static unsigned long highmem_dirtyable_memory(unsigned long total) + { + #ifdef CONFIG_HIGHMEM +@@ -198,11 +217,9 @@ static unsigned long highmem_dirtyable_m + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { +- struct zone *z = +- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; ++ struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + +- x += zone_page_state(z, NR_FREE_PAGES) + +- zone_reclaimable_pages(z) - z->dirty_balance_reserve; ++ x += zone_dirtyable_memory(z); + } + /* + * Unreclaimable memory (kernel memory or anonymous memory +@@ -238,9 +255,11 @@ static unsigned long global_dirtyable_me + { + unsigned long x; + +- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); ++ x = global_page_state(NR_FREE_PAGES); + x -= min(x, dirty_balance_reserve); + ++ x += global_reclaimable_pages(); ++ + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + +@@ -289,32 +308,6 @@ void global_dirty_limits(unsigned long * + } + + /** +- * zone_dirtyable_memory - number of dirtyable pages in a zone +- * @zone: the zone +- * +- * Returns the zone's number of pages potentially available for dirty +- * page cache. This is the base value for the per-zone dirty limits. +- */ +-static unsigned long zone_dirtyable_memory(struct zone *zone) +-{ +- /* +- * The effective global number of dirtyable pages may exclude +- * highmem as a big-picture measure to keep the ratio between +- * dirty memory and lowmem reasonable. +- * +- * But this function is purely about the individual zone and a +- * highmem zone can hold its share of dirty pages, so we don't +- * care about vm_highmem_is_dirtyable here. +- */ +- unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + +- zone_reclaimable_pages(zone); +- +- /* don't allow this to underflow */ +- nr_pages -= min(nr_pages, zone->dirty_balance_reserve); +- return nr_pages; +-} +- +-/** + * zone_dirty_limit - maximum number of dirty pages allowed in a zone + * @zone: the zone + * diff --git a/queue-3.13/mmc-atmel-mci-fix-timeout-errors-in-sdio-mode-when-using-dma.patch b/queue-3.13/mmc-atmel-mci-fix-timeout-errors-in-sdio-mode-when-using-dma.patch new file mode 100644 index 00000000000..c66e08a984d --- /dev/null +++ b/queue-3.13/mmc-atmel-mci-fix-timeout-errors-in-sdio-mode-when-using-dma.patch @@ -0,0 +1,50 @@ +From 66b512eda74d59b17eac04c4da1b38d82059e6c9 Mon Sep 17 00:00:00 2001 +From: Ludovic Desroches +Date: Wed, 20 Nov 2013 16:01:11 +0100 +Subject: mmc: atmel-mci: fix timeout errors in SDIO mode when using DMA + +From: Ludovic Desroches + +commit 66b512eda74d59b17eac04c4da1b38d82059e6c9 upstream. + +With some SDIO devices, timeout errors can happen when reading data. +To solve this issue, the DMA transfer has to be activated before sending +the command to the device. This order is incorrect in PDC mode. So we +have to take care if we are using DMA or PDC to know when to send the +MMC command. + +Signed-off-by: Ludovic Desroches +Acked-by: Nicolas Ferre +Signed-off-by: Chris Ball +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/mmc/host/atmel-mci.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/drivers/mmc/host/atmel-mci.c ++++ b/drivers/mmc/host/atmel-mci.c +@@ -1192,11 +1192,22 @@ static void atmci_start_request(struct a + iflags |= ATMCI_CMDRDY; + cmd = mrq->cmd; + cmdflags = atmci_prepare_command(slot->mmc, cmd); +- atmci_send_command(host, cmd, cmdflags); ++ ++ /* ++ * DMA transfer should be started before sending the command to avoid ++ * unexpected errors especially for read operations in SDIO mode. ++ * Unfortunately, in PDC mode, command has to be sent before starting ++ * the transfer. ++ */ ++ if (host->submit_data != &atmci_submit_data_dma) ++ atmci_send_command(host, cmd, cmdflags); + + if (data) + host->submit_data(host, data); + ++ if (host->submit_data == &atmci_submit_data_dma) ++ atmci_send_command(host, cmd, cmdflags); ++ + if (mrq->stop) { + host->stop_cmdr = atmci_prepare_command(slot->mmc, mrq->stop); + host->stop_cmdr |= ATMCI_CMDR_STOP_XFER; diff --git a/queue-3.13/mmc-core-sd-implement-proper-support-for-sd3.0-au-sizes.patch b/queue-3.13/mmc-core-sd-implement-proper-support-for-sd3.0-au-sizes.patch new file mode 100644 index 00000000000..a25ccca0fe2 --- /dev/null +++ b/queue-3.13/mmc-core-sd-implement-proper-support-for-sd3.0-au-sizes.patch @@ -0,0 +1,98 @@ +From 9288cac05405a7da406097a44721aa4004609b4d Mon Sep 17 00:00:00 2001 +From: Wolfram Sang +Date: Tue, 26 Nov 2013 02:16:25 +0100 +Subject: mmc: core: sd: implement proper support for sd3.0 au sizes + +From: Wolfram Sang + +commit 9288cac05405a7da406097a44721aa4004609b4d upstream. + +This reverts and updates commit 77776fd0a4cc541b9 ("mmc: sd: fix the +maximum au_size for SD3.0"). The au_size for SD3.0 cannot be achieved +by a simple bit shift, so this needs to be implemented differently. +Also, don't print the warning in case of 0 since 'not defined' is +different from 'invalid'. + +Signed-off-by: Wolfram Sang +Acked-by: Jaehoon Chung +Reviewed-by: H Hartley Sweeten +Signed-off-by: Chris Ball +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/mmc/core/sd.c | 37 ++++++++++++++++++++++--------------- + 1 file changed, 22 insertions(+), 15 deletions(-) + +--- a/drivers/mmc/core/sd.c ++++ b/drivers/mmc/core/sd.c +@@ -11,6 +11,7 @@ + */ + + #include ++#include + #include + #include + #include +@@ -45,6 +46,13 @@ static const unsigned int tacc_mant[] = + 35, 40, 45, 50, 55, 60, 70, 80, + }; + ++static const unsigned int sd_au_size[] = { ++ 0, SZ_16K / 512, SZ_32K / 512, SZ_64K / 512, ++ SZ_128K / 512, SZ_256K / 512, SZ_512K / 512, SZ_1M / 512, ++ SZ_2M / 512, SZ_4M / 512, SZ_8M / 512, (SZ_8M + SZ_4M) / 512, ++ SZ_16M / 512, (SZ_16M + SZ_8M) / 512, SZ_32M / 512, SZ_64M / 512, ++}; ++ + #define UNSTUFF_BITS(resp,start,size) \ + ({ \ + const int __size = size; \ +@@ -216,7 +224,7 @@ static int mmc_decode_scr(struct mmc_car + static int mmc_read_ssr(struct mmc_card *card) + { + unsigned int au, es, et, eo; +- int err, i, max_au; ++ int err, i; + u32 *ssr; + + if (!(card->csd.cmdclass & CCC_APP_SPEC)) { +@@ -240,26 +248,25 @@ static int mmc_read_ssr(struct mmc_card + for (i = 0; i < 16; i++) + ssr[i] = be32_to_cpu(ssr[i]); + +- /* SD3.0 increases max AU size to 64MB (0xF) from 4MB (0x9) */ +- max_au = card->scr.sda_spec3 ? 0xF : 0x9; +- + /* + * UNSTUFF_BITS only works with four u32s so we have to offset the + * bitfield positions accordingly. + */ + au = UNSTUFF_BITS(ssr, 428 - 384, 4); +- if (au > 0 && au <= max_au) { +- card->ssr.au = 1 << (au + 4); +- es = UNSTUFF_BITS(ssr, 408 - 384, 16); +- et = UNSTUFF_BITS(ssr, 402 - 384, 6); +- eo = UNSTUFF_BITS(ssr, 400 - 384, 2); +- if (es && et) { +- card->ssr.erase_timeout = (et * 1000) / es; +- card->ssr.erase_offset = eo * 1000; ++ if (au) { ++ if (au <= 9 || card->scr.sda_spec3) { ++ card->ssr.au = sd_au_size[au]; ++ es = UNSTUFF_BITS(ssr, 408 - 384, 16); ++ et = UNSTUFF_BITS(ssr, 402 - 384, 6); ++ if (es && et) { ++ eo = UNSTUFF_BITS(ssr, 400 - 384, 2); ++ card->ssr.erase_timeout = (et * 1000) / es; ++ card->ssr.erase_offset = eo * 1000; ++ } ++ } else { ++ pr_warning("%s: SD Status: Invalid Allocation Unit size.\n", ++ mmc_hostname(card->host)); + } +- } else { +- pr_warning("%s: SD Status: Invalid Allocation Unit " +- "size.\n", mmc_hostname(card->host)); + } + out: + kfree(ssr); diff --git a/queue-3.13/mmc-fix-host-release-issue-after-discard-operation.patch b/queue-3.13/mmc-fix-host-release-issue-after-discard-operation.patch new file mode 100644 index 00000000000..22eb886387f --- /dev/null +++ b/queue-3.13/mmc-fix-host-release-issue-after-discard-operation.patch @@ -0,0 +1,66 @@ +From f662ae48ae67dfd42739e65750274fe8de46240a Mon Sep 17 00:00:00 2001 +From: Ray Jui +Date: Sat, 26 Oct 2013 11:03:44 -0700 +Subject: mmc: fix host release issue after discard operation + +From: Ray Jui + +commit f662ae48ae67dfd42739e65750274fe8de46240a upstream. + +Under function mmc_blk_issue_rq, after an MMC discard operation, +the MMC request data structure may be freed in memory. Later in +the same function, the check of req->cmd_flags & MMC_REQ_SPECIAL_MASK +is dangerous and invalid. It causes the MMC host not to be released +when it should. + +This patch fixes the issue by marking the special request down before +the discard/flush operation. + +Reported by: Harold (SoonYeal) Yang +Signed-off-by: Ray Jui +Reviewed-by: Seungwon Jeon +Acked-by: Seungwon Jeon +Signed-off-by: Chris Ball +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/mmc/card/block.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/drivers/mmc/card/block.c ++++ b/drivers/mmc/card/block.c +@@ -1959,6 +1959,7 @@ static int mmc_blk_issue_rq(struct mmc_q + struct mmc_card *card = md->queue.card; + struct mmc_host *host = card->host; + unsigned long flags; ++ unsigned int cmd_flags = req ? req->cmd_flags : 0; + + if (req && !mq->mqrq_prev->req) + /* claim host only for the first request */ +@@ -1974,7 +1975,7 @@ static int mmc_blk_issue_rq(struct mmc_q + } + + mq->flags &= ~MMC_QUEUE_NEW_REQUEST; +- if (req && req->cmd_flags & REQ_DISCARD) { ++ if (cmd_flags & REQ_DISCARD) { + /* complete ongoing async transfer before issuing discard */ + if (card->host->areq) + mmc_blk_issue_rw_rq(mq, NULL); +@@ -1983,7 +1984,7 @@ static int mmc_blk_issue_rq(struct mmc_q + ret = mmc_blk_issue_secdiscard_rq(mq, req); + else + ret = mmc_blk_issue_discard_rq(mq, req); +- } else if (req && req->cmd_flags & REQ_FLUSH) { ++ } else if (cmd_flags & REQ_FLUSH) { + /* complete ongoing async transfer before issuing flush */ + if (card->host->areq) + mmc_blk_issue_rw_rq(mq, NULL); +@@ -1999,7 +2000,7 @@ static int mmc_blk_issue_rq(struct mmc_q + + out: + if ((!req && !(mq->flags & MMC_QUEUE_NEW_REQUEST)) || +- (req && (req->cmd_flags & MMC_REQ_SPECIAL_MASK))) ++ (cmd_flags & MMC_REQ_SPECIAL_MASK)) + /* + * Release host when there are no more requests + * and after special request(discard, flush) is done. diff --git a/queue-3.13/revert-eisa-initialize-device-before-its-resources.patch b/queue-3.13/revert-eisa-initialize-device-before-its-resources.patch new file mode 100644 index 00000000000..6de968b6a61 --- /dev/null +++ b/queue-3.13/revert-eisa-initialize-device-before-its-resources.patch @@ -0,0 +1,96 @@ +From 765ee51f9a3f652959b4c7297d198a28e37952b4 Mon Sep 17 00:00:00 2001 +From: Bjorn Helgaas +Date: Fri, 17 Jan 2014 14:57:29 -0700 +Subject: Revert "EISA: Initialize device before its resources" + +From: Bjorn Helgaas + +commit 765ee51f9a3f652959b4c7297d198a28e37952b4 upstream. + +This reverts commit 26abfeed4341872364386c6a52b9acef8c81a81a. + +In the eisa_probe() force_probe path, if we were unable to request slot +resources (e.g., [io 0x800-0x8ff]), we skipped the slot with "Cannot +allocate resource for EISA slot %d" before reading the EISA signature in +eisa_init_device(). + +Commit 26abfeed4341 moved eisa_init_device() earlier, so we tried to read +the EISA signature before requesting the slot resources, and this caused +hangs during boot. + +Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1251816 +Signed-off-by: Bjorn Helgaas +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/eisa/eisa-bus.c | 26 +++++++++++++++----------- + 1 file changed, 15 insertions(+), 11 deletions(-) + +--- a/drivers/eisa/eisa-bus.c ++++ b/drivers/eisa/eisa-bus.c +@@ -275,11 +275,13 @@ static int __init eisa_request_resources + } + + if (slot) { ++ edev->res[i].name = NULL; + edev->res[i].start = SLOT_ADDRESS(root, slot) + + (i * 0x400); + edev->res[i].end = edev->res[i].start + 0xff; + edev->res[i].flags = IORESOURCE_IO; + } else { ++ edev->res[i].name = NULL; + edev->res[i].start = SLOT_ADDRESS(root, slot) + + EISA_VENDOR_ID_OFFSET; + edev->res[i].end = edev->res[i].start + 3; +@@ -326,19 +328,20 @@ static int __init eisa_probe(struct eisa + return -ENOMEM; + } + +- if (eisa_init_device(root, edev, 0)) { ++ if (eisa_request_resources(root, edev, 0)) { ++ dev_warn(root->dev, ++ "EISA: Cannot allocate resource for mainboard\n"); + kfree(edev); + if (!root->force_probe) +- return -ENODEV; ++ return -EBUSY; + goto force_probe; + } + +- if (eisa_request_resources(root, edev, 0)) { +- dev_warn(root->dev, +- "EISA: Cannot allocate resource for mainboard\n"); ++ if (eisa_init_device(root, edev, 0)) { ++ eisa_release_resources(edev); + kfree(edev); + if (!root->force_probe) +- return -EBUSY; ++ return -ENODEV; + goto force_probe; + } + +@@ -361,11 +364,6 @@ static int __init eisa_probe(struct eisa + continue; + } + +- if (eisa_init_device(root, edev, i)) { +- kfree(edev); +- continue; +- } +- + if (eisa_request_resources(root, edev, i)) { + dev_warn(root->dev, + "Cannot allocate resource for EISA slot %d\n", +@@ -373,6 +371,12 @@ static int __init eisa_probe(struct eisa + kfree(edev); + continue; + } ++ ++ if (eisa_init_device(root, edev, i)) { ++ eisa_release_resources(edev); ++ kfree(edev); ++ continue; ++ } + + if (edev->state == (EISA_CONFIG_ENABLED | EISA_CONFIG_FORCED)) + enabled_str = " (forced enabled)"; diff --git a/queue-3.13/selinux-fix-memory-leak-upon-loading-policy.patch b/queue-3.13/selinux-fix-memory-leak-upon-loading-policy.patch new file mode 100644 index 00000000000..373ee438ed0 --- /dev/null +++ b/queue-3.13/selinux-fix-memory-leak-upon-loading-policy.patch @@ -0,0 +1,79 @@ +From 8ed814602876bec9bad2649ca17f34b499357a1c Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa +Date: Mon, 6 Jan 2014 21:28:15 +0900 +Subject: SELinux: Fix memory leak upon loading policy + +From: Tetsuo Handa + +commit 8ed814602876bec9bad2649ca17f34b499357a1c upstream. + +Hello. + +I got below leak with linux-3.10.0-54.0.1.el7.x86_64 . + +[ 681.903890] kmemleak: 5538 new suspected memory leaks (see /sys/kernel/debug/kmemleak) + +Below is a patch, but I don't know whether we need special handing for undoing +ebitmap_set_bit() call. +---------- +>>From fe97527a90fe95e2239dfbaa7558f0ed559c0992 Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa +Date: Mon, 6 Jan 2014 16:30:21 +0900 +Subject: SELinux: Fix memory leak upon loading policy + +Commit 2463c26d "SELinux: put name based create rules in a hashtable" did not +check return value from hashtab_insert() in filename_trans_read(). It leaks +memory if hashtab_insert() returns error. + + unreferenced object 0xffff88005c9160d0 (size 8): + comm "systemd", pid 1, jiffies 4294688674 (age 235.265s) + hex dump (first 8 bytes): + 57 0b 00 00 6b 6b 6b a5 W...kkk. + backtrace: + [] kmemleak_alloc+0x4e/0xb0 + [] kmem_cache_alloc_trace+0x12e/0x360 + [] policydb_read+0xd1d/0xf70 + [] security_load_policy+0x6c/0x500 + [] sel_write_load+0xac/0x750 + [] vfs_write+0xc0/0x1f0 + [] SyS_write+0x4c/0xa0 + [] system_call_fastpath+0x16/0x1b + [] 0xffffffffffffffff + +However, we should not return EEXIST error to the caller, or the systemd will +show below message and the boot sequence freezes. + + systemd[1]: Failed to load SELinux policy. Freezing. + +Signed-off-by: Tetsuo Handa +Acked-by: Eric Paris +Signed-off-by: Paul Moore +Signed-off-by: Greg Kroah-Hartman + +--- + security/selinux/ss/policydb.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/security/selinux/ss/policydb.c ++++ b/security/selinux/ss/policydb.c +@@ -1941,7 +1941,19 @@ static int filename_trans_read(struct po + if (rc) + goto out; + +- hashtab_insert(p->filename_trans, ft, otype); ++ rc = hashtab_insert(p->filename_trans, ft, otype); ++ if (rc) { ++ /* ++ * Do not return -EEXIST to the caller, or the system ++ * will not boot. ++ */ ++ if (rc != -EEXIST) ++ goto out; ++ /* But free memory to avoid memory leak. */ ++ kfree(ft); ++ kfree(name); ++ kfree(otype); ++ } + } + hash_eval(p->filename_trans, "filenametr"); + return 0; diff --git a/queue-3.13/series b/queue-3.13/series index 69bf3fdb414..d57b593b845 100644 --- a/queue-3.13/series +++ b/queue-3.13/series @@ -1 +1,25 @@ mei-mei_hbm_dispatch-returns-void.patch +selinux-fix-memory-leak-upon-loading-policy.patch +ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch +ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch +tracing-have-trace-buffer-point-back-to-trace_array.patch +tracing-check-if-tracing-is-enabled-in-trace_puts.patch +arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch +intel-iommu-fix-off-by-one-in-pagetable-freeing.patch +revert-eisa-initialize-device-before-its-resources.patch +fuse-fix-pipe_buf_operations.patch +audit-reset-audit-backlog-wait-time-after-error-recovery.patch +audit-correct-a-type-mismatch-in-audit_syscall_exit.patch +xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch +mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch +mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch +mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch +mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch +mm-munlock-fix-potential-race-with-thp-page-split.patch +memcg-fix-endless-loop-caused-by-mem_cgroup_iter.patch +memcg-fix-css-reference-leak-and-endless-loop-in-mem_cgroup_iter.patch +mm-ignore-vm_softdirty-on-vma-merging.patch +mm-don-t-lose-the-soft_dirty-flag-on-mprotect.patch +mmc-fix-host-release-issue-after-discard-operation.patch +mmc-atmel-mci-fix-timeout-errors-in-sdio-mode-when-using-dma.patch +mmc-core-sd-implement-proper-support-for-sd3.0-au-sizes.patch diff --git a/queue-3.13/tracing-check-if-tracing-is-enabled-in-trace_puts.patch b/queue-3.13/tracing-check-if-tracing-is-enabled-in-trace_puts.patch new file mode 100644 index 00000000000..c6dc9752d90 --- /dev/null +++ b/queue-3.13/tracing-check-if-tracing-is-enabled-in-trace_puts.patch @@ -0,0 +1,44 @@ +From 3132e107d608f8753240d82d61303c500fd515b4 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Red Hat)" +Date: Thu, 23 Jan 2014 12:27:59 -0500 +Subject: tracing: Check if tracing is enabled in trace_puts() + +From: "Steven Rostedt (Red Hat)" + +commit 3132e107d608f8753240d82d61303c500fd515b4 upstream. + +If trace_puts() is used very early in boot up, it can crash the machine +if it is called before the ring buffer is allocated. If a trace_printk() +is used with no arguments, then it will be converted into a trace_puts() +and suffer the same fate. + +Fixes: 09ae72348ecc "tracing: Add trace_puts() for even faster trace_printk() tracing" +Signed-off-by: Steven Rostedt +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/trace.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -455,6 +455,9 @@ int __trace_puts(unsigned long ip, const + unsigned long irq_flags; + int alloc; + ++ if (unlikely(tracing_selftest_running || tracing_disabled)) ++ return 0; ++ + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + local_save_flags(irq_flags); +@@ -495,6 +498,9 @@ int __trace_bputs(unsigned long ip, cons + unsigned long irq_flags; + int size = sizeof(struct bputs_entry); + ++ if (unlikely(tracing_selftest_running || tracing_disabled)) ++ return 0; ++ + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, diff --git a/queue-3.13/tracing-have-trace-buffer-point-back-to-trace_array.patch b/queue-3.13/tracing-have-trace-buffer-point-back-to-trace_array.patch new file mode 100644 index 00000000000..a88730c2732 --- /dev/null +++ b/queue-3.13/tracing-have-trace-buffer-point-back-to-trace_array.patch @@ -0,0 +1,36 @@ +From dced341b2d4f06668efaab33f88de5d287c0f45b Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Red Hat)" +Date: Tue, 14 Jan 2014 10:19:46 -0500 +Subject: tracing: Have trace buffer point back to trace_array + +From: "Steven Rostedt (Red Hat)" + +commit dced341b2d4f06668efaab33f88de5d287c0f45b upstream. + +The trace buffer has a descriptor pointer that goes back to the trace +array. But it was never assigned. Luckily, nothing uses it (yet), but +it will in the future. + +Although nothing currently uses this, if any of the new features get +backported to older kernels, and because this is such a simple change, +I'm marking it for stable too. + +Fixes: 12883efb670c "tracing: Consolidate max_tr into main trace_array structure" +Signed-off-by: Steven Rostedt +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/trace.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -5883,6 +5883,8 @@ allocate_trace_buffer(struct trace_array + + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + ++ buf->tr = tr; ++ + buf->buffer = ring_buffer_alloc(size, rb_flags); + if (!buf->buffer) + return -ENOMEM; diff --git a/queue-3.13/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch b/queue-3.13/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch new file mode 100644 index 00000000000..4d9e958880f --- /dev/null +++ b/queue-3.13/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch @@ -0,0 +1,334 @@ +From 51c71a3bbaca868043cc45b3ad3786dd48a90235 Mon Sep 17 00:00:00 2001 +From: Konrad Rzeszutek Wilk +Date: Tue, 26 Nov 2013 15:05:40 -0500 +Subject: xen/pvhvm: If xen_platform_pci=0 is set don't blow up (v4). + +From: Konrad Rzeszutek Wilk + +commit 51c71a3bbaca868043cc45b3ad3786dd48a90235 upstream. + +The user has the option of disabling the platform driver: +00:02.0 Unassigned class [ff80]: XenSource, Inc. Xen Platform Device (rev 01) + +which is used to unplug the emulated drivers (IDE, Realtek 8169, etc) +and allow the PV drivers to take over. If the user wishes +to disable that they can set: + + xen_platform_pci=0 + (in the guest config file) + +or + xen_emul_unplug=never + (on the Linux command line) + +except it does not work properly. The PV drivers still try to +load and since the Xen platform driver is not run - and it +has not initialized the grant tables, most of the PV drivers +stumble upon: + +input: Xen Virtual Keyboard as /devices/virtual/input/input5 +input: Xen Virtual Pointer as /devices/virtual/input/input6M +------------[ cut here ]------------ +kernel BUG at /home/konrad/ssd/konrad/linux/drivers/xen/grant-table.c:1206! +invalid opcode: 0000 [#1] SMP +Modules linked in: xen_kbdfront(+) xenfs xen_privcmd +CPU: 6 PID: 1389 Comm: modprobe Not tainted 3.13.0-rc1upstream-00021-ga6c892b-dirty #1 +Hardware name: Xen HVM domU, BIOS 4.4-unstable 11/26/2013 +RIP: 0010:[] [] get_free_entries+0x2e0/0x300 +Call Trace: + [] ? evdev_connect+0x1e3/0x240 + [] gnttab_grant_foreign_access+0x2e/0x70 + [] xenkbd_connect_backend+0x41/0x290 [xen_kbdfront] + [] xenkbd_probe+0x2f2/0x324 [xen_kbdfront] + [] xenbus_dev_probe+0x77/0x130 + [] xenbus_frontend_dev_probe+0x47/0x50 + [] driver_probe_device+0x89/0x230 + [] __driver_attach+0x9b/0xa0 + [] ? driver_probe_device+0x230/0x230 + [] ? driver_probe_device+0x230/0x230 + [] bus_for_each_dev+0x8c/0xb0 + [] driver_attach+0x19/0x20 + [] bus_add_driver+0x1a0/0x220 + [] driver_register+0x5f/0xf0 + [] xenbus_register_driver_common+0x15/0x20 + [] xenbus_register_frontend+0x23/0x40 + [] ? 0xffffffffa0014fff + [] xenkbd_init+0x2b/0x1000 [xen_kbdfront] + [] do_one_initcall+0x49/0x170 + +.. snip.. + +which is hardly nice. This patch fixes this by having each +PV driver check for: + - if running in PV, then it is fine to execute (as that is their + native environment). + - if running in HVM, check if user wanted 'xen_emul_unplug=never', + in which case bail out and don't load any PV drivers. + - if running in HVM, and if PCI device 5853:0001 (xen_platform_pci) + does not exist, then bail out and not load PV drivers. + - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=ide-disks', + then bail out for all PV devices _except_ the block one. + Ditto for the network one ('nics'). + - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=unnecessary' + then load block PV driver, and also setup the legacy IDE paths. + In (v3) make it actually load PV drivers. + +Reported-by: Sander Eikelenboom +Reported-and-Tested-by: Fabio Fantoni +Signed-off-by: Konrad Rzeszutek Wilk +[v2: Add extra logic to handle the myrid ways 'xen_emul_unplug' +can be used per Ian and Stefano suggestion] +[v3: Make the unnecessary case work properly] +[v4: s/disks/ide-disks/ spotted by Fabio] +Reviewed-by: Stefano Stabellini +Acked-by: Bjorn Helgaas [for PCI parts] +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/xen/platform-pci-unplug.c | 74 +++++++++++++++++++++++++++++ + drivers/block/xen-blkfront.c | 4 - + drivers/char/tpm/xen-tpmfront.c | 4 + + drivers/input/misc/xen-kbdfront.c | 4 + + drivers/net/xen-netfront.c | 2 + drivers/pci/xen-pcifront.c | 4 + + drivers/video/xen-fbfront.c | 4 + + drivers/xen/xenbus/xenbus_probe_frontend.c | 2 + include/xen/platform_pci.h | 23 +++++++++ + 9 files changed, 117 insertions(+), 4 deletions(-) + +--- a/arch/x86/xen/platform-pci-unplug.c ++++ b/arch/x86/xen/platform-pci-unplug.c +@@ -69,6 +69,80 @@ static int check_platform_magic(void) + return 0; + } + ++bool xen_has_pv_devices() ++{ ++ if (!xen_domain()) ++ return false; ++ ++ /* PV domains always have them. */ ++ if (xen_pv_domain()) ++ return true; ++ ++ /* And user has xen_platform_pci=0 set in guest config as ++ * driver did not modify the value. */ ++ if (xen_platform_pci_unplug == 0) ++ return false; ++ ++ if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER) ++ return false; ++ ++ if (xen_platform_pci_unplug & XEN_UNPLUG_ALL) ++ return true; ++ ++ /* This is an odd one - we are going to run legacy ++ * and PV drivers at the same time. */ ++ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) ++ return true; ++ ++ /* And the caller has to follow with xen_pv_{disk,nic}_devices ++ * to be certain which driver can load. */ ++ return false; ++} ++EXPORT_SYMBOL_GPL(xen_has_pv_devices); ++ ++static bool __xen_has_pv_device(int state) ++{ ++ /* HVM domains might or might not */ ++ if (xen_hvm_domain() && (xen_platform_pci_unplug & state)) ++ return true; ++ ++ return xen_has_pv_devices(); ++} ++ ++bool xen_has_pv_nic_devices(void) ++{ ++ return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL); ++} ++EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices); ++ ++bool xen_has_pv_disk_devices(void) ++{ ++ return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS | ++ XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL); ++} ++EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices); ++ ++/* ++ * This one is odd - it determines whether you want to run PV _and_ ++ * legacy (IDE) drivers together. This combination is only possible ++ * under HVM. ++ */ ++bool xen_has_pv_and_legacy_disk_devices(void) ++{ ++ if (!xen_domain()) ++ return false; ++ ++ /* N.B. This is only ever used in HVM mode */ ++ if (xen_pv_domain()) ++ return false; ++ ++ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) ++ return true; ++ ++ return false; ++} ++EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices); ++ + void xen_unplug_emulated_devices(void) + { + int r; +--- a/drivers/block/xen-blkfront.c ++++ b/drivers/block/xen-blkfront.c +@@ -1356,7 +1356,7 @@ static int blkfront_probe(struct xenbus_ + char *type; + int len; + /* no unplug has been done: do not hook devices != xen vbds */ +- if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) { ++ if (xen_has_pv_and_legacy_disk_devices()) { + int major; + + if (!VDEV_IS_EXTENDED(vdevice)) +@@ -2079,7 +2079,7 @@ static int __init xlblk_init(void) + if (!xen_domain()) + return -ENODEV; + +- if (xen_hvm_domain() && !xen_platform_pci_unplug) ++ if (!xen_has_pv_disk_devices()) + return -ENODEV; + + if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { +--- a/drivers/char/tpm/xen-tpmfront.c ++++ b/drivers/char/tpm/xen-tpmfront.c +@@ -17,6 +17,7 @@ + #include + #include + #include "tpm.h" ++#include + + struct tpm_private { + struct tpm_chip *chip; +@@ -421,6 +422,9 @@ static int __init xen_tpmfront_init(void + if (!xen_domain()) + return -ENODEV; + ++ if (!xen_has_pv_devices()) ++ return -ENODEV; ++ + return xenbus_register_frontend(&tpmfront_driver); + } + module_init(xen_tpmfront_init); +--- a/drivers/input/misc/xen-kbdfront.c ++++ b/drivers/input/misc/xen-kbdfront.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + struct xenkbd_info { + struct input_dev *kbd; +@@ -380,6 +381,9 @@ static int __init xenkbd_init(void) + if (xen_initial_domain()) + return -ENODEV; + ++ if (!xen_has_pv_devices()) ++ return -ENODEV; ++ + return xenbus_register_frontend(&xenkbd_driver); + } + +--- a/drivers/net/xen-netfront.c ++++ b/drivers/net/xen-netfront.c +@@ -2079,7 +2079,7 @@ static int __init netif_init(void) + if (!xen_domain()) + return -ENODEV; + +- if (xen_hvm_domain() && !xen_platform_pci_unplug) ++ if (!xen_has_pv_nic_devices()) + return -ENODEV; + + pr_info("Initialising Xen virtual ethernet driver\n"); +--- a/drivers/pci/xen-pcifront.c ++++ b/drivers/pci/xen-pcifront.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + #define INVALID_GRANT_REF (0) +@@ -1138,6 +1139,9 @@ static int __init pcifront_init(void) + if (!xen_pv_domain() || xen_initial_domain()) + return -ENODEV; + ++ if (!xen_has_pv_devices()) ++ return -ENODEV; ++ + pci_frontend_registrar(1 /* enable */); + + return xenbus_register_frontend(&xenpci_driver); +--- a/drivers/video/xen-fbfront.c ++++ b/drivers/video/xen-fbfront.c +@@ -35,6 +35,7 @@ + #include + #include + #include ++#include + + struct xenfb_info { + unsigned char *fb; +@@ -699,6 +700,9 @@ static int __init xenfb_init(void) + if (xen_initial_domain()) + return -ENODEV; + ++ if (!xen_has_pv_devices()) ++ return -ENODEV; ++ + return xenbus_register_frontend(&xenfb_driver); + } + +--- a/drivers/xen/xenbus/xenbus_probe_frontend.c ++++ b/drivers/xen/xenbus/xenbus_probe_frontend.c +@@ -496,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_in + #ifndef MODULE + static int __init boot_wait_for_devices(void) + { +- if (xen_hvm_domain() && !xen_platform_pci_unplug) ++ if (!xen_has_pv_devices()) + return -ENODEV; + + ready_to_wait_for_devices = 1; +--- a/include/xen/platform_pci.h ++++ b/include/xen/platform_pci.h +@@ -48,4 +48,27 @@ static inline int xen_must_unplug_disks( + + extern int xen_platform_pci_unplug; + ++#if defined(CONFIG_XEN_PVHVM) ++extern bool xen_has_pv_devices(void); ++extern bool xen_has_pv_disk_devices(void); ++extern bool xen_has_pv_nic_devices(void); ++extern bool xen_has_pv_and_legacy_disk_devices(void); ++#else ++static inline bool xen_has_pv_devices(void) ++{ ++ return IS_ENABLED(CONFIG_XEN); ++} ++static inline bool xen_has_pv_disk_devices(void) ++{ ++ return IS_ENABLED(CONFIG_XEN); ++} ++static inline bool xen_has_pv_nic_devices(void) ++{ ++ return IS_ENABLED(CONFIG_XEN); ++} ++static inline bool xen_has_pv_and_legacy_disk_devices(void) ++{ ++ return false; ++} ++#endif + #endif /* _XEN_PLATFORM_PCI_H */ -- 2.47.2