From: Greg Kroah-Hartman Date: Thu, 6 Feb 2014 23:24:35 +0000 (-0800) Subject: 3.12-stable patches X-Git-Tag: v3.4.80~87 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=864545ded606df552eaaf784dc04f79b9e19b97f;p=thirdparty%2Fkernel%2Fstable-queue.git 3.12-stable patches added patches: arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch audit-correct-a-type-mismatch-in-audit_syscall_exit.patch audit-reset-audit-backlog-wait-time-after-error-recovery.patch ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch fuse-fix-pipe_buf_operations.patch intel-iommu-fix-off-by-one-in-pagetable-freeing.patch mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch revert-eisa-initialize-device-before-its-resources.patch selinux-fix-memory-leak-upon-loading-policy.patch tracing-check-if-tracing-is-enabled-in-trace_puts.patch tracing-have-trace-buffer-point-back-to-trace_array.patch xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch --- diff --git a/queue-3.12/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch b/queue-3.12/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch new file mode 100644 index 00000000000..27a040ee275 --- /dev/null +++ b/queue-3.12/arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch @@ -0,0 +1,43 @@ +From 53a52f17d96c8d47c79a7dafa81426317e89c7c1 Mon Sep 17 00:00:00 2001 +From: Wanlong Gao +Date: Tue, 21 Jan 2014 15:48:41 -0800 +Subject: arch/sh/kernel/kgdb.c: add missing #include + +From: Wanlong Gao + +commit 53a52f17d96c8d47c79a7dafa81426317e89c7c1 upstream. + + arch/sh/kernel/kgdb.c: In function 'sleeping_thread_to_gdb_regs': + arch/sh/kernel/kgdb.c:225:32: error: implicit declaration of function 'task_stack_page' [-Werror=implicit-function-declaration] + arch/sh/kernel/kgdb.c:242:23: error: dereferencing pointer to incomplete type + arch/sh/kernel/kgdb.c:243:22: error: dereferencing pointer to incomplete type + arch/sh/kernel/kgdb.c: In function 'singlestep_trap_handler': + arch/sh/kernel/kgdb.c:310:27: error: 'SIGTRAP' undeclared (first use in this function) + arch/sh/kernel/kgdb.c:310:27: note: each undeclared identifier is reported only once for each function it appears in + +This was introduced by commit 16559ae48c76 ("kgdb: remove #include + from kgdb.h"). + +[geert@linux-m68k.org: reworded and reformatted] +Signed-off-by: Wanlong Gao +Signed-off-by: Geert Uytterhoeven +Reported-by: Fengguang Wu +Acked-by: Greg Kroah-Hartman +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + arch/sh/kernel/kgdb.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/sh/kernel/kgdb.c ++++ b/arch/sh/kernel/kgdb.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + diff --git a/queue-3.12/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch b/queue-3.12/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch new file mode 100644 index 00000000000..2d680e4f2c7 --- /dev/null +++ b/queue-3.12/audit-correct-a-type-mismatch-in-audit_syscall_exit.patch @@ -0,0 +1,39 @@ +From 06bdadd7634551cfe8ce071fe44d0311b3033d9e Mon Sep 17 00:00:00 2001 +From: AKASHI Takahiro +Date: Mon, 13 Jan 2014 13:33:09 -0800 +Subject: audit: correct a type mismatch in audit_syscall_exit() + +From: AKASHI Takahiro + +commit 06bdadd7634551cfe8ce071fe44d0311b3033d9e upstream. + +audit_syscall_exit() saves a result of regs_return_value() in intermediate +"int" variable and passes it to __audit_syscall_exit(), which expects its +second argument as a "long" value. This will result in truncating the +value returned by a system call and making a wrong audit record. + +I don't know why gcc compiler doesn't complain about this, but anyway it +causes a problem at runtime on arm64 (and probably most 64-bit archs). + +Signed-off-by: AKASHI Takahiro +Cc: Al Viro +Cc: Eric Paris +Signed-off-by: Andrew Morton +Signed-off-by: Eric Paris +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/audit.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/include/linux/audit.h ++++ b/include/linux/audit.h +@@ -135,7 +135,7 @@ static inline void audit_syscall_exit(vo + { + if (unlikely(current->audit_context)) { + int success = is_syscall_success(pt_regs); +- int return_code = regs_return_value(pt_regs); ++ long return_code = regs_return_value(pt_regs); + + __audit_syscall_exit(success, return_code); + } diff --git a/queue-3.12/audit-reset-audit-backlog-wait-time-after-error-recovery.patch b/queue-3.12/audit-reset-audit-backlog-wait-time-after-error-recovery.patch new file mode 100644 index 00000000000..351efcb07aa --- /dev/null +++ b/queue-3.12/audit-reset-audit-backlog-wait-time-after-error-recovery.patch @@ -0,0 +1,48 @@ +From e789e561a50de0aaa8c695662d97aaa5eac9d55f Mon Sep 17 00:00:00 2001 +From: Richard Guy Briggs +Date: Thu, 12 Sep 2013 23:03:51 -0400 +Subject: audit: reset audit backlog wait time after error recovery + +From: Richard Guy Briggs + +commit e789e561a50de0aaa8c695662d97aaa5eac9d55f upstream. + +When the audit queue overflows and times out (audit_backlog_wait_time), the +audit queue overflow timeout is set to zero. Once the audit queue overflow +timeout condition recovers, the timeout should be reset to the original value. + +See also: + https://lkml.org/lkml/2013/9/2/473 + +Signed-off-by: Luiz Capitulino +Signed-off-by: Dan Duval +Signed-off-by: Chuck Anderson +Signed-off-by: Richard Guy Briggs +Signed-off-by: Eric Paris +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/audit.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -103,7 +103,8 @@ static int audit_rate_limit; + + /* Number of outstanding audit_buffers allowed. */ + static int audit_backlog_limit = 64; +-static int audit_backlog_wait_time = 60 * HZ; ++#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) ++static int audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; + static int audit_backlog_wait_overflow = 0; + + /* The identity of the user shutting down the audit system. */ +@@ -1135,6 +1136,8 @@ struct audit_buffer *audit_log_start(str + return NULL; + } + ++ audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; ++ + ab = audit_buffer_alloc(ctx, gfp_mask, type); + if (!ab) { + audit_log_lost("out of memory in audit_log_start"); diff --git a/queue-3.12/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch b/queue-3.12/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch new file mode 100644 index 00000000000..f505f789a6b --- /dev/null +++ b/queue-3.12/ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch @@ -0,0 +1,111 @@ +From a4c35ed241129dd142be4cadb1e5a474a56d5464 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Red Hat)" +Date: Mon, 13 Jan 2014 12:56:21 -0500 +Subject: ftrace: Fix synchronization location disabling and freeing ftrace_ops + +From: "Steven Rostedt (Red Hat)" + +commit a4c35ed241129dd142be4cadb1e5a474a56d5464 upstream. + +The synchronization needed after ftrace_ops are unregistered must happen +after the callback is disabled from becing called by functions. + +The current location happens after the function is being removed from the +internal lists, but not after the function callbacks were disabled, leaving +the functions susceptible of being called after their callbacks are freed. + +This affects perf and any externel users of function tracing (LTTng and +SystemTap). + +Fixes: cdbe61bfe704 "ftrace: Allow dynamically allocated function tracers" +Signed-off-by: Steven Rostedt +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/ftrace.c | 58 +++++++++++++++++++++++++++----------------------- + 1 file changed, 32 insertions(+), 26 deletions(-) + +--- a/kernel/trace/ftrace.c ++++ b/kernel/trace/ftrace.c +@@ -447,20 +447,6 @@ static int __unregister_ftrace_function( + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + ret = remove_ftrace_list_ops(&ftrace_control_list, + &control_ops, ops); +- if (!ret) { +- /* +- * The ftrace_ops is now removed from the list, +- * so there'll be no new users. We must ensure +- * all current users are done before we free +- * the control data. +- * Note synchronize_sched() is not enough, as we +- * use preempt_disable() to do RCU, but the function +- * tracer can be called where RCU is not active +- * (before user_exit()). +- */ +- schedule_on_each_cpu(ftrace_sync); +- control_ops_free(ops); +- } + } else + ret = remove_ftrace_ops(&ftrace_ops_list, ops); + +@@ -470,17 +456,6 @@ static int __unregister_ftrace_function( + if (ftrace_enabled) + update_ftrace_function(); + +- /* +- * Dynamic ops may be freed, we must make sure that all +- * callers are done before leaving this function. +- * +- * Again, normal synchronize_sched() is not good enough. +- * We need to do a hard force of sched synchronization. +- */ +- if (ops->flags & FTRACE_OPS_FL_DYNAMIC) +- schedule_on_each_cpu(ftrace_sync); +- +- + return 0; + } + +@@ -2164,10 +2139,41 @@ static int ftrace_shutdown(struct ftrace + command |= FTRACE_UPDATE_TRACE_FUNC; + } + +- if (!command || !ftrace_enabled) ++ if (!command || !ftrace_enabled) { ++ /* ++ * If these are control ops, they still need their ++ * per_cpu field freed. Since, function tracing is ++ * not currently active, we can just free them ++ * without synchronizing all CPUs. ++ */ ++ if (ops->flags & FTRACE_OPS_FL_CONTROL) ++ control_ops_free(ops); + return 0; ++ } + + ftrace_run_update_code(command); ++ ++ /* ++ * Dynamic ops may be freed, we must make sure that all ++ * callers are done before leaving this function. ++ * The same goes for freeing the per_cpu data of the control ++ * ops. ++ * ++ * Again, normal synchronize_sched() is not good enough. ++ * We need to do a hard force of sched synchronization. ++ * This is because we use preempt_disable() to do RCU, but ++ * the function tracers can be called where RCU is not watching ++ * (like before user_exit()). We can not rely on the RCU ++ * infrastructure to do the synchronization, thus we must do it ++ * ourselves. ++ */ ++ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { ++ schedule_on_each_cpu(ftrace_sync); ++ ++ if (ops->flags & FTRACE_OPS_FL_CONTROL) ++ control_ops_free(ops); ++ } ++ + return 0; + } + diff --git a/queue-3.12/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch b/queue-3.12/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch new file mode 100644 index 00000000000..cb7fc1165bf --- /dev/null +++ b/queue-3.12/ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch @@ -0,0 +1,184 @@ +From 23a8e8441a0a74dd612edf81dc89d1600bc0a3d1 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Red Hat)" +Date: Mon, 13 Jan 2014 10:30:23 -0500 +Subject: ftrace: Have function graph only trace based on global_ops filters + +From: "Steven Rostedt (Red Hat)" + +commit 23a8e8441a0a74dd612edf81dc89d1600bc0a3d1 upstream. + +Doing some different tests, I discovered that function graph tracing, when +filtered via the set_ftrace_filter and set_ftrace_notrace files, does +not always keep with them if another function ftrace_ops is registered +to trace functions. + +The reason is that function graph just happens to trace all functions +that the function tracer enables. When there was only one user of +function tracing, the function graph tracer did not need to worry about +being called by functions that it did not want to trace. But now that there +are other users, this becomes a problem. + +For example, one just needs to do the following: + + # cd /sys/kernel/debug/tracing + # echo schedule > set_ftrace_filter + # echo function_graph > current_tracer + # cat trace +[..] + 0) | schedule() { + ------------------------------------------ + 0) -0 => rcu_pre-7 + ------------------------------------------ + + 0) ! 2980.314 us | } + 0) | schedule() { + ------------------------------------------ + 0) rcu_pre-7 => -0 + ------------------------------------------ + + 0) + 20.701 us | } + + # echo 1 > /proc/sys/kernel/stack_tracer_enabled + # cat trace +[..] + 1) + 20.825 us | } + 1) + 21.651 us | } + 1) + 30.924 us | } /* SyS_ioctl */ + 1) | do_page_fault() { + 1) | __do_page_fault() { + 1) 0.274 us | down_read_trylock(); + 1) 0.098 us | find_vma(); + 1) | handle_mm_fault() { + 1) | _raw_spin_lock() { + 1) 0.102 us | preempt_count_add(); + 1) 0.097 us | do_raw_spin_lock(); + 1) 2.173 us | } + 1) | do_wp_page() { + 1) 0.079 us | vm_normal_page(); + 1) 0.086 us | reuse_swap_page(); + 1) 0.076 us | page_move_anon_rmap(); + 1) | unlock_page() { + 1) 0.082 us | page_waitqueue(); + 1) 0.086 us | __wake_up_bit(); + 1) 1.801 us | } + 1) 0.075 us | ptep_set_access_flags(); + 1) | _raw_spin_unlock() { + 1) 0.098 us | do_raw_spin_unlock(); + 1) 0.105 us | preempt_count_sub(); + 1) 1.884 us | } + 1) 9.149 us | } + 1) + 13.083 us | } + 1) 0.146 us | up_read(); + +When the stack tracer was enabled, it enabled all functions to be traced, which +now the function graph tracer also traces. This is a side effect that should +not occur. + +To fix this a test is added when the function tracing is changed, as well as when +the graph tracer is enabled, to see if anything other than the ftrace global_ops +function tracer is enabled. If so, then the graph tracer calls a test trampoline +that will look at the function that is being traced and compare it with the +filters defined by the global_ops. + +As an optimization, if there's no other function tracers registered, or if +the only registered function tracers also use the global ops, the function +graph infrastructure will call the registered function graph callback directly +and not go through the test trampoline. + +Fixes: d2d45c7a03a2 "tracing: Have stack_tracer use a separate list of functions" +Signed-off-by: Steven Rostedt +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/ftrace.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 44 insertions(+), 1 deletion(-) + +--- a/kernel/trace/ftrace.c ++++ b/kernel/trace/ftrace.c +@@ -278,6 +278,12 @@ static void update_global_ops(void) + global_ops.func = func; + } + ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++static void update_function_graph_func(void); ++#else ++static inline void update_function_graph_func(void) { } ++#endif ++ + static void update_ftrace_function(void) + { + ftrace_func_t func; +@@ -325,6 +331,8 @@ static int remove_ftrace_ops(struct ftra + { + struct ftrace_ops **p; + ++ update_function_graph_func(); ++ + /* + * If we are removing the last function, then simply point + * to the ftrace_stub. +@@ -4777,6 +4785,7 @@ int ftrace_graph_entry_stub(struct ftrac + trace_func_graph_ret_t ftrace_graph_return = + (trace_func_graph_ret_t)ftrace_stub; + trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; ++static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub; + + /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ + static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +@@ -4918,6 +4927,30 @@ static struct ftrace_ops fgraph_ops __re + FTRACE_OPS_FL_RECURSION_SAFE, + }; + ++static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace) ++{ ++ if (!ftrace_ops_test(&global_ops, trace->func, NULL)) ++ return 0; ++ return __ftrace_graph_entry(trace); ++} ++ ++/* ++ * The function graph tracer should only trace the functions defined ++ * by set_ftrace_filter and set_ftrace_notrace. If another function ++ * tracer ops is registered, the graph tracer requires testing the ++ * function against the global ops, and not just trace any function ++ * that any ftrace_ops registered. ++ */ ++static void update_function_graph_func(void) ++{ ++ if (ftrace_ops_list == &ftrace_list_end || ++ (ftrace_ops_list == &global_ops && ++ global_ops.next == &ftrace_list_end)) ++ ftrace_graph_entry = __ftrace_graph_entry; ++ else ++ ftrace_graph_entry = ftrace_graph_entry_test; ++} ++ + int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) + { +@@ -4942,7 +4975,16 @@ int register_ftrace_graph(trace_func_gra + } + + ftrace_graph_return = retfunc; +- ftrace_graph_entry = entryfunc; ++ ++ /* ++ * Update the indirect function to the entryfunc, and the ++ * function that gets called to the entry_test first. Then ++ * call the update fgraph entry function to determine if ++ * the entryfunc should be called directly or not. ++ */ ++ __ftrace_graph_entry = entryfunc; ++ ftrace_graph_entry = ftrace_graph_entry_test; ++ update_function_graph_func(); + + ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET); + +@@ -4961,6 +5003,7 @@ void unregister_ftrace_graph(void) + ftrace_graph_active--; + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; ++ __ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); diff --git a/queue-3.12/fuse-fix-pipe_buf_operations.patch b/queue-3.12/fuse-fix-pipe_buf_operations.patch new file mode 100644 index 00000000000..d9d94038235 --- /dev/null +++ b/queue-3.12/fuse-fix-pipe_buf_operations.patch @@ -0,0 +1,152 @@ +From 28a625cbc2a14f17b83e47ef907b2658576a32aa Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Wed, 22 Jan 2014 19:36:57 +0100 +Subject: fuse: fix pipe_buf_operations + +From: Miklos Szeredi + +commit 28a625cbc2a14f17b83e47ef907b2658576a32aa upstream. + +Having this struct in module memory could Oops when if the module is +unloaded while the buffer still persists in a pipe. + +Since sock_pipe_buf_ops is essentially the same as fuse_dev_pipe_buf_steal +merge them into nosteal_pipe_buf_ops (this is the same as +default_pipe_buf_ops except stealing the page from the buffer is not +allowed). + +Reported-by: Al Viro +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fuse/dev.c | 22 +++++----------------- + fs/splice.c | 18 ++++++++++++++++++ + include/linux/pipe_fs_i.h | 2 ++ + net/core/skbuff.c | 32 +------------------------------- + 4 files changed, 26 insertions(+), 48 deletions(-) + +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -1296,22 +1296,6 @@ static ssize_t fuse_dev_read(struct kioc + return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); + } + +-static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, +- struct pipe_buffer *buf) +-{ +- return 1; +-} +- +-static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { +- .can_merge = 0, +- .map = generic_pipe_buf_map, +- .unmap = generic_pipe_buf_unmap, +- .confirm = generic_pipe_buf_confirm, +- .release = generic_pipe_buf_release, +- .steal = fuse_dev_pipe_buf_steal, +- .get = generic_pipe_buf_get, +-}; +- + static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +@@ -1358,7 +1342,11 @@ static ssize_t fuse_dev_splice_read(stru + buf->page = bufs[page_nr].page; + buf->offset = bufs[page_nr].offset; + buf->len = bufs[page_nr].len; +- buf->ops = &fuse_dev_pipe_buf_ops; ++ /* ++ * Need to be careful about this. Having buf->ops in module ++ * code can Oops if the buffer persists after module unload. ++ */ ++ buf->ops = &nosteal_pipe_buf_ops; + + pipe->nrbufs++; + page_nr++; +--- a/fs/splice.c ++++ b/fs/splice.c +@@ -555,6 +555,24 @@ static const struct pipe_buf_operations + .get = generic_pipe_buf_get, + }; + ++static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, ++ struct pipe_buffer *buf) ++{ ++ return 1; ++} ++ ++/* Pipe buffer operations for a socket and similar. */ ++const struct pipe_buf_operations nosteal_pipe_buf_ops = { ++ .can_merge = 0, ++ .map = generic_pipe_buf_map, ++ .unmap = generic_pipe_buf_unmap, ++ .confirm = generic_pipe_buf_confirm, ++ .release = generic_pipe_buf_release, ++ .steal = generic_pipe_buf_nosteal, ++ .get = generic_pipe_buf_get, ++}; ++EXPORT_SYMBOL(nosteal_pipe_buf_ops); ++ + static ssize_t kernel_readv(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t offset) + { +--- a/include/linux/pipe_fs_i.h ++++ b/include/linux/pipe_fs_i.h +@@ -157,6 +157,8 @@ int generic_pipe_buf_confirm(struct pipe + int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); + void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); + ++extern const struct pipe_buf_operations nosteal_pipe_buf_ops; ++ + /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ + long pipe_fcntl(struct file *, unsigned int, unsigned long arg); + struct pipe_inode_info *get_pipe_info(struct file *file); +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -74,36 +74,6 @@ + struct kmem_cache *skbuff_head_cache __read_mostly; + static struct kmem_cache *skbuff_fclone_cache __read_mostly; + +-static void sock_pipe_buf_release(struct pipe_inode_info *pipe, +- struct pipe_buffer *buf) +-{ +- put_page(buf->page); +-} +- +-static void sock_pipe_buf_get(struct pipe_inode_info *pipe, +- struct pipe_buffer *buf) +-{ +- get_page(buf->page); +-} +- +-static int sock_pipe_buf_steal(struct pipe_inode_info *pipe, +- struct pipe_buffer *buf) +-{ +- return 1; +-} +- +- +-/* Pipe buffer operations for a socket. */ +-static const struct pipe_buf_operations sock_pipe_buf_ops = { +- .can_merge = 0, +- .map = generic_pipe_buf_map, +- .unmap = generic_pipe_buf_unmap, +- .confirm = generic_pipe_buf_confirm, +- .release = sock_pipe_buf_release, +- .steal = sock_pipe_buf_steal, +- .get = sock_pipe_buf_get, +-}; +- + /** + * skb_panic - private function for out-of-line support + * @skb: buffer +@@ -1800,7 +1770,7 @@ int skb_splice_bits(struct sk_buff *skb, + .partial = partial, + .nr_pages_max = MAX_SKB_FRAGS, + .flags = flags, +- .ops = &sock_pipe_buf_ops, ++ .ops = &nosteal_pipe_buf_ops, + .spd_release = sock_spd_release, + }; + struct sk_buff *frag_iter; diff --git a/queue-3.12/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch b/queue-3.12/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch new file mode 100644 index 00000000000..86e14c028e5 --- /dev/null +++ b/queue-3.12/intel-iommu-fix-off-by-one-in-pagetable-freeing.patch @@ -0,0 +1,61 @@ +From 08336fd218e087cc4fcc458e6b6dcafe8702b098 Mon Sep 17 00:00:00 2001 +From: Alex Williamson +Date: Tue, 21 Jan 2014 15:48:18 -0800 +Subject: intel-iommu: fix off-by-one in pagetable freeing + +From: Alex Williamson + +commit 08336fd218e087cc4fcc458e6b6dcafe8702b098 upstream. + +dma_pte_free_level() has an off-by-one error when checking whether a pte +is completely covered by a range. Take for example the case of +attempting to free pfn 0x0 - 0x1ff, ie. 512 entries covering the first +2M superpage. + +The level_size() is 0x200 and we test: + + static void dma_pte_free_level(... + ... + + if (!(0 > 0 || 0x1ff < 0 + 0x200)) { + ... + } + +Clearly the 2nd test is true, which means we fail to take the branch to +clear and free the pagetable entry. As a result, we're leaking +pagetables and failing to install new pages over the range. + +This was found with a PCI device assigned to a QEMU guest using vfio-pci +without a VGA device present. The first 1M of guest address space is +mapped with various combinations of 4K pages, but eventually the range +is entirely freed and replaced with a 2M contiguous mapping. +intel-iommu errors out with something like: + + ERROR: DMA PTE for vPFN 0x0 already set (to 5c2b8003 not 849c00083) + +In this case 5c2b8003 is the pointer to the previous leaf page that was +neither freed nor cleared and 849c00083 is the superpage entry that +we're trying to replace it with. + +Signed-off-by: Alex Williamson +Cc: David Woodhouse +Cc: Joerg Roedel +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/iommu/intel-iommu.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/iommu/intel-iommu.c ++++ b/drivers/iommu/intel-iommu.c +@@ -917,7 +917,7 @@ static void dma_pte_free_level(struct dm + + /* If range covers entire pagetable, free it */ + if (!(start_pfn > level_pfn || +- last_pfn < level_pfn + level_size(level))) { ++ last_pfn < level_pfn + level_size(level) - 1)) { + dma_clear_pte(pte); + domain_flush_cache(domain, pte, sizeof(*pte)); + free_pgtable_page(level_pte); diff --git a/queue-3.12/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch b/queue-3.12/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch new file mode 100644 index 00000000000..81c002cd430 --- /dev/null +++ b/queue-3.12/mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch @@ -0,0 +1,41 @@ +From d8ad30559715ce97afb7d1a93a12fd90e8fff312 Mon Sep 17 00:00:00 2001 +From: Hugh Dickins +Date: Thu, 23 Jan 2014 15:53:32 -0800 +Subject: mm/memcg: iteration skip memcgs not yet fully initialized + +From: Hugh Dickins + +commit d8ad30559715ce97afb7d1a93a12fd90e8fff312 upstream. + +It is surprising that the mem_cgroup iterator can return memcgs which +have not yet been fully initialized. By accident (or trial and error?) +this appears not to present an actual problem; but it may be better to +prevent such surprises, by skipping memcgs not yet online. + +Signed-off-by: Hugh Dickins +Cc: Tejun Heo +Acked-by: Michal Hocko +Cc: Johannes Weiner +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -1081,10 +1081,8 @@ skip_node: + * protected by css_get and the tree walk is rcu safe. + */ + if (next_css) { +- struct mem_cgroup *mem = mem_cgroup_from_css(next_css); +- +- if (css_tryget(&mem->css)) +- return mem; ++ if ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)) ++ return mem_cgroup_from_css(next_css); + else { + prev_css = next_css; + goto skip_node; diff --git a/queue-3.12/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch b/queue-3.12/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch new file mode 100644 index 00000000000..8610322f726 --- /dev/null +++ b/queue-3.12/mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch @@ -0,0 +1,141 @@ +From 54b9dd14d09f24927285359a227aa363ce46089e Mon Sep 17 00:00:00 2001 +From: Naoya Horiguchi +Date: Thu, 23 Jan 2014 15:53:14 -0800 +Subject: mm/memory-failure.c: shift page lock from head page to tail page after thp split + +From: Naoya Horiguchi + +commit 54b9dd14d09f24927285359a227aa363ce46089e upstream. + +After thp split in hwpoison_user_mappings(), we hold page lock on the +raw error page only between try_to_unmap, hence we are in danger of race +condition. + +I found in the RHEL7 MCE-relay testing that we have "bad page" error +when a memory error happens on a thp tail page used by qemu-kvm: + + Triggering MCE exception on CPU 10 + mce: [Hardware Error]: Machine check events logged + MCE exception done on CPU 10 + MCE 0x38c535: Killing qemu-kvm:8418 due to hardware memory corruption + MCE 0x38c535: dirty LRU page recovery: Recovered + qemu-kvm[8418]: segfault at 20 ip 00007ffb0f0f229a sp 00007fffd6bc5240 error 4 in qemu-kvm[7ffb0ef14000+420000] + BUG: Bad page state in process qemu-kvm pfn:38c400 + page:ffffea000e310000 count:0 mapcount:0 mapping: (null) index:0x7ffae3c00 + page flags: 0x2fffff0008001d(locked|referenced|uptodate|dirty|swapbacked) + Modules linked in: hwpoison_inject mce_inject vhost_net macvtap macvlan ... + CPU: 0 PID: 8418 Comm: qemu-kvm Tainted: G M -------------- 3.10.0-54.0.1.el7.mce_test_fixed.x86_64 #1 + Hardware name: NEC NEC Express5800/R120b-1 [N8100-1719F]/MS-91E7-001, BIOS 4.6.3C19 02/10/2011 + Call Trace: + dump_stack+0x19/0x1b + bad_page.part.59+0xcf/0xe8 + free_pages_prepare+0x148/0x160 + free_hot_cold_page+0x31/0x140 + free_hot_cold_page_list+0x46/0xa0 + release_pages+0x1c1/0x200 + free_pages_and_swap_cache+0xad/0xd0 + tlb_flush_mmu.part.46+0x4c/0x90 + tlb_finish_mmu+0x55/0x60 + exit_mmap+0xcb/0x170 + mmput+0x67/0xf0 + vhost_dev_cleanup+0x231/0x260 [vhost_net] + vhost_net_release+0x3f/0x90 [vhost_net] + __fput+0xe9/0x270 + ____fput+0xe/0x10 + task_work_run+0xc4/0xe0 + do_exit+0x2bb/0xa40 + do_group_exit+0x3f/0xa0 + get_signal_to_deliver+0x1d0/0x6e0 + do_signal+0x48/0x5e0 + do_notify_resume+0x71/0xc0 + retint_signal+0x48/0x8c + +The reason of this bug is that a page fault happens before unlocking the +head page at the end of memory_failure(). This strange page fault is +trying to access to address 0x20 and I'm not sure why qemu-kvm does +this, but anyway as a result the SIGSEGV makes qemu-kvm exit and on the +way we catch the bad page bug/warning because we try to free a locked +page (which was the former head page.) + +To fix this, this patch suggests to shift page lock from head page to +tail page just after thp split. SIGSEGV still happens, but it affects +only error affected VMs, not a whole system. + +Signed-off-by: Naoya Horiguchi +Cc: Andi Kleen +Cc: Wanpeng Li +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memory-failure.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +--- a/mm/memory-failure.c ++++ b/mm/memory-failure.c +@@ -856,14 +856,14 @@ static int page_action(struct page_state + * the pages and send SIGBUS to the processes if the data was dirty. + */ + static int hwpoison_user_mappings(struct page *p, unsigned long pfn, +- int trapno, int flags) ++ int trapno, int flags, struct page **hpagep) + { + enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + struct address_space *mapping; + LIST_HEAD(tokill); + int ret; + int kill = 1, forcekill; +- struct page *hpage = compound_head(p); ++ struct page *hpage = *hpagep; + struct page *ppage; + + if (PageReserved(p) || PageSlab(p)) +@@ -942,11 +942,14 @@ static int hwpoison_user_mappings(struct + * We pinned the head page for hwpoison handling, + * now we split the thp and we are interested in + * the hwpoisoned raw page, so move the refcount +- * to it. ++ * to it. Similarly, page lock is shifted. + */ + if (hpage != p) { + put_page(hpage); + get_page(p); ++ lock_page(p); ++ unlock_page(hpage); ++ *hpagep = p; + } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; +@@ -964,17 +967,11 @@ static int hwpoison_user_mappings(struct + if (kill) + collect_procs(ppage, &tokill); + +- if (hpage != ppage) +- lock_page(ppage); +- + ret = try_to_unmap(ppage, ttu); + if (ret != SWAP_SUCCESS) + printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(ppage)); + +- if (hpage != ppage) +- unlock_page(ppage); +- + /* + * Now that the dirty bit has been propagated to the + * struct page and all unmaps done we can decide if +@@ -1193,8 +1190,12 @@ int memory_failure(unsigned long pfn, in + /* + * Now take care of user space mappings. + * Abort on fail: __delete_from_page_cache() assumes unmapped page. ++ * ++ * When the raw error page is thp tail page, hpage points to the raw ++ * page after thp split. + */ +- if (hwpoison_user_mappings(p, pfn, trapno, flags) != SWAP_SUCCESS) { ++ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) ++ != SWAP_SUCCESS) { + printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + res = -EBUSY; + goto out; diff --git a/queue-3.12/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch b/queue-3.12/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch new file mode 100644 index 00000000000..8685c56cf8e --- /dev/null +++ b/queue-3.12/mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch @@ -0,0 +1,125 @@ +From a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 29 Jan 2014 14:05:41 -0800 +Subject: mm/page-writeback.c: do not count anon pages as dirtyable memory + +From: Johannes Weiner + +commit a1c3bfb2f67ef766de03f1f56bdfff9c8595ab14 upstream. + +The VM is currently heavily tuned to avoid swapping. Whether that is +good or bad is a separate discussion, but as long as the VM won't swap +to make room for dirty cache, we can not consider anonymous pages when +calculating the amount of dirtyable memory, the baseline to which +dirty_background_ratio and dirty_ratio are applied. + +A simple workload that occupies a significant size (40+%, depending on +memory layout, storage speeds etc.) of memory with anon/tmpfs pages and +uses the remainder for a streaming writer demonstrates this problem. In +that case, the actual cache pages are a small fraction of what is +considered dirtyable overall, which results in an relatively large +portion of the cache pages to be dirtied. As kswapd starts rotating +these, random tasks enter direct reclaim and stall on IO. + +Only consider free pages and file pages dirtyable. + +Signed-off-by: Johannes Weiner +Reported-by: Tejun Heo +Tested-by: Tejun Heo +Reviewed-by: Rik van Riel +Cc: Mel Gorman +Cc: Wu Fengguang +Reviewed-by: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/vmstat.h | 2 -- + mm/internal.h | 1 - + mm/page-writeback.c | 6 ++++-- + mm/vmscan.c | 23 +---------------------- + 4 files changed, 5 insertions(+), 27 deletions(-) + +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -142,8 +142,6 @@ static inline unsigned long zone_page_st + return x; + } + +-extern unsigned long global_reclaimable_pages(void); +- + #ifdef CONFIG_NUMA + /* + * Determine the per node value of a stat item. This function +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -85,7 +85,6 @@ extern unsigned long highest_memmap_pfn; + */ + extern int isolate_lru_page(struct page *page); + extern void putback_lru_page(struct page *page); +-extern unsigned long zone_reclaimable_pages(struct zone *zone); + extern bool zone_reclaimable(struct zone *zone); + + /* +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -205,7 +205,8 @@ static unsigned long zone_dirtyable_memo + nr_pages = zone_page_state(zone, NR_FREE_PAGES); + nr_pages -= min(nr_pages, zone->dirty_balance_reserve); + +- nr_pages += zone_reclaimable_pages(zone); ++ nr_pages += zone_page_state(zone, NR_INACTIVE_FILE); ++ nr_pages += zone_page_state(zone, NR_ACTIVE_FILE); + + return nr_pages; + } +@@ -258,7 +259,8 @@ static unsigned long global_dirtyable_me + x = global_page_state(NR_FREE_PAGES); + x -= min(x, dirty_balance_reserve); + +- x += global_reclaimable_pages(); ++ x += global_page_state(NR_INACTIVE_FILE); ++ x += global_page_state(NR_ACTIVE_FILE); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -147,7 +147,7 @@ static bool global_reclaim(struct scan_c + } + #endif + +-unsigned long zone_reclaimable_pages(struct zone *zone) ++static unsigned long zone_reclaimable_pages(struct zone *zone) + { + int nr; + +@@ -3297,27 +3297,6 @@ void wakeup_kswapd(struct zone *zone, in + wake_up_interruptible(&pgdat->kswapd_wait); + } + +-/* +- * The reclaimable count would be mostly accurate. +- * The less reclaimable pages may be +- * - mlocked pages, which will be moved to unevictable list when encountered +- * - mapped pages, which may require several travels to be reclaimed +- * - dirty pages, which is not "instantly" reclaimable +- */ +-unsigned long global_reclaimable_pages(void) +-{ +- int nr; +- +- nr = global_page_state(NR_ACTIVE_FILE) + +- global_page_state(NR_INACTIVE_FILE); +- +- if (get_nr_swap_pages() > 0) +- nr += global_page_state(NR_ACTIVE_ANON) + +- global_page_state(NR_INACTIVE_ANON); +- +- return nr; +-} +- + #ifdef CONFIG_HIBERNATION + /* + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of diff --git a/queue-3.12/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch b/queue-3.12/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch new file mode 100644 index 00000000000..7cd2e5055f1 --- /dev/null +++ b/queue-3.12/mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch @@ -0,0 +1,147 @@ +From a804552b9a15c931cfc2a92a2e0aed1add8b580a Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Wed, 29 Jan 2014 14:05:39 -0800 +Subject: mm/page-writeback.c: fix dirty_balance_reserve subtraction from dirtyable memory + +From: Johannes Weiner + +commit a804552b9a15c931cfc2a92a2e0aed1add8b580a upstream. + +Tejun reported stuttering and latency spikes on a system where random +tasks would enter direct reclaim and get stuck on dirty pages. Around +50% of memory was occupied by tmpfs backed by an SSD, and another disk +(rotating) was reading and writing at max speed to shrink a partition. + +: The problem was pretty ridiculous. It's a 8gig machine w/ one ssd and 10k +: rpm harddrive and I could reliably reproduce constant stuttering every +: several seconds for as long as buffered IO was going on on the hard drive +: either with tmpfs occupying somewhere above 4gig or a test program which +: allocates about the same amount of anon memory. Although swap usage was +: zero, turning off swap also made the problem go away too. +: +: The trigger conditions seem quite plausible - high anon memory usage w/ +: heavy buffered IO and swap configured - and it's highly likely that this +: is happening in the wild too. (this can happen with copying large files +: to usb sticks too, right?) + +This patch (of 2): + +The dirty_balance_reserve is an approximation of the fraction of free +pages that the page allocator does not make available for page cache +allocations. As a result, it has to be taken into account when +calculating the amount of "dirtyable memory", the baseline to which +dirty_background_ratio and dirty_ratio are applied. + +However, currently the reserve is subtracted from the sum of free and +reclaimable pages, which is non-sensical and leads to erroneous results +when the system is dominated by unreclaimable pages and the +dirty_balance_reserve is bigger than free+reclaimable. In that case, at +least the already allocated cache should be considered dirtyable. + +Fix the calculation by subtracting the reserve from the amount of free +pages, then adding the reclaimable pages on top. + +[akpm@linux-foundation.org: fix CONFIG_HIGHMEM build] +Signed-off-by: Johannes Weiner +Reported-by: Tejun Heo +Tested-by: Tejun Heo +Reviewed-by: Rik van Riel +Cc: Mel Gorman +Cc: Wu Fengguang +Reviewed-by: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/page-writeback.c | 55 ++++++++++++++++++++++------------------------------ + 1 file changed, 24 insertions(+), 31 deletions(-) + +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -191,6 +191,25 @@ static unsigned long writeout_period_tim + * global dirtyable memory first. + */ + ++/** ++ * zone_dirtyable_memory - number of dirtyable pages in a zone ++ * @zone: the zone ++ * ++ * Returns the zone's number of pages potentially available for dirty ++ * page cache. This is the base value for the per-zone dirty limits. ++ */ ++static unsigned long zone_dirtyable_memory(struct zone *zone) ++{ ++ unsigned long nr_pages; ++ ++ nr_pages = zone_page_state(zone, NR_FREE_PAGES); ++ nr_pages -= min(nr_pages, zone->dirty_balance_reserve); ++ ++ nr_pages += zone_reclaimable_pages(zone); ++ ++ return nr_pages; ++} ++ + static unsigned long highmem_dirtyable_memory(unsigned long total) + { + #ifdef CONFIG_HIGHMEM +@@ -198,11 +217,9 @@ static unsigned long highmem_dirtyable_m + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { +- struct zone *z = +- &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; ++ struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + +- x += zone_page_state(z, NR_FREE_PAGES) + +- zone_reclaimable_pages(z) - z->dirty_balance_reserve; ++ x += zone_dirtyable_memory(z); + } + /* + * Unreclaimable memory (kernel memory or anonymous memory +@@ -238,9 +255,11 @@ static unsigned long global_dirtyable_me + { + unsigned long x; + +- x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); ++ x = global_page_state(NR_FREE_PAGES); + x -= min(x, dirty_balance_reserve); + ++ x += global_reclaimable_pages(); ++ + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + +@@ -289,32 +308,6 @@ void global_dirty_limits(unsigned long * + } + + /** +- * zone_dirtyable_memory - number of dirtyable pages in a zone +- * @zone: the zone +- * +- * Returns the zone's number of pages potentially available for dirty +- * page cache. This is the base value for the per-zone dirty limits. +- */ +-static unsigned long zone_dirtyable_memory(struct zone *zone) +-{ +- /* +- * The effective global number of dirtyable pages may exclude +- * highmem as a big-picture measure to keep the ratio between +- * dirty memory and lowmem reasonable. +- * +- * But this function is purely about the individual zone and a +- * highmem zone can hold its share of dirty pages, so we don't +- * care about vm_highmem_is_dirtyable here. +- */ +- unsigned long nr_pages = zone_page_state(zone, NR_FREE_PAGES) + +- zone_reclaimable_pages(zone); +- +- /* don't allow this to underflow */ +- nr_pages -= min(nr_pages, zone->dirty_balance_reserve); +- return nr_pages; +-} +- +-/** + * zone_dirty_limit - maximum number of dirty pages allowed in a zone + * @zone: the zone + * diff --git a/queue-3.12/revert-eisa-initialize-device-before-its-resources.patch b/queue-3.12/revert-eisa-initialize-device-before-its-resources.patch new file mode 100644 index 00000000000..6de968b6a61 --- /dev/null +++ b/queue-3.12/revert-eisa-initialize-device-before-its-resources.patch @@ -0,0 +1,96 @@ +From 765ee51f9a3f652959b4c7297d198a28e37952b4 Mon Sep 17 00:00:00 2001 +From: Bjorn Helgaas +Date: Fri, 17 Jan 2014 14:57:29 -0700 +Subject: Revert "EISA: Initialize device before its resources" + +From: Bjorn Helgaas + +commit 765ee51f9a3f652959b4c7297d198a28e37952b4 upstream. + +This reverts commit 26abfeed4341872364386c6a52b9acef8c81a81a. + +In the eisa_probe() force_probe path, if we were unable to request slot +resources (e.g., [io 0x800-0x8ff]), we skipped the slot with "Cannot +allocate resource for EISA slot %d" before reading the EISA signature in +eisa_init_device(). + +Commit 26abfeed4341 moved eisa_init_device() earlier, so we tried to read +the EISA signature before requesting the slot resources, and this caused +hangs during boot. + +Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1251816 +Signed-off-by: Bjorn Helgaas +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/eisa/eisa-bus.c | 26 +++++++++++++++----------- + 1 file changed, 15 insertions(+), 11 deletions(-) + +--- a/drivers/eisa/eisa-bus.c ++++ b/drivers/eisa/eisa-bus.c +@@ -275,11 +275,13 @@ static int __init eisa_request_resources + } + + if (slot) { ++ edev->res[i].name = NULL; + edev->res[i].start = SLOT_ADDRESS(root, slot) + + (i * 0x400); + edev->res[i].end = edev->res[i].start + 0xff; + edev->res[i].flags = IORESOURCE_IO; + } else { ++ edev->res[i].name = NULL; + edev->res[i].start = SLOT_ADDRESS(root, slot) + + EISA_VENDOR_ID_OFFSET; + edev->res[i].end = edev->res[i].start + 3; +@@ -326,19 +328,20 @@ static int __init eisa_probe(struct eisa + return -ENOMEM; + } + +- if (eisa_init_device(root, edev, 0)) { ++ if (eisa_request_resources(root, edev, 0)) { ++ dev_warn(root->dev, ++ "EISA: Cannot allocate resource for mainboard\n"); + kfree(edev); + if (!root->force_probe) +- return -ENODEV; ++ return -EBUSY; + goto force_probe; + } + +- if (eisa_request_resources(root, edev, 0)) { +- dev_warn(root->dev, +- "EISA: Cannot allocate resource for mainboard\n"); ++ if (eisa_init_device(root, edev, 0)) { ++ eisa_release_resources(edev); + kfree(edev); + if (!root->force_probe) +- return -EBUSY; ++ return -ENODEV; + goto force_probe; + } + +@@ -361,11 +364,6 @@ static int __init eisa_probe(struct eisa + continue; + } + +- if (eisa_init_device(root, edev, i)) { +- kfree(edev); +- continue; +- } +- + if (eisa_request_resources(root, edev, i)) { + dev_warn(root->dev, + "Cannot allocate resource for EISA slot %d\n", +@@ -373,6 +371,12 @@ static int __init eisa_probe(struct eisa + kfree(edev); + continue; + } ++ ++ if (eisa_init_device(root, edev, i)) { ++ eisa_release_resources(edev); ++ kfree(edev); ++ continue; ++ } + + if (edev->state == (EISA_CONFIG_ENABLED | EISA_CONFIG_FORCED)) + enabled_str = " (forced enabled)"; diff --git a/queue-3.12/selinux-fix-memory-leak-upon-loading-policy.patch b/queue-3.12/selinux-fix-memory-leak-upon-loading-policy.patch new file mode 100644 index 00000000000..373ee438ed0 --- /dev/null +++ b/queue-3.12/selinux-fix-memory-leak-upon-loading-policy.patch @@ -0,0 +1,79 @@ +From 8ed814602876bec9bad2649ca17f34b499357a1c Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa +Date: Mon, 6 Jan 2014 21:28:15 +0900 +Subject: SELinux: Fix memory leak upon loading policy + +From: Tetsuo Handa + +commit 8ed814602876bec9bad2649ca17f34b499357a1c upstream. + +Hello. + +I got below leak with linux-3.10.0-54.0.1.el7.x86_64 . + +[ 681.903890] kmemleak: 5538 new suspected memory leaks (see /sys/kernel/debug/kmemleak) + +Below is a patch, but I don't know whether we need special handing for undoing +ebitmap_set_bit() call. +---------- +>>From fe97527a90fe95e2239dfbaa7558f0ed559c0992 Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa +Date: Mon, 6 Jan 2014 16:30:21 +0900 +Subject: SELinux: Fix memory leak upon loading policy + +Commit 2463c26d "SELinux: put name based create rules in a hashtable" did not +check return value from hashtab_insert() in filename_trans_read(). It leaks +memory if hashtab_insert() returns error. + + unreferenced object 0xffff88005c9160d0 (size 8): + comm "systemd", pid 1, jiffies 4294688674 (age 235.265s) + hex dump (first 8 bytes): + 57 0b 00 00 6b 6b 6b a5 W...kkk. + backtrace: + [] kmemleak_alloc+0x4e/0xb0 + [] kmem_cache_alloc_trace+0x12e/0x360 + [] policydb_read+0xd1d/0xf70 + [] security_load_policy+0x6c/0x500 + [] sel_write_load+0xac/0x750 + [] vfs_write+0xc0/0x1f0 + [] SyS_write+0x4c/0xa0 + [] system_call_fastpath+0x16/0x1b + [] 0xffffffffffffffff + +However, we should not return EEXIST error to the caller, or the systemd will +show below message and the boot sequence freezes. + + systemd[1]: Failed to load SELinux policy. Freezing. + +Signed-off-by: Tetsuo Handa +Acked-by: Eric Paris +Signed-off-by: Paul Moore +Signed-off-by: Greg Kroah-Hartman + +--- + security/selinux/ss/policydb.c | 14 +++++++++++++- + 1 file changed, 13 insertions(+), 1 deletion(-) + +--- a/security/selinux/ss/policydb.c ++++ b/security/selinux/ss/policydb.c +@@ -1941,7 +1941,19 @@ static int filename_trans_read(struct po + if (rc) + goto out; + +- hashtab_insert(p->filename_trans, ft, otype); ++ rc = hashtab_insert(p->filename_trans, ft, otype); ++ if (rc) { ++ /* ++ * Do not return -EEXIST to the caller, or the system ++ * will not boot. ++ */ ++ if (rc != -EEXIST) ++ goto out; ++ /* But free memory to avoid memory leak. */ ++ kfree(ft); ++ kfree(name); ++ kfree(otype); ++ } + } + hash_eval(p->filename_trans, "filenametr"); + return 0; diff --git a/queue-3.12/series b/queue-3.12/series new file mode 100644 index 00000000000..0ac9d1de8d6 --- /dev/null +++ b/queue-3.12/series @@ -0,0 +1,16 @@ +selinux-fix-memory-leak-upon-loading-policy.patch +ftrace-have-function-graph-only-trace-based-on-global_ops-filters.patch +ftrace-fix-synchronization-location-disabling-and-freeing-ftrace_ops.patch +tracing-have-trace-buffer-point-back-to-trace_array.patch +tracing-check-if-tracing-is-enabled-in-trace_puts.patch +arch-sh-kernel-kgdb.c-add-missing-include-linux-sched.h.patch +intel-iommu-fix-off-by-one-in-pagetable-freeing.patch +revert-eisa-initialize-device-before-its-resources.patch +fuse-fix-pipe_buf_operations.patch +audit-reset-audit-backlog-wait-time-after-error-recovery.patch +audit-correct-a-type-mismatch-in-audit_syscall_exit.patch +xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch +mm-memory-failure.c-shift-page-lock-from-head-page-to-tail-page-after-thp-split.patch +mm-memcg-iteration-skip-memcgs-not-yet-fully-initialized.patch +mm-page-writeback.c-fix-dirty_balance_reserve-subtraction-from-dirtyable-memory.patch +mm-page-writeback.c-do-not-count-anon-pages-as-dirtyable-memory.patch diff --git a/queue-3.12/tracing-check-if-tracing-is-enabled-in-trace_puts.patch b/queue-3.12/tracing-check-if-tracing-is-enabled-in-trace_puts.patch new file mode 100644 index 00000000000..6f6114b7860 --- /dev/null +++ b/queue-3.12/tracing-check-if-tracing-is-enabled-in-trace_puts.patch @@ -0,0 +1,44 @@ +From 3132e107d608f8753240d82d61303c500fd515b4 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Red Hat)" +Date: Thu, 23 Jan 2014 12:27:59 -0500 +Subject: tracing: Check if tracing is enabled in trace_puts() + +From: "Steven Rostedt (Red Hat)" + +commit 3132e107d608f8753240d82d61303c500fd515b4 upstream. + +If trace_puts() is used very early in boot up, it can crash the machine +if it is called before the ring buffer is allocated. If a trace_printk() +is used with no arguments, then it will be converted into a trace_puts() +and suffer the same fate. + +Fixes: 09ae72348ecc "tracing: Add trace_puts() for even faster trace_printk() tracing" +Signed-off-by: Steven Rostedt +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/trace.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -435,6 +435,9 @@ int __trace_puts(unsigned long ip, const + unsigned long irq_flags; + int alloc; + ++ if (unlikely(tracing_selftest_running || tracing_disabled)) ++ return 0; ++ + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + local_save_flags(irq_flags); +@@ -475,6 +478,9 @@ int __trace_bputs(unsigned long ip, cons + unsigned long irq_flags; + int size = sizeof(struct bputs_entry); + ++ if (unlikely(tracing_selftest_running || tracing_disabled)) ++ return 0; ++ + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, diff --git a/queue-3.12/tracing-have-trace-buffer-point-back-to-trace_array.patch b/queue-3.12/tracing-have-trace-buffer-point-back-to-trace_array.patch new file mode 100644 index 00000000000..0665f55bf5a --- /dev/null +++ b/queue-3.12/tracing-have-trace-buffer-point-back-to-trace_array.patch @@ -0,0 +1,36 @@ +From dced341b2d4f06668efaab33f88de5d287c0f45b Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Red Hat)" +Date: Tue, 14 Jan 2014 10:19:46 -0500 +Subject: tracing: Have trace buffer point back to trace_array + +From: "Steven Rostedt (Red Hat)" + +commit dced341b2d4f06668efaab33f88de5d287c0f45b upstream. + +The trace buffer has a descriptor pointer that goes back to the trace +array. But it was never assigned. Luckily, nothing uses it (yet), but +it will in the future. + +Although nothing currently uses this, if any of the new features get +backported to older kernels, and because this is such a simple change, +I'm marking it for stable too. + +Fixes: 12883efb670c "tracing: Consolidate max_tr into main trace_array structure" +Signed-off-by: Steven Rostedt +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/trace/trace.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -5872,6 +5872,8 @@ allocate_trace_buffer(struct trace_array + + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + ++ buf->tr = tr; ++ + buf->buffer = ring_buffer_alloc(size, rb_flags); + if (!buf->buffer) + return -ENOMEM; diff --git a/queue-3.12/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch b/queue-3.12/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch new file mode 100644 index 00000000000..f1daa06c2be --- /dev/null +++ b/queue-3.12/xen-pvhvm-if-xen_platform_pci-0-is-set-don-t-blow-up-v4.patch @@ -0,0 +1,334 @@ +From 51c71a3bbaca868043cc45b3ad3786dd48a90235 Mon Sep 17 00:00:00 2001 +From: Konrad Rzeszutek Wilk +Date: Tue, 26 Nov 2013 15:05:40 -0500 +Subject: xen/pvhvm: If xen_platform_pci=0 is set don't blow up (v4). + +From: Konrad Rzeszutek Wilk + +commit 51c71a3bbaca868043cc45b3ad3786dd48a90235 upstream. + +The user has the option of disabling the platform driver: +00:02.0 Unassigned class [ff80]: XenSource, Inc. Xen Platform Device (rev 01) + +which is used to unplug the emulated drivers (IDE, Realtek 8169, etc) +and allow the PV drivers to take over. If the user wishes +to disable that they can set: + + xen_platform_pci=0 + (in the guest config file) + +or + xen_emul_unplug=never + (on the Linux command line) + +except it does not work properly. The PV drivers still try to +load and since the Xen platform driver is not run - and it +has not initialized the grant tables, most of the PV drivers +stumble upon: + +input: Xen Virtual Keyboard as /devices/virtual/input/input5 +input: Xen Virtual Pointer as /devices/virtual/input/input6M +------------[ cut here ]------------ +kernel BUG at /home/konrad/ssd/konrad/linux/drivers/xen/grant-table.c:1206! +invalid opcode: 0000 [#1] SMP +Modules linked in: xen_kbdfront(+) xenfs xen_privcmd +CPU: 6 PID: 1389 Comm: modprobe Not tainted 3.13.0-rc1upstream-00021-ga6c892b-dirty #1 +Hardware name: Xen HVM domU, BIOS 4.4-unstable 11/26/2013 +RIP: 0010:[] [] get_free_entries+0x2e0/0x300 +Call Trace: + [] ? evdev_connect+0x1e3/0x240 + [] gnttab_grant_foreign_access+0x2e/0x70 + [] xenkbd_connect_backend+0x41/0x290 [xen_kbdfront] + [] xenkbd_probe+0x2f2/0x324 [xen_kbdfront] + [] xenbus_dev_probe+0x77/0x130 + [] xenbus_frontend_dev_probe+0x47/0x50 + [] driver_probe_device+0x89/0x230 + [] __driver_attach+0x9b/0xa0 + [] ? driver_probe_device+0x230/0x230 + [] ? driver_probe_device+0x230/0x230 + [] bus_for_each_dev+0x8c/0xb0 + [] driver_attach+0x19/0x20 + [] bus_add_driver+0x1a0/0x220 + [] driver_register+0x5f/0xf0 + [] xenbus_register_driver_common+0x15/0x20 + [] xenbus_register_frontend+0x23/0x40 + [] ? 0xffffffffa0014fff + [] xenkbd_init+0x2b/0x1000 [xen_kbdfront] + [] do_one_initcall+0x49/0x170 + +.. snip.. + +which is hardly nice. This patch fixes this by having each +PV driver check for: + - if running in PV, then it is fine to execute (as that is their + native environment). + - if running in HVM, check if user wanted 'xen_emul_unplug=never', + in which case bail out and don't load any PV drivers. + - if running in HVM, and if PCI device 5853:0001 (xen_platform_pci) + does not exist, then bail out and not load PV drivers. + - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=ide-disks', + then bail out for all PV devices _except_ the block one. + Ditto for the network one ('nics'). + - (v2) if running in HVM, and if the user wanted 'xen_emul_unplug=unnecessary' + then load block PV driver, and also setup the legacy IDE paths. + In (v3) make it actually load PV drivers. + +Reported-by: Sander Eikelenboom +Reported-and-Tested-by: Fabio Fantoni +Signed-off-by: Konrad Rzeszutek Wilk +[v2: Add extra logic to handle the myrid ways 'xen_emul_unplug' +can be used per Ian and Stefano suggestion] +[v3: Make the unnecessary case work properly] +[v4: s/disks/ide-disks/ spotted by Fabio] +Reviewed-by: Stefano Stabellini +Acked-by: Bjorn Helgaas [for PCI parts] +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/xen/platform-pci-unplug.c | 74 +++++++++++++++++++++++++++++ + drivers/block/xen-blkfront.c | 4 - + drivers/char/tpm/xen-tpmfront.c | 4 + + drivers/input/misc/xen-kbdfront.c | 4 + + drivers/net/xen-netfront.c | 2 + drivers/pci/xen-pcifront.c | 4 + + drivers/video/xen-fbfront.c | 4 + + drivers/xen/xenbus/xenbus_probe_frontend.c | 2 + include/xen/platform_pci.h | 23 +++++++++ + 9 files changed, 117 insertions(+), 4 deletions(-) + +--- a/arch/x86/xen/platform-pci-unplug.c ++++ b/arch/x86/xen/platform-pci-unplug.c +@@ -69,6 +69,80 @@ static int check_platform_magic(void) + return 0; + } + ++bool xen_has_pv_devices() ++{ ++ if (!xen_domain()) ++ return false; ++ ++ /* PV domains always have them. */ ++ if (xen_pv_domain()) ++ return true; ++ ++ /* And user has xen_platform_pci=0 set in guest config as ++ * driver did not modify the value. */ ++ if (xen_platform_pci_unplug == 0) ++ return false; ++ ++ if (xen_platform_pci_unplug & XEN_UNPLUG_NEVER) ++ return false; ++ ++ if (xen_platform_pci_unplug & XEN_UNPLUG_ALL) ++ return true; ++ ++ /* This is an odd one - we are going to run legacy ++ * and PV drivers at the same time. */ ++ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) ++ return true; ++ ++ /* And the caller has to follow with xen_pv_{disk,nic}_devices ++ * to be certain which driver can load. */ ++ return false; ++} ++EXPORT_SYMBOL_GPL(xen_has_pv_devices); ++ ++static bool __xen_has_pv_device(int state) ++{ ++ /* HVM domains might or might not */ ++ if (xen_hvm_domain() && (xen_platform_pci_unplug & state)) ++ return true; ++ ++ return xen_has_pv_devices(); ++} ++ ++bool xen_has_pv_nic_devices(void) ++{ ++ return __xen_has_pv_device(XEN_UNPLUG_ALL_NICS | XEN_UNPLUG_ALL); ++} ++EXPORT_SYMBOL_GPL(xen_has_pv_nic_devices); ++ ++bool xen_has_pv_disk_devices(void) ++{ ++ return __xen_has_pv_device(XEN_UNPLUG_ALL_IDE_DISKS | ++ XEN_UNPLUG_AUX_IDE_DISKS | XEN_UNPLUG_ALL); ++} ++EXPORT_SYMBOL_GPL(xen_has_pv_disk_devices); ++ ++/* ++ * This one is odd - it determines whether you want to run PV _and_ ++ * legacy (IDE) drivers together. This combination is only possible ++ * under HVM. ++ */ ++bool xen_has_pv_and_legacy_disk_devices(void) ++{ ++ if (!xen_domain()) ++ return false; ++ ++ /* N.B. This is only ever used in HVM mode */ ++ if (xen_pv_domain()) ++ return false; ++ ++ if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) ++ return true; ++ ++ return false; ++} ++EXPORT_SYMBOL_GPL(xen_has_pv_and_legacy_disk_devices); ++ + void xen_unplug_emulated_devices(void) + { + int r; +--- a/drivers/block/xen-blkfront.c ++++ b/drivers/block/xen-blkfront.c +@@ -1278,7 +1278,7 @@ static int blkfront_probe(struct xenbus_ + char *type; + int len; + /* no unplug has been done: do not hook devices != xen vbds */ +- if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) { ++ if (xen_has_pv_and_legacy_disk_devices()) { + int major; + + if (!VDEV_IS_EXTENDED(vdevice)) +@@ -2022,7 +2022,7 @@ static int __init xlblk_init(void) + if (!xen_domain()) + return -ENODEV; + +- if (xen_hvm_domain() && !xen_platform_pci_unplug) ++ if (!xen_has_pv_disk_devices()) + return -ENODEV; + + if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { +--- a/drivers/char/tpm/xen-tpmfront.c ++++ b/drivers/char/tpm/xen-tpmfront.c +@@ -17,6 +17,7 @@ + #include + #include + #include "tpm.h" ++#include + + struct tpm_private { + struct tpm_chip *chip; +@@ -423,6 +424,9 @@ static int __init xen_tpmfront_init(void + if (!xen_domain()) + return -ENODEV; + ++ if (!xen_has_pv_devices()) ++ return -ENODEV; ++ + return xenbus_register_frontend(&tpmfront_driver); + } + module_init(xen_tpmfront_init); +--- a/drivers/input/misc/xen-kbdfront.c ++++ b/drivers/input/misc/xen-kbdfront.c +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + struct xenkbd_info { + struct input_dev *kbd; +@@ -380,6 +381,9 @@ static int __init xenkbd_init(void) + if (xen_initial_domain()) + return -ENODEV; + ++ if (!xen_has_pv_devices()) ++ return -ENODEV; ++ + return xenbus_register_frontend(&xenkbd_driver); + } + +--- a/drivers/net/xen-netfront.c ++++ b/drivers/net/xen-netfront.c +@@ -2070,7 +2070,7 @@ static int __init netif_init(void) + if (!xen_domain()) + return -ENODEV; + +- if (xen_hvm_domain() && !xen_platform_pci_unplug) ++ if (!xen_has_pv_nic_devices()) + return -ENODEV; + + pr_info("Initialising Xen virtual ethernet driver\n"); +--- a/drivers/pci/xen-pcifront.c ++++ b/drivers/pci/xen-pcifront.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include + #define INVALID_GRANT_REF (0) +@@ -1138,6 +1139,9 @@ static int __init pcifront_init(void) + if (!xen_pv_domain() || xen_initial_domain()) + return -ENODEV; + ++ if (!xen_has_pv_devices()) ++ return -ENODEV; ++ + pci_frontend_registrar(1 /* enable */); + + return xenbus_register_frontend(&xenpci_driver); +--- a/drivers/video/xen-fbfront.c ++++ b/drivers/video/xen-fbfront.c +@@ -35,6 +35,7 @@ + #include + #include + #include ++#include + + struct xenfb_info { + unsigned char *fb; +@@ -699,6 +700,9 @@ static int __init xenfb_init(void) + if (xen_initial_domain()) + return -ENODEV; + ++ if (!xen_has_pv_devices()) ++ return -ENODEV; ++ + return xenbus_register_frontend(&xenfb_driver); + } + +--- a/drivers/xen/xenbus/xenbus_probe_frontend.c ++++ b/drivers/xen/xenbus/xenbus_probe_frontend.c +@@ -496,7 +496,7 @@ subsys_initcall(xenbus_probe_frontend_in + #ifndef MODULE + static int __init boot_wait_for_devices(void) + { +- if (xen_hvm_domain() && !xen_platform_pci_unplug) ++ if (!xen_has_pv_devices()) + return -ENODEV; + + ready_to_wait_for_devices = 1; +--- a/include/xen/platform_pci.h ++++ b/include/xen/platform_pci.h +@@ -48,4 +48,27 @@ static inline int xen_must_unplug_disks( + + extern int xen_platform_pci_unplug; + ++#if defined(CONFIG_XEN_PVHVM) ++extern bool xen_has_pv_devices(void); ++extern bool xen_has_pv_disk_devices(void); ++extern bool xen_has_pv_nic_devices(void); ++extern bool xen_has_pv_and_legacy_disk_devices(void); ++#else ++static inline bool xen_has_pv_devices(void) ++{ ++ return IS_ENABLED(CONFIG_XEN); ++} ++static inline bool xen_has_pv_disk_devices(void) ++{ ++ return IS_ENABLED(CONFIG_XEN); ++} ++static inline bool xen_has_pv_nic_devices(void) ++{ ++ return IS_ENABLED(CONFIG_XEN); ++} ++static inline bool xen_has_pv_and_legacy_disk_devices(void) ++{ ++ return false; ++} ++#endif + #endif /* _XEN_PLATFORM_PCI_H */