From b3071c51e89007177e916ee2dd88b4967cb16f04 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 14 Apr 2013 18:18:17 -0700 Subject: [PATCH] 3.8-stable patches added patches: ftrace-move-ftrace_filter_lseek-out-of-config_dynamic_ftrace-section.patch sched_clock-prevent-64bit-inatomicity-on-32bit-systems.patch udl-handle-edid-failure-properly.patch x86-mm-paravirt-fix-vmalloc_fault-oops-during-lazy-mmu-updates.patch x86-mm-patch-out-arch_flush_lazy_mmu_mode-when-running-on-bare-metal.patch --- ...out-of-config_dynamic_ftrace-section.patch | 92 ++++++++++++ ...t-64bit-inatomicity-on-32bit-systems.patch | 141 ++++++++++++++++++ queue-3.8/series | 5 + .../udl-handle-edid-failure-properly.patch | 31 ++++ ...c_fault-oops-during-lazy-mmu-updates.patch | 88 +++++++++++ ..._mmu_mode-when-running-on-bare-metal.patch | 138 +++++++++++++++++ 6 files changed, 495 insertions(+) create mode 100644 queue-3.8/ftrace-move-ftrace_filter_lseek-out-of-config_dynamic_ftrace-section.patch create mode 100644 queue-3.8/sched_clock-prevent-64bit-inatomicity-on-32bit-systems.patch create mode 100644 queue-3.8/udl-handle-edid-failure-properly.patch create mode 100644 queue-3.8/x86-mm-paravirt-fix-vmalloc_fault-oops-during-lazy-mmu-updates.patch create mode 100644 queue-3.8/x86-mm-patch-out-arch_flush_lazy_mmu_mode-when-running-on-bare-metal.patch diff --git a/queue-3.8/ftrace-move-ftrace_filter_lseek-out-of-config_dynamic_ftrace-section.patch b/queue-3.8/ftrace-move-ftrace_filter_lseek-out-of-config_dynamic_ftrace-section.patch new file mode 100644 index 00000000000..ee06d39a6c4 --- /dev/null +++ b/queue-3.8/ftrace-move-ftrace_filter_lseek-out-of-config_dynamic_ftrace-section.patch @@ -0,0 +1,92 @@ +From 7f49ef69db6bbf756c0abca7e9b65b32e999eec8 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Red Hat)" +Date: Fri, 12 Apr 2013 16:40:13 -0400 +Subject: ftrace: Move ftrace_filter_lseek out of CONFIG_DYNAMIC_FTRACE section + +From: "Steven Rostedt (Red Hat)" + +commit 7f49ef69db6bbf756c0abca7e9b65b32e999eec8 upstream. + +As ftrace_filter_lseek is now used with ftrace_pid_fops, it needs to +be moved out of the #ifdef CONFIG_DYNAMIC_FTRACE section as the +ftrace_pid_fops is defined when DYNAMIC_FTRACE is not. + +Signed-off-by: Steven Rostedt +Cc: Namhyung Kim +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/ftrace.h | 3 ++- + kernel/trace/ftrace.c | 28 ++++++++++++++-------------- + 2 files changed, 16 insertions(+), 15 deletions(-) + +--- a/include/linux/ftrace.h ++++ b/include/linux/ftrace.h +@@ -394,7 +394,6 @@ ssize_t ftrace_filter_write(struct file + size_t cnt, loff_t *ppos); + ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos); +-loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); + int ftrace_regex_release(struct inode *inode, struct file *file); + + void __init +@@ -567,6 +566,8 @@ static inline int + ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; } + #endif /* CONFIG_DYNAMIC_FTRACE */ + ++loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); ++ + /* totally disable ftrace - can not re-enable after this */ + void ftrace_kill(void); + +--- a/kernel/trace/ftrace.c ++++ b/kernel/trace/ftrace.c +@@ -1027,6 +1027,19 @@ static __init void ftrace_profile_debugf + + static struct pid * const ftrace_swapper_pid = &init_struct_pid; + ++loff_t ++ftrace_filter_lseek(struct file *file, loff_t offset, int whence) ++{ ++ loff_t ret; ++ ++ if (file->f_mode & FMODE_READ) ++ ret = seq_lseek(file, offset, whence); ++ else ++ file->f_pos = ret = 1; ++ ++ return ret; ++} ++ + #ifdef CONFIG_DYNAMIC_FTRACE + + #ifndef CONFIG_FTRACE_MCOUNT_RECORD +@@ -2589,7 +2602,7 @@ static void ftrace_filter_reset(struct f + * routine, you can use ftrace_filter_write() for the write + * routine if @flag has FTRACE_ITER_FILTER set, or + * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. +- * ftrace_regex_lseek() should be used as the lseek routine, and ++ * ftrace_filter_lseek() should be used as the lseek routine, and + * release must call ftrace_regex_release(). + */ + int +@@ -2673,19 +2686,6 @@ ftrace_notrace_open(struct inode *inode, + inode, file); + } + +-loff_t +-ftrace_filter_lseek(struct file *file, loff_t offset, int whence) +-{ +- loff_t ret; +- +- if (file->f_mode & FMODE_READ) +- ret = seq_lseek(file, offset, whence); +- else +- file->f_pos = ret = 1; +- +- return ret; +-} +- + static int ftrace_match(char *str, char *regex, int len, int type) + { + int matched = 0; diff --git a/queue-3.8/sched_clock-prevent-64bit-inatomicity-on-32bit-systems.patch b/queue-3.8/sched_clock-prevent-64bit-inatomicity-on-32bit-systems.patch new file mode 100644 index 00000000000..cc2685ffa95 --- /dev/null +++ b/queue-3.8/sched_clock-prevent-64bit-inatomicity-on-32bit-systems.patch @@ -0,0 +1,141 @@ +From a1cbcaa9ea87b87a96b9fc465951dcf36e459ca2 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Sat, 6 Apr 2013 10:10:27 +0200 +Subject: sched_clock: Prevent 64bit inatomicity on 32bit systems + +From: Thomas Gleixner + +commit a1cbcaa9ea87b87a96b9fc465951dcf36e459ca2 upstream. + +The sched_clock_remote() implementation has the following inatomicity +problem on 32bit systems when accessing the remote scd->clock, which +is a 64bit value. + +CPU0 CPU1 + +sched_clock_local() sched_clock_remote(CPU0) +... + remote_clock = scd[CPU0]->clock + read_low32bit(scd[CPU0]->clock) +cmpxchg64(scd->clock,...) + read_high32bit(scd[CPU0]->clock) + +While the update of scd->clock is using an atomic64 mechanism, the +readout on the remote cpu is not, which can cause completely bogus +readouts. + +It is a quite rare problem, because it requires the update to hit the +narrow race window between the low/high readout and the update must go +across the 32bit boundary. + +The resulting misbehaviour is, that CPU1 will see the sched_clock on +CPU1 ~4 seconds ahead of it's own and update CPU1s sched_clock value +to this bogus timestamp. This stays that way due to the clamping +implementation for about 4 seconds until the synchronization with +CLOCK_MONOTONIC undoes the problem. + +The issue is hard to observe, because it might only result in a less +accurate SCHED_OTHER timeslicing behaviour. To create observable +damage on realtime scheduling classes, it is necessary that the bogus +update of CPU1 sched_clock happens in the context of an realtime +thread, which then gets charged 4 seconds of RT runtime, which results +in the RT throttler mechanism to trigger and prevent scheduling of RT +tasks for a little less than 4 seconds. So this is quite unlikely as +well. + +The issue was quite hard to decode as the reproduction time is between +2 days and 3 weeks and intrusive tracing makes it less likely, but the +following trace recorded with trace_clock=global, which uses +sched_clock_local(), gave the final hint: + + -0 0d..30 400269.477150: hrtimer_cancel: hrtimer=0xf7061e80 + -0 0d..30 400269.477151: hrtimer_start: hrtimer=0xf7061e80 ... +irq/20-S-587 1d..32 400273.772118: sched_wakeup: comm= ... target_cpu=0 + -0 0dN.30 400273.772118: hrtimer_cancel: hrtimer=0xf7061e80 + +What happens is that CPU0 goes idle and invokes +sched_clock_idle_sleep_event() which invokes sched_clock_local() and +CPU1 runs a remote wakeup for CPU0 at the same time, which invokes +sched_remote_clock(). The time jump gets propagated to CPU0 via +sched_remote_clock() and stays stale on both cores for ~4 seconds. + +There are only two other possibilities, which could cause a stale +sched clock: + +1) ktime_get() which reads out CLOCK_MONOTONIC returns a sporadic + wrong value. + +2) sched_clock() which reads the TSC returns a sporadic wrong value. + +#1 can be excluded because sched_clock would continue to increase for + one jiffy and then go stale. + +#2 can be excluded because it would not make the clock jump + forward. It would just result in a stale sched_clock for one jiffy. + +After quite some brain twisting and finding the same pattern on other +traces, sched_clock_remote() remained the only place which could cause +such a problem and as explained above it's indeed racy on 32bit +systems. + +So while on 64bit systems the readout is atomic, we need to verify the +remote readout on 32bit machines. We need to protect the local->clock +readout in sched_clock_remote() on 32bit as well because an NMI could +hit between the low and the high readout, call sched_clock_local() and +modify local->clock. + +Thanks to Siegfried Wulsch for bearing with my debug requests and +going through the tedious tasks of running a bunch of reproducer +systems to generate the debug information which let me decode the +issue. + +Reported-by: Siegfried Wulsch +Acked-by: Peter Zijlstra +Cc: Steven Rostedt +Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304051544160.21884@ionos +Signed-off-by: Thomas Gleixner +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched/clock.c | 26 ++++++++++++++++++++++++++ + 1 file changed, 26 insertions(+) + +--- a/kernel/sched/clock.c ++++ b/kernel/sched/clock.c +@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sch + u64 this_clock, remote_clock; + u64 *ptr, old_val, val; + ++#if BITS_PER_LONG != 64 ++again: ++ /* ++ * Careful here: The local and the remote clock values need to ++ * be read out atomic as we need to compare the values and ++ * then update either the local or the remote side. So the ++ * cmpxchg64 below only protects one readout. ++ * ++ * We must reread via sched_clock_local() in the retry case on ++ * 32bit as an NMI could use sched_clock_local() via the ++ * tracer and hit between the readout of ++ * the low32bit and the high 32bit portion. ++ */ ++ this_clock = sched_clock_local(my_scd); ++ /* ++ * We must enforce atomic readout on 32bit, otherwise the ++ * update on the remote cpu can hit inbetween the readout of ++ * the low32bit and the high 32bit portion. ++ */ ++ remote_clock = cmpxchg64(&scd->clock, 0, 0); ++#else ++ /* ++ * On 64bit the read of [my]scd->clock is atomic versus the ++ * update, so we can avoid the above 32bit dance. ++ */ + sched_clock_local(my_scd); + again: + this_clock = my_scd->clock; + remote_clock = scd->clock; ++#endif + + /* + * Use the opportunity that we have both locks diff --git a/queue-3.8/series b/queue-3.8/series index 8397448a380..64b4ae53ca2 100644 --- a/queue-3.8/series +++ b/queue-3.8/series @@ -19,3 +19,8 @@ kobject-fix-kset_find_obj-race-with-concurrent-last-kobject_put.patch gpio-fix-wrong-checking-condition-for-gpio-range.patch x86-32-fix-possible-incomplete-tlb-invalidate-with-pae-pagetables.patch tracing-fix-possible-null-pointer-dereferences.patch +udl-handle-edid-failure-properly.patch +ftrace-move-ftrace_filter_lseek-out-of-config_dynamic_ftrace-section.patch +sched_clock-prevent-64bit-inatomicity-on-32bit-systems.patch +x86-mm-paravirt-fix-vmalloc_fault-oops-during-lazy-mmu-updates.patch +x86-mm-patch-out-arch_flush_lazy_mmu_mode-when-running-on-bare-metal.patch diff --git a/queue-3.8/udl-handle-edid-failure-properly.patch b/queue-3.8/udl-handle-edid-failure-properly.patch new file mode 100644 index 00000000000..ebb762b59c1 --- /dev/null +++ b/queue-3.8/udl-handle-edid-failure-properly.patch @@ -0,0 +1,31 @@ +From 1baee58638fc58248625255f5c5fcdb987f11b1f Mon Sep 17 00:00:00 2001 +From: Dave Airlie +Date: Fri, 12 Apr 2013 13:25:20 +1000 +Subject: udl: handle EDID failure properly. + +From: Dave Airlie + +commit 1baee58638fc58248625255f5c5fcdb987f11b1f upstream. + +Don't oops seems proper. + +Signed-off-by: Dave Airlie +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/gpu/drm/udl/udl_connector.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/gpu/drm/udl/udl_connector.c ++++ b/drivers/gpu/drm/udl/udl_connector.c +@@ -61,6 +61,10 @@ static int udl_get_modes(struct drm_conn + int ret; + + edid = (struct edid *)udl_get_edid(udl); ++ if (!edid) { ++ drm_mode_connector_update_edid_property(connector, NULL); ++ return 0; ++ } + + /* + * We only read the main block, but if the monitor reports extension diff --git a/queue-3.8/x86-mm-paravirt-fix-vmalloc_fault-oops-during-lazy-mmu-updates.patch b/queue-3.8/x86-mm-paravirt-fix-vmalloc_fault-oops-during-lazy-mmu-updates.patch new file mode 100644 index 00000000000..206cc639fe2 --- /dev/null +++ b/queue-3.8/x86-mm-paravirt-fix-vmalloc_fault-oops-during-lazy-mmu-updates.patch @@ -0,0 +1,88 @@ +From 1160c2779b826c6f5c08e5cc542de58fd1f667d5 Mon Sep 17 00:00:00 2001 +From: Samu Kallio +Date: Sat, 23 Mar 2013 09:36:35 -0400 +Subject: x86, mm, paravirt: Fix vmalloc_fault oops during lazy MMU updates + +From: Samu Kallio + +commit 1160c2779b826c6f5c08e5cc542de58fd1f667d5 upstream. + +In paravirtualized x86_64 kernels, vmalloc_fault may cause an oops +when lazy MMU updates are enabled, because set_pgd effects are being +deferred. + +One instance of this problem is during process mm cleanup with memory +cgroups enabled. The chain of events is as follows: + +- zap_pte_range enables lazy MMU updates +- zap_pte_range eventually calls mem_cgroup_charge_statistics, + which accesses the vmalloc'd mem_cgroup per-cpu stat area +- vmalloc_fault is triggered which tries to sync the corresponding + PGD entry with set_pgd, but the update is deferred +- vmalloc_fault oopses due to a mismatch in the PUD entries + +The OOPs usually looks as so: + +------------[ cut here ]------------ +kernel BUG at arch/x86/mm/fault.c:396! +invalid opcode: 0000 [#1] SMP +.. snip .. +CPU 1 +Pid: 10866, comm: httpd Not tainted 3.6.10-4.fc18.x86_64 #1 +RIP: e030:[] [] vmalloc_fault+0x11f/0x208 +.. snip .. +Call Trace: + [] do_page_fault+0x399/0x4b0 + [] ? xen_mc_extend_args+0xec/0x110 + [] page_fault+0x25/0x30 + [] ? mem_cgroup_charge_statistics.isra.13+0x13/0x50 + [] __mem_cgroup_uncharge_common+0xd8/0x350 + [] mem_cgroup_uncharge_page+0x57/0x60 + [] page_remove_rmap+0xe0/0x150 + [] ? vm_normal_page+0x1a/0x80 + [] unmap_single_vma+0x531/0x870 + [] unmap_vmas+0x52/0xa0 + [] ? pte_mfn_to_pfn+0x72/0x100 + [] exit_mmap+0x98/0x170 + [] ? __raw_callee_save_xen_pmd_val+0x11/0x1e + [] mmput+0x83/0xf0 + [] exit_mm+0x104/0x130 + [] do_exit+0x15a/0x8c0 + [] do_group_exit+0x3f/0xa0 + [] sys_exit_group+0x17/0x20 + [] system_call_fastpath+0x16/0x1b + +Calling arch_flush_lazy_mmu_mode immediately after set_pgd makes the +changes visible to the consistency checks. + +RedHat-Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=914737 +Tested-by: Josh Boyer +Reported-and-Tested-by: Krishna Raman +Signed-off-by: Samu Kallio +Link: http://lkml.kernel.org/r/1364045796-10720-1-git-send-email-konrad.wilk@oracle.com +Tested-by: Konrad Rzeszutek Wilk +Signed-off-by: Konrad Rzeszutek Wilk +Signed-off-by: H. Peter Anvin +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/mm/fault.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -378,10 +378,12 @@ static noinline __kprobes int vmalloc_fa + if (pgd_none(*pgd_ref)) + return -1; + +- if (pgd_none(*pgd)) ++ if (pgd_none(*pgd)) { + set_pgd(pgd, *pgd_ref); +- else ++ arch_flush_lazy_mmu_mode(); ++ } else { + BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); ++ } + + /* + * Below here mismatches are bugs because these lower tables diff --git a/queue-3.8/x86-mm-patch-out-arch_flush_lazy_mmu_mode-when-running-on-bare-metal.patch b/queue-3.8/x86-mm-patch-out-arch_flush_lazy_mmu_mode-when-running-on-bare-metal.patch new file mode 100644 index 00000000000..dbf0ab92099 --- /dev/null +++ b/queue-3.8/x86-mm-patch-out-arch_flush_lazy_mmu_mode-when-running-on-bare-metal.patch @@ -0,0 +1,138 @@ +From 511ba86e1d386f671084b5d0e6f110bb30b8eeb2 Mon Sep 17 00:00:00 2001 +From: Boris Ostrovsky +Date: Sat, 23 Mar 2013 09:36:36 -0400 +Subject: x86, mm: Patch out arch_flush_lazy_mmu_mode() when running on bare metal + +From: Boris Ostrovsky + +commit 511ba86e1d386f671084b5d0e6f110bb30b8eeb2 upstream. + +Invoking arch_flush_lazy_mmu_mode() results in calls to +preempt_enable()/disable() which may have performance impact. + +Since lazy MMU is not used on bare metal we can patch away +arch_flush_lazy_mmu_mode() so that it is never called in such +environment. + +[ hpa: the previous patch "Fix vmalloc_fault oops during lazy MMU + updates" may cause a minor performance regression on + bare metal. This patch resolves that performance regression. It is + somewhat unclear to me if this is a good -stable candidate. ] + +Signed-off-by: Boris Ostrovsky +Link: http://lkml.kernel.org/r/1364045796-10720-2-git-send-email-konrad.wilk@oracle.com +Tested-by: Josh Boyer +Tested-by: Konrad Rzeszutek Wilk +Acked-by: Borislav Petkov +Signed-off-by: Konrad Rzeszutek Wilk +Signed-off-by: H. Peter Anvin +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/include/asm/paravirt.h | 5 ++++- + arch/x86/include/asm/paravirt_types.h | 2 ++ + arch/x86/kernel/paravirt.c | 25 +++++++++++++------------ + arch/x86/lguest/boot.c | 1 + + arch/x86/xen/mmu.c | 1 + + 5 files changed, 21 insertions(+), 13 deletions(-) + +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -703,7 +703,10 @@ static inline void arch_leave_lazy_mmu_m + PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave); + } + +-void arch_flush_lazy_mmu_mode(void); ++static inline void arch_flush_lazy_mmu_mode(void) ++{ ++ PVOP_VCALL0(pv_mmu_ops.lazy_mode.flush); ++} + + static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, + phys_addr_t phys, pgprot_t flags) +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -91,6 +91,7 @@ struct pv_lazy_ops { + /* Set deferred update mode, used for batching operations. */ + void (*enter)(void); + void (*leave)(void); ++ void (*flush)(void); + }; + + struct pv_time_ops { +@@ -679,6 +680,7 @@ void paravirt_end_context_switch(struct + + void paravirt_enter_lazy_mmu(void); + void paravirt_leave_lazy_mmu(void); ++void paravirt_flush_lazy_mmu(void); + + void _paravirt_nop(void); + u32 _paravirt_ident_32(u32); +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -263,6 +263,18 @@ void paravirt_leave_lazy_mmu(void) + leave_lazy(PARAVIRT_LAZY_MMU); + } + ++void paravirt_flush_lazy_mmu(void) ++{ ++ preempt_disable(); ++ ++ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { ++ arch_leave_lazy_mmu_mode(); ++ arch_enter_lazy_mmu_mode(); ++ } ++ ++ preempt_enable(); ++} ++ + void paravirt_start_context_switch(struct task_struct *prev) + { + BUG_ON(preemptible()); +@@ -292,18 +304,6 @@ enum paravirt_lazy_mode paravirt_get_laz + return this_cpu_read(paravirt_lazy_mode); + } + +-void arch_flush_lazy_mmu_mode(void) +-{ +- preempt_disable(); +- +- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { +- arch_leave_lazy_mmu_mode(); +- arch_enter_lazy_mmu_mode(); +- } +- +- preempt_enable(); +-} +- + struct pv_info pv_info = { + .name = "bare hardware", + .paravirt_enabled = 0, +@@ -475,6 +475,7 @@ struct pv_mmu_ops pv_mmu_ops = { + .lazy_mode = { + .enter = paravirt_nop, + .leave = paravirt_nop, ++ .flush = paravirt_nop, + }, + + .set_fixmap = native_set_fixmap, +--- a/arch/x86/lguest/boot.c ++++ b/arch/x86/lguest/boot.c +@@ -1333,6 +1333,7 @@ __init void lguest_init(void) + pv_mmu_ops.read_cr3 = lguest_read_cr3; + pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; + pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; ++ pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; + pv_mmu_ops.pte_update = lguest_pte_update; + pv_mmu_ops.pte_update_defer = lguest_pte_update; + +--- a/arch/x86/xen/mmu.c ++++ b/arch/x86/xen/mmu.c +@@ -2190,6 +2190,7 @@ static const struct pv_mmu_ops xen_mmu_o + .lazy_mode = { + .enter = paravirt_enter_lazy_mmu, + .leave = xen_leave_lazy_mmu, ++ .flush = paravirt_flush_lazy_mmu, + }, + + .set_fixmap = xen_set_fixmap, -- 2.47.3