From 8a408374bcc635d07c4fb172aec5c0607d40f802 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 13 Oct 2015 15:22:03 -0700 Subject: [PATCH] 4.1-stable patches added patches: leds-led-class-add-missing-put_device.patch s390-boot-decompression-disable-floating-point-in-decompressor.patch s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch sched-core-fix-task_dead-race-in-finish_task_switch.patch use-warn_on_once-for-missing-x86_feature_nrips.patch x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch --- ...eds-led-class-add-missing-put_device.patch | 42 +++++ ...sable-floating-point-in-decompressor.patch | 47 +++++ ...c_sigmask-of-the-compat-signal-frame.patch | 108 +++++++++++ ...task_dead-race-in-finish_task_switch.patch | 98 ++++++++++ queue-4.1/series | 10 + ...n_once-for-missing-x86_feature_nrips.patch | 37 ++++ ...om-up-at-runtime-instead-of-top-down.patch | 173 ++++++++++++++++++ ...xec-crash-in-syscall-kexec_file_load.patch | 103 +++++++++++ ...on-gap-between-__ex_table-and-rodata.patch | 64 +++++++ ...oper-bound-checks-in-64bit-get_wchan.patch | 152 +++++++++++++++ ...-in-hvm-guests-by-doing-a-soft-reset.patch | 105 +++++++++++ 11 files changed, 939 insertions(+) create mode 100644 queue-4.1/leds-led-class-add-missing-put_device.patch create mode 100644 queue-4.1/s390-boot-decompression-disable-floating-point-in-decompressor.patch create mode 100644 queue-4.1/s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch create mode 100644 queue-4.1/sched-core-fix-task_dead-race-in-finish_task_switch.patch create mode 100644 queue-4.1/use-warn_on_once-for-missing-x86_feature_nrips.patch create mode 100644 queue-4.1/x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch create mode 100644 queue-4.1/x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch create mode 100644 queue-4.1/x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch create mode 100644 queue-4.1/x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch create mode 100644 queue-4.1/x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch diff --git a/queue-4.1/leds-led-class-add-missing-put_device.patch b/queue-4.1/leds-led-class-add-missing-put_device.patch new file mode 100644 index 00000000000..5de63a0ff11 --- /dev/null +++ b/queue-4.1/leds-led-class-add-missing-put_device.patch @@ -0,0 +1,42 @@ +From e5b5a61fcb3743f1dacf9e20d28f48423cecf0c1 Mon Sep 17 00:00:00 2001 +From: Ricardo Ribalda Delgado +Date: Fri, 31 Jul 2015 13:36:21 +0200 +Subject: leds/led-class: Add missing put_device() + +From: Ricardo Ribalda Delgado + +commit e5b5a61fcb3743f1dacf9e20d28f48423cecf0c1 upstream. + +Devices found by class_find_device must be freed with put_device(). +Otherwise the reference count will not work properly. + +Fixes: a96aa64cb572 ("leds/led-class: Handle LEDs with the same name") +Reported-by: Alan Tull +Signed-off-by: Ricardo Ribalda Delgado +Signed-off-by: Jacek Anaszewski +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/leds/led-class.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/drivers/leds/led-class.c ++++ b/drivers/leds/led-class.c +@@ -223,12 +223,15 @@ static int led_classdev_next_name(const + { + unsigned int i = 0; + int ret = 0; ++ struct device *dev; + + strlcpy(name, init_name, len); + +- while (class_find_device(leds_class, NULL, name, match_name) && +- (ret < len)) ++ while ((ret < len) && ++ (dev = class_find_device(leds_class, NULL, name, match_name))) { ++ put_device(dev); + ret = snprintf(name, len, "%s_%u", init_name, ++i); ++ } + + if (ret >= len) + return -ENOMEM; diff --git a/queue-4.1/s390-boot-decompression-disable-floating-point-in-decompressor.patch b/queue-4.1/s390-boot-decompression-disable-floating-point-in-decompressor.patch new file mode 100644 index 00000000000..514b1e25f96 --- /dev/null +++ b/queue-4.1/s390-boot-decompression-disable-floating-point-in-decompressor.patch @@ -0,0 +1,47 @@ +From adc0b7fbf6fe9967505c0254d9535ec7288186ae Mon Sep 17 00:00:00 2001 +From: Christian Borntraeger +Date: Mon, 28 Sep 2015 22:47:42 +0200 +Subject: s390/boot/decompression: disable floating point in decompressor + +From: Christian Borntraeger + +commit adc0b7fbf6fe9967505c0254d9535ec7288186ae upstream. + +my gcc 5.1 used an ldgr instruction with a register != 0,2,4,6 for +spilling/filling into a floating point register in our decompressor. + +This will cause an AFP-register data exception as the decompressor +did not setup the additional floating point registers via cr0. +That causes a program check loop that looked like a hang with +one "Uncompressing Linux... " message (directly booted via kvm) +or a loop of "Uncompressing Linux... " messages (when booted via +zipl boot loader). + +The offending code in my build was + + 48e400: e3 c0 af ff ff 71 lay %r12,-1(%r10) +-->48e406: b3 c1 00 1c ldgr %f1,%r12 + 48e40a: ec 6c 01 22 02 7f clij %r6,2,12,0x48e64e + +but gcc could do spilling into an fpr at any function. We can +simply disable floating point support at that early stage. + +Signed-off-by: Christian Borntraeger +Acked-by: Heiko Carstens +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/boot/compressed/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/s390/boot/compressed/Makefile ++++ b/arch/s390/boot/compressed/Makefile +@@ -10,7 +10,7 @@ targets += misc.o piggy.o sizes.h head.o + + KBUILD_CFLAGS := -m64 -D__KERNEL__ $(LINUX_INCLUDE) -O2 + KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +-KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks ++KBUILD_CFLAGS += $(cflags-y) -fno-delete-null-pointer-checks -msoft-float + KBUILD_CFLAGS += $(call cc-option,-mpacked-stack) + KBUILD_CFLAGS += $(call cc-option,-ffreestanding) + diff --git a/queue-4.1/s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch b/queue-4.1/s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch new file mode 100644 index 00000000000..2fdc513fa3c --- /dev/null +++ b/queue-4.1/s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch @@ -0,0 +1,108 @@ +From 8d4bd0ed0439dfc780aab801a085961925ed6838 Mon Sep 17 00:00:00 2001 +From: Martin Schwidefsky +Date: Tue, 8 Sep 2015 15:25:39 +0200 +Subject: s390/compat: correct uc_sigmask of the compat signal frame + +From: Martin Schwidefsky + +commit 8d4bd0ed0439dfc780aab801a085961925ed6838 upstream. + +The uc_sigmask in the ucontext structure is an array of words to keep +the 64 signal bits (or 1024 if you ask glibc but the kernel sigset_t +only has 64 bits). + +For 64 bit the sigset_t contains a single 8 byte word, but for 31 bit +there are two 4 byte words. The compat signal handler code uses a +simple copy of the 64 bit sigset_t to the 31 bit compat_sigset_t. +As s390 is a big-endian architecture this is incorrect, the two words +in the 31 bit sigset_t array need to be swapped. + +Reported-by: Stefan Liebler +Signed-off-by: Martin Schwidefsky +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/kernel/compat_signal.c | 27 +++++++++++++++++++++++---- + 1 file changed, 23 insertions(+), 4 deletions(-) + +--- a/arch/s390/kernel/compat_signal.c ++++ b/arch/s390/kernel/compat_signal.c +@@ -48,6 +48,19 @@ typedef struct + struct ucontext32 uc; + } rt_sigframe32; + ++static inline void sigset_to_sigset32(unsigned long *set64, ++ compat_sigset_word *set32) ++{ ++ set32[0] = (compat_sigset_word) set64[0]; ++ set32[1] = (compat_sigset_word)(set64[0] >> 32); ++} ++ ++static inline void sigset32_to_sigset(compat_sigset_word *set32, ++ unsigned long *set64) ++{ ++ set64[0] = (unsigned long) set32[0] | ((unsigned long) set32[1] << 32); ++} ++ + int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) + { + int err; +@@ -303,10 +316,12 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) + { + struct pt_regs *regs = task_pt_regs(current); + sigframe32 __user *frame = (sigframe32 __user *)regs->gprs[15]; ++ compat_sigset_t cset; + sigset_t set; + +- if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32)) ++ if (__copy_from_user(&cset.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32)) + goto badframe; ++ sigset32_to_sigset(cset.sig, set.sig); + set_current_blocked(&set); + if (restore_sigregs32(regs, &frame->sregs)) + goto badframe; +@@ -323,10 +338,12 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) + { + struct pt_regs *regs = task_pt_regs(current); + rt_sigframe32 __user *frame = (rt_sigframe32 __user *)regs->gprs[15]; ++ compat_sigset_t cset; + sigset_t set; + +- if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) ++ if (__copy_from_user(&cset, &frame->uc.uc_sigmask, sizeof(cset))) + goto badframe; ++ sigset32_to_sigset(cset.sig, set.sig); + set_current_blocked(&set); + if (compat_restore_altstack(&frame->uc.uc_stack)) + goto badframe; +@@ -397,7 +414,7 @@ static int setup_frame32(struct ksignal + return -EFAULT; + + /* Create struct sigcontext32 on the signal stack */ +- memcpy(&sc.oldmask, &set->sig, _SIGMASK_COPY_SIZE32); ++ sigset_to_sigset32(set->sig, sc.oldmask); + sc.sregs = (__u32)(unsigned long __force) &frame->sregs; + if (__copy_to_user(&frame->sc, &sc, sizeof(frame->sc))) + return -EFAULT; +@@ -458,6 +475,7 @@ static int setup_frame32(struct ksignal + static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, + struct pt_regs *regs) + { ++ compat_sigset_t cset; + rt_sigframe32 __user *frame; + unsigned long restorer; + size_t frame_size; +@@ -505,11 +523,12 @@ static int setup_rt_frame32(struct ksign + store_sigregs(); + + /* Create ucontext on the signal stack. */ ++ sigset_to_sigset32(set->sig, cset.sig); + if (__put_user(uc_flags, &frame->uc.uc_flags) || + __put_user(0, &frame->uc.uc_link) || + __compat_save_altstack(&frame->uc.uc_stack, regs->gprs[15]) || + save_sigregs32(regs, &frame->uc.uc_mcontext) || +- __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)) || ++ __copy_to_user(&frame->uc.uc_sigmask, &cset, sizeof(cset)) || + save_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) + return -EFAULT; + diff --git a/queue-4.1/sched-core-fix-task_dead-race-in-finish_task_switch.patch b/queue-4.1/sched-core-fix-task_dead-race-in-finish_task_switch.patch new file mode 100644 index 00000000000..6c3d7f9d554 --- /dev/null +++ b/queue-4.1/sched-core-fix-task_dead-race-in-finish_task_switch.patch @@ -0,0 +1,98 @@ +From 95913d97914f44db2b81271c2e2ebd4d2ac2df83 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra +Date: Tue, 29 Sep 2015 14:45:09 +0200 +Subject: sched/core: Fix TASK_DEAD race in finish_task_switch() + +From: Peter Zijlstra + +commit 95913d97914f44db2b81271c2e2ebd4d2ac2df83 upstream. + +So the problem this patch is trying to address is as follows: + + CPU0 CPU1 + + context_switch(A, B) + ttwu(A) + LOCK A->pi_lock + A->on_cpu == 0 + finish_task_switch(A) + prev_state = A->state <-. + WMB | + A->on_cpu = 0; | + UNLOCK rq0->lock | + | context_switch(C, A) + `-- A->state = TASK_DEAD + prev_state == TASK_DEAD + put_task_struct(A) + context_switch(A, C) + finish_task_switch(A) + A->state == TASK_DEAD + put_task_struct(A) + +The argument being that the WMB will allow the load of A->state on CPU0 +to cross over and observe CPU1's store of A->state, which will then +result in a double-drop and use-after-free. + +Now the comment states (and this was true once upon a long time ago) +that we need to observe A->state while holding rq->lock because that +will order us against the wakeup; however the wakeup will not in fact +acquire (that) rq->lock; it takes A->pi_lock these days. + +We can obviously fix this by upgrading the WMB to an MB, but that is +expensive, so we'd rather avoid that. + +The alternative this patch takes is: smp_store_release(&A->on_cpu, 0), +which avoids the MB on some archs, but not important ones like ARM. + +Reported-by: Oleg Nesterov +Signed-off-by: Peter Zijlstra (Intel) +Acked-by: Linus Torvalds +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-kernel@vger.kernel.org +Cc: manfred@colorfullife.com +Cc: will.deacon@arm.com +Fixes: e4a52bcb9a18 ("sched: Remove rq->lock from the first half of ttwu()") +Link: http://lkml.kernel.org/r/20150929124509.GG3816@twins.programming.kicks-ass.net +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/sched/core.c | 10 +++++----- + kernel/sched/sched.h | 5 +++-- + 2 files changed, 8 insertions(+), 7 deletions(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -2217,11 +2217,11 @@ static struct rq *finish_task_switch(str + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. +- * The test for TASK_DEAD must occur while the runqueue locks are +- * still held, otherwise prev could be scheduled on another cpu, die +- * there before we look at prev->state, and then the reference would +- * be dropped twice. +- * Manfred Spraul ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_lock_switch), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. + */ + prev_state = prev->state; + vtime_task_switch(prev); +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1068,9 +1068,10 @@ static inline void finish_lock_switch(st + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. ++ * ++ * Pairs with the control dependency and rmb in try_to_wake_up(). + */ +- smp_wmb(); +- prev->on_cpu = 0; ++ smp_store_release(&prev->on_cpu, 0); + #endif + #ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ diff --git a/queue-4.1/series b/queue-4.1/series index 8aa7a042bd0..7f3c593a314 100644 --- a/queue-4.1/series +++ b/queue-4.1/series @@ -40,3 +40,13 @@ x86-alternatives-make-optimize_nops-interrupt-safe-and-synced.patch x86-platform-fix-geode-lx-timekeeping-in-the-generic-x86-build.patch x86-paravirt-replace-the-paravirt-nop-with-a-bona-fide-empty-function.patch x86-nmi-64-fix-a-paravirt-stack-clobbering-bug-in-the-nmi-code.patch +use-warn_on_once-for-missing-x86_feature_nrips.patch +x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch +x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch +x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch +x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch +x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch +leds-led-class-add-missing-put_device.patch +sched-core-fix-task_dead-race-in-finish_task_switch.patch +s390-compat-correct-uc_sigmask-of-the-compat-signal-frame.patch +s390-boot-decompression-disable-floating-point-in-decompressor.patch diff --git a/queue-4.1/use-warn_on_once-for-missing-x86_feature_nrips.patch b/queue-4.1/use-warn_on_once-for-missing-x86_feature_nrips.patch new file mode 100644 index 00000000000..a02cd2769ef --- /dev/null +++ b/queue-4.1/use-warn_on_once-for-missing-x86_feature_nrips.patch @@ -0,0 +1,37 @@ +From d2922422c48df93f3edff7d872ee4f3191fefb08 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Dirk=20M=C3=BCller?= +Date: Thu, 1 Oct 2015 13:43:42 +0200 +Subject: Use WARN_ON_ONCE for missing X86_FEATURE_NRIPS + +From: =?UTF-8?q?Dirk=20M=C3=BCller?= + +commit d2922422c48df93f3edff7d872ee4f3191fefb08 upstream. + +The cpu feature flags are not ever going to change, so warning +everytime can cause a lot of kernel log spam +(in our case more than 10GB/hour). + +The warning seems to only occur when nested virtualization is +enabled, so it's probably triggered by a KVM bug. This is a +sensible and safe change anyway, and the KVM bug fix might not +be suitable for stable releases anyway. + +Signed-off-by: Dirk Mueller +Signed-off-by: Paolo Bonzini +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kvm/svm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/svm.c ++++ b/arch/x86/kvm/svm.c +@@ -512,7 +512,7 @@ static void skip_emulated_instruction(st + struct vcpu_svm *svm = to_svm(vcpu); + + if (svm->vmcb->control.next_rip != 0) { +- WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS)); ++ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); + svm->next_rip = svm->vmcb->control.next_rip; + } + diff --git a/queue-4.1/x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch b/queue-4.1/x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch new file mode 100644 index 00000000000..10d6214686d --- /dev/null +++ b/queue-4.1/x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch @@ -0,0 +1,173 @@ +From a5caa209ba9c29c6421292e7879d2387a2ef39c9 Mon Sep 17 00:00:00 2001 +From: Matt Fleming +Date: Fri, 25 Sep 2015 23:02:18 +0100 +Subject: x86/efi: Fix boot crash by mapping EFI memmap entries bottom-up at runtime, instead of top-down + +From: Matt Fleming + +commit a5caa209ba9c29c6421292e7879d2387a2ef39c9 upstream. + +Beginning with UEFI v2.5 EFI_PROPERTIES_TABLE was introduced +that signals that the firmware PE/COFF loader supports splitting +code and data sections of PE/COFF images into separate EFI +memory map entries. This allows the kernel to map those regions +with strict memory protections, e.g. EFI_MEMORY_RO for code, +EFI_MEMORY_XP for data, etc. + +Unfortunately, an unwritten requirement of this new feature is +that the regions need to be mapped with the same offsets +relative to each other as observed in the EFI memory map. If +this is not done crashes like this may occur, + + BUG: unable to handle kernel paging request at fffffffefe6086dd + IP: [] 0xfffffffefe6086dd + Call Trace: + [] efi_call+0x7e/0x100 + [] ? virt_efi_set_variable+0x61/0x90 + [] efi_delete_dummy_variable+0x63/0x70 + [] efi_enter_virtual_mode+0x383/0x392 + [] start_kernel+0x38a/0x417 + [] x86_64_start_reservations+0x2a/0x2c + [] x86_64_start_kernel+0xeb/0xef + +Here 0xfffffffefe6086dd refers to an address the firmware +expects to be mapped but which the OS never claimed was mapped. +The issue is that included in these regions are relative +addresses to other regions which were emitted by the firmware +toolchain before the "splitting" of sections occurred at +runtime. + +Needless to say, we don't satisfy this unwritten requirement on +x86_64 and instead map the EFI memory map entries in reverse +order. The above crash is almost certainly triggerable with any +kernel newer than v3.13 because that's when we rewrote the EFI +runtime region mapping code, in commit d2f7cbe7b26a ("x86/efi: +Runtime services virtual mapping"). For kernel versions before +v3.13 things may work by pure luck depending on the +fragmentation of the kernel virtual address space at the time we +map the EFI regions. + +Instead of mapping the EFI memory map entries in reverse order, +where entry N has a higher virtual address than entry N+1, map +them in the same order as they appear in the EFI memory map to +preserve this relative offset between regions. + +This patch has been kept as small as possible with the intention +that it should be applied aggressively to stable and +distribution kernels. It is very much a bugfix rather than +support for a new feature, since when EFI_PROPERTIES_TABLE is +enabled we must map things as outlined above to even boot - we +have no way of asking the firmware not to split the code/data +regions. + +In fact, this patch doesn't even make use of the more strict +memory protections available in UEFI v2.5. That will come later. + +Suggested-by: Ard Biesheuvel +Reported-by: Ard Biesheuvel +Signed-off-by: Matt Fleming +Cc: Borislav Petkov +Cc: Chun-Yi +Cc: Dave Young +Cc: H. Peter Anvin +Cc: James Bottomley +Cc: Lee, Chun-Yi +Cc: Leif Lindholm +Cc: Linus Torvalds +Cc: Matthew Garrett +Cc: Mike Galbraith +Cc: Peter Jones +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-kernel@vger.kernel.org +Link: http://lkml.kernel.org/r/1443218539-7610-2-git-send-email-matt@codeblueprint.co.uk +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/platform/efi/efi.c | 67 +++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 66 insertions(+), 1 deletion(-) + +--- a/arch/x86/platform/efi/efi.c ++++ b/arch/x86/platform/efi/efi.c +@@ -679,6 +679,70 @@ out: + } + + /* ++ * Iterate the EFI memory map in reverse order because the regions ++ * will be mapped top-down. The end result is the same as if we had ++ * mapped things forward, but doesn't require us to change the ++ * existing implementation of efi_map_region(). ++ */ ++static inline void *efi_map_next_entry_reverse(void *entry) ++{ ++ /* Initial call */ ++ if (!entry) ++ return memmap.map_end - memmap.desc_size; ++ ++ entry -= memmap.desc_size; ++ if (entry < memmap.map) ++ return NULL; ++ ++ return entry; ++} ++ ++/* ++ * efi_map_next_entry - Return the next EFI memory map descriptor ++ * @entry: Previous EFI memory map descriptor ++ * ++ * This is a helper function to iterate over the EFI memory map, which ++ * we do in different orders depending on the current configuration. ++ * ++ * To begin traversing the memory map @entry must be %NULL. ++ * ++ * Returns %NULL when we reach the end of the memory map. ++ */ ++static void *efi_map_next_entry(void *entry) ++{ ++ if (!efi_enabled(EFI_OLD_MEMMAP) && efi_enabled(EFI_64BIT)) { ++ /* ++ * Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE ++ * config table feature requires us to map all entries ++ * in the same order as they appear in the EFI memory ++ * map. That is to say, entry N must have a lower ++ * virtual address than entry N+1. This is because the ++ * firmware toolchain leaves relative references in ++ * the code/data sections, which are split and become ++ * separate EFI memory regions. Mapping things ++ * out-of-order leads to the firmware accessing ++ * unmapped addresses. ++ * ++ * Since we need to map things this way whether or not ++ * the kernel actually makes use of ++ * EFI_PROPERTIES_TABLE, let's just switch to this ++ * scheme by default for 64-bit. ++ */ ++ return efi_map_next_entry_reverse(entry); ++ } ++ ++ /* Initial call */ ++ if (!entry) ++ return memmap.map; ++ ++ entry += memmap.desc_size; ++ if (entry >= memmap.map_end) ++ return NULL; ++ ++ return entry; ++} ++ ++/* + * Map the efi memory ranges of the runtime services and update new_mmap with + * virtual addresses. + */ +@@ -688,7 +752,8 @@ static void * __init efi_map_regions(int + unsigned long left = 0; + efi_memory_desc_t *md; + +- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { ++ p = NULL; ++ while ((p = efi_map_next_entry(p))) { + md = p; + if (!(md->attribute & EFI_MEMORY_RUNTIME)) { + #ifdef CONFIG_X86_64 diff --git a/queue-4.1/x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch b/queue-4.1/x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch new file mode 100644 index 00000000000..b6f584f9bc9 --- /dev/null +++ b/queue-4.1/x86-kexec-fix-kexec-crash-in-syscall-kexec_file_load.patch @@ -0,0 +1,103 @@ +From e3c41e37b0f4b18cbd4dac76cbeece5a7558b909 Mon Sep 17 00:00:00 2001 +From: "Lee, Chun-Yi" +Date: Tue, 29 Sep 2015 20:58:57 +0800 +Subject: x86/kexec: Fix kexec crash in syscall kexec_file_load() + +From: "Lee, Chun-Yi" + +commit e3c41e37b0f4b18cbd4dac76cbeece5a7558b909 upstream. + +The original bug is a page fault crash that sometimes happens +on big machines when preparing ELF headers: + + BUG: unable to handle kernel paging request at ffffc90613fc9000 + IP: [] prepare_elf64_ram_headers_callback+0x165/0x260 + +The bug is caused by us under-counting the number of memory ranges +and subsequently not allocating enough ELF header space for them. +The bug is typically masked on smaller systems, because the ELF header +allocation is rounded up to the next page. + +This patch modifies the code in fill_up_crash_elf_data() by using +walk_system_ram_res() instead of walk_system_ram_range() to correctly +count the max number of crash memory ranges. That's because the +walk_system_ram_range() filters out small memory regions that +reside in the same page, but walk_system_ram_res() does not. + +Here's how I found the bug: + +After tracing prepare_elf64_headers() and prepare_elf64_ram_headers_callback(), +the code uses walk_system_ram_res() to fill-in crash memory regions information +to the program header, so it counts those small memory regions that +reside in a page area. + +But, when the kernel was using walk_system_ram_range() in +fill_up_crash_elf_data() to count the number of crash memory regions, +it filters out small regions. + +I printed those small memory regions, for example: + + kexec: Get nr_ram ranges. vaddr=0xffff880077592258 paddr=0x77592258, sz=0xdc0 + +Based on the code in walk_system_ram_range(), this memory region +will be filtered out: + + pfn = (0x77592258 + 0x1000 - 1) >> 12 = 0x77593 + end_pfn = (0x77592258 + 0xfc0 -1 + 1) >> 12 = 0x77593 + end_pfn - pfn = 0x77593 - 0x77593 = 0 <=== if (end_pfn > pfn) is FALSE + +So, the max_nr_ranges that's counted by the kernel doesn't include +small memory regions - causing us to under-allocate the required space. +That causes the page fault crash that happens in a later code path +when preparing ELF headers. + +This bug is not easy to reproduce on small machines that have few +CPUs, because the allocated page aligned ELF buffer has more free +space to cover those small memory regions' PT_LOAD headers. + +Signed-off-by: Lee, Chun-Yi +Cc: Andy Lutomirski +Cc: Baoquan He +Cc: Jiang Liu +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Stephen Rothwell +Cc: Takashi Iwai +Cc: Thomas Gleixner +Cc: Viresh Kumar +Cc: Vivek Goyal +Cc: kexec@lists.infradead.org +Cc: linux-kernel@vger.kernel.org +Link: http://lkml.kernel.org/r/1443531537-29436-1-git-send-email-jlee@suse.com +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/crash.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +--- a/arch/x86/kernel/crash.c ++++ b/arch/x86/kernel/crash.c +@@ -184,10 +184,9 @@ void native_machine_crash_shutdown(struc + } + + #ifdef CONFIG_KEXEC_FILE +-static int get_nr_ram_ranges_callback(unsigned long start_pfn, +- unsigned long nr_pfn, void *arg) ++static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg) + { +- int *nr_ranges = arg; ++ unsigned int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +@@ -213,7 +212,7 @@ static void fill_up_crash_elf_data(struc + + ced->image = image; + +- walk_system_ram_range(0, -1, &nr_ranges, ++ walk_system_ram_res(0, -1, &nr_ranges, + get_nr_ram_ranges_callback); + + ced->max_nr_ranges = nr_ranges; diff --git a/queue-4.1/x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch b/queue-4.1/x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch new file mode 100644 index 00000000000..e8fbd1092a7 --- /dev/null +++ b/queue-4.1/x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch @@ -0,0 +1,64 @@ +From ab76f7b4ab2397ffdd2f1eb07c55697d19991d10 Mon Sep 17 00:00:00 2001 +From: Stephen Smalley +Date: Thu, 1 Oct 2015 09:04:22 -0400 +Subject: x86/mm: Set NX on gap between __ex_table and rodata + +From: Stephen Smalley + +commit ab76f7b4ab2397ffdd2f1eb07c55697d19991d10 upstream. + +Unused space between the end of __ex_table and the start of +rodata can be left W+x in the kernel page tables. Extend the +setting of the NX bit to cover this gap by starting from +text_end rather than rodata_start. + + Before: + ---[ High Kernel Mapping ]--- + 0xffffffff80000000-0xffffffff81000000 16M pmd + 0xffffffff81000000-0xffffffff81600000 6M ro PSE GLB x pmd + 0xffffffff81600000-0xffffffff81754000 1360K ro GLB x pte + 0xffffffff81754000-0xffffffff81800000 688K RW GLB x pte + 0xffffffff81800000-0xffffffff81a00000 2M ro PSE GLB NX pmd + 0xffffffff81a00000-0xffffffff81b3b000 1260K ro GLB NX pte + 0xffffffff81b3b000-0xffffffff82000000 4884K RW GLB NX pte + 0xffffffff82000000-0xffffffff82200000 2M RW PSE GLB NX pmd + 0xffffffff82200000-0xffffffffa0000000 478M pmd + + After: + ---[ High Kernel Mapping ]--- + 0xffffffff80000000-0xffffffff81000000 16M pmd + 0xffffffff81000000-0xffffffff81600000 6M ro PSE GLB x pmd + 0xffffffff81600000-0xffffffff81754000 1360K ro GLB x pte + 0xffffffff81754000-0xffffffff81800000 688K RW GLB NX pte + 0xffffffff81800000-0xffffffff81a00000 2M ro PSE GLB NX pmd + 0xffffffff81a00000-0xffffffff81b3b000 1260K ro GLB NX pte + 0xffffffff81b3b000-0xffffffff82000000 4884K RW GLB NX pte + 0xffffffff82000000-0xffffffff82200000 2M RW PSE GLB NX pmd + 0xffffffff82200000-0xffffffffa0000000 478M pmd + +Signed-off-by: Stephen Smalley +Acked-by: Kees Cook +Cc: Linus Torvalds +Cc: Mike Galbraith +Cc: Peter Zijlstra +Cc: Thomas Gleixner +Cc: linux-kernel@vger.kernel.org +Link: http://lkml.kernel.org/r/1443704662-3138-1-git-send-email-sds@tycho.nsa.gov +Signed-off-by: Ingo Molnar +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/mm/init_64.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -1132,7 +1132,7 @@ void mark_rodata_ro(void) + * has been zapped already via cleanup_highmem(). + */ + all_end = roundup((unsigned long)_brk_end, PMD_SIZE); +- set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT); ++ set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); + + rodata_test(); + diff --git a/queue-4.1/x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch b/queue-4.1/x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch new file mode 100644 index 00000000000..b317e12cd86 --- /dev/null +++ b/queue-4.1/x86-process-add-proper-bound-checks-in-64bit-get_wchan.patch @@ -0,0 +1,152 @@ +From eddd3826a1a0190e5235703d1e666affa4d13b96 Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Wed, 30 Sep 2015 08:38:22 +0000 +Subject: x86/process: Add proper bound checks in 64bit get_wchan() + +From: Thomas Gleixner + +commit eddd3826a1a0190e5235703d1e666affa4d13b96 upstream. + +Dmitry Vyukov reported the following using trinity and the memory +error detector AddressSanitizer +(https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel). + +[ 124.575597] ERROR: AddressSanitizer: heap-buffer-overflow on +address ffff88002e280000 +[ 124.576801] ffff88002e280000 is located 131938492886538 bytes to +the left of 28857600-byte region [ffffffff81282e0a, ffffffff82e0830a) +[ 124.578633] Accessed by thread T10915: +[ 124.579295] inlined in describe_heap_address +./arch/x86/mm/asan/report.c:164 +[ 124.579295] #0 ffffffff810dd277 in asan_report_error +./arch/x86/mm/asan/report.c:278 +[ 124.580137] #1 ffffffff810dc6a0 in asan_check_region +./arch/x86/mm/asan/asan.c:37 +[ 124.581050] #2 ffffffff810dd423 in __tsan_read8 ??:0 +[ 124.581893] #3 ffffffff8107c093 in get_wchan +./arch/x86/kernel/process_64.c:444 + +The address checks in the 64bit implementation of get_wchan() are +wrong in several ways: + + - The lower bound of the stack is not the start of the stack + page. It's the start of the stack page plus sizeof (struct + thread_info) + + - The upper bound must be: + + top_of_stack - TOP_OF_KERNEL_STACK_PADDING - 2 * sizeof(unsigned long). + + The 2 * sizeof(unsigned long) is required because the stack pointer + points at the frame pointer. The layout on the stack is: ... IP FP + ... IP FP. So we need to make sure that both IP and FP are in the + bounds. + +Fix the bound checks and get rid of the mix of numeric constants, u64 +and unsigned long. Making all unsigned long allows us to use the same +function for 32bit as well. + +Use READ_ONCE() when accessing the stack. This does not prevent a +concurrent wakeup of the task and the stack changing, but at least it +avoids TOCTOU. + +Also check task state at the end of the loop. Again that does not +prevent concurrent changes, but it avoids walking for nothing. + +Add proper comments while at it. + +Reported-by: Dmitry Vyukov +Reported-by: Sasha Levin +Based-on-patch-from: Wolfram Gloger +Signed-off-by: Thomas Gleixner +Reviewed-by: Borislav Petkov +Reviewed-by: Dmitry Vyukov +Cc: Andrey Ryabinin +Cc: Andy Lutomirski +Cc: Andrey Konovalov +Cc: Kostya Serebryany +Cc: Alexander Potapenko +Cc: kasan-dev +Cc: Denys Vlasenko +Cc: Andi Kleen +Cc: Wolfram Gloger +Link: http://lkml.kernel.org/r/20150930083302.694788319@linutronix.de +Signed-off-by: Thomas Gleixner +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/kernel/process_64.c | 52 ++++++++++++++++++++++++++++++++++--------- + 1 file changed, 42 insertions(+), 10 deletions(-) + +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -499,27 +499,59 @@ void set_personality_ia32(bool x32) + } + EXPORT_SYMBOL_GPL(set_personality_ia32); + ++/* ++ * Called from fs/proc with a reference on @p to find the function ++ * which called into schedule(). This needs to be done carefully ++ * because the task might wake up and we might look at a stack ++ * changing under us. ++ */ + unsigned long get_wchan(struct task_struct *p) + { +- unsigned long stack; +- u64 fp, ip; ++ unsigned long start, bottom, top, sp, fp, ip; + int count = 0; + + if (!p || p == current || p->state == TASK_RUNNING) + return 0; +- stack = (unsigned long)task_stack_page(p); +- if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) ++ ++ start = (unsigned long)task_stack_page(p); ++ if (!start) + return 0; +- fp = *(u64 *)(p->thread.sp); ++ ++ /* ++ * Layout of the stack page: ++ * ++ * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long) ++ * PADDING ++ * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING ++ * stack ++ * ----------- bottom = start + sizeof(thread_info) ++ * thread_info ++ * ----------- start ++ * ++ * The tasks stack pointer points at the location where the ++ * framepointer is stored. The data on the stack is: ++ * ... IP FP ... IP FP ++ * ++ * We need to read FP and IP, so we need to adjust the upper ++ * bound by another unsigned long. ++ */ ++ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; ++ top -= 2 * sizeof(unsigned long); ++ bottom = start + sizeof(struct thread_info); ++ ++ sp = READ_ONCE(p->thread.sp); ++ if (sp < bottom || sp > top) ++ return 0; ++ ++ fp = READ_ONCE(*(unsigned long *)sp); + do { +- if (fp < (unsigned long)stack || +- fp >= (unsigned long)stack+THREAD_SIZE) ++ if (fp < bottom || fp > top) + return 0; +- ip = *(u64 *)(fp+8); ++ ip = READ_ONCE(*(unsigned long *)(fp + sizeof(unsigned long))); + if (!in_sched_functions(ip)) + return ip; +- fp = *(u64 *)fp; +- } while (count++ < 16); ++ fp = READ_ONCE(*(unsigned long *)fp); ++ } while (count++ < 16 && p->state != TASK_RUNNING); + return 0; + } + diff --git a/queue-4.1/x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch b/queue-4.1/x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch new file mode 100644 index 00000000000..f2b8592a2be --- /dev/null +++ b/queue-4.1/x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch @@ -0,0 +1,105 @@ +From 0b34a166f291d255755be46e43ed5497cdd194f2 Mon Sep 17 00:00:00 2001 +From: Vitaly Kuznetsov +Date: Fri, 25 Sep 2015 11:59:52 +0200 +Subject: x86/xen: Support kexec/kdump in HVM guests by doing a soft reset + +From: Vitaly Kuznetsov + +commit 0b34a166f291d255755be46e43ed5497cdd194f2 upstream. + +Currently there is a number of issues preventing PVHVM Xen guests from +doing successful kexec/kdump: + + - Bound event channels. + - Registered vcpu_info. + - PIRQ/emuirq mappings. + - shared_info frame after XENMAPSPACE_shared_info operation. + - Active grant mappings. + +Basically, newly booted kernel stumbles upon already set up Xen +interfaces and there is no way to reestablish them. In Xen-4.7 a new +feature called 'soft reset' is coming. A guest performing kexec/kdump +operation is supposed to call SCHEDOP_shutdown hypercall with +SHUTDOWN_soft_reset reason before jumping to new kernel. Hypervisor +(with some help from toolstack) will do full domain cleanup (but +keeping its memory and vCPU contexts intact) returning the guest to +the state it had when it was first booted and thus allowing it to +start over. + +Doing SHUTDOWN_soft_reset on Xen hypervisors which don't support it is +probably OK as by default all unknown shutdown reasons cause domain +destroy with a message in toolstack log: 'Unknown shutdown reason code +5. Destroying domain.' which gives a clue to what the problem is and +eliminates false expectations. + +Signed-off-by: Vitaly Kuznetsov +Signed-off-by: David Vrabel +Signed-off-by: Greg Kroah-Hartman + +--- + arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++ + include/xen/interface/sched.h | 8 ++++++++ + 2 files changed, 31 insertions(+) + +--- a/arch/x86/xen/enlighten.c ++++ b/arch/x86/xen/enlighten.c +@@ -33,6 +33,10 @@ + #include + #include + ++#ifdef CONFIG_KEXEC_CORE ++#include ++#endif ++ + #include + #include + #include +@@ -1798,6 +1802,21 @@ static struct notifier_block xen_hvm_cpu + .notifier_call = xen_hvm_cpu_notify, + }; + ++#ifdef CONFIG_KEXEC_CORE ++static void xen_hvm_shutdown(void) ++{ ++ native_machine_shutdown(); ++ if (kexec_in_progress) ++ xen_reboot(SHUTDOWN_soft_reset); ++} ++ ++static void xen_hvm_crash_shutdown(struct pt_regs *regs) ++{ ++ native_machine_crash_shutdown(regs); ++ xen_reboot(SHUTDOWN_soft_reset); ++} ++#endif ++ + static void __init xen_hvm_guest_init(void) + { + if (xen_pv_domain()) +@@ -1817,6 +1836,10 @@ static void __init xen_hvm_guest_init(vo + x86_init.irqs.intr_init = xen_init_IRQ; + xen_hvm_init_time_ops(); + xen_hvm_init_mmu_ops(); ++#ifdef CONFIG_KEXEC_CORE ++ machine_ops.shutdown = xen_hvm_shutdown; ++ machine_ops.crash_shutdown = xen_hvm_crash_shutdown; ++#endif + } + #endif + +--- a/include/xen/interface/sched.h ++++ b/include/xen/interface/sched.h +@@ -107,5 +107,13 @@ struct sched_watchdog { + #define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */ + #define SHUTDOWN_crash 3 /* Tell controller we've crashed. */ + #define SHUTDOWN_watchdog 4 /* Restart because watchdog time expired. */ ++/* ++ * Domain asked to perform 'soft reset' for it. The expected behavior is to ++ * reset internal Xen state for the domain returning it to the point where it ++ * was created but leaving the domain's memory contents and vCPU contexts ++ * intact. This will allow the domain to start over and set up all Xen specific ++ * interfaces again. ++ */ ++#define SHUTDOWN_soft_reset 5 + + #endif /* __XEN_PUBLIC_SCHED_H__ */ -- 2.47.3