--- /dev/null
+From 95913d97914f44db2b81271c2e2ebd4d2ac2df83 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Tue, 29 Sep 2015 14:45:09 +0200
+Subject: sched/core: Fix TASK_DEAD race in finish_task_switch()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+commit 95913d97914f44db2b81271c2e2ebd4d2ac2df83 upstream.
+
+So the problem this patch is trying to address is as follows:
+
+ CPU0 CPU1
+
+ context_switch(A, B)
+ ttwu(A)
+ LOCK A->pi_lock
+ A->on_cpu == 0
+ finish_task_switch(A)
+ prev_state = A->state <-.
+ WMB |
+ A->on_cpu = 0; |
+ UNLOCK rq0->lock |
+ | context_switch(C, A)
+ `-- A->state = TASK_DEAD
+ prev_state == TASK_DEAD
+ put_task_struct(A)
+ context_switch(A, C)
+ finish_task_switch(A)
+ A->state == TASK_DEAD
+ put_task_struct(A)
+
+The argument being that the WMB will allow the load of A->state on CPU0
+to cross over and observe CPU1's store of A->state, which will then
+result in a double-drop and use-after-free.
+
+Now the comment states (and this was true once upon a long time ago)
+that we need to observe A->state while holding rq->lock because that
+will order us against the wakeup; however the wakeup will not in fact
+acquire (that) rq->lock; it takes A->pi_lock these days.
+
+We can obviously fix this by upgrading the WMB to an MB, but that is
+expensive, so we'd rather avoid that.
+
+The alternative this patch takes is: smp_store_release(&A->on_cpu, 0),
+which avoids the MB on some archs, but not important ones like ARM.
+
+Reported-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-kernel@vger.kernel.org
+Cc: manfred@colorfullife.com
+Cc: will.deacon@arm.com
+Fixes: e4a52bcb9a18 ("sched: Remove rq->lock from the first half of ttwu()")
+Link: http://lkml.kernel.org/r/20150929124509.GG3816@twins.programming.kicks-ass.net
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/sched/core.c | 10 +++++-----
+ kernel/sched/sched.h | 5 +++--
+ 2 files changed, 8 insertions(+), 7 deletions(-)
+
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -2136,11 +2136,11 @@ static void finish_task_switch(struct rq
+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+ * schedule one last time. The schedule call will never return, and
+ * the scheduled task must drop that reference.
+- * The test for TASK_DEAD must occur while the runqueue locks are
+- * still held, otherwise prev could be scheduled on another cpu, die
+- * there before we look at prev->state, and then the reference would
+- * be dropped twice.
+- * Manfred Spraul <manfred@colorfullife.com>
++ *
++ * We must observe prev->state before clearing prev->on_cpu (in
++ * finish_lock_switch), otherwise a concurrent wakeup can get prev
++ * running on another CPU and we could rave with its RUNNING -> DEAD
++ * transition, resulting in a double drop.
+ */
+ prev_state = prev->state;
+ vtime_task_switch(prev);
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -994,9 +994,10 @@ static inline void finish_lock_switch(st
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
++ *
++ * Pairs with the control dependency and rmb in try_to_wake_up().
+ */
+- smp_wmb();
+- prev->on_cpu = 0;
++ smp_store_release(&prev->on_cpu, 0);
+ #endif
+ #ifdef CONFIG_DEBUG_SPINLOCK
+ /* this is a valid case when another task releases the spinlock */
x86-platform-fix-geode-lx-timekeeping-in-the-generic-x86-build.patch
x86-paravirt-replace-the-paravirt-nop-with-a-bona-fide-empty-function.patch
x86-nmi-64-fix-a-paravirt-stack-clobbering-bug-in-the-nmi-code.patch
+use-warn_on_once-for-missing-x86_feature_nrips.patch
+x86-efi-fix-boot-crash-by-mapping-efi-memmap-entries-bottom-up-at-runtime-instead-of-top-down.patch
+x86-mm-set-nx-on-gap-between-__ex_table-and-rodata.patch
+x86-xen-support-kexec-kdump-in-hvm-guests-by-doing-a-soft-reset.patch
+sched-core-fix-task_dead-race-in-finish_task_switch.patch
--- /dev/null
+From d2922422c48df93f3edff7d872ee4f3191fefb08 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Dirk=20M=C3=BCller?= <dmueller@suse.com>
+Date: Thu, 1 Oct 2015 13:43:42 +0200
+Subject: Use WARN_ON_ONCE for missing X86_FEATURE_NRIPS
+
+From: =?UTF-8?q?Dirk=20M=C3=BCller?= <dmueller@suse.com>
+
+commit d2922422c48df93f3edff7d872ee4f3191fefb08 upstream.
+
+The cpu feature flags are not ever going to change, so warning
+everytime can cause a lot of kernel log spam
+(in our case more than 10GB/hour).
+
+The warning seems to only occur when nested virtualization is
+enabled, so it's probably triggered by a KVM bug. This is a
+sensible and safe change anyway, and the KVM bug fix might not
+be suitable for stable releases anyway.
+
+Signed-off-by: Dirk Mueller <dmueller@suse.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/svm.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/svm.c
++++ b/arch/x86/kvm/svm.c
+@@ -496,7 +496,7 @@ static void skip_emulated_instruction(st
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (svm->vmcb->control.next_rip != 0) {
+- WARN_ON(!static_cpu_has(X86_FEATURE_NRIPS));
++ WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
+ svm->next_rip = svm->vmcb->control.next_rip;
+ }
+
--- /dev/null
+From a5caa209ba9c29c6421292e7879d2387a2ef39c9 Mon Sep 17 00:00:00 2001
+From: Matt Fleming <matt.fleming@intel.com>
+Date: Fri, 25 Sep 2015 23:02:18 +0100
+Subject: x86/efi: Fix boot crash by mapping EFI memmap entries bottom-up at runtime, instead of top-down
+
+From: Matt Fleming <matt.fleming@intel.com>
+
+commit a5caa209ba9c29c6421292e7879d2387a2ef39c9 upstream.
+
+Beginning with UEFI v2.5 EFI_PROPERTIES_TABLE was introduced
+that signals that the firmware PE/COFF loader supports splitting
+code and data sections of PE/COFF images into separate EFI
+memory map entries. This allows the kernel to map those regions
+with strict memory protections, e.g. EFI_MEMORY_RO for code,
+EFI_MEMORY_XP for data, etc.
+
+Unfortunately, an unwritten requirement of this new feature is
+that the regions need to be mapped with the same offsets
+relative to each other as observed in the EFI memory map. If
+this is not done crashes like this may occur,
+
+ BUG: unable to handle kernel paging request at fffffffefe6086dd
+ IP: [<fffffffefe6086dd>] 0xfffffffefe6086dd
+ Call Trace:
+ [<ffffffff8104c90e>] efi_call+0x7e/0x100
+ [<ffffffff81602091>] ? virt_efi_set_variable+0x61/0x90
+ [<ffffffff8104c583>] efi_delete_dummy_variable+0x63/0x70
+ [<ffffffff81f4e4aa>] efi_enter_virtual_mode+0x383/0x392
+ [<ffffffff81f37e1b>] start_kernel+0x38a/0x417
+ [<ffffffff81f37495>] x86_64_start_reservations+0x2a/0x2c
+ [<ffffffff81f37582>] x86_64_start_kernel+0xeb/0xef
+
+Here 0xfffffffefe6086dd refers to an address the firmware
+expects to be mapped but which the OS never claimed was mapped.
+The issue is that included in these regions are relative
+addresses to other regions which were emitted by the firmware
+toolchain before the "splitting" of sections occurred at
+runtime.
+
+Needless to say, we don't satisfy this unwritten requirement on
+x86_64 and instead map the EFI memory map entries in reverse
+order. The above crash is almost certainly triggerable with any
+kernel newer than v3.13 because that's when we rewrote the EFI
+runtime region mapping code, in commit d2f7cbe7b26a ("x86/efi:
+Runtime services virtual mapping"). For kernel versions before
+v3.13 things may work by pure luck depending on the
+fragmentation of the kernel virtual address space at the time we
+map the EFI regions.
+
+Instead of mapping the EFI memory map entries in reverse order,
+where entry N has a higher virtual address than entry N+1, map
+them in the same order as they appear in the EFI memory map to
+preserve this relative offset between regions.
+
+This patch has been kept as small as possible with the intention
+that it should be applied aggressively to stable and
+distribution kernels. It is very much a bugfix rather than
+support for a new feature, since when EFI_PROPERTIES_TABLE is
+enabled we must map things as outlined above to even boot - we
+have no way of asking the firmware not to split the code/data
+regions.
+
+In fact, this patch doesn't even make use of the more strict
+memory protections available in UEFI v2.5. That will come later.
+
+Suggested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Reported-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
+Signed-off-by: Matt Fleming <matt.fleming@intel.com>
+Cc: Borislav Petkov <bp@suse.de>
+Cc: Chun-Yi <jlee@suse.com>
+Cc: Dave Young <dyoung@redhat.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: James Bottomley <JBottomley@Odin.com>
+Cc: Lee, Chun-Yi <jlee@suse.com>
+Cc: Leif Lindholm <leif.lindholm@linaro.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Matthew Garrett <mjg59@srcf.ucam.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Jones <pjones@redhat.com>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/1443218539-7610-2-git-send-email-matt@codeblueprint.co.uk
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/platform/efi/efi.c | 67 +++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 66 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/platform/efi/efi.c
++++ b/arch/x86/platform/efi/efi.c
+@@ -961,6 +961,70 @@ out:
+ }
+
+ /*
++ * Iterate the EFI memory map in reverse order because the regions
++ * will be mapped top-down. The end result is the same as if we had
++ * mapped things forward, but doesn't require us to change the
++ * existing implementation of efi_map_region().
++ */
++static inline void *efi_map_next_entry_reverse(void *entry)
++{
++ /* Initial call */
++ if (!entry)
++ return memmap.map_end - memmap.desc_size;
++
++ entry -= memmap.desc_size;
++ if (entry < memmap.map)
++ return NULL;
++
++ return entry;
++}
++
++/*
++ * efi_map_next_entry - Return the next EFI memory map descriptor
++ * @entry: Previous EFI memory map descriptor
++ *
++ * This is a helper function to iterate over the EFI memory map, which
++ * we do in different orders depending on the current configuration.
++ *
++ * To begin traversing the memory map @entry must be %NULL.
++ *
++ * Returns %NULL when we reach the end of the memory map.
++ */
++static void *efi_map_next_entry(void *entry)
++{
++ if (!efi_enabled(EFI_OLD_MEMMAP) && efi_enabled(EFI_64BIT)) {
++ /*
++ * Starting in UEFI v2.5 the EFI_PROPERTIES_TABLE
++ * config table feature requires us to map all entries
++ * in the same order as they appear in the EFI memory
++ * map. That is to say, entry N must have a lower
++ * virtual address than entry N+1. This is because the
++ * firmware toolchain leaves relative references in
++ * the code/data sections, which are split and become
++ * separate EFI memory regions. Mapping things
++ * out-of-order leads to the firmware accessing
++ * unmapped addresses.
++ *
++ * Since we need to map things this way whether or not
++ * the kernel actually makes use of
++ * EFI_PROPERTIES_TABLE, let's just switch to this
++ * scheme by default for 64-bit.
++ */
++ return efi_map_next_entry_reverse(entry);
++ }
++
++ /* Initial call */
++ if (!entry)
++ return memmap.map;
++
++ entry += memmap.desc_size;
++ if (entry >= memmap.map_end)
++ return NULL;
++
++ return entry;
++}
++
++/*
+ * Map the efi memory ranges of the runtime services and update new_mmap with
+ * virtual addresses.
+ */
+@@ -970,7 +1034,8 @@ static void * __init efi_map_regions(int
+ unsigned long left = 0;
+ efi_memory_desc_t *md;
+
+- for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
++ p = NULL;
++ while ((p = efi_map_next_entry(p))) {
+ md = p;
+ if (!(md->attribute & EFI_MEMORY_RUNTIME)) {
+ #ifdef CONFIG_X86_64
--- /dev/null
+From ab76f7b4ab2397ffdd2f1eb07c55697d19991d10 Mon Sep 17 00:00:00 2001
+From: Stephen Smalley <sds@tycho.nsa.gov>
+Date: Thu, 1 Oct 2015 09:04:22 -0400
+Subject: x86/mm: Set NX on gap between __ex_table and rodata
+
+From: Stephen Smalley <sds@tycho.nsa.gov>
+
+commit ab76f7b4ab2397ffdd2f1eb07c55697d19991d10 upstream.
+
+Unused space between the end of __ex_table and the start of
+rodata can be left W+x in the kernel page tables. Extend the
+setting of the NX bit to cover this gap by starting from
+text_end rather than rodata_start.
+
+ Before:
+ ---[ High Kernel Mapping ]---
+ 0xffffffff80000000-0xffffffff81000000 16M pmd
+ 0xffffffff81000000-0xffffffff81600000 6M ro PSE GLB x pmd
+ 0xffffffff81600000-0xffffffff81754000 1360K ro GLB x pte
+ 0xffffffff81754000-0xffffffff81800000 688K RW GLB x pte
+ 0xffffffff81800000-0xffffffff81a00000 2M ro PSE GLB NX pmd
+ 0xffffffff81a00000-0xffffffff81b3b000 1260K ro GLB NX pte
+ 0xffffffff81b3b000-0xffffffff82000000 4884K RW GLB NX pte
+ 0xffffffff82000000-0xffffffff82200000 2M RW PSE GLB NX pmd
+ 0xffffffff82200000-0xffffffffa0000000 478M pmd
+
+ After:
+ ---[ High Kernel Mapping ]---
+ 0xffffffff80000000-0xffffffff81000000 16M pmd
+ 0xffffffff81000000-0xffffffff81600000 6M ro PSE GLB x pmd
+ 0xffffffff81600000-0xffffffff81754000 1360K ro GLB x pte
+ 0xffffffff81754000-0xffffffff81800000 688K RW GLB NX pte
+ 0xffffffff81800000-0xffffffff81a00000 2M ro PSE GLB NX pmd
+ 0xffffffff81a00000-0xffffffff81b3b000 1260K ro GLB NX pte
+ 0xffffffff81b3b000-0xffffffff82000000 4884K RW GLB NX pte
+ 0xffffffff82000000-0xffffffff82200000 2M RW PSE GLB NX pmd
+ 0xffffffff82200000-0xffffffffa0000000 478M pmd
+
+Signed-off-by: Stephen Smalley <sds@tycho.nsa.gov>
+Acked-by: Kees Cook <keescook@chromium.org>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Mike Galbraith <efault@gmx.de>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: linux-kernel@vger.kernel.org
+Link: http://lkml.kernel.org/r/1443704662-3138-1-git-send-email-sds@tycho.nsa.gov
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/mm/init_64.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/mm/init_64.c
++++ b/arch/x86/mm/init_64.c
+@@ -1131,7 +1131,7 @@ void mark_rodata_ro(void)
+ * has been zapped already via cleanup_highmem().
+ */
+ all_end = roundup((unsigned long)_brk_end, PMD_SIZE);
+- set_memory_nx(rodata_start, (all_end - rodata_start) >> PAGE_SHIFT);
++ set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT);
+
+ rodata_test();
+
--- /dev/null
+From 0b34a166f291d255755be46e43ed5497cdd194f2 Mon Sep 17 00:00:00 2001
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+Date: Fri, 25 Sep 2015 11:59:52 +0200
+Subject: x86/xen: Support kexec/kdump in HVM guests by doing a soft reset
+
+From: Vitaly Kuznetsov <vkuznets@redhat.com>
+
+commit 0b34a166f291d255755be46e43ed5497cdd194f2 upstream.
+
+Currently there is a number of issues preventing PVHVM Xen guests from
+doing successful kexec/kdump:
+
+ - Bound event channels.
+ - Registered vcpu_info.
+ - PIRQ/emuirq mappings.
+ - shared_info frame after XENMAPSPACE_shared_info operation.
+ - Active grant mappings.
+
+Basically, newly booted kernel stumbles upon already set up Xen
+interfaces and there is no way to reestablish them. In Xen-4.7 a new
+feature called 'soft reset' is coming. A guest performing kexec/kdump
+operation is supposed to call SCHEDOP_shutdown hypercall with
+SHUTDOWN_soft_reset reason before jumping to new kernel. Hypervisor
+(with some help from toolstack) will do full domain cleanup (but
+keeping its memory and vCPU contexts intact) returning the guest to
+the state it had when it was first booted and thus allowing it to
+start over.
+
+Doing SHUTDOWN_soft_reset on Xen hypervisors which don't support it is
+probably OK as by default all unknown shutdown reasons cause domain
+destroy with a message in toolstack log: 'Unknown shutdown reason code
+5. Destroying domain.' which gives a clue to what the problem is and
+eliminates false expectations.
+
+Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Signed-off-by: David Vrabel <david.vrabel@citrix.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++
+ include/xen/interface/sched.h | 8 ++++++++
+ 2 files changed, 31 insertions(+)
+
+--- a/arch/x86/xen/enlighten.c
++++ b/arch/x86/xen/enlighten.c
+@@ -33,6 +33,10 @@
+ #include <linux/memblock.h>
+ #include <linux/edd.h>
+
++#ifdef CONFIG_KEXEC_CORE
++#include <linux/kexec.h>
++#endif
++
+ #include <xen/xen.h>
+ #include <xen/events.h>
+ #include <xen/interface/xen.h>
+@@ -1844,6 +1848,21 @@ static struct notifier_block xen_hvm_cpu
+ .notifier_call = xen_hvm_cpu_notify,
+ };
+
++#ifdef CONFIG_KEXEC_CORE
++static void xen_hvm_shutdown(void)
++{
++ native_machine_shutdown();
++ if (kexec_in_progress)
++ xen_reboot(SHUTDOWN_soft_reset);
++}
++
++static void xen_hvm_crash_shutdown(struct pt_regs *regs)
++{
++ native_machine_crash_shutdown(regs);
++ xen_reboot(SHUTDOWN_soft_reset);
++}
++#endif
++
+ static void __init xen_hvm_guest_init(void)
+ {
+ init_hvm_pv_info();
+@@ -1860,6 +1879,10 @@ static void __init xen_hvm_guest_init(vo
+ x86_init.irqs.intr_init = xen_init_IRQ;
+ xen_hvm_init_time_ops();
+ xen_hvm_init_mmu_ops();
++#ifdef CONFIG_KEXEC_CORE
++ machine_ops.shutdown = xen_hvm_shutdown;
++ machine_ops.crash_shutdown = xen_hvm_crash_shutdown;
++#endif
+ }
+
+ static uint32_t __init xen_hvm_platform(void)
+--- a/include/xen/interface/sched.h
++++ b/include/xen/interface/sched.h
+@@ -107,5 +107,13 @@ struct sched_watchdog {
+ #define SHUTDOWN_suspend 2 /* Clean up, save suspend info, kill. */
+ #define SHUTDOWN_crash 3 /* Tell controller we've crashed. */
+ #define SHUTDOWN_watchdog 4 /* Restart because watchdog time expired. */
++/*
++ * Domain asked to perform 'soft reset' for it. The expected behavior is to
++ * reset internal Xen state for the domain returning it to the point where it
++ * was created but leaving the domain's memory contents and vCPU contexts
++ * intact. This will allow the domain to start over and set up all Xen specific
++ * interfaces again.
++ */
++#define SHUTDOWN_soft_reset 5
+
+ #endif /* __XEN_PUBLIC_SCHED_H__ */