+++ /dev/null
-From 44889942b6eb356eab27ce25fe10701adfec7776 Mon Sep 17 00:00:00 2001
-From: Ladi Prosek <lprosek@redhat.com>
-Date: Fri, 22 Sep 2017 07:53:15 +0200
-Subject: KVM: nVMX: fix HOST_CR3/HOST_CR4 cache
-
-From: Ladi Prosek <lprosek@redhat.com>
-
-commit 44889942b6eb356eab27ce25fe10701adfec7776 upstream.
-
-For nested virt we maintain multiple VMCS that can run on a vCPU. So it is
-incorrect to keep vmcs_host_cr3 and vmcs_host_cr4, whose purpose is caching
-the value of the rarely changing HOST_CR3 and HOST_CR4 VMCS fields, in
-vCPU-wide data structures.
-
-Hyper-V nested on KVM runs into this consistently for me with PCID enabled.
-CR3 is updated with a new value, unlikely(cr3 != vmx->host_state.vmcs_host_cr3)
-fires, and the currently loaded VMCS is updated. Then we switch from L2 to
-L1 and the next exit reverts CR3 to its old value.
-
-Fixes: d6e41f1151fe ("x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant")
-Signed-off-by: Ladi Prosek <lprosek@redhat.com>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/kvm/vmx.c | 16 ++++++++--------
- 1 file changed, 8 insertions(+), 8 deletions(-)
-
---- a/arch/x86/kvm/vmx.c
-+++ b/arch/x86/kvm/vmx.c
-@@ -191,6 +191,8 @@ struct loaded_vmcs {
- struct vmcs *shadow_vmcs;
- int cpu;
- int launched;
-+ unsigned long vmcs_host_cr3; /* May not match real cr3 */
-+ unsigned long vmcs_host_cr4; /* May not match real cr4 */
- struct list_head loaded_vmcss_on_cpu_link;
- };
-
-@@ -573,8 +575,6 @@ struct vcpu_vmx {
- int gs_ldt_reload_needed;
- int fs_reload_needed;
- u64 msr_host_bndcfgs;
-- unsigned long vmcs_host_cr3; /* May not match real cr3 */
-- unsigned long vmcs_host_cr4; /* May not match real cr4 */
- } host_state;
- struct {
- int vm86_active;
-@@ -4871,12 +4871,12 @@ static void vmx_set_constant_host_state(
- */
- cr3 = read_cr3();
- vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
-- vmx->host_state.vmcs_host_cr3 = cr3;
-+ vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
-
- /* Save the most likely value for this task's CR4 in the VMCS. */
- cr4 = cr4_read_shadow();
- vmcs_writel(HOST_CR4, cr4); /* 22.2.3, 22.2.5 */
-- vmx->host_state.vmcs_host_cr4 = cr4;
-+ vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
-
- vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
- #ifdef CONFIG_X86_64
-@@ -8874,15 +8874,15 @@ static void __noclone vmx_vcpu_run(struc
- vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
- cr3 = __get_current_cr3_fast();
-- if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
-+ if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
- vmcs_writel(HOST_CR3, cr3);
-- vmx->host_state.vmcs_host_cr3 = cr3;
-+ vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
- }
-
- cr4 = cr4_read_shadow();
-- if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
-+ if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
- vmcs_writel(HOST_CR4, cr4);
-- vmx->host_state.vmcs_host_cr4 = cr4;
-+ vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
- }
-
- /* When single-stepping over STI and MOV SS, we must clear the
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
-@@ -1201,6 +1201,11 @@ static inline bool cpu_has_vmx_invvpid_g
+@@ -1199,6 +1199,11 @@ static inline bool cpu_has_vmx_invvpid_g
return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
}
static inline bool cpu_has_vmx_ept(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
-@@ -6445,8 +6450,10 @@ static __init int hardware_setup(void)
+@@ -6434,8 +6439,10 @@ static __init int hardware_setup(void)
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
-@@ -3818,6 +3818,12 @@ static void vmx_flush_tlb(struct kvm_vcp
+@@ -3816,6 +3816,12 @@ static void vmx_flush_tlb(struct kvm_vcp
__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
}
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
-@@ -8505,6 +8511,7 @@ static void vmx_set_virtual_x2apic_mode(
+@@ -8494,6 +8500,7 @@ static void vmx_set_virtual_x2apic_mode(
} else {
sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
}
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
-@@ -8530,8 +8537,10 @@ static void vmx_set_apic_access_page_add
+@@ -8519,8 +8526,10 @@ static void vmx_set_apic_access_page_add
*/
if (!is_guest_mode(vcpu) ||
!nested_cpu_has2(get_vmcs12(&vmx->vcpu),
}
static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
-@@ -10110,6 +10119,9 @@ static void prepare_vmcs02(struct kvm_vc
+@@ -10093,6 +10102,9 @@ static void prepare_vmcs02(struct kvm_vc
if (nested_cpu_has_ept(vmcs12)) {
kvm_mmu_unload(vcpu);
nested_ept_init_mmu_context(vcpu);
}
if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
-@@ -10850,6 +10862,10 @@ static void nested_vmx_vmexit(struct kvm
+@@ -10833,6 +10845,10 @@ static void nested_vmx_vmexit(struct kvm
vmx->nested.change_vmcs01_virtual_x2apic_mode = false;
vmx_set_virtual_x2apic_mode(vcpu,
vcpu->arch.apic_base & X2APIC_ENABLE);
+++ /dev/null
-From e73ad5ff2f76da25390e9607cb549691639330c3 Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Mon, 22 May 2017 15:30:03 -0700
-Subject: mm, x86/mm: Make the batched unmap TLB flush API more generic
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit e73ad5ff2f76da25390e9607cb549691639330c3 upstream.
-
-try_to_unmap_flush() used to open-code a rather x86-centric flush
-sequence: local_flush_tlb() + flush_tlb_others(). Rearrange the
-code so that the arch (only x86 for now) provides
-arch_tlbbatch_add_mm() and arch_tlbbatch_flush() and the core code
-calls those functions instead.
-
-I'll want this for x86 because, to enable address space ids, I can't
-support the flush_tlb_others() mode used by exising
-try_to_unmap_flush() implementation with good performance. I can
-support the new API fairly easily, though.
-
-I imagine that other architectures may be in a similar position.
-Architectures with strong remote flush primitives (arm64?) may have
-even worse performance problems with flush_tlb_others() the way that
-try_to_unmap_flush() uses it.
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Acked-by: Kees Cook <keescook@chromium.org>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Sasha Levin <sasha.levin@oracle.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-Link: http://lkml.kernel.org/r/19f25a8581f9fb77876b7ff3b001f89835e34ea3.1495492063.git.luto@kernel.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/include/asm/tlbbatch.h | 16 ++++++++++++++++
- arch/x86/include/asm/tlbflush.h | 8 ++++++++
- arch/x86/mm/tlb.c | 17 +++++++++++++++++
- include/linux/sched.h | 15 +++++++++++----
- mm/rmap.c | 16 ++--------------
- 5 files changed, 54 insertions(+), 18 deletions(-)
- create mode 100644 arch/x86/include/asm/tlbbatch.h
-
---- /dev/null
-+++ b/arch/x86/include/asm/tlbbatch.h
-@@ -0,0 +1,16 @@
-+#ifndef _ARCH_X86_TLBBATCH_H
-+#define _ARCH_X86_TLBBATCH_H
-+
-+#include <linux/cpumask.h>
-+
-+#ifdef CONFIG_SMP
-+struct arch_tlbflush_unmap_batch {
-+ /*
-+ * Each bit set is a CPU that potentially has a TLB entry for one of
-+ * the PFNs being flushed..
-+ */
-+ struct cpumask cpumask;
-+};
-+#endif
-+
-+#endif /* _ARCH_X86_TLBBATCH_H */
---- a/arch/x86/include/asm/tlbflush.h
-+++ b/arch/x86/include/asm/tlbflush.h
-@@ -327,6 +327,14 @@ static inline void reset_lazy_tlbstate(v
- this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
- }
-
-+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
-+ struct mm_struct *mm)
-+{
-+ cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
-+}
-+
-+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
-+
- #endif /* SMP */
-
- #ifndef CONFIG_PARAVIRT
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -410,6 +410,23 @@ void flush_tlb_kernel_range(unsigned lon
- }
- }
-
-+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
-+{
-+ int cpu = get_cpu();
-+
-+ if (cpumask_test_cpu(cpu, &batch->cpumask)) {
-+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-+ local_flush_tlb();
-+ trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
-+ }
-+
-+ if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
-+ flush_tlb_others(&batch->cpumask, NULL, 0, TLB_FLUSH_ALL);
-+ cpumask_clear(&batch->cpumask);
-+
-+ put_cpu();
-+}
-+
- static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf,
- size_t count, loff_t *ppos)
- {
---- a/include/linux/sched.h
-+++ b/include/linux/sched.h
-@@ -1463,15 +1463,22 @@ enum perf_event_task_context {
- perf_nr_task_contexts,
- };
-
-+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
-+#include <asm/tlbbatch.h>
-+#endif
-+
- /* Track pages that require TLB flushes */
- struct tlbflush_unmap_batch {
- /*
-- * Each bit set is a CPU that potentially has a TLB entry for one of
-- * the PFNs being flushed. See set_tlb_ubc_flush_pending().
-+ * The arch code makes the following promise: generic code can modify a
-+ * PTE, then call arch_tlbbatch_add_mm() (which internally provides all
-+ * needed barriers), then call arch_tlbbatch_flush(), and the entries
-+ * will be flushed on all CPUs by the time that arch_tlbbatch_flush()
-+ * returns.
- */
-- struct cpumask cpumask;
-+ struct arch_tlbflush_unmap_batch arch;
-
-- /* True if any bit in cpumask is set */
-+ /* True if a flush is needed. */
- bool flush_required;
-
- /*
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -578,25 +578,13 @@ void page_unlock_anon_vma_read(struct an
- void try_to_unmap_flush(void)
- {
- struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
-- int cpu;
-
- if (!tlb_ubc->flush_required)
- return;
-
-- cpu = get_cpu();
--
-- if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) {
-- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-- local_flush_tlb();
-- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
-- }
--
-- if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids)
-- flush_tlb_others(&tlb_ubc->cpumask, NULL, 0, TLB_FLUSH_ALL);
-- cpumask_clear(&tlb_ubc->cpumask);
-+ arch_tlbbatch_flush(&tlb_ubc->arch);
- tlb_ubc->flush_required = false;
- tlb_ubc->writable = false;
-- put_cpu();
- }
-
- /* Flush iff there are potentially writable TLB entries that can race with IO */
-@@ -613,7 +601,7 @@ static void set_tlb_ubc_flush_pending(st
- {
- struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
-
-- cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
-+ arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
- tlb_ubc->flush_required = true;
-
- /*
cxl-check-if-vphb-exists-before-iterating-over-afu-devices.patch
arm64-initialise-high_memory-global-variable-earlier.patch
-x86-entry-unwind-create-stack-frames-for-saved-interrupt-registers.patch
-x86-mm-reimplement-flush_tlb_page-using-flush_tlb_mm_range.patch
-x86-mm-reduce-indentation-in-flush_tlb_func.patch
-mm-x86-mm-make-the-batched-unmap-tlb-flush-api-more-generic.patch
-x86-mm-pass-flush_tlb_info-to-flush_tlb_others-etc.patch
-x86-mm-change-the-leave_mm-condition-for-local-tlb-flushes.patch
-x86-mm-refactor-flush_tlb_mm_range-to-merge-local-and-remote-cases.patch
-x86-mm-use-new-merged-flush-logic-in-arch_tlbbatch_flush.patch
-x86-mm-remove-the-up-asm-tlbflush.h-code-always-use-the-formerly-smp-code.patch
-x86-mm-rework-lazy-tlb-to-track-the-actual-loaded-mm.patch
-x86-mm-be-more-consistent-wrt-page_shift-vs-page_size-in-tlb-flush-code.patch
-x86-mm-kvm-teach-kvm-s-vmx-code-that-cr3-isn-t-a-constant.patch
-kvm-nvmx-fix-host_cr3-host_cr4-cache.patch
alsa-hda-add-support-for-docking-station-for-hp-820-g2.patch
alsa-hda-add-support-for-docking-station-for-hp-840-g3.patch
kvm-fix-usage-of-uninit-spinlock-in-avic_vm_destroy.patch
tcp-fix-under-evaluated-ssthresh-in-tcp-vegas.patch
rtc-set-the-alarm-to-the-next-expiring-timer.patch
cpuidle-fix-broadcast-control-when-broadcast-can-not-be-entered.patch
-x86-kvm-vmx-simplify-segment_base.patch
-x86-unify-tss_struct.patch
-x86-kvm-vmx-defer-tr-reload-after-vm-exit.patch
-x86-kvm-vmx-remove-unused-variable-in-segment_base.patch
thermal-hisilicon-handle-return-value-of-clk_prepare_enable.patch
thermal-drivers-hisi-fix-missing-interrupt-enablement.patch
thermal-drivers-hisi-fix-kernel-panic-on-alarm-interrupt.patch
+++ /dev/null
-From 946c191161cef10c667b5ee3179db1714fa5b7c0 Mon Sep 17 00:00:00 2001
-From: Josh Poimboeuf <jpoimboe@redhat.com>
-Date: Thu, 20 Oct 2016 11:34:40 -0500
-Subject: x86/entry/unwind: Create stack frames for saved interrupt registers
-
-From: Josh Poimboeuf <jpoimboe@redhat.com>
-
-commit 946c191161cef10c667b5ee3179db1714fa5b7c0 upstream.
-
-With frame pointers, when a task is interrupted, its stack is no longer
-completely reliable because the function could have been interrupted
-before it had a chance to save the previous frame pointer on the stack.
-So the caller of the interrupted function could get skipped by a stack
-trace.
-
-This is problematic for live patching, which needs to know whether a
-stack trace of a sleeping task can be relied upon. There's currently no
-way to detect if a sleeping task was interrupted by a page fault
-exception or preemption before it went to sleep.
-
-Another issue is that when dumping the stack of an interrupted task, the
-unwinder has no way of knowing where the saved pt_regs registers are, so
-it can't print them.
-
-This solves those issues by encoding the pt_regs pointer in the frame
-pointer on entry from an interrupt or an exception.
-
-This patch also updates the unwinder to be able to decode it, because
-otherwise the unwinder would be broken by this change.
-
-Note that this causes a change in the behavior of the unwinder: each
-instance of a pt_regs on the stack is now considered a "frame". So
-callers of unwind_get_return_address() will now get an occasional
-'regs->ip' address that would have previously been skipped over.
-
-Suggested-by: Andy Lutomirski <luto@amacapital.net>
-Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
-Cc: Andy Lutomirski <luto@kernel.org>
-Cc: Borislav Petkov <bp@alien8.de>
-Cc: Brian Gerst <brgerst@gmail.com>
-Cc: Denys Vlasenko <dvlasenk@redhat.com>
-Cc: H. Peter Anvin <hpa@zytor.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Link: http://lkml.kernel.org/r/8b9f84a21e39d249049e0547b559ff8da0df0988.1476973742.git.jpoimboe@redhat.com
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/entry/calling.h | 20 ++++++++++
- arch/x86/entry/entry_32.S | 33 +++++++++++++++--
- arch/x86/entry/entry_64.S | 10 +++--
- arch/x86/include/asm/unwind.h | 16 ++++++++
- arch/x86/kernel/unwind_frame.c | 76 ++++++++++++++++++++++++++++++++++++-----
- 5 files changed, 139 insertions(+), 16 deletions(-)
-
---- a/arch/x86/entry/calling.h
-+++ b/arch/x86/entry/calling.h
-@@ -201,6 +201,26 @@ For 32-bit we have the following convent
- .byte 0xf1
- .endm
-
-+/*
-+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
-+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
-+ * is just setting the LSB, which makes it an invalid stack address and is also
-+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
-+ *
-+ * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts
-+ * the original rbp.
-+ */
-+.macro ENCODE_FRAME_POINTER ptregs_offset=0
-+#ifdef CONFIG_FRAME_POINTER
-+ .if \ptregs_offset
-+ leaq \ptregs_offset(%rsp), %rbp
-+ .else
-+ mov %rsp, %rbp
-+ .endif
-+ orq $0x1, %rbp
-+#endif
-+.endm
-+
- #endif /* CONFIG_X86_64 */
-
- /*
---- a/arch/x86/entry/entry_32.S
-+++ b/arch/x86/entry/entry_32.S
-@@ -175,6 +175,22 @@
- SET_KERNEL_GS %edx
- .endm
-
-+/*
-+ * This is a sneaky trick to help the unwinder find pt_regs on the stack. The
-+ * frame pointer is replaced with an encoded pointer to pt_regs. The encoding
-+ * is just setting the LSB, which makes it an invalid stack address and is also
-+ * a signal to the unwinder that it's a pt_regs pointer in disguise.
-+ *
-+ * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the
-+ * original rbp.
-+ */
-+.macro ENCODE_FRAME_POINTER
-+#ifdef CONFIG_FRAME_POINTER
-+ mov %esp, %ebp
-+ orl $0x1, %ebp
-+#endif
-+.endm
-+
- .macro RESTORE_INT_REGS
- popl %ebx
- popl %ecx
-@@ -624,6 +640,7 @@ common_interrupt:
- ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
- SAVE_ALL
-+ ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- movl %esp, %eax
- call do_IRQ
-@@ -635,6 +652,7 @@ ENTRY(name) \
- ASM_CLAC; \
- pushl $~(nr); \
- SAVE_ALL; \
-+ ENCODE_FRAME_POINTER; \
- TRACE_IRQS_OFF \
- movl %esp, %eax; \
- call fn; \
-@@ -769,6 +787,7 @@ END(spurious_interrupt_bug)
- ENTRY(xen_hypervisor_callback)
- pushl $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
-+ ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
-
- /*
-@@ -823,6 +842,7 @@ ENTRY(xen_failsafe_callback)
- jmp iret_exc
- 5: pushl $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
-+ ENCODE_FRAME_POINTER
- jmp ret_from_exception
-
- .section .fixup, "ax"
-@@ -1047,6 +1067,7 @@ error_code:
- pushl %edx
- pushl %ecx
- pushl %ebx
-+ ENCODE_FRAME_POINTER
- cld
- movl $(__KERNEL_PERCPU), %ecx
- movl %ecx, %fs
-@@ -1079,6 +1100,7 @@ ENTRY(debug)
- ASM_CLAC
- pushl $-1 # mark this as an int
- SAVE_ALL
-+ ENCODE_FRAME_POINTER
- xorl %edx, %edx # error code 0
- movl %esp, %eax # pt_regs pointer
-
-@@ -1094,11 +1116,11 @@ ENTRY(debug)
-
- .Ldebug_from_sysenter_stack:
- /* We're on the SYSENTER stack. Switch off. */
-- movl %esp, %ebp
-+ movl %esp, %ebx
- movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
- TRACE_IRQS_OFF
- call do_debug
-- movl %ebp, %esp
-+ movl %ebx, %esp
- jmp ret_from_exception
- END(debug)
-
-@@ -1121,6 +1143,7 @@ ENTRY(nmi)
-
- pushl %eax # pt_regs->orig_ax
- SAVE_ALL
-+ ENCODE_FRAME_POINTER
- xorl %edx, %edx # zero error code
- movl %esp, %eax # pt_regs pointer
-
-@@ -1139,10 +1162,10 @@ ENTRY(nmi)
- * We're on the SYSENTER stack. Switch off. No one (not even debug)
- * is using the thread stack right now, so it's safe for us to use it.
- */
-- movl %esp, %ebp
-+ movl %esp, %ebx
- movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
- call do_nmi
-- movl %ebp, %esp
-+ movl %ebx, %esp
- jmp restore_all_notrace
-
- #ifdef CONFIG_X86_ESPFIX32
-@@ -1159,6 +1182,7 @@ nmi_espfix_stack:
- .endr
- pushl %eax
- SAVE_ALL
-+ ENCODE_FRAME_POINTER
- FIXUP_ESPFIX_STACK # %eax == %esp
- xorl %edx, %edx # zero error code
- call do_nmi
-@@ -1172,6 +1196,7 @@ ENTRY(int3)
- ASM_CLAC
- pushl $-1 # mark this as an int
- SAVE_ALL
-+ ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- xorl %edx, %edx # zero error code
- movl %esp, %eax # pt_regs pointer
---- a/arch/x86/entry/entry_64.S
-+++ b/arch/x86/entry/entry_64.S
-@@ -469,6 +469,7 @@ END(irq_entries_start)
- ALLOC_PT_GPREGS_ON_STACK
- SAVE_C_REGS
- SAVE_EXTRA_REGS
-+ ENCODE_FRAME_POINTER
-
- testb $3, CS(%rsp)
- jz 1f
-@@ -985,6 +986,7 @@ ENTRY(xen_failsafe_callback)
- ALLOC_PT_GPREGS_ON_STACK
- SAVE_C_REGS
- SAVE_EXTRA_REGS
-+ ENCODE_FRAME_POINTER
- jmp error_exit
- END(xen_failsafe_callback)
-
-@@ -1028,6 +1030,7 @@ ENTRY(paranoid_entry)
- cld
- SAVE_C_REGS 8
- SAVE_EXTRA_REGS 8
-+ ENCODE_FRAME_POINTER 8
- movl $1, %ebx
- movl $MSR_GS_BASE, %ecx
- rdmsr
-@@ -1075,6 +1078,7 @@ ENTRY(error_entry)
- cld
- SAVE_C_REGS 8
- SAVE_EXTRA_REGS 8
-+ ENCODE_FRAME_POINTER 8
- xorl %ebx, %ebx
- testb $3, CS+8(%rsp)
- jz .Lerror_kernelspace
-@@ -1259,6 +1263,7 @@ ENTRY(nmi)
- pushq %r13 /* pt_regs->r13 */
- pushq %r14 /* pt_regs->r14 */
- pushq %r15 /* pt_regs->r15 */
-+ ENCODE_FRAME_POINTER
-
- /*
- * At this point we no longer need to worry about stack damage
-@@ -1272,11 +1277,10 @@ ENTRY(nmi)
-
- /*
- * Return back to user mode. We must *not* do the normal exit
-- * work, because we don't want to enable interrupts. Fortunately,
-- * do_nmi doesn't modify pt_regs.
-+ * work, because we don't want to enable interrupts.
- */
- SWAPGS
-- jmp restore_c_regs_and_iret
-+ jmp restore_regs_and_iret
-
- .Lnmi_from_kernel:
- /*
---- a/arch/x86/include/asm/unwind.h
-+++ b/arch/x86/include/asm/unwind.h
-@@ -13,6 +13,7 @@ struct unwind_state {
- int graph_idx;
- #ifdef CONFIG_FRAME_POINTER
- unsigned long *bp;
-+ struct pt_regs *regs;
- #else
- unsigned long *sp;
- #endif
-@@ -47,7 +48,15 @@ unsigned long *unwind_get_return_address
- if (unwind_done(state))
- return NULL;
-
-- return state->bp + 1;
-+ return state->regs ? &state->regs->ip : state->bp + 1;
-+}
-+
-+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
-+{
-+ if (unwind_done(state))
-+ return NULL;
-+
-+ return state->regs;
- }
-
- #else /* !CONFIG_FRAME_POINTER */
-@@ -57,6 +66,11 @@ unsigned long *unwind_get_return_address
- {
- return NULL;
- }
-+
-+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
-+{
-+ return NULL;
-+}
-
- #endif /* CONFIG_FRAME_POINTER */
-
---- a/arch/x86/kernel/unwind_frame.c
-+++ b/arch/x86/kernel/unwind_frame.c
-@@ -14,6 +14,9 @@ unsigned long unwind_get_return_address(
- if (unwind_done(state))
- return 0;
-
-+ if (state->regs && user_mode(state->regs))
-+ return 0;
-+
- addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p,
- addr_p);
-
-@@ -21,6 +24,20 @@ unsigned long unwind_get_return_address(
- }
- EXPORT_SYMBOL_GPL(unwind_get_return_address);
-
-+/*
-+ * This determines if the frame pointer actually contains an encoded pointer to
-+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
-+ */
-+static struct pt_regs *decode_frame_pointer(unsigned long *bp)
-+{
-+ unsigned long regs = (unsigned long)bp;
-+
-+ if (!(regs & 0x1))
-+ return NULL;
-+
-+ return (struct pt_regs *)(regs & ~0x1);
-+}
-+
- static bool update_stack_state(struct unwind_state *state, void *addr,
- size_t len)
- {
-@@ -43,26 +60,59 @@ static bool update_stack_state(struct un
-
- bool unwind_next_frame(struct unwind_state *state)
- {
-- unsigned long *next_bp;
-+ struct pt_regs *regs;
-+ unsigned long *next_bp, *next_frame;
-+ size_t next_len;
-
- if (unwind_done(state))
- return false;
-
-- next_bp = (unsigned long *)*state->bp;
-+ /* have we reached the end? */
-+ if (state->regs && user_mode(state->regs))
-+ goto the_end;
-+
-+ /* get the next frame pointer */
-+ if (state->regs)
-+ next_bp = (unsigned long *)state->regs->bp;
-+ else
-+ next_bp = (unsigned long *)*state->bp;
-+
-+ /* is the next frame pointer an encoded pointer to pt_regs? */
-+ regs = decode_frame_pointer(next_bp);
-+ if (regs) {
-+ next_frame = (unsigned long *)regs;
-+ next_len = sizeof(*regs);
-+ } else {
-+ next_frame = next_bp;
-+ next_len = FRAME_HEADER_SIZE;
-+ }
-
- /* make sure the next frame's data is accessible */
-- if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE))
-+ if (!update_stack_state(state, next_frame, next_len))
- return false;
--
- /* move to the next frame */
-- state->bp = next_bp;
-+ if (regs) {
-+ state->regs = regs;
-+ state->bp = NULL;
-+ } else {
-+ state->bp = next_bp;
-+ state->regs = NULL;
-+ }
-+
- return true;
-+
-+the_end:
-+ state->stack_info.type = STACK_TYPE_UNKNOWN;
-+ return false;
- }
- EXPORT_SYMBOL_GPL(unwind_next_frame);
-
- void __unwind_start(struct unwind_state *state, struct task_struct *task,
- struct pt_regs *regs, unsigned long *first_frame)
- {
-+ unsigned long *bp, *frame;
-+ size_t len;
-+
- memset(state, 0, sizeof(*state));
- state->task = task;
-
-@@ -73,12 +123,22 @@ void __unwind_start(struct unwind_state
- }
-
- /* set up the starting stack frame */
-- state->bp = get_frame_pointer(task, regs);
-+ bp = get_frame_pointer(task, regs);
-+ regs = decode_frame_pointer(bp);
-+ if (regs) {
-+ state->regs = regs;
-+ frame = (unsigned long *)regs;
-+ len = sizeof(*regs);
-+ } else {
-+ state->bp = bp;
-+ frame = bp;
-+ len = FRAME_HEADER_SIZE;
-+ }
-
- /* initialize stack info and make sure the frame data is accessible */
-- get_stack_info(state->bp, state->task, &state->stack_info,
-+ get_stack_info(frame, state->task, &state->stack_info,
- &state->stack_mask);
-- update_stack_state(state, state->bp, FRAME_HEADER_SIZE);
-+ update_stack_state(state, frame, len);
-
- /*
- * The caller can provide the address of the first frame directly
+++ /dev/null
-From b7ffc44d5b2ea163899d09289ca7743d5c32e926 Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Mon, 20 Feb 2017 08:56:14 -0800
-Subject: x86/kvm/vmx: Defer TR reload after VM exit
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit b7ffc44d5b2ea163899d09289ca7743d5c32e926 upstream.
-
-Intel's VMX is daft and resets the hidden TSS limit register to 0x67
-on VMX reload, and the 0x67 is not configurable. KVM currently
-reloads TR using the LTR instruction on every exit, but this is quite
-slow because LTR is serializing.
-
-The 0x67 limit is entirely harmless unless ioperm() is in use, so
-defer the reload until a task using ioperm() is actually running.
-
-Here's some poorly done benchmarking using kvm-unit-tests:
-
-Before:
-
-cpuid 1313
-vmcall 1195
-mov_from_cr8 11
-mov_to_cr8 17
-inl_from_pmtimer 6770
-inl_from_qemu 6856
-inl_from_kernel 2435
-outl_to_kernel 1402
-
-After:
-
-cpuid 1291
-vmcall 1181
-mov_from_cr8 11
-mov_to_cr8 16
-inl_from_pmtimer 6457
-inl_from_qemu 6209
-inl_from_kernel 2339
-outl_to_kernel 1391
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-[Force-reload TR in invalidate_tss_limit. - Paolo]
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/include/asm/desc.h | 48 ++++++++++++++++++++++++++++++++++++++++++++
- arch/x86/kernel/ioport.c | 5 ++++
- arch/x86/kernel/process.c | 10 +++++++++
- arch/x86/kvm/vmx.c | 23 ++++++++-------------
- 4 files changed, 72 insertions(+), 14 deletions(-)
-
---- a/arch/x86/include/asm/desc.h
-+++ b/arch/x86/include/asm/desc.h
-@@ -213,6 +213,54 @@ static inline void native_load_tr_desc(v
- asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
- }
-
-+static inline void force_reload_TR(void)
-+{
-+ struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
-+ tss_desc tss;
-+
-+ memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
-+
-+ /*
-+ * LTR requires an available TSS, and the TSS is currently
-+ * busy. Make it be available so that LTR will work.
-+ */
-+ tss.type = DESC_TSS;
-+ write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
-+
-+ load_TR_desc();
-+}
-+
-+DECLARE_PER_CPU(bool, need_tr_refresh);
-+
-+static inline void refresh_TR(void)
-+{
-+ WARN_ON(preemptible());
-+
-+ if (unlikely(this_cpu_read(need_tr_refresh))) {
-+ force_reload_TR();
-+ this_cpu_write(need_tr_refresh, false);
-+ }
-+}
-+
-+/*
-+ * If you do something evil that corrupts the cached TSS limit (I'm looking
-+ * at you, VMX exits), call this function.
-+ *
-+ * The optimization here is that the TSS limit only matters for Linux if the
-+ * IO bitmap is in use. If the TSS limit gets forced to its minimum value,
-+ * everything works except that IO bitmap will be ignored and all CPL 3 IO
-+ * instructions will #GP, which is exactly what we want for normal tasks.
-+ */
-+static inline void invalidate_tss_limit(void)
-+{
-+ WARN_ON(preemptible());
-+
-+ if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
-+ force_reload_TR();
-+ else
-+ this_cpu_write(need_tr_refresh, true);
-+}
-+
- static inline void native_load_gdt(const struct desc_ptr *dtr)
- {
- asm volatile("lgdt %0"::"m" (*dtr));
---- a/arch/x86/kernel/ioport.c
-+++ b/arch/x86/kernel/ioport.c
-@@ -16,6 +16,7 @@
- #include <linux/syscalls.h>
- #include <linux/bitmap.h>
- #include <asm/syscalls.h>
-+#include <asm/desc.h>
-
- /*
- * this changes the io permissions bitmap in the current task.
-@@ -45,6 +46,10 @@ asmlinkage long sys_ioperm(unsigned long
- memset(bitmap, 0xff, IO_BITMAP_BYTES);
- t->io_bitmap_ptr = bitmap;
- set_thread_flag(TIF_IO_BITMAP);
-+
-+ preempt_disable();
-+ refresh_TR();
-+ preempt_enable();
- }
-
- /*
---- a/arch/x86/kernel/process.c
-+++ b/arch/x86/kernel/process.c
-@@ -33,6 +33,7 @@
- #include <asm/mce.h>
- #include <asm/vm86.h>
- #include <asm/switch_to.h>
-+#include <asm/desc.h>
-
- /*
- * per-CPU TSS segments. Threads are completely 'soft' on Linux,
-@@ -82,6 +83,9 @@ void idle_notifier_unregister(struct not
- EXPORT_SYMBOL_GPL(idle_notifier_unregister);
- #endif
-
-+DEFINE_PER_CPU(bool, need_tr_refresh);
-+EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh);
-+
- /*
- * this gets called so that we can store lazy state into memory and copy the
- * current task into the new thread.
-@@ -227,6 +231,12 @@ void __switch_to_xtra(struct task_struct
- */
- memcpy(tss->io_bitmap, next->io_bitmap_ptr,
- max(prev->io_bitmap_max, next->io_bitmap_max));
-+
-+ /*
-+ * Make sure that the TSS limit is correct for the CPU
-+ * to notice the IO bitmap.
-+ */
-+ refresh_TR();
- } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
- /*
- * Clear any possible leftover bits:
---- a/arch/x86/kvm/vmx.c
-+++ b/arch/x86/kvm/vmx.c
-@@ -1959,19 +1959,6 @@ static void add_atomic_switch_msr(struct
- m->host[i].value = host_val;
- }
-
--static void reload_tss(void)
--{
-- /*
-- * VT restores TR but not its size. Useless.
-- */
-- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-- struct desc_struct *descs;
--
-- descs = (void *)gdt->address;
-- descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
-- load_TR_desc();
--}
--
- static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
- {
- u64 guest_efer = vmx->vcpu.arch.efer;
-@@ -2141,7 +2128,7 @@ static void __vmx_load_host_state(struct
- loadsegment(es, vmx->host_state.es_sel);
- }
- #endif
-- reload_tss();
-+ invalidate_tss_limit();
- #ifdef CONFIG_X86_64
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
- #endif
-@@ -2265,6 +2252,14 @@ static void vmx_vcpu_load(struct kvm_vcp
- vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
- vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
-
-+ /*
-+ * VM exits change the host TR limit to 0x67 after a VM
-+ * exit. This is okay, since 0x67 covers everything except
-+ * the IO bitmap and have have code to handle the IO bitmap
-+ * being lost after a VM exit.
-+ */
-+ BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
-+
- rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
- vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
-
+++ /dev/null
-From 0fce546f9f07b94ccc9de09cf48d35e18946d2fa Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Lefaure?= <jeremy.lefaure@lse.epita.fr>
-Date: Sat, 25 Feb 2017 17:46:53 -0500
-Subject: x86/kvm/vmx: remove unused variable in segment_base()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-From: Jérémy Lefaure <jeremy.lefaure@lse.epita.fr>
-
-commit 0fce546f9f07b94ccc9de09cf48d35e18946d2fa upstream.
-
-The pointer 'struct desc_struct *d' is unused since commit 8c2e41f7ae12
-("x86/kvm/vmx: Simplify segment_base()") so let's remove it.
-
-Signed-off-by: Jérémy Lefaure <jeremy.lefaure@lse.epita.fr>
-Reviewed-by: David Hildenbrand <david@redhat.com>
-Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/kvm/vmx.c | 1 -
- 1 file changed, 1 deletion(-)
-
---- a/arch/x86/kvm/vmx.c
-+++ b/arch/x86/kvm/vmx.c
-@@ -2016,7 +2016,6 @@ static bool update_transition_efer(struc
- static unsigned long segment_base(u16 selector)
- {
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
-- struct desc_struct *d;
- struct desc_struct *table;
- unsigned long v;
-
+++ /dev/null
-From 8c2e41f7ae1234c192ef497472ad306227c77c03 Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Mon, 20 Feb 2017 08:56:12 -0800
-Subject: x86/kvm/vmx: Simplify segment_base()
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit 8c2e41f7ae1234c192ef497472ad306227c77c03 upstream.
-
-Use actual pointer types for pointers (instead of unsigned long) and
-replace hardcoded constants with the appropriate self-documenting
-macros.
-
-The function is still a bit messy, but this seems a lot better than
-before to me.
-
-This is mostly borrowed from a patch by Thomas Garnier.
-
-Cc: Thomas Garnier <thgarnie@google.com>
-Cc: Jim Mattson <jmattson@google.com>
-Cc: Radim Krčmář <rkrcmar@redhat.com>
-Cc: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/kvm/vmx.c | 19 +++++++------------
- 1 file changed, 7 insertions(+), 12 deletions(-)
-
---- a/arch/x86/kvm/vmx.c
-+++ b/arch/x86/kvm/vmx.c
-@@ -2030,28 +2030,23 @@ static unsigned long segment_base(u16 se
- {
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
- struct desc_struct *d;
-- unsigned long table_base;
-+ struct desc_struct *table;
- unsigned long v;
-
-- if (!(selector & ~3))
-+ if (!(selector & ~SEGMENT_RPL_MASK))
- return 0;
-
-- table_base = gdt->address;
-+ table = (struct desc_struct *)gdt->address;
-
-- if (selector & 4) { /* from ldt */
-+ if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
- u16 ldt_selector = kvm_read_ldt();
-
-- if (!(ldt_selector & ~3))
-+ if (!(ldt_selector & ~SEGMENT_RPL_MASK))
- return 0;
-
-- table_base = segment_base(ldt_selector);
-+ table = (struct desc_struct *)segment_base(ldt_selector);
- }
-- d = (struct desc_struct *)(table_base + (selector & ~7));
-- v = get_desc_base(d);
--#ifdef CONFIG_X86_64
-- if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
-- v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
--#endif
-+ v = get_desc_base(&table[selector >> 3]);
- return v;
- }
-
+++ /dev/null
-From be4ffc0d787fafb22b89a2f29e71fea3b119205e Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Sun, 28 May 2017 10:00:16 -0700
-Subject: x86/mm: Be more consistent wrt PAGE_SHIFT vs PAGE_SIZE in tlb flush code
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit be4ffc0d787fafb22b89a2f29e71fea3b119205e upstream.
-
-Nadav pointed out that some code used PAGE_SIZE and other code used
-PAGE_SHIFT. Use PAGE_SHIFT instead of multiplying or dividing by
-PAGE_SIZE.
-
-Requested-by: Nadav Amit <nadav.amit@gmail.com>
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Arjan van de Ven <arjan@linux.intel.com>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/mm/tlb.c | 5 ++---
- 1 file changed, 2 insertions(+), 3 deletions(-)
-
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -220,8 +220,7 @@ static void flush_tlb_func_common(const
- trace_tlb_flush(reason, TLB_FLUSH_ALL);
- } else {
- unsigned long addr;
-- unsigned long nr_pages =
-- (f->end - f->start) / PAGE_SIZE;
-+ unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT;
- addr = f->start;
- while (addr < f->end) {
- __flush_tlb_single(addr);
-@@ -374,7 +373,7 @@ void flush_tlb_kernel_range(unsigned lon
-
- /* Balance as user space task's flush, a bit conservative */
- if (end == TLB_FLUSH_ALL ||
-- (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) {
-+ (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) {
- on_each_cpu(do_flush_tlb_all, NULL, 1);
- } else {
- struct flush_tlb_info info;
+++ /dev/null
-From 59f537c1dea04287165bb11407921e095250dc80 Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Sun, 28 May 2017 10:00:11 -0700
-Subject: x86/mm: Change the leave_mm() condition for local TLB flushes
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit 59f537c1dea04287165bb11407921e095250dc80 upstream.
-
-On a remote TLB flush, we leave_mm() if we're TLBSTATE_LAZY. For a
-local flush_tlb_mm_range(), we leave_mm() if !current->mm. These
-are approximately the same condition -- the scheduler sets lazy TLB
-mode when switching to a thread with no mm.
-
-I'm about to merge the local and remote flush code, but for ease of
-verifying and bisecting the patch, I want the local and remote flush
-behavior to match first. This patch changes the local code to match
-the remote code.
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Acked-by: Rik van Riel <riel@redhat.com>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Arjan van de Ven <arjan@linux.intel.com>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/mm/tlb.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -328,7 +328,7 @@ void flush_tlb_mm_range(struct mm_struct
- goto out;
- }
-
-- if (!current->mm) {
-+ if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
- leave_mm(smp_processor_id());
-
- /* Synchronize with switch_mm. */
+++ /dev/null
-From d6e41f1151feeb118eee776c09323aceb4a415d9 Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Sun, 28 May 2017 10:00:17 -0700
-Subject: x86/mm, KVM: Teach KVM's VMX code that CR3 isn't a constant
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit d6e41f1151feeb118eee776c09323aceb4a415d9 upstream.
-
-When PCID is enabled, CR3's PCID bits can change during context
-switches, so KVM won't be able to treat CR3 as a per-mm constant any
-more.
-
-I structured this like the existing CR4 handling. Under ordinary
-circumstances (PCID disabled or if the current PCID and the value
-that's already in the VMCS match), then we won't do an extra VMCS
-write, and we'll never do an extra direct CR3 read. The overhead
-should be minimal.
-
-I disallowed using the new helper in non-atomic context because
-PCID support will cause CR3 to stop being constant in non-atomic
-process context.
-
-(Frankly, it also scares me a bit that KVM ever treated CR3 as
-constant, but it looks like it was okay before.)
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Arjan van de Ven <arjan@linux.intel.com>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Paolo Bonzini <pbonzini@redhat.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Radim Krčmář <rkrcmar@redhat.com>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: kvm@vger.kernel.org
-Cc: linux-mm@kvack.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/include/asm/mmu_context.h | 19 +++++++++++++++++++
- arch/x86/kvm/vmx.c | 25 +++++++++++++++++++++----
- 2 files changed, 40 insertions(+), 4 deletions(-)
-
---- a/arch/x86/include/asm/mmu_context.h
-+++ b/arch/x86/include/asm/mmu_context.h
-@@ -268,4 +268,23 @@ static inline bool arch_pte_access_permi
- {
- return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
- }
-+
-+/*
-+ * This can be used from process context to figure out what the value of
-+ * CR3 is without needing to do a (slow) read_cr3().
-+ *
-+ * It's intended to be used for code like KVM that sneakily changes CR3
-+ * and needs to restore it. It needs to be used very carefully.
-+ */
-+static inline unsigned long __get_current_cr3_fast(void)
-+{
-+ unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
-+
-+ /* For now, be very restrictive about when this can be called. */
-+ VM_WARN_ON(in_nmi() || !in_atomic());
-+
-+ VM_BUG_ON(cr3 != read_cr3());
-+ return cr3;
-+}
-+
- #endif /* _ASM_X86_MMU_CONTEXT_H */
---- a/arch/x86/kvm/vmx.c
-+++ b/arch/x86/kvm/vmx.c
-@@ -48,6 +48,7 @@
- #include <asm/kexec.h>
- #include <asm/apic.h>
- #include <asm/irq_remapping.h>
-+#include <asm/mmu_context.h>
-
- #include "trace.h"
- #include "pmu.h"
-@@ -572,6 +573,7 @@ struct vcpu_vmx {
- int gs_ldt_reload_needed;
- int fs_reload_needed;
- u64 msr_host_bndcfgs;
-+ unsigned long vmcs_host_cr3; /* May not match real cr3 */
- unsigned long vmcs_host_cr4; /* May not match real cr4 */
- } host_state;
- struct {
-@@ -4857,10 +4859,19 @@ static void vmx_set_constant_host_state(
- u32 low32, high32;
- unsigned long tmpl;
- struct desc_ptr dt;
-- unsigned long cr4;
-+ unsigned long cr0, cr3, cr4;
-
-- vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */
-- vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */
-+ cr0 = read_cr0();
-+ WARN_ON(cr0 & X86_CR0_TS);
-+ vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */
-+
-+ /*
-+ * Save the most likely value for this task's CR3 in the VMCS.
-+ * We can't use __get_current_cr3_fast() because we're not atomic.
-+ */
-+ cr3 = read_cr3();
-+ vmcs_writel(HOST_CR3, cr3); /* 22.2.3 FIXME: shadow tables */
-+ vmx->host_state.vmcs_host_cr3 = cr3;
-
- /* Save the most likely value for this task's CR4 in the VMCS. */
- cr4 = cr4_read_shadow();
-@@ -8836,7 +8847,7 @@ void vmx_arm_hv_timer(struct kvm_vcpu *v
- static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
- {
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-- unsigned long debugctlmsr, cr4;
-+ unsigned long debugctlmsr, cr3, cr4;
-
- /* Record the guest's net vcpu time for enforced NMI injections. */
- if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
-@@ -8862,6 +8873,12 @@ static void __noclone vmx_vcpu_run(struc
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
- vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
-
-+ cr3 = __get_current_cr3_fast();
-+ if (unlikely(cr3 != vmx->host_state.vmcs_host_cr3)) {
-+ vmcs_writel(HOST_CR3, cr3);
-+ vmx->host_state.vmcs_host_cr3 = cr3;
-+ }
-+
- cr4 = cr4_read_shadow();
- if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
- vmcs_writel(HOST_CR4, cr4);
+++ /dev/null
-From a2055abe9c6789cedef29abbdaa488a087faccc3 Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Sun, 28 May 2017 10:00:10 -0700
-Subject: x86/mm: Pass flush_tlb_info to flush_tlb_others() etc
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit a2055abe9c6789cedef29abbdaa488a087faccc3 upstream.
-
-Rather than passing all the contents of flush_tlb_info to
-flush_tlb_others(), pass a pointer to the structure directly. For
-consistency, this also removes the unnecessary cpu parameter from
-uv_flush_tlb_others() to make its signature match the other
-*flush_tlb_others() functions.
-
-This serves two purposes:
-
- - It will dramatically simplify future patches that change struct
- flush_tlb_info, which I'm planning to do.
-
- - struct flush_tlb_info is an adequate description of what to do
- for a local flush, too, so by reusing it we can remove duplicated
- code between local and remove flushes in a future patch.
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Acked-by: Rik van Riel <riel@redhat.com>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-[ Fix build warning. ]
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/include/asm/paravirt.h | 6 --
- arch/x86/include/asm/paravirt_types.h | 5 --
- arch/x86/include/asm/tlbflush.h | 19 +++++---
- arch/x86/include/asm/uv/uv.h | 11 ++---
- arch/x86/mm/tlb.c | 72 ++++++++++++++++++----------------
- arch/x86/platform/uv/tlb_uv.c | 10 +---
- arch/x86/xen/mmu.c | 10 ++--
- 7 files changed, 68 insertions(+), 65 deletions(-)
-
---- a/arch/x86/include/asm/paravirt.h
-+++ b/arch/x86/include/asm/paravirt.h
-@@ -317,11 +317,9 @@ static inline void __flush_tlb_single(un
- }
-
- static inline void flush_tlb_others(const struct cpumask *cpumask,
-- struct mm_struct *mm,
-- unsigned long start,
-- unsigned long end)
-+ const struct flush_tlb_info *info)
- {
-- PVOP_VCALL4(pv_mmu_ops.flush_tlb_others, cpumask, mm, start, end);
-+ PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
- }
-
- static inline int paravirt_pgd_alloc(struct mm_struct *mm)
---- a/arch/x86/include/asm/paravirt_types.h
-+++ b/arch/x86/include/asm/paravirt_types.h
-@@ -51,6 +51,7 @@ struct mm_struct;
- struct desc_struct;
- struct task_struct;
- struct cpumask;
-+struct flush_tlb_info;
-
- /*
- * Wrapper type for pointers to code which uses the non-standard
-@@ -225,9 +226,7 @@ struct pv_mmu_ops {
- void (*flush_tlb_kernel)(void);
- void (*flush_tlb_single)(unsigned long addr);
- void (*flush_tlb_others)(const struct cpumask *cpus,
-- struct mm_struct *mm,
-- unsigned long start,
-- unsigned long end);
-+ const struct flush_tlb_info *info);
-
- /* Hooks for allocating and freeing a pagetable top-level */
- int (*pgd_alloc)(struct mm_struct *mm);
---- a/arch/x86/include/asm/tlbflush.h
-+++ b/arch/x86/include/asm/tlbflush.h
-@@ -211,12 +211,18 @@ static inline void __flush_tlb_one(unsig
- * - flush_tlb_page(vma, vmaddr) flushes one page
- * - flush_tlb_range(vma, start, end) flushes a range of pages
- * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
-- * - flush_tlb_others(cpumask, mm, start, end) flushes TLBs on other cpus
-+ * - flush_tlb_others(cpumask, info) flushes TLBs on other cpus
- *
- * ..but the i386 has somewhat limited tlb flushing capabilities,
- * and page-granular flushes are available only on i486 and up.
- */
-
-+struct flush_tlb_info {
-+ struct mm_struct *mm;
-+ unsigned long start;
-+ unsigned long end;
-+};
-+
- #ifndef CONFIG_SMP
-
- /* "_up" is for UniProcessor.
-@@ -275,9 +281,7 @@ static inline void flush_tlb_mm_range(st
- }
-
- static inline void native_flush_tlb_others(const struct cpumask *cpumask,
-- struct mm_struct *mm,
-- unsigned long start,
-- unsigned long end)
-+ const struct flush_tlb_info *info)
- {
- }
-
-@@ -315,8 +319,7 @@ static inline void flush_tlb_page(struct
- }
-
- void native_flush_tlb_others(const struct cpumask *cpumask,
-- struct mm_struct *mm,
-- unsigned long start, unsigned long end);
-+ const struct flush_tlb_info *info);
-
- #define TLBSTATE_OK 1
- #define TLBSTATE_LAZY 2
-@@ -338,8 +341,8 @@ extern void arch_tlbbatch_flush(struct a
- #endif /* SMP */
-
- #ifndef CONFIG_PARAVIRT
--#define flush_tlb_others(mask, mm, start, end) \
-- native_flush_tlb_others(mask, mm, start, end)
-+#define flush_tlb_others(mask, info) \
-+ native_flush_tlb_others(mask, info)
- #endif
-
- #endif /* _ASM_X86_TLBFLUSH_H */
---- a/arch/x86/include/asm/uv/uv.h
-+++ b/arch/x86/include/asm/uv/uv.h
-@@ -1,6 +1,8 @@
- #ifndef _ASM_X86_UV_UV_H
- #define _ASM_X86_UV_UV_H
-
-+#include <asm/tlbflush.h>
-+
- enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
-
- struct cpumask;
-@@ -14,10 +16,7 @@ extern void uv_cpu_init(void);
- extern void uv_nmi_init(void);
- extern void uv_system_init(void);
- extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-- struct mm_struct *mm,
-- unsigned long start,
-- unsigned long end,
-- unsigned int cpu);
-+ const struct flush_tlb_info *info);
-
- #else /* X86_UV */
-
-@@ -26,8 +25,8 @@ static inline int is_uv_system(void) { r
- static inline void uv_cpu_init(void) { }
- static inline void uv_system_init(void) { }
- static inline const struct cpumask *
--uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm,
-- unsigned long start, unsigned long end, unsigned int cpu)
-+uv_flush_tlb_others(const struct cpumask *cpumask,
-+ const struct flush_tlb_info *info)
- { return cpumask; }
-
- #endif /* X86_UV */
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -30,12 +30,6 @@
-
- #ifdef CONFIG_SMP
-
--struct flush_tlb_info {
-- struct mm_struct *flush_mm;
-- unsigned long flush_start;
-- unsigned long flush_end;
--};
--
- /*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
-@@ -229,11 +223,11 @@ void switch_mm_irqs_off(struct mm_struct
- */
- static void flush_tlb_func(void *info)
- {
-- struct flush_tlb_info *f = info;
-+ const struct flush_tlb_info *f = info;
-
- inc_irq_stat(irq_tlb_count);
-
-- if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
-+ if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.active_mm))
- return;
-
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
-@@ -243,15 +237,15 @@ static void flush_tlb_func(void *info)
- return;
- }
-
-- if (f->flush_end == TLB_FLUSH_ALL) {
-+ if (f->end == TLB_FLUSH_ALL) {
- local_flush_tlb();
- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
- } else {
- unsigned long addr;
- unsigned long nr_pages =
-- (f->flush_end - f->flush_start) / PAGE_SIZE;
-- addr = f->flush_start;
-- while (addr < f->flush_end) {
-+ (f->end - f->start) / PAGE_SIZE;
-+ addr = f->start;
-+ while (addr < f->end) {
- __flush_tlb_single(addr);
- addr += PAGE_SIZE;
- }
-@@ -260,38 +254,38 @@ static void flush_tlb_func(void *info)
- }
-
- void native_flush_tlb_others(const struct cpumask *cpumask,
-- struct mm_struct *mm, unsigned long start,
-- unsigned long end)
-+ const struct flush_tlb_info *info)
- {
-- struct flush_tlb_info info;
--
-- info.flush_mm = mm;
-- info.flush_start = start;
-- info.flush_end = end;
--
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
-- if (end == TLB_FLUSH_ALL)
-+ if (info->end == TLB_FLUSH_ALL)
- trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
- else
- trace_tlb_flush(TLB_REMOTE_SEND_IPI,
-- (end - start) >> PAGE_SHIFT);
-+ (info->end - info->start) >> PAGE_SHIFT);
-
- if (is_uv_system()) {
- unsigned int cpu;
-
- cpu = smp_processor_id();
-- cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu);
-+ cpumask = uv_flush_tlb_others(cpumask, info);
- if (cpumask)
- smp_call_function_many(cpumask, flush_tlb_func,
-- &info, 1);
-+ (void *)info, 1);
- return;
- }
-- smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
-+ smp_call_function_many(cpumask, flush_tlb_func,
-+ (void *)info, 1);
- }
-
- void flush_tlb_current_task(void)
- {
- struct mm_struct *mm = current->mm;
-+ struct flush_tlb_info info = {
-+ .mm = mm,
-+ .start = 0UL,
-+ .end = TLB_FLUSH_ALL,
-+ };
-+
-
- preempt_disable();
-
-@@ -302,7 +296,7 @@ void flush_tlb_current_task(void)
-
- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-- flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
-+ flush_tlb_others(mm_cpumask(mm), &info);
- preempt_enable();
- }
-
-@@ -322,6 +316,7 @@ void flush_tlb_mm_range(struct mm_struct
- unsigned long end, unsigned long vmflag)
- {
- unsigned long addr;
-+ struct flush_tlb_info info;
- /* do a global flush by default */
- unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
-
-@@ -362,15 +357,20 @@ void flush_tlb_mm_range(struct mm_struct
- }
- trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
- out:
-+ info.mm = mm;
- if (base_pages_to_flush == TLB_FLUSH_ALL) {
-- start = 0UL;
-- end = TLB_FLUSH_ALL;
-+ info.start = 0UL;
-+ info.end = TLB_FLUSH_ALL;
-+ } else {
-+ info.start = start;
-+ info.end = end;
- }
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-- flush_tlb_others(mm_cpumask(mm), mm, start, end);
-+ flush_tlb_others(mm_cpumask(mm), &info);
- preempt_enable();
- }
-
-+
- static void do_flush_tlb_all(void *info)
- {
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
-@@ -391,7 +391,7 @@ static void do_kernel_range_flush(void *
- unsigned long addr;
-
- /* flush range by one by one 'invlpg' */
-- for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
-+ for (addr = f->start; addr < f->end; addr += PAGE_SIZE)
- __flush_tlb_single(addr);
- }
-
-@@ -404,14 +404,20 @@ void flush_tlb_kernel_range(unsigned lon
- on_each_cpu(do_flush_tlb_all, NULL, 1);
- } else {
- struct flush_tlb_info info;
-- info.flush_start = start;
-- info.flush_end = end;
-+ info.start = start;
-+ info.end = end;
- on_each_cpu(do_kernel_range_flush, &info, 1);
- }
- }
-
- void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
- {
-+ struct flush_tlb_info info = {
-+ .mm = NULL,
-+ .start = 0UL,
-+ .end = TLB_FLUSH_ALL,
-+ };
-+
- int cpu = get_cpu();
-
- if (cpumask_test_cpu(cpu, &batch->cpumask)) {
-@@ -421,7 +427,7 @@ void arch_tlbbatch_flush(struct arch_tlb
- }
-
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
-- flush_tlb_others(&batch->cpumask, NULL, 0, TLB_FLUSH_ALL);
-+ flush_tlb_others(&batch->cpumask, &info);
- cpumask_clear(&batch->cpumask);
-
- put_cpu();
---- a/arch/x86/platform/uv/tlb_uv.c
-+++ b/arch/x86/platform/uv/tlb_uv.c
-@@ -1110,11 +1110,9 @@ static int set_distrib_bits(struct cpuma
- * done. The returned pointer is valid till preemption is re-enabled.
- */
- const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
-- struct mm_struct *mm,
-- unsigned long start,
-- unsigned long end,
-- unsigned int cpu)
-+ const struct flush_tlb_info *info)
- {
-+ unsigned int cpu = smp_processor_id();
- int locals = 0;
- int remotes = 0;
- int hubs = 0;
-@@ -1171,8 +1169,8 @@ const struct cpumask *uv_flush_tlb_other
-
- record_send_statistics(stat, locals, hubs, remotes, bau_desc);
-
-- if (!end || (end - start) <= PAGE_SIZE)
-- bau_desc->payload.address = start;
-+ if (!info->end || (info->end - info->start) <= PAGE_SIZE)
-+ bau_desc->payload.address = info->start;
- else
- bau_desc->payload.address = TLB_FLUSH_ALL;
- bau_desc->payload.sending_cpu = cpu;
---- a/arch/x86/xen/mmu.c
-+++ b/arch/x86/xen/mmu.c
-@@ -1372,8 +1372,7 @@ static void xen_flush_tlb_single(unsigne
- }
-
- static void xen_flush_tlb_others(const struct cpumask *cpus,
-- struct mm_struct *mm, unsigned long start,
-- unsigned long end)
-+ const struct flush_tlb_info *info)
- {
- struct {
- struct mmuext_op op;
-@@ -1385,7 +1384,7 @@ static void xen_flush_tlb_others(const s
- } *args;
- struct multicall_space mcs;
-
-- trace_xen_mmu_flush_tlb_others(cpus, mm, start, end);
-+ trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
-
- if (cpumask_empty(cpus))
- return; /* nothing to do */
-@@ -1399,9 +1398,10 @@ static void xen_flush_tlb_others(const s
- cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
-
- args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-- if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {
-+ if (info->end != TLB_FLUSH_ALL &&
-+ (info->end - info->start) <= PAGE_SIZE) {
- args->op.cmd = MMUEXT_INVLPG_MULTI;
-- args->op.arg1.linear_addr = start;
-+ args->op.arg1.linear_addr = info->start;
- }
-
- MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+++ /dev/null
-From b3b90e5af7976e46541f5029a369c9c38c5e4cea Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Mon, 22 May 2017 15:30:02 -0700
-Subject: x86/mm: Reduce indentation in flush_tlb_func()
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit b3b90e5af7976e46541f5029a369c9c38c5e4cea upstream.
-
-The leave_mm() case can just exit the function early so we don't
-need to indent the entire remainder of the function.
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Acked-by: Kees Cook <keescook@chromium.org>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-Link: http://lkml.kernel.org/r/97901ddcc9821d7bc7b296d2918d1179f08aaf22.1495492063.git.luto@kernel.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/x86/mm/tlb.c | 34 ++++++++++++++++++----------------
- 1 file changed, 18 insertions(+), 16 deletions(-)
-
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -237,24 +237,26 @@ static void flush_tlb_func(void *info)
- return;
-
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
-- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
-- if (f->flush_end == TLB_FLUSH_ALL) {
-- local_flush_tlb();
-- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
-- } else {
-- unsigned long addr;
-- unsigned long nr_pages =
-- (f->flush_end - f->flush_start) / PAGE_SIZE;
-- addr = f->flush_start;
-- while (addr < f->flush_end) {
-- __flush_tlb_single(addr);
-- addr += PAGE_SIZE;
-- }
-- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
-- }
-- } else
-+
-+ if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
- leave_mm(smp_processor_id());
-+ return;
-+ }
-
-+ if (f->flush_end == TLB_FLUSH_ALL) {
-+ local_flush_tlb();
-+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
-+ } else {
-+ unsigned long addr;
-+ unsigned long nr_pages =
-+ (f->flush_end - f->flush_start) / PAGE_SIZE;
-+ addr = f->flush_start;
-+ while (addr < f->flush_end) {
-+ __flush_tlb_single(addr);
-+ addr += PAGE_SIZE;
-+ }
-+ trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
-+ }
- }
-
- void native_flush_tlb_others(const struct cpumask *cpumask,
+++ /dev/null
-From 454bbad9793f59f5656ce5971ee473a8be736ef5 Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Sun, 28 May 2017 10:00:12 -0700
-Subject: x86/mm: Refactor flush_tlb_mm_range() to merge local and remote cases
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit 454bbad9793f59f5656ce5971ee473a8be736ef5 upstream.
-
-The local flush path is very similar to the remote flush path.
-Merge them.
-
-This is intended to make no difference to behavior whatsoever. It
-removes some code and will make future changes to the flushing
-mechanics simpler.
-
-This patch does remove one small optimization: flush_tlb_mm_range()
-now has an unconditional smp_mb() instead of using MOV to CR3 or
-INVLPG as a full barrier when applicable. I think this is okay for
-a few reasons. First, smp_mb() is quite cheap compared to the cost
-of a TLB flush. Second, this rearrangement makes a bigger
-optimization available: with some work on the SMP function call
-code, we could do the local and remote flushes in parallel. Third,
-I'm planning a rework of the TLB flush algorithm that will require
-an atomic operation at the beginning of each flush, and that
-operation will replace the smp_mb().
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Arjan van de Ven <arjan@linux.intel.com>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/include/asm/tlbflush.h | 1
- arch/x86/mm/tlb.c | 111 +++++++++++++++++-----------------------
- 2 files changed, 48 insertions(+), 64 deletions(-)
-
---- a/arch/x86/include/asm/tlbflush.h
-+++ b/arch/x86/include/asm/tlbflush.h
-@@ -216,7 +216,6 @@ static inline void __flush_tlb_one(unsig
- * ..but the i386 has somewhat limited tlb flushing capabilities,
- * and page-granular flushes are available only on i486 and up.
- */
--
- struct flush_tlb_info {
- struct mm_struct *mm;
- unsigned long start;
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -216,22 +216,9 @@ void switch_mm_irqs_off(struct mm_struct
- * write/read ordering problems.
- */
-
--/*
-- * TLB flush funcation:
-- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
-- * 2) Leave the mm if we are in the lazy tlb mode.
-- */
--static void flush_tlb_func(void *info)
-+static void flush_tlb_func_common(const struct flush_tlb_info *f,
-+ bool local, enum tlb_flush_reason reason)
- {
-- const struct flush_tlb_info *f = info;
--
-- inc_irq_stat(irq_tlb_count);
--
-- if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.active_mm))
-- return;
--
-- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
--
- if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
- leave_mm(smp_processor_id());
- return;
-@@ -239,7 +226,9 @@ static void flush_tlb_func(void *info)
-
- if (f->end == TLB_FLUSH_ALL) {
- local_flush_tlb();
-- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
-+ if (local)
-+ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-+ trace_tlb_flush(reason, TLB_FLUSH_ALL);
- } else {
- unsigned long addr;
- unsigned long nr_pages =
-@@ -249,10 +238,32 @@ static void flush_tlb_func(void *info)
- __flush_tlb_single(addr);
- addr += PAGE_SIZE;
- }
-- trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages);
-+ if (local)
-+ count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages);
-+ trace_tlb_flush(reason, nr_pages);
- }
- }
-
-+static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
-+{
-+ const struct flush_tlb_info *f = info;
-+
-+ flush_tlb_func_common(f, true, reason);
-+}
-+
-+static void flush_tlb_func_remote(void *info)
-+{
-+ const struct flush_tlb_info *f = info;
-+
-+ inc_irq_stat(irq_tlb_count);
-+
-+ if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.active_mm))
-+ return;
-+
-+ count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
-+ flush_tlb_func_common(f, false, TLB_REMOTE_SHOOTDOWN);
-+}
-+
- void native_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info)
- {
-@@ -269,11 +280,11 @@ void native_flush_tlb_others(const struc
- cpu = smp_processor_id();
- cpumask = uv_flush_tlb_others(cpumask, info);
- if (cpumask)
-- smp_call_function_many(cpumask, flush_tlb_func,
-+ smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
- return;
- }
-- smp_call_function_many(cpumask, flush_tlb_func,
-+ smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
- }
-
-@@ -315,59 +326,33 @@ static unsigned long tlb_single_page_flu
- void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long vmflag)
- {
-- unsigned long addr;
-- struct flush_tlb_info info;
-- /* do a global flush by default */
-- unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
--
-- preempt_disable();
-- if (current->active_mm != mm) {
-- /* Synchronize with switch_mm. */
-- smp_mb();
-+ int cpu;
-
-- goto out;
-- }
--
-- if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
-- leave_mm(smp_processor_id());
--
-- /* Synchronize with switch_mm. */
-- smp_mb();
-+ struct flush_tlb_info info = {
-+ .mm = mm,
-+ };
-
-- goto out;
-- }
-+ cpu = get_cpu();
-
-- if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
-- base_pages_to_flush = (end - start) >> PAGE_SHIFT;
-+ /* Synchronize with switch_mm. */
-+ smp_mb();
-
-- /*
-- * Both branches below are implicit full barriers (MOV to CR or
-- * INVLPG) that synchronize with switch_mm.
-- */
-- if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
-- base_pages_to_flush = TLB_FLUSH_ALL;
-- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-- local_flush_tlb();
-+ /* Should we flush just the requested range? */
-+ if ((end != TLB_FLUSH_ALL) &&
-+ !(vmflag & VM_HUGETLB) &&
-+ ((end - start) >> PAGE_SHIFT) <= tlb_single_page_flush_ceiling) {
-+ info.start = start;
-+ info.end = end;
- } else {
-- /* flush range by one by one 'invlpg' */
-- for (addr = start; addr < end; addr += PAGE_SIZE) {
-- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE);
-- __flush_tlb_single(addr);
-- }
-- }
-- trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
--out:
-- info.mm = mm;
-- if (base_pages_to_flush == TLB_FLUSH_ALL) {
- info.start = 0UL;
- info.end = TLB_FLUSH_ALL;
-- } else {
-- info.start = start;
-- info.end = end;
- }
-- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-+
-+ if (mm == current->active_mm)
-+ flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
-+ if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), &info);
-- preempt_enable();
-+ put_cpu();
- }
-
-
+++ /dev/null
-From ca6c99c0794875c6d1db6e22f246699691ab7e6b Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Mon, 22 May 2017 15:30:01 -0700
-Subject: x86/mm: Reimplement flush_tlb_page() using flush_tlb_mm_range()
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit ca6c99c0794875c6d1db6e22f246699691ab7e6b upstream.
-
-flush_tlb_page() was very similar to flush_tlb_mm_range() except that
-it had a couple of issues:
-
- - It was missing an smp_mb() in the case where
- current->active_mm != mm. (This is a longstanding bug reported by Nadav Amit)
-
- - It was missing tracepoints and vm counter updates.
-
-The only reason that I can see for keeping it at as a separate
-function is that it could avoid a few branches that
-flush_tlb_mm_range() needs to decide to flush just one page. This
-hardly seems worthwhile. If we decide we want to get rid of those
-branches again, a better way would be to introduce an
-__flush_tlb_mm_range() helper and make both flush_tlb_page() and
-flush_tlb_mm_range() use it.
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Acked-by: Kees Cook <keescook@chromium.org>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-Link: http://lkml.kernel.org/r/3cc3847cf888d8907577569b8bac3f01992ef8f9.1495492063.git.luto@kernel.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/include/asm/tlbflush.h | 5 ++++-
- arch/x86/mm/tlb.c | 27 ---------------------------
- 2 files changed, 4 insertions(+), 28 deletions(-)
-
---- a/arch/x86/include/asm/tlbflush.h
-+++ b/arch/x86/include/asm/tlbflush.h
-@@ -304,12 +304,15 @@ static inline void flush_tlb_kernel_rang
-
- extern void flush_tlb_all(void);
- extern void flush_tlb_current_task(void);
--extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
- extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
- unsigned long end, unsigned long vmflag);
- extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
-
- #define flush_tlb() flush_tlb_current_task()
-+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
-+{
-+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
-+}
-
- void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -369,33 +369,6 @@ out:
- preempt_enable();
- }
-
--void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
--{
-- struct mm_struct *mm = vma->vm_mm;
--
-- preempt_disable();
--
-- if (current->active_mm == mm) {
-- if (current->mm) {
-- /*
-- * Implicit full barrier (INVLPG) that synchronizes
-- * with switch_mm.
-- */
-- __flush_tlb_one(start);
-- } else {
-- leave_mm(smp_processor_id());
--
-- /* Synchronize with switch_mm. */
-- smp_mb();
-- }
-- }
--
-- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-- flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
--
-- preempt_enable();
--}
--
- static void do_flush_tlb_all(void *info)
- {
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+++ /dev/null
-From ce4a4e565f5264909a18c733b864c3f74467f69e Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Sun, 28 May 2017 10:00:14 -0700
-Subject: x86/mm: Remove the UP asm/tlbflush.h code, always use the (formerly) SMP code
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit ce4a4e565f5264909a18c733b864c3f74467f69e upstream.
-
-The UP asm/tlbflush.h generates somewhat nicer code than the SMP version.
-Aside from that, it's fallen quite a bit behind the SMP code:
-
- - flush_tlb_mm_range() didn't flush individual pages if the range
- was small.
-
- - The lazy TLB code was much weaker. This usually wouldn't matter,
- but, if a kernel thread flushed its lazy "active_mm" more than
- once (due to reclaim or similar), it wouldn't be unlazied and
- would instead pointlessly flush repeatedly.
-
- - Tracepoints were missing.
-
-Aside from that, simply having the UP code around was a maintanence
-burden, since it means that any change to the TLB flush code had to
-make sure not to break it.
-
-Simplify everything by deleting the UP code.
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Arjan van de Ven <arjan@linux.intel.com>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/Kconfig | 2
- arch/x86/include/asm/hardirq.h | 2
- arch/x86/include/asm/mmu.h | 6 --
- arch/x86/include/asm/mmu_context.h | 2
- arch/x86/include/asm/tlbbatch.h | 2
- arch/x86/include/asm/tlbflush.h | 81 -------------------------------------
- arch/x86/mm/init.c | 2
- arch/x86/mm/tlb.c | 17 -------
- 8 files changed, 5 insertions(+), 109 deletions(-)
-
---- a/arch/x86/Kconfig
-+++ b/arch/x86/Kconfig
-@@ -45,7 +45,7 @@ config X86
- select ARCH_USE_CMPXCHG_LOCKREF if X86_64
- select ARCH_USE_QUEUED_RWLOCKS
- select ARCH_USE_QUEUED_SPINLOCKS
-- select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
-+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
- select ARCH_WANTS_DYNAMIC_TASK_STRUCT
- select ARCH_WANT_FRAME_POINTERS
- select ARCH_WANT_IPC_PARSE_VERSION if X86_32
---- a/arch/x86/include/asm/hardirq.h
-+++ b/arch/x86/include/asm/hardirq.h
-@@ -22,8 +22,8 @@ typedef struct {
- #ifdef CONFIG_SMP
- unsigned int irq_resched_count;
- unsigned int irq_call_count;
-- unsigned int irq_tlb_count;
- #endif
-+ unsigned int irq_tlb_count;
- #ifdef CONFIG_X86_THERMAL_VECTOR
- unsigned int irq_thermal_count;
- #endif
---- a/arch/x86/include/asm/mmu.h
-+++ b/arch/x86/include/asm/mmu.h
-@@ -33,12 +33,6 @@ typedef struct {
- #endif
- } mm_context_t;
-
--#ifdef CONFIG_SMP
- void leave_mm(int cpu);
--#else
--static inline void leave_mm(int cpu)
--{
--}
--#endif
-
- #endif /* _ASM_X86_MMU_H */
---- a/arch/x86/include/asm/mmu_context.h
-+++ b/arch/x86/include/asm/mmu_context.h
-@@ -99,10 +99,8 @@ static inline void load_mm_ldt(struct mm
-
- static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
- {
--#ifdef CONFIG_SMP
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
--#endif
- }
-
- static inline int init_new_context(struct task_struct *tsk,
---- a/arch/x86/include/asm/tlbbatch.h
-+++ b/arch/x86/include/asm/tlbbatch.h
-@@ -3,7 +3,6 @@
-
- #include <linux/cpumask.h>
-
--#ifdef CONFIG_SMP
- struct arch_tlbflush_unmap_batch {
- /*
- * Each bit set is a CPU that potentially has a TLB entry for one of
-@@ -11,6 +10,5 @@ struct arch_tlbflush_unmap_batch {
- */
- struct cpumask cpumask;
- };
--#endif
-
- #endif /* _ARCH_X86_TLBBATCH_H */
---- a/arch/x86/include/asm/tlbflush.h
-+++ b/arch/x86/include/asm/tlbflush.h
-@@ -7,6 +7,7 @@
- #include <asm/processor.h>
- #include <asm/cpufeature.h>
- #include <asm/special_insns.h>
-+#include <asm/smp.h>
-
- static inline void __invpcid(unsigned long pcid, unsigned long addr,
- unsigned long type)
-@@ -65,10 +66,8 @@ static inline void invpcid_flush_all_non
- #endif
-
- struct tlb_state {
--#ifdef CONFIG_SMP
- struct mm_struct *active_mm;
- int state;
--#endif
-
- /*
- * Access to this CR4 shadow and to H/W CR4 is protected by
-@@ -222,82 +221,6 @@ struct flush_tlb_info {
- unsigned long end;
- };
-
--#ifndef CONFIG_SMP
--
--/* "_up" is for UniProcessor.
-- *
-- * This is a helper for other header functions. *Not* intended to be called
-- * directly. All global TLB flushes need to either call this, or to bump the
-- * vm statistics themselves.
-- */
--static inline void __flush_tlb_up(void)
--{
-- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-- __flush_tlb();
--}
--
--static inline void flush_tlb_all(void)
--{
-- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-- __flush_tlb_all();
--}
--
--static inline void flush_tlb(void)
--{
-- __flush_tlb_up();
--}
--
--static inline void local_flush_tlb(void)
--{
-- __flush_tlb_up();
--}
--
--static inline void flush_tlb_mm(struct mm_struct *mm)
--{
-- if (mm == current->active_mm)
-- __flush_tlb_up();
--}
--
--static inline void flush_tlb_page(struct vm_area_struct *vma,
-- unsigned long addr)
--{
-- if (vma->vm_mm == current->active_mm)
-- __flush_tlb_one(addr);
--}
--
--static inline void flush_tlb_range(struct vm_area_struct *vma,
-- unsigned long start, unsigned long end)
--{
-- if (vma->vm_mm == current->active_mm)
-- __flush_tlb_up();
--}
--
--static inline void flush_tlb_mm_range(struct mm_struct *mm,
-- unsigned long start, unsigned long end, unsigned long vmflag)
--{
-- if (mm == current->active_mm)
-- __flush_tlb_up();
--}
--
--static inline void native_flush_tlb_others(const struct cpumask *cpumask,
-- const struct flush_tlb_info *info)
--{
--}
--
--static inline void reset_lazy_tlbstate(void)
--{
--}
--
--static inline void flush_tlb_kernel_range(unsigned long start,
-- unsigned long end)
--{
-- flush_tlb_all();
--}
--
--#else /* SMP */
--
--#include <asm/smp.h>
--
- #define local_flush_tlb() __flush_tlb()
-
- #define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
-@@ -337,8 +260,6 @@ static inline void arch_tlbbatch_add_mm(
-
- extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
-
--#endif /* SMP */
--
- #ifndef CONFIG_PARAVIRT
- #define flush_tlb_others(mask, info) \
- native_flush_tlb_others(mask, info)
---- a/arch/x86/mm/init.c
-+++ b/arch/x86/mm/init.c
-@@ -764,10 +764,8 @@ void __init zone_sizes_init(void)
- }
-
- DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
--#ifdef CONFIG_SMP
- .active_mm = &init_mm,
- .state = 0,
--#endif
- .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
- };
- EXPORT_SYMBOL_GPL(cpu_tlbstate);
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -15,7 +15,7 @@
- #include <linux/debugfs.h>
-
- /*
-- * Smarter SMP flushing macros.
-+ * TLB flushing, formerly SMP-only
- * c/o Linus Torvalds.
- *
- * These mean you can really definitely utterly forget about
-@@ -28,8 +28,6 @@
- * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
- */
-
--#ifdef CONFIG_SMP
--
- /*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
-@@ -53,8 +51,6 @@ void leave_mm(int cpu)
- }
- EXPORT_SYMBOL_GPL(leave_mm);
-
--#endif /* CONFIG_SMP */
--
- void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
- {
-@@ -85,10 +81,8 @@ void switch_mm_irqs_off(struct mm_struct
- set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
- }
-
--#ifdef CONFIG_SMP
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- this_cpu_write(cpu_tlbstate.active_mm, next);
--#endif
-
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
-@@ -146,9 +140,7 @@ void switch_mm_irqs_off(struct mm_struct
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_mm_ldt(next);
- #endif
-- }
--#ifdef CONFIG_SMP
-- else {
-+ } else {
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
-@@ -175,11 +167,8 @@ void switch_mm_irqs_off(struct mm_struct
- load_mm_ldt(next);
- }
- }
--#endif
- }
-
--#ifdef CONFIG_SMP
--
- /*
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
-@@ -459,5 +448,3 @@ static int __init create_tlb_single_page
- return 0;
- }
- late_initcall(create_tlb_single_page_flush_ceiling);
--
--#endif /* CONFIG_SMP */
+++ /dev/null
-From 3d28ebceaffab40f30afa87e33331560148d7b8b Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Sun, 28 May 2017 10:00:15 -0700
-Subject: x86/mm: Rework lazy TLB to track the actual loaded mm
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit 3d28ebceaffab40f30afa87e33331560148d7b8b upstream.
-
-Lazy TLB state is currently managed in a rather baroque manner.
-AFAICT, there are three possible states:
-
- - Non-lazy. This means that we're running a user thread or a
- kernel thread that has called use_mm(). current->mm ==
- current->active_mm == cpu_tlbstate.active_mm and
- cpu_tlbstate.state == TLBSTATE_OK.
-
- - Lazy with user mm. We're running a kernel thread without an mm
- and we're borrowing an mm_struct. We have current->mm == NULL,
- current->active_mm == cpu_tlbstate.active_mm, cpu_tlbstate.state
- != TLBSTATE_OK (i.e. TLBSTATE_LAZY or 0). The current cpu is set
- in mm_cpumask(current->active_mm). CR3 points to
- current->active_mm->pgd. The TLB is up to date.
-
- - Lazy with init_mm. This happens when we call leave_mm(). We
- have current->mm == NULL, current->active_mm ==
- cpu_tlbstate.active_mm, but that mm is only relelvant insofar as
- the scheduler is tracking it for refcounting. cpu_tlbstate.state
- != TLBSTATE_OK. The current cpu is clear in
- mm_cpumask(current->active_mm). CR3 points to swapper_pg_dir,
- i.e. init_mm->pgd.
-
-This patch simplifies the situation. Other than perf, x86 stops
-caring about current->active_mm at all. We have
-cpu_tlbstate.loaded_mm pointing to the mm that CR3 references. The
-TLB is always up to date for that mm. leave_mm() just switches us
-to init_mm. There are no longer any special cases for mm_cpumask,
-and switch_mm() switches mms without worrying about laziness.
-
-After this patch, cpu_tlbstate.state serves only to tell the TLB
-flush code whether it may switch to init_mm instead of doing a
-normal flush.
-
-This makes fairly extensive changes to xen_exit_mmap(), which used
-to look a bit like black magic.
-
-Perf is unchanged. With or without this change, perf may behave a bit
-erratically if it tries to read user memory in kernel thread context.
-We should build on this patch to teach perf to never look at user
-memory when cpu_tlbstate.loaded_mm != current->mm.
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Arjan van de Ven <arjan@linux.intel.com>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/events/core.c | 3
- arch/x86/include/asm/tlbflush.h | 12 +-
- arch/x86/kernel/ldt.c | 7 -
- arch/x86/mm/init.c | 2
- arch/x86/mm/tlb.c | 216 ++++++++++++++++++++--------------------
- arch/x86/xen/mmu.c | 51 ++++-----
- 6 files changed, 147 insertions(+), 144 deletions(-)
-
---- a/arch/x86/events/core.c
-+++ b/arch/x86/events/core.c
-@@ -2100,8 +2100,7 @@ static int x86_pmu_event_init(struct per
-
- static void refresh_pce(void *ignored)
- {
-- if (current->active_mm)
-- load_mm_cr4(current->active_mm);
-+ load_mm_cr4(this_cpu_read(cpu_tlbstate.loaded_mm));
- }
-
- static void x86_pmu_event_mapped(struct perf_event *event)
---- a/arch/x86/include/asm/tlbflush.h
-+++ b/arch/x86/include/asm/tlbflush.h
-@@ -66,7 +66,13 @@ static inline void invpcid_flush_all_non
- #endif
-
- struct tlb_state {
-- struct mm_struct *active_mm;
-+ /*
-+ * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts
-+ * are on. This means that it may not match current->active_mm,
-+ * which will contain the previous user mm when we're in lazy TLB
-+ * mode even if we've already switched back to swapper_pg_dir.
-+ */
-+ struct mm_struct *loaded_mm;
- int state;
-
- /*
-@@ -249,7 +255,9 @@ void native_flush_tlb_others(const struc
- static inline void reset_lazy_tlbstate(void)
- {
- this_cpu_write(cpu_tlbstate.state, 0);
-- this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
-+ this_cpu_write(cpu_tlbstate.loaded_mm, &init_mm);
-+
-+ WARN_ON(read_cr3() != __pa_symbol(swapper_pg_dir));
- }
-
- static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
---- a/arch/x86/kernel/ldt.c
-+++ b/arch/x86/kernel/ldt.c
-@@ -23,14 +23,15 @@
- #include <asm/syscalls.h>
-
- /* context.lock is held for us, so we don't need any locking. */
--static void flush_ldt(void *current_mm)
-+static void flush_ldt(void *__mm)
- {
-+ struct mm_struct *mm = __mm;
- mm_context_t *pc;
-
-- if (current->active_mm != current_mm)
-+ if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
- return;
-
-- pc = ¤t->active_mm->context;
-+ pc = &mm->context;
- set_ldt(pc->ldt->entries, pc->ldt->size);
- }
-
---- a/arch/x86/mm/init.c
-+++ b/arch/x86/mm/init.c
-@@ -764,7 +764,7 @@ void __init zone_sizes_init(void)
- }
-
- DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
-- .active_mm = &init_mm,
-+ .loaded_mm = &init_mm,
- .state = 0,
- .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
- };
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -28,26 +28,25 @@
- * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
- */
-
--/*
-- * We cannot call mmdrop() because we are in interrupt context,
-- * instead update mm->cpu_vm_mask.
-- */
- void leave_mm(int cpu)
- {
-- struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm);
-+ struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
-+
-+ /*
-+ * It's plausible that we're in lazy TLB mode while our mm is init_mm.
-+ * If so, our callers still expect us to flush the TLB, but there
-+ * aren't any user TLB entries in init_mm to worry about.
-+ *
-+ * This needs to happen before any other sanity checks due to
-+ * intel_idle's shenanigans.
-+ */
-+ if (loaded_mm == &init_mm)
-+ return;
-+
- if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- BUG();
-- if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
-- cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
-- load_cr3(swapper_pg_dir);
-- /*
-- * This gets called in the idle path where RCU
-- * functions differently. Tracing normally
-- * uses RCU, so we have to call the tracepoint
-- * specially here.
-- */
-- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-- }
-+
-+ switch_mm(NULL, &init_mm, NULL);
- }
- EXPORT_SYMBOL_GPL(leave_mm);
-
-@@ -65,108 +64,109 @@ void switch_mm_irqs_off(struct mm_struct
- struct task_struct *tsk)
- {
- unsigned cpu = smp_processor_id();
-+ struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
-
-- if (likely(prev != next)) {
-- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
-- /*
-- * If our current stack is in vmalloc space and isn't
-- * mapped in the new pgd, we'll double-fault. Forcibly
-- * map it.
-- */
-- unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
--
-- pgd_t *pgd = next->pgd + stack_pgd_index;
--
-- if (unlikely(pgd_none(*pgd)))
-- set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
-- }
-+ /*
-+ * NB: The scheduler will call us with prev == next when
-+ * switching from lazy TLB mode to normal mode if active_mm
-+ * isn't changing. When this happens, there is no guarantee
-+ * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
-+ *
-+ * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
-+ */
-
-- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-- this_cpu_write(cpu_tlbstate.active_mm, next);
-+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-
-- cpumask_set_cpu(cpu, mm_cpumask(next));
-+ if (real_prev == next) {
-+ /*
-+ * There's nothing to do: we always keep the per-mm control
-+ * regs in sync with cpu_tlbstate.loaded_mm. Just
-+ * sanity-check mm_cpumask.
-+ */
-+ if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
-+ cpumask_set_cpu(cpu, mm_cpumask(next));
-+ return;
-+ }
-
-+ if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
-- * Re-load page tables.
-- *
-- * This logic has an ordering constraint:
-- *
-- * CPU 0: Write to a PTE for 'next'
-- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
-- * CPU 1: set bit 1 in next's mm_cpumask
-- * CPU 1: load from the PTE that CPU 0 writes (implicit)
-- *
-- * We need to prevent an outcome in which CPU 1 observes
-- * the new PTE value and CPU 0 observes bit 1 clear in
-- * mm_cpumask. (If that occurs, then the IPI will never
-- * be sent, and CPU 0's TLB will contain a stale entry.)
-- *
-- * The bad outcome can occur if either CPU's load is
-- * reordered before that CPU's store, so both CPUs must
-- * execute full barriers to prevent this from happening.
-- *
-- * Thus, switch_mm needs a full barrier between the
-- * store to mm_cpumask and any operation that could load
-- * from next->pgd. TLB fills are special and can happen
-- * due to instruction fetches or for no reason at all,
-- * and neither LOCK nor MFENCE orders them.
-- * Fortunately, load_cr3() is serializing and gives the
-- * ordering guarantee we need.
-- *
-+ * If our current stack is in vmalloc space and isn't
-+ * mapped in the new pgd, we'll double-fault. Forcibly
-+ * map it.
- */
-- load_cr3(next->pgd);
-+ unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
-
-- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-+ pgd_t *pgd = next->pgd + stack_pgd_index;
-
-- /* Stop flush ipis for the previous mm */
-- cpumask_clear_cpu(cpu, mm_cpumask(prev));
-+ if (unlikely(pgd_none(*pgd)))
-+ set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
-+ }
-
-- /* Load per-mm CR4 state */
-- load_mm_cr4(next);
-+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
-
--#ifdef CONFIG_MODIFY_LDT_SYSCALL
-- /*
-- * Load the LDT, if the LDT is different.
-- *
-- * It's possible that prev->context.ldt doesn't match
-- * the LDT register. This can happen if leave_mm(prev)
-- * was called and then modify_ldt changed
-- * prev->context.ldt but suppressed an IPI to this CPU.
-- * In this case, prev->context.ldt != NULL, because we
-- * never set context.ldt to NULL while the mm still
-- * exists. That means that next->context.ldt !=
-- * prev->context.ldt, because mms never share an LDT.
-- */
-- if (unlikely(prev->context.ldt != next->context.ldt))
-- load_mm_ldt(next);
--#endif
-- } else {
-- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-- BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-+ WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
-+ cpumask_set_cpu(cpu, mm_cpumask(next));
-
-- if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
-- /*
-- * On established mms, the mm_cpumask is only changed
-- * from irq context, from ptep_clear_flush() while in
-- * lazy tlb mode, and here. Irqs are blocked during
-- * schedule, protecting us from simultaneous changes.
-- */
-- cpumask_set_cpu(cpu, mm_cpumask(next));
-+ /*
-+ * Re-load page tables.
-+ *
-+ * This logic has an ordering constraint:
-+ *
-+ * CPU 0: Write to a PTE for 'next'
-+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
-+ * CPU 1: set bit 1 in next's mm_cpumask
-+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
-+ *
-+ * We need to prevent an outcome in which CPU 1 observes
-+ * the new PTE value and CPU 0 observes bit 1 clear in
-+ * mm_cpumask. (If that occurs, then the IPI will never
-+ * be sent, and CPU 0's TLB will contain a stale entry.)
-+ *
-+ * The bad outcome can occur if either CPU's load is
-+ * reordered before that CPU's store, so both CPUs must
-+ * execute full barriers to prevent this from happening.
-+ *
-+ * Thus, switch_mm needs a full barrier between the
-+ * store to mm_cpumask and any operation that could load
-+ * from next->pgd. TLB fills are special and can happen
-+ * due to instruction fetches or for no reason at all,
-+ * and neither LOCK nor MFENCE orders them.
-+ * Fortunately, load_cr3() is serializing and gives the
-+ * ordering guarantee we need.
-+ */
-+ load_cr3(next->pgd);
-+
-+ /*
-+ * This gets called via leave_mm() in the idle path where RCU
-+ * functions differently. Tracing normally uses RCU, so we have to
-+ * call the tracepoint specially here.
-+ */
-+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-+
-+ /* Stop flush ipis for the previous mm */
-+ WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
-+ real_prev != &init_mm);
-+ cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
-- /*
-- * We were in lazy tlb mode and leave_mm disabled
-- * tlb flush IPI delivery. We must reload CR3
-- * to make sure to use no freed page tables.
-- *
-- * As above, load_cr3() is serializing and orders TLB
-- * fills with respect to the mm_cpumask write.
-- */
-- load_cr3(next->pgd);
-- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-- load_mm_cr4(next);
-- load_mm_ldt(next);
-- }
-- }
-+ /* Load per-mm CR4 state */
-+ load_mm_cr4(next);
-+
-+#ifdef CONFIG_MODIFY_LDT_SYSCALL
-+ /*
-+ * Load the LDT, if the LDT is different.
-+ *
-+ * It's possible that prev->context.ldt doesn't match
-+ * the LDT register. This can happen if leave_mm(prev)
-+ * was called and then modify_ldt changed
-+ * prev->context.ldt but suppressed an IPI to this CPU.
-+ * In this case, prev->context.ldt != NULL, because we
-+ * never set context.ldt to NULL while the mm still
-+ * exists. That means that next->context.ldt !=
-+ * prev->context.ldt, because mms never share an LDT.
-+ */
-+ if (unlikely(real_prev->context.ldt != next->context.ldt))
-+ load_mm_ldt(next);
-+#endif
- }
-
- /*
-@@ -246,7 +246,7 @@ static void flush_tlb_func_remote(void *
-
- inc_irq_stat(irq_tlb_count);
-
-- if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.active_mm))
-+ if (f->mm && f->mm != this_cpu_read(cpu_tlbstate.loaded_mm))
- return;
-
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
-@@ -337,7 +337,7 @@ void flush_tlb_mm_range(struct mm_struct
- info.end = TLB_FLUSH_ALL;
- }
-
-- if (mm == current->active_mm)
-+ if (mm == this_cpu_read(cpu_tlbstate.loaded_mm))
- flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN);
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), &info);
---- a/arch/x86/xen/mmu.c
-+++ b/arch/x86/xen/mmu.c
-@@ -998,37 +998,32 @@ static void xen_dup_mmap(struct mm_struc
- spin_unlock(&mm->page_table_lock);
- }
-
--
--#ifdef CONFIG_SMP
--/* Another cpu may still have their %cr3 pointing at the pagetable, so
-- we need to repoint it somewhere else before we can unpin it. */
--static void drop_other_mm_ref(void *info)
-+static void drop_mm_ref_this_cpu(void *info)
- {
- struct mm_struct *mm = info;
-- struct mm_struct *active_mm;
--
-- active_mm = this_cpu_read(cpu_tlbstate.active_mm);
-
-- if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
-+ if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
- leave_mm(smp_processor_id());
-
-- /* If this cpu still has a stale cr3 reference, then make sure
-- it has been flushed. */
-+ /*
-+ * If this cpu still has a stale cr3 reference, then make sure
-+ * it has been flushed.
-+ */
- if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
-- load_cr3(swapper_pg_dir);
-+ xen_mc_flush();
- }
-
-+#ifdef CONFIG_SMP
-+/*
-+ * Another cpu may still have their %cr3 pointing at the pagetable, so
-+ * we need to repoint it somewhere else before we can unpin it.
-+ */
- static void xen_drop_mm_ref(struct mm_struct *mm)
- {
- cpumask_var_t mask;
- unsigned cpu;
-
-- if (current->active_mm == mm) {
-- if (current->mm == mm)
-- load_cr3(swapper_pg_dir);
-- else
-- leave_mm(smp_processor_id());
-- }
-+ drop_mm_ref_this_cpu(mm);
-
- /* Get the "official" set of cpus referring to our pagetable. */
- if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
-@@ -1036,31 +1031,31 @@ static void xen_drop_mm_ref(struct mm_st
- if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
- && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
- continue;
-- smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
-+ smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
- }
- return;
- }
- cpumask_copy(mask, mm_cpumask(mm));
-
-- /* It's possible that a vcpu may have a stale reference to our
-- cr3, because its in lazy mode, and it hasn't yet flushed
-- its set of pending hypercalls yet. In this case, we can
-- look at its actual current cr3 value, and force it to flush
-- if needed. */
-+ /*
-+ * It's possible that a vcpu may have a stale reference to our
-+ * cr3, because its in lazy mode, and it hasn't yet flushed
-+ * its set of pending hypercalls yet. In this case, we can
-+ * look at its actual current cr3 value, and force it to flush
-+ * if needed.
-+ */
- for_each_online_cpu(cpu) {
- if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
- cpumask_set_cpu(cpu, mask);
- }
-
-- if (!cpumask_empty(mask))
-- smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
-+ smp_call_function_many(mask, drop_mm_ref_this_cpu, mm, 1);
- free_cpumask_var(mask);
- }
- #else
- static void xen_drop_mm_ref(struct mm_struct *mm)
- {
-- if (current->active_mm == mm)
-- load_cr3(swapper_pg_dir);
-+ drop_mm_ref_this_cpu(mm);
- }
- #endif
-
+++ /dev/null
-From 3f79e4c7c9c2f5c30751ea5c8dd9fd1d56b81947 Mon Sep 17 00:00:00 2001
-From: Andy Lutomirski <luto@kernel.org>
-Date: Sun, 28 May 2017 10:00:13 -0700
-Subject: x86/mm: Use new merged flush logic in arch_tlbbatch_flush()
-
-From: Andy Lutomirski <luto@kernel.org>
-
-commit 3f79e4c7c9c2f5c30751ea5c8dd9fd1d56b81947 upstream.
-
-Now there's only one copy of the local tlb flush logic for
-non-kernel pages on SMP kernels.
-
-The only functional change is that arch_tlbbatch_flush() will now
-leave_mm() on the local CPU if that CPU is in the batch and is in
-TLBSTATE_LAZY mode.
-
-Signed-off-by: Andy Lutomirski <luto@kernel.org>
-Cc: Andrew Morton <akpm@linux-foundation.org>
-Cc: Arjan van de Ven <arjan@linux.intel.com>
-Cc: Borislav Petkov <bpetkov@suse.de>
-Cc: Dave Hansen <dave.hansen@intel.com>
-Cc: Linus Torvalds <torvalds@linux-foundation.org>
-Cc: Mel Gorman <mgorman@suse.de>
-Cc: Michal Hocko <mhocko@suse.com>
-Cc: Nadav Amit <nadav.amit@gmail.com>
-Cc: Nadav Amit <namit@vmware.com>
-Cc: Peter Zijlstra <peterz@infradead.org>
-Cc: Rik van Riel <riel@redhat.com>
-Cc: Thomas Gleixner <tglx@linutronix.de>
-Cc: linux-mm@kvack.org
-Signed-off-by: Ingo Molnar <mingo@kernel.org>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/mm/tlb.c | 8 ++------
- 1 file changed, 2 insertions(+), 6 deletions(-)
-
---- a/arch/x86/mm/tlb.c
-+++ b/arch/x86/mm/tlb.c
-@@ -405,12 +405,8 @@ void arch_tlbbatch_flush(struct arch_tlb
-
- int cpu = get_cpu();
-
-- if (cpumask_test_cpu(cpu, &batch->cpumask)) {
-- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-- local_flush_tlb();
-- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
-- }
--
-+ if (cpumask_test_cpu(cpu, &batch->cpumask))
-+ flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN);
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&batch->cpumask, &info);
- cpumask_clear(&batch->cpumask);
+++ /dev/null
-From ca241c75037b32e0216a68e39ad2801d04fa1f87 Mon Sep 17 00:00:00 2001
-From: Glauber de Oliveira Costa <gcosta@redhat.com>
-Date: Wed, 30 Jan 2008 13:31:31 +0100
-Subject: x86: unify tss_struct
-
-From: Glauber de Oliveira Costa <gcosta@redhat.com>
-
-commit ca241c75037b32e0216a68e39ad2801d04fa1f87 upstream.
-
-Although slighly different, the tss_struct is very similar in x86_64 and
-i386. The really different part, which matchs the hardware vision of it, is
-now called x86_hw_tss, and each of the architectures provides yours.
-It's then used as a field in the outter tss_struct.
-
-Signed-off-by: Glauber de Oliveira Costa <gcosta@redhat.com>
-Signed-off-by: Ingo Molnar <mingo@elte.hu>
-Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
-Signed-off-by: Eduardo Valentin <eduval@amazon.com>
-Signed-off-by: Eduardo Valentin <edubezval@gmail.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- arch/x86/include/asm/processor.h | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
---- a/arch/x86/include/asm/processor.h
-+++ b/arch/x86/include/asm/processor.h
-@@ -272,7 +272,7 @@ struct x86_hw_tss {
- u16 reserved5;
- u16 io_bitmap_base;
-
--} __attribute__((packed)) ____cacheline_aligned;
-+} __attribute__((packed));
- #endif
-
- /*