]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
3.16-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 23 Oct 2014 06:47:14 +0000 (14:47 +0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 23 Oct 2014 06:47:14 +0000 (14:47 +0800)
added patches:
kvm-do-not-bias-the-generation-number-in-kvm_current_mmio_generation.patch
kvm-don-t-take-vcpu-mutex-for-obviously-invalid-vcpu-ioctls.patch
kvm-fix-potentially-corrupt-mmio-cache.patch
kvm-s390-unintended-fallthrough-for-external-call.patch
spi-dw-mid-check-that-dma-was-inited-before-exit.patch
spi-dw-mid-respect-8-bit-mode.patch
x86-intel-quark-switch-off-cr4.pge-so-tlb-flush-uses-cr3-instead.patch
x86-kvm-vmx-preserve-cr4-across-vm-entry.patch

queue-3.16/kvm-do-not-bias-the-generation-number-in-kvm_current_mmio_generation.patch [new file with mode: 0644]
queue-3.16/kvm-don-t-take-vcpu-mutex-for-obviously-invalid-vcpu-ioctls.patch [new file with mode: 0644]
queue-3.16/kvm-fix-potentially-corrupt-mmio-cache.patch [new file with mode: 0644]
queue-3.16/kvm-s390-unintended-fallthrough-for-external-call.patch [new file with mode: 0644]
queue-3.16/series
queue-3.16/spi-dw-mid-check-that-dma-was-inited-before-exit.patch [new file with mode: 0644]
queue-3.16/spi-dw-mid-respect-8-bit-mode.patch [new file with mode: 0644]
queue-3.16/x86-intel-quark-switch-off-cr4.pge-so-tlb-flush-uses-cr3-instead.patch [new file with mode: 0644]
queue-3.16/x86-kvm-vmx-preserve-cr4-across-vm-entry.patch [new file with mode: 0644]

diff --git a/queue-3.16/kvm-do-not-bias-the-generation-number-in-kvm_current_mmio_generation.patch b/queue-3.16/kvm-do-not-bias-the-generation-number-in-kvm_current_mmio_generation.patch
new file mode 100644 (file)
index 0000000..93da79e
--- /dev/null
@@ -0,0 +1,55 @@
+From 00f034a12fdd81210d58116326d92780aac5c238 Mon Sep 17 00:00:00 2001
+From: Paolo Bonzini <pbonzini@redhat.com>
+Date: Wed, 20 Aug 2014 14:29:21 +0200
+Subject: KVM: do not bias the generation number in kvm_current_mmio_generation
+
+From: Paolo Bonzini <pbonzini@redhat.com>
+
+commit 00f034a12fdd81210d58116326d92780aac5c238 upstream.
+
+The next patch will give a meaning (a la seqcount) to the low bit of the
+generation number.  Ensure that it matches between kvm->memslots->generation
+and kvm_current_mmio_generation().
+
+Reviewed-by: David Matlack <dmatlack@google.com>
+Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/mmu.c  |    7 +------
+ virt/kvm/kvm_main.c |    7 +++++++
+ 2 files changed, 8 insertions(+), 6 deletions(-)
+
+--- a/arch/x86/kvm/mmu.c
++++ b/arch/x86/kvm/mmu.c
+@@ -240,12 +240,7 @@ static unsigned int get_mmio_spte_genera
+ static unsigned int kvm_current_mmio_generation(struct kvm *kvm)
+ {
+-      /*
+-       * Init kvm generation close to MMIO_MAX_GEN to easily test the
+-       * code of handling generation number wrap-around.
+-       */
+-      return (kvm_memslots(kvm)->generation +
+-                    MMIO_MAX_GEN - 150) & MMIO_GEN_MASK;
++      return kvm_memslots(kvm)->generation & MMIO_GEN_MASK;
+ }
+ static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn,
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -472,6 +472,13 @@ static struct kvm *kvm_create_vm(unsigne
+       kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+       if (!kvm->memslots)
+               goto out_err_no_srcu;
++
++      /*
++       * Init kvm generation close to the maximum to easily test the
++       * code of handling generation number wrap-around.
++       */
++      kvm->memslots->generation = -150;
++
+       kvm_init_memslots_id(kvm);
+       if (init_srcu_struct(&kvm->srcu))
+               goto out_err_no_srcu;
diff --git a/queue-3.16/kvm-don-t-take-vcpu-mutex-for-obviously-invalid-vcpu-ioctls.patch b/queue-3.16/kvm-don-t-take-vcpu-mutex-for-obviously-invalid-vcpu-ioctls.patch
new file mode 100644 (file)
index 0000000..3256ad4
--- /dev/null
@@ -0,0 +1,46 @@
+From 2ea75be3219571d0ec009ce20d9971e54af96e09 Mon Sep 17 00:00:00 2001
+From: David Matlack <dmatlack@google.com>
+Date: Fri, 19 Sep 2014 16:03:25 -0700
+Subject: kvm: don't take vcpu mutex for obviously invalid vcpu ioctls
+
+From: David Matlack <dmatlack@google.com>
+
+commit 2ea75be3219571d0ec009ce20d9971e54af96e09 upstream.
+
+vcpu ioctls can hang the calling thread if issued while a vcpu is running.
+However, invalid ioctls can happen when userspace tries to probe the kind
+of file descriptors (e.g. isatty() calls ioctl(TCGETS)); in that case,
+we know the ioctl is going to be rejected as invalid anyway and we can
+fail before trying to take the vcpu mutex.
+
+This patch does not change functionality, it just makes invalid ioctls
+fail faster.
+
+Signed-off-by: David Matlack <dmatlack@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ virt/kvm/kvm_main.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -52,6 +52,7 @@
+ #include <asm/processor.h>
+ #include <asm/io.h>
++#include <asm/ioctl.h>
+ #include <asm/uaccess.h>
+ #include <asm/pgtable.h>
+@@ -1989,6 +1990,9 @@ static long kvm_vcpu_ioctl(struct file *
+       if (vcpu->kvm->mm != current->mm)
+               return -EIO;
++      if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
++              return -EINVAL;
++
+ #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
+       /*
+        * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
diff --git a/queue-3.16/kvm-fix-potentially-corrupt-mmio-cache.patch b/queue-3.16/kvm-fix-potentially-corrupt-mmio-cache.patch
new file mode 100644 (file)
index 0000000..ba49f33
--- /dev/null
@@ -0,0 +1,191 @@
+From ee3d1570b58677885b4552bce8217fda7b226a68 Mon Sep 17 00:00:00 2001
+From: David Matlack <dmatlack@google.com>
+Date: Mon, 18 Aug 2014 15:46:06 -0700
+Subject: kvm: fix potentially corrupt mmio cache
+
+From: David Matlack <dmatlack@google.com>
+
+commit ee3d1570b58677885b4552bce8217fda7b226a68 upstream.
+
+vcpu exits and memslot mutations can run concurrently as long as the
+vcpu does not aquire the slots mutex. Thus it is theoretically possible
+for memslots to change underneath a vcpu that is handling an exit.
+
+If we increment the memslot generation number again after
+synchronize_srcu_expedited(), vcpus can safely cache memslot generation
+without maintaining a single rcu_dereference through an entire vm exit.
+And much of the x86/kvm code does not maintain a single rcu_dereference
+of the current memslots during each exit.
+
+We can prevent the following case:
+
+   vcpu (CPU 0)                             | thread (CPU 1)
+--------------------------------------------+--------------------------
+1  vm exit                                  |
+2  srcu_read_unlock(&kvm->srcu)             |
+3  decide to cache something based on       |
+     old memslots                           |
+4                                           | change memslots
+                                            | (increments generation)
+5                                           | synchronize_srcu(&kvm->srcu);
+6  retrieve generation # from new memslots  |
+7  tag cache with new memslot generation    |
+8  srcu_read_unlock(&kvm->srcu)             |
+...                                         |
+   <action based on cache occurs even       |
+    though the caching decision was based   |
+    on the old memslots>                    |
+...                                         |
+   <action *continues* to occur until next  |
+    memslot generation change, which may    |
+    be never>                               |
+                                            |
+
+By incrementing the generation after synchronizing with kvm->srcu readers,
+we ensure that the generation retrieved in (6) will become invalid soon
+after (8).
+
+Keeping the existing increment is not strictly necessary, but we
+do keep it and just move it for consistency from update_memslots to
+install_new_memslots.  It invalidates old cached MMIOs immediately,
+instead of having to wait for the end of synchronize_srcu_expedited,
+which makes the code more clearly correct in case CPU 1 is preempted
+right after synchronize_srcu() returns.
+
+To avoid halving the generation space in SPTEs, always presume that the
+low bit of the generation is zero when reconstructing a generation number
+out of an SPTE.  This effectively disables MMIO caching in SPTEs during
+the call to synchronize_srcu_expedited.  Using the low bit this way is
+somewhat like a seqcount---where the protected thing is a cache, and
+instead of retrying we can simply punt if we observe the low bit to be 1.
+
+Signed-off-by: David Matlack <dmatlack@google.com>
+Reviewed-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
+Reviewed-by: David Matlack <dmatlack@google.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ Documentation/virtual/kvm/mmu.txt |   14 ++++++++++++++
+ arch/x86/kvm/mmu.c                |   20 ++++++++++++--------
+ virt/kvm/kvm_main.c               |   23 ++++++++++++++++-------
+ 3 files changed, 42 insertions(+), 15 deletions(-)
+
+--- a/Documentation/virtual/kvm/mmu.txt
++++ b/Documentation/virtual/kvm/mmu.txt
+@@ -425,6 +425,20 @@ fault through the slow path.
+ Since only 19 bits are used to store generation-number on mmio spte, all
+ pages are zapped when there is an overflow.
++Unfortunately, a single memory access might access kvm_memslots(kvm) multiple
++times, the last one happening when the generation number is retrieved and
++stored into the MMIO spte.  Thus, the MMIO spte might be created based on
++out-of-date information, but with an up-to-date generation number.
++
++To avoid this, the generation number is incremented again after synchronize_srcu
++returns; thus, the low bit of kvm_memslots(kvm)->generation is only 1 during a
++memslot update, while some SRCU readers might be using the old copy.  We do not
++want to use an MMIO sptes created with an odd generation number, and we can do
++this without losing a bit in the MMIO spte.  The low bit of the generation
++is not stored in MMIO spte, and presumed zero when it is extracted out of the
++spte.  If KVM is unlucky and creates an MMIO spte while the low bit is 1,
++the next access to the spte will always be a cache miss.
++
+ Further reading
+ ===============
+--- a/arch/x86/kvm/mmu.c
++++ b/arch/x86/kvm/mmu.c
+@@ -199,16 +199,20 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio
+ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+ /*
+- * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number,
+- * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation
+- * number.
++ * the low bit of the generation number is always presumed to be zero.
++ * This disables mmio caching during memslot updates.  The concept is
++ * similar to a seqcount but instead of retrying the access we just punt
++ * and ignore the cache.
++ *
++ * spte bits 3-11 are used as bits 1-9 of the generation number,
++ * the bits 52-61 are used as bits 10-19 of the generation number.
+  */
+-#define MMIO_SPTE_GEN_LOW_SHIFT               3
++#define MMIO_SPTE_GEN_LOW_SHIFT               2
+ #define MMIO_SPTE_GEN_HIGH_SHIFT      52
+-#define MMIO_GEN_SHIFT                        19
+-#define MMIO_GEN_LOW_SHIFT            9
+-#define MMIO_GEN_LOW_MASK             ((1 << MMIO_GEN_LOW_SHIFT) - 1)
++#define MMIO_GEN_SHIFT                        20
++#define MMIO_GEN_LOW_SHIFT            10
++#define MMIO_GEN_LOW_MASK             ((1 << MMIO_GEN_LOW_SHIFT) - 2)
+ #define MMIO_GEN_MASK                 ((1 << MMIO_GEN_SHIFT) - 1)
+ #define MMIO_MAX_GEN                  ((1 << MMIO_GEN_SHIFT) - 1)
+@@ -4433,7 +4437,7 @@ void kvm_mmu_invalidate_mmio_sptes(struc
+        * The very rare case: if the generation-number is round,
+        * zap all shadow pages.
+        */
+-      if (unlikely(kvm_current_mmio_generation(kvm) >= MMIO_MAX_GEN)) {
++      if (unlikely(kvm_current_mmio_generation(kvm) == 0)) {
+               printk_ratelimited(KERN_INFO "kvm: zapping shadow pages for mmio generation wraparound\n");
+               kvm_mmu_invalidate_zap_all_pages(kvm);
+       }
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -95,8 +95,6 @@ static int hardware_enable_all(void);
+ static void hardware_disable_all(void);
+ static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
+-static void update_memslots(struct kvm_memslots *slots,
+-                          struct kvm_memory_slot *new, u64 last_generation);
+ static void kvm_release_pfn_dirty(pfn_t pfn);
+ static void mark_page_dirty_in_slot(struct kvm *kvm,
+@@ -685,8 +683,7 @@ static void sort_memslots(struct kvm_mem
+ }
+ static void update_memslots(struct kvm_memslots *slots,
+-                          struct kvm_memory_slot *new,
+-                          u64 last_generation)
++                          struct kvm_memory_slot *new)
+ {
+       if (new) {
+               int id = new->id;
+@@ -697,8 +694,6 @@ static void update_memslots(struct kvm_m
+               if (new->npages != npages)
+                       sort_memslots(slots);
+       }
+-
+-      slots->generation = last_generation + 1;
+ }
+ static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
+@@ -720,10 +715,24 @@ static struct kvm_memslots *install_new_
+ {
+       struct kvm_memslots *old_memslots = kvm->memslots;
+-      update_memslots(slots, new, kvm->memslots->generation);
++      /*
++       * Set the low bit in the generation, which disables SPTE caching
++       * until the end of synchronize_srcu_expedited.
++       */
++      WARN_ON(old_memslots->generation & 1);
++      slots->generation = old_memslots->generation + 1;
++
++      update_memslots(slots, new);
+       rcu_assign_pointer(kvm->memslots, slots);
+       synchronize_srcu_expedited(&kvm->srcu);
++      /*
++       * Increment the new memslot generation a second time. This prevents
++       * vm exits that race with memslot updates from caching a memslot
++       * generation that will (potentially) be valid forever.
++       */
++      slots->generation++;
++
+       kvm_arch_memslots_updated(kvm);
+       return old_memslots;
diff --git a/queue-3.16/kvm-s390-unintended-fallthrough-for-external-call.patch b/queue-3.16/kvm-s390-unintended-fallthrough-for-external-call.patch
new file mode 100644 (file)
index 0000000..d471e08
--- /dev/null
@@ -0,0 +1,29 @@
+From f346026e55f1efd3949a67ddd1dcea7c1b9a615e Mon Sep 17 00:00:00 2001
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+Date: Wed, 3 Sep 2014 16:21:32 +0200
+Subject: KVM: s390: unintended fallthrough for external call
+
+From: Christian Borntraeger <borntraeger@de.ibm.com>
+
+commit f346026e55f1efd3949a67ddd1dcea7c1b9a615e upstream.
+
+We must not fallthrough if the conditions for external call are not met.
+
+Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
+Reviewed-by: Thomas Huth <thuth@linux.vnet.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/s390/kvm/interrupt.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/s390/kvm/interrupt.c
++++ b/arch/s390/kvm/interrupt.c
+@@ -85,6 +85,7 @@ static int __interrupt_is_deliverable(st
+                       return 0;
+               if (vcpu->arch.sie_block->gcr[0] & 0x2000ul)
+                       return 1;
++              return 0;
+       case KVM_S390_INT_EMERGENCY:
+               if (psw_extint_disabled(vcpu))
+                       return 0;
index 3eb1f9019f07f2a8f88f6b2f0ad5053a5140115f..f05f188837231ca385301a3eb744ccd0ba96b0bd 100644 (file)
@@ -11,3 +11,11 @@ fs-add-a-missing-permission-check-to-do_umount.patch
 usb-pch_udc-usb-gadget-device-support-for-intel-quark-x1000.patch
 pci_ids-add-support-for-intel-quark-ilb.patch
 kvm-x86-fix-stale-mmio-cache-bug.patch
+kvm-fix-potentially-corrupt-mmio-cache.patch
+kvm-do-not-bias-the-generation-number-in-kvm_current_mmio_generation.patch
+kvm-s390-unintended-fallthrough-for-external-call.patch
+kvm-don-t-take-vcpu-mutex-for-obviously-invalid-vcpu-ioctls.patch
+x86-kvm-vmx-preserve-cr4-across-vm-entry.patch
+x86-intel-quark-switch-off-cr4.pge-so-tlb-flush-uses-cr3-instead.patch
+spi-dw-mid-respect-8-bit-mode.patch
+spi-dw-mid-check-that-dma-was-inited-before-exit.patch
diff --git a/queue-3.16/spi-dw-mid-check-that-dma-was-inited-before-exit.patch b/queue-3.16/spi-dw-mid-check-that-dma-was-inited-before-exit.patch
new file mode 100644 (file)
index 0000000..0b6d76b
--- /dev/null
@@ -0,0 +1,32 @@
+From fb57862ead652454ceeb659617404c5f13bc34b5 Mon Sep 17 00:00:00 2001
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Date: Fri, 12 Sep 2014 15:11:58 +0300
+Subject: spi: dw-mid: check that DMA was inited before exit
+
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+
+commit fb57862ead652454ceeb659617404c5f13bc34b5 upstream.
+
+If the driver was compiled with DMA support, but DMA channels weren't acquired
+by some reason, mid_spi_dma_exit() will crash the kernel.
+
+Fixes: 7063c0d942a1 (spi/dw_spi: add DMA support)
+Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/spi/spi-dw-mid.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/spi/spi-dw-mid.c
++++ b/drivers/spi/spi-dw-mid.c
+@@ -89,6 +89,8 @@ err_exit:
+ static void mid_spi_dma_exit(struct dw_spi *dws)
+ {
++      if (!dws->dma_inited)
++              return;
+       dma_release_channel(dws->txchan);
+       dma_release_channel(dws->rxchan);
+ }
diff --git a/queue-3.16/spi-dw-mid-respect-8-bit-mode.patch b/queue-3.16/spi-dw-mid-respect-8-bit-mode.patch
new file mode 100644 (file)
index 0000000..a586a87
--- /dev/null
@@ -0,0 +1,40 @@
+From b41583e7299046abdc578c33f25ed83ee95b9b31 Mon Sep 17 00:00:00 2001
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Date: Thu, 18 Sep 2014 20:08:51 +0300
+Subject: spi: dw-mid: respect 8 bit mode
+
+From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+
+commit b41583e7299046abdc578c33f25ed83ee95b9b31 upstream.
+
+In case of 8 bit mode and DMA usage we end up with every second byte written as
+0. We have to respect bits_per_word settings what this patch actually does.
+
+Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+Signed-off-by: Mark Brown <broonie@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/spi/spi-dw-mid.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/spi/spi-dw-mid.c
++++ b/drivers/spi/spi-dw-mid.c
+@@ -136,7 +136,7 @@ static int mid_spi_dma_transfer(struct d
+       txconf.dst_addr = dws->dma_addr;
+       txconf.dst_maxburst = LNW_DMA_MSIZE_16;
+       txconf.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+-      txconf.dst_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
++      txconf.dst_addr_width = dws->dma_width;
+       txconf.device_fc = false;
+       txchan->device->device_control(txchan, DMA_SLAVE_CONFIG,
+@@ -159,7 +159,7 @@ static int mid_spi_dma_transfer(struct d
+       rxconf.src_addr = dws->dma_addr;
+       rxconf.src_maxburst = LNW_DMA_MSIZE_16;
+       rxconf.dst_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+-      rxconf.src_addr_width = DMA_SLAVE_BUSWIDTH_2_BYTES;
++      rxconf.src_addr_width = dws->dma_width;
+       rxconf.device_fc = false;
+       rxchan->device->device_control(rxchan, DMA_SLAVE_CONFIG,
diff --git a/queue-3.16/x86-intel-quark-switch-off-cr4.pge-so-tlb-flush-uses-cr3-instead.patch b/queue-3.16/x86-intel-quark-switch-off-cr4.pge-so-tlb-flush-uses-cr3-instead.patch
new file mode 100644 (file)
index 0000000..7f94b69
--- /dev/null
@@ -0,0 +1,53 @@
+From ee1b5b165c0a2f04d2107e634e51f05d0eb107de Mon Sep 17 00:00:00 2001
+From: Bryan O'Donoghue <pure.logic@nexus-software.ie>
+Date: Wed, 24 Sep 2014 00:26:24 +0100
+Subject: x86/intel/quark: Switch off CR4.PGE so TLB flush uses CR3 instead
+
+From: Bryan O'Donoghue <pure.logic@nexus-software.ie>
+
+commit ee1b5b165c0a2f04d2107e634e51f05d0eb107de upstream.
+
+Quark x1000 advertises PGE via the standard CPUID method
+PGE bits exist in Quark X1000's PTEs. In order to flush
+an individual PTE it is necessary to reload CR3 irrespective
+of the PTE.PGE bit.
+
+See Quark Core_DevMan_001.pdf section 6.4.11
+
+This bug was fixed in Galileo kernels, unfixed vanilla kernels are expected to
+crash and burn on this platform.
+
+Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
+Cc: Borislav Petkov <bp@alien8.de>
+Link: http://lkml.kernel.org/r/1411514784-14885-1-git-send-email-pure.logic@nexus-software.ie
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kernel/cpu/intel.c |   15 +++++++++++++++
+ 1 file changed, 15 insertions(+)
+
+--- a/arch/x86/kernel/cpu/intel.c
++++ b/arch/x86/kernel/cpu/intel.c
+@@ -144,6 +144,21 @@ static void early_init_intel(struct cpui
+                       setup_clear_cpu_cap(X86_FEATURE_ERMS);
+               }
+       }
++
++      /*
++       * Intel Quark Core DevMan_001.pdf section 6.4.11
++       * "The operating system also is required to invalidate (i.e., flush)
++       *  the TLB when any changes are made to any of the page table entries.
++       *  The operating system must reload CR3 to cause the TLB to be flushed"
++       *
++       * As a result cpu_has_pge() in arch/x86/include/asm/tlbflush.h should
++       * be false so that __flush_tlb_all() causes CR3 insted of CR4.PGE
++       * to be modified
++       */
++      if (c->x86 == 5 && c->x86_model == 9) {
++              pr_info("Disabling PGE capability bit\n");
++              setup_clear_cpu_cap(X86_FEATURE_PGE);
++      }
+ }
+ #ifdef CONFIG_X86_32
diff --git a/queue-3.16/x86-kvm-vmx-preserve-cr4-across-vm-entry.patch b/queue-3.16/x86-kvm-vmx-preserve-cr4-across-vm-entry.patch
new file mode 100644 (file)
index 0000000..ab8cb06
--- /dev/null
@@ -0,0 +1,82 @@
+From d974baa398f34393db76be45f7d4d04fbdbb4a0a Mon Sep 17 00:00:00 2001
+From: Andy Lutomirski <luto@amacapital.net>
+Date: Wed, 8 Oct 2014 09:02:13 -0700
+Subject: x86,kvm,vmx: Preserve CR4 across VM entry
+
+From: Andy Lutomirski <luto@amacapital.net>
+
+commit d974baa398f34393db76be45f7d4d04fbdbb4a0a upstream.
+
+CR4 isn't constant; at least the TSD and PCE bits can vary.
+
+TBH, treating CR0 and CR3 as constant scares me a bit, too, but it looks
+like it's correct.
+
+This adds a branch and a read from cr4 to each vm entry.  Because it is
+extremely likely that consecutive entries into the same vcpu will have
+the same host cr4 value, this fixes up the vmcs instead of restoring cr4
+after the fact.  A subsequent patch will add a kernel-wide cr4 shadow,
+reducing the overhead in the common case to just two memory reads and a
+branch.
+
+Signed-off-by: Andy Lutomirski <luto@amacapital.net>
+Acked-by: Paolo Bonzini <pbonzini@redhat.com>
+Cc: Petr Matousek <pmatouse@redhat.com>
+Cc: Gleb Natapov <gleb@kernel.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/kvm/vmx.c |   16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kvm/vmx.c
++++ b/arch/x86/kvm/vmx.c
+@@ -450,6 +450,7 @@ struct vcpu_vmx {
+               int           gs_ldt_reload_needed;
+               int           fs_reload_needed;
+               u64           msr_host_bndcfgs;
++              unsigned long vmcs_host_cr4;    /* May not match real cr4 */
+       } host_state;
+       struct {
+               int vm86_active;
+@@ -4218,11 +4219,16 @@ static void vmx_set_constant_host_state(
+       u32 low32, high32;
+       unsigned long tmpl;
+       struct desc_ptr dt;
++      unsigned long cr4;
+       vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS);  /* 22.2.3 */
+-      vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
+       vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
++      /* Save the most likely value for this task's CR4 in the VMCS. */
++      cr4 = read_cr4();
++      vmcs_writel(HOST_CR4, cr4);                     /* 22.2.3, 22.2.5 */
++      vmx->host_state.vmcs_host_cr4 = cr4;
++
+       vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
+ #ifdef CONFIG_X86_64
+       /*
+@@ -7336,7 +7342,7 @@ static void atomic_switch_perf_msrs(stru
+ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
+ {
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+-      unsigned long debugctlmsr;
++      unsigned long debugctlmsr, cr4;
+       /* Record the guest's net vcpu time for enforced NMI injections. */
+       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+@@ -7357,6 +7363,12 @@ static void __noclone vmx_vcpu_run(struc
+       if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+               vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
++      cr4 = read_cr4();
++      if (unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) {
++              vmcs_writel(HOST_CR4, cr4);
++              vmx->host_state.vmcs_host_cr4 = cr4;
++      }
++
+       /* When single-stepping over STI and MOV SS, we must clear the
+        * corresponding interruptibility bits in the guest state. Otherwise
+        * vmentry fails as it then expects bit 14 (BS) in pending debug